### Code for filtering the species level trait data and scalling it up to the Ecoregion # This includes one example for quantitative data (Wood Density), and one example for qualitative data (Latex presence) ### Loading Required Packages library(rgbif) library(dplyr) library(purrr) library(readr) library(magrittr) library(taxize) library(data.table) library(rgdal) library(raster) library(sp) #### Example code for quantitative trait data per ecoregion (using Wood Density; WD) #### # Open wood density dataset with mean values per species derived from Zanne et al. (2009) # the subset of this dataset consists of all species occurring in Tropical and Extratropical South America and Central America. Values are species-level means. zanne=read.csv(file.choose(),header=T, sep=",") head(zanne) names(zanne)[1]="species" # checking for duplicated species names sum(duplicated(zanne$species)) dim(zanne) # count spaces in taxa name strings to search for spelling errors in species names countSpaces <- function(s) { sapply(gregexpr(" ", s), function(p) { sum(p>=0) } ) } zanne[countSpaces(zanne$species)>1,] zanne[countSpaces(zanne$species)<1,] # no problems detected # get ditribution data for the selected species from GBIF user <- "your gbif.org username" pwd <- "your gbif.org password" email <- "your email" setwd("your selected path") write.csv(zanne$species, file="wd-sp.csv") file_url <- "your selected path/wd-sp.csv" gbif_taxon_keys <- readr::read_csv(file_url) %>% pull("x") %>% # use fewer names if you want to just test taxize::get_gbifid_(method="backbone") %>% # match names to the GBIF backbone to get taxonkeys imap(~ .x %>% mutate(original_sciname = .y)) %>% # add original name back into data.frame bind_rows() %T>% # combine all data.frames into one readr::write_tsv(path = "all_matches.tsv") %>% # save as side effect for you to inspect if you want filter(matchtype == "EXACT" & status == "ACCEPTED") %>% # get only accepted and matched names filter(kingdom == "Plantae") %>% # remove anything that might have matched to a non-plant pull(usagekey) # get the gbif taxonkeys occ_download( pred_in("taxonKey", gbif_taxon_keys), format = "SIMPLE_CSV", user=user,pwd=pwd,email=email ) # download data from GBIF site and unzip # read the downloaded GBIF data gbif.data=fread("D:/Work/Ongoing/Projects/Neotropical Large Herbivores and Ecorregions/Trait Compilation/Final Trait Compilation/WD/GBIF/0238294-200613084148143/0238294-200613084148143.csv", head=T, quote="", stringsAsFactors = FALSE,select = c("gbifID","order","family","genus","species","decimalLatitude","decimalLongitude","taxonKey")) # Exclude all coordinates with NAs gbif.data=gbif.data[is.na(gbif.data$decimalLatitude)==F,] dim(gbif.data) # load the Ecorregions shape file (downloaded from https://ecoregions.appspot.com/) shape = readOGR(dsn = "your path") # subset the Neotropics neot=shape[shape$REALM=="Neotropic",] names(gbif.data) # transform the gbif.data data.frame into a SpatialPointsDataFrame gbif.data=data.frame(gbif.data) coordinates(gbif.data) <- c("decimalLongitude", "decimalLatitude") # set the projection to match the projection in the Ecorregions shapefile proj4string(gbif.data) <- shape@proj4string class(gbif.data) # Crop the GBIF data to include only occurrences within the Neotropical realm library(raster) ext = extent(neot) gbif.neo = crop(gbif.data, ext) # for each occurrence in gbif.neo, get the ecorregion within which it is located over.neo=over(gbif.neo, neot) gbif.neo$ECO_NAME=over.neo$ECO_NAME gbif.neo$ECO_NAME=as.character(gbif.neo$ECO_NAME) tt=gbif.neo[is.na(gbif.neo$ECO_NAME)==F, ] tt=data.frame(tt) dim(tt) # get the saved taxon keys, saved during in the specified folder when sending the data request to GBIF keys=fread("D:/Work/Ongoing/Projects/Neotropical Large Herbivores and Ecorregions/Trait Compilation/Final Trait Compilation/WD/GBIF/all_matches.tsv") head(keys) # merge the WD means per species with the GBIF keys bb=zanne names(bb)[1]="original_sciname" cc=merge(keys, bb, by="original_sciname", all.x=T) # according to the GBIF tutorial for downloading the data using the api in R, the usagekey in the keys table is the same as the taxonKeys in the occurrence data table # thus, we change the column name of the usagekey in the cc table to "taxonkey" so that they match names(cc)[2]="taxonKey" cc$taxonKey=as.character(cc$taxonKey) gbif.neo$taxonKey=as.character(gbif.neo$taxonKey) # to merge the WD data with the occurrence data is necessary to aggregate the WD data per taxonKey names(cc) dd=aggregate(cc$wd, by=list(cc$taxonKey), mean, na.rm=T) head(dd) names(dd)=c("taxonKey", "wd") head(dd) # now we can merge the WD and the species distribution data from GBIF ee=merge(gbif.neo, dd, by="taxonKey", all.x=T) names(gbif.neo) ee$ECO_NAME=as.character(ee$ECO_NAME) # Create species abundance per Ecorigion matrix, here, already using the species names corrected by GBIF gg=as.matrix(table(ee$ECO_NAME, ee$species)) head(gg[,1:3]) dim(gg) # calculating the total abundance of the ecorregion tot.abun=apply(gg, 1, sum) head(tot.abun) pp=data.frame(tot.abun) head(pp) names(pp)="nind.wd" ## calculating the abundance-weighted WD for each species occurrence in each ecorregion # first, aggregate the wood density data using the names corrected by GBIF names(ee) qq=aggregate(ee$wd, by=list(ee$species), mean, na.rm=T) names(qq)=c("sp","wd") # check if the species names in the columns of gg are in the same order as whose in the lines of qq qq=qq[order(colnames(gg)),] sum(colnames(gg)!=qq$sp) # multiply the WD values per species by the species abundance matrix rr=sweep(gg, MARGIN=2, qq$wd, '*') # Calculate the sum across species of the WD mutiplied by abundance per ecorregion ss=apply(rr, 1, sum, na.rm=T) head(ss) ss1=data.frame(ss) # merge to the values of total abundance per ecoregion pp1=merge(pp,ss1,by=0,all=T) head(pp1) # then calculate the abundance weighted mean WD of the ecoregions pp1$wd.z=pp1$ss/pp1$nind.wd # merge with the ecoregion data data1=read.csv(file.choose(),header=T) data2=merge(data1,pp3,by="ECO_NAME", all=T) ### Now for latex as an example of cathegorical trait ###### Code for getting ecoregion level trait data # Read .csv file with the species by trait matrix with species names in the first colum and latex presence-absence in the second (tagged: 1 for presence; 0 for absence) latex01=read.csv(file.choose(),header=T, sep=";") ### 2) Get ditribution data for your species from GBIF # codes in this section are the same as those used for requesting distribution data for Wood Density species ### 3) Download the .csv file with the ocurrences from the GBIF webpage to a folder and load in R gbif.data=fread(file.choose(), head=T, quote="", stringsAsFactors = FALSE) # browse to file # Exclude all coordinates with NAs gbif.data=gbif.data[is.na(gbif.data$decimalLatitude)==F,] ### 4) Match occurrences with Ecoregions names(gbif.data) # transform the gbif.data data.frame into a SpatialPointsDataFrame coordinates(gbif.data) <- c("decimalLongitude", "decimalLatitude") # set the projection to match the projection in the Ecorregions shape proj4string(gbif.data) <- shape@proj4string # Crop the GBIF data to include only occurrences within the Neotropical realm gbif.neo = crop(gbif.data, ext) dim(gbif.neo) # get the Ecoregion corresponding to each occurrence location in gbif.neo gbif.neo$ECO_NAME=over(gbif.neo, neot)$ECO_NAME gbif.neo$ECO_NAME=as.character(gbif.neo$ECO_NAME) tt=gbif.neo[is.na(gbif.neo$ECO_NAME)==F, ] tt=data.frame(tt) dim(tt) # get the saved species key per species data to merge with latex data keys=fread("D:/Work/Ongoing/Projects/Neotropical Large Herbivores and Ecorregions/Trait Compilation/Final Trait Compilation/Leaf Defences/Latex/GBIF/all_matches.tsv") # use the column "original_sciname" in the gbif.neo file to merge the occurrence and the trait data names(latex)[1]="original_sciname" #calculate latescence per taxon keys cc=merge(keys,latex, by="original_sciname", all.x=T) head(cc, 15) names(gbif.neo) names(cc) names(cc)[2]="taxonKey" class(cc$taxonKey) cc$taxonKey=as.character(cc$taxonKey) head(cc) class(gbif.neo$taxonKey) gbif.neo$taxonKey=as.character(gbif.neo$taxonKey) cc1=data.frame(cc) # Check for NAs # the name of the column containin the presence (1) absence (0) was called latex01 cc1[is.na(cc1$latex01),] # exclude NAs cc1=cc1[is.na(cc1$latex01)==F,] # Aggregate trait to the taxonkey level to make sure there are no duplicates # We used the maximum value (1) to assign presence to the species in the case of divergence, as previously defined dd=aggregate(cc1[,30], by=list(cc1$taxonKey), max, na.rm=T) names(dd)=c("taxonKey", "ltx") # Merging the distribution and the trait data via GBIF taxon key ee=merge(gbif.neo, dd, by="taxonKey", all.x=T) names(gbif.neo) # check for NAs sum(is.na(ee$ltx)) # Exclude NAs ee=ee[is.na(ee$ltx)==F,] ee$ECO_NAME=as.character(ee$ECO_NAME) # build a matrix with the number of species occurrence per ecorregion gg=as.matrix(table(ee$ECO_NAME, ee$species)) # determine the presence of latex per species using the names corrected by GBIF and matching the occcurrence data ff=ee[,c(11,50)] ff=ff[,c(1,2)] hh=aggregate(ff[,2], by=list(ff$species), max, na.rm=T) hh=data.frame(hh) names(hh)=c("species","ltx") # get the total number of individuals per ecoregion jj2=apply(gg,1,sum, na.rm=T) names(jj2)=c("ECO_NAME","nind.ltx") # multiply the matrices by the latescence data # put the two matrices in the same order gg=gg[,order(hh$species)] # check if names match sum(hh[,1]==colnames(gg)) sum(hh[,1]!=colnames(gg)) # multipli one matrix by the other ll=sweep(gg, MARGIN=2, hh[,2], "*") head(ll) mm=data.frame(apply(ll,1,sum, na.rm=T)) head(mm) names(mm)="lxt.pres.ind" head(jj2) row.names(jj2)=jj2$ECO_NAME head(mm) nn=merge(jj2,mm,by=0,all.x=T,all.y=T) head(nn) # calculate number of plants without latex nn$lxt.abs.ind=nn$nind.ltx-nn$lxt.pres.ind # and the proportion of individuals with latex nn$prop.ltx.ind=round(nn$lxt.pres.ind/nn$nind.ltx,3) # merge to the dataset containing the predictor variables data3=merge(data2,nn,by="ECO_NAME",all=T)