### Code for filtering the species level trait data and scalling it up to the Ecoregion

# This includes one example for quantitative data (Wood Density), and one example for qualitative data (Latex presence)

### Loading Required Packages

library(rgbif)

library(dplyr)

library(purrr)

library(readr) 

library(magrittr)

library(taxize)

library(data.table)

library(rgdal)

library(raster)

library(sp)



#### Example code for quantitative trait data per ecoregion (using Wood Density; WD) ####

# Open wood density dataset with mean values per species derived from Zanne et al. (2009)

# the subset of this dataset consists of all species occurring in Tropical and Extratropical South America and Central America. Values are species-level means.

zanne=read.csv(file.choose(),header=T, sep=",")

head(zanne)

names(zanne)[1]="species"


# checking for duplicated species names

sum(duplicated(zanne$species))

dim(zanne)



# count spaces in taxa name strings to search for spelling errors in species names

countSpaces <- function(s) { sapply(gregexpr(" ", s), function(p) { sum(p>=0) } ) }

zanne[countSpaces(zanne$species)>1,]

zanne[countSpaces(zanne$species)<1,]

# no problems detected



# get ditribution data for the selected species from GBIF 

user <- "your gbif.org username"
pwd <- "your gbif.org password"
email <- "your email"


setwd("your selected path")

write.csv(zanne$species, file="wd-sp.csv")

file_url <- "your selected path/wd-sp.csv"

gbif_taxon_keys <- 
readr::read_csv(file_url) %>% 
pull("x") %>% # use fewer names if you want to just test 
taxize::get_gbifid_(method="backbone") %>% # match names to the GBIF backbone to get taxonkeys
imap(~ .x %>% mutate(original_sciname = .y)) %>% # add original name back into data.frame
bind_rows() %T>% # combine all data.frames into one
readr::write_tsv(path = "all_matches.tsv") %>% # save as side effect for you to inspect if you want
filter(matchtype == "EXACT" & status == "ACCEPTED") %>% # get only accepted and matched names
filter(kingdom == "Plantae") %>% # remove anything that might have matched to a non-plant
pull(usagekey) # get the gbif taxonkeys



occ_download(
pred_in("taxonKey", gbif_taxon_keys),
format = "SIMPLE_CSV",
user=user,pwd=pwd,email=email
)



# download data from GBIF site and unzip

# read the downloaded GBIF data

gbif.data=fread("D:/Work/Ongoing/Projects/Neotropical Large Herbivores and Ecorregions/Trait Compilation/Final Trait Compilation/WD/GBIF/0238294-200613084148143/0238294-200613084148143.csv", head=T, quote="", stringsAsFactors = FALSE,select = c("gbifID","order","family","genus","species","decimalLatitude","decimalLongitude","taxonKey"))


# Exclude all coordinates with NAs

gbif.data=gbif.data[is.na(gbif.data$decimalLatitude)==F,]

dim(gbif.data)



# load the Ecorregions shape file (downloaded from https://ecoregions.appspot.com/)

shape = readOGR(dsn = "your path")

# subset the Neotropics

neot=shape[shape$REALM=="Neotropic",]

names(gbif.data)






# transform the gbif.data data.frame into a SpatialPointsDataFrame

gbif.data=data.frame(gbif.data)

coordinates(gbif.data) <- c("decimalLongitude", "decimalLatitude")



# set the projection to match the projection in the Ecorregions shapefile

proj4string(gbif.data) <- shape@proj4string

class(gbif.data)



# Crop the GBIF data to include only occurrences within the Neotropical realm

library(raster)

ext = extent(neot)

gbif.neo = crop(gbif.data, ext)



# for each occurrence in gbif.neo, get the ecorregion within which it is located

over.neo=over(gbif.neo, neot)

gbif.neo$ECO_NAME=over.neo$ECO_NAME

gbif.neo$ECO_NAME=as.character(gbif.neo$ECO_NAME)

tt=gbif.neo[is.na(gbif.neo$ECO_NAME)==F, ]

tt=data.frame(tt)

dim(tt)


# get the saved taxon keys, saved during in the specified folder when sending the data request to GBIF

keys=fread("D:/Work/Ongoing/Projects/Neotropical Large Herbivores and Ecorregions/Trait Compilation/Final Trait Compilation/WD/GBIF/all_matches.tsv")

head(keys)


# merge the WD means per species with the GBIF keys

bb=zanne

names(bb)[1]="original_sciname"

cc=merge(keys, bb, by="original_sciname", all.x=T)


# according to the GBIF tutorial for downloading the data using the api in R, the usagekey in the keys table is the same as the taxonKeys in the occurrence data table

# thus, we change the column name of the usagekey in the cc table to "taxonkey" so that they match

names(cc)[2]="taxonKey"

cc$taxonKey=as.character(cc$taxonKey)

gbif.neo$taxonKey=as.character(gbif.neo$taxonKey)

# to merge the WD data with the occurrence data is necessary to aggregate the WD data per taxonKey

names(cc)

dd=aggregate(cc$wd, by=list(cc$taxonKey), mean, na.rm=T)

head(dd)

names(dd)=c("taxonKey", "wd")

head(dd)

# now we can merge the WD and the species distribution data from GBIF

ee=merge(gbif.neo, dd, by="taxonKey", all.x=T)

names(gbif.neo)

ee$ECO_NAME=as.character(ee$ECO_NAME)


# Create species abundance per Ecorigion matrix, here, already using the species names corrected by GBIF

gg=as.matrix(table(ee$ECO_NAME, ee$species))

head(gg[,1:3])

dim(gg)

# calculating the total abundance of the ecorregion

tot.abun=apply(gg, 1, sum)

head(tot.abun)

pp=data.frame(tot.abun)

head(pp)

names(pp)="nind.wd"



## calculating the abundance-weighted WD for each species occurrence in each ecorregion

# first, aggregate the wood density data using the names corrected by GBIF

names(ee)

qq=aggregate(ee$wd, by=list(ee$species), mean, na.rm=T)

names(qq)=c("sp","wd")



# check if the species names in the columns of gg are in the same order as whose in the lines of qq

qq=qq[order(colnames(gg)),]

sum(colnames(gg)!=qq$sp)

# multiply the WD values per species by the species abundance matrix 

rr=sweep(gg, MARGIN=2, qq$wd, '*')

# Calculate the sum across species of the WD mutiplied by abundance per ecorregion

ss=apply(rr, 1, sum, na.rm=T)

head(ss)

ss1=data.frame(ss)


# merge to the values of total abundance per ecoregion

pp1=merge(pp,ss1,by=0,all=T)

head(pp1)



# then calculate the abundance weighted mean WD of the ecoregions

pp1$wd.z=pp1$ss/pp1$nind.wd


# merge with the ecoregion data


data1=read.csv(file.choose(),header=T)

data2=merge(data1,pp3,by="ECO_NAME", all=T)



### Now for latex as an example of cathegorical trait


###### Code for getting ecoregion level trait data

# Read .csv file with the species by trait matrix with species names in the first colum and latex presence-absence in the second (tagged: 1 for presence; 0 for absence)

latex01=read.csv(file.choose(),header=T, sep=";")

### 2) Get ditribution data for your species from GBIF 

# codes in this section are the same as those used for requesting distribution data for Wood Density species  

### 3) Download the .csv file with the ocurrences from the GBIF webpage to a folder and load in R

gbif.data=fread(file.choose(), head=T, quote="", stringsAsFactors = FALSE)

# browse to file

# Exclude all coordinates with NAs

gbif.data=gbif.data[is.na(gbif.data$decimalLatitude)==F,]


### 4) Match occurrences with Ecoregions

names(gbif.data)

# transform the gbif.data data.frame into a SpatialPointsDataFrame

coordinates(gbif.data) <- c("decimalLongitude", "decimalLatitude")

# set the projection to match the projection in the Ecorregions shape

proj4string(gbif.data) <- shape@proj4string

# Crop the GBIF data to include only occurrences within the Neotropical realm

gbif.neo = crop(gbif.data, ext)

dim(gbif.neo)



# get the Ecoregion corresponding to each occurrence location in gbif.neo

gbif.neo$ECO_NAME=over(gbif.neo, neot)$ECO_NAME

gbif.neo$ECO_NAME=as.character(gbif.neo$ECO_NAME)

tt=gbif.neo[is.na(gbif.neo$ECO_NAME)==F, ]

tt=data.frame(tt)

dim(tt)






# get the saved species key per species data to merge with latex data

keys=fread("D:/Work/Ongoing/Projects/Neotropical Large Herbivores and Ecorregions/Trait Compilation/Final Trait Compilation/Leaf Defences/Latex/GBIF/all_matches.tsv")



# use the column "original_sciname" in the gbif.neo file to merge the occurrence and the trait data

names(latex)[1]="original_sciname"


#calculate latescence per taxon keys

cc=merge(keys,latex, by="original_sciname", all.x=T)

head(cc, 15)

names(gbif.neo)

names(cc)

names(cc)[2]="taxonKey"

class(cc$taxonKey)

cc$taxonKey=as.character(cc$taxonKey)

head(cc)

class(gbif.neo$taxonKey)

gbif.neo$taxonKey=as.character(gbif.neo$taxonKey)

cc1=data.frame(cc)





# Check for NAs

# the name of the column containin the presence (1) absence (0) was called latex01

cc1[is.na(cc1$latex01),]

# exclude NAs

cc1=cc1[is.na(cc1$latex01)==F,]

# Aggregate trait to the taxonkey level to make sure there are no duplicates

# We used the maximum value (1) to assign presence to the species in the case of divergence, as previously defined

dd=aggregate(cc1[,30], by=list(cc1$taxonKey), max, na.rm=T)

names(dd)=c("taxonKey", "ltx")


# Merging the distribution and the trait data via GBIF taxon key

ee=merge(gbif.neo, dd, by="taxonKey", all.x=T)

names(gbif.neo)

# check for NAs

sum(is.na(ee$ltx))

# Exclude NAs

ee=ee[is.na(ee$ltx)==F,]

ee$ECO_NAME=as.character(ee$ECO_NAME)


# build a matrix with the number of species occurrence per ecorregion 

gg=as.matrix(table(ee$ECO_NAME, ee$species))


# determine the presence of latex per species using the names corrected by GBIF and matching the occcurrence data

ff=ee[,c(11,50)]

ff=ff[,c(1,2)]

hh=aggregate(ff[,2], by=list(ff$species), max, na.rm=T)

hh=data.frame(hh)

names(hh)=c("species","ltx")


# get the total number of individuals per ecoregion

jj2=apply(gg,1,sum, na.rm=T)

names(jj2)=c("ECO_NAME","nind.ltx")



# multiply the matrices by the latescence data 

# put the two matrices in the same order

gg=gg[,order(hh$species)]

# check if names match

sum(hh[,1]==colnames(gg))

sum(hh[,1]!=colnames(gg))



# multipli one matrix by the other


ll=sweep(gg, MARGIN=2, hh[,2], "*")

head(ll)

mm=data.frame(apply(ll,1,sum, na.rm=T))

head(mm)

names(mm)="lxt.pres.ind"

head(jj2)

row.names(jj2)=jj2$ECO_NAME

head(mm)

nn=merge(jj2,mm,by=0,all.x=T,all.y=T)

head(nn)

# calculate number of plants without latex

nn$lxt.abs.ind=nn$nind.ltx-nn$lxt.pres.ind

# and the proportion of individuals with latex

nn$prop.ltx.ind=round(nn$lxt.pres.ind/nn$nind.ltx,3)





# merge to the dataset containing the predictor variables

data3=merge(data2,nn,by="ECO_NAME",all=T)