#********************************************************************************
#********************************************************************************
# 
## R scripts for manuscript:
##
##          Reptile diversity patterns under climate and land use change scenarios 
##                      in a subtropical montane landscape in Mexico 
##
##  Daniel G. Ramírez-Arce, Leticia M. Ochoa-Ochoa, Andrés Lira-Noriega & Carlos Martorell
##
## This R code serves as an example for computing species distribution models, functional diversity analysis 
## and generalized additive models as performed in the manuscript.
# The following R scripts include three parts:  
# (1). Script for computing species distribution models.
# (2). Script for calculating taxonomic diversity, functional diversity and functional diversity 
#      standardized effect sizes.
# (3). Script for computing generalized additive models to:
#      a) Model the relationships between taxonomic, functional diversity, functional diversity
#         standardized effect sizes, and climatic, topographic, and land use/cover variables.
#      b) Compare mean taxonomic and functional diversity between seven climate and land use change scenarios.
# 
#
#*******************************************************************************
#*******************************************************************************

#Loading libraries#
library(raster)
library(spocc)
library(scrubr)
library(rgeos)
library(ellipsenm)
library(magrittr)
library(mapview)
library(kuenm)
library(maps)
library(sp)
library(terra)
library(picante)
library(SpatialPack)
library(msm)
library(MuMIn)
library(mgcv)
library(readr)
library(mgcViz)
library(extrafont)
library(spdep)
library(parallel)
library(foreach)
library(doParallel)
library(knitr)
library(rmdformats)
library(matrixStats)
library(ntbox)
library(raster)
library(dplyr)
library(purrr)
library(ggpubr)
library(nortest)
library(gstat)
library(nlme)
library(lmtest)
library(DHARMa)
library(gawdis)
library(NbClust)
library(mFD)
library(emmeans)
library(gratia)

#Assigning a working directory#
setwd("write_the_path_of_your_working_directory_here")


####################################################################################
#
# (1). Computing species distribution models.
#
# NOTE: Occurrence data was obtained from several sources including literature and 
# biological collection databases, and there is no R script for that part of the
# process. The occurrence data file for each species must be a csv file with three columns (species_name,
# longitude, latitude) and several rows representing each occurrence observed and its
# respective coordinates.
####################################################################################


#####Division of occurrence data into training (70%) and internal validation data (30%)#####

species1_occ_data <- read.csv("species1_occ_data.csv") #The csv file for the species occurrence data is uploaded here#

species1_samp <- sample(nrow(species1_occ_data),
                       size =ceiling(nrow(species1_occ_data)*0.7))
species1_train <- species1_occ_data[species1_samp,1:3]
write.csv(species1_train, "species1_train.csv") #Save the training data in a separate csv file#
species1_test <-  species1_occ_data[-species1_samp,1:3]
write.csv(species1_test, "species1_test.csv") #Save the test data in a separate csv file#

#NOTE: This is performed separately for each species that is going to be modeled#


#####Environmental selection with three methods#####

bio1<-raster('Bio01.TIF')  #Each climatic variable is uploaded. These variables are raster files downloaded from Chelsa climatology and clipped to Mexico#
bio2<-raster('Bio02.TIF')
bio3<-raster('Bio03.TIF')
bio4<-raster('Bio04.TIF')
bio5<-raster('Bio05.TIF')
bio6<-raster('Bio06.TIF')
bio7<-raster('Bio07.TIF')
bio8<-raster('Bio08.TIF')
bio9<-raster('Bio09.TIF')
bio10<-raster('Bio10.TIF')
bio11<-raster('Bio11.TIF')
bio12<-raster('Bio12.TIF')
bio13<-raster('Bio13.TIF')
bio14<-raster('Bio14.TIF')
bio15<-raster('Bio15.TIF')
bio16<-raster('Bio16.TIF')
bio17<-raster('Bio17.TIF')
bio18<-raster('Bio18.TIF')
bio19<-raster('Bio19.TIF')

bios_stack<-stack(bio1,bio2,bio3,bio4,bio5,bio6,bio7,bio8,bio9,bio10,bio11,bio12,bio13,bio14,bio15,bio16,bio17,bio18,bio19)

species1_occ<-read.csv("species1_occ_data.csv") #Upload the species occurrence data here#

species1_envir <- raster::extract(bios_stack,species1_occ[,c("longitude","latitude")]) #Extract the environmental data in each species' occurrence data point#
species1_occ_envir <- data.frame(species1_occ,species1_envir) #Create a single dataframe with both occurrence and environmental data#

#Method_1: Spearman correlations#
species1_occ_envir_spea <- cor(species1_occ_envir[,-(1:3)],method = "spearman")
species1_occ_envir_spea  %>% as.data.frame()
species1_occ_envir_filtroSpea <- ntbox::correlation_finder(species1_occ_envir_spea,
                                                  threshold = 0.8,
                                                  verbose = F)
cat("Less correlated variables under a 0.8 threshold",
    species1_occ_envir_filtroSpea$descriptors) #This gives the names of the variables selected by this method#

#Method_2: Variance inflation factor#
vif <- function(evars) {
  nvars <- 1:ncol(evars)
  tabla_vif <- nvars %>% purrr::map_df(function(x){
    modelo <- lm(evars[,x]~.,data=evars[-x])
    rsqrt <- summary(modelo)[["r.squared"]]
    vif <- 1/(1-rsqrt)
    rr <- data.frame(Variable=colnames(evars)[x],VIF=vif)
    return(rr)
  })
  return(tabla_vif)}

select_by_vif <- function(evars,umbral=10){
  busca <- TRUE
  vals <- vif(evars)
  variables <- colnames(evars)
  while (busca) {
    order_vif <- order(vals$VIF,decreasing = T)
    new_df <- vals[order_vif,]
    variables <- variables[order_vif]
    if(new_df$VIF[1]>umbral){
      variables <- new_df$Variable[-1]
      vals <- vif(evars[,variables])
    } else{
      busca <- FALSE
    }
  }
  return(vals)
}

vif(evars = species1_occ_envir[,-(1:3)])
select_by_vif(evars=species1_occ_envir[,-(1:3)],umbral = 10) #This gives the names of the variables selected by this method#

#Method_3: For the third set, we ran an initial model with default settings in Maxent 
#using all 19 variables, and select all variables presenting a cumulative importance value of 95%.
#This was done in the Maxent graphical user interface, so there is not a code for this part#

#NOTE: This process is performed separately for each species that is going to be modeled#


#####Creation of the accessibility area (M)#####

species1_occ_data <- read.csv("species1_occ_data.csv") #Upload the species occurrence data#
points<-species1_occ_data[c(2,3)] #Take out the coordinates data#
xy_species<-coordinates(points) #Take out the coordinates data#
Species_points<-SpatialPointsDataFrame(coords = xy_species, data = points, proj4string = CRS("+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"))
ecoreg <- rgdal::readOGR("Terrestrial_ecorregions") #Upload shape data of ecoregions#
mapview(ecoreg)
mapview(Species_points)

ecoreg.sub <- ecoreg[!is.na(sp::over(ecoreg, sp::geometry(Species_points))), ] #Select all ecoregions where there is occurrence data points, in order to create the M area# 
plot(ecoreg.sub)

writeOGR(ecoreg.sub, "species1/M", "species1_M", driver = "ESRI Shapefile") #Save the M area as a shape file if desired#

var_mask <- mask(crop(bios_stack, ecoreg.sub), ecoreg.sub) #All environmental variables (raster files) are clipped according to the M area#

rnames <- paste0("Environmental_variables_M/", names(bios_stack), ".asc")
Species_variables <- lapply(1:nlayers(var_mask), function(x) {
  writeRaster(var_mask[[x]], filename = rnames[x], format = "ascii") 
}) # Save the 19 clipped environmental variables (raster files) according to the M area. Change raster format if desired#

#NOTE1: From these clipped raster files, create three environmental datasets according to the 
#variable selection procedure perfomed with the three methods described above. Each set must be named
#"set1", "set2" and "set3". Then, those three folders must be included in a single folder named 
#"Calibration_data", which is going to be used for the following analysis#

#NOTE2: This process is performed separately for each species that is going to be modeled#

#####Model calibration and selection#####

occ_joint <- "species1_occ_data.csv" #Upload species occurrence data
occ_tra <- "species1_train.csv" #Upload species train data#
M_var_dir <- "Calibration_data" #Upload folder with three environmental datasets#
batch_cal <- "Candidate_models" #This is the name of a batch file that is created along with candidate models#
out_dir <- "Candidate_Models" #This is a folder where candidate models are going to be saved#
reg_mult <- c(0.1,0.4,0.7,1,2,3,4,5) #Select specific regularization multipliers#
f_clas <- c("l","q","h","lq","lqh") #Select the feature classes combinations#
args <- NULL
maxent_path <- "Write here the path of the Maxent folder"
wait <- FALSE
run <- TRUE

Species_cal<-kuenm_cal(occ.joint = occ_joint, occ.tra = occ_tra, M.var.dir = M_var_dir, batch = batch_cal,
                       out.dir = out_dir, reg.mult = reg_mult, f.clas = f_clas, args = args,
                       maxent.path = maxent_path, wait = wait, run = run) #Model calibration#

occ_test <- "species1_test.csv" #Upload species test data#
out_eval <- "Calibration_results" #This is a folder where results will be saved#
threshold <- 5
rand_percent <- 50
iterations <- 100
kept <- TRUE
selection <- "OR_AICc"
paral_proc <- FALSE

Species_ceval <- kuenm_ceval(path = out_dir, occ.joint = occ_joint, occ.tra = occ_tra,
                             occ.test = occ_test, batch = batch_cal,
                             out.eval = out_eval, threshold = threshold,
                             rand.percent = rand_percent, iterations = iterations,
                             kept = kept, selection = selection,
                             parallel.proc = paral_proc) #Model selection#

#NOTE1: Final models are built with the parameterization of the best model selected. 
#In our case, this was done in the Maxent graphical user interface, so there is not a code for this part.
#Climatic variables for years 2050 and 2070 are used to make projections to future scenarios.#

#NOTE2: This process is performed separately for each species that is going to be modeled#

#FINAL NOTE: From the suitability maps generated in Maxent, binary presence/absence maps 
#are created for each scenario in order to use them in subsequent analyses. These maps must be in a tiff 
#file format and stored in a folder for the following analysis#



####################################################################################
#
# (2). Computing taxonomic and functional diversity analysis.
#
# NOTE: Here, the creation of presence-absence matrices for each scenario is explained.
# These correspond to the seven files names "Species_PAM": SpeciesPAM_Current-2015.csv,
# SpeciesPAM_RCP2.6-2050.csv, SpeciesPAM_RCP2.6-2070.csv, SpeciesPAM_RCP4.5-2050.csv, 
# SpeciesPAM_RCP4.5-2070.csv, SpeciesPAM_RCP8.5-2050.csv, SpeciesPAM_RCP8.5-2070.csv
#
####################################################################################


#####Creation of presence-absence matrices from species distribution models#####

setwd("write here the name of the folder with all species presence/absence maps")
Species_maps <- list.files(pattern = "\\.tif$")
raster_stack <- stack(Species_maps)
names(raster_stack)
species_points <- rasterToPoints(raster_stack)  #This create a data frame with species' presence-absence data from all pixels in the region# 
write.csv(species_points, "SpeciesPAM_Current-2015.csv") #Save the presence-absence matrix#

#NOTE: This process is performed separately for each scenario# 


#####Calculation of taxonomic and functional diversity#####

speciesPAM <- read.csv("SpeciesPAM_Current-2015.csv", header = TRUE, row.names = 1) #Upload species presence-absence dataframe for an specific scenario#
speciesPAM <- as.matrix(speciesPAM) #Convert to matrix format#
speciesTraits <- read.csv("Speciesxtraits.csv", header = TRUE, row.names = 1) #Upload species traits dataframe#
traitstable <- read.csv("Speciesxtraits_info.csv", header = TRUE) #Upload species traits information (see Magneville et al., 2021)#

#Pre-processing of trait data#
hist(speciesTraits[, 1], main = colnames(speciesTraits)[1]) #Create an histogram to check for normality for continuous traits#
hist(log(speciesTraits[, 1]), main = paste("Log of ", colnames(speciesTraits)[1])) #Create an histogram with a logarithmic scale to check if this helps for normality#
speciesTraits$trait1 <- log(speciesTraits$trait1) #Convert continuous trait to logarithmic scale for normality#

#NOTE: This is done for each continuous trait#

speciesTraits$trait4 <- as.factor(speciesTraits$trait4) #Convert to factor if necessary#
plot(speciesTraits$trait4) #Check the number of observations for each category for categorical traits# 

#NOTE: This is done for each categorical trait#

#Summary of traits and presence-absence matrix data#
sp_traits_summ <- sp.tr.summary(
  tr_cat     = traitstable,   
  sp_tr      = speciesTraits, 
  stop_if_NA = FALSE)
sp_traits_summ

sp_abun_summ <- asb.sp.summary(asb_sp_w = speciesPAM)
sp_abun_summ

#Calculation of functional distances between species#
sp_dist_comm2 <- gawdis(speciesTraits, w.type = "optimized", 
                            opti.maxiter = 300) #Creation of functional dissimilarity matrix with gawdis function (de Bello et al. 2020)#
sp_dist_comm2

attr(sp_dist_comm2, "correls") #Contribution of each trait to the functional dissimilarity matrix#
attr(sp_dist_comm2, "weights") #Weight of each trait to the functional dissimilarity matrix#

#Creation and evaluation of the best functional space#
fspaces_quality_comm <- quality.fspaces(
  sp_dist             = sp_dist_comm2,
  maxdim_pcoa         = 10,
  deviation_weighting = c("absolute","squared"),
  fdist_scaling       = c(FALSE,TRUE),
  fdendro             = "average")
fspaces_quality_comm$quality_fspaces
apply(fspaces_quality_comm$quality_fspaces, 2, which.min)

#Functional diversity indexes calculation#
sp_faxes_coord_comm <- fspaces_quality_comm$"details_fspaces"$"sp_pc_coord" #Extract species coordinates from functional space#

alpha_fd_indices_comm <- alpha.fd.multidim(
  sp_faxes_coord   = sp_faxes_coord_comm[ , c("PC1", "PC2", "PC3", "PC4", "PC5")],
  asb_sp_w         = speciesPAM,
  ind_vect         = c("fdis"),
  scaling          = TRUE,
  check_input      = TRUE,
  details_returned = TRUE)
alpha_fd_indices_comm$functional_diversity_indices
write.csv(alpha_fd_indices_comm$functional_diversity_indices, 
          "TD_FD_results.csv") #Save the species richness and functional dispersion index results#

#NOTE: This process is performed separately for each scenario# 


#####Calculation of functional diversity SES#####

#Creation of null models#
cores=detectCores()
cl <- makeCluster(cores-1) #To not overload your computer#
registerDoParallel(cl) #To perform null models in parallel to optimize computational performance#
resul <- foreach(i=1:999, .combine=cbind) %dopar% {
  library(mFD)
  library(picante)
  library(gawdis)
  setwd("write here the name of the folder with your presence-absence matrix and trait data")
  speciesPAM <- read.csv("SpeciesPAM_Current-2015.csv", header = TRUE, row.names = 1)
  speciesTraits <- read.csv("Speciesxtraits.csv", header = TRUE, row.names = 1)
  speciesTraits$trait1 <- log(speciesTraits$trait1)
  speciesTraits$trait2 <- log(speciesTraits$trait2)
  speciesTraits$trait3 <- log(speciesTraits$trait3)
  speciesTraits$trait4 <- as.factor(speciesTraits$trait4)
  speciesTraits$trait5 <- as.factor(speciesTraits$trait5)
  speciesTraits$trait6 <- as.factor(speciesTraits$trait6)
  speciesTraits$trait7 <- as.factor(speciesTraits$trait7)
  speciesTraits$trait8 <- as.factor(speciesTraits$trait8)
  speciesPAMmat <- as.matrix(speciesPAM)
  randomizedPA <- randomizeMatrix(samp = speciesPAMmat, null.model = "richness")
  sp_dist_comm2 <- gawdis(speciesTraits, w.type = "optimized", 
                          opti.maxiter = 300)
  fspaces_quality_comm <- quality.fspaces(
    sp_dist             = sp_dist_comm2,
    maxdim_pcoa         = 10,
    deviation_weighting = c("absolute","squared"),
    fdist_scaling       = c(FALSE,TRUE),
    fdendro             = "average")
  sp_faxes_coord_comm <- fspaces_quality_comm$"details_fspaces"$"sp_pc_coord"
  fd_exp <- alpha.fd.multidim(
    sp_faxes_coord   = sp_faxes_coord_comm[ , c("PC1", "PC2", "PC3", "PC4", "PC5")],
    asb_sp_w         = randomizedPA,
    ind_vect         = c("fdis"),
    scaling          = TRUE,
    check_input      = TRUE,
    details_returned = TRUE)
  fd_exp$functional_diversity_indices
}
View(resul)

resulFDis <- resul[, grepl("^fdis", names(resul))] #This gives the functional dispersion index expected for the 999 null models#
obsFDis <- alpha_fd_indices_comm$functional_diversity_indices$fdis #This is the observed functional dispersion index from your data#

stopCluster(cl = cl)

#Calculation of functional dispersion SES#
meanNullFDis <- rowMeans(resulFDis)
meanNullFDis
ESFDis <- obsFDis - meanNullFDis
sdNullFDis <- apply(resulFDis, 1, sd)
SESFDis <- ESFDis / sdNullFDis
data.frame(SESFDis)
write.csv(SESFDis, "FDisSES_current.csv") #Save the functional dispersion SES#

#NOTE: This process is performed separately for each scenario#


####################################################################################
#
# (3). Computing Generalized additive models.
#
# NOTE: All climatic, topographic and land use/cover variables data was extracted from 
# raster files in QGIS and included in a dataframe with the taxonomic and functional diversity 
# results from the last section. Climatic variables were obtained from CHELSA Climatology
# (https://chelsa-climate.org/; Karger et al., 2017), topographic variables from EarthEnv
# (https://www.earthenv.org/; Amatulli et al., 2018) and lan use/cover variables from 
# Mendoza-Ponce et al. (2018; 2020). 
#
# a) To explore the relationships between taxonomic diversity, functional diversity, 
#    functional diversity standardized effect sizes and climatic, topographic, and 
#    land use/cover variables the "Environment_GAM.csv" file is used. 
#
# b) To compare diversity dimensions between scenarios, the "Scenarios_GAM.csv" file
#    is used.
#
####################################################################################


##### a) Diversity-environmental relationships #####

datos <- read.csv("Environment_GAM.csv",header = TRUE, row.names = 1)
datos <- na.omit(datos) #Eliminate NAs from data if any#
interval <- floor(nrow(datos) / 1000) #Select a subsample from all pixels for following analysis#
sampled_indices <- seq(1, nrow(datos), by = interval)
datos_samp <- datos[sampled_indices, ]

min_max_scale <- function(x){(x - min(x)) / (max(x) - min(x))} #Standarization of variables between 0 and 1 (except for land use/cover)#
datos_samp[,6:25] <- lapply(datos_samp[6:25], min_max_scale)

gam_model<-gam(Rich ~ s(Bio1, k=10, bs="tp") + 
                 s(Bio4, k=10, bs="tp") + 
                 s(Bio12, k=10, bs="tp") + 
                 s(Bio15, k=10, bs="tp") +
                 s(tpi, k=10, bs="tp") +
                 LUC + 
                 s(x, y, bs="gp", m=2), 
               family=poisson(), data=datos_samp, method = "REML")	#Generalized additive model#

summary(gam_model) #Summary of model results#
appraise(gam_model) #Diagnosis plots of model#
draw(gam_model) #Partial effect plots#

model_spt <- simulateResiduals(fittedModel = gam_model, n = 1000)
testSpatialAutocorrelation(model_spt, datos_samp$x, datos_samp$y) #Check for spatial autocorrelation#

gam_model_emm <- emmeans(gam_model, ~LUC) #Calculation of Estimated marginal means for land use/cover variable#
contrast(gam_model_emm, method = "pairwise") #Pairwise comparisons of estimated marginal means between land uses/covers#
gam_plot_emm <- plot(gam_model_emm, comparisons = TRUE) #Plot estimated marginal means for each land use/cover#

#NOTE: This is separately done for species richness, functional diversity and functional diversity SES#


##### b) Diversity comparisons between scenarios #####

data <- read.csv("Scenarios_GAM.csv",header = TRUE) #Upload dataframe with taxonomic/functional diversity and "Scenario" variable#
data <- na.omit(data) #Eliminate NAs from data if any#
interval <- floor(nrow(data) / 1000) #Select a subsample from all pixels for following analysis#
sampled_indices <- seq(1, nrow(data), by = interval)
data_samp <- data[sampled_indices, ]

gam_model_sce <- gam(Richness~ Scenario + s(Longitude, Latitude, k=100, bs="gp", m=2),
                     family=poisson(), data=data_samp, method = "REML") #Generalized additive model#
summary(gam_model_sce) #Summary of model results#
appraise(gam_model_sce) #Diagnosis plots of model#

gam_emm <- emmeans(gam_model_sce, ~Scenario) #Calculation of Estimated marginal means#
contrast(gam_emm, method = "pairwise") #Pairwise comparisons of estimated marginal means between scenarios#
plot_emm <- plot(gam_emm, comparisons = TRUE) #Plot estimated marginal means for each scenario#

#NOTE: This is separately done for species richness and functional diversity#


################################################END OF CODE###############################################################