################################################################################
#
#   BIRD-HABITAT MODELLING (PLUMMER ET AL 2020, J APPL ECOL)
#
#   R code used to develop, run and evaluate bird-habitat models presented in
#   the paper
#
#   2020-03-29 Kate E Plummer (kate.plummer@bto.org)
#
################################################################################



## SET UP ======================================================================

## Set working directory for loading data and saving outputs
wd <- getwd()

## Load required packages
library(openxlsx) # for writing/reading .xlsx output files
library(reshape)  # for data manipulation
library(tictoc)   # for measuring time elapsed per species run
library(glmmTMB)  # for mixed modelling
library(arm)      # for 'rescale' standardising modelled covariates
library(AER)      # for 'dispersiontest' measuring model dispersion
library(bbmle)    # for 'AICtab' comparing model error structures
library(caret)    # for 'postResample' x-val performance calcs
library(Rmisc)    # for 'CI' confidence interval calc
library(Hmisc)    # for 'wtd.mean' weighted mean calc
library(weights)  # for 'wtd.t.test' weighted t-test
library(pastecs)  # for descriptive statistics
library(mgcv)     # for general additive modelling
library(devtools) # for reading online code

## Function to assign significance stars to p values
sigstar<- function(x){
  symnum(x, corr = FALSE, na = FALSE,
         cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
         symbols = c("***", "** ", "*  ", ".  ", "   "))
}

sessionInfo()
# R version 3.6.1 (2019-07-05)
# Platform: x86_64-w64-mingw32/x64 (64-bit)
# Running under: Windows 10 x64 (build 18362)
# 
# Matrix products: default
# 
# locale:
# [1] LC_COLLATE=English_United Kingdom.1252 
# [2] LC_CTYPE=English_United Kingdom.1252   
# [3] LC_MONETARY=English_United Kingdom.1252
# [4] LC_NUMERIC=C                           
# [5] LC_TIME=English_United Kingdom.1252    
# 
# attached base packages:
# [1] stats4    stats     graphics  grDevices utils     datasets  methods  
# [8] base     
# 
# other attached packages:
#  [1] devtools_2.2.2  usethis_1.5.1   mgcv_1.8-31     nlme_3.1-145   
#  [5] pastecs_1.3.21  weights_1.0.1   mice_3.8.0      gdata_2.18.0   
#  [9] Hmisc_4.4-0     Formula_1.2-3   Rmisc_1.5       plyr_1.8.6     
# [13] caret_6.0-86    ggplot2_3.3.0   lattice_0.20-40 bbmle_1.0.23.1 
# [17] AER_1.2-9       survival_3.1-11 sandwich_2.5-1  lmtest_0.9-37  
# [21] zoo_1.8-7       car_3.0-7       carData_3.0-3   arm_1.10-1     
# [25] lme4_1.1-21     Matrix_1.2-17   MASS_7.3-51.4   glmmTMB_1.0.1  
# [29] tictoc_1.0      reshape_0.8.8   openxlsx_4.1.4 



## LOAD DATA ===================================================================

## All urban site data, including 34 urban landscape metrics
moddf  <- read.csv(paste0(wd, "/Urban_metrics_per_site.csv"))

## Bird abundance per urban site for 57 species  
birddf <- read.csv(paste0(wd, "/Birds_per_site.csv"))

## Bird species names and traits
specdf <- read.csv(paste0(wd, "/Bird_species_list.csv"))



## SCALE VARIABLES FOR ANALYSIS ================================================

## Species list
spList <- as.character(specdf$species)

## List of 34 landscape metrics 
metList <- c("pc_building", "pc_built", "pc_roads", "pc_rail"
             , "pc_otherman", "pc_grassrough", "pc_grassmanaged", "pc_verge"
             , "pc_natural", "pc_gardens", "pc_scrub"
             , "pc_trees", "pc_woodbroad", "pc_woodconifer", "pc_woodmix"
             , "pc_waterinland", "pc_arable", "den_woods", "den_waterbodies"
             , "sz_buildings", "sz_woods", "sz_waterbodies", "sz_gardens"
             , "nn_buildings", "nn_woods", "nn_waterbodies", "ric_greenspace"
             , "den_greenspace", "sz_greenspace", "buf_urban", "buf_wood"
             , "buf_grass", "buf_coast", "climate")

## Replicate dataframe for variable scaling
rmoddf <- moddf

## Rescale all predictor terms to mean of 0 and SD of 0.5
rmoddf[,metList] <- sapply(rmoddf[,metList], rescale) 

## Log of area for offset
rmoddf$logarea <- log(rmoddf$areakm2)



## CREATE .XLSX WORKBOOK TO STORE ALL MODELLING OUTPUTS ========================

## Name of .xlsx workbook of all outputs from the current run 
filename <- "Modelling outputs"

## Create workbook
options("openxlsx.fontName" = "Calibri")
options("openxlsx.numFmt" = "0.000")
wb <- createWorkbook()
bottom <- length(metList)*length(spList)

## Add tab for error structure results
addWorksheet(wb, "Error structure")
setColWidths(wb, 1, cols= 1:9, widths= c(9,25,11,10,12,9,9,9,9))
freezePane(wb, 1, firstRow= TRUE)
addStyle(wb, 1, rows= 1:length(spList)+1, cols= 4:9
         , style= createStyle(halign="right"), gridExpand=TRUE) 

## Add tab for univariate model outputs
addWorksheet(wb, "Univariate mod table")
setColWidths(wb, 2, cols= 1:10, widths= c(9,25,17,9,9,9,9,9,5,11))
freezePane(wb, 2, firstRow= TRUE)
addStyle(wb, 2, rows= 1:bottom, cols= 3 , style= createStyle(halign="left"))
addStyle(wb, 2, rows= 1:bottom, cols= 4:8
         , style= createStyle(halign="right"), gridExpand=TRUE)

## Add tab for landscape metric summary results
addWorksheet(wb, "Metric summary table")
setColWidths(wb, 3, cols= 1:14, widths= c(17,9,9,9,9,9,9,9,9,9,9,5,9,5))
freezePane(wb, 3, firstRow= TRUE)
addStyle(wb, 3, rows= 1:length(metList)+1, cols= 2:13
         , style= createStyle(halign="right"), gridExpand=TRUE)

## Add tab for predictive model outputs
addWorksheet(wb, "Predictive mod table")
setColWidths(wb, 4, cols= 1:7, widths= c(9,25,17,9,9,9,9))
freezePane(wb, 4, firstRow= TRUE)
addStyle(wb, 4, rows= 1:bottom, cols= 3 , style= createStyle(halign="left"))
addStyle(wb, 2, rows= 1:bottom, cols= 4:7
         , style= createStyle(halign="right"), gridExpand=TRUE)

## Add tab for cross-validation results
addWorksheet(wb, "X-validation table")
setColWidths(wb, 5, cols= 1:8, widths= c(9,25,9,9,9,9,9,9))
freezePane(wb, 5, firstRow= TRUE)
addStyle(wb, 5, rows= 1:length(spList)+1, cols= 3:8
         , style= createStyle(halign="right"), gridExpand=TRUE)



## CREATE LOOP TO CYCLE THROUGH EVERY SPECIES ==================================
## =============================================================================

## Number of bird species (n = 57)
n <- length(spList) 

## Start timer for running all species
tic(paste0("Total runtime"))

## Open loop
for (i in 1:n) {
  
  ## Species of interest
  sp <- spList[i]
  
  ## Start timer
  tic(paste0("Runtime for species #", i, " of ", n))
  
  ## Text update
  cat("\n-------------------------------------------------------------------------------------\n"
      , paste0("*** Running urban landscape model for species #", i
               , " -- ", sp, " (", specdf$sp_name[i], ")"), "\n\n")
  
## =============================================================================
## =============================================================================

  
  
## SPECIES i DATASET FOR MODELLING =============================================
  
## Merge sp i data with dataset for analysis:
spdat <- birddf[,c("sid", sp)]
SPi   <- merge(rmoddf, spdat, by= "sid")


  
## DETERMINE APPROPRIATE ERROR STRUCTURE =======================================
  
# Identify most appropriate eror structure accounting for over-dispersion
# and/or zero inflation by comparing fit of Poisson, zero-inflated Poisson,
# negative binomial and zero-inflated negative binomial

## Update text
cat("* RUNNING 4 GLOBAL MODELS TO IDENTIFY THE BEST ERROR STRUCTURE\n")

  
## Global model formula
global <- paste(sp, "~"                          # selected species
                , paste(metList, collapse=" + ") # all landscape metrics
                , "+ offset(logarea)"            # offset for densities
                , "+ (1|grid100)")               # 100km-square random effect
global <- as.formula(global)  

## Test for overdispersion using glm of global and AER::dispersiontest
global_norand <- update(global, . ~ . -(1|grid100))  
glmtest       <- glm(global_norand, family=poisson, data= SPi)   
dispersion    <- dispersiontest(glmtest)$estimate
dispersion_p  <- dispersiontest(glmtest)$p                                                                       

## Compare Poisson, NegBinomial, ZIP, ZINB error structures for global model 
errstructList   <- c("pois", "zipois", "nbin", "zinbin")
gmodList        <- vector("list", length(errstructList)) 
names(gmodList) <- errstructList 

cat("Poisson...")
gmodList[["pois"]]   <- glmmTMB(global, family=poisson, zi=~0, data= SPi)
cat("negative binomial...")
gmodList[["zipois"]] <- glmmTMB(global, family=poisson, zi=~1, data= SPi)
cat("ZI Poisson...")
gmodList[["nbin"]]   <- glmmTMB(global, family=nbinom2, zi=~0, data= SPi)
cat("ZI negative binomial...")
gmodList[["zinbin"]] <- glmmTMB(global, family=nbinom2, zi=~1, data= SPi)

## dAIC model comparison table using bbmle::AICtab 
aiccomp <- AICtab(gmodList)

## Top model, i.e. error structure to use in all further modelling
top  <- attr(aiccomp, "row.names")[1]
gmod <- gmodList[[top]]

## Reformat dAIC model comparison results
d <- merge(data.frame(errstruct= errstructList)
           , data.frame(errstruct=attr(aiccomp, "row.names")
                        , dAIC= aiccomp$dAIC)
           , by="errstruct")
d$species <- sp

## Combine all error structure comparison results to export
errstats <- cast(species ~ errstruct, value= "dAIC", data= d)
errstats$errstruct    <- top
errstats$dispersion   <- dispersion
errstats$dispersion_p <- dispersion_p
errstats$sp_name      <- specdf$sp_name[i]

## Standardise error results table order
errstats <- errstats[,c("species", "sp_name", "errstruct"
                        , "dispersion", "dispersion_p", errstructList)]

# ## Print results
# print(errstats[,3:9], digits=4, row.names= FALSE)

## Add results to dataframe for all species
if("errdf" %in% ls() == TRUE)  { errdf <- rbind(errdf, errstats) } 
if("errdf" %in% ls() == FALSE) { errdf <- errstats }

## Update data in .xlsx wookbook and save
writeData(wb, 1, errdf)
saveWorkbook(wb, paste0(wd, "/", filename, ".xlsx"), TRUE)

## Text update
cat("\n\noutput saved\n\n")



## USE ERROR STRUCTURE IDENTIFIED TO FIT UNIVARIATE MODELS =====================

# Fit 34 univariate models for each of the different landscape metrics
# to determine their influence on species-specific abundance

## Update text
cat("* RUNNING 34 LANDSCAPE METRIC UNIVARIATE MODELS\n")

## Create list to store all univarate model outputs
uniMods        <- vector("list", length(metList)) 
names(uniMods) <- metList 
nmet           <- length(metList)

## Univariate model formuli
formuli <- list()
for (j in 1:nmet) {
  uvmod <- paste(sp, "~", metList[j], "+ offset(logarea) + (1|grid100)")
  formuli[[j]] <- as.formula(uvmod)
} 

## Fit all univariate models using the top global model error structure
for( j in 1:nmet) {
  cat(paste0(j, "..."))
  uniMods[[j]] <- update(gmod, formuli[[j]])
}

## List of model coefficient tables per model
uniMods_coefs <- lapply(uniMods, function(x) summary(x)$coefficients[[1]])

## Extract univariate model results of interest for export
unistats <- data.frame(
  species= sp
  , sp_name= specdf$sp_name[i]
  , metric = sapply(uniMods_coefs, function(x) rownames(x)[-1])
  , AIC    = sapply(uniMods, AIC)
  , est    = sapply(uniMods_coefs, function(x) x[2,1])
  , sterr  = sapply(uniMods_coefs, function(x) x[2,2])
  , zvalue = sapply(uniMods_coefs, function(x) x[2,3])
  , p      = sapply(uniMods_coefs, function(x) x[2,4])
)
unistats$sig       <- sigstar(unistats$p)
unistats$direction <- factor(ifelse(unistats$est>0, "pos", "neg"))

## Add results to dataframe for all species
if("unidf" %in% ls() == TRUE)  { unidf <- rbind(unidf, unistats) } 
if("unidf" %in% ls() == FALSE) { unidf <- unistats }

## Update data in .xlsx wookbook and save
writeData(wb, 2, unidf)
saveWorkbook(wb, paste0(wd, "/", filename, ".xlsx"), TRUE)

## Text update
cat("\n\noutput saved\n\n")



## FIT PREDICTIVE MODEL WITH ALL SIGNIFICANT METRICS ===========================

# Fit species predictive model using all landscape metrics which have a 
# significant (p < 0.05) influence

## Update text
cat("* FITTING SPECIES PREDICTIVE MODEL\n")


## List of landscape metrics with a significant effect
sigList <- as.character(unistats$metric[unistats$p <=0.05])

## Formula for predictive model
predFormula <- paste(sp, "~"                     # selected species
                , paste(sigList, collapse=" + ") # significant landscape metrics
                , "+ offset(logarea)"            # offset for densities
                , "+ (1|grid100)")               # 100km-square random effect
predFormula <- as.formula(predFormula) 


## Fit predictive model using the top global model error structure
pmod <- update(gmod, predFormula)

## Predictive model coefficient table
coeftab <- summary(pmod)$coefficients[[1]]

## Extract predictive model results for export
pmodstats <- data.frame(
  species= sp
  , sp_name= specdf$sp_name[i]
  , metric = c("INTERCEPT", row.names(coeftab)[-1])
  , est    = coeftab[,1]
  , sterr  = coeftab[,2]
  , zvalue = coeftab[,3]
  , p      = coeftab[,4]
)

## Add results to dataframe for all species
if("pmoddf" %in% ls() == TRUE)  { pmoddf <- rbind(pmoddf, pmodstats) } 
if("pmoddf" %in% ls() == FALSE) { pmoddf <- pmodstats }

## Update data in .xlsx wookbook and save
writeData(wb, 4, pmoddf)
saveWorkbook(wb, paste0(wd, "/", filename, ".xlsx"), TRUE)

## Text update
cat("\noutput saved\n\n")



## BLOCK 10-FOLD CROSS VALIDATION ==============================================

# Use block 10-fold cross validation (following Roberts et al 2017) to assess
# model predictive ability while accounting for spatial dependencies

## Update text
cat("* RUNNING 10-FOLD CROSS VALIDATION\n")

## Dataframe to collect results per fold
xvals <- data.frame(xval_fold= 1:10, rs= NA, MAE= NA)

## Perform 10 fold cross validation
cat("Fold ")
for(k in 1:10){
  cat(paste0(k, "..."))
  ## Split data by fold 
  testData    <- droplevels(subset(SPi, xval_fold==k))
  trainData   <- droplevels(subset(SPi, xval_fold!=k))
  
  ## Model training data
  mod <- update(pmod, data=trainData)
  
  ## Predict values for test data
  preds <-predict(mod, newdata= testData, allow.new.levels= T, type= "response")
  
  ## Model performance measures using observed v predicted
  obs          <- testData[,sp]
  xvals$rs[k]  <- cor(obs, preds, method="spearman")
  xvals$MAE[k] <- postResample(obs, preds)[3]      

}  

## Calculate and store model performance measures using averages across folds
xvalstats <- data.frame(
  species= sp
  , sp_name= specdf$sp_name[i]
  , rs=     mean(na.omit(xvals$rs))
  , rs_lci= CI(na.omit(xvals$rs))[["lower"]]
  , rs_uci= CI(na.omit(xvals$rs))[["upper"]]
  , MAE=     mean(na.omit(xvals$MAE))
  , MAE_lci= CI(na.omit(xvals$MAE))[["lower"]]
  , MAE_uci= CI(na.omit(xvals$MAE))[["upper"]]
)
  
## Add results to dataframe for all species
if("xvaldf" %in% ls() == TRUE)  { xvaldf <- rbind(xvaldf, xvalstats) } 
if("xvaldf" %in% ls() == FALSE) { xvaldf <- xvalstats }

## Update data in .xlsx wookbook and save
writeData(wb, 5, xvaldf)
saveWorkbook(wb, paste0(wd, "/", filename, ".xlsx"), TRUE)

## Text update
cat("\n\noutput saved\n\n")



## CLEAR SPECIES-SPECIFIC DATA FROM ENVIRONMENT ================================

## Data to keep loaded
keep <- c("wd", "sigstar"
          ,"birddf", "specdf", "rmoddf"
          , "wb", "filename"
          , "spList", "metList", "n"
          , "errdf", "unidf", "pmoddf", "xvaldf")

## Remove everything except the data and functions to keep
rm(list=setdiff(ls(), keep))

## Clear environment
gc()
toc()


## CLOSE LOOP ==================================================================
## =============================================================================

}
toc()

## =============================================================================
## =============================================================================



## DATA SUMMARIES AND STATS PER METRIC ACROSS ALL SPECIES ======================

## Read in results table from univariate modelling of all species
#unidf <- read.xlsx(paste0(wd, "/", filename, ".xlsx"), 2)
#unidf$metric <- factor(unidf$metric, levels=metList)

## Patterns of response to each metric (for results and FIG2)
metsumdf <- data.frame(
  metric = metList
  , neg_tot   = with(unidf, table(metric, direction))[,"neg"]
  , neg_sig   = with(subset(unidf, p<=0.05), table(metric, direction))[,"neg"]
  , neg_pcsig = with(subset(unidf, p<=0.05)
                     , table(metric, direction))[,"neg"]/57*100
  , pos_tot   = with(unidf, table(metric, direction))[,"pos"]
  , pos_sig   = with(subset(unidf, p<=0.05), table(metric, direction))[,"pos"]
  , pos_pcsig = with(subset(unidf, p<=0.05)
                     , table(metric, direction))[,"pos"]/57*100
)

## Inverse variance weights
unidf$wt <- 1/sqrt(unidf$sterr)

## List of species results for each metric
estList <- split(unidf[,c("metric", "est", "wt")], f = unidf$metric)

## Functions for calculating weighted sterr and 95% ci
wtd.se <- function(est, wt) {
  sqrt(wtd.var(est, wt)/length(na.omit(est)))  }
wtd.ci <- function(est, wt) {
  wtd.se(est, wt) * qt(0.975,length(x)-1)  }

## Weighted means and error per metric
wtdf <- data.frame(
  metric = metList
  , wtmean = sapply(estList, function(x) wtd.mean(x$est, x$wt))
  , wtse   = sapply(estList, function(x) wtd.se(x$est, x$wt))
  , wtci   = sapply(estList, function(x) wtd.ci(x$est, x$wt))
)
  
## Weighted one-sample t-test per metric
metstats <- lapply(estList, function(x) wtd.t.test(x$est, 0, x$wt)$coefficients)
metstats <- data.frame(do.call("rbind", metstats))
names(metstats) <- c("tstatistic", "df", "p")
metstats$sig    <- sigstar(metstats$p)

## Combine all metric stats
wtdf <- cbind(wtdf, metstats)

## Combine tables for pattern of responses and inv var weighting statistics
metdf <- cbind(metsumdf, wtdf[-1])

## Add data into .xlsx wookbook and save
#wb <- loadWorkbook(paste0(wd, "/", filename, ".xlsx"))
writeData(wb, 3, metdf)
addStyle(wb, 3, rows= 1:length(metList), cols= c(2:7,12)
         , style= createStyle(numFmt= "0"), gridExpand=TRUE, stack=T)

saveWorkbook(wb, paste0(wd, "/", filename, ".xlsx"), TRUE)



## EXAMINING VARIATION IN MODEL PREDICTIVE ABILITY =============================

## Summaries across species
# filename <- "Modelling outputs"
# xvaldf  <- read.xlsx(paste0(wd, "/", filename, ".xlsx")
#                       , sheet= 5)
# pmoddf  <- read.xlsx(paste0(wd, "/", filename, ".xlsx")
#                      , sheet= 4)

## rs descriptive statistics
round(stat.desc(xvaldf$rs, desc=FALSE), 3)
round(stat.desc(xvaldf$rs, desc=FALSE), 3)

## Predictive model outcomes for outlying species with CIs crossing zero
subset(xvaldf, MAE_lci<0)
spout   <- xvaldf$species[xvaldf$MAE_lci<0]
spoutdf <- subset(pmoddf, species %in% spout)
subset(xvaldf, species %in% spout)
spoutdf[order(abs(spoutdf$est), decreasing=T),]
 # Four waterbirds with high effect sizes for water variables
 # (pc_waterinland, nn_waterbodies)

## Preditive model accuracy for all other species
apply(subset(xvaldf, MAE_lci>0, select= MAE), 2
      , function(x) {c(mean= mean(x), se= se(x))})



## EFFECT OF SPECIES HABITAT PREFERENCE ON MODEL PREDICTIVE ABILITY ============

# Model predictive ability (rs) in response to species habitat preference 

## Add explanatory variables for testing to predictive ability data
xvaldf <- merge(xvaldf, specdf[,c("species", "dom_habitat", "site_freq")]
                , by= "species")

## Examine relationship before modelling
plot(rs ~ dom_habitat, data=xvaldf)

## Fit linear model
mod1 <- lm(rs ~ dom_habitat, data=xvaldf)

## Model outcome
summary(mod1)
anova(mod1)



## EFFECT OF SPECIES RELATIVE SCARCITY ON MODEL PREDICTIVE ABILITY =============

# Model predictive ability (rs) in response to relative scarcity (frequency of 
# occurence in urban sites)  

## Examine relationship before modelling
plot(rs ~ site_freq, data=xvaldf)
  # Indicates a non-linear relationship, so will use a GAM for modelling

# Fit GAM
mod2 <- gam(rs ~ s(site_freq, k=10), data=xvaldf, gamma= 1.4) 

# Model outcome
summary(mod2)

## Use first derivative of the fitted trend to determine the range of values 
## over which site_freq is having a significant effect 

## Load functions to produce derivatives via finite differences, sourced from  
## Gavin Simpson's github for example application and further explanation, see:
## https://www.fromthebottomoftheheap.net/2011/06/12/additive-modelling-and-the-hadcrut3v-global-mean-temperature-series/
source_url("https://raw.githubusercontent.com/gavinsimpson/random_code/master/derivFun.R")

# Predictions for y across the full range of x values
preds <- predict(mod2, data.frame(site_freq= 1:482), se.fit=T)

## Calculate first derivative and 95% CI
mod2.d    <- Deriv(mod2, n=482)
mod2.dci  <- confint(mod2.d)

## Significance is assigned where zero is not within the confidence range
mod2.dsig <- signifD(preds$fit
                     , d = mod2.d$site_freq$deriv
                     , upper = mod2.dci$site_freq$upper
                     , lower = mod2.dci$site_freq$lower
                     )
range(which(!is.na(mod2.dsig$incr))) 
range(which(!is.na(mod2.dsig$decr))) 

## Plot first derivatives with 95% ci and with areas of significance highlighted
plot(mod2.d, sizer = TRUE, alpha = 0.05)

## Plot relationship between species rs and site_freq values 
plot(rs~site_freq, data= xvaldf
     , xlab= "Species frequency of occurence", ylab= "Mean rs (95% CI)")

## Add modelled non-linear relationship and area of significant
x   <- 1:482
y   <- preds$fit
yci <- preds$se.fit
lines(x, y)
lines(x, mod2.dsig$incr, lwd = 3, col="red")
lines(x, y + yci, lty=2)  
lines(x, y - yci, lty=2)  



## OPEN COMPLETE .xlsx RESULTS FILE ============================================

openXL(paste0(wd, "/", filename, ".xlsx"))



## END =========================================================================