################################################################################ # # BIRD-HABITAT MODELLING (PLUMMER ET AL 2020, J APPL ECOL) # # R code used to develop, run and evaluate bird-habitat models presented in # the paper # # 2020-03-29 Kate E Plummer (kate.plummer@bto.org) # ################################################################################ ## SET UP ====================================================================== ## Set working directory for loading data and saving outputs wd <- getwd() ## Load required packages library(openxlsx) # for writing/reading .xlsx output files library(reshape) # for data manipulation library(tictoc) # for measuring time elapsed per species run library(glmmTMB) # for mixed modelling library(arm) # for 'rescale' standardising modelled covariates library(AER) # for 'dispersiontest' measuring model dispersion library(bbmle) # for 'AICtab' comparing model error structures library(caret) # for 'postResample' x-val performance calcs library(Rmisc) # for 'CI' confidence interval calc library(Hmisc) # for 'wtd.mean' weighted mean calc library(weights) # for 'wtd.t.test' weighted t-test library(pastecs) # for descriptive statistics library(mgcv) # for general additive modelling library(devtools) # for reading online code ## Function to assign significance stars to p values sigstar<- function(x){ symnum(x, corr = FALSE, na = FALSE, cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1), symbols = c("***", "** ", "* ", ". ", " ")) } sessionInfo() # R version 3.6.1 (2019-07-05) # Platform: x86_64-w64-mingw32/x64 (64-bit) # Running under: Windows 10 x64 (build 18362) # # Matrix products: default # # locale: # [1] LC_COLLATE=English_United Kingdom.1252 # [2] LC_CTYPE=English_United Kingdom.1252 # [3] LC_MONETARY=English_United Kingdom.1252 # [4] LC_NUMERIC=C # [5] LC_TIME=English_United Kingdom.1252 # # attached base packages: # [1] stats4 stats graphics grDevices utils datasets methods # [8] base # # other attached packages: # [1] devtools_2.2.2 usethis_1.5.1 mgcv_1.8-31 nlme_3.1-145 # [5] pastecs_1.3.21 weights_1.0.1 mice_3.8.0 gdata_2.18.0 # [9] Hmisc_4.4-0 Formula_1.2-3 Rmisc_1.5 plyr_1.8.6 # [13] caret_6.0-86 ggplot2_3.3.0 lattice_0.20-40 bbmle_1.0.23.1 # [17] AER_1.2-9 survival_3.1-11 sandwich_2.5-1 lmtest_0.9-37 # [21] zoo_1.8-7 car_3.0-7 carData_3.0-3 arm_1.10-1 # [25] lme4_1.1-21 Matrix_1.2-17 MASS_7.3-51.4 glmmTMB_1.0.1 # [29] tictoc_1.0 reshape_0.8.8 openxlsx_4.1.4 ## LOAD DATA =================================================================== ## All urban site data, including 34 urban landscape metrics moddf <- read.csv(paste0(wd, "/Urban_metrics_per_site.csv")) ## Bird abundance per urban site for 57 species birddf <- read.csv(paste0(wd, "/Birds_per_site.csv")) ## Bird species names and traits specdf <- read.csv(paste0(wd, "/Bird_species_list.csv")) ## SCALE VARIABLES FOR ANALYSIS ================================================ ## Species list spList <- as.character(specdf$species) ## List of 34 landscape metrics metList <- c("pc_building", "pc_built", "pc_roads", "pc_rail" , "pc_otherman", "pc_grassrough", "pc_grassmanaged", "pc_verge" , "pc_natural", "pc_gardens", "pc_scrub" , "pc_trees", "pc_woodbroad", "pc_woodconifer", "pc_woodmix" , "pc_waterinland", "pc_arable", "den_woods", "den_waterbodies" , "sz_buildings", "sz_woods", "sz_waterbodies", "sz_gardens" , "nn_buildings", "nn_woods", "nn_waterbodies", "ric_greenspace" , "den_greenspace", "sz_greenspace", "buf_urban", "buf_wood" , "buf_grass", "buf_coast", "climate") ## Replicate dataframe for variable scaling rmoddf <- moddf ## Rescale all predictor terms to mean of 0 and SD of 0.5 rmoddf[,metList] <- sapply(rmoddf[,metList], rescale) ## Log of area for offset rmoddf$logarea <- log(rmoddf$areakm2) ## CREATE .XLSX WORKBOOK TO STORE ALL MODELLING OUTPUTS ======================== ## Name of .xlsx workbook of all outputs from the current run filename <- "Modelling outputs" ## Create workbook options("openxlsx.fontName" = "Calibri") options("openxlsx.numFmt" = "0.000") wb <- createWorkbook() bottom <- length(metList)*length(spList) ## Add tab for error structure results addWorksheet(wb, "Error structure") setColWidths(wb, 1, cols= 1:9, widths= c(9,25,11,10,12,9,9,9,9)) freezePane(wb, 1, firstRow= TRUE) addStyle(wb, 1, rows= 1:length(spList)+1, cols= 4:9 , style= createStyle(halign="right"), gridExpand=TRUE) ## Add tab for univariate model outputs addWorksheet(wb, "Univariate mod table") setColWidths(wb, 2, cols= 1:10, widths= c(9,25,17,9,9,9,9,9,5,11)) freezePane(wb, 2, firstRow= TRUE) addStyle(wb, 2, rows= 1:bottom, cols= 3 , style= createStyle(halign="left")) addStyle(wb, 2, rows= 1:bottom, cols= 4:8 , style= createStyle(halign="right"), gridExpand=TRUE) ## Add tab for landscape metric summary results addWorksheet(wb, "Metric summary table") setColWidths(wb, 3, cols= 1:14, widths= c(17,9,9,9,9,9,9,9,9,9,9,5,9,5)) freezePane(wb, 3, firstRow= TRUE) addStyle(wb, 3, rows= 1:length(metList)+1, cols= 2:13 , style= createStyle(halign="right"), gridExpand=TRUE) ## Add tab for predictive model outputs addWorksheet(wb, "Predictive mod table") setColWidths(wb, 4, cols= 1:7, widths= c(9,25,17,9,9,9,9)) freezePane(wb, 4, firstRow= TRUE) addStyle(wb, 4, rows= 1:bottom, cols= 3 , style= createStyle(halign="left")) addStyle(wb, 2, rows= 1:bottom, cols= 4:7 , style= createStyle(halign="right"), gridExpand=TRUE) ## Add tab for cross-validation results addWorksheet(wb, "X-validation table") setColWidths(wb, 5, cols= 1:8, widths= c(9,25,9,9,9,9,9,9)) freezePane(wb, 5, firstRow= TRUE) addStyle(wb, 5, rows= 1:length(spList)+1, cols= 3:8 , style= createStyle(halign="right"), gridExpand=TRUE) ## CREATE LOOP TO CYCLE THROUGH EVERY SPECIES ================================== ## ============================================================================= ## Number of bird species (n = 57) n <- length(spList) ## Start timer for running all species tic(paste0("Total runtime")) ## Open loop for (i in 1:n) { ## Species of interest sp <- spList[i] ## Start timer tic(paste0("Runtime for species #", i, " of ", n)) ## Text update cat("\n-------------------------------------------------------------------------------------\n" , paste0("*** Running urban landscape model for species #", i , " -- ", sp, " (", specdf$sp_name[i], ")"), "\n\n") ## ============================================================================= ## ============================================================================= ## SPECIES i DATASET FOR MODELLING ============================================= ## Merge sp i data with dataset for analysis: spdat <- birddf[,c("sid", sp)] SPi <- merge(rmoddf, spdat, by= "sid") ## DETERMINE APPROPRIATE ERROR STRUCTURE ======================================= # Identify most appropriate eror structure accounting for over-dispersion # and/or zero inflation by comparing fit of Poisson, zero-inflated Poisson, # negative binomial and zero-inflated negative binomial ## Update text cat("* RUNNING 4 GLOBAL MODELS TO IDENTIFY THE BEST ERROR STRUCTURE\n") ## Global model formula global <- paste(sp, "~" # selected species , paste(metList, collapse=" + ") # all landscape metrics , "+ offset(logarea)" # offset for densities , "+ (1|grid100)") # 100km-square random effect global <- as.formula(global) ## Test for overdispersion using glm of global and AER::dispersiontest global_norand <- update(global, . ~ . -(1|grid100)) glmtest <- glm(global_norand, family=poisson, data= SPi) dispersion <- dispersiontest(glmtest)$estimate dispersion_p <- dispersiontest(glmtest)$p ## Compare Poisson, NegBinomial, ZIP, ZINB error structures for global model errstructList <- c("pois", "zipois", "nbin", "zinbin") gmodList <- vector("list", length(errstructList)) names(gmodList) <- errstructList cat("Poisson...") gmodList[["pois"]] <- glmmTMB(global, family=poisson, zi=~0, data= SPi) cat("negative binomial...") gmodList[["zipois"]] <- glmmTMB(global, family=poisson, zi=~1, data= SPi) cat("ZI Poisson...") gmodList[["nbin"]] <- glmmTMB(global, family=nbinom2, zi=~0, data= SPi) cat("ZI negative binomial...") gmodList[["zinbin"]] <- glmmTMB(global, family=nbinom2, zi=~1, data= SPi) ## dAIC model comparison table using bbmle::AICtab aiccomp <- AICtab(gmodList) ## Top model, i.e. error structure to use in all further modelling top <- attr(aiccomp, "row.names")[1] gmod <- gmodList[[top]] ## Reformat dAIC model comparison results d <- merge(data.frame(errstruct= errstructList) , data.frame(errstruct=attr(aiccomp, "row.names") , dAIC= aiccomp$dAIC) , by="errstruct") d$species <- sp ## Combine all error structure comparison results to export errstats <- cast(species ~ errstruct, value= "dAIC", data= d) errstats$errstruct <- top errstats$dispersion <- dispersion errstats$dispersion_p <- dispersion_p errstats$sp_name <- specdf$sp_name[i] ## Standardise error results table order errstats <- errstats[,c("species", "sp_name", "errstruct" , "dispersion", "dispersion_p", errstructList)] # ## Print results # print(errstats[,3:9], digits=4, row.names= FALSE) ## Add results to dataframe for all species if("errdf" %in% ls() == TRUE) { errdf <- rbind(errdf, errstats) } if("errdf" %in% ls() == FALSE) { errdf <- errstats } ## Update data in .xlsx wookbook and save writeData(wb, 1, errdf) saveWorkbook(wb, paste0(wd, "/", filename, ".xlsx"), TRUE) ## Text update cat("\n\noutput saved\n\n") ## USE ERROR STRUCTURE IDENTIFIED TO FIT UNIVARIATE MODELS ===================== # Fit 34 univariate models for each of the different landscape metrics # to determine their influence on species-specific abundance ## Update text cat("* RUNNING 34 LANDSCAPE METRIC UNIVARIATE MODELS\n") ## Create list to store all univarate model outputs uniMods <- vector("list", length(metList)) names(uniMods) <- metList nmet <- length(metList) ## Univariate model formuli formuli <- list() for (j in 1:nmet) { uvmod <- paste(sp, "~", metList[j], "+ offset(logarea) + (1|grid100)") formuli[[j]] <- as.formula(uvmod) } ## Fit all univariate models using the top global model error structure for( j in 1:nmet) { cat(paste0(j, "...")) uniMods[[j]] <- update(gmod, formuli[[j]]) } ## List of model coefficient tables per model uniMods_coefs <- lapply(uniMods, function(x) summary(x)$coefficients[[1]]) ## Extract univariate model results of interest for export unistats <- data.frame( species= sp , sp_name= specdf$sp_name[i] , metric = sapply(uniMods_coefs, function(x) rownames(x)[-1]) , AIC = sapply(uniMods, AIC) , est = sapply(uniMods_coefs, function(x) x[2,1]) , sterr = sapply(uniMods_coefs, function(x) x[2,2]) , zvalue = sapply(uniMods_coefs, function(x) x[2,3]) , p = sapply(uniMods_coefs, function(x) x[2,4]) ) unistats$sig <- sigstar(unistats$p) unistats$direction <- factor(ifelse(unistats$est>0, "pos", "neg")) ## Add results to dataframe for all species if("unidf" %in% ls() == TRUE) { unidf <- rbind(unidf, unistats) } if("unidf" %in% ls() == FALSE) { unidf <- unistats } ## Update data in .xlsx wookbook and save writeData(wb, 2, unidf) saveWorkbook(wb, paste0(wd, "/", filename, ".xlsx"), TRUE) ## Text update cat("\n\noutput saved\n\n") ## FIT PREDICTIVE MODEL WITH ALL SIGNIFICANT METRICS =========================== # Fit species predictive model using all landscape metrics which have a # significant (p < 0.05) influence ## Update text cat("* FITTING SPECIES PREDICTIVE MODEL\n") ## List of landscape metrics with a significant effect sigList <- as.character(unistats$metric[unistats$p <=0.05]) ## Formula for predictive model predFormula <- paste(sp, "~" # selected species , paste(sigList, collapse=" + ") # significant landscape metrics , "+ offset(logarea)" # offset for densities , "+ (1|grid100)") # 100km-square random effect predFormula <- as.formula(predFormula) ## Fit predictive model using the top global model error structure pmod <- update(gmod, predFormula) ## Predictive model coefficient table coeftab <- summary(pmod)$coefficients[[1]] ## Extract predictive model results for export pmodstats <- data.frame( species= sp , sp_name= specdf$sp_name[i] , metric = c("INTERCEPT", row.names(coeftab)[-1]) , est = coeftab[,1] , sterr = coeftab[,2] , zvalue = coeftab[,3] , p = coeftab[,4] ) ## Add results to dataframe for all species if("pmoddf" %in% ls() == TRUE) { pmoddf <- rbind(pmoddf, pmodstats) } if("pmoddf" %in% ls() == FALSE) { pmoddf <- pmodstats } ## Update data in .xlsx wookbook and save writeData(wb, 4, pmoddf) saveWorkbook(wb, paste0(wd, "/", filename, ".xlsx"), TRUE) ## Text update cat("\noutput saved\n\n") ## BLOCK 10-FOLD CROSS VALIDATION ============================================== # Use block 10-fold cross validation (following Roberts et al 2017) to assess # model predictive ability while accounting for spatial dependencies ## Update text cat("* RUNNING 10-FOLD CROSS VALIDATION\n") ## Dataframe to collect results per fold xvals <- data.frame(xval_fold= 1:10, rs= NA, MAE= NA) ## Perform 10 fold cross validation cat("Fold ") for(k in 1:10){ cat(paste0(k, "...")) ## Split data by fold testData <- droplevels(subset(SPi, xval_fold==k)) trainData <- droplevels(subset(SPi, xval_fold!=k)) ## Model training data mod <- update(pmod, data=trainData) ## Predict values for test data preds <-predict(mod, newdata= testData, allow.new.levels= T, type= "response") ## Model performance measures using observed v predicted obs <- testData[,sp] xvals$rs[k] <- cor(obs, preds, method="spearman") xvals$MAE[k] <- postResample(obs, preds)[3] } ## Calculate and store model performance measures using averages across folds xvalstats <- data.frame( species= sp , sp_name= specdf$sp_name[i] , rs= mean(na.omit(xvals$rs)) , rs_lci= CI(na.omit(xvals$rs))[["lower"]] , rs_uci= CI(na.omit(xvals$rs))[["upper"]] , MAE= mean(na.omit(xvals$MAE)) , MAE_lci= CI(na.omit(xvals$MAE))[["lower"]] , MAE_uci= CI(na.omit(xvals$MAE))[["upper"]] ) ## Add results to dataframe for all species if("xvaldf" %in% ls() == TRUE) { xvaldf <- rbind(xvaldf, xvalstats) } if("xvaldf" %in% ls() == FALSE) { xvaldf <- xvalstats } ## Update data in .xlsx wookbook and save writeData(wb, 5, xvaldf) saveWorkbook(wb, paste0(wd, "/", filename, ".xlsx"), TRUE) ## Text update cat("\n\noutput saved\n\n") ## CLEAR SPECIES-SPECIFIC DATA FROM ENVIRONMENT ================================ ## Data to keep loaded keep <- c("wd", "sigstar" ,"birddf", "specdf", "rmoddf" , "wb", "filename" , "spList", "metList", "n" , "errdf", "unidf", "pmoddf", "xvaldf") ## Remove everything except the data and functions to keep rm(list=setdiff(ls(), keep)) ## Clear environment gc() toc() ## CLOSE LOOP ================================================================== ## ============================================================================= } toc() ## ============================================================================= ## ============================================================================= ## DATA SUMMARIES AND STATS PER METRIC ACROSS ALL SPECIES ====================== ## Read in results table from univariate modelling of all species #unidf <- read.xlsx(paste0(wd, "/", filename, ".xlsx"), 2) #unidf$metric <- factor(unidf$metric, levels=metList) ## Patterns of response to each metric (for results and FIG2) metsumdf <- data.frame( metric = metList , neg_tot = with(unidf, table(metric, direction))[,"neg"] , neg_sig = with(subset(unidf, p<=0.05), table(metric, direction))[,"neg"] , neg_pcsig = with(subset(unidf, p<=0.05) , table(metric, direction))[,"neg"]/57*100 , pos_tot = with(unidf, table(metric, direction))[,"pos"] , pos_sig = with(subset(unidf, p<=0.05), table(metric, direction))[,"pos"] , pos_pcsig = with(subset(unidf, p<=0.05) , table(metric, direction))[,"pos"]/57*100 ) ## Inverse variance weights unidf$wt <- 1/sqrt(unidf$sterr) ## List of species results for each metric estList <- split(unidf[,c("metric", "est", "wt")], f = unidf$metric) ## Functions for calculating weighted sterr and 95% ci wtd.se <- function(est, wt) { sqrt(wtd.var(est, wt)/length(na.omit(est))) } wtd.ci <- function(est, wt) { wtd.se(est, wt) * qt(0.975,length(x)-1) } ## Weighted means and error per metric wtdf <- data.frame( metric = metList , wtmean = sapply(estList, function(x) wtd.mean(x$est, x$wt)) , wtse = sapply(estList, function(x) wtd.se(x$est, x$wt)) , wtci = sapply(estList, function(x) wtd.ci(x$est, x$wt)) ) ## Weighted one-sample t-test per metric metstats <- lapply(estList, function(x) wtd.t.test(x$est, 0, x$wt)$coefficients) metstats <- data.frame(do.call("rbind", metstats)) names(metstats) <- c("tstatistic", "df", "p") metstats$sig <- sigstar(metstats$p) ## Combine all metric stats wtdf <- cbind(wtdf, metstats) ## Combine tables for pattern of responses and inv var weighting statistics metdf <- cbind(metsumdf, wtdf[-1]) ## Add data into .xlsx wookbook and save #wb <- loadWorkbook(paste0(wd, "/", filename, ".xlsx")) writeData(wb, 3, metdf) addStyle(wb, 3, rows= 1:length(metList), cols= c(2:7,12) , style= createStyle(numFmt= "0"), gridExpand=TRUE, stack=T) saveWorkbook(wb, paste0(wd, "/", filename, ".xlsx"), TRUE) ## EXAMINING VARIATION IN MODEL PREDICTIVE ABILITY ============================= ## Summaries across species # filename <- "Modelling outputs" # xvaldf <- read.xlsx(paste0(wd, "/", filename, ".xlsx") # , sheet= 5) # pmoddf <- read.xlsx(paste0(wd, "/", filename, ".xlsx") # , sheet= 4) ## rs descriptive statistics round(stat.desc(xvaldf$rs, desc=FALSE), 3) round(stat.desc(xvaldf$rs, desc=FALSE), 3) ## Predictive model outcomes for outlying species with CIs crossing zero subset(xvaldf, MAE_lci<0) spout <- xvaldf$species[xvaldf$MAE_lci<0] spoutdf <- subset(pmoddf, species %in% spout) subset(xvaldf, species %in% spout) spoutdf[order(abs(spoutdf$est), decreasing=T),] # Four waterbirds with high effect sizes for water variables # (pc_waterinland, nn_waterbodies) ## Preditive model accuracy for all other species apply(subset(xvaldf, MAE_lci>0, select= MAE), 2 , function(x) {c(mean= mean(x), se= se(x))}) ## EFFECT OF SPECIES HABITAT PREFERENCE ON MODEL PREDICTIVE ABILITY ============ # Model predictive ability (rs) in response to species habitat preference ## Add explanatory variables for testing to predictive ability data xvaldf <- merge(xvaldf, specdf[,c("species", "dom_habitat", "site_freq")] , by= "species") ## Examine relationship before modelling plot(rs ~ dom_habitat, data=xvaldf) ## Fit linear model mod1 <- lm(rs ~ dom_habitat, data=xvaldf) ## Model outcome summary(mod1) anova(mod1) ## EFFECT OF SPECIES RELATIVE SCARCITY ON MODEL PREDICTIVE ABILITY ============= # Model predictive ability (rs) in response to relative scarcity (frequency of # occurence in urban sites) ## Examine relationship before modelling plot(rs ~ site_freq, data=xvaldf) # Indicates a non-linear relationship, so will use a GAM for modelling # Fit GAM mod2 <- gam(rs ~ s(site_freq, k=10), data=xvaldf, gamma= 1.4) # Model outcome summary(mod2) ## Use first derivative of the fitted trend to determine the range of values ## over which site_freq is having a significant effect ## Load functions to produce derivatives via finite differences, sourced from ## Gavin Simpson's github for example application and further explanation, see: ## https://www.fromthebottomoftheheap.net/2011/06/12/additive-modelling-and-the-hadcrut3v-global-mean-temperature-series/ source_url("https://raw.githubusercontent.com/gavinsimpson/random_code/master/derivFun.R") # Predictions for y across the full range of x values preds <- predict(mod2, data.frame(site_freq= 1:482), se.fit=T) ## Calculate first derivative and 95% CI mod2.d <- Deriv(mod2, n=482) mod2.dci <- confint(mod2.d) ## Significance is assigned where zero is not within the confidence range mod2.dsig <- signifD(preds$fit , d = mod2.d$site_freq$deriv , upper = mod2.dci$site_freq$upper , lower = mod2.dci$site_freq$lower ) range(which(!is.na(mod2.dsig$incr))) range(which(!is.na(mod2.dsig$decr))) ## Plot first derivatives with 95% ci and with areas of significance highlighted plot(mod2.d, sizer = TRUE, alpha = 0.05) ## Plot relationship between species rs and site_freq values plot(rs~site_freq, data= xvaldf , xlab= "Species frequency of occurence", ylab= "Mean rs (95% CI)") ## Add modelled non-linear relationship and area of significant x <- 1:482 y <- preds$fit yci <- preds$se.fit lines(x, y) lines(x, mod2.dsig$incr, lwd = 3, col="red") lines(x, y + yci, lty=2) lines(x, y - yci, lty=2) ## OPEN COMPLETE .xlsx RESULTS FILE ============================================ openXL(paste0(wd, "/", filename, ".xlsx")) ## END =========================================================================