

# geom_hex # hexbin for lots of points, or .png

# --------------------------------------------------------------------------------------------------------------------------------
# functions

"%!in%" <- Negate("%in%")


# --------------------------------------------------------------------------------------------------------------------------------
# options

options(stringsAsFactors = FALSE)


# --------------------------------------------------------------------------------------------------------------------------------
# packages

# install.packages("caret", dependencies = c("Imports", "Depends", "Suggests"))
packages <- c("corrplot", "ggplot2", "grid", "gridExtra", "scales", "RColorBrewer", "maps", 
                        "plyr", "reshape2", "caret", "GGally", "choroplethr", "choroplethrMaps", "ggthemes") 
sapply(packages, require, character.only = TRUE)


# --------------------------------------------------------------------------------------------------------------------------------
# functions

simpleCap <- function(x) {
  s <- tolower(x)
  s <- strsplit(s, " ")[[1]]
  paste0(toupper(substring(s, 1,1)), substring(s, 2), collapse=" ")
}

# tools::toTitleCase(tolower(x))

rm_box_outliers <- function(dat, form, limit_var, by_vars) {
	# get boxplot outlier limits
	form <- formula(paste(limit_var, "~", paste(by_vars, collapse = " + ")))
    box_stats <- boxplot(form, data = dat, plot = FALSE)
    sts_df <- data.frame(t(rbind(box_stats$stats[1, ], box_stats$stats[5, ])))
    colnames(sts_df) <- c("lower", "upper")
    sts_df$level <- box_stats$names
    # exclude data outside of limits
    dat$level <- interaction(dat[, by_vars])
    dat <- merge(dat, sts_df, by = "level", all.x = TRUE)
    dat_split <- split(dat, f = dat$level)     
    dat_split <- lapply(dat_split, function(x) {
	    x[x[, limit_var] >= x[, "lower"] & x[, limit_var] <= x[, "upper"], ]})  
    dat_limits <- do.call(rbind.data.frame, dat_split)
    rownames(dat_limits) <- NULL
    return(dat_limits)
}


# --------------------------------------------------------------------------------------------------------------------------------
# load training data and model prediction data

project_dir <- "path-to-output-directory"
data_dir <- "path-to-input-directory"
setwd(project_dir)

load("data/data_PP.Rdata") # combined_PP
load("data/final_cubist_predictions_exp.Rdata") # final_cubist_predictions_exp
#load("data/files_pred.Rdata") # files_pred (cleaned RCP scenario data for 2050 and 2070)

eco_county <- read.csv(file.path(data_dir, "data/eco_county_single.csv")) # changed FIPS 46102 to 46113
eco_county$ecoregion <- factor(tools::toTitleCase(tolower(eco_county$NA_L1NAME)))
eco_county$region <- eco_county$FIPS

political_regions <- read.csv(file.path(data_dir, "data/state_regions.csv")) 

combined_PP <- merge(eco_county[, c("region", "ecoregion")], combined_PP, by = "region", all.y = TRUE)
combined_PP <- merge(political_regions[, c("state", "Region", "Division")], combined_PP, by = "state", all.y = TRUE)

save(combined_PP, file = "data/combined_PP_ecoregion_political.Rdata", compress = "gzip")
#load("data/combined_PP_ecoregion_political.Rdata")


# --------------------------------------------------------------------------------------------------------------------------------
# create diffs between present and 2050 and present and 2070 first, then reshape and aggregate by state or bioregion

preds <- merge(combined_PP[, c("region", "state", "ecoregion", "Region", "Division")], final_cubist_predictions_exp, by = "region", all = TRUE)

from <- 62 # start counter
to <- 75 # start counter
	
for (present in responseColumns) {     
	
	from <- from + 14
	to <- to + 14
	
	if (from > ncol(preds)) { 
	   stop("Completed") } 
	else {
	
    for (future in colnames(preds)[from:to]) {
    	cat(future, "vs", present, "\n")
    	
preds[, paste0("diff_", present, "_vs_", future)] <- preds[, future] - preds[, present]       
# preds[, paste0("diff_", present, "_vs_", future)] <- ((preds[, future] - preds[, present]) / preds[, "area"]) * 100
    
    } # end of future loop
    } # end of else statement

} # end of present loop


# long format
preds_long <- melt(preds, id.vars = colnames(preds[, c("region", "Region", "Division", "ecoregion", "state", "area")]), variable.name = "variable", value.name = "pred")

# split into two data frames: 1) contemporary variables, 2) predictions and differences
contemp <- preds_long[preds_long$variable %in% levels(preds_long$variable)[1:69], ]
contemp$year <- rep("present", times = 69*3066)
contemp$scenario <- rep("current", times = 69*3066)    
contemp <- droplevels(contemp)
pred_and_diff <- preds_long[!(preds_long$variable %in% levels(preds_long$variable)[1:69]), ]
pred_and_diff <- droplevels(pred_and_diff)

# create year and scenario variables
pred_and_diff$year <- paste0("20", gsub(".+(\\d{2})$", "\\1", pred_and_diff$variable))
pred_and_diff$scenario <- gsub(".+\\_(\\w{4})\\d+$", "\\1", pred_and_diff$variable)     

# recombine predictions and differences with contemporary variables
preds_long_all <- rbind.fill(contemp, pred_and_diff)
preds_long_all$year <- factor(preds_long_all$year)
preds_long_all$scenario <- factor(preds_long_all$scenario)

# crate response variable and order factor levels
responseColumns_diff <- paste0("diff.", responseColumns)
preds_long_all$response <- factor(c(rep(responseColumns, each = 3066), rep(c(responseColumns, responseColumns_diff), each = 14*3066))) 
preds_long_all$response <- factor(preds_long_all$response, levels = c(responseColumns, responseColumns_diff))

# remove "variable"
preds_long_all <- preds_long_all[, !(colnames(preds_long_all) %in% "variable")]

# average of scenarios
#preds_long_all_avg <- ddply(preds_long_all, .(region, state, Region, Division, ecoregion, response, year), summarise, 
#    pred_avg = mean(pred, na.rm = TRUE))
        
save(preds_long_all, preds_long_all_avg, file = "data/plots_final_response_predictions.Rdata", compress = "gzip")
#load("data/plots_final_response_predictions.Rdata")


# --------------------------------------------------------------------------------------------------------------------------------
# load manipulated new data (7 RCP scenarios for 2050 and 2070)

RCP_data_path <- "~/Dropbox/US County Phylodiversity/data/County Data"
RCPfilenames <- list.files(path = RCP_data_path, pattern = "ac|gd|gs|he", full.names = TRUE)
RCP <- lapply(RCPfilenames, read.csv)
names(RCP) <- gsub("^(\\w+)\\..+", "\\1", basename(RCPfilenames))

# --------------------------------------------------------------------------------------------------------------------------------
# clean manipulated new data

# omit observations not in trained model
RCP <- lapply(RCP, function(x) { x <- x[x$FIPS %in% combined_PP$fips, ] ; x })

# omit variables not in trained model
pred_variables <- colnames(combined_PP)[colnames(combined_PP) %!in% c(responseColumns, "Region", "Division", "ecoregion")]
RCP <- lapply(RCP, function(x) { x <- x[, colnames(x) %in% pred_variables] ; x })

# convert state names to abbreviations
RCP <- lapply(RCP, function(x) { x$state <- state.abb[match(x$state, state.name)] ; x$state[is.na(x$state)] <- "DC" ; x })

# impute missing data
RCP <- lapply(RCP, function(x) { x <- data.frame(state = x$state, region = x$fips, predict(preProcess(x[, colnames(x) %!in% c("state", "fips")], method = "bagImpute"), newdata = x[, colnames(x) %!in% c("state", "fips")])) ; x })
      
# include extra variable "totalOntree" for diversity metrics
RCP <- lapply(RCP, function(x) { 
	x <- data.frame(x, totalOntree = combined_PP$totalOntree,
	                   totalOntree.n = combined_PP$totalOntree.n,
	                   totalOntree.i = combined_PP$totalOntree.i,
	                   totalOntree.e = combined_PP$totalOntree.e,
	                   totalOntree.int = combined_PP$totalOntree.int)
	x 
	})

# include interaction term for lat/long
RCP <- lapply(RCP, function(x) { x$longitude <- x$longitude - mean(x$longitude, na.rm = TRUE); x } )
RCP <- lapply(RCP, function(x) { x$latitude <- x$latitude - mean(x$latitude, na.rm = TRUE); x } )
RCP <- lapply(RCP, function(x) { x$long_x_lat <- x$longitude * x$latitude; x } )

# save
save(RCP, file = "data/RCP_cleaned.Rdata", compress = "gzip")
#load("data/RCP_cleaned.Rdata")


#######################################################################################
#### FIG_1: TOTAL for BOXPLOTS and CHOROPLETH PLOTS 
#######################################################################################
# --------------------------------------------------------------------------------------------------------------------------------
# predicted changes (delta) in plant biodoversity relative to current values (ac45, gs45, gd45, he45, averaged), for all 7 metrics

# USE % OF COUNTIES AS RESPONSE SCALE

# --------------------------------------------------------------------------------------------
# present day values
preds_long_response7 <- preds_long_all[with(preds_long_all, response %in% c("species", "family", "genus", "PD.ALL", "MPD.ALL", "PD.s.ALL", "MPD.s.ALL") & 
                                                                                                     year %in% c("present") &
                                                                                                     scenario %in% c("current")), ]
preds_long_response7 <- droplevels(preds_long_response7)
preds_long_response7$pred_original <- preds_long_response7$pred

# unify factor names
preds_long_response7$response <- factor(preds_long_response7$response,  
    levels = c("species", "genus", "family", "PD.ALL", "MPD.ALL", "PD.s.ALL", "MPD.s.ALL"), 
    labels = c("Species", "Genus", "Family", "PD (all)", "MPD (all)", "PD (std. all)", "MPD (std. all)"))
    
# --------------------------------------------------------------------------------------------
# differences (future - present)                                                                                                         
preds_long_response7_diff <- preds_long_all[with(preds_long_all, response %in% c("diff.species", "diff.family", "diff.genus", "diff.PD.ALL", "diff.MPD.ALL", "diff.PD.s.ALL", "diff.MPD.s.ALL") & 
                                                                                                     year %in% c("2050", "2070") &
                                                                                                     scenario %in% c("ac45", "gd45", "gs45", "he45")), ]
preds_long_response7_diff <- droplevels(preds_long_response7_diff)

# --------------------------------------------------------------------------------------------
# unify response factor names
preds_long_response7_diff$response <- factor(preds_long_response7_diff$response,  
    levels = c("diff.species", "diff.genus", "diff.family", "diff.PD.ALL", "diff.MPD.ALL", "diff.PD.s.ALL", "diff.MPD.s.ALL"), 
    labels = c("Species", "Genus", "Family", "PD (all)", "MPD (all)", "PD (std. all)", "MPD (std. all)"))
    
# abbreviate eco-region and division factor names
preds_long_response7_diff$ecoregion <- factor(preds_long_response7_diff$ecoregion,  
    levels = c("Eastern Temperate Forests", "Great Plains", "Marine West Coast Forest", "Mediterranean California", "North American Deserts",
                     "Northern Forests", "Northwestern Forested Mountains", "Southern Semiarid Highlands", "Temperate Sierras", "Tropical Wet Forests"), 
    labels = c("ETF", "GP", "MWCF", "MC", "NAD", "NF", "NFM", "SSH", "TS", "TWF"))

preds_long_response7_diff$Division <- factor(preds_long_response7_diff$Division,  
    levels = c("South Atlantic", "East South Central", "Mountain", "East North Central", "West North Central",
                     "West South Central", "New England", "Middle Atlantic", "Pacific" ), 
    labels = c("SA", "ESC", "M", "ENC", "WNC", "WSC", "NE", "MA", "P"))

# --------------------------------------------------------------------------------------------            
# merge present day values with future prediction data
preds_long_response7_diff <- merge(preds_long_response7_diff, preds_long_response7[, c("region", "response", "pred_original")], 
                                                            by = c("region", "response"), all.x = TRUE)

# create % difference
preds_long_response7_diff$diff_percent <- with(preds_long_response7_diff, (pred / pred_original) * 100)

# average of scenarios
preds_long_response7_diff_avg <- ddply(preds_long_response7_diff, .(region, state, Region, Division, ecoregion, response, year), summarise, 
    diff_percent_avg = mean(diff_percent, na.rm = TRUE))
# -------------------------------------------------------------------------------------------- 

    
#######################################################################################
# FIG_1: Second and third panels: choropleth maps

# NEED TO USE THE DATA THAT IS AVERAGED OVER THE 4 SCENARIOS TO CREATE THE PLOTS DATA
# have two columns of maps, one with (future 2050 - present) and the other with (future 2070 - present)

# --------------------------------------------------------------------------------------------------------------------------------
# predicted changes (delta) in plant biodoversity relative to current values (ac45, gd45, gs45, he45, averaged), for all 7 metrics

# unify factor names
preds_long_response7_diff_avg_new <- preds_long_response7_diff_avg
preds_long_response7_diff_avg_new$response <- factor(preds_long_response7_diff_avg$response,  
    levels = c("Species", "Genus", "Family", "PD (all)", "MPD (all)", "PD (std. all)", "MPD (std. all)"), 
    labels = c("species", "genus", "family", "PD", "MPD", "PD.s", "MPD.s"))
    
# reshape "response" and "year" back to wide format
preds_long_response7_diff_avg_wide <- dcast(preds_long_response7_diff_avg_new, ... ~ response + year, value.var = "diff_percent_avg")
            
# average of scenarios
#preds_long_response7_diff_avg <- ddply(preds_long_response7_diff, .(region, state, Region, Division, ecoregion, area, response, year), summarise, 
#    pred_avg = mean(pred, na.rm = TRUE))
# reshape "response" and "year" back to wide format
#preds_long_response7_diff_avg_wide <- dcast(preds_long_response7_diff_avg, ... ~ response + year, value.var = "pred_avg")

save(preds_long_response7_diff, preds_long_response7_diff_avg, preds_long_response7_diff_avg_wide, 
    file = "data/preds_long_response7_diff_avg_TOTAL.Rdata", compress = "gzip")
#load("data/preds_long_response7_diff_avg_TOTAL.Rdata")


#######################################################################################
# Fig 3-4 present to present final 4x4 panel plots (ALL)

# ----------------------------------------------------------------
combined_PP_long <- melt(combined_PP, 
    id.vars = colnames(combined_PP)[colnames(combined_PP) %!in% responseColumns],
    measure_vars = responseColumns, 
    variable.name = "response",
    value.name = "metric")

response4x4 <- c("species", "genus", "family", "PD.ALL", "MPD.ALL", "PD.s.ALL", "MPD.s.ALL")
combined_PP_long_4x4 <- combined_PP_long[combined_PP_long$response %in% response4x4, 
    c("response", "metric", "bio07.mn", "bio15.mn", "hwsd.t_clay.mn", "enow.coast")]
combined_PP_long_4x4 <- droplevels(combined_PP_long_4x4)

combined_PP_long_4x4_long <- melt(combined_PP_long_4x4, 
    id.vars = c("response", "metric"), variable.name = "predictor", value.name = "env_var")
combined_PP_long_4x4_long$response <- factor(combined_PP_long_4x4_long$response, 
    levels = response4x4,
    labels = c("Species", "Genus", "Family", "PD (all)", "MPD (all)", "PD (Std. all)", "MPD (Std. all)"))
combined_PP_long_4x4_long$predictor <- factor(combined_PP_long_4x4_long$predictor,, 
    levels = c("bio07.mn", "bio15.mn", "hwsd.t_clay.mn", "enow.coast"),
    labels = c("BIO 07 (mn)", "BIO 15 (mn)", "Clay (mn)", "Coast"))
str(combined_PP_long_4x4_long)
 
save(combined_PP_long_4x4_long, file = "data/explanatory_present2present_4x4_panel.Rdata", compress = "gzip")
#load("data/explanatory_present2present_4x4_panel.Rdata")


# -----------------------------------------------------------------
#combined_PP_long_box <- combined_PP_long[combined_PP_long$response %in% c("species", "family", "PD.ALL", "MPD.ALL"), 
#    c("response", "metric", "bio07.mn", "bio15.mn", "gmted2010.elev_mean.mn", "hwsd.t_clay.mn", "enow.coast")]
#combined_PP_long_box <- droplevels(combined_PP_long_box)

#combined_PP_long_box$response <- factor(combined_PP_long_box$response, 
#    levels = c("species", "family", "PD.ALL", "MPD.ALL"),
#    labels = c("Species", "Family", "PD (all)", "MPD (all)"))



#######################################################################################
# Fig S9 present to present final 4x4 panel plots (NAT versus INTRO)

 # use species.i and species.n etc. and color by .i and .n
 
combined_PP_long <- melt(combined_PP, 
    id.vars = colnames(combined_PP)[colnames(combined_PP) %!in% responseColumns],
    measure_vars = responseColumns, 
    variable.name = "variable",  # VARIABLE
    value.name = "metric")

response4x4_ni <- c("species.n", "species.i", "genus.n", "genus.i", "family.n", "family.i", "PD.NAT", "PD.INTRO", "MPD.NAT", "MPD.INTRO", "PD.s.NAT", "PD.s.INTRO", "MPD.s.NAT", "MPD.s.INTRO")
combined_PP_long_4x4_ni <- combined_PP_long[combined_PP_long$variable %in% response4x4_ni,  c("variable", "metric", "bio07.mn", "bio15.mn", "hwsd.t_clay.mn", "enow.coast")]
combined_PP_long_4x4_ni <- droplevels(combined_PP_long_4x4_ni)

combined_PP_long_4x4_ni_long <- melt(combined_PP_long_4x4_ni, id.vars = c("variable", "metric"), variable.name = "predictor", value.name = "env_var")

combined_PP_long_4x4_ni_long$response <- factor(combined_PP_long_4x4_ni_long$variable, levels = response4x4_ni)
combined_PP_long_4x4_ni_long$Origin <- gsub("\\w+\\.(\\w+)", "\\1", as.character(combined_PP_long_4x4_ni_long$variable))
combined_PP_long_4x4_ni_long[combined_PP_long_4x4_ni_long$Origin %in% c("n", "NAT", "s.NAT"), "Origin"] <- "native" 
combined_PP_long_4x4_ni_long[combined_PP_long_4x4_ni_long$Origin %in% c("i", "INTRO", "s.INTRO"), "Origin"] <- "introduced"
combined_PP_long_4x4_ni_long$Origin <- factor(combined_PP_long_4x4_ni_long$Origin)

combined_PP_long_4x4_ni_long$predictor <- factor(combined_PP_long_4x4_ni_long$predictor,, 
    levels = c("bio07.mn", "bio15.mn", "hwsd.t_clay.mn", "enow.coast"),
    labels = c("BIO 07 (mn)", "BIO 15 (mn)", "Clay (mn)", "Coast"))
str(combined_PP_long_4x4_ni_long)

save(combined_PP_long_4x4_ni_long, file = "data/explanatory_present2present_4x4_panel_NAT_INTRO.Rdata", compress = "gzip")
#load("data/explanatory_present2present_4x4_panel_NAT_INTRO.Rdata")



#######################################################################################
#### FIG_S5-S6 NATIVE VS NON-NATIVE for BOXPLOTS (FIG_S6) and CHOROPLETH PLOTS (FIG_S5)
#######################################################################################
# --------------------------------------------------------------------------------------------------------------------------------
# predicted changes (delta) in plant biodoversity relative to current values (ac45, gs45, gd45, he45, averaged), for all 7 metrics

# USE % OF COUNTIES AS RESPONSE SCALE

# --------------------------------------------------------------------------------------------
# present day values
preds_long_response7_ni <- preds_long_all[with(preds_long_all, response %in% c("species.n", "family.n", "genus.n", "species.i", "family.i", "genus.i", "PD.NAT", "MPD.NAT", "PD.s.NAT", "MPD.s.NAT", "PD.INTRO", "MPD.INTRO", "PD.s.INTRO", "MPD.s.INTRO") & 
                                                                                                     year %in% c("present") &
                                                                                                     scenario %in% c("current")), ]
preds_long_response7_ni <- droplevels(preds_long_response7_ni)
preds_long_response7_ni$pred_original <- preds_long_response7_ni$pred

# native versus invasive
preds_long_response7_ni$NAT_INTRO <- gsub(".+\\.(\\w+)$", "\\1", preds_long_response7_ni$response)
preds_long_response7_ni$NAT_INTRO <- factor(ifelse(preds_long_response7_ni$NAT_INTRO %in% c("n", "NAT"), "Native", "Invasive"))
preds_long_response7_ni$response <- gsub("(.+)\\.\\w+$", "\\1", preds_long_response7_ni$response)

preds_long_response7_ni$response <- factor(preds_long_response7_ni$response)
    
# --------------------------------------------------------------------------------------------
# differences (future - present)                                                                                                         
preds_long_response7_diff_ni <- preds_long_all[with(preds_long_all, response %in% c("diff.species.n", "diff.family.n", "diff.genus.n", "diff.species.i", "diff.family.i", "diff.genus.i", "diff.PD.NAT", "diff.MPD.NAT", "diff.PD.s.NAT", "diff.MPD.s.NAT", "diff.PD.INTRO", "diff.MPD.INTRO", "diff.PD.s.INTRO", "diff.MPD.s.INTRO") & 
                                                                                                     year %in% c("2050", "2070") &
                                                                                                     scenario %in% c("ac45", "gd45", "gs45", "he45")), ]
preds_long_response7_diff_ni <- droplevels(preds_long_response7_diff_ni)

# native versus invasive
preds_long_response7_diff_ni$NAT_INTRO <- gsub(".+\\.(\\w+)$", "\\1", preds_long_response7_diff_ni$response)
preds_long_response7_diff_ni$NAT_INTRO <- factor(ifelse(preds_long_response7_diff_ni$NAT_INTRO %in% c("n", "NAT"), "Native", "Introduced"))
preds_long_response7_diff_ni$response <- gsub("(.+)\\.\\w+$", "\\1", preds_long_response7_diff_ni$response)

# --------------------------------------------------------------------------------------------
# unify response factor names
preds_long_response7_diff_ni$response <- factor(preds_long_response7_diff_ni$response,  
    levels = c("diff.species", "diff.genus", "diff.family", "diff.PD", "diff.MPD", "diff.PD.s", "diff.MPD.s"), 
    labels = c("species", "genus", "family", "PD", "MPD", "PD.s", "MPD.s")) 
    
# abbreviate eco-region and division factor names
preds_long_response7_diff_ni$ecoregion <- factor(preds_long_response7_diff_ni$ecoregion,  
    levels = c("Eastern Temperate Forests", "Great Plains", "Marine West Coast Forest", "Mediterranean California", "North American Deserts",
                     "Northern Forests", "Northwestern Forested Mountains", "Southern Semiarid Highlands", "Temperate Sierras", "Tropical Wet Forests"), 
    labels = c("ETF", "GP", "MWCF", "MC", "NAD", "NF", "NFM", "SSH", "TS", "TWF"))

preds_long_response7_diff_ni$Division <- factor(preds_long_response7_diff_ni$Division,  
    levels = c("South Atlantic", "East South Central", "Mountain", "East North Central", "West North Central",
                     "West South Central", "New England", "Middle Atlantic", "Pacific" ), 
    labels = c("SA", "ESC", "M", "ENC", "WNC", "WSC", "NE", "MA", "P"))

# --------------------------------------------------------------------------------------------            
# merge present day values with future prediction data
preds_long_response7_diff_ni <- merge(preds_long_response7_diff_ni, preds_long_response7_ni[, c("region", "response", "pred_original")], 
                                                            by = c("region", "response"), all.x = TRUE)

# create % difference
preds_long_response7_diff_ni$diff_percent <- with(preds_long_response7_diff_ni, (pred / pred_original) * 100)

# average of scenarios
preds_long_response7_diff_avg_ni <- ddply(preds_long_response7_diff_ni, .(region, state, Region, Division, ecoregion, NAT_INTRO, response, year), summarise, 
    diff_percent_avg = mean(diff_percent, na.rm = TRUE))


#######################################################################################
# FIG_S5: Second and third panels: choropleth maps

# NEED TO USE THE DATA THAT IS AVERAGED OVER THE 4 SCENARIOS TO CREATE THE PLOTS DATA
# have two columns of maps, one with (future 2050 - present) and the other with (future 2070 - present)

# --------------------------------------------------------------------------------------------------------------------------------
# predicted changes (delta) in plant biodoversity relative to current values (ac45, gd45, gs45, he45, averaged), for all 7 metrics

# reshape "response", "year", and "NAT/INTRO" back to wide format
preds_long_response7_diff_avg_wide_ni <- dcast(preds_long_response7_diff_avg_ni, ... ~ response + year + NAT_INTRO, value.var = "diff_percent_avg")

# subset to include only 2070
#preds_long_response7_diff_avg_wide_ni <- preds_long_response7_diff_avg_wide_ni[preds_long_response7_diff_avg_wide_ni$year %in% "2070", ]
            
# average of scenarios
#preds_long_response7_diff_avg <- ddply(preds_long_response7_diff, .(region, state, Region, Division, ecoregion, area, response, year), summarise, 
#    pred_avg = mean(pred, na.rm = TRUE))
# reshape "response" and "year" back to wide format
#preds_long_response7_diff_avg_wide <- dcast(preds_long_response7_diff_avg, ... ~ response + year, value.var = "pred_avg")


save(preds_long_response7_diff_ni, preds_long_response7_diff_avg_ni, preds_long_response7_diff_avg_wide_ni, 
    file = "data/preds_long_response7_diff_avg_NAT_INTRO.Rdata", compress = "gzip")
#load("data/preds_long_response7_diff_avg_NAT_INTRO.Rdata")


#######################################################################################
#### FIG_S7: TOTAL for BOXPLOTS and CHOROPLETH PLOTS for NON-RCP45
#######################################################################################
# --------------------------------------------------------------------------------------------------------------------------------
# predicted changes (delta) in plant biodoversity relative to current values (he26, he60, he85, averaged), for all 7 metrics

# USE % OF COUNTIES AS RESPONSE SCALE

# --------------------------------------------------------------------------------------------
# present day values
preds_long_response7 <- preds_long_all[with(preds_long_all, response %in% c("species", "family", "genus", "PD.ALL", "MPD.ALL", "PD.s.ALL", "MPD.s.ALL") & 
                                                                                                     year %in% c("present") &
                                                                                                     scenario %in% c("current")), ]
preds_long_response7 <- droplevels(preds_long_response7)
preds_long_response7$pred_original <- preds_long_response7$pred

# unify factor names
preds_long_response7$response <- factor(preds_long_response7$response,  
    levels = c("species", "genus", "family", "PD.ALL", "MPD.ALL", "PD.s.ALL", "MPD.s.ALL"), 
    labels = c("Species", "Genus", "Family", "PD (all)", "MPD (all)", "PD (std. all)", "MPD (std. all)"))
    
# --------------------------------------------------------------------------------------------
# differences (future - present)                                                                                                         
preds_long_response7_diff_nonRCP45 <- preds_long_all[with(preds_long_all, response %in% c("diff.species", "diff.family", "diff.genus", "diff.PD.ALL", "diff.MPD.ALL", "diff.PD.s.ALL", "diff.MPD.s.ALL") & 
                                                                                                     year %in% c("2050", "2070") &
                                                                                                     scenario %in% c("he26", "he60", "he85")), ]
preds_long_response7_diff_nonRCP45 <- droplevels(preds_long_response7_diff_nonRCP45)

# --------------------------------------------------------------------------------------------
# unify response factor names
preds_long_response7_diff_nonRCP45$response <- factor(preds_long_response7_diff_nonRCP45$response,  
    levels = c("diff.species", "diff.genus", "diff.family", "diff.PD.ALL", "diff.MPD.ALL", "diff.PD.s.ALL", "diff.MPD.s.ALL"), 
    labels = c("Species", "Genus", "Family", "PD (all)", "MPD (all)", "PD (std. all)", "MPD (std. all)"))
    
# abbreviate eco-region and division factor names
preds_long_response7_diff_nonRCP45$ecoregion <- factor(preds_long_response7_diff_nonRCP45$ecoregion,  
    levels = c("Eastern Temperate Forests", "Great Plains", "Marine West Coast Forest", "Mediterranean California", "North American Deserts",
                     "Northern Forests", "Northwestern Forested Mountains", "Southern Semiarid Highlands", "Temperate Sierras", "Tropical Wet Forests"), 
    labels = c("ETF", "GP", "MWCF", "MC", "NAD", "NF", "NFM", "SSH", "TS", "TWF"))

preds_long_response7_diff_nonRCP45$Division <- factor(preds_long_response7_diff_nonRCP45$Division,  
    levels = c("South Atlantic", "East South Central", "Mountain", "East North Central", "West North Central",
                     "West South Central", "New England", "Middle Atlantic", "Pacific" ), 
    labels = c("SA", "ESC", "M", "ENC", "WNC", "WSC", "NE", "MA", "P"))

# --------------------------------------------------------------------------------------------            
# merge present day values with future prediction data
preds_long_response7_diff_nonRCP45 <- merge(preds_long_response7_diff_nonRCP45, preds_long_response7[, c("region", "response", "pred_original")], 
                                                            by = c("region", "response"), all.x = TRUE)

# create % difference
preds_long_response7_diff_nonRCP45$diff_percent <- with(preds_long_response7_diff_nonRCP45, (pred / pred_original) * 100)

# average of scenarios
preds_long_response7_diff_avg_nonRCP45 <- ddply(preds_long_response7_diff_nonRCP45, .(region, state, Region, Division, ecoregion, response, year), summarise, 
    diff_percent_avg = mean(diff_percent, na.rm = TRUE))
# -------------------------------------------------------------------------------------------- 

#######################################################################################
# FIG_S7: Second and third panels: choropleth maps

# NEED TO USE THE DATA THAT IS AVERAGED OVER THE 3 SCENARIOS TO CREATE THE PLOTS DATA
# have two columns of maps, one with (future 2050 - present) and the other with (future 2070 - present)

# --------------------------------------------------------------------------------------------------------------------------------
# predicted changes (delta) in plant biodoversity relative to current values (ac45, gd45, gs45, he45, averaged), for all 7 metrics

# unify factor names
preds_long_response7_diff_avg_new_nonRCP45 <- preds_long_response7_diff_avg_nonRCP45
preds_long_response7_diff_avg_new_nonRCP45$response <- factor(preds_long_response7_diff_avg_nonRCP45$response,  
    levels = c("Species", "Genus", "Family", "PD (all)", "MPD (all)", "PD (std. all)", "MPD (std. all)"), 
    labels = c("species", "genus", "family", "PD", "MPD", "PD.s", "MPD.s"))
    
# reshape "response" and "year" back to wide format
preds_long_response7_diff_avg_wide_nonRCP45 <- dcast(preds_long_response7_diff_avg_new_nonRCP45, ... ~ response + year, value.var = "diff_percent_avg")
            
# average of scenarios
#preds_long_response7_diff_avg_nonRCP45 <- ddply(preds_long_response7_diff_nonRCP45, .(region, state, Region, Division, ecoregion, area, response, year), summarise, 
#    pred_avg = mean(pred, na.rm = TRUE))
# reshape "response" and "year" back to wide format
#preds_long_response7_diff_avg_wide_nonRCP45 <- dcast(preds_long_response7_diff_avg_nonRCP45, ... ~ response + year, value.var = "pred_avg")

save(preds_long_response7_diff_nonRCP45, preds_long_response7_diff_avg_nonRCP45, preds_long_response7_diff_avg_wide_nonRCP45, 
    file = "data/preds_long_response7_diff_avg_TOTAL_nonRCP45.Rdata", compress = "gzip")
#load("data/preds_long_response7_diff_avg_TOTAL_nonRCP45.Rdata")



#######################################################################################
#### FIG_S8 NATIVE VS NON-NATIVE for BOXPLOTS and CHOROPLETH PLOTS
#######################################################################################
# --------------------------------------------------------------------------------------------------------------------------------
# predicted changes (delta) in plant biodoversity relative to current values (ac45, gs45, gd45, he45, averaged), for all 7 metrics

# USE % OF COUNTIES AS RESPONSE SCALE

# --------------------------------------------------------------------------------------------
# present day values
preds_long_response7_ni <- preds_long_all[with(preds_long_all, response %in% c("species.n", "family.n", "genus.n", "species.i", "family.i", "genus.i", "PD.NAT", "MPD.NAT", 
                                                                                                                                       "PD.s.NAT", "MPD.s.NAT", "PD.INTRO", "MPD.INTRO", "PD.s.INTRO", "MPD.s.INTRO") & 
                                                                                                     year %in% c("present") &
                                                                                                     scenario %in% c("current")), ]
preds_long_response7_ni <- droplevels(preds_long_response7_ni)
preds_long_response7_ni$pred_original <- preds_long_response7_ni$pred

# native versus invasive
preds_long_response7_ni$NAT_INTRO <- gsub(".+\\.(\\w+)$", "\\1", preds_long_response7_ni$response)
preds_long_response7_ni$NAT_INTRO <- factor(ifelse(preds_long_response7_ni$NAT_INTRO %in% c("n", "NAT"), "Native", "Invasive"))
preds_long_response7_ni$response <- gsub("(.+)\\.\\w+$", "\\1", preds_long_response7_ni$response)

preds_long_response7_ni$response <- factor(preds_long_response7_ni$response)
    
# --------------------------------------------------------------------------------------------
# differences (future - present)                                                                                                         
preds_long_response7_diff_ni_nonRCP45 <- preds_long_all[with(preds_long_all, response %in% c("diff.species.n", "diff.family.n", "diff.genus.n", "diff.species.i", "diff.family.i", "diff.genus.i", 
                                             "diff.PD.NAT", "diff.MPD.NAT", "diff.PD.s.NAT", "diff.MPD.s.NAT", "diff.PD.INTRO", "diff.MPD.INTRO", "diff.PD.s.INTRO", "diff.MPD.s.INTRO") & 
                                                                                                     year %in% c("2050", "2070") &
                                                                                                     scenario %in% c("he26", "he60", "he85")), ]
preds_long_response7_diff_ni_nonRCP45 <- droplevels(preds_long_response7_diff_ni_nonRCP45)

# native versus invasive
preds_long_response7_diff_ni_nonRCP45$NAT_INTRO <- gsub(".+\\.(\\w+)$", "\\1", preds_long_response7_diff_ni_nonRCP45$response)
preds_long_response7_diff_ni_nonRCP45$NAT_INTRO <- factor(ifelse(preds_long_response7_diff_ni_nonRCP45$NAT_INTRO %in% c("n", "NAT"), "Native", "Introduced"))
preds_long_response7_diff_ni_nonRCP45$response <- gsub("(.+)\\.\\w+$", "\\1", preds_long_response7_diff_ni_nonRCP45$response)

# --------------------------------------------------------------------------------------------
# unify response factor names
preds_long_response7_diff_ni_nonRCP45$response <- factor(preds_long_response7_diff_ni_nonRCP45$response,  
    levels = c("diff.species", "diff.genus", "diff.family", "diff.PD", "diff.MPD", "diff.PD.s", "diff.MPD.s"), 
    labels = c("species", "genus", "family", "PD", "MPD", "PD.s", "MPD.s")) 
    
# abbreviate eco-region and division factor names
preds_long_response7_diff_ni_nonRCP45$ecoregion <- factor(preds_long_response7_diff_ni_nonRCP45$ecoregion,  
    levels = c("Eastern Temperate Forests", "Great Plains", "Marine West Coast Forest", "Mediterranean California", "North American Deserts",
                     "Northern Forests", "Northwestern Forested Mountains", "Southern Semiarid Highlands", "Temperate Sierras", "Tropical Wet Forests"), 
    labels = c("ETF", "GP", "MWCF", "MC", "NAD", "NF", "NFM", "SSH", "TS", "TWF"))

preds_long_response7_diff_ni_nonRCP45$Division <- factor(preds_long_response7_diff_ni_nonRCP45$Division,  
    levels = c("South Atlantic", "East South Central", "Mountain", "East North Central", "West North Central",
                     "West South Central", "New England", "Middle Atlantic", "Pacific" ), 
    labels = c("SA", "ESC", "M", "ENC", "WNC", "WSC", "NE", "MA", "P"))

# --------------------------------------------------------------------------------------------            
# merge present day values with future prediction data
preds_long_response7_diff_ni_nonRCP45 <- merge(preds_long_response7_diff_ni_nonRCP45, preds_long_response7_ni[, c("region", "response", "pred_original")], 
                                                            by = c("region", "response"), all.x = TRUE)

# create % difference
preds_long_response7_diff_ni_nonRCP45$diff_percent <- with(preds_long_response7_diff_ni_nonRCP45, (pred / pred_original) * 100)

# average of scenarios
preds_long_response7_diff_avg_ni_nonRCP45 <- ddply(preds_long_response7_diff_ni_nonRCP45, .(region, state, Region, Division, ecoregion, NAT_INTRO, response, year), summarise, 
    diff_percent_avg = mean(diff_percent, na.rm = TRUE))


#######################################################################################
# FIG_S8: Second and third panels: choropleth maps

# NEED TO USE THE DATA THAT IS AVERAGED OVER THE 4 SCENARIOS TO CREATE THE PLOTS DATA
# have two columns of maps, one with (future 2050 - present) and the other with (future 2070 - present)

# --------------------------------------------------------------------------------------------------------------------------------
# predicted changes (delta) in plant biodoversity relative to current values (ac45, gd45, gs45, he45, averaged), for all 7 metrics

# reshape "response", "year", and "NAT/INTRO" back to wide format
preds_long_response7_diff_avg_wide_ni_nonRCP45 <- dcast(preds_long_response7_diff_avg_ni_nonRCP45, ... ~ response + year + NAT_INTRO, value.var = "diff_percent_avg")

# subset to include only 2070
#preds_long_response7_diff_avg_wide_ni <- preds_long_response7_diff_avg_wide_ni[preds_long_response7_diff_avg_wide_ni$year %in% "2070", ]
            
# average of scenarios
#preds_long_response7_diff_avg <- ddply(preds_long_response7_diff, .(region, state, Region, Division, ecoregion, area, response, year), summarise, 
#    pred_avg = mean(pred, na.rm = TRUE))
# reshape "response" and "year" back to wide format
#preds_long_response7_diff_avg_wide <- dcast(preds_long_response7_diff_avg, ... ~ response + year, value.var = "pred_avg")


save(preds_long_response7_diff_ni_nonRCP45, preds_long_response7_diff_avg_ni_nonRCP45, preds_long_response7_diff_avg_wide_ni_nonRCP45, 
    file = "data/preds_long_response7_diff_avg_NAT_INTRO_nonRCP45.Rdata", compress = "gzip")
#load("data/preds_long_response7_diff_avg_NAT_INTRO_nonRCP45.Rdata")










































#############################################################################################
#############################################################################################
#############################################################################################
#######################################################################################

# -----------------------------------------------------------------
#combined_PP_long_box_ni <- combined_PP_long[combined_PP_long$variable %in% c("species.n", "species.i", "family.n", "family.i", "PD.NAT", "PD.INTRO", "MPD.NAT", "MPD.INTRO"), #c("variable", "metric", "bio07.mn", "bio15.mn", "gmted2010.elev_mean.mn", "hwsd.t_clay.mn", "enow.coast")]
#combined_PP_long_box_ni <- droplevels(combined_PP_long_box_ni)

#combined_PP_long_box_ni$response <- factor(gsub("(\\w+)\\.\\w+", "\\1", as.character(combined_PP_long_box_ni$variable)), 
#    levels = c("species", "family", "PD", "MPD"), labels = c("Species", "Family", "PD", "MPD"))
#combined_PP_long_box_ni$Origin <- gsub("\\w+\\.(\\w+)", "\\1", as.character(combined_PP_long_box_ni$variable))
#combined_PP_long_box_ni[combined_PP_long_box_ni$Origin %in% c("n", "NAT"), "Origin"] <- "native" 
#combined_PP_long_box_ni[combined_PP_long_box_ni$Origin %in% c("i", "INTRO"), "Origin"] <- "introduced"
#combined_PP_long_box_ni$Origin <- factor(combined_PP_long_box_ni$Origin)

# plot_dat$glaciation <- factor(ifelse(plot_dat$glaciation %in% 2:3, "yes", "no"))
#combined_PP_long_box_ni$enow.coast <- factor(combined_PP_long_box_ni$enow.coast, levels = 0:1, labels = c("No", "Yes"))
#str(combined_PP_long_box_ni)



# diff to diff final 4x4 panel plots (ALL)

# RESPONSES
# --------------------------------------------------------------------------------------------
# present day values
preds_long_response7 <- preds_long_all[with(preds_long_all, response %in% c("species", "family", "PD.ALL", "MPD.ALL") & 
                                                                                                     year %in% c("present") &
                                                                                                     scenario %in% c("current")), ]
preds_long_response7 <- droplevels(preds_long_response7)
preds_long_response7$pred_original <- preds_long_response7$pred

# unify factor names
preds_long_response7$response <- factor(preds_long_response7$response,  
    levels = c("species", "family", "PD.ALL", "MPD.ALL"), 
    labels = c("Species", "Family", "PD (all)", "MPD (all)"))
    
# --------------------------------------------------------------------------------------------
# differences (future - present)                                                                                                         
preds_long_response7_diff <- preds_long_all[with(preds_long_all, response %in% c("diff.species", "diff.family", "diff.PD.ALL", "diff.MPD.ALL") & 
                                                                                                     year %in% c("2050", "2070") &
                                                                                                     scenario %in% c("ac45", "gd45", "gs45", "he45")), ]
preds_long_response7_diff <- droplevels(preds_long_response7_diff)

# get average predictions (over scenarios) for response variables
preds_long_response7_diff_avg <- ddply(preds_long_response7_diff, .(region, year, response), summarise, pred_avg = mean(pred, na.rm = TRUE))

# --------------------------------------------------------------------------------------------
# unify response factor names
preds_long_response7_diff_avg$response <- factor(preds_long_response7_diff_avg$response,  
    levels = c("diff.species", "diff.family", "diff.PD.ALL", "diff.MPD.ALL"), 
    labels = c("Species", "Family", "PD (all)", "MPD (all)"))
    
# --------------------------------------------------------------------------------------------            
# merge present day values with future prediction data
preds_long_response7_diff_avg <- merge(preds_long_response7_diff_avg, preds_long_response7[, c("region", "response", "pred_original")], 
                                                                    by = c("region", "response"), all.x = TRUE)

# create % difference
preds_long_response7_diff_avg$diff_percent_avg <- with(preds_long_response7_diff_avg, (pred_avg / pred_original) * 100)

# -------------------------------------------------------------------------------------------- 
# PREDICTORS

# get 2050 and 2070 average values (over scenarios) for environmental predictors
predVars <- c("region", "bio07.mn", "bio15.mn", "gmted2010.elev_mean.mn", "hwsd.t_clay.mn")
files_pred_num <- lapply(files_pred_plot, function(x) x[, predVars])
#
ScenarioKeep <- c("ac4550", "gd4550", "gs4550", "he4550", "ac4570", "gd4570", "gs4570", "he4570")
files_pred_num <- files_pred_num[ScenarioKeep]
#
y2050 <- files_pred_num[grep("50$", names(files_pred_num))] # 2050
y2070 <- files_pred_num[grep("70$", names(files_pred_num))] # 2070
y2050_avg <- Reduce(`+`, y2050) / length(y2050)
y2070_avg <- Reduce(`+`, y2070) / length(y2070)
# write.csv(y2050_avg, file = "y2050_avg_RCP45.csv", row.names = FALSE)
# write.csv(y2070_avg, file = "y2070_avg_RCP45.csv", row.names = FALSE)

# create diffs between 2050 - present and 2070 - present for environmental predictors
diff_2050 <- Reduce(`-`, list(y2050_avg[order(y2050_avg$region), ], combined_PP[order(combined_PP$region), predVars]))
diff_2050$region <- y2050_avg[order(y2050_avg$region), "region"]
colnames(diff_2050) <- c("region", paste("diff_2050", colnames(diff_2050)[-1], sep="_"))
diff_2070 <- Reduce(`-`, list(y2070_avg[order(y2070_avg$region), ], combined_PP[order(combined_PP$region), predVars]))
diff_2070$region <- y2070_avg[order(y2070_avg$region), "region"]
colnames(diff_2070) <- c("region", paste("diff_2070", colnames(diff_2070)[-1], sep="_"))

# create % diffs
perc_diff_2050 <- Reduce(`/`, list(diff_2050[order(diff_2050$region), ], combined_PP[order(combined_PP$region), predVars])) * 100
perc_diff_2050$region <- diff_2050[order(diff_2050$region), "region"]
perc_diff_2070 <- Reduce(`/`, list(diff_2070[order(diff_2070$region), ], combined_PP[order(combined_PP$region), predVars])) * 100
perc_diff_2070$region <- diff_2070[order(diff_2070$region), "region"]

# merge years and reshape to long
perc_diffs_50_70 <- merge(perc_diff_2050, perc_diff_2070, by = "region")
perc_diffs_50_70_long <- melt(perc_diffs_50_70, id.vars = "region", value.name = "env_var")
perc_diffs_50_70_long$year <- factor(gsub(".+(\\d{4})\\_.+", "\\1", perc_diffs_50_70_long$variable))
perc_diffs_50_70_long$predictor <- factor(gsub(".+\\d{4}\\_(.+)", "\\1", perc_diffs_50_70_long$variable))
perc_diffs_50_70_long <- perc_diffs_50_70_long[ colnames(perc_diffs_50_70_long) %!in% "variable"]

# -------------------------------------------------------------------------------------------- 
# merge response and predictor diffs
preds_long_all_4x4_plot <- merge(preds_long_response7_diff_avg, perc_diffs_50_70_long, by = c("region", "year"), all.x = TRUE)

# factor level names
preds_long_all_4x4_plot$predictor <- factor(preds_long_all_4x4_plot$predictor, 
    levels = c("bio07.mn", "bio15.mn", "gmted2010.elev_mean.mn", "hwsd.t_clay.mn"),
    labels = c("BIO 07 (mn)", "BIO 15 (mn)", "Elevation (mn)", "Clay (mn)"))
str(preds_long_all_4x4_plot)

save(preds_long_all_4x4_plot, file = "data/explanatory_diff2diff_4x4_panel.Rdata", compress = "gzip")
#load("data/explanatory_diff2diff_4x4_panel.Rdata")

 
#######################################################################################
# regressions

diff_2050_2070 <- merge(diff_2050, diff_2070, by = "region", all = TRUE) # predictors
preds_long_response7_diff_avg_wide <- dcast(preds_long_response7_diff_avg, region ~ response + year, value.var = "pred_avg") # responses
colnames(preds_long_response7_diff_avg_wide) <- c("region", "Species_2050", "Species_2070", "Family_2050", "Family_2070",
                                                                                        "PD_2050", "PD_2070", "MPD_2050", "MPD_2070")
regDat <- merge(preds_long_response7_diff_avg_wide, diff_2050_2070, by = "region", all = TRUE)

save(regDat, file = "data/regDat_RCP45.Rdata")
#load("data/regDat_RCP45.Rdata")




