
# --------------------------------------------------------------------------------------------------------------------------------
# functions

merge_all <- function(dflist, by.var) {
    Reduce(function(...) merge(..., by = by.var, all = TRUE), dflist)
}

"%!in%" <- Negate("%in%")


# --------------------------------------------------------------------------------------------------------------------------------
# options

options(stringsAsFactors = FALSE)


# --------------------------------------------------------------------------------------------------------------------------------
# packages

# install.packages("caret", dependencies = c("Imports", "Depends", "Suggests"))
packages <- c("caret", "e1071", "xgboost", "subselect", "ipred", "parallel", "doParallel", "corrplot", "choroplethr", "choroplethrMaps") 
sapply(packages, require, character.only = TRUE)


# --------------------------------------------------------------------------------------------------------------------------------
# parallel processing

cluster <- makeCluster(detectCores() - 3) # convention to leave 1 core for OS
registerDoParallel(cluster)


# --------------------------------------------------------------------------------------------------------------------------------
# load training data and trained model data

project_dir <- "path-to-output-directory"
#data_dir <- "path-to-input-directory"
setwd(project_dir)

load("data/data_PP.Rdata")
load("data/final_cubist_trained_models.Rdata")


# --------------------------------------------------------------------------------------------------------------------------------
# load manipulated new data

pred_data_path <- "path-to-RCP-scenario-data-directory"
filenames <- list.files(path = pred_data_path, pattern = "ac|gd|gs|he", full.names = TRUE)
files_pred <- lapply(filenames, read.csv)
names(files_pred) <- gsub("^(\\w+)\\..+", "\\1", basename(filenames))


# --------------------------------------------------------------------------------------------------------------------------------
# clean manipulated new data

# omit observations not in trained model
files_pred <- lapply(files_pred, function(x) { x <- x[x$FIPS %in% combined_PP$fips, ] ; x })

# omit variables not in trained model
pred_variables <- colnames(combined_PP)[colnames(combined_PP) %!in% c(responseColumns, "fips", "region")]
files_pred <- lapply(files_pred, function(x) { x <- x[, colnames(x) %in% pred_variables] ; x })

# convert state names to abbreviations
files_pred <- lapply(files_pred, function(x) { x$state <- state.abb[match(x$state, state.name)] ; x$state[is.na(x$state)] <- "DC" ; x })

# impute missing data
files_pred <- lapply(files_pred, function(x) { x <- data.frame(state = x$state, predict(preProcess(x[, colnames(x) %!in% "state"], method = "bagImpute"), newdata = x[, colnames(x) %!in% "state"])) ; x })
      
# include extra variable "totalOntree" for diversity metrics
files_pred <- lapply(files_pred, function(x) { 
	x <- data.frame(x, totalOntree = combined_PP$totalOntree,
	                   totalOntree.n = combined_PP$totalOntree.n,
	                   totalOntree.i = combined_PP$totalOntree.i,
	                   totalOntree.e = combined_PP$totalOntree.e,
	                   totalOntree.int = combined_PP$totalOntree.int)
	x 
	})

# include interaction term for lat/long
files_pred <- lapply(files_pred, function(x) { x$longitude <- x$longitude - mean(x$longitude, na.rm = TRUE); x } )
files_pred <- lapply(files_pred, function(x) { x$latitude <- x$latitude - mean(x$latitude, na.rm = TRUE); x } )
files_pred <- lapply(files_pred, function(x) { x$long_x_lat <- x$longitude * x$latitude; x } )

# save
save(files_pred, file = "data/files_pred.Rdata", compress = "gzip")


# --------------------------------------------------------------------------------------------------------------------------------
# generate new predictions from the complete data

# load cleaned scenario data
load("data/files_pred.Rdata")

# pad out predictions with NAs to length of original data (3066)
pad.NA <- function(preds, present = which(!is.na(combined_PP$totalOntree.e)), missing = which(is.na(combined_PP$totalOntree.e))) {
    length(preds) <- length(c(present, missing))
    df <- data.frame(preds)
    rownames(df) <- c(present, missing)
    df <- df[order(as.numeric(row.names(df))), ]
    return(df)
}


final_cubist_predictions <- combined_PP[, c("region", responseColumns)]


for (i in names(final_cubist_models)) {
    print(i)
    
        for (j in names(files_pred)) {
            print(j)
            
            if (i %in% grep("species|genus|family", names(final_cubist_models), value = TRUE)) { # no totalOntree
                
                final_cubist_predictions[, paste(i, j, sep = "_")] <- predict(final_cubist_models[[i]], 
                    newdata = files_pred[[j]][, colnames(files_pred[[j]]) %!in% c("totalOntree", paste("totalOntree", c("n", "i", "e", "int"), sep = "."))],
                    na.action = "na.exclude")
                    
                } else
            if (i %in% c("PD.ALL",  "MPD.ALL", "PD.s.ALL", "MPD.s.ALL")) { # blank suffix
            	
            	final_cubist_predictions[, paste(i, j, sep = "_")] <- predict(final_cubist_models[[i]], 
                    newdata = files_pred[[j]][, colnames(files_pred[[j]]) %!in% paste("totalOntree", c("n", "i", "e", "int"), sep = ".")],
                    na.action = "na.exclude")
            	
            	} else
            if (i %in% c("PD.NAT",  "MPD.NAT", "PD.s.NAT", "MPD.s.NAT")) { # n 
            	
            	final_cubist_predictions[, paste(i, j, sep = "_")] <- predict(final_cubist_models[[i]], 
                    newdata = files_pred[[j]][, colnames(files_pred[[j]]) %!in% c("totalOntree", paste("totalOntree", c("i", "e", "int"), sep = "."))],
                    na.action = "na.exclude")
            	
            	} else
            if (i %in% c("PD.INTRO",  "MPD.INTRO", "PD.s.INTRO", "MPD.s.INTRO")) { # int
            	
            	final_cubist_predictions[, paste(i, j, sep = "_")] <- predict(final_cubist_models[[i]], 
                    newdata = files_pred[[j]][, colnames(files_pred[[j]]) %!in% c("totalOntree", paste("totalOntree", c("n", "i", "e"), sep = "."))],
                    na.action = "na.exclude")
            	
            	}	
            	
        } # j loop end
} # i loop end


# --------------------------------------------------------------------------------------------------------------------------------
# save output

# add area variable
final_cubist_predictions <- data.frame(area = combined_PP$area, final_cubist_predictions)


save(final_cubist_predictions, file = "data/final_cubist_predictions.Rdata", compress = "gzip") 

# exponentiate the logged responses and predictions
scenarios <- c("ac4550", "ac4570", "gd4550", "gd4570", "gs4550", "gs4570", "he2650", "he2670", "he4550", "he4570", "he6050", "he6070", "he8550", "he8570")
all_logged_columns <- c(logged_variables, apply(expand.grid(logged_variables, scenarios), 1, paste, collapse = "_"))
#grep_variables <- paste(all_logged_columns, collapse = "|")
#logged_index <- grepl(grep_variables, colnames(final_cubist_predictions))
final_cubist_predictions_exp <- final_cubist_predictions
final_cubist_predictions_exp[, all_logged_columns] <- lapply(final_cubist_predictions[, all_logged_columns], exp)


save(final_cubist_predictions_exp, file = "data/final_cubist_predictions_exp.Rdata", compress = "gzip")

write.csv(final_cubist_predictions_exp, file = "data/final_cubist_predictions_exp.csv", row.names = FALSE)


# --------------------------------------------------------------------------------------------------------------------------------
# de-register parallel processing cluster

stopCluster(cluster)
registerDoSEQ()


