 
# --------------------------------------------------------------------------------------------------------------------------------
# functions

merge_all <- function(dflist, by.var) {
    Reduce(function(...) merge(..., by = by.var, all = TRUE), dflist)
}

"%!in%" <- Negate("%in%")


# --------------------------------------------------------------------------------------------------------------------------------
# options

options(stringsAsFactors = FALSE)


# --------------------------------------------------------------------------------------------------------------------------------
# packages

# install.packages("caret", dependencies = c("Imports", "Depends", "Suggests"))
packages <- c("caret", "e1071", "subselect", "ipred", "parallel", "doParallel", "corrplot", "choroplethr", "choroplethrMaps", "rowr") 
sapply(packages, require, character.only = TRUE)


# --------------------------------------------------------------------------------------------------------------------------------
# parallel processing

cluster <- makeCluster(detectCores() - 1) # convention to leave 1 core for OS
registerDoParallel(cluster)


# --------------------------------------------------------------------------------------------------------------------------------
# load data

project_dir <- "path-to-output-directory"
#data_dir <- "path-to-input-directory"
setwd(project_dir)

load("data/data_PP.Rdata")


# --------------------------------------------------------------------------------------------------------------------------------
# variables with zero minima (just a check)

zero_min_vars <- vector()
for (i in seq_along(responseColumns)) {
	if (min(combined_PP[, responseColumns[i]]) == 0) {
		zero_min_vars[i] <- responseColumns[i]
		} 
	else {
		NULL
		}
	}
	
na.omit((zero_min_vars[16:45]))


# --------------------------------------------------------------------------------------------------------------------------------
# tuning parameters

modelMethods <- c("cubist", "rf", "xgbLinear")
tuningParams <- lapply(modelMethods, modelLookup)
names(tuningParams) <- modelMethods
tuningParams


# --------------------------------------------------------------------------------------------------------------------------------
# models

# Cubist: a rule-based model that uses linear regression trees (cubist) 
# committees = 100, neighbors = 9,  nzv not removed

# -----------------------------------------------------------
# comparsion of all versus n+i

# load("data/cv_data_PP_NEW.Rdata")
# str(cv_data_PP, max.level = 1)

# -----------------------------------------------------------
# taxonomic diversity

set.seed(7)
cv_cubist_species <- train(species ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = cv_data_PP$species_PP$dataset_PP[, colnames(cv_data_PP$species_PP$dataset_PP) %!in% 
    c("fips", "region", "totalOntree", paste("totalOntree", c("n", "i", "e", "int"), sep = "."), responseColumns[responseColumns %!in% "species"])])

set.seed(7)
cv_cubist_genus <- train(genus ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = cv_data_PP$genus_PP$dataset_PP[, colnames(cv_data_PP$genus_PP$dataset_PP) %!in% 
    c("fips", "region", "totalOntree", paste("totalOntree", c("n", "i", "e", "int"), sep = "."), responseColumns[responseColumns %!in% "genus"])])

set.seed(7)
cv_cubist_family <- train(family ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = cv_data_PP$family_PP$dataset_PP[, colnames(cv_data_PP$family_PP$dataset_PP) %!in% 
    c("fips", "region", "totalOntree", paste("totalOntree", c("n", "i", "e", "int"), sep = "."), responseColumns[responseColumns %!in% "family"])])

set.seed(7)
cv_cubist_species.n <- train(species.n ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = cv_data_PP$species.n_PP$dataset_PP[, colnames(cv_data_PP$species.n_PP$dataset_PP) %!in% 
    c("fips", "region", "totalOntree", paste("totalOntree", c("n", "i", "e", "int"), sep = "."), responseColumns[responseColumns %!in% "species.n"])])

set.seed(7)
cv_cubist_genus.n <- train(genus.n ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = cv_data_PP$genus.n_PP$dataset_PP[, colnames(cv_data_PP$genus.n_PP$dataset_PP) %!in% 
    c("fips", "region", "totalOntree", paste("totalOntree", c("n", "i", "e", "int"), sep = "."), responseColumns[responseColumns %!in% "genus.n"])])

set.seed(7)
cv_cubist_family.n <- train(family.n ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = cv_data_PP$family.n_PP$dataset_PP[, colnames(cv_data_PP$family.n_PP$dataset_PP) %!in% 
    c("fips", "region", "totalOntree", paste("totalOntree", c("n", "i", "e", "int"), sep = "."), responseColumns[responseColumns %!in% "family.n"])])

set.seed(7)
cv_cubist_species.i <- train(species.i ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = cv_data_PP$species.i_PP$dataset_PP[, colnames(cv_data_PP$species.i_PP$dataset_PP) %!in% 
    c("fips", "region", "totalOntree", paste("totalOntree", c("n", "i", "e", "int"), sep = "."), responseColumns[responseColumns %!in% "species.i"])])

set.seed(7)
cv_cubist_genus.i <- train(genus.i ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = cv_data_PP$genus.i_PP$dataset_PP[, colnames(cv_data_PP$genus.i_PP$dataset_PP) %!in% 
    c("fips", "region", "totalOntree", paste("totalOntree", c("n", "i", "e", "int"), sep = "."), responseColumns[responseColumns %!in% "genus.i"])])

set.seed(7)
cv_cubist_family.i <- train(family.i ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = cv_data_PP$family.i_PP$dataset_PP[, colnames(cv_data_PP$family.i_PP$dataset_PP) %!in% 
    c("fips", "region", "totalOntree", paste("totalOntree", c("n", "i", "e", "int"), sep = "."), responseColumns[responseColumns %!in% "family.i"])])

# -----------------------------------------------------------
# alpha diversity (ALL)

set.seed(7)
cv_cubist_PD.ALL <- train(PD.ALL ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = cv_data_PP$PD.ALL_PP$dataset_PP[, colnames(cv_data_PP$PD.ALL_PP$dataset_PP) %!in% 
    c("fips", "region", paste("totalOntree", c("n", "i", "e", "int"), sep = "."), responseColumns[responseColumns %!in% "PD.ALL"])])

set.seed(7)
cv_cubist_MPD.ALL <- train(MPD.ALL ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = cv_data_PP$MPD.ALL_PP$dataset_PP[, colnames(cv_data_PP$MPD.ALL_PP$dataset_PP) %!in% 
    c("fips", "region", paste("totalOntree", c("n", "i", "e", "int"), sep = "."), responseColumns[responseColumns %!in% "MPD.ALL"])])

set.seed(7)
cv_cubist_PD.s.ALL <- train(PD.s.ALL ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = cv_data_PP$PD.s.ALL_PP$dataset_PP[, colnames(cv_data_PP$PD.s.ALL_PP$dataset_PP) %!in% 
    c("fips", "region", paste("totalOntree", c("n", "i", "e", "int"), sep = "."), responseColumns[responseColumns %!in% "PD.s.ALL"])])

set.seed(7)
cv_cubist_MPD.s.ALL <- train(MPD.s.ALL ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = cv_data_PP$MPD.s.ALL_PP$dataset_PP[, colnames(cv_data_PP$MPD.s.ALL_PP$dataset_PP) %!in% 
    c("fips", "region", paste("totalOntree", c("n", "i", "e", "int"), sep = "."), responseColumns[responseColumns %!in% "MPD.s.ALL"])])

        
# -----------------------------------------------------------
# alpha diversity (NAT)

set.seed(7)
cv_cubist_PD.NAT <- train(PD.NAT ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = cv_data_PP$PD.NAT_PP$dataset_PP[, colnames(cv_data_PP$PD.NAT_PP$dataset_PP) %!in% 
    c("fips", "region", "totalOntree", paste("totalOntree", c("i", "e", "int"), sep = "."), responseColumns[responseColumns %!in% "PD.NAT"])])

set.seed(7)
cv_cubist_MPD.NAT <- train(MPD.NAT ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = cv_data_PP$MPD.NAT_PP$dataset_PP[, colnames(cv_data_PP$MPD.NAT_PP$dataset_PP) %!in% 
    c("fips", "region", "totalOntree", paste("totalOntree", c("i", "e", "int"), sep = "."), responseColumns[responseColumns %!in% "MPD.NAT"])])

set.seed(7)
cv_cubist_PD.s.NAT <- train(PD.s.NAT ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = cv_data_PP$PD.s.NAT_PP$dataset_PP[, colnames(cv_data_PP$PD.s.NAT_PP$dataset_PP) %!in% 
    c("fips", "region", "totalOntree", paste("totalOntree", c("i", "e", "int"), sep = "."), responseColumns[responseColumns %!in% "PD.s.NAT"])])

set.seed(7)
cv_cubist_MPD.s.NAT <- train(MPD.s.NAT ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = cv_data_PP$MPD.s.NAT_PP$dataset_PP[, colnames(cv_data_PP$MPD.s.NAT_PP$dataset_PP) %!in% 
    c("fips", "region", "totalOntree", paste("totalOntree", c("i", "e", "int"), sep = "."), responseColumns[responseColumns %!in% "MPD.s.NAT"])])


# -----------------------------------------------------------
# alpha diversity (INTRO)

set.seed(7)
cv_cubist_PD.INTRO <- train(PD.INTRO ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = cv_data_PP$PD.INTRO_PP$dataset_PP[, colnames(cv_data_PP$PD.INTRO_PP$dataset_PP) %!in% 
    c("fips", "region", "totalOntree", paste("totalOntree", c("n", "i", "e"), sep = "."), responseColumns[responseColumns %!in% "PD.INTRO"])])

set.seed(7)
cv_cubist_MPD.INTRO <- train(MPD.INTRO ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = cv_data_PP$MPD.INTRO_PP$dataset_PP[, colnames(cv_data_PP$MPD.INTRO_PP$dataset_PP) %!in% 
    c("fips", "region", "totalOntree", paste("totalOntree", c("n", "i", "e"), sep = "."), responseColumns[responseColumns %!in% "MPD.INTRO"])])

set.seed(7)
cv_cubist_PD.s.INTRO <- train(PD.s.INTRO ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = cv_data_PP$PD.s.INTRO_PP$dataset_PP[, colnames(cv_data_PP$PD.s.INTRO_PP$dataset_PP) %!in% 
    c("fips", "region", "totalOntree", paste("totalOntree", c("n", "i", "e"), sep = "."), responseColumns[responseColumns %!in% "PD.s.INTRO"])])

set.seed(7)
cv_cubist_MPD.s.INTRO <- train(MPD.s.INTRO ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = cv_data_PP$MPD.s.INTRO_PP$dataset_PP[, colnames(cv_data_PP$MPD.s.INTRO_PP$dataset_PP) %!in% 
    c("fips", "region", "totalOntree", paste("totalOntree", c("n", "i", "e"), sep = "."), responseColumns[responseColumns %!in% "MPD.s.INTRO"])])

   
# -----------------------------------------------------------
# store and save models

cv_cubist_models <- list(
    "species" = cv_cubist_species,
    "genus" = cv_cubist_genus,
    "family" = cv_cubist_family, 
    "species.n" = cv_cubist_species.n,
    "genus.n" = cv_cubist_genus.n,
    "family.n" = cv_cubist_family.n,
    "species.i" = cv_cubist_species.i,
    "genus.i" = cv_cubist_genus.i,
    "family.i" = cv_cubist_family.i,
    #
    "PD.ALL" = cv_cubist_PD.ALL,
    "MPD.ALL" = cv_cubist_MPD.ALL,
    "PD.s.ALL" = cv_cubist_PD.s.ALL,
    "MPD.s.ALL" = cv_cubist_MPD.s.ALL,
    #
    "PD.NAT" = cv_cubist_PD.NAT,
    "MPD.NAT" = cv_cubist_MPD.NAT,
    "PD.s.NAT" = cv_cubist_PD.s.NAT,
    "MPD.s.NAT" = cv_cubist_MPD.s.NAT,
    #
    "PD.INTRO" = cv_cubist_PD.INTRO,
    "MPD.INTRO" = cv_cubist_MPD.INTRO,
    "PD.s.INTRO" = cv_cubist_PD.s.INTRO,
    "MPD.s.INTRO" = cv_cubist_MPD.s.INTRO
)


save(cv_cubist_models, file = "data/cv_cubist_trained_models.Rdata", compress = "gzip")


# --------------------------------------------------------------------------------------------------------------------------------
# summarize model output

# summarize accuracy of models
#results <- resamples(list(lm = cv_cubist_models$species, BstLm = fit.BstLm, pcr = fit.pcr, xgbLinear = fit.xgbLinear, pls = fit.pls)) 
#summary(results)
#dotplot(results)

# differences between models
#diffs <- diff(results)
#summary(diffs)
#dotplot(diffs)

# summarize best model
#print(cv_cubist_models$species)


# --------------------------------------------------------------------------------------------------------------------------------
# validation

load("data/cv_cubist_trained_models.Rdata")
str(cv_cubist_models, max.level = 1)


validate <- function(cv_trained_model, validation_data, response_var) {
    # estimate skill of best model on the validation dataset and calculate RMSE and R^2
    preds <- predict(cv_trained_model, newdata = validation_data) # listwise deletion for newdata
    rmse <- sqrt(mean((preds - validation_data[, response_var])^2)) 
    R2 <- cor(preds, validation_data[, response_var])^2
    list(preds = preds, rmse = rmse, R2 = R2)
}

val_species <- validate(cv_cubist_models$species, validation_data = cv_data_PP$species_PP$validation_PP, response_var = "species")
val_genus <- validate(cv_cubist_models$genus, validation_data = cv_data_PP$genus_PP$validation_PP, response_var = "genus")
val_family <- validate(cv_cubist_models$family, validation_data = cv_data_PP$family_PP$validation_PP, response_var = "family")
val_species.n <- validate(cv_cubist_models$species.n, validation_data = cv_data_PP$species.n_PP$validation_PP, response_var = "species.n")
val_genus.n <- validate(cv_cubist_models$genus.n, validation_data = cv_data_PP$genus.n_PP$validation_PP, response_var = "genus.n")
val_family.n <- validate(cv_cubist_models$family.n, validation_data = cv_data_PP$family.n_PP$validation_PP, response_var = "family.n")
val_species.i <- validate(cv_cubist_models$species.i, validation_data = cv_data_PP$species.i_PP$validation_PP, response_var = "species.i")
val_genus.i <- validate(cv_cubist_models$genus.i, validation_data = cv_data_PP$genus.i_PP$validation_PP, response_var = "genus.i")
val_family.i <- validate(cv_cubist_models$family.i, validation_data = cv_data_PP$family.i_PP$validation_PP, response_var = "family.i")
#
val_PD.ALL <- validate(cv_cubist_models$PD.ALL, validation_data = cv_data_PP$PD.ALL_PP$validation_PP, response_var = "PD.ALL")
val_MPD.ALL <- validate(cv_cubist_models$MPD.ALL, validation_data = cv_data_PP$MPD.ALL_PP$validation_PP, response_var = "MPD.ALL")
val_PD.s.ALL <- validate(cv_cubist_models$PD.s.ALL, validation_data = cv_data_PP$PD.s.ALL_PP$validation_PP, response_var = "PD.s.ALL")
val_MPD.s.ALL <- validate(cv_cubist_models$MPD.s.ALL, validation_data = cv_data_PP$MPD.s.ALL_PP$validation_PP, response_var = "MPD.s.ALL")
#
val_PD.NAT <- validate(cv_cubist_models$PD.NAT, validation_data = cv_data_PP$PD.NAT_PP$validation_PP, response_var = "PD.NAT")
val_MPD.NAT <- validate(cv_cubist_models$MPD.NAT, validation_data = cv_data_PP$MPD.NAT_PP$validation_PP, response_var = "MPD.NAT")
val_PD.s.NAT <- validate(cv_cubist_models$PD.s.NAT, validation_data = cv_data_PP$PD.s.NAT_PP$validation_PP, response_var = "PD.s.NAT")
val_MPD.s.NAT <- validate(cv_cubist_models$MPD.s.NAT, validation_data = cv_data_PP$MPD.s.NAT_PP$validation_PP, response_var = "MPD.s.NAT")
#
val_PD.INTRO <- validate(cv_cubist_models$PD.INTRO, validation_data = cv_data_PP$PD.INTRO_PP$validation_PP, response_var = "PD.INTRO")
val_MPD.INTRO <- validate(cv_cubist_models$MPD.INTRO, validation_data = cv_data_PP$MPD.INTRO_PP$validation_PP, response_var = "MPD.INTRO")
val_PD.s.INTRO <- validate(cv_cubist_models$PD.s.INTRO, validation_data = cv_data_PP$PD.s.INTRO_PP$validation_PP, response_var = "PD.s.INTRO")
val_MPD.s.INTRO <- validate(cv_cubist_models$MPD.s.INTRO, validation_data = cv_data_PP$MPD.s.INTRO_PP$validation_PP, response_var = "MPD.s.INTRO") 

val_estimates <- list(
    val_species, val_genus, val_family, val_species.n, val_genus.n, val_family.n, val_species.i, val_genus.i, val_family.i,
    val_PD.ALL, val_MPD.ALL, val_PD.s.ALL, val_MPD.s.ALL,
    val_PD.NAT, val_MPD.NAT, val_PD.s.NAT, val_MPD.s.NAT,
    val_PD.INTRO, val_MPD.INTRO, val_PD.s.INTRO, val_MPD.s.INTRO,
    )

val_df <- data.frame(
    response = c("species", "genus", "family", "species.n", "genus.n", "family.n", "species.i", "genus.i", "family.i", 
        "PD.ALL", "MPD.ALL", "PD.s.ALL", "MPD.s.ALL",
        "PD.NAT", "MPD.NAT", "PD.s.NAT", "MPD.s.NAT",
        "PD.INTRO", "MPD.INTRO", "PD.s.INTRO", "MPD.s.INTRO"
        ),
    # rmse = sapply(val_estimates, function(x) x$rmse),
    R2 = sapply(val_estimates, function(x) round(x$R2, digits = 2))
    )

write.csv(val_df, file = "data/validation_cubist_R2.csv", row.names = FALSE)

# plot observed versus predicted
pdf("figures/validation.pdf", height = 12, width = 21)
op <- par(mfrow = c(7, 10), mar = c(4, 4, 1, 1) + 0.1)
  plot(val_species$preds, cv_data_PP$species_PP$validation_PP$species)
  plot(val_genus$preds, cv_data_PP$genus_PP$validation_PP$genus) 
  plot(val_family$preds, cv_data_PP$family_PP$validation_PP$family)   
  plot(val_species.n$preds, cv_data_PP$species.n_PP$validation_PP$species.n)
  plot(val_genus.n$preds, cv_data_PP$genus.n_PP$validation_PP$genus.n) 
  plot(val_family.n$preds, cv_data_PP$family.n_PP$validation_PP$family.n)   
  plot(val_species.i$preds, cv_data_PP$species.i_PP$validation_PP$species.i)
  plot(val_genus.i$preds, cv_data_PP$genus.i_PP$validation_PP$genus.i) 
  plot(val_family.i$preds, cv_data_PP$family.i_PP$validation_PP$family.i)
  #
  plot(val_PD.ALL$preds, cv_data_PP$PD.ALL_PP$validation_PP$PD.ALL) 
  plot(val_MPD.ALL$preds, cv_data_PP$MPD.ALL_PP$validation_PP$MPD.ALL) 
  plot(val_PD.s.ALL$preds, cv_data_PP$PD.s.ALL_PP$validation_PP$PD.s.ALL) 
  plot(val_MPD.s.ALL$preds, cv_data_PP$MPD.s.ALL_PP$validation_PP$MPD.s.ALL) 
  #
  plot(val_PD.NAT$preds, cv_data_PP$PD.NAT_PP$validation_PP$PD.NAT) 
  plot(val_MPD.NAT$preds, cv_data_PP$MPD.NAT_PP$validation_PP$MPD.NAT)  
  plot(val_PD.s.NAT$preds, cv_data_PP$PD.s.NAT_PP$validation_PP$PD.s.NAT) 
  plot(val_MPD.s.NAT$preds, cv_data_PP$MPD.s.NAT_PP$validation_PP$MPD.s.NAT) 
  #
  plot(val_PD.INTRO$preds, cv_data_PP$PD.INTRO_PP$validation_PP$PD.INTRO) 
  plot(val_MPD.INTRO$preds, cv_data_PP$MPD.INTRO_PP$validation_PP$MPD.INTRO)  
  plot(val_PD.s.INTRO$preds, cv_data_PP$PD.s.INTRO_PP$validation_PP$PD.s.INTRO) 
  plot(val_MPD.s.INTRO$preds, cv_data_PP$MPD.s.INTRO_PP$validation_PP$MPD.s.INTRO) 
par(op)
dev.off()


# --------------------------------------------------------------------------------------------------------------------------------
# de-register parallel processing cluster

stopCluster(cluster)
registerDoSEQ()



