
# --------------------------------------------------------------------------------------------------------------------------------
# functions

merge_all <- function(dflist, by.var) {
    Reduce(function(...) merge(..., by = by.var, all = TRUE), dflist)
}

"%!in%" <- Negate("%in%")


# --------------------------------------------------------------------------------------------------------------------------------
# options

options(stringsAsFactors = FALSE)


# --------------------------------------------------------------------------------------------------------------------------------
# packages

# install.packages("caret", dependencies = c("Imports", "Depends", "Suggests"))
packages <- c("caret", "e1071", "xgboost", "subselect", "ipred", "parallel", 
                       "doParallel", "corrplot", "choroplethr", "choroplethrMaps", "rowr") 
sapply(packages, require, character.only = TRUE)


# --------------------------------------------------------------------------------------------------------------------------------
# parallel processing

cluster <- makeCluster(detectCores() - 3) # convention to leave 1 core for OS
registerDoParallel(cluster)


# --------------------------------------------------------------------------------------------------------------------------------
# load data

project_dir <- "path-to-output-directory"
#data_dir <- "path-to-input-directory"
setwd(project_dir)

load("data/data_PP.Rdata")


# --------------------------------------------------------------------------------------------------------------------------------
# final models fitted to complete data

# these take 10-15 mins each

# committees = 100, neighbors = 9, nzv not removed

# -----------------------------------------------------------
# taxonomic diversity

set.seed(7)
final_cubist_species <- train(species ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = combined_PP[, colnames(combined_PP) %!in% c("fips", "region", "totalOntree", paste("totalOntree", c("n", "i", "e", "int"), sep = "."),
        responseColumns[responseColumns %!in% "species"])])

set.seed(7)
final_cubist_genus <- train(genus ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = combined_PP[, colnames(combined_PP) %!in% c("fips", "region", "totalOntree", paste("totalOntree", c("n", "i", "e", "int"), sep = "."),
        responseColumns[responseColumns %!in% "genus"])])

set.seed(7)
final_cubist_family <- train(family ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = combined_PP[, colnames(combined_PP) %!in% c("fips", "region", "totalOntree", paste("totalOntree", c("n", "i", "e", "int"), sep = "."), 
        responseColumns[responseColumns %!in% "family"])])

set.seed(7)
final_cubist_species.n <- train(species.n ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = combined_PP[, colnames(combined_PP) %!in% c("fips", "region", "totalOntree", paste("totalOntree", c("n", "i", "e", "int"), sep = "."), 
        responseColumns[responseColumns %!in% "species.n"])])

set.seed(7)
final_cubist_genus.n <- train(genus.n ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = combined_PP[, colnames(combined_PP) %!in% c("fips", "region", "totalOntree", paste("totalOntree", c("n", "i", "e", "int"), sep = "."), 
        responseColumns[responseColumns %!in% "genus.n"])])

set.seed(7)
final_cubist_family.n <- train(family.n ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = combined_PP[, colnames(combined_PP) %!in% c("fips", "region", "totalOntree", paste("totalOntree", c("n", "i", "e", "int"), sep = "."), 
        responseColumns[responseColumns %!in% "family.n"])])

set.seed(7)
final_cubist_species.i <- train(species.i ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = combined_PP[, colnames(combined_PP) %!in% c("fips", "region", "totalOntree", paste("totalOntree", c("n", "i", "e", "int"), sep = "."),
        responseColumns[responseColumns %!in% "species.i"])])

set.seed(7)
final_cubist_genus.i <- train(genus.i ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = combined_PP[, colnames(combined_PP) %!in% c("fips", "region", "totalOntree", paste("totalOntree", c("n", "i", "e", "int"), sep = "."), 
        responseColumns[responseColumns %!in% "genus.i"])])

set.seed(7)
final_cubist_family.i <- train(family.i ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = combined_PP[, colnames(combined_PP) %!in% c("fips", "region", "totalOntree", paste("totalOntree", c("n", "i", "e", "int"), sep = "."), 
        responseColumns[responseColumns %!in% "family.i"])])

# -----------------------------------------------------------
# alpha diversity (ALL)

set.seed(7)
final_cubist_PD.ALL <- train(PD.ALL ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = combined_PP[, colnames(combined_PP) %!in% c("fips", "region", paste("totalOntree", c("n", "i", "e", "int"), sep = "."), 
        responseColumns[responseColumns %!in% "PD.ALL"])])

set.seed(7)
final_cubist_MPD.ALL <- train(MPD.ALL ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = combined_PP[, colnames(combined_PP) %!in% c("fips", "region", paste("totalOntree", c("n", "i", "e", "int"), sep = "."), 
        responseColumns[responseColumns %!in% "MPD.ALL"])])

set.seed(7)
final_cubist_PD.s.ALL <- train(PD.s.ALL ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = combined_PP[, colnames(combined_PP) %!in% c("fips", "region", paste("totalOntree", c("n", "i", "e", "int"), sep = "."),
        responseColumns[responseColumns %!in% "PD.s.ALL"])])

set.seed(7)
final_cubist_MPD.s.ALL <- train(MPD.s.ALL ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = combined_PP[, colnames(combined_PP) %!in% c("fips", "region", paste("totalOntree", c("n", "i", "e", "int"), sep = "."),
        responseColumns[responseColumns %!in% "MPD.s.ALL"])])


# -----------------------------------------------------------
# alpha diversity (NAT)

set.seed(7)
final_cubist_PD.NAT <- train(PD.NAT ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = combined_PP[, colnames(combined_PP) %!in% c("fips", "region", "totalOntree", paste("totalOntree", c("i", "e", "int"), sep = "."), 
        responseColumns[responseColumns %!in% "PD.NAT"])])

set.seed(7)
final_cubist_MPD.NAT <- train(MPD.NAT ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = combined_PP[, colnames(combined_PP) %!in% c("fips", "region", "totalOntree", paste("totalOntree", c("i", "e", "int"), sep = "."), 
        responseColumns[responseColumns %!in% "MPD.NAT"])])

set.seed(7)
final_cubist_PD.s.NAT <- train(PD.s.NAT ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = combined_PP[, colnames(combined_PP) %!in% c("fips", "region", "totalOntree", paste("totalOntree", c("i", "e", "int"), sep = "."),
        responseColumns[responseColumns %!in% "PD.s.NAT"])])

set.seed(7)
final_cubist_MPD.s.NAT <- train(MPD.s.NAT ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = combined_PP[, colnames(combined_PP) %!in% c("fips", "region", "totalOntree", paste("totalOntree", c("i", "e", "int"), sep = "."),
        responseColumns[responseColumns %!in% "MPD.s.NAT"])])


# -----------------------------------------------------------
# alpha diversity (INTRO)

set.seed(7)
final_cubist_PD.INTRO <- train(PD.INTRO ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = combined_PP[, colnames(combined_PP) %!in% c("fips", "region", "totalOntree", paste("totalOntree", c("n", "i", "e"), sep = "."), 
        responseColumns[responseColumns %!in% "PD.INTRO"])])

set.seed(7)
final_cubist_MPD.INTRO <- train(MPD.INTRO ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = combined_PP[, colnames(combined_PP) %!in% c("fips", "region", "totalOntree", paste("totalOntree", c("n", "i", "e"), sep = "."), 
        responseColumns[responseColumns %!in% "MPD.INTRO"])])

set.seed(7)
final_cubist_PD.s.INTRO <- train(PD.s.INTRO ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = combined_PP[, colnames(combined_PP) %!in% c("fips", "region", "totalOntree", paste("totalOntree", c("n", "i", "e"), sep = "."),
        responseColumns[responseColumns %!in% "PD.s.INTRO"])])

set.seed(7)
final_cubist_MPD.s.INTRO <- train(MPD.s.INTRO ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = combined_PP[, colnames(combined_PP) %!in% c("fips", "region", "totalOntree", paste("totalOntree", c("n", "i", "e"), sep = "."),
        responseColumns[responseColumns %!in% "MPD.s.INTRO"])])

                        
# -----------------------------------------------------------
# store and save models

final_cubist_models <- list(
    "species" = final_cubist_species,
    "genus" = final_cubist_genus,
    "family" = final_cubist_family, 
    "species.n" = final_cubist_species.n,
    "genus.n" = final_cubist_genus.n,
    "family.n" = final_cubist_family.n,
    "species.i" = final_cubist_species.i,
    "genus.i" = final_cubist_genus.i,
    "family.i" = final_cubist_family.i,
    #
    "PD.ALL" = final_cubist_PD.ALL,
    "MPD.ALL" = final_cubist_MPD.ALL,
    "PD.s.ALL" = final_cubist_PD.s.ALL,
    "MPD.s.ALL" = final_cubist_MPD.s.ALL,
    #
    "PD.NAT" = final_cubist_PD.NAT,
    "MPD.NAT" = final_cubist_MPD.NAT,
    "PD.s.NAT" = final_cubist_PD.s.NAT,
    "MPD.s.NAT" = final_cubist_MPD.s.NAT,
    #
    "PD.INTRO" = final_cubist_PD.INTRO,
    "MPD.INTRO" = final_cubist_MPD.INTRO,
    "PD.s.INTRO" = final_cubist_PD.s.INTRO,
    "MPD.s.INTRO" = final_cubist_MPD.s.INTRO
)

save(final_cubist_models, file = "data/final_cubist_trained_models.Rdata", compress = "gzip")


# --------------------------------------------------------------------------------------------------------------------------------
# variable importance in the complete dataset
    
final_varImp <- lapply(final_cubist_models, varImp)
final_varImp_all <- lapply(final_varImp, function(x) data.frame(variables = rownames(x$importance), importance = round(x$importance$Overall, digits = 1)))

# overall importance over all the models
final_varImp_all_merged <- merge_all(final_varImp_all, by = "variables")
colnames(final_varImp_all_merged)[-1] <- paste(names(final_cubist_models), gsub("(\\w+)\\..+", "\\1", colnames(final_varImp_all_merged)[-1]), sep = "_")
summed_importance <- rowSums(final_varImp_all_merged[, -1], na.rm = TRUE)
mean_importance <- round(rowMeans(final_varImp_all_merged[, -1], na.rm = TRUE), digits = 1)
final_varImp_all_merged <- data.frame(final_varImp_all_merged, summed_importance, mean_importance)
final_varImp_all_merged <- final_varImp_all_merged[order(final_varImp_all_merged$mean_importance, decreasing = TRUE), ]
write.csv(final_varImp_all_merged, file = "data/final_cubist_varImp_merged.csv", row.names = FALSE)

# overall importance for each model
for (i in seq_along(final_varImp_all)) {
    colnames(final_varImp_all[[i]]) <- paste(names(final_cubist_models)[i], colnames(final_varImp_all[[i]]), sep = "_")
}
final_varImp_df <- do.call(cbind.fill, final_varImp_all)
final_varImp_df[nrow(final_varImp_df), 1:18] <- NA
write.csv(final_varImp_df, file = "data/final_cubist_varImp.csv", row.names = FALSE)

# The Cubist output contains variable usage statistics. It gives the percentage of times where each variable was used in a condition and/or a linear model. Note that this output will probably be inconsistent with the rules shown in the output from summary.cubist. At each split of the tree, Cubist saves a linear model (after feature selection) that is allowed to have terms for each variable used in the current split or any split above it. Quinlan (1992) discusses a smoothing algorithm where each model prediction is a linear combination of the parent and child model along the tree. As such, the final prediction is a function of all the linear models from the initial node to the terminal node. The percentages shown in the Cubist output reflects all the models involved in prediction (as opposed to the terminal models shown in the output). The variable importance used here is a linear combination of the usage in the rule conditions and the model.


# --------------------------------------------------------------------------------------------------------------------------------
# de-register parallel processing cluster

stopCluster(cluster)
registerDoSEQ()


