
# compare models using cross validation

# --------------------------------------------------------------------------------------------------------------------------------
# functions

merge_all <- function(dflist, by.var) {
    Reduce(function(...) merge(..., by = by.var, all = TRUE), dflist)
}

"%!in%" <- Negate("%in%")

# --------------------------------------------------------------------------------------------------------------------------------
# options

options(stringsAsFactors = FALSE)

# --------------------------------------------------------------------------------------------------------------------------------
# packages

# installpackages("caret", dependencies = c("Imports", "Depends", "Suggests"))
packages <- c("caret", "e1071", "subselect", "ipred", "parallel", "doParallel", "corrplot",  "rowr", "ggplot2") 
sapply(packages, require, character.only = TRUE)


# --------------------------------------------------------------------------------------------------------------------------------
# load data

project_dir <- "path-to-output-directory"
data_dir <- "path-to-input-directory"

# load pre-processed data
load(file.path(data_dir, "data_PP.Rdata"))


# --------------------------------------------------------------------------------------------------------------------------------
# tuning parameters

# https://machinelearningmastery.com/tune-machine-learning-algorithms-in-r/

modelMethods <- c("cubist", "rf", "xgbLinear", "rqnc", "gamSpline", "penalized", "BstLm", 
                                "simpls", "widekernelpls", "glmnet", "gaussprPoly", "pcr", "lm")
tuningParams <- lapply(modelMethods, modelLookup)
names(tuningParams) <- modelMethods
tuningParams

# --------------------------------------------------------------------------------------------------------------------------------
control <- trainControl(method = "repeatedcv", number = 10, repeats = 10, allowParallel = TRUE)
metric <- "RMSE"

# --------------------------------------------------------------------------------------------------------------------------------
# data for family response
dat_family <- cv_data_PP$family_PP$dataset_PP[, colnames(cv_data_PP$family_PP$dataset_PP) %!in% 
    c("fips", "region",  paste("totalOntree", c("n", "i", "e", "int"), sep = "."), responseColumns[responseColumns %!in% "family"])]
   
# --------------------------------------------------------------------------------------------------------------------------------
# parallel processing
cluster <- makeCluster(detectCores() - 1) # convention to leave 1 core for OS
registerDoParallel(cluster)

# -------------------------------------------------------------------------------------------------------------------------------- 

cat("starting model fitting", "\n")
   
set.seed(7)
cv_cubist_family <- train(family ~ . , method = "cubist", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(committees = 100, neighbors = 9),
    data = dat_family)

set.seed(7)
mtry <- floor(sqrt(ncol(dat_family[, !(colnames(dat_family) %in% "family")])))
tunegrid <- expand.grid(.mtry=mtry) # grid or random search?
cv_rf_family <- train(family ~ . , method = "rf", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = tunegrid,
    data = dat_family)
    
set.seed(7)
cv_xgbLinear_family <- train(family ~ . , method = "xgbLinear", metric = metric, trControl = control, na.action = "na.exclude", 
    data = dat_family)
    
set.seed(7)
cv_rqnc_family <- train(family ~ . , method = "rqnc", metric = metric, trControl = control, na.action = "na.exclude", 
    data = dat_family)

set.seed(7)
cv_gamSpline_family <- train(family ~ . , method = "gamSpline", metric = metric, trControl = control, na.action = "na.exclude", 
    data = dat_family)

set.seed(7)
cv_penalized_family <- train(family ~ . , method = "penalized", metric = metric, trControl = control, na.action = "na.exclude", 
    tuneGrid = data.frame(lambda1 = 0, lambda2 = 1),
    data = dat_family)

set.seed(7)
cv_BstLm_family <- train(family ~ . , method = "BstLm", metric = metric, trControl = control, na.action = "na.exclude", 
    data = dat_family)

set.seed(7)
cv_simpls_family <- train(family ~ . , method = "simpls", metric = metric, trControl = control, na.action = "na.exclude", 
    data = dat_family)

set.seed(7)
cv_widekernelpls_family <- train(family ~ . , method = "widekernelpls", metric = metric, trControl = control, na.action = "na.exclude", 
    data = dat_family)

set.seed(7)
cv_glmnet_family <- train(family ~ . , method = "glmnet", metric = metric, trControl = control, na.action = "na.exclude", 
    data = dat_family)

set.seed(7)
cv_gaussprPoly_family <- train(family ~ . , method = "gaussprPoly", metric = metric, trControl = control, na.action = "na.exclude", 
    data = dat_family)

set.seed(7)
cv_pcr_family <- train(family ~ . , method = "pcr", metric = metric, trControl = control, na.action = "na.exclude", 
    data = dat_family)

set.seed(7)
cv_lm_family <- train(family ~ . , method = "lm", metric = metric, trControl = control, na.action = "na.exclude", 
    data = dat_family)
                
cat("finished model fitting", "\n")


# de-register parallel processing cluster
stopCluster(cluster)
registerDoSEQ()


# --------------------------------------------------------------------------------------------------------------------------------
# save output

cv_models_comp <- list(
    "cubist" = cv_cubist_family,
    "rf" = cv_rf_family,
    "xgbLinear" = cv_xgbLinear_family, 
    "rqnc" = cv_rqnc_family,
    "gamSpline" = cv_gamSpline_family,
    "penalized" = cv_penalized_family,
    "BstLm" = cv_BstLm_family,
    "simpls" = cv_simpls_family,
    "widekernelpls" = cv_widekernelpls_family,
    "glmnet" = cv_glmnet_family,
    "gaussprPoly" = cv_gaussprPoly_family,
    "pcr" = cv_pcr_family,
    "lm" = cv_lm_family
   )

save(cv_models_comp, file = file.path(project_dir, "cv_models_comp_family.Rdata"), compress = "gzip")

    

