
# --------------------------------------------------------------------------------------------------------------------------------
# functions

merge_all <- function(dflist, by.var) {
    Reduce(function(...) merge(..., by = by.var, all = TRUE), dflist)
}

"%!in%" <- Negate("%in%")

missing.cases <- function(dat) {
    missingCases <- sapply(dat, function(x) round((sum(is.na(x)) / length(x)) * 100, digits = 1))
    missingCases_df <- data.frame(variable = names(missingCases), missing = missingCases)
    missingCases_df <- missingCases_df[order(missingCases_df$missing, decreasing = TRUE), ]
    rownames(missingCases_df) <- NULL
    return(missingCases_df)
}


# --------------------------------------------------------------------------------------------------------------------------------
# options

options(stringsAsFactors = FALSE)


# --------------------------------------------------------------------------------------------------------------------------------
# packages

# install.packages("caret", dependencies = c("Imports", "Depends", "Suggests"))
packages <- c("caret", "e1071", "xgboost", "subselect", "ipred", "parallel", "doParallel", "corrplot") 
sapply(packages, require, character.only = TRUE)

# --------------------------------------------------------------------------------------------------------------------------------
# parallel processing

cluster <- makeCluster(detectCores() - 3) # convention to leave 1 core for OS
registerDoParallel(cluster)


# --------------------------------------------------------------------------------------------------------------------------------
# load all data files

project_dir <- "path-to-output-directory"
data_dir <- "path-to-input-directory"

# NOTE: Oglala Lakota has its new (as of 2015) FIPS code of X46102, while Ogalala Lakota (misspelled) has FIPS code of Shannon X46113, therefore remove FIPS of X46102

# ----------------------------------------------------------------------------
# country data
countyDat <- read.csv(file.path(data_dir, "County Data/Master County Dataset.csv"))
# countyDat <- countyDat[countyDat$FIPS %!in% "X46102", ] # remove Oglala County 
countyDat$fips <- countyDat$FIPS

# ----------------------------------------------------------------------------
# taxonomic data
taxonDat <- read.csv(file.path(data_dir, "Diversity Data/taxonomic/county_richness0501.csv"))

# ----------------------------------------------------------------------------
# alpha diversity data (ALL)
alphaDat_ALL <- read.csv(file.path(data_dir, "Diversity Data/alpha/all_species/ALL_county_diversity.ML.csv"))
# create totalOntree
alphaDat_ALL[, "ontree"][alphaDat_ALL[, "ontree"] == 0] <- NA
alphaDat_ALL$totalOntree <- with(alphaDat_ALL, ontree / total)
# exclude columns and change column names
alphaDat_ALL <- alphaDat_ALL[, !(colnames(alphaDat_ALL) %in% c("total", "ontree", "PD.p", "MPD.p", "MNTD.p"))]
colnames(alphaDat_ALL)[grep("PD|MPD|MNTD", colnames(alphaDat_ALL))] <- paste(grep("PD|MPD|MNTD", colnames(alphaDat_ALL), value = TRUE), "ALL", sep = ".")

# ----------------------------------------------------------------------------
# alpha diversity data (NAT)
alphaDat_NAT <- read.csv(file.path(data_dir, "Diversity Data/alpha/native_species/NAT.NAT_county.diversity.ML.csv"))
# exclude columns and change column names
alphaDat_NAT <- alphaDat_NAT[, !(colnames(alphaDat_NAT) %in% c("total.n", "ontree.n", "PD.p", "MPD.p", "MNTD.p"))]
colnames(alphaDat_NAT)[grep("PD|MPD|MNTD", colnames(alphaDat_NAT))] <- paste(grep("PD|MPD|MNTD", colnames(alphaDat_NAT), value = TRUE), "NAT", sep = ".")

# ----------------------------------------------------------------------------
# alpha diversity data (INTRO)
alphaDat_INTRO <- read.csv(file.path(data_dir, "Diversity Data/alpha/nonnative_species/INTRO.INTRO_county.diversity.ML.csv"))
# create totalOntree
alphaDat_INTRO[, "ontree.i"][alphaDat_INTRO[, "ontree.i"] == 0] <- NA
alphaDat_INTRO$totalOntree.int <- with(alphaDat_INTRO, ontree.i / total.i)
# exclude columns and change column names
alphaDat_INTRO <- alphaDat_INTRO[, !(colnames(alphaDat_INTRO) %in% c("total.i", "ontree.i", "PD.p", "MPD.p", "MNTD.p"))]
colnames(alphaDat_INTRO)[grep("PD|MPD|MNTD", colnames(alphaDat_INTRO))] <- paste(grep("PD|MPD|MNTD", colnames(alphaDat_INTRO), value = TRUE), "INTRO", sep = ".")

# ----------------------------------------------------------------------------
# merge
dat <- merge_all(list(countyDat, taxonDat, alphaDat_ALL, alphaDat_NAT, alphaDat_INTRO), by = "fips")


# --------------------------------------------------------------------------------------------------------------------------------
# basic cleaning

dat <- dat[!(dat$fips %in% "X11031"), ] # exclude Montgomery county in DC, as it doesn't exist

dat$state <- dat$state.y # keep state variable name consistent

dat$region <- as.numeric(gsub("\\w(\\d+)", "\\1", dat$fips))

# str(dat)
# write.csv(dat, file = file.path(project_dir, "raw_data.csv"), row.names = FALSE)


# --------------------------------------------------------------------------------------------------------------------------------
# missing data

# missing.cases(dat)
# write.csv(missing.cases(dat), file = file.path(project_dir, "missing_observations.csv"), row.names = FALSE)


# --------------------------------------------------------------------------------------------------------------------------------
# exclude irrelevant variables

excludeCols <- c("FIPS", "state.x", "county.x", "fips_state", "fips_county", "state_county", 
    "pop2010", "popdensity", "usda.prct.reserved", "noaa.coast", "state.y", "county.y", "sc")
datExcluded <- dat[, !(colnames(dat) %in% excludeCols)]

# exclude all variance variables and sample size variables
datExcluded <- datExcluded[, !grepl(".vr|.ln|.md|aridity.|gdd5.|petmn.|usda.|_caco3", colnames(datExcluded))] # _caco3 is linearly dependent with _ph

# create factors
facVars <- c("state", "fips")
datExcluded[, facVars] <- lapply(datExcluded[, facVars], factor)

# exclude missing cases for response variables (all responses have same pattern of missingness)   ################
responseMissing <- complete.cases(dat$species) 
datNA <- datExcluded[responseMissing, ] # 44 counties excluded
datNA <- droplevels(datNA)

# --------------------------------------------------------------------------------------------------------------------------------
# outliers

# gather response variable names
taxonResponses <- c("species", "genus", "family", "species.n", "genus.n", "family.n", "species.i", "genus.i", "family.i")
alphaMetrics <- c("PD", "MPD", "PD.s", "MPD.s")
alphaSuffix <- c("ALL", "NAT", "INTRO")
alphaResponses <- as.vector(sapply(alphaSuffix, function(x) paste(alphaMetrics, x, sep = ".")))
totalOntree_variables <- c("totalOntree", paste("totalOntree", c("n", "i", "e", "int"), sep = "."))
responseColumns <- c(taxonResponses, alphaResponses)

# 48489 - Willacy, Texas is a major outlier for MPDs models (area = 1916.84)
# 48433 - Stonewall, Texas is a major outlier for MPD.ni.nii when logged
# lots of zero values, mostly in Texas, for beta diversity metrics

# convert all beta reponses and totalOntree with zeros to NA
zero2NA_vars <- alphaResponses
datNA[, zero2NA_vars][datNA[, zero2NA_vars] == 0] <- NA


# --------------------------------------------------------------------------------------------------------------------------------
# log transform some response variables

logged_variables <- c("species.i", "genus.i", "family.i", "MPD.ALL", "MPD.NAT", "PD.INTRO")
datNA[, logged_variables] <- lapply(datNA[, logged_variables], log)


# --------------------------------------------------------------------------------------------------------------------------------
# create interaction variables

datNA <- within(datNA, {
	longitude <- longitude - mean(longitude, na.rm = TRUE)
	latitude <- latitude - mean(latitude, na.rm = TRUE)
	long_x_lat <- longitude * latitude
	})


# --------------------------------------------------------------------------------------------------------------------------------
# set up cross validation and perform pre-processing

# run algorithms using repeated 10-fold cross validation
control <- trainControl(method = "repeatedcv", number = 10, repeats = 10, allowParallel = TRUE)
metric <- "RMSE"


# --------------------------------------------------------------------------------------------------------------------------------
# exclude  linearly dependent variables

# datNACorr <- cor(datNA[, colnames(datNA) %!in% c("fips", "region", "state", responseCols)], use = "pairwise.complete.obs")
# linComb <- findLinearCombos(datNACorr) 
# datNA <- datNA[, -linComb$remove]
    

# --------------------------------------------------------------------------------------------------------------------------------
# perform pre-processing for cross validation

cv_pre_process <- function(y, dat = datNA, responseCols = responseColumns, tOt = "totalOntree.e") {

    # omit NAs from response and data
    resp <- na.omit(dat[, y])
    dat <- dat[!is.na(dat[, y]), ]
    
    # create a list of 80% of the rows in the original dataset we can use for training
    set.seed(7)
    validation_index <- createDataPartition(resp, times = 1, p = 0.80, list = FALSE, groups = 5)

    # select 20% of the data for validation
    validation <- dat[-validation_index, ]

    # use the remaining 80% of data to training and testing the models
    dataset <- dat[validation_index, ]

    # pre-process (predictors only - not response) "feature engineering"
    preProc <- preProcess(dataset[, colnames(dataset) %!in% c(responseCols,  tOt)], method = c("zv", "bagImpute")) # "nzv", "corr"

    # apply pre-processing using predict 
    dataset_PP <- predict(preProc, newdata = dataset[, colnames(dataset) %!in% c(responseCols,  tOt)])
    validation_PP <- predict(preProc, newdata = validation[, colnames(dataset) %!in% c(responseCols,  tOt)])

    # re-include response variables
    dataset_PP <- merge(dataset_PP, dataset[, c("fips", responseCols, tOt)], by = "fips", all.x = TRUE)
    validation_PP <- merge(validation_PP, validation[, c("fips", responseCols,  tOt)], by = "fips", all.x = TRUE)
    
    list(dataset_PP = dataset_PP, validation_PP = validation_PP)
} 

    
cv_data_PP <- list(
    "species_PP" = cv_pre_process("species"),
    "genus_PP" = cv_pre_process("genus"),
    "family_PP" = cv_pre_process("family"), 
    "species.n_PP" = cv_pre_process("species.n"),
    "genus.n_PP" = cv_pre_process("genus.n"),
    "family.n_PP" = cv_pre_process("family.n"),
    "species.i_PP" = cv_pre_process("species.i"),
    "genus.i_PP" = cv_pre_process("genus.i"),
    "family.i_PP" = cv_pre_process("family.i"),
    #
    "PD.ALL_PP" = cv_pre_process("PD.ALL"),
    "MPD.ALL_PP" = cv_pre_process("MPD.ALL"),
    "PD.s.ALL_PP" = cv_pre_process("PD.s.ALL"),
    "MPD.s.ALL_PP" = cv_pre_process("MPD.s.ALL"),
    # 
    "PD.NAT_PP" = cv_pre_process("PD.NAT"),
    "MPD.NAT_PP" = cv_pre_process("MPD.NAT"),
    "PD.s.NAT_PP" = cv_pre_process("PD.s.NAT"),
    "MPD.s.NAT_PP" = cv_pre_process("MPD.s.NAT"),
    #
    "PD.INTRO_PP" = cv_pre_process("PD.INTRO"),
    "MPD.INTRO_PP" = cv_pre_process("MPD.INTRO"),
    "PD.s.INTRO_PP" = cv_pre_process("PD.s.INTRO"),
    "MPD.s.INTRO_PP" = cv_pre_process("MPD.s.INTRO")
)


# --------------------------------------------------------------------------------------------------------------------------------
# perform pre-processing for complete dataset

pre_process <- function(dat, responseCols = responseColumns, tOt = "totalOntree.e") {
    
    # pre-process (predictors only - not response) "feature engineering"
    preProc <- preProcess(dat[, colnames(dat) %!in% c(responseCols,  tOt)], method = c("zv", "bagImpute")) # "nzv", "corr"

    # apply pre-processing using predict 
    combined_PP <- predict(preProc, newdata = dat[, colnames(dat) %!in% c(responseCols,  tOt)])

    # re-include response variables
    combined_PP <- merge(combined_PP, dat[, c("fips", responseCols,  tOt)], by = "fips", all.x = TRUE)
        
    return(combined_PP)
} 

combined_PP <- pre_process(datNA)


# --------------------------------------------------------------------------------------------------------------------------------
# save pre-processed data and cv options

save(
  responseColumns,
  taxonResponses,
  alphaResponses,
  totalOntree_variables,
  logged_variables,
  control, 
  metric, 
  cv_data_PP,
  combined_PP,
  file = file.path(project_dir, "data/data_PP.Rdata"),
  compress = "gzip"
)

  
# --------------------------------------------------------------------------------------------------------------------------------
# de-register parallel processing cluster

stopCluster(cluster)
registerDoSEQ()



