##Machine Learning code:
#Weakened Kuroshio intrusion reverses picophytoplankton biomass trends in a subtropical marginal sea under ocean warming
#Author: Weinan Li

##### Packages ----------------------------------------------------------------
library(tidymodels)
library(vip)
library(usemodels)
library(ranger)
library(stringr)

setwd(
  dirname(
    rstudioapi::getActiveDocumentContext()$path
  )
)

##### Build a random forest model using the “range” package -------------------
path <- paste0(getwd(),"/results")
dir.create(path)
dir.create(paste0(path, "/metrics/"))
dir.create(paste0(path, "/predictions/"))
#Read and preprocess the data
scs_df <- read.csv("myscsdata.csv") %>% 
  rename(E_Wind = eastward_wind, N_Wind = northward_wind) %>% 
  dplyr::select(Cruise, Station, Dino:Pras, Lon, Lat, "SST0", "SSS0", "Chl0", "PAR0", "MLD", "BD", "Dist2C", "KIS", "E_Wind", "N_Wind") %>% 
  mutate(across(Dino:Pras, ~ log(abs(.x) + 1)),#log(+1)–transformed
         across(c(Chl0, MLD, BD), ~ log(.x))) %>%  #Chl0, MLD, BD, log()–transformed
  mutate_if(is.character, factor) %>% 
  drop_na(Dino:Pras)
#Response variable
pig <- names(scs_df)[3:11]


#Create an empty list to store the best ranger models
rf_models <- list()
for(target_pig in pig){
  cat("Processing:", target_pig, "\n")
  #model_df
  model_df <- scs_df %>% 
    dplyr::select(Cruise, Station, paste(target_pig), Lon, Lat, "SST0", "SSS0", "Chl0", "PAR0", "MLD", "BD", "Dist2C", "KIS", "E_Wind", "N_Wind")
  
  #Build model ————————————————————————
  #1.Data splitting
  set.seed(123)
  model_split <- initial_split(model_df, strata = paste(target_pig))
  model_train <- training(model_split)
  model_test <- testing(model_split)
  #2.Resampling and cross-validation, with 25 iterations by default.
  set.seed(234)
  model_folds <- bootstraps(model_train, strata = paste(target_pig))
  # model_folds
  #3.workflow
  use_ranger(as.formula(paste(target_pig, "~ .")), data = model_train)
  
  #
  ranger_recipe <- 
    recipe(formula = as.formula(paste(target_pig, "~ .")), data = model_train) %>% 
    update_role(Cruise, Station, new_role = "ID") %>% 
    step_impute_knn("SST0", "SSS0", "Chl0", "PAR0", "MLD", "BD", "Dist2C", "KIS", "E_Wind", "N_Wind")#KNN imputation for NA values
  #Hyperparameter: mtry, trees, min_n, Given the obs and computational cost, fix min_n=10.
  ranger_spec <- 
    rand_forest(mtry = tune(), min_n = 10, trees = tune()) %>% 
    set_mode("regression") %>% 
    set_engine("ranger") 
  
  ranger_workflow <- 
    workflow() %>% 
    add_recipe(ranger_recipe) %>% 
    add_model(ranger_spec) 
  # 4.Create a tuning grid
  ranger_grid <- grid_regular(
    mtry(range = c(2, 10)),         
    # min_n(range = c(5, 30)),        
    trees(range = c(500, 2000)),    
    levels = 4                     
  )
  #5.run
  set.seed(19335)
  ranger_tune <-
    tune_grid(ranger_workflow, 
              resamples = model_folds, 
              grid = ranger_grid,
              # control = control_grid(save_pred = TRUE)
              metrics = metric_set(rmse, rsq, mae)
    )
  
  
  #Extract the RMSE and R² for all models and write.csv
  collect_metrics(ranger_tune) %>% 
    mutate(pigment = paste(target_pig)) %>% 
    write.csv(paste0(path, "/metrics/", target_pig, "_metrics.csv"), row.names = F)
  
  #6.Finalize the workflow with the best hyperparameter
  final_workflow <- ranger_workflow %>%
    finalize_workflow(select_best(ranger_tune, metric = "rmse"))
  
  
  #7.last_fit
  model_fit <- last_fit(final_workflow, model_split)
  

  collect_predictions(model_fit) %>% 
    rename(real = paste(target_pig)) %>% 
    mutate(pigment = paste(target_pig)) %>% 
    write.csv(paste0(path, "/predictions/", target_pig, "_prediction.csv"), row.names = F)
  
  #feature importance
  imp_spec <- ranger_spec %>%
    finalize_model(select_best(ranger_tune)) %>% 
    set_engine("ranger", importance = "permutation") 

  final_fit <- workflow() %>%
    add_recipe(ranger_recipe) %>%
    add_model(imp_spec) %>%
    fit(model_train)

  rf_models[[target_pig]] <- extract_fit_parsnip(final_fit)$fit

  ranger_model <- extract_fit_parsnip(final_fit)$fit

  data.frame(
    variable = names(ranger_model$variable.importance),
    importance = ranger_model$variable.importance
  ) %>%
    arrange(desc(importance)) %>%
    mutate(pigment = paste(target_pig)) %>% 
    write.csv(paste0(path, "/predictions/", target_pig, "_importance.csv"), row.names = F)
}
#Save models
saveRDS(rf_models, file = paste0(path, "/rf_models_all.RDS"))


