Changes in the terrestrial water cycle: data- and ML-driven development of predictive capability.

The purpose of this document is to show how to load and visualize the data generated by the research, and how some of the procedures in this study were performed.

Authors

Affiliation

Daniel Althoff

Department of Physical Geography, Stockholm University

Georgia Destouni

Department of Physical Geography, Stockholm University

Setup

First, load the required packages to reproduce the code

# Package manager (install if needed)
if(!require(pacman)) install.packages('pacman')

# Load all packages
pacman::p_load(dplyr, tidyr, stringr, lubridate,
               terra, sf, rnaturalearth, ggplot2,
               update = F)

Load data

Load compiled database for modelling

df <- read.table('./Compiled_database.txt', sep = ';') %>% as_tibble()

Also load the catchments boundaries to help with data visualization.

Obs: WGS84 is used here as coordinate reference system for figures

show

# Read world shapefile (excluding Antarctica)
world <- ne_countries(scale = 50, returnclass = 'sf')[-12,]

# Read shapefile
catchments <- read_sf('./Study_catchments.shp')

ggplot() +
  geom_sf(data = world, 
          fill = 'grey80', 
          lwd = 0.2) +
  geom_sf(data = catchments,
          lwd = 0.2, 
          color = 'transparent', 
          fill = alpha('orangered', 0.2)) +
  theme_void()

Organize data for modelling

Split train and test sets (80-20%)

set.seed(1)
trainids <- sample(1:nrow(df), size = 0.8*nrow(df))
trainDat <- df[trainids, ]
testDat  <- df[-trainids, ]

Save predictors (candidates) and target (response) names, and organize variables to make it easier to create variable combinations.

show

response   <- 'dif_medR'
# Exclude runoff columns from modelling
candidates <- names(df)[c(4:5, 7:40, 42:43, 45:75)]
# Reorder :)
candidates <- candidates[c(1,3, 2,4, 18,21, 19,22, 20,23, 5, 36, 24:29, 
                           35:34, 7:17, 6, 30:33, 67:69, 37,39, 38,40, 
                           53,56, 54,57, 55,58, 41, 66, 59:65, 42:52)]
# Names
candnames  <- c('Mean precipitation', 'Amp. precipitation', 
                'Mean temperature', 'Amp. temperature',
                'Mean rel. humidity', 'Amp. rel. humidity',
                'Mean inc. SWR', 'Amp. inc. SWR',
                'Mean inc. LWR', 'Amp. inc. LWR', 
                'Timing P-T', 'CO2 conc.', 'Mean LAI', 'Amp. LAI',
                'Mean SM L1', 'Amp. SM L1', 'Mean SM Tot', 'Amp. SM Tot',
                'Irrigated areas', 'Volume reserved', 'Agriculture', 'Forest',
                'Grassland', 'Shrubland', 'Sparse vegetation', 'Wetland',
                'Settlement', 'Bare areas', 'Water', 'Snow and ice',
                'Population density', 'Avg slope', 
                'Bulk density', 'Clay frac.', 'Sand frac.',
                'Soil org. carbon',
                'Aridity index', 'Rgwb0', 'Teq',
                'Change mean precipitation', 'Change amp. precipitation',
                'Change mean temperature', 'Change amp. temperature',
                'Change mean rel. humidity', 'Change amp. rel. humidity', 
                'Change mean inc. SWR', 'Change amp. inc. SWR',
                'Change mean inc. LWR', 'Change amp. inc. LWR', 
                'Change timing P-T', 'Change CO2 conc.',
                'Change mean LAI', 'Change amp. LAI',
                'Change mean SM L1', 'Change amp. SM L1', 
                'Change mean SM Tot', 'Change amp. SM Tot',
                'Change irrigated areas', 'Change agriculture', 'Change forest',
                'Change grassland', 'Change shrubland', 'Change sparse vegetation',
                'Change wetland', 'Change settlement', 'Change bare areas',
                'Change water', 'Change snow and ice',
                'Change population density')

vP  <- candidates[c(1:2, 40:41)]
vT  <- candidates[c(3:4, 42:43)]
vTI <- candidates[c(11, 50)]
vNSP<- candidates[c(5:10, 44:49)]
vSLP<- candidates[32]
vLAI<- candidates[c(13:14, 52:53)]
vLUC<- candidates[c(21:30, 59:68)]
vSM <- candidates[c(15:18, 54:57)]
vSG <- candidates[c(33:36)]
vPD <- candidates[c(31, 69)]
vCO2<- candidates[c(12, 51)]
vIA <- candidates[c(19, 58)]
vRSV<- candidates[20]
vIQH<- candidates[c(37:39)]

Modelling

Load ML packages

# Load modelling packages
pacman::p_load(tidymodels, workflowsets, 
               tune, finetune, 
               rules, bonsai, 
               stacks, bundle)

Creating model en mass

# Set models
lm_fit  <- linear_reg(mode = 'regression', 
                      engine = 'glmnet', 
                      penalty = tune(),
                      mixture = tune())

cub_fit <- cubist_rules(engine = 'Cubist',
                        committees = tune(),
                        neighbors = tune(), 
                        max_rules = tune())

xgb_fit <- boost_tree(mode = 'regression',
                      engine = 'xgboost',
                      tree_depth = tune(), 
                      trees = tune(), 
                      mtry = tune(),
                      learn_rate = tune(),
                      min_n = tune(), 
                      loss_reduction = tune(), 
                      sample_size = tune(),
                      stop_iter = tune())

lgb_fit <- boost_tree(mode = 'regression',
                      engine = 'lightgbm',
                      tree_depth = tune(),
                      trees = tune(),
                      mtry = tune(),
                      learn_rate = tune(), 
                      min_n = tune(), 
                      loss_reduction = tune())

rf_fit  <- rand_forest(mode = 'regression',
                       engine = 'ranger',
                       mtry = tune(), 
                       trees = tune(),
                       min_n = tune()) 

bart_fit <- bart(mode = 'regression', 
                 engine = 'dbarts', 
                 trees = tune(),
                 prior_terminal_node_coef = tune(),
                 prior_terminal_node_expo = tune(), 
                 prior_outcome_range = tune())

svm_fit1 <- svm_rbf(mode = 'regression', 
                    engine = 'kernlab', 
                    cost = tune(), 
                    rbf_sigma = tune(), 
                    margin = tune())

svm_fit2 <- svm_poly(mode = 'regression', 
                     engine = 'kernlab', 
                     cost = tune(), 
                     degree = tune(), 
                     scale_factor = tune(), 
                     margin =  tune()) 

# Set resampling
set.seed(1)
train_folds <- vfold_cv(data = trainDat[,c(response, candidates)],
                        v = 10, 
                        repeats = 3)

# Create formulas for training
preproc_list  <- list(
    'formula_c01' = reformulate(c(vP, vT, vTI, vSLP, vLUC), response),
    'formula_c02' = reformulate(c(vP, vT, vTI, vSLP, vLUC, vLAI), response),
    'formula_c03' = reformulate(c(vP, vT, vTI, vSLP, vLAI), response),
    'formula_c04' = reformulate(c(vP, vT, vTI, vSLP, vLAI, vNSP), response),
    'formula_c05' = reformulate(c(vP, vT, vTI, vSLP, vLAI, vNSP, vSM), response),
    'formula_c06' = reformulate(c(vP, vT, vTI, vSLP, vLAI, vNSP, vSM, vSG), response),
    'formula_c07' = reformulate(c(vP, vT, vTI, vSLP, vLAI, vNSP, vSM, vSG, vPD), response),
    'formula_c08' = reformulate(c(vP, vT, vTI, vSLP, vLAI, vNSP, vSM, vSG, vPD, vCO2), response),
    'formula_c09' = reformulate(c(vP, vT, vTI, vSLP, vLAI, vNSP, vSM, vSG, vPD, vCO2, vIA), response),
    'formula_c10' = reformulate(c(vP, vT, vTI, vSLP, vLAI, vNSP, vSM, vSG, vPD, vCO2, vIA, vRSV), response),
    'formula_c11' = reformulate(c(vP, vT, vTI, vSLP, vLAI, vNSP, vSM, vSG, vPD, vCO2, vIA, vRSV, vIQH), response),
    'formula_all' = reformulate(candidates, response)
  )

# Set workflows
all_workflows <- workflow_set(
  preproc = preproc_list,
  models =  list(
    lm = lm_fit, 
    cubist = cub_fit,
    rf = rf_fit,
    bart = bart_fit,
    xgboost = xgb_fit, 
    lightgbm = lgb_fit,
    svm_rbf = svm_fit1, 
    svm_poly = svm_fit2
  )
)

# Control hyperparameter tunning
grid_ctrl <-
   control_race(
     verbose = T,
     verbose_elim = T,
     save_pred = TRUE,
     burn_in = 5,
     parallel_over = "everything",
     save_workflow = TRUE
   )

# Go parallel for speed
cl <- makePSOCKcluster(detectCores()-4)
registerDoParallel(cl)
# Fit
all_workflows <- all_workflows %>% 
  workflow_map(resamples = train_folds, 
               grid = 16, 
               fn = 'tune_race_anova',
               verbose = T, 
               seed = 1, 
               metrics = metric_set(rmse, rsq),
               control = grid_ctrl)
# End parallel
stopCluster(cl)


# Select best fit for each model
tuned_models <- lapply(unique(all_workflows$wflow_id), function(x){
  # Select best hyperparameters
  bestfit <- all_workflows %>% 
    extract_workflow_set_result(x) %>% 
    select_best(metric = 'rmse')
  # Fit
  final_fit <- all_workflows %>% 
    extract_workflow(x) %>% 
    finalize_workflow(bestfit %>% dplyr::select(-.config)) %>% 
    fit(data = trainDat)
  # Return
  tibble(wflow_id = x, fit = list(final_fit)) %>% bind_cols(bestfit)
})

# Performance
test_wf <- lapply(tuned_models, function(x){
  gof <- hydroGOF::gof(
    sim = predict(x$fit, testDat) %>% unlist(),
    obs = testDat$dif_medR
  )[,1][c(1,2,4,9,17,19)] 
  gof %>% t() %>% as_tibble() %>% mutate(wflow_id = x$wflow_id)
}) %>% bind_rows() %>% arrange(substr(wflow_id, 13, 25))

Example of model ensembling:

show

registerDoSEQ()

data_st <-
  stacks() %>% 
  add_candidates(all_workflows %>% 
                   filter(grepl(wflow_id, pattern = 'c04')))

model_st <-
  data_st %>%
  blend_predictions(non_negative = T)

model_st <-
  model_st %>%
  fit_members()

# Assess performance
hydroGOF::gof(
    sim = predict(model_st, testDat) %>% unlist(),
    obs = testDat$dif_medR
  )[,1][c(1,2,4,9,17,19)]

Feed-forward and recursive feature elimination were performed using the best Cubist and its best hyperparameter set:

show

### feed foward selection
cl <- makePSOCKcluster(detectCores()-4)
registerDoParallel(cl)
set.seed(100)
model_ffs <-ffs(trainDat[,candidates],
                trainDat[,response] %>% unlist %>% as.numeric,
                method = "cubist",
                metric = "RMSE",
                trControl = ctrl,
                importance = TRUE,
                minVar = 1,
                tuneGrid = expand.grid(committees = 75, neighbors = 7))
stopCluster(cl)

plot_ffs(model_ffs); model_ffs$selectedvars
# Assess performance
hydroGOF::gof(sim = predict(model_ffs, testDat),
              obs = testDat$dif_medR)[,1][c(1,2,4,9,17,19)]


### backward selection - RFE
ctrlrf <- rfeControl(functions = caretFuncs,
                     method = "repeatedcv",
                     repeats = 5, number = 10,
                     verbose = FALSE)

subsets <- c(2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50)

cl <- makePSOCKcluster(detectCores()-4)
registerDoParallel(cl)
set.seed(100)
model_rfe <- rfe(x = trainDat[,candidates] %>% as.data.frame(),
               y = trainDat[,response] %>% unlist %>% as.numeric,
                 sizes = subsets, 
                 rfeControl = ctrlrf,
                 method = 'cubist',
                 tuneGrid = expand.grid(committees = 75, neighbors = 7))
stopCluster(cl)

hydroGOF::gof(sim = predict(model_rfe, testDat),  
              obs = testDat$dif_medR)[,1][c(1,2,4,9,17,19)]

Other relevant data/outputs

Outputs from this work are available in a .rds (r native) format in /Output/. Global prediction stacks, trained models, etc.

# Original data split
readRDS('./Output/original_split_df.rds')

# Train and test set performances
readRDS('./Output/df_train_perf.rds')
readRDS('./Output/df_test_perf.rds')

# Shap values
readRDS('./Output/agshap.rds')
# AOI output variable from CAST
readRDS('./Output/AOA_RMSExDI.rds')

# World predictors stack for variable combination c04
rast('./Output/Global_prediction_stack.tif')
# Predicted stack with AOI (AoA and DI columns)
readRDS('./Output/Global_predicted_stack.rds')
# World shap
readRDS('./Output/agshap_world.rds')

# RFE and FFS models
readRDS('./Output/model_rfe.rds')
readRDS('./Output/model_ffs.rds')

# Final trained model
model_st <- readRDS('./Output/Model_ensemble.rds')
model_st$member_fits$formula_c04_lightgbm_1_14 <-
  readRDS('./Output/Model_Lgbm.rds') %>% bundle::unbundle()

Making predictions

# Best fitted model (ensemble for variable combination c04)
model_st <- readRDS('./Output/Model_ensemble.rds')
model_st$member_fits$formula_c04_lightgbm_1_14 <-
  readRDS('./Output/Model_Lgbm.rds') %>% bundle::unbundle()

# Selecting variable combination c04
var_ids <- candidates %in% c(vP, vT, vTI, vSLP, vLAI, vNSP)
predictors <- candidates[var_ids]

# Created table of predicted and obsered data
tibble(Observed = df$dif_medR,
       Predicted = predict(model_st, 
                           new_data = df[,predictors])$.pred) %>% 
  # Plot
  ggplot() +
  geom_point(aes(x = Observed, y = Predicted))

# # Predicting the rasterStack
# rast('./Output/Global_prediction_stack.tif') %>%
#   as.data.frame(xy = T) %>% 
#   as_tibble() %>% 
#   mutate(difR = predict(model_st, new_data = .)$.pred)