##############NDVI EXTRACTION############################################################
options(repos = c(CRAN = "https://cloud.r-project.org"))

install.packages(c("remotes","pkgbuild"))
pkgbuild::has_build_tools(debug = TRUE)   # should be TRUE (Rtools installed)

remotes::install_github("ranghetti/sen2r", upgrade = "never")
library(sen2r)
sen2r::check_sen2r_deps()



library(terra); library(sf); library(readxl)
library(dplyr); library(stringr); library(lubridate)

# --- 1) Load points from Excel ---

pts_df <- readxl::read_excel("C:/Users/LENOVO/Documents/Master_Thesis_Project/Master_Thesis_Giriteka/R_files/thesis_database.xlsx") %>%
  mutate(date = as.Date(date))         # ensure proper Date class

# Make sf points in WGS84
pts <- st_as_sf(pts_df, coords = c("lon","lat"), crs = 4326)

# --- 2) AOI & time window (pad a few days to catch nearest clear scene) ---
aoi_3857 <- st_transform(pts, 3857) |> st_buffer(1000) |> st_union()
aoi      <- st_transform(aoi_3857, 4326)              # polygon in WGS84
timewin  <- range(pts$date) + c(-7, +7)               # tolerance = ±7 days

# --- 3) Sentinel-2 -> NDVI (cloud/shadow masked), 10 m, Float32 output ---
out_dir <- "s2_ndvi_out"
dir.create(out_dir, showWarnings = FALSE)

ndvi_files <- sen2r(
  gui = FALSE,
  timewindow = timewin,
  extent = aoi, extent_name = "aoi",
  list_indices = "NDVI", index_source = "BOA",     # NDVI from surface reflectance
  list_prods   = NA,
  res_s2 = "10m", res = c(10,10), unit = "Meter",
  mask_type = "cloud_and_shadow", max_mask = 60,   # relax/tighten if needed
  clip_on_extent = TRUE, extent_as_mask = TRUE,
  index_datatype = "Float32",                      # NDVI in -1..1 (recommended)
  path_out = out_dir, path_indices = out_dir,
  parallel = TRUE
)

# --- 4) Build a SpatRaster stack & read acquisition dates from filenames ---
ndvi_files <- list.files(out_dir, pattern = "NDVI.*\\.tif$", recursive = TRUE, full.names = TRUE)
r_ndvi <- rast(ndvi_files)

# Extract date from sen2r file names (YYYYMMDD inside)
get_date <- function(x) {
  m <- str_match(basename(x), "(\\d{4})(\\d{2})(\\d{2})")[,2:4]
  as.Date(paste(m[,1], m[,2], m[,3], sep="-"))
}
layer_dates <- get_date(ndvi_files)

# --- 5) For each point, find nearest scene to its target date (within tolerance) ---
tol_days <- 14  
match_idx <- sapply(pts$date, function(d) {
  dif <- abs(as.numeric(layer_dates - d))
  i <- which.min(dif)
  if (length(i)==0 || dif[i] > tol_days) NA_integer_ else i
})

# --- 6) Extract NDVI at points () ---
# small buffer to be robust to GPS pixel alignment (10 m = one pixel)
buf_m <- 10
pts_buff <- st_transform(pts, 3857) |> st_buffer(buf_m) |> st_transform(4326)

ndvi_val <- vector("numeric", nrow(pts))
for (i in seq_len(nrow(pts))) {
  if (is.na(match_idx[i])) { ndvi_val[i] <- NA_real_; next }
  lyr <- r_ndvi[[ match_idx[i] ]]

  v <- terra::extract(lyr, vect(pts_buff[i,]), fun=mean, na.rm=TRUE)[,2]
  ndvi_val[i] <- v
}

# --- 7) Save tidy results ---
out <- pts_df %>%
  mutate(acq_date = ifelse(is.na(match_idx), NA, layer_dates[match_idx]),
         ndvi = ndvi_val)
write.csv(out, file = "ndvi_at_points.csv", row.names = FALSE)

############################MODEL TRAINING#################################################################
# Load necessary libraries
library(randomForest)
library(xgboost)
library(gbm)
library(caret)
library(ggplot2)
library(corrplot)
library(openxlsx)
library(mice)
library(dplyr)
library(pdp) 
library(MLmetrics)  
library(tidyr)
library(writexl)
library(gridExtra)
library(ggcorrplot)
library(patchwork)
library(ragg)
library(broom)      
library(viridis)    
library(scales)



# ---- feature importance plot function ----
plot_feature_importance <- function(df, title, fill_color, x_label = "Importance") {
  ggplot(df, aes(x = reorder(Feature, Importance), y = Importance)) +
    geom_bar(stat = "identity", fill = fill_color, width = 0.6) +
    coord_flip() +
    geom_text(aes(label = round(Importance, 2)), hjust = -0.1, size = 4.5) +
    theme_minimal(base_size = 14) +
    labs(title = title, x = "Features", y = x_label) +
    theme(
      plot.title = element_text(size = 16, face = "bold"),
      axis.title = element_text(size = 14),
      axis.text = element_text(size = 12),
      panel.grid.major.y = element_blank()
    ) +
    scale_y_continuous(expand = expansion(mult = c(0, 0.15)))
}


setwd("/Users/vinkelvinendawuheme/Documents/Master_Thesis_Project/Master_Thesis_Giriteka/R_files/ndvi_out_localSAFE_maxcover")

data <- read.csv("ndvi_at_points_localSAFE_maxcover.csv")

data_clean <- data[, c("month", "soil", "average_rainfall", "Anthropisation", "total_yield", "ndvi_extracted","longitude","latitude")]

# ---------------------- Data base management ---------------------- #

sum(is.na(data_clean))  

# Imputation with the mice package
imputed_data <- mice(data_clean, m = 5, method = 'pmm', seed = 500)
data_clean <- complete(imputed_data)  # To get a completed dataset after imputation

# Conversions: only true categories -> factors; coordinates stay numeric
data_clean <- data_clean %>%
  mutate(
    month = as.factor(month),
    soil = as.factor(soil),
    Anthropisation = as.factor(Anthropisation),
    longitude = as.numeric(longitude),
    latitude  = as.numeric(latitude)
  )
# ---------------------- Descriptive analysis ---------------------- #
# Descriptive statistics for the total yield
summary(data_clean$total_yield)

# Reshape data to long format
data_long <- data_clean %>%
  select(total_yield, ndvi_extracted, average_rainfall) %>%
  pivot_longer(
    cols = everything(),
    names_to = "Variable",
    values_to = "Value"
  )

# Create combined histogram with dynamic binwidth
combined_histogram <- ggplot(data_long, aes(x = Value)) +
  geom_histogram(
    bins = 30,  # for better resolution
    fill = "steelblue", 
    color = "black", 
    alpha = 0.7
  ) +
  facet_wrap(
    ~ Variable, 
    scales = "free",  # Allowing better representation for each variable
    ncol = 1  # Adjust as necessary for better layout
  ) +
  labs(
    title = "Variables distributions",
    x = "Value",
    y = "Frequency"
  ) +
  theme_minimal() +
  theme(
    strip.text = element_text(size = 12, face = "bold"),  
    axis.text = element_text(size = 10)
  )

# Save the plot
ggsave("improved_combined_histogram.png", combined_histogram, width = 8, height = 8)


print(combined_histogram)

# Boxplot to visualize the relationship between month and Total Productivity
ggplot(data_clean, aes(x = month, y = total_yield, fill = month)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Boxplot of Total Productivity by Month", x = "Month", y = "Total Productivity")


# Summary Statistics for Total Productivity by Soil and Anthropisation
summary_stats <- data_clean %>%
  group_by(soil, Anthropisation) %>%
  summarise(
    mean_productivity = mean(total_yield, na.rm = TRUE),
    median_productivity = median(total_yield, na.rm = TRUE),
    sd_productivity = sd(total_yield, na.rm = TRUE),
    min_productivity = min(total_yield, na.rm = TRUE),
    max_productivity = max(total_yield, na.rm = TRUE)
  )
print(summary_stats)

# Boxplot for Total Productivity by Month and Anthropisation
ggplot(data_clean, aes(x = month, y = total_yield, fill = Anthropisation)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Boxplot of Total Productivity by Month and Anthropisation",
       x = "Month", y = "Total Productivity") +
  facet_wrap(~Anthropisation)

# Boxplot for Total Productivity by Soil and Anthropisation
ggplot(data_clean, aes(x = soil, y = total_yield, fill = Anthropisation)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Boxplot of Total Productivity by Soil and Anthropisation",
       x = "Soil", y = "Total Productivity") +
  facet_wrap(~Anthropisation)

# Boxplot for Total Productivity by Anthropisation and Soil
ggplot(data_clean, aes(x = Anthropisation, y = total_yield, fill = soil)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Boxplot of Total Productivity by Anthropisation and Soil",
       x = "Anthropisation", y = "Total Productivity") +
  facet_wrap(~soil)

# Additional descriptive statistics for Anthropisation
summary_by_anthropisation <- data_clean %>%
  group_by(Anthropisation) %>%
  summarise(
    Mean = mean(total_yield, na.rm = TRUE),
    Median = median(total_yield, na.rm = TRUE),
    SD = sd(total_yield, na.rm = TRUE),
    Min = min(total_yield, na.rm = TRUE),
    Max = max(total_yield, na.rm = TRUE)
  )
print(summary_by_anthropisation)

# Additional descriptive statistics for soil
summary_by_soil <- data_clean %>%
  group_by(soil) %>%
  summarise(
    Mean = mean(total_yield, na.rm = TRUE),
    Median = median(total_yield, na.rm = TRUE),
    SD = sd(total_yield, na.rm = TRUE),
    Min = min(total_yield, na.rm = TRUE),
    Max = max(total_yield, na.rm = TRUE)
  )
print(summary_by_soil)

# Select only numeric columns for correlation analysis
numeric_vars <- data_clean[, sapply(data_clean, is.numeric)]

# Define the function to calculate correlation and p-values
cor.mtest <- function(mat) {
  mat <- as.matrix(mat)
  n <- ncol(mat)
  p.mat <- matrix(NA, n, n)
  diag(p.mat) <- 0
  for (i in 1:(n - 1)) {
    for (j in (i + 1):n) {
      test <- cor.test(mat[, i], mat[, j])
      p.mat[i, j] <- test$p.value
      p.mat[j, i] <- test$p.value
    }
  }
  colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
  return(p.mat)
}

# Select only numeric columns from the data
numeric_vars <- data_clean[, sapply(data_clean, is.numeric)]

# Calculate correlation matrix
correlation_matrix <- cor(numeric_vars, use = "complete.obs")

# Calculate p-values
p_values_matrix <- cor.mtest(numeric_vars)

# Save correlation and p-values to a data frame for reporting
correlation_values <- as.data.frame(as.table(correlation_matrix))
p_values <- as.data.frame(as.table(p_values_matrix))
correlation_summary <- merge(correlation_values, p_values, by = c("Var1", "Var2"))
colnames(correlation_summary) <- c("Variable1", "Variable2", "Correlation", "P_Value")

# Save the correlation summary as an Excel file
write.xlsx(correlation_summary, "Updated_Correlation_Summary.xlsx", rowNames = FALSE)

# Extract specific correlations and their p-values for reporting
cor_total_yield_ndvi <- correlation_matrix["total_yield", "ndvi_extracted"]
p_value_total_yield_ndvi <- p_values_matrix["total_yield", "ndvi_extracted"]

cor_total_yield_precip <- correlation_matrix["total_yield", "average_rainfall"]
p_value_total_yield_precip <- p_values_matrix["total_yield", "average_rainfall"]

cor_ndvi_precip <- correlation_matrix["ndvi_extracted", "average_rainfall"]
p_value_ndvi_precip <- p_values_matrix["ndvi_extracted", "average_rainfall"]

# Print specific correlations and p-values for documentation
cat("Correlation between Total Yield and NDVI Extracted: r =", round(cor_total_yield_ndvi, 2), 
    "with p-value =", round(p_value_total_yield_ndvi, 3), "/n")
cat("Correlation between Total Yield and Average Rainfall: r =", round(cor_total_yield_precip, 2), 
    "with p-value =", round(p_value_total_yield_precip, 3), "/n")
cat("Correlation between NDVI Extracted and Average Rainfall: r =", round(cor_ndvi_precip, 2), 
    "with p-value =", round(p_value_ndvi_precip, 3), "/n")

# display the correlation matrix with numerical values
ggcorrplot(
  correlation_matrix,
  method = "square",       
  lab = TRUE,         
  lab_size = 7,          
  tl.cex = 16,             
  tl.srt = 45,             
  colors = c("blue", "white", "red"), 
  show.legend = TRUE
)


ggsave("Updated_Correlation_Matrix_Numeric.png", width = 8, height = 8)

# Definition of  the evaluation metrics function
evaluation_metrics <- function(actual, predicted) {
  rmse <- sqrt(mean((actual - predicted)^2))  # RMSE
  mae <- mean(abs(actual - predicted))  # MAE
  r_squared <- 1 - (sum((actual - predicted)^2) / sum((actual - mean(actual))^2))  # R²
  return(list(RMSE = rmse, MAE = mae, R2 = r_squared))
}



# Function to calculate summary statistics
calculate_summary_stats <- function(column) {
  tibble(
    Minimum = min(column, na.rm = TRUE),
    `1st Quartile` = quantile(column, 0.25, na.rm = TRUE),
    Median = median(column, na.rm = TRUE),
    Mean = mean(column, na.rm = TRUE),
    `3rd Quartile` = quantile(column, 0.75, na.rm = TRUE),
    Maximum = max(column, na.rm = TRUE),
    `Std. Dev` = sd(column, na.rm = TRUE),
    `Coeff. of Variation (%)` = (sd(column, na.rm = TRUE) / mean(column, na.rm = TRUE)) * 100
  )
}

# Apply the function to continuous variables and reshape the data
summary_table <- data_clean %>%
  select(total_yield, ndvi_extracted, average_rainfall) %>%  
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value") %>%  
  group_by(Variable) %>%
  summarise(calculate_summary_stats(Value), .groups = "drop") %>% 
  pivot_longer(cols = -Variable, names_to = "Statistic", values_to = "Value") %>% 
  pivot_wider(names_from = Statistic, values_from = Value) 

# Write the summary table to an Excel file
write.xlsx(
  summary_table,
  "Corrected_Summary_Table.xlsx",
  sheetName = "Summary Stats",
  rowNames = FALSE
)


print(summary_table)

# ---------------------- Data preparation for Machine Learning ---------------------- #
# Training and Testing dataset split

set.seed(123)
trainIndex <- createDataPartition(data_clean$total_yield, p = 0.7, list = FALSE)
trainData <- data_clean[trainIndex, ]
testData <- data_clean[-trainIndex, ]

# ---------------------- Random Forest Model Tuning ---------------------- #

rf_tune <- tuneRF(trainData[, -which(names(trainData) == "total_yield")], 
                  trainData$total_yield, 
                  stepFactor = 1.5, 
                  improve = 0.01, 
                  ntreeTry = 500,  # Number of trees for tuning
                  trace = TRUE)

# Extract the best `mtry` based on cross-validation
best_mtry <- rf_tune[which.min(rf_tune[, 2]), 1]


set.seed(123)  # reproducible RF training
rf_model <- randomForest(
  total_yield ~ ., 
  data = trainData, 
  mtry = best_mtry, 
  ntree = 500
)

# Random Forest Predictions
rf_predictions <- predict(rf_model, testData)

# Evaluate Random Forest model
rf_metrics <- evaluation_metrics(testData$total_yield, rf_predictions)

cat("Random Forest Metrics: \n")
cat("RMSE: ", rf_metrics$RMSE, "\n")
cat("MAE: ", rf_metrics$MAE, "\n")
cat("R²: ", rf_metrics$R2, "\n")

# ---------------------- Gradient Boosting Model Tuning ---------------------- #

gbm_model <- gbm(
  total_yield ~ ., 
  data = trainData, 
  distribution = "gaussian", 
  n.trees = 2000, 
  interaction.depth = 5,
  shrinkage = 0.001,  
  cv.folds = 5, 
  verbose = FALSE
)

# Select the best number of trees via cross-validation
best_iter <- gbm.perf(gbm_model, method = "cv")

# Gradient Boosting Predictions with the optimal number of trees
gbm_predictions <- predict(gbm_model, testData, n.trees = best_iter)

# Evaluate Gradient Boosting model
gbm_metrics <- evaluation_metrics(testData$total_yield, gbm_predictions)

# Print Gradient Boosting results
cat("Gradient Boosting Metrics:\n")
cat("RMSE: ", gbm_metrics$RMSE, "\n")
cat("MAE: ", gbm_metrics$MAE, "\n")
cat("R²: ", gbm_metrics$R2, "\n")


# ---------------------- XGBoost Model Tuning ---------------------- #

# One-hot encode categorical variables
train_matrix <- model.matrix(total_yield ~ . - 1, data = trainData)
test_matrix <- model.matrix(total_yield ~ . - 1, data = testData)

# Convert data to DMatrix for XGBoost
xgb_train_matrix <- xgb.DMatrix(data = train_matrix, label = trainData$total_yield)
xgb_test_matrix <- xgb.DMatrix(data = test_matrix)

# Define XGBoost parameters, including regularization terms
params <- list(
  objective = "reg:squarederror",
  eta = 0.01,  # Low learning rate for smoother learning
  max_depth = 6,  # Maximum tree depth
  min_child_weight = 1,  # Controls over fitting
  subsample = 0.8,  # Fraction of samples to use for each tree
  colsample_bytree = 0.8,  # Fraction of features to use
  gamma = 0,  # Minimum loss reduction to create a new split
  lambda = 1,  # L2 regularization (Ridge)
  alpha = 0.1  # L1 regularization (Lasso)
)

# Perform cross-validation for XGBoost to determine the best number of boosting rounds
xgb_cv <- xgb.cv(
  params = params,
  data = xgb_train_matrix,
  nrounds = 1000,  # Initial high number of trees
  nfold = 5,  # 5-fold cross-validation
  early_stopping_rounds = 10,  # Stop if no improvement
  verbose = TRUE
)

# Select the best number of iterations from cross-validation
best_nrounds <- xgb_cv$best_iteration

# Train the final XGBoost model with the optimal number of boosting rounds
xgb_model_final <- xgboost(
  params = params, 
  data = xgb_train_matrix, 
  nrounds = best_nrounds, 
  verbose = 0  # Turn off verbose output
)

# XGBoost Predictions
xgb_predictions <- predict(xgb_model_final, xgb_test_matrix)

# Evaluate XGBoost model
xgb_metrics <- evaluation_metrics(testData$total_yield, xgb_predictions)

# Print XGBoost results
cat("XGBoost Metrics:\n")
cat("RMSE: ", xgb_metrics$RMSE, "\n")
cat("MAE: ", xgb_metrics$MAE, "\n")
cat("R²: ", xgb_metrics$R2, "\n")


# ---------------------- Preparing Actuals and Predictions (TEST SET ONLY) ---------------------- #

# Build per-model data frames using the test split and the predictions/metrics already computed
rf_lab   <- sprintf("Random Forest: Predictions vs. Actuals\nRMSE = %.2f | MAE = %.2f | R² = %.2f",
                    rf_metrics$RMSE, rf_metrics$MAE, rf_metrics$R2)
gbm_lab  <- sprintf("Gradient Boosting: Predictions vs. Actuals\nRMSE = %.2f | MAE = %.2f | R² = %.2f",
                    gbm_metrics$RMSE, gbm_metrics$MAE, gbm_metrics$R2)
xgb_lab  <- sprintf("XGBoost: Predictions vs. Actuals\nRMSE = %.2f | MAE = %.2f | R² = %.2f",
                    xgb_metrics$RMSE, xgb_metrics$MAE, xgb_metrics$R2)

rf_df  <- data.frame(actual = testData$total_yield, predicted = rf_predictions)
gbm_df <- data.frame(actual = testData$total_yield, predicted = gbm_predictions)
xgb_df <- data.frame(actual = testData$total_yield, predicted = xgb_predictions)



make_scatter_clean <- function(df, ttl, point_color) {
  
  ggplot(df, aes(x = actual, y = predicted)) +
    geom_point(color = point_color, alpha = 0.6, size = 1.8) +
    
    
    geom_abline(slope = 1, intercept = 0,
                linetype = "dashed", color = "red", linewidth = 0.8) +
    
    
    geom_smooth(method = "lm", se = TRUE, color = "black",
                fill = "grey60", alpha = 0.25, linewidth = 0.9) +
    
    labs(title = ttl, x = "Actual Values", y = "Predicted Values") +
    
    theme_minimal(base_size = 13) +
    theme(
      plot.title = element_text(size = 14, face = "bold"),
      axis.title = element_text(size = 12),
      axis.text  = element_text(size = 10)
    ) +
    
    Le 
  scale_x_continuous(limits = c(0, 1100)) +
    scale_y_continuous(limits = c(0, 900))
}

# Palet colorblind-friendly
col_rf  <- viridis(3)[1]
col_gbm <- viridis(3)[2]
col_xgb <- viridis(3)[3]


rf_plot_clean  <- make_scatter_clean(rf_df,  rf_lab,  col_rf)
gbm_plot_clean <- make_scatter_clean(gbm_df, gbm_lab, col_gbm)
xgb_plot_clean <- make_scatter_clean(xgb_df, xgb_lab, col_xgb)

# Final combination 
fig_clean <- grid.arrange(rf_plot_clean, gbm_plot_clean, xgb_plot_clean, ncol = 1)


ggsave("Comparison_Predictions_vs_Actuals_clean.png",
       fig_clean, width = 10, height = 15, dpi = 600)


# ---------------------- Partial Dependence Plots (FUSED) ---------------------- #
# --- PDP: Random Forest ---
pdp_rf_ndvi   <- partial(
  rf_model, pred.var = "ndvi_extracted",
  train = trainData, grid.resolution = 25
)
pdp_rf_precip <- partial(
  rf_model, pred.var = "average_rainfall",
  train = trainData, grid.resolution = 25
)

# --- PDP: GBM (Using the best trees) ---
pdp_gbm_ndvi   <- partial(
  gbm_model, pred.var = "ndvi_extracted",
  train = trainData, n.trees = best_iter, grid.resolution = 25
)
pdp_gbm_precip <- partial(
  gbm_model, pred.var = "average_rainfall",
  train = trainData, n.trees = best_iter, grid.resolution = 25
)

# --- PDP: XGBoost ( using the matrix 'model.matrix') ---
train_matrix_pdp <- model.matrix(total_yield ~ . - 1, data = trainData)
train_df_pdp     <- as.data.frame(train_matrix_pdp)

pdp_xgb_ndvi   <- partial(
  xgb_model_final, pred.var = "ndvi_extracted",
  train = train_df_pdp, grid.resolution = 25
)
pdp_xgb_precip <- partial(
  xgb_model_final, pred.var = "average_rainfall",
  train = train_df_pdp, grid.resolution = 25
)


theme_set(theme_minimal(base_size = 13))

# Étiquettes "k" (0.8k, 1.0k, 1.2k). 
lab_short <- label_number(accuracy = 0.1, scale_cut = cut_short_scale())

# Étiquettes entières (800, 900, 1000, …) -> choisis l’un des deux jeux de labels.
lab_int   <- label_number(accuracy = 1, big.mark = " ")

style_pdp <- function(p, ttl, var = c("ndvi","precip"), use_short_labels = TRUE){
  var <- match.arg(var)
  g <- autoplot(p) +
    ggtitle(ttl) +
    labs(x = NULL, y = "ŷ") +
    coord_cartesian(expand = 0) +
    theme_minimal(base_size = 12) +
    theme(
      plot.title = element_text(face = "bold", size = 13),
      axis.title = element_text(size = 11),
      axis.text  = element_text(size = 10),
      plot.margin = margin(6, 8, 6, 8)
    )
  
  if (var == "ndvi") {
    g <- g + scale_x_continuous(
      limits = c(0, 0.7),
      breaks = seq(0, 0.7, by = 0.1),
      labels = number_format(accuracy = 0.1)
    )
  } else {
    rng   <- range(trainData$average_rainfall, na.rm = TRUE)
    brks  <- pretty(rng, n = 6)
    labsf <- if (use_short_labels) lab_short else lab_int
    g <- g + scale_x_continuous(
      limits = range(brks),
      breaks = brks,
      labels = labsf
    ) +
      theme(axis.text.x = element_text(angle = 35, hjust = 1))
  }
  g
}

# Sous-figures
plot_rf_ndvi   <- style_pdp(pdp_rf_ndvi,   "RF — NDVI",   "ndvi")
plot_gbm_ndvi  <- style_pdp(pdp_gbm_ndvi,  "GBM — NDVI",  "ndvi")
plot_xgb_ndvi  <- style_pdp(pdp_xgb_ndvi,  "XGB — NDVI",  "ndvi")

plot_rf_precip   <- style_pdp(pdp_rf_precip,   "RF — Precip.",   "precip", use_short_labels = TRUE)
plot_gbm_precip  <- style_pdp(pdp_gbm_precip,  "GBM — Precip.",  "precip", use_short_labels = TRUE)
plot_xgb_precip  <- style_pdp(pdp_xgb_precip,  "XGB — Precip.",  "precip", use_short_labels = TRUE)

# Assemblage paysage (2 lignes × 3 colonnes)
row_ndvi   <- plot_rf_ndvi + plot_gbm_ndvi + plot_xgb_ndvi
row_precip <- plot_rf_precip + plot_gbm_precip + plot_xgb_precip

pdp_fused_landscape <- (row_ndvi / row_precip) +
  plot_annotation(
    title   = "",
    caption = ""
  ) &
  theme(
    plot.title   = element_text(hjust = 0.5, size = 15, face = "bold"),
    plot.caption = element_text(size = 10),
    panel.spacing = unit(10, "pt")
  )

# === Sauvegardes propres ===
# 1) PNG HiDPI (A4 paysage) – nécessite ragg (install.packages('ragg') si besoin)
ggsave(
  "Figure_PDP_NDVI_Precip_FUSED_Landscape.png",
  plot   = pdp_fused_landscape,
  width  = 11.69, height = 8.27, units = "in",
  dpi    = 900, bg = "white",
  device = ragg::agg_png
)

# 2) PDF vectoriel (idéal pour LaTeX / Word)
ggsave(
  "Figure_PDP_NDVI_Precip_FUSED_Landscape.pdf",
  plot   = pdp_fused_landscape,
  width  = 11.69, height = 8.27, units = "in"
)

# ---------------------- Feature Importance Analysis ---------------------- #
# Fit a separate RF on the FULL dataset only for importance
set.seed(123)
rf_model_full <- randomForest(
  total_yield ~ .,
  data = data_clean,
  importance = TRUE,
  ntree = 500
)

# Extract RF importance
rf_importance_raw <- importance(rf_model_full)
rf_importance <- data.frame(
  Feature = rownames(rf_importance_raw),
  Importance = rf_importance_raw[, "IncNodePurity"]
)

# GBM importance (from the tuned GBM used for test metrics)
gbm_importance_raw <- summary(gbm_model, plotit = FALSE)
gbm_importance <- data.frame(
  Feature = gbm_importance_raw$var,
  Importance = gbm_importance_raw$rel.inf
)

# XGBoost importance (from the tuned xgb model used for test metrics)
xgb_importance_raw <- xgb.importance(model = xgb_model_final)
xgb_importance <- xgb_importance_raw |>
  arrange(desc(Gain)) |>
  transmute(Feature = Feature, Importance = Gain) |>
  head(8)

xgb_plot <- plot_feature_importance(
  df = xgb_importance,
  title = "Feature Importance - XGBoost",
  fill_color = "forestgreen",
  x_label = "Gain"
)

# Build the three importance plots using the helper
rf_plot <- plot_feature_importance(
  df = rf_importance,
  title = "Feature Importance - Random Forest",
  fill_color = "steelblue"
)

gbm_plot <- plot_feature_importance(
  df = gbm_importance,
  title = "Feature Importance - Gradient Boosting Machine",
  fill_color = "darkorange",
  x_label = "Relative Influence"
)

xgb_plot <- plot_feature_importance(
  df = xgb_importance,
  title = "Feature Importance - XGBoost",
  fill_color = "forestgreen",
  x_label = "Gain"
)

# ───────────────────── Affichage ou Sauvegarde ─────────────────────
library(gridExtra)

# Affichage côte à côte ou verticalement
grid.arrange(rf_plot, gbm_plot, xgb_plot, ncol = 1)

# Optionnel : sauvegarde
ggsave("Final_Feature_Importance_Plots.png", width = 10, height = 12)
ggsave("Feature_Importance_AllModels.png", plot = grid.arrange(rf_plot, gbm_plot, xgb_plot, ncol = 1), 
       width = 8, height = 10, dpi = 300)


####################################SENSITIVITY ANALYSIS####################################################################################################################################################################################

#############################################
### 1. LOAD LIBRARIES
#############################################

library(randomForest)
library(xgboost)
library(gbm)
library(caret)
library(ggplot2)
library(dplyr)
library(tidyr)
library(openxlsx)

#############################################
### 2. FIXED TRAIN/TEST SPLIT
#############################################

set.seed(123)
trainIndex <- createDataPartition(data_clean$total_yield, p = 0.8, list = FALSE)

train_original <- data_clean[trainIndex, ]
test_original  <- data_clean[-trainIndex, ]

#############################################
### 3. DEFINE SENSITIVITY GRID
#############################################

sample_sizes <- seq(100, 1000, by = 100)

grid <- expand.grid(
  n_estimators = c(100, 200, 300),
  max_depth = c(3, 5),
  subsample = c(0.6, 0.8)
)

results <- list()

#############################################
### 4. METRICS FUNCTION
#############################################

evaluation_metrics <- function(actual, predicted) {
  rmse <- sqrt(mean((actual - predicted)^2))
  mae <- mean(abs(actual - predicted))
  r_squared <- 1 - (sum((actual - predicted)^2) /
                      sum((actual - mean(actual))^2))
  return(c(RMSE = rmse, MAE = mae, R2 = r_squared))
}

#############################################
### 5. CORRECTED SENSITIVITY LOOP
#############################################

for (size in sample_sizes) {
  
  boot_train <- train_original[
    sample(1:nrow(train_original), size = size, replace = TRUE),
  ]
  
  for (i in 1:nrow(grid)) {
    
    n_estimators <- grid$n_estimators[i]
    max_depth <- grid$max_depth[i]
    subsample <- grid$subsample[i]
    
    # RANDOM FOREST
    rf_model <- randomForest(
      total_yield ~ ., 
      data = boot_train,
      ntree = n_estimators,
      maxnodes = max_depth * 10
    )
    rf_pred <- predict(rf_model, test_original)
    rf_metrics <- evaluation_metrics(test_original$total_yield, rf_pred)
    
    # GBM
    gbm_model <- gbm(
      total_yield ~ ., 
      data = boot_train,
      distribution = "gaussian",
      n.trees = n_estimators,
      interaction.depth = max_depth,
      shrinkage = 0.05,
      bag.fraction = subsample,
      train.fraction = 1,
      verbose = FALSE
    )
    gbm_pred <- predict(gbm_model, test_original, n.trees = n_estimators)
    gbm_metrics <- evaluation_metrics(test_original$total_yield, gbm_pred)
    
    # XGBOOST
    train_matrix <- model.matrix(total_yield ~ . - 1, data = boot_train)
    test_matrix  <- model.matrix(total_yield  ~ . - 1, data = test_original)
    
    xgb_train <- xgb.DMatrix(data = train_matrix, label = boot_train$total_yield)
    
    xgb_model <- xgboost(
      data = xgb_train,
      objective = "reg:squarederror",
      nrounds = n_estimators,
      max_depth = max_depth,
      subsample = subsample,
      eta = 0.05,
      verbose = 0
    )
    
    xgb_pred <- predict(xgb_model, test_matrix)
    xgb_metrics <- evaluation_metrics(test_original$total_yield, xgb_pred)
    
    # STORE METRICS
    results[[paste(size, i, sep = "_")]] <- data.frame(
      SampleSize = size,
      n_estimators = n_estimators,
      max_depth = max_depth,
      subsample = subsample,
      Model = c("RF", "GBM", "XGBoost"),
      RMSE = c(rf_metrics["RMSE"], gbm_metrics["RMSE"], xgb_metrics["RMSE"]),
      MAE = c(rf_metrics["MAE"], gbm_metrics["MAE"], xgb_metrics["MAE"]),
      R2 = c(rf_metrics["R2"], gbm_metrics["R2"], xgb_metrics["R2"])
    )
  }
}

results_df <- bind_rows(results)
#############################################
### 5.1. ADD CLEAN LABELS FOR FACETS
#############################################

results_df <- results_df %>%
  mutate(
    n_estimators_lab = factor(
      n_estimators,
      levels = c(100, 200, 300),
      labels = c("Trees = 100", "Trees = 200", "Trees = 300")
    ),
    
    max_depth_lab = factor(
      max_depth,
      levels = c(3, 5),
      labels = c("Max depth = 3", "Max depth = 5")
    ),
    
    subsample_lab = factor(
      subsample,
      levels = c(0.6, 0.8),
      labels = c("Subsample = 0.6", "Subsample = 0.8")
    )
  )

#############################################
### 6. EXPORT RESULTS TO EXCEL FILES
#############################################

# 6.1 Full table
write.xlsx(results_df, "sensitivity_results_corrected.xlsx", overwrite = TRUE)

# 6.2 Summary table
results_summary <- results_df %>%
  group_by(Model) %>%
  summarise(
    Mean_RMSE = mean(RMSE),
    Mean_MAE  = mean(MAE),
    Mean_R2   = mean(R2),
    .groups = "drop"
  )
write.xlsx(results_summary, "results_summary.xlsx", overwrite = TRUE)

# 6.3 Best model configurations according to RMSE
best_by_rmse <- results_df %>%
  group_by(Model) %>%
  slice_min(RMSE)
write.xlsx(best_by_rmse, "best_by_rmse.xlsx", overwrite = TRUE)

# 6.4 Best model configurations according to R²
best_by_r2 <- results_df %>%
  group_by(Model) %>%
  slice_max(R2)
write.xlsx(best_by_r2, "best_by_r2.xlsx", overwrite = TRUE)

#############################################
### 7. SENSITIVITY PLOTS (OPTIONAL)
#############################################

plot_sensitivity <- function(data, metric, metric_label, title, file_name) {
  
  p <- ggplot(data, aes(x = SampleSize, y = !!sym(metric), color = Model)) +
    geom_line(linewidth = 0.9) + 
    geom_point(size = 1.8) +
    facet_grid(
      rows = vars(n_estimators_lab),
      cols = vars(max_depth_lab, subsample_lab),
      scales = "free_y",
      switch = "y"
    ) +
    labs(
      title = title,
      x = "Sample Size",
      y = metric_label,
      color = "Model"
    ) +
    theme_bw(base_size = 13) +
    theme(
      strip.placement = "outside",
      strip.background = element_rect(fill = "white", color = "black"),
      strip.text.y.left = element_text(angle = 0, face = "bold"),
      panel.spacing = unit(0.7, "lines")
    )
  
  ggsave(file_name, p, width = 18, height = 22, bg = "white")
}


plot_sensitivity(results_df, "RMSE", "RMSE", 
                 "Sensitivity Analysis: RMSE",
                 "Sensitivity_Analysis_RMSE_corrected.png")

plot_sensitivity(results_df, "MAE", "MAE", 
                 "Sensitivity Analysis: MAE",
                 "Sensitivity_Analysis_MAE_corrected.png")

plot_sensitivity(results_df, "R2", "R²", 
                 "Sensitivity Analysis: R²",
                 "Sensitivity_Analysis_R2_corrected.png")
