#!/usr/bin/env Rscript

# =============================================================================
# Adaptive Capacity (municipality-level) processing and clustering
# =============================================================================
# - English code/comments (Spanish column names preserved).
# - Produces:
#   (1) Long table joined with lat_bin (Comuna, lat_bin, Adaptive.Capacity*, Value)
#   (2) Optional wide table with lat_bin appended to municipality column names
#   (3) K-means clustering (centers=3) on scaled municipality vectors + plots
#
# IMPORTANT:
# - write.csv() does NOT support `sep=`. Use write.csv2() for semicolon-separated,
#   or readr::write_delim() / write_csv2().
# =============================================================================

suppressPackageStartupMessages({
  library(dplyr)
  library(tidyr)
  library(readr)
  library(stringi)
  library(ggplot2)
  library(factoextra)
  library(ggrepel)
})

# -----------------------------
# Configuration
# -----------------------------
PATH_ADAPTIVE <- "Adaptive_Capacity_SCCoastalFisheries.csv"
PATH_LATBINS  <- "lat_bins.csv"
PATH_COLORS   <- "Caletas_Comuna_Color.csv"

OUT_LONG_LATBIN <- "adaptive_long_with_lat_bin_withMissingDATA.csv"
OUT_WIDE_LATBIN <- "adaptive_with_lat_bin_withMissingDATA.csv"

# Clustering inputs (already curated/cleaned)
PATH_ADAPTIVE_CLUSTER <- "adaptive_with_lat_bin_withMissingDATA_clean(nolatbin).csv"

# Clustering parameters
K_CENTERS <- 3
K_NSTART  <- 20

# Which columns are municipalities in the original adaptive table?
# Your original script assumes columns 4:86 are municipalities.
MUNICIPALITY_COLS_START <- 4
MUNICIPALITY_COLS_END   <- 86

# -----------------------------
# Helpers
# -----------------------------
normalize_text <- function(x) {
  # Standardize municipality names: replace dots, upper-case, remove accents.
  x <- as.character(x)
  x <- gsub("\\.", " ", x)
  x <- toupper(x)
  stringi::stri_trans_general(x, "Latin-ASCII")
}

zscore_safe <- function(x) {
  # Robust z-score: if all NA or <2 valid values => return NA
  x <- as.numeric(x)
  valid <- x[!is.na(x)]
  if (length(valid) < 2) return(rep(NA_real_, length(x)))
  (x - mean(valid)) / stats::sd(valid)
}

# -----------------------------
# 1) Load Adaptive Capacity table
# -----------------------------
adaptive_capacity <- read.csv(PATH_ADAPTIVE, encoding = "utf-8", sep = ",", check.names = FALSE)

# Drop the first column (it looked like an index in your original file)
adaptive_capacity <- adaptive_capacity %>% select(-1)

# Convert municipality columns to numeric
adaptive_capacity <- adaptive_capacity %>%
  mutate(across(MUNICIPALITY_COLS_START:MUNICIPALITY_COLS_END, ~ as.numeric(as.character(.x))))

# Row-wise z-scoring (standardize each variable across municipalities)
adaptive_capacity_data <- adaptive_capacity[, MUNICIPALITY_COLS_START:MUNICIPALITY_COLS_END]
adaptive_capacity_sta  <- t(apply(adaptive_capacity_data, 1, zscore_safe))
adaptive_capacity_sta  <- as.data.frame(adaptive_capacity_sta)
names(adaptive_capacity_sta) <- names(adaptive_capacity_data)

adaptive_capacity_final <- bind_cols(adaptive_capacity[, 1:(MUNICIPALITY_COLS_START - 1)], adaptive_capacity_sta)

# -----------------------------
# 2) Pivot to long format: (variable x comuna)
# -----------------------------
adaptive_long <- adaptive_capacity_final %>%
  pivot_longer(
    cols      = MUNICIPALITY_COLS_START:MUNICIPALITY_COLS_END,
    names_to  = "Comuna",
    values_to = "Value"
  )

# Standardize "Comuna" strings for joining
adaptive_long <- adaptive_long %>%
  mutate(Comuna = normalize_text(Comuna))

# -----------------------------
# 3) Load lat_bin table and join
# -----------------------------
lat_bins <- read.csv(PATH_LATBINS, encoding = "utf-8", sep = ";", check.names = FALSE) %>%
  mutate(Comuna = normalize_text(Comuna))

adaptive_pivot <- adaptive_long %>%
  full_join(lat_bins, by = "Comuna") %>%
  mutate(Value = as.numeric(Value)) %>%
  filter(!is.na(Value))

# Export long table (semicolon-delimited, UTF-8)
readr::write_delim(
  adaptive_pivot,
  file = OUT_LONG_LATBIN,
  delim = ";",
  na = "",
  quote_escape = "double"
)

# -----------------------------
# 4) OPTIONAL: Wide table with lat_bin appended to municipality column names
# -----------------------------
# This is only needed if you need a municipality-by-variable matrix where each municipality
# name carries lat_bin in the column name (for traceability in some pipelines).

# Map: Comuna -> lat_bin (named vector)
lat_map <- setNames(lat_bins$lat_bin, lat_bins$Comuna)

# Original municipality column names (standardized)
current_names_raw <- names(adaptive_capacity_final)[MUNICIPALITY_COLS_START:MUNICIPALITY_COLS_END]
current_names_std <- normalize_text(current_names_raw)

# Build the wide transformed table
adaptive_transformed <- adaptive_capacity_final[, 1:(MUNICIPALITY_COLS_START - 1)]

for (i in seq_along(current_names_raw)) {
  comuna_std <- current_names_std[i]
  lb <- lat_map[[comuna_std]]
  
  # If lat_bin missing, skip
  if (is.null(lb) || is.na(lb)) next
  
  new_col_name <- paste0(comuna_std, "_", lb)
  
  adaptive_transformed[[new_col_name]] <- adaptive_capacity_final[[current_names_raw[i]]]
}

readr::write_delim(
  adaptive_transformed,
  file = OUT_WIDE_LATBIN,
  delim = ";",
  na = "",
  quote_escape = "double"
)

# -----------------------------
# 5) Clustering (kmeans) + plotting
# -----------------------------
adaptive_data <- read.csv(PATH_ADAPTIVE_CLUSTER, encoding = "utf-8", sep = ",", header = TRUE, check.names = FALSE)

# Load colors (optional), expected columns: Comuna, Color
clusters <- read.csv(PATH_COLORS, encoding = "utf-8", sep = ";", header = TRUE, check.names = FALSE) %>%
  mutate(Comuna = normalize_text(Comuna))

comunas_colors <- clusters %>%
  select(Comuna, Color) %>%
  distinct()

# Extract municipality matrix: transpose so rows=municipalities, cols=variables
# Your file uses 4:86 in earlier steps, but later you used 4:111.
# Here we detect municipality columns by excluding the first 3 descriptor columns.
municipality_block <- adaptive_data[, 4:ncol(adaptive_data)]
data_transposed <- t(municipality_block)

# Scale municipalities across variables
data_scaled_df <- as.data.frame(scale(data_transposed))
data_scaled_df[is.na(data_scaled_df)] <- 0

# K-means
km <- kmeans(data_scaled_df, centers = K_CENTERS, nstart = K_NSTART)

# Color vector aligned to municipality rownames (optional)
row_comunas <- normalize_text(rownames(data_scaled_df))
colors_vec <- comunas_colors$Color[match(row_comunas, comunas_colors$Comuna)]

p_cluster <- fviz_cluster(
  km,
  data = data_scaled_df,
  geom = "point",
  repel = TRUE,
  col = colors_vec
) +
  geom_text_repel(aes(label = rownames(data_scaled_df)), size = 2) +
  theme(text = element_text(size = 9))

print(p_cluster)

cat("Done.\n")
cat("Exported:\n")
cat(" - ", OUT_LONG_LATBIN, "\n", sep = "")
cat(" - ", OUT_WIDE_LATBIN, "\n", sep = "")