# =============================================================================
# SERNAPESCA benthic landings (2015–2022) – Interannual beta diversity workflow
# =============================================================================
# Notes:
# - Column names are preserved as in the original dataset (Spanish).
# - This script:
#   1) computes a municipality-level "controlled frequency" (trips per landing site)
#   2) filters municipalities based on an annual threshold
#   3) computes Bray–Curtis beta diversity from adjusted landings
#   4) exports analysis tables used in the manuscript
#
# Outputs (default):
# - beta_div_fullmatrix.csv
# - Interannual_beta_diversity.csv
# - dispersion_beta_diversity.csv
# - results_lat_bin_beta_diversity_adjacent.csv
# - landings_similarity_2015-2022_CanCoqHigOval.csv
# =============================================================================

suppressPackageStartupMessages({
  library(readr)
  library(dplyr)
  library(tidyr)
  library(purrr)
  library(vegan)
  library(ggplot2)
  library(forcats)
})

# -----------------------------
# Configuration
# -----------------------------
DATA_PATH <- "SERNAPESCA_benthic_landings_2015_2022_publish_subset.csv"

# Municipality filtering (controlled frequency)
CONTROLLED_FREQ_BINWIDTH <- 0.1
CONTROLLED_FREQ_THRESHOLD <- 12  # e.g., >= 12 trips/year (approx. monthly reporting)

# Beta diversity settings
BETA_METHOD <- "bray"
PAIR_MODE <- "full"  # "adjacent" or "full"

# Optional: exclude a lat_bin (as in your original workflow)
EXCLUDE_LAT_BIN <- -24

# Focused communes for example plots / exports
FOCAL_COMUNAS <- c("LA HIGUERA", "COQUIMBO", "CANELA", "OVALLE")

# -----------------------------
# Helpers
# -----------------------------
as_numeric_comma <- function(x) {
  # Converts "1,23" -> 1.23 safely
  as.numeric(gsub(",", ".", x))
}

validate_required_cols <- function(df, cols) {
  missing <- setdiff(cols, names(df))
  if (length(missing) > 0) {
    stop("Missing required columns: ", paste(missing, collapse = ", "))
  }
}

compute_controlled_frequency <- function(df) {
  # Trips per municipality-year (count of records)
  trips <- df %>%
    group_by(Comuna, Year) %>%
    summarise(Num_Viajes = n(), .groups = "drop")
  
  # Unique landing sites per municipality-year
  sites <- df %>%
    group_by(Comuna, Year) %>%
    summarise(Num_Caletas = n_distinct(CaletadeDesembarque), .groups = "drop")
  
  trips %>%
    left_join(sites, by = c("Comuna", "Year")) %>%
    mutate(Frecuencia_Controlada = Num_Viajes / Num_Caletas)
}

filter_municipalities_by_threshold <- function(freq_df, threshold = 12) {
  # Keep municipalities meeting the threshold in any year?
  # Your original logic effectively removed low-frequency communes.
  # Here: keep communes whose controlled frequency >= threshold at least once.
  # If you instead want "must meet threshold in ALL years", tell me and I’ll adjust.
  keep <- freq_df %>%
    group_by(Comuna) %>%
    summarise(max_freq = max(Frecuencia_Controlada, na.rm = TRUE), .groups = "drop") %>%
    filter(max_freq >= threshold) %>%
    pull(Comuna)
  
  keep
}

community_matrix_for_years <- function(df_comuna, year_1, year_2) {
  df_comuna %>%
    filter(Year %in% c(year_1, year_2)) %>%
    group_by(Year, spp_scname) %>%
    summarise(Desembarques.Ajustados = sum(Desembarques.Ajustados, na.rm = TRUE),
              .groups = "drop") %>%
    pivot_wider(
      names_from  = spp_scname,
      values_from = Desembarques.Ajustados,
      values_fill = list(Desembarques.Ajustados = 0)
    )
}

bray_between_two_rows <- function(mat) {
  # mat includes Year + species columns
  if (nrow(mat) < 2 || ncol(mat) < 2) return(NA_real_)
  d <- vegdist(mat %>% select(-Year), method = BETA_METHOD)
  as.numeric(d)[1]
}

compute_beta_diversity_for_comuna <- function(df, comuna, mode = c("adjacent", "full")) {
  mode <- match.arg(mode)
  
  df_comuna <- df %>%
    filter(Comuna == comuna) %>%
    mutate(Desembarques.Ajustados = as_numeric_comma(Desembarques.Ajustados)) %>%
    filter(!is.na(Desembarques.Ajustados))
  
  years <- sort(unique(df_comuna$Year))
  if (length(years) < 2) return(tibble())
  
  if (mode == "adjacent") {
    pairs <- tibble(
      Year_1 = years[-length(years)],
      Year_2 = years[-1]
    )
  } else {
    pairs <- as_tibble(t(combn(years, 2))) %>% setNames(c("Year_1", "Year_2"))
  }
  
  map_dfr(seq_len(nrow(pairs)), function(i) {
    y1 <- pairs$Year_1[i]
    y2 <- pairs$Year_2[i]
    mat <- community_matrix_for_years(df_comuna, y1, y2)
    beta <- bray_between_two_rows(mat)
    
    if (is.na(beta)) return(tibble())
    tibble(Comuna = comuna, Year_1 = y1, Year_2 = y2, Beta_Diversity = beta)
  })
}

compute_beta_diversity_by_latbin_adjacent <- function(df) {
  lat_bins <- sort(unique(df$lat_bin))
  lat_bins <- lat_bins[lat_bins != EXCLUDE_LAT_BIN]
  
  map_dfr(lat_bins, function(lb) {
    df_lb <- df %>%
      filter(lat_bin == lb) %>%
      mutate(Desembarques.Ajustados = as_numeric_comma(Desembarques.Ajustados)) %>%
      filter(!is.na(Desembarques.Ajustados))
    
    years <- sort(unique(df_lb$Year))
    if (length(years) < 2) return(tibble())
    
    pairs <- tibble(
      Year_1 = years[-length(years)],
      Year_2 = years[-1]
    )
    
    map_dfr(seq_len(nrow(pairs)), function(i) {
      y1 <- pairs$Year_1[i]
      y2 <- pairs$Year_2[i]
      
      mat <- df_lb %>%
        filter(Year %in% c(y1, y2)) %>%
        group_by(Year, spp_scname) %>%
        summarise(Desembarques.Ajustados = sum(Desembarques.Ajustados, na.rm = TRUE),
                  .groups = "drop") %>%
        pivot_wider(names_from = spp_scname,
                    values_from = Desembarques.Ajustados,
                    values_fill = list(Desembarques.Ajustados = 0))
      
      beta <- bray_between_two_rows(mat)
      if (is.na(beta)) return(tibble())
      
      tibble(lat_bin = lb, Year_1 = y1, Year_2 = y2, Beta_Diversity = beta)
    })
  })
}

# -----------------------------
# Load data
# -----------------------------
df <- read.csv2(DATA_PATH, fileEncoding = "UTF-8")

required_cols <- c("Comuna","Year","CaletadeDesembarque","lat_bin","spp_scname","Desembarques.Ajustados")
validate_required_cols(df, required_cols)

# Convert numeric-like fields used in analyses
df <- df %>%
  mutate(
    Desembarque.Tons..      = as_numeric_comma(Desembarque.Tons..),
    Desembarques.Ajustados  = as_numeric_comma(Desembarques.Ajustados)
  )

# -----------------------------
# 1) Controlled frequency (trips per landing site)
# -----------------------------
freq_df <- compute_controlled_frequency(df)

# Histogram (overall)
p_hist_all <- ggplot(freq_df, aes(x = Frecuencia_Controlada)) +
  geom_histogram(binwidth = CONTROLLED_FREQ_BINWIDTH, color = "black") +
  labs(
    title = "Controlled trip frequency (trips per landing site) – all municipality-year",
    x = "Controlled frequency (trips / landing site)",
    y = "Count of municipality-year"
  ) +
  theme_minimal()

print(p_hist_all)

# Histogram by year
p_hist_year <- ggplot(freq_df, aes(x = Frecuencia_Controlada)) +
  geom_histogram(binwidth = CONTROLLED_FREQ_BINWIDTH, color = "black") +
  facet_wrap(~ Year, ncol = 1, scales = "free_y") +
  labs(
    title = "Controlled trip frequency (trips per landing site) – by year",
    x = "Controlled frequency (trips / landing site)",
    y = "Count of municipality-year"
  ) +
  theme_minimal()

print(p_hist_year)

# Boxplot by year
p_box <- ggplot(freq_df, aes(x = factor(Year), y = Frecuencia_Controlada)) +
  geom_boxplot() +
  labs(
    title = "Distribution of controlled trip frequency by year",
    x = "Year",
    y = "Controlled frequency (trips / landing site)"
  ) +
  theme_minimal()

print(p_box)

# -----------------------------
# 2) Filter municipalities
# -----------------------------
keep_comunas <- filter_municipalities_by_threshold(freq_df, threshold = CONTROLLED_FREQ_THRESHOLD)

df_filtered <- df %>%
  filter(Comuna %in% keep_comunas) %>%
  filter(lat_bin != EXCLUDE_LAT_BIN)

cat("Municipalities retained:", length(unique(df_filtered$Comuna)), "\n")

# -----------------------------
# 3) Beta diversity by municipality
# -----------------------------
comunas <- sort(unique(df_filtered$Comuna))

results <- map_dfr(comunas, ~ compute_beta_diversity_for_comuna(df_filtered, .x, mode = PAIR_MODE))

results <- results %>%
  mutate(
    Year_Pair = paste(Year_1, Year_2, sep = "-"),
    Similarity = 1 - Beta_Diversity
  )

write.csv(results, "Interannual_beta_diversity.csv", row.names = FALSE, quote = FALSE)

# Summary metrics per commune (CV, mean, sd, variance)
cv_results <- results %>%
  group_by(Comuna) %>%
  summarise(
    CV_Beta_Diversity   = sd(Beta_Diversity, na.rm = TRUE) / mean(Beta_Diversity, na.rm = TRUE),
    Mean_Beta_Diversity = mean(Beta_Diversity, na.rm = TRUE),
    SD_Beta_Diversity   = sd(Beta_Diversity, na.rm = TRUE),
    variance            = var(Beta_Diversity, na.rm = TRUE),
    .groups = "drop"
  )

write.csv(cv_results, "dispersion_beta_diversity.csv", row.names = FALSE, quote = FALSE)

# Optional export used earlier
write.csv(results, "beta_div_fullmatrix.csv", row.names = FALSE)

# -----------------------------
# 4) Beta diversity by lat_bin (adjacent years)
# -----------------------------
results_lat_bin <- compute_beta_diversity_by_latbin_adjacent(df_filtered) %>%
  mutate(Year_Pair = paste(Year_1, Year_2, sep = "-"))

write.csv(results_lat_bin, "results_lat_bin_beta_diversity_adjacent.csv", row.names = FALSE, quote = FALSE)

cv_results_lat_bin <- results_lat_bin %>%
  group_by(lat_bin) %>%
  summarise(
    CV_Beta_Diversity = sd(Beta_Diversity, na.rm = TRUE) / mean(Beta_Diversity, na.rm = TRUE),
    .groups = "drop"
  )

print(cv_results_lat_bin)

# -----------------------------
# 5) Focused analysis: four municipalities
# -----------------------------
results_focal <- results %>%
  filter(Comuna %in% FOCAL_COMUNAS) %>%
  mutate(Year_Difference = abs(Year_2 - Year_1))

write.csv(results_focal, "landings_similarity_2015-2022_CanCoqHigOval.csv", row.names = FALSE)

# Mean similarity by year (for plotting)
mean_similarity <- results_focal %>%
  group_by(Year_1, Comuna) %>%
  summarise(mean_similarity = mean(Similarity, na.rm = TRUE), .groups = "drop")

p_focal_box <- ggplot(results_focal, aes(x = as.factor(Year_1), y = Similarity, fill = Comuna)) +
  geom_boxplot(alpha = 0.6, outlier.shape = NA) +
  geom_jitter(position = position_jitter(width = 0.2), alpha = 0.35) +
  geom_line(data = mean_similarity,
            aes(x = as.factor(Year_1), y = mean_similarity, color = Comuna, group = Comuna),
            linewidth = 1) +
  geom_point(data = mean_similarity,
             aes(x = as.factor(Year_1), y = mean_similarity, color = Comuna),
             size = 2) +
  labs(x = "Year", y = "Similarity (1 - beta diversity)") +
  theme_minimal()

print(p_focal_box)

# Similarity vs temporal distance
mean_similarity_by_diff <- results_focal %>%
  group_by(Year_Difference, Comuna) %>%
  summarise(
    mean_similarity = mean(Similarity, na.rm = TRUE),
    sd_similarity   = sd(Similarity, na.rm = TRUE),
    n               = n(),
    se_similarity   = sd_similarity / sqrt(n),
    lower_ci        = mean_similarity - 1.96 * se_similarity,
    upper_ci        = mean_similarity + 1.96 * se_similarity,
    .groups = "drop"
  )

p_focal_diff <- ggplot(mean_similarity_by_diff, aes(x = Year_Difference, y = mean_similarity, color = Comuna)) +
  geom_line(linewidth = 1.1) +
  geom_point(size = 2.5) +
  geom_ribbon(aes(ymin = lower_ci, ymax = upper_ci, fill = Comuna),
              alpha = 0.2, color = NA) +
  labs(x = "Temporal distance (years)", y = "Mean similarity (1 - beta diversity)") +
  theme_minimal()

print(p_focal_diff)

cat("Done.\n")
cat("Exported: Interannual_beta_diversity.csv, dispersion_beta_diversity.csv, beta_div_fullmatrix.csv,\n")
cat("          results_lat_bin_beta_diversity_adjacent.csv, landings_similarity_2015-2022_CanCoqHigOval.csv\n")