# Combine isotope data with clumped data and apply clustering and grouping
# Project "Saiwan_data_processing"
# Author: Niels J. de Winter
# Study: "Living on the edge: Response of rudist bivalves (Hippuritida) to hot and highly seasonal climate in the low-latitude Saiwan site, Oman"

# Load packages
require(tidyverse)
require(ggpubr)
require(cluster)
# require(ggforce)
source("calculate_temp_and_d18Osw_NJW.R") # load d18Oc-d18Ow conversion functions

# Load stable stable and clumped isotope data
Saiwan_SI <- read.csv("3_Combine_monthly_SI_data/3_Saiwan_SI_dated.csv") # Load all stable isotope datapoints
Saiwan_SI <- Saiwan_SI[which(!is.na(Saiwan_SI$d18O)), ] # Remove lines without d18Oc values
Clumped_data <- read.csv("1_Results_Clumped_data_processing/Clumped_TS_intcorr_review.csv") # Load clumped data
Clumped_data <- Clumped_data[complete.cases(Clumped_data), ] # Remove rows with missing data

# Add uncertainties to clumped data based on standards
Clumped_data$D47sd <- 0.053 # Based on IAEA-C2
Clumped_data$d18Osd <- 0.096 # Based on IAEA-C2
Clumped_data$d13Csd <- 0.053 # Based on IAEA-C2

# Add temperature and d18Ow reconstructions using the inverted Omnivariant Generalized Least Squares equation by Daëron et al., 2023
# Not taking into account the uncertainties on the calibration at this point
# https://doi.org/10.1016/j.chemgeo.2023.121881
Clumped_data$T47 <- 10 ^ 3 / (0.21265 + sqrt(23.4427 * Clumped_data$D47_corrETF - 4.0427)) - 273.15
Clumped_data$Final.d18Ow <- d18Ow_from_d18O_T(Clumped_data$Final.d18O, Clumped_data$T47 + 273.15, mineral = "calcite")

# Assign clusters to clumped data based on location in the shell
Clumped_data$location_cluster <- Clumped_data$SA.STD
for(i in 1:length(Clumped_data$location_cluster)){
    if(grepl("R_08", Clumped_data$location_cluster[i]) |
        grepl("R_09", Clumped_data$location_cluster[i]) |
        grepl("R_10", Clumped_data$location_cluster[i]) |
        grepl("R_11", Clumped_data$location_cluster[i])
    ){
        Clumped_data$location_cluster[i] <- "R_08-R_11"
    }else if(
        grepl("R_01", Clumped_data$location_cluster[i]) |
        grepl("R_02", Clumped_data$location_cluster[i]) |
        grepl("R_03", Clumped_data$location_cluster[i]) |
        grepl("R_04", Clumped_data$location_cluster[i]) |
        grepl("R_05", Clumped_data$location_cluster[i]) |
        grepl("R_06", Clumped_data$location_cluster[i]) |
        grepl("R_07", Clumped_data$location_cluster[i])
    ){
        Clumped_data$location_cluster[i] <- "R_01-R_07"
    }else if(grepl("RB_2", Clumped_data$location_cluster[i])){
        Clumped_data$location_cluster[i] <- "RB_2"
    }
}

# ---------------------------------------------------------------------------
# Clustering by location
# ---------------------------------------------------------------------------

# Summarize statistics per location cluster
Clumped_location_clusters <- Clumped_data |>
    group_by(location_cluster) |>
    summarise(
        N = n(),
        d18Omean = mean(Final.d18O),
        d18Osd = sd(Final.d18O),
        d13Cmean = mean(Final.d13C),
        d13Csd = sd(Final.d13C),
        D47mean = mean(D47_corrETF),
        D47sd = sd(D47_corrETF),
        D47se = D47sd / sqrt(N),
        D47CL = qt(0.95, N - 1) * D47se,
        Tmean = mean(T47),
        Tsd = sd(T47),
        Tmin = 10 ^ 3 / (0.21265 + sqrt(23.4427 * (D47mean + D47CL) - 4.0427)) - 273.15,
        Tmax = 10 ^ 3 / (0.21265 + sqrt(23.4427 * (D47mean - D47CL) - 4.0427)) - 273.15,
        dwmean = d18Ow_from_d18O_T(d18Omean, Tmean + 273.15, mineral = "calcite"),
        dwmin = d18Ow_from_d18O_T(d18Omean - d18Osd, Tmin + 273.15, mineral = "calcite"),
        dwmax = d18Ow_from_d18O_T(d18Omean + d18Osd, Tmax + 273.15, mineral = "calcite")
    )

# Rename columns in Clumped_data
Clumped_data <- Clumped_data %>%
    rename(
        D47 = D47_corrETF,
        d18O = Final.d18O,
        d13C = Final.d13C,
        d18Ow = Final.d18Ow
    )

# Add plot with clustering based on location in the shell
SI_clumped_location_clusters <- ggplot(Clumped_data) +
    geom_pointrange(
        aes(
            x = d18O,
            y = d13C,
            ymin = d13C - d13Csd,
            ymax = d13C + d13Csd,
            shape = as.factor(location_cluster)
        ),
        alpha = 0.5
    ) +
    geom_errorbarh(
        aes(
            y = d13C,
            xmin = d18O - d18Osd,
            xmax = d18O + d18Osd
        ),
        alpha = 0.5
    ) +
    geom_pointrange(data = filter(Saiwan_SI, specimen_name == "HU_027"),
        aes(
            x = d18O,
            y = d13C,
            ymin = d13C - d13C_SD,
            ymax = d13C + d13C_SD
        ),
        alpha = 0.2
    ) +
    geom_pointrange(data = Clumped_location_clusters,
        aes(
            x = d18Omean,
            y = d13Cmean,
            ymin = d13Cmean - d13Csd,
            ymax = d13Cmean + d13Csd,
            color = Tmean,
            shape = as.factor(location_cluster)
        ),
        size = 1.5
    ) +
    geom_errorbarh(data = Clumped_location_clusters,
        aes(
            y = d13Cmean,
            xmin = d18Omean - d18Osd,
            xmax = d18Omean + d18Osd,
            color = Tmean
        ),
        height = 0
    ) +
    geom_label(data = Clumped_location_clusters, # Add label with d18Ow values
        aes(x = d18Omean,
            y = d13Cmean,
            label = paste(round(dwmean, 2), "±", round((dwmax - dwmin) / 2, 2), "\u2030 VSMOW")),
        nudge_x = 0.5,
        nudge_y = 0.1,
        show.legend = FALSE) +
    scale_y_continuous(expression(paste(delta^13 * "C ", "(\u2030 VPDB)")),
        breaks = seq(-0.5, 2.5, 0.5)
    ) +
    scale_x_continuous(expression(paste(delta^18 * "O ", "(\u2030 VPDB)")),
        breaks = seq(-7, -4, 0.5)
    ) +
    scale_shape_manual(values = c(15:18)) +
    scale_color_distiller(palette = "RdYlBu") +
    theme_minimal() +
    ggtitle("Clustering based on location in the shell")

# ---------------------------------------------------------------------------
# Statistical clustering options
# ---------------------------------------------------------------------------

# Apply K-means clustering based on 3 and 4 clusters
kmeans_fit3 <- kmeans(select(Clumped_data, d18O, d13C), 3)
Clumped_data$kmeans3 <- kmeans_fit3$cluster # Add clustering to data
kmeans_fit4 <- kmeans(select(Clumped_data, d18O, d13C), 4)
Clumped_data$kmeans4 <- kmeans_fit4$cluster # Add clustering to data

# Apply PAM (Partitioning Around Medoids) based on 3 and 4 clusters
pam_fit3 <- pam(select(Clumped_data, d18O, d13C), 3)
Clumped_data$pam3 <- pam_fit3$cluster # Add clustering to data
pam_fit4 <- pam(select(Clumped_data, d18O, d13C), 4)
Clumped_data$pam4 <- pam_fit4$cluster # Add clustering to data

# Summarize clustering results
Clumped_kmeans3 <- Clumped_data |>
    group_by(kmeans3) |>
    summarise(
        N = n(),
        d18Omean = mean(d18O),
        d18Osd = sd(d18O),
        d13Cmean = mean(d13C),
        d13Csd = sd(d13C),
        D47mean = mean(D47),
        D47sd = sd(D47),
        D47se = D47sd / sqrt(N),
        D47CL = qt(0.95, N - 1) * D47se,
        Tmean = mean(T47),
        Tsd = sd(T47),
        Tmin = 10 ^ 3 / (0.21265 + sqrt(23.4427 * (D47mean + D47CL) - 4.0427)) - 273.15,
        Tmax = 10 ^ 3 / (0.21265 + sqrt(23.4427 * (D47mean - D47CL) - 4.0427)) - 273.15,
        dwmean = d18Ow_from_d18O_T(d18Omean, Tmean + 273.15, mineral = "calcite"),
        dwmin = d18Ow_from_d18O_T(d18Omean - d18Osd, Tmin + 273.15, mineral = "calcite"),
        dwmax = d18Ow_from_d18O_T(d18Omean + d18Osd, Tmax + 273.15, mineral = "calcite")
    )

Clumped_kmeans4 <- Clumped_data |>
    group_by(kmeans4) |>
    summarise(
        N = n(),
        d18Omean = mean(d18O),
        d18Osd = sd(d18O),
        d13Cmean = mean(d13C),
        d13Csd = sd(d13C),
        D47mean = mean(D47),
        D47sd = sd(D47),
        D47se = D47sd / sqrt(N),
        D47CL = qt(0.95, N - 1) * D47se,
        Tmean = mean(T47),
        Tsd = sd(T47),
        Tmin = 10 ^ 3 / (0.21265 + sqrt(23.4427 * (D47mean + D47CL) - 4.0427)) - 273.15,
        Tmax = 10 ^ 3 / (0.21265 + sqrt(23.4427 * (D47mean - D47CL) - 4.0427)) - 273.15,
        dwmean = d18Ow_from_d18O_T(d18Omean, Tmean + 273.15, mineral = "calcite"),
        dwmin = d18Ow_from_d18O_T(d18Omean - d18Osd, Tmin + 273.15, mineral = "calcite"),
        dwmax = d18Ow_from_d18O_T(d18Omean + d18Osd, Tmax + 273.15, mineral = "calcite")
    )

Clumped_pam3 <- Clumped_data |>
    group_by(pam3) |>
    summarise(
        N = n(),
        d18Omean = mean(d18O),
        d18Osd = sd(d18O),
        d13Cmean = mean(d13C),
        d13Csd = sd(d13C),
        D47mean = mean(D47),
        D47sd = sd(D47),
        D47se = D47sd / sqrt(N),
        D47CL = qt(0.95, N - 1) * D47se,
        Tmean = mean(T47),
        Tsd = sd(T47),
        Tmin = 10 ^ 3 / (0.21265 + sqrt(23.4427 * (D47mean + D47CL) - 4.0427)) - 273.15,
        Tmax = 10 ^ 3 / (0.21265 + sqrt(23.4427 * (D47mean - D47CL) - 4.0427)) - 273.15,
        dwmean = d18Ow_from_d18O_T(d18Omean, Tmean + 273.15, mineral = "calcite"),
        dwmin = d18Ow_from_d18O_T(d18Omean - d18Osd, Tmin + 273.15, mineral = "calcite"),
        dwmax = d18Ow_from_d18O_T(d18Omean + d18Osd, Tmax + 273.15, mineral = "calcite")
    )

Clumped_pam4 <- Clumped_data |>
    group_by(pam4) |>
    summarise(
        N = n(),
        d18Omean = mean(d18O),
        d18Osd = sd(d18O),
        d13Cmean = mean(d13C),
        d13Csd = sd(d13C),
        D47mean = mean(D47),
        D47sd = sd(D47),
        D47se = D47sd / sqrt(N),
        D47CL = qt(0.95, N - 1) * D47se,
        Tmean = mean(T47),
        Tsd = sd(T47),
        Tmin = 10 ^ 3 / (0.21265 + sqrt(23.4427 * (D47mean + D47CL) - 4.0427)) - 273.15,
        Tmax = 10 ^ 3 / (0.21265 + sqrt(23.4427 * (D47mean - D47CL) - 4.0427)) - 273.15,
        dwmean = d18Ow_from_d18O_T(d18Omean, Tmean + 273.15, mineral = "calcite"),
        dwmin = d18Ow_from_d18O_T(d18Omean - d18Osd, Tmin + 273.15, mineral = "calcite"),
        dwmax = d18Ow_from_d18O_T(d18Omean + d18Osd, Tmax + 273.15, mineral = "calcite")
    )

# Write clustering results
write.csv(Clumped_kmeans3, "7_Results_clumped_clustering/Clumped_TS_clusters_kmeans3.csv", row.names = FALSE) # Export cluster results (k = 3)
write.csv(Clumped_kmeans4, "7_Results_clumped_clustering/Clumped_TS_clusters_kmeans4.csv", row.names = FALSE) # Export cluster results (k = 4)
write.csv(Clumped_pam3, "7_Results_clumped_clustering/Clumped_TS_clusters_pam3.csv", row.names = FALSE) # Export cluster results (k = 3)
write.csv(Clumped_pam4, "7_Results_clumped_clustering/Clumped_TS_clusters_pam4.csv", row.names = FALSE) # Export cluster results (k = 4)
write.csv(Clumped_location_clusters, "7_Results_clumped_clustering/Clumped_TS_clusters_location.csv", row.names = FALSE) # Export cluster results by location
write.csv(Clumped_data, "7_Results_clumped_clustering/Clumped_TS_intcorr_cluster_review.csv", row.names = FALSE) # Export all data with cluster IDs

# ---------------------------------------------------------------------------
# Plots for clustering appendix
# ---------------------------------------------------------------------------

# Plot clustering for k = 3 and k = 4
SI_clumped_kmeans3 <- ggplot(Clumped_data) +
    geom_pointrange(
        aes(
            x = d18O,
            y = d13C,
            ymin = d13C - d13Csd,
            ymax = d13C + d13Csd,
            shape = as.factor(kmeans3)
        ),
        alpha = 0.5
    ) +
    geom_errorbarh(
        aes(
            y = d13C,
            xmin = d18O - d18Osd,
            xmax = d18O + d18Osd
        ),
        alpha = 0.5
    ) +
    geom_pointrange(data = Clumped_kmeans3,
        aes(
            x = d18Omean,
            y = d13Cmean,
            ymin = d13Cmean - d13Csd,
            ymax = d13Cmean + d13Csd,
            color = Tmean,
            shape = as.factor(kmeans3)
        ),
        size = 1.5
    ) +
    geom_pointrange(data = filter(Saiwan_SI, specimen_name == "HU_027"),
        aes(
            x = d18O,
            y = d13C,
            ymin = d13C - d13C_SD,
            ymax = d13C + d13C_SD
        ),
        alpha = 0.2
    ) +
    geom_errorbarh(data = Clumped_kmeans3,
        aes(
            y = d13Cmean,
            xmin = d18Omean - d18Osd,
            xmax = d18Omean + d18Osd,
            color = Tmean
        ),
        height = 0
    ) +
    scale_y_continuous(expression(paste(delta^13 * "C ", "(\u2030 VPDB)")),
        breaks = seq(-0.5, 2.5, 0.5)
    ) +
    scale_x_continuous(expression(paste(delta^18 * "O ", "(\u2030 VPDB)")),
        breaks = seq(-7, -4, 0.5)
    ) +
    scale_shape_manual(values = c(15:18)) +
    scale_color_distiller(palette = "RdYlBu") +
    theme_minimal() +
    ggtitle("K-means clustering applying 3 clusters")

SI_clumped_kmeans4 <- ggplot(Clumped_data) +
    geom_pointrange(
        aes(
            x = d18O,
            y = d13C,
            ymin = d13C - d13Csd,
            ymax = d13C + d13Csd,
            shape = as.factor(kmeans4)
        ),
        alpha = 0.5
    ) +
    geom_errorbarh(
        aes(
            y = d13C,
            xmin = d18O - d18Osd,
            xmax = d18O + d18Osd
        ),
        alpha = 0.5
    ) +
    geom_pointrange(data = filter(Saiwan_SI, specimen_name == "HU_027"),
        aes(
            x = d18O,
            y = d13C,
            ymin = d13C - d13C_SD,
            ymax = d13C + d13C_SD
        ),
        alpha = 0.2
    ) +
    geom_pointrange(data = Clumped_kmeans4,
        aes(
            x = d18Omean,
            y = d13Cmean,
            ymin = d13Cmean - d13Csd,
            ymax = d13Cmean + d13Csd,
            color = Tmean,
            shape = as.factor(kmeans4)
        ),
        size = 1.5
    ) +
    geom_errorbarh(data = Clumped_kmeans4,
        aes(
            y = d13Cmean,
            xmin = d18Omean - d18Osd,
            xmax = d18Omean + d18Osd,
            color = Tmean
        ),
        height = 0
    ) +
    scale_y_continuous(expression(paste(delta^13 * "C ", "(\u2030 VPDB)")),
        breaks = seq(-0.5, 2.5, 0.5)
    ) +
    scale_x_continuous(expression(paste(delta^18 * "O ", "(\u2030 VPDB)")),
        breaks = seq(-7, -4, 0.5)
    ) +
    scale_shape_manual(values = c(15:18)) +
    scale_color_distiller(palette = "RdYlBu") +
    theme_minimal() +
    ggtitle("K-means clustering applying 4 clusters")

SI_clumped_pam3 <- ggplot(Clumped_data) +
    geom_pointrange(
        aes(
            x = d18O,
            y = d13C,
            ymin = d13C - d13Csd,
            ymax = d13C + d13Csd,
            shape = as.factor(pam3)
        ),
        alpha = 0.5
    ) +
    geom_errorbarh(
        aes(
            y = d13C,
            xmin = d18O - d18Osd,
            xmax = d18O + d18Osd
        ),
        alpha = 0.5
    ) +
    geom_pointrange(data = filter(Saiwan_SI, specimen_name == "HU_027"),
        aes(
            x = d18O,
            y = d13C,
            ymin = d13C - d13C_SD,
            ymax = d13C + d13C_SD
        ),
        alpha = 0.2
    ) +
    geom_pointrange(data = Clumped_pam3,
        aes(
            x = d18Omean,
            y = d13Cmean,
            ymin = d13Cmean - d13Csd,
            ymax = d13Cmean + d13Csd,
            color = Tmean,
            shape = as.factor(pam3)
        ),
        size = 1.5
    ) +
    geom_errorbarh(data = Clumped_pam3,
        aes(
            y = d13Cmean,
            xmin = d18Omean - d18Osd,
            xmax = d18Omean + d18Osd,
            color = Tmean
        ),
        height = 0
    ) +
    scale_y_continuous(expression(paste(delta^13 * "C ", "(\u2030 VPDB)")),
        breaks = seq(-0.5, 2.5, 0.5)
    ) +
    scale_x_continuous(expression(paste(delta^18 * "O ", "(\u2030 VPDB)")),
        breaks = seq(-7, -4, 0.5)
    ) +
    scale_shape_manual(values = c(15:18)) +
    scale_color_distiller(palette = "RdYlBu") +
    theme_minimal() +
    ggtitle("PAM clustering applying 3 clusters")

SI_clumped_pam4 <- ggplot(Clumped_data) +
    geom_pointrange(
        aes(
            x = d18O,
            y = d13C,
            ymin = d13C - d13Csd,
            ymax = d13C + d13Csd,
            shape = as.factor(pam4)
        ),
        alpha = 0.5
    ) +
    geom_errorbarh(
        aes(
            y = d13C,
            xmin = d18O - d18Osd,
            xmax = d18O + d18Osd
        ),
        alpha = 0.5
    ) +
    geom_pointrange(data = filter(Saiwan_SI, specimen_name == "HU_027"),
        aes(
            x = d18O,
            y = d13C,
            ymin = d13C - d13C_SD,
            ymax = d13C + d13C_SD
        ),
        alpha = 0.2
    ) +
    geom_pointrange(data = Clumped_pam4,
        aes(
            x = d18Omean,
            y = d13Cmean,
            ymin = d13Cmean - d13Csd,
            ymax = d13Cmean + d13Csd,
            color = Tmean,
            shape = as.factor(pam4)
        ),
        size = 1.5
    ) +
    geom_errorbarh(data = Clumped_pam4,
        aes(
            y = d13Cmean,
            xmin = d18Omean - d18Osd,
            xmax = d18Omean + d18Osd,
            color = Tmean
        ),
        height = 0
    ) +
    scale_y_continuous(expression(paste(delta^13 * "C ", "(\u2030 VPDB)")),
        breaks = seq(-0.5, 2.5, 0.5)
    ) +
    scale_x_continuous(expression(paste(delta^18 * "O ", "(\u2030 VPDB)")),
        breaks = seq(-7, -4, 0.5)
    ) +
    scale_shape_manual(values = c(15:18)) +
    scale_color_distiller(palette = "RdYlBu") +
    theme_minimal() +
    ggtitle("PAM clustering applying 4 clusters")

# ------------------------------------------------------------------------------
# Figure S5
# ------------------------------------------------------------------------------

SI_clumped_clustering_plot_combined <- ggarrange(
    SI_clumped_kmeans3,
    SI_clumped_kmeans4,
    SI_clumped_pam3,
    SI_clumped_pam4,
    ncol = 2,
    nrow = 2,
    common.legend = TRUE
)