# Check clumped data by intensity, reproducibility and outliers
# Project "Saiwan_data_processing"
# Author: Niels J. de Winter
# Study: "Living on the edge: Response of rudist bivalves (Hippuritida) to hot and highly seasonal climate in the low-latitude Saiwan site, Oman"

require(tidyverse)
require(ggpubr) 
source("calculate_temp_and_d18Osw_NJW.R") # load d18Oc-d18Ow conversion functions

# Load Archive data
PLUS_data <- read.csv("0_raw_data/Clumped_archive_PLUS.csv") %>% # PLUS archive
    mutate(
        Archive = "PLUS",
        Date = as_datetime(Date, format = "%d/%m/%Y %H:%M"),
        Sample_or_Standard = ifelse(SA.STD %in% c("ETH-1", "ETH-2", "ETH-3"), "ETH Standard", "Sample") # Separate samples from standards
    ) %>%
    rename(D47_raw = D47.Raw)
PACMAN_data <- read.csv("0_raw_data/Clumped_archive_PACMAN.csv") %>% # PLUS archive
    mutate(
        Archive = "PACMAN",
        Date = as_datetime(Date, format = "%d/%m/%Y %H:%M"), # Format date and time
        Sample_or_Standard = ifelse(SA.STD %in% c("ETH-1", "ETH-2", "ETH-3"), "ETH Standard", "Sample") # Separate samples from standards
    ) %>%
    rename(D47_raw = D47.Raw)

# Replace NA by Najat to prevent flase NAs
PLUS_data$ID[which(is.na(PLUS_data$ID))] <- "Najat"
PACMAN_data$ID[which(is.na(PACMAN_data$ID))] <- "Najat"

# ------------------------------------------------------------------------------
# Correct D47 data based on standards with same intensity

# PACMAN archive (lost 2 check standards)
PACMAN_data$D47_offset_corr <- NA # Create column for offset corrected D47
PACMAN_data$D47_offset_N <- NA # Record number of ETH3 used for offset correction
# Loop through non-ETH standards and samples
for(row in 1:nrow(PACMAN_data)){
    # Find intensity-matched ETH-3 standards
    ETH3 <- filter(PACMAN_data,
        Sa.m44 >= PACMAN_data$Sa.m44[row] - 1000,
        Sa.m44 <= PACMAN_data$Sa.m44[row] + 1000,
        SA.STD == "ETH-3",
        OUTLIER == 0
    )
    # If ETH3 standards are found, correct the raw D47 value with the mean of the intensity-matched ETH3 values and the accepted value for ETH-3 (0.6132)
    if(nrow(ETH3) >= 5){ # Only correct with at least 5 ETH-3 aliquots
        PACMAN_data$D47_offset_corr[row] <- PACMAN_data$D47_raw[row] - mean(ETH3$D47_raw) + 0.6132
        PACMAN_data$D47_offset_N[row] <- nrow(ETH3)
    }else{
        PACMAN_data$D47_offset_corr[row] <- NA
        PACMAN_data$D47_offset_N[row] <- 0
    }
}

# Create PLUS archive without 2022 data
PLUS_data2 <- filter(
    PLUS_data,
    Date < as_datetime("01/01/2022", format = "%d/%m/%Y")
) %>%
    mutate(
        D47_offset_corr = NA, # Create column for offset corrected D47
        D47_offset_N = NA # Record number of ETH3 used for offset correction
    )

# PLUS archive without 2022 data (lost 9 samples + 1 check standard)
# Loop through non-ETH standards and samples
for(row in 1:nrow(PLUS_data2)){
    # Find intensity-matched ETH-3 standards
    ETH3 <- filter(PLUS_data2,
        Sa.m44 >= PLUS_data2$Sa.m44[row] - 1000,
        Sa.m44 <= PLUS_data2$Sa.m44[row] + 1000,
        SA.STD == "ETH-3",
        OUTLIER == 0
    )
    # If ETH3 standards are found, correct the raw D47 value with the mean of the intensity-matched ETH3 values and the accepted value for ETH-3 (0.6132)
    if(nrow(ETH3) >= 5){ # Only correct with at least 5 ETH-3 aliquots
        PLUS_data2$D47_offset_corr[row] <- PLUS_data2$D47_raw[row] - mean(ETH3$D47_raw) + 0.6132
        PLUS_data2$D47_offset_N[row] <- nrow(ETH3)
    }else{
        PLUS_data2$D47_offset_corr[row] <- NA
        PLUS_data2$D47_offset_N[row] <- 0
    }
}

# ------------------------------------------------------------------------------
# Reapply ETF for entire measurement window

# PACMAN Archive
# Add accepted values
PACMAN_data$D47_accepted <- NA
PACMAN_data$D47_accepted[which(PACMAN_data$Sample_or_Standard == "ETH Standard" & PACMAN_data$SA.STD == "ETH-3")] <- 0.6132
PACMAN_data$D47_accepted[which(PACMAN_data$Sample_or_Standard == "ETH Standard" & PACMAN_data$SA.STD == "ETH-2")] <- 0.2085
PACMAN_data$D47_accepted[which(PACMAN_data$Sample_or_Standard == "ETH Standard" & PACMAN_data$SA.STD == "ETH-1")] <- 0.2052

PACMAN_ETF <- lm(D47_accepted ~ D47_offset_corr,
    data = filter(
        PACMAN_data,
        Sample_or_Standard == "ETH Standard",
        OUTLIER == 0
    )
) # Calculate ETF
PACMAN_data$D47_corrETF <- PACMAN_ETF$coefficients[1] + PACMAN_data$D47_offset_corr * PACMAN_ETF$coefficients[2] # Apply ETF

# PLUS Archive without 2022 data
# Add accepted values
PLUS_data2$D47_accepted <- NA
PLUS_data2$D47_accepted[which(PLUS_data2$Sample_or_Standard == "ETH Standard" & PLUS_data2$SA.STD == "ETH-3")] <- 0.6132
PLUS_data2$D47_accepted[which(PLUS_data2$Sample_or_Standard == "ETH Standard" & PLUS_data2$SA.STD == "ETH-2")] <- 0.2085
PLUS_data2$D47_accepted[which(PLUS_data2$Sample_or_Standard == "ETH Standard" & PLUS_data2$SA.STD == "ETH-1")] <- 0.2052

PLUS_ETF2 <- lm(D47_accepted ~ D47_offset_corr,
    data = filter(
        PLUS_data2,
        Sample_or_Standard == "ETH Standard",
        OUTLIER == 0
    )
) # Calculate ETF
PLUS_data2$D47_corrETF <- PLUS_ETF2$coefficients[1] + PLUS_data2$D47_offset_corr * PLUS_ETF2$coefficients[2] # Apply ETF

# Combine archives withour 2022 data
Archives_clumped <- full_join( # Merge archives
    PLUS_data2,
    PACMAN_data
)
Archives_clumped$ID[which(is.na(Archives_clumped$ID))] <- "Najat" # Replace NA by Najat to prevent flase NAs

# Export intensity-corrected data from archives
write.csv(Archives_clumped, "Clumped_archives2021_intcorr.csv", row.names = FALSE)

# ------------------------------------------------------------------------------
# Group data by run
Rungroups <- Archives_clumped %>%
    group_by(Run, Sample_or_Standard, Archive) %>%
    summarize(
        Date = mean(Date),
        Run = first(Run),
        Int_mean = mean(Sa.m44),
        Weight_mean = mean(Weight),
        D47_mean = mean(D47_corrETF),
        d18O_mean = mean(Final.d18O),
        d13C_mean = mean(Final.d13C),
        Archive = first(Archive)
    )

# Plot intensities vs clumped values for samples and standards after intensity correction
Int_D47_plot_PLUS_corr <- ggplot(filter(Archives_clumped, Archive == "PLUS")) +
    geom_point(
        aes(
            x = Sa.m44,
            y = D47_corrETF,
            color = Sample_or_Standard
        ),
        alpha = 0.3
    ) +
    geom_smooth(
        aes(
            x = Sa.m44,
            y = D47_corrETF,
            fill = Sample_or_Standard,
            color = Sample_or_Standard
        ),
        method = "lm"
    ) +
    scale_x_continuous("Intensity 44") +
    scale_y_continuous("D47 (I-CDES)") +
    theme_minimal() +
    ggtitle("Corrected clumped value vs\nintensities measured on PLUS")

Int_D47_plot_PACMAN_corr <- ggplot(filter(Archives_clumped, Archive == "PACMAN")) +
    geom_point(
        aes(
            x = Sa.m44,
            y = D47_corrETF,
            color = Sample_or_Standard
        ),
        alpha = 0.3
    ) +
    geom_smooth(
        aes(
            x = Sa.m44,
            y = D47_corrETF,
            fill = Sample_or_Standard,
            color = Sample_or_Standard
        ),
        method = "lm"
    ) +
    scale_x_continuous("Intensity 44") +
    scale_y_continuous("D47 (I-CDES)") +
    theme_minimal() +
    ggtitle("Corrected clumped value vs\nintensities measured on PACMAN")

Int_D47_plot_combined <- ggarrange(
    Int_D47_plot_PLUS_corr,
    Int_D47_plot_PACMAN_corr,
    ncol = 2,
    common.legend = TRUE
)

# Isolate and export corrected data for samples and statistics
Clumped_TS <- select(
    filter(Archives_clumped,
        ID == "Najat",
        SA.STD != "IAEA-C2",
        SA.STD != "MERCK",
        OUTLIER == 0
    ),
    Date,
    SA.STD,
    Final.D47,
    D47_corrETF,
    Final.d18O,
    Final.d13C
)

# Summarize stats of standards
STDstats <- filter(Archives_clumped,
        SA.STD %in% c("ETH-1", "ETH-2", "ETH-3", "IAEA-C2", "MERCK"),
        OUTLIER == 0
    ) %>%
    group_by(SA.STD) %>%
    summarize(
        N = n(),
        D47_mean = mean(D47_corrETF, na.rm = TRUE),
        D47_sd = sd(D47_corrETF, na.rm = TRUE),
        d18O_mean = mean(Final.d18O, na.rm = TRUE),
        d18O_sd = sd(Final.d18O, na.rm = TRUE),
        d13C_mean = mean(Final.d13C, na.rm = TRUE),
        d13C_sd = sd(Final.d13C, na.rm = TRUE)
    )

write.csv(Clumped_TS, "Clumped_TS_intcorr.csv", row.names = FALSE)
write.csv(STDstats, "STDstats_intcorr.csv", row.names = FALSE)