#Load dependencies----------------------------------------------------------------------------
library(vegan) #For diversity calculation
library(dplyr) #For data manipulation
set.seed(54) #Set seed (Forgot to do earlier)
#---------------------------------------------------------------------------------------------

#Inputs---------------------------------------------------------------------------------------
inpDir="/Volumes/TOSHI_EXT/Projects/Mycobiome/Data/OTU Tables/Archaea" #The directory containing the OTU tables, of which L8 will be pulled in this version
outDir="/Volumes/TOSHI_EXT/Projects/Mycobiome/Output/tOut_Rev" #Output directory
metaF<-"/Volumes/TOSHI_EXT/Projects/Mycobiome/Data/metadata.csv" #The meta data file containing group information
abrev="Archaea_" #Substring used as an abreviation when saving the output
#---------------------------------------------------------------------------------------------

#Functions----------------------------------------------------------------------------
#This function is designed to pull only the species level (L8) taxonomic file from the directory
load_csv_file <- function(directory) {
  # Get a list of files in the directory
  files <- list.files(directory)
  
  # Find the file that ends with 'L8.csv'
  csv_file <- files[grep("L8.csv$", files)]
  
  # Check if a matching file is found
  if (length(csv_file) == 0) {
    stop("No file found with 'L8.csv' extension in the directory.")
  }
  
  # Load the csv file into a dataframe
  file_path <- file.path(directory, csv_file)
  df <- read.csv(file_path, row.names = 1, check.names = FALSE)
  
  return(df)
}
#---------------------------------------------------------------------------------------------

#Running----------------------------------------------------------------------------
#Load the files--- 
combDF=load_csv_file(inpDir) #First the OTU table (this time L8)
abund_tab_t<-t(combDF) #Transpose the table for downstream compatibility
meta_tab <- read.csv(metaF, check.names = FALSE) #Now load the meta data table

#Fixing some nomenclature issues (group2 is the combined variable that the data will be split on)
#Remove the VDR prefix as all comparisons are VDR
meta_tab$Group2 <- gsub("\\.VDR", "", meta_tab$Group2)
#Replace instances of "DEFA" with "PC for consistency with previous owrk.
meta_tab <- meta_tab %>%
  mutate(Group2 = gsub("DEFA", "PC", Group2))

#Calculations --- 
#Use Vegan package function vegdist to get a dissimilary measure (Bray–Curtis dissimilarity) btwn rows (samples) of abundance df
bray<-vegdist(abund_tab_t, "bray")
#Based on the bray distances, again use the Vegan package to run an Analysis of Similarities (ANOSIM) based on genotype/sex subgroups (both).
fit_b <- anosim(bray, meta_tab$Group2,permutations = 999)

#List all the groups possible in the data
values <- c("M-Loxp", "F-Loxp", "M-PC", "F-PC", "M-Lyz", "F-Lyz", "M-IEC", "F-IEC")
result_list <- list() #Initialize an empty list to store results

# Loop over all pairs - we want to run ANOSIM on subsets of the data containing only two groups
# In theory this runs on all sets in the data set, but we are only really interested in some (matching by sex, comparing with controls, etc)
for (i in 1:(length(values) - 1)) {
  for (j in (i + 1):length(values)) {
    pair <- c(values[i], values[j])
    print(pair)
    
    # Subset meta data such that it only contains samples in one of the two groups
    sub_meta <- meta_tab %>% filter(Group2 %in% c(values[i], values[j]))
    
    #... now perform the same subset on the abundance table
    # Creating the subset based on SampleID
    subset_indices <- sub_meta$SampleID #Get ID of samples to drop
    subset_abund_tab <- abund_tab_t[subset_indices, , drop = FALSE] #Then drop these samples from the abundance table
     
    #Use Vegan package function vegdist to get a dissimilary measure (Bray–Curtis dissimilarity) btwn rows (samples) of abundance df
    sub_bray <- vegdist(subset_abund_tab, "bray")
    
    # Perform ANOSIM for the subset
    sub_anosim <- anosim(sub_bray, sub_meta$Group2, permutations = 999)
    
    # Extract and print p-value
    p_value <- sub_anosim$signif
    testStat <- sub_anosim$statistic
    print("ANOSIM P Value...")
    print(p_value)
    print("ANOSIM test statistic...")
    print(testStat)
    
    #Add these results to the list so they can be exported lated
    toAdd<-c(values[i],values[j],p_value,testStat)
    result_list <- c(result_list, list(toAdd))
  }
}

#Cast the result list to a dataframe for easy export
result_df <- as.data.frame(do.call(rbind, result_list))

# Provide meaningful column names
colnames(result_df) <- c("Group1", "Group2", "p_value", "testStat")

#Actually export
tOut <- file.path(outDir, paste(abrev, "_subAnalysis_Stats.csv", sep = "")) #Build the name of the output plot
write.csv(result_df, tOut, row.names = FALSE)