#Packages
library(ANCOMBC)
library(tidyverse)
library(caret)
library(DT)
library(TreeSummarizedExperiment)
library("phyloseq")
library("ggplot2")      # graphics
library("readxl")       # necessary to import the data from Excel file
library("dplyr")        # filter and reformat data frames
library("tibble")       # Needed for converting column to row names
library('mia')
#Functions
breakDownTaxonomy <- function(otuDataFrame) {
  otuCount <- nrow(otuDataFrame)
  newCols <- c("OTU", paste0("Level", 1:8))
  newDataFrame <- data.frame(matrix(ncol = length(newCols), nrow = otuCount))
  colnames(newDataFrame) <- newCols
  
  taxonomicLevels <- strsplit(rownames(otuDataFrame), split = ";", fixed = TRUE)
  
  for (i in 1:otuCount) {
    newDataFrame[i, "OTU"] <- paste0("otu", i)
    
    for (j in 1:8) {
      if (length(taxonomicLevels[[i]]) >= j) {
        newDataFrame[i, paste0("Level", j)] <- taxonomicLevels[[i]][j]
      } else {
        newDataFrame[i, paste0("Level", j)] <- NA
      }
    }
  }
  
  return(newDataFrame)
}
select_columns <- function(df, str1, str2, str3) {
  cols <- c(str1, grep(str2, names(df), value = TRUE), grep(str3, names(df), value = TRUE))
  selected_df <- df[, cols, drop = FALSE]
  return(selected_df)
}
runANCOMBC <- function(inpDir, inpTax, outDir, metaF, refLevel) {
  # Load the data to be usable by ANCOMBC
  metaData <- read.csv(metaF, row.names = 1)
  metaData <- metaData[, c("Gender", "Group2")]
  
  if (refLevel == "M.VDR-Loxp") {
    metaData$Group2 <- as.factor(metaData$Group2)
    metaData$Group2 <- relevel(metaData$Group2, ref = "M.VDR-Loxp")
    refLevel2<-"F.VDR-Loxp"
    refLevel3<-"Group2M"
  } else if (refLevel == "F.VDR-Loxp") {
    metaData$Group2 <- as.factor(metaData$Group2)
    metaData$Group2 <- relevel(metaData$Group2, ref = "F.VDR-Loxp")
    refLevel2<-"M.VDR-Loxp"
    refLevel3<-"Group2F"
  }
  
  myOTU <- read.csv(file.path(inpDir, inpTax), row.names = 1)
  taxaTable <- breakDownTaxonomy(myOTU)
  
  # Set row names of myOTU
  row.names(myOTU) <- taxaTable$OTU
  
  # Set row names of taxaTable and remove OTU column
  taxaTable <- data.frame(taxaTable[-1], row.names = taxaTable$OTU)
  
  otu_mat <- as.matrix(myOTU)
  tax_mat <- as.matrix(taxaTable)
  
  OTU <- otu_table(otu_mat, taxa_are_rows = TRUE)
  TAX <- tax_table(tax_mat)
  samples <- sample_data(metaData)
  carbom <- phyloseq(OTU, TAX, samples)
  
  tse <- makeTreeSummarizedExperimentFromPhyloseq(carbom)
  
  library(ANCOMBC)
  set.seed(123)
  output <- ancombc2(data = tse, assay_name = "counts", tax_level = NULL,
                     fix_formula = "Group2", rand_formula = NULL,
                     p_adj_method = "holm", pseudo = 0, pseudo_sens = TRUE,
                     prv_cut = 0.10, lib_cut = 1000, s0_perc = 0.05, struc_zero = FALSE, neg_lb = FALSE,
                     alpha = 0.05, n_cl = 2, verbose = TRUE,
                     global = FALSE, pairwise = FALSE, 
                     dunnet = FALSE, trend = FALSE,
                     iter_control = list(tol = 1e-5, max_iter = 20, 
                                         verbose = FALSE),
                     em_control = list(tol = 1e-5, max_iter = 100),
                     lme_control = NULL, mdfdr_control = NULL, 
                     trend_control = NULL)
  res_prim <- output$res
  tab_sens <- output$pseudo_sens_tab
  
  df_subset <- select_columns(res_prim, "taxon", refLevel2, refLevel3)
  
  # Intersect with the taxatable
  # Convert row names to a column in taxaTable
  taxaTable$taxon <- row.names(taxaTable)
  row.names(taxaTable) <- NULL
  
  # Subset taxaTable to include only rows with matching taxa values
  subset_taxaTable <- subset(taxaTable, taxon %in% df_subset$taxon)
  
  # Merge subset_taxaTable and df_subset based on matching taxa values
  merged_df <- merge(subset_taxaTable, df_subset, by = "taxon", all.x = TRUE)
  
  # Reorder columns to have columns from subset_taxaTable at the start
  reordered_cols <- c("taxon", names(subset_taxaTable)[-1], names(df_subset)[-1])
  merged_df <- merged_df[, reordered_cols]
  merged_df <- subset(merged_df, select = -c(taxon, taxon.1))
  
  # Rename the "Level2", "Level3", etc. columns to appropriate phylogenetic ranks
  new_colnames <- c("kingdom", "phylum", "class", "order", "family", "genus", "species")
  for (col in colnames(merged_df)) {
    if (!grepl("Level", col)) {
      new_colnames <- c(new_colnames, col)
    }
  }
  colnames(merged_df) <- new_colnames
  
  # Finally, to export
  substring <- strsplit(inpTax, "\\.csv$")[[1]][1]
  tOut <- paste("ANCOM_BC_vsLOXP_", refLevel, "_Output_", substring, ".csv", sep="")
  fullOut <- file.path(outDir, tOut)
  write.csv(merged_df, fullOut, row.names = TRUE)
}
runANCOMBC_within <- function(inpDir, inpTax, outDir, metaF, refLevel) {
  # Load the data to be usable by ANCOMBC
  metaData <- read.csv(metaF, row.names = 1)
  metaData <- metaData[, c("Gender", "Group2")]
  
  if (refLevel == "M.VDR-Lyz") {
    metaData$Group2 <- as.factor(metaData$Group2)
    metaData$Group2 <- relevel(metaData$Group2, ref = "M.VDR-Loxp")
    refLevel2<-"F.VDR-Lyz"
    refLevel3<-"F.VDR-Lyz"
  } else if (refLevel == "M.VDR-IEC") {
    metaData$Group2 <- as.factor(metaData$Group2)
    metaData$Group2 <- relevel(metaData$Group2, ref = "F.VDR-Loxp")
    refLevel2<-"F.VDR-IEC"
    refLevel3<-"F.VDR-IEC"
  }
  else if (refLevel == "M.VDR-DEFA") {
    metaData$Group2 <- as.factor(metaData$Group2)
    metaData$Group2 <- relevel(metaData$Group2, ref = "F.VDR-Loxp")
    refLevel2<-"F.VDR-DEFA"
    refLevel3<-"F.VDR-DEFA"
  }
  
  myOTU <- read.csv(file.path(inpDir, inpTax), row.names = 1)
  taxaTable <- breakDownTaxonomy(myOTU)
  
  # Set row names of myOTU
  row.names(myOTU) <- taxaTable$OTU
  
  # Set row names of taxaTable and remove OTU column
  taxaTable <- data.frame(taxaTable[-1], row.names = taxaTable$OTU)
  
  otu_mat <- as.matrix(myOTU)
  tax_mat <- as.matrix(taxaTable)
  
  OTU <- otu_table(otu_mat, taxa_are_rows = TRUE)
  TAX <- tax_table(tax_mat)
  samples <- sample_data(metaData)
  carbom <- phyloseq(OTU, TAX, samples)
  
  tse <- makeTreeSummarizedExperimentFromPhyloseq(carbom)
  
  library(ANCOMBC)
  set.seed(123)
  output <- ancombc2(data = tse, assay_name = "counts", tax_level = NULL,
                     fix_formula = "Group2", rand_formula = NULL,
                     p_adj_method = "holm", pseudo = 0, pseudo_sens = TRUE,
                     prv_cut = 0.10, lib_cut = 1000, s0_perc = 0.05, struc_zero = FALSE, neg_lb = FALSE,
                     alpha = 0.05, n_cl = 2, verbose = TRUE,
                     global = FALSE, pairwise = FALSE, 
                     dunnet = FALSE, trend = FALSE,
                     iter_control = list(tol = 1e-5, max_iter = 20, 
                                         verbose = FALSE),
                     em_control = list(tol = 1e-5, max_iter = 100),
                     lme_control = NULL, mdfdr_control = NULL, 
                     trend_control = NULL)
  res_prim <- output$res
  tab_sens <- output$pseudo_sens_tab
  
  df_subset <- select_columns(res_prim, "taxon", refLevel2, refLevel3)
  
  # Intersect with the taxatable
  # Convert row names to a column in taxaTable
  taxaTable$taxon <- row.names(taxaTable)
  row.names(taxaTable) <- NULL
  
  # Subset taxaTable to include only rows with matching taxa values
  subset_taxaTable <- subset(taxaTable, taxon %in% df_subset$taxon)
  
  # Merge subset_taxaTable and df_subset based on matching taxa values
  merged_df <- merge(subset_taxaTable, df_subset, by = "taxon", all.x = TRUE)
  
  # Reorder columns to have columns from subset_taxaTable at the start
  reordered_cols <- c("taxon", names(subset_taxaTable)[-1], names(df_subset)[-1])
  merged_df <- merged_df[, reordered_cols]
  merged_df <- subset(merged_df, select = -c(taxon, taxon.1))
  
  # Rename the "Level2", "Level3", etc. columns to appropriate phylogenetic ranks
  new_colnames <- c("kingdom", "phylum", "class", "order", "family", "genus", "species")
  for (col in colnames(merged_df)) {
    if (!grepl("Level", col)) {
      new_colnames <- c(new_colnames, col)
    }
  }
  colnames(merged_df) <- new_colnames
  
  # Finally, to export
  substring <- strsplit(inpTax, "\\.csv$")[[1]][1]
  tOut <- paste("ANCOM_BC_selfPair_", refLevel, "_Output_", substring, ".csv", sep="")
  fullOut <- file.path(outDir, tOut)
  write.csv(merged_df, fullOut, row.names = TRUE)
}
runANCOMBC_group1 <- function(inpDir, inpTax, outDir, metaF, refLevel) {
  # Load the data to be usable by ANCOMBC
  metaData <- read.csv(metaF, row.names = 1)
  metaData <- metaData[, c("Gender", "Group")]
  
  metaData$Group <- as.factor(metaData$Group)
  metaData$Group <- relevel(metaData$Group, ref = "VDR-Loxp")
  
  myOTU <- read.csv(file.path(inpDir, inpTax), row.names = 1)
  taxaTable <- breakDownTaxonomy(myOTU)
  
  # Set row names of myOTU
  row.names(myOTU) <- taxaTable$OTU
  
  # Set row names of taxaTable and remove OTU column
  taxaTable <- data.frame(taxaTable[-1], row.names = taxaTable$OTU)
  
  otu_mat <- as.matrix(myOTU)
  tax_mat <- as.matrix(taxaTable)
  
  OTU <- otu_table(otu_mat, taxa_are_rows = TRUE)
  TAX <- tax_table(tax_mat)
  samples <- sample_data(metaData)
  carbom <- phyloseq(OTU, TAX, samples)
  
  tse <- makeTreeSummarizedExperimentFromPhyloseq(carbom)
  
  library(ANCOMBC)
  set.seed(123)
  output <- ancombc2(data = tse, assay_name = "counts", tax_level = NULL,
                     fix_formula = "Group", rand_formula = NULL,
                     p_adj_method = "holm", pseudo = 0, pseudo_sens = TRUE,
                     prv_cut = 0.10, lib_cut = 1000, s0_perc = 0.05, struc_zero = FALSE, neg_lb = FALSE,
                     alpha = 0.05, n_cl = 2, verbose = TRUE,
                     global = FALSE, pairwise = FALSE, 
                     dunnet = FALSE, trend = FALSE,
                     iter_control = list(tol = 1e-5, max_iter = 20, 
                                         verbose = FALSE),
                     em_control = list(tol = 1e-5, max_iter = 100),
                     lme_control = NULL, mdfdr_control = NULL, 
                     trend_control = NULL)
  res_prim <- output$res
  tab_sens <- output$pseudo_sens_tab
  
  df_subset <- res_prim
  
  # Intersect with the taxatable
  # Convert row names to a column in taxaTable
  taxaTable$taxon <- row.names(taxaTable)
  row.names(taxaTable) <- NULL
  
  # Subset taxaTable to include only rows with matching taxa values
  subset_taxaTable <- subset(taxaTable, taxon %in% df_subset$taxon)
  
  # Merge subset_taxaTable and df_subset based on matching taxa values
  merged_df <- merge(subset_taxaTable, df_subset, by = "taxon", all.x = TRUE)
  
  # Reorder columns to have columns from subset_taxaTable at the start
  reordered_cols <- c("taxon", names(subset_taxaTable)[-1], names(df_subset)[-1])
  merged_df <- merged_df[, reordered_cols]
  merged_df <- subset(merged_df, select = -c(taxon, taxon.1))
  
  # Rename the "Level2", "Level3", etc. columns to appropriate phylogenetic ranks
  new_colnames <- c("kingdom", "phylum", "class", "order", "family", "genus", "species")
  for (col in colnames(merged_df)) {
    if (!grepl("Level", col)) {
      new_colnames <- c(new_colnames, col)
    }
  }
  colnames(merged_df) <- new_colnames
  
  # Finally, to export
  substring <- strsplit(inpTax, "\\.csv$")[[1]][1]
  tOut <- paste("ANCOM_BC_Group1_", "_Output_", substring, ".csv", sep="")
  fullOut <- file.path(outDir, tOut)
  write.csv(merged_df, fullOut, row.names = TRUE)
}

#Inputs-------------
inpDir <- "C:/Users/Duncan/Desktop/UIC/SunRotation/Mycobiome/Data/OTU_Tables/Archaea_Raw"
outDir <- "C:/Users/Duncan/Desktop/UIC/SunRotation/Mycobiome/Output/ANCOM_Archaea"
metaF <- "C:/Users/Duncan/Desktop/UIC/SunRotation/Mycobiome/Data/metadata.csv"
ref1<-"M.VDR-Loxp"
ref2<-"F.VDR-Loxp"

file_names <- list.files(inpDir, pattern = "\\.csv$", full.names = TRUE)
# Loop over the file names
for (file_name in file_names) {
  file_name_without_dir <- basename(file_name)
  
  if (file_name_without_dir=="Eukaryota_L2.csv"){
    next
  }
  
  print("Running on...")
  print(file_name_without_dir)
  # Perform operations on the data...
  runANCOMBC(inpDir,file_name_without_dir,outDir,metaF,ref1)
  
  print("Individual tests...")
  toInd<-c("M.VDR-DEFA","M.VDR-IEC","M.VDR-Lyz")
  for (indI in toInd){
    print(indI)
    runANCOMBC_within(inpDir,file_name_without_dir,outDir,metaF,indI)
  }
  
  print("Group without gender...")
  runANCOMBC_group1(inpDir,file_name_without_dir,outDir,metaF,indI)
  
}