#!/usr/bin/env Rscript

#Load necessary libraries
library(DESeq2)
library(ggplot2)
library(ggrepel)
library(EnhancedVolcano)
library(pheatmap)

#Set the working directory to where the count files are
setwd("/home/farshchian/December_2023_data/trimmed_data/Analysis2")

#Define the count data files for each condition
sample_files <- list(
Control = c("PZ2-CTL.counts", "PZ2-CTL-II.counts"),
Gemcitabine = c("PZ2-GEM.counts", "PZ2-GEM-II.counts"),
Gemcitabine_EV = c("PZ2-GEM-EV.counts", "PZ2-GEM-EV-II.counts"),
sTRAIL = c("PZ2-Strail.counts", "PZ2-Strail-II.counts"),
Combo = c("PZ2-GEM-STRAIL.counts", "PZ2-GEM-STRAIL-II.counts")
)

#Create a data frame that maps files to conditions
samples <- data.frame(
condition = rep(names(sample_files), each = 2),
filename = unlist(sample_files)
)

#Function to read count files and check for consistency
read_count_file <- function(file, referenceGenes) {
counts <- read.table(file, header = FALSE, col.names = c("gene", "count"))
if (!is.null(referenceGenes) && !identical(counts$gene, referenceGenes)) {
stop("Gene order or content differs in file: ", file)
}
counts$count
}

#Read the first file to get the reference list of genes
referenceGenes <- read.table(samples$filename[1], header = FALSE, col.names = c("gene", "count"))$gene

#Load count data and merge by gene
countDataList <- lapply(samples$filename, read_count_file, referenceGenes = referenceGenes)
countMatrix <- do.call(cbind, countDataList)
rownames(countMatrix) <- referenceGenes

#Set the reference level for the condition factor
samples$condition <- factor(samples$condition, levels = c("Control", "Gemcitabine", "Gemcitabine_EV", "sTRAIL", "Combo"))

#Create DESeqDataSet with 'Control' as the reference level
colData <- DataFrame(condition = samples$condition)
dds <- DESeqDataSetFromMatrix(countData = countMatrix,
colData = colData,
design = ~ condition)

#Run the DESeq pipeline
dds <- DESeq(dds)

#Define the comparisons list with descriptive names
comparisons <- combn(levels(samples$condition), 2, simplify = FALSE)

#List of genes of interest
genes_of_interest <- c("PTGS2", "TOP2A", "MKI67", "COL1A1", "POSTN", "SPINK1", "IL6",
"FN1", "SOX9", "IL33", "IL37", "FOSB", "SNAI1","NR5A2", "CXCL8",
"PINCR", "KIF20A", "CXCR4", "POU3F2")

#Loop through each comparison
for (i in seq_along(comparisons)) {

#Reverse the order of comparison to make it "test vs control"
contrast <- c("condition", comparisons[[i]][2], comparisons[[i]][1])

#Extract results using the contrast
res <- results(dds, contrast = contrast)

#Generate a data frame for plotting
resdata <- as.data.frame(res)
resdata$log2FoldChange <- as.numeric(resdata$log2FoldChange)
resdata$log10baseMean <- log10(resdata$baseMean + 1) # Add +1 to avoid log10(0)
resdata$gene <- rownames(resdata) # Add gene names as a new column

#Create a column indicating if the gene is one of the genes of interest
resdata$in_interest <- resdata$gene %in% genes_of_interest

#Create a descriptive title for the MA plot
ma_title <- paste(comparisons[[i]][2], "vs", comparisons[[i]][1], "MA Plot")

#Generate MA Plot with ggplot2 aesthetics
ma_plot <- ggplot(data = resdata, aes(x = log10baseMean, y = log2FoldChange)) +
geom_point(aes(color = padj), alpha = 0.4) +
scale_color_gradient(low = "blue"
, high = "red", na.value = "grey50", limit = c(0, 0.05)) +
geom_hline(yintercept = 0, linetype = "dashed") +
geom_vline(xintercept = 0, linetype = "dashed") +
geom_label_repel(data = subset(resdata, in_interest & !is.na(padj)),
aes(label = gene), size = 3, box.padding = unit(0.35, "lines"),
point.padding = unit(0.5, "lines")) +
labs(title = ma_title, color = "Adjusted P value") +
theme_minimal()

#Save the MA plot to a PDF file
pdf(file = paste0(comparisons[[i]][2], "vs", comparisons[[i]][1], "_labeled_MA_plot_present.pdf"))
print(ma_plot)
dev.off()

#Save the results to a CSV file
write.csv(as.data.frame(res), file = paste0(comparisons[[i]][2], "vs", comparisons[[i]][1], "_DESeq2_results.csv"))
}

#Successful completion message
cat("Differential expression analysis completed successfully.\n")
