#### 1. Directory

conda activate R-4.1.2
cd /mnt/05b41c36-2c2a-47ff-8539-154dec9a2e7d/Colaboraciones/Roberto_Elizondo_UdeC/2024/


#### 2. Data

R


## 1. Load data

# Load necessary library for data manipulation
library(dplyr)

# Read the CSV file
data <- read.csv("Affymetrix.csv", header = TRUE)

# Check the first few rows to ensure it's read correctly
head(data)

# Select the desired columns
selected_data <- data %>% 
  select(SYMBOL, `HC1.CEL`, `HC2.CEL`, `HC3.CEL`, `OAC1.CEL`, `OAC2.CEL`, `OAC3.CEL`, `OAC4.CEL`, `OAC1mito.CEL`, `OAC2mito.CEL`, `OAC3mito.CEL`, `OAC4mito.CEL`)

# Set the SYMBOL column as row names
row.names(selected_data) <- selected_data$SYMBOL

# Remove the SYMBOL column from the data frame
selected_data <- selected_data[, -which(names(selected_data) == "SYMBOL")]

# Update column names by removing "CEL"
names(selected_data) <- gsub(".CEL", "", names(selected_data))

# View the updated column names
print(names(selected_data))

# View the first few rows of the updated data
print(head(selected_data))

# Drop columns that start with "HC"
selected_data <- selected_data %>%
  select(-starts_with("HC"))


## 2. Limma 

library(limma)

# Create a vector indicating the Subject for each sample
# Alternating between subjects for "OAC" and "mito" samples
Subject <- factor(rep(1:4, times=2))
Subject

# Create a vector indicating the Treatment for each sample
# Assuming the first half of the samples are "OAC" and the second half are "mito"
Treatment <- factor(rep(c("OAC", "mito"), each=4))

# Your design matrix as previously defined
design <- model.matrix(~Subject + Treatment)

# Fit linear models to each gene using the design matrix
fit <- lmFit(selected_data, design)

# Assuming 'TreatmentOAC' is the column for the OAC treatment in your design matrix
# and 'mito' is the baseline level (and thus not explicitly present in the design matrix)
contrast.matrix <- makeContrasts(OACvsMito = TreatmentOAC, levels = design)

# Re-fit the models using the contrast matrix
fit2 <- contrasts.fit(fit, contrast.matrix)

# Apply empirical Bayes smoothing
fit2 <- eBayes(fit2)

# Extract the top differentially expressed genes, focusing on the "OAC" vs. "mito" comparison
topGenes <- topTable(fit2, coef="OACvsMito", adjust="BH", sort.by="P", number=Inf)

# Filter to keep only genes with adj.P.Val < 0.05
significantGenes <- topGenes[topGenes$adj.P.Val < 0.05, ]

# View the table of significant genes
print(significantGenes)

# Save the table of significant genes to a CSV file
write.csv(significantGenes, "significant_genes.csv", row.names = FALSE)

# Save the entire table of analyzed genes to a CSV file
write.csv(topGenes, "all_genes.csv", row.names = TRUE)



#####################
### Z-score plots ###
#####################

cd /mnt/05b41c36-2c2a-47ff-8539-154dec9a2e7d/Colaboraciones/Roberto_Elizondo_UdeC/2024/
conda activate base

python3

import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.stats import zscore


# Load the CSV containing the gene expression data
all_genes = pd.read_csv('all_genes.csv')

# Assuming 'logFC' is the column with the fold change values
# Multiply the fold change values by -1
all_genes['logFC'] = all_genes['logFC'] * -1

# Save the modified dataframe back to a CSV file
all_genes.to_csv('all_genes_corrected_FoldChange.csv', index=False)

# Read the files to get the correct column names and correct any issues
all_genes = pd.read_csv('all_genes_corrected_FoldChange.csv')
affymetrix_intensities = pd.read_csv('Affymetrix_RMA_intensities.csv')

# The gene column in all_genes.csv might have a different name; let's check the columns
gene_column_name = all_genes.columns[0]  # Assuming the first column is the gene names

# Filter the significant genes from all_genes.csv
significant_genes = all_genes[all_genes['adj.P.Val'] < 0.05]

# Get the gene names from the filtered dataframe
significant_gene_names = significant_genes[gene_column_name]

# Assuming the first column in Affymetrix_RMA_intensities.csv is the gene names, set it as index
affymetrix_intensities.set_index(affymetrix_intensities.columns[0], inplace=True)

# Now we will intersect the significant genes with the affymetrix values
# and keep only the rows for significant genes
significant_expression_data = affymetrix_intensities.loc[significant_gene_names]

# Set the font scale for the heatmap
sns.set(font_scale=1.5)

# Create a heatmap with clustering on rows (genes) and columns (samples)
heatmap = sns.clustermap(significant_expression_data, cmap='vlag', z_score=0, figsize=(10, 15),
                         row_cluster=True, col_cluster=False)

# Add title and legend annotation to the heatmap
plt.title("RMA intensities")

# Save the plot as a PDF file
heatmap.savefig('significant_genes_heatmap_clustered.pdf')


#####################
### Gene pathways ###
#####################

# pip install gprofiler-official

from gprofiler import GProfiler
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Initialize g:Profiler client
gp = GProfiler(return_dataframe=True)

# List of genes for pathway analysis
gene_list = [
    "UCP2", "RCAN2", "FOS", "EGR1", "IER2", "ZFP36", "TSPYL6", "MX2",
    "DDX60L", "HERC5", "IFIT2", "RSAD2", "CMPK2", "IFIH1", "IFIT3",
    "RTP4", "USP18", "IFI44", "IFIT1", "MX1", "OAS1", "USP41",
    "DDX58", "CCNE2", "EXO1", "AL589947.1", "FGL2"
]

# Perform pathway analysis
pathway_results = gp.profile(organism='hsapiens', query=gene_list)

# Filter for GO:BP terms
go_bp_results = pathway_results[(pathway_results['source'] == 'GO:BP') & (pathway_results['p_value'] < 0.01)]
go_bp_results['-log10_p_value'] = -np.log10(go_bp_results['p_value'])

# Filter for Transcription Factors (TF) - This requires specific knowledge of how TFs are represented in your results
tf_results = pathway_results[(pathway_results['source'] == 'TF') & (pathway_results['p_value'] < 0.01)]
tf_results['-log10_p_value'] = -np.log10(tf_results['p_value'])

# Filter for KEGG pathways
kegg_results = pathway_results[(pathway_results['source'] == 'KEGG') & (pathway_results['p_value'] < 0.01)]
kegg_results['-log10_p_value'] = -np.log10(kegg_results['p_value'])

# Filter for Reactome (REAC) pathways
reac_results = pathway_results[(pathway_results['source'] == 'REAC') & (pathway_results['p_value'] < 0.01)]
reac_results['-log10_p_value'] = -np.log10(reac_results['p_value'])


# Plot for GO:BP terms
plt.figure(figsize=(10, 8))
sns.barplot(x='-log10_p_value', y='name', data=go_bp_results)
plt.xlabel('-log10 P-Value')
plt.ylabel('GO Biological Process')
plt.title('GO:BP Analysis of Selected Genes')
plt.tight_layout()
plt.savefig('go_bp_analysis.pdf')
plt.close()

# Plot for TF - Uncomment and modify based on how TFs are represented in your results
plt.figure(figsize=(10, 12))
sns.barplot(x='-log10_p_value', y='name', data=tf_results)
plt.xlabel('-log10 P-Value')
plt.ylabel('Transcription Factors')
plt.title('TF Analysis of Selected Genes')
plt.tight_layout()
plt.savefig('tf_analysis.pdf')
plt.close()

# Plot for KEGG pathways
plt.figure(figsize=(9, 5.5))
sns.barplot(x='-log10_p_value', y='name', data=kegg_results)
plt.xlabel('-log10 P-Value')
plt.ylabel('KEGG Pathway')
plt.title('KEGG Pathway Analysis of Selected Genes')
plt.tight_layout()
plt.savefig('kegg_pathway_analysis.pdf')
plt.close()

# Plot for Reactome (REAC) pathways
plt.figure(figsize=(7, 4.0))
sns.barplot(x='-log10_p_value', y='name', data=reac_results)
plt.xlabel('-log10 P-Value')
plt.ylabel('Reactome Pathway')
plt.title('Reactome Pathway Analysis of Selected Genes')
plt.tight_layout()
plt.savefig('reactome_pathway_analysis.pdf')
plt.close()



############
### GSEA ###
############

import pandas as pd
import numpy as np
import gseapy as gp

gene_list = [
    "UCP2", "RCAN2", "FOS", "EGR1", "IER2", "ZFP36", "TSPYL6", "MX2",
    "DDX60L", "HERC5", "IFIT2", "RSAD2", "CMPK2", "IFIH1", "IFIT3",
    "RTP4", "USP18", "IFI44", "IFIT1", "MX1", "OAS1", "USP41",
    "DDX58", "CCNE2", "EXO1", "AL589947.1", "FGL2"
]


# Mock ranking metric for demonstration (e.g., log fold change)
np.random.seed(42)  # For reproducibility
ranking_metric = np.random.randn(len(gene_list))  # Generating random values as mock ranking metric

# Create a Pandas Series with gene names as index and the mock ranking metric as values
ranked_gene_list = pd.Series(ranking_metric, index=gene_list)

# Define the path to the gene sets database or use one of the MSigDB collections
gene_sets = 'h.all.v7.2.symbols.gmt'  # Example: Hallmark gene sets from MSigDB

# Run GSEA
gsea_results = gp.prerank(rnk=ranked_gene_list, gene_sets=gene_sets,
                          min_size=15, max_size=500, outdir='gsea_output',
                          permutation_num=1000, figsize=(6,6))




###################
### VolcanoPlot ###
###################


library(EnhancedVolcano)
library(magrittr)

#load table

directory <- "/home/ccarrilp/Desktop/Affimetrix_Robert_2023/con_carlos"
out_directory <- "/home/ccarrilp/Desktop/Affimetrix_Robert_2023/con_carlos"

data1 <- read.csv(file = "/home/ccarrilp/Desktop/Affimetrix_Robert_2023/con_carlos/21_03_2024/all_genes_corrected_FoldChange.csv", header = TRUE, sep = ",")

volcano_plot <- EnhancedVolcano(data1,
                                lab = data1$X,
                                x = "logFC",
                                y = "adj.P.Val",
                                title = "OACvsOACmito",
                                xlab = bquote(~Log[2]~ "fold change"),
                                pCutoff = 0.05,
                                FCcutoff = 1,
                                xlim = c(-5,5),
                                ylim = c(0, -log10(10e-06)),
                                pointSize = 4.0,
                                labSize = 6.0,
                                labCol = "black",
                                labFace = "bold",
                                boxedLabels = TRUE,
                                colAlpha = 4/5,
                                legendPosition = "right",
                                legendLabSize = 12,
                                legendIconSize = 4.0,
                                drawConnectors = TRUE,
                                widthConnectors = 0.75,
                                colConnectors = "black")
                                #lengthConnectors = 10,
                                #max.overlaps = 20)
volcano_plot
