Load Required Libraries

library(readxl)   
## Warning: package 'readxl' was built under R version 4.3.2
library(calibrate) 
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 4.3.2
library(dplyr)    
## Warning: package 'dplyr' was built under R version 4.3.2
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)  
## Warning: package 'ggplot2' was built under R version 4.3.3
# Load the configuration
config <- yaml::read_yaml("config.yml")

# Set base directory and subdirectories
base_directory <- config$base_directory
data_directory <- file.path(base_directory, config$data_directory)
results_directory <- file.path(base_directory, config$results_directory)


source(paste0(base_directory,"/common_functions/common_functions.R"))  # Custom functions

#Import Results from Excel Files

# Load data from different sheets
HP <- as.data.frame(read_excel(paste0(data_directory,"/22_0322_alphaSyn_fly_pilot_Classes.xlsx"),
                               sheet = "HILIC-pos", skip = 4), stringsAsFactors = FALSE)
## New names:
## • `` -> `...23`
## • `p value` -> `p value...25`
## • `p value` -> `p value...28`
HN <- as.data.frame(read_excel(paste0(data_directory,"/22_0322_alphaSyn_fly_pilot_Classes.xlsx"),
                               sheet = "HILIC-neg", skip = 4), stringsAsFactors = FALSE)
## New names:
## • `` -> `...23`
## • `p value` -> `p value...25`
## • `p value` -> `p value...28`
CP <- as.data.frame(read_excel(paste0(data_directory,"/22_0322_alphaSyn_fly_pilot_Classes.xlsx"),
                               sheet = "C8-pos", skip = 4), stringsAsFactors = FALSE)
## New names:
## • `` -> `...23`
## • `p value` -> `p value...25`
## • `p value` -> `p value...28`
CN <- as.data.frame(read_excel(paste0(data_directory,"/22_0322_alphaSyn_fly_pilot_Classes.xlsx"),
                               sheet = "C18-neg", skip = 4), stringsAsFactors = FALSE)
## New names:
## • `` -> `...23`
## • `p value` -> `p value...25`
## • `p value` -> `p value...28`

#Import metadata

# Load metadata from the same Excel file
HP_metadata <- as.data.frame(read_excel(paste0(data_directory,"/22_0322_alphaSyn_fly_pilot_Classes.xlsx"),
                                        sheet = "HILIC-pos", range = "J1:V5"), stringsAsFactors = FALSE)
## New names:
## • `` -> `...1`
HN_metadata <- as.data.frame(read_excel(paste0(data_directory,"/22_0322_alphaSyn_fly_pilot_Classes.xlsx"),
                                        sheet = "HILIC-neg", range = "J1:V5"), stringsAsFactors = FALSE)
## New names:
## • `` -> `...1`
CP_metadata <- as.data.frame(read_excel(paste0(data_directory,"/22_0322_alphaSyn_fly_pilot_Classes.xlsx"),
                                        sheet = "C8-pos", range = "J1:V5"), stringsAsFactors = FALSE)
## New names:
## • `` -> `...1`
CN_metadata <- as.data.frame(read_excel(paste0(data_directory,"/22_0322_alphaSyn_fly_pilot_Classes.xlsx"),
                                        sheet = "C18-neg", range = "J1:V5"), stringsAsFactors = FALSE)
## New names:
## • `` -> `...1`
# Extract study metadata from HN sheet
s_m <- as.character(HN_metadata[2, -1])

#Combine and Clean Data

# Combine all datasets into one dataframe
df <- rbind(HP, HN, CP, CN)

# Remove irrelevant columns by their names
df <- df[, !colnames(df) %in% c("...23", "0.05")]

# Rename specific columns for clarity
colnames(df)[colnames(df) == "p value...25"] <- "p value_Syn_vs_Ctrl"
colnames(df)[colnames(df) == "p value...28"] <- "p value_AKI_KD_Syn_vs_AKI_KD_Ctrl"

#Identify and Handle Duplicated Metabolites Metabolites annotated in more than one

#Duplicated metabolites
dup_metabs <- df$Metabolite[!is.na(df$Metabolite)][duplicated(df$Metabolite[!is.na(df$Metabolite)])]
print(dup_metabs)
##  [1] "Aspartic acid"          "Cytidine"               "Glutamic acid"         
##  [4] "Glutathione"            "Guanosine"              "Hexose"                
##  [7] "N-Acetylglutamic acid"  "Nicotinic acid"         "Pantothenic acid"      
## [10] "Uric acid"              "Xanthine"               "LPC 14:0"              
## [13] "LPC 16:1"               "LPC 16:0"               "LPC 18:3"              
## [16] "LPC 18:2"               "LPC 18:1"               "LPC 18:0"              
## [19] "LPE 16:1"               "LPE 18:3"               "LPE 18:2"              
## [22] "LPE 18:1"               "LPE 18:0"               "PC 30:1"               
## [25] "PC 30:0"                "PC 32:2"                "PC 32:0"               
## [28] "PC 34:4"                "PC 34:1"                "PC 36:3"               
## [31] "PC P-34:1 or PC O-34:2" "PC P-34:0 or PC O-34:1" "PC P-36:2 or PC O-36:3"
## [34] "PC P-36:1 or PC O-36:2" "PE 32:1"                "PE 34:2"               
## [37] "PE 34:0"                "PE 36:4"                "PE 36:3"               
## [40] "PE 36:2"                "PE 38:6"                "PE P-34:2 or PE O-34:3"
## [43] "PE P-34:1 or PE O-34:2" "PE P-36:2 or PE O-36:3" "PE P-36:1 or PE O-36:2"
## [46] "Palmitoyl-EA"           "Cer 18:1;O2/16:0"       "DG 34:3"               
## [49] "DG 36:3"                "DG 36:4"                "TG 44:1"               
## [52] "TG 44:2"                "TG 46:2"                "Hexose"                
## [55] "Sebacic acid"           "Myristoleic acid"       "Palmitoyl-EA"          
## [58] "CAR 14:0"               "N-Acetylalanine"        "2-Aminooctanoic acid"
# Read decision file for duplicates and remove unwanted duplicates
toRemove <- read.csv(paste0(data_directory,"/dup_metabs_decision.csv"))
table(toRemove$Decision)
## 
##      KEEP 
##   60   58
dim(df)
## [1] 568  26
for(r in 1:nrow(toRemove)){
  if(toRemove$Decision[r] == ""){
    loc_toRm <- which(toRemove$Method[r] == df$Method & toRemove$Compound_ID[r] == df$Compound_ID)
    print(df[loc_toRm,c("Method","Metabolite")])
    df <- df[!(toRemove$Method[r] == df$Method & toRemove$Compound_ID[r] == df$Compound_ID),]
  }
}
##      Method           Metabolite
## 558 C18-neg 2-Aminooctanoic acid
##       Method    Metabolite
## 57 HILIC-pos Aspartic acid
##      Method Metabolite
## 549 C18-neg   CAR 14:0
##        Method       Metabolite
## 252 HILIC-pos Cer 18:1;O2/16:0
##        Method Metabolite
## 296 HILIC-neg   Cytidine
##        Method Metabolite
## 173 HILIC-pos    DG 34:3
##        Method Metabolite
## 175 HILIC-pos    DG 36:3
##        Method Metabolite
## 174 HILIC-pos    DG 36:4
##       Method    Metabolite
## 63 HILIC-pos Glutamic acid
##        Method  Metabolite
## 103 HILIC-pos Glutathione
##        Method Metabolite
## 306 HILIC-neg  Guanosine
##       Method Metabolite
## 17 HILIC-pos     Hexose
##      Method Metabolite
## 493 C18-neg     Hexose
##     Method Metabolite
## 347 C8-pos   LPC 14:0
##     Method Metabolite
## 349 C8-pos   LPC 16:0
##     Method Metabolite
## 348 C8-pos   LPC 16:1
##     Method Metabolite
## 353 C8-pos   LPC 18:0
##     Method Metabolite
## 352 C8-pos   LPC 18:1
##     Method Metabolite
## 351 C8-pos   LPC 18:2
##     Method Metabolite
## 350 C8-pos   LPC 18:3
##     Method Metabolite
## 355 C8-pos   LPE 16:1
##     Method Metabolite
## 360 C8-pos   LPE 18:0
##     Method Metabolite
## 359 C8-pos   LPE 18:1
##     Method Metabolite
## 358 C8-pos   LPE 18:2
##     Method Metabolite
## 357 C8-pos   LPE 18:3
##        Method       Metabolite
## 151 HILIC-pos Myristoleic acid
##      Method      Metabolite
## 557 C18-neg N-Acetylalanine
##       Method            Metabolite
## 84 HILIC-pos N-Acetylglutamic acid
##        Method     Metabolite
## 140 HILIC-pos Nicotinic acid
##        Method   Metabolite
## 155 HILIC-pos Palmitoyl-EA
##      Method   Metabolite
## 548 C18-neg Palmitoyl-EA
##       Method       Metabolite
## 90 HILIC-pos Pantothenic acid
##        Method Metabolite
## 203 HILIC-pos    PC 30:0
##        Method Metabolite
## 202 HILIC-pos    PC 30:1
##        Method Metabolite
## 207 HILIC-pos    PC 32:0
##        Method Metabolite
## 206 HILIC-pos    PC 32:2
##        Method Metabolite
## 212 HILIC-pos    PC 34:1
##        Method Metabolite
## 211 HILIC-pos    PC 34:4
##        Method Metabolite
## 217 HILIC-pos    PC 36:3
##        Method             Metabolite
## 195 HILIC-pos PC P-34:0 or PC O-34:1
##        Method             Metabolite
## 194 HILIC-pos PC P-34:1 or PC O-34:2
##        Method             Metabolite
## 199 HILIC-pos PC P-36:1 or PC O-36:2
##        Method             Metabolite
## 198 HILIC-pos PC P-36:2 or PC O-36:3
##        Method Metabolite
## 236 HILIC-pos    PE 32:1
##        Method Metabolite
## 239 HILIC-pos    PE 34:0
##        Method Metabolite
## 237 HILIC-pos    PE 34:2
##        Method Metabolite
## 243 HILIC-pos    PE 36:2
##        Method Metabolite
## 242 HILIC-pos    PE 36:3
##        Method Metabolite
## 241 HILIC-pos    PE 36:4
##        Method Metabolite
## 244 HILIC-pos    PE 38:6
##        Method             Metabolite
## 230 HILIC-pos PE P-34:1 or PE O-34:2
##        Method             Metabolite
## 229 HILIC-pos PE P-34:2 or PE O-34:3
##        Method             Metabolite
## 233 HILIC-pos PE P-36:1 or PE O-36:2
##        Method             Metabolite
## 232 HILIC-pos PE P-36:2 or PE O-36:3
##      Method   Metabolite
## 502 C18-neg Sebacic acid
##        Method Metabolite
## 177 HILIC-pos    TG 44:1
##        Method Metabolite
## 176 HILIC-pos    TG 44:2
##        Method Metabolite
## 178 HILIC-pos    TG 46:2
##        Method Metabolite
## 342 HILIC-neg  Uric acid
##       Method Metabolite
## 30 HILIC-pos   Xanthine
dim(df)
## [1] 508  26
#Check any duplicates left:
df$Metabolite[!is.na(df$Metabolite)][duplicated(df$Metabolite[!is.na(df$Metabolite)])]
## character(0)

#Remove Unwanted Data

# Remove internal standards and incorrectly identified metabolites
df <- df[!grepl("nternal", df$HMDB_ID), ]
df <- df[!grepl("xypurinol", df$Metabolite), ]

# Remove rows with unknown metabolites
df <- df[!is.na(df$Metabolite), ]

#Split Data into Metadata and Values

# Separate the metadata and measurement data
m <- df[, c(1:10, 23:26)]
a <- as.matrix(df[, c(11:22)])

#Impute values

# Drop metabolites with no measurements
toDrop <- which(rowSums(is.na(a)) == ncol(a))
if(length(toDrop) > 0){
  m <- m[-toDrop, ]
  a <- a[-toDrop, ]
}

# Apply custom imputation methods
a <- rm.zeros(a)
a <- fun.impute(a)
a_backup <- a

#Impute Missing Values and Remove Zero-Variance Metabolites

#Drop metabolites with no measurements
toDrop <- which(apply(a,1,FUN = function(x){return(sum(is.na(x)))}) == ncol(a))
#Empty metabolites
if(length(toDrop) > 0){
  m[toDrop,]
  a <- a[-toDrop,]
  m <- m[-toDrop,]
  
}

a <- rm.zeros(a)
a <- fun.impute(a)
a_backup <- a

# Remove zero-variance metabolites (often internal standards if IS-scaled data)
toKeep <- apply(a, 1, sd, na.rm = TRUE) != 0
a <- a[toKeep, ]

#Perform PCA (Principal Component Analysis)

# Conduct PCA on the log-transformed data
pcaResults <- prcomp(t(log(a)), scale. = TRUE, center = TRUE)

# Create a dataframe with PCA results and additional metadata
PC <- as.data.frame(pcaResults$x) %>%
      mutate(names = rownames(pcaResults$x), s_m = s_m) %>%
      mutate(median = apply(a, 2, median, na.rm = TRUE),
             mean = apply(a, 2, mean, na.rm = TRUE),
             sum = apply(a, 2, sum, na.rm = TRUE))

# Plot PCA results
ggplot(PC, aes(x = PC1, y = PC2, color = s_m)) + 
  geom_point() + 
  theme(legend.title = element_blank()) + 
  ggtitle("PCA Scores Plot for Known Metabolites (HP, HN, CP, CN)") + 
  scale_x_continuous(name = paste0("[PCA1] ", var.fun(pcaResults)[1], " %")) +
  scale_y_continuous(name = paste0("[PCA2] ", var.fun(pcaResults)[2], " %"))

#Write

pdf(paste0(results_directory,"/pca_knowns_HP-HN-CP-CN.pdf"),8,8)
ggplot(PC, aes(x = PC1, y = PC2, color=s_m)) + 
  geom_point() + theme(legend.title = element_blank()) + ggtitle("PCA Scores plot knowns HP and HN, CP, CN") + 
  scale_x_continuous(name=paste0("[PCA1] ", var.fun(pcaResults)[1], " %")) +
  scale_y_continuous(name=paste0("[PCA2] ", var.fun(pcaResults)[2], " %"))
dev.off()
## png 
##   2