library(readxl)
## Warning: package 'readxl' was built under R version 4.3.2
library(calibrate)
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 4.3.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.2
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
# Load the configuration
config <- yaml::read_yaml("config.yml")
# Set base directory and subdirectories
base_directory <- config$base_directory
data_directory <- file.path(base_directory, config$data_directory)
results_directory <- file.path(base_directory, config$results_directory)
source(paste0(base_directory,"/common_functions/common_functions.R")) # Custom functions
#Import Results from Excel Files
# Load data from different sheets
HP <- as.data.frame(read_excel(paste0(data_directory,"/22_0322_alphaSyn_fly_pilot_Classes.xlsx"),
sheet = "HILIC-pos", skip = 4), stringsAsFactors = FALSE)
## New names:
## • `` -> `...23`
## • `p value` -> `p value...25`
## • `p value` -> `p value...28`
HN <- as.data.frame(read_excel(paste0(data_directory,"/22_0322_alphaSyn_fly_pilot_Classes.xlsx"),
sheet = "HILIC-neg", skip = 4), stringsAsFactors = FALSE)
## New names:
## • `` -> `...23`
## • `p value` -> `p value...25`
## • `p value` -> `p value...28`
CP <- as.data.frame(read_excel(paste0(data_directory,"/22_0322_alphaSyn_fly_pilot_Classes.xlsx"),
sheet = "C8-pos", skip = 4), stringsAsFactors = FALSE)
## New names:
## • `` -> `...23`
## • `p value` -> `p value...25`
## • `p value` -> `p value...28`
CN <- as.data.frame(read_excel(paste0(data_directory,"/22_0322_alphaSyn_fly_pilot_Classes.xlsx"),
sheet = "C18-neg", skip = 4), stringsAsFactors = FALSE)
## New names:
## • `` -> `...23`
## • `p value` -> `p value...25`
## • `p value` -> `p value...28`
#Import metadata
# Load metadata from the same Excel file
HP_metadata <- as.data.frame(read_excel(paste0(data_directory,"/22_0322_alphaSyn_fly_pilot_Classes.xlsx"),
sheet = "HILIC-pos", range = "J1:V5"), stringsAsFactors = FALSE)
## New names:
## • `` -> `...1`
HN_metadata <- as.data.frame(read_excel(paste0(data_directory,"/22_0322_alphaSyn_fly_pilot_Classes.xlsx"),
sheet = "HILIC-neg", range = "J1:V5"), stringsAsFactors = FALSE)
## New names:
## • `` -> `...1`
CP_metadata <- as.data.frame(read_excel(paste0(data_directory,"/22_0322_alphaSyn_fly_pilot_Classes.xlsx"),
sheet = "C8-pos", range = "J1:V5"), stringsAsFactors = FALSE)
## New names:
## • `` -> `...1`
CN_metadata <- as.data.frame(read_excel(paste0(data_directory,"/22_0322_alphaSyn_fly_pilot_Classes.xlsx"),
sheet = "C18-neg", range = "J1:V5"), stringsAsFactors = FALSE)
## New names:
## • `` -> `...1`
# Extract study metadata from HN sheet
s_m <- as.character(HN_metadata[2, -1])
#Combine and Clean Data
# Combine all datasets into one dataframe
df <- rbind(HP, HN, CP, CN)
# Remove irrelevant columns by their names
df <- df[, !colnames(df) %in% c("...23", "0.05")]
# Rename specific columns for clarity
colnames(df)[colnames(df) == "p value...25"] <- "p value_Syn_vs_Ctrl"
colnames(df)[colnames(df) == "p value...28"] <- "p value_AKI_KD_Syn_vs_AKI_KD_Ctrl"
#Identify and Handle Duplicated Metabolites Metabolites annotated in more than one
#Duplicated metabolites
dup_metabs <- df$Metabolite[!is.na(df$Metabolite)][duplicated(df$Metabolite[!is.na(df$Metabolite)])]
print(dup_metabs)
## [1] "Aspartic acid" "Cytidine" "Glutamic acid"
## [4] "Glutathione" "Guanosine" "Hexose"
## [7] "N-Acetylglutamic acid" "Nicotinic acid" "Pantothenic acid"
## [10] "Uric acid" "Xanthine" "LPC 14:0"
## [13] "LPC 16:1" "LPC 16:0" "LPC 18:3"
## [16] "LPC 18:2" "LPC 18:1" "LPC 18:0"
## [19] "LPE 16:1" "LPE 18:3" "LPE 18:2"
## [22] "LPE 18:1" "LPE 18:0" "PC 30:1"
## [25] "PC 30:0" "PC 32:2" "PC 32:0"
## [28] "PC 34:4" "PC 34:1" "PC 36:3"
## [31] "PC P-34:1 or PC O-34:2" "PC P-34:0 or PC O-34:1" "PC P-36:2 or PC O-36:3"
## [34] "PC P-36:1 or PC O-36:2" "PE 32:1" "PE 34:2"
## [37] "PE 34:0" "PE 36:4" "PE 36:3"
## [40] "PE 36:2" "PE 38:6" "PE P-34:2 or PE O-34:3"
## [43] "PE P-34:1 or PE O-34:2" "PE P-36:2 or PE O-36:3" "PE P-36:1 or PE O-36:2"
## [46] "Palmitoyl-EA" "Cer 18:1;O2/16:0" "DG 34:3"
## [49] "DG 36:3" "DG 36:4" "TG 44:1"
## [52] "TG 44:2" "TG 46:2" "Hexose"
## [55] "Sebacic acid" "Myristoleic acid" "Palmitoyl-EA"
## [58] "CAR 14:0" "N-Acetylalanine" "2-Aminooctanoic acid"
# Read decision file for duplicates and remove unwanted duplicates
toRemove <- read.csv(paste0(data_directory,"/dup_metabs_decision.csv"))
table(toRemove$Decision)
##
## KEEP
## 60 58
dim(df)
## [1] 568 26
for(r in 1:nrow(toRemove)){
if(toRemove$Decision[r] == ""){
loc_toRm <- which(toRemove$Method[r] == df$Method & toRemove$Compound_ID[r] == df$Compound_ID)
print(df[loc_toRm,c("Method","Metabolite")])
df <- df[!(toRemove$Method[r] == df$Method & toRemove$Compound_ID[r] == df$Compound_ID),]
}
}
## Method Metabolite
## 558 C18-neg 2-Aminooctanoic acid
## Method Metabolite
## 57 HILIC-pos Aspartic acid
## Method Metabolite
## 549 C18-neg CAR 14:0
## Method Metabolite
## 252 HILIC-pos Cer 18:1;O2/16:0
## Method Metabolite
## 296 HILIC-neg Cytidine
## Method Metabolite
## 173 HILIC-pos DG 34:3
## Method Metabolite
## 175 HILIC-pos DG 36:3
## Method Metabolite
## 174 HILIC-pos DG 36:4
## Method Metabolite
## 63 HILIC-pos Glutamic acid
## Method Metabolite
## 103 HILIC-pos Glutathione
## Method Metabolite
## 306 HILIC-neg Guanosine
## Method Metabolite
## 17 HILIC-pos Hexose
## Method Metabolite
## 493 C18-neg Hexose
## Method Metabolite
## 347 C8-pos LPC 14:0
## Method Metabolite
## 349 C8-pos LPC 16:0
## Method Metabolite
## 348 C8-pos LPC 16:1
## Method Metabolite
## 353 C8-pos LPC 18:0
## Method Metabolite
## 352 C8-pos LPC 18:1
## Method Metabolite
## 351 C8-pos LPC 18:2
## Method Metabolite
## 350 C8-pos LPC 18:3
## Method Metabolite
## 355 C8-pos LPE 16:1
## Method Metabolite
## 360 C8-pos LPE 18:0
## Method Metabolite
## 359 C8-pos LPE 18:1
## Method Metabolite
## 358 C8-pos LPE 18:2
## Method Metabolite
## 357 C8-pos LPE 18:3
## Method Metabolite
## 151 HILIC-pos Myristoleic acid
## Method Metabolite
## 557 C18-neg N-Acetylalanine
## Method Metabolite
## 84 HILIC-pos N-Acetylglutamic acid
## Method Metabolite
## 140 HILIC-pos Nicotinic acid
## Method Metabolite
## 155 HILIC-pos Palmitoyl-EA
## Method Metabolite
## 548 C18-neg Palmitoyl-EA
## Method Metabolite
## 90 HILIC-pos Pantothenic acid
## Method Metabolite
## 203 HILIC-pos PC 30:0
## Method Metabolite
## 202 HILIC-pos PC 30:1
## Method Metabolite
## 207 HILIC-pos PC 32:0
## Method Metabolite
## 206 HILIC-pos PC 32:2
## Method Metabolite
## 212 HILIC-pos PC 34:1
## Method Metabolite
## 211 HILIC-pos PC 34:4
## Method Metabolite
## 217 HILIC-pos PC 36:3
## Method Metabolite
## 195 HILIC-pos PC P-34:0 or PC O-34:1
## Method Metabolite
## 194 HILIC-pos PC P-34:1 or PC O-34:2
## Method Metabolite
## 199 HILIC-pos PC P-36:1 or PC O-36:2
## Method Metabolite
## 198 HILIC-pos PC P-36:2 or PC O-36:3
## Method Metabolite
## 236 HILIC-pos PE 32:1
## Method Metabolite
## 239 HILIC-pos PE 34:0
## Method Metabolite
## 237 HILIC-pos PE 34:2
## Method Metabolite
## 243 HILIC-pos PE 36:2
## Method Metabolite
## 242 HILIC-pos PE 36:3
## Method Metabolite
## 241 HILIC-pos PE 36:4
## Method Metabolite
## 244 HILIC-pos PE 38:6
## Method Metabolite
## 230 HILIC-pos PE P-34:1 or PE O-34:2
## Method Metabolite
## 229 HILIC-pos PE P-34:2 or PE O-34:3
## Method Metabolite
## 233 HILIC-pos PE P-36:1 or PE O-36:2
## Method Metabolite
## 232 HILIC-pos PE P-36:2 or PE O-36:3
## Method Metabolite
## 502 C18-neg Sebacic acid
## Method Metabolite
## 177 HILIC-pos TG 44:1
## Method Metabolite
## 176 HILIC-pos TG 44:2
## Method Metabolite
## 178 HILIC-pos TG 46:2
## Method Metabolite
## 342 HILIC-neg Uric acid
## Method Metabolite
## 30 HILIC-pos Xanthine
dim(df)
## [1] 508 26
#Check any duplicates left:
df$Metabolite[!is.na(df$Metabolite)][duplicated(df$Metabolite[!is.na(df$Metabolite)])]
## character(0)
#Remove Unwanted Data
# Remove internal standards and incorrectly identified metabolites
df <- df[!grepl("nternal", df$HMDB_ID), ]
df <- df[!grepl("xypurinol", df$Metabolite), ]
# Remove rows with unknown metabolites
df <- df[!is.na(df$Metabolite), ]
#Split Data into Metadata and Values
# Separate the metadata and measurement data
m <- df[, c(1:10, 23:26)]
a <- as.matrix(df[, c(11:22)])
#Impute values
# Drop metabolites with no measurements
toDrop <- which(rowSums(is.na(a)) == ncol(a))
if(length(toDrop) > 0){
m <- m[-toDrop, ]
a <- a[-toDrop, ]
}
# Apply custom imputation methods
a <- rm.zeros(a)
a <- fun.impute(a)
a_backup <- a
#Impute Missing Values and Remove Zero-Variance Metabolites
#Drop metabolites with no measurements
toDrop <- which(apply(a,1,FUN = function(x){return(sum(is.na(x)))}) == ncol(a))
#Empty metabolites
if(length(toDrop) > 0){
m[toDrop,]
a <- a[-toDrop,]
m <- m[-toDrop,]
}
a <- rm.zeros(a)
a <- fun.impute(a)
a_backup <- a
# Remove zero-variance metabolites (often internal standards if IS-scaled data)
toKeep <- apply(a, 1, sd, na.rm = TRUE) != 0
a <- a[toKeep, ]
#Perform PCA (Principal Component Analysis)
# Conduct PCA on the log-transformed data
pcaResults <- prcomp(t(log(a)), scale. = TRUE, center = TRUE)
# Create a dataframe with PCA results and additional metadata
PC <- as.data.frame(pcaResults$x) %>%
mutate(names = rownames(pcaResults$x), s_m = s_m) %>%
mutate(median = apply(a, 2, median, na.rm = TRUE),
mean = apply(a, 2, mean, na.rm = TRUE),
sum = apply(a, 2, sum, na.rm = TRUE))
# Plot PCA results
ggplot(PC, aes(x = PC1, y = PC2, color = s_m)) +
geom_point() +
theme(legend.title = element_blank()) +
ggtitle("PCA Scores Plot for Known Metabolites (HP, HN, CP, CN)") +
scale_x_continuous(name = paste0("[PCA1] ", var.fun(pcaResults)[1], " %")) +
scale_y_continuous(name = paste0("[PCA2] ", var.fun(pcaResults)[2], " %"))
#Write
pdf(paste0(results_directory,"/pca_knowns_HP-HN-CP-CN.pdf"),8,8)
ggplot(PC, aes(x = PC1, y = PC2, color=s_m)) +
geom_point() + theme(legend.title = element_blank()) + ggtitle("PCA Scores plot knowns HP and HN, CP, CN") +
scale_x_continuous(name=paste0("[PCA1] ", var.fun(pcaResults)[1], " %")) +
scale_y_continuous(name=paste0("[PCA2] ", var.fun(pcaResults)[2], " %"))
dev.off()
## png
## 2