#library(GEOquery) ## go to https://github.com/seandavi/GEOquery for installation details
#library(R.utils)
library(reshape2)
library(ggplot2)
library(limma)
library(dplyr)
library(tidyverse)
#library(DMwR)
library(readxl)
library(MultiAssayExperiment)
library(S4Vectors)
library(SummarizedExperiment)
library(DT)
library(randomForest)
library(e1071)
library("bnstruct")
library(rminer)
library(ggpubr)
setwd("~/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis")
# install.packages("VANData_1.0.0.tgz", repos=NULL,type="source")
# install.packages("VAN_1.0.0.tgz", repos=NULL,type="source")
library(R.utils)
library(reshape2)
library(ggplot2)
library(limma)
library(dplyr)
library(tidyverse)
library(DMwR)
library(readxl)
library(magrittr)
library(tidyverse)
library(gage)
library(knitr)
library(DT)
library(janitor)
library(KEGGREST)
library(MASS)
library("AnnotationDbi")
library("org.Hs.eg.db")
library(dplyr)
library(reshape)
library(ggplot2)
library(plyr)
library(gtable)
library(grid)
library(pheatmap)
library(reshape2)
library(plotly)
library(UpSetR)
library(MatchIt)
require(pathview)
library(STRINGdb)
## Kegg sets data
require(gage)
require(gageData)
# kegg = kegg.gsets(species = "hsa", id.type = "kegg")
# kegg.sets.hsa = kegg$kg.sets
# go.hs=go.gsets(species="human")
# go.bp=go.hs$go.sets[go.hs$go.subs$BP]
# go.mf=go.hs$go.sets[go.hs$go.subs$MF]
# go.bpmf=go.hs$go.sets[go.hs$go.subs$BP&go.hs$go.subs$MF] #not equal, wired
# go.bpmf2=append(go.bp,go.mf)
require(gage)
require(gageData)
library(clusterProfiler)
library(enrichplot)
adj_to_p <- function(df) {
#df <- topTable(fit, coef = comparison, number = nrow(fit))
if ((df %>%filter(adj.P.Val < 0.05) %>% nrow()) == 0){
r <- 1
} else {
r <- df%>%mutate(rn = row_number()) %>% filter(adj.P.Val <= 0.05) %>% pull(rn)%>%max() + 1
}
n <- nrow(df) + 1
lp10 <- -log10(r / n * 0.05)
lp10
}
load("all_omics_lv_v5.RData")
metabolite_expression=assay(all_omics@ExperimentList$metabolite, "log2_norm")
metabolite_expression1=as.data.frame(metabolite_expression)
metabolite_expression1=apply(metabolite_expression,2,as.numeric)
rownames(metabolite_expression1)=rownames(metabolite_expression)
dim(metabolite_expression1)
## [1] 155 107
#20220321 add the age match as before
infodt1 = colData(all_omics)
infodt1$ihd=as.numeric(as.vector(ifelse(infodt1$Condition=="IHD"&infodt1$Diabetes=="Yes",1,ifelse(infodt1$Condition=="Donor",0,2))))
infodt1$Age=as.numeric(infodt1$Age)#need to change to numeric here
ihddt=infodt1[infodt1$ihd!=2,]
dim(ihddt)
## [1] 42 14
## [1] 30 14
## [1] "1_LV_IHD_Yes_41_M" "2_LV_IHD_Yes_45_M" "3_LV_IHD_Yes_46_F"
## [4] "4_LV_IHD_Yes_49_M" "5_LV_IHD_Yes_50_M" "6_LV_IHD_Yes_51_M"
## [7] "7_LV_IHD_Yes_51_M" "8_LV_IHD_Yes_53_M" "9_LV_IHD_Yes_55_M"
## [10] "10_LV_IHD_Yes_56_M" "11_LV_IHD_Yes_56_M" "12_LV_IHD_Yes_59_M"
## [13] "13_LV_IHD_Yes_59_M" "14_LV_IHD_Yes_65_M" "94_LV_Donor_No_47_M"
## [16] "95_LV_Donor_No_47_F" "96_LV_Donor_No_48_M" "97_LV_Donor_No_49_F"
## [19] "98_LV_Donor_No_51_F" "99_LV_Donor_No_53_M" "100_LV_Donor_No_53_F"
## [22] "101_LV_Donor_No_54_M" "102_LV_Donor_No_54_F" "104_LV_Donor_No_55_F"
## [25] "105_LV_Donor_No_56_M" "106_LV_Donor_No_60_M" "107_LV_Donor_No_62_M"
## [28] "108_LV_Donor_No_62_F" "109_LV_Donor_No_65_M" "110_LV_Donor_No_65_F"
indexx=ihddt$new_filename
metabolite_expression1=metabolite_expression1[,colnames(metabolite_expression1)%in%indexx]
dim(metabolite_expression1)
## [1] 155 30
#get the "lv" "ihd-dm" "ihd-no dm" non-imputed data
experiment_data_noimputation=metabolite_expression1[,union(grep("_IHD_Yes",colnames(metabolite_expression1)),grep("Donor_No",colnames(metabolite_expression1)))]
#colnames(experiment_data_noimputation)
dim(experiment_data_noimputation)
## [1] 155 30
#filtering out proteins with >25% samples missing
experiment_data_noimputation=experiment_data_noimputation[which(rowMeans(!is.na(experiment_data_noimputation)) > 0.25), ]
dim(experiment_data_noimputation)
## [1] 154 30
diabetes=grep("Yes",colnames(experiment_data_noimputation))
experiment_data_noimputation1=as.data.frame(t(experiment_data_noimputation))
rownames(experiment_data_noimputation1)=1:dim(experiment_data_noimputation1)[1]
experiment_data_noimputation1$y=ifelse(rownames(experiment_data_noimputation1) %in% diabetes,1,0)
#de analysis-- should be on the non-imputed data
groupname<-as.factor(experiment_data_noimputation1$y)
design <- model.matrix(~ groupname + 0)
fit <- lmFit(experiment_data_noimputation, design)
cont.matrix <- makeContrasts(groupname1-groupname0, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)
tT <- topTable(fit2)
df1 <- topTable(fit2, number=nrow(fit2), genelist=rownames(experiment_data_noimputation))
df1$name=rownames(df1)
df1$neglogP_value = -log10(df1$P.Value)
#add samples
colnames(experiment_data_noimputation)
## [1] "1_LV_IHD_Yes_41_M" "2_LV_IHD_Yes_45_M" "3_LV_IHD_Yes_46_F"
## [4] "4_LV_IHD_Yes_49_M" "5_LV_IHD_Yes_50_M" "6_LV_IHD_Yes_51_M"
## [7] "7_LV_IHD_Yes_51_M" "8_LV_IHD_Yes_53_M" "9_LV_IHD_Yes_55_M"
## [10] "10_LV_IHD_Yes_56_M" "11_LV_IHD_Yes_56_M" "12_LV_IHD_Yes_59_M"
## [13] "13_LV_IHD_Yes_59_M" "14_LV_IHD_Yes_65_M" "94_LV_Donor_No_47_M"
## [16] "95_LV_Donor_No_47_F" "96_LV_Donor_No_48_M" "97_LV_Donor_No_49_F"
## [19] "98_LV_Donor_No_51_F" "99_LV_Donor_No_53_M" "100_LV_Donor_No_53_F"
## [22] "101_LV_Donor_No_54_M" "102_LV_Donor_No_54_F" "104_LV_Donor_No_55_F"
## [25] "105_LV_Donor_No_56_M" "106_LV_Donor_No_60_M" "107_LV_Donor_No_62_M"
## [28] "108_LV_Donor_No_62_F" "109_LV_Donor_No_65_M" "110_LV_Donor_No_65_F"
## [1] 2.129634
highlight_df=df1
highlight_df$colorcode=as.character(ifelse(highlight_df$neglogP_value>adj_to_p(df1)&highlight_df$logFC>0,1,ifelse(highlight_df$neglogP_value>adj_to_p(df1)&highlight_df$logFC<0,-1,0)))
dim(highlight_df)
## [1] 154 10
## [1] 154 10
g2=ggplot2::ggplot(highlight_df, aes(logFC,neglogP_value,label=name))+
geom_point(aes(x=logFC,y=neglogP_value,color=colorcode)) +
xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("IHD-T2DM vs Donor") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),aspect.ratio = 1) +
geom_hline(yintercept = adj_to_p(df1), linetype = "dashed", colour = "red")+scale_color_manual(values = c("blue","gray","red"))
#ggsave(filename="plot2.pdf",g2)
ggplotly(g2)
#pathway analysis: name mapping to change the metabolites names
nameref=read_excel("/Users/yzha0247/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/Yunwei202110/BH 2020 scMerge_impute AMIDE and HILIC 14.12.21_Normalised.xlsx",sheet = 4)
sum(rownames(df1) %in% nameref$Metabolite)
## [1] 154
df1_new=df1[which(rownames(df1) %in% nameref$Metabolite),]
df1_new$name=rownames(df1_new)
df1_new$name[as.numeric(na.omit(match(nameref$Metabolite,df1_new$name,nomatch = NA)))] <- nameref$`KEGG ID`[as.numeric(na.omit(match(df1_new$name,nameref$Metabolite,nomatch = NA)))]
dim(df1_new)
## [1] 154 9
df2=df1_new[!is.na(df1_new$name),] #not matching are noted as "-" so that i didnt delete anything
dim(df2)
## [1] 154 9
fc=df2$logFC
names(fc)=df2$name
fc=sort(fc,decreasing = TRUE)
#name mapping for pathways
meta_pathway_name=read_excel("/Users/yzha0247/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/Yunwei202110/2020 Human Heart Metabolomic functionally annotated metabolites.xlsx")
meta_pathway_name=meta_pathway_name[,c("KEGG pathway","KEGG")]
length(unique(meta_pathway_name$`KEGG pathway`))
## [1] 137
## [1] 143
library(clusterProfiler)
kk2<-GSEA(fc, TERM2GENE=meta_pathway_name,pvalueCutoff = 1,seed = 12,nPerm = 1000, minGSSize = 3,maxGSSize = 500,pAdjustMethod = "none")
datatable(as.data.frame(kk2)%>% mutate_if(is.numeric, ~round(., 3)))
ggplotdt=as.data.frame(kk2)
ggplotdt$p.adjust=as.numeric(ggplotdt$p.adjust)
ggplotdtup=ggplotdt[ggplotdt$enrichmentScore>0,]
ggplotdtupsub=ggplotdtup[order(ggplotdtup$p.adjust),]
ggplotdtupsub=ggplotdtupsub[1:10,]
ggplotdtdown=ggplotdt[ggplotdt$enrichmentScore<0,]
ggplotdtdownsub=ggplotdtdown[order(ggplotdtdown$p.adjust),]
ggplotdtdownsub=ggplotdtdownsub[1:10,]
ggplotdt=rbind.data.frame(ggplotdtupsub,ggplotdtdownsub)
gg=ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=pvalue,size=pvalue))+coord_flip()+theme_bw()+ggtitle("ihd-dm vs donor pathway enrichment analysis")+
scale_colour_gradient(high = "red", low = "blue")+theme(text = element_text(size = 10))
gg
load("all_omics_lv_v5.RData")
metabolite_expression=assay(all_omics@ExperimentList$metabolite, "log2_norm")
metabolite_expression1=as.data.frame(metabolite_expression)
metabolite_expression1=apply(metabolite_expression,2,as.numeric)
rownames(metabolite_expression1)=rownames(metabolite_expression)
dim(metabolite_expression1)
## [1] 155 107
#20220321 add the age match as before
infodt1 = colData(all_omics)
infodt1$ihd=as.numeric(as.vector(ifelse(infodt1$Condition=="IHD"&infodt1$Diabetes=="No",1,ifelse(infodt1$Condition=="Donor",0,2))))
infodt1$Age=as.numeric(infodt1$Age)#need to change to numeric here
ihddt=infodt1[infodt1$ihd!=2,]
dim(ihddt)
## [1] 44 14
## [1] 32 14
## [1] "15_LV_IHD_No_41_M" "16_LV_IHD_No_42_F" "17_LV_IHD_No_43_F"
## [4] "18_LV_IHD_No_45_M" "19_LV_IHD_No_47_F" "20_LV_IHD_No_48_F"
## [7] "21_LV_IHD_No_49_F" "22_LV_IHD_No_50_M" "23_LV_IHD_No_50_M"
## [10] "24_LV_IHD_No_54_M" "25_LV_IHD_No_54_M" "26_LV_IHD_No_54_F"
## [13] "27_LV_IHD_No_62_F" "28_LV_IHD_No_62_M" "29_LV_IHD_No_62_M"
## [16] "30_LV_IHD_No_66_M" "94_LV_Donor_No_47_M" "95_LV_Donor_No_47_F"
## [19] "96_LV_Donor_No_48_M" "97_LV_Donor_No_49_F" "98_LV_Donor_No_51_F"
## [22] "99_LV_Donor_No_53_M" "100_LV_Donor_No_53_F" "101_LV_Donor_No_54_M"
## [25] "102_LV_Donor_No_54_F" "104_LV_Donor_No_55_F" "105_LV_Donor_No_56_M"
## [28] "106_LV_Donor_No_60_M" "107_LV_Donor_No_62_M" "108_LV_Donor_No_62_F"
## [31] "109_LV_Donor_No_65_M" "110_LV_Donor_No_65_F"
indexx=ihddt$new_filename
metabolite_expression1=metabolite_expression1[,colnames(metabolite_expression1)%in%indexx]
dim(metabolite_expression1)
## [1] 155 32
#get the "lv" "ihd-dm" "ihd-no dm" non-imputed data
experiment_data_noimputation=metabolite_expression1[,union(grep("_IHD_No",colnames(metabolite_expression1)),grep("Donor_No",colnames(metabolite_expression1)))]
#colnames(experiment_data_noimputation)
dim(experiment_data_noimputation)
## [1] 155 32
#filtering out proteins with >25% samples missing
experiment_data_noimputation=experiment_data_noimputation[which(rowMeans(!is.na(experiment_data_noimputation)) > 0.25), ]
dim(experiment_data_noimputation)
## [1] 154 32
diabetes=grep("IHD",colnames(experiment_data_noimputation))
experiment_data_noimputation1=as.data.frame(t(experiment_data_noimputation))
rownames(experiment_data_noimputation1)=1:dim(experiment_data_noimputation1)[1]
experiment_data_noimputation1$y=ifelse(rownames(experiment_data_noimputation1) %in% diabetes,1,0)
#de analysis-- should be on the non-imputed data
groupname<-as.factor(experiment_data_noimputation1$y)
design <- model.matrix(~ groupname + 0)
fit <- lmFit(experiment_data_noimputation, design)
cont.matrix <- makeContrasts(groupname1-groupname0, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)
tT <- topTable(fit2)
df1 <- topTable(fit2, number=nrow(fit2), genelist=rownames(experiment_data_noimputation))
df1$name=rownames(df1)
df1$neglogP_value = -log10(df1$P.Value)
#add samples
colnames(experiment_data_noimputation)
## [1] "15_LV_IHD_No_41_M" "16_LV_IHD_No_42_F" "17_LV_IHD_No_43_F"
## [4] "18_LV_IHD_No_45_M" "19_LV_IHD_No_47_F" "20_LV_IHD_No_48_F"
## [7] "21_LV_IHD_No_49_F" "22_LV_IHD_No_50_M" "23_LV_IHD_No_50_M"
## [10] "24_LV_IHD_No_54_M" "25_LV_IHD_No_54_M" "26_LV_IHD_No_54_F"
## [13] "27_LV_IHD_No_62_F" "28_LV_IHD_No_62_M" "29_LV_IHD_No_62_M"
## [16] "30_LV_IHD_No_66_M" "94_LV_Donor_No_47_M" "95_LV_Donor_No_47_F"
## [19] "96_LV_Donor_No_48_M" "97_LV_Donor_No_49_F" "98_LV_Donor_No_51_F"
## [22] "99_LV_Donor_No_53_M" "100_LV_Donor_No_53_F" "101_LV_Donor_No_54_M"
## [25] "102_LV_Donor_No_54_F" "104_LV_Donor_No_55_F" "105_LV_Donor_No_56_M"
## [28] "106_LV_Donor_No_60_M" "107_LV_Donor_No_62_M" "108_LV_Donor_No_62_F"
## [31] "109_LV_Donor_No_65_M" "110_LV_Donor_No_65_F"
## [1] 2.093422
highlight_df=df1
highlight_df$colorcode=as.character(ifelse(highlight_df$neglogP_value>adj_to_p(df1)&highlight_df$logFC>0,1,ifelse(highlight_df$neglogP_value>adj_to_p(df1)&highlight_df$logFC<0,-1,0)))
dim(highlight_df)
## [1] 154 10
## [1] 154 10
g5=ggplot2::ggplot(highlight_df, aes(logFC,neglogP_value,label=name))+
geom_point(aes(x=logFC,y=neglogP_value,color=colorcode)) +
xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("IHD-T2DM vs Donor") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),aspect.ratio = 1) +
geom_hline(yintercept = adj_to_p(df1), linetype = "dashed", colour = "red")+scale_color_manual(values = c("blue","gray","red"))
#ggsave(filename="plot5.pdf",g5)
ggplotly(g5)
#pathway analysis: name mapping to change the metabolites names
nameref=read_excel("/Users/yzha0247/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/Yunwei202110/BH 2020 scMerge_impute AMIDE and HILIC 14.12.21_Normalised.xlsx",sheet = 4)
df1_new=df1[which(rownames(df1) %in% nameref$Metabolite),]
df1_new$name=rownames(df1_new)
df1_new$name[as.numeric(na.omit(match(nameref$Metabolite,df1_new$name,nomatch = NA)))] <- nameref$`KEGG ID`[as.numeric(na.omit(match(df1_new$name,nameref$Metabolite,nomatch = NA)))]
dim(df1_new)
## [1] 154 9
## [1] 154 9
fc=df2$logFC
names(fc)=df2$name
fc=sort(fc,decreasing = TRUE)
#name mapping for pathways
meta_pathway_name=read_excel("/Users/yzha0247/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/Yunwei202110/2020 Human Heart Metabolomic functionally annotated metabolites.xlsx")
meta_pathway_name=meta_pathway_name[,c("KEGG pathway","KEGG")]
length(unique(meta_pathway_name$`KEGG pathway`))
## [1] 137
## [1] 143
library(clusterProfiler)
kk2<-GSEA(fc, TERM2GENE=meta_pathway_name,pvalueCutoff = 1,seed = 12,nPerm = 1000, minGSSize = 3,maxGSSize = 500,pAdjustMethod = "none")
datatable(as.data.frame(kk2)%>% mutate_if(is.numeric, ~round(., 3)))
ggplotdt=as.data.frame(kk2)
ggplotdt$p.adjust=as.numeric(ggplotdt$p.adjust)
ggplotdtup=ggplotdt[ggplotdt$enrichmentScore>0,]
ggplotdtupsub=ggplotdtup[order(ggplotdtup$p.adjust),]
ggplotdtupsub=ggplotdtupsub[1:10,]
ggplotdtdown=ggplotdt[ggplotdt$enrichmentScore<0,]
ggplotdtdownsub=ggplotdtdown[order(ggplotdtdown$p.adjust),]
ggplotdtdownsub=ggplotdtdownsub[1:10,]
ggplotdt=rbind.data.frame(ggplotdtupsub,ggplotdtdownsub)
gg=ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=pvalue,size=pvalue))+coord_flip()+theme_bw()+ggtitle("ihd-no dm vs donor pathway enrichment analysis")+
scale_colour_gradient(high = "red", low = "blue")+theme(text = element_text(size = 10))
gg
## [1] 71 11
## [1] 71 11
common_name=c(ihddm_dt2$Description,ihdnodm_dt2$Description)
plotmerge=merge(ihddm_dt2,ihdnodm_dt2,by="Description")
plotmerge$pvalue.x=-log10(as.numeric(plotmerge$pvalue.x))
plotmerge$pvalue.y=-log10(as.numeric(plotmerge$pvalue.y))
# annotation=read_excel("/Users/yzha0247/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/Yunwei202110/IHD-DM paper figure 3a pathways to annotate.xlsx")
# plotmerge$mito=as.character(ifelse(plotmerge$Description%in% ben_mito$Description,1,0))
# plotmerge$annotation=as.character(ifelse(plotmerge$Description%in% annotation$`Amino acid metabolism`,1,0))
# ggplotly(ggplot(plotmerge,aes(pvalue.x,pvalue.y,label=Description,color=mito))+geom_point()+ggtitle("kegg+mito pathways ihd-dm vs ihd-no dm")+xlab("neg_log10_pval in ind-dm vs donor")+ylab("neg_log10_pval in ind-no dm vs donor")+theme_bw())
library(ggrepel)
gg=ggplot(plotmerge,aes(pvalue.x,pvalue.y,label=Description))+geom_point()+ggtitle("kegg+mito pathways ihd-dm vs ihd-no dm")+xlab("neg_log10_pval in ind-dm vs donor")+ylab("neg_log10_pval in ind-no dm vs donor")+theme_bw()+theme(aspect.ratio = 1)
#gg=ggplot(plotmerge,aes(pvalue.x,pvalue.y,label=Description,color=mito))+geom_point()+ geom_text_repel(aes(x=pvalue.x,y=pvalue.y,label=Description),subset(plotmerge,plotmerge$annotation==1), size=3)+ggtitle("kegg+mito pathways ihd-dm vs ihd-no dm")+xlab("neg_log10_pval in ind-dm vs donor")+ylab("neg_log10_pval in ind-no dm vs donor")+ scale_color_manual(values=c("black", "seagreen"))+theme_bw()+theme(aspect.ratio = 1)
ggplotly(gg)
g1=ggplot(plotmerge, aes(pvalue.x,pvalue.y,label=Description))+
geom_point() +
xlab("neg_log10_pval in ind-dm vs donor") + ylab("neg_log10_pval in ind-no dm vs donor")+ggtitle("Pathways ihd-dm vs ihd-no dm") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),aspect.ratio = 1)+geom_text_repel(aes(x=pvalue.x,y=pvalue.y,label=Description),subset(plotmerge,(pvalue.x>1.2|pvalue.y>1.2)))
#ggsave(filename="figure3b.pdf",g1,height = 10,width = 10)