#library(GEOquery) ## go to https://github.com/seandavi/GEOquery for installation details
#library(R.utils)
library(reshape2)
library(ggplot2)
library(limma)
library(dplyr)
library(tidyverse)
#library(DMwR)
library(readxl)
library(MultiAssayExperiment)
library(S4Vectors)
library(SummarizedExperiment)
library(DT)
library(randomForest)
library(e1071)
library("bnstruct")
#library(rminer)
library(ggpubr)
setwd("/Users/MQ10004787/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis")
# install.packages("VANData_1.0.0.tgz", repos=NULL,type="source")
# install.packages("VAN_1.0.0.tgz", repos=NULL,type="source")
library(R.utils)
library(reshape2)
library(ggplot2)
library(limma)
library(dplyr)
library(tidyverse)
#library(DMwR)
library(readxl)
library(magrittr)
library(tidyverse)
library(gage)
library(knitr)
library(DT)
library(janitor)
library(KEGGREST)
library(MASS)
library("AnnotationDbi")
library("org.Hs.eg.db")
library(dplyr)
library(reshape)
library(ggplot2)
library(plyr)
library(gtable)
library(grid)
library(pheatmap)
library(reshape2)
library(plotly)
library(UpSetR)
library(MatchIt)
require(pathview)
#library(STRINGdb)
## Kegg sets data
require(gage)
require(gageData)
kegg = kegg.gsets(species = "hsa", id.type = "kegg")
kegg.sets.hsa = kegg$kg.sets
# go.hs=go.gsets(species="human")
# go.bp=go.hs$go.sets[go.hs$go.subs$BP]
# go.mf=go.hs$go.sets[go.hs$go.subs$MF]
# go.bpmf=go.hs$go.sets[go.hs$go.subs$BP&go.hs$go.subs$MF] #not equal, wired
# go.bpmf2=append(go.bp,go.mf)
require(gage)
require(gageData)
library(clusterProfiler)
library(enrichplot)
library(GO.db)
# #failed install
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("GOfuncR")
#
# if (!requireNamespace("devtools", quietly=TRUE))
# install.packages("devtools")
# devtools::install_github("sgrote/GOfuncR")
load("all_omics_lv_v5.RData")
protein_expression=assay(all_omics@ExperimentList$protein, "log2")
protein_expression1=as.data.frame(protein_expression)
dim(protein_expression1)
## [1] 9037 111
#changed 20220224: no overall filtering
protein_expression2=protein_expression1
dim(protein_expression2)
## [1] 9037 111
## [1] 4094 111
#20220321 add the age match as before
infodt1 = colData(all_omics)
infodt1$Age=as.numeric(infodt1$Age)
infodt1$ihd=as.numeric(as.vector(ifelse(infodt1$Condition=="IHD"&infodt1$Diabetes=="Yes",1,ifelse(infodt1$Condition=="Donor",0,2))))
ihddt=infodt1[infodt1$ihd!=2,]
ihddt=ihddt[(ihddt$ihd==0&ihddt$Age>=47)|ihddt$ihd==1,]
dim(ihddt)
## [1] 30 14
## [1] "1_LV_IHD_Yes_41_M" "2_LV_IHD_Yes_45_M" "3_LV_IHD_Yes_46_F"
## [4] "4_LV_IHD_Yes_49_M" "5_LV_IHD_Yes_50_M" "6_LV_IHD_Yes_51_M"
## [7] "7_LV_IHD_Yes_51_M" "8_LV_IHD_Yes_53_M" "9_LV_IHD_Yes_55_M"
## [10] "10_LV_IHD_Yes_56_M" "11_LV_IHD_Yes_56_M" "12_LV_IHD_Yes_59_M"
## [13] "13_LV_IHD_Yes_59_M" "14_LV_IHD_Yes_65_M" "94_LV_Donor_No_47_M"
## [16] "95_LV_Donor_No_47_F" "96_LV_Donor_No_48_M" "97_LV_Donor_No_49_F"
## [19] "98_LV_Donor_No_51_F" "99_LV_Donor_No_53_M" "100_LV_Donor_No_53_F"
## [22] "101_LV_Donor_No_54_M" "102_LV_Donor_No_54_F" "104_LV_Donor_No_55_F"
## [25] "105_LV_Donor_No_56_M" "106_LV_Donor_No_60_M" "107_LV_Donor_No_62_M"
## [28] "108_LV_Donor_No_62_F" "109_LV_Donor_No_65_M" "110_LV_Donor_No_65_F"
indexx=ihddt$new_filename
protein_expression2=protein_expression2[,colnames(protein_expression2)%in%indexx]
dim(protein_expression2)
## [1] 9037 30
#get the "lv" "ihd-dm" "ihd-no dm" non-imputed data
experiment_data_noimputation=protein_expression2[,union(grep("_IHD_Yes",colnames(protein_expression2)),grep("Donor_No",colnames(protein_expression2)))]
#colnames(experiment_data_noimputation)
dim(experiment_data_noimputation)
## [1] 9037 30
#filtering out proteins with >25% samples missing
experiment_data_noimputation=experiment_data_noimputation[which(rowMeans(!is.na(experiment_data_noimputation)) > 0.25), ]
dim(experiment_data_noimputation)
## [1] 4120 30
diabetes=grep("Yes",colnames(experiment_data_noimputation))
experiment_data_noimputation1=as.data.frame(t(experiment_data_noimputation))
rownames(experiment_data_noimputation1)=1:dim(experiment_data_noimputation1)[1]
experiment_data_noimputation1$y=ifelse(rownames(experiment_data_noimputation1) %in% diabetes,1,0)
#de analysis-- should be on the non-imputed data
groupname<-as.factor(experiment_data_noimputation1$y)
design <- model.matrix(~ groupname + 0)
fit <- lmFit(experiment_data_noimputation, design)
cont.matrix <- makeContrasts(groupname1-groupname0, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)
tT <- topTable(fit2)
df1 <- topTable(fit2, number=nrow(fit2), genelist=rownames(experiment_data_noimputation))
df1$name=rownames(df1)
df1$neglogP_value = -log10(df1$P.Value)
# g1=ggplot2::ggplot(df1, aes(logFC,neglogP_value,label=name))+
# geom_point(size = 1, alpha=1) +
# xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("IHD-T2DM vs Donor") +
# theme_bw(base_size = 16) +
# geom_hline(yintercept = 1.3, linetype = "dashed", colour = "brown3")+theme(aspect.ratio = 1)
#add fdr plot
adj_to_p <- function(df) {
#df <- topTable(fit, coef = comparison, number = nrow(fit))
if ((df %>%filter(adj.P.Val < 0.05) %>% nrow()) == 0){
r <- 1
} else {
r <- df%>%mutate(rn = row_number()) %>% filter(adj.P.Val <= 0.05) %>% pull(rn)%>%max() + 1
}
n <- nrow(df) + 1
lp10 <- -log10(r / n * 0.05)
lp10
}
#add samples
colnames(experiment_data_noimputation)
## [1] "2_LV_IHD_Yes_45_M" "3_LV_IHD_Yes_46_F" "4_LV_IHD_Yes_49_M"
## [4] "6_LV_IHD_Yes_51_M" "5_LV_IHD_Yes_50_M" "7_LV_IHD_Yes_51_M"
## [7] "14_LV_IHD_Yes_65_M" "9_LV_IHD_Yes_55_M" "12_LV_IHD_Yes_59_M"
## [10] "13_LV_IHD_Yes_59_M" "10_LV_IHD_Yes_56_M" "11_LV_IHD_Yes_56_M"
## [13] "8_LV_IHD_Yes_53_M" "1_LV_IHD_Yes_41_M" "110_LV_Donor_No_65_F"
## [16] "101_LV_Donor_No_54_M" "94_LV_Donor_No_47_M" "98_LV_Donor_No_51_F"
## [19] "105_LV_Donor_No_56_M" "100_LV_Donor_No_53_F" "109_LV_Donor_No_65_M"
## [22] "99_LV_Donor_No_53_M" "97_LV_Donor_No_49_F" "108_LV_Donor_No_62_F"
## [25] "102_LV_Donor_No_54_F" "96_LV_Donor_No_48_M" "106_LV_Donor_No_60_M"
## [28] "95_LV_Donor_No_47_F" "107_LV_Donor_No_62_M" "104_LV_Donor_No_55_F"
## [1] 1.808823
highlight_df=df1
highlight_df$colorcode=as.character(ifelse(highlight_df$neglogP_value>adj_to_p(df1)&highlight_df$logFC>0,1,ifelse(highlight_df$neglogP_value>adj_to_p(df1)&highlight_df$logFC<0,-1,0)))
dim(highlight_df)
## [1] 4120 10
## [1] 4101 10
g1=ggplot2::ggplot(highlight_df, aes(logFC,neglogP_value,label=name))+
geom_point(aes(x=logFC,y=neglogP_value,color=colorcode)) +
xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("IHD-T2DM vs Donor") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),aspect.ratio = 1) +
geom_hline(yintercept = adj_to_p(df1), linetype = "dashed", colour = "red")+scale_color_manual(values = c("blue","gray","red"))
ggplotly(g1)
#pathway analysis
#pathway analysis as in our shiny:delete missing values
experiment_data_pathway=na.omit(experiment_data_noimputation)
dim(experiment_data_pathway)
## [1] 2153 30
##With voom for finding pathways
groupname<-as.factor(experiment_data_noimputation1$y)
design <- model.matrix(~ groupname + 0)
log2.cpm <- voom(experiment_data_pathway, design, normalize.method = "quantile")
fit <- lmFit(log2.cpm, design)
cont.matrix <- makeContrasts(groupname1-groupname0, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)
limma.res=topTable(fit2,n=Inf,sort="p", adjust.method = "fdr")
fc_new <- limma.res$logFC
limma.res$entrez = mapIds(org.Hs.eg.db,
keys=row.names(limma.res),
column="ENTREZID",
keytype=keytypes(org.Hs.eg.db)[2],
multiVals="first")
names(fc_new) <- limma.res$entrez
geneList<-na.omit(fc_new)
geneList = sort(geneList, decreasing = TRUE)
kk2 <- gseGO(geneList = geneList,
OrgDb = org.Hs.eg.db,
ont = "BP",
minGSSize = 100,
maxGSSize = 500,
pvalueCutoff = 0.05,
verbose = FALSE)
kk2_dt=as.data.frame(kk2)
kk2_dt[,c(4,5,7,8)]=round(kk2_dt[,c(4,5,7,8)],4)
datatable(kk2_dt)
BP_namelist=kk2_dt$Description
#https://yulab-smu.top/biomedical-knowledge-mining-book/clusterprofiler-go.html
#gene ontology over-representation test
ego2 <- enrichGO(gene = names(geneList),
OrgDb = org.Hs.eg.db,
keyType = 'ENTREZID',
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05)
ego2_dt=as.data.frame(ego2)
ego2_dt[,5:7]=round(ego2_dt[,5:7],4)
datatable(ego2_dt)
# Let is suppose I have a collection of genesets called : HALLMARK Now let is suppose there is a specific geneset there called: E2F_targets
#
# BgRatio, M/N.
#
# M = size of the geneset (eg size of the E2F_targets); (is the number of genes within that distribution that are annotated (either directly or indirectly) to the node of interest).
#
# N = size of all of the unique genes in the collection of genesets (example the HALLMARK collection); (is the total number of genes in the background distribution (universe)
#
# GeneRatio is k/n.
#
# k = size of the overlap of 'a vector of gene id' you input with the specific geneset (eg E2F_targets), only unique genes; (the number of genes within that list n, which are annotated to the node.
#
# n = size of the overlap of 'a vector of gene id' you input with all the members of the collection of genesets (eg the HALLMARK collection),only unique genes; is the size of the list of genes of interest
#adj pval and qval
#https://support.bioconductor.org/p/96329/
#adj pval is the fdr adjusted pval, qval from another package is another adjustment method
#the BH approach is slightly more conservative than qvalue
#enrichgo:
# Over Representation Analysis (ORA) (Boyle et al. 2004) is a widely used approach to determine whether known biological functions or processes are over-represented (= enriched) in an experimentally-derived gene list, e.g. a list of differentially expressed genes (DEGs).
#
# The p-value can be calculated by hypergeometric distribution.
#Leading edge analysis reports Tags to indicate the percentage of genes contributing to the enrichment score, List to indicate where in the list the enrichment score is attained and Signal for enrichment signal strength.
#It would also be very interesting to get the core enriched genes that contribute to the enrichment. Our packages (clusterProfiler, DOSE, meshes and ReactomePA) support leading edge analysis and report core enriched genes in GSEA analysis.
#get pathway maps
result_path <- enrichGO(gene = names(geneList),
OrgDb = org.Hs.eg.db,
keyType = 'ENTREZID',
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff =1,
qvalueCutoff =1)
result_path_dt=as.data.frame(result_path)
result_path_dt2=result_path_dt[,c("Description","geneID")]
result_path_dt3=data.frame(matrix(ncol=2))
for(i in 1:nrow(result_path_dt2)){
X2=unlist(strsplit(result_path_dt2$geneID[i],"/"))
X1=rep(result_path_dt2$Description[i],length(X2))
dt=cbind(X1,X2)
result_path_dt3=rbind(result_path_dt3,dt)
}
kk3 <- gseGO(geneList = geneList,
OrgDb = org.Hs.eg.db,
ont = "MF",
minGSSize = 100,
maxGSSize = 500,
pvalueCutoff = 0.05,
verbose = FALSE)
kk3_dt=as.data.frame(kk3)
kk3_dt[,c(4,5,7,8)]=round(kk3_dt[,c(4,5,7,8)],4)
datatable(as.data.frame(kk3_dt))
#https://yulab-smu.top/biomedical-knowledge-mining-book/clusterprofiler-go.html
#gene ontology over-representation test
ego3 <- enrichGO(gene = names(geneList),
OrgDb = org.Hs.eg.db,
keyType = 'ENTREZID',
ont = "MF",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05)
ego3_dt=as.data.frame(ego3)
ego3_dt[,5:7]=round(ego3_dt[,5:7],4)
datatable(as.data.frame(ego3_dt))
MF_namelist=kk3_dt$Description
#get pathway maps
MFresult_path <- enrichGO(gene = names(geneList),
OrgDb = org.Hs.eg.db,
keyType = 'ENTREZID',
ont = "MF",
pAdjustMethod = "BH",
pvalueCutoff =1,
qvalueCutoff =1)
MFresult_path_dt=as.data.frame(MFresult_path)
MFresult_path_dt2=MFresult_path_dt[,c("Description","geneID")]
MFresult_path_dt3=data.frame(matrix(ncol=2))
for(i in 1:nrow(MFresult_path_dt2)){
X2=unlist(strsplit(MFresult_path_dt2$geneID[i],"/"))
X1=rep(MFresult_path_dt2$Description[i],length(X2))
dt=cbind(X1,X2)
MFresult_path_dt3=rbind(MFresult_path_dt3,dt)
}
#new things 20220907: add customised pathway analysis results
name=c("GO:0030198","GO:0055007","GO:0030239","GO:0060047","GO:0007010","GO:0031012","GO:0043292","GO:0005856")
AnnotationDbi::select(GO.db, keys=name, columns=c("TERM","ONTOLOGY"), keytype="GOID")
## GOID TERM ONTOLOGY
## 1 GO:0030198 extracellular matrix organization BP
## 2 GO:0055007 cardiac muscle cell differentiation BP
## 3 GO:0030239 myofibril assembly BP
## 4 GO:0060047 heart contraction BP
## 5 GO:0007010 cytoskeleton organization BP
## 6 GO:0031012 extracellular matrix CC
## 7 GO:0043292 contractile fiber CC
## 8 GO:0005856 cytoskeleton CC
pathwayIDs=c("extracellular matrix organization","cardiac muscle cell differentiation", "myofibril assembly", "heart contraction" ,"cytoskeleton organization","extracellular matrix" ,"contractile fiber", "cytoskeleton")
all_name=c(name,GOBPOFFSPRING[["GO:0030198"]],GOBPOFFSPRING[["GO:0055007"]],GOBPOFFSPRING[["GO:0030239"]],GOBPOFFSPRING[["GO:0060047"]],GOBPOFFSPRING[["GO:0007010"]],GOCCOFFSPRING[["GO:0031012"]],GOCCOFFSPRING[["GO:0043292"]],GOCCOFFSPRING[["GO:0005856"]])
all_name2=AnnotationDbi::select(GO.db, keys=all_name, columns=c("TERM","ONTOLOGY"), keytype="GOID")
GO_v<- AnnotationDbi::select(org.Hs.eg.db, keys=all_name2$GOID, columns=c("GO","ONTOLOGY","SYMBOL","ENTREZID"),keytype="GO")
pp2=merge(all_name2,GO_v,by.x="GOID",by.y="GO",all=TRUE)
datatable(pp2%>%group_by(GOID)%>%summarise(n=n()))
gene_list = sort(fc_new, decreasing = TRUE)
names(gene_list)=as.character(names(gene_list))
pathway_list3=pp2[,c("TERM","ENTREZID")]
colnames(pathway_list3)=c("pathwayIDs","geneIDs")
sum(names(gene_list)%in% pathway_list3$geneIDs)
## [1] 871
## [1] 1282
## [1] 2153
#pathway_list3%>%group_by(pathwayIDs)%>%summarise(n=n())
set.seed(20220907)
gse <- GSEA(gene_list, TERM2GENE = pathway_list3,nPerm = 1000, minGSSize = 3, maxGSSize = 500, pAdjustMethod = "none",pvalueCutoff = 0.05)
figure4a=as.data.frame(gse)
figure4av2=figure4a
figure4av2[,4:9]=round(figure4av2[,4:9],3)
datatable(figure4av2)
#new things 20220905: add customised pathway analysis results
name=c("GO:0030198","GO:0055007","GO:0030239","GO:0060047","GO:0007010","GO:0031012","GO:0043292","GO:0005856")
#get pathway maps
#new things 20220905: add customised pathway analysis results
result_path <- enrichGO(gene = names(geneList),
OrgDb = org.Hs.eg.db,
keyType = 'ENTREZID',
ont = "BP",
pAdjustMethod = "none",
pvalueCutoff =1,
qvalueCutoff =1)
result_path_dt=as.data.frame(result_path)
gse_result=gseGO(geneList = geneList,
OrgDb = org.Hs.eg.db,
ont = "BP",
minGSSize = 3,
maxGSSize = 500,
pvalueCutoff = 1,
pAdjustMethod = "none",
verbose = FALSE)
gse_result2=as.data.frame(gse_result)#about 200 enrichgo not in gsego
result_path_dt=gse_result2
result_path_dt2=result_path_dt[,c("Description","core_enrichment","ID")]
colnames(result_path_dt2)=c("Description","geneID","ID")
# "GO:0007010"%in%result_path_dt2$ID
# "biological process"%in%result_path_dt2$Description
# "cellular process"%in%result_path_dt2$Description
gse_dt3=data.frame(matrix(ncol=3))
for(i in 1:nrow(result_path_dt2)){
X2=unlist(strsplit(result_path_dt2$geneID[i],"/"))
X1=rep(result_path_dt2$Description[i],length(X2))
X3=rep(result_path_dt2$ID[i],length(X2))
dt=cbind(X3,X1,X2)
gse_dt3=rbind(gse_dt3,dt)
}
BP_result=gse_dt3
"GO:0007010"%in%BP_result$X3
gse_result=gseGO(geneList = geneList,
OrgDb = org.Hs.eg.db,
ont = "MF",
minGSSize = 3,
maxGSSize = 500,
pvalueCutoff = 1,
pAdjustMethod = "none",
verbose = FALSE)
gse_result2=as.data.frame(gse_result)
result_path_dt=gse_result2
result_path_dt2=result_path_dt[,c("Description","core_enrichment","ID")]
colnames(result_path_dt2)=c("Description","geneID","ID")
gse_dt3=data.frame(matrix(ncol=3))
for(i in 1:nrow(result_path_dt2)){
X2=unlist(strsplit(result_path_dt2$geneID[i],"/"))
X1=rep(result_path_dt2$Description[i],length(X2))
X3=rep(result_path_dt2$ID[i],length(X2))
dt=cbind(X3,X1,X2)
gse_dt3=rbind(gse_dt3,dt)
}
MF_result=gse_dt3
gse_result=gseGO(geneList = geneList,
OrgDb = org.Hs.eg.db,
ont = "CC",
minGSSize = 3,
maxGSSize = 500,
pvalueCutoff = 1,
pAdjustMethod = "none",
verbose = FALSE)
gse_result2=as.data.frame(gse_result)
result_path_dt=gse_result2
result_path_dt2=result_path_dt[,c("Description","core_enrichment","ID")]
colnames(result_path_dt2)=c("Description","geneID","ID")
gse_dt3=data.frame(matrix(ncol=3))
for(i in 1:nrow(result_path_dt2)){
X2=unlist(strsplit(result_path_dt2$geneID[i],"/"))
X1=rep(result_path_dt2$Description[i],length(X2))
X3=rep(result_path_dt2$ID[i],length(X2))
dt=cbind(X3,X1,X2)
gse_dt3=rbind(gse_dt3,dt)
}
CC_result=gse_dt3
gene_list = sort(fc_new, decreasing = TRUE)
names(gene_list)=as.character(names(gene_list))
pathway_list=rbind.data.frame(BP_result,MF_result,CC_result)
colnames(pathway_list)=c("pathwayIDs","geneIDs","GOID")
pathway_list2=pathway_list[pathway_list$GOID%in%name,]
pathway_list3=pathway_list2[,1:2]
sum(names(gene_list)%in% pathway_list3$geneIDs)
length(gene_list)
pathway_list3%>%group_by(pathwayIDs)%>%summarise(n=n())
set.seed(20220905)
gse <- GSEA(gene_list, TERM2GENE = pathway_list3,nPerm = 1000, minGSSize = 3, maxGSSize = 500, pAdjustMethod = "none",pvalueCutoff = 0.05)
figure4a=as.data.frame(gse)
kegg_gene_list<-na.omit(fc_new)
kegg_gene_list = sort(kegg_gene_list, decreasing = TRUE)
set.seed(202205)
kk2 <- gseKEGG(geneList = kegg_gene_list,
organism = "hsa",
nPerm = 10000,
minGSSize = 3,
maxGSSize = 800,
pvalueCutoff = 0.05,
pAdjustMethod = "BH",
keyType = "ncbi-geneid")
datatable(as.data.frame(kk2))
both2=as.data.frame(kk2)
both2=both2[-grep("disease",both2$Description),]
ggplot(both2,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=p.adjust,size=p.adjust))+coord_flip()+theme_bw()+ggtitle("GSEA for significantly downregulated and upregulated pathways")+labs(y="Enrichment score")+
scale_colour_gradient(high = "red", low = "blue")
#get pathway maps
KEGGresult_path <- enrichKEGG(gene = names(geneList),
organism = "hsa", keyType = "kegg", pvalueCutoff = 1, pAdjustMethod = "BH",minGSSize = 0, maxGSSize = 50000000000, qvalueCutoff = 1)
KEGGresult_path_dt=as.data.frame(KEGGresult_path)
KEGGresult_path_dt2=KEGGresult_path_dt[,c("Description","geneID")]
KEGGresult_path_dt2=KEGGresult_path_dt2[-grep("disease",KEGGresult_path_dt2$Description),]
KEGGresult_path_dt3=data.frame(matrix(ncol=2))
for(i in 1:nrow(KEGGresult_path_dt2)){
X2=unlist(strsplit(KEGGresult_path_dt2$geneID[i],"/"))
X1=rep(KEGGresult_path_dt2$Description[i],length(X2))
dt=cbind(X1,X2)
KEGGresult_path_dt3=rbind(KEGGresult_path_dt3,dt)
}
print(paste("The total number of kegg pathways is",length(unique(KEGGresult_path_dt3$X1))))
## [1] "The total number of kegg pathways is 325"
#get all kegg pathways with the mapping
kegg_full_list=download_KEGG("hsa", keggType = "KEGG", keyType = "kegg")
head(kegg_full_list[[1]])
## from to
## 1 hsa00010 10327
## 2 hsa00010 124
## 3 hsa00010 125
## 4 hsa00010 126
## 5 hsa00010 127
## 6 hsa00010 128
## from to
## 1 hsa01100 Metabolic pathways
## 2 hsa01200 Carbon metabolism
## 3 hsa01210 2-Oxocarboxylic acid metabolism
## 4 hsa01212 Fatty acid metabolism
## 5 hsa01230 Biosynthesis of amino acids
## 6 hsa01232 Nucleotide metabolism
## [1] 355
## [1] 355 2
kegg_full_dt=merge(kegg_full_list1,kegg_full_list2,by="from")
kegg_full_dt=kegg_full_dt[,c(3,2)]
length(unique(kegg_full_dt$to.y))
## [1] 355
## GSEA analysis
fc.kegg.p <- gage(fc_new, gsets = kegg.sets.hsa, ref = NULL, samp = NULL, rank.test = TRUE)
upreg <- data.frame(fc.kegg.p[[1]]) %>%
rownames_to_column(var = "pathway") %>%
mutate(KEGGpathway = substr(pathway, start = 1, stop = 8)) %>%
mutate(pathwayname = word(pathway, 2, -1)) %>%
select(pathway, KEGGpathway, pathwayname, set.size, stat.mean, p.val, q.val) %>%
mutate(pathid = substr(KEGGpathway, start = 4, stop = 8))
upreg_selected = upreg %>%
filter(q.val < 0.05) %>%
filter(set.size >= 5)
upreg_selected$neglogP_Value = -log10(upreg_selected$p.val)
upreg_selected$neglogQ_value = -log10(upreg_selected$q.val)
downreg <- data.frame(fc.kegg.p[[2]]) %>%
rownames_to_column(var = "pathway") %>%
mutate(KEGGpathway = substr(pathway, start = 1, stop = 8)) %>%
mutate(pathwayname = word(pathway, 2, -1)) %>%
select(pathway, KEGGpathway, pathwayname, set.size, stat.mean, p.val, q.val) %>%
mutate(pathid = substr(KEGGpathway, start = 4, stop = 8))
downreg_selected = downreg %>%
filter(q.val < 0.05) %>%
filter(set.size >= 5)
downreg_selected$neglogP_Value = -log10(downreg_selected$p.val)
downreg_selected$neglogQ_value = -log10(downreg_selected$q.val)
both = rbind(downreg_selected, upreg_selected)
both$absolute = abs(both$stat.mean)
#datatable(both)
both2 = both %>% filter(pathwayname != "Metabolic pathways" & pathwayname != "Retrograde endocannabinoid signaling")
both2=both2[-grep("disease",both2$pathway),]
both2[,c(5:7,9:11)]=round(both2[,c(5:7,9:11)],4)
datatable(both2)
#saveRDS(both, file = "both_selected.rds")
# pv.out.list.fat <- sapply(both2$pathid[1:2],
# function(pid) pathview(gene.data = fc_new,
# pathway.id = pid,
# species = "hsa",
# out.suffix="human",
# low = list(gene = "deepskyblue1", cpd = "green"),
# high = list(gene = "red", cpd = "yellow")))
library(ggrepel )
ggplot(both2, aes(x = stat.mean, y = -log10(p.val))) + geom_point( aes(color = set.size), size = 3) + labs(x = "Enrichment score", y = "neglog10p", title = "GSEA for significantly downregulated and upregulated pathways") +
geom_text_repel(aes(label=pathwayname), size=3) +
scale_colour_gradient(high = "red", low = "blue") +
scale_x_continuous(
labels = scales::number_format(accuracy = 0.01))
ggplot(both2,aes(x = reorder(pathwayname, stat.mean),y=stat.mean))+geom_point(aes(color=-log10(p.val),size=-log10(p.val)))+coord_flip()+theme_bw()+ggtitle("GSEA for significantly downregulated and upregulated pathways")+labs(y="Enrichment score")+
scale_colour_gradient(high = "red", low = "blue")
library(clusterProfiler)
library(enrichplot)
performGSEA_mito <- function(gene_list,gene.pathway,pval){
# gse <- GSEA(gene_list,
# TERM2GENE = gene.pathway,
# pAdjustMethod = "none",
# scoreType = "pos",
# nPermSimple = 10000
# )
gse <- GSEA(gene_list, TERM2GENE = gene.pathway,nPerm = 1000, minGSSize = 5, maxGSSize = 500, pAdjustMethod = "BH",pvalueCutoff = pval)
GSEresults.rounded <- gse@result[,1:9]
GSEresults.rounded[,4:8] <- signif(GSEresults.rounded[,4:8],4)
GSEresults.rounded[,6:7] <- sapply(GSEresults.rounded[,6:7], format, scientific = TRUE)
table <- DT::datatable(GSEresults.rounded)
dot <- clusterProfiler::dotplot(gse, showCategory=10, split=".sign") + facet_grid(.~.sign)
gse <- enrichplot::pairwise_termsim(gse)
enrich <- emapplot(gse, showCategory = 10)
tree <- treeplot(gse) #https://github.com/YuLab-SMU/ggtree/issues/399 #might be a dplyr issue
return(list(table = table, dot = dot, enrich = enrich, tree = tree,table2=GSEresults.rounded))
}
###
mito_genes <- read_excel("Human.MitoCarta3.0.xls", sheet = 4)
mito_genes <- mito_genes %>% dplyr::select(MitoPathway,Genes) %>% na.omit
gene_sets <- mito_genes$Genes %>% strsplit(", ")
gene.pathway <- data.frame(geneIDs = character(), pathwayIDs = integer())
for(i in 1:length(gene_sets)){
new_data <- data.frame(geneIDs = gene_sets[[i]], pathwayIDs = i)
gene.pathway <- rbind(gene.pathway,new_data)
}
gene.pathway$geneIDs <- mapIds(org.Hs.eg.db, keys = gene.pathway$geneIDs, keytype = "SYMBOL", column = "ENTREZID")
pathway.names <- data.frame(pathwayIDs = 1:length(gene_sets), pathwayNames = mito_genes$MitoPathway)
####
gene_list = sort(fc_new, decreasing = TRUE)
names(gene_list)=as.character(names(gene_list))
gene.pathway2=gene.pathway
gene.pathway2$pathwayIDs=pathway.names$pathwayNames[match(gene.pathway2$pathwayIDs,pathway.names$pathwayIDs)]
gene.pathway2=gene.pathway2[,c(2,1)]#need to put term gene in order
##Ben's mito to delete unused previous mito pathways
ben_mito=read_excel("/Users/MQ10004787/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/Yunwei202110/IHD-DM paper figure 3a mitochondrial pathway list.xlsx")
ben_mito$Description%in%gene.pathway2$pathwayIDs
## [1] TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE
## [13] TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE FALSE TRUE TRUE FALSE
## [25] FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## [37] FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## [49] FALSE TRUE FALSE TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE FALSE
## [61] FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [1] 3904 2
## [1] 2414 2
###
#KEGGresult_path_dt3 is the old one
colnames(kegg_full_dt)=colnames(gene.pathway2)
gene.pathway3=rbind.data.frame(gene.pathway2,kegg_full_dt)
set.seed(202205)
gseaResults_mito <-performGSEA_mito(gene_list,gene.pathway3,0.05)
gseaResults_mito$table
#gseaResults_mito$dot
#gseaResults_mito$enrich
#gseaResults_mito$tree
#Normalized enrichment scores (NES)
#https://www.biostars.org/p/169648/
#my plot
ggplotdt=gseaResults_mito$table2
ggplotdt$p.adjust=as.numeric(ggplotdt$p.adjust)
ggplotdtup=ggplotdt[ggplotdt$enrichmentScore>0,]
ggplotdtupsub=ggplotdtup[order(-abs(ggplotdtup$enrichmentScore)),]
ggplotdtdown=ggplotdt[ggplotdt$enrichmentScore<0,]
ggplotdtdownsub=ggplotdtdown[order(ggplotdtdown$enrichmentScore),]
ggplotdt=rbind.data.frame(ggplotdtupsub[1:10,],ggplotdtdownsub[1:10,])
ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=p.adjust,size=p.adjust))+coord_flip()+theme_bw()+ggtitle("pathway enrichment analysis")+
scale_colour_gradient(high = "red", low = "blue")
paste("proportion of mito pathways in all pathways used",dim(gene.pathway2)[1]/dim(gene.pathway3)[1])
## [1] "proportion of mito pathways in all pathways used 0.0621380215706968"
paste("proportion of significant mito pathways in all significant pathways with threshold 0.05 is",sum(unique(gene.pathway2$pathwayIDs)%in% gseaResults_mito$table2$Description)/nrow(gseaResults_mito$table2))
## [1] "proportion of significant mito pathways in all significant pathways with threshold 0.05 is 0.377049180327869"
#one proportion z test, h0:p>p0
paste("perform statistical test to see if this is come up by chance: one-proportion z-test for greater shows:")
## [1] "perform statistical test to see if this is come up by chance: one-proportion z-test for greater shows:"
prop.test(x =sum(unique(gene.pathway2$pathwayIDs)%in% gseaResults_mito$table2$Description) , n = nrow(gseaResults_mito$table2), p = dim(gene.pathway2)[1]/dim(gene.pathway3)[1], correct = FALSE,
alternative = "greater")
##
## 1-sample proportions test without continuity correction
##
## data: sum(unique(gene.pathway2$pathwayIDs) %in% gseaResults_mito$table2$Description) out of nrow(gseaResults_mito$table2), null probability dim(gene.pathway2)[1]/dim(gene.pathway3)[1]
## X-squared = 103.8, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is greater than 0.06213802
## 95 percent confidence interval:
## 0.2822577 1.0000000
## sample estimates:
## p
## 0.3770492
## [1] 23
## [1] 61
#plot all the significant mito pathways
ggplotdt2=rbind.data.frame(ggplotdtupsub,ggplotdtdownsub)
ggplotdt2=ggplotdt2[ggplotdt2$Description%in%gene.pathway2$pathwayIDs,]
ggplot(ggplotdt2,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=p.adjust,size=p.adjust))+coord_flip()+theme_bw()+ggtitle("mitochondrail pathway enrichment analysis")+
scale_colour_gradient(high = "red", low = "blue")+ylim(c(-1,1))
library(clusterProfiler)
library(enrichplot)
performGSEA_mito <- function(gene_list,gene.pathway,pval){
# gse <- GSEA(gene_list,
# TERM2GENE = gene.pathway,
# pAdjustMethod = "none",
# scoreType = "pos",
# nPermSimple = 10000
# )
gse <- GSEA(gene_list, TERM2GENE = gene.pathway,nPerm = 1000, minGSSize = 3, maxGSSize = 500, pAdjustMethod = "none",pvalueCutoff = pval)
GSEresults.rounded <- gse@result[,1:9]
GSEresults.rounded[,4:8] <- signif(GSEresults.rounded[,4:8],4)
GSEresults.rounded[,6:7] <- sapply(GSEresults.rounded[,6:7], format, scientific = TRUE)
table <- DT::datatable(GSEresults.rounded)
dot <- clusterProfiler::dotplot(gse, showCategory=10, split=".sign") + facet_grid(.~.sign)
gse <- enrichplot::pairwise_termsim(gse)
enrich <- emapplot(gse, showCategory = 10)
tree <- treeplot(gse) #https://github.com/YuLab-SMU/ggtree/issues/399 #might be a dplyr issue
return(list(table = table, dot = dot, enrich = enrich, tree = tree,table2=GSEresults.rounded))
}
###
mito_genes <- read_excel("Human.MitoCarta3.0.xls", sheet = 4)
mito_genes <- mito_genes %>% dplyr::select(MitoPathway,Genes) %>% na.omit
gene_sets <- mito_genes$Genes %>% strsplit(", ")
gene.pathway <- data.frame(geneIDs = character(), pathwayIDs = integer())
for(i in 1:length(gene_sets)){
new_data <- data.frame(geneIDs = gene_sets[[i]], pathwayIDs = i)
gene.pathway <- rbind(gene.pathway,new_data)
}
gene.pathway$geneIDs <- mapIds(org.Hs.eg.db, keys = gene.pathway$geneIDs, keytype = "SYMBOL", column = "ENTREZID")
pathway.names <- data.frame(pathwayIDs = 1:length(gene_sets), pathwayNames = mito_genes$MitoPathway)
####
gene_list = sort(fc_new, decreasing = TRUE)
names(gene_list)=as.character(names(gene_list))
gene.pathway2=gene.pathway
gene.pathway2$pathwayIDs=pathway.names$pathwayNames[match(gene.pathway2$pathwayIDs,pathway.names$pathwayIDs)]
gene.pathway2=gene.pathway2[,c(2,1)]#need to put term gene in order
##Ben's mito to delete unused previous mito pathways
ben_mito=read_excel("/Users/MQ10004787/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/Yunwei202110/IHD-noDM vs donor KEGG mito pathways 27.6.22.xlsx")
ben_mito$Description%in%gene.pathway2$pathwayIDs
## [1] TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
## [13] TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE
## [25] TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE
## [37] TRUE FALSE FALSE TRUE FALSE TRUE TRUE TRUE FALSE TRUE TRUE FALSE
## [49] FALSE TRUE FALSE TRUE TRUE FALSE TRUE FALSE FALSE TRUE FALSE TRUE
## [61] TRUE FALSE TRUE TRUE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE
## [73] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] TRUE FALSE FALSE FALSE FALSE FALSE
## [1] 3904 2
## [1] 2416 2
###
colnames(kegg_full_dt)=colnames(gene.pathway2)
gene.pathway3=rbind.data.frame(gene.pathway2,kegg_full_dt)
set.seed(202205)
gseaResults_mito <-performGSEA_mito(gene_list,gene.pathway3,0.05)
#my plot
ggplotdt=gseaResults_mito$table2
ggplotdt$p.adjust=as.numeric(ggplotdt$p.adjust)
ggplotdtup=ggplotdt[ggplotdt$enrichmentScore>0,]
ggplotdtupsub=ggplotdtup[order(ggplotdtup$p.adjust),]
ggplotdtdown=ggplotdt[ggplotdt$enrichmentScore<0,]
ggplotdtdownsub=ggplotdtdown[order(ggplotdtdown$p.adjust),]
ggplotdt=rbind.data.frame(ggplotdtupsub[1:10,],ggplotdtdownsub[1:10,])
ggplotdt$pvalue=as.numeric(ggplotdt$pvalue)
gg=ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=pvalue,size=pvalue))+coord_flip()+theme_bw()+ggtitle("ihd-dm vs donor pathway enrichment analysis")+
scale_colour_gradient(high = "red", low = "blue")+theme(text = element_text(size = 10))
gg
load("all_omics_lv_v5.RData")
protein_expression=assay(all_omics@ExperimentList$protein, "log2")
protein_expression1=as.data.frame(protein_expression)
dim(protein_expression1)
## [1] 9037 111
#changed 20220224: no overall filtering
protein_expression2=protein_expression1
dim(protein_expression2)
## [1] 9037 111
## [1] 4094 111
#20220321 add the age match as before
infodt1 = colData(all_omics)
infodt1$ihd=as.numeric(as.vector(ifelse(infodt1$Condition=="IHD"&infodt1$Diabetes=="No",1,ifelse(infodt1$Condition=="Donor",0,2))))
ihddt=infodt1[infodt1$ihd!=2,]
ihddt=ihddt[(ihddt$ihd==0&ihddt$Age>=47)|ihddt$ihd==1,]
dim(ihddt)
## [1] 32 14
## [1] "15_LV_IHD_No_41_M" "16_LV_IHD_No_42_F" "17_LV_IHD_No_43_F"
## [4] "18_LV_IHD_No_45_M" "19_LV_IHD_No_47_F" "20_LV_IHD_No_48_F"
## [7] "21_LV_IHD_No_49_F" "22_LV_IHD_No_50_M" "23_LV_IHD_No_50_M"
## [10] "24_LV_IHD_No_54_M" "25_LV_IHD_No_54_M" "26_LV_IHD_No_54_F"
## [13] "27_LV_IHD_No_62_F" "28_LV_IHD_No_62_M" "29_LV_IHD_No_62_M"
## [16] "30_LV_IHD_No_66_M" "94_LV_Donor_No_47_M" "95_LV_Donor_No_47_F"
## [19] "96_LV_Donor_No_48_M" "97_LV_Donor_No_49_F" "98_LV_Donor_No_51_F"
## [22] "99_LV_Donor_No_53_M" "100_LV_Donor_No_53_F" "101_LV_Donor_No_54_M"
## [25] "102_LV_Donor_No_54_F" "104_LV_Donor_No_55_F" "105_LV_Donor_No_56_M"
## [28] "106_LV_Donor_No_60_M" "107_LV_Donor_No_62_M" "108_LV_Donor_No_62_F"
## [31] "109_LV_Donor_No_65_M" "110_LV_Donor_No_65_F"
indexx=ihddt$new_filename
protein_expression2=protein_expression2[,colnames(protein_expression2)%in%indexx]
dim(protein_expression2)
## [1] 9037 32
#get the "lv" "ihd-dm" "ihd-no dm" non-imputed data
experiment_data_noimputation=protein_expression2[,union(grep("_IHD_No",colnames(protein_expression2)),grep("Donor_No",colnames(protein_expression2)))]
#colnames(experiment_data_noimputation)
dim(experiment_data_noimputation)
## [1] 9037 32
#filtering out proteins with >25% samples missing
experiment_data_noimputation=experiment_data_noimputation[which(rowMeans(!is.na(experiment_data_noimputation)) > 0.25), ]
dim(experiment_data_noimputation)
## [1] 4052 32
diabetes=grep("IHD",colnames(experiment_data_noimputation))
experiment_data_noimputation1=as.data.frame(t(experiment_data_noimputation))
rownames(experiment_data_noimputation1)=1:dim(experiment_data_noimputation1)[1]
experiment_data_noimputation1$y=ifelse(rownames(experiment_data_noimputation1) %in% diabetes,1,0)
#de analysis-- should be on the non-imputed data
groupname<-as.factor(experiment_data_noimputation1$y)
design <- model.matrix(~ groupname + 0)
fit <- lmFit(experiment_data_noimputation, design)
cont.matrix <- makeContrasts(groupname1-groupname0, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)
tT <- topTable(fit2)
df1 <- topTable(fit2, number=nrow(fit2), genelist=rownames(experiment_data_noimputation))
df1$name=rownames(df1)
df1$neglogP_value = -log10(df1$P.Value)
# g1=ggplot2::ggplot(df1, aes(logFC,neglogP_value,label=name))+
# geom_point(size = 1, alpha=1) +
# xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("IHD-T2DM vs Donor") +
# theme_bw(base_size = 16) +
# geom_hline(yintercept = 1.3, linetype = "dashed", colour = "brown3")+theme(aspect.ratio = 1)
#add fdr plot
adj_to_p <- function(df) {
#df <- topTable(fit, coef = comparison, number = nrow(fit))
if ((df %>%filter(adj.P.Val < 0.05) %>% nrow()) == 0){
r <- 1
} else {
r <- df%>%mutate(rn = row_number()) %>% filter(adj.P.Val <= 0.05) %>% pull(rn)%>%max() + 1
}
n <- nrow(df) + 1
lp10 <- -log10(r / n * 0.05)
lp10
}
#add samples
colnames(experiment_data_noimputation)
## [1] "17_LV_IHD_No_43_F" "26_LV_IHD_No_54_F" "27_LV_IHD_No_62_F"
## [4] "24_LV_IHD_No_54_M" "16_LV_IHD_No_42_F" "22_LV_IHD_No_50_M"
## [7] "21_LV_IHD_No_49_F" "19_LV_IHD_No_47_F" "20_LV_IHD_No_48_F"
## [10] "23_LV_IHD_No_50_M" "28_LV_IHD_No_62_M" "15_LV_IHD_No_41_M"
## [13] "29_LV_IHD_No_62_M" "18_LV_IHD_No_45_M" "30_LV_IHD_No_66_M"
## [16] "25_LV_IHD_No_54_M" "110_LV_Donor_No_65_F" "101_LV_Donor_No_54_M"
## [19] "94_LV_Donor_No_47_M" "98_LV_Donor_No_51_F" "105_LV_Donor_No_56_M"
## [22] "100_LV_Donor_No_53_F" "109_LV_Donor_No_65_M" "99_LV_Donor_No_53_M"
## [25] "97_LV_Donor_No_49_F" "108_LV_Donor_No_62_F" "102_LV_Donor_No_54_F"
## [28] "96_LV_Donor_No_48_M" "106_LV_Donor_No_60_M" "95_LV_Donor_No_47_F"
## [31] "107_LV_Donor_No_62_M" "104_LV_Donor_No_55_F"
## [1] 2.683497
highlight_df=df1
highlight_df$colorcode=as.character(ifelse(highlight_df$neglogP_value>adj_to_p(df1)&highlight_df$logFC>0,1,ifelse(highlight_df$neglogP_value>adj_to_p(df1)&highlight_df$logFC<0,-1,0)))
dim(highlight_df)
## [1] 4052 10
## [1] 4051 10
g1=ggplot2::ggplot(highlight_df, aes(logFC,neglogP_value,label=name))+
geom_point(aes(x=logFC,y=neglogP_value,color=colorcode)) +
xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("IHD-NO DM vs Donor") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),aspect.ratio = 1) +
geom_hline(yintercept = adj_to_p(df1), linetype = "dashed", colour = "red")+scale_color_manual(values = c("blue","gray","red"))
ggplotly(g1)
#pathway analysis
#pathway analysis as in our shiny:delete missing values
experiment_data_pathway=na.omit(experiment_data_noimputation)
dim(experiment_data_pathway)
## [1] 2212 32
##With voom for finding pathways
groupname<-as.factor(experiment_data_noimputation1$y)
design <- model.matrix(~ groupname + 0)
log2.cpm <- voom(experiment_data_pathway, design, normalize.method = "quantile")
fit <- lmFit(log2.cpm, design)
cont.matrix <- makeContrasts(groupname1-groupname0, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)
limma.res=topTable(fit2,n=Inf,sort="p", adjust.method = "fdr")
fc_new <- limma.res$logFC
limma.res$entrez = mapIds(org.Hs.eg.db,
keys=row.names(limma.res),
column="ENTREZID",
keytype=keytypes(org.Hs.eg.db)[2],
multiVals="first")
names(fc_new) <- limma.res$entrez
geneList<-na.omit(fc_new)
geneList = sort(geneList, decreasing = TRUE)
kk2 <- gseGO(geneList = geneList,
OrgDb = org.Hs.eg.db,
ont = "BP",
minGSSize = 100,
maxGSSize = 500,
pvalueCutoff = 0.05,
verbose = FALSE)
kk2_dt=as.data.frame(kk2)
kk2_dt[,c(4,5,7,8)]=round(kk2_dt[,c(4,5,7,8)],4)
datatable(kk2_dt)
BP_namelist=kk2_dt$Description
#https://yulab-smu.top/biomedical-knowledge-mining-book/clusterprofiler-go.html
#gene ontology over-representation test
ego2 <- enrichGO(gene = names(geneList),
OrgDb = org.Hs.eg.db,
keyType = 'ENTREZID',
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05)
ego2_dt=as.data.frame(ego2)
ego2_dt[,5:7]=round(ego2_dt[,5:7],4)
datatable(ego2_dt)
# Let is suppose I have a collection of genesets called : HALLMARK Now let is suppose there is a specific geneset there called: E2F_targets
#
# BgRatio, M/N.
#
# M = size of the geneset (eg size of the E2F_targets); (is the number of genes within that distribution that are annotated (either directly or indirectly) to the node of interest).
#
# N = size of all of the unique genes in the collection of genesets (example the HALLMARK collection); (is the total number of genes in the background distribution (universe)
#
# GeneRatio is k/n.
#
# k = size of the overlap of 'a vector of gene id' you input with the specific geneset (eg E2F_targets), only unique genes; (the number of genes within that list n, which are annotated to the node.
#
# n = size of the overlap of 'a vector of gene id' you input with all the members of the collection of genesets (eg the HALLMARK collection),only unique genes; is the size of the list of genes of interest
#adj pval and qval
#https://support.bioconductor.org/p/96329/
#adj pval is the fdr adjusted pval, qval from another package is another adjustment method
#the BH approach is slightly more conservative than qvalue
#get pathway maps
result_path <- enrichGO(gene = names(geneList),
OrgDb = org.Hs.eg.db,
keyType = 'ENTREZID',
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff =1,
qvalueCutoff =1)
result_path_dt=as.data.frame(result_path)
result_path_dt2=result_path_dt[,c("Description","geneID")]
result_path_dt3=data.frame(matrix(ncol=2))
for(i in 1:nrow(result_path_dt2)){
X2=unlist(strsplit(result_path_dt2$geneID[i],"/"))
X1=rep(result_path_dt2$Description[i],length(X2))
dt=cbind(X1,X2)
result_path_dt3=rbind(result_path_dt3,dt)
}
kk3 <- gseGO(geneList = geneList,
OrgDb = org.Hs.eg.db,
ont = "MF",
minGSSize = 100,
maxGSSize = 500,
pvalueCutoff = 0.05,
verbose = FALSE)
kk3_dt=as.data.frame(kk3)
kk3_dt[,c(4,5,7,8)]=round(kk3_dt[,c(4,5,7,8)],4)
datatable(as.data.frame(kk3_dt))
#https://yulab-smu.top/biomedical-knowledge-mining-book/clusterprofiler-go.html
#gene ontology over-representation test
ego3 <- enrichGO(gene = names(geneList),
OrgDb = org.Hs.eg.db,
keyType = 'ENTREZID',
ont = "MF",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05)
ego3_dt=as.data.frame(ego3)
ego3_dt[,5:7]=round(ego3_dt[,5:7],4)
datatable(as.data.frame(ego3_dt))
MF_namelist=kk3_dt$Description
#get pathway maps
MFresult_path <- enrichGO(gene = names(geneList),
OrgDb = org.Hs.eg.db,
keyType = 'ENTREZID',
ont = "MF",
pAdjustMethod = "BH",
pvalueCutoff =1,
qvalueCutoff =1)
MFresult_path_dt=as.data.frame(MFresult_path)
MFresult_path_dt2=MFresult_path_dt[,c("Description","geneID")]
MFresult_path_dt3=data.frame(matrix(ncol=2))
for(i in 1:nrow(MFresult_path_dt2)){
X2=unlist(strsplit(MFresult_path_dt2$geneID[i],"/"))
X1=rep(MFresult_path_dt2$Description[i],length(X2))
dt=cbind(X1,X2)
MFresult_path_dt3=rbind(MFresult_path_dt3,dt)
}
#new things 20220907: add customised pathway analysis results
name=c("GO:0030198","GO:0055007","GO:0030239","GO:0060047","GO:0007010","GO:0031012","GO:0043292","GO:0005856")
AnnotationDbi::select(GO.db, keys=name, columns=c("TERM","ONTOLOGY"), keytype="GOID")
## GOID TERM ONTOLOGY
## 1 GO:0030198 extracellular matrix organization BP
## 2 GO:0055007 cardiac muscle cell differentiation BP
## 3 GO:0030239 myofibril assembly BP
## 4 GO:0060047 heart contraction BP
## 5 GO:0007010 cytoskeleton organization BP
## 6 GO:0031012 extracellular matrix CC
## 7 GO:0043292 contractile fiber CC
## 8 GO:0005856 cytoskeleton CC
pathwayIDs=c("extracellular matrix organization","cardiac muscle cell differentiation", "myofibril assembly", "heart contraction" ,"cytoskeleton organization","extracellular matrix" ,"contractile fiber", "cytoskeleton")
all_name=c(name,GOBPOFFSPRING[["GO:0030198"]],GOBPOFFSPRING[["GO:0055007"]],GOBPOFFSPRING[["GO:0030239"]],GOBPOFFSPRING[["GO:0060047"]],GOBPOFFSPRING[["GO:0007010"]],GOCCOFFSPRING[["GO:0031012"]],GOCCOFFSPRING[["GO:0043292"]],GOCCOFFSPRING[["GO:0005856"]])
all_name2=AnnotationDbi::select(GO.db, keys=all_name, columns=c("TERM","ONTOLOGY"), keytype="GOID")
GO_v<- AnnotationDbi::select(org.Hs.eg.db, keys=all_name2$GOID, columns=c("GO","ONTOLOGY","SYMBOL","ENTREZID"),keytype="GO")
pp2=merge(all_name2,GO_v,by.x="GOID",by.y="GO",all=TRUE)
datatable(pp2%>%group_by(GOID)%>%summarise(n=n()))
gene_list = sort(fc_new, decreasing = TRUE)
names(gene_list)=as.character(names(gene_list))
pathway_list3=pp2[,c("TERM","ENTREZID")]
colnames(pathway_list3)=c("pathwayIDs","geneIDs")
sum(names(gene_list)%in% pathway_list3$geneIDs)
## [1] 891
## [1] 1321
## [1] 2212
#pathway_list3%>%group_by(pathwayIDs)%>%summarise(n=n())
set.seed(20220907)
gse <- GSEA(gene_list, TERM2GENE = pathway_list3,nPerm = 1000, minGSSize = 3, maxGSSize = 500, pAdjustMethod = "none",pvalueCutoff = 0.05)
figure4b=as.data.frame(gse)
figure4bv2=figure4b
figure4bv2[,4:9]=round(figure4bv2[,4:9],3)
datatable(figure4bv2)
#new things 20220905: add customised pathway analysis results
name=c("GO:0030198","GO:0055007","GO:0030239","GO:0060047","GO:0007010","GO:0031012","GO:0043292","GO:0005856")
#get pathway maps
result_path <- enrichGO(gene = names(geneList),
OrgDb = org.Hs.eg.db,
keyType = 'ENTREZID',
ont = "BP",
pAdjustMethod = "none",
pvalueCutoff =1,
qvalueCutoff =1)
result_path_dt=as.data.frame(result_path)
gse_result=gseGO(geneList = geneList,
OrgDb = org.Hs.eg.db,
ont = "BP",
minGSSize = 3,
maxGSSize = 500,
pvalueCutoff = 1,
pAdjustMethod = "none",
verbose = FALSE)
gse_result2=as.data.frame(gse_result)#about 200 enrichgo not in gsego
result_path_dt=gse_result2
result_path_dt2=result_path_dt[,c("Description","core_enrichment","ID")]
colnames(result_path_dt2)=c("Description","geneID","ID")
# "GO:0007010"%in%result_path_dt2$ID
# "biological process"%in%result_path_dt2$Description
# "cellular process"%in%result_path_dt2$Description
gse_dt3=data.frame(matrix(ncol=3))
for(i in 1:nrow(result_path_dt2)){
X2=unlist(strsplit(result_path_dt2$geneID[i],"/"))
X1=rep(result_path_dt2$Description[i],length(X2))
X3=rep(result_path_dt2$ID[i],length(X2))
dt=cbind(X3,X1,X2)
gse_dt3=rbind(gse_dt3,dt)
}
BP_result=gse_dt3
"GO:0007010"%in%BP_result$X3
gse_result=gseGO(geneList = geneList,
OrgDb = org.Hs.eg.db,
ont = "MF",
minGSSize = 3,
maxGSSize = 500,
pvalueCutoff = 1,
pAdjustMethod = "none",
verbose = FALSE)
gse_result2=as.data.frame(gse_result)
result_path_dt=gse_result2
result_path_dt2=result_path_dt[,c("Description","core_enrichment","ID")]
colnames(result_path_dt2)=c("Description","geneID","ID")
gse_dt3=data.frame(matrix(ncol=3))
for(i in 1:nrow(result_path_dt2)){
X2=unlist(strsplit(result_path_dt2$geneID[i],"/"))
X1=rep(result_path_dt2$Description[i],length(X2))
X3=rep(result_path_dt2$ID[i],length(X2))
dt=cbind(X3,X1,X2)
gse_dt3=rbind(gse_dt3,dt)
}
MF_result=gse_dt3
gse_result=gseGO(geneList = geneList,
OrgDb = org.Hs.eg.db,
ont = "CC",
minGSSize = 3,
maxGSSize = 500,
pvalueCutoff = 1,
pAdjustMethod = "none",
verbose = FALSE)
gse_result2=as.data.frame(gse_result)
result_path_dt=gse_result2
result_path_dt2=result_path_dt[,c("Description","core_enrichment","ID")]
colnames(result_path_dt2)=c("Description","geneID","ID")
gse_dt3=data.frame(matrix(ncol=3))
for(i in 1:nrow(result_path_dt2)){
X2=unlist(strsplit(result_path_dt2$geneID[i],"/"))
X1=rep(result_path_dt2$Description[i],length(X2))
X3=rep(result_path_dt2$ID[i],length(X2))
dt=cbind(X3,X1,X2)
gse_dt3=rbind(gse_dt3,dt)
}
CC_result=gse_dt3
gene_list = sort(fc_new, decreasing = TRUE)
names(gene_list)=as.character(names(gene_list))
pathway_list=rbind.data.frame(BP_result,MF_result,CC_result)
colnames(pathway_list)=c("pathwayIDs","geneIDs","GOID")
pathway_list2=pathway_list[pathway_list$GOID%in%name,]
pathway_list3=pathway_list2[,1:2]
set.seed(20220905)
gse <- GSEA(gene_list, TERM2GENE = pathway_list3,nPerm = 1000, minGSSize = 3, maxGSSize = 500, pAdjustMethod = "none",pvalueCutoff = 0.05)
figure4b=as.data.frame(gse)
kegg_gene_list<-na.omit(fc_new)
kegg_gene_list = sort(kegg_gene_list, decreasing = TRUE)
set.seed(202205)
kk2 <- gseKEGG(geneList = kegg_gene_list,
organism = "hsa",
nPerm = 10000,
minGSSize = 3,
maxGSSize = 800,
pvalueCutoff = 0.05,
pAdjustMethod = "BH",
keyType = "ncbi-geneid")
datatable(as.data.frame(kk2))
both2=as.data.frame(kk2)
both2=both2[-grep("disease",both2$Description),]
ggplot(both2,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=p.adjust,size=p.adjust))+coord_flip()+theme_bw()+ggtitle("GSEA for significantly downregulated and upregulated pathways")+labs(y="Enrichment score")+
scale_colour_gradient(high = "red", low = "blue")
#get pathway maps
KEGGresult_path <- enrichKEGG(gene = names(geneList),
organism = "hsa", keyType = "kegg", pvalueCutoff = 1, pAdjustMethod = "BH",minGSSize = 0, maxGSSize = 3000, qvalueCutoff = 1)
KEGGresult_path_dt=as.data.frame(KEGGresult_path)
KEGGresult_path_dt2=KEGGresult_path_dt[,c("Description","geneID")]
KEGGresult_path_dt2=KEGGresult_path_dt2[-grep("disease",KEGGresult_path_dt2$Description),]
KEGGresult_path_dt3=data.frame(matrix(ncol=2))
for(i in 1:nrow(KEGGresult_path_dt2)){
X2=unlist(strsplit(KEGGresult_path_dt2$geneID[i],"/"))
X1=rep(KEGGresult_path_dt2$Description[i],length(X2))
dt=cbind(X1,X2)
KEGGresult_path_dt3=rbind(KEGGresult_path_dt3,dt)
}
## GSEA analysis
fc.kegg.p <- gage(fc_new, gsets = kegg.sets.hsa, ref = NULL, samp = NULL, rank.test = TRUE)
upreg <- data.frame(fc.kegg.p[[1]]) %>%
rownames_to_column(var = "pathway") %>%
mutate(KEGGpathway = substr(pathway, start = 1, stop = 8)) %>%
mutate(pathwayname = word(pathway, 2, -1)) %>%
select(pathway, KEGGpathway, pathwayname, set.size, stat.mean, p.val, q.val) %>%
mutate(pathid = substr(KEGGpathway, start = 4, stop = 8))
upreg_selected = upreg %>%
filter(q.val < 0.05) %>%
filter(set.size >= 5)
upreg_selected$neglogP_Value = -log10(upreg_selected$p.val)
upreg_selected$neglogQ_value = -log10(upreg_selected$q.val)
downreg <- data.frame(fc.kegg.p[[2]]) %>%
rownames_to_column(var = "pathway") %>%
mutate(KEGGpathway = substr(pathway, start = 1, stop = 8)) %>%
mutate(pathwayname = word(pathway, 2, -1)) %>%
select(pathway, KEGGpathway, pathwayname, set.size, stat.mean, p.val, q.val) %>%
mutate(pathid = substr(KEGGpathway, start = 4, stop = 8))
downreg_selected = downreg %>%
filter(q.val < 0.05) %>%
filter(set.size >= 5)
downreg_selected$neglogP_Value = -log10(downreg_selected$p.val)
downreg_selected$neglogQ_value = -log10(downreg_selected$q.val)
both = rbind(downreg_selected, upreg_selected)
both$absolute = abs(both$stat.mean)
#datatable(both)
both2 = both %>% filter(pathwayname != "Metabolic pathways" & pathwayname != "Retrograde endocannabinoid signaling")
both2=both2[-grep("disease",both2$pathway),]
both2[,c(5:7,9:11)]=round(both2[,c(5:7,9:11)],4)
datatable(both2)
#saveRDS(both, file = "both_selected.rds")
# pv.out.list.fat <- sapply(both$pathid,
# function(pid) pathview(gene.data = fc,
# pathway.id = pid,
# species = "hsa",
# out.suffix="human",
# low = list(gene = "deepskyblue1", cpd = "green"),
# high = list(gene = "red", cpd = "yellow")))
# library(ggrepel )
# ggplot(both2, aes(x = stat.mean, y = -log10(p.val))) + geom_point( aes(color = set.size), size = 3) + labs(x = "Enrichment score", y = "neglog10p", title = "GSEA for significantly downregulated and upregulated pathways") +
# geom_text_repel(aes(label=pathwayname), size=3) +
# scale_colour_gradient(high = "red", low = "blue") +
# scale_x_continuous(
# labels = scales::number_format(accuracy = 0.01))
ggplot(both2,aes(x = reorder(pathwayname, stat.mean),y=stat.mean))+geom_point(aes(color=-log10(p.val),size=-log10(p.val)))+coord_flip()+theme_bw()+ggtitle("GSEA for significantly downregulated and upregulated pathways")+labs(y="Enrichment score")+
scale_colour_gradient(high = "red", low = "blue")
library(clusterProfiler)
library(enrichplot)
performGSEA_mito <- function(gene_list,gene.pathway,pval){
# gse <- GSEA(gene_list,
# TERM2GENE = gene.pathway,
# pAdjustMethod = "none",
# scoreType = "pos",
# nPermSimple = 10000
# )
gse <- GSEA(gene_list, TERM2GENE = gene.pathway,nPerm = 1000, minGSSize = 5, maxGSSize = 500, pAdjustMethod = "BH",pvalueCutoff = pval)
GSEresults.rounded <- gse@result[,1:9]
GSEresults.rounded[,4:8] <- signif(GSEresults.rounded[,4:8],4)
GSEresults.rounded[,6:7] <- sapply(GSEresults.rounded[,6:7], format, scientific = TRUE)
table <- DT::datatable(GSEresults.rounded)
dot <- clusterProfiler::dotplot(gse, showCategory=10, split=".sign") + facet_grid(.~.sign)
gse <- enrichplot::pairwise_termsim(gse)
enrich <- emapplot(gse, showCategory = 10)
tree <- treeplot(gse) #https://github.com/YuLab-SMU/ggtree/issues/399 #might be a dplyr issue
return(list(table = table, dot = dot, enrich = enrich, tree = tree,table2=GSEresults.rounded))
}
###
mito_genes <- read_excel("Human.MitoCarta3.0.xls", sheet = 4)
mito_genes <- mito_genes %>% dplyr::select(MitoPathway,Genes) %>% na.omit
gene_sets <- mito_genes$Genes %>% strsplit(", ")
gene.pathway <- data.frame(geneIDs = character(), pathwayIDs = integer())
for(i in 1:length(gene_sets)){
new_data <- data.frame(geneIDs = gene_sets[[i]], pathwayIDs = i)
gene.pathway <- rbind(gene.pathway,new_data)
}
gene.pathway$geneIDs <- mapIds(org.Hs.eg.db, keys = gene.pathway$geneIDs, keytype = "SYMBOL", column = "ENTREZID")
pathway.names <- data.frame(pathwayIDs = 1:length(gene_sets), pathwayNames = mito_genes$MitoPathway)
####
gene_list = sort(fc_new, decreasing = TRUE)
names(gene_list)=as.character(names(gene_list))
gene.pathway2=gene.pathway
gene.pathway2$pathwayIDs=pathway.names$pathwayNames[match(gene.pathway2$pathwayIDs,pathway.names$pathwayIDs)]
gene.pathway2=gene.pathway2[,c(2,1)]#need to put term gene in order
##Ben's mito to delete unused previous mito pathways
ben_mito=read_excel("/Users/MQ10004787/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/Yunwei202110/IHD-noDM vs donor KEGG mito pathways 27.6.22.xlsx")
ben_mito$Description%in%gene.pathway2$pathwayIDs
## [1] TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
## [13] TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE
## [25] TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE
## [37] TRUE FALSE FALSE TRUE FALSE TRUE TRUE TRUE FALSE TRUE TRUE FALSE
## [49] FALSE TRUE FALSE TRUE TRUE FALSE TRUE FALSE FALSE TRUE FALSE TRUE
## [61] TRUE FALSE TRUE TRUE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE
## [73] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] TRUE FALSE FALSE FALSE FALSE FALSE
## [1] 3904 2
## [1] 2416 2
###
colnames(kegg_full_dt)=colnames(gene.pathway2)
gene.pathway3=rbind.data.frame(gene.pathway2,kegg_full_dt)
set.seed(202205)
gseaResults_mito <-performGSEA_mito(gene_list,gene.pathway3,0.1)
gseaResults_mito$table
#gseaResults_mito$dot
#gseaResults_mito$enrich
#gseaResults_mito$tree
#Normalized enrichment scores (NES)
#https://www.biostars.org/p/169648/
#my plot
ggplotdt=gseaResults_mito$table2
ggplotdt$p.adjust=as.numeric(ggplotdt$p.adjust)
ggplotdtup=ggplotdt[ggplotdt$enrichmentScore>0,]
ggplotdtupsub=ggplotdtup[order(-abs(ggplotdtup$enrichmentScore)),]
ggplotdtdown=ggplotdt[ggplotdt$enrichmentScore<0,]
ggplotdtdownsub=ggplotdtdown[order(ggplotdtdown$enrichmentScore),]
ggplotdt=rbind.data.frame(ggplotdtupsub[1:2,],ggplotdtdownsub[1:10,])
ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=p.adjust,size=p.adjust))+coord_flip()+theme_bw()+ggtitle("pathway enrichment analysis")+
scale_colour_gradient(high = "red", low = "blue")
## [1] 0.3333333
paste("proportion of mito pathways in all pathways used",dim(gene.pathway2)[1]/dim(gene.pathway3)[1])
## [1] "proportion of mito pathways in all pathways used 0.0621863015109006"
paste("proportion of significant mito pathways in all significant pathways with threshold 0.05 is",sum(unique(gene.pathway2$pathwayIDs)%in% gseaResults_mito$table2$Description)/nrow(gseaResults_mito$table2))
## [1] "proportion of significant mito pathways in all significant pathways with threshold 0.05 is 0.225806451612903"
#one proportion z test, h0:p>p0
paste("perform statistical test to see if this is come up by chance: one-proportion z-test for greater shows:")
## [1] "perform statistical test to see if this is come up by chance: one-proportion z-test for greater shows:"
prop.test(x =sum(unique(gene.pathway2$pathwayIDs)%in% gseaResults_mito$table2$Description) , n = nrow(gseaResults_mito$table2), p = dim(gene.pathway2)[1]/dim(gene.pathway3)[1], correct = FALSE,
alternative = "greater")
##
## 1-sample proportions test without continuity correction
##
## data: sum(unique(gene.pathway2$pathwayIDs) %in% gseaResults_mito$table2$Description) out of nrow(gseaResults_mito$table2), null probability dim(gene.pathway2)[1]/dim(gene.pathway3)[1]
## X-squared = 14.231, df = 1, p-value = 8.086e-05
## alternative hypothesis: true p is greater than 0.0621863
## 95 percent confidence interval:
## 0.1273292 1.0000000
## sample estimates:
## p
## 0.2258065
## [1] 7
## [1] 31
#plot all the significant mito pathways
ggplotdt2=rbind.data.frame(ggplotdtupsub,ggplotdtdownsub)
ggplotdt2=ggplotdt2[ggplotdt2$Description%in%gene.pathway2$pathwayIDs,]
ggplot(ggplotdt2,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=p.adjust,size=p.adjust))+coord_flip()+theme_bw()+ggtitle("mitochondrail pathway enrichment analysis")+
scale_colour_gradient(high = "red", low = "blue")+ylim(c(-1,1))
library(clusterProfiler)
library(enrichplot)
performGSEA_mito <- function(gene_list,gene.pathway,pval){
# gse <- GSEA(gene_list,
# TERM2GENE = gene.pathway,
# pAdjustMethod = "none",
# scoreType = "pos",
# nPermSimple = 10000
# )
gse <- GSEA(gene_list, TERM2GENE = gene.pathway,nPerm = 1000, minGSSize = 3, maxGSSize = 500, pAdjustMethod = "none",pvalueCutoff = pval)
GSEresults.rounded <- gse@result[,1:9]
GSEresults.rounded[,4:8] <- signif(GSEresults.rounded[,4:8],4)
GSEresults.rounded[,6:7] <- sapply(GSEresults.rounded[,6:7], format, scientific = TRUE)
table <- DT::datatable(GSEresults.rounded)
dot <- clusterProfiler::dotplot(gse, showCategory=10, split=".sign") + facet_grid(.~.sign)
gse <- enrichplot::pairwise_termsim(gse)
enrich <- emapplot(gse, showCategory = 10)
tree <- treeplot(gse) #https://github.com/YuLab-SMU/ggtree/issues/399 #might be a dplyr issue
return(list(table = table, dot = dot, enrich = enrich, tree = tree,table2=GSEresults.rounded))
}
###
mito_genes <- read_excel("Human.MitoCarta3.0.xls", sheet = 4)
mito_genes <- mito_genes %>% dplyr::select(MitoPathway,Genes) %>% na.omit
gene_sets <- mito_genes$Genes %>% strsplit(", ")
gene.pathway <- data.frame(geneIDs = character(), pathwayIDs = integer())
for(i in 1:length(gene_sets)){
new_data <- data.frame(geneIDs = gene_sets[[i]], pathwayIDs = i)
gene.pathway <- rbind(gene.pathway,new_data)
}
gene.pathway$geneIDs <- mapIds(org.Hs.eg.db, keys = gene.pathway$geneIDs, keytype = "SYMBOL", column = "ENTREZID")
pathway.names <- data.frame(pathwayIDs = 1:length(gene_sets), pathwayNames = mito_genes$MitoPathway)
####
gene_list = sort(fc_new, decreasing = TRUE)
names(gene_list)=as.character(names(gene_list))
gene.pathway2=gene.pathway
gene.pathway2$pathwayIDs=pathway.names$pathwayNames[match(gene.pathway2$pathwayIDs,pathway.names$pathwayIDs)]
gene.pathway2=gene.pathway2[,c(2,1)]#need to put term gene in order
##Ben's mito to delete unused previous mito pathways
ben_mito=read_excel("/Users/MQ10004787/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/Yunwei202110/IHD-noDM vs donor KEGG mito pathways 27.6.22.xlsx")
ben_mito$Description%in%gene.pathway2$pathwayIDs
## [1] TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
## [13] TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE
## [25] TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE
## [37] TRUE FALSE FALSE TRUE FALSE TRUE TRUE TRUE FALSE TRUE TRUE FALSE
## [49] FALSE TRUE FALSE TRUE TRUE FALSE TRUE FALSE FALSE TRUE FALSE TRUE
## [61] TRUE FALSE TRUE TRUE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE
## [73] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] TRUE FALSE FALSE FALSE FALSE FALSE
## [1] 3904 2
## [1] 2416 2
###
colnames(kegg_full_dt)=colnames(gene.pathway2)
gene.pathway3=rbind.data.frame(gene.pathway2,kegg_full_dt)
set.seed(202205)
gseaResults_mito <-performGSEA_mito(gene_list,gene.pathway3,0.05)
ggplotdt=gseaResults_mito$table2
ggplotdt$p.adjust=as.numeric(ggplotdt$p.adjust)
ggplotdtup=ggplotdt[ggplotdt$enrichmentScore>0,]
ggplotdtupsub=ggplotdtup[order(ggplotdtup$p.adjust),]
ggplotdtdown=ggplotdt[ggplotdt$enrichmentScore<0,]
ggplotdtdownsub=ggplotdtdown[order(ggplotdtdown$p.adjust),]
ggplotdt=rbind.data.frame(ggplotdtupsub[1:10,],ggplotdtdownsub[1:10,])
ggplotdt$pvalue=as.numeric(ggplotdt$pvalue)
gg=ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=pvalue,size=pvalue))+coord_flip()+theme_bw()+ggtitle("ihd-no dm vs donor pathway enrichment analysis")+
scale_colour_gradient(high = "red", low = "blue")+theme(text = element_text(size = 10))
gg
## [1] 90 9
## [1] 67 9
common_name=c(ihddm_dt2$Description,ihdnodm_dt2$Description)
plotmerge=merge(ihddm_dt2,ihdnodm_dt2,by="Description")
plotmerge$pvalue.x=-log10(as.numeric(plotmerge$pvalue.x))
plotmerge$pvalue.y=-log10(as.numeric(plotmerge$pvalue.y))
annotation=read_excel("/Users/MQ10004787/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/Yunwei202110/IHD-DM paper figure 3a pathways to annotate.xlsx")
plotmerge$mito=as.character(ifelse(plotmerge$Description%in% ben_mito$Description,1,0))
plotmerge$annotation=as.character(ifelse(plotmerge$Description%in% annotation$`Amino acid metabolism`,1,0))
ggplotly(ggplot(plotmerge,aes(pvalue.x,pvalue.y,label=Description,color=mito))+geom_point()+ggtitle("kegg+mito pathways ihd-dm vs ihd-no dm")+xlab("neg_log10_pval in ind-dm vs donor")+ylab("neg_log10_pval in ind-no dm vs donor")+theme_bw())
library(ggrepel)
#gg=ggplot(plotmerge,aes(pvalue.x,pvalue.y,label=Description,color=mito))+geom_point()+ geom_text_repel(aes(x=pvalue.x,y=pvalue.y,label=Description),subset(plotmerge,((pvalue.x>1&pvalue.y<2)|(pvalue.x<1&pvalue.y>1)|(pvalue.x>2&pvalue.y>2))), size=3)+ggtitle("kegg+mito pathways ihd-dm vs ihd-no dm")+xlab("neg_log10_pval in ind-dm vs donor")+ylab("neg_log10_pval in ind-no dm vs donor")+ scale_color_manual(values=c("black", "seagreen"))+theme_bw()
gg=ggplot(plotmerge,aes(pvalue.x,pvalue.y,label=Description,color=mito))+geom_point()+ geom_text_repel(aes(x=pvalue.x,y=pvalue.y,label=Description),subset(plotmerge,plotmerge$annotation==1), size=3)+ggtitle("kegg+mito pathways ihd-dm vs ihd-no dm")+xlab("neg_log10_pval in ind-dm vs donor")+ylab("neg_log10_pval in ind-no dm vs donor")+ scale_color_manual(values=c("black", "seagreen"))+theme_bw()+theme(aspect.ratio = 1)
gg
selected_mito=read_excel("/Users/MQ10004787/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/Figure 3c and d IHD-DM and IHD-No DM final pathways to plot_230224.xlsx",sheet = 1)
names=selected_mito$ID
ggplotdt=ihddm_dt2[ihddm_dt2$Description%in%names,]
ggplotdt$pvalue=as.numeric(ggplotdt$pvalue)
ggplotdtup=ggplotdt[ggplotdt$enrichmentScore>0,]
ggplotdtupsub=ggplotdtup[order(-abs(ggplotdtup$enrichmentScore)),]
ggplotdtdown=ggplotdt[ggplotdt$enrichmentScore<0,]
ggplotdtdownsub=ggplotdtdown[order(ggplotdtdown$enrichmentScore),]
ggplotdt=rbind.data.frame(ggplotdtupsub,ggplotdtdownsub)
gg=ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=pvalue,size=pvalue))+coord_flip()+theme_bw()+ggtitle("ihd-dm vs donor pathway enrichment analysis")+
scale_colour_gradient(high = "blue", low = "red")+theme(text = element_text(size = 20))+guides(size = guide_legend(reverse=TRUE))+scale_size_continuous(trans = "reverse")
gg
#ggsave("figure3c.pdf",plot = gg,height = 10,width = 10)
gg=ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=pvalue),size=5)+coord_flip()+theme_bw()+ggtitle("ihd-dm vs donor pathway enrichment analysis")+
scale_colour_gradient(high = "red", low = "blue")+theme(text = element_text(size = 20))+scale_color_continuous(limits=c(0,1),breaks=c(0.01,0.05,0.1,0.5,1))
gg
#ggsave("figure3cv1.pdf",plot = gg,height = 10,width = 10)
gg=ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=pvalue),size=5)+coord_flip()+theme_bw()+ggtitle("ihd-dm vs donor pathway enrichment analysis")+
scale_colour_gradient(high = "red", low = "blue",limits=c(0,1),breaks=c(0.01,0.05,0.1,0.5,1))+theme(text = element_text(size = 20))
gg
selected_mito=read_excel("/Users/MQ10004787/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/Figure 3c and d IHD-DM and IHD-No DM final pathways to plot_230224.xlsx",sheet = 2)
names=selected_mito$ID
ggplotdt=ihdnodm_dt2[ihdnodm_dt2$Description%in%names,]
ggplotdt$pvalue=as.numeric(ggplotdt$pvalue)
ggplotdtup=ggplotdt[ggplotdt$enrichmentScore>0,]
ggplotdtupsub=ggplotdtup[order(-abs(ggplotdtup$enrichmentScore)),]
ggplotdtdown=ggplotdt[ggplotdt$enrichmentScore<0,]
ggplotdtdownsub=ggplotdtdown[order(ggplotdtdown$enrichmentScore),]
ggplotdt=rbind.data.frame(ggplotdtupsub,ggplotdtdownsub)
gg=ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=pvalue,size=pvalue))+coord_flip()+theme_bw()+ggtitle("ihd-no dm vs donor pathway enrichment analysis")+
scale_colour_gradient(high = "blue", low = "red")+theme(text = element_text(size = 20))+guides(size = guide_legend(reverse=TRUE))+scale_size_continuous(trans = "reverse")
gg
#ggsave("figure3d.pdf",plot = gg,height = 7,width = 10)
gg=ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=pvalue),size=5)+coord_flip()+theme_bw()+ggtitle("ihd-no dm vs donor pathway enrichment analysis")+
scale_colour_gradient(high = "red", low = "blue")+theme(text = element_text(size = 20))+scale_color_continuous(limits=c(0,1),breaks=c(0.01,0.05,0.1,0.5,1))
gg
names=read_excel("/Users/MQ10004787/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/Figure 3e and f GO nodes 27.2.23.xlsx",sheet = 1,col_names=TRUE)
names=as.data.frame(names)
ggplotdt=figure4a[figure4a$Description%in%as.vector(names[,1]),]
ggplotdt$pvalue=as.numeric(ggplotdt$pvalue)
ggplotdtup=ggplotdt[ggplotdt$enrichmentScore>0,]
ggplotdtupsub=ggplotdtup[order(-abs(ggplotdtup$enrichmentScore)),]
ggplotdtdown=ggplotdt[ggplotdt$enrichmentScore<0,]
ggplotdtdownsub=ggplotdtdown[order(ggplotdtdown$enrichmentScore),]
ggplotdt=rbind.data.frame(ggplotdtupsub,ggplotdtdownsub)
gg=ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=pvalue),size=5)+coord_flip()+theme_bw()+ggtitle("ihd-dm vs donor pathway enrichment analysis")+
scale_colour_gradient(high = "red", low = "blue")+theme(text = element_text(size = 20))+scale_color_continuous(limits=c(0,1),breaks=c(0.01,0.05,0.1,0.5,1))
gg
#ggsave("figure4av1.pdf",plot = gg,height = 7,width = 10)
ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=pvalue,size=pvalue))+coord_flip()+theme_bw()+ggtitle("ihd-dm vs donor pathway enrichment analysis")+
scale_colour_gradient(high = "red", low = "blue")+theme(text = element_text(size = 20))+scale_color_continuous(limits=c(0,1),breaks=c(0.01,0.05,0.1,0.5,1))+guides(size = guide_legend(reverse=TRUE))+scale_size_continuous(range=c(1,10),breaks = c(0.01,0.05,0.1,0.5,1),limits = c(0,1))
ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=pvalue,size=pvalue))+coord_flip()+theme_bw()+ggtitle("ihd-dm vs donor pathway enrichment analysis")+
scale_colour_gradient(high = "red", low = "blue")+theme(text = element_text(size = 20))+scale_color_continuous(limits=c(0,1),breaks=c(0.01,0.05,0.1,0.5,1))+scale_size_continuous(range=c(1,10),breaks = c(0.01,0.05,0.1,0.5,1),limits = c(0,1))
ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=pvalue,size=pvalue))+coord_flip()+theme_bw()+ggtitle("ihd-dm vs donor pathway enrichment analysis")+
scale_colour_gradient(high = "red", low = "blue")+scale_color_continuous(limits=c(0,1),breaks=c(0.01,0.05,0.1,0.5,1))+guides(size = guide_legend(reverse=TRUE))+scale_size_continuous(range=c(1,6),breaks = c(0,0.01,0.05,0.1,0.5,1,2),trans = "reverse")+theme(axis.text.x = element_text(angle = 60, hjust = 1),panel.background =element_rect(fill = "white"),panel.grid.major = element_line(size = 0.1, linetype = 'solid',
colour = "black"),
panel.grid.minor = element_line(size = 0.1, linetype = 'solid',
colour = "white"),legend.key.size = unit(1, 'cm'), #change legend key size
legend.key.height = unit(0.5, 'cm'), #change legend key height
legend.key.width = unit(1, 'cm'), #change legend key width
legend.title = element_text(size=6), #change legend title font size
legend.text = element_text(size=5),text = element_text(size=15))
gg=ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=pvalue,size=pvalue))+coord_flip()+theme_bw()+ggtitle("ihd-dm vs donor pathway enrichment analysis")+
scale_colour_gradient(high = "red", low = "blue")+theme(text = element_text(size = 20))+scale_color_continuous(limits=c(0,1),breaks=c(0.01,0.05,0.1,0.5,1))+scale_size_continuous(breaks = c(0.01,0.05,0.1,0.5,1),limits = c(1,0),trans = "reverse")
gg
#ggsave("figure4av2.pdf",plot = gg,height = 7,width = 10)
gg=ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=pvalue,size=pvalue))+coord_flip()+theme_bw()+ggtitle("ihd-dm vs donor pathway enrichment analysis")+
scale_colour_gradient(high = "blue", low = "red")+theme(text = element_text(size = 20))+scale_size_continuous(trans = "reverse")+guides(size = guide_legend(reverse=TRUE))
gg
names=read_excel("/Users/MQ10004787/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/Figure 3e and f GO nodes 27.2.23.xlsx",sheet = 2,col_names=TRUE)
names=as.data.frame(names)
ggplotdt=figure4b[figure4b$Description%in%as.vector(names[,1]),]
ggplotdt$pvalue=as.numeric(ggplotdt$pvalue)
ggplotdtup=ggplotdt[ggplotdt$enrichmentScore>0,]
ggplotdtupsub=ggplotdtup[order(-abs(ggplotdtup$enrichmentScore)),]
ggplotdtdown=ggplotdt[ggplotdt$enrichmentScore<0,]
ggplotdtdownsub=ggplotdtdown[order(ggplotdtdown$enrichmentScore),]
ggplotdt=rbind.data.frame(ggplotdtupsub,ggplotdtdownsub)
gg=ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=pvalue,size=pvalue))+coord_flip()+theme_bw()+ggtitle("ihd-no dm vs donor pathway enrichment analysis")+
scale_colour_gradient(high = "blue", low = "red")+theme(text = element_text(size = 20))+scale_size_continuous(trans = "reverse")+guides(size = guide_legend(reverse=TRUE))
gg
#ggsave("figure3f.pdf",plot = gg,height = 7,width = 13)
gg=ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=pvalue),size=5)+coord_flip()+theme_bw()+ggtitle("ihd-no dm vs donor pathway enrichment analysis")+
scale_colour_gradient(high = "red", low = "blue")+theme(text = element_text(size = 20))+scale_color_continuous(limits=c(0,1),breaks=c(0.01,0.05,0.1,0.5,1))
gg
## R version 4.3.1 (2023-06-16)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.4.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/Berlin
## tzcode source: internal
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ggrepel_0.9.3 GO.db_3.17.0
## [3] enrichplot_1.20.0 clusterProfiler_4.8.1
## [5] gageData_2.38.0 MatchIt_4.5.4
## [7] UpSetR_1.4.0 plotly_4.10.2
## [9] pheatmap_1.0.12 gtable_0.3.3
## [11] plyr_1.8.8 reshape_0.8.9
## [13] org.Hs.eg.db_3.17.0 AnnotationDbi_1.62.1
## [15] MASS_7.3-60 KEGGREST_1.40.0
## [17] janitor_2.2.0 knitr_1.43
## [19] gage_2.50.0 magrittr_2.0.3
## [21] R.utils_2.12.2 R.oo_1.25.0
## [23] R.methodsS3_1.8.2 ggpubr_0.6.0
## [25] bnstruct_1.0.14 igraph_1.5.0
## [27] bitops_1.0-7 e1071_1.7-13
## [29] randomForest_4.7-1.1 DT_0.28
## [31] MultiAssayExperiment_1.26.0 SummarizedExperiment_1.30.2
## [33] Biobase_2.60.0 GenomicRanges_1.52.0
## [35] GenomeInfoDb_1.36.1 IRanges_2.34.0
## [37] S4Vectors_0.38.1 BiocGenerics_0.46.0
## [39] MatrixGenerics_1.12.2 matrixStats_1.0.0
## [41] readxl_1.4.2 lubridate_1.9.2
## [43] forcats_1.0.0 stringr_1.5.0
## [45] purrr_1.0.1 readr_2.1.4
## [47] tidyr_1.3.0 tibble_3.2.1
## [49] tidyverse_2.0.0 dplyr_1.1.2
## [51] limma_3.56.2 ggplot2_3.4.2
## [53] reshape2_1.4.4
##
## loaded via a namespace (and not attached):
## [1] splines_4.3.1 ggplotify_0.1.0 cellranger_1.1.0
## [4] polyclip_1.10-4 graph_1.78.0 lifecycle_1.0.3
## [7] rstatix_0.7.2 lattice_0.21-8 crosstalk_1.2.0
## [10] backports_1.4.1 sass_0.4.6 rmarkdown_2.22
## [13] jquerylib_0.1.4 yaml_2.3.7 cowplot_1.1.1
## [16] DBI_1.1.3 RColorBrewer_1.1-3 abind_1.4-5
## [19] zlibbioc_1.46.0 ggraph_2.1.0 RCurl_1.98-1.12
## [22] yulab.utils_0.0.6 tweenr_2.0.2 GenomeInfoDbData_1.2.10
## [25] tidytree_0.4.2 codetools_0.2-19 DelayedArray_0.26.3
## [28] DOSE_3.26.1 ggforce_0.4.1 tidyselect_1.2.0
## [31] aplot_0.1.10 farver_2.1.1 viridis_0.6.3
## [34] jsonlite_1.8.5 ellipsis_0.3.2 tidygraph_1.2.3
## [37] ggnewscale_0.4.9 tools_4.3.1 treeio_1.24.1
## [40] Rcpp_1.0.10 glue_1.6.2 gridExtra_2.3
## [43] xfun_0.39 qvalue_2.32.0 withr_2.5.0
## [46] fastmap_1.1.1 fansi_1.0.4 digest_0.6.31
## [49] timechange_0.2.0 R6_2.5.1 gridGraphics_0.5-1
## [52] colorspace_2.1-0 RSQLite_2.3.1 utf8_1.2.3
## [55] generics_0.1.3 data.table_1.14.8 class_7.3-22
## [58] graphlayouts_1.0.0 httr_1.4.6 htmlwidgets_1.6.2
## [61] S4Arrays_1.0.4 scatterpie_0.2.1 pkgconfig_2.0.3
## [64] blob_1.2.4 XVector_0.40.0 shadowtext_0.1.2
## [67] htmltools_0.5.5 carData_3.0-5 fgsea_1.26.0
## [70] scales_1.2.1 png_0.1-8 snakecase_0.11.0
## [73] ggfun_0.1.1 rstudioapi_0.14 tzdb_0.4.0
## [76] nlme_3.1-162 curl_5.0.1 proxy_0.4-27
## [79] cachem_1.0.8 parallel_4.3.1 HDO.db_0.99.1
## [82] pillar_1.9.0 vctrs_0.6.3 car_3.1-2
## [85] evaluate_0.21 cli_3.6.1 compiler_4.3.1
## [88] rlang_1.1.1 crayon_1.5.2 ggsignif_0.6.4
## [91] labeling_0.4.2 stringi_1.7.12 viridisLite_0.4.2
## [94] BiocParallel_1.34.2 munsell_0.5.0 Biostrings_2.68.1
## [97] lazyeval_0.2.2 GOSemSim_2.26.0 Matrix_1.5-4.1
## [100] hms_1.1.3 patchwork_1.1.2 bit64_4.0.5
## [103] highr_0.10 broom_1.0.5 memoise_2.0.1
## [106] bslib_0.5.0 ggtree_3.8.0 fastmatch_1.1-3
## [109] bit_4.0.5 downloader_0.4 gson_0.1.0
## [112] ape_5.7-1