• this file is created 0916, to get supp figure s5 which is the same as figure3 but use DCM instead, the donors are those >=21 donors.
  • this file is updated 1006 with Ben’s list of pathways to show.
#library(GEOquery)  ## go to https://github.com/seandavi/GEOquery for installation details
#library(R.utils)
library(reshape2)
library(ggplot2)
library(limma)
library(dplyr)
library(tidyverse)
#library(DMwR)
library(readxl)
library(MultiAssayExperiment)
library(S4Vectors)
library(SummarizedExperiment)
library(DT)
library(randomForest)
library(e1071)
library("bnstruct")
library(rminer)
library(ggpubr)
setwd("/Users/yzha0247/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis")
# install.packages("VANData_1.0.0.tgz", repos=NULL,type="source")
# install.packages("VAN_1.0.0.tgz", repos=NULL,type="source") 

library(R.utils)
library(reshape2)
library(ggplot2)
library(limma)
library(dplyr)
library(tidyverse)
library(DMwR)
library(readxl)
library(magrittr)
library(tidyverse)
library(gage)
library(knitr)
library(DT)
library(janitor)
library(KEGGREST)
library(MASS)
library("AnnotationDbi")
library("org.Hs.eg.db")
library(dplyr)
library(reshape)
library(ggplot2)
library(plyr)
library(gtable)
library(grid)
library(pheatmap)
library(reshape2)
library(plotly)
library(UpSetR)
library(MatchIt)
require(pathview)
library(STRINGdb)
## Kegg sets data
require(gage)
require(gageData)
kegg = kegg.gsets(species = "hsa", id.type = "kegg")
kegg.sets.hsa = kegg$kg.sets

# go.hs=go.gsets(species="human")
# go.bp=go.hs$go.sets[go.hs$go.subs$BP]
# go.mf=go.hs$go.sets[go.hs$go.subs$MF]
# go.bpmf=go.hs$go.sets[go.hs$go.subs$BP&go.hs$go.subs$MF] #not equal, wired
# go.bpmf2=append(go.bp,go.mf)
require(gage)
require(gageData)
library(clusterProfiler)
library(enrichplot)
library(GO.db)
# #failed install
# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# 
# BiocManager::install("GOfuncR")
# 
# if (!requireNamespace("devtools", quietly=TRUE))
#     install.packages("devtools")
# devtools::install_github("sgrote/GOfuncR")

1 DCM-DM VS Donor

1.1 DE analysis and standard pipeline

## [1] 9037  111
## [1] 9037  111
## [1] 4094  111
## [1] 29 14
##  [1] "31_LV_DCM_Yes_22_F"   "32_LV_DCM_Yes_31_F"   "33_LV_DCM_Yes_53_M"  
##  [4] "34_LV_DCM_Yes_56_M"   "35_LV_DCM_Yes_56_M"   "36_LV_DCM_Yes_58_M"  
##  [7] "37_LV_DCM_Yes_59_M"   "38_LV_DCM_Yes_61_F"   "39_LV_DCM_Yes_63_F"  
## [10] "90_LV_Donor_No_21_M"  "91_LV_Donor_No_23_F"  "92_LV_Donor_No_23_M" 
## [13] "93_LV_Donor_No_25_F"  "94_LV_Donor_No_47_M"  "95_LV_Donor_No_47_F" 
## [16] "96_LV_Donor_No_48_M"  "97_LV_Donor_No_49_F"  "98_LV_Donor_No_51_F" 
## [19] "99_LV_Donor_No_53_M"  "100_LV_Donor_No_53_F" "101_LV_Donor_No_54_M"
## [22] "102_LV_Donor_No_54_F" "104_LV_Donor_No_55_F" "105_LV_Donor_No_56_M"
## [25] "106_LV_Donor_No_60_M" "107_LV_Donor_No_62_M" "108_LV_Donor_No_62_F"
## [28] "109_LV_Donor_No_65_M" "110_LV_Donor_No_65_F"
## [1] 9037   29
## [1] 9037   29
## [1] 4051   29
##  [1] "31_LV_DCM_Yes_22_F"   "33_LV_DCM_Yes_53_M"   "37_LV_DCM_Yes_59_M"  
##  [4] "32_LV_DCM_Yes_31_F"   "35_LV_DCM_Yes_56_M"   "39_LV_DCM_Yes_63_F"  
##  [7] "34_LV_DCM_Yes_56_M"   "36_LV_DCM_Yes_58_M"   "38_LV_DCM_Yes_61_F"  
## [10] "110_LV_Donor_No_65_F" "101_LV_Donor_No_54_M" "91_LV_Donor_No_23_F" 
## [13] "94_LV_Donor_No_47_M"  "98_LV_Donor_No_51_F"  "105_LV_Donor_No_56_M"
## [16] "90_LV_Donor_No_21_M"  "100_LV_Donor_No_53_F" "109_LV_Donor_No_65_M"
## [19] "99_LV_Donor_No_53_M"  "93_LV_Donor_No_25_F"  "97_LV_Donor_No_49_F" 
## [22] "92_LV_Donor_No_23_M"  "108_LV_Donor_No_62_F" "102_LV_Donor_No_54_F"
## [25] "96_LV_Donor_No_48_M"  "106_LV_Donor_No_60_M" "95_LV_Donor_No_47_F" 
## [28] "107_LV_Donor_No_62_M" "104_LV_Donor_No_55_F"
## [1] 2.175502
## [1] 4051   10
## [1] 4037   10
## [1] 2048   29

# Let is suppose I have a collection of genesets called : HALLMARK Now let is suppose there is a specific geneset there called: E2F_targets
# 
# BgRatio, M/N.
# 
# M = size of the geneset (eg size of the E2F_targets); (is the number of genes within that distribution that are annotated (either directly or indirectly) to the node of interest).
# 
# N = size of all of the unique genes in the collection of genesets (example the HALLMARK collection); (is the total number of genes in the background distribution (universe)
# 
# GeneRatio is k/n.
# 
# k = size of the overlap of 'a vector of gene id' you input with the specific geneset (eg E2F_targets), only unique genes; (the number of genes within that list n, which are annotated to the node.
# 
# n = size of the overlap of 'a vector of gene id' you input with all the members of the collection of genesets (eg the HALLMARK collection),only unique genes; is the size of the list of genes of interest

#adj pval and qval
#https://support.bioconductor.org/p/96329/
#adj pval is the fdr adjusted pval, qval from another package is another adjustment method
#the BH approach is slightly more conservative than qvalue

#enrichgo:
# Over Representation Analysis (ORA) (Boyle et al. 2004) is a widely used approach to determine whether known biological functions or processes are over-represented (= enriched) in an experimentally-derived gene list, e.g. a list of differentially expressed genes (DEGs).
# 
# The p-value can be calculated by hypergeometric distribution.

#Leading edge analysis reports Tags to indicate the percentage of genes contributing to the enrichment score, List to indicate where in the list the enrichment score is attained and Signal for enrichment signal strength.

#It would also be very interesting to get the core enriched genes that contribute to the enrichment. Our packages (clusterProfiler, DOSE, meshes and ReactomePA) support leading edge analysis and report core enriched genes in GSEA analysis.

#get pathway maps
result_path <- enrichGO(gene         = names(geneList),
                OrgDb         = org.Hs.eg.db,
                keyType       = 'ENTREZID',
                ont           = "BP",
                pAdjustMethod = "BH",
                pvalueCutoff  =1,
                qvalueCutoff  =1)
result_path_dt=as.data.frame(result_path)
result_path_dt2=result_path_dt[,c("Description","geneID")]

result_path_dt3=data.frame(matrix(ncol=2))
for(i in 1:nrow(result_path_dt2)){
X2=unlist(strsplit(result_path_dt2$geneID[i],"/"))
X1=rep(result_path_dt2$Description[i],length(X2))
dt=cbind(X1,X2)
result_path_dt3=rbind(result_path_dt3,dt)
}

1.2 check list for ecm & scm gene groups

##         GOID                                TERM ONTOLOGY
## 1 GO:0030198   extracellular matrix organization       BP
## 2 GO:0055007 cardiac muscle cell differentiation       BP
## 3 GO:0030239                  myofibril assembly       BP
## 4 GO:0060047                   heart contraction       BP
## 5 GO:0007010           cytoskeleton organization       BP
## 6 GO:0031012                extracellular matrix       CC
## 7 GO:0043292                   contractile fiber       CC
## 8 GO:0005856                        cytoskeleton       CC
## [1] 810
## [1] 1238
## [1] 2048

1.3 continue for DE analysis and standard pipeline

## [1] "The total number of kegg pathways is 318"
##       from    to
## 1 hsa00010 10327
## 2 hsa00010   124
## 3 hsa00010   125
## 4 hsa00010   126
## 5 hsa00010   127
## 6 hsa00010   128
##       from                                       to
## 1 hsa00010             Glycolysis / Gluconeogenesis
## 2 hsa00020                Citrate cycle (TCA cycle)
## 3 hsa00030                Pentose phosphate pathway
## 4 hsa00040 Pentose and glucuronate interconversions
## 5 hsa00051          Fructose and mannose metabolism
## 6 hsa00052                     Galactose metabolism
## [1] 347
## [1] 558   2
## [1] 347

1.4 Part(1)

1.4.1 Part(1) GO+mito

  • combine the pathways together to do the gsea analysis
  • table with pathway enrichment results
library(clusterProfiler)
library(enrichplot)
performGSEA_mito <- function(gene_list,gene.pathway,pval){
  # gse <- GSEA(gene_list, 
  #           TERM2GENE = gene.pathway,
  #           pAdjustMethod = "none", 
  #           scoreType = "pos",
  #           nPermSimple = 10000
  #           )
  gse <- GSEA(gene_list, TERM2GENE = gene.pathway,nPerm = 1000, minGSSize = 5, maxGSSize = 500, pAdjustMethod = "BH",pvalueCutoff = pval)
  GSEresults.rounded <- gse@result[,1:9]
  GSEresults.rounded[,4:8] <- signif(GSEresults.rounded[,4:8],4)
  GSEresults.rounded[,6:7] <- sapply(GSEresults.rounded[,6:7], format, scientific = TRUE)
  table <- DT::datatable(GSEresults.rounded)
  dot <- clusterProfiler::dotplot(gse, showCategory=10, split=".sign") + facet_grid(.~.sign)
  gse <- enrichplot::pairwise_termsim(gse)
  enrich <- emapplot(gse, showCategory = 10)
  tree <- treeplot(gse) #https://github.com/YuLab-SMU/ggtree/issues/399 #might be a dplyr issue
  return(list(table = table, dot = dot, enrich = enrich, tree = tree,table2=GSEresults.rounded))
}

###
mito_genes <- read_excel("Human.MitoCarta3.0.xls", sheet = 4)
mito_genes <- mito_genes %>% dplyr::select(MitoPathway,Genes) %>% na.omit
gene_sets <- mito_genes$Genes %>% strsplit(", ")
gene.pathway <- data.frame(geneIDs = character(), pathwayIDs = integer())
for(i in 1:length(gene_sets)){
  new_data <- data.frame(geneIDs = gene_sets[[i]], pathwayIDs = i)
  gene.pathway <- rbind(gene.pathway,new_data)
}

gene.pathway$geneIDs <- mapIds(org.Hs.eg.db, keys = gene.pathway$geneIDs, keytype = "SYMBOL", column = "ENTREZID")
pathway.names <- data.frame(pathwayIDs = 1:length(gene_sets), pathwayNames = mito_genes$MitoPathway)
####

gene_list = sort(fc_new, decreasing = TRUE)
names(gene_list)=as.character(names(gene_list))

gene.pathway2=gene.pathway
gene.pathway2$pathwayIDs=pathway.names$pathwayNames[match(gene.pathway2$pathwayIDs,pathway.names$pathwayIDs)]
gene.pathway2=gene.pathway2[,c(2,1)]#need to put term gene in order

##Ben's mito to delete unused previous mito pathways
ben_mito=read_excel("/Users/yzha0247/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/Yunwei202110/IHD-DM paper figure 3a mitochondrial pathway list.xlsx")
ben_mito$Description%in%gene.pathway2$pathwayIDs
deletion_index=which(!gene.pathway2$pathwayIDs%in%ben_mito$Description)
dim(gene.pathway2)
gene.pathway2=gene.pathway2[-deletion_index,]
dim(gene.pathway2)
###

colnames(result_path_dt3)=colnames(gene.pathway2)
colnames(MFresult_path_dt3)=colnames(gene.pathway2)
gene.pathway3=rbind.data.frame(gene.pathway2,result_path_dt3,MFresult_path_dt3)
set.seed(202205)
gseaResults_mito <-performGSEA_mito(gene_list,gene.pathway3,0.1)
gseaResults_mito$table
#gseaResults_mito$dot
#gseaResults_mito$enrich
#gseaResults_mito$tree

#Normalized enrichment scores (NES)
#https://www.biostars.org/p/169648/
  • pathway plot
  • Part(2) for plotting all the enriched mito pathways

1.4.2 Part(1) KEGG+mito

  • combine the pathways together to do the gsea analysis
  • full pathway table
library(clusterProfiler)
library(enrichplot)
performGSEA_mito <- function(gene_list,gene.pathway,pval){
  # gse <- GSEA(gene_list, 
  #           TERM2GENE = gene.pathway,
  #           pAdjustMethod = "none", 
  #           scoreType = "pos",
  #           nPermSimple = 10000
  #           )
  gse <- GSEA(gene_list, TERM2GENE = gene.pathway,nPerm = 1000, minGSSize = 5, maxGSSize = 500, pAdjustMethod = "BH",pvalueCutoff = pval)
  GSEresults.rounded <- gse@result[,1:9]
  GSEresults.rounded[,4:8] <- signif(GSEresults.rounded[,4:8],4)
  GSEresults.rounded[,6:7] <- sapply(GSEresults.rounded[,6:7], format, scientific = TRUE)
  table <- DT::datatable(GSEresults.rounded)
  dot <- clusterProfiler::dotplot(gse, showCategory=10, split=".sign") + facet_grid(.~.sign)
  gse <- enrichplot::pairwise_termsim(gse)
  enrich <- emapplot(gse, showCategory = 10)
  tree <- treeplot(gse) #https://github.com/YuLab-SMU/ggtree/issues/399 #might be a dplyr issue
  return(list(table = table, dot = dot, enrich = enrich, tree = tree,table2=GSEresults.rounded))
}

###
mito_genes <- read_excel("Human.MitoCarta3.0.xls", sheet = 4)
mito_genes <- mito_genes %>% dplyr::select(MitoPathway,Genes) %>% na.omit
gene_sets <- mito_genes$Genes %>% strsplit(", ")
gene.pathway <- data.frame(geneIDs = character(), pathwayIDs = integer())
for(i in 1:length(gene_sets)){
  new_data <- data.frame(geneIDs = gene_sets[[i]], pathwayIDs = i)
  gene.pathway <- rbind(gene.pathway,new_data)
}

gene.pathway$geneIDs <- mapIds(org.Hs.eg.db, keys = gene.pathway$geneIDs, keytype = "SYMBOL", column = "ENTREZID")
pathway.names <- data.frame(pathwayIDs = 1:length(gene_sets), pathwayNames = mito_genes$MitoPathway)
####

gene_list = sort(fc_new, decreasing = TRUE)
names(gene_list)=as.character(names(gene_list))

gene.pathway2=gene.pathway
gene.pathway2$pathwayIDs=pathway.names$pathwayNames[match(gene.pathway2$pathwayIDs,pathway.names$pathwayIDs)]
gene.pathway2=gene.pathway2[,c(2,1)]#need to put term gene in order

##Ben's mito to delete unused previous mito pathways
ben_mito=read_excel("/Users/yzha0247/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/Yunwei202110/IHD-DM paper figure 3a mitochondrial pathway list.xlsx")
ben_mito$Description%in%gene.pathway2$pathwayIDs
deletion_index=which(!gene.pathway2$pathwayIDs%in%ben_mito$Description)
dim(gene.pathway2)
gene.pathway2=gene.pathway2[-deletion_index,]
dim(gene.pathway2)
###

#KEGGresult_path_dt3 is the old one
colnames(kegg_full_dt)=colnames(gene.pathway2)
gene.pathway3=rbind.data.frame(gene.pathway2,kegg_full_dt)
set.seed(202205)
gseaResults_mito <-performGSEA_mito(gene_list,gene.pathway3,0.05)
gseaResults_mito$table
#gseaResults_mito$dot
#gseaResults_mito$enrich
#gseaResults_mito$tree

#Normalized enrichment scores (NES)
#https://www.biostars.org/p/169648/
  • pathway enrichment plot
  • Part(2) for plotting all enriched mito pathways

1.4.3 KEGG+mito relax threshold for the figure

library(clusterProfiler)
library(enrichplot)
performGSEA_mito <- function(gene_list,gene.pathway,pval){
  # gse <- GSEA(gene_list, 
  #           TERM2GENE = gene.pathway,
  #           pAdjustMethod = "none", 
  #           scoreType = "pos",
  #           nPermSimple = 10000
  #           )
  gse <- GSEA(gene_list, TERM2GENE = gene.pathway,nPerm = 1000, minGSSize = 3, maxGSSize = 500, pAdjustMethod = "none",pvalueCutoff = pval)
  GSEresults.rounded <- gse@result[,1:9]
  GSEresults.rounded[,4:8] <- signif(GSEresults.rounded[,4:8],4)
  GSEresults.rounded[,6:7] <- sapply(GSEresults.rounded[,6:7], format, scientific = TRUE)
  table <- DT::datatable(GSEresults.rounded)
  dot <- clusterProfiler::dotplot(gse, showCategory=10, split=".sign") + facet_grid(.~.sign)
  gse <- enrichplot::pairwise_termsim(gse)
  enrich <- emapplot(gse, showCategory = 10)
  tree <- treeplot(gse) #https://github.com/YuLab-SMU/ggtree/issues/399 #might be a dplyr issue
  return(list(table = table, dot = dot, enrich = enrich, tree = tree,table2=GSEresults.rounded))
}

###
mito_genes <- read_excel("Human.MitoCarta3.0.xls", sheet = 4)
mito_genes <- mito_genes %>% dplyr::select(MitoPathway,Genes) %>% na.omit
gene_sets <- mito_genes$Genes %>% strsplit(", ")
gene.pathway <- data.frame(geneIDs = character(), pathwayIDs = integer())
for(i in 1:length(gene_sets)){
  new_data <- data.frame(geneIDs = gene_sets[[i]], pathwayIDs = i)
  gene.pathway <- rbind(gene.pathway,new_data)
}

gene.pathway$geneIDs <- mapIds(org.Hs.eg.db, keys = gene.pathway$geneIDs, keytype = "SYMBOL", column = "ENTREZID")
pathway.names <- data.frame(pathwayIDs = 1:length(gene_sets), pathwayNames = mito_genes$MitoPathway)
####

gene_list = sort(fc_new, decreasing = TRUE)
names(gene_list)=as.character(names(gene_list))

gene.pathway2=gene.pathway
gene.pathway2$pathwayIDs=pathway.names$pathwayNames[match(gene.pathway2$pathwayIDs,pathway.names$pathwayIDs)]
gene.pathway2=gene.pathway2[,c(2,1)]#need to put term gene in order

##Ben's mito to delete unused previous mito pathways
ben_mito=read_excel("/Users/yzha0247/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/IHD-noDM vs donor KEGG mito pathways 27.6.22.xlsx")
ben_mito$Description%in%gene.pathway2$pathwayIDs
##  [1]  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE
## [13]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE
## [25]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE
## [37]  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE
## [49] FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE
## [61]  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE
## [73] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85]  TRUE FALSE FALSE FALSE FALSE FALSE
## [1] 3904    2
## [1] 2416    2

2 DCM-NO DM VS Donor

2.1 DE analysis and standard pipeline

## [1] 9037  111
## [1] 9037  111
## [1] 4094  111
## [1] 38 14
##  [1] "40_LV_DCM_No_43_F"    "41_LV_DCM_No_43_M"    "42_LV_DCM_No_45_M"   
##  [4] "43_LV_DCM_No_49_M"    "44_LV_DCM_No_50_F"    "45_LV_DCM_No_51_M"   
##  [7] "46_LV_DCM_No_53_F"    "47_LV_DCM_No_53_M"    "48_LV_DCM_No_54_M"   
## [10] "49_LV_DCM_No_54_M"    "50_LV_DCM_No_55_M"    "51_LV_DCM_No_56_M"   
## [13] "52_LV_DCM_No_58_F"    "53_LV_DCM_No_58_M"    "54_LV_DCM_No_60_F"   
## [16] "55_LV_DCM_No_62_M"    "56_LV_DCM_No_52_M"    "90_LV_Donor_No_21_M" 
## [19] "91_LV_Donor_No_23_F"  "92_LV_Donor_No_23_M"  "93_LV_Donor_No_25_F" 
## [22] "94_LV_Donor_No_47_M"  "95_LV_Donor_No_47_F"  "96_LV_Donor_No_48_M" 
## [25] "97_LV_Donor_No_49_F"  "98_LV_Donor_No_51_F"  "99_LV_Donor_No_53_M" 
## [28] "100_LV_Donor_No_53_F" "101_LV_Donor_No_54_M" "102_LV_Donor_No_54_F"
## [31] "103_LV_DCM_No_44_M"   "104_LV_Donor_No_55_F" "105_LV_Donor_No_56_M"
## [34] "106_LV_Donor_No_60_M" "107_LV_Donor_No_62_M" "108_LV_Donor_No_62_F"
## [37] "109_LV_Donor_No_65_M" "110_LV_Donor_No_65_F"
## [1] 9037   38
## [1] 9037   38
## [1] 4075   38
##  [1] "50_LV_DCM_No_55_M"    "51_LV_DCM_No_56_M"    "46_LV_DCM_No_53_F"   
##  [4] "49_LV_DCM_No_54_M"    "55_LV_DCM_No_62_M"    "103_LV_DCM_No_44_M"  
##  [7] "44_LV_DCM_No_50_F"    "40_LV_DCM_No_43_F"    "41_LV_DCM_No_43_M"   
## [10] "54_LV_DCM_No_60_F"    "52_LV_DCM_No_58_F"    "48_LV_DCM_No_54_M"   
## [13] "45_LV_DCM_No_51_M"    "47_LV_DCM_No_53_M"    "53_LV_DCM_No_58_M"   
## [16] "56_LV_DCM_No_52_M"    "43_LV_DCM_No_49_M"    "42_LV_DCM_No_45_M"   
## [19] "110_LV_Donor_No_65_F" "101_LV_Donor_No_54_M" "91_LV_Donor_No_23_F" 
## [22] "94_LV_Donor_No_47_M"  "98_LV_Donor_No_51_F"  "105_LV_Donor_No_56_M"
## [25] "90_LV_Donor_No_21_M"  "100_LV_Donor_No_53_F" "109_LV_Donor_No_65_M"
## [28] "99_LV_Donor_No_53_M"  "93_LV_Donor_No_25_F"  "97_LV_Donor_No_49_F" 
## [31] "92_LV_Donor_No_23_M"  "108_LV_Donor_No_62_F" "102_LV_Donor_No_54_F"
## [34] "96_LV_Donor_No_48_M"  "106_LV_Donor_No_60_M" "95_LV_Donor_No_47_F" 
## [37] "107_LV_Donor_No_62_M" "104_LV_Donor_No_55_F"
## [1] 1.97475
## [1] 4075   10
## [1] 4072   10
## [1] 1090   38

# Let is suppose I have a collection of genesets called : HALLMARK Now let is suppose there is a specific geneset there called: E2F_targets
# 
# BgRatio, M/N.
# 
# M = size of the geneset (eg size of the E2F_targets); (is the number of genes within that distribution that are annotated (either directly or indirectly) to the node of interest).
# 
# N = size of all of the unique genes in the collection of genesets (example the HALLMARK collection); (is the total number of genes in the background distribution (universe)
# 
# GeneRatio is k/n.
# 
# k = size of the overlap of 'a vector of gene id' you input with the specific geneset (eg E2F_targets), only unique genes; (the number of genes within that list n, which are annotated to the node.
# 
# n = size of the overlap of 'a vector of gene id' you input with all the members of the collection of genesets (eg the HALLMARK collection),only unique genes; is the size of the list of genes of interest

#adj pval and qval
#https://support.bioconductor.org/p/96329/
#adj pval is the fdr adjusted pval, qval from another package is another adjustment method
#the BH approach is slightly more conservative than qvalue


#get pathway maps
result_path <- enrichGO(gene         = names(geneList),
                OrgDb         = org.Hs.eg.db,
                keyType       = 'ENTREZID',
                ont           = "BP",
                pAdjustMethod = "BH",
                pvalueCutoff  =1,
                qvalueCutoff  =1)
result_path_dt=as.data.frame(result_path)
result_path_dt2=result_path_dt[,c("Description","geneID")]

result_path_dt3=data.frame(matrix(ncol=2))
for(i in 1:nrow(result_path_dt2)){
X2=unlist(strsplit(result_path_dt2$geneID[i],"/"))
X1=rep(result_path_dt2$Description[i],length(X2))
dt=cbind(X1,X2)
result_path_dt3=rbind(result_path_dt3,dt)
}

2.2 check list for ecm & scm gene groups

##         GOID                                TERM ONTOLOGY
## 1 GO:0030198   extracellular matrix organization       BP
## 2 GO:0055007 cardiac muscle cell differentiation       BP
## 3 GO:0030239                  myofibril assembly       BP
## 4 GO:0060047                   heart contraction       BP
## 5 GO:0007010           cytoskeleton organization       BP
## 6 GO:0031012                extracellular matrix       CC
## 7 GO:0043292                   contractile fiber       CC
## 8 GO:0005856                        cytoskeleton       CC
## [1] 401
## [1] 689
## [1] 1090

2.3 continue for DE analysis and standard pipeline

2.4 Part(1)

2.4.1 Part(1) GO+mito

  • combine the pathways together to do the gsea analysis
  • full pathway table
library(clusterProfiler)
library(enrichplot)
performGSEA_mito <- function(gene_list,gene.pathway,pval){
  # gse <- GSEA(gene_list, 
  #           TERM2GENE = gene.pathway,
  #           pAdjustMethod = "none", 
  #           scoreType = "pos",
  #           nPermSimple = 10000
  #           )
  gse <- GSEA(gene_list, TERM2GENE = gene.pathway,nPerm = 1000, minGSSize = 5, maxGSSize = 500, pAdjustMethod = "BH",pvalueCutoff = pval)
  GSEresults.rounded <- gse@result[,1:9]
  GSEresults.rounded[,4:8] <- signif(GSEresults.rounded[,4:8],4)
  GSEresults.rounded[,6:7] <- sapply(GSEresults.rounded[,6:7], format, scientific = TRUE)
  table <- DT::datatable(GSEresults.rounded)
  dot <- clusterProfiler::dotplot(gse, showCategory=10, split=".sign") + facet_grid(.~.sign)
  gse <- enrichplot::pairwise_termsim(gse)
  enrich <- emapplot(gse, showCategory = 10)
  tree <- treeplot(gse) #https://github.com/YuLab-SMU/ggtree/issues/399 #might be a dplyr issue
  return(list(table = table, dot = dot, enrich = enrich, tree = tree,table2=GSEresults.rounded))
}

###
mito_genes <- read_excel("Human.MitoCarta3.0.xls", sheet = 4)
mito_genes <- mito_genes %>% dplyr::select(MitoPathway,Genes) %>% na.omit
gene_sets <- mito_genes$Genes %>% strsplit(", ")
gene.pathway <- data.frame(geneIDs = character(), pathwayIDs = integer())
for(i in 1:length(gene_sets)){
  new_data <- data.frame(geneIDs = gene_sets[[i]], pathwayIDs = i)
  gene.pathway <- rbind(gene.pathway,new_data)
}

gene.pathway$geneIDs <- mapIds(org.Hs.eg.db, keys = gene.pathway$geneIDs, keytype = "SYMBOL", column = "ENTREZID")
pathway.names <- data.frame(pathwayIDs = 1:length(gene_sets), pathwayNames = mito_genes$MitoPathway)
####

gene_list = sort(fc_new, decreasing = TRUE)
names(gene_list)=as.character(names(gene_list))

gene.pathway2=gene.pathway
gene.pathway2$pathwayIDs=pathway.names$pathwayNames[match(gene.pathway2$pathwayIDs,pathway.names$pathwayIDs)]
gene.pathway2=gene.pathway2[,c(2,1)]#need to put term gene in order

##Ben's mito to delete unused previous mito pathways
ben_mito=read_excel("/Users/yzha0247/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/Yunwei202110/IHD-noDM vs donor KEGG mito pathways 27.6.22.xlsx")
ben_mito$Description%in%gene.pathway2$pathwayIDs
deletion_index=which(!gene.pathway2$pathwayIDs%in%ben_mito$Description)
dim(gene.pathway2)
gene.pathway2=gene.pathway2[-deletion_index,]
dim(gene.pathway2)
###

colnames(result_path_dt3)=colnames(gene.pathway2)
colnames(MFresult_path_dt3)=colnames(gene.pathway2)
gene.pathway3=rbind.data.frame(gene.pathway2,result_path_dt3,MFresult_path_dt3)
set.seed(202205)
gseaResults_mito <-performGSEA_mito(gene_list,gene.pathway3,0.1)
gseaResults_mito$table
#gseaResults_mito$dot
#gseaResults_mito$enrich
#gseaResults_mito$tree

#Normalized enrichment scores (NES)
#https://www.biostars.org/p/169648/
  • pathway plot
  • Part(2) for plotting all enriched mito pathways

2.4.2 Part(1) KEGG+mito

  • combine the pathways together to do the gsea analysis
  • pathway table
library(clusterProfiler)
library(enrichplot)
performGSEA_mito <- function(gene_list,gene.pathway,pval){
  # gse <- GSEA(gene_list, 
  #           TERM2GENE = gene.pathway,
  #           pAdjustMethod = "none", 
  #           scoreType = "pos",
  #           nPermSimple = 10000
  #           )
  gse <- GSEA(gene_list, TERM2GENE = gene.pathway,nPerm = 1000, minGSSize = 5, maxGSSize = 500, pAdjustMethod = "BH",pvalueCutoff = pval)
  GSEresults.rounded <- gse@result[,1:9]
  GSEresults.rounded[,4:8] <- signif(GSEresults.rounded[,4:8],4)
  GSEresults.rounded[,6:7] <- sapply(GSEresults.rounded[,6:7], format, scientific = TRUE)
  table <- DT::datatable(GSEresults.rounded)
  dot <- clusterProfiler::dotplot(gse, showCategory=10, split=".sign") + facet_grid(.~.sign)
  gse <- enrichplot::pairwise_termsim(gse)
  enrich <- emapplot(gse, showCategory = 10)
  tree <- treeplot(gse) #https://github.com/YuLab-SMU/ggtree/issues/399 #might be a dplyr issue
  return(list(table = table, dot = dot, enrich = enrich, tree = tree,table2=GSEresults.rounded))
}

###
mito_genes <- read_excel("Human.MitoCarta3.0.xls", sheet = 4)
mito_genes <- mito_genes %>% dplyr::select(MitoPathway,Genes) %>% na.omit
gene_sets <- mito_genes$Genes %>% strsplit(", ")
gene.pathway <- data.frame(geneIDs = character(), pathwayIDs = integer())
for(i in 1:length(gene_sets)){
  new_data <- data.frame(geneIDs = gene_sets[[i]], pathwayIDs = i)
  gene.pathway <- rbind(gene.pathway,new_data)
}

gene.pathway$geneIDs <- mapIds(org.Hs.eg.db, keys = gene.pathway$geneIDs, keytype = "SYMBOL", column = "ENTREZID")
pathway.names <- data.frame(pathwayIDs = 1:length(gene_sets), pathwayNames = mito_genes$MitoPathway)
####

gene_list = sort(fc_new, decreasing = TRUE)
names(gene_list)=as.character(names(gene_list))

gene.pathway2=gene.pathway
gene.pathway2$pathwayIDs=pathway.names$pathwayNames[match(gene.pathway2$pathwayIDs,pathway.names$pathwayIDs)]
gene.pathway2=gene.pathway2[,c(2,1)]#need to put term gene in order

##Ben's mito to delete unused previous mito pathways
ben_mito=read_excel("/Users/yzha0247/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/Yunwei202110/IHD-noDM vs donor KEGG mito pathways 27.6.22.xlsx")
ben_mito$Description%in%gene.pathway2$pathwayIDs
deletion_index=which(!gene.pathway2$pathwayIDs%in%ben_mito$Description)
dim(gene.pathway2)
gene.pathway2=gene.pathway2[-deletion_index,]
dim(gene.pathway2)
###


colnames(kegg_full_dt)=colnames(gene.pathway2)
gene.pathway3=rbind.data.frame(gene.pathway2,kegg_full_dt)
set.seed(202205)
gseaResults_mito <-performGSEA_mito(gene_list,gene.pathway3,0.1)
gseaResults_mito$table
#gseaResults_mito$dot
#gseaResults_mito$enrich
#gseaResults_mito$tree

#Normalized enrichment scores (NES)
#https://www.biostars.org/p/169648/
  • pathway plot
  • Part(2) for plotting enriched mito pathways

2.4.3 Part(1) KEGG+mito threshold relax for our figure

library(clusterProfiler)
library(enrichplot)
performGSEA_mito <- function(gene_list,gene.pathway,pval){
  # gse <- GSEA(gene_list, 
  #           TERM2GENE = gene.pathway,
  #           pAdjustMethod = "none", 
  #           scoreType = "pos",
  #           nPermSimple = 10000
  #           )
  gse <- GSEA(gene_list, TERM2GENE = gene.pathway,nPerm = 1000, minGSSize = 3, maxGSSize = 500, pAdjustMethod = "none",pvalueCutoff = pval)
  GSEresults.rounded <- gse@result[,1:9]
  GSEresults.rounded[,4:8] <- signif(GSEresults.rounded[,4:8],4)
  GSEresults.rounded[,6:7] <- sapply(GSEresults.rounded[,6:7], format, scientific = TRUE)
  table <- DT::datatable(GSEresults.rounded)
  dot <- clusterProfiler::dotplot(gse, showCategory=10, split=".sign") + facet_grid(.~.sign)
  gse <- enrichplot::pairwise_termsim(gse)
  enrich <- emapplot(gse, showCategory = 10)
  tree <- treeplot(gse) #https://github.com/YuLab-SMU/ggtree/issues/399 #might be a dplyr issue
  return(list(table = table, dot = dot, enrich = enrich, tree = tree,table2=GSEresults.rounded))
}

###
mito_genes <- read_excel("Human.MitoCarta3.0.xls", sheet = 4)
mito_genes <- mito_genes %>% dplyr::select(MitoPathway,Genes) %>% na.omit
gene_sets <- mito_genes$Genes %>% strsplit(", ")
gene.pathway <- data.frame(geneIDs = character(), pathwayIDs = integer())
for(i in 1:length(gene_sets)){
  new_data <- data.frame(geneIDs = gene_sets[[i]], pathwayIDs = i)
  gene.pathway <- rbind(gene.pathway,new_data)
}

gene.pathway$geneIDs <- mapIds(org.Hs.eg.db, keys = gene.pathway$geneIDs, keytype = "SYMBOL", column = "ENTREZID")
pathway.names <- data.frame(pathwayIDs = 1:length(gene_sets), pathwayNames = mito_genes$MitoPathway)
####

gene_list = sort(fc_new, decreasing = TRUE)
names(gene_list)=as.character(names(gene_list))

gene.pathway2=gene.pathway
gene.pathway2$pathwayIDs=pathway.names$pathwayNames[match(gene.pathway2$pathwayIDs,pathway.names$pathwayIDs)]
gene.pathway2=gene.pathway2[,c(2,1)]#need to put term gene in order

##Ben's mito to delete unused previous mito pathways
ben_mito=read_excel("/Users/yzha0247/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/IHD-noDM vs donor KEGG mito pathways 27.6.22.xlsx")
ben_mito$Description%in%gene.pathway2$pathwayIDs
##  [1]  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE
## [13]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE
## [25]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE
## [37]  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE
## [49] FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE
## [61]  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE
## [73] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85]  TRUE FALSE FALSE FALSE FALSE FALSE
## [1] 3904    2
## [1] 2416    2

3 Figure3A

## [1] 336   9
## [1] 285   9

4 Figure3C

5 Figure3D

6 Figure3e

  • this plot the scm and ecm pathways for ihd-dm vs donor

7 Figure3f

  • this plot the scm and ecm pathways for ihd-no dm vs donor