this file is created 0916, to get supp figure s5 which is the same as figure3 but use DCM instead, the donors are those >=21 donors.
this file is updated 1006 with Ben’s list of pathways to show.

#library(GEOquery)  ## go to https://github.com/seandavi/GEOquery for installation details
#library(R.utils)
library(reshape2)
library(ggplot2)
library(limma)
library(dplyr)
library(tidyverse)
#library(DMwR)
library(readxl)
library(MultiAssayExperiment)
library(S4Vectors)
library(SummarizedExperiment)
library(DT)
library(randomForest)
library(e1071)
library("bnstruct")
library(rminer)
library(ggpubr)
setwd("/Users/yzha0247/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis")
# install.packages("VANData_1.0.0.tgz", repos=NULL,type="source")
# install.packages("VAN_1.0.0.tgz", repos=NULL,type="source") 

library(R.utils)
library(reshape2)
library(ggplot2)
library(limma)
library(dplyr)
library(tidyverse)
library(DMwR)
library(readxl)
library(magrittr)
library(tidyverse)
library(gage)
library(knitr)
library(DT)
library(janitor)
library(KEGGREST)
library(MASS)
library("AnnotationDbi")
library("org.Hs.eg.db")
library(dplyr)
library(reshape)
library(ggplot2)
library(plyr)
library(gtable)
library(grid)
library(pheatmap)
library(reshape2)
library(plotly)
library(UpSetR)
library(MatchIt)
require(pathview)
library(STRINGdb)
## Kegg sets data
require(gage)
require(gageData)
kegg = kegg.gsets(species = "hsa", id.type = "kegg")
kegg.sets.hsa = kegg$kg.sets

# go.hs=go.gsets(species="human")
# go.bp=go.hs$go.sets[go.hs$go.subs$BP]
# go.mf=go.hs$go.sets[go.hs$go.subs$MF]
# go.bpmf=go.hs$go.sets[go.hs$go.subs$BP&go.hs$go.subs$MF] #not equal, wired
# go.bpmf2=append(go.bp,go.mf)
require(gage)
require(gageData)
library(clusterProfiler)
library(enrichplot)
library(GO.db)
# #failed install
# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# 
# BiocManager::install("GOfuncR")
# 
# if (!requireNamespace("devtools", quietly=TRUE))
#     install.packages("devtools")
# devtools::install_github("sgrote/GOfuncR")

1 DCM-DM VS Donor

1.1 DE analysis and standard pipeline

load("all_omics_lv_v5.RData")
protein_expression=assay(all_omics@ExperimentList$protein, "log2")
protein_expression1=as.data.frame(protein_expression)

dim(protein_expression1)

## [1] 9037  111

#changed 20220224: no overall filtering
protein_expression2=protein_expression1
dim(protein_expression2)

## [1] 9037  111

dim(protein_expression2[which(rowMeans(!is.na(protein_expression2)) > 0.25), ])

## [1] 4094  111

#20220321 add the age match as before
infodt1 = colData(all_omics)
infodt1$Age=as.numeric(infodt1$Age)
infodt1$ihd=as.numeric(as.vector(ifelse(infodt1$Condition=="DCM"&infodt1$Diabetes=="Yes",1,ifelse(infodt1$Condition=="Donor",0,2))))
ihddt=infodt1[infodt1$ihd!=2,]
ihddt=ihddt[(ihddt$ihd==0&ihddt$Age>=21)|ihddt$ihd==1,]
dim(ihddt)

## [1] 29 14

ihddt$new_filename

##  [1] "31_LV_DCM_Yes_22_F"   "32_LV_DCM_Yes_31_F"   "33_LV_DCM_Yes_53_M"  
##  [4] "34_LV_DCM_Yes_56_M"   "35_LV_DCM_Yes_56_M"   "36_LV_DCM_Yes_58_M"  
##  [7] "37_LV_DCM_Yes_59_M"   "38_LV_DCM_Yes_61_F"   "39_LV_DCM_Yes_63_F"  
## [10] "90_LV_Donor_No_21_M"  "91_LV_Donor_No_23_F"  "92_LV_Donor_No_23_M" 
## [13] "93_LV_Donor_No_25_F"  "94_LV_Donor_No_47_M"  "95_LV_Donor_No_47_F" 
## [16] "96_LV_Donor_No_48_M"  "97_LV_Donor_No_49_F"  "98_LV_Donor_No_51_F" 
## [19] "99_LV_Donor_No_53_M"  "100_LV_Donor_No_53_F" "101_LV_Donor_No_54_M"
## [22] "102_LV_Donor_No_54_F" "104_LV_Donor_No_55_F" "105_LV_Donor_No_56_M"
## [25] "106_LV_Donor_No_60_M" "107_LV_Donor_No_62_M" "108_LV_Donor_No_62_F"
## [28] "109_LV_Donor_No_65_M" "110_LV_Donor_No_65_F"

indexx=ihddt$new_filename
protein_expression2=protein_expression2[,colnames(protein_expression2)%in%indexx]
dim(protein_expression2)

## [1] 9037   29

#get the "lv" "ihd-dm" "ihd-no dm" non-imputed data
experiment_data_noimputation=protein_expression2[,union(grep("_DCM_Yes",colnames(protein_expression2)),grep("Donor_No",colnames(protein_expression2)))]
#colnames(experiment_data_noimputation)
dim(experiment_data_noimputation)

## [1] 9037   29

#filtering out proteins with >25% samples missing
experiment_data_noimputation=experiment_data_noimputation[which(rowMeans(!is.na(experiment_data_noimputation)) > 0.25), ]
dim(experiment_data_noimputation)

## [1] 4051   29

diabetes=grep("Yes",colnames(experiment_data_noimputation))
experiment_data_noimputation1=as.data.frame(t(experiment_data_noimputation))
rownames(experiment_data_noimputation1)=1:dim(experiment_data_noimputation1)[1]
experiment_data_noimputation1$y=ifelse(rownames(experiment_data_noimputation1) %in% diabetes,1,0)


#de analysis-- should be on the non-imputed data

    
    groupname<-as.factor(experiment_data_noimputation1$y)
    design <- model.matrix(~ groupname + 0)
    fit <- lmFit(experiment_data_noimputation, design)
    cont.matrix <- makeContrasts(groupname1-groupname0, levels=design)
    fit2 <- contrasts.fit(fit, cont.matrix)
    fit2 <- eBayes(fit2)
    tT <- topTable(fit2)
    df1 <- topTable(fit2, number=nrow(fit2), genelist=rownames(experiment_data_noimputation))
    df1$name=rownames(df1)
    df1$neglogP_value = -log10(df1$P.Value)

    # g1=ggplot2::ggplot(df1, aes(logFC,neglogP_value,label=name))+
    #   geom_point(size = 1, alpha=1) +
    #   xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("IHD-T2DM vs Donor") +
    #   theme_bw(base_size = 16) +
    #   geom_hline(yintercept = 1.3, linetype = "dashed", colour = "brown3")+theme(aspect.ratio = 1)
    
    #add fdr plot
adj_to_p <- function(df) {
  #df <- topTable(fit, coef = comparison, number = nrow(fit))
  if ((df %>%filter(adj.P.Val < 0.05) %>% nrow()) == 0){
    r <- 1
  } else {
    r <- df%>%mutate(rn = row_number()) %>% filter(adj.P.Val <= 0.05) %>% pull(rn)%>%max() + 1
  }
  n <- nrow(df) + 1
  lp10 <- -log10(r / n * 0.05)
  lp10
} 

#add samples
colnames(experiment_data_noimputation)

##  [1] "31_LV_DCM_Yes_22_F"   "33_LV_DCM_Yes_53_M"   "37_LV_DCM_Yes_59_M"  
##  [4] "32_LV_DCM_Yes_31_F"   "35_LV_DCM_Yes_56_M"   "39_LV_DCM_Yes_63_F"  
##  [7] "34_LV_DCM_Yes_56_M"   "36_LV_DCM_Yes_58_M"   "38_LV_DCM_Yes_61_F"  
## [10] "110_LV_Donor_No_65_F" "101_LV_Donor_No_54_M" "91_LV_Donor_No_23_F" 
## [13] "94_LV_Donor_No_47_M"  "98_LV_Donor_No_51_F"  "105_LV_Donor_No_56_M"
## [16] "90_LV_Donor_No_21_M"  "100_LV_Donor_No_53_F" "109_LV_Donor_No_65_M"
## [19] "99_LV_Donor_No_53_M"  "93_LV_Donor_No_25_F"  "97_LV_Donor_No_49_F" 
## [22] "92_LV_Donor_No_23_M"  "108_LV_Donor_No_62_F" "102_LV_Donor_No_54_F"
## [25] "96_LV_Donor_No_48_M"  "106_LV_Donor_No_60_M" "95_LV_Donor_No_47_F" 
## [28] "107_LV_Donor_No_62_M" "104_LV_Donor_No_55_F"

#add fdr plot
adj_to_p(df1)

## [1] 2.175502

highlight_df=df1
highlight_df$colorcode=as.character(ifelse(highlight_df$neglogP_value>adj_to_p(df1)&highlight_df$logFC>0,1,ifelse(highlight_df$neglogP_value>adj_to_p(df1)&highlight_df$logFC<0,-1,0)))
dim(highlight_df)

## [1] 4051   10

highlight_df=na.omit(highlight_df)
dim(highlight_df)

## [1] 4037   10

g1=ggplot2::ggplot(highlight_df, aes(logFC,neglogP_value,label=name))+
      geom_point(aes(x=logFC,y=neglogP_value,color=colorcode)) +
      xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("DCM-T2DM vs Donor") +
      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),aspect.ratio = 1) +
      geom_hline(yintercept = adj_to_p(df1), linetype = "dashed", colour = "red")+scale_color_manual(values = c("blue","gray","red"))
ggplotly(g1)

df1[,c(2:7,9)]=round(df1[,c(2:7,9)],4)
datatable(df1)

#pathway analysis
#pathway analysis as in our shiny:delete missing values
experiment_data_pathway=na.omit(experiment_data_noimputation)
dim(experiment_data_pathway)

## [1] 2048   29

##With voom for finding pathways
groupname<-as.factor(experiment_data_noimputation1$y)
design <- model.matrix(~ groupname + 0)
log2.cpm <- voom(experiment_data_pathway, design, normalize.method = "quantile")
fit <- lmFit(log2.cpm, design)
cont.matrix <- makeContrasts(groupname1-groupname0, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)
limma.res=topTable(fit2,n=Inf,sort="p", adjust.method = "fdr")
fc_new <- limma.res$logFC
limma.res$entrez = mapIds(org.Hs.eg.db,
                     keys=row.names(limma.res), 
                     column="ENTREZID",
                     keytype=keytypes(org.Hs.eg.db)[2],
                     multiVals="first")
names(fc_new) <- limma.res$entrez

geneList<-na.omit(fc_new)
geneList = sort(geneList, decreasing = TRUE)
kk2 <- gseGO(geneList     = geneList,
              OrgDb        = org.Hs.eg.db,
              ont          = "BP",
              minGSSize    = 100,
              maxGSSize    = 500,
              pvalueCutoff = 0.05,
              verbose      = FALSE)
kk2_dt=as.data.frame(kk2)
kk2_dt[,c(4,5,7,8)]=round(kk2_dt[,c(4,5,7,8)],4)
datatable(kk2_dt)

clusterProfiler::dotplot(kk2, showCategory=10, split=".sign") + facet_grid(.~.sign)

BP_namelist=kk2_dt$Description
#https://yulab-smu.top/biomedical-knowledge-mining-book/clusterprofiler-go.html
#gene ontology over-representation test
ego2 <- enrichGO(gene         = names(geneList),
                OrgDb         = org.Hs.eg.db,
                keyType       = 'ENTREZID',
                ont           = "BP",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.01,
                qvalueCutoff  = 0.05)
ego2_dt=as.data.frame(ego2)
ego2_dt[,5:7]=round(ego2_dt[,5:7],4)
datatable(ego2_dt)

# Let is suppose I have a collection of genesets called : HALLMARK Now let is suppose there is a specific geneset there called: E2F_targets
# 
# BgRatio, M/N.
# 
# M = size of the geneset (eg size of the E2F_targets); (is the number of genes within that distribution that are annotated (either directly or indirectly) to the node of interest).
# 
# N = size of all of the unique genes in the collection of genesets (example the HALLMARK collection); (is the total number of genes in the background distribution (universe)
# 
# GeneRatio is k/n.
# 
# k = size of the overlap of 'a vector of gene id' you input with the specific geneset (eg E2F_targets), only unique genes; (the number of genes within that list n, which are annotated to the node.
# 
# n = size of the overlap of 'a vector of gene id' you input with all the members of the collection of genesets (eg the HALLMARK collection),only unique genes; is the size of the list of genes of interest

#adj pval and qval
#https://support.bioconductor.org/p/96329/
#adj pval is the fdr adjusted pval, qval from another package is another adjustment method
#the BH approach is slightly more conservative than qvalue

#enrichgo:
# Over Representation Analysis (ORA) (Boyle et al. 2004) is a widely used approach to determine whether known biological functions or processes are over-represented (= enriched) in an experimentally-derived gene list, e.g. a list of differentially expressed genes (DEGs).
# 
# The p-value can be calculated by hypergeometric distribution.

#Leading edge analysis reports Tags to indicate the percentage of genes contributing to the enrichment score, List to indicate where in the list the enrichment score is attained and Signal for enrichment signal strength.

#It would also be very interesting to get the core enriched genes that contribute to the enrichment. Our packages (clusterProfiler, DOSE, meshes and ReactomePA) support leading edge analysis and report core enriched genes in GSEA analysis.

#get pathway maps
result_path <- enrichGO(gene         = names(geneList),
                OrgDb         = org.Hs.eg.db,
                keyType       = 'ENTREZID',
                ont           = "BP",
                pAdjustMethod = "BH",
                pvalueCutoff  =1,
                qvalueCutoff  =1)
result_path_dt=as.data.frame(result_path)
result_path_dt2=result_path_dt[,c("Description","geneID")]

result_path_dt3=data.frame(matrix(ncol=2))
for(i in 1:nrow(result_path_dt2)){
X2=unlist(strsplit(result_path_dt2$geneID[i],"/"))
X1=rep(result_path_dt2$Description[i],length(X2))
dt=cbind(X1,X2)
result_path_dt3=rbind(result_path_dt3,dt)
}

kk3 <- gseGO(geneList     = geneList,
              OrgDb        = org.Hs.eg.db,
              ont          = "MF",
              minGSSize    = 100,
              maxGSSize    = 500,
              pvalueCutoff = 0.05,
              verbose      = FALSE)
kk3_dt=as.data.frame(kk3)
kk3_dt[,c(4,5,7,8)]=round(kk3_dt[,c(4,5,7,8)],4)
datatable(as.data.frame(kk3_dt))

clusterProfiler::dotplot(kk3, showCategory=10, split=".sign") + facet_grid(.~.sign)

#https://yulab-smu.top/biomedical-knowledge-mining-book/clusterprofiler-go.html
#gene ontology over-representation test
ego3 <- enrichGO(gene         = names(geneList),
                OrgDb         = org.Hs.eg.db,
                keyType       = 'ENTREZID',
                ont           = "MF",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.01,
                qvalueCutoff  = 0.05)
ego3_dt=as.data.frame(ego3)
ego3_dt[,5:7]=round(ego3_dt[,5:7],4)
datatable(as.data.frame(ego3_dt))

MF_namelist=kk3_dt$Description

#get pathway maps
MFresult_path <- enrichGO(gene         = names(geneList),
                OrgDb         = org.Hs.eg.db,
                keyType       = 'ENTREZID',
                ont           = "MF",
                pAdjustMethod = "BH",
                pvalueCutoff  =1,
                qvalueCutoff  =1)
MFresult_path_dt=as.data.frame(MFresult_path)
MFresult_path_dt2=MFresult_path_dt[,c("Description","geneID")]

MFresult_path_dt3=data.frame(matrix(ncol=2))
for(i in 1:nrow(MFresult_path_dt2)){
X2=unlist(strsplit(MFresult_path_dt2$geneID[i],"/"))
X1=rep(MFresult_path_dt2$Description[i],length(X2))
dt=cbind(X1,X2)
MFresult_path_dt3=rbind(MFresult_path_dt3,dt)
}

1.2 check list for ecm & scm gene groups

#new things 20220907: add customised pathway analysis results
name=c("GO:0030198","GO:0055007","GO:0030239","GO:0060047","GO:0007010","GO:0031012","GO:0043292","GO:0005856")
AnnotationDbi::select(GO.db, keys=name, columns=c("TERM","ONTOLOGY"), keytype="GOID")

##         GOID                                TERM ONTOLOGY
## 1 GO:0030198   extracellular matrix organization       BP
## 2 GO:0055007 cardiac muscle cell differentiation       BP
## 3 GO:0030239                  myofibril assembly       BP
## 4 GO:0060047                   heart contraction       BP
## 5 GO:0007010           cytoskeleton organization       BP
## 6 GO:0031012                extracellular matrix       CC
## 7 GO:0043292                   contractile fiber       CC
## 8 GO:0005856                        cytoskeleton       CC

pathwayIDs=c("extracellular matrix organization","cardiac muscle cell differentiation", "myofibril assembly", "heart contraction" ,"cytoskeleton organization","extracellular matrix"  ,"contractile fiber", "cytoskeleton")

all_name=c(name,GOBPOFFSPRING[["GO:0030198"]],GOBPOFFSPRING[["GO:0055007"]],GOBPOFFSPRING[["GO:0030239"]],GOBPOFFSPRING[["GO:0060047"]],GOBPOFFSPRING[["GO:0007010"]],GOCCOFFSPRING[["GO:0031012"]],GOCCOFFSPRING[["GO:0043292"]],GOCCOFFSPRING[["GO:0005856"]])

all_name2=AnnotationDbi::select(GO.db, keys=all_name, columns=c("TERM","ONTOLOGY"), keytype="GOID")


GO_v<- AnnotationDbi::select(org.Hs.eg.db, keys=all_name2$GOID, columns=c("GO","ONTOLOGY","SYMBOL","ENTREZID"),keytype="GO")

pp2=merge(all_name2,GO_v,by.x="GOID",by.y="GO",all=TRUE)
datatable(pp2%>%group_by(GOID)%>%summarise(n=n()))

gene_list = sort(fc_new, decreasing = TRUE)
names(gene_list)=as.character(names(gene_list))

pathway_list3=pp2[,c("TERM","ENTREZID")]
colnames(pathway_list3)=c("pathwayIDs","geneIDs")
sum(names(gene_list)%in% pathway_list3$geneIDs)

## [1] 810

sum(!names(gene_list)%in% pathway_list3$geneIDs)

## [1] 1238

length(gene_list)

## [1] 2048

#pathway_list3%>%group_by(pathwayIDs)%>%summarise(n=n())

set.seed(20220907)
gse <- GSEA(gene_list, TERM2GENE = pathway_list3,nPerm = 1000, minGSSize = 3, maxGSSize = 500, pAdjustMethod = "none",pvalueCutoff = 0.05)
figure4a=as.data.frame(gse)
figure4av2=figure4a
figure4av2[,4:9]=round(figure4av2[,4:9],3)
datatable(figure4av2)

1.3 continue for DE analysis and standard pipeline

kegg_gene_list<-na.omit(fc_new)
kegg_gene_list = sort(kegg_gene_list, decreasing = TRUE)
set.seed(202205)
kk2 <- gseKEGG(geneList     = kegg_gene_list,
               organism     = "hsa",
               nPerm        = 10000,
               minGSSize    = 3,
               maxGSSize    = 800,
               pvalueCutoff = 0.05,
               pAdjustMethod = "BH",
               keyType       = "ncbi-geneid")
datatable(as.data.frame(kk2))

clusterProfiler::dotplot(kk2, showCategory=10, split=".sign") + facet_grid(.~.sign)

both2=as.data.frame(kk2)
both2=both2[-grep("disease",both2$Description),]
ggplot(both2,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=p.adjust,size=p.adjust))+coord_flip()+theme_bw()+ggtitle("GSEA for significantly downregulated and upregulated pathways")+labs(y="Enrichment score")+
  scale_colour_gradient(high = "red", low = "blue")

#get pathway maps
KEGGresult_path <- enrichKEGG(gene         = names(geneList),
              organism = "hsa", keyType = "kegg", pvalueCutoff = 1, pAdjustMethod = "BH",minGSSize = 0, maxGSSize = 50000000000, qvalueCutoff = 1)
KEGGresult_path_dt=as.data.frame(KEGGresult_path)
KEGGresult_path_dt2=KEGGresult_path_dt[,c("Description","geneID")]

KEGGresult_path_dt2=KEGGresult_path_dt2[-grep("disease",KEGGresult_path_dt2$Description),]

KEGGresult_path_dt3=data.frame(matrix(ncol=2))
for(i in 1:nrow(KEGGresult_path_dt2)){
X2=unlist(strsplit(KEGGresult_path_dt2$geneID[i],"/"))
X1=rep(KEGGresult_path_dt2$Description[i],length(X2))
dt=cbind(X1,X2)
KEGGresult_path_dt3=rbind(KEGGresult_path_dt3,dt)
}
print(paste("The total number of kegg pathways is",length(unique(KEGGresult_path_dt3$X1))))

## [1] "The total number of kegg pathways is 318"

#get all kegg pathways with the mapping
kegg_full_list=download_KEGG("hsa", keggType = "KEGG", keyType = "kegg")
head(kegg_full_list[[1]])

##       from    to
## 1 hsa00010 10327
## 2 hsa00010   124
## 3 hsa00010   125
## 4 hsa00010   126
## 5 hsa00010   127
## 6 hsa00010   128

head(kegg_full_list[[2]])

##       from                                       to
## 1 hsa00010             Glycolysis / Gluconeogenesis
## 2 hsa00020                Citrate cycle (TCA cycle)
## 3 hsa00030                Pentose phosphate pathway
## 4 hsa00040 Pentose and glucuronate interconversions
## 5 hsa00051          Fructose and mannose metabolism
## 6 hsa00052                     Galactose metabolism

kegg_full_list1=kegg_full_list[[1]]
length(unique(kegg_full_list1$from))

## [1] 347

kegg_full_list2=kegg_full_list[[2]]
dim(kegg_full_list2)

## [1] 558   2

kegg_full_dt=merge(kegg_full_list1,kegg_full_list2,by="from")
kegg_full_dt=kegg_full_dt[,c(3,2)]
length(unique(kegg_full_dt$to.y))

## [1] 347

## GSEA analysis
fc.kegg.p <- gage(fc_new, gsets = kegg.sets.hsa, ref = NULL, samp = NULL, rank.test = TRUE)

upreg <- data.frame(fc.kegg.p[[1]]) %>%
  rownames_to_column(var = "pathway") %>%
  mutate(KEGGpathway = substr(pathway, start = 1, stop = 8)) %>%
  mutate(pathwayname = word(pathway, 2, -1)) %>%
  select(pathway, KEGGpathway, pathwayname, set.size, stat.mean, p.val, q.val) %>%
  mutate(pathid = substr(KEGGpathway, start = 4, stop = 8))
upreg_selected = upreg %>%
  filter(q.val < 0.05) %>%
  filter(set.size >= 5)
upreg_selected$neglogP_Value = -log10(upreg_selected$p.val)
upreg_selected$neglogQ_value = -log10(upreg_selected$q.val)

downreg <- data.frame(fc.kegg.p[[2]]) %>%
  rownames_to_column(var = "pathway") %>%
  mutate(KEGGpathway = substr(pathway, start = 1, stop = 8)) %>%
  mutate(pathwayname = word(pathway, 2, -1)) %>%
  select(pathway, KEGGpathway, pathwayname, set.size, stat.mean, p.val, q.val) %>%
  mutate(pathid = substr(KEGGpathway, start = 4, stop = 8))
downreg_selected = downreg %>%
  filter(q.val < 0.05) %>%
  filter(set.size >= 5)
downreg_selected$neglogP_Value = -log10(downreg_selected$p.val)
downreg_selected$neglogQ_value = -log10(downreg_selected$q.val)


both = rbind(downreg_selected, upreg_selected)
both$absolute = abs(both$stat.mean)
#datatable(both)
both2 = both %>% filter(pathwayname != "Metabolic pathways" & pathwayname != "Retrograde endocannabinoid signaling")
both2=both2[-grep("disease",both2$pathway),]
both2[,c(5:7,9:11)]=round(both2[,c(5:7,9:11)],4)
datatable(both2)

#saveRDS(both, file = "both_selected.rds")

pv.out.list.fat <- sapply(both2$pathid[1:2],
                              function(pid) pathview(gene.data =  fc_new,
                                                    pathway.id = pid,
                                                    species = "hsa",
                                                    out.suffix="human",
                                                    low = list(gene = "deepskyblue1", cpd = "green"),
                                                    high = list(gene = "red", cpd = "yellow")))

library(ggrepel )
ggplot(both2, aes(x = stat.mean, y = -log10(p.val))) + geom_point( aes(color = set.size), size = 3) + labs(x = "Enrichment score", y = "neglog10p", title = "GSEA for significantly downregulated and upregulated pathways") + 
  geom_text_repel(aes(label=pathwayname), size=3) +
  scale_colour_gradient(high = "red", low = "blue") +
  scale_x_continuous(
  labels = scales::number_format(accuracy = 0.01))

ggplot(both2,aes(x = reorder(pathwayname, stat.mean),y=stat.mean))+geom_point(aes(color=-log10(p.val),size=-log10(p.val)))+coord_flip()+theme_bw()+ggtitle("GSEA for significantly downregulated and upregulated pathways")+labs(y="Enrichment score")+
  scale_colour_gradient(high = "red", low = "blue")

KEGG_namelist=both2$pathwayname

1.4 Part(1)

1.4.1 Part(1) GO+mito

combine the pathways together to do the gsea analysis
table with pathway enrichment results

library(clusterProfiler)
library(enrichplot)
performGSEA_mito <- function(gene_list,gene.pathway,pval){
  # gse <- GSEA(gene_list, 
  #           TERM2GENE = gene.pathway,
  #           pAdjustMethod = "none", 
  #           scoreType = "pos",
  #           nPermSimple = 10000
  #           )
  gse <- GSEA(gene_list, TERM2GENE = gene.pathway,nPerm = 1000, minGSSize = 5, maxGSSize = 500, pAdjustMethod = "BH",pvalueCutoff = pval)
  GSEresults.rounded <- gse@result[,1:9]
  GSEresults.rounded[,4:8] <- signif(GSEresults.rounded[,4:8],4)
  GSEresults.rounded[,6:7] <- sapply(GSEresults.rounded[,6:7], format, scientific = TRUE)
  table <- DT::datatable(GSEresults.rounded)
  dot <- clusterProfiler::dotplot(gse, showCategory=10, split=".sign") + facet_grid(.~.sign)
  gse <- enrichplot::pairwise_termsim(gse)
  enrich <- emapplot(gse, showCategory = 10)
  tree <- treeplot(gse) #https://github.com/YuLab-SMU/ggtree/issues/399 #might be a dplyr issue
  return(list(table = table, dot = dot, enrich = enrich, tree = tree,table2=GSEresults.rounded))
}

###
mito_genes <- read_excel("Human.MitoCarta3.0.xls", sheet = 4)
mito_genes <- mito_genes %>% dplyr::select(MitoPathway,Genes) %>% na.omit
gene_sets <- mito_genes$Genes %>% strsplit(", ")
gene.pathway <- data.frame(geneIDs = character(), pathwayIDs = integer())
for(i in 1:length(gene_sets)){
  new_data <- data.frame(geneIDs = gene_sets[[i]], pathwayIDs = i)
  gene.pathway <- rbind(gene.pathway,new_data)
}

gene.pathway$geneIDs <- mapIds(org.Hs.eg.db, keys = gene.pathway$geneIDs, keytype = "SYMBOL", column = "ENTREZID")
pathway.names <- data.frame(pathwayIDs = 1:length(gene_sets), pathwayNames = mito_genes$MitoPathway)
####

gene_list = sort(fc_new, decreasing = TRUE)
names(gene_list)=as.character(names(gene_list))

gene.pathway2=gene.pathway
gene.pathway2$pathwayIDs=pathway.names$pathwayNames[match(gene.pathway2$pathwayIDs,pathway.names$pathwayIDs)]
gene.pathway2=gene.pathway2[,c(2,1)]#need to put term gene in order

##Ben's mito to delete unused previous mito pathways
ben_mito=read_excel("/Users/yzha0247/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/Yunwei202110/IHD-DM paper figure 3a mitochondrial pathway list.xlsx")
ben_mito$Description%in%gene.pathway2$pathwayIDs
deletion_index=which(!gene.pathway2$pathwayIDs%in%ben_mito$Description)
dim(gene.pathway2)
gene.pathway2=gene.pathway2[-deletion_index,]
dim(gene.pathway2)
###

colnames(result_path_dt3)=colnames(gene.pathway2)
colnames(MFresult_path_dt3)=colnames(gene.pathway2)
gene.pathway3=rbind.data.frame(gene.pathway2,result_path_dt3,MFresult_path_dt3)
set.seed(202205)
gseaResults_mito <-performGSEA_mito(gene_list,gene.pathway3,0.1)
gseaResults_mito$table
#gseaResults_mito$dot
#gseaResults_mito$enrich
#gseaResults_mito$tree

#Normalized enrichment scores (NES)
#https://www.biostars.org/p/169648/

pathway plot

#my plot
ggplotdt=gseaResults_mito$table2
ggplotdt$p.adjust=as.numeric(ggplotdt$p.adjust)
ggplotdtup=ggplotdt[ggplotdt$enrichmentScore>0,]
ggplotdtupsub=ggplotdtup[order(-abs(ggplotdtup$enrichmentScore)),]
ggplotdtdown=ggplotdt[ggplotdt$enrichmentScore<0,]
ggplotdtdownsub=ggplotdtdown[order(ggplotdtdown$enrichmentScore),]

ggplotdt=rbind.data.frame(ggplotdtupsub[1:10,],ggplotdtdownsub[1:10,])
ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=p.adjust,size=p.adjust))+coord_flip()+theme_bw()+ggtitle("pathway enrichment analysis")+
  scale_colour_gradient(high = "red", low = "blue")+ylim(c(-1,1.2))

Part(2) for plotting all the enriched mito pathways

#plot all the significant mito pathways
ggplotdt2=rbind.data.frame(ggplotdtupsub,ggplotdtdownsub)
ggplotdt2=ggplotdt2[ggplotdt2$Description%in%gene.pathway2$pathwayIDs,]
ggplot(ggplotdt2,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=p.adjust,size=p.adjust))+coord_flip()+theme_bw()+ggtitle("mitochondrail pathway enrichment analysis")+
  scale_colour_gradient(high = "red", low = "blue")+ylim(c(-1,1))

1.4.2 Part(1) KEGG+mito

combine the pathways together to do the gsea analysis
full pathway table

library(clusterProfiler)
library(enrichplot)
performGSEA_mito <- function(gene_list,gene.pathway,pval){
  # gse <- GSEA(gene_list, 
  #           TERM2GENE = gene.pathway,
  #           pAdjustMethod = "none", 
  #           scoreType = "pos",
  #           nPermSimple = 10000
  #           )
  gse <- GSEA(gene_list, TERM2GENE = gene.pathway,nPerm = 1000, minGSSize = 5, maxGSSize = 500, pAdjustMethod = "BH",pvalueCutoff = pval)
  GSEresults.rounded <- gse@result[,1:9]
  GSEresults.rounded[,4:8] <- signif(GSEresults.rounded[,4:8],4)
  GSEresults.rounded[,6:7] <- sapply(GSEresults.rounded[,6:7], format, scientific = TRUE)
  table <- DT::datatable(GSEresults.rounded)
  dot <- clusterProfiler::dotplot(gse, showCategory=10, split=".sign") + facet_grid(.~.sign)
  gse <- enrichplot::pairwise_termsim(gse)
  enrich <- emapplot(gse, showCategory = 10)
  tree <- treeplot(gse) #https://github.com/YuLab-SMU/ggtree/issues/399 #might be a dplyr issue
  return(list(table = table, dot = dot, enrich = enrich, tree = tree,table2=GSEresults.rounded))
}

###
mito_genes <- read_excel("Human.MitoCarta3.0.xls", sheet = 4)
mito_genes <- mito_genes %>% dplyr::select(MitoPathway,Genes) %>% na.omit
gene_sets <- mito_genes$Genes %>% strsplit(", ")
gene.pathway <- data.frame(geneIDs = character(), pathwayIDs = integer())
for(i in 1:length(gene_sets)){
  new_data <- data.frame(geneIDs = gene_sets[[i]], pathwayIDs = i)
  gene.pathway <- rbind(gene.pathway,new_data)
}

gene.pathway$geneIDs <- mapIds(org.Hs.eg.db, keys = gene.pathway$geneIDs, keytype = "SYMBOL", column = "ENTREZID")
pathway.names <- data.frame(pathwayIDs = 1:length(gene_sets), pathwayNames = mito_genes$MitoPathway)
####

gene_list = sort(fc_new, decreasing = TRUE)
names(gene_list)=as.character(names(gene_list))

gene.pathway2=gene.pathway
gene.pathway2$pathwayIDs=pathway.names$pathwayNames[match(gene.pathway2$pathwayIDs,pathway.names$pathwayIDs)]
gene.pathway2=gene.pathway2[,c(2,1)]#need to put term gene in order

##Ben's mito to delete unused previous mito pathways
ben_mito=read_excel("/Users/yzha0247/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/Yunwei202110/IHD-DM paper figure 3a mitochondrial pathway list.xlsx")
ben_mito$Description%in%gene.pathway2$pathwayIDs
deletion_index=which(!gene.pathway2$pathwayIDs%in%ben_mito$Description)
dim(gene.pathway2)
gene.pathway2=gene.pathway2[-deletion_index,]
dim(gene.pathway2)
###

#KEGGresult_path_dt3 is the old one
colnames(kegg_full_dt)=colnames(gene.pathway2)
gene.pathway3=rbind.data.frame(gene.pathway2,kegg_full_dt)
set.seed(202205)
gseaResults_mito <-performGSEA_mito(gene_list,gene.pathway3,0.05)
gseaResults_mito$table
#gseaResults_mito$dot
#gseaResults_mito$enrich
#gseaResults_mito$tree

#Normalized enrichment scores (NES)
#https://www.biostars.org/p/169648/

pathway enrichment plot

#my plot
ggplotdt=gseaResults_mito$table2
ggplotdt$p.adjust=as.numeric(ggplotdt$p.adjust)
ggplotdtup=ggplotdt[ggplotdt$enrichmentScore>0,]
ggplotdtupsub=ggplotdtup[order(-abs(ggplotdtup$enrichmentScore)),]
ggplotdtdown=ggplotdt[ggplotdt$enrichmentScore<0,]
ggplotdtdownsub=ggplotdtdown[order(ggplotdtdown$enrichmentScore),]

ggplotdt=rbind.data.frame(ggplotdtupsub[1:10,],ggplotdtdownsub[1:10,])
ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=p.adjust,size=p.adjust))+coord_flip()+theme_bw()+ggtitle("pathway enrichment analysis")+
  scale_colour_gradient(high = "red", low = "blue")


ihddm_topnames=ggplotdt$Description
ihddm_dt=gseaResults_mito$table2

Part(2) for plotting all enriched mito pathways

#plot all the significant mito pathways
ggplotdt2=rbind.data.frame(ggplotdtupsub,ggplotdtdownsub)
ggplotdt2=ggplotdt2[ggplotdt2$Description%in%gene.pathway2$pathwayIDs,]
ggplot(ggplotdt2,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=p.adjust,size=p.adjust))+coord_flip()+theme_bw()+ggtitle("mitochondrail pathway enrichment analysis")+
  scale_colour_gradient(high = "red", low = "blue")+ylim(c(-1,1))

1.4.3 KEGG+mito relax threshold for the figure

library(clusterProfiler)
library(enrichplot)
performGSEA_mito <- function(gene_list,gene.pathway,pval){
  # gse <- GSEA(gene_list, 
  #           TERM2GENE = gene.pathway,
  #           pAdjustMethod = "none", 
  #           scoreType = "pos",
  #           nPermSimple = 10000
  #           )
  gse <- GSEA(gene_list, TERM2GENE = gene.pathway,nPerm = 1000, minGSSize = 3, maxGSSize = 500, pAdjustMethod = "none",pvalueCutoff = pval)
  GSEresults.rounded <- gse@result[,1:9]
  GSEresults.rounded[,4:8] <- signif(GSEresults.rounded[,4:8],4)
  GSEresults.rounded[,6:7] <- sapply(GSEresults.rounded[,6:7], format, scientific = TRUE)
  table <- DT::datatable(GSEresults.rounded)
  dot <- clusterProfiler::dotplot(gse, showCategory=10, split=".sign") + facet_grid(.~.sign)
  gse <- enrichplot::pairwise_termsim(gse)
  enrich <- emapplot(gse, showCategory = 10)
  tree <- treeplot(gse) #https://github.com/YuLab-SMU/ggtree/issues/399 #might be a dplyr issue
  return(list(table = table, dot = dot, enrich = enrich, tree = tree,table2=GSEresults.rounded))
}

###
mito_genes <- read_excel("Human.MitoCarta3.0.xls", sheet = 4)
mito_genes <- mito_genes %>% dplyr::select(MitoPathway,Genes) %>% na.omit
gene_sets <- mito_genes$Genes %>% strsplit(", ")
gene.pathway <- data.frame(geneIDs = character(), pathwayIDs = integer())
for(i in 1:length(gene_sets)){
  new_data <- data.frame(geneIDs = gene_sets[[i]], pathwayIDs = i)
  gene.pathway <- rbind(gene.pathway,new_data)
}

gene.pathway$geneIDs <- mapIds(org.Hs.eg.db, keys = gene.pathway$geneIDs, keytype = "SYMBOL", column = "ENTREZID")
pathway.names <- data.frame(pathwayIDs = 1:length(gene_sets), pathwayNames = mito_genes$MitoPathway)
####

gene_list = sort(fc_new, decreasing = TRUE)
names(gene_list)=as.character(names(gene_list))

gene.pathway2=gene.pathway
gene.pathway2$pathwayIDs=pathway.names$pathwayNames[match(gene.pathway2$pathwayIDs,pathway.names$pathwayIDs)]
gene.pathway2=gene.pathway2[,c(2,1)]#need to put term gene in order

##Ben's mito to delete unused previous mito pathways
ben_mito=read_excel("/Users/yzha0247/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/IHD-noDM vs donor KEGG mito pathways 27.6.22.xlsx")
ben_mito$Description%in%gene.pathway2$pathwayIDs

##  [1]  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE
## [13]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE
## [25]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE
## [37]  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE
## [49] FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE
## [61]  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE
## [73] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85]  TRUE FALSE FALSE FALSE FALSE FALSE

deletion_index=which(!gene.pathway2$pathwayIDs%in%ben_mito$Description)
dim(gene.pathway2)

## [1] 3904    2

gene.pathway2=gene.pathway2[-deletion_index,]
dim(gene.pathway2)

## [1] 2416    2

###


colnames(kegg_full_dt)=colnames(gene.pathway2)
gene.pathway3=rbind.data.frame(gene.pathway2,kegg_full_dt)
set.seed(202205)
gseaResults_mito <-performGSEA_mito(gene_list,gene.pathway3,1)

#my plot
ggplotdt=gseaResults_mito$table2
ggplotdt$p.adjust=as.numeric(ggplotdt$p.adjust)
ggplotdtup=ggplotdt[ggplotdt$enrichmentScore>0,]
ggplotdtupsub=ggplotdtup[order(ggplotdtup$p.adjust),]
ggplotdtdown=ggplotdt[ggplotdt$enrichmentScore<0,]
ggplotdtdownsub=ggplotdtdown[order(ggplotdtdown$p.adjust),]

ggplotdt=rbind.data.frame(ggplotdtupsub[1:10,],ggplotdtdownsub[1:10,])

ggplotdt$pvalue=as.numeric(ggplotdt$pvalue)
gg=ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=pvalue,size=pvalue))+coord_flip()+theme_bw()+ggtitle("dcm-dm vs donor pathway enrichment analysis")+
  scale_colour_gradient(high = "red", low = "blue")+theme(text = element_text(size = 10))

gg

ihddm_topnames2=ggplotdt$Description
ihddm_dt2=gseaResults_mito$table2
datatable(ihddm_dt2%>% mutate_if(is.numeric, ~round(., 3)))

2 DCM-NO DM VS Donor

2.1 DE analysis and standard pipeline

load("all_omics_lv_v5.RData")
protein_expression=assay(all_omics@ExperimentList$protein, "log2")
protein_expression1=as.data.frame(protein_expression)

dim(protein_expression1)

## [1] 9037  111

#changed 20220224: no overall filtering
protein_expression2=protein_expression1
dim(protein_expression2)

## [1] 9037  111

dim(protein_expression2[which(rowMeans(!is.na(protein_expression2)) > 0.25), ])

## [1] 4094  111

#20220321 add the age match as before
infodt1 = colData(all_omics)
infodt1$ihd=as.numeric(as.vector(ifelse(infodt1$Condition=="DCM"&infodt1$Diabetes=="No",1,ifelse(infodt1$Condition=="Donor",0,2))))
ihddt=infodt1[infodt1$ihd!=2,]
ihddt=ihddt[(ihddt$ihd==0&ihddt$Age>=21)|ihddt$ihd==1,]
dim(ihddt)

## [1] 38 14

ihddt$new_filename

##  [1] "40_LV_DCM_No_43_F"    "41_LV_DCM_No_43_M"    "42_LV_DCM_No_45_M"   
##  [4] "43_LV_DCM_No_49_M"    "44_LV_DCM_No_50_F"    "45_LV_DCM_No_51_M"   
##  [7] "46_LV_DCM_No_53_F"    "47_LV_DCM_No_53_M"    "48_LV_DCM_No_54_M"   
## [10] "49_LV_DCM_No_54_M"    "50_LV_DCM_No_55_M"    "51_LV_DCM_No_56_M"   
## [13] "52_LV_DCM_No_58_F"    "53_LV_DCM_No_58_M"    "54_LV_DCM_No_60_F"   
## [16] "55_LV_DCM_No_62_M"    "56_LV_DCM_No_52_M"    "90_LV_Donor_No_21_M" 
## [19] "91_LV_Donor_No_23_F"  "92_LV_Donor_No_23_M"  "93_LV_Donor_No_25_F" 
## [22] "94_LV_Donor_No_47_M"  "95_LV_Donor_No_47_F"  "96_LV_Donor_No_48_M" 
## [25] "97_LV_Donor_No_49_F"  "98_LV_Donor_No_51_F"  "99_LV_Donor_No_53_M" 
## [28] "100_LV_Donor_No_53_F" "101_LV_Donor_No_54_M" "102_LV_Donor_No_54_F"
## [31] "103_LV_DCM_No_44_M"   "104_LV_Donor_No_55_F" "105_LV_Donor_No_56_M"
## [34] "106_LV_Donor_No_60_M" "107_LV_Donor_No_62_M" "108_LV_Donor_No_62_F"
## [37] "109_LV_Donor_No_65_M" "110_LV_Donor_No_65_F"

indexx=ihddt$new_filename
protein_expression2=protein_expression2[,colnames(protein_expression2)%in%indexx]
dim(protein_expression2)

## [1] 9037   38

#get the "lv" "ihd-dm" "ihd-no dm" non-imputed data
experiment_data_noimputation=protein_expression2[,union(grep("_DCM_No",colnames(protein_expression2)),grep("Donor_No",colnames(protein_expression2)))]
#colnames(experiment_data_noimputation)
dim(experiment_data_noimputation)

## [1] 9037   38

#filtering out proteins with >25% samples missing
experiment_data_noimputation=experiment_data_noimputation[which(rowMeans(!is.na(experiment_data_noimputation)) > 0.25), ]
dim(experiment_data_noimputation)

## [1] 4075   38

diabetes=grep("DCM",colnames(experiment_data_noimputation))
experiment_data_noimputation1=as.data.frame(t(experiment_data_noimputation))
rownames(experiment_data_noimputation1)=1:dim(experiment_data_noimputation1)[1]
experiment_data_noimputation1$y=ifelse(rownames(experiment_data_noimputation1) %in% diabetes,1,0)


#de analysis-- should be on the non-imputed data

    
    groupname<-as.factor(experiment_data_noimputation1$y)
    design <- model.matrix(~ groupname + 0)
    fit <- lmFit(experiment_data_noimputation, design)
    cont.matrix <- makeContrasts(groupname1-groupname0, levels=design)
    fit2 <- contrasts.fit(fit, cont.matrix)
    fit2 <- eBayes(fit2)
    tT <- topTable(fit2)
    df1 <- topTable(fit2, number=nrow(fit2), genelist=rownames(experiment_data_noimputation))
    df1$name=rownames(df1)
    df1$neglogP_value = -log10(df1$P.Value)

    # g1=ggplot2::ggplot(df1, aes(logFC,neglogP_value,label=name))+
    #   geom_point(size = 1, alpha=1) +
    #   xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("IHD-T2DM vs Donor") +
    #   theme_bw(base_size = 16) +
    #   geom_hline(yintercept = 1.3, linetype = "dashed", colour = "brown3")+theme(aspect.ratio = 1)
    
    #add fdr plot
adj_to_p <- function(df) {
  #df <- topTable(fit, coef = comparison, number = nrow(fit))
  if ((df %>%filter(adj.P.Val < 0.05) %>% nrow()) == 0){
    r <- 1
  } else {
    r <- df%>%mutate(rn = row_number()) %>% filter(adj.P.Val <= 0.05) %>% pull(rn)%>%max() + 1
  }
  n <- nrow(df) + 1
  lp10 <- -log10(r / n * 0.05)
  lp10
} 

#add samples
colnames(experiment_data_noimputation)

##  [1] "50_LV_DCM_No_55_M"    "51_LV_DCM_No_56_M"    "46_LV_DCM_No_53_F"   
##  [4] "49_LV_DCM_No_54_M"    "55_LV_DCM_No_62_M"    "103_LV_DCM_No_44_M"  
##  [7] "44_LV_DCM_No_50_F"    "40_LV_DCM_No_43_F"    "41_LV_DCM_No_43_M"   
## [10] "54_LV_DCM_No_60_F"    "52_LV_DCM_No_58_F"    "48_LV_DCM_No_54_M"   
## [13] "45_LV_DCM_No_51_M"    "47_LV_DCM_No_53_M"    "53_LV_DCM_No_58_M"   
## [16] "56_LV_DCM_No_52_M"    "43_LV_DCM_No_49_M"    "42_LV_DCM_No_45_M"   
## [19] "110_LV_Donor_No_65_F" "101_LV_Donor_No_54_M" "91_LV_Donor_No_23_F" 
## [22] "94_LV_Donor_No_47_M"  "98_LV_Donor_No_51_F"  "105_LV_Donor_No_56_M"
## [25] "90_LV_Donor_No_21_M"  "100_LV_Donor_No_53_F" "109_LV_Donor_No_65_M"
## [28] "99_LV_Donor_No_53_M"  "93_LV_Donor_No_25_F"  "97_LV_Donor_No_49_F" 
## [31] "92_LV_Donor_No_23_M"  "108_LV_Donor_No_62_F" "102_LV_Donor_No_54_F"
## [34] "96_LV_Donor_No_48_M"  "106_LV_Donor_No_60_M" "95_LV_Donor_No_47_F" 
## [37] "107_LV_Donor_No_62_M" "104_LV_Donor_No_55_F"

#add fdr plot
adj_to_p(df1)

## [1] 1.97475

highlight_df=df1
highlight_df$colorcode=as.character(ifelse(highlight_df$neglogP_value>adj_to_p(df1)&highlight_df$logFC>0,1,ifelse(highlight_df$neglogP_value>adj_to_p(df1)&highlight_df$logFC<0,-1,0)))
dim(highlight_df)

## [1] 4075   10

highlight_df=na.omit(highlight_df)
dim(highlight_df)

## [1] 4072   10

g1=ggplot2::ggplot(highlight_df, aes(logFC,neglogP_value,label=name))+
      geom_point(aes(x=logFC,y=neglogP_value,color=colorcode)) +
      xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("DCM-NO DM vs Donor") +
      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),aspect.ratio = 1) +
      geom_hline(yintercept = adj_to_p(df1), linetype = "dashed", colour = "red")+scale_color_manual(values = c("blue","gray","red"))
ggplotly(g1)

df1[,c(2:7,9)]=round(df1[,c(2:7,9)],4)
datatable(df1)

#pathway analysis
#pathway analysis as in our shiny:delete missing values
experiment_data_pathway=na.omit(experiment_data_noimputation)
dim(experiment_data_pathway)

## [1] 1090   38

##With voom for finding pathways
groupname<-as.factor(experiment_data_noimputation1$y)
design <- model.matrix(~ groupname + 0)
log2.cpm <- voom(experiment_data_pathway, design, normalize.method = "quantile")
fit <- lmFit(log2.cpm, design)
cont.matrix <- makeContrasts(groupname1-groupname0, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)
limma.res=topTable(fit2,n=Inf,sort="p", adjust.method = "fdr")
fc_new <- limma.res$logFC
limma.res$entrez = mapIds(org.Hs.eg.db,
                     keys=row.names(limma.res), 
                     column="ENTREZID",
                     keytype=keytypes(org.Hs.eg.db)[2],
                     multiVals="first")
names(fc_new) <- limma.res$entrez

geneList<-na.omit(fc_new)
geneList = sort(geneList, decreasing = TRUE)
kk2 <- gseGO(geneList     = geneList,
              OrgDb        = org.Hs.eg.db,
              ont          = "BP",
              minGSSize    = 100,
              maxGSSize    = 500,
              pvalueCutoff = 0.05,
              verbose      = FALSE)
kk2_dt=as.data.frame(kk2)
kk2_dt[,c(4,5,7,8)]=round(kk2_dt[,c(4,5,7,8)],4)
datatable(kk2_dt)

clusterProfiler::dotplot(kk2, showCategory=10, split=".sign") + facet_grid(.~.sign)

BP_namelist=kk2_dt$Description
#https://yulab-smu.top/biomedical-knowledge-mining-book/clusterprofiler-go.html
#gene ontology over-representation test
ego2 <- enrichGO(gene         = names(geneList),
                OrgDb         = org.Hs.eg.db,
                keyType       = 'ENTREZID',
                ont           = "BP",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.01,
                qvalueCutoff  = 0.05)
ego2_dt=as.data.frame(ego2)
ego2_dt[,5:7]=round(ego2_dt[,5:7],4)
datatable(ego2_dt)

# Let is suppose I have a collection of genesets called : HALLMARK Now let is suppose there is a specific geneset there called: E2F_targets
# 
# BgRatio, M/N.
# 
# M = size of the geneset (eg size of the E2F_targets); (is the number of genes within that distribution that are annotated (either directly or indirectly) to the node of interest).
# 
# N = size of all of the unique genes in the collection of genesets (example the HALLMARK collection); (is the total number of genes in the background distribution (universe)
# 
# GeneRatio is k/n.
# 
# k = size of the overlap of 'a vector of gene id' you input with the specific geneset (eg E2F_targets), only unique genes; (the number of genes within that list n, which are annotated to the node.
# 
# n = size of the overlap of 'a vector of gene id' you input with all the members of the collection of genesets (eg the HALLMARK collection),only unique genes; is the size of the list of genes of interest

#adj pval and qval
#https://support.bioconductor.org/p/96329/
#adj pval is the fdr adjusted pval, qval from another package is another adjustment method
#the BH approach is slightly more conservative than qvalue


#get pathway maps
result_path <- enrichGO(gene         = names(geneList),
                OrgDb         = org.Hs.eg.db,
                keyType       = 'ENTREZID',
                ont           = "BP",
                pAdjustMethod = "BH",
                pvalueCutoff  =1,
                qvalueCutoff  =1)
result_path_dt=as.data.frame(result_path)
result_path_dt2=result_path_dt[,c("Description","geneID")]

result_path_dt3=data.frame(matrix(ncol=2))
for(i in 1:nrow(result_path_dt2)){
X2=unlist(strsplit(result_path_dt2$geneID[i],"/"))
X1=rep(result_path_dt2$Description[i],length(X2))
dt=cbind(X1,X2)
result_path_dt3=rbind(result_path_dt3,dt)
}

kk3 <- gseGO(geneList     = geneList,
              OrgDb        = org.Hs.eg.db,
              ont          = "MF",
              minGSSize    = 100,
              maxGSSize    = 500,
              pvalueCutoff = 0.05,
              verbose      = FALSE)
kk3_dt=as.data.frame(kk3)
kk3_dt[,c(4,5,7,8)]=round(kk3_dt[,c(4,5,7,8)],4)
datatable(as.data.frame(kk3_dt))

clusterProfiler::dotplot(kk3, showCategory=10, split=".sign") + facet_grid(.~.sign)

#https://yulab-smu.top/biomedical-knowledge-mining-book/clusterprofiler-go.html
#gene ontology over-representation test
ego3 <- enrichGO(gene         = names(geneList),
                OrgDb         = org.Hs.eg.db,
                keyType       = 'ENTREZID',
                ont           = "MF",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.01,
                qvalueCutoff  = 0.05)
ego3_dt=as.data.frame(ego3)
ego3_dt[,5:7]=round(ego3_dt[,5:7],4)
datatable(as.data.frame(ego3_dt))

MF_namelist=kk3_dt$Description

#get pathway maps
MFresult_path <- enrichGO(gene         = names(geneList),
                OrgDb         = org.Hs.eg.db,
                keyType       = 'ENTREZID',
                ont           = "MF",
                pAdjustMethod = "BH",
                pvalueCutoff  =1,
                qvalueCutoff  =1)
MFresult_path_dt=as.data.frame(MFresult_path)
MFresult_path_dt2=MFresult_path_dt[,c("Description","geneID")]

MFresult_path_dt3=data.frame(matrix(ncol=2))
for(i in 1:nrow(MFresult_path_dt2)){
X2=unlist(strsplit(MFresult_path_dt2$geneID[i],"/"))
X1=rep(MFresult_path_dt2$Description[i],length(X2))
dt=cbind(X1,X2)
MFresult_path_dt3=rbind(MFresult_path_dt3,dt)
}

2.2 check list for ecm & scm gene groups

#new things 20220907: add customised pathway analysis results
name=c("GO:0030198","GO:0055007","GO:0030239","GO:0060047","GO:0007010","GO:0031012","GO:0043292","GO:0005856")
AnnotationDbi::select(GO.db, keys=name, columns=c("TERM","ONTOLOGY"), keytype="GOID")

##         GOID                                TERM ONTOLOGY
## 1 GO:0030198   extracellular matrix organization       BP
## 2 GO:0055007 cardiac muscle cell differentiation       BP
## 3 GO:0030239                  myofibril assembly       BP
## 4 GO:0060047                   heart contraction       BP
## 5 GO:0007010           cytoskeleton organization       BP
## 6 GO:0031012                extracellular matrix       CC
## 7 GO:0043292                   contractile fiber       CC
## 8 GO:0005856                        cytoskeleton       CC

pathwayIDs=c("extracellular matrix organization","cardiac muscle cell differentiation", "myofibril assembly", "heart contraction" ,"cytoskeleton organization","extracellular matrix"  ,"contractile fiber", "cytoskeleton")

all_name=c(name,GOBPOFFSPRING[["GO:0030198"]],GOBPOFFSPRING[["GO:0055007"]],GOBPOFFSPRING[["GO:0030239"]],GOBPOFFSPRING[["GO:0060047"]],GOBPOFFSPRING[["GO:0007010"]],GOCCOFFSPRING[["GO:0031012"]],GOCCOFFSPRING[["GO:0043292"]],GOCCOFFSPRING[["GO:0005856"]])

all_name2=AnnotationDbi::select(GO.db, keys=all_name, columns=c("TERM","ONTOLOGY"), keytype="GOID")


GO_v<- AnnotationDbi::select(org.Hs.eg.db, keys=all_name2$GOID, columns=c("GO","ONTOLOGY","SYMBOL","ENTREZID"),keytype="GO")

pp2=merge(all_name2,GO_v,by.x="GOID",by.y="GO",all=TRUE)
datatable(pp2%>%group_by(GOID)%>%summarise(n=n()))

gene_list = sort(fc_new, decreasing = TRUE)
names(gene_list)=as.character(names(gene_list))

pathway_list3=pp2[,c("TERM","ENTREZID")]
colnames(pathway_list3)=c("pathwayIDs","geneIDs")
sum(names(gene_list)%in% pathway_list3$geneIDs)

## [1] 401

sum(!names(gene_list)%in% pathway_list3$geneIDs)

## [1] 689

length(gene_list)

## [1] 1090

#pathway_list3%>%group_by(pathwayIDs)%>%summarise(n=n())

set.seed(20220907)
gse <- GSEA(gene_list, TERM2GENE = pathway_list3,nPerm = 1000, minGSSize = 3, maxGSSize = 500, pAdjustMethod = "none",pvalueCutoff = 0.05)
figure4b=as.data.frame(gse)
figure4bv2=figure4b
figure4bv2[,4:9]=round(figure4bv2[,4:9],3)
datatable(figure4bv2)

2.3 continue for DE analysis and standard pipeline

kegg_gene_list<-na.omit(fc_new)
kegg_gene_list = sort(kegg_gene_list, decreasing = TRUE)
set.seed(202205)
kk2 <- gseKEGG(geneList     = kegg_gene_list,
               organism     = "hsa",
               nPerm        = 10000,
               minGSSize    = 3,
               maxGSSize    = 800,
               pvalueCutoff = 0.05,
               pAdjustMethod = "BH",
               keyType       = "ncbi-geneid")
datatable(as.data.frame(kk2))

clusterProfiler::dotplot(kk2, showCategory=10, split=".sign") + facet_grid(.~.sign)

both2=as.data.frame(kk2)
both2=both2[-grep("disease",both2$Description),]
ggplot(both2,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=p.adjust,size=p.adjust))+coord_flip()+theme_bw()+ggtitle("GSEA for significantly downregulated and upregulated pathways")+labs(y="Enrichment score")+
  scale_colour_gradient(high = "red", low = "blue")

#get pathway maps
KEGGresult_path <- enrichKEGG(gene         = names(geneList),
              organism = "hsa", keyType = "kegg", pvalueCutoff = 1, pAdjustMethod = "BH",minGSSize = 0, maxGSSize = 3000, qvalueCutoff = 1)
KEGGresult_path_dt=as.data.frame(KEGGresult_path)
KEGGresult_path_dt2=KEGGresult_path_dt[,c("Description","geneID")]

KEGGresult_path_dt2=KEGGresult_path_dt2[-grep("disease",KEGGresult_path_dt2$Description),]

KEGGresult_path_dt3=data.frame(matrix(ncol=2))
for(i in 1:nrow(KEGGresult_path_dt2)){
X2=unlist(strsplit(KEGGresult_path_dt2$geneID[i],"/"))
X1=rep(KEGGresult_path_dt2$Description[i],length(X2))
dt=cbind(X1,X2)
KEGGresult_path_dt3=rbind(KEGGresult_path_dt3,dt)
}

## GSEA analysis
fc.kegg.p <- gage(fc_new, gsets = kegg.sets.hsa, ref = NULL, samp = NULL, rank.test = TRUE)

upreg <- data.frame(fc.kegg.p[[1]]) %>%
  rownames_to_column(var = "pathway") %>%
  mutate(KEGGpathway = substr(pathway, start = 1, stop = 8)) %>%
  mutate(pathwayname = word(pathway, 2, -1)) %>%
  select(pathway, KEGGpathway, pathwayname, set.size, stat.mean, p.val, q.val) %>%
  mutate(pathid = substr(KEGGpathway, start = 4, stop = 8))
upreg_selected = upreg %>%
  filter(q.val < 0.05) %>%
  filter(set.size >= 5)
upreg_selected$neglogP_Value = -log10(upreg_selected$p.val)
upreg_selected$neglogQ_value = -log10(upreg_selected$q.val)

downreg <- data.frame(fc.kegg.p[[2]]) %>%
  rownames_to_column(var = "pathway") %>%
  mutate(KEGGpathway = substr(pathway, start = 1, stop = 8)) %>%
  mutate(pathwayname = word(pathway, 2, -1)) %>%
  select(pathway, KEGGpathway, pathwayname, set.size, stat.mean, p.val, q.val) %>%
  mutate(pathid = substr(KEGGpathway, start = 4, stop = 8))
downreg_selected = downreg %>%
  filter(q.val < 0.05) %>%
  filter(set.size >= 5)
downreg_selected$neglogP_Value = -log10(downreg_selected$p.val)
downreg_selected$neglogQ_value = -log10(downreg_selected$q.val)


both = rbind(downreg_selected, upreg_selected)
both$absolute = abs(both$stat.mean)
#datatable(both)
both2 = both %>% filter(pathwayname != "Metabolic pathways" & pathwayname != "Retrograde endocannabinoid signaling")
both2=both2[-grep("disease",both2$pathway),]
both2[,c(5:7,9:11)]=round(both2[,c(5:7,9:11)],4)
datatable(both2)

#saveRDS(both, file = "both_selected.rds")

# pv.out.list.fat <- sapply(both$pathid,
#                               function(pid) pathview(gene.data =  fc,
#                                                     pathway.id = pid,
#                                                     species = "hsa",
#                                                     out.suffix="human",
#                                                     low = list(gene = "deepskyblue1", cpd = "green"),
#                                                     high = list(gene = "red", cpd = "yellow")))

# library(ggrepel )
# ggplot(both2, aes(x = stat.mean, y = -log10(p.val))) + geom_point( aes(color = set.size), size = 3) + labs(x = "Enrichment score", y = "neglog10p", title = "GSEA for significantly downregulated and upregulated pathways") + 
#   geom_text_repel(aes(label=pathwayname), size=3) +
#   scale_colour_gradient(high = "red", low = "blue") +
#   scale_x_continuous(
#   labels = scales::number_format(accuracy = 0.01))

ggplot(both2,aes(x = reorder(pathwayname, stat.mean),y=stat.mean))+geom_point(aes(color=-log10(p.val),size=-log10(p.val)))+coord_flip()+theme_bw()+ggtitle("GSEA for significantly downregulated and upregulated pathways")+labs(y="Enrichment score")+
  scale_colour_gradient(high = "red", low = "blue")

KEGG_namelist=both2$pathwayname

2.4 Part(1)

2.4.1 Part(1) GO+mito

combine the pathways together to do the gsea analysis
full pathway table

library(clusterProfiler)
library(enrichplot)
performGSEA_mito <- function(gene_list,gene.pathway,pval){
  # gse <- GSEA(gene_list, 
  #           TERM2GENE = gene.pathway,
  #           pAdjustMethod = "none", 
  #           scoreType = "pos",
  #           nPermSimple = 10000
  #           )
  gse <- GSEA(gene_list, TERM2GENE = gene.pathway,nPerm = 1000, minGSSize = 5, maxGSSize = 500, pAdjustMethod = "BH",pvalueCutoff = pval)
  GSEresults.rounded <- gse@result[,1:9]
  GSEresults.rounded[,4:8] <- signif(GSEresults.rounded[,4:8],4)
  GSEresults.rounded[,6:7] <- sapply(GSEresults.rounded[,6:7], format, scientific = TRUE)
  table <- DT::datatable(GSEresults.rounded)
  dot <- clusterProfiler::dotplot(gse, showCategory=10, split=".sign") + facet_grid(.~.sign)
  gse <- enrichplot::pairwise_termsim(gse)
  enrich <- emapplot(gse, showCategory = 10)
  tree <- treeplot(gse) #https://github.com/YuLab-SMU/ggtree/issues/399 #might be a dplyr issue
  return(list(table = table, dot = dot, enrich = enrich, tree = tree,table2=GSEresults.rounded))
}

###
mito_genes <- read_excel("Human.MitoCarta3.0.xls", sheet = 4)
mito_genes <- mito_genes %>% dplyr::select(MitoPathway,Genes) %>% na.omit
gene_sets <- mito_genes$Genes %>% strsplit(", ")
gene.pathway <- data.frame(geneIDs = character(), pathwayIDs = integer())
for(i in 1:length(gene_sets)){
  new_data <- data.frame(geneIDs = gene_sets[[i]], pathwayIDs = i)
  gene.pathway <- rbind(gene.pathway,new_data)
}

gene.pathway$geneIDs <- mapIds(org.Hs.eg.db, keys = gene.pathway$geneIDs, keytype = "SYMBOL", column = "ENTREZID")
pathway.names <- data.frame(pathwayIDs = 1:length(gene_sets), pathwayNames = mito_genes$MitoPathway)
####

gene_list = sort(fc_new, decreasing = TRUE)
names(gene_list)=as.character(names(gene_list))

gene.pathway2=gene.pathway
gene.pathway2$pathwayIDs=pathway.names$pathwayNames[match(gene.pathway2$pathwayIDs,pathway.names$pathwayIDs)]
gene.pathway2=gene.pathway2[,c(2,1)]#need to put term gene in order

##Ben's mito to delete unused previous mito pathways
ben_mito=read_excel("/Users/yzha0247/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/Yunwei202110/IHD-noDM vs donor KEGG mito pathways 27.6.22.xlsx")
ben_mito$Description%in%gene.pathway2$pathwayIDs
deletion_index=which(!gene.pathway2$pathwayIDs%in%ben_mito$Description)
dim(gene.pathway2)
gene.pathway2=gene.pathway2[-deletion_index,]
dim(gene.pathway2)
###

colnames(result_path_dt3)=colnames(gene.pathway2)
colnames(MFresult_path_dt3)=colnames(gene.pathway2)
gene.pathway3=rbind.data.frame(gene.pathway2,result_path_dt3,MFresult_path_dt3)
set.seed(202205)
gseaResults_mito <-performGSEA_mito(gene_list,gene.pathway3,0.1)
gseaResults_mito$table
#gseaResults_mito$dot
#gseaResults_mito$enrich
#gseaResults_mito$tree

#Normalized enrichment scores (NES)
#https://www.biostars.org/p/169648/

pathway plot

#my plot
ggplotdt=gseaResults_mito$table2
ggplotdt$p.adjust=as.numeric(ggplotdt$p.adjust)
ggplotdtup=ggplotdt[ggplotdt$enrichmentScore>0,]
ggplotdtupsub=ggplotdtup[order(-abs(ggplotdtup$enrichmentScore)),]
ggplotdtdown=ggplotdt[ggplotdt$enrichmentScore<0,]
ggplotdtdownsub=ggplotdtdown[order(ggplotdtdown$enrichmentScore),]

ggplotdt=rbind.data.frame(ggplotdtupsub[1:10,],ggplotdtdownsub[1:10,])
ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=p.adjust,size=p.adjust))+coord_flip()+theme_bw()+ggtitle("pathway enrichment analysis")+
  scale_colour_gradient(high = "red", low = "blue")

Part(2) for plotting all enriched mito pathways

#plot all the significant mito pathways
ggplotdt2=rbind.data.frame(ggplotdtupsub,ggplotdtdownsub)
ggplotdt2=ggplotdt2[ggplotdt2$Description%in%gene.pathway2$pathwayIDs,]
ggplot(ggplotdt2,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=p.adjust,size=p.adjust))+coord_flip()+theme_bw()+ggtitle("mitochondrail pathway enrichment analysis")+
  scale_colour_gradient(high = "red", low = "blue")+ylim(c(-1,1))

2.4.2 Part(1) KEGG+mito

combine the pathways together to do the gsea analysis
pathway table

library(clusterProfiler)
library(enrichplot)
performGSEA_mito <- function(gene_list,gene.pathway,pval){
  # gse <- GSEA(gene_list, 
  #           TERM2GENE = gene.pathway,
  #           pAdjustMethod = "none", 
  #           scoreType = "pos",
  #           nPermSimple = 10000
  #           )
  gse <- GSEA(gene_list, TERM2GENE = gene.pathway,nPerm = 1000, minGSSize = 5, maxGSSize = 500, pAdjustMethod = "BH",pvalueCutoff = pval)
  GSEresults.rounded <- gse@result[,1:9]
  GSEresults.rounded[,4:8] <- signif(GSEresults.rounded[,4:8],4)
  GSEresults.rounded[,6:7] <- sapply(GSEresults.rounded[,6:7], format, scientific = TRUE)
  table <- DT::datatable(GSEresults.rounded)
  dot <- clusterProfiler::dotplot(gse, showCategory=10, split=".sign") + facet_grid(.~.sign)
  gse <- enrichplot::pairwise_termsim(gse)
  enrich <- emapplot(gse, showCategory = 10)
  tree <- treeplot(gse) #https://github.com/YuLab-SMU/ggtree/issues/399 #might be a dplyr issue
  return(list(table = table, dot = dot, enrich = enrich, tree = tree,table2=GSEresults.rounded))
}

###
mito_genes <- read_excel("Human.MitoCarta3.0.xls", sheet = 4)
mito_genes <- mito_genes %>% dplyr::select(MitoPathway,Genes) %>% na.omit
gene_sets <- mito_genes$Genes %>% strsplit(", ")
gene.pathway <- data.frame(geneIDs = character(), pathwayIDs = integer())
for(i in 1:length(gene_sets)){
  new_data <- data.frame(geneIDs = gene_sets[[i]], pathwayIDs = i)
  gene.pathway <- rbind(gene.pathway,new_data)
}

gene.pathway$geneIDs <- mapIds(org.Hs.eg.db, keys = gene.pathway$geneIDs, keytype = "SYMBOL", column = "ENTREZID")
pathway.names <- data.frame(pathwayIDs = 1:length(gene_sets), pathwayNames = mito_genes$MitoPathway)
####

gene_list = sort(fc_new, decreasing = TRUE)
names(gene_list)=as.character(names(gene_list))

gene.pathway2=gene.pathway
gene.pathway2$pathwayIDs=pathway.names$pathwayNames[match(gene.pathway2$pathwayIDs,pathway.names$pathwayIDs)]
gene.pathway2=gene.pathway2[,c(2,1)]#need to put term gene in order

##Ben's mito to delete unused previous mito pathways
ben_mito=read_excel("/Users/yzha0247/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/Yunwei202110/IHD-noDM vs donor KEGG mito pathways 27.6.22.xlsx")
ben_mito$Description%in%gene.pathway2$pathwayIDs
deletion_index=which(!gene.pathway2$pathwayIDs%in%ben_mito$Description)
dim(gene.pathway2)
gene.pathway2=gene.pathway2[-deletion_index,]
dim(gene.pathway2)
###


colnames(kegg_full_dt)=colnames(gene.pathway2)
gene.pathway3=rbind.data.frame(gene.pathway2,kegg_full_dt)
set.seed(202205)
gseaResults_mito <-performGSEA_mito(gene_list,gene.pathway3,0.1)
gseaResults_mito$table
#gseaResults_mito$dot
#gseaResults_mito$enrich
#gseaResults_mito$tree

#Normalized enrichment scores (NES)
#https://www.biostars.org/p/169648/

pathway plot

#my plot
ggplotdt=gseaResults_mito$table2
ggplotdt$p.adjust=as.numeric(ggplotdt$p.adjust)
ggplotdtup=ggplotdt[ggplotdt$enrichmentScore>0,]
ggplotdtupsub=ggplotdtup[order(-abs(ggplotdtup$enrichmentScore)),]
ggplotdtdown=ggplotdt[ggplotdt$enrichmentScore<0,]
ggplotdtdownsub=ggplotdtdown[order(ggplotdtdown$enrichmentScore),]

ggplotdt=rbind.data.frame(ggplotdtupsub[1:2,],ggplotdtdownsub[1:10,])

ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=p.adjust,size=p.adjust))+coord_flip()+theme_bw()+ggtitle("pathway enrichment analysis")+
  scale_colour_gradient(high = "red", low = "blue")


sum(unique(gene.pathway2$pathwayIDs)%in% ggplotdt$Description)/nrow(ggplotdt)

ihdnodm_topnames=ggplotdt$Description
ihdnodm_dt=gseaResults_mito$table2

Part(2) for plotting enriched mito pathways

#plot all the significant mito pathways
ggplotdt2=rbind.data.frame(ggplotdtupsub,ggplotdtdownsub)
ggplotdt2=ggplotdt2[ggplotdt2$Description%in%gene.pathway2$pathwayIDs,]
ggplot(ggplotdt2,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=p.adjust,size=p.adjust))+coord_flip()+theme_bw()+ggtitle("mitochondrail pathway enrichment analysis")+
  scale_colour_gradient(high = "red", low = "blue")+ylim(c(-1,1))

2.4.3 Part(1) KEGG+mito threshold relax for our figure

library(clusterProfiler)
library(enrichplot)
performGSEA_mito <- function(gene_list,gene.pathway,pval){
  # gse <- GSEA(gene_list, 
  #           TERM2GENE = gene.pathway,
  #           pAdjustMethod = "none", 
  #           scoreType = "pos",
  #           nPermSimple = 10000
  #           )
  gse <- GSEA(gene_list, TERM2GENE = gene.pathway,nPerm = 1000, minGSSize = 3, maxGSSize = 500, pAdjustMethod = "none",pvalueCutoff = pval)
  GSEresults.rounded <- gse@result[,1:9]
  GSEresults.rounded[,4:8] <- signif(GSEresults.rounded[,4:8],4)
  GSEresults.rounded[,6:7] <- sapply(GSEresults.rounded[,6:7], format, scientific = TRUE)
  table <- DT::datatable(GSEresults.rounded)
  dot <- clusterProfiler::dotplot(gse, showCategory=10, split=".sign") + facet_grid(.~.sign)
  gse <- enrichplot::pairwise_termsim(gse)
  enrich <- emapplot(gse, showCategory = 10)
  tree <- treeplot(gse) #https://github.com/YuLab-SMU/ggtree/issues/399 #might be a dplyr issue
  return(list(table = table, dot = dot, enrich = enrich, tree = tree,table2=GSEresults.rounded))
}

###
mito_genes <- read_excel("Human.MitoCarta3.0.xls", sheet = 4)
mito_genes <- mito_genes %>% dplyr::select(MitoPathway,Genes) %>% na.omit
gene_sets <- mito_genes$Genes %>% strsplit(", ")
gene.pathway <- data.frame(geneIDs = character(), pathwayIDs = integer())
for(i in 1:length(gene_sets)){
  new_data <- data.frame(geneIDs = gene_sets[[i]], pathwayIDs = i)
  gene.pathway <- rbind(gene.pathway,new_data)
}

gene.pathway$geneIDs <- mapIds(org.Hs.eg.db, keys = gene.pathway$geneIDs, keytype = "SYMBOL", column = "ENTREZID")
pathway.names <- data.frame(pathwayIDs = 1:length(gene_sets), pathwayNames = mito_genes$MitoPathway)
####

gene_list = sort(fc_new, decreasing = TRUE)
names(gene_list)=as.character(names(gene_list))

gene.pathway2=gene.pathway
gene.pathway2$pathwayIDs=pathway.names$pathwayNames[match(gene.pathway2$pathwayIDs,pathway.names$pathwayIDs)]
gene.pathway2=gene.pathway2[,c(2,1)]#need to put term gene in order

##Ben's mito to delete unused previous mito pathways
ben_mito=read_excel("/Users/yzha0247/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/IHD-noDM vs donor KEGG mito pathways 27.6.22.xlsx")
ben_mito$Description%in%gene.pathway2$pathwayIDs

##  [1]  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE
## [13]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE
## [25]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE
## [37]  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE
## [49] FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE
## [61]  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE
## [73] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85]  TRUE FALSE FALSE FALSE FALSE FALSE

deletion_index=which(!gene.pathway2$pathwayIDs%in%ben_mito$Description)
dim(gene.pathway2)

## [1] 3904    2

gene.pathway2=gene.pathway2[-deletion_index,]
dim(gene.pathway2)

## [1] 2416    2

###


colnames(kegg_full_dt)=colnames(gene.pathway2)
gene.pathway3=rbind.data.frame(gene.pathway2,kegg_full_dt)
set.seed(202205)
gseaResults_mito <-performGSEA_mito(gene_list,gene.pathway3,1)

ggplotdt=gseaResults_mito$table2
ggplotdt$p.adjust=as.numeric(ggplotdt$p.adjust)
ggplotdtup=ggplotdt[ggplotdt$enrichmentScore>0,]
ggplotdtupsub=ggplotdtup[order(ggplotdtup$p.adjust),]
ggplotdtdown=ggplotdt[ggplotdt$enrichmentScore<0,]
ggplotdtdownsub=ggplotdtdown[order(ggplotdtdown$p.adjust),]

ggplotdt=rbind.data.frame(ggplotdtupsub[1:10,],ggplotdtdownsub[1:10,])
ggplotdt$pvalue=as.numeric(ggplotdt$pvalue)
gg=ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=pvalue,size=pvalue))+coord_flip()+theme_bw()+ggtitle("dcm-no dm vs donor pathway enrichment analysis")+
  scale_colour_gradient(high = "red", low = "blue")+theme(text = element_text(size = 10))

gg

ihdnodm_topnames2=ggplotdt$Description
ihdnodm_dt2=gseaResults_mito$table2
datatable(ihdnodm_dt2%>% mutate_if(is.numeric, ~round(., 3)))

3 Figure3A

dim(ihddm_dt2)

## [1] 336   9

dim(ihdnodm_dt2)

## [1] 285   9

common_name=c(ihddm_dt2$Description,ihdnodm_dt2$Description)
plotmerge=merge(ihddm_dt2,ihdnodm_dt2,by="Description")
plotmerge$pvalue.x=-log10(as.numeric(plotmerge$pvalue.x))
plotmerge$pvalue.y=-log10(as.numeric(plotmerge$pvalue.y))
annotation=read_excel("/Users/yzha0247/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/Yunwei202110/IHD-DM paper figure 3a pathways to annotate.xlsx")
plotmerge$mito=as.character(ifelse(plotmerge$Description%in% ben_mito$Description,1,0))
plotmerge$annotation=as.character(ifelse(plotmerge$Description%in% annotation$`Amino acid metabolism`,1,0))
ggplotly(ggplot(plotmerge,aes(pvalue.x,pvalue.y,label=Description,color=mito))+geom_point()+ggtitle("kegg+mito pathways DCM-dm vs DCM-no dm")+xlab("neg_log10_pval in ind-dm vs donor")+ylab("neg_log10_pval in ind-no dm vs donor")+theme_bw())

library(ggrepel)
#gg=ggplot(plotmerge,aes(pvalue.x,pvalue.y,label=Description,color=mito))+geom_point()+ geom_text_repel(aes(x=pvalue.x,y=pvalue.y,label=Description),subset(plotmerge,((pvalue.x>1&pvalue.y<2)|(pvalue.x<1&pvalue.y>1)|(pvalue.x>2&pvalue.y>2))), size=3)+ggtitle("kegg+mito pathways ihd-dm vs ihd-no dm")+xlab("neg_log10_pval in ind-dm vs donor")+ylab("neg_log10_pval in ind-no dm vs donor")+ scale_color_manual(values=c("black", "seagreen"))+theme_bw()
gg=ggplot(plotmerge,aes(pvalue.x,pvalue.y,label=Description,color=mito))+geom_point()+ geom_text_repel(aes(x=pvalue.x,y=pvalue.y,label=Description),subset(plotmerge,plotmerge$annotation==1), size=3)+ggtitle("kegg+mito pathways DCM-dm vs DCM-no dm")+xlab("neg_log10_pval in ind-dm vs donor")+ylab("neg_log10_pval in ind-no dm vs donor")+ scale_color_manual(values=c("black", "seagreen"))+theme_bw()+theme(aspect.ratio = 1)
gg

#ggsave(filename="s5a.pdf",gg,height = 10,width = 10)

4 Figure3C

selected_mito=read_excel("/Users/yzha0247/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/DCM Figure S5c and d mitochondrial pathways.xlsx")
datatable(ihddm_dt2)

names=selected_mito$Description
ggplotdt=ihddm_dt2[ihddm_dt2$Description%in%names,]
ggplotdt$pvalue=as.numeric(ggplotdt$pvalue)
ggplotdtup=ggplotdt[ggplotdt$enrichmentScore>0,]
ggplotdtupsub=ggplotdtup[order(-abs(ggplotdtup$enrichmentScore)),]
ggplotdtdown=ggplotdt[ggplotdt$enrichmentScore<0,]
ggplotdtdownsub=ggplotdtdown[order(ggplotdtdown$enrichmentScore),]

ggplotdt=rbind.data.frame(ggplotdtupsub,ggplotdtdownsub)

gg=ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=pvalue,size=pvalue))+coord_flip()+theme_bw()+ggtitle("DCM-dm vs donor pathway enrichment analysis")+
  scale_colour_gradient(high = "blue", low = "red")+theme(text = element_text(size = 20))+guides(size = guide_legend(reverse=TRUE))+scale_size_continuous(trans = "reverse")

gg

#ggsave("s5c.pdf",plot = gg,height = 10,width = 10)

5 Figure3D

selected_mito=read_excel("/Users/yzha0247/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/DCM Figure S5c and d mitochondrial pathways.xlsx",sheet=2)
datatable(ihdnodm_dt2)

names=selected_mito$Description
ggplotdt=ihdnodm_dt2[ihdnodm_dt2$Description%in%names,]
ggplotdt$pvalue=as.numeric(ggplotdt$pvalue)
ggplotdtup=ggplotdt[ggplotdt$enrichmentScore>0,]
ggplotdtupsub=ggplotdtup[order(-abs(ggplotdtup$enrichmentScore)),]
ggplotdtdown=ggplotdt[ggplotdt$enrichmentScore<0,]
ggplotdtdownsub=ggplotdtdown[order(ggplotdtdown$enrichmentScore),]

ggplotdt=rbind.data.frame(ggplotdtupsub,ggplotdtdownsub)

gg=ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=pvalue,size=pvalue))+coord_flip()+theme_bw()+ggtitle("DCM-no dm vs donor pathway enrichment analysis")+
  scale_colour_gradient(high = "blue", low = "red")+theme(text = element_text(size = 20))+guides(size = guide_legend(reverse=TRUE))+scale_size_continuous(trans = "reverse")

gg

ggsave("s5d.pdf",plot = gg,height = 10,width = 10)

6 Figure3e

this plot the scm and ecm pathways for ihd-dm vs donor

figure4av2=figure4a
figure4av2[,4:9]=round(figure4av2[,4:9],3)
datatable(figure4av2)

ggplotdt=figure4a
ggplotdt$pvalue=as.numeric(ggplotdt$pvalue)
ggplotdtup=ggplotdt[ggplotdt$enrichmentScore>0,]
ggplotdtupsub=ggplotdtup[order(-abs(ggplotdtup$enrichmentScore)),]
ggplotdtdown=ggplotdt[ggplotdt$enrichmentScore<0,]
ggplotdtdownsub=ggplotdtdown[order(ggplotdtdown$enrichmentScore),]

ggplotdt=rbind.data.frame(ggplotdtupsub,ggplotdtdownsub)

gg=ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=pvalue,size=pvalue))+coord_flip()+theme_bw()+ggtitle("DCM-dm vs donor pathway enrichment analysis")+
  scale_colour_gradient(high = "blue", low = "red")+theme(text = element_text(size = 20))+scale_size_continuous(trans = "reverse")+guides(size = guide_legend(reverse=TRUE))

gg

#ggsave("s5e.pdf",plot = gg,height = 10,width = 10)

7 Figure3f

this plot the scm and ecm pathways for ihd-no dm vs donor

figure4bv2=figure4b
figure4bv2[,4:9]=round(figure4bv2[,4:9],3)
datatable(figure4bv2)

ggplotdt=figure4b
ggplotdt$pvalue=as.numeric(ggplotdt$pvalue)
ggplotdtup=ggplotdt[ggplotdt$enrichmentScore>0,]
ggplotdtupsub=ggplotdtup[order(-abs(ggplotdtup$enrichmentScore)),]
ggplotdtdown=ggplotdt[ggplotdt$enrichmentScore<0,]
ggplotdtdownsub=ggplotdtdown[order(ggplotdtdown$enrichmentScore),]

ggplotdt=rbind.data.frame(ggplotdtupsub,ggplotdtdownsub)

gg=ggplot(ggplotdt,aes(x = reorder(Description, enrichmentScore),y=enrichmentScore))+geom_point(aes(color=pvalue,size=pvalue))+coord_flip()+theme_bw()+ggtitle("DCM-no dm vs donor pathway enrichment analysis")+
  scale_colour_gradient(high = "blue", low = "red")+theme(text = element_text(size = 20))+scale_size_continuous(trans = "reverse")+guides(size = guide_legend(reverse=TRUE))

gg

#ggsave("s5f.pdf",plot = gg,height = 10,width = 10)

new GSEA analysis for supp figureS5

Yunwei Zhang

“Initiated on 20220916, compiled on 2022 Oct 10”

1 DCM-DM VS Donor

1.1 DE analysis and standard pipeline

1.2 check list for ecm & scm gene groups

1.3 continue for DE analysis and standard pipeline

1.4 Part(1)

1.4.1 Part(1) GO+mito

1.4.2 Part(1) KEGG+mito

1.4.3 KEGG+mito relax threshold for the figure

2 DCM-NO DM VS Donor

2.1 DE analysis and standard pipeline

2.2 check list for ecm & scm gene groups

2.3 continue for DE analysis and standard pipeline

2.4 Part(1)

2.4.1 Part(1) GO+mito

2.4.2 Part(1) KEGG+mito

2.4.3 Part(1) KEGG+mito threshold relax for our figure

3 Figure3A

4 Figure3C

5 Figure3D

6 Figure3e

7 Figure3f