#library(GEOquery)  ## go to https://github.com/seandavi/GEOquery for installation details
#library(R.utils)
library(reshape2)
library(ggplot2)
library(limma)
library(dplyr)
library(tidyverse)
#library(DMwR)
library(readxl)
library(MultiAssayExperiment)
library(S4Vectors)
library(SummarizedExperiment)
library(DT)
library(randomForest)
library(e1071)
library("bnstruct")
#library(rminer)
library(ggpubr)
setwd("~/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis")
# install.packages("VANData_1.0.0.tgz", repos=NULL,type="source")
# install.packages("VAN_1.0.0.tgz", repos=NULL,type="source") 

library(R.utils)
library(reshape2)
library(ggplot2)
library(limma)
library(dplyr)
library(tidyverse)
#library(DMwR)
library(readxl)
library(magrittr)
library(tidyverse)
library(gage)
library(knitr)
library(DT)
library(janitor)
#library(KEGGREST)
library(MASS)
library("AnnotationDbi")
library("org.Hs.eg.db")
library(dplyr)
library(reshape)
library(ggplot2)
library(plyr)
library(gtable)
library(grid)
library(pheatmap)
library(reshape2)
library(plotly)
library(UpSetR)
library(MatchIt)
require(pathview)
#library(STRINGdb)
## Kegg sets data
require(gage)
require(gageData)
library(clusterProfiler)
library(enrichplot)
kegg = kegg.gsets(species = "hsa", id.type = "kegg")
kegg.sets.hsa = kegg$kg.sets
library(ggrepel)

this file is to runa anlysis and get volcano plots using rna seq data
updated 0928 after the meeting with the decision to go with max abundence except GTF2H2
updated 1116 for the matched pariwise correlation and with three different groups
updated 1205 to output the main figure6 and supp plot

#some notes for pathway analysis
#'count' is the number of genes that belong to a given gene-set, while 'setSize' is the total number of genes in the gene-set.

#NES = enrichment score normalized to mean enrichment of random samples of the same size,The method employs random sampling of gene sets of the same size as the gene set being tested to assess significance and for normalization. The number of samplings is specified as a parameter.

#

# Let is suppose I have a collection of genesets called : HALLMARK Now let is suppose there is a specific geneset there called: E2F_targets
# 
# BgRatio, M/N.
# 
# M = size of the geneset (eg size of the E2F_targets); (is the number of genes within that distribution that are annotated (either directly or indirectly) to the node of interest).
# 
# N = size of all of the unique genes in the collection of genesets (example the HALLMARK collection); (is the total number of genes in the background distribution (universe)
# 
# GeneRatio is k/n.
# 
# k = size of the overlap of 'a vector of gene id' you input with the specific geneset (eg E2F_targets), only unique genes; (the number of genes within that list n, which are annotated to the node.
# 
# n = size of the overlap of 'a vector of gene id' you input with all the members of the collection of genesets (eg the HALLMARK collection),only unique genes; is the size of the list of genes of interest

#adj pval and qval
#https://support.bioconductor.org/p/96329/
#adj pval is the fdr adjusted pval, qval from another package is another adjustment method
#the BH approach is slightly more conservative than qvalue

rnaseq_dt=read_excel("/Users//MQ10004787/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/BH 2022 RNAseq data_3.033 corrected_0s deleted_IHD-DM paper.xlsx",col_names=FALSE)


rnaseq_dt=as.data.frame(rnaseq_dt)
infodt=rnaseq_dt[1:7,]
expressiondt=rnaseq_dt[8:dim(rnaseq_dt)[1],2:dim(rnaseq_dt)[2]]
#as.character(as.vector(rnaseq_dt[8:dim(rnaseq_dt)[1],1]))[which(duplicated(as.character(as.vector(rnaseq_dt[8:dim(rnaseq_dt)[1],1])))==TRUE)]
expressiondt=apply(expressiondt, 2, as.numeric)
abundence=rowSums(expressiondt,na.rm = TRUE)
duplicated_names=as.character(as.vector(rnaseq_dt[8:dim(rnaseq_dt)[1],1]))[which(duplicated(as.character(as.vector(rnaseq_dt[8:dim(rnaseq_dt)[1],1])))==TRUE)]
name_vec=as.character(as.vector(rnaseq_dt[8:dim(rnaseq_dt)[1],1]))

subdt=as.data.frame(expressiondt)
subdt[subdt$name=="GTF2H2",]#need to use the one with a small abundence

##  [1] ...2  ...3  ...4  ...5  ...6  ...7  ...8  ...9  ...10 ...11 ...12 ...13
## [13] ...14 ...15 ...16 ...17 ...18 ...19 ...20 ...21 ...22
## <0 rows> (or 0-length row.names)

GTF2H2_vec=subdt[7283,1:21]
subdt$abundence=abundence
sum(is.na(abundence))

## [1] 0

subdt$name=name_vec
any(is.na(name_vec))

## [1] FALSE

dim(subdt)

## [1] 23883    23

subdt2=subdt%>%group_by(name)%>%slice(which.max(abundence))
dim(subdt2)

## [1] 23787    23

#get the expressiondt
expressiondt=as.data.frame(subdt2[,1:21])
rownames(expressiondt)=subdt2$name
expressiondt[rownames(expressiondt)=="GTF2H2",]=GTF2H2_vec
colnames(expressiondt)=paste(paste(paste(paste(rnaseq_dt[1,2:dim(rnaseq_dt)[2]],rnaseq_dt[2,2:dim(rnaseq_dt)[2]],sep = "_"),rnaseq_dt[3,2:dim(rnaseq_dt)[2]],sep = "_"),rnaseq_dt[4,2:dim(rnaseq_dt)[2]],sep = "_"),rnaseq_dt[5,2:dim(rnaseq_dt)[2]],sep = "_")

#log2 transform
expressiondt=as.data.frame(apply(expressiondt, 2, log2))
rownames(expressiondt)=subdt2$name
dim(expressiondt)

## [1] 23787    21

rnaseq_dt=read_excel("/Users/yzha0247/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/BH 2022 RNAseq data_3.033 corrected_0s deleted_IHD-DM paper.xlsx",col_names=FALSE,sheet = 2)


rnaseq_dt=as.data.frame(rnaseq_dt)
dim(rnaseq_dt)
rnaseq_dt=rnaseq_dt[,-1]
dim(rnaseq_dt)
infodt=rnaseq_dt[1:7,]
expressiondt=rnaseq_dt[8:dim(rnaseq_dt)[1],2:dim(rnaseq_dt)[2]]
which(duplicated(as.character(as.vector(rnaseq_dt[8:dim(rnaseq_dt)[1],1])))==TRUE)
rownames(expressiondt)=make.names(as.character(as.vector(rnaseq_dt[8:dim(rnaseq_dt)[1],1])),unique = TRUE)
colnames(expressiondt)=paste(paste(paste(paste(rnaseq_dt[1,2:dim(rnaseq_dt)[2]],rnaseq_dt[2,2:dim(rnaseq_dt)[2]],sep = "_"),rnaseq_dt[3,2:dim(rnaseq_dt)[2]],sep = "_"),rnaseq_dt[4,2:dim(rnaseq_dt)[2]],sep = "_"),rnaseq_dt[5,2:dim(rnaseq_dt)[2]],sep = "_")
expressiondt=apply(expressiondt, 2, as.numeric)

#log2 transform
expressiondt=apply(expressiondt, 2, log2)
rownames(expressiondt)=make.names(as.character(as.vector(rnaseq_dt[8:dim(rnaseq_dt)[1],1])),unique = TRUE)

1 IHD-DM VS Donor

######################
#get the "lv" "ihd-dm" "ihd-no dm" non-imputed data
experiment_data_noimputation=expressiondt[,union(grep("_IHD_Yes",colnames(expressiondt)),grep("Donor_No",colnames(expressiondt)))]
#colnames(experiment_data_noimputation)
dim(experiment_data_noimputation)

## [1] 23787    14

#filtering out with >25% samples missing
experiment_data_noimputation=experiment_data_noimputation[which(rowMeans(!is.na(experiment_data_noimputation)) > 0.25), ]
dim(experiment_data_noimputation)

## [1] 19212    14

diabetes=grep("Yes",colnames(experiment_data_noimputation))
experiment_data_noimputation1=as.data.frame(t(experiment_data_noimputation))
rownames(experiment_data_noimputation1)=1:dim(experiment_data_noimputation1)[1]
experiment_data_noimputation1$y=ifelse(rownames(experiment_data_noimputation1) %in% diabetes,1,0)


#de analysis-- should be on the non-imputed data

    
    groupname<-as.factor(experiment_data_noimputation1$y)
    design <- model.matrix(~ groupname + 0)
    fit <- lmFit(experiment_data_noimputation, design)
    cont.matrix <- makeContrasts(groupname1-groupname0, levels=design)
    fit2 <- contrasts.fit(fit, cont.matrix)
    fit2 <- eBayes(fit2)
    tT <- topTable(fit2)
    df1 <- topTable(fit2, number=nrow(fit2), genelist=rownames(experiment_data_noimputation))
    df1$name=rownames(df1)
    df1$neglogP_value = -log10(df1$P.Value)
#write.csv(df1,"rna_ihddmvsd.csv")
    # g1=ggplot2::ggplot(df1, aes(logFC,neglogP_value,label=name))+
    #   geom_point(size = 1, alpha=1) +
    #   xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("IHD-T2DM vs Donor") +
    #   theme_bw(base_size = 16) +
    #   geom_hline(yintercept = 1.3, linetype = "dashed", colour = "brown3")+theme(aspect.ratio = 1)
    
    #add fdr plot
adj_to_p <- function(df) {
  #df <- topTable(fit, coef = comparison, number = nrow(fit))
  if ((df %>%dplyr::filter(adj.P.Val < 0.05) %>% nrow()) == 0){
    r <- 1
  } else {
    r <- df%>%dplyr::mutate(rn = row_number()) %>% dplyr::filter(adj.P.Val <= 0.05) %>% dplyr::pull(rn)%>%max() + 1
  }
  n <- nrow(df) + 1
  lp10 <- -log10(r / n * 0.05)
  lp10
} 

#add samples
colnames(experiment_data_noimputation)

##  [1] "LV_IHD_Yes_M_51"   "LV_IHD_Yes_M_53"   "LV_IHD_Yes_M_55"  
##  [4] "LV_IHD_Yes_M_56"   "LV_IHD_Yes_M_56.1" "LV_IHD_Yes_M_59"  
##  [7] "LV_IHD_Yes_M_65"   "LV_Donor_No_M_54"  "LV_Donor_No_F_55" 
## [10] "LV_Donor_No_M_56"  "LV_Donor_No_M_60"  "LV_Donor_No_M_62" 
## [13] "LV_Donor_No_F_62"  "LV_Donor_No_F_65"

#add fdr plot
adj_to_p(df1)

## [1] 5.584625

highlight_df=df1
highlight_df$colorcode=as.character(ifelse(highlight_df$neglogP_value>adj_to_p(df1)&highlight_df$logFC>0,1,ifelse(highlight_df$neglogP_value>adj_to_p(df1)&highlight_df$logFC<0,-1,0)))
dim(highlight_df)

## [1] 19212    10

highlight_df=na.omit(highlight_df)
dim(highlight_df)

## [1] 19167    10

g1=ggplot2::ggplot(highlight_df, aes(logFC,neglogP_value,label=name))+
      geom_point(aes(x=logFC,y=neglogP_value,color=colorcode)) +
      xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("IHD-T2DM vs Donor") +
      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),aspect.ratio = 1) +
      geom_hline(yintercept = adj_to_p(df1), linetype = "dashed", colour = "red")+scale_color_manual(values = c("blue","gray","red"))
ggplotly(g1)

for(i in c(2:7,9)){
  df1[,i]=round(df1[,i],4)
}
datatable(df1)

dim(df1[df1$P.Value<0.05,])

## [1] 1541    9

g1=ggplot(highlight_df, aes(logFC,neglogP_value,label=name))+
      geom_point() +
      xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("IHD-T2DM vs Donor") +
      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),aspect.ratio = 1)+geom_text_repel(aes(x=logFC,y=neglogP_value,label=name),subset(highlight_df,(logFC>4|-logFC>4|neglogP_value>4)))
#ggsave(filename="plot1.pdf",g1,height = 10,width = 8)
g1

set1=df1[which(df1$P.Value<0.05),"name"]
set1_relaxed=df1[which(abs(df1$logFC)>1),"name"]
set1_restricted=df1[which(df1$adj.P.Val<0.05),"name"]

rna_dmvsd=df1

2 IHD-no DM VS Donor

######################
#get the "lv" "ihd-dm" "ihd-no dm" non-imputed data
experiment_data_noimputation=expressiondt[,union(grep("_IHD_No",colnames(expressiondt)),grep("Donor_No",colnames(expressiondt)))]
#colnames(experiment_data_noimputation)
dim(experiment_data_noimputation)

## [1] 23787    14

#filtering out with >25% samples missing
experiment_data_noimputation=experiment_data_noimputation[which(rowMeans(!is.na(experiment_data_noimputation)) > 0.25), ]
dim(experiment_data_noimputation)

## [1] 19099    14

diabetes=grep("IHD",colnames(experiment_data_noimputation))
experiment_data_noimputation1=as.data.frame(t(experiment_data_noimputation))
rownames(experiment_data_noimputation1)=1:dim(experiment_data_noimputation1)[1]
experiment_data_noimputation1$y=ifelse(rownames(experiment_data_noimputation1) %in% diabetes,1,0)


#de analysis-- should be on the non-imputed data

    
    groupname<-as.factor(experiment_data_noimputation1$y)
    design <- model.matrix(~ groupname + 0)
    fit <- lmFit(experiment_data_noimputation, design)
    cont.matrix <- makeContrasts(groupname1-groupname0, levels=design)
    fit2 <- contrasts.fit(fit, cont.matrix)
    fit2 <- eBayes(fit2)
    tT <- topTable(fit2)
    df1 <- topTable(fit2, number=nrow(fit2), genelist=rownames(experiment_data_noimputation))
    df1$name=rownames(df1)
    df1$neglogP_value = -log10(df1$P.Value)
#write.csv(df1,"rna_ihdnodmvsd.csv")
    # g1=ggplot2::ggplot(df1, aes(logFC,neglogP_value,label=name))+
    #   geom_point(size = 1, alpha=1) +
    #   xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("IHD-T2DM vs Donor") +
    #   theme_bw(base_size = 16) +
    #   geom_hline(yintercept = 1.3, linetype = "dashed", colour = "brown3")+theme(aspect.ratio = 1)
    
    #add fdr plot
adj_to_p <- function(df) {
  #df <- topTable(fit, coef = comparison, number = nrow(fit))
  if ((df %>%dplyr::filter(adj.P.Val < 0.05) %>% nrow()) == 0){
    r <- 1
  } else {
    r <- df%>%dplyr::mutate(rn = row_number()) %>% dplyr::filter(adj.P.Val <= 0.05) %>% dplyr::pull(rn)%>%max() + 1
  }
  n <- nrow(df) + 1
  lp10 <- -log10(r / n * 0.05)
  lp10
} 

#add samples
colnames(experiment_data_noimputation)

##  [1] "LV_IHD_No_M_50"   "LV_IHD_No_M_54"   "LV_IHD_No_M_54.1" "LV_IHD_No_F_62"  
##  [5] "LV_IHD_No_M_62"   "LV_IHD_No_M_62.1" "LV_IHD_No_M_66"   "LV_Donor_No_M_54"
##  [9] "LV_Donor_No_F_55" "LV_Donor_No_M_56" "LV_Donor_No_M_60" "LV_Donor_No_M_62"
## [13] "LV_Donor_No_F_62" "LV_Donor_No_F_65"

#add fdr plot
adj_to_p(df1)

## [1] 5.582063

highlight_df=df1
highlight_df$colorcode=as.character(ifelse(highlight_df$neglogP_value>adj_to_p(df1)&highlight_df$logFC>0,1,ifelse(highlight_df$neglogP_value>adj_to_p(df1)&highlight_df$logFC<0,-1,0)))
dim(highlight_df)

## [1] 19099    10

highlight_df=na.omit(highlight_df)
dim(highlight_df)

## [1] 19069    10

g1=ggplot2::ggplot(highlight_df, aes(logFC,neglogP_value,label=name))+
      geom_point(aes(x=logFC,y=neglogP_value,color=colorcode)) +
      xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("IHD-no T2DM vs Donor") +
      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),aspect.ratio = 1) +
      geom_hline(yintercept = adj_to_p(df1), linetype = "dashed", colour = "red")+scale_color_manual(values = c("blue","gray","red"))
ggplotly(g1)

for(i in c(2:7,9)){
  df1[,i]=round(df1[,i],4)
}
datatable(df1)

dim(df1[df1$P.Value<0.05,])

## [1] 691   9

g1=ggplot(highlight_df, aes(logFC,neglogP_value,label=name))+
      geom_point() +
      xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("IHD-T2DM vs Donor") +
      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),aspect.ratio = 1)+geom_text_repel(aes(x=logFC,y=neglogP_value,label=name),subset(highlight_df,(logFC>2.5|-logFC>2.5|neglogP_value>3.5)))
#ggsave(filename="plot1.pdf",g1,height = 10,width = 8)
g1

set2=df1[which(df1$P.Value<0.05),"name"]
set2_relaxed=df1[which(abs(df1$logFC)>1),"name"]
set2_restricted=df1[which(df1$adj.P.Val<0.05),"name"]

rna_nodmvsd=df1

library(RColorBrewer)
myCol <- brewer.pal(3, "Pastel2")[c(1, 2)]
library(VennDiagram)
# Chart
v1=venn.diagram(
        x = list(set1, set2),
        category.names = c("IHD-T2DM" , "IHD-NO T2DM" ),
        cat.cex = 2,
        cat.default.pos = "text",
        filename = NULL,
        output=TRUE,
        
        # Output features
        height = 14 , 
        width = 15 , 
        resolution = 300,
        compression = "lzw",
        
        # Circles
        lwd = 2,
        lty = 'blank',
        fill = myCol,
        
        # Numbers
        cex = 2,
        fontface = "bold",
        fontfamily = "sans"
        
    
)
display_venn <- function(x, ...){
  library(VennDiagram)
  grid.newpage()
  venn_object <- venn.diagram(x, filename = NULL, ...)
  grid.draw(venn_object)
}
display_venn(x = list(set1, set2),
              category.names = c("IHD-T2DM" , "IHD-NO T2DM" ),fill = myCol,lwd = 0,
        lty = 'blank')

# pdf(file="protein_venn.pdf")
# grid.draw(v1)
# dev.off()

set_protein1=readRDS("~/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/set1.rds")
set_protein1_restricted=readRDS("~/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/set1_restricted.rds")
# set_protein2=readRDS("~/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/set4.rds")
# set_protein2_restricted=readRDS("~/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/set4_restricted.rds")


v1=venn.diagram(
        x = list(set1, set_protein1_restricted),
        category.names = c("rna" , "protein" ),
        cat.cex = 2,
        cat.default.pos = "text",
        filename = NULL,
        output=TRUE,
        
        # Output features
        height = 14 , 
        width = 15 , 
        resolution = 300,
        compression = "lzw",
        
        # Circles
        lwd = 2,
        lty = 'blank',
        fill = myCol,
        
        # Numbers
        cex = 2,
        fontface = "bold",
        fontfamily = "sans"
        
    
)
display_venn <- function(x, ...){
  library(VennDiagram)
  grid.newpage()
  venn_object <- venn.diagram(x, filename = NULL, ...)
  grid.draw(venn_object)
}
display_venn(x = list(set1, set_protein1_restricted),
              category.names = c("rna" , "protein" ),fill = myCol,lwd = 0,
        lty = 'blank')

# v1=venn.diagram(
#         x = list(set2,set_protein2_restricted),
#         category.names = c("rna" , "protein" ),
#         cat.cex = 2,
#         cat.default.pos = "text",
#         filename = NULL,
#         output=TRUE,
#         
#         # Output features
#         height = 14 , 
#         width = 15 , 
#         resolution = 300,
#         compression = "lzw",
#         
#         # Circles
#         lwd = 2,
#         lty = 'blank',
#         fill = myCol,
#         
#         # Numbers
#         cex = 2,
#         fontface = "bold",
#         fontfamily = "sans"
#         
#     
# )
# display_venn <- function(x, ...){
#   library(VennDiagram)
#   grid.newpage()
#   venn_object <- venn.diagram(x, filename = NULL, ...)
#   grid.draw(venn_object)
# }
# display_venn(x = list(set2,set_protein2_restricted),
#               category.names = c("rna" , "protein" ),fill = myCol,lwd = 0,
#         lty = 'blank')

3 comparison rna-seq vs proteins

get the protein data

load("all_omics_lv_v6.RData")
protein_expression=assay(all_omics@ExperimentList$protein, "log2")
protein_expression1=as.data.frame(protein_expression)

dim(protein_expression1)

## [1] 9037  109

#changed 20220224: no overall filtering
protein_expression2=protein_expression1
dim(protein_expression2)

## [1] 9037  109

dim(protein_expression2[which(rowMeans(!is.na(protein_expression2)) > 0.25), ])

## [1] 4077  109

protein_expression3=protein_expression2[,c(union(grep("_IHD_Yes",colnames(protein_expression2)),grep("IHD_No",colnames(protein_expression2))),grep("Donor_No",colnames(protein_expression2)))]
protein_expression3=protein_expression3[,-c(32,34,36,39,40,41,43,46,48,49,50,54)]
colnames(protein_expression3)

##  [1] "2_LV_IHD_Yes_45_M"    "3_LV_IHD_Yes_46_F"    "4_LV_IHD_Yes_49_M"   
##  [4] "6_LV_IHD_Yes_51_M"    "5_LV_IHD_Yes_50_M"    "7_LV_IHD_Yes_51_M"   
##  [7] "14_LV_IHD_Yes_65_M"   "9_LV_IHD_Yes_55_M"    "12_LV_IHD_Yes_59_M"  
## [10] "13_LV_IHD_Yes_59_M"   "10_LV_IHD_Yes_56_M"   "11_LV_IHD_Yes_56_M"  
## [13] "8_LV_IHD_Yes_53_M"    "1_LV_IHD_Yes_41_M"    "17_LV_IHD_No_43_F"   
## [16] "26_LV_IHD_No_54_F"    "27_LV_IHD_No_62_F"    "24_LV_IHD_No_54_M"   
## [19] "16_LV_IHD_No_42_F"    "22_LV_IHD_No_50_M"    "21_LV_IHD_No_49_F"   
## [22] "19_LV_IHD_No_47_F"    "20_LV_IHD_No_48_F"    "23_LV_IHD_No_50_M"   
## [25] "28_LV_IHD_No_62_M"    "15_LV_IHD_No_41_M"    "29_LV_IHD_No_62_M"   
## [28] "18_LV_IHD_No_45_M"    "30_LV_IHD_No_66_M"    "25_LV_IHD_No_54_M"   
## [31] "110_LV_Donor_No_65_F" "101_LV_Donor_No_54_M" "94_LV_Donor_No_47_M" 
## [34] "98_LV_Donor_No_51_F"  "105_LV_Donor_No_56_M" "100_LV_Donor_No_53_F"
## [37] "109_LV_Donor_No_65_M" "99_LV_Donor_No_53_M"  "97_LV_Donor_No_49_F" 
## [40] "108_LV_Donor_No_62_F" "102_LV_Donor_No_54_F" "96_LV_Donor_No_48_M" 
## [43] "106_LV_Donor_No_60_M" "95_LV_Donor_No_47_F"  "107_LV_Donor_No_62_M"
## [46] "104_LV_Donor_No_55_F"

get the rna data

rnaseq_dt=read_excel("~/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/BH 2022 RNAseq data_3.033 corrected_0s deleted_IHD-DM paper.xlsx",col_names=FALSE)


rnaseq_dt=as.data.frame(rnaseq_dt)
infodt=rnaseq_dt[1:7,]
expressiondt=rnaseq_dt[8:dim(rnaseq_dt)[1],2:dim(rnaseq_dt)[2]]
#as.character(as.vector(rnaseq_dt[8:dim(rnaseq_dt)[1],1]))[which(duplicated(as.character(as.vector(rnaseq_dt[8:dim(rnaseq_dt)[1],1])))==TRUE)]
expressiondt=apply(expressiondt, 2, as.numeric)
abundence=rowSums(expressiondt,na.rm = TRUE)
duplicated_names=as.character(as.vector(rnaseq_dt[8:dim(rnaseq_dt)[1],1]))[which(duplicated(as.character(as.vector(rnaseq_dt[8:dim(rnaseq_dt)[1],1])))==TRUE)]
name_vec=as.character(as.vector(rnaseq_dt[8:dim(rnaseq_dt)[1],1]))

subdt=as.data.frame(expressiondt)
subdt[subdt$name=="GTF2H2",]#need to use the one with a small abundence

##  [1] ...2  ...3  ...4  ...5  ...6  ...7  ...8  ...9  ...10 ...11 ...12 ...13
## [13] ...14 ...15 ...16 ...17 ...18 ...19 ...20 ...21 ...22
## <0 rows> (or 0-length row.names)

GTF2H2_vec=subdt[7283,1:21]
subdt$abundence=abundence
sum(is.na(abundence))

## [1] 0

subdt$name=name_vec
any(is.na(name_vec))

## [1] FALSE

dim(subdt)

## [1] 23883    23

subdt2=subdt%>%group_by(name)%>%slice(which.max(abundence))
dim(subdt2)

## [1] 23787    23

#get the expressiondt
expressiondt=as.data.frame(subdt2[,1:21])
rownames(expressiondt)=subdt2$name
expressiondt[rownames(expressiondt)=="GTF2H2",]=GTF2H2_vec
colnames(expressiondt)=paste(paste(paste(paste(paste(rnaseq_dt[7,2:dim(rnaseq_dt)[2]],rnaseq_dt[1,2:dim(rnaseq_dt)[2]],sep="_"),rnaseq_dt[2,2:dim(rnaseq_dt)[2]],sep = "_"),rnaseq_dt[3,2:dim(rnaseq_dt)[2]],sep = "_"),rnaseq_dt[5,2:dim(rnaseq_dt)[2]],sep = "_"),rnaseq_dt[4,2:dim(rnaseq_dt)[2]],sep = "_")
#log2 transform
expressiondt=as.data.frame(apply(expressiondt, 2, log2))
rownames(expressiondt)=subdt2$name
dim(expressiondt)

## [1] 23787    21

rna_expression=expressiondt

get all fold changes

expressiondt=rna_expression
######################
#get the "lv" "ihd-dm" "ihd-no dm" non-imputed data
experiment_data_noimputation=expressiondt[,union(grep("_IHD_Yes",colnames(expressiondt)),grep("IHD_No",colnames(expressiondt)))]
#colnames(experiment_data_noimputation)
dim(experiment_data_noimputation)

## [1] 23787    14

#filtering out with >25% samples missing
experiment_data_noimputation=experiment_data_noimputation[which(rowMeans(!is.na(experiment_data_noimputation)) > 0.25), ]
dim(experiment_data_noimputation)

## [1] 19283    14

diabetes=grep("Yes",colnames(experiment_data_noimputation))
experiment_data_noimputation1=as.data.frame(t(experiment_data_noimputation))
rownames(experiment_data_noimputation1)=1:dim(experiment_data_noimputation1)[1]
experiment_data_noimputation1$y=ifelse(rownames(experiment_data_noimputation1) %in% diabetes,1,0)


#de analysis-- should be on the non-imputed data

    
    groupname<-as.factor(experiment_data_noimputation1$y)
    design <- model.matrix(~ groupname + 0)
    fit <- lmFit(experiment_data_noimputation, design)
    cont.matrix <- makeContrasts(groupname1-groupname0, levels=design)
    fit2 <- contrasts.fit(fit, cont.matrix)
    fit2 <- eBayes(fit2)
    tT <- topTable(fit2)
    df1 <- topTable(fit2, number=nrow(fit2), genelist=rownames(experiment_data_noimputation))
    df1$name=rownames(df1)
    df1$neglogP_value = -log10(df1$P.Value)
#write.csv(df1,"rna_ihddm vs nodm.csv")
    
    # g1=ggplot2::ggplot(df1, aes(logFC,neglogP_value,label=name))+
    #   geom_point(size = 1, alpha=1) +
    #   xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("IHD-T2DM vs Donor") +
    #   theme_bw(base_size = 16) +
    #   geom_hline(yintercept = 1.3, linetype = "dashed", colour = "brown3")+theme(aspect.ratio = 1)
    
    #add fdr plot
adj_to_p <- function(df) {
  #df <- topTable(fit, coef = comparison, number = nrow(fit))
  if ((df %>%dplyr::filter(adj.P.Val < 0.05) %>% nrow()) == 0){
    r <- 1
  } else {
    r <- df%>%dplyr::mutate(rn = row_number()) %>% dplyr::filter(adj.P.Val <= 0.05) %>% dplyr::pull(rn)%>%max() + 1
  }
  n <- nrow(df) + 1
  lp10 <- -log10(r / n * 0.05)
  lp10
} 

#add samples
colnames(experiment_data_noimputation)

##  [1] "7_LV_IHD_Yes_51_M"  "8_LV_IHD_Yes_53_M"  "9_LV_IHD_Yes_55_M" 
##  [4] "10_LV_IHD_Yes_56_M" "11_LV_IHD_Yes_56_M" "13_LV_IHD_Yes_59_M"
##  [7] "14_LV_IHD_Yes_65_M" "22_LV_IHD_No_50_M"  "24_LV_IHD_No_54_M" 
## [10] "25_LV_IHD_No_54_M"  "27_LV_IHD_No_62_F"  "28_LV_IHD_No_62_M" 
## [13] "29_LV_IHD_No_62_M"  "30_LV_IHD_No_66_M"

#add fdr plot
adj_to_p(df1)

## [1] 5.285197

highlight_df=df1
highlight_df$colorcode=as.character(ifelse(highlight_df$neglogP_value>adj_to_p(df1)&highlight_df$logFC>0,1,ifelse(highlight_df$neglogP_value>adj_to_p(df1)&highlight_df$logFC<0,-1,0)))
dim(highlight_df)

## [1] 19283    10

highlight_df=na.omit(highlight_df)
dim(highlight_df)

## [1] 19255    10

g1=ggplot2::ggplot(highlight_df, aes(logFC,neglogP_value,label=name))+
      geom_point(aes(x=logFC,y=neglogP_value,color=colorcode)) +
      xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("IHD-T2DM vs Donor") +
      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),aspect.ratio = 1) +
      geom_hline(yintercept = adj_to_p(df1), linetype = "dashed", colour = "red")+scale_color_manual(values = c("blue","gray","red"))
ggplotly(g1)

for(i in c(2:7,9)){
  df1[,i]=round(df1[,i],4)
}
datatable(df1)

dim(df1[df1$P.Value<0.05,])

## [1] 1907    9

g1=ggplot(highlight_df, aes(logFC,neglogP_value,label=name))+
      geom_point() +
      xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("IHD-T2DM vs IHD-NOT2DM") +
      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),aspect.ratio = 1)+geom_text_repel(aes(x=logFC,y=neglogP_value,label=name),subset(highlight_df,(logFC>4|-logFC>4|neglogP_value>4)))
#ggsave(filename="plot1.pdf",g1,height = 10,width = 8)
g1

rna_dmvsnodm=df1

expressiondt=protein_expression3
experiment_data_noimputation=expressiondt[,union(grep("_IHD_Yes",colnames(expressiondt)),grep("Donor_No",colnames(expressiondt)))]
#colnames(experiment_data_noimputation)
dim(experiment_data_noimputation)

## [1] 9037   30

#filtering out with >25% samples missing
experiment_data_noimputation=experiment_data_noimputation[which(rowMeans(!is.na(experiment_data_noimputation)) > 0.25), ]
dim(experiment_data_noimputation)

## [1] 4120   30

colnames(experiment_data_noimputation)

##  [1] "2_LV_IHD_Yes_45_M"    "3_LV_IHD_Yes_46_F"    "4_LV_IHD_Yes_49_M"   
##  [4] "6_LV_IHD_Yes_51_M"    "5_LV_IHD_Yes_50_M"    "7_LV_IHD_Yes_51_M"   
##  [7] "14_LV_IHD_Yes_65_M"   "9_LV_IHD_Yes_55_M"    "12_LV_IHD_Yes_59_M"  
## [10] "13_LV_IHD_Yes_59_M"   "10_LV_IHD_Yes_56_M"   "11_LV_IHD_Yes_56_M"  
## [13] "8_LV_IHD_Yes_53_M"    "1_LV_IHD_Yes_41_M"    "110_LV_Donor_No_65_F"
## [16] "101_LV_Donor_No_54_M" "94_LV_Donor_No_47_M"  "98_LV_Donor_No_51_F" 
## [19] "105_LV_Donor_No_56_M" "100_LV_Donor_No_53_F" "109_LV_Donor_No_65_M"
## [22] "99_LV_Donor_No_53_M"  "97_LV_Donor_No_49_F"  "108_LV_Donor_No_62_F"
## [25] "102_LV_Donor_No_54_F" "96_LV_Donor_No_48_M"  "106_LV_Donor_No_60_M"
## [28] "95_LV_Donor_No_47_F"  "107_LV_Donor_No_62_M" "104_LV_Donor_No_55_F"

diabetes=grep("Yes",colnames(experiment_data_noimputation))
experiment_data_noimputation1=as.data.frame(t(experiment_data_noimputation))
rownames(experiment_data_noimputation1)=1:dim(experiment_data_noimputation1)[1]
experiment_data_noimputation1$y=ifelse(rownames(experiment_data_noimputation1) %in% diabetes,1,0)


#de analysis-- should be on the non-imputed data

    
    groupname<-as.factor(experiment_data_noimputation1$y)
    design <- model.matrix(~ groupname + 0)
    fit <- lmFit(experiment_data_noimputation, design)
    cont.matrix <- makeContrasts(groupname1-groupname0, levels=design)
    fit2 <- contrasts.fit(fit, cont.matrix)
    fit2 <- eBayes(fit2)
    tT <- topTable(fit2)
    df1 <- topTable(fit2, number=nrow(fit2), genelist=rownames(experiment_data_noimputation))
    df1$name=rownames(df1)
    df1$neglogP_value = -log10(df1$P.Value)

    protein_dmvsd=df1

expressiondt=protein_expression3
experiment_data_noimputation=expressiondt[,union(grep("_IHD_No",colnames(expressiondt)),grep("Donor_No",colnames(expressiondt)))]
#colnames(experiment_data_noimputation)
dim(experiment_data_noimputation)

## [1] 9037   32

#filtering out with >25% samples missing
experiment_data_noimputation=experiment_data_noimputation[which(rowMeans(!is.na(experiment_data_noimputation)) > 0.25), ]
dim(experiment_data_noimputation)

## [1] 4052   32

colnames(experiment_data_noimputation)

##  [1] "17_LV_IHD_No_43_F"    "26_LV_IHD_No_54_F"    "27_LV_IHD_No_62_F"   
##  [4] "24_LV_IHD_No_54_M"    "16_LV_IHD_No_42_F"    "22_LV_IHD_No_50_M"   
##  [7] "21_LV_IHD_No_49_F"    "19_LV_IHD_No_47_F"    "20_LV_IHD_No_48_F"   
## [10] "23_LV_IHD_No_50_M"    "28_LV_IHD_No_62_M"    "15_LV_IHD_No_41_M"   
## [13] "29_LV_IHD_No_62_M"    "18_LV_IHD_No_45_M"    "30_LV_IHD_No_66_M"   
## [16] "25_LV_IHD_No_54_M"    "110_LV_Donor_No_65_F" "101_LV_Donor_No_54_M"
## [19] "94_LV_Donor_No_47_M"  "98_LV_Donor_No_51_F"  "105_LV_Donor_No_56_M"
## [22] "100_LV_Donor_No_53_F" "109_LV_Donor_No_65_M" "99_LV_Donor_No_53_M" 
## [25] "97_LV_Donor_No_49_F"  "108_LV_Donor_No_62_F" "102_LV_Donor_No_54_F"
## [28] "96_LV_Donor_No_48_M"  "106_LV_Donor_No_60_M" "95_LV_Donor_No_47_F" 
## [31] "107_LV_Donor_No_62_M" "104_LV_Donor_No_55_F"

diabetes=grep("IHD",colnames(experiment_data_noimputation))
experiment_data_noimputation1=as.data.frame(t(experiment_data_noimputation))
rownames(experiment_data_noimputation1)=1:dim(experiment_data_noimputation1)[1]
experiment_data_noimputation1$y=ifelse(rownames(experiment_data_noimputation1) %in% diabetes,1,0)


#de analysis-- should be on the non-imputed data

    
    groupname<-as.factor(experiment_data_noimputation1$y)
    design <- model.matrix(~ groupname + 0)
    fit <- lmFit(experiment_data_noimputation, design)
    cont.matrix <- makeContrasts(groupname1-groupname0, levels=design)
    fit2 <- contrasts.fit(fit, cont.matrix)
    fit2 <- eBayes(fit2)
    tT <- topTable(fit2)
    df1 <- topTable(fit2, number=nrow(fit2), genelist=rownames(experiment_data_noimputation))
    df1$name=rownames(df1)
    df1$neglogP_value = -log10(df1$P.Value)

    protein_nodmvsd=df1

expressiondt=protein_expression3
experiment_data_noimputation=expressiondt[,union(grep("_IHD_Yes",colnames(expressiondt)),grep("IHD_No",colnames(expressiondt)))]
#colnames(experiment_data_noimputation)
dim(experiment_data_noimputation)

## [1] 9037   30

#filtering out with >25% samples missing
experiment_data_noimputation=experiment_data_noimputation[which(rowMeans(!is.na(experiment_data_noimputation)) > 0.25), ]
dim(experiment_data_noimputation)

## [1] 4075   30

colnames(experiment_data_noimputation)

##  [1] "2_LV_IHD_Yes_45_M"  "3_LV_IHD_Yes_46_F"  "4_LV_IHD_Yes_49_M" 
##  [4] "6_LV_IHD_Yes_51_M"  "5_LV_IHD_Yes_50_M"  "7_LV_IHD_Yes_51_M" 
##  [7] "14_LV_IHD_Yes_65_M" "9_LV_IHD_Yes_55_M"  "12_LV_IHD_Yes_59_M"
## [10] "13_LV_IHD_Yes_59_M" "10_LV_IHD_Yes_56_M" "11_LV_IHD_Yes_56_M"
## [13] "8_LV_IHD_Yes_53_M"  "1_LV_IHD_Yes_41_M"  "17_LV_IHD_No_43_F" 
## [16] "26_LV_IHD_No_54_F"  "27_LV_IHD_No_62_F"  "24_LV_IHD_No_54_M" 
## [19] "16_LV_IHD_No_42_F"  "22_LV_IHD_No_50_M"  "21_LV_IHD_No_49_F" 
## [22] "19_LV_IHD_No_47_F"  "20_LV_IHD_No_48_F"  "23_LV_IHD_No_50_M" 
## [25] "28_LV_IHD_No_62_M"  "15_LV_IHD_No_41_M"  "29_LV_IHD_No_62_M" 
## [28] "18_LV_IHD_No_45_M"  "30_LV_IHD_No_66_M"  "25_LV_IHD_No_54_M"

diabetes=grep("Yes",colnames(experiment_data_noimputation))
experiment_data_noimputation1=as.data.frame(t(experiment_data_noimputation))
rownames(experiment_data_noimputation1)=1:dim(experiment_data_noimputation1)[1]
experiment_data_noimputation1$y=ifelse(rownames(experiment_data_noimputation1) %in% diabetes,1,0)


#de analysis-- should be on the non-imputed data

    
    groupname<-as.factor(experiment_data_noimputation1$y)
    design <- model.matrix(~ groupname + 0)
    fit <- lmFit(experiment_data_noimputation, design)
    cont.matrix <- makeContrasts(groupname1-groupname0, levels=design)
    fit2 <- contrasts.fit(fit, cont.matrix)
    fit2 <- eBayes(fit2)
    tT <- topTable(fit2)
    df1 <- topTable(fit2, number=nrow(fit2), genelist=rownames(experiment_data_noimputation))
    df1$name=rownames(df1)
    df1$neglogP_value = -log10(df1$P.Value)

    protein_dmvnodm=df1

merge the data on genes

protein_dmvsd=protein_dmvsd[,c(1,2,3,4,5)]
protein_nodmvsd=protein_nodmvsd[,c(1,2,3,4,5)]
protein_dmvnodm=protein_dmvnodm[,c(1,2,3,4,5)]
rna_dmvsd=rna_dmvsd[,c(1,2,3,4,5)]
rna_dmvsnodm=rna_dmvsnodm[,c(1,2,3,4,5)]
rna_nodmvsd=rna_nodmvsd[,c(1,2,3,4,5)]

dmvsd=merge(protein_dmvsd,rna_dmvsd,by.x="ID",by.y="ID")
nodmvsd=merge(protein_nodmvsd,rna_nodmvsd,by.x="ID",by.y="ID")
dmvsnodm=merge(protein_dmvnodm,rna_dmvsnodm,by.x="ID",by.y="ID")

the plot1

plotdt=dmvsd[dmvsd$AveExpr.y>=5,]
g1=ggplot(plotdt, aes(x=logFC.x, y=logFC.y)) +
  geom_point( mapping = aes(color=AveExpr.y),size=1)+
  theme(axis.text.x = element_text(angle = 60, hjust = 1),panel.background =element_rect(fill = "white"),
        panel.grid.major = element_line(size = 0.1, linetype = 'solid',colour = "gray"),
        panel.grid.minor = element_line(size = 0.1, linetype = 'solid',
                                        colour = "white"),aspect.ratio = 1)+
  ggtitle("IHD-DM VS Donor")+
  xlab("lfc proteins")+
  ylab("lfc rna-seq")+scale_colour_gradientn(colours=rev(c("#FF0000", "#FFFF00" ,"#00FF00", "#00FFFF","#0000FF")))+geom_abline(slope = 1,intercept = 0)
g1

#ggsave("main_figure6b.pdf",plot=g1)
plotdt=nodmvsd[nodmvsd$AveExpr.y>=5,]
g2=ggplot(plotdt, aes(x=logFC.x, y=logFC.y)) +
  geom_point( mapping = aes(color=AveExpr.y),size=1)+
  theme(axis.text.x = element_text(angle = 60, hjust = 1),panel.background =element_rect(fill = "white"),
        panel.grid.major = element_line(size = 0.1, linetype = 'solid',colour = "gray"),
        panel.grid.minor = element_line(size = 0.1, linetype = 'solid',
                                        colour = "white"),aspect.ratio = 1)+
  ggtitle("IHD-no DM VS Donor")+
  xlab("lfc proteins")+
  ylab("lfc rna-seq")+scale_colour_gradientn(colours=rev(c("#FF0000", "#FFFF00" ,"#00FF00", "#00FFFF","#0000FF")))+geom_abline(slope = 1,intercept = 0)
g2

#ggsave("main_figure6a.pdf",plot=g2)

plotdt=dmvsnodm[dmvsnodm$AveExpr.y>=5,]
g3=ggplot(plotdt, aes(x=logFC.x, y=logFC.y)) +
  geom_point( mapping = aes(color=AveExpr.y),size=1)+
  theme(axis.text.x = element_text(angle = 60, hjust = 1),panel.background =element_rect(fill = "white"),
        panel.grid.major = element_line(size = 0.1, linetype = 'solid',colour = "gray"),
        panel.grid.minor = element_line(size = 0.1, linetype = 'solid',
                                        colour = "white"),aspect.ratio = 1)+
  ggtitle("IHD-DM VS IHD-no DM")+
  xlab("lfc proteins")+
  ylab("lfc rna-seq")+scale_colour_gradientn(colours=rev(c("#FF0000", "#FFFF00" ,"#00FF00", "#00FFFF","#0000FF")))+geom_abline(slope = 1,intercept = 0)
g3

#ggsave("supp_figure_ihdyesno.pdf",plot=g3)
max(dmvsnodm$AveExpr.y)

## [1] 21.82

calculate the correlation for each gene

protein_expression4=protein_expression3[rownames(protein_expression3)%in%dmvsd$ID,]
rna_expression2=rna_expression[rownames(rna_expression)%in%dmvsd$ID,]
dim(protein_expression4)
dim(rna_expression2)

dim(complete.cases(rna_expression2))
protein_expression4$ID=rownames(protein_expression4)
rna_expression2$ID=rownames(rna_expression2)

mergedt=merge(protein_expression4,rna_expression2,by.x="ID",by.y="ID")
dim(mergedt)
colnames(mergedt)
mergedt=mergedt[,c(1,grep(".x",colnames(mergedt)),grep(".y",colnames(mergedt)))]
colnames(mergedt)


for(i in 1:nrow(mergedt)){
mergedt[i,44]=cor(as.vector(as.numeric(mergedt[i,2:22])),as.vector(as.numeric(mergedt[i,23:43])),use = "complete.obs")
mergedt[i,45]=cor(as.vector(as.numeric(mergedt[i,2:22])),as.vector(as.numeric(mergedt[i,23:43])),use = "na.or.complete")
mergedt[i,46]=cor(as.vector(as.numeric(mergedt[i,2:22])),as.vector(as.numeric(mergedt[i,23:43])),use = "pairwise.complete.obs")
  }

cordt=mergedt[,c(1,44,45,46)]

the plot2

plotdt=merge(cordt,dmvsd, by.x="ID",by.y="ID")

# ggplot(plotdt, aes(x=V44, y=AveExpr.y)) +
#   geom_point( mapping = aes(color=AveExpr.y),size=1)+
#   theme(axis.text.x = element_text(angle = 60, hjust = 1),panel.background =element_rect(fill = "white"),
#         panel.grid.major = element_line(size = 0.1, linetype = 'solid',colour = "gray"),
#         panel.grid.minor = element_line(size = 0.1, linetype = 'solid',
#                                         colour = "white"),aspect.ratio = 1)+
#   ggtitle("Correlation vs average expression")+
#   xlab("correlation")+
#   ylab("average in rna")+scale_colour_gradientn(colours=rev(c("#FF0000", "#FFFF00" ,"#00FF00", "#00FFFF","#0000FF")))
# 
# ggplot(plotdt, aes(x=V45, y=AveExpr.y)) +
#   geom_point( mapping = aes(color=AveExpr.y),size=1)+
#   theme(axis.text.x = element_text(angle = 60, hjust = 1),panel.background =element_rect(fill = "white"),
#         panel.grid.major = element_line(size = 0.1, linetype = 'solid',colour = "gray"),
#         panel.grid.minor = element_line(size = 0.1, linetype = 'solid',
#                                         colour = "white"),aspect.ratio = 1)+
#   ggtitle("Correlation vs average expression")+
#   xlab("correlation")+
#   ylab("average in rna")+scale_colour_gradientn(colours=rev(c("#FF0000", "#FFFF00" ,"#00FF00", "#00FFFF","#0000FF")))

#Jean wants V46
ggplot(plotdt, aes(x=AveExpr.y, y=V46)) +
  geom_point( mapping = aes(color=AveExpr.y),size=1)+
  theme(axis.text.x = element_text(angle = 60, hjust = 1),panel.background =element_rect(fill = "white"),
        panel.grid.major = element_line(size = 0.1, linetype = 'solid',colour = "gray"),
        panel.grid.minor = element_line(size = 0.1, linetype = 'solid',
                                        colour = "white"),aspect.ratio = 1)+
  ggtitle("Correlation vs average expression")+
  xlab("average in rna")+
  ylab("correlation")+scale_colour_gradientn(colours=rev(c("#FF0000", "#FFFF00" ,"#00FF00", "#00FFFF","#0000FF")))


filtereddt=plotdt[plotdt$V46>=0.8,c(1,4,10)]
filtereddt=filtereddt[-1,]
colnames(filtereddt)[c(2,3)]=c("correlation","RNA AveExpr")
filtereddt=na.omit(filtereddt)
datatable(na.omit(filtereddt))

20221019 with TF

tf_dt=read.csv("~/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/DatabaseExtract_v_1.01.csv")
tf_dt2=tf_dt[tf_dt$Is.TF.=="Yes",]
#tf_dt3=tf_dt2[tf_dt2$HGNC.symbol%in%filtereddt$ID,]
tf_dt3=merge(filtereddt,tf_dt2,by.x="ID",by.y="HGNC.symbol")
datatable(tf_dt3)
length(unique(tf_dt2$HGNC.symbol))
filtereddt$ID

*20221116 cut the values and get three groups for Ben (manual match those samples, pairwise correlation)

3.1 results

protein_expression4=protein_expression3[rownames(protein_expression3)%in%dmvsd$ID,]
rna_expression2=rna_expression[rownames(rna_expression)%in%dmvsd$ID,]
dim(protein_expression4)

## [1] 3506   46

dim(rna_expression2)

## [1] 3506   21

dim(complete.cases(rna_expression2))

## NULL

protein_expression4$ID=rownames(protein_expression4)
rna_expression2$ID=rownames(rna_expression2)

mergedt=merge(protein_expression4,rna_expression2,by.x="ID",by.y="ID")
dim(mergedt)

## [1] 3506   68

mergedt=mergedt[,c(1,grep(".x",colnames(mergedt)),grep(".y",colnames(mergedt)))]

colnames(mergedt)

##  [1] "ID"                     "7_LV_IHD_Yes_51_M.x"    "14_LV_IHD_Yes_65_M.x"  
##  [4] "9_LV_IHD_Yes_55_M.x"    "13_LV_IHD_Yes_59_M.x"   "10_LV_IHD_Yes_56_M.x"  
##  [7] "11_LV_IHD_Yes_56_M.x"   "8_LV_IHD_Yes_53_M.x"    "27_LV_IHD_No_62_F.x"   
## [10] "24_LV_IHD_No_54_M.x"    "22_LV_IHD_No_50_M.x"    "28_LV_IHD_No_62_M.x"   
## [13] "29_LV_IHD_No_62_M.x"    "30_LV_IHD_No_66_M.x"    "25_LV_IHD_No_54_M.x"   
## [16] "110_LV_Donor_No_65_F.x" "101_LV_Donor_No_54_M.x" "105_LV_Donor_No_56_M.x"
## [19] "108_LV_Donor_No_62_F.x" "106_LV_Donor_No_60_M.x" "107_LV_Donor_No_62_M.x"
## [22] "104_LV_Donor_No_55_F.x" "7_LV_IHD_Yes_51_M.y"    "8_LV_IHD_Yes_53_M.y"   
## [25] "9_LV_IHD_Yes_55_M.y"    "10_LV_IHD_Yes_56_M.y"   "11_LV_IHD_Yes_56_M.y"  
## [28] "13_LV_IHD_Yes_59_M.y"   "14_LV_IHD_Yes_65_M.y"   "22_LV_IHD_No_50_M.y"   
## [31] "24_LV_IHD_No_54_M.y"    "25_LV_IHD_No_54_M.y"    "27_LV_IHD_No_62_F.y"   
## [34] "28_LV_IHD_No_62_M.y"    "29_LV_IHD_No_62_M.y"    "30_LV_IHD_No_66_M.y"   
## [37] "101_LV_Donor_No_54_M.y" "104_LV_Donor_No_55_F.y" "105_LV_Donor_No_56_M.y"
## [40] "106_LV_Donor_No_60_M.y" "107_LV_Donor_No_62_M.y" "108_LV_Donor_No_62_F.y"
## [43] "110_LV_Donor_No_65_F.y"

mergedt_g1=mergedt[,c(1,2,8,4,6,7,5,3,23:29)]
mergedt_g2=mergedt[,c(1,11,10,15,9,12,13,14,30:36)]
mergedt_g3=mergedt[,c(1,17,22,18,20,21,19,16,37:43)]

for(i in 1:nrow(mergedt)){
mergedt[i,44]=cor(as.vector(as.numeric(mergedt[i,c(2,8,4,6,7,5,3,11,10,15,9,12,13,14,17,22,18,20,21,19,16)])),as.vector(as.numeric(mergedt[i,23:43])),use = "pairwise.complete.obs")
  }

cordt=mergedt[,c(1,44)]


plotdt=merge(cordt,dmvsd, by.x="ID",by.y="ID")

plotdt2=plotdt
plotdt2=plotdt2[plotdt2$AveExpr.y>=5,]
gg=ggplot(plotdt2, aes(x=AveExpr.y, y=V44)) +
  geom_point( mapping = aes(color=AveExpr.y),size=1)+
  theme(axis.text.x = element_text(angle = 60, hjust = 1),panel.background =element_rect(fill = "white"),
        panel.grid.major = element_line(size = 0.1, linetype = 'solid',colour = "gray"),
        panel.grid.minor = element_line(size = 0.1, linetype = 'solid',
                                        colour = "white"),aspect.ratio = 1)+
  ggtitle("Correlation vs average expression")+
  xlab("average in rna")+
  ylab("correlation")+scale_colour_gradientn(colours=rev(c("#FF0000", "#FFFF00" ,"#00FF00", "#00FFFF","#0000FF")))+geom_hline(yintercept=0.8)
gg

#ggsave("supp_figure_rna.pdf",plot=gg)

filtereddt=plotdt2[plotdt2$V44>=0.8,which(colnames(plotdt2)%in%c("ID","V44","AveExpr.y"))]
filtereddt=filtereddt[-1,]
colnames(filtereddt)[c(2,3)]=c("correlation","RNA AveExpr")
filtereddt=na.omit(filtereddt)
datatable(na.omit(filtereddt))

tf_dt=read.csv("~/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/DatabaseExtract_v_1.01.csv")
tf_dt2=tf_dt[tf_dt$Is.TF.=="Yes",]
#tf_dt3=tf_dt2[tf_dt2$HGNC.symbol%in%filtereddt$ID,]
tf_dt3=merge(filtereddt,tf_dt2,by.x="ID",by.y="HGNC.symbol")
datatable(tf_dt3)

length(unique(tf_dt2$HGNC.symbol))

## [1] 1639

filtereddt$ID

##  [1] "ANO6"    "B4GALT1" "CTPS2"   "CYBRD1"  "DACT1"   "DLGAP4"  "EIF4G3" 
##  [8] "ETV6"    "GLRX2"   "GPC6"    "GUCY1A1" "H2AX"    "HUWE1"   "LGR6"   
## [15] "MXRA8"   "NES"     "PLA2G4F" "PNPLA6"  "PODN"    "PXMP2"   "RAB23"  
## [22] "RALGAPB" "RBBP9"   "SBF2"    "SCYL2"   "STAT5B"  "TNC"     "UCHL1"  
## [29] "XIRP2"

3.2 three groups

dim(mergedt_g1)

## [1] 3506   15

mergedt_g1$na_sum=rowSums(is.na(mergedt_g1))
mergedt_g1=mergedt_g1[which(mergedt_g1$na_sum<1),]
dim(mergedt_g1)

## [1] 2198   16

mergedt_g1=mergedt_g1[,1:15]

colnames(mergedt_g1)

##  [1] "ID"                   "7_LV_IHD_Yes_51_M.x"  "8_LV_IHD_Yes_53_M.x" 
##  [4] "9_LV_IHD_Yes_55_M.x"  "10_LV_IHD_Yes_56_M.x" "11_LV_IHD_Yes_56_M.x"
##  [7] "13_LV_IHD_Yes_59_M.x" "14_LV_IHD_Yes_65_M.x" "7_LV_IHD_Yes_51_M.y" 
## [10] "8_LV_IHD_Yes_53_M.y"  "9_LV_IHD_Yes_55_M.y"  "10_LV_IHD_Yes_56_M.y"
## [13] "11_LV_IHD_Yes_56_M.y" "13_LV_IHD_Yes_59_M.y" "14_LV_IHD_Yes_65_M.y"

for(i in 1:nrow(mergedt_g1)){
mergedt_g1[i,16]=cor(as.vector(as.numeric(mergedt_g1[i,2:8])),as.vector(as.numeric(mergedt_g1[i,9:15])),use = "pairwise.complete.obs")
}

# nasum_g1=rowSums(is.na(mergedt_g1))
# which(rownames(rna_expression2)=="ALDH1A3")
# nasum_g1[146]#high missing values

cordt=mergedt_g1[,c(1,16)]


plotdt=merge(cordt,dmvsd, by.x="ID",by.y="ID")

plotdt2=plotdt
plotdt2=plotdt2[plotdt2$AveExpr.y>=5,]
ggplot(plotdt2, aes(x=AveExpr.y, y=V16)) +
  geom_point( mapping = aes(color=AveExpr.y),size=1)+
  theme(axis.text.x = element_text(angle = 60, hjust = 1),panel.background =element_rect(fill = "white"),
        panel.grid.major = element_line(size = 0.1, linetype = 'solid',colour = "gray"),
        panel.grid.minor = element_line(size = 0.1, linetype = 'solid',
                                        colour = "white"),aspect.ratio = 1)+
  ggtitle("Correlation vs average expression")+
  xlab("average in rna")+
  ylab("correlation")+scale_colour_gradientn(colours=rev(c("#FF0000", "#FFFF00" ,"#00FF00", "#00FFFF","#0000FF")))+geom_hline(yintercept=0.8)

filtereddt=plotdt2[plotdt2$V16>=0.8,which(colnames(plotdt2)%in%c("ID","V16","AveExpr.y"))]
filtereddt=filtereddt[-1,]
colnames(filtereddt)[c(2,3)]=c("correlation","RNA AveExpr")
filtereddt=na.omit(filtereddt)
dim(filtereddt)

## [1] 210   3

datatable(na.omit(filtereddt))

tf_dt3=merge(filtereddt,tf_dt2,by.x="ID",by.y="HGNC.symbol")
dim(tf_dt3)

## [1]  1 31

datatable(tf_dt3)

length(filtereddt$ID)

## [1] 210

dim(mergedt_g2)

## [1] 3506   15

mergedt_g2$na_sum=rowSums(is.na(mergedt_g2))
mergedt_g2=mergedt_g2[which(mergedt_g2$na_sum<1),]
dim(mergedt_g2)

## [1] 2098   16

mergedt_g2=mergedt_g2[,1:15]


colnames(mergedt_g2)

##  [1] "ID"                  "22_LV_IHD_No_50_M.x" "24_LV_IHD_No_54_M.x"
##  [4] "25_LV_IHD_No_54_M.x" "27_LV_IHD_No_62_F.x" "28_LV_IHD_No_62_M.x"
##  [7] "29_LV_IHD_No_62_M.x" "30_LV_IHD_No_66_M.x" "22_LV_IHD_No_50_M.y"
## [10] "24_LV_IHD_No_54_M.y" "25_LV_IHD_No_54_M.y" "27_LV_IHD_No_62_F.y"
## [13] "28_LV_IHD_No_62_M.y" "29_LV_IHD_No_62_M.y" "30_LV_IHD_No_66_M.y"

for(i in 1:nrow(mergedt_g2)){
mergedt_g2[i,16]=cor(as.vector(as.numeric(mergedt_g2[i,2:8])),as.vector(as.numeric(mergedt_g2[i,9:15])),use = "pairwise.complete.obs")
  }

cordt=mergedt_g2[,c(1,16)]


plotdt=merge(cordt,dmvsd, by.x="ID",by.y="ID")

plotdt2=plotdt
plotdt2=plotdt2[plotdt2$AveExpr.y>=5,]
ggplot(plotdt2, aes(x=AveExpr.y, y=V16)) +
  geom_point( mapping = aes(color=AveExpr.y),size=1)+
  theme(axis.text.x = element_text(angle = 60, hjust = 1),panel.background =element_rect(fill = "white"),
        panel.grid.major = element_line(size = 0.1, linetype = 'solid',colour = "gray"),
        panel.grid.minor = element_line(size = 0.1, linetype = 'solid',
                                        colour = "white"),aspect.ratio = 1)+
  ggtitle("Correlation vs average expression")+
  xlab("average in rna")+
  ylab("correlation")+scale_colour_gradientn(colours=rev(c("#FF0000", "#FFFF00" ,"#00FF00", "#00FFFF","#0000FF")))+geom_hline(yintercept=0.8)

filtereddt=plotdt2[plotdt2$V16>=0.8,which(colnames(plotdt2)%in%c("ID","V16","AveExpr.y"))]
filtereddt=filtereddt[-1,]
colnames(filtereddt)[c(2,3)]=c("correlation","RNA AveExpr")
filtereddt=na.omit(filtereddt)
dim(filtereddt)

## [1] 86  3

datatable(na.omit(filtereddt))

tf_dt3=merge(filtereddt,tf_dt2,by.x="ID",by.y="HGNC.symbol")
dim(tf_dt3)

## [1]  0 31

datatable(tf_dt3)

length(filtereddt$ID)

## [1] 86

dim(mergedt_g3)

## [1] 3506   15

mergedt_g3$na_sum=rowSums(is.na(mergedt_g3))
mergedt_g3=mergedt_g3[which(mergedt_g3$na_sum<1),]
dim(mergedt_g3)

## [1] 2266   16

mergedt_g3=mergedt_g3[,1:15]


colnames(mergedt_g3)

##  [1] "ID"                     "101_LV_Donor_No_54_M.x" "104_LV_Donor_No_55_F.x"
##  [4] "105_LV_Donor_No_56_M.x" "106_LV_Donor_No_60_M.x" "107_LV_Donor_No_62_M.x"
##  [7] "108_LV_Donor_No_62_F.x" "110_LV_Donor_No_65_F.x" "101_LV_Donor_No_54_M.y"
## [10] "104_LV_Donor_No_55_F.y" "105_LV_Donor_No_56_M.y" "106_LV_Donor_No_60_M.y"
## [13] "107_LV_Donor_No_62_M.y" "108_LV_Donor_No_62_F.y" "110_LV_Donor_No_65_F.y"

for(i in 1:nrow(mergedt_g3)){
mergedt_g3[i,16]=cor(as.vector(as.numeric(mergedt_g3[i,2:8])),as.vector(as.numeric(mergedt_g3[i,9:15])),use = "pairwise.complete.obs")
  }

cordt=mergedt_g3[,c(1,16)]


plotdt=merge(cordt,dmvsd, by.x="ID",by.y="ID")

plotdt2=plotdt
plotdt2=plotdt2[plotdt2$AveExpr.y>=5,]
ggplot(plotdt2, aes(x=AveExpr.y, y=V16)) +
  geom_point( mapping = aes(color=AveExpr.y),size=1)+
  theme(axis.text.x = element_text(angle = 60, hjust = 1),panel.background =element_rect(fill = "white"),
        panel.grid.major = element_line(size = 0.1, linetype = 'solid',colour = "gray"),
        panel.grid.minor = element_line(size = 0.1, linetype = 'solid',
                                        colour = "white"),aspect.ratio = 1)+
  ggtitle("Correlation vs average expression")+
  xlab("average in rna")+
  ylab("correlation")+scale_colour_gradientn(colours=rev(c("#FF0000", "#FFFF00" ,"#00FF00", "#00FFFF","#0000FF")))+geom_hline(yintercept=0.8)

filtereddt=plotdt2[plotdt2$V16>=0.8,which(colnames(plotdt2)%in%c("ID","V16","AveExpr.y"))]
filtereddt=filtereddt[-1,]
colnames(filtereddt)[c(2,3)]=c("correlation","RNA AveExpr")
filtereddt=na.omit(filtereddt)
dim(filtereddt)

## [1] 63  3

datatable(na.omit(filtereddt))

tf_dt3=merge(filtereddt,tf_dt2,by.x="ID",by.y="HGNC.symbol")
dim(tf_dt3)

## [1]  0 31

datatable(tf_dt3)

length(filtereddt$ID)

## [1] 63

generate rna seq figures for the paper

Yunwei Zhang

“Initiated on 20220926, compiled on 2023 Jul 31”

1 IHD-DM VS Donor

2 IHD-no DM VS Donor

3 comparison rna-seq vs proteins

3.1 results

3.2 three groups