#library(GEOquery) ## go to https://github.com/seandavi/GEOquery for installation details
#library(R.utils)
library(reshape2)
library(ggplot2)
library(limma)
library(dplyr)
library(tidyverse)
#library(DMwR)
library(readxl)
library(MultiAssayExperiment)
library(S4Vectors)
library(SummarizedExperiment)
library(DT)
library(randomForest)
library(e1071)
library("bnstruct")
#library(rminer)
library(ggpubr)
setwd("~/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis")
# install.packages("VANData_1.0.0.tgz", repos=NULL,type="source")
# install.packages("VAN_1.0.0.tgz", repos=NULL,type="source")
library(R.utils)
library(reshape2)
library(ggplot2)
library(limma)
library(dplyr)
library(tidyverse)
#library(DMwR)
library(readxl)
library(magrittr)
library(tidyverse)
library(gage)
library(knitr)
library(DT)
library(janitor)
#library(KEGGREST)
library(MASS)
library("AnnotationDbi")
library("org.Hs.eg.db")
library(dplyr)
library(reshape)
library(ggplot2)
library(plyr)
library(gtable)
library(grid)
library(pheatmap)
library(reshape2)
library(plotly)
library(UpSetR)
library(MatchIt)
require(pathview)
#library(STRINGdb)
## Kegg sets data
require(gage)
require(gageData)
library(clusterProfiler)
library(enrichplot)
kegg = kegg.gsets(species = "hsa", id.type = "kegg")
kegg.sets.hsa = kegg$kg.sets
library(ggrepel)
#some notes for pathway analysis
#'count' is the number of genes that belong to a given gene-set, while 'setSize' is the total number of genes in the gene-set.
#NES = enrichment score normalized to mean enrichment of random samples of the same size,The method employs random sampling of gene sets of the same size as the gene set being tested to assess significance and for normalization. The number of samplings is specified as a parameter.
#
# Let is suppose I have a collection of genesets called : HALLMARK Now let is suppose there is a specific geneset there called: E2F_targets
#
# BgRatio, M/N.
#
# M = size of the geneset (eg size of the E2F_targets); (is the number of genes within that distribution that are annotated (either directly or indirectly) to the node of interest).
#
# N = size of all of the unique genes in the collection of genesets (example the HALLMARK collection); (is the total number of genes in the background distribution (universe)
#
# GeneRatio is k/n.
#
# k = size of the overlap of 'a vector of gene id' you input with the specific geneset (eg E2F_targets), only unique genes; (the number of genes within that list n, which are annotated to the node.
#
# n = size of the overlap of 'a vector of gene id' you input with all the members of the collection of genesets (eg the HALLMARK collection),only unique genes; is the size of the list of genes of interest
#adj pval and qval
#https://support.bioconductor.org/p/96329/
#adj pval is the fdr adjusted pval, qval from another package is another adjustment method
#the BH approach is slightly more conservative than qvalue
rnaseq_dt=read_excel("/Users//MQ10004787/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/BH 2022 RNAseq data_3.033 corrected_0s deleted_IHD-DM paper.xlsx",col_names=FALSE)
rnaseq_dt=as.data.frame(rnaseq_dt)
infodt=rnaseq_dt[1:7,]
expressiondt=rnaseq_dt[8:dim(rnaseq_dt)[1],2:dim(rnaseq_dt)[2]]
#as.character(as.vector(rnaseq_dt[8:dim(rnaseq_dt)[1],1]))[which(duplicated(as.character(as.vector(rnaseq_dt[8:dim(rnaseq_dt)[1],1])))==TRUE)]
expressiondt=apply(expressiondt, 2, as.numeric)
abundence=rowSums(expressiondt,na.rm = TRUE)
duplicated_names=as.character(as.vector(rnaseq_dt[8:dim(rnaseq_dt)[1],1]))[which(duplicated(as.character(as.vector(rnaseq_dt[8:dim(rnaseq_dt)[1],1])))==TRUE)]
name_vec=as.character(as.vector(rnaseq_dt[8:dim(rnaseq_dt)[1],1]))
subdt=as.data.frame(expressiondt)
subdt[subdt$name=="GTF2H2",]#need to use the one with a small abundence
## [1] ...2 ...3 ...4 ...5 ...6 ...7 ...8 ...9 ...10 ...11 ...12 ...13
## [13] ...14 ...15 ...16 ...17 ...18 ...19 ...20 ...21 ...22
## <0 rows> (or 0-length row.names)
## [1] 0
## [1] FALSE
## [1] 23883 23
## [1] 23787 23
#get the expressiondt
expressiondt=as.data.frame(subdt2[,1:21])
rownames(expressiondt)=subdt2$name
expressiondt[rownames(expressiondt)=="GTF2H2",]=GTF2H2_vec
colnames(expressiondt)=paste(paste(paste(paste(rnaseq_dt[1,2:dim(rnaseq_dt)[2]],rnaseq_dt[2,2:dim(rnaseq_dt)[2]],sep = "_"),rnaseq_dt[3,2:dim(rnaseq_dt)[2]],sep = "_"),rnaseq_dt[4,2:dim(rnaseq_dt)[2]],sep = "_"),rnaseq_dt[5,2:dim(rnaseq_dt)[2]],sep = "_")
#log2 transform
expressiondt=as.data.frame(apply(expressiondt, 2, log2))
rownames(expressiondt)=subdt2$name
dim(expressiondt)
## [1] 23787 21
rnaseq_dt=read_excel("/Users/yzha0247/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/BH 2022 RNAseq data_3.033 corrected_0s deleted_IHD-DM paper.xlsx",col_names=FALSE,sheet = 2)
rnaseq_dt=as.data.frame(rnaseq_dt)
dim(rnaseq_dt)
rnaseq_dt=rnaseq_dt[,-1]
dim(rnaseq_dt)
infodt=rnaseq_dt[1:7,]
expressiondt=rnaseq_dt[8:dim(rnaseq_dt)[1],2:dim(rnaseq_dt)[2]]
which(duplicated(as.character(as.vector(rnaseq_dt[8:dim(rnaseq_dt)[1],1])))==TRUE)
rownames(expressiondt)=make.names(as.character(as.vector(rnaseq_dt[8:dim(rnaseq_dt)[1],1])),unique = TRUE)
colnames(expressiondt)=paste(paste(paste(paste(rnaseq_dt[1,2:dim(rnaseq_dt)[2]],rnaseq_dt[2,2:dim(rnaseq_dt)[2]],sep = "_"),rnaseq_dt[3,2:dim(rnaseq_dt)[2]],sep = "_"),rnaseq_dt[4,2:dim(rnaseq_dt)[2]],sep = "_"),rnaseq_dt[5,2:dim(rnaseq_dt)[2]],sep = "_")
expressiondt=apply(expressiondt, 2, as.numeric)
#log2 transform
expressiondt=apply(expressiondt, 2, log2)
rownames(expressiondt)=make.names(as.character(as.vector(rnaseq_dt[8:dim(rnaseq_dt)[1],1])),unique = TRUE)
######################
#get the "lv" "ihd-dm" "ihd-no dm" non-imputed data
experiment_data_noimputation=expressiondt[,union(grep("_IHD_Yes",colnames(expressiondt)),grep("Donor_No",colnames(expressiondt)))]
#colnames(experiment_data_noimputation)
dim(experiment_data_noimputation)
## [1] 23787 14
#filtering out with >25% samples missing
experiment_data_noimputation=experiment_data_noimputation[which(rowMeans(!is.na(experiment_data_noimputation)) > 0.25), ]
dim(experiment_data_noimputation)
## [1] 19212 14
diabetes=grep("Yes",colnames(experiment_data_noimputation))
experiment_data_noimputation1=as.data.frame(t(experiment_data_noimputation))
rownames(experiment_data_noimputation1)=1:dim(experiment_data_noimputation1)[1]
experiment_data_noimputation1$y=ifelse(rownames(experiment_data_noimputation1) %in% diabetes,1,0)
#de analysis-- should be on the non-imputed data
groupname<-as.factor(experiment_data_noimputation1$y)
design <- model.matrix(~ groupname + 0)
fit <- lmFit(experiment_data_noimputation, design)
cont.matrix <- makeContrasts(groupname1-groupname0, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)
tT <- topTable(fit2)
df1 <- topTable(fit2, number=nrow(fit2), genelist=rownames(experiment_data_noimputation))
df1$name=rownames(df1)
df1$neglogP_value = -log10(df1$P.Value)
#write.csv(df1,"rna_ihddmvsd.csv")
# g1=ggplot2::ggplot(df1, aes(logFC,neglogP_value,label=name))+
# geom_point(size = 1, alpha=1) +
# xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("IHD-T2DM vs Donor") +
# theme_bw(base_size = 16) +
# geom_hline(yintercept = 1.3, linetype = "dashed", colour = "brown3")+theme(aspect.ratio = 1)
#add fdr plot
adj_to_p <- function(df) {
#df <- topTable(fit, coef = comparison, number = nrow(fit))
if ((df %>%dplyr::filter(adj.P.Val < 0.05) %>% nrow()) == 0){
r <- 1
} else {
r <- df%>%dplyr::mutate(rn = row_number()) %>% dplyr::filter(adj.P.Val <= 0.05) %>% dplyr::pull(rn)%>%max() + 1
}
n <- nrow(df) + 1
lp10 <- -log10(r / n * 0.05)
lp10
}
#add samples
colnames(experiment_data_noimputation)
## [1] "LV_IHD_Yes_M_51" "LV_IHD_Yes_M_53" "LV_IHD_Yes_M_55"
## [4] "LV_IHD_Yes_M_56" "LV_IHD_Yes_M_56.1" "LV_IHD_Yes_M_59"
## [7] "LV_IHD_Yes_M_65" "LV_Donor_No_M_54" "LV_Donor_No_F_55"
## [10] "LV_Donor_No_M_56" "LV_Donor_No_M_60" "LV_Donor_No_M_62"
## [13] "LV_Donor_No_F_62" "LV_Donor_No_F_65"
## [1] 5.584625
highlight_df=df1
highlight_df$colorcode=as.character(ifelse(highlight_df$neglogP_value>adj_to_p(df1)&highlight_df$logFC>0,1,ifelse(highlight_df$neglogP_value>adj_to_p(df1)&highlight_df$logFC<0,-1,0)))
dim(highlight_df)
## [1] 19212 10
## [1] 19167 10
g1=ggplot2::ggplot(highlight_df, aes(logFC,neglogP_value,label=name))+
geom_point(aes(x=logFC,y=neglogP_value,color=colorcode)) +
xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("IHD-T2DM vs Donor") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),aspect.ratio = 1) +
geom_hline(yintercept = adj_to_p(df1), linetype = "dashed", colour = "red")+scale_color_manual(values = c("blue","gray","red"))
ggplotly(g1)
## [1] 1541 9
g1=ggplot(highlight_df, aes(logFC,neglogP_value,label=name))+
geom_point() +
xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("IHD-T2DM vs Donor") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),aspect.ratio = 1)+geom_text_repel(aes(x=logFC,y=neglogP_value,label=name),subset(highlight_df,(logFC>4|-logFC>4|neglogP_value>4)))
#ggsave(filename="plot1.pdf",g1,height = 10,width = 8)
g1
######################
#get the "lv" "ihd-dm" "ihd-no dm" non-imputed data
experiment_data_noimputation=expressiondt[,union(grep("_IHD_No",colnames(expressiondt)),grep("Donor_No",colnames(expressiondt)))]
#colnames(experiment_data_noimputation)
dim(experiment_data_noimputation)
## [1] 23787 14
#filtering out with >25% samples missing
experiment_data_noimputation=experiment_data_noimputation[which(rowMeans(!is.na(experiment_data_noimputation)) > 0.25), ]
dim(experiment_data_noimputation)
## [1] 19099 14
diabetes=grep("IHD",colnames(experiment_data_noimputation))
experiment_data_noimputation1=as.data.frame(t(experiment_data_noimputation))
rownames(experiment_data_noimputation1)=1:dim(experiment_data_noimputation1)[1]
experiment_data_noimputation1$y=ifelse(rownames(experiment_data_noimputation1) %in% diabetes,1,0)
#de analysis-- should be on the non-imputed data
groupname<-as.factor(experiment_data_noimputation1$y)
design <- model.matrix(~ groupname + 0)
fit <- lmFit(experiment_data_noimputation, design)
cont.matrix <- makeContrasts(groupname1-groupname0, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)
tT <- topTable(fit2)
df1 <- topTable(fit2, number=nrow(fit2), genelist=rownames(experiment_data_noimputation))
df1$name=rownames(df1)
df1$neglogP_value = -log10(df1$P.Value)
#write.csv(df1,"rna_ihdnodmvsd.csv")
# g1=ggplot2::ggplot(df1, aes(logFC,neglogP_value,label=name))+
# geom_point(size = 1, alpha=1) +
# xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("IHD-T2DM vs Donor") +
# theme_bw(base_size = 16) +
# geom_hline(yintercept = 1.3, linetype = "dashed", colour = "brown3")+theme(aspect.ratio = 1)
#add fdr plot
adj_to_p <- function(df) {
#df <- topTable(fit, coef = comparison, number = nrow(fit))
if ((df %>%dplyr::filter(adj.P.Val < 0.05) %>% nrow()) == 0){
r <- 1
} else {
r <- df%>%dplyr::mutate(rn = row_number()) %>% dplyr::filter(adj.P.Val <= 0.05) %>% dplyr::pull(rn)%>%max() + 1
}
n <- nrow(df) + 1
lp10 <- -log10(r / n * 0.05)
lp10
}
#add samples
colnames(experiment_data_noimputation)
## [1] "LV_IHD_No_M_50" "LV_IHD_No_M_54" "LV_IHD_No_M_54.1" "LV_IHD_No_F_62"
## [5] "LV_IHD_No_M_62" "LV_IHD_No_M_62.1" "LV_IHD_No_M_66" "LV_Donor_No_M_54"
## [9] "LV_Donor_No_F_55" "LV_Donor_No_M_56" "LV_Donor_No_M_60" "LV_Donor_No_M_62"
## [13] "LV_Donor_No_F_62" "LV_Donor_No_F_65"
## [1] 5.582063
highlight_df=df1
highlight_df$colorcode=as.character(ifelse(highlight_df$neglogP_value>adj_to_p(df1)&highlight_df$logFC>0,1,ifelse(highlight_df$neglogP_value>adj_to_p(df1)&highlight_df$logFC<0,-1,0)))
dim(highlight_df)
## [1] 19099 10
## [1] 19069 10
g1=ggplot2::ggplot(highlight_df, aes(logFC,neglogP_value,label=name))+
geom_point(aes(x=logFC,y=neglogP_value,color=colorcode)) +
xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("IHD-no T2DM vs Donor") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),aspect.ratio = 1) +
geom_hline(yintercept = adj_to_p(df1), linetype = "dashed", colour = "red")+scale_color_manual(values = c("blue","gray","red"))
ggplotly(g1)
## [1] 691 9
g1=ggplot(highlight_df, aes(logFC,neglogP_value,label=name))+
geom_point() +
xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("IHD-T2DM vs Donor") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),aspect.ratio = 1)+geom_text_repel(aes(x=logFC,y=neglogP_value,label=name),subset(highlight_df,(logFC>2.5|-logFC>2.5|neglogP_value>3.5)))
#ggsave(filename="plot1.pdf",g1,height = 10,width = 8)
g1
set2=df1[which(df1$P.Value<0.05),"name"]
set2_relaxed=df1[which(abs(df1$logFC)>1),"name"]
set2_restricted=df1[which(df1$adj.P.Val<0.05),"name"]
rna_nodmvsd=df1
library(RColorBrewer)
myCol <- brewer.pal(3, "Pastel2")[c(1, 2)]
library(VennDiagram)
# Chart
v1=venn.diagram(
x = list(set1, set2),
category.names = c("IHD-T2DM" , "IHD-NO T2DM" ),
cat.cex = 2,
cat.default.pos = "text",
filename = NULL,
output=TRUE,
# Output features
height = 14 ,
width = 15 ,
resolution = 300,
compression = "lzw",
# Circles
lwd = 2,
lty = 'blank',
fill = myCol,
# Numbers
cex = 2,
fontface = "bold",
fontfamily = "sans"
)
display_venn <- function(x, ...){
library(VennDiagram)
grid.newpage()
venn_object <- venn.diagram(x, filename = NULL, ...)
grid.draw(venn_object)
}
display_venn(x = list(set1, set2),
category.names = c("IHD-T2DM" , "IHD-NO T2DM" ),fill = myCol,lwd = 0,
lty = 'blank')
set_protein1=readRDS("~/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/set1.rds")
set_protein1_restricted=readRDS("~/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/set1_restricted.rds")
# set_protein2=readRDS("~/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/set4.rds")
# set_protein2_restricted=readRDS("~/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/set4_restricted.rds")
v1=venn.diagram(
x = list(set1, set_protein1_restricted),
category.names = c("rna" , "protein" ),
cat.cex = 2,
cat.default.pos = "text",
filename = NULL,
output=TRUE,
# Output features
height = 14 ,
width = 15 ,
resolution = 300,
compression = "lzw",
# Circles
lwd = 2,
lty = 'blank',
fill = myCol,
# Numbers
cex = 2,
fontface = "bold",
fontfamily = "sans"
)
display_venn <- function(x, ...){
library(VennDiagram)
grid.newpage()
venn_object <- venn.diagram(x, filename = NULL, ...)
grid.draw(venn_object)
}
display_venn(x = list(set1, set_protein1_restricted),
category.names = c("rna" , "protein" ),fill = myCol,lwd = 0,
lty = 'blank')
# v1=venn.diagram(
# x = list(set2,set_protein2_restricted),
# category.names = c("rna" , "protein" ),
# cat.cex = 2,
# cat.default.pos = "text",
# filename = NULL,
# output=TRUE,
#
# # Output features
# height = 14 ,
# width = 15 ,
# resolution = 300,
# compression = "lzw",
#
# # Circles
# lwd = 2,
# lty = 'blank',
# fill = myCol,
#
# # Numbers
# cex = 2,
# fontface = "bold",
# fontfamily = "sans"
#
#
# )
# display_venn <- function(x, ...){
# library(VennDiagram)
# grid.newpage()
# venn_object <- venn.diagram(x, filename = NULL, ...)
# grid.draw(venn_object)
# }
# display_venn(x = list(set2,set_protein2_restricted),
# category.names = c("rna" , "protein" ),fill = myCol,lwd = 0,
# lty = 'blank')
load("all_omics_lv_v6.RData")
protein_expression=assay(all_omics@ExperimentList$protein, "log2")
protein_expression1=as.data.frame(protein_expression)
dim(protein_expression1)
## [1] 9037 109
#changed 20220224: no overall filtering
protein_expression2=protein_expression1
dim(protein_expression2)
## [1] 9037 109
## [1] 4077 109
protein_expression3=protein_expression2[,c(union(grep("_IHD_Yes",colnames(protein_expression2)),grep("IHD_No",colnames(protein_expression2))),grep("Donor_No",colnames(protein_expression2)))]
protein_expression3=protein_expression3[,-c(32,34,36,39,40,41,43,46,48,49,50,54)]
colnames(protein_expression3)
## [1] "2_LV_IHD_Yes_45_M" "3_LV_IHD_Yes_46_F" "4_LV_IHD_Yes_49_M"
## [4] "6_LV_IHD_Yes_51_M" "5_LV_IHD_Yes_50_M" "7_LV_IHD_Yes_51_M"
## [7] "14_LV_IHD_Yes_65_M" "9_LV_IHD_Yes_55_M" "12_LV_IHD_Yes_59_M"
## [10] "13_LV_IHD_Yes_59_M" "10_LV_IHD_Yes_56_M" "11_LV_IHD_Yes_56_M"
## [13] "8_LV_IHD_Yes_53_M" "1_LV_IHD_Yes_41_M" "17_LV_IHD_No_43_F"
## [16] "26_LV_IHD_No_54_F" "27_LV_IHD_No_62_F" "24_LV_IHD_No_54_M"
## [19] "16_LV_IHD_No_42_F" "22_LV_IHD_No_50_M" "21_LV_IHD_No_49_F"
## [22] "19_LV_IHD_No_47_F" "20_LV_IHD_No_48_F" "23_LV_IHD_No_50_M"
## [25] "28_LV_IHD_No_62_M" "15_LV_IHD_No_41_M" "29_LV_IHD_No_62_M"
## [28] "18_LV_IHD_No_45_M" "30_LV_IHD_No_66_M" "25_LV_IHD_No_54_M"
## [31] "110_LV_Donor_No_65_F" "101_LV_Donor_No_54_M" "94_LV_Donor_No_47_M"
## [34] "98_LV_Donor_No_51_F" "105_LV_Donor_No_56_M" "100_LV_Donor_No_53_F"
## [37] "109_LV_Donor_No_65_M" "99_LV_Donor_No_53_M" "97_LV_Donor_No_49_F"
## [40] "108_LV_Donor_No_62_F" "102_LV_Donor_No_54_F" "96_LV_Donor_No_48_M"
## [43] "106_LV_Donor_No_60_M" "95_LV_Donor_No_47_F" "107_LV_Donor_No_62_M"
## [46] "104_LV_Donor_No_55_F"
rnaseq_dt=read_excel("~/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/BH 2022 RNAseq data_3.033 corrected_0s deleted_IHD-DM paper.xlsx",col_names=FALSE)
rnaseq_dt=as.data.frame(rnaseq_dt)
infodt=rnaseq_dt[1:7,]
expressiondt=rnaseq_dt[8:dim(rnaseq_dt)[1],2:dim(rnaseq_dt)[2]]
#as.character(as.vector(rnaseq_dt[8:dim(rnaseq_dt)[1],1]))[which(duplicated(as.character(as.vector(rnaseq_dt[8:dim(rnaseq_dt)[1],1])))==TRUE)]
expressiondt=apply(expressiondt, 2, as.numeric)
abundence=rowSums(expressiondt,na.rm = TRUE)
duplicated_names=as.character(as.vector(rnaseq_dt[8:dim(rnaseq_dt)[1],1]))[which(duplicated(as.character(as.vector(rnaseq_dt[8:dim(rnaseq_dt)[1],1])))==TRUE)]
name_vec=as.character(as.vector(rnaseq_dt[8:dim(rnaseq_dt)[1],1]))
subdt=as.data.frame(expressiondt)
subdt[subdt$name=="GTF2H2",]#need to use the one with a small abundence
## [1] ...2 ...3 ...4 ...5 ...6 ...7 ...8 ...9 ...10 ...11 ...12 ...13
## [13] ...14 ...15 ...16 ...17 ...18 ...19 ...20 ...21 ...22
## <0 rows> (or 0-length row.names)
## [1] 0
## [1] FALSE
## [1] 23883 23
## [1] 23787 23
#get the expressiondt
expressiondt=as.data.frame(subdt2[,1:21])
rownames(expressiondt)=subdt2$name
expressiondt[rownames(expressiondt)=="GTF2H2",]=GTF2H2_vec
colnames(expressiondt)=paste(paste(paste(paste(paste(rnaseq_dt[7,2:dim(rnaseq_dt)[2]],rnaseq_dt[1,2:dim(rnaseq_dt)[2]],sep="_"),rnaseq_dt[2,2:dim(rnaseq_dt)[2]],sep = "_"),rnaseq_dt[3,2:dim(rnaseq_dt)[2]],sep = "_"),rnaseq_dt[5,2:dim(rnaseq_dt)[2]],sep = "_"),rnaseq_dt[4,2:dim(rnaseq_dt)[2]],sep = "_")
#log2 transform
expressiondt=as.data.frame(apply(expressiondt, 2, log2))
rownames(expressiondt)=subdt2$name
dim(expressiondt)
## [1] 23787 21
expressiondt=rna_expression
######################
#get the "lv" "ihd-dm" "ihd-no dm" non-imputed data
experiment_data_noimputation=expressiondt[,union(grep("_IHD_Yes",colnames(expressiondt)),grep("IHD_No",colnames(expressiondt)))]
#colnames(experiment_data_noimputation)
dim(experiment_data_noimputation)
## [1] 23787 14
#filtering out with >25% samples missing
experiment_data_noimputation=experiment_data_noimputation[which(rowMeans(!is.na(experiment_data_noimputation)) > 0.25), ]
dim(experiment_data_noimputation)
## [1] 19283 14
diabetes=grep("Yes",colnames(experiment_data_noimputation))
experiment_data_noimputation1=as.data.frame(t(experiment_data_noimputation))
rownames(experiment_data_noimputation1)=1:dim(experiment_data_noimputation1)[1]
experiment_data_noimputation1$y=ifelse(rownames(experiment_data_noimputation1) %in% diabetes,1,0)
#de analysis-- should be on the non-imputed data
groupname<-as.factor(experiment_data_noimputation1$y)
design <- model.matrix(~ groupname + 0)
fit <- lmFit(experiment_data_noimputation, design)
cont.matrix <- makeContrasts(groupname1-groupname0, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)
tT <- topTable(fit2)
df1 <- topTable(fit2, number=nrow(fit2), genelist=rownames(experiment_data_noimputation))
df1$name=rownames(df1)
df1$neglogP_value = -log10(df1$P.Value)
#write.csv(df1,"rna_ihddm vs nodm.csv")
# g1=ggplot2::ggplot(df1, aes(logFC,neglogP_value,label=name))+
# geom_point(size = 1, alpha=1) +
# xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("IHD-T2DM vs Donor") +
# theme_bw(base_size = 16) +
# geom_hline(yintercept = 1.3, linetype = "dashed", colour = "brown3")+theme(aspect.ratio = 1)
#add fdr plot
adj_to_p <- function(df) {
#df <- topTable(fit, coef = comparison, number = nrow(fit))
if ((df %>%dplyr::filter(adj.P.Val < 0.05) %>% nrow()) == 0){
r <- 1
} else {
r <- df%>%dplyr::mutate(rn = row_number()) %>% dplyr::filter(adj.P.Val <= 0.05) %>% dplyr::pull(rn)%>%max() + 1
}
n <- nrow(df) + 1
lp10 <- -log10(r / n * 0.05)
lp10
}
#add samples
colnames(experiment_data_noimputation)
## [1] "7_LV_IHD_Yes_51_M" "8_LV_IHD_Yes_53_M" "9_LV_IHD_Yes_55_M"
## [4] "10_LV_IHD_Yes_56_M" "11_LV_IHD_Yes_56_M" "13_LV_IHD_Yes_59_M"
## [7] "14_LV_IHD_Yes_65_M" "22_LV_IHD_No_50_M" "24_LV_IHD_No_54_M"
## [10] "25_LV_IHD_No_54_M" "27_LV_IHD_No_62_F" "28_LV_IHD_No_62_M"
## [13] "29_LV_IHD_No_62_M" "30_LV_IHD_No_66_M"
## [1] 5.285197
highlight_df=df1
highlight_df$colorcode=as.character(ifelse(highlight_df$neglogP_value>adj_to_p(df1)&highlight_df$logFC>0,1,ifelse(highlight_df$neglogP_value>adj_to_p(df1)&highlight_df$logFC<0,-1,0)))
dim(highlight_df)
## [1] 19283 10
## [1] 19255 10
g1=ggplot2::ggplot(highlight_df, aes(logFC,neglogP_value,label=name))+
geom_point(aes(x=logFC,y=neglogP_value,color=colorcode)) +
xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("IHD-T2DM vs Donor") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),aspect.ratio = 1) +
geom_hline(yintercept = adj_to_p(df1), linetype = "dashed", colour = "red")+scale_color_manual(values = c("blue","gray","red"))
ggplotly(g1)
## [1] 1907 9
g1=ggplot(highlight_df, aes(logFC,neglogP_value,label=name))+
geom_point() +
xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("IHD-T2DM vs IHD-NOT2DM") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),aspect.ratio = 1)+geom_text_repel(aes(x=logFC,y=neglogP_value,label=name),subset(highlight_df,(logFC>4|-logFC>4|neglogP_value>4)))
#ggsave(filename="plot1.pdf",g1,height = 10,width = 8)
g1
expressiondt=protein_expression3
experiment_data_noimputation=expressiondt[,union(grep("_IHD_Yes",colnames(expressiondt)),grep("Donor_No",colnames(expressiondt)))]
#colnames(experiment_data_noimputation)
dim(experiment_data_noimputation)
## [1] 9037 30
#filtering out with >25% samples missing
experiment_data_noimputation=experiment_data_noimputation[which(rowMeans(!is.na(experiment_data_noimputation)) > 0.25), ]
dim(experiment_data_noimputation)
## [1] 4120 30
## [1] "2_LV_IHD_Yes_45_M" "3_LV_IHD_Yes_46_F" "4_LV_IHD_Yes_49_M"
## [4] "6_LV_IHD_Yes_51_M" "5_LV_IHD_Yes_50_M" "7_LV_IHD_Yes_51_M"
## [7] "14_LV_IHD_Yes_65_M" "9_LV_IHD_Yes_55_M" "12_LV_IHD_Yes_59_M"
## [10] "13_LV_IHD_Yes_59_M" "10_LV_IHD_Yes_56_M" "11_LV_IHD_Yes_56_M"
## [13] "8_LV_IHD_Yes_53_M" "1_LV_IHD_Yes_41_M" "110_LV_Donor_No_65_F"
## [16] "101_LV_Donor_No_54_M" "94_LV_Donor_No_47_M" "98_LV_Donor_No_51_F"
## [19] "105_LV_Donor_No_56_M" "100_LV_Donor_No_53_F" "109_LV_Donor_No_65_M"
## [22] "99_LV_Donor_No_53_M" "97_LV_Donor_No_49_F" "108_LV_Donor_No_62_F"
## [25] "102_LV_Donor_No_54_F" "96_LV_Donor_No_48_M" "106_LV_Donor_No_60_M"
## [28] "95_LV_Donor_No_47_F" "107_LV_Donor_No_62_M" "104_LV_Donor_No_55_F"
diabetes=grep("Yes",colnames(experiment_data_noimputation))
experiment_data_noimputation1=as.data.frame(t(experiment_data_noimputation))
rownames(experiment_data_noimputation1)=1:dim(experiment_data_noimputation1)[1]
experiment_data_noimputation1$y=ifelse(rownames(experiment_data_noimputation1) %in% diabetes,1,0)
#de analysis-- should be on the non-imputed data
groupname<-as.factor(experiment_data_noimputation1$y)
design <- model.matrix(~ groupname + 0)
fit <- lmFit(experiment_data_noimputation, design)
cont.matrix <- makeContrasts(groupname1-groupname0, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)
tT <- topTable(fit2)
df1 <- topTable(fit2, number=nrow(fit2), genelist=rownames(experiment_data_noimputation))
df1$name=rownames(df1)
df1$neglogP_value = -log10(df1$P.Value)
protein_dmvsd=df1
expressiondt=protein_expression3
experiment_data_noimputation=expressiondt[,union(grep("_IHD_No",colnames(expressiondt)),grep("Donor_No",colnames(expressiondt)))]
#colnames(experiment_data_noimputation)
dim(experiment_data_noimputation)
## [1] 9037 32
#filtering out with >25% samples missing
experiment_data_noimputation=experiment_data_noimputation[which(rowMeans(!is.na(experiment_data_noimputation)) > 0.25), ]
dim(experiment_data_noimputation)
## [1] 4052 32
## [1] "17_LV_IHD_No_43_F" "26_LV_IHD_No_54_F" "27_LV_IHD_No_62_F"
## [4] "24_LV_IHD_No_54_M" "16_LV_IHD_No_42_F" "22_LV_IHD_No_50_M"
## [7] "21_LV_IHD_No_49_F" "19_LV_IHD_No_47_F" "20_LV_IHD_No_48_F"
## [10] "23_LV_IHD_No_50_M" "28_LV_IHD_No_62_M" "15_LV_IHD_No_41_M"
## [13] "29_LV_IHD_No_62_M" "18_LV_IHD_No_45_M" "30_LV_IHD_No_66_M"
## [16] "25_LV_IHD_No_54_M" "110_LV_Donor_No_65_F" "101_LV_Donor_No_54_M"
## [19] "94_LV_Donor_No_47_M" "98_LV_Donor_No_51_F" "105_LV_Donor_No_56_M"
## [22] "100_LV_Donor_No_53_F" "109_LV_Donor_No_65_M" "99_LV_Donor_No_53_M"
## [25] "97_LV_Donor_No_49_F" "108_LV_Donor_No_62_F" "102_LV_Donor_No_54_F"
## [28] "96_LV_Donor_No_48_M" "106_LV_Donor_No_60_M" "95_LV_Donor_No_47_F"
## [31] "107_LV_Donor_No_62_M" "104_LV_Donor_No_55_F"
diabetes=grep("IHD",colnames(experiment_data_noimputation))
experiment_data_noimputation1=as.data.frame(t(experiment_data_noimputation))
rownames(experiment_data_noimputation1)=1:dim(experiment_data_noimputation1)[1]
experiment_data_noimputation1$y=ifelse(rownames(experiment_data_noimputation1) %in% diabetes,1,0)
#de analysis-- should be on the non-imputed data
groupname<-as.factor(experiment_data_noimputation1$y)
design <- model.matrix(~ groupname + 0)
fit <- lmFit(experiment_data_noimputation, design)
cont.matrix <- makeContrasts(groupname1-groupname0, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)
tT <- topTable(fit2)
df1 <- topTable(fit2, number=nrow(fit2), genelist=rownames(experiment_data_noimputation))
df1$name=rownames(df1)
df1$neglogP_value = -log10(df1$P.Value)
protein_nodmvsd=df1
expressiondt=protein_expression3
experiment_data_noimputation=expressiondt[,union(grep("_IHD_Yes",colnames(expressiondt)),grep("IHD_No",colnames(expressiondt)))]
#colnames(experiment_data_noimputation)
dim(experiment_data_noimputation)
## [1] 9037 30
#filtering out with >25% samples missing
experiment_data_noimputation=experiment_data_noimputation[which(rowMeans(!is.na(experiment_data_noimputation)) > 0.25), ]
dim(experiment_data_noimputation)
## [1] 4075 30
## [1] "2_LV_IHD_Yes_45_M" "3_LV_IHD_Yes_46_F" "4_LV_IHD_Yes_49_M"
## [4] "6_LV_IHD_Yes_51_M" "5_LV_IHD_Yes_50_M" "7_LV_IHD_Yes_51_M"
## [7] "14_LV_IHD_Yes_65_M" "9_LV_IHD_Yes_55_M" "12_LV_IHD_Yes_59_M"
## [10] "13_LV_IHD_Yes_59_M" "10_LV_IHD_Yes_56_M" "11_LV_IHD_Yes_56_M"
## [13] "8_LV_IHD_Yes_53_M" "1_LV_IHD_Yes_41_M" "17_LV_IHD_No_43_F"
## [16] "26_LV_IHD_No_54_F" "27_LV_IHD_No_62_F" "24_LV_IHD_No_54_M"
## [19] "16_LV_IHD_No_42_F" "22_LV_IHD_No_50_M" "21_LV_IHD_No_49_F"
## [22] "19_LV_IHD_No_47_F" "20_LV_IHD_No_48_F" "23_LV_IHD_No_50_M"
## [25] "28_LV_IHD_No_62_M" "15_LV_IHD_No_41_M" "29_LV_IHD_No_62_M"
## [28] "18_LV_IHD_No_45_M" "30_LV_IHD_No_66_M" "25_LV_IHD_No_54_M"
diabetes=grep("Yes",colnames(experiment_data_noimputation))
experiment_data_noimputation1=as.data.frame(t(experiment_data_noimputation))
rownames(experiment_data_noimputation1)=1:dim(experiment_data_noimputation1)[1]
experiment_data_noimputation1$y=ifelse(rownames(experiment_data_noimputation1) %in% diabetes,1,0)
#de analysis-- should be on the non-imputed data
groupname<-as.factor(experiment_data_noimputation1$y)
design <- model.matrix(~ groupname + 0)
fit <- lmFit(experiment_data_noimputation, design)
cont.matrix <- makeContrasts(groupname1-groupname0, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)
tT <- topTable(fit2)
df1 <- topTable(fit2, number=nrow(fit2), genelist=rownames(experiment_data_noimputation))
df1$name=rownames(df1)
df1$neglogP_value = -log10(df1$P.Value)
protein_dmvnodm=df1
protein_dmvsd=protein_dmvsd[,c(1,2,3,4,5)]
protein_nodmvsd=protein_nodmvsd[,c(1,2,3,4,5)]
protein_dmvnodm=protein_dmvnodm[,c(1,2,3,4,5)]
rna_dmvsd=rna_dmvsd[,c(1,2,3,4,5)]
rna_dmvsnodm=rna_dmvsnodm[,c(1,2,3,4,5)]
rna_nodmvsd=rna_nodmvsd[,c(1,2,3,4,5)]
dmvsd=merge(protein_dmvsd,rna_dmvsd,by.x="ID",by.y="ID")
nodmvsd=merge(protein_nodmvsd,rna_nodmvsd,by.x="ID",by.y="ID")
dmvsnodm=merge(protein_dmvnodm,rna_dmvsnodm,by.x="ID",by.y="ID")
plotdt=dmvsd[dmvsd$AveExpr.y>=5,]
g1=ggplot(plotdt, aes(x=logFC.x, y=logFC.y)) +
geom_point( mapping = aes(color=AveExpr.y),size=1)+
theme(axis.text.x = element_text(angle = 60, hjust = 1),panel.background =element_rect(fill = "white"),
panel.grid.major = element_line(size = 0.1, linetype = 'solid',colour = "gray"),
panel.grid.minor = element_line(size = 0.1, linetype = 'solid',
colour = "white"),aspect.ratio = 1)+
ggtitle("IHD-DM VS Donor")+
xlab("lfc proteins")+
ylab("lfc rna-seq")+scale_colour_gradientn(colours=rev(c("#FF0000", "#FFFF00" ,"#00FF00", "#00FFFF","#0000FF")))+geom_abline(slope = 1,intercept = 0)
g1
#ggsave("main_figure6b.pdf",plot=g1)
plotdt=nodmvsd[nodmvsd$AveExpr.y>=5,]
g2=ggplot(plotdt, aes(x=logFC.x, y=logFC.y)) +
geom_point( mapping = aes(color=AveExpr.y),size=1)+
theme(axis.text.x = element_text(angle = 60, hjust = 1),panel.background =element_rect(fill = "white"),
panel.grid.major = element_line(size = 0.1, linetype = 'solid',colour = "gray"),
panel.grid.minor = element_line(size = 0.1, linetype = 'solid',
colour = "white"),aspect.ratio = 1)+
ggtitle("IHD-no DM VS Donor")+
xlab("lfc proteins")+
ylab("lfc rna-seq")+scale_colour_gradientn(colours=rev(c("#FF0000", "#FFFF00" ,"#00FF00", "#00FFFF","#0000FF")))+geom_abline(slope = 1,intercept = 0)
g2
#ggsave("main_figure6a.pdf",plot=g2)
plotdt=dmvsnodm[dmvsnodm$AveExpr.y>=5,]
g3=ggplot(plotdt, aes(x=logFC.x, y=logFC.y)) +
geom_point( mapping = aes(color=AveExpr.y),size=1)+
theme(axis.text.x = element_text(angle = 60, hjust = 1),panel.background =element_rect(fill = "white"),
panel.grid.major = element_line(size = 0.1, linetype = 'solid',colour = "gray"),
panel.grid.minor = element_line(size = 0.1, linetype = 'solid',
colour = "white"),aspect.ratio = 1)+
ggtitle("IHD-DM VS IHD-no DM")+
xlab("lfc proteins")+
ylab("lfc rna-seq")+scale_colour_gradientn(colours=rev(c("#FF0000", "#FFFF00" ,"#00FF00", "#00FFFF","#0000FF")))+geom_abline(slope = 1,intercept = 0)
g3
## [1] 21.82
protein_expression4=protein_expression3[rownames(protein_expression3)%in%dmvsd$ID,]
rna_expression2=rna_expression[rownames(rna_expression)%in%dmvsd$ID,]
dim(protein_expression4)
dim(rna_expression2)
dim(complete.cases(rna_expression2))
protein_expression4$ID=rownames(protein_expression4)
rna_expression2$ID=rownames(rna_expression2)
mergedt=merge(protein_expression4,rna_expression2,by.x="ID",by.y="ID")
dim(mergedt)
colnames(mergedt)
mergedt=mergedt[,c(1,grep(".x",colnames(mergedt)),grep(".y",colnames(mergedt)))]
colnames(mergedt)
for(i in 1:nrow(mergedt)){
mergedt[i,44]=cor(as.vector(as.numeric(mergedt[i,2:22])),as.vector(as.numeric(mergedt[i,23:43])),use = "complete.obs")
mergedt[i,45]=cor(as.vector(as.numeric(mergedt[i,2:22])),as.vector(as.numeric(mergedt[i,23:43])),use = "na.or.complete")
mergedt[i,46]=cor(as.vector(as.numeric(mergedt[i,2:22])),as.vector(as.numeric(mergedt[i,23:43])),use = "pairwise.complete.obs")
}
cordt=mergedt[,c(1,44,45,46)]
plotdt=merge(cordt,dmvsd, by.x="ID",by.y="ID")
# ggplot(plotdt, aes(x=V44, y=AveExpr.y)) +
# geom_point( mapping = aes(color=AveExpr.y),size=1)+
# theme(axis.text.x = element_text(angle = 60, hjust = 1),panel.background =element_rect(fill = "white"),
# panel.grid.major = element_line(size = 0.1, linetype = 'solid',colour = "gray"),
# panel.grid.minor = element_line(size = 0.1, linetype = 'solid',
# colour = "white"),aspect.ratio = 1)+
# ggtitle("Correlation vs average expression")+
# xlab("correlation")+
# ylab("average in rna")+scale_colour_gradientn(colours=rev(c("#FF0000", "#FFFF00" ,"#00FF00", "#00FFFF","#0000FF")))
#
# ggplot(plotdt, aes(x=V45, y=AveExpr.y)) +
# geom_point( mapping = aes(color=AveExpr.y),size=1)+
# theme(axis.text.x = element_text(angle = 60, hjust = 1),panel.background =element_rect(fill = "white"),
# panel.grid.major = element_line(size = 0.1, linetype = 'solid',colour = "gray"),
# panel.grid.minor = element_line(size = 0.1, linetype = 'solid',
# colour = "white"),aspect.ratio = 1)+
# ggtitle("Correlation vs average expression")+
# xlab("correlation")+
# ylab("average in rna")+scale_colour_gradientn(colours=rev(c("#FF0000", "#FFFF00" ,"#00FF00", "#00FFFF","#0000FF")))
#Jean wants V46
ggplot(plotdt, aes(x=AveExpr.y, y=V46)) +
geom_point( mapping = aes(color=AveExpr.y),size=1)+
theme(axis.text.x = element_text(angle = 60, hjust = 1),panel.background =element_rect(fill = "white"),
panel.grid.major = element_line(size = 0.1, linetype = 'solid',colour = "gray"),
panel.grid.minor = element_line(size = 0.1, linetype = 'solid',
colour = "white"),aspect.ratio = 1)+
ggtitle("Correlation vs average expression")+
xlab("average in rna")+
ylab("correlation")+scale_colour_gradientn(colours=rev(c("#FF0000", "#FFFF00" ,"#00FF00", "#00FFFF","#0000FF")))
filtereddt=plotdt[plotdt$V46>=0.8,c(1,4,10)]
filtereddt=filtereddt[-1,]
colnames(filtereddt)[c(2,3)]=c("correlation","RNA AveExpr")
filtereddt=na.omit(filtereddt)
datatable(na.omit(filtereddt))
tf_dt=read.csv("~/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/DatabaseExtract_v_1.01.csv")
tf_dt2=tf_dt[tf_dt$Is.TF.=="Yes",]
#tf_dt3=tf_dt2[tf_dt2$HGNC.symbol%in%filtereddt$ID,]
tf_dt3=merge(filtereddt,tf_dt2,by.x="ID",by.y="HGNC.symbol")
datatable(tf_dt3)
length(unique(tf_dt2$HGNC.symbol))
filtereddt$ID
*20221116 cut the values and get three groups for Ben (manual match those samples, pairwise correlation)
protein_expression4=protein_expression3[rownames(protein_expression3)%in%dmvsd$ID,]
rna_expression2=rna_expression[rownames(rna_expression)%in%dmvsd$ID,]
dim(protein_expression4)
## [1] 3506 46
## [1] 3506 21
## NULL
protein_expression4$ID=rownames(protein_expression4)
rna_expression2$ID=rownames(rna_expression2)
mergedt=merge(protein_expression4,rna_expression2,by.x="ID",by.y="ID")
dim(mergedt)
## [1] 3506 68
## [1] "ID" "7_LV_IHD_Yes_51_M.x" "14_LV_IHD_Yes_65_M.x"
## [4] "9_LV_IHD_Yes_55_M.x" "13_LV_IHD_Yes_59_M.x" "10_LV_IHD_Yes_56_M.x"
## [7] "11_LV_IHD_Yes_56_M.x" "8_LV_IHD_Yes_53_M.x" "27_LV_IHD_No_62_F.x"
## [10] "24_LV_IHD_No_54_M.x" "22_LV_IHD_No_50_M.x" "28_LV_IHD_No_62_M.x"
## [13] "29_LV_IHD_No_62_M.x" "30_LV_IHD_No_66_M.x" "25_LV_IHD_No_54_M.x"
## [16] "110_LV_Donor_No_65_F.x" "101_LV_Donor_No_54_M.x" "105_LV_Donor_No_56_M.x"
## [19] "108_LV_Donor_No_62_F.x" "106_LV_Donor_No_60_M.x" "107_LV_Donor_No_62_M.x"
## [22] "104_LV_Donor_No_55_F.x" "7_LV_IHD_Yes_51_M.y" "8_LV_IHD_Yes_53_M.y"
## [25] "9_LV_IHD_Yes_55_M.y" "10_LV_IHD_Yes_56_M.y" "11_LV_IHD_Yes_56_M.y"
## [28] "13_LV_IHD_Yes_59_M.y" "14_LV_IHD_Yes_65_M.y" "22_LV_IHD_No_50_M.y"
## [31] "24_LV_IHD_No_54_M.y" "25_LV_IHD_No_54_M.y" "27_LV_IHD_No_62_F.y"
## [34] "28_LV_IHD_No_62_M.y" "29_LV_IHD_No_62_M.y" "30_LV_IHD_No_66_M.y"
## [37] "101_LV_Donor_No_54_M.y" "104_LV_Donor_No_55_F.y" "105_LV_Donor_No_56_M.y"
## [40] "106_LV_Donor_No_60_M.y" "107_LV_Donor_No_62_M.y" "108_LV_Donor_No_62_F.y"
## [43] "110_LV_Donor_No_65_F.y"
mergedt_g1=mergedt[,c(1,2,8,4,6,7,5,3,23:29)]
mergedt_g2=mergedt[,c(1,11,10,15,9,12,13,14,30:36)]
mergedt_g3=mergedt[,c(1,17,22,18,20,21,19,16,37:43)]
for(i in 1:nrow(mergedt)){
mergedt[i,44]=cor(as.vector(as.numeric(mergedt[i,c(2,8,4,6,7,5,3,11,10,15,9,12,13,14,17,22,18,20,21,19,16)])),as.vector(as.numeric(mergedt[i,23:43])),use = "pairwise.complete.obs")
}
cordt=mergedt[,c(1,44)]
plotdt=merge(cordt,dmvsd, by.x="ID",by.y="ID")
plotdt2=plotdt
plotdt2=plotdt2[plotdt2$AveExpr.y>=5,]
gg=ggplot(plotdt2, aes(x=AveExpr.y, y=V44)) +
geom_point( mapping = aes(color=AveExpr.y),size=1)+
theme(axis.text.x = element_text(angle = 60, hjust = 1),panel.background =element_rect(fill = "white"),
panel.grid.major = element_line(size = 0.1, linetype = 'solid',colour = "gray"),
panel.grid.minor = element_line(size = 0.1, linetype = 'solid',
colour = "white"),aspect.ratio = 1)+
ggtitle("Correlation vs average expression")+
xlab("average in rna")+
ylab("correlation")+scale_colour_gradientn(colours=rev(c("#FF0000", "#FFFF00" ,"#00FF00", "#00FFFF","#0000FF")))+geom_hline(yintercept=0.8)
gg
#ggsave("supp_figure_rna.pdf",plot=gg)
filtereddt=plotdt2[plotdt2$V44>=0.8,which(colnames(plotdt2)%in%c("ID","V44","AveExpr.y"))]
filtereddt=filtereddt[-1,]
colnames(filtereddt)[c(2,3)]=c("correlation","RNA AveExpr")
filtereddt=na.omit(filtereddt)
datatable(na.omit(filtereddt))
tf_dt=read.csv("~/Dropbox (Sydney Uni)/Diabetic cardiomyopathy/YUNWEI USE THIS/clean_up_folder/analysis/DatabaseExtract_v_1.01.csv")
tf_dt2=tf_dt[tf_dt$Is.TF.=="Yes",]
#tf_dt3=tf_dt2[tf_dt2$HGNC.symbol%in%filtereddt$ID,]
tf_dt3=merge(filtereddt,tf_dt2,by.x="ID",by.y="HGNC.symbol")
datatable(tf_dt3)
## [1] 1639
## [1] "ANO6" "B4GALT1" "CTPS2" "CYBRD1" "DACT1" "DLGAP4" "EIF4G3"
## [8] "ETV6" "GLRX2" "GPC6" "GUCY1A1" "H2AX" "HUWE1" "LGR6"
## [15] "MXRA8" "NES" "PLA2G4F" "PNPLA6" "PODN" "PXMP2" "RAB23"
## [22] "RALGAPB" "RBBP9" "SBF2" "SCYL2" "STAT5B" "TNC" "UCHL1"
## [29] "XIRP2"
## [1] 3506 15
mergedt_g1$na_sum=rowSums(is.na(mergedt_g1))
mergedt_g1=mergedt_g1[which(mergedt_g1$na_sum<1),]
dim(mergedt_g1)
## [1] 2198 16
## [1] "ID" "7_LV_IHD_Yes_51_M.x" "8_LV_IHD_Yes_53_M.x"
## [4] "9_LV_IHD_Yes_55_M.x" "10_LV_IHD_Yes_56_M.x" "11_LV_IHD_Yes_56_M.x"
## [7] "13_LV_IHD_Yes_59_M.x" "14_LV_IHD_Yes_65_M.x" "7_LV_IHD_Yes_51_M.y"
## [10] "8_LV_IHD_Yes_53_M.y" "9_LV_IHD_Yes_55_M.y" "10_LV_IHD_Yes_56_M.y"
## [13] "11_LV_IHD_Yes_56_M.y" "13_LV_IHD_Yes_59_M.y" "14_LV_IHD_Yes_65_M.y"
for(i in 1:nrow(mergedt_g1)){
mergedt_g1[i,16]=cor(as.vector(as.numeric(mergedt_g1[i,2:8])),as.vector(as.numeric(mergedt_g1[i,9:15])),use = "pairwise.complete.obs")
}
# nasum_g1=rowSums(is.na(mergedt_g1))
# which(rownames(rna_expression2)=="ALDH1A3")
# nasum_g1[146]#high missing values
cordt=mergedt_g1[,c(1,16)]
plotdt=merge(cordt,dmvsd, by.x="ID",by.y="ID")
plotdt2=plotdt
plotdt2=plotdt2[plotdt2$AveExpr.y>=5,]
ggplot(plotdt2, aes(x=AveExpr.y, y=V16)) +
geom_point( mapping = aes(color=AveExpr.y),size=1)+
theme(axis.text.x = element_text(angle = 60, hjust = 1),panel.background =element_rect(fill = "white"),
panel.grid.major = element_line(size = 0.1, linetype = 'solid',colour = "gray"),
panel.grid.minor = element_line(size = 0.1, linetype = 'solid',
colour = "white"),aspect.ratio = 1)+
ggtitle("Correlation vs average expression")+
xlab("average in rna")+
ylab("correlation")+scale_colour_gradientn(colours=rev(c("#FF0000", "#FFFF00" ,"#00FF00", "#00FFFF","#0000FF")))+geom_hline(yintercept=0.8)
filtereddt=plotdt2[plotdt2$V16>=0.8,which(colnames(plotdt2)%in%c("ID","V16","AveExpr.y"))]
filtereddt=filtereddt[-1,]
colnames(filtereddt)[c(2,3)]=c("correlation","RNA AveExpr")
filtereddt=na.omit(filtereddt)
dim(filtereddt)
## [1] 210 3
## [1] 1 31
## [1] 210
## [1] 3506 15
mergedt_g2$na_sum=rowSums(is.na(mergedt_g2))
mergedt_g2=mergedt_g2[which(mergedt_g2$na_sum<1),]
dim(mergedt_g2)
## [1] 2098 16
## [1] "ID" "22_LV_IHD_No_50_M.x" "24_LV_IHD_No_54_M.x"
## [4] "25_LV_IHD_No_54_M.x" "27_LV_IHD_No_62_F.x" "28_LV_IHD_No_62_M.x"
## [7] "29_LV_IHD_No_62_M.x" "30_LV_IHD_No_66_M.x" "22_LV_IHD_No_50_M.y"
## [10] "24_LV_IHD_No_54_M.y" "25_LV_IHD_No_54_M.y" "27_LV_IHD_No_62_F.y"
## [13] "28_LV_IHD_No_62_M.y" "29_LV_IHD_No_62_M.y" "30_LV_IHD_No_66_M.y"
for(i in 1:nrow(mergedt_g2)){
mergedt_g2[i,16]=cor(as.vector(as.numeric(mergedt_g2[i,2:8])),as.vector(as.numeric(mergedt_g2[i,9:15])),use = "pairwise.complete.obs")
}
cordt=mergedt_g2[,c(1,16)]
plotdt=merge(cordt,dmvsd, by.x="ID",by.y="ID")
plotdt2=plotdt
plotdt2=plotdt2[plotdt2$AveExpr.y>=5,]
ggplot(plotdt2, aes(x=AveExpr.y, y=V16)) +
geom_point( mapping = aes(color=AveExpr.y),size=1)+
theme(axis.text.x = element_text(angle = 60, hjust = 1),panel.background =element_rect(fill = "white"),
panel.grid.major = element_line(size = 0.1, linetype = 'solid',colour = "gray"),
panel.grid.minor = element_line(size = 0.1, linetype = 'solid',
colour = "white"),aspect.ratio = 1)+
ggtitle("Correlation vs average expression")+
xlab("average in rna")+
ylab("correlation")+scale_colour_gradientn(colours=rev(c("#FF0000", "#FFFF00" ,"#00FF00", "#00FFFF","#0000FF")))+geom_hline(yintercept=0.8)
filtereddt=plotdt2[plotdt2$V16>=0.8,which(colnames(plotdt2)%in%c("ID","V16","AveExpr.y"))]
filtereddt=filtereddt[-1,]
colnames(filtereddt)[c(2,3)]=c("correlation","RNA AveExpr")
filtereddt=na.omit(filtereddt)
dim(filtereddt)
## [1] 86 3
## [1] 0 31
## [1] 86
## [1] 3506 15
mergedt_g3$na_sum=rowSums(is.na(mergedt_g3))
mergedt_g3=mergedt_g3[which(mergedt_g3$na_sum<1),]
dim(mergedt_g3)
## [1] 2266 16
## [1] "ID" "101_LV_Donor_No_54_M.x" "104_LV_Donor_No_55_F.x"
## [4] "105_LV_Donor_No_56_M.x" "106_LV_Donor_No_60_M.x" "107_LV_Donor_No_62_M.x"
## [7] "108_LV_Donor_No_62_F.x" "110_LV_Donor_No_65_F.x" "101_LV_Donor_No_54_M.y"
## [10] "104_LV_Donor_No_55_F.y" "105_LV_Donor_No_56_M.y" "106_LV_Donor_No_60_M.y"
## [13] "107_LV_Donor_No_62_M.y" "108_LV_Donor_No_62_F.y" "110_LV_Donor_No_65_F.y"
for(i in 1:nrow(mergedt_g3)){
mergedt_g3[i,16]=cor(as.vector(as.numeric(mergedt_g3[i,2:8])),as.vector(as.numeric(mergedt_g3[i,9:15])),use = "pairwise.complete.obs")
}
cordt=mergedt_g3[,c(1,16)]
plotdt=merge(cordt,dmvsd, by.x="ID",by.y="ID")
plotdt2=plotdt
plotdt2=plotdt2[plotdt2$AveExpr.y>=5,]
ggplot(plotdt2, aes(x=AveExpr.y, y=V16)) +
geom_point( mapping = aes(color=AveExpr.y),size=1)+
theme(axis.text.x = element_text(angle = 60, hjust = 1),panel.background =element_rect(fill = "white"),
panel.grid.major = element_line(size = 0.1, linetype = 'solid',colour = "gray"),
panel.grid.minor = element_line(size = 0.1, linetype = 'solid',
colour = "white"),aspect.ratio = 1)+
ggtitle("Correlation vs average expression")+
xlab("average in rna")+
ylab("correlation")+scale_colour_gradientn(colours=rev(c("#FF0000", "#FFFF00" ,"#00FF00", "#00FFFF","#0000FF")))+geom_hline(yintercept=0.8)
filtereddt=plotdt2[plotdt2$V16>=0.8,which(colnames(plotdt2)%in%c("ID","V16","AveExpr.y"))]
filtereddt=filtereddt[-1,]
colnames(filtereddt)[c(2,3)]=c("correlation","RNA AveExpr")
filtereddt=na.omit(filtereddt)
dim(filtereddt)
## [1] 63 3
## [1] 0 31
## [1] 63