# R commands to reanalyze the data from Lin et al (2014) starting from the raw fragment counts

#1. load data
datasets = as.data.frame(scan("Stanford_datasets.txt",list(setname="",seqBatch="",species="",tissue=""),sep="\t"))
rawCounts <- as.matrix(read.table('Stanford_datasets_rawCountsMat.txt',header=FALSE,sep='\t'))
colnames(rawCounts) <- datasets$setname
geneDetails <- as.data.frame(scan("ortholog_GC_table.txt",skip=1,list(mouse_name="",mouse_GC = 0.0,human_name = "",human_GC=0.0)))
rownames(rawCounts) <- geneDetails$human_name 
rownames(datasets) <- datasets$setname
rownames(geneDetails) <- geneDetails$human_name 

#2. filter out lower 30% + mitochondrial genes
rowSums = apply(rawCounts,1,function(x) sum(x))
quantile(rowSums,probs = 0.3) # result is 2947.9 
filter <- apply(rawCounts,1,function(x) sum(x)>2947.9 )
mt <- grep("mt-",geneDetails$mouse_name)
filteredNames <- setdiff(rownames(rawCounts[filter,]),rownames(rawCounts[mt,])) 
filteredRawCounts <- rawCounts[filteredNames,]

#3. Use EDASeq to normalize data within lanes, accounting for GC content
library(EDASeq)
GCnormCounts <- filteredRawCounts
GCnormCounts[,1:13] <- withinLaneNormalization(filteredRawCounts[,1:13],geneDetails[filteredNames,"human_GC"],which="loess",round=TRUE)
GCnormCounts[,14:26] <- withinLaneNormalization(filteredRawCounts[,14:26],geneDetails[filteredNames,"mouse_GC"],which="loess",round=TRUE)

#4. depth normalize,using TMM scaling factors - divide by sum, then multiply by mean of sums
library(edgeR)
origColSums <- apply(rawCounts,2,function(x) sum(x))
normFactors <- calcNormFactors(GCnormCounts,method='TMM')
colSums = apply(GCnormCounts,2,function(x) sum(x))
normalizedColSums <- origColSums
i <- 1
while (i<length(colSums)){
  normalizedColSums[i] <- origColSums[i]* normFactors[i]
  i <- i+1
}
meanDepth <- mean(normalizedColSums)
filteredDepthNormCounts <- GCnormCounts
i <- 1
while (i<ncol(filteredDepthNormCounts)){
 filteredDepthNormCounts[,i] <- (GCnormCounts[,i]/normalizedColSums[i])*meanDepth
 i <- i+1
 }

#5. log transformation
logTransformedDepthNormCounts <- log2(filteredDepthNormCounts+1)

#6. use combat on log2 values to remove batch effects 
library(sva)
meta <- data.frame(seqBatch = datasets$seqBatch,tissue=datasets$tissue,species=datasets$species)
design <- model.matrix(~1,data=meta)
combat <- ComBat(dat= logTransformedDepthNormCounts,batch=meta$seqBatch,mod=design,numCovs=NULL,par.prior=TRUE)

#7.plot  heatmaps
library(pheatmap)
pheatmap(cor(combat)) #pearson correlation; complete linkage; euclidean distance
pheatmap(cor(combat),clustering_method="average") #pearson correlation; average linkage
pheatmap(cor(combat),clustering_method="single") #pearson correlation; single linkage
pheatmap(cor(combat),clustering_distance_rows="manhattan",clustering_distance_cols="manhattan") #pearson correlation; manhattan distance, complete linkage
pheatmap(cor(combat),clustering_distance_rows="canberra",clustering_distance_cols="canberra") #pearson correlation; canberra distance, complete linkage
pheatmap(cor(combat,method="spearman")) #spearman correlation; complete linkage
pheatmap(combat, clustering_distance_cols="correlation", show_rownames = FALSE) # clustering of 'clean' log-transformed data, Pearson correlation distance, complete linkage


#8. PCA
transposed_combat <- t(combat)
# perform prcomp on the transposed matrix from which columns (genes) of zero variance have been removed
pca_proc <- prcomp(transposed_combat[,apply(transposed_combat, 2, var, na.rm=TRUE) != 0],scale=TRUE,center=TRUE,retX=TRUE)

summary(pca_proc)
#Importance of components:
#                           PC1     PC2      PC3      PC4      PC5      PC6     PC7      PC8      PC9     PC10     PC11     PC12     PC13     PC14     PC15
#Standard deviation     46.1729 34.5340 29.87140 28.60640 26.55177 25.54213 21.4675 20.27704 18.79279 18.45900 16.80437 16.39394 15.52684 15.21368 14.84465
#Proportion of Variance  0.2068  0.1157  0.08656  0.07938  0.06839  0.06328  0.0447  0.03988  0.03426  0.03305  0.02739  0.02607  0.02339  0.02245  0.02138
#Cumulative Proportion   0.2068  0.3225  0.40904  0.48842  0.55681  0.62009  0.6648  0.70468  0.73894  0.77199  0.79938  0.82546  0.84884  0.87129  0.89267
#                           PC16     PC17     PC18     PC19     PC20     PC21     PC22    PC23    PC24   PC25     PC26
#Standard deviation     13.91730 12.85659 12.61067 11.64782 11.06210 10.50878 10.26773 7.74366 6.08532 4.1913 8.53e-14
#Proportion of Variance  0.01879  0.01603  0.01543  0.01316  0.01187  0.01071  0.01023 0.00582 0.00359 0.0017 0.00e+00
#Cumulative Proportion   0.91146  0.92749  0.94292  0.95608  0.96795  0.97866  0.98889 0.99470 0.99830 1.0000 1.00e+00

plotData = datasets[,c("setname","species","tissue")]
plotData$PC1 <- pca_proc$x[,1]
plotData$PC2 <- pca_proc$x[,2]
plotData$PC3 <- pca_proc$x[,3]
plotData$PC4 <- pca_proc$x[,4]
plotData$PC5 <- pca_proc$x[,5]
plotData$PC6 <- pca_proc$x[,6]

library(ggplot2)
# 2D plots of pairs of principal components
qplot(PC1,PC2,data=plotData,color=species,shape=tissue,xlab="PC1 (21% variability)",ylab="PC2 (12% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))
qplot(PC1,PC3,data=plotData,color=species,shape=tissue,xlab="PC1 (21% variability)",ylab="PC3 (9% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))
qplot(PC2,PC3,data=plotData,color=species,shape=tissue,xlab="PC2 (12% variability)",ylab="PC3 (9% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))
qplot(PC4,PC5,data=plotData,color=species,shape=tissue,xlab="PC4 (8% variability)",ylab="PC5 (7% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))
qplot(PC4,PC6,data=plotData,color=species,shape=tissue,xlab="PC4 (8% variability)",ylab="PC6 (6% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))
qplot(PC5,PC6,data=plotData,color=species,shape=tissue,xlab="PC5 (7% variability)",ylab="PC6 (6% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))

# 2D plots of individual principle components against tissue
qplot(PC1,tissue,data=plotData,color=species,shape=tissue,xlab="PC1 (21% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))
qplot(PC2,tissue,data=plotData,color=species,shape=tissue,xlab="PC2 (12% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))
qplot(PC3,tissue,data=plotData,color=species,shape=tissue,xlab="PC3 (9% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))
qplot(PC4,tissue,data=plotData,color=species,shape=tissue,xlab="PC4 (8% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))
qplot(PC5,tissue,data=plotData,color=species,shape=tissue,xlab="PC5 (7% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))
qplot(PC6,tissue,data=plotData,color=species,shape=tissue,xlab="PC6 (6% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))

# testing for significance of correlations between the matched tissues PC values of human and mouse
cor.test(plotData$PC1[1:13],plotData$PC1[14:26],method="spearman") # rho=0.3076923 ; p-value = 0.3063
cor.test(plotData$PC2[1:13],plotData$PC2[14:26],method="spearman") # rho=0.0.7912088 ; p-value = 0.002082
cor.test(plotData$PC3[1:13],plotData$PC3[14:26],method="spearman") # rho=0.6923077 ; p-value= 0.01115
cor.test(plotData$PC4[1:13],plotData$PC4[14:26],method="spearman") # rho=0.8736264 ; p-value = 6.817e-05
cor.test(plotData$PC5[1:13],plotData$PC5[14:26],method="spearman") # rho=0.7967033 ; p-value=0.001844
cor.test(plotData$PC6[1:13],plotData$PC6[14:26],method="spearman") # rho=0.02197802; p-value=0.9494 

cor.test(plotData$PC1[1:13],plotData$PC1[14:26],method="pearson") # cor=0.2972731  ; p-value = 0.3239
cor.test(plotData$PC2[1:13],plotData$PC2[14:26],method="pearson") # cor=0.8482408 ; p-value = 0.0002478
cor.test(plotData$PC3[1:13],plotData$PC3[14:26],method="pearson") # cor=0.6788079 ; p-value = 0.01074
cor.test(plotData$PC4[1:13],plotData$PC4[14:26],method="pearson") # cor = 0.7004334  ; p-value=0.007668
cor.test(plotData$PC5[1:13],plotData$PC5[14:26],method="pearson") # cor = 0.8382749 ; p-value=0.0003446
cor.test(plotData$PC6[1:13],plotData$PC6[14:26],method="pearson") #cor=-0.02760841 ; p-value = 0.9287

#9. plots of data before batch-correction with ComBat
# heatmap of Pearson correlations
pheatmap(cor(logTransformedDepthNormCounts)) # complete linkage; euclidean distance

#pca 
transposed_logTransformedDepthNormCounts <- t(logTransformedDepthNormCounts)
# perform prcomp on the transposed matrix from which columns (genes) of zero variance have been removed
pca_proc_beforeCombat <- prcomp(transposed_logTransformedDepthNormCounts[,apply(transposed_logTransformedDepthNormCounts, 2, var, na.rm=TRUE) != 0],scale=TRUE,center=TRUE,retX=TRUE)

summary(pca_proc_beforeCombat)
#Importance of components:
#                           PC1     PC2     PC3      PC4      PC5      PC6     PC7      PC8      PC9     PC10     PC11     PC12     PC13     PC14     PC15
#Standard deviation     48.8872 37.0410 32.9175 28.76029 25.87420 23.82773 21.5134 19.10016 17.14287 15.88775 15.04607 14.44407 14.11145 13.42179 12.82748
#Proportion of Variance  0.2318  0.1331  0.1051  0.08024  0.06494  0.05507  0.0449  0.03539  0.02851  0.02449  0.02196  0.02024  0.01932  0.01747  0.01596
#Cumulative Proportion   0.2318  0.3649  0.4700  0.55027  0.61521  0.67028  0.7152  0.75057  0.77907  0.80356  0.82552  0.84576  0.86507  0.88255  0.89851
                           PC16     PC17     PC18     PC19     PC20    PC21    PC22   PC23    PC24    PC25      PC26
#Standard deviation     12.10684 11.67716 11.24912 10.82649 10.29446 9.99497 9.49378 9.2525 8.53607 8.06905 7.199e-14
#Proportion of Variance  0.01422  0.01323  0.01227  0.01137  0.01028 0.00969 0.00874 0.0083 0.00707 0.00632 0.000e+00
#Cumulative Proportion   0.91273  0.92595  0.93823  0.94960  0.95988 0.96957 0.97831 0.9866 0.99368 1.00000 1.000e+00

plotData_beforeCombat = datasets[,c("setname","species","tissue")]
plotData_beforeCombat$PC1 <- pca_proc_beforeCombat$x[,1]
plotData_beforeCombat$PC2 <- pca_proc_beforeCombat$x[,2]
plotData_beforeCombat$PC3 <- pca_proc_beforeCombat$x[,3]
plotData_beforeCombat$PC4 <- pca_proc_beforeCombat$x[,4]
plotData_beforeCombat$PC5 <- pca_proc_beforeCombat$x[,5]
plotData_beforeCombat$PC6 <- pca_proc_beforeCombat$x[,6]

# 2D plots of pairs of principal components
qplot(PC1,PC2,data=plotData_beforeCombat,color=species,shape=tissue,xlab="PC1 (23% variability)",ylab="PC2 (13% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))