# File: alignmentStats.R
# Author: Nikolas Barkas
# Date: September 2016
# Description: Generate alignment statistics for all libraries


# The identifiers here are OX_XXX so they are easy to resolve
alRepB1 <- read.csv('input/stats/alignmentReport_Batch1.txt')
postFiltRepB1 <- read.csv('input/stats/PostFilterReadCount_Batch1.txt', header=F)
colnames(postFiltRepB1) <- c("id","postFilterReads")
alRepB1$id <- substr(x = alRepB1$filename, start=0, stop=6)
b1 <- merge(alRepB1,postFiltRepB1, by="id" )[,c("id","input.count","mapped.count","multiple.count","postFilterReads")]

# The identifiers here are not OX_XXX so we also need to load the samplesheet to resolve them
alRepB2 <- read.csv('input/stats/alignmentReport_Batch2.txt')
alRepB2$id <- substr(alRepB2$filename, start=7, stop=9)
alRepB2$id <- gsub("_","",alRepB2$id)

postFiltRepB2 <- read.csv('input/stats/PostFilterReadCount_Batch2.txt', header = F)
colnames(postFiltRepB2) <- c("id","postFilterReads")
postFiltRepB2$id <- substr(postFiltRepB2$id, start=0,stop=3)
postFiltRepB2$id <- gsub("_","",postFiltRepB2$id)
b2 <- merge(postFiltRepB2,alRepB2, by="id" )[,c("id","input.count","mapped.count","multiple.count","postFilterReads")]

sampleSheetB2 <- read.csv('input/stats/sampleSheet_Batch2.csv')
b2map <- sampleSheetB2[,c("rna.prep.id","sample.id")]

b2 <- merge(b2, b2map, by.x="id", by.y='sample.id')
b2 <- b2[,c("rna.prep.id","input.count","mapped.count","multiple.count","postFilterReads")]
colnames(b2) <- c("id","input.count","mapped.count","multiple.count","postFilterReads")

alignStats <- rbind(b1,b2)
rm(alRepB2,alRepB1, b1,b2,b2map,postFiltRepB2,postFiltRepB1,sampleSheetB2)

alignStats <- merge(alignStats,as.data.frame(colData(se)),by.x="id",by.y="rna.prep.id")

alignStats <- alignStats[,c("id","input.count","mapped.count","multiple.count","postFilterReads","treatment","batch", "genotype","cell.population.summary")]

write.csv(alignStats, "output/alignmentStats/alignmentStats.csv")

library(ggplot2)
ggplot(alignStats, aes(x=id, y=input.count, fill=batch)) + geom_bar(stat="identity") + theme_bw() +
  ggtitle("Input Count by Batch") + scale_y_continuous(name="Number of Input Reads") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggsave("output/alignmentStats/input.reads.png",width=16,height = 10)

ggplot(alignStats, aes(x=id, y=multiple.count/input.count*100, fill=batch)) + geom_bar(stat="identity") + 
  theme_bw() + scale_y_continuous(name="% multimap") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggsave("output/alignmentStats/pc.multimap.png",width=16,height = 10)

ggplot(alignStats, aes(x=id, y=postFilterReads/input.count*100, fill=batch)) + geom_bar(stat="identity") + 
  theme_bw() + scale_y_continuous(name="% Reads Uniquely Mapped",lim=c(0,100)) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + scale_x_discrete(name="Library Id")
ggsave("output/alignmentStats/pc.post.filter.png",width=16,height = 10)

rm(alignStats)
