
#QC:
#This script executes a standardised QC procedure on cellranger-processed single cell 10x data. It does the following
# Filter cells and genes, based on Qc metrics such as mitochondrial ratio, number of genes, total UMI, confident read alignment rate (i.e. proportion of reads from the cell that were confidently aligned)



## ----receive input from nextflow----------------------------------------------
#Get parameters
args <- commandArgs(TRUE)
path<-args[1]
file<-args[2]

#Load standard package list
source(paste0(path,'/bin/auxiliary/DirectoryChecker.R'))
source(paste0(path,'/bin/auxiliary/PackageLoader.R'))
source(paste0(path,'/bin/auxiliary/SetPlottingParameters.R'))


#parse input, find timepoint from input
ispath<-FALSE
timepoint<-file
if(length(grep('[.]', file)) > 0){
  ispath<- TRUE
  chunks<-unlist(str_split(file,'/'))
  filename<-chunks[length(chunks)]
  filestring<-unlist(str_split(filename,'\\.'))[1]
  timepoint<-str_remove(filestring,'matrix')
}



#check ram
RAM<-get_ram()
print(paste(RAM,'RAM passed to Rscript'))

ncor=detectCores()
print(paste(ncor,'cores detected'))
plan("multiprocess", workers = ncor )
options(future.globals.maxSize = 15000 * 1024^2)



## ----Setup---------------------------------------------------------------


#Check if directory exists and make it if it doesn't
if(!dir.exists(paste0(path,'/output/scrnaseq/',timepoint,'_QC'))){
  dir.create(paste0(path,'/output/scrnaseq/',timepoint,'_QC'))
}


#Set up plot file
plot_file_name<-paste0(path,'/output/scrnaseq/',timepoint,'_QC/',timepoint,'_QC_Plots1.pdf')
pdf(file=plot_file_name,width=14,height=7) 

#Set up empty table to fill with value, plot at the end\
summaryTable<-data.frame(Stage=c("raw matrix","Total Genes cutoff","MT cutoff","UMI cutoff","Aligment cutoff","Gene filtering","Contamination filtering"),
                         Cells_Removed=vector(length=7,mode='numeric'), 
                         Genes_Removed=vector(length=7,mode='numeric'), 
                         Remaining_Cells=vector(length=7,mode='numeric'), 
                         Remaining_Genes=vector(length=7,mode='numeric')) 



## ----- load files ------------------------------------------------------------------------

matrix<-readMM(gzfile(paste(path,'/input/developmental_single_cell/scrnaseq/',timepoint,'matrix.mtx.gz',sep='')))

feat<-read.table(gzfile(paste(path,'/input/developmental_single_cell/scrnaseq/',timepoint,'features.tsv.gz',sep='')), header=F,fill=T)

bar<-read.table(gzfile(paste(path,'/input/developmental_single_cell/scrnaseq/',timepoint,'barcodes.tsv.gz',sep='')), header=F,fill=T)


#Add starting dimension to summary table
summaryTable$Remaining_Genes[1]<-dim(matrix)[1]
summaryTable$Remaining_Cells[1]<-dim(matrix)[2]


#Add ensembl ID to genes to preserve unqiqueness. Store gene symbols as meta features
rownames(matrix)<-feat[,1]
colnames(matrix)<-bar[,1]
SeuratObject<-CreateSeuratObject(matrix,min.cells = 10,project='LRP Neuron Differentiation')
SeuratObject[['orig.ident']]<-timepoint
Idents(SeuratObject)<-'orig.ident'
print('object created')


gsym<-data.frame(toupper(feat$V2))
rownames(gsym)<-feat$V1
colnames(gsym)<-'gene.symbol'
SeuratObject@assays[['RNA']]<-AddMetaData(SeuratObject@assays[['RNA']],gsym)

#Check if alignment info available
align_exists<-file.exists(paste0(path,'/input/developmental_single_cell/scrnaseq/',timepoint,'alignment.rds'))

if(align_exists){
#Load filtering information
x = load(paste0(path,'/input/developmental_single_cell/scrnaseq/',timepoint,'alignment.rds'))
# Get the object by its name
percentages = get(x)
# Remove the old object 
rm(x)

SeuratObject[['pt.conf.align']]<-data.frame(percentages)
}

SeuratObject[['percent.mt']]<-PercentageFeatureSet(SeuratObject, features=grep(pattern = "^MT-",SeuratObject@assays[["RNA"]]@meta.features$gene.symbol)  )





## ----- Filter Cells ------------------------------------------------------------------------

#Visualise alignment distribution
if(align_exists){
hist(SeuratObject[['pt.conf.align']][,1],main='Histogram of Cells by % of Confidently Aligned Reads')
}


#Note 2 s.d. arrived at following adjustment where 3 s.d. was deemed too permissive 
SeuratObjectOld<-SeuratObject

#Consider a log transformed version to assess s.d. from mean with an approx. normal dist. 
temp<-SeuratObject


#Compute thresholds
logUMI<-log1p(SeuratObjectOld[['nCount_RNA']])

if(align_exists){
  #the alignment rate is already close to normal, so we use as-is
  lower.align<-mean(SeuratObjectOld[['pt.conf.align']][,1]) - (2*sd(SeuratObjectOld[['pt.conf.align']][,1]))
  
}

#Remove cells 2 s.d. from the mean
lower.umi<-exp(mean(logUMI[,1]) - (2*sd(logUMI[,1])))


#Find cells with fewer than 700 genes
tooFew<-WhichCells(temp,expression=nFeature_RNA<700)
temp<-subset(temp,cells=tooFew,invert=T)

#Update summaryTable
summaryTable$Genes_Removed[2]<-dim(matrix)[1]-dim(temp)[1]
summaryTable$Cells_Removed[2]<-dim(matrix)[2]-dim(temp)[2]
summaryTable$Remaining_Genes[2]<-dim(temp)[1]
summaryTable$Remaining_Cells[2]<-dim(temp)[2]

#Remove cells with >10% mitochondrial content
temp<-subset(temp,cells=WhichCells(temp,expression=percent.mt<=10))

#Update summaryTable
summaryTable$Cells_Removed[3]<-summaryTable$Remaining_Cells[2]-dim(temp)[2]
summaryTable$Remaining_Genes[3]<-summaryTable$Remaining_Genes[2]
summaryTable$Remaining_Cells[3]<-dim(temp)[2]


temp<-subset(temp,subset =  nCount_RNA>lower.umi)

#Update summaryTable
summaryTable$Cells_Removed[4]<-summaryTable$Remaining_Cells[3]-dim(temp)[2]
summaryTable$Remaining_Genes[4]<-summaryTable$Remaining_Genes[3]
summaryTable$Remaining_Cells[4]<-dim(temp)[2]




if(align_exists){
  #the alignment rate is already close to normal, so we use as-is
  temp<-subset(temp,subset =  pt.conf.align>lower.align)
  
  
  #Update summaryTable
  summaryTable$Cells_Removed[5]<-summaryTable$Remaining_Cells[4]-dim(temp)[2]
  summaryTable$Remaining_Genes[5]<-summaryTable$Remaining_Genes[4]
  summaryTable$Remaining_Cells[5]<-dim(temp)[2]
}else{
  #Update summaryTable
  summaryTable$Cells_Removed[5]<-summaryTable$Remaining_Cells[4]
  summaryTable$Remaining_Genes[5]<-summaryTable$Remaining_Genes[4]
  summaryTable$Remaining_Cells[5]<-dim(SeuratObject)[2]
}



#now filter the actual file to match the temporary log normalised ones
SeuratObject<-subset(SeuratObject,cells=temp@assays[["RNA"]]@data@Dimnames[[2]])


#Visualise MT ratio/nGene/nUMI relationships
SeuratObjectOld[['mt_filtering']]<-'Kept'
SeuratObjectOld[['mt_filtering']][SeuratObjectOld[['percent.mt']][,1]>10,]<-'Removed'
SeuratObjectOld[['umi_filtering']]<-'Kept'
SeuratObjectOld[['umi_filtering']][SeuratObjectOld[['nCount_RNA']][,1]<lower.umi,]<-'Removed'
SeuratObjectOld[['align_filtering']]<-'Kept'
SeuratObjectOld[['align_filtering']][SeuratObjectOld[['pt.conf.align']][,1]<lower.align,]<-'Removed'

cols<-hue_pal()(3)
names(cols)<-c('E16','P1','P5')
if(!is.na(cols[timepoint])){
  col<-cols[timepoint]
}else{
  col='black'
}

dev.off()



plot_file_name<-paste0(path,'/output/scrnaseq/',timepoint,'_QC/',timepoint,'_QC_Plots2.pdf')
pdf(file=plot_file_name,width=21,height=7) 

plot1 <- FeatureScatter(SeuratObjectOld, feature1 = "nCount_RNA", feature2 = "percent.mt",group.by='mt_filtering',cols=c(col,'gray'))+ggtitle('Mitochondrial Content Filtering') + geom_hline(yintercept=10,linetype="dashed")
plot2 <- FeatureScatter(SeuratObjectOld, feature1 = "nCount_RNA", feature2 = "nFeature_RNA",group.by='umi_filtering',cols=c(col,'gray'))+ggtitle('UMI Filtering') + geom_vline(xintercept=lower.umi,linetype="dashed")
plot3 <- FeatureScatter(SeuratObjectOld, feature1 = "pt.conf.align", feature2 = "nFeature_RNA",group.by='align_filtering',cols=c(col,'gray'))+ggtitle('Confident Alignment Filtering') + geom_vline(xintercept=lower.align,linetype="dashed")
plot_grid(plot1,plot2,plot3,ncol=3)

dev.off()




## -----Filter Genes ------------------------------------------------------------------------


#get genes list for mouse
load(paste(path,'/input/supplementary/Ref_Genes.Rdata',sep=''))


#Remove pseudo genes
pseudoGenes<-grep(pattern='^GM',SeuratObject@assays[["RNA"]]@meta.features[["gene.symbol"]])

#Clean genes accidentally caught genes
pseudoGenes<-pseudoGenes[nchar(as.character(SeuratObject@assays[["RNA"]]@meta.features[["gene.symbol"]][grep(pattern='^GM',SeuratObject@assays[["RNA"]]@meta.features[["gene.symbol"]])]))>5]


#Remove ribosomal genes
riboGenes<-c(grep(pattern='^RPS',SeuratObject@assays[["RNA"]]@meta.features[["gene.symbol"]]) , grep(pattern='^RPL',SeuratObject@assays[["RNA"]]@meta.features[["gene.symbol"]]), grep(pattern='^MRP',SeuratObject@assays[["RNA"]]@meta.features[["gene.symbol"]]))

#We may not need to remove these ribosomal genes


#want to remove all Y genes
YGenesList<-BMout$ensembl_gene_id[which(BMout$chromosome_name=='Y')]
YGene<-which(rownames(SeuratObject@assays[["RNA"]])%in% YGenesList)


#remove mitochondrial genes
MTGenesList<-BMout$ensembl_gene_id[which(BMout$chromosome_name=='MT')]
MTGenes<-which(rownames(SeuratObject@assays[["RNA"]])%in% MTGenesList)
#We may not need to remove mitochondrial genes


#Remove genes
keep<-which(1:length(SeuratObject@assays[["RNA"]]@data@Dimnames[[1]]) %nin% c(pseudoGenes,YGene,MTGenes,riboGenes))


SeuratObject<-subset(SeuratObject,features=as.vector(keep))


#Update summaryTable
summaryTable$Genes_Removed[6]<-summaryTable$Remaining_Genes[5]-dim(SeuratObject)[1]
summaryTable$Remaining_Genes[6]<-dim(SeuratObject)[1]
summaryTable$Remaining_Cells[6]<-dim(SeuratObject)[2]


SeuratObject<-NormalizeData(SeuratObject)

#Cell Cycle. It is not strictly necessary to adjust for cell cycle, our data should contain only neurons targeted for SST expression and should not have significatn progenitor contamination 
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes

res <- sapply(s.genes, function(xx){paste("\\b",xx,"$",sep = "")})
res <- sapply(res, function(xx){grep(xx, SeuratObject@assays[["RNA"]]@meta.features[["gene.symbol"]])})
s.genes<-SeuratObject@assays[["RNA"]]@data@Dimnames[[1]][unlist(res)]

res <- sapply(g2m.genes, function(xx){paste("\\b",xx,"$",sep = "")})
res <- sapply(res, function(xx){grep(xx, SeuratObject@assays[["RNA"]]@meta.features[["gene.symbol"]])})
g2m.genes<-SeuratObject@assays[["RNA"]]@data@Dimnames[[1]][unlist(res)]

SeuratObject <- CellCycleScoring(SeuratObject, s.features = s.genes, g2m.features = g2m.genes)

ScoreDiffSeuratObject<-data.frame(SeuratObject@meta.data[["S.Score"]] - SeuratObject@meta.data[["G2M.Score"]])
rownames(ScoreDiffSeuratObject)<-SeuratObject@assays[["RNA"]]@data@Dimnames[[2]]

SeuratObject[['ccDiff']]<-ScoreDiffSeuratObject

SeuratObject<-ScaleData(SeuratObject,vars.to.regress = c("nFeature_RNA",'percent.mt','ccDiff'),model.use = 'linear',block.size=dim(SeuratObject)[1])






## ----- Initial Processing  ------------------------------------------------------------------------

plot_file_name<-paste(path,'/output/scrnaseq/',timepoint,'_QC/',timepoint,'_QC_Plots3.pdf',sep='')
pdf(file=plot_file_name,width=14,height=14) 


SeuratObject<-FindVariableFeatures(SeuratObject)
SeuratObject<-RunPCA(SeuratObject)

ElbowPlot(SeuratObject)+ggtitle('Pre-Contamination Filtering')

SeuratObject<-RunTSNE(SeuratObject,dims=1:15)
SeuratObject<-RunUMAP(SeuratObject,dims=1:15)
SeuratObject<-FindNeighbors(SeuratObject)
SeuratObject<-FindClusters(SeuratObject,resolution = 1.5)
#Note we use a high resolution as some contamination clusters are very small and we want to make sure they aren't grouped with desired cells. 

DimPlot(SeuratObject,reduction='tsne')+ggtitle('Pre-Contamination Filtering')
DimPlot(SeuratObject,reduction='umap')+ggtitle('Pre-Contamination Filtering')
FeaturePlot(SeuratObject,features=c('nCount_RNA','nFeature_RNA','percent.mt'))






## ----- Seek Contamination ------------------------------------------------------------------------


#Store entrez id
SeuratObject@assays[["RNA"]]@meta.features[["entrez"]]<-SeuratObject@assays[["RNA"]]@counts@Dimnames[[1]]

#Set genes to symbol for better plotting
SeuratObject@assays[["RNA"]]@data@Dimnames[[1]]<-as.character(SeuratObject@assays[["RNA"]]@meta.features[["gene.symbol"]])

SeuratObject@assays[["RNA"]]@counts@Dimnames[[1]]<-SeuratObject@assays[["RNA"]]@data@Dimnames[[1]]

#Identify excitatory neurons/general contamination. They are clustered together

#Get contamination markers that are present in the data
markers<-c('SLC17A7','SLC17A6','NEUROD6','NEUROD2')
contaminationMarkers<-markers[which(!is.na(match(markers,SeuratObject@assays[['RNA']]@data@Dimnames[[1]])))]
cellList<-vector()
for(i in 1:length(contaminationMarkers)){
  
  marker<-contaminationMarkers[i]
  exp<-SeuratObject@assays[['RNA']]@data[marker,]
  cellList<-c(cellList,names(which(exp>0)))
}
cellList<-unique(cellList)

#Remove contamination clusters
print('about to subset')

#Identify contaminated clusters
Contamination<-FetchData(SeuratObject,vars='seurat_clusters',cells=cellList)
Contamination<-table(Contamination)
Contamination<-Contamination/table(SeuratObject[['seurat_clusters']])

#remove clusters with >50% contamination markers
remove_clusters<-names(which(Contamination>0.5))

cleanedSeuratObject<-subset(SeuratObject,cells=which(SeuratObject[['seurat_clusters']][,1] %in% remove_clusters ),invert=T)

#Finally remove all loose marker +ve cells (scattered around) 
cleanedSeuratObject<-subset(cleanedSeuratObject,cells=cellList,invert=T)


#Update summaryTable
summaryTable$Cells_Removed[7]<-summaryTable$Remaining_Cells[6]-dim(cleanedSeuratObject)[2]
summaryTable$Remaining_Genes[7]<-dim(cleanedSeuratObject)[1]
summaryTable$Remaining_Cells[7]<-dim(cleanedSeuratObject)[2]


cleanedSeuratObject@assays[["RNA"]]@data@Dimnames[[1]]<-as.character(cleanedSeuratObject@assays[["RNA"]]@data@Dimnames[[1]])

cleanedSeuratObject@assays[["RNA"]]@counts@Dimnames[[1]]<-as.character(cleanedSeuratObject@assays[["RNA"]]@counts@Dimnames[[1]])

rownames(cleanedSeuratObject@assays[["RNA"]]@meta.features)<-as.character(cleanedSeuratObject@assays[["RNA"]]@counts@Dimnames[[1]])

print('subset done')

cleanedSeuratObject<-ScaleData(cleanedSeuratObject,vars.to.regress = c("nFeature_RNA",'percent.mt','ccDiff'),model.use='negbinom',block.size=round(dim(cleanedSeuratObject)[1]/ncor))
print('final scaling complete')






## ----- Final Processing ------------------------------------------------------------------------

#Reprocess cleaned object as before
cleanedSeuratObject<-FindVariableFeatures(cleanedSeuratObject,nfeatures=2000)
cleanedSeuratObject<-RunPCA(cleanedSeuratObject)
ElbowPlot(cleanedSeuratObject)+ggtitle('Post-Contamination Filtering')
cleanedSeuratObject<-RunUMAP(cleanedSeuratObject,dims=1:15)
cleanedSeuratObject<-RunTSNE(cleanedSeuratObject,dims=1:15)
cleanedSeuratObject<-FindNeighbors(cleanedSeuratObject)

DimPlot(cleanedSeuratObject,reduction='tsne')+ggtitle('Post-Contamination Filtering')
DimPlot(cleanedSeuratObject,reduction='umap')+ggtitle('Post-Contamination Filtering')


#Update object to match the filtered, uncontaminated version
processedSeuratObject<-cleanedSeuratObject
preProcessedSeuratObject<-SeuratObjectOld
print(paste('QC complete on',timepoint))

#Save to pipeline directory for later reference
save(processedSeuratObject,file=paste(path,'/output/scrnaseq/',timepoint,'_QC/processed',timepoint,'.Rdata',sep= ''))
save(preProcessedSeuratObject,lower.umi,lower.align,file=paste(path,'/output/scrnaseq/',timepoint,'_QC/preQC',timepoint,'.Rdata',sep= ''))
#Pass to working directory
save(processedSeuratObject,file=paste('processed',timepoint,'.Rdata',sep= ''))

print(paste('saving complete on',timepoint))

dev.off()

plot_file_name<-paste0(path,'/output/scrnaseq/',timepoint,'_QC/',timepoint,'_QC_Plots4.pdf')
pdf(file=plot_file_name,width=14,height=14) 

grid.table(summaryTable)
dev.off()



#Plot tidying
pdf_combine(c(paste0(path,'/output/scrnaseq/',timepoint,'_QC/',timepoint,'_QC_Plots1.pdf'),paste0(path,'/output/scrnaseq/',timepoint,'_QC/',timepoint,'_QC_Plots2.pdf'),paste(path,'/output/scrnaseq/',timepoint,'_QC/',timepoint,'_QC_Plots3.pdf',sep=''),paste(path,'/output/scrnaseq/',timepoint,'_QC/',timepoint,'_QC_Plots4.pdf',sep='')),output=paste(path,'/output/scrnaseq/',timepoint,'_QC/',timepoint,'_QC_Plots.pdf',sep=''))

file.remove(paste(path,'/output/scrnaseq/',timepoint,'_QC/',timepoint,'_QC_Plots1.pdf',sep=''))
file.remove(paste(path,'/output/scrnaseq/',timepoint,'_QC/',timepoint,'_QC_Plots2.pdf',sep=''))
file.remove(paste(path,'/output/scrnaseq/',timepoint,'_QC/',timepoint,'_QC_Plots3.pdf',sep=''))
file.remove(paste(path,'/output/scrnaseq/',timepoint,'_QC/',timepoint,'_QC_Plots4.pdf',sep=''))

