

## ----receive input from nextflow----------------------------------------------
#Get parameters
args <- commandArgs(TRUE)

#If running locally, redefine
path<-args[1]
markers<-args[2]
print(markers)
print(class(markers))


#Load standard package list
source(paste0(path,'/bin/auxiliary/DirectoryChecker.R'))
source(paste0(path,'/bin/auxiliary/PackageLoader.R'))
source(paste0(path,'/bin/auxiliary/SetPlottingParameters.R'))


#classify clusters by descore for markers
source(paste0(path,'/bin/auxiliary/DEScoreClassify.R'))

ncor=detectCores()-1
print(paste(ncor,'cores detected'))
plan("multiprocess", workers = ncor )
options(future.globals.maxSize = 15000 * 1024^2)


#load markers
load(markers)


canonmarkers<-read.csv(paste0(path,'/input/supplementary/CanonicalMarkers.csv'))
markers<-list()
markers[['Martinotti']]<-toupper(canonmarkers[grep('^Martinotti',canonmarkers[,2]),1])
markers[['Non-Martinotti']]<-toupper(canonmarkers[grep('Non-Martinotti',canonmarkers[,2]),1])
markers[['LRP']]<-toupper(canonmarkers[grep('projecting',canonmarkers[,2]),1])
markers[['Stressed']]<-toupper(canonmarkers[grep('Stress',canonmarkers[,2]),1])



#load matrix
matrix <- read.csv(gzfile(paste(path,'/input/external_reference/P56Allen10x_matrix.csv.gz',sep='')))
rownames(matrix) <-matrix[,1]
matrix <- matrix[,-1]
matrix<-t(matrix)

feat <- read.csv(paste0(path,'/input/external_reference/P56Allen10x_genedata.csv'))
feat$gene_symbol <- toupper(feat$gene_symbol)

meta <- read.csv(paste0(path,'/input/external_reference/P56Allen10x_metadata.csv'))
rownames(meta) <-meta$sample_name

#load dendrogram
load(paste0(path,'/input/external_reference/P56Allen10x_dendrogram.Rdata'))


# #We'll also need cell labels
# sampAnnotations <- read.csv(unz(paste(path,'/input/external_reference/',timepoint,"sampannotations.zip",sep=''), "sample_annotations.csv"), header = TRUE, sep = ",")

# #And additional gene info would be helpful too
# geneAnnotations<-read.csv(unz(paste(path,"/mouse_VISp_gene_expression_matrices_2018-06-14.zip",sep=''), "mouse_VISp_2018-06-14_genes-rows.csv"), header = TRUE, sep = ",")

# colnames(matrix)<-bar$V1
# rownames(matrix)<-feat$V1
# 
processedP56<-CreateSeuratObject(matrix, meta.data = meta)
#rm(matrix)

processedP56@assays[['RNA']]@meta.features[['chr']]   <- NA
ind <- which(!is.na(match(rownames(processedP56), feat$gene_symbol)))
processedP56@assays[['RNA']]@meta.features[['chr']][ind]   <- feat$chr[na.omit(match(rownames(processedP56), feat$gene_symbol))]
  
  #
  
  #set gene metadata
  
  
  # gsym<-data.frame(toupper(feat[,2]))
  # rownames(gsym)<-feat[,1]
  # colnames(gsym)<-'gene.symbol'
  # processedP56@assays[['RNA']]<-AddMetaData(processedP56@assays[['RNA']],gsym)
  
# SSTannotations<-sampAnnotations[match(colnames(matrix),sampAnnotations$sample_name),]


processedP56[['orig.ident']]<-'P56'

# region<-SSTannotations$region_label
# names(region)<-SSTannotations$sample_name
# processedP56[['regions']]<-region
# 
# layer<-SSTannotations$cortical_layer_label
# names(layer)<-SSTannotations$sample_name
# processedP56[['layer']]<-layer
# 
# GT<-SSTannotations$full_genotype_label
# names(GT)<-SSTannotations$sample_name
# processedP56[['GT']]<-GT
# 
# facs<-SSTannotations$facs_population_plan_order
# names(facs)<-SSTannotations$sample_name
# processedP56[['facs_order']]<-facs


processedP56[["percent.mt"]]<-PercentageFeatureSet(processedP56, features=grep(pattern = "^MT-",processedP56@assays[["RNA"]]@meta.features$gene.symbol)  )


# #get genes list for mouse
# load(paste(path,'/input/supplementary/','Ref_Genes.Rdata',sep=''))
noChr <- which(is.na(processedP56@assays[['RNA']]@meta.features[['chr']]))

#Remove pseudo genes
pseudoGenes<-grep(pattern='^GM',processedP56@assays[["RNA"]]@meta.features[["gene.symbol"]])

#Clean genes accidentally caught genes
pseudoGenes<-pseudoGenes[nchar(as.character(processedP56@assays[["RNA"]]@meta.features[["gene.symbol"]][grep(pattern='^GM',processedP56@assays[["RNA"]]@meta.features[["gene.symbol"]])]))>5]

#Remove ribosomal genes
riboGenes<-c(grep(pattern='^RPS',processedP56@assays[["RNA"]]@meta.features[["gene.symbol"]]) , grep(pattern='^RPL',processedP56@assays[["RNA"]]@meta.features[["gene.symbol"]]), grep(pattern='^MRP',processedP56@assays[["RNA"]]@meta.features[["gene.symbol"]]))

#want to remove all Y genes
YGene<-which(processedP56@assays[['RNA']]@meta.features[['chr']] == 'Y')

#remove mitochondrial genes
MTGenes<-which(processedP56@assays[['RNA']]@meta.features[['chr']]  == 'MT')


#Remove genes
keep<-which(1:length(processedP56@assays[["RNA"]]@data@Dimnames[[1]]) %nin% c(pseudoGenes,YGene,MTGenes,riboGenes, noChr))


processedP56<-subset(processedP56,features=as.vector(keep))



processedP56<-NormalizeData(processedP56)
processedP56<-FindVariableFeatures(processedP56,nfeatures = 5000)
processedP56<-ScaleData(processedP56,vars.to.regress =  c("nFeature_RNA",'percent.mt'))
processedP56<-RunPCA(processedP56)
ElbowPlot(processedP56)

processedP56<-RunTSNE(processedP56,dims=1:20)
processedP56<-RunUMAP(processedP56,dims=1:20)
processedP56<-FindNeighbors(processedP56,dims=1:20)
processedP56<-FindClusters(processedP56,resolution = 0.1)

#remove meis2
processedP56 <- subset(processedP56, cell=grep('Meis', processedP56$cluster_label), invert=T)


# #Store entrez id
# processedP56@assays[["RNA"]]@meta.features[["entrez"]]<-processedP56@assays[["RNA"]]@counts@Dimnames[[1]]
# 
# #Set genes to symbol for better plotting
# processedP56@assays[["RNA"]]@data@Dimnames[[1]]<-as.character(processedP56@assays[["RNA"]]@meta.features[["gene.symbol"]])
# 
# processedP56@assays[["RNA"]]@counts@Dimnames[[1]]<-processedP56@assays[["RNA"]]@data@Dimnames[[1]]


#cut dendrogram
processedP56$cluster_label<-gsub(' ', processedP56$cluster_label,replacement='')
pruneThese<-labels(GABAdend)[labels(GABAdend) %nin% unique(processedP56$cluster_label)]
SSTdend<-prune(GABAdend,pruneThese)

cut<- cutree(SSTdend, h= 2.5)
processedP56$dendCut <- cut[processedP56$cluster_label]


cut2<- cutree(SSTdend, h= 1.5)
processedP56$dendCut2 <- cut2[processedP56$cluster_label]



#reannotate

Idents(processedP56)<-'dendCut'
dendDEGenes<-FindAllMarkers(processedP56,logfc.threshold = log(1.5),only.pos = TRUE, base=exp(1))


Idents(processedP56)<-'dendCut2'
dend2DEGenes<-FindAllMarkers(processedP56,logfc.threshold = log(1.5),only.pos = TRUE, base=exp(1))


Idents(processedP56)<-'cluster_label'
allDEGenes<-FindAllMarkers(processedP56,logfc.threshold = log(1.5),only.pos = TRUE, base=exp(1))


#Try descore classify
typeMarkers<-typeMarkers[-4]
processedP56$dendCut <- processedP56$dendCut2
DEGenes<-dend2DEGenes

#Restrict min pct diff and logfold change to consider only top DE genes / avoid misclassification
DEGenes<-DEGenes[which(DEGenes$pct.1-DEGenes$pct.2>0.4),]
DEGenes<-DEGenes[which(DEGenes$avg_logFC > log(2)),]

output <- DEScoreClassify(processedP56, DEGenes, typeMarkers, 'dendCut', specWeight = T )
out<-output[[1]]
processedP56$putative_identity <- out[processedP56$dendCut]


# diff<-abs(DEGenes$pct.1 - DEGenes$pct.2)
# DEGenes<-DEGenes[which(diff>0.3),]

processedP56$reannotation<-NA 
processedP56$putative_identity <- NA


#Quantify proportion of markers that appear in DE genes.
for(cluster in unique(processedP56[['dendCut']][,1])){

  sub<-DEGenes[DEGenes$cluster==cluster,]
  scores<-vector('list',length=3)
  names(scores)<- c('Martinotti','Non-Martinotti','LRP')

  #Compute DEscore for each type
  for(type in c('Martinotti','Non-Martinotti','LRP')){
    mark<-typeMarkers[[type]]
    m<- sub[which(sub$gene %in% mark),]
    spec<-m$pct.1-m$pct.2
    names(spec)<-m$gene
    DEscore<- -log10(m$p_val_adj)
    DEscore<-unlist(mapply(DEscore,FUN=function(x) min(x,20)))
    #DEscore<-unlist(mapply(DEscore,FUN=function(x) min(x,20)))*spec

    if(dim(m)[1] == 0){
      scores[[type]]<-c(sum(DEscore),0)
    }else{
      scores[[type]]<-c(sum(DEscore),m$gene[grep(max(m$avg_logFC),m$avg_logFC)])
    }


  }

  vals<-as.numeric(unlist(lapply(scores,FUN=function(x)x[[1]])))
  names(vals)<-names(scores)
  if(length(grep(max(vals),vals))>1){

    #take the gene with highest logfc
    gene<-sub$gene[grep(max(sub$avg_logFC), sub$avg_logFC)]
    cellType<-NA

  }else{


    #which cell type?
    cellType<-names(vals)[grep(max(vals), vals)]
    canon<-markers[[cellType]]
    full<-typeMarkers[[cellType]]

    #Find gene identity
    #is there a canonical one?
    if(sum(sub$gene %in% canon) >= 1){

      marks<-sub[sub$gene %in% canon,]
      gene<-marks$gene[grep(max(marks$avg_logFC), marks$avg_logFC)]

    }else if(sum(sub$gene %in% full) >= 1){

      marks<-sub[sub$gene %in% full,]
      gene<-marks$gene[grep(max(marks$avg_logFC), marks$avg_logFC)]

    }else{
      #take the gene with highest logfc
      gene<-sub$gene[grep(max(sub$avg_logFC), sub$avg_logFC)]

    }
  }

    #has that identity already been found in another cluster?
    if(length(grep(gene,processedP56$reannotation))>1){
      #get all the matching labels, seek maximum number on the end
      nams<-processedP56$reannotation[grep(gene,processedP56$reannotation)]
      nums<-lapply(nams,FUN=function(x) substring(x,nchar(x)))
      m<-max(as.numeric(nums))
      m<-m+1
      processedP56$reannotation[which(processedP56$dendCut==cluster)]<-paste0('Sst_',gene,'_',as.character(m))

    }
    else{
      processedP56$reannotation[which(processedP56$dendCut==cluster)]<-paste0('Sst_',gene,'_1')
    }

  processedP56$putative_identity[which(processedP56$dendCut==cluster)]<-cellType


}


DimPlot(processedP56, group.by='reannotation', label=T)
DimPlot(processedP56, group.by='putative_identity', label=T)

P56DEGenes<-DEGenes

save(processedP56,P56DEGenes,file=paste0(path,'/input/external_reference/processedAllenP5610x.Rdata'))

save(processedP56, file='processedAllenP5610x.Rdata')
