

#THis script annotates clusters by comparing DE genes to a list of cell type markers and computing DEscore

## ----receive input from nextflow----------------------------------------------
args <- commandArgs(TRUE)
path<-args[1]
file<-args[2]
markers<-args[3]

print(path)
set.seed(123)




## ---- Load standard package list ----------------------------------------------
source(paste0(path,'/bin/auxiliary/DirectoryChecker.R'))
source(paste0(path,'/bin/auxiliary/PackageLoader.R'))
source(paste0(path,'/bin/auxiliary/SetPlottingParameters.R'))

#classify clusters by descore for markers
source(paste0(path,'/bin/auxiliary/DEScoreClassify.R'))


#check ram
RAM<-get_ram()
print(paste(RAM,'RAM passed to Rscript'))

ncor=detectCores()-1
print(paste(ncor,'cores detected'))
plan("multiprocess", workers = ncor -1)
options(future.globals.minSize = 10000 * 1024^2)



## ---- File setup ----------------------------------------------

#load data files
load(file)

#load markers extracted form MET data
load(markers)

times<-mixedsort(unique(integrated$orig.ident))
timepoints<-paste(times,collapse='_')



## ---- Identify which cluster is which type ----------------------------------------------


#Identify each cluster individually by combination of existing marker list and DE genes from MET type data.
integratedSubset<-subset(integrated,cells=WhichCells(integrated,expression=cluster_label!='FailedClassification' & putative_identity!='Meis2'))


#this function annotates based on detected markers present in de genes
Idents(integratedSubset)<-'cluster_label'
DefaultAssay(integratedSubset)<-'RNA'
DEGenes<-FindAllMarkers(integratedSubset, logfc.threshold = log(1.5), base=exp(1), only.pos=T)
DEGenes<-DEGenes[which(DEGenes$p_val_adj<0.01),]

output<-DEScoreClassify(integratedSubset,DEGenes,typeMarkers, 'cluster_label', specWeight = F)
out <- output[[1]]

save(output,file=paste0(path,'/output/scrnaseq/',timepoints,'/cluster_classification_genes.Rdata'))

#reassign cluster name
integrated[['major_cluster_label']]<-integrated[['cluster_label']]
for(cluster in unique(integratedSubset[['cluster_label']][,1])){
  
  integrated$major_cluster_label[WhichCells(integrated,expression=cluster_label==cluster)]<-out[cluster]
}


## ---- Housekeeping for new cluster labels  ----------------------------------------------


#Redo cluster labels to match
for(group in unique(integrated[['major_cluster_label']][,1])){
  cells<-integrated[['cluster_label']][which(integrated[['major_cluster_label']][,1]==group),]
  numClass<-length(unique(cells))
  newVals<-as.character(seq(1:numClass))
  names(newVals)<-unique(cells)
  newVec<-newVals[as.factor(cells)]
  integrated[['cluster_label']][which(integrated[['major_cluster_label']][,1]==group),]<-paste0(group,'_',newVec)
}
table(integrated[['cluster_label']])



if(sum(which(integrated$major_cluster_label == 'Uncertain')) > 0){
  
  #do a metaneighbour alignment with P5 vs MET
  integratedSubset$cluster_label <- integrated$cluster_label
  integratedSubset[['comparison_cluster_label']]<-integratedSubset[['cluster_label']]
  MET[['comparison_cluster_label']]<-MET[['MET.type.Label']]
  integratedSubset[['experiment']]<-'Marín_scRNAseq'
  MET[['experiment']]<-'Allen_Patchseq'
  
  #P5<-subset(integratedSubset, cells=WhichCells(integratedSubset, expression=orig.ident=='P5'))
  
  mergedData<-merge(integratedSubset,MET)
  mergedData<-subset(mergedData,cells=which(!is.na(mergedData$comparison_cluster_label)))
  mergedDataSCE<-as.SingleCellExperiment(mergedData,assay='RNA')
  
  varGenes<- unlist(typeMarkers[c('Martinotti','LRP','Non-Martinotti')])
  
  
  
  celltype_NV <- MetaNeighborUS(var_genes = varGenes,
                                dat = mergedDataSCE,
                                study_id = mergedDataSCE$experiment,
                                cell_type = mergedDataSCE$comparison_cluster_label,
                                fast_version = FALSE)
  
  #fix colnames
  colnames(celltype_NV ) <- unlist(lapply(colnames(celltype_NV ), FUN=function(x){
    str_split(x,'\\|')[[1]][2]
  }))
  
  #fix rownames
  rownames(celltype_NV ) <- unlist(lapply(rownames(celltype_NV ), FUN=function(x){
    str_split(x,'\\|')[[1]][2]
  }))
  
  
  
  #only consider correlation above threshold to be valid.
  threshMatrix<-as.matrix(celltype_NV>0.8)
  threshMatrix<-1*threshMatrix
  
  
  #compare to MET only. Only look at MET types with know cell type
  METsub<-subset(MET, cells=WhichCells(MET, expression=cell_type != 'U'))
  METnames <- names(table(METsub$MET.type.Label))
  
  threshMatrix <- threshMatrix[, METnames]
  
  #for each uncertain group, find the MET types it best matches with
  uncGroups <-  rownames(threshMatrix)[grep('Uncertain', rownames(threshMatrix))]
  
  for(group in uncGroups){
    
    type <- NA
    r <- threshMatrix[group,]
    
    if(sum(r) > 0){
      
      #table the types
      top <- names(r)[which(r==1)]
      types<-unlist(lapply(top, FUN=function(x){
        as.character(MET$cell_type[grep(x, MET$MET.type.Label)][1])
      }))
      tally <- table(types)
      
      if(length(grep(max(tally), tally)) == 1){
        type <- names(tally)[grep(max(tally), tally)]
      } 
      
    }
    
    
    if(is.na(type)){
      #choose the best matching MET type
      r <- celltype_NV[group,]
      top <- names(sort(r, decreasing = T)[1])
      
      #get celltype
      type<-as.character(MET$cell_type[grep(top, MET$MET.type.Label)][1])
      
    }
    
    #reassign label
    already <- length(grep(type, unique(integrated$cluster_label)))
    if(already> 0){
      newnum <- already + 1
      newlab <- paste0(type,'_',newnum)
    }else{
      newlab <- paste0(type,'_1')
    }
    
    integrated$cluster_label[integrated$cluster_label==group] <- newlab
    integrated$major_cluster_label[integrated$cluster_label==newlab] <- type
  }
}


# Get dendrogram for correct labels
integratedSubset<-subset(integrated,cells=WhichCells(integrated,expression=major_cluster_label!='FailedClassification' & major_cluster_label!='Stressed' & major_cluster_label!='Meis2'))
Idents(integratedSubset)<-'cluster_label'
avg<-AverageExpression(integratedSubset,return.seurat=T,assays='integrated')
hInput<-avg@assays[['integrated']]@scale.data
dis<-dist(1-cor(hInput))
clu<-hclust(dis,method = 'average')

#Dendrogram
dend<-as.dendrogram(clu)
plot(rank_branches(dend))

newdend<-dend
integratedTemp<-integrated

#Relabel Clusters
for(group in unique(unlist(lapply(str_split(labels(dend),'_'),FUN = function(x) x[[1]])))){
  labs<-labels(dend)[grep(paste0('^',group),labels(dend))]
  newlabs<-sort(labs)
  names(newlabs)<-labs
  
  labels(newdend)[match(names(newlabs),labels(dend))]<-newlabs
  integratedTemp[['cluster_label']][grep(paste0('^',group),integrated[['cluster_label']][,1]),]<-newlabs[integrated[['cluster_label']][grep(paste0('^',group),integrated[['cluster_label']][,1]),]]
  
}

integrated<-integratedTemp
dend<-newdend



#Compute embedding using DE genes
integratedSubset<-subset(integrated,cells=WhichCells(integrated,expression=major_cluster_label!='FailedClassification' & major_cluster_label!='Stressed'))
DefaultAssay(integratedSubset)<-'RNA'
Idents(integratedSubset)<-'cluster_label'
DEgenes<-FindAllMarkers(integratedSubset,only.pos=T, logfc.threshold = log(1.5), base=exp(1))
DEgenes<-DEgenes[which(DEgenes$p_val_adj<0.01),]
topDEgenes<-DEgenes %>% group_by(cluster) %>% top_n(n=5, avg_logFC)
DefaultAssay(integrated)<-'integrated'
integrated<-RunPCA(integrated,features=unique(DEgenes$gene))



#Compute 3D UMAP for fancy 3D plots later
temp<-RunUMAP(integrated,dims=1:15,n.components = 3)
integrated[['ThreeDumap']]<-CreateDimReducObject(embeddings = temp@reductions$umap@cell.embeddings , key='UMAP_3D')

#Compute 2D UMAP
integrated<-RunUMAP(integrated,dims=1:15, n.components = 2, verbose = T)

#Add metadata to show cleaned cell groups
integrated$filter_status<-'Retained'
integrated$filter_status[WhichCells(integrated,expression=major_cluster_label %in% c('FailedClassification','Stressed','Meis2'))]<-'Removed'



## ---- Output final data ----------------------------------------------

#save to pipeline directory for reference
save(integrated,dend,file=paste0(path,'/output/scrnaseq/',timepoints,'/integrated_',timepoints,'.Rdata'))


#Pass to working directory
save(integrated,dend,file=paste('integrated_',timepoints,'_classified.Rdata',sep= ''))


print('saving complete')




