#Classify Cells:
#This script executes the following stages of analysis
#   - Cluster cells iterativly, checking at each stage for DE genes and stopping if clustering criteria fail (see recursive clustering)
#   - Validate that clustering with random forest classification, allowing some cells to be reclassificed from one cluster to another if they have characteristic expression of key genes (see scoremembership)
#   - Identify by correlation which clusters are most similar to one another. 
#   - Identify by DE marker genes which clusters represent canoncial cell types
#   - Compute a final UMAP embedding using DE genes between clusters




## ----receive input from nextflow----------------------------------------------
args <- commandArgs(TRUE)
path<-args[1]
file<-args[2]
markers<-args[3]

print(path)
set.seed(123)
timepoints<-substr(file,12,nchar(file)-6)




## ---- Load standard package list ----------------------------------------------
source(paste0(path,'/bin/DirectoryChecker.R'))
source(paste0(path,'/bin/PackageLoader.R'))
source(paste0(path,'/bin/SetPlottingParameters.R'))

#Load functions that help us to go through nested lists
source(paste0(path,'/bin/NestedListsFunctions.R'))

#Load functions that specifically support this procedure: dendrogram handling, etc.
source(paste0(path,'/bin/DendrogramHandlerFunctions.R'))

#Load the function to perform iterative hierarchical clustering
source(paste0(path,'/bin/RecursiveClustering.R'))

#Load the validation and cell scoring function
source(paste0(path,'/bin/ScoreMembership.R'))

#classify clusters by descore for markers
source(paste0(path,'/bin/DEScoreClassify.R'))


#check ram
RAM<-get_ram()
print(paste(RAM,'RAM passed to Rscript'))

ncor=detectCores()-1
print(paste(ncor,'cores detected'))
plan("multiprocess", workers = ncor )
options(future.globals.maxSize = 15000 * 1024^




## ---- File setup ----------------------------------------------

#load data files
load(file)

#load markers extracted form MET data
load(markers)

#integrated_clean<-subset(integrated,cells=sample(1:8807,4000,replace = F))
integrated_clean<-subset(integrated,cells=WhichCells(integrated,expression=putative_identity!='Meis2'))
print('starting recursive classification')





## ---- Clustering + Validation ----------------------------------------------

#Original setting below
dendrogram<-RecursiveClustering(integrated_clean,minSize = 50,DEscore.thresh = 60)
print('Clustering Complete')
print(table(dendrogram[['Cluster Labels']]))

integrated_clean[['iterative_clust']]<-as.numeric(dendrogram[["Cluster Labels"]][,1])
#membershipScore<-ScoreMembership(integrated_clean,'iterative_clust')

Idents(integrated_clean)<-'iterative_clust'


#Alternate setting
membershipScore<-ScoreMembership(integrated_clean,'iterative_clust',ntree=1000,nGene=20,nIteration=100,logFC.thresh = log(1.5))

integrated_clean[['validated_clust_certainty']]<-membershipScore[[1]]$certainty
integrated_clean[['validated_clust']]<-membershipScore[[1]]$classification

#For debugging
#save(integrated_clean,membershipScore,file=paste0(path,'/output/scrnaseq/',timepoints,'/integrated_',timepoints,'_classified_debug','.Rdata'))


## ---- Group clusters by inter-cluster similarity ----------------------------------------------

integrated[['cluster_label']]<-integrated[['putative_identity']]
integrated[['cluster_label']][rownames(integrated_clean[['validated_clust']]),]<-integrated_clean[['validated_clust']][,1]

integrated[['certainty_score']]<-'Not Assessed'
integrated[['certainty_score']][rownames(integrated_clean[['validated_clust_certainty']]),]<-as.character(integrated_clean[['validated_clust_certainty']][,1])

integrated[['cluster_label']][WhichCells(integrated,expression=certainty_score=='Failure'),]<-'FailedClassification'

#Centroid based cluster grouping
integratedSubset<-subset(integrated,cells=WhichCells(integrated_clean,expression=validated_clust_certainty!='FailedClassification' & putative_identity!='Meis2'))
Idents(integratedSubset)<-'cluster_label'
avg<-AverageExpression(integratedSubset,return.seurat=T,assays='integrated')
hInput<-avg@assays[['integrated']]@scale.data
dis<-dist(1-cor(hInput))
clu<-hclust(dis,method = 'average')
numGroups<-4

#Dendrogram
dend<-as.dendrogram(clu)
cut<-cutree(dend,numGroups)
labels<-cut
labels<-labels[match(labels(dend),names(labels))]
labels<-as.numeric(labels)
clusterCols<-hue_pal()(length(unique(labels)))
cols<-clusterCols[unique(labels)]
dend<-dend %>% color_branches(col=clusterCols,clusters=labels)
dend<-color_labels(dend,col='black')
#plot(rank_branches(dend))

integrated[['major_cluster_label']]<-integrated[['cluster_label']]
for(i in 1:length(cut)){

  integrated[['major_cluster_label']][which(integrated[['cluster_label']]==names(cut)[i]),]<-cut[i]
}
table(integrated[['major_cluster_label']])

#DimPlot(integrated,group.by='major_cluster_label',label=T)




## ---- Identify which cluster is which type ----------------------------------------------



#Identify each cluster individually by combination of existing marker list and DE genes from MET type data.
integratedSubset<-subset(integrated,cells=WhichCells(integrated_clean,expression=validated_clust_certainty!='FailedClassification' & putative_identity!='Meis2'))


#this function flags a score for stressed markers > 10, labels it stressed instead of anything else even if it has a higher score
out<-DEScoreClassify(integratedSubset,typeMarkers, 'cluster_label', specWeight = F)


#reassign cluster name
for(cluster in unique(integratedSubset[['cluster_label']][,1])){
  
  integrated$major_cluster_label[WhichCells(integrated,expression=cluster_label==cluster)]<-out[cluster]
}
#DimPlot(integrated,group.by='major_cluster_label', label=T, reduction='umap')

## ---- Housekeeping for new cluster labels  ----------------------------------------------



#Redo cluster labels to match
for(group in unique(integrated[['major_cluster_label']][,1])){
  cells<-integrated[['cluster_label']][which(integrated[['major_cluster_label']][,1]==group),]
  numClass<-length(unique(cells))
  newVals<-as.character(seq(1:numClass))
  names(newVals)<-unique(cells)
  newVec<-newVals[as.factor(cells)]
  integrated[['cluster_label']][which(integrated[['major_cluster_label']][,1]==group),]<-paste0(group,'_',newVec)
}
table(integrated[['cluster_label']])

# #Re-do dendrogram for correct labels
integratedSubset<-subset(integrated,cells=WhichCells(integrated,expression=major_cluster_label!='FailedClassification' & major_cluster_label!='Stressed' & major_cluster_label!='Meis2'))
Idents(integratedSubset)<-'cluster_label'
avg<-AverageExpression(integratedSubset,return.seurat=T,assays='integrated')
hInput<-avg@assays[['integrated']]@scale.data[m[,1],]
dis<-dist(1-cor(hInput))
clu<-hclust(dis,method = 'average')

#Dendrogram
dend<-as.dendrogram(clu)
plot(rank_branches(dend))

 newdend<-dend
integratedTemp<-integrated

#Relabel Clusters
for(group in unique(unlist(lapply(str_split(labels(dend),'_'),FUN = function(x) x[[1]])))){
  labs<-labels(dend)[grep(paste0('^',group),labels(dend))]
  newlabs<-sort(labs)
  names(newlabs)<-labs
  

  labels(newdend)[match(names(newlabs),labels(dend))]<-newlabs
  integratedTemp[['cluster_label']][grep(paste0('^',group),integrated[['cluster_label']][,1]),]<-newlabs[integrated[['cluster_label']][grep(paste0('^',group),integrated[['cluster_label']][,1]),]]
    

                
}
integrated<-integratedTemp
dend<-newdend



#Recompute embedding with DE genes
integratedSubset<-subset(integrated,cells=WhichCells(integrated,expression=major_cluster_label!='FailedClassification' & major_cluster_label!='Stressed'))
DefaultAssay(integratedSubset)<-'RNA'
Idents(integratedSubset)<-'cluster_label'
DEgenes<-FindAllMarkers(integratedSubset,only.pos=T, logfc.threshold = log(1.5), base=exp(1))
DEgenes<-DEgenes[which(DEgenes$p_val_adj<0.01),]
topDEgenes<-DEgenes %>% group_by(cluster) %>% top_n(n=5, avg_logFC)
DefaultAssay(integrated)<-'integrated'
integrated<-RunPCA(integrated,features=unique(DEgenes$gene))


#Compute 3D embedding for fancy 3D plots later
temp<-RunUMAP(integrated,dims=1:15,n.components = 3)
integrated[['ThreeDumap']]<-CreateDimReducObject(embeddings = temp@reductions$umap@cell.embeddings , key='UMAP_3D')

#Compute 2D embedding
integrated<-RunUMAP(integrated,dims=1:15, n.components = 2, verbose = T)

#Add metadata to show cleaned cell groups
integrated$filter_status<-'Retained'
integrated$filter_status[WhichCells(integrated,expression=major_cluster_label %in% c('FailedClassification','Stressed','Meis2'))]<-'Removed'



## ---- Output final data ----------------------------------------------

#save to pipeline directory for reference
save(integrated,dend,file=paste0(path,'/output/scrnaseq/',timepoints,'/integrated_',timepoints,'.Rdata'))


#Pass to working directory
save(integrated,dend,file=paste('integrated_',timepoints,'_classified.Rdata',sep= ''))


print('saving complete')


