

#This script uses P10 scRNAseq data from Mayer et al. 2018. We use a random forest model to classify the data into
#cell types. Model 2 is the one we use for final labels.
# 
# Mayer, C., Hafemeister, C., Bandler, R. et al. Developmental diversification of cortical inhibitory interneurons. Nature 555, 457–462 (2018). https://doi.org/10.1038/nature25999
# 
# https://www.nature.com/articles/nature25999


set.seed(123)

## ----receive input from nextflow----------------------------------------------
#Get parameters
args <- commandArgs(TRUE)
path<-args[1]
file<-args[2]
print(path)


timepoints<-substr(file,12,nchar(file)-17)
figurePath<-paste0(path,'/output/figures') 

if(!dir.exists(figurePath)){
  dir.create(figurePath)
}


## ----load---------------------------------------------

#Load standard package list, etc.
source(paste0(path,'/bin/auxiliary/DirectoryChecker.R'))
source(paste0(path,'/bin/auxiliary/PackageLoader.R'))


#check ram
RAM<-get_ram()
print(paste(RAM,'RAM passed to Rscript'))

ncor=detectCores()
print(paste(ncor,'cores detected'))
plan("multiprocess", workers = ncor )
options(future.globals.maxSize = 10000 * 1024^2)


load(file)

integratedSubset<-subset(integrated,cells=WhichCells(integrated,expression=major_cluster_label!='FailedClassification' & major_cluster_label != 'Stressed' & major_cluster_label != 'Meis2'))


#load gord fishell
load(paste0(path,'/input/external_reference/processedP10Mayer10x.Rdata'))

# P10[['tsnealigned']] <- CreateDimReducObject(embeddings = emb, key = 'tSNE_')
# 
# P10 <- UpdateSeuratObject(P10)

# rownames(P10[['RNA']]@data) <- toupper(rownames(P10[['RNA']]@data))
# rownames(P10[['RNA']]@counts) <- toupper(rownames(P10[['RNA']]@counts))
# rownames(P10[['RNA']]@scale.data) <- toupper(rownames(P10[['RNA']]@scale.data))

# #load dibella
# dibella <-'/Volumes/lab-luscomben-1/home/users/fisherj/RefData/ExtractSST/output/INs_integrated_E10_E11_E12_E13_E14_E15_E16_E17_E18_S1_E18_S3_P1_S1_P1_S2_P4.Rdata'
# 
# x<-load(dibella)
# DiBellaP4 <- get(x)
# DiBellaP4 <- subset(DiBellaP4, cells = WhichCells(DiBellaP4, expression = orig.ident == 'P4' ))
# DefaultAssay(DiBellaP4) <- 'RNA'

# ---- GORD FISHELL ----
sub<-subset(P10, cells=WhichCells(P10, expression=celltype1=='MGE-SST'))
# sub <- subset(P10,cells=unique( c(colnames(P10)[grep('^Sst', P10$Map1)], colnames(P10)[grep('^Nos', P10$Map1)]) ) )
# sub <- subset(sub, cells = colnames(sub)[grep('^Unassigned', sub$Map1)], invert=T)

#Try training random forest?
library(randomForest)

kfold <- 10

#---- Major Cluster Label ----

#Random Forest

Idents(integratedSubset) <-'major_cluster_label'
majormark <- FindAllMarkers(integratedSubset, only.pos = T, logfc.threshold = log(1.5))

#genes <- intersect(toupper(rownames(P10)), rownames(integratedSubset))
genes <- unique(majormark$gene)
genes<- intersect(genes, toupper(rownames(sub)))

dat<-t(as.matrix(integratedSubset@assays[['RNA']]@data[genes,]))

#measure accuracy with cross validation 
train<-dat[sample(nrow(dat)),]
#Create 10 equally size folds
folds <- cut(seq(1,nrow(train)),breaks=kfold,labels=FALSE)

prediction <- integratedSubset$major_cluster_label
prediction[1:length(prediction)] <- NA

#Perform 10 fold cross validation with feature selction
for(i in 1:kfold){
  #Segement your data by fold using the which() function 
  testIndices <- which(folds==i,arr.ind=TRUE)
  testData <- dat[testIndices, ]
  trainData <- dat[-testIndices, ]
  
  #Use the test and train data partitions however you desire...
  model <- randomForest(x = trainData,  y =  as.factor(integratedSubset$major_cluster_label[rownames(trainData)]))
  
  #measure importance and refine
  imp <-importance(model)
  imp<-imp[order(imp, decreasing=T),]
  
  d<-density(imp)
  
  #approximate peaks
  peaks<-d$x[which(diff(sign(diff(d$y)))<0)+1]
  thresh <- min(peaks)
  
  #drop genes with importance < 1
  newgenes <- names(which(imp > thresh))
  
  testData <- dat[testIndices, newgenes]
  trainData <- dat[-testIndices, newgenes]
  
  model <- randomForest(x = trainData,  y =  as.factor(integratedSubset$major_cluster_label[rownames(trainData)]))
  
  
  #predict
  pred <- predict(model, testData)
  prediction[names(pred)] <- as.character(pred) 
  
  
}

model1acc <- sum(prediction == integratedSubset$major_cluster_label) / length(prediction)
model1acc
#Train model on full data


#dat<-dat[,colnames(dat)[colnames(dat) %in% rownames(integratedSubset)]]
dat<-t(as.matrix(integratedSubset@assays[['RNA']]@data[genes,]))

model1 <- randomForest(x = dat,  y =  as.factor(integratedSubset$major_cluster_label[rownames(dat)]))

#measure importance and refine
#measure importance and refine
imp <-importance(model1)
imp<-imp[order(imp, decreasing=T),]

d<-density(imp)

#approximate peaks
peaks<-d$x[which(diff(sign(diff(d$y)))<0)+1]
thresh <- min(peaks)

#drop genes with importance < 1
newgenes <- names(which(imp > thresh))

dat<-t(as.matrix(integratedSubset@assays[['RNA']]@data[newgenes,]))

model1 <- randomForest(x = dat,  y =  as.factor(integratedSubset$major_cluster_label[rownames(dat)]))

test <- t(as.matrix(sub[['RNA']]@data[newgenes,]))
colnames(test) <- toupper(colnames(test))
test <- test[,colnames(test) %in% newgenes]
pred <- predict( model1,test)
sub$major_prediction <- pred



#---- Cluster Label ----

Idents(integratedSubset) <-'cluster_label'
minormark <- FindAllMarkers(integratedSubset, only.pos = T, logfc.threshold = log(1.5))

#genes <- intersect(toupper(rownames(P10)), rownames(integratedSubset))
genes <- unique(minormark$gene)
genes<- intersect(genes, toupper(rownames(P10)))

#now predict on gord fishell
sub<-subset(P10, cells=WhichCells(P10, expression=celltype1=='MGE-SST'))
test <- t(as.matrix(sub[['RNA']]@data))
colnames(test) <- toupper(colnames(test))
test <- test[,colnames(test) %in% genes]

dat<-t(as.matrix(integratedSubset@assays[['RNA']]@data[genes,]))


#measure accuracy with cross validation 
train<-dat[sample(nrow(dat)),]
#Create 10 equally size folds
folds <- cut(seq(1,nrow(train)),breaks=kfold,labels=FALSE)

prediction <- integratedSubset$cluster_label
prediction[1:length(prediction)] <- NA

#Perform 10 fold cross validation with feature selction
for(i in 1:kfold){
  #Segement your data by fold using the which() function 
  testIndices <- which(folds==i,arr.ind=TRUE)
  testData <- dat[testIndices, ]
  trainData <- dat[-testIndices, ]
  
  #Use the test and train data partitions however you desire...
  model <- randomForest(x = trainData,  y =  as.factor(integratedSubset$cluster_label[rownames(trainData)]))
  
  #measure importance and refine
  imp <-importance(model)
  imp<-imp[order(imp, decreasing=T),]
  
  # d<-density(imp)
  # 
  # #approximate peaks
  # peaks<-d$x[which(diff(sign(diff(d$y)))<0)+1]
  # thresh <- min(peaks)
  thresh <- median(imp)
  
  #drop genes with importance < 1
  newgenes <- names(which(imp > thresh))
  
  testData <- dat[  testIndices, newgenes]
  trainData <- dat[-testIndices, newgenes]
  
  model <- randomForest(x = trainData,  y =  as.factor(integratedSubset$cluster_label[rownames(trainData)]))
  
  #predict
  pred <- predict(model, testData)
  prediction[names(pred)] <- as.character(pred)
  
}

model2acc <- sum(prediction == integratedSubset$cluster_label) / length(prediction)
model2preds <- prediction

#dat<-dat[,colnames(dat)[colnames(dat) %in% rownames(integratedSubset)]]

model2 <- randomForest(x = dat,  y =  as.factor(integratedSubset$cluster_label[rownames(dat)]))

#measure importance and refine
imp <-importance(model2)
imp<-imp[order(imp, decreasing=T),]

# d<-density(imp)
# 
# #approximate peaks
# peaks<-d$x[which(diff(sign(diff(d$y)))<0)+1]
# thresh <- min(peaks)
thresh <- median(imp)

#drop genes with importance < 1
newgenes <- names(which(imp > thresh))

dat<-t(as.matrix(integratedSubset@assays[['RNA']]@data[newgenes,]))

model2 <- randomForest(x = dat,  y =  as.factor(integratedSubset$cluster_label[rownames(dat)]))

test <- t(as.matrix(sub[['RNA']]@data))
colnames(test) <- toupper(colnames(test))
test <- test[,colnames(test) %in% newgenes]
pred <- predict( model2,test)
sub$minor_prediction <- pred


#save for reference
P10model <- model2
P10SST <- sub
save(model1, model1acc, model2, model2acc, model2preds , file = paste0(path, '/output/scrnaseq/P10Mayer10x/RFmodels.Rdata'))
save(P10, P10model, P10SST, file = paste0(path, '/output/scrnaseq/P10Mayer10x/classifiedP10.Rdata'))

#pass down pipe
save(P10,P10model,P10SST, file = paste0('classifiedP10.Rdata'))

# 
# #plot classification
# p1 <- DimPlot(sub, group.by='minor_prediction', label = T, cols=SubClusterCols[sort(as.character(unique(sub$minor_prediction)))], repel = T) + ggtitle('Sub-type Classification') + theme(legend.position='none')
# p2 <- DimPlot(sub, group.by='major_prediction', label = T, cols=MajorClusterCols[sort(as.character(unique(sub$major_prediction)))]) + ggtitle('Cell Type Classification')+ theme(legend.position='none')
# 
# plot_grid(p1,p2,NULL,NULL,ncol=2,nrow=2)
# 
# #plot proportion classified
# 
# 
# FeaturePlot(sub, c('Gad1','Gad2','Lhx6','Sst','Reln','Hpse','Nr2f2','Erbb4','Chodl'), order=T) & scale_color_gradientn(colors=FeatureCol)
# 


