
#This script computes diffusion map, using deveopmental gene module to contruct pseudotime trajectories. 

## ----receive input from nextflow----------------------------------------------
#Get parameters
args <- commandArgs(TRUE)
path<-args[1]
file<-args[2]
markers<-args[3]
modules<-args[4]
print(path)
set.seed(123)

library(devtools)
library(destiny)
library(kbranches)
library(Seurat)
library(scales)
library(cowplot)
library(tidyverse)
library(gtools)
library(ggthemes)
library(pheatmap)
library(gam)
library(readr) 
library(Hmisc)




source(paste0(path,'/bin/auxiliary/DirectoryChecker.R'))
source(paste0(path,'/bin/auxiliary/SetPlottingParameters.R'))
source(paste0(path,"/bin/auxiliary/ApplyDiffusionMap.R"))


timepoints<-substr(file,12,nchar(file)-17)



#Place where all the results will go
diffPath<-paste0(path,"/output/scrnaseq/",timepoints,"/Pseudotime")
if(!dir.exists(diffPath)){
  dir.create(diffPath)
}




#load devlopmental data
load(file)

load(markers)

load(modules)


#Set colours
branchCols<-c( '#af86f5', '#2dddd3','#f056b4')
#branchCols<-c(smoothBlue[6], smoothPink[6], 'gray')
cols = colorRampPalette(customCol)(length(unique(integrated$cluster_label)))
names(cols)<-sort(unique(integrated$cluster_label))
SubClusterCols<-cols


typeMarkers<-typeMarkers[which(names(typeMarkers)!='Stressed')]

#clean cells to include only groups of interest
integrated_clean<-subset(integrated,cells=WhichCells(integrated,expression=major_cluster_label %nin% c('FailedClassification','Stressed','Meis2')))

keygenes<-unique(c(unlist(modList[grep('GAP43|SYP', modList)]), unlist(typeMarkers)[ unlist(typeMarkers) %in% rownames(integrated_clean) ]))
out<-ApplyDiffusionMap(integrated_clean,keygenes)
dm<-out[[1]]
dpt<-out[[2]]
integrated_clean<-out[[3]]

integrated_clean$branch<-integrated_clean$major_cluster_label
integrated_clean$branch[integrated_clean$branch=='Martinotti']<-'Branch 1'
integrated_clean$branch[integrated_clean$branch=='Non-Martinotti']<-'Branch 2'
integrated_clean$branch[integrated_clean$branch=='LRP']<-'Branch 3'

#break each trajectory into chunks, and average over those chunks.
integrated_clean$branch_binned<-NA
nchunk<-5
for(branch in sort(unique(integrated_clean$branch))){
  
  #get whole branch
  b<-integrated_clean$dptval[which(integrated_clean$branch==branch)]
  
  #labels for chunks
  newlabs<-paste0(rep(branch), paste0('_', 1:nchunk))
  #split into bins
  integrated_clean$branch_binned[which(integrated_clean$branch==branch)] <- as.character(cut(b,nchunk, labels=newlabs))
  
}


integrated$branch<-integrated_clean$branch
integrated$branch_binned<-integrated_clean$branch_binned
integrated$dptval<-integrated_clean$dptval

#add embedding
DCemb<-integrated@reductions$ThreeDumap@cell.embeddings
colnames(DCemb) <- c('DC_1','DC_2','DC_3')
DCemb[1:dim(DCemb)[1],1:dim(DCemb)[2]]<-NA
DCemb[rownames(integrated_clean@reductions$DC@cell.embeddings),]<-integrated_clean@reductions$DC@cell.embeddings

integrated[['DC']] <- CreateDimReducObject(embeddings =as.matrix(DCemb),key='DC_', assay='RNA')

save(out,file=paste0(diffPath, '/DiffusionMap_output.Rdata'))
save(integrated,dend,file='DiffusionMap_output.Rdata')
save(integrated,dend,file=paste0(path,'/output/scrnaseq/',timepoints,'/integrated_',timepoints,'.Rdata'))


#-- GAM fitting -----
library(gam)

sub<-integrated_clean
DefaultAssay(sub)<-'RNA'
sub<-ScaleData(sub)
Y <- sub@assays[['RNA']]@data

# Fit GAM for each gene using pseudotime as independent variable.
t <- sub$dptval[colnames(Y)]
gam.pval <- apply(Y, 1, function(z){
  d <- data.frame(z=z, t=t)
  tmp <- gam(z ~ lo(t), data=d)
  p <- summary(tmp)[4][[1]][1,5]
  p
})

gam.pval<-gam.pval[which(gam.pval<0.01)]
allGenes<- gam.pval

#Martinotti
sub<-subset(integrated_clean,cells=WhichCells(integrated_clean,expression = major_cluster_label == 'Martinotti'))
DefaultAssay(sub)<-'RNA'
sub<-FindVariableFeatures(sub, nfeatures=1000)
sub<-ScaleData(sub)
Y <- sub@assays[['RNA']]@data


# Fit GAM for each gene using pseudotime as independent variable.
t <- sub$dptval[colnames(Y)]
gam.pval <- apply(Y, 1, function(z){
  d <- data.frame(z=z, t=t)
  tmp <- gam(z ~ lo(t), data=d)
  p <- summary(tmp)[4][[1]][1,5]
  p
})


gam.pval<-gam.pval[which(gam.pval<0.01)]
MartGenes<- gam.pval

#Non-Martinotti
sub<-subset(integrated_clean,cells=WhichCells(integrated_clean,expression = major_cluster_label == 'Non-Martinotti'))
DefaultAssay(sub)<-'RNA'
sub<-FindVariableFeatures(sub, nfeatures=1000)
sub<-ScaleData(sub)
Y <- sub@assays[['RNA']]@data

# Fit GAM for each gene using pseudotime as independent variable.
t <- sub$dptval[colnames(Y)]
gam.pval <- apply(Y, 1, function(z){
  d <- data.frame(z=z, t=t)
  tmp <- gam(z ~ lo(t), data=d)
  p <- summary(tmp)[4][[1]][1,5]
  p
})


nonMartGenes<- gam.pval

#Long-Range
sub<-subset(integrated_clean,cells=WhichCells(integrated_clean,expression = major_cluster_label == 'LRP'))
DefaultAssay(sub)<-'RNA'
sub<-FindVariableFeatures(sub, nfeatures=1000)
sub<-ScaleData(sub)
Y <- sub@assays[['RNA']]@data

# Fit GAM for each gene using pseudotime as independent variable.
t <- sub$dptval[colnames(Y)]
gam.pval <- apply(Y, 1, function(z){
  d <- data.frame(z=z, t=t)
  tmp <- gam(z ~ lo(t), data=d)
  p <- summary(tmp)[4][[1]][1,5]
  p
})


LRPGenes<- gam.pval


GAMres <- list(allGenes,MartGenes,nonMartGenes,LRPGenes)

save(GAMres,file=paste0(diffPath, '/GAM_output.Rdata'))


# 
# #Try geneSwither
# library(GeneSwitches)
# library(SingleCellExperiment)
# 
# ## create SingleCellExperiment object with log-normalized single cell data
# 
# sce <- SingleCellExperiment(assays = List(expdata = integrated_clean@assays[['RNA']]@data))
# ## add pseudo-time information
# colData(sce)$Pseudotime <- integrated_clean$dptval
# ## add dimensionality reductions, e.g. PCA, UMAP, tSNE
# reducedDims(sce) <- SimpleList(PCA= integrated_clean@reductions$pca@cell.embeddings,DC = integrated_clean@reductions$DC@cell.embeddings)
# 
# #subsample for testing
# #sce_sub<-sce[,sample(colnames(sce),100)]
# sce_p1 <- binarize_exp(sce, ncores = 8, fix_cutoff = T)
# 
# sce_p1 <- find_switch_logistic_fastglm(sce_p1, show_warning = FALSE)
# 
# 
# ## filter top 15 best fitting switching genes among all the genes
# sg_allgenes <- filter_switchgenes(sce_p1, allgenes = TRUE, topnum = 15)
# ## filter top 15 best fitting switching genes among surface proteins and TFs only
# sg_gtypes <- filter_switchgenes(sce_p1, allgenes = FALSE, topnum = 20,
#                                 genelists = gs_genelists, genetype = c("Surface proteins", "TFs"))
# ## combine switching genes and remove duplicated genes from sg_allgenes
# sg_vis <- rbind(sg_gtypes, sg_allgenes[setdiff(rownames(sg_allgenes), rownames(sg_gtypes)),])
# 
# plot_timeline_ggplot(sg_vis, timedata = sce_p1$Pseudotime, txtsize = 3)
# 
# plot_gene_exp(sce_p1, gene = "PCP4", reduction = "DC", downsample = F)
# 
# 
# 
# 
# 
# 
# ## create SingleCellExperiment object with log-normalized single cell data
# LRP<-subset(integrated_clean, cells=WhichCells(integrated_clean,expression=major_cluster_label=='LRP'))
# sce <- SingleCellExperiment(assays = List(expdata = LRP@assays[['RNA']]@data))
# ## add pseudo-time information
# colData(sce)$Pseudotime <- LRP$dptval
# ## add dimensionality reductions, e.g. PCA, UMAP, tSNE
# reducedDims(sce) <- SimpleList(PCA= LRP@reductions$pca@cell.embeddings,DC = LRP@reductions$DC@cell.embeddings)
# 
# #subsample for testing
# #sce_sub<-sce[,sample(colnames(sce),100)]
# sce_p1 <- binarize_exp(sce, ncores = 5, fix_cutoff = T)
# 
# sce_p1 <- find_switch_logistic_fastglm(sce_p1, show_warning = FALSE)
# 
# 
# ## filter top 15 best fitting switching genes among all the genes
# sg_allgenes <- filter_switchgenes(sce_p1, allgenes = TRUE, topnum = 15)
# ## filter top 15 best fitting switching genes among surface proteins and TFs only
# sg_gtypes <- filter_switchgenes(sce_p1, allgenes = FALSE, topnum = 20,
#                                 genelists = gs_genelists, genetype = c("Surface proteins", "TFs"))
# ## combine switching genes and remove duplicated genes from sg_allgenes
# sg_vis <- rbind(sg_gtypes, sg_allgenes[setdiff(rownames(sg_allgenes), rownames(sg_gtypes)),])
# 
# plot_timeline_ggplot(sg_vis, timedata = sce_p1$Pseudotime, txtsize = 3)
# 
# plot_gene_exp(sce_p1, gene = "PCP4", reduction = "DC", downsample = F)
# 
# 
# 
# ## create SingleCellExperiment object with log-normalized single cell data
# Mart<-subset(integrated_clean, cells=WhichCells(integrated_clean,expression=major_cluster_label=='Martinotti'))
# sce <- SingleCellExperiment(assays = List(expdata = Mart@assays[['RNA']]@data))
# ## add pseudo-time information
# colData(sce)$Pseudotime <- Mart$dptval
# ## add dimensionality reductions, e.g. PCA, UMAP, tSNE
# reducedDims(sce) <- SimpleList(PCA= Mart@reductions$pca@cell.embeddings,DC = Mart@reductions$DC@cell.embeddings)
# 
# #subsample for testing
# #sce_sub<-sce[,sample(colnames(sce),100)]
# sce_p1 <- binarize_exp(sce, ncores = 8, fix_cutoff = T)
# 
# sce_p1 <- find_switch_logistic_fastglm(sce_p1, show_warning = FALSE)
# 
# 
# ## filter top 15 best fitting switching genes among all the genes
# sg_allgenes <- filter_switchgenes(sce_p1, allgenes = TRUE, topnum = 500)
# ## filter top 15 best fitting switching genes among surface proteins and TFs only
# sg_gtypes <- filter_switchgenes(sce_p1, allgenes = FALSE, topnum = 20,
#                                 genelists = gs_genelists, genetype = c("Surface proteins", "TFs"))
# ## combine switching genes and remove duplicated genes from sg_allgenes
# sg_vis <- rbind(sg_gtypes, sg_allgenes[setdiff(rownames(sg_allgenes), rownames(sg_gtypes)),])
# 
# plot_timeline_ggplot(sg_vis, timedata = sce_p1$Pseudotime, txtsize = 3)
# 
# plot_gene_exp(sce_p1, gene = "PCP4", reduction = "DC", downsample = F)
# 
# 
# 
# ## create SingleCellExperiment object with log-normalized single cell data
# nonMart<-subset(integrated_clean, cells=WhichCells(integrated_clean,expression=major_cluster_label=='Non-Martinotti'))
# sce <- SingleCellExperiment(assays = List(expdata = nonMart@assays[['RNA']]@data))
# ## add pseudo-time information
# colData(sce)$Pseudotime <- nonMart$dptval
# ## add dimensionality reductions, e.g. PCA, UMAP, tSNE
# reducedDims(sce) <- SimpleList(PCA= nonMart@reductions$pca@cell.embeddings,DC = nonMart@reductions$DC@cell.embeddings)
# 
# #subsample for testing
# #sce_sub<-sce[,sample(colnames(sce),100)]
# sce_p1 <- binarize_exp(sce, ncores = 8, fix_cutoff = T)
# 
# sce_p1 <- find_switch_logistic_fastglm(sce_p1, show_warning = FALSE)
# 
# 
# ## filter top 15 best fitting switching genes among all the genes
# sg_allgenes <- filter_switchgenes(sce_p1, allgenes = TRUE, topnum = 15)
# ## filter top 15 best fitting switching genes among surface proteins and TFs only
# sg_gtypes <- filter_switchgenes(sce_p1, allgenes = FALSE, topnum = 20,
#                                 genelists = gs_genelists, genetype = c("Surface proteins", "TFs"))
# ## combine switching genes and remove duplicated genes from sg_allgenes
# sg_vis <- rbind(sg_gtypes, sg_allgenes[setdiff(rownames(sg_allgenes), rownames(sg_gtypes)),])
# 
# plot_timeline_ggplot(sg_vis, timedata = sce_p1$Pseudotime, txtsize = 3)
# 
# plot_gene_exp(sce_p1, gene = "PCP4", reduction = "DC", downsample = F)







# 
# # ---- URD ----
# 
# library(URD)
# 
# obj<-integrated_clean
# 
# # Create an URD object, which will filter the data, then normalize and log-transform it.
# urd <- createURD(count.data = as.matrix(obj[['RNA']]@counts), meta = obj@meta.data)
# 
# # urd <- calcPCA(urd, mp.factor = 2)
# # urd <- calcTsne(object = urd)
# 
# #replace tsne with umap
# urd@tsne.y <- as.data.frame(obj@reductions$umap@cell.embeddings)
# 
# urd@var.genes <- colnames(dm@data_env[["data"]])
# 
# #urd <- calcDM(urd, genes.use = keygenes)
# 
# urd <- importDM(urd, dm )
# 
# plotDimArray(urd, reduction.use = "dm", dims.to.plot = c(1,2),label="orig.ident")
# #plotDimArray(urd,  "tsne", dims.to.plot = c(1,2),label="stage")
# 
# 
# # Here we use all cells from the first stage as the root. Get k nearest neighbours to rrot cells as root group
# rootcell<-out[[4]]
# nn <- get.knn(data=obj@reductions$DC@cell.embeddings, k=50, algorithm="cover_tree")
# nind <- nn[[1]][grep(rootcell, rownames(obj@reductions$DC@cell.embeddings)),]
# rootcells <- c(rootcell,rownames(obj@reductions$DC@cell.embeddings)[nind])
# 
# 
# #We need tips as well. Get from dpt object
# dpt<-out[[2]]
# tips<-names(which(dpt@tips[,1]))
# tips<-tips[-grep(rootcell, tips)]
# 
# #get clusters for each
# tipval <- obj$orig.ident
# tipval[1:length(tipval)] <- NA
# tipind<-1
# for(cell in tips){
#   ind <- nn[[1]][grep(cell, rownames(obj@reductions$DC@cell.embeddings)),]
#   neighbours <- rownames(obj@reductions$DC@cell.embeddings)[ind]
#   set<-c(cell, neighbours)
#   tipval[set] <- as.numeric(tipind)
#   tipind <- tipind + 1
# }
# 
# 
# #add to urd
# urd@group.ids[names(tipval), "tip.clusters"] <- tipval
# 
# # Then we run 'flood' simulations
# urd.floods <- floodPseudotime(urd, root.cells = rootcells, n=50, minimum.cells.flooded = 2, verbose=F)
# 
# # Then we process the simulations into a pseudotime
# urd <- floodPseudotimeProcess(urd, urd.floods, floods.name="pseudotime")
# 
# pseudotimePlotStabilityOverall(urd)
# 
# plotDim(urd, "pseudotime", reduction.use = 'dm')
# urd@pseudotime$pseudotime <- urd@pseudotime$pseudotime
# plotDists(urd, label="pseudotime", category.label = "cluster_label", plot.title="Pseudotime by stage")
# 
# 
# 
# urd.ptlogistic <- pseudotimeDetermineLogistic(urd, "pseudotime", optimal.cells.forward=20, max.cells.back=40, do.plot = T)
# 
# # Bias the transition matrix acording to pseudotime
# urd.biased.tm <- as.matrix(pseudotimeWeightTransitionMatrix(urd, "pseudotime", logistic.params=urd.ptlogistic))
# 
# # Simulate the biased random walks from each tip
# urd.walks <- simulateRandomWalksFromTips(urd, tip.group.id="tip.clusters", root.cells=rootcells, transition.matrix = urd.biased.tm, n.per.tip = 25000, root.visits = 1, max.steps = 5000, verbose = F)
# 
# # Process the biased random walks into visitation frequencies
# urd <- processRandomWalksFromTips(urd, urd.walks, verbose = F)
# 
# 
# # Load the cells used for each tip into the URD object
# urd.tree <- loadTipCells(urd, "tip.clusters")
# 
# # Build the tree
# urd.tree <- buildTree(urd.tree, pseudotime = "pseudotime", tips.use=c(1,2), divergence.method = "preference", cells.per.pseudotime.bin = 25, bins.per.pseudotime.window = 8, save.all.breakpoint.info = T, p.thresh=0.001)
# 
# p1<- plotTree(urd.tree, "orig.ident", title="Sample", legend = F)
# p2<- plotTree(urd.tree, "major_cluster_label", title="Cell Type", legend = F)
# p3<- plotTree(urd.tree, "pseudotime", title="Pseudotime", legend = F,continuous.colors = viridis_pal(option='A')(100))
# p4<- plotTree(urd.tree, "NOS1", continuous.colors = FeatureCol)
# p5<- plotTree(urd.tree, "RELN", continuous.colors = FeatureCol)
# p6<- plotTree(urd.tree, "ERBB4", continuous.colors = FeatureCol)
# 
# plot_grid(p1,p2,p3,p4,p5,p6,ncol=3)
# 
# save(urd, urd.tree, file=paste0(diffPath, '/URD_output.Rdata'))


# #Try pseudtimeDE software
# test<-LRP
# 
# nsample <- 50
# nsize <- 0.8 * dim(test)[2]
# cells <- colnames(test)
# 
# subsamples <- lapply(1:nsample, FUN=function(x){
#   sub <- sample(cells, nsize)
#   t <- subset(test, cells=sub)
# })
# 
# pb <- txtProgressBar(min = 0, max = nsample, style = 3)
# 
# pseudotime_samp <- lapply(subsamples, FUN=function(x){
#   
#   out<-ApplyDiffusionMap(x,keygenes, find.branch=F)
#   obj<-out[[3]]
#   vals<-obj$dptval
#   df<-data.frame(vals)
#   df<-cbind(rownames(df), df)
#   colnames(df) <- c('cell','pseudotime')
#   print('loop')
#   return(df)
#   
# })
# 
# 
# fulldf<-data.frame(test$dptval)
# fulldf<-cbind(rownames(fulldf), fulldf)
# colnames(fulldf) <- c('cell','pseudotime')
# 
# 
# LRPcells <-WhichCells(test,expression=major_cluster_label=='LRP')
# LRP<-subset(test, cells=LRPcells)
# 
# pseudo_tbl <- lapply(pseudotime_samp, as_tibble)
# PseudotimeDE::plotUncertainty(ori.tbl = fulldf, sub.tbl = pseudotime_samp[1:2] )
# 
# LRPsce<-as.SingleCellExperiment(LRP)
# sce <-LPS_sce
# sce@assays@data@listData$counts <- as.matrix(LRP@assays[['RNA']]@counts)
# 
# res <- PseudotimeDE::runPseudotimeDE(gene.vec = c("RELN", "GAP43", 'NOS1'),
#                                                  ori.tbl = as_tibble(fulldf),
#                                                  sub.tbl = pseudo_tbl,
#                                                  sce = sce,
#                                                  model = "auto")
# 
# 
# 
# 
# unction (ori.tbl, sub.tbl) 
# {
#   n_subample <- length(sub.tbl)
#   cell <- gene <- ori_pseudotime <- pseudotimes <- pseudotime <- counts <- ..density.. <- NULL
#   Cells_true_time <- as.data.frame(ori.tbl)
#   colnames(Cells_true_time) <- c("cell", "ori_pseudotime")
#   Merge_pseudotimes <- suppressWarnings(Reduce(function(x, 
#                                                         y) merge(x = x, y = y, by = "cell", all = TRUE), sub.tbl))
#   for (i in 2:length(sub.tbl) + 1) {
#     colnames(Merge_pseudotimes)[i] <- paste("Pseudotime", 
#                                             i - 1, sep = " ")
#   }
#   Truetimes_pseudotimes <- merge(Cells_true_time, Merge_pseudotimes, 
#                                  by = "cell")
#   Descending_truetimes_pseudotimes <- dplyr::arrange(Truetimes_pseudotimes, 
#                                                      dplyr::desc(ori_pseudotime))
#   pseudotimes_only <- Descending_truetimes_pseudotimes %>% 
#     dplyr::select(c(-cell, -ori_pseudotime))
#   Descending_pseudotimes_transpose <- purrr::transpose(pseudotimes_only)
#   Descending_unlist_pseudotimes <- data.frame(pseudotimes = unlist(Descending_pseudotimes_transpose, 
#                                                                    use.names = FALSE))
#   cell_truetime_only <- Descending_truetimes_pseudotimes %>% 
#     dplyr::select(cell, ori_pseudotime)
#   multiple_cell_truetime <- cell_truetime_only[rep(seq_len(nrow(cell_truetime_only)), 
#                                                    each = n_subample), ]
#   na_plotdata <- cbind(Descending_unlist_pseudotimes, multiple_cell_truetime)
#   plotdata <- na_plotdata %>% na.omit(pseudotimes)
#   plotdata$cell <- factor(plotdata$cell, levels = unique(plotdata$cell))
#   p <- ggplot(plotdata, aes(pseudotimes, cell)) + stat_density(aes(fill = ..density..), 
#                                                                geom = "raster", position = "identity") + scale_fill_gradient(low = "white", 
#                                                                                                                              high = "black") + labs(x = "pseudotimes of subsamples", 
#                                                                                                                                                     y = "Cells", fill = "Density") + theme_bw() + theme(plot.title = element_text(hjust = 0.5, 
#                                                                                                                                                                                                                                   face = "bold"), axis.text.y = element_blank(), aspect.ratio = 1, 
#                                                                                                                                                                                                         legend.position = "right")
#   p
# }

#extra plotting, not run in pipeline -----
# 
# 
# dimLimX<-range(integrated_clean@reductions$DC@cell.embeddings[,1])
# dimLimY<-range(integrated_clean@reductions$DC@cell.embeddings[,2])
# 
# 
# #3D diffmap
# df<-data.frame(integrated_clean@reductions$DC@cell.embeddings, integrated_clean@meta.data)
# fig<-plot_ly(df, x = ~DC_1, y = ~DC_2, z = ~DC_3, size=1.5,color = ~branch, colors = branchCols)
# fig<-plot_ly(df, x = ~DC_1, y = ~DC_2, z = ~DC_3,size=2, color = ~major_cluster_label, colors = customCol[c(10,6,4)])
# fig<-plot_ly(df, x = ~DC_1, y = ~DC_2, z = ~DC_3, size=1.5,color = ~dptval)
# 
# #Highlight Groups
# temp<-integrated_clean
# temp$justMartinotti<-NA
# temp$justMartinotti[which(temp$major_cluster_label=='Martinotti')]<-temp$cluster_label[which(temp$major_cluster_label=='Martinotti')]
# temp$justNonMartinotti<-NA
# temp$justNonMartinotti[which(temp$major_cluster_label=='Non-Martinotti')]<-temp$cluster_label[which(temp$major_cluster_label=='Non-Martinotti')]
# temp$justLRP<-NA
# temp$justLRP[which(temp$major_cluster_label=='LRP')]<-temp$cluster_label[which(temp$major_cluster_label=='LRP')]
# 
# DimPlot(temp, pt.size=1, group.by='justMartinotti', reduction='DC', label=T, label.size=6, repel=T)+scale_color_manual(values=alpha(SubClusterCols,0.35), na.value=alpha('light grey',0.2))
# DimPlot(temp, pt.size=1, group.by='justNonMartinotti', reduction='DC', label=T, label.size=6, repel=T)+scale_color_manual(values=alpha(SubClusterCols,0.35), na.value=alpha('light grey',0.2))
# DimPlot(temp, pt.size=1, group.by='justLRP', reduction='DC', label=T, label.size=6, repel=T)+scale_color_manual(values=alpha(SubClusterCols,0.35), na.value=alpha('light grey',0.2))
# 
# #cluster pseudotime localisation
# df<-integrated_clean@meta.data
# 
# ggplot( df, aes(x=cluster_label, y=dptval, fill=cluster_label, color=cluster_label)) +
# geom_violin(width=1.2, size=0.2,position=position_dodge(.9)) +
# scale_fill_manual(values=SubClusterCols) +
# scale_color_manual(values=SubClusterCols) +
# theme_ipsum() +
# theme(
#   legend.position="none"
# ) +
# coord_flip() + # This switch X and Y axis and allows to get the horizontal version
# xlab("") +
# ylab("Pseudotime")
# 
# 
# #--- DE Genes ------------------------------------------------------------
# #branch markers
# Idents(integrated_clean)<-'branch'
# DefaultAssay(integrated_clean)<-'RNA'
# mark<-FindAllMarkers(integrated_clean,only.pos=T, logfc.threshold = log(1.5), min.diff.pct = 0.3)
# 
# 
# #Get branch markers for each sample separately
# objList<-list()
# markerSets<-list()
# for(samp in unique(integrated_clean$orig.ident)){
#   obj<-subset(integrated_clean,cells=WhichCells(integrated_clean,expression=orig.ident==samp))
#   objList[[samp]]<-obj
# 
#   Idents(obj)<-'branch'
#   DefaultAssay(obj)<-'RNA'
#   objmark<-FindAllMarkers(obj,only.pos = T, logfc.threshold = log(1.5))
#   markerSets[[samp]]<-objmark
# 
# }
# 
# #get conserved markers for each branch (expressed across all samples)
# branchMarkers<-lapply(unique(integrated_clean$branch), FUN= function(x){
# 
#   E16set<-markerSets[['E16']]$gene[markerSets[['E16']]$cluster == x]
#   P1set<-markerSets[['P1']]$gene[markerSets[['P1']]$cluster == x]
#   P5set<-markerSets[['P5']]$gene[markerSets[['P5']]$cluster == x]
#   return(intersect(intersect(E16set, P1set), P5set))
# })
# 
# #plot
# integrated_clean<-AddModuleScore(integrated_clean,branchMarkers,name='ConservedDEGenes_',assay='RNA')
# dat<-integrated_clean@meta.data
# dat$branch<-factor(dat$branch,levels=c('Undecided', 'Branch 1','Branch 2'))
# ggplot(dat,aes_string(x='orig.ident', y='ConservedDEGenes_2', color='branch')) +
#   theme_minimal()+
#   geom_boxplot(size=0.8, alpha=1) +
#   xlab("Sample") +
#   ylab("Module Score") +
#   scale_color_manual(values=branchCols)+
#   scale_fill_manual(values=branchCols)+
#   ggtitle(paste0('Module ',13 ,' Score'))
# 
# names(branchMarkers)<-unique(integrated_clean$branch)
# 
# 
# 
# 
# # ---- Module Enrichment -----------------------------------
# #check expression
# genes<-c('CHODL','NOS1','RELN','NR2F2','FXYD6','GABRB1')
# DefaultAssay(integrated_clean)<-'RNA'
# FeaturePlot(integrated_clean,reduction='DC',features=genes, order=T) & xlim(dimLimX[1],dimLimX[2]) & ylim(dimLimY[1],dimLimY[2]) & scale_color_gradientn(colours = c('#e6e8e8', '#1fa7bc', '#ff708a'))
# 
# dat<-integrated_clean@meta.data[WhichCells(integrated_clean, expression=branch != 'Undecided'),]
# 
# plotList<-lapply(1:length(modList), FUN= function(i){
# 
#   p<-ggplot(dat,aes_string(x='dptval', y=paste0('ModuleScore_',i), color='branch')) +
#     theme_minimal()+
#     geom_point(size=0.3, alpha=0.15) +
#     geom_smooth(method='gam', se=TRUE,fullrange=FALSE, aes(fill=branch), alpha=0.25)+
#     xlab("pseudotime") +
#     ylab("Module Score") +
#     scale_color_manual(values=branchCols)+
#     scale_fill_manual(values=branchCols)+
#     ggtitle(paste0('Module ',i ,' Score')) +
#     theme(legend.position = 'none')
# 
#   #p<-FeaturePlot(integrated_clean,reduction='umap',features = paste0('ModuleScore_',i)) + scale_color_viridis_c()
# 
#   return(p)
# 
# })
# 
# 
# plot_grid(plotlist=plotList, ncol=5)
# 
# 
# 
# plotList<-list()
# #plot maturity by subtype
# for(type in unique(integrated_clean$major_cluster_label)){
#   dat<-integrated_clean@meta.data[WhichCells(integrated_clean, expression=major_cluster_label==type),]
#   dat$cluster_label<-as.factor(dat$cluster_label)
#   p1<-ggplot(dat,aes_string(x='cluster_label', y=paste0('ModuleScore_',13), color='cluster_label')) +
#     theme_minimal()+
#     geom_boxplot(size=0.8, alpha=1) +
#     xlab("pseudotime") +
#     ylab("Module Score") +
#     scale_color_manual(values=SubClusterCols)+
#     scale_fill_manual(values=SubClusterCols)+
#     ggtitle(paste0('Module ',13 ,' Score')) +
#     theme(legend.position = 'none')
#   plotList[[type]]<-p1
# }
# plot_grid(plotlist=plotList, ncol=1)
# 
# 
# dat<-integrated_clean@meta.data
# ggplot(dat,aes_string(x='dptval', y=paste0('ModuleScore_',13), color='major_cluster_label')) +
#   theme_minimal()+
#   geom_point(size=0.9, alpha=0.25) +
#   geom_smooth(method='gam', se=TRUE,fullrange=FALSE, aes(fill=major_cluster_label), alpha=0.25)+
#   xlab("pseudotime") +
#   ylab("Module Score") +
#   scale_color_manual(values=MajorClusterCols)+
#   scale_fill_manual(values=MajorClusterCols)+
#   ggtitle(paste0('Module ',14 ,' Score')) +
#   theme(legend.position = 'none')
# 
# 
# #Plotsample content of clusters
# ggplot(dat, aes_string(x = 'dptval', y = 'cluster_label',
#                                              colour = 'orig.ident')) +
#   geom_quasirandom(groupOnX = FALSE) +
#   scale_color_manual(values=SampleCol) +
#   theme_ipsum() +
#   theme(
#     legend.position="none"
#   ) +
#   xlab("") + ylab("")
# 
# #Plotsample content of clusters
# ggplot(dat, aes_string(x = 'dptval', y = 'cluster_label',
#                        colour = 'branch')) +
#   geom_quasirandom(groupOnX = FALSE) +
#   scale_color_manual(values=c( branchCols)) +
#   theme_ipsum() +
#   theme(
#     legend.position="none"
#   ) +
#   xlab("") + ylab("")
# 
# 
# 
# p1<-ggplot(integrated_clean@meta.data, aes(x = DC1, y = DC2, colour = orig.ident)) +
#   geom_point(size=1) + scale_color_manual(values=hue_pal()(4)) +
#   xlab("Diffusion component 1") +
#   ylab("Diffusion component 2") +
#   theme(legend.text = element_text( size = 20))+ggtitle('Diffusion Map - Sample')
# 
# p2<-ggplot(integrated_clean@meta.data[!is.na(integrated_clean$major_cluster_label),], aes(x = DC1, y = DC2, colour = major_cluster_label)) +
#   geom_point(size=1) + scale_color_manual(values=hue_pal()(4)) +
#   xlab("Diffusion component 1") +
#   ylab("Diffusion component 2") +
#   theme(legend.text = element_text( size = 20))+ggtitle('Diffusion Map - Identity')
# 
# print(plot_grid(p1,p2))
# 
# #check types along individual branches
# branch0<-subset(integrated_clean,cells=WhichCells(integrated_clean,expression=branch=='Branch 0'))
# branch1<-subset(integrated_clean,cells=WhichCells(integrated_clean,expression=branch=='Branch 1'))
# branch2<-subset(integrated_clean,cells=WhichCells(integrated_clean,expression=branch=='Branch 2'))
# 
# p1<-ggplot(branch1@meta.data,aes(x = dptval, y = cluster_label)) +
#   geom_boxplot(aes(fill = orig.ident, color = orig.ident), alpha = 0.4)+ scale_color_manual(values=hue_pal()(4)) +
#   xlab("Pseudotime") +
#   ylab("Clusters") +
#   theme(legend.text = element_text( size = 20))+ggtitle("Branch 1: Transcriptomic identities")
# 
# p2<-ggplot(branch2@meta.data,aes(x = dptval, y = cluster_label)) +
#   geom_boxplot(aes(fill = orig.ident, color = orig.ident), alpha = 0.4)+ scale_color_manual(values=hue_pal()(4)) +
#   xlab("Pseudotime") +
#   ylab("Clusters") +
#   theme(legend.text = element_text( size = 20))+ggtitle("Branch 2: Transcriptomic identities")
# 
# ggplot(integrated_clean@meta.data[WhichCells(integrated_clean,expression=branch!='Branch 0'),],aes(x = dptval, y = cluster_label)) +
#   geom_boxplot(aes(fill = orig.ident, color = orig.ident), alpha = 0.4)+ scale_color_manual(values=hue_pal()(4)) +
#   xlab("Pseudotime") +
#   ylab("Clusters") +
#   theme(legend.text = element_text( size = 20))+ggtitle("Branch 2: Transcriptomic identities")
# 
# 
# #quantify groups in branch
# df<-melt(branch1@meta.data[,c('orig.ident','branch','major_cluster_label')])
# df %>%
#   dplyr::count(major_cluster_label) %>%
#   mutate(perc = round(n / nrow(df), 3)) -> df2
# #content of whole data
# pie<- ggplot(df2, aes(x="",y=perc, fill=major_cluster_label))+
#   geom_bar(width = 1,stat='identity') +
#   coord_polar("y", start=0) +
#   geom_text_repel(aes(label=perc),size = 6, position=position_stack(vjust=0.5))+
#   scale_fill_manual(values=customCol[c(10,6,4)]) +
#   theme_minimal()
# pie
# 
# 
# 
# df<-melt(branch2@meta.data[,c('orig.ident','branch','major_cluster_label')])
# df %>%
#   dplyr::count(orig.ident) %>%
#   mutate(perc = round(n / nrow(df), 3)) -> df2
# #content of whole data
# bp<- ggplot(df2, aes(x="",y=perc, fill=orig.ident))+
#   geom_bar(width = 1,stat='identity', fill='Assignment')
# pie <- bp + coord_polar("y", start=0) +
#   geom_text(aes(label=perc),size = 6, position=position_stack(vjust=0.5))+
#   theme_minimal()
# pie
# 
# df<-melt(branch0@meta.data[,c('orig.ident','branch','major_cluster_label')])
# df %>%
#   dplyr::count(orig.ident) %>%
#   mutate(perc = round(n / nrow(df), 3)) -> df2
# #content of whole data
# bp<- ggplot(df2, aes(x="",y=perc, fill=orig.ident))+
#   geom_bar(width = 1,stat='identity')
# pie <- bp + coord_polar("y", start=0) +
#   geom_text(aes(label=perc),size = 6, position=position_stack(vjust=0.5))+
#   theme_minimal()
# pie
# 
# 
# df<-melt(integrated_clean@meta.data[,c('orig.ident','Assignment')])
# df %>%
# group_by(orig.ident) %>%
#   dplyr::count(Assignment)
# 
# 
# 
# for(samp in c('E16','P1','P5')){
# 
#   ind<-grep(samp,df2$orig.ident)
#   df2$n[ind]<-df2$n[ind]/sum(df2$n[ind])
# 
# }
# 
# df2$Assignment<-factor(df2$Assignment, levels=c('Undecided','Assigned'))
# #content of whole data
# ggplot(df2, aes(x=orig.ident,y=n, fill=Assignment))+
#   geom_bar(stat='identity', position='dodge')+scale_fill_manual(values=c('light grey',smoothBlue[3])) + theme_minimal()
# 
# 
# #check genes that change over pseudotime
# 
# 
# #shared gene changes over pseudotime
# # Only look at the 1,000 most variable genes when identifying temporally expressesd genes.
# # Identify the variable genes by ranking all genes by their variance.
# sub<-subset(integrated_clean_alt,cells=WhichCells(integrated_clean_alt,expression = branch == 'Branch 1'))
# DefaultAssay(sub)<-'RNA'
# sub<-FindVariableFeatures(sub, nfeatures=1000)
# sub<-ScaleData(sub)
# Y <- sub@assays[['RNA']]@data
# var1K <- names(sort(apply(sub@assays[['RNA']]@data, 1, var),decreasing = TRUE))[1:1000]
# Y <- Y[sub@assays[['RNA']]@var.features, ]  # only counts for variable genes
# 
# 
# # Fit GAM for each gene using pseudotime as independent variable.
# t <- sub$dptval
# gam.pval <- apply(Y, 1, function(z){
#   d <- data.frame(z=z, t=t)
#   tmp <- gam(z ~ lo(t), data=d)
#   p <- summary(tmp)[4][[1]][1,5]
#   p
# })
# 
# 
# allnames<- names(sort(gam.pval, decreasing = FALSE))
# # Identify genes with the most significant time-dependent model fit.
# topgenesBranchAll<- names(sort(gam.pval, decreasing = FALSE))[1:100]
# 
# DefaultAssay(integrated_clean)<-'RNA'
# integrated_clean<-ScaleData(integrated_clean)
# # Prepare and plot a heatmap of the top genes that vary their expression over pseudotime.
# 
# heatdata <- as.matrix(sub[['RNA']]@scale.data[rownames(sub