library(Seurat)
library(STACAS)
library(scIntegrationMetrics)

#Labelling and subsampling
##########################
source("Labelling_datasets.R")

#subsampling
#Study with Healthy Hepatocyte condition containing the maximum of healthy cells :
GSE192742_healthy <- readRDS(paste0(path,"GSE192742_healthy_labelled_filtered.rds"))
table(GSE192742_healthy$labelling_digest)
# BEC_Adult Hep_Adult 
# 506      8963 

#So no more than 8963 cells in other conditions in other studies :

#Subsampling of LPS conditions of GSE186554 :
GSE186554 <- readRDS(paste0(path,"GSE186554_labelled_filtered.rds"))
table(GSE186554$labelling_semi_supervised)
# Hep_Adult Hep_Adult_LPS12h  Hep_Adult_LPS3h (keeping this proportion)
# 3153            10705            10412 
nbLPS12h<-round(8963*10705/21117)
LPS12h <- subset(GSE186554, subset = orig.ident == "LPS12h")
LPS12h <- subset(LPS12h, cells = c(sample(Cells(LPS12h), nbLPS12h)))

nbLPS3h<-round(8963*10412/21117)
LPS3h <- subset(GSE186554, subset = orig.ident == "LPS3h")
LPS3h <- subset(LPS3h, cells = c(sample(Cells(LPS3h), nbLPS3h)))

Control <- subset(GSE186554, subset = orig.ident == "Control")
GSE186554 <- merge(Control ,y=c(LPS3h, LPS12h))
saveRDS(GSE186554,"GSE186554_labelled_filtered_subsampled.rds")

#Subsampling of conditions of GSE157698 :
GSE157698 <- readRDS(paste0(path,"GSE157698_labelled_filtered.rds"))
table(GSE157698$orig.ident)
# bulkw2 bulkw3 bulkw4 bulkw6  YFPw2  YFPw3  YFPw4  YFPw6 
# 2359   3083   5507   4498   1127   1898   3086   5702
nbBulkw2<-round(8963*2359/27260)
bulkw2 <- subset(GSE157698, subset = orig.ident == "bulkw2")
bulkw2 <- subset(bulkw2, cells = c(sample(Cells(bulkw2), nbBulkw2)))
nbBulkw3<-round(8963*3083/27260)
bulkw3 <- subset(GSE157698, subset = orig.ident == "bulkw3")
bulkw3 <- subset(bulkw3, cells = c(sample(Cells(bulkw3), nbBulkw3)))
nbBulkw4<-round(8963*5507/27260)
bulkw4 <- subset(GSE157698, subset = orig.ident == "bulkw4")
bulkw4 <- subset(bulkw4, cells = c(sample(Cells(bulkw4), nbBulkw4)))
nbBulkw6<-round(8963*4498/27260)
bulkw6 <- subset(GSE157698, subset = orig.ident == "bulkw6")
bulkw6 <- subset(bulkw6, cells = c(sample(Cells(bulkw6), nbBulkw6)))
nbYFPw2<-round(8963*1127/27260)
YFPw2 <- subset(GSE157698, subset = orig.ident == "YFPw2")
YFPw2 <- subset(YFPw2, cells = c(sample(Cells(YFPw2), nbYFPw2)))
nbYFPw3<-round(8963*1898/27260)
YFPw3 <- subset(GSE157698, subset = orig.ident == "YFPw3")
YFPw3 <- subset(YFPw3, cells = c(sample(Cells(YFPw3), nbYFPw3)))
nbYFPw4<-round(8963*3086/27260)
YFPw4 <- subset(GSE157698, subset = orig.ident == "YFPw4")
YFPw4 <- subset(YFPw4, cells = c(sample(Cells(YFPw4), nbYFPw4)))
nbYFPw6<-round(8963*5702/27260)
YFPw6 <- subset(GSE157698, subset = orig.ident == "YFPw6")
YFPw6 <- subset(YFPw6, cells = c(sample(Cells(YFPw6), nbYFPw6)))
saveRDS(GSE186554,"GSE157698_labelled_filtered_subsampled.rds")



#Integration with STACAS
##############################

CRA002445 <- readRDS(paste0(path,"CRA002445_labelled_filtered.rds"))
GSE192742_healthy <- readRDS(paste0(path,"GSE192742_healthy_labelled_filtered.rds"))
GSE192742_NASH <- readRDS(paste0(path,"GSE192742_NASH_labelled_filtered.rds"))
zenodo <- readRDS(paste0(path,"zenodo_labelled_filtered.rds"))
GSE186554 <- readRDS(paste0(path,"GSE186554_labelled_filtered_subsampled.rds"))
GSE171993 <- readRDS(paste0(path,"GSE171993_labelled_filtered.rds"))
GSE136679 <- readRDS(paste0(path,"GSE136679_labelled_filtered.rds"))
GSE158866 <- readRDS(paste0(path,"GSE158866_labelled_filtered.rds"))
GSE125688 <- readRDS(paste0(path,"GSE125688_labelled_filtered.rds"))
GSE151309 <- readRDS(paste0(path,"GSE151309_labelled_filtered.rds"))
GSE157698 <- readRDS(paste0(path,"GSE157698_labelled_filtered_subsampled.rds"))
fileList <- c(CRA002445,GSE192742_healthy,GSE192742_NASH,zenodo,GSE186554,GSE171993,GSE136679,GSE158866,GSE125688,GSE151309,GSE157698) # create a list containing all the datasets, to change to pass the right files
name_file <- c("CRA002445","GSE192742_healthy","GSE192742_NASH","zenodo","GSE186554","GSE171993","GSE136679","GSE158866","GSE125688","GSE151309","GSE157698")

for(i in 1:length(fileList)){
  DefaultAssay(fileList[[i]]) <- "RNA" #put all datasets on the same slot 
  fileList[[i]] <- NormalizeData(fileList[[i]], scale.factor = 10000, normalization.method = "LogNormalize")
  fileList[[i]] <- FindVariableFeatures(fileList[[i]], selection.method = "vst", nfeatures =2000)
  DefaultAssay(fileList[[i]]) <- "RNA" #put all datasets on the same slot 
}
DefaultAssay(zenodo) <- "RNA"
DefaultAssay(CRA002445) <- "RNA"
DefaultAssay(GSE125688) <- "RNA"
DefaultAssay(GSE136679) <- "RNA"
DefaultAssay(GSE151309) <- "RNA"
DefaultAssay(GSE157698) <- "RNA"
DefaultAssay(GSE158866) <- "RNA"
DefaultAssay(GSE171993) <- "RNA"
DefaultAssay(GSE186554) <- "RNA"
DefaultAssay(GSE192742_healthy) <- "RNA"
DefaultAssay(GSE192742_NASH) <- "RNA"

all.genes_RNA <- row.names(fileList[[1]])
for (i in 2:length(fileList)) {
  all.genes_RNA <- intersect(all.genes_RNA, row.names(fileList[[i]]))
}
features <- SelectIntegrationFeatures(object.list = fileList, nfeatures = 500)
Anchors_found <- FindAnchors.STACAS(object.list = fileList, dims=1:30, anchor.features=features, alpha = 0.8, anchor.coverage = 0.5, verbose = T) 
Integrated_object <- IntegrateData.STACAS(Anchors_found, dims=1:30, features.to.integrate=all.genes_RNA, semisupervised = F)
Integrated_object <- DefaultAssay("integrated")
Integrated_object <- ScaleData(Integrated_object)
Integrated_object <- RunPCA(Integrated_object, verbose = FALSE, npcs = 30)
Integrated_object <- RunUMAP(Integrated_object, reduction = "pca", dims = 1:30)
Integrated_object <- FindNeighbors(Integrated_object, reduction = "pca")
Integrated_object <- FindClusters(Integrated_object, res = 0.8)

#for quality check
#DimPlot(Integrated_object, group.by = "labelling_digest")# Plot batch condition in order to look at if the data are weel integrated

saveRDS(Integrated_object,"Integration_alpha0.8_coverage0.5_unsupervised.rds")
#Re-make the FindAnchors.STACAS and IntegrateData.STACAS functions with other parameters and save them in separated rds objects

#LISI et ASW score
#####################################################################
 LISI <- list()
 Silhouette <- list()
 meta.label = "type" #The meta.data containing informations about the cell type in your dataset
 meta.batch = "labelling_digest" # meta.data containing informations  about batch in your dataset
 lisi_perplexity <- 30
 for(i in dataset){#dataset is a vector containing the name of the rds files issued of different integration
   method <- i
   Integrated_object <- readRDS(i)
   intergationMetrics<-getIntegrationMetrics(Integrated_object, meta.label = meta.label,meta.batch = meta.batch,iLISI_perplexity = lisi_perplexity)
   LISI[[method]]<- intergationMetrics$iLISI 
   Silhouette[[method]] <- intergationMetrics$celltype_ASW
}

#####################################################################
#Focus : Selection of clusters in Integration + hepatocytes control:
#####################################################################
Integrated_object <- readRDS("Integration_alpha0.8_coverage0.5_unsupervised.rds")
Integration_subset <- subset(Integrated_object, subset = integrated_snn_res.0.8 == "2" | integrated_snn_res.0.8 == "8" | integrated_snn_res.0.8 == "15" | (dataset == "GSE125688"  & labelling_digest == "Hep_Adult"))
saveRDS(Integration_subset,"Integration_alpha0.8_coverage0.5_unsupervised_Focus.Progenitors_Chol_HepHCT_HepHealthy.rds")



