

#####################################################################
#Remap  datasets of interest with Alvin
#####################################################################
source("Generation_decoys_index_alevin.R") # to generate the annotation file .tx2gene.tsv and .sidx for salmon-alevin

############ in bash ######
#for each BEC condition of GSE125688
#salmon alevin -l ISR -i Mus_musculus.annotation.expanded.sidx -1 fastqFile1.fastq  -2 fastqFile2.fastq -o ConditionName -p 10 --tgMap Mus.musculus.annotation.expanded.tx2gene.tsv  --expectCells 1500 --forceCells 1500 --umiLength 6 --barcodeLength 16 --end 5

#for each Hepatocyte condition of GSE125688
#salmon alevin -l ISR -i Mus_musculus.annotation.expanded.sidx -1 fastqFile_1.fastq -2 fastqFile_2.fastq  -o  ConditionName -p 10 --tgMap Mus.musculus.annotation.expanded.tx2gene.tsv  --dropseq --expectCells 1500 --forceCells 1500

#for bam file of GSE157698 (bamtofastq is in v1.4.1)
#bamtofastq --nthreads=8 GSE157698/Bulk_W2.bam.1 fastqOutput/
#salmon alevin -l ISR -i Mus_musculus.annotation.expanded.sidx/ -1 bulk_w6_scRNA_seq_fastqs/Run147_Sample4_MissingLibrary_1_GW190505174th_novaxp/*R1_*.fastq.gz  -2 bulk_w6_scRNA_seq_fastqs/Run147_Sample4_MissingLibrary_1_GW190505174th_novaxp/*R2*.fastq.gz  -o  Bulk_w6_scRNAseq_aligned -p 10 --tgMap Mus.musculus.annotation.expanded.tx2gene.tsv  --chromiumV3 --expectCells 20000 --forceCells 20000

#for each condition of GSE192742 (number of expectedCells adapted according information provided by GEO for the options --expectCells and --forceCells)
#salmon alevin -l ISR -i Mus_musculus.annotation.expanded.sidx/ -1 fastqFile_1.fastq.gz  -2 fastqFile_2.fastq.gz  -o  Results_realignment/GSE192742 -p 10 --tgMap Mus.musculus.annotation.expanded.tx2gene.tsv  --chromium --expectCells 5165 --forceCells 5165

# for each condition of CRA002445
#salmon alevin -l ISR -i Mus.musculus.vM24.annotation.expanded.sidx -1 mouse_f1.fastq.gz  -2 mouse_r2.fastq.gz  -o alevin_out -p 10 --tgMap Mus.musculus.annotation.expanded.tx2gene.tsv  --chromiumV3 --expectCells 20000
###########################


#####################################################################
#Select only cells of interest in remapped datasets
#####################################################################


#Read spliced unspliced file from GSE125688 datasets
cg <- read.delim("~/single_cell_analysis/single_cell_integration/ScRNAseq_mouse_liver_injury_atlas_v3/single_cell_alignment_for_velocity/Mus.musculus.annotation.expanded.features.tsv", header = TRUE, as.is = TRUE)
colnames(cg)[colnames(cg) == "intron"] <- "unspliced"
##BEC_Ctrl1
Bec_Ctrl1 <- tximeta::tximeta(coldata = data.frame(
  names = "GSE125688",
  files = "Results_realignment/GSE125688/BEC_Ctrl1/alevin/quants_mat.gz", 
  stringsAsFactors = FALSE
), type = "alevin", skipMeta = TRUE)
cells_BEC_ctrl1 <- subset(cells_GSE125688, subset = orig.ident == "bil_adult1")
tmp <- vapply(strsplit(Cells(cells_BEC_ctrl1),"_"),`[`,2, FUN.VALUE = character(1))### Keep first part
tmp2 <- vapply(strsplit(Cells(cells_BEC_ctrl1),"_"),`[`,3, FUN.VALUE = character(1))# keep second part
cell_names_bec_ctrl1 <- paste0(tmp,tmp2)
to_add <- strsplit(Cells(cells_BEC_ctrl1),"_")[[1]][1]
Bec_Ctrl1 <- Bec_Ctrl1[,cell_names_bec_ctrl1]
Bec_Ctrl1 <- tximeta::splitSE(Bec_Ctrl1, cg, assayName = "counts")
assays(Bec_Ctrl1) <- list(counts = assay(Bec_Ctrl1, "spliced"),spliced = assay(Bec_Ctrl1, "spliced"),unspliced = assay(Bec_Ctrl1, "unspliced"))
colnames(Bec_Ctrl1) <- Cells(cells_BEC_ctrl1)
colnames(Bec_Ctrl1)


##BEC_Ctrl2
Bec_Ctrl2 <- tximeta::tximeta(coldata = data.frame(
  names = "GSE125688",
  files = "Results_realignment/GSE125688/BEC_Ctrl2/alevin/quants_mat.gz", 
  stringsAsFactors = FALSE
), type = "alevin", skipMeta = TRUE)
cells_BEC_ctrl2 <- subset(cells_GSE125688, subset = orig.ident == "bil_adult2")
tmp <- vapply(strsplit(Cells(cells_BEC_ctrl2),"_"),`[`,2, FUN.VALUE = character(1))### Keep first part
tmp2 <- vapply(strsplit(Cells(cells_BEC_ctrl2),"_"),`[`,3, FUN.VALUE = character(1))# keep second part
cell_names_bec_ctrl2 <- paste0(tmp,tmp2)
Bec_Ctrl2 <- Bec_Ctrl2[,cell_names_bec_ctrl2]
Bec_Ctrl2 <- tximeta::splitSE(Bec_Ctrl2, cg, assayName = "counts")
assays(Bec_Ctrl2) <- list(counts = assay(Bec_Ctrl2, "spliced"),spliced = assay(Bec_Ctrl2, "spliced"),unspliced = assay(Bec_Ctrl2, "unspliced"))
colnames(Bec_Ctrl2) <- Cells(cells_BEC_ctrl2)
colnames(Bec_Ctrl2)

##BEC_Ctrl3
Bec_Ctrl3 <- tximeta::tximeta(coldata = data.frame(
  names = "GSE125688",
  files = "Results_realignment/GSE125688/BEC_Ctrl3/alevin/quants_mat.gz", 
  stringsAsFactors = FALSE
), type = "alevin", skipMeta = TRUE)
cells_BEC_ctrl3 <- subset(cells_GSE125688, subset = orig.ident == "bil_adult3")
tmp <- vapply(strsplit(Cells(cells_BEC_ctrl3),"_"),`[`,2, FUN.VALUE = character(1))### Keep first part
tmp2 <- vapply(strsplit(Cells(cells_BEC_ctrl3),"_"),`[`,3, FUN.VALUE = character(1))# keep second part
cell_names_bec_ctrl3 <- paste0(tmp,tmp2)
Bec_Ctrl3 <- Bec_Ctrl3[,cell_names_bec_ctrl3]
Bec_Ctrl3 <- tximeta::splitSE(Bec_Ctrl3, cg, assayName = "counts")
assays(Bec_Ctrl3) <- list(counts = assay(Bec_Ctrl3, "spliced"),spliced = assay(Bec_Ctrl3, "spliced"),unspliced = assay(Bec_Ctrl3, "unspliced"))
colnames(Bec_Ctrl3) <- Cells(cells_BEC_ctrl3)
colnames(Bec_Ctrl3)

##BEC_DDC1
Bec_DDC1 <- tximeta::tximeta(coldata = data.frame(
  names = "GSE125688",
  files = "Results_realignment/GSE125688/BEC_DDC/alevin/quants_mat.gz", 
  stringsAsFactors = FALSE
), type = "alevin", skipMeta = TRUE)
cells_BEC_DDC <- subset(cells_GSE125688, subset = orig.ident == "bil_DDC1")
tmp <- vapply(strsplit(Cells(cells_BEC_DDC),"_"),`[`,2, FUN.VALUE = character(1))### Keep first part
tmp2 <- vapply(strsplit(Cells(cells_BEC_DDC),"_"),`[`,3, FUN.VALUE = character(1))# keep second part
cell_names_bec_DDC <- paste0(tmp,tmp2)
Bec_DDC1 <- Bec_DDC1[,cell_names_bec_DDC]
Bec_DDC1 <- tximeta::splitSE(Bec_DDC1, cg, assayName = "counts")
assays(Bec_DDC1) <- list(counts = assay(Bec_DDC1, "spliced"),spliced = assay(Bec_DDC1, "spliced"),unspliced = assay(Bec_DDC1, "unspliced"))
colnames(Bec_DDC1) <- Cells(cells_BEC_DDC)
#colnames(Bec_DDC1)


##Hep_DDC1
Hep_DDC1 <- tximeta::tximeta(coldata = data.frame(
  names = "GSE125688",
  files = "Results_realignment/GSE125688/Hep_DDC1/alevin/quants_mat.gz",
  stringsAsFactors = FALSE
), type = "alevin", skipMeta = TRUE)
cells_Hep_DDC <- subset(cells_GSE125688, subset = orig.ident == "hep_DDC1")
Cells(cells_Hep_DDC)
tmp <- vapply(strsplit(Cells(cells_Hep_DDC),"_"),`[`,3, FUN.VALUE = character(1))### Keep first part
Hep_DDC1 <- Hep_DDC1[,tmp]
## Error index out of bounds: GTGCCCGTCGNN GAGGACGTCGGN
vect_True_False <- tmp[tmp %in% colnames(Hep_DDC1)]
Hep_DDC1 <- Hep_DDC1[,vect_True_False]
Hep_DDC1 <- tximeta::splitSE(Hep_DDC1, cg, assayName = "counts")
colnames(Hep_DDC1)
assays(Hep_DDC1) <- list(counts = assay(Hep_DDC1, "spliced"),spliced = assay(Hep_DDC1, "spliced"),unspliced = assay(Hep_DDC1, "unspliced"))
colnames(Hep_DDC1) <- paste0("hep_DDC1_",colnames(Hep_DDC1))
colnames(Hep_DDC1)

##Hep_Cntrl
Hep_ctrl <- tximeta::tximeta(coldata = data.frame(
  names = "GSE125688",
  files = "Results_realignment/GSE125688/Hep_Ctrl1/alevin/quants_mat.gz",
  stringsAsFactors = FALSE
), type = "alevin", skipMeta = TRUE)
Cells(Hep_ctrl)
cells_Hep_ctrl <- subset(cells_GSE125688, subset = orig.ident == "hep_adult1")
Cells(cells_Hep_ctrl)
tmp <- vapply(strsplit(Cells(cells_Hep_ctrl),"_"),`[`,3, FUN.VALUE = character(1))### Keep first part
Hep_ctrl <- Hep_ctrl[,tmp]
## Error index out of bounds: GTGCCCGTCGNN GAGGACGTCGGN
vect_True_False <- tmp[tmp %in% colnames(Hep_ctrl)]
Hep_ctrl <- Hep_ctrl[,vect_True_False]
Hep_ctrl <- tximeta::splitSE(Hep_ctrl, cg, assayName = "counts")
colnames(Hep_ctrl)
assays(Hep_ctrl) <- list(counts = assay(Hep_ctrl, "spliced"),spliced = assay(Hep_ctrl, "spliced"),unspliced = assay(Hep_ctrl, "unspliced"))
colnames(Hep_ctrl) <- paste0("hep_adult1_",colnames(Hep_ctrl))
colnames(Hep_ctrl)

GSE125688 <- SingleCellExperiment::cbind(Bec_Ctrl1, Bec_Ctrl2, Bec_Ctrl3, Bec_DDC1,Hep_DDC1, Hep_ctrl)
saveRDS(GSE125688, "~/single_cell_analysis/single_cell_integration/ScRNAseq_mouse_liver_injury_atlas_v3/single_cell_alignment_for_velocity/GSE125688_with_Hep_ctrl_velocity.rds")

#Select cells of the focus in each remapped dataset and merge
CRA002445 <- readRDS("single_cell_alignment_for_velocity/CRA002445_velocity.rds")
GSE125688 <- readRDS("single_cell_alignment_for_velocity/GSE125688_with_Hep_ctrl_velocity.rds")
GSE157698 <- readRDS("single_cell_alignment_for_velocity/GSE157698_velocity.rds")
GSE192742_N <- readRDS("single_cell_alignment_for_velocity/GSE192742_nafld_right_cells.rds")

Integration_focus <- readRDS("Integration_alpha0.8_coverage0.5_unsupervised_Focus.Progenitors_Chol_HepHCT_HepHealthy.rds")

cells_CRA002445 <- Cells(CRA002445)
cells_GSE125688 <- Cells(GSE125688)
cells_GSE157698 <- Cells(GSE157698)
cells_GSE192742 <- Cells(GSE192742_N)
pooled_cells <- c(cells_CRA002445,cells_GSE125688,cells_GSE157698,cells_GSE192742)

integrated_focus_only_cells <- subset(Integration_focus, cells = pooled_cells)
saveRDS(integrated_focus_only_cells, "Integration_alpha0.8_cov0.5_Focus.Progenitors_Chol_HepHCT_HepHealthy_matched_cell_velocity.rds")

#Create annData object for Velocity
matrice_batch_corr <- Integration_focus@assays$integrated@data
Integration_velocity <- SingleCellExperiment::cbind(CRA002445, GSE125688, GSE157698, GSE192742_N)
Integration_velocity <- as(Integration_velocity, "SingleCellExperiment")
#12752 cells
colnames(Integration_velocity@assays@data$counts) <- colnames(Integration_velocity)
colnames(Integration_velocity@assays@data$spliced) <- colnames(Integration_velocity)
colnames(Integration_velocity@assays@data$unspliced) <- colnames(Integration_velocity)
## translation ENS-ID to GeneID
library(biomaRt)
mmusculus <- useEnsembl(biomart="genes", dataset = "mmusculus_gene_ensembl", version = 108)

gene_id_test_matrix_spliced <- getBM(attributes =c("ensembl_gene_id","mgi_symbol"),filters = "ensembl_gene_id", values = rownames(Integration_velocity), mart = mmusculus,)
gene_conversion_wo_empty_names <- gene_id_test_matrix_spliced[-which(gene_id_test_matrix_spliced$mgi_symbol == ""),]
mgi_symbols <- gene_conversion_wo_empty_names$mgi_symbol[match(rownames(Integration_velocity), gene_conversion_wo_empty_names$ensembl_gene_id)]

rownames(Integration_velocity@assays@data$counts) <- mgi_symbols
rownames(Integration_velocity@assays@data$spliced) <- mgi_symbols
rownames(Integration_velocity@assays@data$unspliced) <- mgi_symbols


## Extract only the gene present in the Batch corrected matrices
count_matrix <- Integration_velocity@assays@data$counts[which(rownames(Integration_velocity@assays@data$counts) %in% rownames(Integration_focus@assays$integrated)),]
spliced_matrix <- Integration_velocity@assays@data$spliced[which(rownames(Integration_velocity@assays@data$spliced) %in% rownames(Integration_focus@assays$integrated)),]
unspliced_matrix <- Integration_velocity@assays@data$unspliced[which(rownames(Integration_velocity@assays@data$unspliced) %in% rownames(Integration_focus@assays$integrated)),]

#Extract also only genes that are present in count matrix
matrice_batch_corr <- matrice_batch_corr[which(rownames(matrice_batch_corr) %in% rownames(count_matrix)),which(colnames(matrice_batch_corr) %in% colnames(Integration_velocity))]
table(duplicated(rownames(count_matrix)))# How handle duplicated
# FALSE  TRUE 
# 11972    15

count_matrix_aggregation_gene <-as(rowsum(count_matrix, row.names(count_matrix)),"dgCMatrix")
spliced_matrix_aggregation_gene <-as(rowsum(spliced_matrix, row.names(spliced_matrix)),"dgCMatrix")
unspliced_matrix_aggregation_gene <-as(rowsum(unspliced_matrix, row.names(unspliced_matrix)),"dgCMatrix")
# Apply Batch correction from slot integrated of integrated object
Snorm <- LogNormalize(spliced_matrix_aggregation_gene)

Unlog_Snorm <- expm1(Snorm)
Unorm <- LogNormalize(unspliced_matrix_aggregation_gene)
Unlog_Unorm <- expm1(Unorm)
Unlog_MbatchCorr <- as(expm1(matrice_batch_corr),"dgCMatrix")
Unlog_MbatchCorr <- Unlog_MbatchCorr[rownames(Unlog_Snorm),colnames(Unlog_Snorm)]
Sb <- as.matrix(Unlog_MbatchCorr*(Unlog_Snorm/(Unlog_Snorm+Unlog_Unorm)))
Sb[is.nan(Sb)] <- 0
Sb <- as(Sb, "dgCMatrix")
Ub <- as.matrix(Unlog_MbatchCorr*(1-(Unlog_Snorm/(Unlog_Snorm+Unlog_Unorm))))
Ub[is.nan(Ub)] <- 0
Ub <- as(Ub, "dgCMatrix")
Integration_velocity@assays@data$counts <- Sb
Integration_velocity@assays@data$spliced <- Sb
Integration_velocity@assays@data$unspliced <- Ub

spliced <- t(as.matrix(Sb))
unspliced <- t(as.matrix(Ub))

Umap_coordinate <- Embeddings(Integration_focus,reduction = "umap")
Umap_coordinate <- Umap_coordinate[which(colnames(Integration_focus) %in% colnames(Integration_velocity)),] #Extraction of UMAP coordinate
PCA <- Embeddings(Integration_focus,reduction = "pca")
PCA <- PCA[which(colnames(Integration_focus) %in% colnames(Integration_velocity)),] #Extraction of PCA coordinate
spliced <- as(log1p(spliced),"dgCMatrix")

unspliced <- as(log1p(unspliced),"dgCMatrix")
Integration_focus[["cell.name"]] <- colnames(Integration_focus)
Integration_focus <- subset(Integration_focus, subset =cell.name %in% colnames(Integration_velocity))
spliced_reordered<- spliced[colnames(Integration_focus),]
unspliced_reordered <- unspliced[colnames(Integration_focus),]

library(reticulate)
use_python("bin/python")
sc <- import("scanpy")
anndata <- import("anndata")
pd <- import("pandas")
adata <- sc$AnnData(spliced_reordered)
adata$var_names <- colnames(spliced_reordered)
adata$obs_names <- rownames(spliced_reordered)
adata$obs.index <- rownames(orig_ident)
adata$layers =(dict(spliced = spliced_reordered,unspliced = unspliced_reordered))
adata$obsm=(dict(X_pca=PCA,X_umap=Umap_coordinate))

orig_ident = as.data.frame(Integration_focus$orig.ident)

type = as.data.frame(Integration_focus$type)
labelling_semi_sup = as.data.frame(Integration_focus$labelling_semi_supervised)
barcode = as.data.frame(Integration_focus$barcode)
labelling_digest = as.data.frame(Integration_focus$labelling_digest)
dataset = as.data.frame(Integration_focus$dataset)
table(labelling_digest)
tail(rownames(orig_ident))
adata$obs_names
adata$obs["orig.ident"]=(orig_ident)
adata$obs["type"]=(type)
adata$obs["labelling_semi_sup"]=(labelling_semi_sup)
adata$obs["barcode"]=(barcode)
adata$obs["labelling_digest"]=(labelling_digest)
adata$obs["dataset"]=(dataset)

sc$write("Integration_focus_with_velocity_metadata.h5ad", adata)

Integration_focus[["cell.name"]] <- colnames(Integration_focus)
Integration_focus <- subset(Integration_focus, subset =cell.name %in% colnames(Integration_velocity))
metadata <- data.frame(Integration_focus$orig.ident, Integration_focus$type,Integration_focus$labelling_semi_supervised, Integration_focus$labelling_digest, Integration_focus$barcode, Integration_focus$dataset)
colnames(metadata) <- c("orig.ident","type","labelling_semi_sup","labelling_digest","barcode","dataset")
write.csv(metadata, "single_cell_alignment_for_velocity/metadata_for_velocity.csv")


############ Velocity+CellRank #############################
source("pseudotime_analysis.R")
######## in bash ##############
#python Integration_velocity_construction.py

###############################
