# Stereo-seq analysis pipeline
# Step one: Convert the stereoseq gene expression data into Seurat to perform the multiple sample integration

""" Run in python """
# Read the Stereoseq data through python and write the gene expression matrix into feather file and gene name into csv 
import stereo as st
import utils
from stereo.core.ms_data import MSData
from stereo.core.ms_pipeline import slice_generator
import pandas as pd
import pyarrow.feather as feather

samples_list = ["B01806B5", "B01806B6", "B01809A3", "B01809A4", "B01809C2", "D02175A4", "D02175A6"]
for sample in samples_list:
    print(sample)
    data_path = sample+ "/GeneExpMatrix/" + sample + ".cellbin.gef"
    st.io.read_gef_info(data_path)
    data1 = st.io.read_gef(file_path=data_path, bin_type='cell_bins')
    df = data1.to_df()
    df = df.T
    feather.write_feather(df, sample + "/matrix.feather")
    gene_list = pd.DataFrame(df.index)
    gene_list.to_csv(sample + "/gene_list.csv")
""""""

# After writing the gene expression and gene label for each individual, use the Seurat in R to read the data
# High memory needed and time consuming (finished on loni)

""" Run in R """
library(Seurat)
library(zellkonverter)
library(Matrix)

# clean the environment
rm(list = ls())
gc()

sample_list = c("B01806B5", "B01806B6", "B01809A3", "B01809A4", "B01809C2", "D02175A4", "D02175A6")
for (sample in sample_list){
    print(sample)
    # import gene expression matrix
    library(arrow)
    data = read_feather(paste0(sample,"/matrix.feather"))
    
    gene_list = read.csv(paste0(sample,"/gene_list.csv"))
    gene_list[,2]
    
    rownames(data) = gene_list[,2]
    
    # Create Seurat data object
    pbmc <- CreateSeuratObject(
      counts = data, 
      project = "pbmc3k", 
      min.cells = 0, 
      min.features = 0
      )

    # Find the mitochondria genes, cells with high mitochondira gene expression are considered as dead cells    
    pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
    
    
    # Add the label for each sample
    pbmc$sample = sample
    pbmc$diagnosis = "Normal"
    
    # save the R RDS file for each sample    
    saveRDS(pbmc, paste0(sample, "/Seurat_gene.RDS"))
    rm(list=ls())
    gc()
}
""""""

# Perform the integration using sctransform for normalization and CCA for integration (working on loni)
""" Run in R """
library(Seurat)
library(tidyr)

# read the RDS file from each sample folder
data_list = c("B01806B5", "B01806B6", "B01809A3", "B01809A4", "B01809C2", "D02175A4", "D02175A6")
combine_data_list = list()

for (i in data_list){
    file = readRDS(paste0(i, "/Seurat_gene.RDS"))
    matrix = file@assays$RNA@counts
    matrix<-matrix[,!grepl("__index_level_0__",colnames(matrix))] # remove the extra columns
    pbmc <- CreateSeuratObject(counts = matrix, project = "pbmc3k", min.cells = 3, min.features = 200)
    pbmc$sample = i
    combine_data_list = c(combine_data_list, pbmc)
}

ifnb.list <- lapply(X = combine_data_list,FUN = SCTransform)


features <- SelectIntegrationFeatures(object.list = ifnb.list)
ifnb.list <- PrepSCTIntegration(object.list = ifnb.list, anchor.features = features)
immune.anchors <- FindIntegrationAnchors(object.list = ifnb.list, normalization.method = "SCT", anchor.features = features)

# this command creates an 'integrated' data assay
immune.combined.sct <- IntegrateData(anchorset = immune.anchors, normalization.method = "SCT")

immune.combined.sct <- RunPCA(immune.combined.sct, verbose = FALSE)
immune.combined.sct <- RunUMAP(immune.combined.sct, reduction = "pca", dims = 1:30)

# save the data as RDS in the Result file
saveRDS(immune.combined.sct, "Result/immune.combined.RDS")
""""""