#Analysis of raw .h5ad files from SevenBridges
#Import needed modules and setup up the single-cell adata object.
#Import modules

import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Read in the h5ad files, remove the placeholders with your file path
file_path = "/Users/manueltrebo/Documents/GitHub/STAR_scRNA_seq/raw_data/lung_raw.h5ad"

adata = sc.read_h5ad(file_path)

#Perform quality control by removing undetermined cells and filter cells based on Sample Tags.

#(Optional) Setup up the adata object by removing unneeded information for the sake of this tutorial:
adata.obs = adata.obs.iloc[:,[1,2]]

#(Optional) In case of multiplexed samples, only keep cells that were correctly assigned and remove the rest:
adata = adata[(adata.obs["Sample_Name"] == "GG")|(adata.obs["Sample_Name"] == "TU")].copy()

#Identify mitochondrial genes and calculate quality control metrics.x
adata.var["mt"] = adata.var_names.str.startswith("MT-")

sc.pp.calculate_qc_metrics(
    adata, qc_vars=["mt"], log1p=False, inplace=True, percent_top=None
)

#Plot the raw data using violin visualization to identify optimal thresholds.

sc.pl.violin(
    adata,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    groupby="Sample_Name",
    size = 0.25,
    jitter=0.2,
    multi_panel=True,
)

#Perform filtering according to the selected filtering criteria and the samples individual needs.

sc.pp.filter_cells(adata, min_counts=1000)
sc.pp.filter_cells(adata, max_counts = 60000)
sc.pp.filter_cells(adata, min_genes= 200)
sc.pp.filter_cells(adata, max_genes=8000)

adata = adata[adata.obs["pct_counts_mt"] < 30].copy()

'''
Note: As described in a previous publication (Salcher et al 2024) we choose a less stringent threshold of < 30 % mitochondrial counts 
due to the increased presence of mitochondrial transcripts using the BD Rhapsody system.
'''

'\nNote: As described in a previous publication (Salcher et al 2024) we choose a less stringent threshold of < 30 % mitochondrial counts \ndue to the increased presence of mitochondrial transcripts using the BD Rhapsody system.\n'

#Visualize the filtered data again for comparison to the raw data.

sc.pl.violin(
    adata,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    groupby="Sample_Name",
    size = 0.25,
    jitter=0.2,
    multi_panel=True,
)

#Perform data scaling and transformation to remove technical biases and stabilize variance. 

#Normalize the data and transform it using the log1p function, but keep the raw counts in a separata adata layer.
adata.layers["counts"] = adata.X #optional step to preserve raw_counts
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)

#Calculate highly variable genes based on Seurat_v3 flavor using raw counts

sc.pp.highly_variable_genes(
    adata,
    layer = "counts",
    n_top_genes=6000,
    subset=False,
    flavor="seurat_v3",
)

'''
Note: The ideally chosen number of selected variable genes again is dependent on your sample, 
please refer to our published papers where you can find the chosen values for our datasets.
'''

'\nNote: The ideally chosen number of selected variable genes again is dependent on your sample, \nplease refer to our published papers where you can find the chosen values for our datasets.\n'

#Reduce the data dimensionality and compute neighbors and umap embeddings for visualization including leiden clustering for cell annotation.

#Calculate principal components for dimensionality reduction
sc.tl.pca(adata, mask_var="highly_variable")

#Computing of the nearest neighbor distance matrix.
sc.pp.neighbors(adata)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.

#Embed the neighborhood graph using Uniform Manifold Approximation and Projection (UMAP)
sc.tl.umap(adata)

#Cluster the cells using the Leiden algorithm.
sc.tl.leiden(adata, resolution=0.5)

/var/folders/gk/nnxhwmls7zvg4ghvyrg2xxb00000gn/T/ipykernel_74226/1036972697.py:2: FutureWarning: In the future, the default backend for leiden will be igraph instead of leidenalg.

 To achieve the future defaults please pass: flavor="igraph" and n_iterations=2.  directed must also be False to work with igraph's implementation.
  sc.tl.leiden(adata, resolution=0.5)

#Visualize results using UMAP

sc.pl.umap(adata, color="leiden")

sc.pl.umap(adata, color="total_counts")