from  distutils.version import StrictVersion
import scanpy as sc
import besca as bc
import re



def print_software_versions():
    scv = sc.__version__
    if(StrictVersion(scv) >= "1.6"):
        sc.logging.print_header()
    else:
        sc.logging.print_versions()
    bcstr = StrictVersion(re.sub("\+.*$", "", bc.__version__))
    print("besca=={}".format(bcstr))
    return None


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from scipy import sparse, io
import seaborn as sns
import os
import time
import IPython

print_software_versions()

import logging
import seaborn as sns
from sinfo import sinfo

# Seed is fixed for reproducible analysis.
import random 
random.seed(1)

# for standard processing, set verbosity to minimum
sc.settings.verbosity = 0  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80)
sns.set_context("paper", font_scale=0.9) #changes font sizes so better readable
plt.rcParams['svg.fonttype'] = 'none' #ensures that any saved svgs have editable fonts
version = '2.9'
start0 = time.time()

%matplotlib inline

scanpy==1.6.1 anndata==0.7.5 umap==0.4.6 numpy==1.20.1 scipy==1.6.0 pandas==1.2.2 scikit-learn==0.24.1 statsmodels==0.12.2 python-igraph==0.8.3 louvain==0.7.0 leidenalg==0.8.3
besca==2.4


# decisions to be made
use_example_dataset = False # If True, the pipeline will run using the example dataset shipped with BESCA

species = 'human'
batch_to_correct = 'None' # must be "None" or any one of the labels in "metadata.tsv", ID, SPECIES, TISSUE, DONOR, TREATMENT; typically "ID" or "DONOR"
analysis_name = 'standard_workflow_besca2'
split_condition='experiment' #'experiment' is generally a good default
citeseq = False #specify if the dataset contains citeseq data
dynrange=['B2m','Actb','Pgk1','Ctcf'] #genes for which to plot dynamic range
example_markers = ['PTPRC', 'CD3E', 'CD8A', 'PDCD1'] # Will be shown in the first umaps
if species=='human': dynrange=[x.upper() for x in dynrange]

#additional labeling
labeling_to_use = 'None' # must be "None" or any one of the labels in "metadata.tsv", ID, SPECIES, TISSUE, DONOR, TREATMENT; typically "ID" or "DONOR"
labeling_name = 'MyAnno' # define name under which the labeling should be exported
labeling_description = 'celltype annotation' #define description which should be saved to labeling_info file
labeling_author = 'MyAnnotAuthor' #define author which should be saved to labeling info file


# define filepath (this is the folder that contains "raw" and "analyzed")
root_path = os.getcwd()


# the standard parameter section
standard_min_genes = 800 # 500
standard_min_cells = 3 # 30
standard_min_counts = 2500 # 1000
standard_n_genes = 6000 # 3000 # this is the most tricky one to set
standard_percent_mito = 0.15 # 0.05
standard_max_counts = 3e6 # 20000 #might be redundant with n_genes


#define standardized filepaths based on above input
results_folder = os.path.join(root_path, 'analyzed', analysis_name)
results_file = os.path.join(results_folder, analysis_name + '.h5ad') # specify a .h5ad file for storing the results
log_file = os.path.join(results_folder, analysis_name + '.standard.log') # specify a log file for keeping a short summary and overview
sc.settings.figdir = os.path.join(results_folder, 'figures')

#setup standard workflow (generates output directories and setsup logging file)
bc.st.setup(results_folder, 
              analysis_name, 
              labeling_name, 
              labeling_to_use, 
              log_file, 
              version,
              root_path, 
              species, 
              batch_to_correct, 
              standard_min_genes,
              standard_min_cells,
              standard_min_counts,
              standard_n_genes,
              standard_percent_mito,
              standard_max_counts)

# read input data
if use_example_dataset:
    print('utilizing example data')
    adata = bc.datasets.Kotliarov2020_raw()
    adata.obs['donor'] = 'donor1'
elif citeseq:
    #generate file strucutre for citeseq data
    results_folder_citeseq = os.path.join(results_folder, 'citeseq')
    results_folder_merged= os.path.join(results_folder, 'citeseq_merged')
    results_file_citeseq = os.path.join(results_folder_citeseq, analysis_name + '.h5ad')
    results_file_merged = os.path.join(results_folder_merged, analysis_name + '.h5ad')
    bc.st.setup_citeseq(results_folder,
                        labeling_name, 
                        labeling_to_use)
    
    print('utilizing citeseq data')
    #read citeseq data and normal data
    logging.info('Reading Gene Expression Data...')
    adata = bc.st.read_matrix(root_path, citeseq = 'gex_only')
    logging.info('Reading Citeseq Data...')
    adata_prot = bc.st.read_matrix(root_path, citeseq = 'citeseq_only')
else:
    #read data
    adata = bc.st.read_matrix(root_path, citeseq = None)

LOG MESSAGE: Standard Pipeline Version 2.9 used
LOG MESSAGE: 2021-07-21
LOG MESSAGE: Analysis 'standard_workflow_besca2' on data located in'/pstore/data/biomics/_pre_portfolio/ID/70171_Immunomics_HBV/HBV_SMARTSeq_NovaSeq_July2021'
LOG MESSAGE: species: human
LOG MESSAGE: Batch effect to correct: None
LOG MESSAGE: Parameters:
LOG MESSAGE: 	standard_min_genes = 800
LOG MESSAGE: 	standard_min_cells = 3
LOG MESSAGE: 	standard_min_counts = 2500
LOG MESSAGE: 	standard_n_genes = 6000
LOG MESSAGE: 	standard_max_counts = 3000000.0
LOG MESSAGE: 	standard_percent_mito = 0.15
LOG MESSAGE: 	Time for creating all output directories and setting up logging: 0.019s

all output directories created successfully
reading matrix.mtx
adding cell barcodes
adding genes

LOG MESSAGE: After input: 1711 cells, 60683 genes
LOG MESSAGE: 	Time for reading data: 0.802s

making var_names unique
adding ENSEMBL gene ids to adata.var
adding annotation


# Exclude VHB27
adata = bc.subset_adata(adata, (adata.obs.donor != 'VHB27'), raw=False)


adata

AnnData object with n_obs × n_vars = 1166 × 60683
    obs: 'CELL', 'ID', 'GROUP', 'CONDITION', 'cell', 'batch', 'NovaSeq_batch', 'donor', 'disease', 'organ', 'experiment', 'sample_description', 'sample_type', 'protocol', 'Sex', 'fresh.frozen', 'Plate.ID', 'Population', 'Index.plate.used', 'Consecutive.sample.numbers', 'Date', 'Fastq_id', 'ORGANISM_BIOKIT', 'TISSUE_BIOKIT'
    var: 'ENSEMBL', 'SYMBOL'


#calculate mitochondrial gene content
bc.pp.fraction_counts(adata=adata, species=species)


patient_data = pd.read_csv(os.path.join(root_path, '/pstore/data/biomics_clin/ID/70171_Immunomics_HBV/patient_data.txt'), sep='\t', index_col='donor', na_values='na')
adata.obs = adata.obs.join(patient_data['HBsAg'], on='donor', how='left')


temp=bc.tl.count_occurrence(adata,split_condition)
sns.barplot(y=temp.index,x=temp.Counts,color='gray',orient='h')

<AxesSubplot:xlabel='Counts'>


fig, ax = plt.subplots(1)
fig.set_figwidth(8)
fig.set_figheight(5)
fig.tight_layout()

bc.pl.transcript_capture_efficiency(adata,ax=ax)
fig.savefig(os.path.join(results_folder, 'figures/transcriptcaptureefficiency.png'), format='png', bbbox_inches = 'tight') #save figure for QC report


#fig = bc.pl.librarysize_overview(adata, bins=100)
#fig.savefig(os.path.join(results_folder, 'figures/librarysize.png'), format='png',bbbox_inches = 'tight') #save figure for QC report


adata_unfiltered = adata.copy()


fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(ncols=3, nrows=2)
fig.set_figwidth(17)
fig.set_figheight(9)
fig.tight_layout(pad=4.5)

bc.pl.kp_genes(adata, min_genes=standard_min_genes, ax = ax1)
bc.pl.kp_counts(adata, min_counts=standard_min_counts, ax = ax2)
bc.pl.kp_cells(adata, min_cells=standard_min_cells, ax = ax3)
bc.pl.max_genes(adata, max_genes=standard_n_genes, ax = ax4)
bc.pl.max_mito(adata, max_mito=standard_percent_mito, annotation_type='SYMBOL', species=species, ax = ax5)
bc.pl.max_counts(adata, max_counts=standard_max_counts, ax=ax6)
fig.savefig(os.path.join(results_folder, 'figures/filtering_thresholds.png'), format='png', bbbox_inches = 'tight') #save figure for QC report


sc.pl.violin(adata, ['n_genes', 'n_counts', 'percent_mito'], jitter=0.2, multi_panel=True, save = '.before_filtering.png')

... storing 'GROUP' as categorical
... storing 'CONDITION' as categorical
... storing 'batch' as categorical
... storing 'donor' as categorical
... storing 'disease' as categorical
... storing 'organ' as categorical
... storing 'experiment' as categorical
... storing 'sample_description' as categorical
... storing 'sample_type' as categorical
... storing 'protocol' as categorical
... storing 'Sex' as categorical
... storing 'fresh.frozen' as categorical
... storing 'Plate.ID' as categorical
... storing 'Population' as categorical
... storing 'Index.plate.used' as categorical
... storing 'Consecutive.sample.numbers' as categorical
... storing 'ORGANISM_BIOKIT' as categorical
... storing 'TISSUE_BIOKIT' as categorical
... storing 'SYMBOL' as categorical


sc.pl.violin(adata, ['percent_mito','n_genes', 'n_counts'], groupby=split_condition,jitter=0.1,rotation=90, save = '.before_filtering.split.png')


%%capture filtering1
adata = bc.st.filtering_cells_genes_min(adata, standard_min_cells, standard_min_genes, standard_min_counts)

LOG MESSAGE: After filtering for minimum number of cells and minimum number of expressed genes: 1131 cells, 29999 genes
LOG MESSAGE: 	Time for filtering: 0.138s


filtering1.show()

started with  1166  total cells and  60683  total genes
removed 35 cells that did not express at least 800  genes
removed 0 cells that did not have at least 2500 counts
removed 30684 genes that were not expressed in at least 3 cells
finished with 1131  total cells and 29999 total genes


%%capture filtering2
adata = bc.st.filtering_mito_genes_max(adata, standard_percent_mito, standard_n_genes, standard_max_counts)

LOG MESSAGE: After filtering for maximum number of expressed genes and max percent mito: 1069 cells, 29999 genes
LOG MESSAGE: 	Time for filtering: 0.151s


filtering2.show()

started with  1131  total cells and  29999  total genes
removed 11 cells that expressed more than 6000 genes
removed 0 cells that had more than 3000000.0  counts
removed  51  cells that expressed  15.0 percent mitochondrial genes or more
finished with 1069  total cells and 29999 total genes


sc.pl.violin(adata, ['n_genes', 'n_counts', 'percent_mito'], jitter=0.2, multi_panel=True, save = '.after_filtering.png')


### check mitochondrial reads per sample 
sc.pl.violin(adata, ['percent_mito','n_genes', 'n_counts'], groupby=split_condition,jitter=0.1,rotation=90, save = '.after_filtering.split.png')


### cell counts per sample
temp=bc.tl.count_occurrence(adata,split_condition)
sns.barplot(y=temp.index,x=temp.Counts,color='gray',orient='h')

<AxesSubplot:xlabel='Counts'>


#display the top 25 genes in the dataset
fig, ax = plt.subplots(ncols=1, nrows=1, figsize = (8, 6))
bc.pl.top_genes_counts(adata=adata, top_n=25, ax = ax )
fig.savefig(os.path.join(results_folder, 'figures/top_genes.png'), format='png', bbbox_inches = 'tight') #save figure for QC report


adata.var


adata = bc.st.per_cell_normalize(adata, results_folder)

LOG MESSAGE: Per cell normalization completed successfully.
LOG MESSAGE: 	Time for per-cell normalization: 0.077s

adata normalized per cell
log1p values saved into adata.raw
writing out matrix.mtx ...

LOG MESSAGE: cp10k values exported to file.
LOG MESSAGE: 	Time for cp10k export: 4.864s

adata.X successfully written to matrix.mtx
genes successfully written out to genes.tsv
cellbarcodes successfully written out to barcodes.tsv
annotation successfully written out to metadata.tsv


# Further QC: dynamic range of ubi/marker genes
fig = plt.figure()
sns.set(font_scale=0.8)
plt.style.use('seaborn-white')
fig = plt.figure(figsize=(len(dynrange)*2.8,2))
fig.subplots_adjust(hspace=0.2, wspace=0.35)
for i in range(1,len(dynrange)+1):
    ax = fig.add_subplot(1, len(dynrange), i)
    myg=dynrange[i-1]
    try:
        g=sns.distplot(adata.raw[:,myg].X.toarray(), norm_hist=True)
        ax.set(xlabel='log.cp10k',ylabel=myg)
        g.set_xlim(-1, 7)
    except:
        print( myg + ' can not be plotted')

<Figure size 432x288 with 0 Axes>


adata = bc.st.highly_variable_genes(adata)

log1p taken of adata

LOG MESSAGE: After feature selection of highly variable genes: 1069 cells, 9120 genes
LOG MESSAGE: 	Time for feature selection: 1.778s


adata

View of AnnData object with n_obs × n_vars = 1069 × 9120
    obs: 'CELL', 'ID', 'GROUP', 'CONDITION', 'cell', 'batch', 'NovaSeq_batch', 'donor', 'disease', 'organ', 'experiment', 'sample_description', 'sample_type', 'protocol', 'Sex', 'fresh.frozen', 'Plate.ID', 'Population', 'Index.plate.used', 'Consecutive.sample.numbers', 'Date', 'Fastq_id', 'ORGANISM_BIOKIT', 'TISSUE_BIOKIT', 'percent_mito', 'HBsAg', 'n_counts', 'n_genes'
    var: 'ENSEMBL', 'SYMBOL', 'n_cells', 'total_counts', 'frac_reads', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'experiment_colors', 'log1p', 'hvg'


# RMK : AS OF FEB 2020 there is a bug in scanpy regress out if scanpy installed with PIP (see https://github.com/theislab/scanpy/issues/707)
# Before the fix is available, one should coopy the data toprevet it. hence the adata = adata.copy()
adata = adata.copy()
adata = bc.st.regress_out(adata, results_folder)

LOG MESSAGE: Regression steps completed. 'n_counts' and 'percent_mito' regressed out. adata was log-normalized and scaled.
LOG MESSAGE: 	Time for regression steps: 26.918s

'n_counts' and 'percent_mito' regressed out
adata scaled with max_value set to 10


if (batch_to_correct != 'None'):
    #save a copy of uncorrected in case we need it for something later
    adata_uncorrected = adata.copy()
    adata.obs['batch'] = adata.obs[batch_to_correct]
    adata = bc.st.pca_neighbors_umap(adata,results_folder, method='bbknn')
else:
    adata = bc.st.pca_neighbors_umap(adata, results_folder)

Using random_state = 0 for all the following calculations
PCA calculated using svd_solver = 'arpack'. PCA multiplied by -1 to match Seurat output.

Nearest neighbors calculated with n_neighbors = 10

LOG MESSAGE: Neighborhood analysis completed, and UMAP generated.
LOG MESSAGE: 	 Time for PCA, nearest neighbor calculation and UMAP generation: 7.599s
LOG MESSAGE: Metadata containing 3 PCAs and UMAP coordinates exported successfully to file.
LOG MESSAGE: Time for export: 0.037s

UMAP coordinates calculated.
results successfully written out to 'analysis_metadata.tsv'


# leiden clustering is the default 
adata = bc.st.clustering(adata, results_folder)

leiden clustering performed with a resolution of 1

LOG MESSAGE: leidenclustering done. Found 10 clusters.
LOG MESSAGE: 	Time for leiden clustering: 0.362s
LOG MESSAGE: Marker gene detection performed on a per-cluster basis using the method wilcoxon.
LOG MESSAGE: 	Time for marker gene detection: 2.261s

rank genes per cluster calculated using method wilcoxon.
mapping of cells to  leiden exported successfully to cell2labels.tsv
average.gct exported successfully to file
fract_pos.gct exported successfully to file
labelinfo.tsv successfully written out
/pstore/data/biomics/_pre_portfolio/ID/70171_Immunomics_HBV/HBV_SMARTSeq_NovaSeq_July2021/analyzed/standard_workflow_besca2/labelings/leiden/WilxRank.gct written out
/pstore/data/biomics/_pre_portfolio/ID/70171_Immunomics_HBV/HBV_SMARTSeq_NovaSeq_July2021/analyzed/standard_workflow_besca2/labelings/leiden/WilxRank.pvalues.gct written out

LOG MESSAGE: Cluster level analysis and marker genes exported to file.
LOG MESSAGE: 	Time for export of cluster level analysis: 2.765s

/pstore/data/biomics/_pre_portfolio/ID/70171_Immunomics_HBV/HBV_SMARTSeq_NovaSeq_July2021/analyzed/standard_workflow_besca2/labelings/leiden/WilxRank.logFC.gct written out


# everything that was done so far goes to the .h5ad file for later use
adata.write(results_file)
print(results_file)

/pstore/data/biomics/_pre_portfolio/ID/70171_Immunomics_HBV/HBV_SMARTSeq_NovaSeq_July2021/analyzed/standard_workflow_besca2/standard_workflow_besca2.h5ad


marker = [x for x in example_markers if x in adata.raw.var_names ] 
if len(marker) >0 :
    sc.pl.umap( adata, color = marker , color_map = 'viridis')


if (labeling_to_use != 'None'):
    adata = bc.st.additional_labeling(adata, labeling_to_use, labeling_name, labeling_description, labeling_author, results_folder)


if citeseq:
    logging.info('Performing analysis of Citeseq Data.')
    sc.settings.figdir = os.path.join(results_folder_citeseq, 'figures') #update results directory
    start_citeseq = time.time()
    
    #apply filtering from gex to citeseq data
    cells = adata.obs_names.to_list()
    adata_prot = adata_prot[[x in cells for x in adata_prot.obs_names.tolist()], :]
    
    #generate a QC plot 
    n_prots = len(adata_prot.var_names)
    percent_top = (int(round(0.01*n_prots, 0)) if int(round(0.01*n_prots, 0)) >= 1 else 1, int(round(0.1*n_prots, 0)), int(round(0.25*n_prots, 0)))
    qc_adata = sc.pp.calculate_qc_metrics(adata_prot, percent_top=percent_top, var_type="antibodies", inplace=False)
    fig = sns.jointplot("log1p_total_counts", "n_antibodies_by_counts", qc_adata[0], kind="hex", norm=mpl.colors.LogNorm())
    fig.savefig(os.path.join(results_folder_citeseq,'figures', 'CITESEQ_QC_plot.png'))
    
    #generate overview of n_counts
    adata_prot.obs['n_counts'] = adata_prot.X.sum(axis=1)  
    
    #save counts into a layer
    adata_prot.layers["counts"] = adata_prot.X.copy()
    
    #perform normalization (this normalization is specific to CITEseq data)
    bc.st.clr_normalize(adata_prot, results_folder_citeseq)
    
    #no selection of highly variable genes since we want to use all measured proteins!
    #question if we should regress out?

    #perform batch correction if desired otherwise just perform clustering
    if (batch_to_correct != 'None'):
        #save a copy of uncorrected in case we need it for something later
        adata_prot_uncorrected = adata_prot.copy()
        adata_prot.obs['batch'] = adata_prot.obs[batch_to_correct]
        
        if n_prots < 50:
            nrpcs = n_prots - 1
        else:
            nrpcs = 50
        
        #important: nrpcs_neigh = 0, uses all input components not the calculated PCs
        adata_prot = bc.st.pca_neighbors_umap(adata_prot, results_folder_citeseq, method='bbknn', nrpcs = nrpcs, nrpcs_neigh=0)
    else:
        
        if n_prots < 50:
            nrpcs = n_prots - 1
        else:
            nrpcs = 50
        
        #important: nrpcs_neigh = 0, uses all input components not the calculated PCs
        adata_prot = bc.st.pca_neighbors_umap(adata_prot, results_folder_citeseq, nrpcs = nrpcs, nrpcs_neigh=0)
        
    #perform clustering
    adata_prot = bc.st.clustering(adata_prot, results_folder_citeseq)
    
    #save to file
    adata_prot.write(results_file_citeseq)
    logging.info('Citeseq analysis written to file: '+ results_file_citeseq)
    
    logging.info('Time for Citeseq analysis: '+str(round(time.time()-start_citeseq, 3))+'s')
    
    #generate a merged file which contains both the citeseq data and the gene expression data
    # (with clustering on gene expression data only)
    logging.info('Generating a merged adata object which contains both gene expression and citeseq data. Clustering and UMAP visualization were calculated using only the gene expression values as input.')
    sc.settings.figdir = os.path.join(results_folder_merged, 'figures') #update results directory
    start_merge = time.time()
    adata_merged = bc.concate_adata(adata, adata_prot)
    
    
    #add calculated values
    adata_merged.obsm['X_umap'] = adata.obsm['X_umap']
    adata_merged.obsm['X_pca'] = adata.obsm['X_pca']
    adata_merged.uns['neighbors'] = adata.uns['neighbors']
    
    #add leiden clustering of protein back in.
    adata_merged.obs['protein_leiden'] = adata_prot.obs["leiden"]
    
    #add protein calculated embedding
    adata_merged.obsm["protein"] = adata_prot.to_df()
    adata_merged.obsm["protein_umap"] = adata_prot.obsm["X_umap"]

    logging.info('Protein based embeddings were also saved into the object and can be plotted using sc.pl.embeddings')
    sc.pl.umap(adata_merged, color=["leiden", "protein_leiden"], size=10, save = '.rna_umap.clustering_results_rna_protein.png')
    sc.pl.embedding(adata_merged, basis="protein_umap", color=["leiden", "protein_leiden"], size=10, save = '.protein_umap.clustering_results_rna_protein.png')
    
    #save to file
    adata_merged.write(results_file_merged)
    logging.info('Merged Citeseq anndata object written to file: '+ results_file_merged)
    logging.info('Time for generation of merged anndata object: '+str(round(time.time()-start_merge, 3))+'s')


logging.info('Entire workflow completed.')
logging.info('\tTime for entire workflow: '+str(round(time.time()-start0, 3))+'s')

LOG MESSAGE: Entire workflow completed.
LOG MESSAGE: 	Time for entire workflow: 64.284s


bc.st.write_qc(adata_unfiltered = adata_unfiltered, 
                 adata_filtered = adata,
                 version = version, 
                 analysis_name = analysis_name, 
                 standard_min_genes = standard_min_genes, 
                 standard_min_cells = standard_min_cells, 
                 standard_min_counts = standard_min_counts, 
                 standard_percent_mito = standard_percent_mito, 
                 standard_max_counts = standard_max_counts,
                 standard_n_genes = standard_n_genes,
                 filtering_output1 = filtering1,
                 filtering_output2 = filtering2,
                 results_folder = results_folder,
                 css_path = os.path.join(os.path.dirname(bc.__file__),'st', 'style.css'))
logging.info('QC Report generated and saved as .html')

LOG MESSAGE: QC Report generated and saved as .html


%%capture session_info
sinfo()


logging.info(session_info)

LOG MESSAGE: -----
anndata     0.7.5
besca       2.4+57.g5ad53b2
matplotlib  3.3.4
numpy       1.20.1
pandas      1.2.2
scanpy      1.6.1
scipy       1.6.0
seaborn     0.11.1
sinfo       0.3.1
-----
IPython             7.20.0
jupyter_client      6.1.11
jupyter_core        4.7.1
notebook            6.2.0
-----
Python 3.7.9 | packaged by conda-forge | (default, Dec  9 2020, 21:08:20) [GCC 9.3.0]
Linux-3.10.0-693.11.6.el7.x86_64-x86_64-with-centos-7.4.1708-Core
24 logical CPU cores, x86_64
-----
Session information updated at 2021-07-21 11:56


# Fix color platte (was leading to an error when loading the h5ad file)
sc.pl.umap(adata, color=['leiden'], frameon=False, legend_loc='on data')


# Fix color platte (was leading to an error when loading the h5ad file)
sc.pl.umap(adata, color=['leiden','experiment'], frameon=False)

LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.


sc.pl.umap(adata, color=['leiden', 'n_genes', 'percent_mito'], color_map='viridis')

LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.


# Fix color platte (was leading to an error when loading the h5ad file)
del adata.uns['leiden_colors']
#del adata.uns['CONDITION_colors']
del adata.uns['experiment_colors']


adata.write(results_file)


adata = sc.read(results_file)


adata

AnnData object with n_obs × n_vars = 1069 × 9120
    obs: 'CELL', 'ID', 'GROUP', 'CONDITION', 'cell', 'batch', 'NovaSeq_batch', 'donor', 'disease', 'organ', 'experiment', 'sample_description', 'sample_type', 'protocol', 'Sex', 'fresh.frozen', 'Plate.ID', 'Population', 'Index.plate.used', 'Consecutive.sample.numbers', 'Date', 'Fastq_id', 'ORGANISM_BIOKIT', 'TISSUE_BIOKIT', 'percent_mito', 'HBsAg', 'n_counts', 'n_genes', 'leiden'
    var: 'ENSEMBL', 'SYMBOL', 'n_cells', 'total_counts', 'frac_reads', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'hvg', 'leiden', 'neighbors', 'pca', 'rank_genes_groups', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'


sc.pl.umap(adata, color=example_markers, frameon=False, color_map='viridis')


sc.pl.umap(adata, color=['leiden', 'donor', 'CONDITION'], color_map='viridis')

LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.


%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')


nb_name = os.path.join(os.getcwd(), nb_name)


! jupyter nbconvert --to html {nb_name}

[NbConvertApp] Converting notebook /pstore/data/biomics/_pre_portfolio/ID/70171_Immunomics_HBV/HBV_SMARTSeq_NovaSeq_July2021/standard_workflow_besca2.ipynb to html
[NbConvertApp] Writing 6431473 bytes to /pstore/data/biomics/_pre_portfolio/ID/70171_Immunomics_HBV/HBV_SMARTSeq_NovaSeq_July2021/standard_workflow_besca2.html

	ENSEMBL	SYMBOL	n_cells	total_counts	frac_reads
A1BG	ENSG00000121410	A1BG	11	1226.626099	1.152375e-06
A1BG-AS1	ENSG00000268895	A1BG-AS1	17	3207.958008	3.013770e-06
A1CF	ENSG00000148584	A1CF	8	239.138992	2.246632e-07
A2M	ENSG00000175899	A2M	316	33540.484375	3.151018e-05
A2M-AS1	ENSG00000245105	A2M-AS1	91	19303.855469	1.813533e-05
...	...	...	...	...	...
ZXDC	ENSG00000070476	ZXDC	87	17643.845703	1.657581e-05
ZYG11A	ENSG00000203995	ZYG11A	8	58.874001	5.531017e-08
ZYG11B	ENSG00000162378	ZYG11B	52	3417.104736	3.210257e-06
ZYX	ENSG00000159840	ZYX	346	190766.312500	1.792186e-04
ZZEF1	ENSG00000074755	ZZEF1	251	26362.820312	2.476700e-05

Table of Contents

Standard BESCA workflow¶

Import section¶

Setup Standard Wokflow¶

Parameters to be set - on the command line or here¶

Define Input¶

Standard parameters - these should be kept as stable as possible¶

Standard Pipeline¶

!!! Remove low quality samples !!!¶

Read patient HBsAg¶

Visualization of quality control plots and selected filtering parameters¶

Count occurrence¶

Transcript capture efficiency¶

Library size distribution¶

The effect of filtering parameters¶

Scanpy plots of genes, counts, and mitochondria gene counts¶

Mitochondrial genes, genes, and counts by samples grouped by the split condition¶

Filtering¶

Filtering with thresholds of gene and cell counts¶

Filtering with thresholds of proportion of mitochondrial genes and the upper limit of gene counts¶

Visualising QC metrics of the filtered dataset¶

Per-cell normalization and output of the normalized data¶

Feature selection (highly variable genes) for clustering¶

Regression steps, and output of regressed data¶

PCA-based neighborhood analysis and UMAP with optional batch correction¶

Clustering¶

Additional Labeling¶

CiteSeq Standard Wokflow (only executed if applicable)¶

Complete the log-file¶

Write QC Report¶

Session info¶

Additional plots¶

Convert to html¶