The current workflow works with the Scanpy version 1.6.
Changes:
from distutils.version import StrictVersion
import scanpy as sc
import besca as bc
import re
def print_software_versions():
scv = sc.__version__
if(StrictVersion(scv) >= "1.6"):
sc.logging.print_header()
else:
sc.logging.print_versions()
bcstr = StrictVersion(re.sub("\+.*$", "", bc.__version__))
print("besca=={}".format(bcstr))
return None
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from scipy import sparse, io
import seaborn as sns
import os
import time
import IPython
print_software_versions()
import logging
import seaborn as sns
from sinfo import sinfo
# Seed is fixed for reproducible analysis.
import random
random.seed(1)
# for standard processing, set verbosity to minimum
sc.settings.verbosity = 0 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80)
sns.set_context("paper", font_scale=0.9) #changes font sizes so better readable
plt.rcParams['svg.fonttype'] = 'none' #ensures that any saved svgs have editable fonts
version = '2.9'
start0 = time.time()
%matplotlib inline
scanpy==1.6.1 anndata==0.7.5 umap==0.4.6 numpy==1.20.1 scipy==1.6.0 pandas==1.2.2 scikit-learn==0.24.1 statsmodels==0.12.2 python-igraph==0.8.3 louvain==0.7.0 leidenalg==0.8.3 besca==2.4
# decisions to be made
use_example_dataset = False # If True, the pipeline will run using the example dataset shipped with BESCA
species = 'human'
batch_to_correct = 'None' # must be "None" or any one of the labels in "metadata.tsv", ID, SPECIES, TISSUE, DONOR, TREATMENT; typically "ID" or "DONOR"
analysis_name = 'standard_workflow_besca2'
split_condition='experiment' #'experiment' is generally a good default
citeseq = False #specify if the dataset contains citeseq data
dynrange=['B2m','Actb','Pgk1','Ctcf'] #genes for which to plot dynamic range
example_markers = ['PTPRC', 'CD3E', 'CD8A', 'PDCD1'] # Will be shown in the first umaps
if species=='human': dynrange=[x.upper() for x in dynrange]
#additional labeling
labeling_to_use = 'None' # must be "None" or any one of the labels in "metadata.tsv", ID, SPECIES, TISSUE, DONOR, TREATMENT; typically "ID" or "DONOR"
labeling_name = 'MyAnno' # define name under which the labeling should be exported
labeling_description = 'celltype annotation' #define description which should be saved to labeling_info file
labeling_author = 'MyAnnotAuthor' #define author which should be saved to labeling info file
# define filepath (this is the folder that contains "raw" and "analyzed")
root_path = os.getcwd()
# the standard parameter section
standard_min_genes = 800 # 500
standard_min_cells = 3 # 30
standard_min_counts = 2500 # 1000
standard_n_genes = 6000 # 3000 # this is the most tricky one to set
standard_percent_mito = 0.15 # 0.05
standard_max_counts = 3e6 # 20000 #might be redundant with n_genes
Thresholds defined above for filtering should be a good start to treat most samples. In some cases, based on the QC plots shown below, one can decide to change the threshold.
For example, with the pbmc3k dataset, we advise to lower those thresholds (see besca tutorial here: https://bedapub.github.io/besca/tutorials/notebook1_data_processing_pbmc3k.html).
For the test dataset here (Kotliarov2020), one might argue that the max_counts threshold could be lower (based on the distribution in code chunk 12)
(note nothing below this point should be modified!!)
#define standardized filepaths based on above input
results_folder = os.path.join(root_path, 'analyzed', analysis_name)
results_file = os.path.join(results_folder, analysis_name + '.h5ad') # specify a .h5ad file for storing the results
log_file = os.path.join(results_folder, analysis_name + '.standard.log') # specify a log file for keeping a short summary and overview
sc.settings.figdir = os.path.join(results_folder, 'figures')
#setup standard workflow (generates output directories and setsup logging file)
bc.st.setup(results_folder,
analysis_name,
labeling_name,
labeling_to_use,
log_file,
version,
root_path,
species,
batch_to_correct,
standard_min_genes,
standard_min_cells,
standard_min_counts,
standard_n_genes,
standard_percent_mito,
standard_max_counts)
# read input data
if use_example_dataset:
print('utilizing example data')
adata = bc.datasets.Kotliarov2020_raw()
adata.obs['donor'] = 'donor1'
elif citeseq:
#generate file strucutre for citeseq data
results_folder_citeseq = os.path.join(results_folder, 'citeseq')
results_folder_merged= os.path.join(results_folder, 'citeseq_merged')
results_file_citeseq = os.path.join(results_folder_citeseq, analysis_name + '.h5ad')
results_file_merged = os.path.join(results_folder_merged, analysis_name + '.h5ad')
bc.st.setup_citeseq(results_folder,
labeling_name,
labeling_to_use)
print('utilizing citeseq data')
#read citeseq data and normal data
logging.info('Reading Gene Expression Data...')
adata = bc.st.read_matrix(root_path, citeseq = 'gex_only')
logging.info('Reading Citeseq Data...')
adata_prot = bc.st.read_matrix(root_path, citeseq = 'citeseq_only')
else:
#read data
adata = bc.st.read_matrix(root_path, citeseq = None)
LOG MESSAGE: Standard Pipeline Version 2.9 used LOG MESSAGE: 2021-07-21 LOG MESSAGE: Analysis 'standard_workflow_besca2' on data located in'/pstore/data/biomics/_pre_portfolio/ID/70171_Immunomics_HBV/HBV_SMARTSeq_NovaSeq_July2021' LOG MESSAGE: species: human LOG MESSAGE: Batch effect to correct: None LOG MESSAGE: Parameters: LOG MESSAGE: standard_min_genes = 800 LOG MESSAGE: standard_min_cells = 3 LOG MESSAGE: standard_min_counts = 2500 LOG MESSAGE: standard_n_genes = 6000 LOG MESSAGE: standard_max_counts = 3000000.0 LOG MESSAGE: standard_percent_mito = 0.15 LOG MESSAGE: Time for creating all output directories and setting up logging: 0.019s
all output directories created successfully reading matrix.mtx adding cell barcodes adding genes
LOG MESSAGE: After input: 1711 cells, 60683 genes LOG MESSAGE: Time for reading data: 0.802s
making var_names unique adding ENSEMBL gene ids to adata.var adding annotation
# Exclude VHB27
adata = bc.subset_adata(adata, (adata.obs.donor != 'VHB27'), raw=False)
adata
AnnData object with n_obs × n_vars = 1166 × 60683 obs: 'CELL', 'ID', 'GROUP', 'CONDITION', 'cell', 'batch', 'NovaSeq_batch', 'donor', 'disease', 'organ', 'experiment', 'sample_description', 'sample_type', 'protocol', 'Sex', 'fresh.frozen', 'Plate.ID', 'Population', 'Index.plate.used', 'Consecutive.sample.numbers', 'Date', 'Fastq_id', 'ORGANISM_BIOKIT', 'TISSUE_BIOKIT' var: 'ENSEMBL', 'SYMBOL'
#calculate mitochondrial gene content
bc.pp.fraction_counts(adata=adata, species=species)
patient_data = pd.read_csv(os.path.join(root_path, '/pstore/data/biomics_clin/ID/70171_Immunomics_HBV/patient_data.txt'), sep='\t', index_col='donor', na_values='na')
adata.obs = adata.obs.join(patient_data['HBsAg'], on='donor', how='left')
This plot shows cell counts per sample
temp=bc.tl.count_occurrence(adata,split_condition)
sns.barplot(y=temp.index,x=temp.Counts,color='gray',orient='h')
<AxesSubplot:xlabel='Counts'>
This plot gives you an idea about the sequencing depth and if the sequencing has reached saturation or not.
fig, ax = plt.subplots(1)
fig.set_figwidth(8)
fig.set_figheight(5)
fig.tight_layout()
bc.pl.transcript_capture_efficiency(adata,ax=ax)
fig.savefig(os.path.join(results_folder, 'figures/transcriptcaptureefficiency.png'), format='png', bbbox_inches = 'tight') #save figure for QC report
This plot gives you an idea about the library size distribution in your dataset before processing.
#fig = bc.pl.librarysize_overview(adata, bins=100)
#fig.savefig(os.path.join(results_folder, 'figures/librarysize.png'), format='png',bbbox_inches = 'tight') #save figure for QC report
adata_unfiltered = adata.copy()
fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(ncols=3, nrows=2)
fig.set_figwidth(17)
fig.set_figheight(9)
fig.tight_layout(pad=4.5)
bc.pl.kp_genes(adata, min_genes=standard_min_genes, ax = ax1)
bc.pl.kp_counts(adata, min_counts=standard_min_counts, ax = ax2)
bc.pl.kp_cells(adata, min_cells=standard_min_cells, ax = ax3)
bc.pl.max_genes(adata, max_genes=standard_n_genes, ax = ax4)
bc.pl.max_mito(adata, max_mito=standard_percent_mito, annotation_type='SYMBOL', species=species, ax = ax5)
bc.pl.max_counts(adata, max_counts=standard_max_counts, ax=ax6)
fig.savefig(os.path.join(results_folder, 'figures/filtering_thresholds.png'), format='png', bbbox_inches = 'tight') #save figure for QC report
sc.pl.violin(adata, ['n_genes', 'n_counts', 'percent_mito'], jitter=0.2, multi_panel=True, save = '.before_filtering.png')
... storing 'GROUP' as categorical ... storing 'CONDITION' as categorical ... storing 'batch' as categorical ... storing 'donor' as categorical ... storing 'disease' as categorical ... storing 'organ' as categorical ... storing 'experiment' as categorical ... storing 'sample_description' as categorical ... storing 'sample_type' as categorical ... storing 'protocol' as categorical ... storing 'Sex' as categorical ... storing 'fresh.frozen' as categorical ... storing 'Plate.ID' as categorical ... storing 'Population' as categorical ... storing 'Index.plate.used' as categorical ... storing 'Consecutive.sample.numbers' as categorical ... storing 'ORGANISM_BIOKIT' as categorical ... storing 'TISSUE_BIOKIT' as categorical ... storing 'SYMBOL' as categorical
sc.pl.violin(adata, ['percent_mito','n_genes', 'n_counts'], groupby=split_condition,jitter=0.1,rotation=90, save = '.before_filtering.split.png')
%%capture filtering1
adata = bc.st.filtering_cells_genes_min(adata, standard_min_cells, standard_min_genes, standard_min_counts)
LOG MESSAGE: After filtering for minimum number of cells and minimum number of expressed genes: 1131 cells, 29999 genes LOG MESSAGE: Time for filtering: 0.138s
filtering1.show()
started with 1166 total cells and 60683 total genes removed 35 cells that did not express at least 800 genes removed 0 cells that did not have at least 2500 counts removed 30684 genes that were not expressed in at least 3 cells finished with 1131 total cells and 29999 total genes
%%capture filtering2
adata = bc.st.filtering_mito_genes_max(adata, standard_percent_mito, standard_n_genes, standard_max_counts)
LOG MESSAGE: After filtering for maximum number of expressed genes and max percent mito: 1069 cells, 29999 genes LOG MESSAGE: Time for filtering: 0.151s
filtering2.show()
started with 1131 total cells and 29999 total genes removed 11 cells that expressed more than 6000 genes removed 0 cells that had more than 3000000.0 counts removed 51 cells that expressed 15.0 percent mitochondrial genes or more finished with 1069 total cells and 29999 total genes
sc.pl.violin(adata, ['n_genes', 'n_counts', 'percent_mito'], jitter=0.2, multi_panel=True, save = '.after_filtering.png')
### check mitochondrial reads per sample
sc.pl.violin(adata, ['percent_mito','n_genes', 'n_counts'], groupby=split_condition,jitter=0.1,rotation=90, save = '.after_filtering.split.png')
### cell counts per sample
temp=bc.tl.count_occurrence(adata,split_condition)
sns.barplot(y=temp.index,x=temp.Counts,color='gray',orient='h')
<AxesSubplot:xlabel='Counts'>
#display the top 25 genes in the dataset
fig, ax = plt.subplots(ncols=1, nrows=1, figsize = (8, 6))
bc.pl.top_genes_counts(adata=adata, top_n=25, ax = ax )
fig.savefig(os.path.join(results_folder, 'figures/top_genes.png'), format='png', bbbox_inches = 'tight') #save figure for QC report
adata.var
ENSEMBL | SYMBOL | n_cells | total_counts | frac_reads | |
---|---|---|---|---|---|
A1BG | ENSG00000121410 | A1BG | 11 | 1226.626099 | 1.152375e-06 |
A1BG-AS1 | ENSG00000268895 | A1BG-AS1 | 17 | 3207.958008 | 3.013770e-06 |
A1CF | ENSG00000148584 | A1CF | 8 | 239.138992 | 2.246632e-07 |
A2M | ENSG00000175899 | A2M | 316 | 33540.484375 | 3.151018e-05 |
A2M-AS1 | ENSG00000245105 | A2M-AS1 | 91 | 19303.855469 | 1.813533e-05 |
... | ... | ... | ... | ... | ... |
ZXDC | ENSG00000070476 | ZXDC | 87 | 17643.845703 | 1.657581e-05 |
ZYG11A | ENSG00000203995 | ZYG11A | 8 | 58.874001 | 5.531017e-08 |
ZYG11B | ENSG00000162378 | ZYG11B | 52 | 3417.104736 | 3.210257e-06 |
ZYX | ENSG00000159840 | ZYX | 346 | 190766.312500 | 1.792186e-04 |
ZZEF1 | ENSG00000074755 | ZZEF1 | 251 | 26362.820312 | 2.476700e-05 |
29999 rows × 5 columns
adata = bc.st.per_cell_normalize(adata, results_folder)
LOG MESSAGE: Per cell normalization completed successfully. LOG MESSAGE: Time for per-cell normalization: 0.077s
adata normalized per cell log1p values saved into adata.raw writing out matrix.mtx ...
LOG MESSAGE: cp10k values exported to file. LOG MESSAGE: Time for cp10k export: 4.864s
adata.X successfully written to matrix.mtx genes successfully written out to genes.tsv cellbarcodes successfully written out to barcodes.tsv annotation successfully written out to metadata.tsv
We perform an additional QC, which checks the dynamic range of ubiquitously expressed marker genes.
# Further QC: dynamic range of ubi/marker genes
fig = plt.figure()
sns.set(font_scale=0.8)
plt.style.use('seaborn-white')
fig = plt.figure(figsize=(len(dynrange)*2.8,2))
fig.subplots_adjust(hspace=0.2, wspace=0.35)
for i in range(1,len(dynrange)+1):
ax = fig.add_subplot(1, len(dynrange), i)
myg=dynrange[i-1]
try:
g=sns.distplot(adata.raw[:,myg].X.toarray(), norm_hist=True)
ax.set(xlabel='log.cp10k',ylabel=myg)
g.set_xlim(-1, 7)
except:
print( myg + ' can not be plotted')
<Figure size 432x288 with 0 Axes>
adata = bc.st.highly_variable_genes(adata)
log1p taken of adata
LOG MESSAGE: After feature selection of highly variable genes: 1069 cells, 9120 genes LOG MESSAGE: Time for feature selection: 1.778s
adata
View of AnnData object with n_obs × n_vars = 1069 × 9120 obs: 'CELL', 'ID', 'GROUP', 'CONDITION', 'cell', 'batch', 'NovaSeq_batch', 'donor', 'disease', 'organ', 'experiment', 'sample_description', 'sample_type', 'protocol', 'Sex', 'fresh.frozen', 'Plate.ID', 'Population', 'Index.plate.used', 'Consecutive.sample.numbers', 'Date', 'Fastq_id', 'ORGANISM_BIOKIT', 'TISSUE_BIOKIT', 'percent_mito', 'HBsAg', 'n_counts', 'n_genes' var: 'ENSEMBL', 'SYMBOL', 'n_cells', 'total_counts', 'frac_reads', 'highly_variable', 'means', 'dispersions', 'dispersions_norm' uns: 'experiment_colors', 'log1p', 'hvg'
# RMK : AS OF FEB 2020 there is a bug in scanpy regress out if scanpy installed with PIP (see https://github.com/theislab/scanpy/issues/707)
# Before the fix is available, one should coopy the data toprevet it. hence the adata = adata.copy()
adata = adata.copy()
adata = bc.st.regress_out(adata, results_folder)
LOG MESSAGE: Regression steps completed. 'n_counts' and 'percent_mito' regressed out. adata was log-normalized and scaled. LOG MESSAGE: Time for regression steps: 26.918s
'n_counts' and 'percent_mito' regressed out adata scaled with max_value set to 10
We use the Batch Balanced K-Nearest Neighbourhood (bbknn, Teichlab/bbknn) method as the batch correction method.
if (batch_to_correct != 'None'):
#save a copy of uncorrected in case we need it for something later
adata_uncorrected = adata.copy()
adata.obs['batch'] = adata.obs[batch_to_correct]
adata = bc.st.pca_neighbors_umap(adata,results_folder, method='bbknn')
else:
adata = bc.st.pca_neighbors_umap(adata, results_folder)
Using random_state = 0 for all the following calculations PCA calculated using svd_solver = 'arpack'. PCA multiplied by -1 to match Seurat output.
Nearest neighbors calculated with n_neighbors = 10
LOG MESSAGE: Neighborhood analysis completed, and UMAP generated. LOG MESSAGE: Time for PCA, nearest neighbor calculation and UMAP generation: 7.599s LOG MESSAGE: Metadata containing 3 PCAs and UMAP coordinates exported successfully to file. LOG MESSAGE: Time for export: 0.037s
UMAP coordinates calculated. results successfully written out to 'analysis_metadata.tsv'
# leiden clustering is the default
adata = bc.st.clustering(adata, results_folder)
leiden clustering performed with a resolution of 1
LOG MESSAGE: leidenclustering done. Found 10 clusters. LOG MESSAGE: Time for leiden clustering: 0.362s LOG MESSAGE: Marker gene detection performed on a per-cluster basis using the method wilcoxon. LOG MESSAGE: Time for marker gene detection: 2.261s
rank genes per cluster calculated using method wilcoxon. mapping of cells to leiden exported successfully to cell2labels.tsv average.gct exported successfully to file fract_pos.gct exported successfully to file labelinfo.tsv successfully written out /pstore/data/biomics/_pre_portfolio/ID/70171_Immunomics_HBV/HBV_SMARTSeq_NovaSeq_July2021/analyzed/standard_workflow_besca2/labelings/leiden/WilxRank.gct written out /pstore/data/biomics/_pre_portfolio/ID/70171_Immunomics_HBV/HBV_SMARTSeq_NovaSeq_July2021/analyzed/standard_workflow_besca2/labelings/leiden/WilxRank.pvalues.gct written out
LOG MESSAGE: Cluster level analysis and marker genes exported to file. LOG MESSAGE: Time for export of cluster level analysis: 2.765s
/pstore/data/biomics/_pre_portfolio/ID/70171_Immunomics_HBV/HBV_SMARTSeq_NovaSeq_July2021/analyzed/standard_workflow_besca2/labelings/leiden/WilxRank.logFC.gct written out
# everything that was done so far goes to the .h5ad file for later use
adata.write(results_file)
print(results_file)
/pstore/data/biomics/_pre_portfolio/ID/70171_Immunomics_HBV/HBV_SMARTSeq_NovaSeq_July2021/analyzed/standard_workflow_besca2/standard_workflow_besca2.h5ad
marker = [x for x in example_markers if x in adata.raw.var_names ]
if len(marker) >0 :
sc.pl.umap( adata, color = marker , color_map = 'viridis')
If labeling_to_use
is specified, additional labels are taken from annotations in "metadata.tsv", and the data associated with additional labelling will be exported to files. And the fract_pos.gct and average.gct files are generated.
if (labeling_to_use != 'None'):
adata = bc.st.additional_labeling(adata, labeling_to_use, labeling_name, labeling_description, labeling_author, results_folder)
if citeseq:
logging.info('Performing analysis of Citeseq Data.')
sc.settings.figdir = os.path.join(results_folder_citeseq, 'figures') #update results directory
start_citeseq = time.time()
#apply filtering from gex to citeseq data
cells = adata.obs_names.to_list()
adata_prot = adata_prot[[x in cells for x in adata_prot.obs_names.tolist()], :]
#generate a QC plot
n_prots = len(adata_prot.var_names)
percent_top = (int(round(0.01*n_prots, 0)) if int(round(0.01*n_prots, 0)) >= 1 else 1, int(round(0.1*n_prots, 0)), int(round(0.25*n_prots, 0)))
qc_adata = sc.pp.calculate_qc_metrics(adata_prot, percent_top=percent_top, var_type="antibodies", inplace=False)
fig = sns.jointplot("log1p_total_counts", "n_antibodies_by_counts", qc_adata[0], kind="hex", norm=mpl.colors.LogNorm())
fig.savefig(os.path.join(results_folder_citeseq,'figures', 'CITESEQ_QC_plot.png'))
#generate overview of n_counts
adata_prot.obs['n_counts'] = adata_prot.X.sum(axis=1)
#save counts into a layer
adata_prot.layers["counts"] = adata_prot.X.copy()
#perform normalization (this normalization is specific to CITEseq data)
bc.st.clr_normalize(adata_prot, results_folder_citeseq)
#no selection of highly variable genes since we want to use all measured proteins!
#question if we should regress out?
#perform batch correction if desired otherwise just perform clustering
if (batch_to_correct != 'None'):
#save a copy of uncorrected in case we need it for something later
adata_prot_uncorrected = adata_prot.copy()
adata_prot.obs['batch'] = adata_prot.obs[batch_to_correct]
if n_prots < 50:
nrpcs = n_prots - 1
else:
nrpcs = 50
#important: nrpcs_neigh = 0, uses all input components not the calculated PCs
adata_prot = bc.st.pca_neighbors_umap(adata_prot, results_folder_citeseq, method='bbknn', nrpcs = nrpcs, nrpcs_neigh=0)
else:
if n_prots < 50:
nrpcs = n_prots - 1
else:
nrpcs = 50
#important: nrpcs_neigh = 0, uses all input components not the calculated PCs
adata_prot = bc.st.pca_neighbors_umap(adata_prot, results_folder_citeseq, nrpcs = nrpcs, nrpcs_neigh=0)
#perform clustering
adata_prot = bc.st.clustering(adata_prot, results_folder_citeseq)
#save to file
adata_prot.write(results_file_citeseq)
logging.info('Citeseq analysis written to file: '+ results_file_citeseq)
logging.info('Time for Citeseq analysis: '+str(round(time.time()-start_citeseq, 3))+'s')
#generate a merged file which contains both the citeseq data and the gene expression data
# (with clustering on gene expression data only)
logging.info('Generating a merged adata object which contains both gene expression and citeseq data. Clustering and UMAP visualization were calculated using only the gene expression values as input.')
sc.settings.figdir = os.path.join(results_folder_merged, 'figures') #update results directory
start_merge = time.time()
adata_merged = bc.concate_adata(adata, adata_prot)
#add calculated values
adata_merged.obsm['X_umap'] = adata.obsm['X_umap']
adata_merged.obsm['X_pca'] = adata.obsm['X_pca']
adata_merged.uns['neighbors'] = adata.uns['neighbors']
#add leiden clustering of protein back in.
adata_merged.obs['protein_leiden'] = adata_prot.obs["leiden"]
#add protein calculated embedding
adata_merged.obsm["protein"] = adata_prot.to_df()
adata_merged.obsm["protein_umap"] = adata_prot.obsm["X_umap"]
logging.info('Protein based embeddings were also saved into the object and can be plotted using sc.pl.embeddings')
sc.pl.umap(adata_merged, color=["leiden", "protein_leiden"], size=10, save = '.rna_umap.clustering_results_rna_protein.png')
sc.pl.embedding(adata_merged, basis="protein_umap", color=["leiden", "protein_leiden"], size=10, save = '.protein_umap.clustering_results_rna_protein.png')
#save to file
adata_merged.write(results_file_merged)
logging.info('Merged Citeseq anndata object written to file: '+ results_file_merged)
logging.info('Time for generation of merged anndata object: '+str(round(time.time()-start_merge, 3))+'s')
logging.info('Entire workflow completed.')
logging.info('\tTime for entire workflow: '+str(round(time.time()-start0, 3))+'s')
LOG MESSAGE: Entire workflow completed. LOG MESSAGE: Time for entire workflow: 64.284s
bc.st.write_qc(adata_unfiltered = adata_unfiltered,
adata_filtered = adata,
version = version,
analysis_name = analysis_name,
standard_min_genes = standard_min_genes,
standard_min_cells = standard_min_cells,
standard_min_counts = standard_min_counts,
standard_percent_mito = standard_percent_mito,
standard_max_counts = standard_max_counts,
standard_n_genes = standard_n_genes,
filtering_output1 = filtering1,
filtering_output2 = filtering2,
results_folder = results_folder,
css_path = os.path.join(os.path.dirname(bc.__file__),'st', 'style.css'))
logging.info('QC Report generated and saved as .html')
LOG MESSAGE: QC Report generated and saved as .html
%%capture session_info
sinfo()
logging.info(session_info)
LOG MESSAGE: ----- anndata 0.7.5 besca 2.4+57.g5ad53b2 matplotlib 3.3.4 numpy 1.20.1 pandas 1.2.2 scanpy 1.6.1 scipy 1.6.0 seaborn 0.11.1 sinfo 0.3.1 ----- IPython 7.20.0 jupyter_client 6.1.11 jupyter_core 4.7.1 notebook 6.2.0 ----- Python 3.7.9 | packaged by conda-forge | (default, Dec 9 2020, 21:08:20) [GCC 9.3.0] Linux-3.10.0-693.11.6.el7.x86_64-x86_64-with-centos-7.4.1708-Core 24 logical CPU cores, x86_64 ----- Session information updated at 2021-07-21 11:56
# Fix color platte (was leading to an error when loading the h5ad file)
sc.pl.umap(adata, color=['leiden'], frameon=False, legend_loc='on data')
# Fix color platte (was leading to an error when loading the h5ad file)
sc.pl.umap(adata, color=['leiden','experiment'], frameon=False)
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
sc.pl.umap(adata, color=['leiden', 'n_genes', 'percent_mito'], color_map='viridis')
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
# Fix color platte (was leading to an error when loading the h5ad file)
del adata.uns['leiden_colors']
#del adata.uns['CONDITION_colors']
del adata.uns['experiment_colors']
adata.write(results_file)
adata = sc.read(results_file)
adata
AnnData object with n_obs × n_vars = 1069 × 9120 obs: 'CELL', 'ID', 'GROUP', 'CONDITION', 'cell', 'batch', 'NovaSeq_batch', 'donor', 'disease', 'organ', 'experiment', 'sample_description', 'sample_type', 'protocol', 'Sex', 'fresh.frozen', 'Plate.ID', 'Population', 'Index.plate.used', 'Consecutive.sample.numbers', 'Date', 'Fastq_id', 'ORGANISM_BIOKIT', 'TISSUE_BIOKIT', 'percent_mito', 'HBsAg', 'n_counts', 'n_genes', 'leiden' var: 'ENSEMBL', 'SYMBOL', 'n_cells', 'total_counts', 'frac_reads', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std' uns: 'hvg', 'leiden', 'neighbors', 'pca', 'rank_genes_groups', 'umap' obsm: 'X_pca', 'X_umap' varm: 'PCs' obsp: 'connectivities', 'distances'
sc.pl.umap(adata, color=example_markers, frameon=False, color_map='viridis')
sc.pl.umap(adata, color=['leiden', 'donor', 'CONDITION'], color_map='viridis')
LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. LOG MESSAGE: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')
nb_name = os.path.join(os.getcwd(), nb_name)
! jupyter nbconvert --to html {nb_name}
[NbConvertApp] Converting notebook /pstore/data/biomics/_pre_portfolio/ID/70171_Immunomics_HBV/HBV_SMARTSeq_NovaSeq_July2021/standard_workflow_besca2.ipynb to html [NbConvertApp] Writing 6431473 bytes to /pstore/data/biomics/_pre_portfolio/ID/70171_Immunomics_HBV/HBV_SMARTSeq_NovaSeq_July2021/standard_workflow_besca2.html