import pandas as pd
import anndata
import numpy as np
import scipy.sparse
np.random.seed(1234)
[docs]
def generate_bulk_AnnData(bulk_df):
'''
Generate AnnData object from DataFrame.
Parameters:
- bulk_df (DataFrame): Dataframe of prototype data:
- columns: cell types / samples
- rows: featues (cCREs)
Returns:
- AnnData of prototypes.
'''
features = pd.DataFrame(bulk_df.index)
features.columns=['cCREs']
features.index = features['cCREs']
cell_types = pd.DataFrame(bulk_df.columns)
cell_types.columns=['cell_types']
cell_types.index = cell_types['cell_types']
bulk_complete_adata = anndata.AnnData(np.array(bulk_df.T, dtype= np.float32), var=features, obs=cell_types)
return bulk_complete_adata
[docs]
def generate_bulk_sparse_AnnData(bulk_df, var_key='cCREs', obs_key='cell_types'):
'''
Generate AnnData object from DataFrame. The count matrix is sparse.
Parameters:
- bulk_df (DataFrame): Dataframe of prototype data:
- columns: cell types / samples
- rows: featues (cCREs)
Returns:
- AnnData of prototypes.
'''
features = pd.DataFrame(bulk_df.index)
features.columns=[var_key]
features.index = features[var_key]
cell_types = pd.DataFrame(bulk_df.columns)
cell_types.columns=[obs_key]
cell_types.index = cell_types[obs_key]
bulk_csr = scipy.sparse.csr_matrix(np.matrix(bulk_df.T.values,dtype=np.float32))
bulk_complete_adata = anndata.AnnData(bulk_csr, var=features, obs=cell_types)
return bulk_complete_adata
[docs]
def preprocess_bulk_adata(bulk_adata, remove_chrY=True, var_key = 'cCREs', copy=False):
'''
Preprocess a prototype count matrix in AnnData format.
This function preprocesses a prototype count matrix in AnnData format by optionally removing features associated with chromosome Y.
If copy is True, a new AnnData object with the preprocessed data is returned, leaving the original AnnData object unchanged.
If copy is False, the original AnnData object is modified in place, and the preprocessed AnnData object is returned.
Parameters:
- bulk_adata (AnnData): An AnnData object containing the prototype count matrix.
- remove_chrY (bool, optional): Whether to remove features associated with chromosome Y. Default is True.
- var_key (str, optional): Key for accessing feature information in AnnData.var. Default is 'cCREs'.
- copy (bool, optional): If True, a copy of the AnnData object is returned; if False, the original AnnData object is modified. Default is False.
Returns:
- AnnData: The preprocessed AnnData object.
'''
feature_coverage = bulk_adata.X.sum(0)
if copy==False:
bulk_adata = bulk_adata[:,feature_coverage>0]
if remove_chrY==True:
filtered_chrY= [a for a in bulk_adata.var_names if not a.startswith("chrY")]
bulk_adata=bulk_adata[:,sorted(filtered_chrY)]
return bulk_adata
else:
bulk_adata2 = bulk_adata[:,feature_coverage>0].copy()
if remove_chrY==True:
filtered_chrY= [a for a in bulk_adata2.var_names if not a.startswith("chrY")]
bulk_adata2=bulk_adata2[:,sorted(filtered_chrY)]
return bulk_adata2