In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime
from matplotlib.colors import Normalize, ListedColormap
from scipy.stats import linregress, pearsonr
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
import json
from kneed import KneeLocator
In [2]:
pd.options.display.float_format = '{:.3f}'.format
pd.set_option('display.max_columns', None)
plt.rcParams['figure.dpi']=170
In [3]:
from list_vars import LIST_PROFILERS, DIR_FIGURES, RESULTS_DIR, POOLS, CONTROLS
Biological sample analysis¶
In this notebook we are going to do an analysis on the biological samples (POOL samples + controls).
There are two main variables that we are going to consider:
- The importance of including the biological control samples to ensure that false positives are not considered.
- The importance of normalizing the reads considering the biooogical samples.
These two concepts are intertwined, so what we are going to do is the following:
- Load all pool + control tables.
- Get the species that are selected using the flags. With that we are going to generate a cut-off table with the species. The way to merge the table is "outer", that is, we are going to include any species that appears in any sample. We can later discard them.
- We are going to discard the species that have less than X times more expression in control samples than in pools. X is determined dynamically using the kneed method.
- Then we are going to compare which species have been discarded using the | normalization (normalizing controls and pools separately) and the + normalization (normalizng controls and pools jointly).
In [4]:
# Loading the tables
def process_samples(pass_num, mode, S, NORM, samples, verbose, min_sample_flag='dynamic'):
# Initialize empty DataFrames for counts and flags
joined_counts = pd.DataFrame()
taxid_list = []
for sample in samples:
# Define file paths for counts and flags
counts_file = f'{RESULTS_DIR}/summary/{sample}_pass{pass_num}_mode{mode}_taxgenus_S{S}_{NORM}.diversity.tsv'
flags_file = f'{RESULTS_DIR}/summary/{sample}_pass{pass_num}_mode{mode}_taxgenus_S{S}_{NORM}.flags.tsv'
# Load the data
df_counts = pd.read_csv(counts_file, sep='\t').set_index('taxonomy_id')[['name', 'lineage', 'mean_norm']]
df_flags = pd.read_csv(flags_file, sep='\t').set_index('taxonomy_id')[['name', 'lineage', 'mean_norm']]
# Ensure name and lineage columns are retained correctly
if joined_counts.empty:
joined_counts = df_counts.rename(columns={'mean_norm': sample})
else:
df_counts = df_counts.rename(columns={'mean_norm': sample})
joined_counts = joined_counts.join(df_counts, how='outer', rsuffix=f'_{sample}')
# Add tax_ids where mean_norm is False in flags dataframe
taxid_list += df_flags[df_flags['mean_norm'] == False].index.tolist()
# Consolidate name and lineage columns to avoid suffix issues
if not joined_counts.empty:
joined_counts['name'] = joined_counts.filter(like='name').bfill(axis=1).iloc[:, 0]
joined_counts['lineage'] = joined_counts.filter(like='lineage').bfill(axis=1).iloc[:, 0]
joined_counts = joined_counts.drop(columns=joined_counts.filter(like='name_').columns)
joined_counts = joined_counts.drop(columns=joined_counts.filter(like='lineage_').columns)
# Rename pool columns based on their ranges
rename_mapping = {
'POOL1': 'RR1', 'POOL2': 'RR2', 'POOL3': 'RR3', 'POOL4': 'RR4',
'POOL5': 'SP1', 'POOL6': 'SP2', 'POOL7': 'SP3', 'POOL8': 'SP4',
'POOL9': 'HC1', 'POOL10': 'HC2', 'POOL11': 'HC3', 'POOL12': 'HC4'
}
joined_counts = joined_counts.rename(columns=rename_mapping)
# Create a "cut" DataFrame containing only the tax_ids in the taxid_list
taxidvalues, samplecounts = np.unique(taxid_list, return_counts=True)
joined_counts['n_samples_flag'] = 0
joined_counts.loc[taxidvalues, 'n_samples_flag'] = samplecounts
n_samples, counts_ntaxids = np.unique(joined_counts['n_samples_flag'].values, return_counts=True)
if min_sample_flag == 'dynamic':
kneedle = KneeLocator(n_samples, np.cumsum(counts_ntaxids), curve='concave', direction='increasing', S=0)
min_sample_flag = kneedle.knee
joined_counts['selected_flag'] = joined_counts['n_samples_flag'] >= min_sample_flag
cut_df = joined_counts[joined_counts['selected_flag'] == True]
if verbose:
print('TaxIDs species count:', n_samples, counts_ntaxids)
print(f'Flag threshold: {min_sample_flag} | Number of species: {len(joined_counts)} | Species selected: {len(cut_df)} ({100 * len(cut_df) / len(joined_counts):.2f}%)')
# Reset index and sort by mean counts (descending order)
joined_counts = joined_counts.reset_index().sort_values(by=list(rename_mapping.values()), ascending=False)
cut_df = cut_df.reset_index().sort_values(by=list(rename_mapping.values()), ascending=False)
return joined_counts, taxid_list, cut_df
# Example usage
samples = [
'POOL1', 'POOL2', 'POOL3', 'POOL4', 'POOL5', 'POOL6',
'POOL7', 'POOL8', 'POOL9', 'POOL10', 'POOL11', 'POOL12',
'ACIDOLA', 'BLACTIS'
]
In [5]:
def filter_by_nan_percentage(df, per_cutoff=0.35):
# Identify sample columns (excluding taxonomy_id, name, lineage, and controls)
control_cols = ['ACIDOLA', 'BLACTIS']
sample_cols = [col for col in df.columns if col not in ['taxonomy_id', 'name', 'lineage'] + control_cols]
# Calculate the percentage of NaNs in sample columns
nan_percentage = df[sample_cols].isna().mean(axis=1)
# Retain species with less than 35% NaNs
filtered_df = df[nan_percentage < per_cutoff]
return filtered_df
In [6]:
def calculate_retained_discarded(df, threshold, verbose):
df = df.copy()
# Separate ACIDOLA and BLACTIS columns
control_cols = ['ACIDOLA', 'BLACTIS']
sample_cols = [col for col in df.columns if col not in ['taxonomy_id', 'name', 'lineage'] + control_cols]
# Calculate mean across samples
df['median_across_samples'] = df[sample_cols].median(axis=1, skipna=True)
# Calculate max of ACIDOLA and BLACTIS
df['max_control'] = df[control_cols].max(axis=1, skipna=True).fillna(0)
if threshold == 'dynamic':
list_len_discarded = []
for threshold in range(1, 500): # In theory the dataframe len is not related but it is just a number to add, which should be bigger the bigger the dataframe
discarded = df[~((df['median_across_samples'] > (df['max_control'] * threshold)))]
list_len_discarded.append(len(discarded) / len(df))
kneedle = KneeLocator(np.arange(1,500), list_len_discarded, curve='concave', direction='increasing', S=0)
threshold = kneedle.knee
# Define retention logic
retained = df[(df['median_across_samples'] > (df['max_control'] * threshold)) | (df['max_control'].isna())]
discarded = df[~((df['median_across_samples'] > (df['max_control'] * threshold)) | (df['max_control'].isna()))]
if verbose:
print(f'Threshold: {threshold} | Number of species: {len(df)} | Species discarded: {len(discarded)} ({100 * len(discarded) / len(df):.2f}%)')
return retained, discarded
def filter_species_ids(joined_counts_norm_plus, joined_counts_norm_pipe, threshold='dynamic', verbose=True):
retained_norm_plus, discarded_norm_plus = calculate_retained_discarded(joined_counts_norm_plus, threshold, verbose)
retained_norm_pipe, discarded_norm_pipe = calculate_retained_discarded(joined_counts_norm_pipe, threshold, verbose)
# Extract taxonomy IDs
retained_ids_norm_plus = retained_norm_plus['taxonomy_id'].tolist()
discarded_ids_norm_plus = discarded_norm_plus['taxonomy_id'].tolist()
retained_ids_norm_pipe = retained_norm_pipe['taxonomy_id'].tolist()
discarded_ids_norm_pipe = discarded_norm_pipe['taxonomy_id'].tolist()
discarded_common = np.intersect1d(discarded_ids_norm_plus, discarded_ids_norm_pipe).tolist()
discarded_exclusive_norm_plus = [i for i in discarded_ids_norm_plus if i not in discarded_ids_norm_pipe]
discarded_exclusive_norm_pipe = [i for i in discarded_ids_norm_pipe if i not in discarded_ids_norm_plus]
return {
'discarded_common': discarded_common,
'discarded_exclusive_norm_plus': discarded_exclusive_norm_plus,
'discarded_exclusive_norm_pipe': discarded_exclusive_norm_pipe,
'retained_norm_plus': retained_ids_norm_plus,
'retained_norm_pipe': retained_ids_norm_pipe
}
In [7]:
def differential_abundance_analysis(df, condition_cols, reference_cols):
"""
Perform differential abundance analysis between condition and reference groups.
Parameters:
df (pd.DataFrame): Input dataframe containing species counts and metadata.
condition_cols (list): Column names for the condition group.
reference_cols (list): Column names for the reference group.
output_file (str, optional): Path to save results to an Excel file. Default is None.
sheet_name (str): Sheet name for Excel output. Default is 'Results'.
Returns:
pd.DataFrame: Dataframe containing p-values, log2 fold changes, and sorted results.
"""
list_pvals_mannwhitney = []
L2FC = []
for row in range(len(df)):
# Extract condition and reference values
condition_vals = df.iloc[row][condition_cols].astype(float).dropna().values
reference_vals = df.iloc[row][reference_cols].astype(float).dropna().values
condition_vals, reference_vals = condition_vals, reference_vals
# Mann-Whitney U test
res_mw = mannwhitneyu(condition_vals, reference_vals, alternative='two-sided')
list_pvals_mannwhitney.append(res_mw.pvalue)
# Log2 fold change
L2FC.append(np.log2(np.nanmedian(condition_vals) / np.nanmedian(reference_vals)))
# Compile results
df_pval = df.copy()
df_pval['log2FC'] = L2FC
df_pval['pval_MW'] = list_pvals_mannwhitney
# Add the corrected p-values to the dataset
_, pvals_corrected, _, _ = multipletests(df_pval['pval_MW'].values, alpha=0.05, method='fdr_bh')
df_pval['pval_MW_corrected'] = pvals_corrected
# Sort by p-values
df_pval = df_pval.sort_values(by=['pval_MW'])
return df_pval
In [8]:
os.makedirs(f'{RESULTS_DIR}/merged_counts', exist_ok=True)
os.makedirs(f'{RESULTS_DIR}/differential_abundance', exist_ok=True)
In [9]:
for mode in [3, 5, 7]:
for S in [0, 1, 2, 3, 4, 5, 6, 7, 10, 15]:
print(f'MODE: {mode} | S {S}')
df_all_normplus, taxid_list, df_cut_normplus = process_samples(pass_num=2, mode=mode, S=S, NORM='NORM+', samples=samples, verbose=True, min_sample_flag=2)
df_cut_nan_percentage_normplus = filter_by_nan_percentage(df_cut_normplus, per_cutoff=0.35)
df_all_normpipe, taxid_list, df_cut_normpipe = process_samples(pass_num=2, mode=mode, S=S, NORM='NORMx', samples=samples, verbose=True, min_sample_flag=2)
df_cut_nan_percentage_normpipe = filter_by_nan_percentage(df_cut_normpipe, per_cutoff=0.35)
dict_filternorm_cut = filter_species_ids(df_cut_nan_percentage_normplus, df_cut_nan_percentage_normpipe)
with open(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_dict_norm+|_species.tsv', "w") as file:
json.dump(dict_filternorm_cut, file)
print([(i, len(dict_filternorm_cut[i])) for i in dict_filternorm_cut.keys()])
df_cut_nan_percentage_normplus_discarded_common = df_cut_nan_percentage_normplus[df_cut_nan_percentage_normplus['taxonomy_id'].isin(dict_filternorm_cut['discarded_common'])]
df_cut_nan_percentage_normplus_discarded_common.to_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_discarded_common.tsv', sep='\t', index=None)
df_cut_nan_percentage_normplus_discarded_normplus = df_cut_nan_percentage_normplus[df_cut_nan_percentage_normplus['taxonomy_id'].isin(dict_filternorm_cut['discarded_exclusive_norm_plus'])]
df_cut_nan_percentage_normplus_discarded_normplus.to_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_discarded_norm+.tsv', sep='\t', index=None)
df_cut_nan_percentage_normplus_retained = df_cut_nan_percentage_normplus[df_cut_nan_percentage_normplus['taxonomy_id'].isin(dict_filternorm_cut['retained_norm_plus'])]
df_cut_nan_percentage_normplus_retained.to_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_retained.tsv', sep='\t', index=None)
df_cut_nan_percentage_normpipe_discarded_common = df_cut_nan_percentage_normpipe[df_cut_nan_percentage_normpipe['taxonomy_id'].isin(dict_filternorm_cut['discarded_common'])]
df_cut_nan_percentage_normpipe_discarded_common.to_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORMx_discarded_common.tsv', sep='\t', index=None)
df_cut_nan_percentage_normpipe_discarded_normplus = df_cut_nan_percentage_normpipe[df_cut_nan_percentage_normpipe['taxonomy_id'].isin(dict_filternorm_cut['discarded_exclusive_norm_plus'])]
df_cut_nan_percentage_normpipe_discarded_normplus.to_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORMx_discarded_norm+.tsv', sep='\t', index=None)
df_cut_nan_percentage_normpipe_retained = df_cut_nan_percentage_normpipe[df_cut_nan_percentage_normpipe['taxonomy_id'].isin(dict_filternorm_cut['retained_norm_pipe'])]
df_cut_nan_percentage_normpipe_retained.to_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORMx_retained.tsv', sep='\t', index=None)
df_pval_HCvsRR = differential_abundance_analysis(df_cut_nan_percentage_normplus_retained, ['HC1', 'HC2', 'HC3', 'HC4'], ['RR1', 'RR2', 'RR3', 'RR4'])
df_pval_HCvsRR.to_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_HCvsRR.tsv', sep='\t', index=None)
df_pval_HCvsSP = differential_abundance_analysis(df_cut_nan_percentage_normplus_retained, ['HC1', 'HC2', 'HC3', 'HC4'], ['SP1', 'SP2', 'SP3', 'SP4'])
df_pval_HCvsSP.to_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_HCvsSP.tsv', sep='\t', index=None)
df_pval_RRvsSP = differential_abundance_analysis(df_cut_nan_percentage_normplus_retained, ['RR1', 'RR2', 'RR3', 'RR4'], ['SP1', 'SP2', 'SP3', 'SP4'])
df_pval_RRvsSP.to_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_RRvsSP.tsv', sep='\t', index=None)
df_pval_sex = differential_abundance_analysis(df_cut_nan_percentage_normplus_retained, ['HC1', 'HC2', 'RR1', 'RR2', 'SP1', 'SP2'], ['HC3', 'HC4', 'RR3', 'RR4', 'SP3', 'SP4'])
df_pval_sex.to_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_sex.tsv', sep='\t', index=None)
print('\n\n')
MODE: 3 | S 0
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 10] [2535 39 12 11 5 5 3 3 5 2]
Flag threshold: 2 | Number of species: 2620 | Species selected: 46 (1.76%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 10] [2535 39 12 11 5 5 3 3 5 2]
Flag threshold: 2 | Number of species: 2620 | Species selected: 46 (1.76%)
Threshold: 2 | Number of species: 46 | Species discarded: 16 (34.78%)
Threshold: 1 | Number of species: 46 | Species discarded: 2 (4.35%)
[('discarded_common', 2), ('discarded_exclusive_norm_plus', 14), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 30), ('retained_norm_pipe', 44)]
MODE: 3 | S 1
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11] [2485 54 18 15 12 5 7 2 2 4 14 2]
Flag threshold: 2 | Number of species: 2620 | Species selected: 81 (3.09%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11] [2485 54 18 15 12 5 7 2 2 4 14 2]
Flag threshold: 2 | Number of species: 2620 | Species selected: 81 (3.09%)
Threshold: 8 | Number of species: 81 | Species discarded: 45 (55.56%)
Threshold: 1 | Number of species: 81 | Species discarded: 5 (6.17%)
[('discarded_common', 5), ('discarded_exclusive_norm_plus', 40), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 36), ('retained_norm_pipe', 76)]
MODE: 3 | S 2
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2429 75 32 16 15 9 7 8 4 5 14 5 1]
Flag threshold: 2 | Number of species: 2620 | Species selected: 116 (4.43%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2429 75 32 16 15 9 7 8 4 5 14 5 1]
Flag threshold: 2 | Number of species: 2620 | Species selected: 116 (4.43%)
Threshold: 8 | Number of species: 116 | Species discarded: 58 (50.00%)
Threshold: 1 | Number of species: 116 | Species discarded: 6 (5.17%)
[('discarded_common', 6), ('discarded_exclusive_norm_plus', 52), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 58), ('retained_norm_pipe', 110)]
MODE: 3 | S 3
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2406 93 30 16 14 12 8 4 8 6 13 8 2]
Flag threshold: 2 | Number of species: 2620 | Species selected: 121 (4.62%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2406 93 30 16 14 12 8 4 8 6 13 8 2]
Flag threshold: 2 | Number of species: 2620 | Species selected: 121 (4.62%)
Threshold: 8 | Number of species: 121 | Species discarded: 59 (48.76%)
Threshold: 1 | Number of species: 121 | Species discarded: 6 (4.96%)
[('discarded_common', 6), ('discarded_exclusive_norm_plus', 53), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 62), ('retained_norm_pipe', 115)]
MODE: 3 | S 4
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2404 94 29 15 13 15 6 6 6 4 18 8 2]
Flag threshold: 2 | Number of species: 2620 | Species selected: 122 (4.66%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2404 94 29 15 13 15 6 6 6 4 18 8 2]
Flag threshold: 2 | Number of species: 2620 | Species selected: 122 (4.66%)
Threshold: 8 | Number of species: 122 | Species discarded: 60 (49.18%)
Threshold: 1 | Number of species: 122 | Species discarded: 6 (4.92%)
[('discarded_common', 6), ('discarded_exclusive_norm_plus', 54), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 62), ('retained_norm_pipe', 116)]
MODE: 3 | S 5
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2392 92 34 15 16 15 11 6 5 4 17 9 4]
Flag threshold: 2 | Number of species: 2620 | Species selected: 136 (5.19%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2392 92 34 15 16 15 11 6 5 4 17 9 4]
Flag threshold: 2 | Number of species: 2620 | Species selected: 136 (5.19%)
Threshold: 9 | Number of species: 136 | Species discarded: 62 (45.59%)
Threshold: 1 | Number of species: 136 | Species discarded: 6 (4.41%)
[('discarded_common', 6), ('discarded_exclusive_norm_plus', 56), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 74), ('retained_norm_pipe', 130)]
MODE: 3 | S 6
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2366 105 35 23 12 17 7 8 8 7 18 10 4]
Flag threshold: 2 | Number of species: 2620 | Species selected: 149 (5.69%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2366 105 35 23 12 17 7 8 8 7 18 10 4]
Flag threshold: 2 | Number of species: 2620 | Species selected: 149 (5.69%)
Threshold: 9 | Number of species: 149 | Species discarded: 65 (43.62%)
Threshold: 1 | Number of species: 149 | Species discarded: 6 (4.03%)
[('discarded_common', 6), ('discarded_exclusive_norm_plus', 59), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 84), ('retained_norm_pipe', 143)]
MODE: 3 | S 7
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2361 108 34 22 14 18 8 7 8 7 19 10 4]
Flag threshold: 2 | Number of species: 2620 | Species selected: 151 (5.76%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2361 108 34 22 14 18 8 7 8 7 19 10 4]
Flag threshold: 2 | Number of species: 2620 | Species selected: 151 (5.76%)
Threshold: 9 | Number of species: 151 | Species discarded: 67 (44.37%)
Threshold: 1 | Number of species: 151 | Species discarded: 6 (3.97%)
[('discarded_common', 6), ('discarded_exclusive_norm_plus', 61), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 84), ('retained_norm_pipe', 145)]
MODE: 3 | S 10
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2340 118 38 22 15 16 13 6 7 7 21 11 6]
Flag threshold: 2 | Number of species: 2620 | Species selected: 162 (6.18%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2340 118 38 22 15 16 13 6 7 7 21 11 6]
Flag threshold: 2 | Number of species: 2620 | Species selected: 162 (6.18%)
Threshold: 9 | Number of species: 162 | Species discarded: 68 (41.98%)
Threshold: 1 | Number of species: 162 | Species discarded: 6 (3.70%)
[('discarded_common', 6), ('discarded_exclusive_norm_plus', 62), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 94), ('retained_norm_pipe', 156)]
MODE: 3 | S 15
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2322 128 44 16 17 17 15 7 6 8 20 13 7]
Flag threshold: 2 | Number of species: 2620 | Species selected: 170 (6.49%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2322 128 44 16 17 17 15 7 6 8 20 13 7]
Flag threshold: 2 | Number of species: 2620 | Species selected: 170 (6.49%)
Threshold: 9 | Number of species: 170 | Species discarded: 72 (42.35%)
Threshold: 1 | Number of species: 170 | Species discarded: 7 (4.12%)
[('discarded_common', 7), ('discarded_exclusive_norm_plus', 65), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 98), ('retained_norm_pipe', 163)]
MODE: 5 | S 0
TaxIDs species count: [ 0 1 2 3 4 5 6 8 9 10] [2571 34 14 7 6 5 2 3 3 3]
Flag threshold: 2 | Number of species: 2648 | Species selected: 43 (1.62%)
TaxIDs species count: [ 0 1 2 3 4 5 6 8 9 10] [2571 34 14 7 6 5 2 3 3 3]
Flag threshold: 2 | Number of species: 2648 | Species selected: 43 (1.62%)
Threshold: 3 | Number of species: 43 | Species discarded: 17 (39.53%)
Threshold: 1 | Number of species: 43 | Species discarded: 3 (6.98%)
[('discarded_common', 3), ('discarded_exclusive_norm_plus', 14), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 26), ('retained_norm_pipe', 40)]
MODE: 5 | S 1
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11] [2489 69 18 18 13 5 7 6 3 4 13 3]
Flag threshold: 2 | Number of species: 2648 | Species selected: 90 (3.40%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11] [2489 69 18 18 13 5 7 6 3 4 13 3]
Flag threshold: 2 | Number of species: 2648 | Species selected: 90 (3.40%)
Threshold: 9 | Number of species: 90 | Species discarded: 55 (61.11%)
Threshold: 11 | Number of species: 90 | Species discarded: 22 (24.44%)
[('discarded_common', 22), ('discarded_exclusive_norm_plus', 33), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 35), ('retained_norm_pipe', 68)]
MODE: 5 | S 2
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2473 80 19 14 11 11 7 5 4 5 12 5 2]
Flag threshold: 2 | Number of species: 2648 | Species selected: 95 (3.59%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2473 80 19 14 11 11 7 5 4 5 12 5 2]
Flag threshold: 2 | Number of species: 2648 | Species selected: 95 (3.59%)
Threshold: 9 | Number of species: 95 | Species discarded: 55 (57.89%)
Threshold: 11 | Number of species: 95 | Species discarded: 22 (23.16%)
[('discarded_common', 22), ('discarded_exclusive_norm_plus', 33), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 40), ('retained_norm_pipe', 73)]
MODE: 5 | S 3
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2443 84 28 17 12 10 10 7 8 4 12 11 2]
Flag threshold: 2 | Number of species: 2648 | Species selected: 121 (4.57%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2443 84 28 17 12 10 10 7 8 4 12 11 2]
Flag threshold: 2 | Number of species: 2648 | Species selected: 121 (4.57%)
Threshold: 9 | Number of species: 121 | Species discarded: 62 (51.24%)
Threshold: 11 | Number of species: 121 | Species discarded: 25 (20.66%)
[('discarded_common', 25), ('discarded_exclusive_norm_plus', 37), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 59), ('retained_norm_pipe', 96)]
MODE: 5 | S 4
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2427 92 31 20 14 7 8 8 8 5 14 12 2]
Flag threshold: 2 | Number of species: 2648 | Species selected: 129 (4.87%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2427 92 31 20 14 7 8 8 8 5 14 12 2]
Flag threshold: 2 | Number of species: 2648 | Species selected: 129 (4.87%)
Threshold: 9 | Number of species: 129 | Species discarded: 63 (48.84%)
Threshold: 11 | Number of species: 129 | Species discarded: 25 (19.38%)
[('discarded_common', 25), ('discarded_exclusive_norm_plus', 38), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 66), ('retained_norm_pipe', 104)]
MODE: 5 | S 5
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2408 98 30 18 21 13 6 6 9 11 14 12 2]
Flag threshold: 2 | Number of species: 2648 | Species selected: 142 (5.36%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2408 98 30 18 21 13 6 6 9 11 14 12 2]
Flag threshold: 2 | Number of species: 2648 | Species selected: 142 (5.36%)
Threshold: 9 | Number of species: 142 | Species discarded: 68 (47.89%)
Threshold: 11 | Number of species: 142 | Species discarded: 25 (17.61%)
[('discarded_common', 25), ('discarded_exclusive_norm_plus', 43), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 74), ('retained_norm_pipe', 117)]
MODE: 5 | S 6
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2408 98 30 18 21 13 6 6 9 11 14 12 2]
Flag threshold: 2 | Number of species: 2648 | Species selected: 142 (5.36%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2408 98 30 18 21 13 6 6 9 11 14 12 2]
Flag threshold: 2 | Number of species: 2648 | Species selected: 142 (5.36%)
Threshold: 9 | Number of species: 142 | Species discarded: 68 (47.89%)
Threshold: 11 | Number of species: 142 | Species discarded: 25 (17.61%)
[('discarded_common', 25), ('discarded_exclusive_norm_plus', 43), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 74), ('retained_norm_pipe', 117)]
MODE: 5 | S 7
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2387 108 39 17 21 15 7 5 8 11 13 14 3]
Flag threshold: 2 | Number of species: 2648 | Species selected: 153 (5.78%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2387 108 39 17 21 15 7 5 8 11 13 14 3]
Flag threshold: 2 | Number of species: 2648 | Species selected: 153 (5.78%)
Threshold: 9 | Number of species: 153 | Species discarded: 71 (46.41%)
Threshold: 11 | Number of species: 153 | Species discarded: 25 (16.34%)
[('discarded_common', 25), ('discarded_exclusive_norm_plus', 46), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 82), ('retained_norm_pipe', 128)]
MODE: 5 | S 10
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2365 117 47 17 19 17 10 5 8 12 12 16 3]
Flag threshold: 2 | Number of species: 2648 | Species selected: 166 (6.27%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2365 117 47 17 19 17 10 5 8 12 12 16 3]
Flag threshold: 2 | Number of species: 2648 | Species selected: 166 (6.27%)
Threshold: 9 | Number of species: 166 | Species discarded: 76 (45.78%)
Threshold: 11 | Number of species: 166 | Species discarded: 27 (16.27%)
[('discarded_common', 27), ('discarded_exclusive_norm_plus', 49), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 90), ('retained_norm_pipe', 139)]
MODE: 5 | S 15
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2360 110 52 20 15 22 7 7 9 9 17 17 3]
Flag threshold: 2 | Number of species: 2648 | Species selected: 178 (6.72%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2360 110 52 20 15 22 7 7 9 9 17 17 3]
Flag threshold: 2 | Number of species: 2648 | Species selected: 178 (6.72%)
Threshold: 10 | Number of species: 178 | Species discarded: 82 (46.07%)
Threshold: 11 | Number of species: 178 | Species discarded: 28 (15.73%)
[('discarded_common', 28), ('discarded_exclusive_norm_plus', 54), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 96), ('retained_norm_pipe', 150)]
MODE: 7 | S 0
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10] [2597 49 20 7 8 4 1 3 1 2 5]
Flag threshold: 2 | Number of species: 2697 | Species selected: 51 (1.89%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10] [2597 49 20 7 8 4 1 3 1 2 5]
Flag threshold: 2 | Number of species: 2697 | Species selected: 51 (1.89%)
Threshold: 8 | Number of species: 51 | Species discarded: 32 (62.75%)
Threshold: 3 | Number of species: 51 | Species discarded: 6 (11.76%)
[('discarded_common', 6), ('discarded_exclusive_norm_plus', 26), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 19), ('retained_norm_pipe', 45)]
MODE: 7 | S 1
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2545 66 18 15 13 7 6 4 2 6 12 2 1]
Flag threshold: 2 | Number of species: 2697 | Species selected: 86 (3.19%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2545 66 18 15 13 7 6 4 2 6 12 2 1]
Flag threshold: 2 | Number of species: 2697 | Species selected: 86 (3.19%)
Threshold: 9 | Number of species: 86 | Species discarded: 53 (61.63%)
Threshold: 3 | Number of species: 86 | Species discarded: 7 (8.14%)
[('discarded_common', 7), ('discarded_exclusive_norm_plus', 46), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 33), ('retained_norm_pipe', 79)]
MODE: 7 | S 2
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2500 81 34 16 9 10 9 5 6 6 13 7 1]
Flag threshold: 2 | Number of species: 2697 | Species selected: 116 (4.30%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2500 81 34 16 9 10 9 5 6 6 13 7 1]
Flag threshold: 2 | Number of species: 2697 | Species selected: 116 (4.30%)
Threshold: 9 | Number of species: 116 | Species discarded: 66 (56.90%)
Threshold: 3 | Number of species: 116 | Species discarded: 7 (6.03%)
[('discarded_common', 7), ('discarded_exclusive_norm_plus', 59), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 50), ('retained_norm_pipe', 109)]
MODE: 7 | S 3
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2494 81 29 23 9 10 10 4 7 6 15 6 3]
Flag threshold: 2 | Number of species: 2697 | Species selected: 122 (4.52%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2494 81 29 23 9 10 10 4 7 6 15 6 3]
Flag threshold: 2 | Number of species: 2697 | Species selected: 122 (4.52%)
Threshold: 9 | Number of species: 122 | Species discarded: 70 (57.38%)
Threshold: 3 | Number of species: 122 | Species discarded: 8 (6.56%)
[('discarded_common', 8), ('discarded_exclusive_norm_plus', 62), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 52), ('retained_norm_pipe', 114)]
MODE: 7 | S 4
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2459 100 34 23 11 12 8 7 7 5 15 9 7]
Flag threshold: 2 | Number of species: 2697 | Species selected: 138 (5.12%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2459 100 34 23 11 12 8 7 7 5 15 9 7]
Flag threshold: 2 | Number of species: 2697 | Species selected: 138 (5.12%)
Threshold: 9 | Number of species: 138 | Species discarded: 75 (54.35%)
Threshold: 3 | Number of species: 138 | Species discarded: 9 (6.52%)
[('discarded_common', 9), ('discarded_exclusive_norm_plus', 66), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 63), ('retained_norm_pipe', 129)]
MODE: 7 | S 5
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2433 116 34 22 17 11 8 8 9 5 17 10 7]
Flag threshold: 2 | Number of species: 2697 | Species selected: 148 (5.49%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2433 116 34 22 17 11 8 8 9 5 17 10 7]
Flag threshold: 2 | Number of species: 2697 | Species selected: 148 (5.49%)
Threshold: 9 | Number of species: 148 | Species discarded: 78 (52.70%)
Threshold: 3 | Number of species: 148 | Species discarded: 9 (6.08%)
[('discarded_common', 9), ('discarded_exclusive_norm_plus', 69), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 70), ('retained_norm_pipe', 139)]
MODE: 7 | S 6
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2430 112 35 22 16 15 6 10 8 9 16 11 7]
Flag threshold: 2 | Number of species: 2697 | Species selected: 155 (5.75%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2430 112 35 22 16 15 6 10 8 9 16 11 7]
Flag threshold: 2 | Number of species: 2697 | Species selected: 155 (5.75%)
Threshold: 9 | Number of species: 155 | Species discarded: 80 (51.61%)
Threshold: 3 | Number of species: 155 | Species discarded: 9 (5.81%)
[('discarded_common', 9), ('discarded_exclusive_norm_plus', 71), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 75), ('retained_norm_pipe', 146)]
MODE: 7 | S 7
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2430 112 35 22 16 15 6 10 8 9 16 11 7]
Flag threshold: 2 | Number of species: 2697 | Species selected: 155 (5.75%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2430 112 35 22 16 15 6 10 8 9 16 11 7]
Flag threshold: 2 | Number of species: 2697 | Species selected: 155 (5.75%)
Threshold: 9 | Number of species: 155 | Species discarded: 80 (51.61%)
Threshold: 3 | Number of species: 155 | Species discarded: 9 (5.81%)
[('discarded_common', 9), ('discarded_exclusive_norm_plus', 71), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 75), ('retained_norm_pipe', 146)]
MODE: 7 | S 10
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2425 112 39 23 14 16 7 8 10 9 16 11 7]
Flag threshold: 2 | Number of species: 2697 | Species selected: 160 (5.93%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2425 112 39 23 14 16 7 8 10 9 16 11 7]
Flag threshold: 2 | Number of species: 2697 | Species selected: 160 (5.93%)
Threshold: 9 | Number of species: 160 | Species discarded: 81 (50.62%)
Threshold: 3 | Number of species: 160 | Species discarded: 9 (5.62%)
[('discarded_common', 9), ('discarded_exclusive_norm_plus', 72), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 79), ('retained_norm_pipe', 151)]
MODE: 7 | S 15
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2407 116 46 21 16 21 8 7 9 11 17 11 7]
Flag threshold: 2 | Number of species: 2697 | Species selected: 174 (6.45%)
TaxIDs species count: [ 0 1 2 3 4 5 6 7 8 9 10 11 12] [2407 116 46 21 16 21 8 7 9 11 17 11 7]
Flag threshold: 2 | Number of species: 2697 | Species selected: 174 (6.45%)
Threshold: 9 | Number of species: 174 | Species discarded: 87 (50.00%)
Threshold: 3 | Number of species: 174 | Species discarded: 9 (5.17%)
[('discarded_common', 9), ('discarded_exclusive_norm_plus', 78), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 87), ('retained_norm_pipe', 165)]
In [ ]: