In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime
from matplotlib.colors import Normalize, ListedColormap
from scipy.stats import linregress, pearsonr
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
import json
from kneed import KneeLocator
In [2]:
pd.options.display.float_format = '{:.3f}'.format
pd.set_option('display.max_columns', None)

plt.rcParams['figure.dpi']=170
In [3]:
from list_vars import LIST_PROFILERS, DIR_FIGURES, RESULTS_DIR, POOLS, CONTROLS

Biological sample analysis¶

In this notebook we are going to do an analysis on the biological samples (POOL samples + controls).

There are two main variables that we are going to consider:

  • The importance of including the biological control samples to ensure that false positives are not considered.
  • The importance of normalizing the reads considering the biooogical samples.

These two concepts are intertwined, so what we are going to do is the following:

  • Load all pool + control tables.
  • Get the species that are selected using the flags. With that we are going to generate a cut-off table with the species. The way to merge the table is "outer", that is, we are going to include any species that appears in any sample. We can later discard them.
  • We are going to discard the species that have less than X times more expression in control samples than in pools. X is determined dynamically using the kneed method.
  • Then we are going to compare which species have been discarded using the | normalization (normalizing controls and pools separately) and the + normalization (normalizng controls and pools jointly).
In [4]:
# Loading the tables
def process_samples(pass_num, mode, S, NORM, samples, verbose, min_sample_flag='dynamic'):
    # Initialize empty DataFrames for counts and flags
    joined_counts = pd.DataFrame()
    taxid_list = []

    for sample in samples:
        # Define file paths for counts and flags
        counts_file = f'{RESULTS_DIR}/summary/{sample}_pass{pass_num}_mode{mode}_taxgenus_S{S}_{NORM}.diversity.tsv'
        flags_file =  f'{RESULTS_DIR}/summary/{sample}_pass{pass_num}_mode{mode}_taxgenus_S{S}_{NORM}.flags.tsv'

        # Load the data
        df_counts = pd.read_csv(counts_file, sep='\t').set_index('taxonomy_id')[['name', 'lineage', 'mean_norm']]
        df_flags = pd.read_csv(flags_file, sep='\t').set_index('taxonomy_id')[['name', 'lineage', 'mean_norm']]

        # Ensure name and lineage columns are retained correctly
        if joined_counts.empty:
            joined_counts = df_counts.rename(columns={'mean_norm': sample})
        else:
            df_counts = df_counts.rename(columns={'mean_norm': sample})
            joined_counts = joined_counts.join(df_counts, how='outer', rsuffix=f'_{sample}')

        # Add tax_ids where mean_norm is False in flags dataframe
        taxid_list += df_flags[df_flags['mean_norm'] == False].index.tolist()

    # Consolidate name and lineage columns to avoid suffix issues
    if not joined_counts.empty:
        joined_counts['name'] = joined_counts.filter(like='name').bfill(axis=1).iloc[:, 0]
        joined_counts['lineage'] = joined_counts.filter(like='lineage').bfill(axis=1).iloc[:, 0]
        joined_counts = joined_counts.drop(columns=joined_counts.filter(like='name_').columns)
        joined_counts = joined_counts.drop(columns=joined_counts.filter(like='lineage_').columns)

    # Rename pool columns based on their ranges
    rename_mapping = {
        'POOL1': 'RR1', 'POOL2': 'RR2', 'POOL3': 'RR3', 'POOL4': 'RR4',
        'POOL5': 'SP1', 'POOL6': 'SP2', 'POOL7': 'SP3', 'POOL8': 'SP4',
        'POOL9': 'HC1', 'POOL10': 'HC2', 'POOL11': 'HC3', 'POOL12': 'HC4'
    }
    joined_counts = joined_counts.rename(columns=rename_mapping)

    # Create a "cut" DataFrame containing only the tax_ids in the taxid_list
    taxidvalues, samplecounts = np.unique(taxid_list, return_counts=True)
    joined_counts['n_samples_flag'] = 0
    joined_counts.loc[taxidvalues, 'n_samples_flag'] = samplecounts


    n_samples, counts_ntaxids = np.unique(joined_counts['n_samples_flag'].values, return_counts=True)
            
    if min_sample_flag == 'dynamic':
        kneedle = KneeLocator(n_samples, np.cumsum(counts_ntaxids), curve='concave', direction='increasing', S=0)
        min_sample_flag = kneedle.knee
    

    joined_counts['selected_flag'] = joined_counts['n_samples_flag'] >= min_sample_flag
    cut_df = joined_counts[joined_counts['selected_flag'] == True]

    if verbose: 
        print('TaxIDs species count:', n_samples, counts_ntaxids)
        print(f'Flag threshold: {min_sample_flag} | Number of species: {len(joined_counts)} | Species selected: {len(cut_df)} ({100 * len(cut_df) / len(joined_counts):.2f}%)')

    

    # Reset index and sort by mean counts (descending order)
    joined_counts = joined_counts.reset_index().sort_values(by=list(rename_mapping.values()), ascending=False)
    cut_df = cut_df.reset_index().sort_values(by=list(rename_mapping.values()), ascending=False)

    return joined_counts, taxid_list, cut_df
# Example usage
samples = [
    'POOL1', 'POOL2', 'POOL3', 'POOL4', 'POOL5', 'POOL6',
    'POOL7', 'POOL8', 'POOL9', 'POOL10', 'POOL11', 'POOL12',
    'ACIDOLA', 'BLACTIS'
]
In [5]:
def filter_by_nan_percentage(df, per_cutoff=0.35):
    # Identify sample columns (excluding taxonomy_id, name, lineage, and controls)
    control_cols = ['ACIDOLA', 'BLACTIS']
    sample_cols = [col for col in df.columns if col not in ['taxonomy_id', 'name', 'lineage'] + control_cols]

    # Calculate the percentage of NaNs in sample columns
    nan_percentage = df[sample_cols].isna().mean(axis=1)

    # Retain species with less than 35% NaNs
    filtered_df = df[nan_percentage < per_cutoff]

    return filtered_df
In [6]:
def calculate_retained_discarded(df, threshold, verbose):
        df = df.copy()
        
        # Separate ACIDOLA and BLACTIS columns
        control_cols = ['ACIDOLA', 'BLACTIS']
        sample_cols = [col for col in df.columns if col not in ['taxonomy_id', 'name', 'lineage'] + control_cols]

        # Calculate mean across samples
        df['median_across_samples'] = df[sample_cols].median(axis=1, skipna=True)

        # Calculate max of ACIDOLA and BLACTIS
        df['max_control'] = df[control_cols].max(axis=1, skipna=True).fillna(0)

        if threshold == 'dynamic':
            list_len_discarded = []

            for threshold in range(1, 500): # In theory the dataframe len is not related but it is just a number to add, which should be bigger the bigger the dataframe 
                discarded = df[~((df['median_across_samples'] > (df['max_control'] * threshold)))]
                list_len_discarded.append(len(discarded) / len(df))

            kneedle = KneeLocator(np.arange(1,500), list_len_discarded, curve='concave', direction='increasing', S=0)
            threshold = kneedle.knee


        # Define retention logic
        retained = df[(df['median_across_samples'] > (df['max_control'] * threshold)) | (df['max_control'].isna())]
        discarded = df[~((df['median_across_samples'] > (df['max_control'] * threshold)) | (df['max_control'].isna()))]

        if verbose: 
                print(f'Threshold: {threshold} | Number of species: {len(df)} | Species discarded: {len(discarded)} ({100 * len(discarded) / len(df):.2f}%)')

        return retained, discarded

def filter_species_ids(joined_counts_norm_plus, joined_counts_norm_pipe, threshold='dynamic', verbose=True):
    retained_norm_plus, discarded_norm_plus = calculate_retained_discarded(joined_counts_norm_plus, threshold, verbose)
    retained_norm_pipe, discarded_norm_pipe = calculate_retained_discarded(joined_counts_norm_pipe, threshold, verbose)

    # Extract taxonomy IDs
    retained_ids_norm_plus = retained_norm_plus['taxonomy_id'].tolist()
    discarded_ids_norm_plus = discarded_norm_plus['taxonomy_id'].tolist()
    retained_ids_norm_pipe = retained_norm_pipe['taxonomy_id'].tolist()
    discarded_ids_norm_pipe = discarded_norm_pipe['taxonomy_id'].tolist()



    discarded_common = np.intersect1d(discarded_ids_norm_plus, discarded_ids_norm_pipe).tolist()
    discarded_exclusive_norm_plus = [i for i in discarded_ids_norm_plus if i not in discarded_ids_norm_pipe]
    discarded_exclusive_norm_pipe = [i for i in discarded_ids_norm_pipe if i not in discarded_ids_norm_plus]    

    return {
        'discarded_common': discarded_common,
        'discarded_exclusive_norm_plus': discarded_exclusive_norm_plus,
        'discarded_exclusive_norm_pipe': discarded_exclusive_norm_pipe,
        'retained_norm_plus': retained_ids_norm_plus,
        'retained_norm_pipe': retained_ids_norm_pipe
    }
In [7]:
def differential_abundance_analysis(df, condition_cols, reference_cols):
    """
    Perform differential abundance analysis between condition and reference groups.

    Parameters:
        df (pd.DataFrame): Input dataframe containing species counts and metadata.
        condition_cols (list): Column names for the condition group.
        reference_cols (list): Column names for the reference group.
        output_file (str, optional): Path to save results to an Excel file. Default is None.
        sheet_name (str): Sheet name for Excel output. Default is 'Results'.

    Returns:
        pd.DataFrame: Dataframe containing p-values, log2 fold changes, and sorted results.
    """
    list_pvals_mannwhitney = []
    L2FC = []

    for row in range(len(df)):
        # Extract condition and reference values
        condition_vals = df.iloc[row][condition_cols].astype(float).dropna().values
        reference_vals = df.iloc[row][reference_cols].astype(float).dropna().values

        condition_vals, reference_vals = condition_vals, reference_vals

        # Mann-Whitney U test
        res_mw = mannwhitneyu(condition_vals, reference_vals, alternative='two-sided')
        list_pvals_mannwhitney.append(res_mw.pvalue)

        # Log2 fold change
        L2FC.append(np.log2(np.nanmedian(condition_vals) / np.nanmedian(reference_vals)))

    # Compile results
    df_pval = df.copy()
    df_pval['log2FC'] = L2FC
    df_pval['pval_MW'] = list_pvals_mannwhitney

    # Add the corrected p-values to the dataset
    _, pvals_corrected, _, _ = multipletests(df_pval['pval_MW'].values, alpha=0.05, method='fdr_bh')
    df_pval['pval_MW_corrected'] = pvals_corrected


    # Sort by p-values
    df_pval = df_pval.sort_values(by=['pval_MW'])

    return df_pval
In [8]:
os.makedirs(f'{RESULTS_DIR}/merged_counts', exist_ok=True)
os.makedirs(f'{RESULTS_DIR}/differential_abundance', exist_ok=True)
In [9]:
for mode in [3, 5, 7]:
    for S in [0, 1, 2, 3, 4, 5, 6, 7, 10, 15]:
        print(f'MODE: {mode} | S {S}')
        df_all_normplus, taxid_list, df_cut_normplus = process_samples(pass_num=2, mode=mode, S=S, NORM='NORM+', samples=samples, verbose=True, min_sample_flag=2)
        df_cut_nan_percentage_normplus = filter_by_nan_percentage(df_cut_normplus, per_cutoff=0.35)

        df_all_normpipe, taxid_list, df_cut_normpipe = process_samples(pass_num=2, mode=mode, S=S, NORM='NORMx', samples=samples, verbose=True, min_sample_flag=2)
        df_cut_nan_percentage_normpipe = filter_by_nan_percentage(df_cut_normpipe, per_cutoff=0.35)

        dict_filternorm_cut = filter_species_ids(df_cut_nan_percentage_normplus, df_cut_nan_percentage_normpipe)
        with open(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_dict_norm+|_species.tsv', "w") as file:
            json.dump(dict_filternorm_cut, file)
        
        print([(i, len(dict_filternorm_cut[i])) for i in dict_filternorm_cut.keys()])

        df_cut_nan_percentage_normplus_discarded_common = df_cut_nan_percentage_normplus[df_cut_nan_percentage_normplus['taxonomy_id'].isin(dict_filternorm_cut['discarded_common'])]
        df_cut_nan_percentage_normplus_discarded_common.to_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_discarded_common.tsv', sep='\t', index=None)

        df_cut_nan_percentage_normplus_discarded_normplus = df_cut_nan_percentage_normplus[df_cut_nan_percentage_normplus['taxonomy_id'].isin(dict_filternorm_cut['discarded_exclusive_norm_plus'])]
        df_cut_nan_percentage_normplus_discarded_normplus.to_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_discarded_norm+.tsv', sep='\t', index=None)
        
        df_cut_nan_percentage_normplus_retained = df_cut_nan_percentage_normplus[df_cut_nan_percentage_normplus['taxonomy_id'].isin(dict_filternorm_cut['retained_norm_plus'])]
        df_cut_nan_percentage_normplus_retained.to_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_retained.tsv', sep='\t', index=None)

        df_cut_nan_percentage_normpipe_discarded_common = df_cut_nan_percentage_normpipe[df_cut_nan_percentage_normpipe['taxonomy_id'].isin(dict_filternorm_cut['discarded_common'])]
        df_cut_nan_percentage_normpipe_discarded_common.to_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORMx_discarded_common.tsv', sep='\t', index=None)

        df_cut_nan_percentage_normpipe_discarded_normplus = df_cut_nan_percentage_normpipe[df_cut_nan_percentage_normpipe['taxonomy_id'].isin(dict_filternorm_cut['discarded_exclusive_norm_plus'])]
        df_cut_nan_percentage_normpipe_discarded_normplus.to_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORMx_discarded_norm+.tsv', sep='\t', index=None)

        df_cut_nan_percentage_normpipe_retained = df_cut_nan_percentage_normpipe[df_cut_nan_percentage_normpipe['taxonomy_id'].isin(dict_filternorm_cut['retained_norm_pipe'])]
        df_cut_nan_percentage_normpipe_retained.to_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORMx_retained.tsv', sep='\t', index=None)



        df_pval_HCvsRR = differential_abundance_analysis(df_cut_nan_percentage_normplus_retained, ['HC1', 'HC2', 'HC3', 'HC4'], ['RR1', 'RR2', 'RR3', 'RR4'])
        df_pval_HCvsRR.to_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_HCvsRR.tsv', sep='\t', index=None)

        df_pval_HCvsSP = differential_abundance_analysis(df_cut_nan_percentage_normplus_retained, ['HC1', 'HC2', 'HC3', 'HC4'], ['SP1', 'SP2', 'SP3', 'SP4'])
        df_pval_HCvsSP.to_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_HCvsSP.tsv', sep='\t', index=None)

        df_pval_RRvsSP = differential_abundance_analysis(df_cut_nan_percentage_normplus_retained, ['RR1', 'RR2', 'RR3', 'RR4'], ['SP1', 'SP2', 'SP3', 'SP4'])
        df_pval_RRvsSP.to_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_RRvsSP.tsv', sep='\t', index=None)

        df_pval_sex = differential_abundance_analysis(df_cut_nan_percentage_normplus_retained, ['HC1', 'HC2', 'RR1', 'RR2', 'SP1', 'SP2'], ['HC3', 'HC4', 'RR3', 'RR4', 'SP3', 'SP4'])
        df_pval_sex.to_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_sex.tsv', sep='\t', index=None)

        print('\n\n')
MODE: 3 | S 0
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8 10] [2535   39   12   11    5    5    3    3    5    2]
Flag threshold: 2 | Number of species: 2620 | Species selected: 46 (1.76%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8 10] [2535   39   12   11    5    5    3    3    5    2]
Flag threshold: 2 | Number of species: 2620 | Species selected: 46 (1.76%)
Threshold: 2 | Number of species: 46 | Species discarded: 16 (34.78%)
Threshold: 1 | Number of species: 46 | Species discarded: 2 (4.35%)
[('discarded_common', 2), ('discarded_exclusive_norm_plus', 14), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 30), ('retained_norm_pipe', 44)]



MODE: 3 | S 1
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11] [2485   54   18   15   12    5    7    2    2    4   14    2]
Flag threshold: 2 | Number of species: 2620 | Species selected: 81 (3.09%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11] [2485   54   18   15   12    5    7    2    2    4   14    2]
Flag threshold: 2 | Number of species: 2620 | Species selected: 81 (3.09%)
Threshold: 8 | Number of species: 81 | Species discarded: 45 (55.56%)
Threshold: 1 | Number of species: 81 | Species discarded: 5 (6.17%)
[('discarded_common', 5), ('discarded_exclusive_norm_plus', 40), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 36), ('retained_norm_pipe', 76)]



MODE: 3 | S 2
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2429   75   32   16   15    9    7    8    4    5   14    5    1]
Flag threshold: 2 | Number of species: 2620 | Species selected: 116 (4.43%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2429   75   32   16   15    9    7    8    4    5   14    5    1]
Flag threshold: 2 | Number of species: 2620 | Species selected: 116 (4.43%)
Threshold: 8 | Number of species: 116 | Species discarded: 58 (50.00%)
Threshold: 1 | Number of species: 116 | Species discarded: 6 (5.17%)
[('discarded_common', 6), ('discarded_exclusive_norm_plus', 52), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 58), ('retained_norm_pipe', 110)]



MODE: 3 | S 3
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2406   93   30   16   14   12    8    4    8    6   13    8    2]
Flag threshold: 2 | Number of species: 2620 | Species selected: 121 (4.62%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2406   93   30   16   14   12    8    4    8    6   13    8    2]
Flag threshold: 2 | Number of species: 2620 | Species selected: 121 (4.62%)
Threshold: 8 | Number of species: 121 | Species discarded: 59 (48.76%)
Threshold: 1 | Number of species: 121 | Species discarded: 6 (4.96%)
[('discarded_common', 6), ('discarded_exclusive_norm_plus', 53), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 62), ('retained_norm_pipe', 115)]



MODE: 3 | S 4
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2404   94   29   15   13   15    6    6    6    4   18    8    2]
Flag threshold: 2 | Number of species: 2620 | Species selected: 122 (4.66%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2404   94   29   15   13   15    6    6    6    4   18    8    2]
Flag threshold: 2 | Number of species: 2620 | Species selected: 122 (4.66%)
Threshold: 8 | Number of species: 122 | Species discarded: 60 (49.18%)
Threshold: 1 | Number of species: 122 | Species discarded: 6 (4.92%)
[('discarded_common', 6), ('discarded_exclusive_norm_plus', 54), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 62), ('retained_norm_pipe', 116)]



MODE: 3 | S 5
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2392   92   34   15   16   15   11    6    5    4   17    9    4]
Flag threshold: 2 | Number of species: 2620 | Species selected: 136 (5.19%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2392   92   34   15   16   15   11    6    5    4   17    9    4]
Flag threshold: 2 | Number of species: 2620 | Species selected: 136 (5.19%)
Threshold: 9 | Number of species: 136 | Species discarded: 62 (45.59%)
Threshold: 1 | Number of species: 136 | Species discarded: 6 (4.41%)
[('discarded_common', 6), ('discarded_exclusive_norm_plus', 56), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 74), ('retained_norm_pipe', 130)]



MODE: 3 | S 6
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2366  105   35   23   12   17    7    8    8    7   18   10    4]
Flag threshold: 2 | Number of species: 2620 | Species selected: 149 (5.69%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2366  105   35   23   12   17    7    8    8    7   18   10    4]
Flag threshold: 2 | Number of species: 2620 | Species selected: 149 (5.69%)
Threshold: 9 | Number of species: 149 | Species discarded: 65 (43.62%)
Threshold: 1 | Number of species: 149 | Species discarded: 6 (4.03%)
[('discarded_common', 6), ('discarded_exclusive_norm_plus', 59), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 84), ('retained_norm_pipe', 143)]



MODE: 3 | S 7
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2361  108   34   22   14   18    8    7    8    7   19   10    4]
Flag threshold: 2 | Number of species: 2620 | Species selected: 151 (5.76%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2361  108   34   22   14   18    8    7    8    7   19   10    4]
Flag threshold: 2 | Number of species: 2620 | Species selected: 151 (5.76%)
Threshold: 9 | Number of species: 151 | Species discarded: 67 (44.37%)
Threshold: 1 | Number of species: 151 | Species discarded: 6 (3.97%)
[('discarded_common', 6), ('discarded_exclusive_norm_plus', 61), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 84), ('retained_norm_pipe', 145)]



MODE: 3 | S 10
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2340  118   38   22   15   16   13    6    7    7   21   11    6]
Flag threshold: 2 | Number of species: 2620 | Species selected: 162 (6.18%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2340  118   38   22   15   16   13    6    7    7   21   11    6]
Flag threshold: 2 | Number of species: 2620 | Species selected: 162 (6.18%)
Threshold: 9 | Number of species: 162 | Species discarded: 68 (41.98%)
Threshold: 1 | Number of species: 162 | Species discarded: 6 (3.70%)
[('discarded_common', 6), ('discarded_exclusive_norm_plus', 62), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 94), ('retained_norm_pipe', 156)]



MODE: 3 | S 15
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2322  128   44   16   17   17   15    7    6    8   20   13    7]
Flag threshold: 2 | Number of species: 2620 | Species selected: 170 (6.49%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2322  128   44   16   17   17   15    7    6    8   20   13    7]
Flag threshold: 2 | Number of species: 2620 | Species selected: 170 (6.49%)
Threshold: 9 | Number of species: 170 | Species discarded: 72 (42.35%)
Threshold: 1 | Number of species: 170 | Species discarded: 7 (4.12%)
[('discarded_common', 7), ('discarded_exclusive_norm_plus', 65), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 98), ('retained_norm_pipe', 163)]



MODE: 5 | S 0
TaxIDs species count: [ 0  1  2  3  4  5  6  8  9 10] [2571   34   14    7    6    5    2    3    3    3]
Flag threshold: 2 | Number of species: 2648 | Species selected: 43 (1.62%)
TaxIDs species count: [ 0  1  2  3  4  5  6  8  9 10] [2571   34   14    7    6    5    2    3    3    3]
Flag threshold: 2 | Number of species: 2648 | Species selected: 43 (1.62%)
Threshold: 3 | Number of species: 43 | Species discarded: 17 (39.53%)
Threshold: 1 | Number of species: 43 | Species discarded: 3 (6.98%)
[('discarded_common', 3), ('discarded_exclusive_norm_plus', 14), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 26), ('retained_norm_pipe', 40)]



MODE: 5 | S 1
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11] [2489   69   18   18   13    5    7    6    3    4   13    3]
Flag threshold: 2 | Number of species: 2648 | Species selected: 90 (3.40%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11] [2489   69   18   18   13    5    7    6    3    4   13    3]
Flag threshold: 2 | Number of species: 2648 | Species selected: 90 (3.40%)
Threshold: 9 | Number of species: 90 | Species discarded: 55 (61.11%)
Threshold: 11 | Number of species: 90 | Species discarded: 22 (24.44%)
[('discarded_common', 22), ('discarded_exclusive_norm_plus', 33), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 35), ('retained_norm_pipe', 68)]



MODE: 5 | S 2
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2473   80   19   14   11   11    7    5    4    5   12    5    2]
Flag threshold: 2 | Number of species: 2648 | Species selected: 95 (3.59%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2473   80   19   14   11   11    7    5    4    5   12    5    2]
Flag threshold: 2 | Number of species: 2648 | Species selected: 95 (3.59%)
Threshold: 9 | Number of species: 95 | Species discarded: 55 (57.89%)
Threshold: 11 | Number of species: 95 | Species discarded: 22 (23.16%)
[('discarded_common', 22), ('discarded_exclusive_norm_plus', 33), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 40), ('retained_norm_pipe', 73)]



MODE: 5 | S 3
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2443   84   28   17   12   10   10    7    8    4   12   11    2]
Flag threshold: 2 | Number of species: 2648 | Species selected: 121 (4.57%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2443   84   28   17   12   10   10    7    8    4   12   11    2]
Flag threshold: 2 | Number of species: 2648 | Species selected: 121 (4.57%)
Threshold: 9 | Number of species: 121 | Species discarded: 62 (51.24%)
Threshold: 11 | Number of species: 121 | Species discarded: 25 (20.66%)
[('discarded_common', 25), ('discarded_exclusive_norm_plus', 37), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 59), ('retained_norm_pipe', 96)]



MODE: 5 | S 4
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2427   92   31   20   14    7    8    8    8    5   14   12    2]
Flag threshold: 2 | Number of species: 2648 | Species selected: 129 (4.87%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2427   92   31   20   14    7    8    8    8    5   14   12    2]
Flag threshold: 2 | Number of species: 2648 | Species selected: 129 (4.87%)
Threshold: 9 | Number of species: 129 | Species discarded: 63 (48.84%)
Threshold: 11 | Number of species: 129 | Species discarded: 25 (19.38%)
[('discarded_common', 25), ('discarded_exclusive_norm_plus', 38), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 66), ('retained_norm_pipe', 104)]



MODE: 5 | S 5
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2408   98   30   18   21   13    6    6    9   11   14   12    2]
Flag threshold: 2 | Number of species: 2648 | Species selected: 142 (5.36%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2408   98   30   18   21   13    6    6    9   11   14   12    2]
Flag threshold: 2 | Number of species: 2648 | Species selected: 142 (5.36%)
Threshold: 9 | Number of species: 142 | Species discarded: 68 (47.89%)
Threshold: 11 | Number of species: 142 | Species discarded: 25 (17.61%)
[('discarded_common', 25), ('discarded_exclusive_norm_plus', 43), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 74), ('retained_norm_pipe', 117)]



MODE: 5 | S 6
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2408   98   30   18   21   13    6    6    9   11   14   12    2]
Flag threshold: 2 | Number of species: 2648 | Species selected: 142 (5.36%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2408   98   30   18   21   13    6    6    9   11   14   12    2]
Flag threshold: 2 | Number of species: 2648 | Species selected: 142 (5.36%)
Threshold: 9 | Number of species: 142 | Species discarded: 68 (47.89%)
Threshold: 11 | Number of species: 142 | Species discarded: 25 (17.61%)
[('discarded_common', 25), ('discarded_exclusive_norm_plus', 43), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 74), ('retained_norm_pipe', 117)]



MODE: 5 | S 7
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2387  108   39   17   21   15    7    5    8   11   13   14    3]
Flag threshold: 2 | Number of species: 2648 | Species selected: 153 (5.78%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2387  108   39   17   21   15    7    5    8   11   13   14    3]
Flag threshold: 2 | Number of species: 2648 | Species selected: 153 (5.78%)
Threshold: 9 | Number of species: 153 | Species discarded: 71 (46.41%)
Threshold: 11 | Number of species: 153 | Species discarded: 25 (16.34%)
[('discarded_common', 25), ('discarded_exclusive_norm_plus', 46), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 82), ('retained_norm_pipe', 128)]



MODE: 5 | S 10
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2365  117   47   17   19   17   10    5    8   12   12   16    3]
Flag threshold: 2 | Number of species: 2648 | Species selected: 166 (6.27%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2365  117   47   17   19   17   10    5    8   12   12   16    3]
Flag threshold: 2 | Number of species: 2648 | Species selected: 166 (6.27%)
Threshold: 9 | Number of species: 166 | Species discarded: 76 (45.78%)
Threshold: 11 | Number of species: 166 | Species discarded: 27 (16.27%)
[('discarded_common', 27), ('discarded_exclusive_norm_plus', 49), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 90), ('retained_norm_pipe', 139)]



MODE: 5 | S 15
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2360  110   52   20   15   22    7    7    9    9   17   17    3]
Flag threshold: 2 | Number of species: 2648 | Species selected: 178 (6.72%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2360  110   52   20   15   22    7    7    9    9   17   17    3]
Flag threshold: 2 | Number of species: 2648 | Species selected: 178 (6.72%)
Threshold: 10 | Number of species: 178 | Species discarded: 82 (46.07%)
Threshold: 11 | Number of species: 178 | Species discarded: 28 (15.73%)
[('discarded_common', 28), ('discarded_exclusive_norm_plus', 54), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 96), ('retained_norm_pipe', 150)]



MODE: 7 | S 0
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10] [2597   49   20    7    8    4    1    3    1    2    5]
Flag threshold: 2 | Number of species: 2697 | Species selected: 51 (1.89%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10] [2597   49   20    7    8    4    1    3    1    2    5]
Flag threshold: 2 | Number of species: 2697 | Species selected: 51 (1.89%)
Threshold: 8 | Number of species: 51 | Species discarded: 32 (62.75%)
Threshold: 3 | Number of species: 51 | Species discarded: 6 (11.76%)
[('discarded_common', 6), ('discarded_exclusive_norm_plus', 26), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 19), ('retained_norm_pipe', 45)]



MODE: 7 | S 1
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2545   66   18   15   13    7    6    4    2    6   12    2    1]
Flag threshold: 2 | Number of species: 2697 | Species selected: 86 (3.19%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2545   66   18   15   13    7    6    4    2    6   12    2    1]
Flag threshold: 2 | Number of species: 2697 | Species selected: 86 (3.19%)
Threshold: 9 | Number of species: 86 | Species discarded: 53 (61.63%)
Threshold: 3 | Number of species: 86 | Species discarded: 7 (8.14%)
[('discarded_common', 7), ('discarded_exclusive_norm_plus', 46), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 33), ('retained_norm_pipe', 79)]



MODE: 7 | S 2
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2500   81   34   16    9   10    9    5    6    6   13    7    1]
Flag threshold: 2 | Number of species: 2697 | Species selected: 116 (4.30%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2500   81   34   16    9   10    9    5    6    6   13    7    1]
Flag threshold: 2 | Number of species: 2697 | Species selected: 116 (4.30%)
Threshold: 9 | Number of species: 116 | Species discarded: 66 (56.90%)
Threshold: 3 | Number of species: 116 | Species discarded: 7 (6.03%)
[('discarded_common', 7), ('discarded_exclusive_norm_plus', 59), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 50), ('retained_norm_pipe', 109)]



MODE: 7 | S 3
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2494   81   29   23    9   10   10    4    7    6   15    6    3]
Flag threshold: 2 | Number of species: 2697 | Species selected: 122 (4.52%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2494   81   29   23    9   10   10    4    7    6   15    6    3]
Flag threshold: 2 | Number of species: 2697 | Species selected: 122 (4.52%)
Threshold: 9 | Number of species: 122 | Species discarded: 70 (57.38%)
Threshold: 3 | Number of species: 122 | Species discarded: 8 (6.56%)
[('discarded_common', 8), ('discarded_exclusive_norm_plus', 62), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 52), ('retained_norm_pipe', 114)]



MODE: 7 | S 4
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2459  100   34   23   11   12    8    7    7    5   15    9    7]
Flag threshold: 2 | Number of species: 2697 | Species selected: 138 (5.12%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2459  100   34   23   11   12    8    7    7    5   15    9    7]
Flag threshold: 2 | Number of species: 2697 | Species selected: 138 (5.12%)
Threshold: 9 | Number of species: 138 | Species discarded: 75 (54.35%)
Threshold: 3 | Number of species: 138 | Species discarded: 9 (6.52%)
[('discarded_common', 9), ('discarded_exclusive_norm_plus', 66), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 63), ('retained_norm_pipe', 129)]



MODE: 7 | S 5
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2433  116   34   22   17   11    8    8    9    5   17   10    7]
Flag threshold: 2 | Number of species: 2697 | Species selected: 148 (5.49%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2433  116   34   22   17   11    8    8    9    5   17   10    7]
Flag threshold: 2 | Number of species: 2697 | Species selected: 148 (5.49%)
Threshold: 9 | Number of species: 148 | Species discarded: 78 (52.70%)
Threshold: 3 | Number of species: 148 | Species discarded: 9 (6.08%)
[('discarded_common', 9), ('discarded_exclusive_norm_plus', 69), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 70), ('retained_norm_pipe', 139)]



MODE: 7 | S 6
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2430  112   35   22   16   15    6   10    8    9   16   11    7]
Flag threshold: 2 | Number of species: 2697 | Species selected: 155 (5.75%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2430  112   35   22   16   15    6   10    8    9   16   11    7]
Flag threshold: 2 | Number of species: 2697 | Species selected: 155 (5.75%)
Threshold: 9 | Number of species: 155 | Species discarded: 80 (51.61%)
Threshold: 3 | Number of species: 155 | Species discarded: 9 (5.81%)
[('discarded_common', 9), ('discarded_exclusive_norm_plus', 71), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 75), ('retained_norm_pipe', 146)]



MODE: 7 | S 7
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2430  112   35   22   16   15    6   10    8    9   16   11    7]
Flag threshold: 2 | Number of species: 2697 | Species selected: 155 (5.75%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2430  112   35   22   16   15    6   10    8    9   16   11    7]
Flag threshold: 2 | Number of species: 2697 | Species selected: 155 (5.75%)
Threshold: 9 | Number of species: 155 | Species discarded: 80 (51.61%)
Threshold: 3 | Number of species: 155 | Species discarded: 9 (5.81%)
[('discarded_common', 9), ('discarded_exclusive_norm_plus', 71), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 75), ('retained_norm_pipe', 146)]



MODE: 7 | S 10
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2425  112   39   23   14   16    7    8   10    9   16   11    7]
Flag threshold: 2 | Number of species: 2697 | Species selected: 160 (5.93%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2425  112   39   23   14   16    7    8   10    9   16   11    7]
Flag threshold: 2 | Number of species: 2697 | Species selected: 160 (5.93%)
Threshold: 9 | Number of species: 160 | Species discarded: 81 (50.62%)
Threshold: 3 | Number of species: 160 | Species discarded: 9 (5.62%)
[('discarded_common', 9), ('discarded_exclusive_norm_plus', 72), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 79), ('retained_norm_pipe', 151)]



MODE: 7 | S 15
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2407  116   46   21   16   21    8    7    9   11   17   11    7]
Flag threshold: 2 | Number of species: 2697 | Species selected: 174 (6.45%)
TaxIDs species count: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] [2407  116   46   21   16   21    8    7    9   11   17   11    7]
Flag threshold: 2 | Number of species: 2697 | Species selected: 174 (6.45%)
Threshold: 9 | Number of species: 174 | Species discarded: 87 (50.00%)
Threshold: 3 | Number of species: 174 | Species discarded: 9 (5.17%)
[('discarded_common', 9), ('discarded_exclusive_norm_plus', 78), ('discarded_exclusive_norm_pipe', 0), ('retained_norm_plus', 87), ('retained_norm_pipe', 165)]



In [ ]: