import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import os
from datetime import datetime
from matplotlib.colors import Normalize, ListedColormap
from scipy.stats import linregress, pearsonr
from scipy.stats import mannwhitneyu, wilcoxon
from statsmodels.stats.multitest import multipletests

from statannotations.Annotator import Annotator

pd.options.display.float_format = '{:.3f}'.format
pd.set_option('display.max_columns', None)

plt.rcParams['figure.dpi']=170

plt.rc('axes', linewidth=0.65)  # Adjust the line width of plot frames
plt.rc('xtick.major', width=0.0)
plt.rc('ytick.major', width=0.0)


DPI=250

from list_vars import LIST_PROFILERS, DIR_FIGURES, RESULTS_DIR

df_host_map_info = pd.read_csv(f'{RESULTS_DIR}/counts/mapping_counts.txt', sep='\t').set_index('SAMPLE')

artificial_taxid_counts = pd.read_csv('table_artificial_taxid.csv', sep=';', names=['species', 'taxid', 'reads'])
artificial_taxid_counts['reads_true'] = (artificial_taxid_counts['reads'] / 2).astype(int)

n_true_human_reads = int(artificial_taxid_counts['reads_true'].iloc[0])
n_true_human_reads

20000000

n_mapped_reads_1and2_maps = df_host_map_info.loc['ARTIFICIAL', '1st_mapped'] + df_host_map_info.loc['ARTIFICIAL', '2nd_mapped']

print(f'There is a total of {n_mapped_reads_1and2_maps} reads mapped to human during the 1st and 2nd map, which represents around {100 * n_mapped_reads_1and2_maps/n_true_human_reads} % of the total number of reads ({n_true_human_reads}).')
print(f'There is a total of {n_true_human_reads - n_mapped_reads_1and2_maps} reads remaining to be mapped.')

There is a total of 38219883 reads mapped to human during the 1st and 2nd map, which represents around 191.099415 % of the total number of reads (20000000).
There is a total of -18219883 reads remaining to be mapped.

df_host_profile_info = pd.read_csv(f'{RESULTS_DIR}/counts/profiling_counts_ARTIFICIAL.txt', sep='\t')
df_host_profile_info_artificial = df_host_profile_info[df_host_profile_info['SAMPLE'] == 'ARTIFICIAL']

df_host_profile_info_artificial['mapped_human_1_2_maps'] = 0
df_host_profile_info_artificial.loc[df_host_profile_info_artificial['pass'] == 2, 'mapped_human_1_2_maps'] = n_mapped_reads_1and2_maps

df_host_profile_info_artificial['mapped_human_total'] = df_host_profile_info_artificial['mapped_human_1_2_maps'] + df_host_profile_info_artificial['mapped_human']
df_host_profile_info_artificial['total_reads'] = df_host_profile_info_artificial['mapped_human_total'] + df_host_profile_info_artificial['mapped_others'] + df_host_profile_info_artificial['unmapped']

df_host_profile_info_artificial['observed_human_prop'] = df_host_profile_info_artificial['mapped_human_total'] / df_host_profile_info_artificial['total_reads']
df_host_profile_info_artificial['observed_others_prop'] = df_host_profile_info_artificial['mapped_others'] / df_host_profile_info_artificial['total_reads']
df_host_profile_info_artificial['observed_unmapped_prop'] = df_host_profile_info_artificial['unmapped'] / df_host_profile_info_artificial['total_reads']

df_host_profile_info_artificial['expected_human_prop'] = n_true_human_reads / artificial_taxid_counts['reads_true'].sum() # 0.8
df_host_profile_info_artificial['expected_others_prop'] = 1 - n_true_human_reads / artificial_taxid_counts['reads_true'].sum() # 0.8

df_host_profile_info_artificial['calculated_unmapped_human_prop'] = df_host_profile_info_artificial['expected_human_prop'] - df_host_profile_info_artificial['observed_human_prop']
df_host_profile_info_artificial['calculated_unmapped_others_prop'] = df_host_profile_info_artificial['expected_others_prop'] - df_host_profile_info_artificial['observed_others_prop']

df_host_profile_info_artificial['proportion_mapped_other_reads'] = df_host_profile_info_artificial['observed_others_prop'] /  df_host_profile_info_artificial['expected_others_prop']


for profiler in LIST_PROFILERS:
    display(profiler)
    display(df_host_profile_info_artificial[df_host_profile_info_artificial['profiler'] == profiler])

'centrifuge'

'ganon'

'kaiju'

'kraken2'

'krakenuniq'

custom_palette = ["#648FFF", "#785EF0", "#DC267F", "#FE6100", "#FFB000", "#848484"]

fig, axs = plt.subplots(1, 2, figsize=(8, 3))

# 1A) Check if there are differences in human read assignment.

# Step 1: Calculate the differences between pass 2 and pass 0 for each profiler and mode

pass_diff = (
    df_host_profile_info_artificial.pivot_table(
        index=["profiler", "mode"], columns="pass", values="observed_human_prop"
    )
    .reset_index()
)

# Ensure column names are integers
pass_diff.columns.name = None  # Remove the columns' name from pivot_table
pass_diff.columns = ['profiler', 'mode', 0, 2]  # Explicitly rename columns

# Calculate the difference
pass_diff["difference"] = 100 * (pass_diff[2] - pass_diff[0])



# Step 2: Plot the differences using a lineplot
g = sns.lineplot(
    data=pass_diff,
    x="mode",
    y="difference",
    hue="profiler",
    marker="o",
    palette=custom_palette,
    ax=axs[0],
    legend=False
)

axs[0].axhline(0, color="#848484", linestyle="--", linewidth=0.8)
axs[0].set_title("Human assignment", fontsize=14)
axs[0].set_xlabel("Mode", fontsize=12)
axs[0].set_ylabel("Diff pass2 - pass0 (%)", fontsize=12)



# 1B) Check if there are differences in non-human read assignment.

pass_diff = (
    df_host_profile_info_artificial.pivot_table(
        index=["profiler", "mode"], columns="pass", values="observed_others_prop"
    )
    .reset_index()
)

pass_diff["difference"] = 100 * (pass_diff[2] - pass_diff[0])



# Step 2: Plot the differences using a lineplot

h = sns.lineplot(
    data=pass_diff,
    x="mode",
    y="difference",
    hue="profiler",
    marker="o",
    palette=custom_palette,
    ax=axs[1],
)

sns.despine(top=True, right=True)

axs[1].axhline(0, color="gray", linestyle="--", linewidth=0.8)
axs[1].set_title("Non-human assignment", fontsize=12)
axs[1].set_xlabel("Mode", fontsize=12)
axs[1].set_ylabel("", fontsize=12)

for ax in axs:
    ax.set_xticks(range(1, 10))

axs[1].legend(title="Profiler", frameon=False, bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
plt.tight_layout()
plt.rc('axes', linewidth=0.65)
for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/figA.{format}', dpi=DPI, )

/tmp/ipykernel_2918775/492332427.py:24: UserWarning: The palette list has more values (6) than needed (5), which may not be intended.
  g = sns.lineplot(
/tmp/ipykernel_2918775/492332427.py:57: UserWarning: The palette list has more values (6) than needed (5), which may not be intended.
  h = sns.lineplot(

table_artificial_taxcounts = pd.read_csv('../../src/version_2/table_artificial_taxid.csv', sep=';', names=['species', 'taxid', 'count'])
table_artificial_taxcounts = table_artificial_taxcounts[table_artificial_taxcounts['taxid'] != 9606]
table_artificial_taxcounts['abundance'] = 100 * table_artificial_taxcounts['count'] / table_artificial_taxcounts['count'].sum()
table_artificial_taxcounts

def calculate_nominal_metrics(df_tax_ground_truth, df_flags_observed, column):
    list_expected_taxids = list(df_tax_ground_truth['taxid'].astype(int).values)
    list_observed_true_taxids = list(df_flags_observed.loc[df_flags_observed[column] == False, 'taxonomy_id'].astype(int).values)
    list_observed_false_taxids = list(df_flags_observed.loc[df_flags_observed[column] == True, 'taxonomy_id'].astype(int).values)

    TP = len([i for i in list_expected_taxids if i in list_observed_true_taxids])
    FN = len([i for i in list_expected_taxids if i not in list_observed_true_taxids])
    FP = len([i for i in list_observed_true_taxids if i not in list_expected_taxids])
    TN = len([i for i in list_observed_false_taxids if i not in list_expected_taxids])

    assert len(set(list_expected_taxids + list_observed_true_taxids + list_observed_false_taxids)) == TP + FN + FP + TN

    precision = TP / (TP + FP)
    recall = TP / (TP + FN)

    try:
        f1 = (2 * precision * recall) / (precision + recall)
    except:
        f1 = 0
    
    # Create kappa measures
    ALL = TP + FN + FP + TN
    p0 = (TP + TN) / (ALL)
    pe = (TP + FP)/ALL * (TP + FN)/ALL + (TN + FP)/ALL * (TN + FN)/ALL
    kappa = (p0 - pe) / (1 - pe)

    return precision, recall, f1, kappa, TP, FN, FP, TN

columns_selected = ['centrifuge_norm', 'ganon_norm', 'kaiju_norm', 'kmcp_norm', 'kraken2_norm', 'krakenuniq_norm',
                    'centrifuge_relab', 'ganon_relab', 'kaiju_relab', 'kmcp_relab', 'kraken2_relab', 'krakenuniq_relab',
                    'mean_norm', 'CV_norm', 'mean_relab', 'CV_relab']
df_nominal_stats = {'pass': [], 'mode': [], 'S': [], 'column': [], 'precision': [], 'recall': [], 'f1': [], 
                    'kappa': [], 'TP|FN|FP|TN': []}

for passn in [0, 2]:
    for mode in range(1, 10):
        for S in [0, 1, 2, 3, 4, 5, 6, 7, 10, 15]:
            for column in columns_selected: 
                summary_table_flags = pd.read_csv(f'{RESULTS_DIR}/summary/ARTIFICIAL_pass{passn}_mode{mode}_taxspecies_S{S}.flags.tsv', sep='\t')
                try:
                    precision, recall, f1, kappa, TP, FN, FP, TN = calculate_nominal_metrics(table_artificial_taxcounts, summary_table_flags, column)
                except KeyError:
                    continue 

                df_nominal_stats['pass'].append(passn)
                df_nominal_stats['mode'].append(mode)
                df_nominal_stats['S'].append(S)
                df_nominal_stats['column'].append(column)

                df_nominal_stats['precision'].append(precision)
                df_nominal_stats['recall'].append(recall)
                df_nominal_stats['f1'].append(f1)
                df_nominal_stats['kappa'].append(kappa)
                df_nominal_stats['TP|FN|FP|TN'].append((TP, FN, FP, TN))

df_nominal_stats = pd.DataFrame(df_nominal_stats)

df_nominal_stats

def compute_mad(values):
    median = np.median(values)
    mad = np.median(np.abs(values - median))
    return mad

def calculate_numerical_metrics(df_tax_ground_truth, df_counts_observed, profiler, suffix):
    df_tax_ground_truth = df_tax_ground_truth.copy().set_index('taxid')
    df_counts_observed = df_counts_observed.copy().set_index('taxonomy_id')

    list_expected_taxids = df_tax_ground_truth.index.astype(int).values
    list_observed_true_taxids = df_counts_observed.index.astype(int).values

    combined_taxid = np.intersect1d(list_expected_taxids, list_observed_true_taxids)
    species = df_tax_ground_truth.loc[combined_taxid, 'species'].values
    observed_counts = df_counts_observed.loc[combined_taxid, f'{profiler}_{suffix}'].fillna(0).values

    expected_col = 'count' if suffix == 'norm' else 'abundance'
    expected_counts = df_tax_ground_truth.loc[combined_taxid, expected_col].values

    diff_counts = 100 * (observed_counts - expected_counts) / expected_counts

    MRE_counts = np.mean(diff_counts)
    MRED_counts = np.std(diff_counts)
    MAE_counts = np.mean(np.abs(diff_counts))
    MAED_counts = np.std(np.abs(diff_counts))
    MACV_counts = MAED_counts / MAE_counts

    if len(combined_taxid) > 35:
        x,y = expected_counts, observed_counts 

        slope, _, r_value, _, _ = linregress(x, y)
        r2 = r_value ** 2
    else:
        slope, r2 = np.nan, np.nan

    return diff_counts, MRE_counts, MRED_counts, MAE_counts, MAED_counts, MACV_counts, slope, r2, combined_taxid, species, expected_counts, observed_counts

for passn in [2]:
    for mode in range(5, 6):
            for profiler in ['mean']: 
                try:
                    summary_table_flags = pd.read_csv(f'{RESULTS_DIR}/summary/ARTIFICIAL_pass{passn}_mode{mode}_taxspecies_S{S}.flags.tsv', sep='\t')
                    summary_table_counts = pd.read_csv(f'{RESULTS_DIR}/summary/ARTIFICIAL_pass{passn}_mode{mode}_taxspecies_S{S}.diversity.tsv', sep='\t')
                    diff_counts, MRE_counts, MRED_counts, MAE_counts, MAED_counts, MACV_counts, corr_counts, rmse_counts, taxids_counts, species_counts, expected_counts, observed_counts = \
                        calculate_numerical_metrics(table_artificial_taxcounts, summary_table_counts, \
                                                                        profiler, suffix='norm')
                    diff_abundance, MRE_abundance, MRED_abundance, MAE_abundance, MAED_abundance, MACV_abundance, corr_abundance, rmse_abundance, taxids_abundance, species_abundance, expected_abundance, observed_abundance = \
                        calculate_numerical_metrics(table_artificial_taxcounts, summary_table_counts, \
                                                                        profiler, suffix='relab')
                except KeyError:
                    raise

df = pd.DataFrame({'species': species_counts, 'expected_counts': expected_counts, 'observed_counts': observed_counts, 'diff_abundance': diff_counts})
df

df_numerical_stats = {'pass': [], 'mode': [], 'profiler': [], 
                    'diff_counts': [], 'MRE_counts': [],'MRED_counts': [], 'MAE_counts': [], 'MAED_counts': [], 'MACV_counts': [], 
                    'corr_counts': [], 'R2_counts': [], 'taxid_counts': [], 
                    'species_counts': [], 'expected_counts': [], 'observed_counts': [], 
                    'diff_abundance': [], 'MRE_abundance': [],'MRED_abundance': [], 'MAE_abundance': [], 'MAED_abundance': [], 'MACV_abundance': [], 
                    'corr_abundance': [], 'R2_abundance': [], 'taxid_abundance': [], 
                    'species_abundance': [], 'expected_abundance': [], 'observed_abundance': [],  }

for passn in [0, 2]:
    for mode in range(1, 10):
            for profiler in LIST_PROFILERS + ['mean']: 
                summary_table_flags = pd.read_csv(f'{RESULTS_DIR}/summary/ARTIFICIAL_pass{passn}_mode{mode}_taxspecies_S{S}.flags.tsv', sep='\t')
                summary_table_counts = pd.read_csv(f'{RESULTS_DIR}/summary/ARTIFICIAL_pass{passn}_mode{mode}_taxspecies_S{S}.diversity.tsv', sep='\t')
                try:
                    diff_counts, MRE_counts, MRED_counts, MAE_counts, MAED_counts, MACV_counts, corr_counts, rmse_counts, taxids_counts, species_counts, expected_counts, observed_counts = \
                        calculate_numerical_metrics(table_artificial_taxcounts, summary_table_counts, \
                                                                        profiler, suffix='norm')
                    diff_abundance, MRE_abundance, MRED_abundance, MAE_abundance, MAED_abundance, MACV_abundance, corr_abundance, rmse_abundance, taxids_abundance, species_abundance, expected_abundance, observed_abundance = \
                        calculate_numerical_metrics(table_artificial_taxcounts, summary_table_counts, \
                                                                        profiler, suffix='relab')
                except KeyError:
                    raise
                    continue 
                
                df_numerical_stats['pass'].append(passn)
                df_numerical_stats['mode'].append(mode)
                df_numerical_stats['profiler'].append(profiler)

                df_numerical_stats['diff_counts'].append(diff_counts)
                df_numerical_stats['MRE_counts'].append(MRE_counts)          
                df_numerical_stats['MRED_counts'].append(MRED_counts)              
                df_numerical_stats['MAE_counts'].append(MAE_counts)                
                df_numerical_stats['MAED_counts'].append(MAED_counts)                
                df_numerical_stats['MACV_counts'].append(MACV_counts)                
                df_numerical_stats['corr_counts'].append(corr_counts)                
                df_numerical_stats['R2_counts'].append(rmse_counts) 
                df_numerical_stats['taxid_counts'].append(taxids_counts)
                df_numerical_stats['species_counts'].append(species_counts)
                df_numerical_stats['expected_counts'].append(expected_counts)
                df_numerical_stats['observed_counts'].append(observed_counts)

                df_numerical_stats['diff_abundance'].append(diff_abundance)
                df_numerical_stats['MRE_abundance'].append(MRE_abundance)
                df_numerical_stats['MRED_abundance'].append(MRED_abundance)

                df_numerical_stats['MAE_abundance'].append(MAE_abundance)
                df_numerical_stats['MAED_abundance'].append(MAED_abundance)
                df_numerical_stats['MACV_abundance'].append(MACV_abundance)
                df_numerical_stats['corr_abundance'].append(corr_abundance)                
                df_numerical_stats['R2_abundance'].append(rmse_abundance) 
                df_numerical_stats['taxid_abundance'].append(taxids_abundance)
                df_numerical_stats['species_abundance'].append(species_abundance)
                df_numerical_stats['expected_abundance'].append(expected_abundance)
                df_numerical_stats['observed_abundance'].append(observed_abundance)

df_numerical_stats = pd.DataFrame(df_numerical_stats)

df_numerical_stats

# Compute and print correlation (Pearson by default)
corr = df_nominal_stats['f1'].corr(df_nominal_stats['kappa'])

print("Pearson correlation between F1 and Kappa:", corr)

# Scatter plot with the identity line
plt.figure(figsize=(4, 4))

# Add the identity line (y = x)
plt.plot([0, 1], [0, 1], color='red', linestyle='--', alpha=0.5)

sns.scatterplot(x='f1', y='kappa', data=df_nominal_stats, s=20)

plt.text(0.01, 0.95, f'R$^2$: {corr**2:.5f}')

sns.despine(top=True, right=True)

# Add title and labels
plt.title('F1-score vs Kappa')
plt.xlabel('F1-score')
plt.ylabel('Kappa')
plt.tight_layout()

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/fig0.kappa-f1.{format}', dpi=DPI, bbox_inches='tight' )

plt.show()

Pearson correlation between F1 and Kappa: 0.9997826111567186

import random

def cohen_kappa(tp, fp, fn, tn):
    """
    Compute Cohen's kappa given the confusion matrix counts:
      TP = true positives
      FP = false positives
      FN = false negatives
      TN = true negatives
    """
    N = tp + fp + fn + tn  # total
    
    # Observed agreement
    po = (tp + tn) / N
    
    # Expected agreement
    # (actual positives * predicted positives) + (actual negatives * predicted negatives)
    p_a = (tp + fn) / N  # actual positives
    p_p = (tp + fp) / N  # predicted positives
    p_n = (fp + tn) / N  # actual negatives? Actually "actual positives" vs "predicted negatives" can be spelled out:

    p_n_act = (fp + tn) / N   # actual negatives
    p_n_pred = (fn + tn) / N  # predicted negatives
    
    pe = p_a * p_p + p_n_act * p_n_pred
    
    if np.isclose(pe, 1.0):
        # If pe is 1, the denominator (1 - pe) -> 0, can cause issues
        return np.nan
    
    kappa = (po - pe) / (1 - pe)
    return kappa

def f1_score_simple(tp, fp, fn):
    """
    Compute the F1 score from TP, FP, FN:
      F1 = 2 * TP / (2*TP + FP + FN)
    """
    denom = (2 * tp + fp + fn)
    if denom == 0:
        return np.nan
    return 2 * tp / denom

# --- Simulation parameters ---
N = 3000        # total samples per confusion matrix
n_sims = 100000  # number of random simulations

f1_values = []
kappa_values = []
table_values = []

for _ in range(n_sims):
    # We need TP, FP, FN, TN >= 0 and sum = N.
    # One way: sample three random integers and let the fourth be what's left.
    # This ensures the sum is N.
    
    tp = random.randint(0, N)
    fp = random.randint(0, N - tp)
    fn = random.randint(0, N - tp - fp)
    tn = N - tp - fp - fn
    
    # Compute F1
    f1 = f1_score_simple(tp, fp, fn)
    # Compute Kappa
    kappa = cohen_kappa(tp, fp, fn, tn)
    
    # We only keep valid calculations (not NaN)
    if not np.isnan(f1) and not np.isnan(kappa):
        f1_values.append(f1)
        kappa_values.append(kappa)
        table_values.append((tp, fp, fn, tn))

# Convert to numpy arrays for correlation
f1_values = np.array(f1_values)
kappa_values = np.array(kappa_values)

# Compute Pearson correlation
corr, p_value = pearsonr(f1_values, kappa_values)

print(f"Number of valid simulations: {len(f1_values)} / {n_sims}")
print(f"Pearson correlation between F1 and Kappa: {corr:.3f} (p = {p_value:.3e})")

# Plot the scatter of F1 vs Kappa
plt.figure(figsize=(7, 5))
plt.scatter(f1_values, kappa_values, alpha=0.2, s=10)
plt.title("Monte Carlo Simulation (TP, FP, FN, TN random)")
plt.xlabel("F1 Score")
plt.ylabel("Cohen's Kappa")

plt.plot([0, 1], [0, 1])

plt.grid(True)
plt.show()

Number of valid simulations: 99965 / 100000
Pearson correlation between F1 and Kappa: 0.681 (p = 0.000e+00)

N = 0.8

df = pd.DataFrame({'f1': f1_values, 'kappa': kappa_values, 'table': table_values})

df[(df['f1'] > N - 0.03) & (df['f1'] < N + 0.03) & (df['kappa'] < N + 0.03) & (df['kappa'] > N- 0.03)]

N = 1

df[(df['f1'] > N - 0.03) & (df['f1'] < N + 0.03) & (df['kappa'] < N + 0.03) & (df['kappa'] > N- 0.03)]

N = 0

df[(df['f1'] > N - 0.03) & (df['f1'] < N + 0.03) & (df['kappa'] < N + 0.03) & (df['kappa'] > N- 0.03)]

cols = [f'{i}_norm' for i in LIST_PROFILERS] + ['mean_norm']
modes = range(2, 9)
passn = [2]
S_values = df_nominal_stats['S'].unique()

subset_df = df_nominal_stats[(df_nominal_stats['pass'].isin(passn)) & \
                             (df_nominal_stats['column'].isin(cols)) & \
                              (df_nominal_stats['mode'].isin(modes))]

melted_df = pd.melt(
    subset_df,
    id_vars=['mode', 'S', 'column'],
    value_vars=['recall', 'precision', 'f1'],
    var_name='metric',
    value_name='score'
)

# Create a colormap for 'mode'
norm = Normalize(vmin=melted_df['mode'].min(), vmax=melted_df['mode'].max())
cmap = plt.cm.viridis  # Choose a colormap (e.g., 'viridis', 'plasma', 'cividis')

# Create a FacetGrid: 6x3 grid (row for each profiler, column for each metric)
g = sns.FacetGrid(
    melted_df, 
    col='column', 
    row='metric', 
    height=2, 
    sharey=True, 
    sharex=True
)

# Map the lineplot to the grid
def lineplot_with_cmap(data, **kwargs):
    for mode in sorted(data['mode'].unique()):
        subset = data[data['mode'] == mode]
        plt.plot(subset['S'], subset['score'], label=f"Mode {mode}",
                 color=cmap(norm(mode)), marker='o')

g.map_dataframe(lineplot_with_cmap)

# Create a legend for the discrete modes
handles = [
    plt.Line2D([0], [0], color=cmap(norm(mode)), marker='o', linestyle='', label=f"Mode {mode}")
    for mode in sorted(melted_df['mode'].unique())
]
plt.legend(
    handles=handles, 
    title="", 
    bbox_to_anchor=(1.05, 3), 
    loc='center left', 
    frameon=False
)

# Set x-axis ticks (if you have specific S values)
g.set(xticks=subset_df['S'].unique())

for ax in g.axes.ravel():
    ax.set_title('')

# Add axis labels and titles
for ax, profiler in zip(g.axes[0, :], melted_df['column'].unique()):
    ax.set_title(profiler.replace('_norm', ''))

for ax, score in zip(g.axes[:, 0], ['recall', 'precision', 'F1-score']):
    ax.set_ylabel(score)

plt.tight_layout()

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/figE.{format}', dpi=DPI, bbox_inches='tight')


plt.show()

/tmp/ipykernel_2918775/2133720995.py:58: UserWarning: Tight layout not applied. tight_layout cannot make Axes height small enough to accommodate all Axes decorations.
  plt.tight_layout()

g = sns.FacetGrid(
    melted_df, 
    col='column', 
    row='metric', 
    height=3, 
    sharey=True, 
    sharex=True
)

g.map(sns.boxplot, 'S', 'score')

# Set x-axis ticks (if you have specific S values)
for ax in g.axes.ravel():
    ax.set_title('')

# Add axis labels and titles
for ax, profiler in zip(g.axes[0, :], melted_df['column'].unique()):
    ax.set_title(profiler.replace('_norm', ''))

for ax, score in zip(g.axes[:, 0], ['recall', 'precision', 'F1-score']):
    ax.set_ylabel(score)

plt.subplots_adjust(top=0.9)
g.fig.suptitle("Metrics by Profiler and Mode", fontsize=16)

plt.show()

/home/neuroimm/.local/share/mamba/envs/EVs/lib/python3.13/site-packages/seaborn/axisgrid.py:718: UserWarning: Using the boxplot function without specifying `order` is likely to produce an incorrect plot.
  warnings.warn(warning)

# Plot Jaccard index between the different species

"""
For this part we are going to read all the .standardised.species reports and simply read the number of species and compute a table with the total number of species and the jaccard index
"""

def get_n_species(sample, mode, profiler, S=15):
    pass0_df = pd.read_csv(f'{RESULTS_DIR}/summary/{sample}_pass0_mode{mode}_taxspecies_S{S}.diversity.tsv', sep='\t').set_index('taxonomy_id')
    pass2_df = pd.read_csv(f'{RESULTS_DIR}/summary/{sample}_pass2_mode{mode}_taxspecies_S{S}.diversity.tsv', sep='\t').set_index('taxonomy_id')

    pass0_profiler = pass0_df[profiler].dropna()
    pass2_profiler = pass2_df[profiler].dropna()

    pass0_taxids = pass0_profiler.index.values
    pass2_taxids = pass2_profiler.index.values

    jaccard_index = len(np.intersect1d(pass0_taxids, pass2_taxids)) / len(np.union1d(pass0_taxids, pass2_taxids))

    return len(pass0_taxids), len(pass2_taxids), jaccard_index


dict_n_species = {'profiler': [],
                  'mode': [],
                  'pass 0 species': [],
                  'pass 2 species': [],
                  'jaccard': []}

for profiler in LIST_PROFILERS + ['mean']:
    for mode in range(1,10):
        try:
            pass0_n_species, pass2_n_species, jaccard_index = get_n_species('ARTIFICIAL', mode, profiler + '_norm')
            dict_n_species['profiler'].append(profiler)
            dict_n_species['mode'].append(mode)
            dict_n_species['pass 0 species'].append(pass0_n_species)
            dict_n_species['pass 2 species'].append(pass2_n_species)
            dict_n_species['jaccard'].append(jaccard_index)
        except:
            print(f'No entry added for profiler {profiler} and mode {mode}')

df_n_species = pd.DataFrame(dict_n_species)

plt.rc('axes', linewidth=0.65)

# Initialize a 3x2 grid for subplots
fig, axes = plt.subplots(2, 3, figsize=(11, 6))
axes = axes.flatten()

# Set Seaborn style
sns.set_theme(style="white")

# Define custom labels for the shared legend
legend_elements = [
    plt.Line2D([0], [0], marker='o', color='#648FFF', label='Pass 0', markersize=8, linestyle='None'),
    plt.Line2D([0], [0], marker='o', color='#785EF0', label='Pass 2', markersize=8, linestyle='None'),
    plt.Line2D([0], [0], marker='o', color='#DC267F', label='Jaccard Index', markersize=8, linestyle='-')
]

# Plot for each profiler
for i, profiler in enumerate(LIST_PROFILERS + ['mean']):
    df_profiler = df_n_species[df_n_species['profiler'] == profiler]
    ax1 = axes[i]
    
    # Scatter plots for `pass 0 species` and `pass 2 species`
    sns.scatterplot(
        data=df_profiler, x='mode', y='pass 0 species', ax=ax1, color='#648FFF', s=50
    )
    sns.scatterplot(
        data=df_profiler, x='mode', y='pass 2 species', ax=ax1, color='#785EF0', s=50
    )
    
    sns.despine(top=True, right=True)
    
    sns.set_theme(style="white")
    
    # Configure the primary y-axis
    ax1.set_xticks(range(1, 10))
    ax1.set_xlabel('Mode')
    ax1.set_ylabel('Species Count', color='black')
    #ax1.set_yscale('log')  # Optional: Log scale
    ax1.set_title(f'{profiler.capitalize()}')
    
    # Create a secondary y-axis for Jaccard index
    ax2 = ax1.twinx()
    sns.lineplot(
        data=df_profiler, x='mode', y='jaccard', ax=ax2, color='#DC267F', marker='o'
    )
    ax2.set_ylabel('Jaccard Index', color='#DC267F')

    sns.despine(top=True, right=True)
# Hide unused axes
for j in range(len(LIST_PROFILERS) + 1, len(axes)):
    axes[j].axis('off')

# Add a shared legend
fig.legend(
    handles=legend_elements, loc='center right', frameon=False, title="", bbox_to_anchor=(1.15, 0.5)
)

plt.tight_layout()

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/figB.{format}', dpi=DPI, bbox_inches='tight' )

sns.lineplot(df_n_species, x='mode', y='jaccard', hue='profiler', marker='o', palette=custom_palette)
plt.legend(
    loc='center right', frameon=False, bbox_to_anchor=(1.35, 0.5),
)

<matplotlib.legend.Legend at 0x76fde1836ad0>

from scipy.stats import chi2_contingency

SMALL_VAL = 0.1

df_sub = df_nominal_stats[(df_nominal_stats['column'].isin([f'{i}_norm' for i in LIST_PROFILERS] + ['mean_norm'])) & 
                          (df_nominal_stats['S'].isin([0, 1, 2, 3, 4, 5, 6, 7, 10, 15]))].copy()

# Initialize a list to store chi-squared results
chi2_results = []

# Iterate over unique combinations of mode, S, and column
for (mode, S, column), group in df_sub.groupby(['mode', 'S', 'column']):
    try:
        # Extract contingency tables for full (TP, FN, FP, TN) and partial (TP, FP, FN)
        full_contingency_table = []
        partial_contingency_table = []
        precision_diff, recall_diff, f1_diff = None, None, None  # Initialize differences
        
        for p in [0, 2]:
            data = group[group['pass'] == p]
            if len(data):
                (tp, fn, fp, tn) = data.iloc[0]['TP|FN|FP|TN']
                full_contingency_table.append([tp, fn, fp, tn])  # Full table
                partial_contingency_table.append([tp, fp, fn])   # Partial table

                # Calculate differences for precision, recall, and f1
                if p == 0:
                    precision_0 = data.iloc[0]['precision']
                    recall_0 = data.iloc[0]['recall']
                    f1_0 = data.iloc[0]['f1']
                elif p == 2:
                    precision_2 = data.iloc[0]['precision']
                    recall_2 = data.iloc[0]['recall']
                    f1_2 = data.iloc[0]['f1']

        # Compute differences
        if 'precision_0' in locals() and 'precision_2' in locals():
            precision_diff = precision_2 - precision_0
            recall_diff = recall_2 - recall_0
            f1_diff = f1_2 - f1_0

        # Perform chi-squared test for full contingency table
        if len(full_contingency_table) == 2:
            chi2_full, p_value_full, _, _ = chi2_contingency(np.array(full_contingency_table) + SMALL_VAL)
        
        # Perform chi-squared test for partial contingency table
        if len(partial_contingency_table) == 2:
            chi2_partial, p_value_partial, _, _ = chi2_contingency(np.array(partial_contingency_table) + SMALL_VAL)
        
        # Store results
        chi2_results.append({
            'mode': mode,
            'S': S,
            'column': column,
            'chi2_full': chi2_full if len(full_contingency_table) == 2 else None,
            'p_value_full': p_value_full if len(full_contingency_table) == 2 else None,
            'chi2_partial': chi2_partial if len(partial_contingency_table) == 2 else None,
            'p_value_partial': p_value_partial if len(partial_contingency_table) == 2 else None,
            'precision_diff': precision_diff,
            'recall_diff': recall_diff,
            'f1_diff': f1_diff,
            'stats_pass0': full_contingency_table[0],
            'stats_pass2': full_contingency_table[1],
        })
    except Exception as e:
        print(f"Error processing {mode}, {S}, {column}: {e}")

# Convert results to a DataFrame
df_pass_chi2_stats = pd.DataFrame(chi2_results)

# Add significance categories based on p-value thresholds
alpha = 0.1 

df_pass_chi2_stats['significance'] = df_pass_chi2_stats.apply(
    lambda row: (
        'Both Significant' if row['p_value_full'] < alpha and row['p_value_partial'] < alpha else
        'Only Full Significant' if row['p_value_full'] < alpha else
        'Only Partial Significant' if row['p_value_partial'] < alpha else
        'Neither Significant'
    ),
    axis=1
)

# Count points in each quadrant
quadrant_counts = {
    'Both Significant': len(df_pass_chi2_stats[(df_pass_chi2_stats['p_value_full'] < alpha) & (df_pass_chi2_stats['p_value_partial'] < alpha)]),
    'Only Full Significant': len(df_pass_chi2_stats[(df_pass_chi2_stats['p_value_full'] < alpha) & (df_pass_chi2_stats['p_value_partial'] >= alpha)]),
    'Only Partial Significant': len(df_pass_chi2_stats[(df_pass_chi2_stats['p_value_full'] >= alpha) & (df_pass_chi2_stats['p_value_partial'] < alpha)]),
    'Neither Significant': len(df_pass_chi2_stats[(df_pass_chi2_stats['p_value_full'] >= alpha) & (df_pass_chi2_stats['p_value_partial'] >= alpha)])
}

# Set up the plots
fig, axs = plt.subplots(1, 2, figsize=(12, 6))

# First plot: chi2_full vs chi2_partial
sns.scatterplot(data=df_pass_chi2_stats, x='chi2_full', y='chi2_partial', ax=axs[0])
axs[0].set_title('Chi2 Full vs Partial')
axs[0].set_xlabel('Chi2 Full')
axs[0].set_ylabel('Chi2 Partial')

# Second plot: p_value_full vs p_value_partial with significance categories
sns.scatterplot(
    data=df_pass_chi2_stats,
    x='p_value_full',
    y='p_value_partial',
    hue='significance',
    palette={
        'Both Significant': '#bc0000',
        'Only Full Significant': '#0000bc',
        'Only Partial Significant': '#00bc00',
        'Neither Significant': '#aaaaaa'
    },
    ax=axs[1]
)

# Add thresholds
axs[1].axhline(alpha, color='black', linestyle='--', linewidth=1)
axs[1].axvline(alpha, color='black', linestyle='--', linewidth=1)

# Annotate quadrant counts
axs[1].text(0.25, 0.75, f"Both Significant\n{quadrant_counts['Both Significant']}", 
            ha='center', color='#bc0000', fontsize=12, bbox=dict(facecolor='white', edgecolor='none'))
axs[1].text(0.75, 0.75, f"Only Full Significant\n{quadrant_counts['Only Full Significant']}", 
            ha='center', color='#0000bc', fontsize=12, bbox=dict(facecolor='white', edgecolor='none'))
axs[1].text(0.25, 0.25, f"Only Partial Significant\n{quadrant_counts['Only Partial Significant']}", 
            ha='center', color='#00bc00', fontsize=12, bbox=dict(facecolor='white', edgecolor='none'))
axs[1].text(0.75, 0.25, f"Neither Significant\n{quadrant_counts['Neither Significant']}", 
            ha='center', color='#aaaaaa', fontsize=12, bbox=dict(facecolor='white', edgecolor='none'))

# Customize the plot
axs[1].set_title('P-Value Comparison with Significance')
axs[1].set_xlabel('P-Value Full')
axs[1].set_ylabel('P-Value Partial')
axs[1].set_xlim(0, 1)
axs[1].set_ylim(0, 1)

plt.legend(loc='center right', frameon=False, bbox_to_anchor=(1.65, 0.5))
# Adjust layout
plt.tight_layout()

# Show the plots
plt.show()

df_pass_chi2_stats[df_pass_chi2_stats['significance'] != 'Neither Significant'].sort_values(by=['mode', 'S'])

plt.rc('axes', linewidth=0.65)


df_pass_chi2_stats['are_diff'] = [True if i < alpha else False for i in df_pass_chi2_stats['p_value_full']]


print((df_pass_chi2_stats['are_diff'] == True).sum(), (df_pass_chi2_stats['are_diff'] == False).sum())

display(df_pass_chi2_stats)
display(df_pass_chi2_stats[df_pass_chi2_stats['are_diff'] == True])



# Create a grid of three axes for the plots
fig, axs = plt.subplots(1, 3, figsize=(7.5, 2.7), sharey=True)

# Define the metrics to plot
metrics = ['f1_diff', 'precision_diff', 'recall_diff']
titles = ['F1 Score', 'Precision', 'Recall']


# Iterate through the metrics and create a boxplot for each
for ax, metric, title in zip(axs, metrics, titles):
    sns.boxplot(
        data=df_pass_chi2_stats, 
        x="are_diff", 
        y=metric, 
        palette=sns.color_palette(['#4E79A7', '#A0CBE8']),
        ax=ax
    )
    sns.despine(top=True, right=True, ax=ax)

    ax.set_title(title, fontsize=14, pad=40)
    ax.set_xlabel("Group", fontsize=12)
    if metric == 'f1_diff':  # Label y-axis only for the first plot
        ax.set_ylabel("Difference", fontsize=12)
    else:
        ax.set_ylabel("")

    # Highlight medians
    medians = df_pass_chi2_stats.groupby("are_diff")[metric].median()
    for i, median in enumerate(medians):
        ax.text(i, median, f"{median:.2f}", ha='center', va='bottom', fontsize=10)
    
    pairs=[(False, True)]

    annotator = Annotator(ax, pairs, data=df_pass_chi2_stats, x='are_diff', y=metric)
    annotator.configure(test='Mann-Whitney', text_format='simple', loc='outside', line_width=1)
    annotator.apply_and_annotate()



# Adjust layout and save the plot
plt.tight_layout()

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/figC.{format}', dpi=DPI)

plt.show()

38 502

False vs. True: Mann-Whitney-Wilcoxon test two-sided, P_val:1.167e-07 U_stat=4.738e+03
False vs. True: Mann-Whitney-Wilcoxon test two-sided, P_val:1.681e-04 U_stat=6.271e+03
False vs. True: Mann-Whitney-Wilcoxon test two-sided, P_val:6.430e-06 U_stat=6.282e+03

/tmp/ipykernel_2918775/3028195495.py:24: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(
/tmp/ipykernel_2918775/3028195495.py:24: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(
/tmp/ipykernel_2918775/3028195495.py:24: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(

df_pass_chi2_stats[df_pass_chi2_stats['are_diff'] == True].groupby('S').count()

df_pass_chi2_stats[(df_pass_chi2_stats['are_diff'] == True) & (df_pass_chi2_stats['S'] == 0)][['precision_diff',	'recall_diff',	'f1_diff']].median()

precision_diff    0.032
recall_diff      -0.008
f1_diff           0.039
dtype: float64

df_pass_chi2_stats[(df_pass_chi2_stats['are_diff'] == True) & (df_pass_chi2_stats['S'] == 3)][['precision_diff',	'recall_diff',	'f1_diff']].median()

precision_diff   0.062
recall_diff      0.567
f1_diff          0.540
dtype: float64

df_pass_chi2_stats[df_pass_chi2_stats['are_diff'] == True].groupby('column').count()

df_pass_chi2_stats[(df_pass_chi2_stats['are_diff'] == True) & (df_pass_chi2_stats['column'] == 'ganon_norm')][['precision_diff',	'recall_diff',	'f1_diff']].median()

precision_diff   0.061
recall_diff      0.550
f1_diff          0.519
dtype: float64

df_pass_chi2_stats[(df_pass_chi2_stats['are_diff'] == True) & (df_pass_chi2_stats['column'] == 'mean_norm')][['precision_diff',	'recall_diff',	'f1_diff']].median()

precision_diff   0.102
recall_diff      0.000
f1_diff          0.101
dtype: float64

df_pass_chi2_stats[df_pass_chi2_stats['are_diff'] == True].groupby('mode').count()

df_numerical_stats

# Filter for pass=0 and pass=2
df_pass_0 = df_numerical_stats[df_numerical_stats['pass'] == 0].set_index(['mode', 'profiler'])
df_pass_2 = df_numerical_stats[df_numerical_stats['pass'] == 2].set_index(['mode', 'profiler'])

# Align the two DataFrames to ensure consistent indexing
aligned_df = pd.concat([df_pass_0, df_pass_2], axis=1, keys=['pass0', 'pass2'])

# Extract relevant columns and observed counts for pass=0 and pass=2
comparison_df = aligned_df[['pass0', 'pass2']].apply(
    lambda x: pd.Series({
        'passn': 'comparison',  # Mark as comparison for clarity
        'mode': x.name[0],
        'profiler': x.name[1],
        'taxid_counts': x['pass0']['taxid_counts'],  # Keep consistent metadata
        'expected_counts': x['pass0']['expected_counts'],
        'pass0_counts': x['pass0']['observed_counts'],
        'pass2_counts': x['pass2']['observed_counts'],
        'pass0_MAE': x['pass0']['MAE_counts'],
        'pass0_diff': x['pass0']['diff_counts'],
        'pass2_MAE': x['pass2']['MAE_counts'],
        'pass2_diff': x['pass2']['diff_counts'],
    }),
    axis=1
)

# Reset index for the final DataFrame
comparison_df = comparison_df.reset_index(drop=True)

plt.rc('axes', linewidth=0.65)

# Filter the DataFrame for the specific mode (5 in this case)
mode_df = comparison_df[comparison_df['mode'] == 5]

# Get unique profilers for layout
profilers = mode_df['profiler'].unique()
num_profilers = len(profilers)

# Set up the figure with GridSpec for custom layout
fig, axs = plt.subplots(3, num_profilers, figsize=(2 * num_profilers, 6), sharex=True, sharey=True)

# Define plotting functions with regression line
def plot_pass0_vs_pass2(ax, data, profiler, i):
    x = data['pass0_counts'].values[0]
    y = data['pass2_counts'].values[0]
    
    # Scatter plot
    ax.scatter(x, y, alpha=0.7)
    
    # Plot y=x line
    min_val, max_val = min(x.min(), y.min()), max(x.max(), y.max())
    ax.plot([0, 350000], [0, 350000], '--', color='#DC267F')
    
    # Pearson correlation and R^2
    slope, intercept, r_value, _, _ = linregress(x, y)
    r2 = r_value ** 2
    ax.annotate(f"Slope: {slope:.2f}\nR$^2$: {r2:.2f}", xy=(0.05, 0.95),
                xycoords='axes fraction', fontsize=12, verticalalignment='top')
    if i == 0:
        ax.set_ylabel("Pass0 vs Pass2", fontsize=12)
    # Set profiler title on top
    ax.set_title(profiler, fontsize=14)

    sns.despine(top=True, right=True, ax=ax)

    
def plot_with_regression(ax, x, y, xlabel, i):
    # Scatter plot
    ax.scatter(x, y, alpha=0.7)

    # Regression line
    slope, intercept, r_value, _, _ = linregress(x, y)
    ax.plot(x, slope * x + intercept, '-', color='#DC267F')

    # Pearson correlation and R^2
    r2 = r_value ** 2
    ax.annotate(f"Slope: {slope:.2f}\nR$^2$: {r2:.2f}", xy=(0.05, 0.95),
                xycoords='axes fraction', fontsize=12, verticalalignment='top')
    if i == 0:
        ax.set_ylabel(xlabel, fontsize=12)
    sns.despine(top=True, right=True, ax=ax)

# Iterate through profilers and create subplots for each
for i, profiler in enumerate(profilers):
    profiler_data = mode_df[mode_df['profiler'] == profiler]
    x_pass0 = profiler_data['pass0_counts'].values[0]
    x_expected = profiler_data['expected_counts'].values[0]
    y_pass2 = profiler_data['pass2_counts'].values[0]
    
    # Pass0 vs Pass2 (Row 1)
    ax = fig.add_subplot(axs[0, i])
    plot_pass0_vs_pass2(ax, profiler_data, profiler, i)
    
    # Pass2 vs Expected (Row 2)
    ax = fig.add_subplot(axs[1, i])
    plot_with_regression(ax, x_expected, y_pass2, "Pass2 vs Expected", i)
    
    # Pass0 vs Expected (Row 3)
    ax = fig.add_subplot(axs[2, i])
    plot_with_regression(ax, x_expected, x_pass0, "Pass0 vs Expected", i)

    ax.set_xticks([0, 300000])
    ax.set_yticks([0, 300000])



# Aesthetic adjustments
for ax in axs[-1, :]:
    ax.set_xlabel("Counts", fontsize=12)  # Only bottom row has x-labels

plt.tight_layout()

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/figD.{format}', dpi=DPI)


plt.show()

import seaborn as sns
import matplotlib.pyplot as plt


comparison_df_exploded = comparison_df.melt(
    id_vars=['mode', 'profiler'],
    value_vars=['pass0_diff', 'pass2_diff'],
    var_name='pass_type',
    value_name='diff_value'
)

# Convert `diff_value` from a list to separate rows
comparison_df_exploded = comparison_df_exploded.explode('diff_value')

# Ensure `diff_value` is numeric after exploding
comparison_df_exploded['diff_value'] = pd.to_numeric(comparison_df_exploded['diff_value'])




# Get unique profilers
profilers = comparison_df_exploded['profiler'].unique()
num_profilers = len(profilers)

# Set up the figure
fig, axs = plt.subplots(1, num_profilers, figsize=(6 * num_profilers, 6), sharey=True)

# Iterate through profilers and plot the data
for i, profiler in enumerate(profilers):
    ax = axs[i]
    profiler_data = comparison_df_exploded[comparison_df_exploded['profiler'] == profiler]

    # Create the plot
    sns.boxplot(
        data=profiler_data,
        x='mode',
        y='diff_value',
        hue='pass_type',
        ax=ax,
        palette='viridis'
    )
    
    # Set plot title and labels
    ax.set_title(f'Profiler: {profiler}', fontsize=14)
    ax.set_xlabel('Mode', fontsize=12)
    if i == 0:
        ax.set_ylabel('Diff Values', fontsize=12)
    else:
        ax.set_ylabel('')

# Adjust layout and display
plt.tight_layout()
plt.show()

df_nominal_stats_sub = df_nominal_stats[(df_nominal_stats['pass'] == 2) & (df_nominal_stats['S'].isin([0, 2, 7, 15])) & (df_nominal_stats['column'].isin([f'{i}_norm' for i in LIST_PROFILERS + ['mean']]))]

df_nominal_stats_sub

plt.rc('axes', linewidth=0.65)

melted_df = pd.melt(
    df_nominal_stats_sub,
    id_vars=['mode', 'S', 'column'],
    value_vars=['recall', 'precision', 'f1'],
    var_name='metric',
    value_name='score'
)

# Create a colormap for 'mode'
norm = Normalize(vmin=melted_df['mode'].min(), vmax=melted_df['mode'].max())
cmap = plt.cm.viridis  # Choose a colormap (e.g., 'viridis', 'plasma', 'cividis')

# Create a FacetGrid: 6x3 grid (row for each profiler, column for each metric)
g = sns.FacetGrid(
    melted_df, 
    col='column', 
    row='metric', 
    height=2, 
    sharey=True, 
    sharex=True
)

# Map the lineplot to the grid
def lineplot_with_cmap(data, **kwargs):
    for S in sorted(data['S'].unique()):
        subset = data[data['S'] == S]
        plt.plot(subset['mode'], subset['score'], label=f"Mode {mode}",
                 color=cmap(norm(S)), marker='o')

g.map_dataframe(lineplot_with_cmap)

# Create a legend for the discrete modes
handles = [
    plt.Line2D([0], [0], color=cmap(norm(mode)), marker='o', linestyle='', label=f"S {mode}")
    for mode in sorted(melted_df['S'].unique())
]
plt.legend(
    handles=handles, 
    title="", 
    bbox_to_anchor=(1.05, 3), 
    loc='center left', 
    frameon=False
)

# Set x-axis ticks (if you have specific S values)
g.set(xticks=df_nominal_stats_sub['mode'].unique())

for ax in g.axes.ravel():
    ax.set_title('')

# Add axis labels and titles
for ax, profiler in zip(g.axes[0, :], melted_df['column'].unique()):
    ax.set_title(profiler.replace('_norm', ''))

for ax, score in zip(g.axes[:, 0], ['recall', 'precision', 'F1-score']):
    ax.set_ylabel(score)

plt.tight_layout()

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/figF.{format}', dpi=DPI, bbox_inches='tight')

plt.show()

/tmp/ipykernel_2918775/1058285318.py:60: UserWarning: Tight layout not applied. tight_layout cannot make Axes height small enough to accommodate all Axes decorations.
  plt.tight_layout()

df_numerical_stats_sub = df_numerical_stats[(df_numerical_stats['pass'] == 2) & (df_numerical_stats['mode'].isin([1, 3, 5, 7, 9]))]

# Get unique profilers for layout
profilers = df_numerical_stats_sub['profiler'].unique()
modes = df_numerical_stats_sub['mode'].unique()


# Set up the figure with GridSpec for custom layout
fig, axs = plt.subplots(len(profilers), len(modes), figsize=(2 * len(modes), 2 * len(profilers)), sharex=True, sharey=True)

# Define plotting functions with regression line
def plot_regression(ax, data, profiler, i):
    x = data['expected_counts'].values[0]
    y = data['observed_counts'].values[0]
    
    # Scatter plot
    ax.scatter(x, y, alpha=0.7)
    
    # Plot y=x line
    min_val, max_val = min(x.min(), y.min()), max(x.max(), y.max())
    ax.plot([0, 350000], [0, 350000], 'r--')
    
    # Pearson correlation and R^2
    slope, intercept, r_value, _, _ = linregress(x, y)
    r2 = r_value ** 2
    ax.annotate(f"Slope: {slope:.2f}\nR2: {r2:.2f}", xy=(0.05, 0.95),
                xycoords='axes fraction', fontsize=10, verticalalignment='top')
    

# Iterate through profilers and create subplots for each
for i, profiler in enumerate(profilers):
    for j, mode in enumerate(modes):
        profiler_data = df_numerical_stats_sub[(df_numerical_stats_sub['profiler'] == profiler) & (df_numerical_stats_sub['mode'] == mode)]
        
        # Pass0 vs Pass2 (Row 1)
        ax = fig.add_subplot(axs[i, j])
        plot_regression(ax, profiler_data, profiler, i)

        if j == 0:
            ax.set_ylabel(profiler, fontsize=12)
            # Set profiler title on top
        if i == 0:
            ax.set_title(f'Mode {mode}', fontsize=14)

# Aesthetic adjustments
for ax in axs[-1, :]:
    ax.set_xlabel("Counts", fontsize=12)  # Only bottom row has x-labels

plt.tight_layout()
plt.show()

# Prepare the data for plotting
expanded_data = []
for _, row in df_numerical_stats_sub.iterrows():
    profiler = row['profiler']
    mode = row['mode']
    for value in row['diff_counts']:
        expanded_data.append({'profiler': profiler, 'mode': mode, 'diff_counts': value})

# Convert to a new DataFrame
plot_data = pd.DataFrame(expanded_data)

# Initialize the grid of plots with seaborn
g = sns.FacetGrid(plot_data, col="profiler", height=4, aspect=1)

# Plot the data on each facet
g.map_dataframe(
    sns.boxplot,
    x="mode",
    y="diff_counts",
    palette="viridis"
)

# Adjust the legend and titles
g.set_axis_labels("Mode", "Difference Counts")
g.set_titles(col_template="{col_name} Profiler")

plt.tight_layout()
plt.show()

/home/neuroimm/.local/share/mamba/envs/EVs/lib/python3.13/site-packages/seaborn/axisgrid.py:854: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  func(*plot_args, **plot_kwargs)
/home/neuroimm/.local/share/mamba/envs/EVs/lib/python3.13/site-packages/seaborn/axisgrid.py:854: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  func(*plot_args, **plot_kwargs)
/home/neuroimm/.local/share/mamba/envs/EVs/lib/python3.13/site-packages/seaborn/axisgrid.py:854: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  func(*plot_args, **plot_kwargs)
/home/neuroimm/.local/share/mamba/envs/EVs/lib/python3.13/site-packages/seaborn/axisgrid.py:854: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  func(*plot_args, **plot_kwargs)
/home/neuroimm/.local/share/mamba/envs/EVs/lib/python3.13/site-packages/seaborn/axisgrid.py:854: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  func(*plot_args, **plot_kwargs)
/home/neuroimm/.local/share/mamba/envs/EVs/lib/python3.13/site-packages/seaborn/axisgrid.py:854: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  func(*plot_args, **plot_kwargs)

from matplotlib.ticker import MaxNLocator

plt.rc('axes', linewidth=0.65)

# Initialize the grid of plots with seaborn
g = sns.FacetGrid(df_numerical_stats_sub, col="profiler", height=2.3, aspect=0.85, sharex=False, sharey=False)

# Plot the data on each facet
g.map_dataframe(
    sns.scatterplot,
    x="corr_counts",
    y="R2_counts",
    hue="mode",
    palette="viridis"
)

# Adjust the legend and titles
g.add_legend(title="Mode", bbox_to_anchor=(1.05, 0.5))

g.set_axis_labels("Correlation Counts", "R² Counts")
g.set_titles(col_template="{col_name}")

for ax in g.axes.flat:
    ax.xaxis.set_major_locator(MaxNLocator(nbins=2)) 


plt.tight_layout()

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/figGA.{format}', dpi=DPI, bbox_inches='tight')

plt.show()

g = sns.FacetGrid(df_numerical_stats_sub, col="profiler", height=2.3, aspect=0.85, sharex=False, sharey=False)

# Plot the data on each facet
g.map_dataframe(
    sns.scatterplot,
    x="MAE_counts",
    y="MAED_counts",
    hue="mode",
    palette="viridis"
)

# Adjust the legend and titles
g.add_legend(title="Mode", bbox_to_anchor=(1.05, 0.5))

g.set_axis_labels("MAE", "MAED")
g.set_titles(col_template="{col_name}")

plt.tight_layout()

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/figGB.{format}', dpi=DPI, bbox_inches='tight')


plt.show()

g = sns.FacetGrid(df_numerical_stats_sub, col="profiler", height=2.3, aspect=0.85, sharex=False, sharey=False)

# Plot the data on each facet
g.map_dataframe(
    sns.scatterplot,
    x="MRE_counts",
    y="MRED_counts",
    hue="mode",
    palette="viridis"
)

# Adjust the legend and titles
g.add_legend(title="Mode", bbox_to_anchor=(1.05, 0.5))

g.set_axis_labels("MRE", "MRED")
g.set_titles(col_template="{col_name}")

plt.tight_layout()

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/figGC.{format}', dpi=DPI, bbox_inches='tight')


plt.show()

# Create the DataFrame
df = df_numerical_stats[df_numerical_stats['pass'] == 2].copy()



# Calculate the differences from the mean for each mode
df_mean = df[df['profiler'] == 'mean'][['mode', 'MRED_counts']].rename(columns={'MRED_counts': 'mean_MRED'})
df = df.merge(df_mean, on='mode')
df['difference'] = df['MRED_counts'] - df['mean_MRED']


# Filter out the mean profiler itself
df_filtered = df[df['profiler'] != 'mean'].sort_values('profiler')


# Perform Mann-Whitney U tests
profiler_groups = df_filtered['profiler'].unique()
p_values = []
results = []


for profiler in profiler_groups:
    group_data = df_filtered[df_filtered['profiler'] == profiler]
    mean_data = group_data['MRED_counts'].values
    prof_data = group_data['mean_MRED'].values
    stat, p = wilcoxon(mean_data, prof_data, alternative='greater')
    p_values.append(p)
    results.append({'profiler': profiler, 'statistic': stat, 'p_value': p})

# Correct p-values for multiple testing
p_values_corrected = multipletests(p_values, method='fdr_bh')[1]

for i, result in enumerate(results):
    result['p_value_corrected'] = p_values_corrected[i]


# Convert results to DataFrame
results_df = pd.DataFrame(results)

display(results_df)

# Plot the differences
fig, ax = plt.subplots(1, 1, figsize=(5, 3))

sns.boxplot(data=df_filtered, x='profiler', y='difference', palette=custom_palette, ax=ax)
# sns.stripplot(data=df_filtered, x='profiler', y='difference', color='black', alpha=0.5, jitter=True)
medians = df_filtered.groupby("profiler")['difference'].median()
for i, median in enumerate(medians):
        ax.text(i, median , f"{median:.2f}", ha='center', va='bottom', fontsize=8)


plt.axhline(0, linestyle='--', color='#848484', linewidth=0.5)
plt.ylabel('MRED difference from mean')
plt.xlabel('')

sns.despine(top=True, right=True)

plt.tight_layout()

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/figJ.{format}', dpi=DPI, bbox_inches='tight')


plt.show()

/tmp/ipykernel_2918775/2421930555.py:45: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df_filtered, x='profiler', y='difference', palette=custom_palette, ax=ax)
/tmp/ipykernel_2918775/2421930555.py:45: UserWarning: The palette list has more values (6) than needed (5), which may not be intended.
  sns.boxplot(data=df_filtered, x='profiler', y='difference', palette=custom_palette, ax=ax)

# Create the DataFrame
df = df_numerical_stats[df_numerical_stats['pass'] == 2].copy()



# Calculate the differences from the mean for each mode
df_mean = df[df['profiler'] == 'mean'][['mode', 'MAED_counts']].rename(columns={'MAED_counts': 'mean_MAED'})
df = df.merge(df_mean, on='mode')
df['difference'] = df['MAED_counts'] - df['mean_MAED']


# Filter out the mean profiler itself
df_filtered = df[df['profiler'] != 'mean'].sort_values('profiler')


# Perform Mann-Whitney U tests
profiler_groups = df_filtered['profiler'].unique()
p_values = []
results = []


for profiler in profiler_groups:
    group_data = df_filtered[df_filtered['profiler'] == profiler]
    mean_data = group_data['MAED_counts'].values
    prof_data = group_data['mean_MAED'].values
    stat, p = wilcoxon(mean_data, prof_data, alternative='greater')
    p_values.append(p)
    results.append({'profiler': profiler, 'statistic': stat, 'p_value': p})

# Correct p-values for multiple testing
p_values_corrected = multipletests(p_values, method='fdr_bh')[1]

for i, result in enumerate(results):
    result['p_value_corrected'] = p_values_corrected[i]


# Convert results to DataFrame
results_df = pd.DataFrame(results)

display(results_df)

# Plot the differences
fig, ax = plt.subplots(1, 1, figsize=(5, 3))

sns.boxplot(data=df_filtered, x='profiler', y='difference', palette=custom_palette, ax=ax)
# sns.stripplot(data=df_filtered, x='profiler', y='difference', color='black', alpha=0.5, jitter=True)
medians = df_filtered.groupby("profiler")['difference'].median()
for i, median in enumerate(medians):
        ax.text(i, median , f"{median:.2f}", ha='center', va='bottom', fontsize=8)



plt.axhline(0, linestyle='--', color='#848484', linewidth=0.5)
plt.ylabel('MAED difference from mean')
plt.xlabel('')

sns.despine(top=True, right=True)

plt.tight_layout()

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/figJ.{format}', dpi=DPI, bbox_inches='tight')


plt.show()

/tmp/ipykernel_2918775/683032758.py:45: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df_filtered, x='profiler', y='difference', palette=custom_palette, ax=ax)
/tmp/ipykernel_2918775/683032758.py:45: UserWarning: The palette list has more values (6) than needed (5), which may not be intended.
  sns.boxplot(data=df_filtered, x='profiler', y='difference', palette=custom_palette, ax=ax)

df_numerical_stats_sub[df_numerical_stats_sub['mode'].isin([3, 7])]

def plot_prof_couns(df_combo):
    # Assuming your data is in a DataFrame called `df`
    # Expand the diff_counts column into individual rows for plotting
    df_expanded = df_combo.explode('diff_counts')
    df_expanded['diff_counts'] = pd.to_numeric(df_expanded['diff_counts'])

    # Create the plot
    plt.figure(figsize=(10, 2.5))
    

    for profiler, countsdiff in zip(df_combo['profiler'], df_combo['diff_counts']):
        me = np.median(countsdiff)
        med = np.std(countsdiff)
        plt.scatter(me, profiler, color='#DC267F', label='_nolegend_', s=100, zorder=3, marker = '|')
        plt.plot([me-med, me+med], [profiler, profiler], color='#DC267F', label='_nolegend_',)

    sns.stripplot(
        data=df_expanded,
        x='diff_counts',
        y='profiler',
        jitter=True,  # Adds jitter for better visibility of points
        size=5,  # Adjust point size
        alpha=0.7,  # Slight transparency
        color="#648FFF"
    )

    # Add a vertical line at x=0
    plt.axvline(0, color='#848484', linestyle='--', linewidth=1)

    sns.despine(top=True, right=True)

    # Add labels and title
    # plt.title('Diff Counts Across Profilers', fontsize=14)
    plt.xlabel('PRE', fontsize=12)
    plt.ylabel('', fontsize=12)
    plt.grid(True, axis='x', linestyle='--', alpha=0.6)
    plt.tight_layout()

    for format in ['png', 'tiff']: 
        plt.savefig(f'{RESULTS_DIR}/figures/paper/figH{mode}.{format}', dpi=DPI, bbox_inches='tight')

    plt.show()

for mode in [3,5,7]:
    print(mode)
    df_combo = df_numerical_stats[(df_numerical_stats['pass'] == 2) & \
                              (df_numerical_stats['mode'] == mode)] # We choose a large number because S in not relevant here (but with small S we may select few datasets)

    plot_prof_couns(df_combo)

3

5

7

species_to_kingdom = {
    # Bacteria
    "Cutibacterium acnes": "Bacteria",
    "Lactobacillus acidophilus": "Bacteria",
    "Bifidobacterium bifidum": "Bacteria",
    "Akkermansia muciniphila": "Bacteria",
    "Blautia coccoides": "Bacteria",
    "Blautia luti": "Bacteria",
    "Bacteroides ovatus": "Bacteria",
    "Bacteroides intestinalis": "Bacteria",
    "Bacteroides fragilis": "Bacteria",
    "Escherichia coli": "Bacteria",
    "Dietzia lutea": "Bacteria",
    "Ruthenibacterium lactatiformans": "Bacteria",
    "Faecalibacterium prausnitzii": "Bacteria",
    "Parabacteroides distasonis": "Bacteria",
    "Parabacteroides merdae": "Bacteria",
    "Fusicatenibacter saccharivorans": "Bacteria",
    "Erysipelatoclostridium ramosum": "Bacteria",
    "Streptococcus salivarius": "Bacteria",
    "Hungatella hathewayi": "Bacteria",
    "Eisenbergiella porci": "Bacteria",
    "Butyricimonas faecalis": "Bacteria",
    "Alistipes indistinctus": "Bacteria",
    "Alistipes finegoldii": "Bacteria",
    "Eubacterium callanderi": "Bacteria",
    "Acidaminococcus intestini": "Bacteria",
    
    # Fungi
    "Aspergillus chevalieri": "Fungi",
    "Aspergillus flavus": "Fungi",
    "Saccharomyces cerevisiae": "Fungi",
    "Saccharomyces kudriavzevii": "Fungi",
    "Saccharomyces mikatae": "Fungi",
    "Candida albicans": "Fungi",
    "Candida dubliniensis": "Fungi",
    "Candida orthopsilosis": "Fungi",
    "Malassezia restricta": "Fungi",
    "Alternaria dauci": "Fungi",
    "Kazachstania africana": "Fungi",
    "Penicillium digitatum": "Fungi",
    "Pichia kudriavzevii": "Fungi",
    "Trichoderma asperellum": "Fungi",
    "Akanthomyces muscarius": "Fungi",
    "Fusarium falciforme": "Fungi",
    "Eremothecium sinecaudum": "Fungi",
    "Cryptococcus decagattii": "Fungi",
    "Kwoniella shandongensis": "Fungi",
    "Puccinia triticina": "Fungi",
    
    # Virus
    "Tobacco mosaic virus": "Virus",
    "Rotavirus A": "Virus",
    "Rotavirus B": "Virus",
    "Rotavirus C": "Virus",
    "Bacteriophage P2": "Virus",
    "Escherichia phage T4": "Virus",
    "Human immunodeficiency virus 1": "Virus",
    "Human adenovirus 7": "Virus",
    "Hepatitis C virus": "Virus",
    "Bovine alphaherpesvirus 2": "Virus",
    "Human herpesvirus 4 type 2": "Virus",
    "Mimivirus terra2": "Virus",
    "Dengue virus": "Virus",
    "Norovirus GI": "Virus",
    "Zaire ebolavirus": "Virus"
}

mode = 5

df_combo = df_numerical_stats[(df_numerical_stats['pass'] == 2) & \
                              (df_numerical_stats['mode'] == mode)].set_index('profiler')

df_combo

# RUN FOR MEAN IN MODE 1 3 5 7 9
# RUN FOR MODE 5 AND THE REST OF PROFILERS

prof = 'mean'

df_diffab = pd.DataFrame({'species': df_combo.loc[prof, 'species_counts'],  
                          'expected_counts': df_combo.loc[prof, 'expected_counts'], \
                          'observed_counts': df_combo.loc[prof, 'observed_counts'], \
                          'diff_abundance': df_combo.loc[prof, 'diff_counts']})
df_diffab['kingdom'] = [species_to_kingdom[i] for i in df_diffab['species']]
df_diffab['isfamily'] = False

family_species = ['Blautia coccoides', 'Blautia luti', 'Bacteroides ovatus', 'Bacteroides intestinalis','Bacteroides fragilis', 
                  'Parabacteroides distasonis', 'Parabacteroides merdae', 'Alistipes indistinctus', 'Alistipes finegoldii', 'Aspergillus chevalieri', 
                  'Aspergillus flavus', 'Saccharomyces cerevisiae', 'Saccharomyces kudriavzevii', 'Saccharomyces mikatae', 'Candida albicans', 'Candida dubliniensis', 
                  'Candida orthopsilosis', 'Rotavirus A', 'Rotavirus B', 'Rotavirus C']
df_diffab.loc[df_diffab['species'].isin(family_species), 'isfamily'] = True
df_diffab.sort_values(by=['expected_counts', 'diff_abundance'], ascending=False)

# Create a grid of three axes for the plots
fig, axs = plt.subplots(1, 3, figsize=(9, 3.5), sharey=True)

pairs=[(True, False)]


ax1 = sns.boxplot(df_diffab, x='isfamily', y='diff_abundance', ax=axs[0], palette=sns.color_palette(['#4E79A7', '#A0CBE8']))
medians = df_diffab.groupby("isfamily")['diff_abundance'].median()
for i, median in enumerate(medians):
        ax1.text(i, median - 8, f"{median:.2f}", ha='center', va='bottom', fontsize=8)

axs[0].set_ylabel("PRE", fontsize=12)
annotator = Annotator(ax1, pairs, data=df_diffab, x='isfamily', y='diff_abundance')
annotator.configure(test='Mann-Whitney', text_format='simple', loc='outside', line_width=1)
annotator.apply_and_annotate()
axs[0].set_xlabel("Species belongs to\na shared genus", fontsize=12)



ax2 = sns.boxplot(df_diffab, x='kingdom', y='diff_abundance', ax=axs[1], palette=sns.color_palette(['#4E79A7', '#77a2c8', '#A0CBE8']))
pairs=[("Virus", 'Bacteria'), ('Fungi', 'Bacteria'), ('Fungi', 'Virus')]
annotator = Annotator(ax2, pairs, data=df_diffab, x='kingdom', y='diff_abundance')
annotator.configure(test='Mann-Whitney', text_format='simple', loc='outside', line_width=1)
annotator.apply_and_annotate()
medians = df_diffab.groupby("kingdom")['diff_abundance'].median()
for i, median in enumerate(medians):
        ax2.text(i, median - 8, f"{median:.2f}", ha='center', va='bottom', fontsize=8)




df_bac = df_diffab[df_diffab['kingdom'] == 'Bacteria']

ax3 = sns.boxplot(df_bac, x='isfamily', y='diff_abundance', ax=axs[2], palette=sns.color_palette(['#4E79A7', '#A0CBE8']))
pairs=[(True, False)]
annotator = Annotator(ax3, pairs, data=df_bac, x='isfamily', y='diff_abundance')
annotator.configure(test='Mann-Whitney', text_format='simple', loc='outside', line_width=1)
annotator.apply_and_annotate()
axs[2].set_xlabel("Bacterial species belongs\nto a shared genus", fontsize=12)
medians = df_bac.groupby("isfamily")['diff_abundance'].median()
for i, median in enumerate(medians):
        ax3.text(i, median - 8, f"{median:.2f}", ha='center', va='bottom', fontsize=8)


for ax in axs:
    sns.despine(top=True, right=True, ax=ax)


plt.tight_layout()

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/figI-{mode}-{prof}.{format}', dpi=DPI, bbox_inches='tight')

plt.show()

/tmp/ipykernel_2918775/1519887836.py:7: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax1 = sns.boxplot(df_diffab, x='isfamily', y='diff_abundance', ax=axs[0], palette=sns.color_palette(['#4E79A7', '#A0CBE8']))
/tmp/ipykernel_2918775/1519887836.py:20: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax2 = sns.boxplot(df_diffab, x='kingdom', y='diff_abundance', ax=axs[1], palette=sns.color_palette(['#4E79A7', '#77a2c8', '#A0CBE8']))
/tmp/ipykernel_2918775/1519887836.py:34: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax3 = sns.boxplot(df_bac, x='isfamily', y='diff_abundance', ax=axs[2], palette=sns.color_palette(['#4E79A7', '#A0CBE8']))

False vs. True: Mann-Whitney-Wilcoxon test two-sided, P_val:3.304e-01 U_stat=4.400e+02
Bacteria vs. Fungi: Mann-Whitney-Wilcoxon test two-sided, P_val:1.184e-07 U_stat=1.200e+01
Fungi vs. Virus: Mann-Whitney-Wilcoxon test two-sided, P_val:3.156e-02 U_stat=8.500e+01
Bacteria vs. Virus: Mann-Whitney-Wilcoxon test two-sided, P_val:2.544e-05 U_stat=3.100e+01
False vs. True: Mann-Whitney-Wilcoxon test two-sided, P_val:3.610e-01 U_stat=7.800e+01

	SAMPLE	pass	mode	profiler	mapped_human	mapped_others	unmapped	mapped_human_1_2_maps	mapped_human_total	total_reads	observed_human_prop	observed_others_prop	observed_unmapped_prop	expected_human_prop	expected_others_prop	calculated_unmapped_human_prop	calculated_unmapped_others_prop	proportion_mapped_other_reads
0	ARTIFICIAL	0	1	centrifuge	38740565	7929463	3326083	0	38740565	49996111	0.775	0.159	0.067	0.800	0.200	0.025	0.041	0.793
1	ARTIFICIAL	0	2	centrifuge	38831835	7963709	3200567	0	38831835	49996111	0.777	0.159	0.064	0.800	0.200	0.023	0.041	0.796
2	ARTIFICIAL	0	3	centrifuge	38909846	7998877	3087388	0	38909846	49996111	0.778	0.160	0.062	0.800	0.200	0.022	0.040	0.800
3	ARTIFICIAL	0	4	centrifuge	38976065	8034549	2985497	0	38976065	49996111	0.780	0.161	0.060	0.800	0.200	0.020	0.039	0.804
4	ARTIFICIAL	0	5	centrifuge	39023379	8070059	2902673	0	39023379	49996111	0.781	0.161	0.058	0.800	0.200	0.019	0.039	0.807
5	ARTIFICIAL	0	6	centrifuge	39064849	8106808	2824454	0	39064849	49996111	0.781	0.162	0.056	0.800	0.200	0.019	0.038	0.811
6	ARTIFICIAL	0	7	centrifuge	39095674	8148852	2751585	0	39095674	49996111	0.782	0.163	0.055	0.800	0.200	0.018	0.037	0.815
7	ARTIFICIAL	0	8	centrifuge	39120561	8209851	2665699	0	39120561	49996111	0.782	0.164	0.053	0.800	0.200	0.018	0.036	0.821
8	ARTIFICIAL	0	9	centrifuge	39149190	8425120	2421801	0	39149190	49996111	0.783	0.169	0.048	0.800	0.200	0.017	0.031	0.843
9	ARTIFICIAL	2	1	centrifuge	1046644	7929421	2800163	38219883	39266527	49996111	0.785	0.159	0.056	0.800	0.200	0.015	0.041	0.793
10	ARTIFICIAL	2	2	centrifuge	1052656	7964098	2759474	38219883	39272539	49996111	0.786	0.159	0.055	0.800	0.200	0.014	0.041	0.796
11	ARTIFICIAL	2	3	centrifuge	1058357	7998912	2718959	38219883	39278240	49996111	0.786	0.160	0.054	0.800	0.200	0.014	0.040	0.800
12	ARTIFICIAL	2	4	centrifuge	1063583	8034577	2678068	38219883	39283466	49996111	0.786	0.161	0.054	0.800	0.200	0.014	0.039	0.804
13	ARTIFICIAL	2	5	centrifuge	1066865	8069484	2639879	38219883	39286748	49996111	0.786	0.161	0.053	0.800	0.200	0.014	0.039	0.807
14	ARTIFICIAL	2	6	centrifuge	1069906	8105966	2600356	38219883	39289789	49996111	0.786	0.162	0.052	0.800	0.200	0.014	0.038	0.811
15	ARTIFICIAL	2	7	centrifuge	1071551	8147452	2557225	38219883	39291434	49996111	0.786	0.163	0.051	0.800	0.200	0.014	0.037	0.815
16	ARTIFICIAL	2	8	centrifuge	1073535	8203982	2498711	38219883	39293418	49996111	0.786	0.164	0.050	0.800	0.200	0.014	0.036	0.820
17	ARTIFICIAL	2	9	centrifuge	1082232	8384059	2309937	38219883	39302115	49996111	0.786	0.168	0.046	0.800	0.200	0.014	0.032	0.838

	SAMPLE	pass	mode	profiler	mapped_human	mapped_others	unmapped	mapped_human_1_2_maps	mapped_human_total	total_reads	observed_human_prop	observed_others_prop	observed_unmapped_prop	expected_human_prop	expected_others_prop	calculated_unmapped_human_prop	calculated_unmapped_others_prop	proportion_mapped_other_reads
18	ARTIFICIAL	0	1	ganon	35534622	7058731	7402758	0	35534622	49996111	0.711	0.141	0.148	0.800	0.200	0.089	0.059	0.706
19	ARTIFICIAL	0	2	ganon	35534622	7058731	7402758	0	35534622	49996111	0.711	0.141	0.148	0.800	0.200	0.089	0.059	0.706
20	ARTIFICIAL	0	3	ganon	38667598	8417056	2911457	0	38667598	49996111	0.773	0.168	0.058	0.800	0.200	0.027	0.032	0.842
21	ARTIFICIAL	0	4	ganon	38665847	8418807	2911457	0	38665847	49996111	0.773	0.168	0.058	0.800	0.200	0.027	0.032	0.842
22	ARTIFICIAL	0	5	ganon	39527606	8895428	1573077	0	39527606	49996111	0.791	0.178	0.031	0.800	0.200	0.009	0.022	0.890
23	ARTIFICIAL	0	6	ganon	39526084	8896950	1573077	0	39526084	49996111	0.791	0.178	0.031	0.800	0.200	0.009	0.022	0.890
24	ARTIFICIAL	0	7	ganon	39779348	9122179	1094584	0	39779348	49996111	0.796	0.182	0.022	0.800	0.200	0.004	0.018	0.912
25	ARTIFICIAL	0	8	ganon	39773555	9127972	1094584	0	39773555	49996111	0.796	0.183	0.022	0.800	0.200	0.004	0.017	0.913
26	ARTIFICIAL	0	9	ganon	39861349	9281351	853411	0	39861349	49996111	0.797	0.186	0.017	0.800	0.200	0.003	0.014	0.928
27	ARTIFICIAL	2	1	ganon	1080116	7658050	3038062	38219883	39299999	49996111	0.786	0.153	0.061	0.800	0.200	0.014	0.047	0.766
28	ARTIFICIAL	2	2	ganon	1080116	7658050	3038062	38219883	39299999	49996111	0.786	0.153	0.061	0.800	0.200	0.014	0.047	0.766
29	ARTIFICIAL	2	3	ganon	1176914	9041188	1558126	38219883	39396797	49996111	0.788	0.181	0.031	0.800	0.200	0.012	0.019	0.904
30	ARTIFICIAL	2	4	ganon	1176339	9041763	1558126	38219883	39396222	49996111	0.788	0.181	0.031	0.800	0.200	0.012	0.019	0.904
31	ARTIFICIAL	2	5	ganon	1214141	9519657	1042430	38219883	39434024	49996111	0.789	0.190	0.021	0.800	0.200	0.011	0.010	0.952
32	ARTIFICIAL	2	6	ganon	1213866	9519932	1042430	38219883	39433749	49996111	0.789	0.190	0.021	0.800	0.200	0.011	0.010	0.952
33	ARTIFICIAL	2	7	ganon	1228567	9741454	806207	38219883	39448450	49996111	0.789	0.195	0.016	0.800	0.200	0.011	0.005	0.974
34	ARTIFICIAL	2	8	ganon	1227189	9742832	806207	38219883	39447072	49996111	0.789	0.195	0.016	0.800	0.200	0.011	0.005	0.974
35	ARTIFICIAL	2	9	ganon	1231920	9884612	659696	38219883	39451803	49996111	0.789	0.198	0.013	0.800	0.200	0.011	0.002	0.989

	SAMPLE	pass	mode	profiler	mapped_human	mapped_others	unmapped	mapped_human_1_2_maps	mapped_human_total	total_reads	observed_human_prop	observed_others_prop	observed_unmapped_prop	expected_human_prop	expected_others_prop	calculated_unmapped_human_prop	calculated_unmapped_others_prop	proportion_mapped_other_reads
36	ARTIFICIAL	0	1	kaiju	814678	5097740	44083693	0	814678	49996111	0.016	0.102	0.882	0.800	0.200	0.784	0.098	0.510
37	ARTIFICIAL	0	2	kaiju	1065560	5349844	43580707	0	1065560	49996111	0.021	0.107	0.872	0.800	0.200	0.779	0.093	0.535
38	ARTIFICIAL	0	3	kaiju	1065560	5349844	43580707	0	1065560	49996111	0.021	0.107	0.872	0.800	0.200	0.779	0.093	0.535
39	ARTIFICIAL	0	4	kaiju	1519215	5586056	42890840	0	1519215	49996111	0.030	0.112	0.858	0.800	0.200	0.770	0.088	0.559
40	ARTIFICIAL	0	5	kaiju	3099887	5936414	40959810	0	3099887	49996111	0.062	0.119	0.819	0.800	0.200	0.738	0.081	0.594
41	ARTIFICIAL	0	6	kaiju	3100631	5936491	40958989	0	3100631	49996111	0.062	0.119	0.819	0.800	0.200	0.738	0.081	0.594
42	ARTIFICIAL	0	7	kaiju	4812786	6324993	38858332	0	4812786	49996111	0.096	0.127	0.777	0.800	0.200	0.704	0.073	0.633
43	ARTIFICIAL	0	8	kaiju	4862779	6330261	38803071	0	4862779	49996111	0.097	0.127	0.776	0.800	0.200	0.703	0.073	0.633
44	ARTIFICIAL	0	9	kaiju	5825922	6650125	37520064	0	5825922	49996111	0.117	0.133	0.750	0.800	0.200	0.683	0.067	0.665
45	ARTIFICIAL	2	1	kaiju	56877	4941549	6777802	38219883	38276760	49996111	0.766	0.099	0.136	0.800	0.200	0.034	0.101	0.494
46	ARTIFICIAL	2	2	kaiju	68592	5116745	6590891	38219883	38288475	49996111	0.766	0.102	0.132	0.800	0.200	0.034	0.098	0.512
47	ARTIFICIAL	2	3	kaiju	68592	5116745	6590891	38219883	38288475	49996111	0.766	0.102	0.132	0.800	0.200	0.034	0.098	0.512
48	ARTIFICIAL	2	4	kaiju	82822	5262441	6430965	38219883	38302705	49996111	0.766	0.105	0.129	0.800	0.200	0.034	0.095	0.526
49	ARTIFICIAL	2	5	kaiju	118607	5439087	6218534	38219883	38338490	49996111	0.767	0.109	0.124	0.800	0.200	0.033	0.091	0.544
50	ARTIFICIAL	2	6	kaiju	118607	5439147	6218474	38219883	38338490	49996111	0.767	0.109	0.124	0.800	0.200	0.033	0.091	0.544
51	ARTIFICIAL	2	7	kaiju	169913	5603398	6002917	38219883	38389796	49996111	0.768	0.112	0.120	0.800	0.200	0.032	0.088	0.560
52	ARTIFICIAL	2	8	kaiju	170510	5604969	6000749	38219883	38390393	49996111	0.768	0.112	0.120	0.800	0.200	0.032	0.088	0.561
53	ARTIFICIAL	2	9	kaiju	194987	5718712	5862529	38219883	38414870	49996111	0.768	0.114	0.117	0.800	0.200	0.032	0.086	0.572

	SAMPLE	pass	mode	profiler	mapped_human	mapped_others	unmapped	mapped_human_1_2_maps	mapped_human_total	total_reads	observed_human_prop	observed_others_prop	observed_unmapped_prop	expected_human_prop	expected_others_prop	calculated_unmapped_human_prop	calculated_unmapped_others_prop	proportion_mapped_other_reads
54	ARTIFICIAL	0	1	kraken2	14455036	2965393	32575682	0	14455036	49996111	0.289	0.059	0.652	0.800	0.200	0.511	0.141	0.297
55	ARTIFICIAL	0	2	kraken2	19419056	3736988	26840067	0	19419056	49996111	0.388	0.075	0.537	0.800	0.200	0.412	0.125	0.374
56	ARTIFICIAL	0	3	kraken2	23299865	4319590	22376656	0	23299865	49996111	0.466	0.086	0.448	0.800	0.200	0.334	0.114	0.432
57	ARTIFICIAL	0	4	kraken2	26673093	4785348	18537670	0	26673093	49996111	0.534	0.096	0.371	0.800	0.200	0.266	0.104	0.479
58	ARTIFICIAL	0	5	kraken2	29088107	5118672	15789332	0	29088107	49996111	0.582	0.102	0.316	0.800	0.200	0.218	0.098	0.512
59	ARTIFICIAL	0	6	kraken2	31245840	5417465	13332806	0	31245840	49996111	0.625	0.108	0.267	0.800	0.200	0.175	0.092	0.542
60	ARTIFICIAL	0	7	kraken2	32777703	5644156	11574252	0	32777703	49996111	0.656	0.113	0.232	0.800	0.200	0.144	0.087	0.564
61	ARTIFICIAL	0	8	kraken2	34127026	5848584	10020501	0	34127026	49996111	0.683	0.117	0.200	0.800	0.200	0.117	0.083	0.585
62	ARTIFICIAL	0	9	kraken2	35488630	6041475	8466006	0	35488630	49996111	0.710	0.121	0.169	0.800	0.200	0.090	0.079	0.604
63	ARTIFICIAL	2	1	kraken2	381464	2975038	8419726	38219883	38601347	49996111	0.772	0.060	0.168	0.800	0.200	0.028	0.140	0.298
64	ARTIFICIAL	2	2	kraken2	503239	3755612	7517377	38219883	38723122	49996111	0.775	0.075	0.150	0.800	0.200	0.025	0.125	0.376
65	ARTIFICIAL	2	3	kraken2	591284	4327476	6857468	38219883	38811167	49996111	0.776	0.087	0.137	0.800	0.200	0.024	0.113	0.433
66	ARTIFICIAL	2	4	kraken2	664379	4770367	6341482	38219883	38884262	49996111	0.778	0.095	0.127	0.800	0.200	0.022	0.105	0.477
67	ARTIFICIAL	2	5	kraken2	723279	5127558	5925391	38219883	38943162	49996111	0.779	0.103	0.119	0.800	0.200	0.021	0.097	0.513
68	ARTIFICIAL	2	6	kraken2	772466	5409228	5594534	38219883	38992349	49996111	0.780	0.108	0.112	0.800	0.200	0.020	0.092	0.541
69	ARTIFICIAL	2	7	kraken2	813970	5651243	5311015	38219883	39033853	49996111	0.781	0.113	0.106	0.800	0.200	0.019	0.087	0.565
70	ARTIFICIAL	2	8	kraken2	848074	5843864	5084290	38219883	39067957	49996111	0.781	0.117	0.102	0.800	0.200	0.019	0.083	0.584
71	ARTIFICIAL	2	9	kraken2	895442	6031308	4849478	38219883	39115325	49996111	0.782	0.121	0.097	0.800	0.200	0.018	0.079	0.603

	SAMPLE	pass	mode	profiler	mapped_human	mapped_others	unmapped	mapped_human_1_2_maps	mapped_human_total	total_reads	observed_human_prop	observed_others_prop	observed_unmapped_prop	expected_human_prop	expected_others_prop	calculated_unmapped_human_prop	calculated_unmapped_others_prop	proportion_mapped_other_reads
72	ARTIFICIAL	0	1	krakenuniq	39192382	8073019	2730710	0	39192382	49996111	0.784	0.161	0.055	0.800	0.200	0.016	0.039	0.807
73	ARTIFICIAL	0	2	krakenuniq	39192382	8073019	2730710	0	39192382	49996111	0.784	0.161	0.055	0.800	0.200	0.016	0.039	0.807
74	ARTIFICIAL	0	3	krakenuniq	39192382	8073019	2730710	0	39192382	49996111	0.784	0.161	0.055	0.800	0.200	0.016	0.039	0.807
75	ARTIFICIAL	0	4	krakenuniq	39192382	8073019	2730710	0	39192382	49996111	0.784	0.161	0.055	0.800	0.200	0.016	0.039	0.807
76	ARTIFICIAL	0	5	krakenuniq	39192382	8073019	2730710	0	39192382	49996111	0.784	0.161	0.055	0.800	0.200	0.016	0.039	0.807
77	ARTIFICIAL	0	6	krakenuniq	39192382	8073019	2730710	0	39192382	49996111	0.784	0.161	0.055	0.800	0.200	0.016	0.039	0.807
78	ARTIFICIAL	0	7	krakenuniq	39192382	8073019	2730710	0	39192382	49996111	0.784	0.161	0.055	0.800	0.200	0.016	0.039	0.807
79	ARTIFICIAL	0	8	krakenuniq	39192382	8073019	2730710	0	39192382	49996111	0.784	0.161	0.055	0.800	0.200	0.016	0.039	0.807
80	ARTIFICIAL	0	9	krakenuniq	39192382	8073019	2730710	0	39192382	49996111	0.784	0.161	0.055	0.800	0.200	0.016	0.039	0.807
81	ARTIFICIAL	2	1	krakenuniq	1094558	8062716	2618954	38219883	39314441	49996111	0.786	0.161	0.052	0.800	0.200	0.014	0.039	0.806
82	ARTIFICIAL	2	2	krakenuniq	1094558	8062716	2618954	38219883	39314441	49996111	0.786	0.161	0.052	0.800	0.200	0.014	0.039	0.806
83	ARTIFICIAL	2	3	krakenuniq	1094558	8062716	2618954	38219883	39314441	49996111	0.786	0.161	0.052	0.800	0.200	0.014	0.039	0.806
84	ARTIFICIAL	2	4	krakenuniq	1094558	8062716	2618954	38219883	39314441	49996111	0.786	0.161	0.052	0.800	0.200	0.014	0.039	0.806
85	ARTIFICIAL	2	5	krakenuniq	1094558	8062716	2618954	38219883	39314441	49996111	0.786	0.161	0.052	0.800	0.200	0.014	0.039	0.806
86	ARTIFICIAL	2	6	krakenuniq	1094558	8062716	2618954	38219883	39314441	49996111	0.786	0.161	0.052	0.800	0.200	0.014	0.039	0.806
87	ARTIFICIAL	2	7	krakenuniq	1094558	8062716	2618954	38219883	39314441	49996111	0.786	0.161	0.052	0.800	0.200	0.014	0.039	0.806
88	ARTIFICIAL	2	8	krakenuniq	1094558	8062716	2618954	38219883	39314441	49996111	0.786	0.161	0.052	0.800	0.200	0.014	0.039	0.806
89	ARTIFICIAL	2	9	krakenuniq	1094558	8062716	2618954	38219883	39314441	49996111	0.786	0.161	0.052	0.800	0.200	0.014	0.039	0.806

In silico sample analysis¶

How many reads are incorrectly mapped if we do not perfom a host mapping step?¶

Computing detection stats to aswer the questions¶

Categorical values¶

Numerical values¶

Analysis of kappa/F1¶

How does the S parametter used during curve fitting affect?¶

Checking recall/precision/F1-score for inclusion/exclusion of species¶

Does pass0/pass2 (no host pre-mapping vs host pre-mapping) affect the detection of the species?¶

Total number of species and Jaccard index¶

Checking statistical differences in the truth table.¶

Only using the full table (for publication)¶

Checking at the correlation between the read counts¶

Checking at MAE values¶

Which mode is best for the data?¶

Checking F1-scores¶

Checking correlation and MAE values¶

Which species have the worst assignment?¶

	species	taxid	count	abundance
1	Cutibacterium acnes	1747	312500	3.125
2	Lactobacillus acidophilus	1579	312500	3.125
3	Bifidobacterium bifidum	1681	312500	3.125
4	Akkermansia muciniphila	239935	312500	3.125
5	Blautia coccoides	1532	312500	3.125
6	Blautia luti	89014	312500	3.125
7	Bacteroides ovatus	28116	312500	3.125
8	Bacteroides intestinalis	329854	312500	3.125
9	Bacteroides fragilis	817	312500	3.125
10	Escherichia coli	511145	312500	3.125
11	Dietzia lutea	546160	250000	2.500
12	Ruthenibacterium lactatiformans	1550024	187500	1.875
13	Faecalibacterium prausnitzii	853	187500	1.875
14	Parabacteroides distasonis	823	187500	1.875
15	Parabacteroides merdae	46503	187500	1.875
16	Fusicatenibacter saccharivorans	1150298	187500	1.875
17	Erysipelatoclostridium ramosum	1547	125000	1.250
18	Streptococcus salivarius	1304	125000	1.250
19	Hungatella hathewayi	154046	62500	0.625
20	Eisenbergiella porci	2652274	62500	0.625
21	Butyricimonas faecalis	2093856	62500	0.625
22	Alistipes indistinctus	626932	62500	0.625
23	Alistipes finegoldii	214856	62500	0.625
24	Eubacterium callanderi	53442	62500	0.625
25	Acidaminococcus intestini	187327	62500	0.625
26	Aspergillus chevalieri	182096	200000	2.000
27	Aspergillus flavus	5059	200000	2.000
28	Saccharomyces cerevisiae	4932	200000	2.000
29	Saccharomyces kudriavzevii	114524	200000	2.000
30	Saccharomyces mikatae	114525	200000	2.000
31	Candida albicans	5476	200000	2.000
32	Candida dubliniensis	42374	200000	2.000
33	Candida orthopsilosis	273371	200000	2.000
34	Malassezia restricta	76775	150000	1.500
35	Alternaria dauci	48095	150000	1.500
36	Kazachstania africana	432096	100000	1.000
37	Penicillium digitatum	36651	100000	1.000
38	Pichia kudriavzevii	4909	50000	0.500
39	Trichoderma asperellum	101201	50000	0.500
40	Akanthomyces muscarius	2231603	50000	0.500
41	Fusarium falciforme	195108	50000	0.500
42	Eremothecium sinecaudum	45286	50000	0.500
43	Cryptococcus decagattii	1859122	50000	0.500
44	Kwoniella shandongensis	1734106	50000	0.500
45	Puccinia triticina	208348	50000	0.500
46	Tobacco mosaic virus	12242	250000	2.500
47	Rotavirus A	28875	250000	2.500
48	Rotavirus B	28876	250000	2.500
49	Rotavirus C	36427	250000	2.500
50	Bacteriophage P2	10679	200000	2.000
51	Escherichia phage T4	10665	200000	2.000
52	Human immunodeficiency virus 1	11676	200000	2.000
53	Human adenovirus 7	108098	150000	1.500
54	Hepatitis C virus	3052230	150000	1.500
55	Bovine alphaherpesvirus 2	3050244	150000	1.500
56	Human herpesvirus 4 type 2	3050299	150000	1.500
57	Mimivirus terra2	1128151	100000	1.000
58	Dengue virus	3052464	100000	1.000
59	Norovirus GI	11983	50000	0.500
60	Zaire ebolavirus	3052462	50000	0.500

	pass	mode	S	column	precision	recall	f1	kappa	TP\|FN\|FP\|TN
0	0	1	0	centrifuge_norm	1.000	0.050	0.095	0.094	(3, 57, 0, 3637)
1	0	1	0	ganon_norm	0.833	0.083	0.152	0.149	(5, 55, 1, 3636)
2	0	1	0	kaiju_norm	1.000	0.100	0.182	0.179	(6, 54, 0, 3637)
3	0	1	0	kraken2_norm	1.000	0.217	0.356	0.352	(13, 47, 0, 3637)
4	0	1	0	krakenuniq_norm	1.000	0.100	0.182	0.179	(6, 54, 0, 3637)
...	...	...	...	...	...	...	...	...	...
2515	2	9	15	krakenuniq_relab	0.400	0.967	0.566	0.563	(58, 2, 87, 12282)
2516	2	9	15	mean_norm	0.310	0.967	0.470	0.466	(58, 2, 129, 12240)
2517	2	9	15	CV_norm	0.005	0.900	0.009	-0.000	(54, 6, 11726, 643)
2518	2	9	15	mean_relab	0.258	0.967	0.407	0.402	(58, 2, 167, 12202)
2519	2	9	15	CV_relab	0.005	0.933	0.009	-0.000	(56, 4, 11731, 638)

	species	expected_counts	observed_counts	diff_abundance
0	Bacteroides fragilis	312500	240040.116	-23.187
1	Parabacteroides distasonis	187500	89289.167	-52.379
2	Faecalibacterium prausnitzii	187500	54476.741	-70.946
3	Streptococcus salivarius	125000	58117.095	-53.506
4	Blautia coccoides	312500	173325.771	-44.536
5	Erysipelatoclostridium ramosum	125000	67123.702	-46.301
6	Lactobacillus acidophilus	312500	239520.544	-23.353
7	Bifidobacterium bifidum	312500	184723.065	-40.889
8	Cutibacterium acnes	312500	254046.803	-18.705
9	Pichia kudriavzevii	50000	48697.541	-2.605
10	Saccharomyces cerevisiae	200000	195902.958	-2.049
11	Aspergillus flavus	200000	148008.703	-25.996
12	Candida albicans	200000	184824.390	-7.588
13	Escherichia phage T4	200000	164474.441	-17.763
14	Bacteriophage P2	200000	45220.596	-77.390
15	Human immunodeficiency virus 1	200000	201026.902	0.513
16	Norovirus GI	50000	50524.777	1.050
17	Tobacco mosaic virus	250000	243272.405	-2.691
18	Bacteroides ovatus	312500	111631.939	-64.278
19	Rotavirus A	250000	251608.261	0.643
20	Rotavirus B	250000	253634.055	1.454
21	Rotavirus C	250000	254565.176	1.826
22	Penicillium digitatum	100000	95742.552	-4.257
23	Candida dubliniensis	200000	184501.730	-7.749
24	Eremothecium sinecaudum	50000	49425.836	-1.148
25	Parabacteroides merdae	187500	125862.826	-32.873
26	Alternaria dauci	150000	141107.893	-5.928
27	Eubacterium callanderi	62500	30952.294	-50.476
28	Malassezia restricta	150000	149869.196	-0.087
29	Blautia luti	312500	53234.639	-82.965
30	Trichoderma asperellum	50000	46584.994	-6.830
31	Human adenovirus 7	150000	147511.825	-1.659
32	Saccharomyces kudriavzevii	200000	190980.300	-4.510
33	Saccharomyces mikatae	200000	189682.529	-5.159
34	Hungatella hathewayi	62500	40802.328	-34.716
35	Aspergillus chevalieri	200000	187004.613	-6.498
36	Acidaminococcus intestini	62500	57048.401	-8.723
37	Fusarium falciforme	50000	46336.938	-7.326
38	Puccinia triticina	50000	49269.888	-1.460
39	Alistipes finegoldii	62500	51804.368	-17.113
40	Akkermansia muciniphila	312500	196080.997	-37.254
41	Candida orthopsilosis	200000	189190.931	-5.405
42	Bacteroides intestinalis	312500	163291.164	-47.747
43	Kazachstania africana	100000	97363.929	-2.636
44	Dietzia lutea	250000	232472.093	-7.011
45	Alistipes indistinctus	62500	55391.499	-11.374
46	Mimivirus terra2	100000	84794.462	-15.206
47	Ruthenibacterium lactatiformans	187500	135305.249	-27.837
48	Kwoniella shandongensis	50000	48329.668	-3.341
49	Cryptococcus decagattii	50000	47962.656	-4.075
50	Butyricimonas faecalis	62500	49405.437	-20.951
51	Akanthomyces muscarius	50000	47989.514	-4.021
52	Eisenbergiella porci	62500	57135.945	-8.582
53	Bovine alphaherpesvirus 2	150000	148849.267	-0.767
54	Human herpesvirus 4 type 2	150000	139697.441	-6.868
55	Hepatitis C virus	150000	151377.085	0.918
56	Zaire ebolavirus	50000	49744.206	-0.512
57	Dengue virus	100000	101890.120	1.890

	pass	mode	profiler	diff_counts	MRE_counts	MRED_counts	MAE_counts	MAED_counts	MACV_counts	corr_counts	R2_counts	taxid_counts	species_counts	expected_counts	observed_counts	diff_abundance	MRE_abundance	MRED_abundance	MAE_abundance	MAED_abundance	MACV_abundance	corr_abundance	R2_abundance	taxid_abundance	species_abundance	expected_abundance	observed_abundance
0	0	1	centrifuge	[-21.76736, -50.1056, -74.39946666666667, -58....	-16.939	23.674	17.173	23.505	1.369	0.635	0.576	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[244477.0, 93552.0, 48001.0, 51790.0, 189991.0...	[-1.33929624238111, -37.07720182312471, -67.71...	4.750	29.856	26.450	14.640	0.554	0.801	0.576	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[3.0831469924255903, 1.1798024658164117, 0.605...
1	0	1	ganon	[-23.91328, -55.785066666666665, -83.386666666...	-28.729	21.022	28.729	21.022	0.732	0.558	0.590	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[237771.0, 82903.0, 31150.0, 42979.0, 159924.0...	[7.790932959479549, -37.36135668956172, -76.46...	0.969	29.782	25.411	15.563	0.612	0.790	0.590	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[3.368466654983736, 1.1744745620707178, 0.4412...
2	0	1	kaiju	[-53.25664, -79.21386666666666, -91.4437333333...	-49.543	28.481	49.543	28.481	0.575	0.389	0.313	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[146073.0, 38974.0, 16043.0, 14537.0, 54507.0,...	[-8.305719789553791, -59.224806809815064, -83....	-1.021	55.870	47.167	29.963	0.635	0.764	0.313	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[2.865446256576444, 0.7645348723159675, 0.3147...
3	0	1	kraken2	[-76.07584, -94.63306666666666, -99.4, -99.609...	-65.147	24.902	65.147	24.902	0.382	0.176	0.122	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[74763.0, 10063.0, 1125.0, 488.0, 11530.0, 288...	[-19.322126949109276, -81.90144330504141, -97....	17.532	83.974	75.284	41.126	0.546	0.594	0.122	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[2.521183532840335, 0.3393479380304735, 0.0379...
4	0	1	krakenuniq	[-19.62944, -48.239466666666665, -68.408533333...	-16.729	23.969	17.018	23.765	1.396	0.642	0.572	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[251158.0, 97051.0, 59234.0, 56908.0, 183826.0...	[-0.4454715144359227, -35.88453918746712, -60....	3.147	29.691	25.571	15.414	0.603	0.795	0.572	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[3.1110790151738774, 1.2021648902349915, 0.733...
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
103	2	9	ganon	[3.88512, -35.4976, -56.00053333333334, -32.8,...	-1.882	23.489	18.223	14.940	0.820	0.751	0.659	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[324641.0, 120942.0, 82499.0, 84000.0, 215092....	[5.0978227572311425, -34.74463135224731, -55.4...	-0.736	23.763	18.879	14.451	0.765	0.760	0.659	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[3.284306961163473, 1.223538162145363, 0.83462...
104	2	9	kaiju	[-48.45856, -76.704, -89.0208, -85.9616, -80.8...	-43.474	29.718	43.474	29.718	0.684	0.422	0.324	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[161067.0, 43680.0, 20586.0, 17548.0, 59848.0,...	[-9.872292921902698, -59.26355445072247, -80.8...	-1.156	51.967	43.407	28.597	0.659	0.738	0.324	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[2.8164908461905407, 0.7638083540489538, 0.359...
105	2	9	kraken2	[-39.60768, -84.74186666666667, -95.9482666666...	-32.087	35.514	32.087	35.514	1.107	0.419	0.235	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[188726.0, 28609.0, 7597.0, 4884.0, 33042.0, 6...	[0.13138111998259205, -74.70178386954647, -93....	12.601	58.882	53.987	26.671	0.494	0.695	0.235	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[3.129105659999456, 0.4743415524460034, 0.1259...
106	2	9	krakenuniq	[-19.67168, -48.3264, -68.4816, -54.56, -41.30...	-16.778	23.985	17.064	23.782	1.394	0.641	0.572	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[251026.0, 96888.0, 59097.0, 56800.0, 183437.0...	[-0.37064433374560224, -35.91043018258363, -60...	3.219	29.748	25.644	15.417	0.601	0.796	0.572	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[3.11341736457045, 1.2016794340765569, 0.73296...
107	2	9	mean	[-20.390584802138655, -49.665829444940506, -66...	-16.259	21.692	16.933	21.171	1.250	0.638	0.622	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[248779.4224933167, 94376.56979073654, 62310.1...	[-1.6718373579364112, -43.06956794651194, -63....	8.287	32.919	29.573	16.667	0.564	0.799	0.536	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[3.072755082564487, 1.0674456010029012, 0.6795...

	f1	kappa	table
1440	0.821	0.778	(484, 149, 62, 2305)
38383	0.829	0.772	(627, 206, 53, 2114)
39301	0.810	0.778	(351, 126, 39, 2484)
74870	0.811	0.792	(232, 108, 0, 2660)
78198	0.830	0.780	(562, 94, 137, 2207)
93867	0.811	0.791	(236, 80, 30, 2654)

	f1	kappa	table
361	0.992	0.972	(2118, 1, 33, 848)
911	1.000	1.000	(2999, 0, 0, 1)
3050	0.999	0.974	(2879, 5, 1, 115)
4738	1.000	1.000	(2943, 0, 0, 57)
6415	1.000	1.000	(2986, 0, 0, 14)
...	...	...	...
96296	1.000	1.000	(2993, 0, 0, 7)
96612	1.000	1.000	(2999, 0, 0, 1)
97751	0.992	0.983	(1564, 1, 25, 1410)
97820	0.999	0.977	(2883, 1, 4, 112)
98444	1.000	0.985	(2965, 1, 0, 34)

	f1	kappa	table
410	0.019	-0.002	(27, 2856, 4, 113)
1178	0.014	-0.020	(18, 2561, 32, 389)
2550	0.018	-0.017	(26, 2769, 27, 178)
5017	0.004	-0.004	(5, 2715, 7, 273)
5733	0.001	-0.013	(1, 1611, 20, 1368)
...	...	...	...
96808	0.010	-0.015	(15, 2958, 22, 5)
97000	0.007	-0.003	(11, 2969, 4, 16)
97339	0.020	-0.021	(28, 2667, 34, 271)
97542	0.015	-0.028	(20, 2583, 45, 352)
99850	0.005	-0.012	(8, 2949, 18, 25)

	mode	S	column	chi2_full	p_value_full	chi2_partial	p_value_partial	precision_diff	recall_diff	f1_diff	stats_pass0	stats_pass2	significance
5	1	0	mean_norm	59.705	0.000	59.578	0.000	-0.043	0.683	0.716	[4, 56, 0, 3637]	[45, 15, 2, 3520]	Both Significant
13	1	2	ganon_norm	30.477	0.000	30.438	0.000	0.060	0.500	0.498	[11, 49, 1, 3636]	[41, 19, 1, 3521]	Both Significant
18	1	3	centrifuge_norm	5.064	0.167	5.032	0.081	0.000	0.183	0.227	[11, 49, 0, 3637]	[22, 38, 0, 3522]	Only Partial Significant
43	1	7	ganon_norm	37.082	0.000	33.421	0.000	-0.269	0.267	0.001	[42, 18, 1, 3636]	[58, 2, 24, 3498]	Both Significant
62	2	0	kaiju_norm	8.014	0.046	7.984	0.018	0.000	0.167	0.269	[2, 58, 0, 3724]	[12, 48, 0, 3611]	Both Significant
65	2	0	mean_norm	10.970	0.012	10.962	0.004	0.050	-0.233	-0.321	[19, 41, 1, 3723]	[5, 55, 0, 3611]	Both Significant
73	2	2	ganon_norm	30.473	0.000	30.438	0.000	0.060	0.500	0.498	[11, 49, 1, 3723]	[41, 19, 1, 3610]	Both Significant
78	2	3	centrifuge_norm	5.062	0.167	5.032	0.081	0.000	0.183	0.227	[11, 49, 0, 3724]	[22, 38, 0, 3611]	Only Partial Significant
96	2	6	centrifuge_norm	46.620	0.000	41.727	0.000	0.376	-0.233	0.088	[58, 2, 35, 3689]	[44, 16, 0, 3611]	Both Significant
102	2	7	centrifuge_norm	46.620	0.000	41.727	0.000	0.376	-0.233	0.088	[58, 2, 35, 3689]	[44, 16, 0, 3611]	Both Significant
103	2	7	ganon_norm	37.046	0.000	33.421	0.000	-0.269	0.267	0.001	[42, 18, 1, 3723]	[58, 2, 24, 3587]	Both Significant
122	3	0	kaiju_norm	8.013	0.046	7.984	0.018	0.000	0.167	0.269	[2, 58, 0, 3761]	[12, 48, 0, 3648]	Both Significant
125	3	0	mean_norm	26.453	0.000	26.381	0.000	0.013	0.467	0.405	[17, 43, 1, 3760]	[45, 15, 2, 3646]	Both Significant
138	3	3	centrifuge_norm	5.061	0.167	5.032	0.081	0.000	0.183	0.227	[11, 49, 0, 3761]	[22, 38, 0, 3648]	Only Partial Significant
139	3	3	ganon_norm	36.467	0.000	36.431	0.000	0.061	0.550	0.533	[11, 49, 1, 3760]	[44, 16, 1, 3647]	Both Significant
182	4	0	kaiju_norm	19.754	0.000	19.750	0.000	0.081	-0.367	-0.368	[34, 26, 3, 3845]	[12, 48, 0, 3733]	Both Significant
185	4	0	mean_norm	44.783	0.000	44.789	0.000	0.004	-0.600	-0.516	[49, 11, 4, 3844]	[13, 47, 1, 3732]	Both Significant
198	4	3	centrifuge_norm	38.612	0.000	38.576	0.000	0.000	0.567	0.547	[11, 49, 0, 3848]	[45, 15, 0, 3733]	Both Significant
199	4	3	ganon_norm	36.467	0.000	36.431	0.000	0.061	0.550	0.533	[11, 49, 1, 3847]	[44, 16, 1, 3732]	Both Significant
245	5	0	mean_norm	17.691	0.001	17.077	0.000	0.154	-0.183	-0.016	[58, 2, 14, 4048]	[47, 13, 2, 3940]	Both Significant
259	5	3	ganon_norm	40.835	0.000	40.799	0.000	0.062	0.583	0.554	[11, 49, 1, 4061]	[46, 14, 1, 3941]	Both Significant
263	5	3	mean_norm	6.513	0.089	2.956	0.228	-0.114	0.000	-0.097	[58, 2, 43, 4019]	[58, 2, 68, 3874]	Only Full Significant
305	6	0	mean_norm	43.871	0.000	40.577	0.000	0.300	-0.317	-0.006	[58, 2, 31, 4162]	[39, 21, 2, 4066]	Both Significant
319	6	3	ganon_norm	40.836	0.000	40.799	0.000	0.062	0.583	0.554	[11, 49, 1, 4192]	[46, 14, 1, 4067]	Both Significant
354	6	15	centrifuge_norm	8.454	0.037	3.712	0.156	-0.125	0.000	-0.109	[58, 2, 46, 4147]	[58, 2, 76, 3992]	Only Full Significant
365	7	0	mean_norm	46.384	0.000	46.336	0.000	0.084	0.583	0.422	[23, 37, 4, 4836]	[58, 2, 4, 4678]	Both Significant
379	7	3	ganon_norm	40.843	0.000	40.799	0.000	0.062	0.583	0.554	[11, 49, 1, 4839]	[46, 14, 1, 4681]	Both Significant
402	7	7	centrifuge_norm	12.768	0.005	5.586	0.061	-0.152	0.000	-0.135	[58, 2, 45, 4795]	[58, 2, 83, 4599]	Both Significant
407	7	7	mean_norm	6.531	0.088	3.144	0.208	0.102	0.000	0.101	[58, 2, 104, 4736]	[58, 2, 68, 4614]	Only Full Significant
413	7	10	mean_norm	6.531	0.088	3.144	0.208	0.102	0.000	0.101	[58, 2, 104, 4736]	[58, 2, 68, 4614]	Only Full Significant
437	8	2	mean_norm	9.109	0.028	5.037	0.081	0.147	0.000	0.127	[58, 2, 76, 7660]	[58, 2, 42, 7520]	Both Significant
439	8	3	ganon_norm	40.820	0.000	40.799	0.000	0.062	0.583	0.554	[11, 49, 1, 7735]	[46, 14, 1, 7561]	Both Significant
443	8	3	mean_norm	9.109	0.028	5.037	0.081	0.147	0.000	0.127	[58, 2, 76, 7660]	[58, 2, 42, 7520]	Both Significant
449	8	4	mean_norm	8.979	0.030	3.999	0.135	0.113	0.000	0.114	[58, 2, 111, 7625]	[58, 2, 69, 7493]	Only Full Significant
482	9	0	kaiju_norm	997.648	0.000	888.595	0.000	-0.437	0.933	0.085	[1, 59, 1, 12540]	[57, 3, 854, 11515]	Both Significant
485	9	0	mean_norm	24.758	0.000	24.539	0.000	0.101	-0.383	-0.228	[51, 9, 8, 12533]	[28, 32, 1, 12368]	Both Significant
499	9	3	ganon_norm	38.413	0.000	38.405	0.000	0.051	0.567	0.519	[13, 47, 1, 12540]	[47, 13, 1, 12368]	Both Significant
507	9	4	kraken2_norm	32.370	0.000	31.976	0.000	-0.109	0.400	0.210	[33, 27, 0, 12541]	[57, 3, 7, 12362]	Both Significant
509	9	4	mean_norm	17.490	0.001	7.116	0.028	0.147	0.000	0.151	[58, 2, 129, 12412]	[58, 2, 69, 12300]	Both Significant
529	9	10	ganon_norm	10.471	0.015	5.917	0.052	0.164	0.000	0.136	[58, 2, 70, 12471]	[58, 2, 36, 12333]	Both Significant
535	9	15	ganon_norm	10.471	0.015	5.917	0.052	0.164	0.000	0.136	[58, 2, 70, 12471]	[58, 2, 36, 12333]	Both Significant

	mode	column	chi2_full	p_value_full	chi2_partial	p_value_partial	precision_diff	recall_diff	f1_diff	stats_pass0	stats_pass2	significance	are_diff
S
0	12	12	12	12	12	12	12	12	12	12	12	12	12
2	3	3	3	3	3	3	3	3	3	3	3	3	3
3	10	10	10	10	10	10	10	10	10	10	10	10	10
4	3	3	3	3	3	3	3	3	3	3	3	3	3
6	1	1	1	1	1	1	1	1	1	1	1	1	1
7	5	5	5	5	5	5	5	5	5	5	5	5	5
10	2	2	2	2	2	2	2	2	2	2	2	2	2
15	2	2	2	2	2	2	2	2	2	2	2	2	2

	mode	S	chi2_full	p_value_full	chi2_partial	p_value_partial	precision_diff	recall_diff	f1_diff	stats_pass0	stats_pass2	significance	are_diff
column
centrifuge_norm	5	5	5	5	5	5	5	5	5	5	5	5	5
ganon_norm	13	13	13	13	13	13	13	13	13	13	13	13	13
kaiju_norm	4	4	4	4	4	4	4	4	4	4	4	4	4
kraken2_norm	1	1	1	1	1	1	1	1	1	1	1	1	1
mean_norm	15	15	15	15	15	15	15	15	15	15	15	15	15

	mode	column	chi2_full	p_value_full	chi2_partial	p_value_partial	precision_diff	recall_diff	f1_diff	stats_pass0	stats_pass2	significance	are_diff
S
0	12	12	12	12	12	12	12	12	12	12	12	12	12
2	3	3	3	3	3	3	3	3	3	3	3	3	3
3	10	10	10	10	10	10	10	10	10	10	10	10	10
4	3	3	3	3	3	3	3	3	3	3	3	3	3
6	1	1	1	1	1	1	1	1	1	1	1	1	1
7	5	5	5	5	5	5	5	5	5	5	5	5	5
10	2	2	2	2	2	2	2	2	2	2	2	2	2
15	2	2	2	2	2	2	2	2	2	2	2	2	2

	pass	mode	S	column	precision	recall	f1	kappa	TP\|FN\|FP\|TN
1260	2	1	0	centrifuge_norm	1.000	0.050	0.095	0.094	(3, 57, 0, 3522)
1261	2	1	0	ganon_norm	1.000	0.050	0.095	0.094	(3, 57, 0, 3522)
1262	2	1	0	kaiju_norm	1.000	0.100	0.182	0.179	(6, 54, 0, 3522)
1263	2	1	0	kraken2_norm	1.000	0.217	0.356	0.352	(13, 47, 0, 3522)
1264	2	1	0	krakenuniq_norm	1.000	0.067	0.125	0.123	(4, 56, 0, 3522)
...	...	...	...	...	...	...	...	...	...
2507	2	9	15	ganon_norm	0.617	0.967	0.753	0.752	(58, 2, 36, 12333)
2508	2	9	15	kaiju_norm	0.533	0.950	0.683	0.681	(57, 3, 50, 12319)
2509	2	9	15	kraken2_norm	0.891	0.950	0.919	0.919	(57, 3, 7, 12362)
2510	2	9	15	krakenuniq_norm	0.400	0.967	0.566	0.563	(58, 2, 87, 12282)
2516	2	9	15	mean_norm	0.310	0.967	0.470	0.466	(58, 2, 129, 12240)

	profiler	statistic	p_value	p_value_corrected
0	centrifuge	45.000	0.002	0.002
1	ganon	45.000	0.002	0.002
2	kaiju	45.000	0.002	0.002
3	kraken2	45.000	0.002	0.002
4	krakenuniq	45.000	0.002	0.002

	pass	mode	profiler	diff_counts	MRE_counts	MRED_counts	MAE_counts	MAED_counts	MACV_counts	corr_counts	R2_counts	taxid_counts	species_counts	expected_counts	observed_counts	diff_abundance	MRE_abundance	MRED_abundance	MAE_abundance	MAED_abundance	MACV_abundance	corr_abundance	R2_abundance	taxid_abundance	species_abundance	expected_abundance	observed_abundance
66	2	3	centrifuge	[-21.29664, -49.24053333333333, -72.7157333333...	-16.551	23.325	16.808	23.141	1.377	0.642	0.586	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[245948.0, 95174.0, 51158.0, 53926.0, 192262.0...	[-1.6074186089308142, -36.542036383614835, -65...	4.325	29.161	25.668	14.498	0.565	0.803	0.586	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[3.074768168470912, 1.1898368178072218, 0.6395...
67	2	3	ganon	[-6.65408, -46.7088, -73.0704, -50.604, -40.65...	-8.143	26.077	19.274	19.361	1.005	0.683	0.580	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[291706.0, 99921.0, 50493.0, 61745.0, 185450.0...	[3.2451929989731525, -41.05730353135007, -70.2...	1.598	28.843	24.447	15.387	0.629	0.755	0.580	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[3.226412281217911, 1.105175558787186, 0.55847...
68	2	3	kaiju	[-52.1184, -78.6064, -90.92586666666666, -87.8...	-48.224	28.792	48.224	28.792	0.597	0.398	0.316	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[149630.0, 40113.0, 17014.0, 15201.0, 55766.0,...	[-6.421758363959896, -58.18904401137832, -82.2...	1.190	56.270	47.277	30.539	0.646	0.779	0.316	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[2.9243200511262533, 0.7839554247866565, 0.332...
69	2	3	kraken2	[-61.60288, -91.8592, -98.78133333333334, -99....	-50.081	32.137	50.081	32.137	0.642	0.278	0.161	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[119991.0, 15264.0, 2285.0, 1020.0, 17562.0, 4...	[-11.271327674607562, -81.18811057530996, -97....	15.354	74.263	67.854	33.862	0.499	0.642	0.161	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[2.7727710101685137, 0.3527229267129384, 0.052...
70	2	3	krakenuniq	[-19.67168, -48.3264, -68.4816, -54.56, -41.30...	-16.778	23.985	17.064	23.782	1.394	0.641	0.572	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[251026.0, 96888.0, 59097.0, 56800.0, 183437.0...	[-0.37064433374560224, -35.91043018258363, -60...	3.219	29.748	25.644	15.417	0.601	0.796	0.572	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[3.11341736457045, 1.2016794340765569, 0.73296...
71	2	3	mean	[-25.802934689620443, -53.521229040380774, -73...	-20.388	21.741	20.388	21.741	1.066	0.601	0.601	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[231865.8290949361, 87147.69554928604, 49730.5...	[-3.0213808182247845, -44.51661135451871, -68....	12.824	38.048	34.447	20.629	0.599	0.804	0.485	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[3.0305818494304755, 1.040313537102774, 0.5921...
90	2	7	centrifuge	[-20.50976, -47.4128, -69.89973333333333, -54....	-15.904	22.759	16.191	22.556	1.393	0.653	0.603	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[248407.0, 98601.0, 56438.0, 56618.0, 196085.0...	[-2.4354608041876133, -35.45564920173818, -63....	3.218	27.934	24.137	14.425	0.598	0.801	0.603	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[3.048891849869137, 1.2102065774674091, 0.6927...
91	2	7	ganon	[2.87936, -39.4528, -60.080533333333335, -36.5...	-2.179	24.830	19.200	15.894	0.828	0.734	0.629	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[321498.0, 113526.0, 74849.0, 79318.0, 200909....	[5.609860704572441, -37.84582876437132, -59.02...	0.418	25.489	20.717	14.855	0.717	0.754	0.629	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[3.3003081470178888, 1.1653907106680377, 0.768...
92	2	7	kaiju	[-49.07328, -77.00266666666667, -89.2613333333...	-44.465	29.503	44.465	29.503	0.664	0.419	0.323	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[159146.0, 43120.0, 20135.0, 17133.0, 58980.0,...	[-9.114576548016046, -58.958236888878254, -80....	-0.890	52.653	43.948	29.012	0.660	0.747	0.323	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[2.8401694828744986, 0.7695330583335327, 0.359...
93	2	7	kraken2	[-44.83456, -87.64213333333333, -97.136, -97.6...	-36.009	35.648	36.009	35.648	0.990	0.386	0.212	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[172392.0, 23171.0, 5370.0, 2881.0, 26475.0, 6...	[-2.383528721026508, -78.1324804708156, -94.93...	13.233	63.080	58.103	27.897	0.480	0.682	0.212	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[3.0505147274679216, 0.4100159911722076, 0.095...
94	2	7	krakenuniq	[-19.67168, -48.3264, -68.4816, -54.56, -41.30...	-16.778	23.985	17.064	23.782	1.394	0.641	0.572	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[251026.0, 96888.0, 59097.0, 56800.0, 183437.0...	[-0.37064433374560224, -35.91043018258363, -60...	3.219	29.748	25.644	15.417	0.601	0.796	0.572	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[3.11341736457045, 1.2016794340765569, 0.73296...
95	2	7	mean	[-21.385476139592473, -51.124221746876785, -68...	-16.856	21.928	17.401	21.498	1.235	0.629	0.614	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[245670.38706377352, 91642.08422460603, 58892....	[-1.5125795343509907, -43.63677614078014, -64....	9.779	34.128	30.912	17.460	0.565	0.803	0.524	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[3.0777318895515315, 1.0568104473603723, 0.659...

	pass	mode	diff_counts	MRE_counts	MRED_counts	MAE_counts	MAED_counts	MACV_counts	corr_counts	R2_counts	taxid_counts	species_counts	expected_counts	observed_counts	diff_abundance	MRE_abundance	MRED_abundance	MAE_abundance	MAED_abundance	MACV_abundance	corr_abundance	R2_abundance	taxid_abundance	species_abundance	expected_abundance	observed_abundance
profiler
centrifuge	2	5	[-20.92736, -48.3728, -71.28853333333333, -55....	-16.211	23.038	16.486	22.842	1.386	0.648	0.595	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[247102.0, 96801.0, 53834.0, 55355.0, 194199.0...	[-2.0102896294236245, -36.0216836665145, -64.4...	3.835	28.550	24.922	14.446	0.580	0.803	0.595	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[3.0621784490805117, 1.1995934312528531, 0.667...
ganon	2	5	[-0.50112, -43.06133333333333, -65.74453333333...	-3.843	25.604	19.386	17.162	0.885	0.717	0.606	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[310934.0, 106760.0, 64229.0, 72144.0, 194362....	[4.519396024457606, -40.188321210872765, -64.0...	1.009	26.896	22.260	15.131	0.680	0.753	0.606	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[3.2662311257643, 1.1214689772961357, 0.674698...
kaiju	2	5	[-49.99712, -77.44426666666666, -89.7125333333...	-45.742	29.240	45.742	29.240	0.639	0.413	0.322	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[156259.0, 42292.0, 19289.0, 16479.0, 57971.0,...	[-8.0675120659037, -58.53029500478051, -81.086...	-0.245	53.759	44.984	29.436	0.654	0.759	0.322	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[2.8728902479405094, 0.7775569686603653, 0.354...
kraken2	2	5	[-51.72896, -89.8096, -98.04693333333333, -98....	-41.501	34.745	41.501	34.745	0.837	0.341	0.189	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[150847.0, 19107.0, 3662.0, 1740.0, 21850.0, 5...	[-5.859592421967733, -80.12621212670827, -96.1...	14.088	67.761	62.520	29.686	0.475	0.665	0.189	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[2.9418877368135083, 0.3726335226242199, 0.071...
krakenuniq	2	5	[-19.67168, -48.3264, -68.4816, -54.56, -41.30...	-16.778	23.985	17.064	23.782	1.394	0.641	0.572	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[251026.0, 96888.0, 59097.0, 56800.0, 183437.0...	[-0.37064433374560224, -35.91043018258363, -60...	3.219	29.748	25.644	15.417	0.601	0.796	0.572	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[3.11341736457045, 1.2016794340765569, 0.73296...
mean	2	5	[-23.187162957260917, -52.379110996145705, -70...	-18.085	21.968	18.371	21.729	1.183	0.618	0.607	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[312500, 187500, 187500, 125000, 312500, 12500...	[240040.11575855964, 89289.1668822268, 54476.7...	[-2.1606238158283873, -44.255403818539435, -66...	11.009	35.605	32.294	18.601	0.576	0.803	0.509	[817, 823, 853, 1304, 1532, 1547, 1579, 1681, ...	[Bacteroides fragilis, Parabacteroides distaso...	[3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125...	[3.057480505755363, 1.0452111784023856, 0.6271...

	mode	column	chi2_full	p_value_full	chi2_partial	p_value_partial	precision_diff	recall_diff	f1_diff	stats_pass0	stats_pass2	significance	are_diff
S
0	12	12	12	12	12	12	12	12	12	12	12	12	12
2	3	3	3	3	3	3	3	3	3	3	3	3	3
3	10	10	10	10	10	10	10	10	10	10	10	10	10
4	3	3	3	3	3	3	3	3	3	3	3	3	3
6	1	1	1	1	1	1	1	1	1	1	1	1	1
7	5	5	5	5	5	5	5	5	5	5	5	5	5
10	2	2	2	2	2	2	2	2	2	2	2	2	2
15	2	2	2	2	2	2	2	2	2	2	2	2	2