import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import os
from datetime import datetime
from matplotlib.colors import Normalize, ListedColormap
from scipy.stats import linregress, pearsonr
from scipy.stats import mannwhitneyu, wilcoxon
from statsmodels.stats.multitest import multipletests
from statannotations.Annotator import Annotator
pd.options.display.float_format = '{:.3f}'.format
pd.set_option('display.max_columns', None)
plt.rcParams['figure.dpi']=170
plt.rc('axes', linewidth=0.65) # Adjust the line width of plot frames
plt.rc('xtick.major', width=0.0)
plt.rc('ytick.major', width=0.0)
DPI=250
from list_vars import LIST_PROFILERS, DIR_FIGURES, RESULTS_DIR
In silico sample analysis¶
In this notebook we are going to do an analysis on the in silico samples, where we are going to study several variables.
How many reads are incorrectly mapped if we do not perfom a host mapping step?¶
It has been reported that not mapping to human databases before profiling increases the number of reads assigned to other organisms.
In this case, we are going to do 3 checks with the in silico dataset using pass2 (profiling after 2-time host mapping) and pass0 (direct profiling withou host mapping), and we are going to check the influence in parameter sensitivity:
- We are going to see what is the total number of reads mapped to the human dataset, and what is the offset left unmapped which should have been mapped to human.
- We are also going to do the same with the microbial reads, and see if more microbial reads have been assigned to the pass0 dataset.
Later in the analysis we are going to do two additional analyses:
- We are going to see the number of species present in total between pass0 and pass2, and their jaccard index.
- We are going to calculate the ratio between the number of reads in pass0 and pass2.
df_host_map_info = pd.read_csv(f'{RESULTS_DIR}/counts/mapping_counts.txt', sep='\t').set_index('SAMPLE')
artificial_taxid_counts = pd.read_csv('table_artificial_taxid.csv', sep=';', names=['species', 'taxid', 'reads'])
artificial_taxid_counts['reads_true'] = (artificial_taxid_counts['reads'] / 2).astype(int)
n_true_human_reads = int(artificial_taxid_counts['reads_true'].iloc[0])
n_true_human_reads
20000000
n_mapped_reads_1and2_maps = df_host_map_info.loc['ARTIFICIAL', '1st_mapped'] + df_host_map_info.loc['ARTIFICIAL', '2nd_mapped']
print(f'There is a total of {n_mapped_reads_1and2_maps} reads mapped to human during the 1st and 2nd map, which represents around {100 * n_mapped_reads_1and2_maps/n_true_human_reads} % of the total number of reads ({n_true_human_reads}).')
print(f'There is a total of {n_true_human_reads - n_mapped_reads_1and2_maps} reads remaining to be mapped.')
There is a total of 38219883 reads mapped to human during the 1st and 2nd map, which represents around 191.099415 % of the total number of reads (20000000). There is a total of -18219883 reads remaining to be mapped.
df_host_profile_info = pd.read_csv(f'{RESULTS_DIR}/counts/profiling_counts_ARTIFICIAL.txt', sep='\t')
df_host_profile_info_artificial = df_host_profile_info[df_host_profile_info['SAMPLE'] == 'ARTIFICIAL']
df_host_profile_info_artificial['mapped_human_1_2_maps'] = 0
df_host_profile_info_artificial.loc[df_host_profile_info_artificial['pass'] == 2, 'mapped_human_1_2_maps'] = n_mapped_reads_1and2_maps
df_host_profile_info_artificial['mapped_human_total'] = df_host_profile_info_artificial['mapped_human_1_2_maps'] + df_host_profile_info_artificial['mapped_human']
df_host_profile_info_artificial['total_reads'] = df_host_profile_info_artificial['mapped_human_total'] + df_host_profile_info_artificial['mapped_others'] + df_host_profile_info_artificial['unmapped']
df_host_profile_info_artificial['observed_human_prop'] = df_host_profile_info_artificial['mapped_human_total'] / df_host_profile_info_artificial['total_reads']
df_host_profile_info_artificial['observed_others_prop'] = df_host_profile_info_artificial['mapped_others'] / df_host_profile_info_artificial['total_reads']
df_host_profile_info_artificial['observed_unmapped_prop'] = df_host_profile_info_artificial['unmapped'] / df_host_profile_info_artificial['total_reads']
df_host_profile_info_artificial['expected_human_prop'] = n_true_human_reads / artificial_taxid_counts['reads_true'].sum() # 0.8
df_host_profile_info_artificial['expected_others_prop'] = 1 - n_true_human_reads / artificial_taxid_counts['reads_true'].sum() # 0.8
df_host_profile_info_artificial['calculated_unmapped_human_prop'] = df_host_profile_info_artificial['expected_human_prop'] - df_host_profile_info_artificial['observed_human_prop']
df_host_profile_info_artificial['calculated_unmapped_others_prop'] = df_host_profile_info_artificial['expected_others_prop'] - df_host_profile_info_artificial['observed_others_prop']
df_host_profile_info_artificial['proportion_mapped_other_reads'] = df_host_profile_info_artificial['observed_others_prop'] / df_host_profile_info_artificial['expected_others_prop']
for profiler in LIST_PROFILERS:
display(profiler)
display(df_host_profile_info_artificial[df_host_profile_info_artificial['profiler'] == profiler])
'centrifuge'
| SAMPLE | pass | mode | profiler | mapped_human | mapped_others | unmapped | mapped_human_1_2_maps | mapped_human_total | total_reads | observed_human_prop | observed_others_prop | observed_unmapped_prop | expected_human_prop | expected_others_prop | calculated_unmapped_human_prop | calculated_unmapped_others_prop | proportion_mapped_other_reads | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ARTIFICIAL | 0 | 1 | centrifuge | 38740565 | 7929463 | 3326083 | 0 | 38740565 | 49996111 | 0.775 | 0.159 | 0.067 | 0.800 | 0.200 | 0.025 | 0.041 | 0.793 |
| 1 | ARTIFICIAL | 0 | 2 | centrifuge | 38831835 | 7963709 | 3200567 | 0 | 38831835 | 49996111 | 0.777 | 0.159 | 0.064 | 0.800 | 0.200 | 0.023 | 0.041 | 0.796 |
| 2 | ARTIFICIAL | 0 | 3 | centrifuge | 38909846 | 7998877 | 3087388 | 0 | 38909846 | 49996111 | 0.778 | 0.160 | 0.062 | 0.800 | 0.200 | 0.022 | 0.040 | 0.800 |
| 3 | ARTIFICIAL | 0 | 4 | centrifuge | 38976065 | 8034549 | 2985497 | 0 | 38976065 | 49996111 | 0.780 | 0.161 | 0.060 | 0.800 | 0.200 | 0.020 | 0.039 | 0.804 |
| 4 | ARTIFICIAL | 0 | 5 | centrifuge | 39023379 | 8070059 | 2902673 | 0 | 39023379 | 49996111 | 0.781 | 0.161 | 0.058 | 0.800 | 0.200 | 0.019 | 0.039 | 0.807 |
| 5 | ARTIFICIAL | 0 | 6 | centrifuge | 39064849 | 8106808 | 2824454 | 0 | 39064849 | 49996111 | 0.781 | 0.162 | 0.056 | 0.800 | 0.200 | 0.019 | 0.038 | 0.811 |
| 6 | ARTIFICIAL | 0 | 7 | centrifuge | 39095674 | 8148852 | 2751585 | 0 | 39095674 | 49996111 | 0.782 | 0.163 | 0.055 | 0.800 | 0.200 | 0.018 | 0.037 | 0.815 |
| 7 | ARTIFICIAL | 0 | 8 | centrifuge | 39120561 | 8209851 | 2665699 | 0 | 39120561 | 49996111 | 0.782 | 0.164 | 0.053 | 0.800 | 0.200 | 0.018 | 0.036 | 0.821 |
| 8 | ARTIFICIAL | 0 | 9 | centrifuge | 39149190 | 8425120 | 2421801 | 0 | 39149190 | 49996111 | 0.783 | 0.169 | 0.048 | 0.800 | 0.200 | 0.017 | 0.031 | 0.843 |
| 9 | ARTIFICIAL | 2 | 1 | centrifuge | 1046644 | 7929421 | 2800163 | 38219883 | 39266527 | 49996111 | 0.785 | 0.159 | 0.056 | 0.800 | 0.200 | 0.015 | 0.041 | 0.793 |
| 10 | ARTIFICIAL | 2 | 2 | centrifuge | 1052656 | 7964098 | 2759474 | 38219883 | 39272539 | 49996111 | 0.786 | 0.159 | 0.055 | 0.800 | 0.200 | 0.014 | 0.041 | 0.796 |
| 11 | ARTIFICIAL | 2 | 3 | centrifuge | 1058357 | 7998912 | 2718959 | 38219883 | 39278240 | 49996111 | 0.786 | 0.160 | 0.054 | 0.800 | 0.200 | 0.014 | 0.040 | 0.800 |
| 12 | ARTIFICIAL | 2 | 4 | centrifuge | 1063583 | 8034577 | 2678068 | 38219883 | 39283466 | 49996111 | 0.786 | 0.161 | 0.054 | 0.800 | 0.200 | 0.014 | 0.039 | 0.804 |
| 13 | ARTIFICIAL | 2 | 5 | centrifuge | 1066865 | 8069484 | 2639879 | 38219883 | 39286748 | 49996111 | 0.786 | 0.161 | 0.053 | 0.800 | 0.200 | 0.014 | 0.039 | 0.807 |
| 14 | ARTIFICIAL | 2 | 6 | centrifuge | 1069906 | 8105966 | 2600356 | 38219883 | 39289789 | 49996111 | 0.786 | 0.162 | 0.052 | 0.800 | 0.200 | 0.014 | 0.038 | 0.811 |
| 15 | ARTIFICIAL | 2 | 7 | centrifuge | 1071551 | 8147452 | 2557225 | 38219883 | 39291434 | 49996111 | 0.786 | 0.163 | 0.051 | 0.800 | 0.200 | 0.014 | 0.037 | 0.815 |
| 16 | ARTIFICIAL | 2 | 8 | centrifuge | 1073535 | 8203982 | 2498711 | 38219883 | 39293418 | 49996111 | 0.786 | 0.164 | 0.050 | 0.800 | 0.200 | 0.014 | 0.036 | 0.820 |
| 17 | ARTIFICIAL | 2 | 9 | centrifuge | 1082232 | 8384059 | 2309937 | 38219883 | 39302115 | 49996111 | 0.786 | 0.168 | 0.046 | 0.800 | 0.200 | 0.014 | 0.032 | 0.838 |
'ganon'
| SAMPLE | pass | mode | profiler | mapped_human | mapped_others | unmapped | mapped_human_1_2_maps | mapped_human_total | total_reads | observed_human_prop | observed_others_prop | observed_unmapped_prop | expected_human_prop | expected_others_prop | calculated_unmapped_human_prop | calculated_unmapped_others_prop | proportion_mapped_other_reads | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 18 | ARTIFICIAL | 0 | 1 | ganon | 35534622 | 7058731 | 7402758 | 0 | 35534622 | 49996111 | 0.711 | 0.141 | 0.148 | 0.800 | 0.200 | 0.089 | 0.059 | 0.706 |
| 19 | ARTIFICIAL | 0 | 2 | ganon | 35534622 | 7058731 | 7402758 | 0 | 35534622 | 49996111 | 0.711 | 0.141 | 0.148 | 0.800 | 0.200 | 0.089 | 0.059 | 0.706 |
| 20 | ARTIFICIAL | 0 | 3 | ganon | 38667598 | 8417056 | 2911457 | 0 | 38667598 | 49996111 | 0.773 | 0.168 | 0.058 | 0.800 | 0.200 | 0.027 | 0.032 | 0.842 |
| 21 | ARTIFICIAL | 0 | 4 | ganon | 38665847 | 8418807 | 2911457 | 0 | 38665847 | 49996111 | 0.773 | 0.168 | 0.058 | 0.800 | 0.200 | 0.027 | 0.032 | 0.842 |
| 22 | ARTIFICIAL | 0 | 5 | ganon | 39527606 | 8895428 | 1573077 | 0 | 39527606 | 49996111 | 0.791 | 0.178 | 0.031 | 0.800 | 0.200 | 0.009 | 0.022 | 0.890 |
| 23 | ARTIFICIAL | 0 | 6 | ganon | 39526084 | 8896950 | 1573077 | 0 | 39526084 | 49996111 | 0.791 | 0.178 | 0.031 | 0.800 | 0.200 | 0.009 | 0.022 | 0.890 |
| 24 | ARTIFICIAL | 0 | 7 | ganon | 39779348 | 9122179 | 1094584 | 0 | 39779348 | 49996111 | 0.796 | 0.182 | 0.022 | 0.800 | 0.200 | 0.004 | 0.018 | 0.912 |
| 25 | ARTIFICIAL | 0 | 8 | ganon | 39773555 | 9127972 | 1094584 | 0 | 39773555 | 49996111 | 0.796 | 0.183 | 0.022 | 0.800 | 0.200 | 0.004 | 0.017 | 0.913 |
| 26 | ARTIFICIAL | 0 | 9 | ganon | 39861349 | 9281351 | 853411 | 0 | 39861349 | 49996111 | 0.797 | 0.186 | 0.017 | 0.800 | 0.200 | 0.003 | 0.014 | 0.928 |
| 27 | ARTIFICIAL | 2 | 1 | ganon | 1080116 | 7658050 | 3038062 | 38219883 | 39299999 | 49996111 | 0.786 | 0.153 | 0.061 | 0.800 | 0.200 | 0.014 | 0.047 | 0.766 |
| 28 | ARTIFICIAL | 2 | 2 | ganon | 1080116 | 7658050 | 3038062 | 38219883 | 39299999 | 49996111 | 0.786 | 0.153 | 0.061 | 0.800 | 0.200 | 0.014 | 0.047 | 0.766 |
| 29 | ARTIFICIAL | 2 | 3 | ganon | 1176914 | 9041188 | 1558126 | 38219883 | 39396797 | 49996111 | 0.788 | 0.181 | 0.031 | 0.800 | 0.200 | 0.012 | 0.019 | 0.904 |
| 30 | ARTIFICIAL | 2 | 4 | ganon | 1176339 | 9041763 | 1558126 | 38219883 | 39396222 | 49996111 | 0.788 | 0.181 | 0.031 | 0.800 | 0.200 | 0.012 | 0.019 | 0.904 |
| 31 | ARTIFICIAL | 2 | 5 | ganon | 1214141 | 9519657 | 1042430 | 38219883 | 39434024 | 49996111 | 0.789 | 0.190 | 0.021 | 0.800 | 0.200 | 0.011 | 0.010 | 0.952 |
| 32 | ARTIFICIAL | 2 | 6 | ganon | 1213866 | 9519932 | 1042430 | 38219883 | 39433749 | 49996111 | 0.789 | 0.190 | 0.021 | 0.800 | 0.200 | 0.011 | 0.010 | 0.952 |
| 33 | ARTIFICIAL | 2 | 7 | ganon | 1228567 | 9741454 | 806207 | 38219883 | 39448450 | 49996111 | 0.789 | 0.195 | 0.016 | 0.800 | 0.200 | 0.011 | 0.005 | 0.974 |
| 34 | ARTIFICIAL | 2 | 8 | ganon | 1227189 | 9742832 | 806207 | 38219883 | 39447072 | 49996111 | 0.789 | 0.195 | 0.016 | 0.800 | 0.200 | 0.011 | 0.005 | 0.974 |
| 35 | ARTIFICIAL | 2 | 9 | ganon | 1231920 | 9884612 | 659696 | 38219883 | 39451803 | 49996111 | 0.789 | 0.198 | 0.013 | 0.800 | 0.200 | 0.011 | 0.002 | 0.989 |
'kaiju'
| SAMPLE | pass | mode | profiler | mapped_human | mapped_others | unmapped | mapped_human_1_2_maps | mapped_human_total | total_reads | observed_human_prop | observed_others_prop | observed_unmapped_prop | expected_human_prop | expected_others_prop | calculated_unmapped_human_prop | calculated_unmapped_others_prop | proportion_mapped_other_reads | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 36 | ARTIFICIAL | 0 | 1 | kaiju | 814678 | 5097740 | 44083693 | 0 | 814678 | 49996111 | 0.016 | 0.102 | 0.882 | 0.800 | 0.200 | 0.784 | 0.098 | 0.510 |
| 37 | ARTIFICIAL | 0 | 2 | kaiju | 1065560 | 5349844 | 43580707 | 0 | 1065560 | 49996111 | 0.021 | 0.107 | 0.872 | 0.800 | 0.200 | 0.779 | 0.093 | 0.535 |
| 38 | ARTIFICIAL | 0 | 3 | kaiju | 1065560 | 5349844 | 43580707 | 0 | 1065560 | 49996111 | 0.021 | 0.107 | 0.872 | 0.800 | 0.200 | 0.779 | 0.093 | 0.535 |
| 39 | ARTIFICIAL | 0 | 4 | kaiju | 1519215 | 5586056 | 42890840 | 0 | 1519215 | 49996111 | 0.030 | 0.112 | 0.858 | 0.800 | 0.200 | 0.770 | 0.088 | 0.559 |
| 40 | ARTIFICIAL | 0 | 5 | kaiju | 3099887 | 5936414 | 40959810 | 0 | 3099887 | 49996111 | 0.062 | 0.119 | 0.819 | 0.800 | 0.200 | 0.738 | 0.081 | 0.594 |
| 41 | ARTIFICIAL | 0 | 6 | kaiju | 3100631 | 5936491 | 40958989 | 0 | 3100631 | 49996111 | 0.062 | 0.119 | 0.819 | 0.800 | 0.200 | 0.738 | 0.081 | 0.594 |
| 42 | ARTIFICIAL | 0 | 7 | kaiju | 4812786 | 6324993 | 38858332 | 0 | 4812786 | 49996111 | 0.096 | 0.127 | 0.777 | 0.800 | 0.200 | 0.704 | 0.073 | 0.633 |
| 43 | ARTIFICIAL | 0 | 8 | kaiju | 4862779 | 6330261 | 38803071 | 0 | 4862779 | 49996111 | 0.097 | 0.127 | 0.776 | 0.800 | 0.200 | 0.703 | 0.073 | 0.633 |
| 44 | ARTIFICIAL | 0 | 9 | kaiju | 5825922 | 6650125 | 37520064 | 0 | 5825922 | 49996111 | 0.117 | 0.133 | 0.750 | 0.800 | 0.200 | 0.683 | 0.067 | 0.665 |
| 45 | ARTIFICIAL | 2 | 1 | kaiju | 56877 | 4941549 | 6777802 | 38219883 | 38276760 | 49996111 | 0.766 | 0.099 | 0.136 | 0.800 | 0.200 | 0.034 | 0.101 | 0.494 |
| 46 | ARTIFICIAL | 2 | 2 | kaiju | 68592 | 5116745 | 6590891 | 38219883 | 38288475 | 49996111 | 0.766 | 0.102 | 0.132 | 0.800 | 0.200 | 0.034 | 0.098 | 0.512 |
| 47 | ARTIFICIAL | 2 | 3 | kaiju | 68592 | 5116745 | 6590891 | 38219883 | 38288475 | 49996111 | 0.766 | 0.102 | 0.132 | 0.800 | 0.200 | 0.034 | 0.098 | 0.512 |
| 48 | ARTIFICIAL | 2 | 4 | kaiju | 82822 | 5262441 | 6430965 | 38219883 | 38302705 | 49996111 | 0.766 | 0.105 | 0.129 | 0.800 | 0.200 | 0.034 | 0.095 | 0.526 |
| 49 | ARTIFICIAL | 2 | 5 | kaiju | 118607 | 5439087 | 6218534 | 38219883 | 38338490 | 49996111 | 0.767 | 0.109 | 0.124 | 0.800 | 0.200 | 0.033 | 0.091 | 0.544 |
| 50 | ARTIFICIAL | 2 | 6 | kaiju | 118607 | 5439147 | 6218474 | 38219883 | 38338490 | 49996111 | 0.767 | 0.109 | 0.124 | 0.800 | 0.200 | 0.033 | 0.091 | 0.544 |
| 51 | ARTIFICIAL | 2 | 7 | kaiju | 169913 | 5603398 | 6002917 | 38219883 | 38389796 | 49996111 | 0.768 | 0.112 | 0.120 | 0.800 | 0.200 | 0.032 | 0.088 | 0.560 |
| 52 | ARTIFICIAL | 2 | 8 | kaiju | 170510 | 5604969 | 6000749 | 38219883 | 38390393 | 49996111 | 0.768 | 0.112 | 0.120 | 0.800 | 0.200 | 0.032 | 0.088 | 0.561 |
| 53 | ARTIFICIAL | 2 | 9 | kaiju | 194987 | 5718712 | 5862529 | 38219883 | 38414870 | 49996111 | 0.768 | 0.114 | 0.117 | 0.800 | 0.200 | 0.032 | 0.086 | 0.572 |
'kraken2'
| SAMPLE | pass | mode | profiler | mapped_human | mapped_others | unmapped | mapped_human_1_2_maps | mapped_human_total | total_reads | observed_human_prop | observed_others_prop | observed_unmapped_prop | expected_human_prop | expected_others_prop | calculated_unmapped_human_prop | calculated_unmapped_others_prop | proportion_mapped_other_reads | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 54 | ARTIFICIAL | 0 | 1 | kraken2 | 14455036 | 2965393 | 32575682 | 0 | 14455036 | 49996111 | 0.289 | 0.059 | 0.652 | 0.800 | 0.200 | 0.511 | 0.141 | 0.297 |
| 55 | ARTIFICIAL | 0 | 2 | kraken2 | 19419056 | 3736988 | 26840067 | 0 | 19419056 | 49996111 | 0.388 | 0.075 | 0.537 | 0.800 | 0.200 | 0.412 | 0.125 | 0.374 |
| 56 | ARTIFICIAL | 0 | 3 | kraken2 | 23299865 | 4319590 | 22376656 | 0 | 23299865 | 49996111 | 0.466 | 0.086 | 0.448 | 0.800 | 0.200 | 0.334 | 0.114 | 0.432 |
| 57 | ARTIFICIAL | 0 | 4 | kraken2 | 26673093 | 4785348 | 18537670 | 0 | 26673093 | 49996111 | 0.534 | 0.096 | 0.371 | 0.800 | 0.200 | 0.266 | 0.104 | 0.479 |
| 58 | ARTIFICIAL | 0 | 5 | kraken2 | 29088107 | 5118672 | 15789332 | 0 | 29088107 | 49996111 | 0.582 | 0.102 | 0.316 | 0.800 | 0.200 | 0.218 | 0.098 | 0.512 |
| 59 | ARTIFICIAL | 0 | 6 | kraken2 | 31245840 | 5417465 | 13332806 | 0 | 31245840 | 49996111 | 0.625 | 0.108 | 0.267 | 0.800 | 0.200 | 0.175 | 0.092 | 0.542 |
| 60 | ARTIFICIAL | 0 | 7 | kraken2 | 32777703 | 5644156 | 11574252 | 0 | 32777703 | 49996111 | 0.656 | 0.113 | 0.232 | 0.800 | 0.200 | 0.144 | 0.087 | 0.564 |
| 61 | ARTIFICIAL | 0 | 8 | kraken2 | 34127026 | 5848584 | 10020501 | 0 | 34127026 | 49996111 | 0.683 | 0.117 | 0.200 | 0.800 | 0.200 | 0.117 | 0.083 | 0.585 |
| 62 | ARTIFICIAL | 0 | 9 | kraken2 | 35488630 | 6041475 | 8466006 | 0 | 35488630 | 49996111 | 0.710 | 0.121 | 0.169 | 0.800 | 0.200 | 0.090 | 0.079 | 0.604 |
| 63 | ARTIFICIAL | 2 | 1 | kraken2 | 381464 | 2975038 | 8419726 | 38219883 | 38601347 | 49996111 | 0.772 | 0.060 | 0.168 | 0.800 | 0.200 | 0.028 | 0.140 | 0.298 |
| 64 | ARTIFICIAL | 2 | 2 | kraken2 | 503239 | 3755612 | 7517377 | 38219883 | 38723122 | 49996111 | 0.775 | 0.075 | 0.150 | 0.800 | 0.200 | 0.025 | 0.125 | 0.376 |
| 65 | ARTIFICIAL | 2 | 3 | kraken2 | 591284 | 4327476 | 6857468 | 38219883 | 38811167 | 49996111 | 0.776 | 0.087 | 0.137 | 0.800 | 0.200 | 0.024 | 0.113 | 0.433 |
| 66 | ARTIFICIAL | 2 | 4 | kraken2 | 664379 | 4770367 | 6341482 | 38219883 | 38884262 | 49996111 | 0.778 | 0.095 | 0.127 | 0.800 | 0.200 | 0.022 | 0.105 | 0.477 |
| 67 | ARTIFICIAL | 2 | 5 | kraken2 | 723279 | 5127558 | 5925391 | 38219883 | 38943162 | 49996111 | 0.779 | 0.103 | 0.119 | 0.800 | 0.200 | 0.021 | 0.097 | 0.513 |
| 68 | ARTIFICIAL | 2 | 6 | kraken2 | 772466 | 5409228 | 5594534 | 38219883 | 38992349 | 49996111 | 0.780 | 0.108 | 0.112 | 0.800 | 0.200 | 0.020 | 0.092 | 0.541 |
| 69 | ARTIFICIAL | 2 | 7 | kraken2 | 813970 | 5651243 | 5311015 | 38219883 | 39033853 | 49996111 | 0.781 | 0.113 | 0.106 | 0.800 | 0.200 | 0.019 | 0.087 | 0.565 |
| 70 | ARTIFICIAL | 2 | 8 | kraken2 | 848074 | 5843864 | 5084290 | 38219883 | 39067957 | 49996111 | 0.781 | 0.117 | 0.102 | 0.800 | 0.200 | 0.019 | 0.083 | 0.584 |
| 71 | ARTIFICIAL | 2 | 9 | kraken2 | 895442 | 6031308 | 4849478 | 38219883 | 39115325 | 49996111 | 0.782 | 0.121 | 0.097 | 0.800 | 0.200 | 0.018 | 0.079 | 0.603 |
'krakenuniq'
| SAMPLE | pass | mode | profiler | mapped_human | mapped_others | unmapped | mapped_human_1_2_maps | mapped_human_total | total_reads | observed_human_prop | observed_others_prop | observed_unmapped_prop | expected_human_prop | expected_others_prop | calculated_unmapped_human_prop | calculated_unmapped_others_prop | proportion_mapped_other_reads | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 72 | ARTIFICIAL | 0 | 1 | krakenuniq | 39192382 | 8073019 | 2730710 | 0 | 39192382 | 49996111 | 0.784 | 0.161 | 0.055 | 0.800 | 0.200 | 0.016 | 0.039 | 0.807 |
| 73 | ARTIFICIAL | 0 | 2 | krakenuniq | 39192382 | 8073019 | 2730710 | 0 | 39192382 | 49996111 | 0.784 | 0.161 | 0.055 | 0.800 | 0.200 | 0.016 | 0.039 | 0.807 |
| 74 | ARTIFICIAL | 0 | 3 | krakenuniq | 39192382 | 8073019 | 2730710 | 0 | 39192382 | 49996111 | 0.784 | 0.161 | 0.055 | 0.800 | 0.200 | 0.016 | 0.039 | 0.807 |
| 75 | ARTIFICIAL | 0 | 4 | krakenuniq | 39192382 | 8073019 | 2730710 | 0 | 39192382 | 49996111 | 0.784 | 0.161 | 0.055 | 0.800 | 0.200 | 0.016 | 0.039 | 0.807 |
| 76 | ARTIFICIAL | 0 | 5 | krakenuniq | 39192382 | 8073019 | 2730710 | 0 | 39192382 | 49996111 | 0.784 | 0.161 | 0.055 | 0.800 | 0.200 | 0.016 | 0.039 | 0.807 |
| 77 | ARTIFICIAL | 0 | 6 | krakenuniq | 39192382 | 8073019 | 2730710 | 0 | 39192382 | 49996111 | 0.784 | 0.161 | 0.055 | 0.800 | 0.200 | 0.016 | 0.039 | 0.807 |
| 78 | ARTIFICIAL | 0 | 7 | krakenuniq | 39192382 | 8073019 | 2730710 | 0 | 39192382 | 49996111 | 0.784 | 0.161 | 0.055 | 0.800 | 0.200 | 0.016 | 0.039 | 0.807 |
| 79 | ARTIFICIAL | 0 | 8 | krakenuniq | 39192382 | 8073019 | 2730710 | 0 | 39192382 | 49996111 | 0.784 | 0.161 | 0.055 | 0.800 | 0.200 | 0.016 | 0.039 | 0.807 |
| 80 | ARTIFICIAL | 0 | 9 | krakenuniq | 39192382 | 8073019 | 2730710 | 0 | 39192382 | 49996111 | 0.784 | 0.161 | 0.055 | 0.800 | 0.200 | 0.016 | 0.039 | 0.807 |
| 81 | ARTIFICIAL | 2 | 1 | krakenuniq | 1094558 | 8062716 | 2618954 | 38219883 | 39314441 | 49996111 | 0.786 | 0.161 | 0.052 | 0.800 | 0.200 | 0.014 | 0.039 | 0.806 |
| 82 | ARTIFICIAL | 2 | 2 | krakenuniq | 1094558 | 8062716 | 2618954 | 38219883 | 39314441 | 49996111 | 0.786 | 0.161 | 0.052 | 0.800 | 0.200 | 0.014 | 0.039 | 0.806 |
| 83 | ARTIFICIAL | 2 | 3 | krakenuniq | 1094558 | 8062716 | 2618954 | 38219883 | 39314441 | 49996111 | 0.786 | 0.161 | 0.052 | 0.800 | 0.200 | 0.014 | 0.039 | 0.806 |
| 84 | ARTIFICIAL | 2 | 4 | krakenuniq | 1094558 | 8062716 | 2618954 | 38219883 | 39314441 | 49996111 | 0.786 | 0.161 | 0.052 | 0.800 | 0.200 | 0.014 | 0.039 | 0.806 |
| 85 | ARTIFICIAL | 2 | 5 | krakenuniq | 1094558 | 8062716 | 2618954 | 38219883 | 39314441 | 49996111 | 0.786 | 0.161 | 0.052 | 0.800 | 0.200 | 0.014 | 0.039 | 0.806 |
| 86 | ARTIFICIAL | 2 | 6 | krakenuniq | 1094558 | 8062716 | 2618954 | 38219883 | 39314441 | 49996111 | 0.786 | 0.161 | 0.052 | 0.800 | 0.200 | 0.014 | 0.039 | 0.806 |
| 87 | ARTIFICIAL | 2 | 7 | krakenuniq | 1094558 | 8062716 | 2618954 | 38219883 | 39314441 | 49996111 | 0.786 | 0.161 | 0.052 | 0.800 | 0.200 | 0.014 | 0.039 | 0.806 |
| 88 | ARTIFICIAL | 2 | 8 | krakenuniq | 1094558 | 8062716 | 2618954 | 38219883 | 39314441 | 49996111 | 0.786 | 0.161 | 0.052 | 0.800 | 0.200 | 0.014 | 0.039 | 0.806 |
| 89 | ARTIFICIAL | 2 | 9 | krakenuniq | 1094558 | 8062716 | 2618954 | 38219883 | 39314441 | 49996111 | 0.786 | 0.161 | 0.052 | 0.800 | 0.200 | 0.014 | 0.039 | 0.806 |
custom_palette = ["#648FFF", "#785EF0", "#DC267F", "#FE6100", "#FFB000", "#848484"]
fig, axs = plt.subplots(1, 2, figsize=(8, 3))
# 1A) Check if there are differences in human read assignment.
# Step 1: Calculate the differences between pass 2 and pass 0 for each profiler and mode
pass_diff = (
df_host_profile_info_artificial.pivot_table(
index=["profiler", "mode"], columns="pass", values="observed_human_prop"
)
.reset_index()
)
# Ensure column names are integers
pass_diff.columns.name = None # Remove the columns' name from pivot_table
pass_diff.columns = ['profiler', 'mode', 0, 2] # Explicitly rename columns
# Calculate the difference
pass_diff["difference"] = 100 * (pass_diff[2] - pass_diff[0])
# Step 2: Plot the differences using a lineplot
g = sns.lineplot(
data=pass_diff,
x="mode",
y="difference",
hue="profiler",
marker="o",
palette=custom_palette,
ax=axs[0],
legend=False
)
axs[0].axhline(0, color="#848484", linestyle="--", linewidth=0.8)
axs[0].set_title("Human assignment", fontsize=14)
axs[0].set_xlabel("Mode", fontsize=12)
axs[0].set_ylabel("Diff pass2 - pass0 (%)", fontsize=12)
# 1B) Check if there are differences in non-human read assignment.
pass_diff = (
df_host_profile_info_artificial.pivot_table(
index=["profiler", "mode"], columns="pass", values="observed_others_prop"
)
.reset_index()
)
pass_diff["difference"] = 100 * (pass_diff[2] - pass_diff[0])
# Step 2: Plot the differences using a lineplot
h = sns.lineplot(
data=pass_diff,
x="mode",
y="difference",
hue="profiler",
marker="o",
palette=custom_palette,
ax=axs[1],
)
sns.despine(top=True, right=True)
axs[1].axhline(0, color="gray", linestyle="--", linewidth=0.8)
axs[1].set_title("Non-human assignment", fontsize=12)
axs[1].set_xlabel("Mode", fontsize=12)
axs[1].set_ylabel("", fontsize=12)
for ax in axs:
ax.set_xticks(range(1, 10))
axs[1].legend(title="Profiler", frameon=False, bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
plt.tight_layout()
plt.rc('axes', linewidth=0.65)
for format in ['png', 'tiff']:
plt.savefig(f'{RESULTS_DIR}/figures/paper/figA.{format}', dpi=DPI, )
/tmp/ipykernel_2918775/492332427.py:24: UserWarning: The palette list has more values (6) than needed (5), which may not be intended. g = sns.lineplot( /tmp/ipykernel_2918775/492332427.py:57: UserWarning: The palette list has more values (6) than needed (5), which may not be intended. h = sns.lineplot(
What do we see here?
- The number of reads assigned to humans without host mapping is very variable depending on the profiler. Centrifuge, krakenuniq and ganon and map human reads correctly, whereas kaiju, kraken2 fail to map the reads to human. The differences tend to decrease with the sensitivity mode, that is, paradoxically, a more strict read assignment leads to an improved number of human-mapped reads. However, this makes sense because more reads are assigned in general, and thus both human and non-human reads are mapped.
- However, this difference does not occur in non-human species. In general, non-human species are assigned equally with or without host mapping. This is interesting because we would expect a higher amount of reads assigned to non-human species originating from a false positive assignment of human reads, but seems not to be the case, even in profilers that have a high ammount of unmapped human reads.
- Still, we have to take into acount that the profiler databases include a host mapping step.
table_artificial_taxcounts = pd.read_csv('../../src/version_2/table_artificial_taxid.csv', sep=';', names=['species', 'taxid', 'count'])
table_artificial_taxcounts = table_artificial_taxcounts[table_artificial_taxcounts['taxid'] != 9606]
table_artificial_taxcounts['abundance'] = 100 * table_artificial_taxcounts['count'] / table_artificial_taxcounts['count'].sum()
table_artificial_taxcounts
| species | taxid | count | abundance | |
|---|---|---|---|---|
| 1 | Cutibacterium acnes | 1747 | 312500 | 3.125 |
| 2 | Lactobacillus acidophilus | 1579 | 312500 | 3.125 |
| 3 | Bifidobacterium bifidum | 1681 | 312500 | 3.125 |
| 4 | Akkermansia muciniphila | 239935 | 312500 | 3.125 |
| 5 | Blautia coccoides | 1532 | 312500 | 3.125 |
| 6 | Blautia luti | 89014 | 312500 | 3.125 |
| 7 | Bacteroides ovatus | 28116 | 312500 | 3.125 |
| 8 | Bacteroides intestinalis | 329854 | 312500 | 3.125 |
| 9 | Bacteroides fragilis | 817 | 312500 | 3.125 |
| 10 | Escherichia coli | 511145 | 312500 | 3.125 |
| 11 | Dietzia lutea | 546160 | 250000 | 2.500 |
| 12 | Ruthenibacterium lactatiformans | 1550024 | 187500 | 1.875 |
| 13 | Faecalibacterium prausnitzii | 853 | 187500 | 1.875 |
| 14 | Parabacteroides distasonis | 823 | 187500 | 1.875 |
| 15 | Parabacteroides merdae | 46503 | 187500 | 1.875 |
| 16 | Fusicatenibacter saccharivorans | 1150298 | 187500 | 1.875 |
| 17 | Erysipelatoclostridium ramosum | 1547 | 125000 | 1.250 |
| 18 | Streptococcus salivarius | 1304 | 125000 | 1.250 |
| 19 | Hungatella hathewayi | 154046 | 62500 | 0.625 |
| 20 | Eisenbergiella porci | 2652274 | 62500 | 0.625 |
| 21 | Butyricimonas faecalis | 2093856 | 62500 | 0.625 |
| 22 | Alistipes indistinctus | 626932 | 62500 | 0.625 |
| 23 | Alistipes finegoldii | 214856 | 62500 | 0.625 |
| 24 | Eubacterium callanderi | 53442 | 62500 | 0.625 |
| 25 | Acidaminococcus intestini | 187327 | 62500 | 0.625 |
| 26 | Aspergillus chevalieri | 182096 | 200000 | 2.000 |
| 27 | Aspergillus flavus | 5059 | 200000 | 2.000 |
| 28 | Saccharomyces cerevisiae | 4932 | 200000 | 2.000 |
| 29 | Saccharomyces kudriavzevii | 114524 | 200000 | 2.000 |
| 30 | Saccharomyces mikatae | 114525 | 200000 | 2.000 |
| 31 | Candida albicans | 5476 | 200000 | 2.000 |
| 32 | Candida dubliniensis | 42374 | 200000 | 2.000 |
| 33 | Candida orthopsilosis | 273371 | 200000 | 2.000 |
| 34 | Malassezia restricta | 76775 | 150000 | 1.500 |
| 35 | Alternaria dauci | 48095 | 150000 | 1.500 |
| 36 | Kazachstania africana | 432096 | 100000 | 1.000 |
| 37 | Penicillium digitatum | 36651 | 100000 | 1.000 |
| 38 | Pichia kudriavzevii | 4909 | 50000 | 0.500 |
| 39 | Trichoderma asperellum | 101201 | 50000 | 0.500 |
| 40 | Akanthomyces muscarius | 2231603 | 50000 | 0.500 |
| 41 | Fusarium falciforme | 195108 | 50000 | 0.500 |
| 42 | Eremothecium sinecaudum | 45286 | 50000 | 0.500 |
| 43 | Cryptococcus decagattii | 1859122 | 50000 | 0.500 |
| 44 | Kwoniella shandongensis | 1734106 | 50000 | 0.500 |
| 45 | Puccinia triticina | 208348 | 50000 | 0.500 |
| 46 | Tobacco mosaic virus | 12242 | 250000 | 2.500 |
| 47 | Rotavirus A | 28875 | 250000 | 2.500 |
| 48 | Rotavirus B | 28876 | 250000 | 2.500 |
| 49 | Rotavirus C | 36427 | 250000 | 2.500 |
| 50 | Bacteriophage P2 | 10679 | 200000 | 2.000 |
| 51 | Escherichia phage T4 | 10665 | 200000 | 2.000 |
| 52 | Human immunodeficiency virus 1 | 11676 | 200000 | 2.000 |
| 53 | Human adenovirus 7 | 108098 | 150000 | 1.500 |
| 54 | Hepatitis C virus | 3052230 | 150000 | 1.500 |
| 55 | Bovine alphaherpesvirus 2 | 3050244 | 150000 | 1.500 |
| 56 | Human herpesvirus 4 type 2 | 3050299 | 150000 | 1.500 |
| 57 | Mimivirus terra2 | 1128151 | 100000 | 1.000 |
| 58 | Dengue virus | 3052464 | 100000 | 1.000 |
| 59 | Norovirus GI | 11983 | 50000 | 0.500 |
| 60 | Zaire ebolavirus | 3052462 | 50000 | 0.500 |
Computing detection stats to aswer the questions¶
One of the parameters used during profiling is the mode of the profilers. Each profiler has a different set of parametters to include reads as valid or not. This may results in the detection of false positives and negatives.
Here, we are going to study this effect in in silico samples to see if there are major changes. We are can measure the effectivity of several variables:
Categorical values: we can use each of the columns in the flag system to check how well were species assigned. We can use the precision (TP/TP + FP), recall (TP/TP+FN) and F1-score (2 x precision x recall / precision + recall) and Cohen's kappa. $$\kappa = \frac{p_0-p_e}{1-p_e} \quad p_0 = \frac{TP + TN}{TP + FP + FN + TN} \quad p_e=\frac{TP + FP}{TP + FP + FN + TN}\cdot\frac{TP + FN }{TP + FP + FN + TN} + \frac{TN + FP}{TP + FP + FN + TN}\cdot\frac{TN + FN}{TP + FP + FN + TN}$$
Numerical values: we can use the normalized value and the abundance to see how well are reads classified. For that we can use the expected number of reads and abundance. With that we will calculate the (1) difference between observed and expected categories and (2) the mean error and the (3) mean absolute error: $$(1) \qquad DIFF_i = 100\cdot\frac{x_{obs,i} - x_{exp,i}}{x_{exp,i}}$$ $$(2) \qquad ME = E[DIFF_i] = \frac{100}{N}\sum\frac{x_{obs,i} - x_{exp,i}}{x_{exp,i}}$$ $$(3A) \qquad MAE = E[|DIFF_i|] = \frac{100}{N}\sum\frac{|x_{obs,i} - x_{exp,i}|}{x_{exp,i}}$$ $$(3B) \qquad MAED = \sigma[|DIFF_i|]$$
For numerical values we are also going to calculate the pearson correlation between the observed and expected values, using a log10(1+x) transform
Categorical values¶
def calculate_nominal_metrics(df_tax_ground_truth, df_flags_observed, column):
list_expected_taxids = list(df_tax_ground_truth['taxid'].astype(int).values)
list_observed_true_taxids = list(df_flags_observed.loc[df_flags_observed[column] == False, 'taxonomy_id'].astype(int).values)
list_observed_false_taxids = list(df_flags_observed.loc[df_flags_observed[column] == True, 'taxonomy_id'].astype(int).values)
TP = len([i for i in list_expected_taxids if i in list_observed_true_taxids])
FN = len([i for i in list_expected_taxids if i not in list_observed_true_taxids])
FP = len([i for i in list_observed_true_taxids if i not in list_expected_taxids])
TN = len([i for i in list_observed_false_taxids if i not in list_expected_taxids])
assert len(set(list_expected_taxids + list_observed_true_taxids + list_observed_false_taxids)) == TP + FN + FP + TN
precision = TP / (TP + FP)
recall = TP / (TP + FN)
try:
f1 = (2 * precision * recall) / (precision + recall)
except:
f1 = 0
# Create kappa measures
ALL = TP + FN + FP + TN
p0 = (TP + TN) / (ALL)
pe = (TP + FP)/ALL * (TP + FN)/ALL + (TN + FP)/ALL * (TN + FN)/ALL
kappa = (p0 - pe) / (1 - pe)
return precision, recall, f1, kappa, TP, FN, FP, TN
columns_selected = ['centrifuge_norm', 'ganon_norm', 'kaiju_norm', 'kmcp_norm', 'kraken2_norm', 'krakenuniq_norm',
'centrifuge_relab', 'ganon_relab', 'kaiju_relab', 'kmcp_relab', 'kraken2_relab', 'krakenuniq_relab',
'mean_norm', 'CV_norm', 'mean_relab', 'CV_relab']
df_nominal_stats = {'pass': [], 'mode': [], 'S': [], 'column': [], 'precision': [], 'recall': [], 'f1': [],
'kappa': [], 'TP|FN|FP|TN': []}
for passn in [0, 2]:
for mode in range(1, 10):
for S in [0, 1, 2, 3, 4, 5, 6, 7, 10, 15]:
for column in columns_selected:
summary_table_flags = pd.read_csv(f'{RESULTS_DIR}/summary/ARTIFICIAL_pass{passn}_mode{mode}_taxspecies_S{S}.flags.tsv', sep='\t')
try:
precision, recall, f1, kappa, TP, FN, FP, TN = calculate_nominal_metrics(table_artificial_taxcounts, summary_table_flags, column)
except KeyError:
continue
df_nominal_stats['pass'].append(passn)
df_nominal_stats['mode'].append(mode)
df_nominal_stats['S'].append(S)
df_nominal_stats['column'].append(column)
df_nominal_stats['precision'].append(precision)
df_nominal_stats['recall'].append(recall)
df_nominal_stats['f1'].append(f1)
df_nominal_stats['kappa'].append(kappa)
df_nominal_stats['TP|FN|FP|TN'].append((TP, FN, FP, TN))
df_nominal_stats = pd.DataFrame(df_nominal_stats)
df_nominal_stats
| pass | mode | S | column | precision | recall | f1 | kappa | TP|FN|FP|TN | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 0 | centrifuge_norm | 1.000 | 0.050 | 0.095 | 0.094 | (3, 57, 0, 3637) |
| 1 | 0 | 1 | 0 | ganon_norm | 0.833 | 0.083 | 0.152 | 0.149 | (5, 55, 1, 3636) |
| 2 | 0 | 1 | 0 | kaiju_norm | 1.000 | 0.100 | 0.182 | 0.179 | (6, 54, 0, 3637) |
| 3 | 0 | 1 | 0 | kraken2_norm | 1.000 | 0.217 | 0.356 | 0.352 | (13, 47, 0, 3637) |
| 4 | 0 | 1 | 0 | krakenuniq_norm | 1.000 | 0.100 | 0.182 | 0.179 | (6, 54, 0, 3637) |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2515 | 2 | 9 | 15 | krakenuniq_relab | 0.400 | 0.967 | 0.566 | 0.563 | (58, 2, 87, 12282) |
| 2516 | 2 | 9 | 15 | mean_norm | 0.310 | 0.967 | 0.470 | 0.466 | (58, 2, 129, 12240) |
| 2517 | 2 | 9 | 15 | CV_norm | 0.005 | 0.900 | 0.009 | -0.000 | (54, 6, 11726, 643) |
| 2518 | 2 | 9 | 15 | mean_relab | 0.258 | 0.967 | 0.407 | 0.402 | (58, 2, 167, 12202) |
| 2519 | 2 | 9 | 15 | CV_relab | 0.005 | 0.933 | 0.009 | -0.000 | (56, 4, 11731, 638) |
2520 rows × 9 columns
Numerical values¶
def compute_mad(values):
median = np.median(values)
mad = np.median(np.abs(values - median))
return mad
def calculate_numerical_metrics(df_tax_ground_truth, df_counts_observed, profiler, suffix):
df_tax_ground_truth = df_tax_ground_truth.copy().set_index('taxid')
df_counts_observed = df_counts_observed.copy().set_index('taxonomy_id')
list_expected_taxids = df_tax_ground_truth.index.astype(int).values
list_observed_true_taxids = df_counts_observed.index.astype(int).values
combined_taxid = np.intersect1d(list_expected_taxids, list_observed_true_taxids)
species = df_tax_ground_truth.loc[combined_taxid, 'species'].values
observed_counts = df_counts_observed.loc[combined_taxid, f'{profiler}_{suffix}'].fillna(0).values
expected_col = 'count' if suffix == 'norm' else 'abundance'
expected_counts = df_tax_ground_truth.loc[combined_taxid, expected_col].values
diff_counts = 100 * (observed_counts - expected_counts) / expected_counts
MRE_counts = np.mean(diff_counts)
MRED_counts = np.std(diff_counts)
MAE_counts = np.mean(np.abs(diff_counts))
MAED_counts = np.std(np.abs(diff_counts))
MACV_counts = MAED_counts / MAE_counts
if len(combined_taxid) > 35:
x,y = expected_counts, observed_counts
slope, _, r_value, _, _ = linregress(x, y)
r2 = r_value ** 2
else:
slope, r2 = np.nan, np.nan
return diff_counts, MRE_counts, MRED_counts, MAE_counts, MAED_counts, MACV_counts, slope, r2, combined_taxid, species, expected_counts, observed_counts
for passn in [2]:
for mode in range(5, 6):
for profiler in ['mean']:
try:
summary_table_flags = pd.read_csv(f'{RESULTS_DIR}/summary/ARTIFICIAL_pass{passn}_mode{mode}_taxspecies_S{S}.flags.tsv', sep='\t')
summary_table_counts = pd.read_csv(f'{RESULTS_DIR}/summary/ARTIFICIAL_pass{passn}_mode{mode}_taxspecies_S{S}.diversity.tsv', sep='\t')
diff_counts, MRE_counts, MRED_counts, MAE_counts, MAED_counts, MACV_counts, corr_counts, rmse_counts, taxids_counts, species_counts, expected_counts, observed_counts = \
calculate_numerical_metrics(table_artificial_taxcounts, summary_table_counts, \
profiler, suffix='norm')
diff_abundance, MRE_abundance, MRED_abundance, MAE_abundance, MAED_abundance, MACV_abundance, corr_abundance, rmse_abundance, taxids_abundance, species_abundance, expected_abundance, observed_abundance = \
calculate_numerical_metrics(table_artificial_taxcounts, summary_table_counts, \
profiler, suffix='relab')
except KeyError:
raise
df = pd.DataFrame({'species': species_counts, 'expected_counts': expected_counts, 'observed_counts': observed_counts, 'diff_abundance': diff_counts})
df
| species | expected_counts | observed_counts | diff_abundance | |
|---|---|---|---|---|
| 0 | Bacteroides fragilis | 312500 | 240040.116 | -23.187 |
| 1 | Parabacteroides distasonis | 187500 | 89289.167 | -52.379 |
| 2 | Faecalibacterium prausnitzii | 187500 | 54476.741 | -70.946 |
| 3 | Streptococcus salivarius | 125000 | 58117.095 | -53.506 |
| 4 | Blautia coccoides | 312500 | 173325.771 | -44.536 |
| 5 | Erysipelatoclostridium ramosum | 125000 | 67123.702 | -46.301 |
| 6 | Lactobacillus acidophilus | 312500 | 239520.544 | -23.353 |
| 7 | Bifidobacterium bifidum | 312500 | 184723.065 | -40.889 |
| 8 | Cutibacterium acnes | 312500 | 254046.803 | -18.705 |
| 9 | Pichia kudriavzevii | 50000 | 48697.541 | -2.605 |
| 10 | Saccharomyces cerevisiae | 200000 | 195902.958 | -2.049 |
| 11 | Aspergillus flavus | 200000 | 148008.703 | -25.996 |
| 12 | Candida albicans | 200000 | 184824.390 | -7.588 |
| 13 | Escherichia phage T4 | 200000 | 164474.441 | -17.763 |
| 14 | Bacteriophage P2 | 200000 | 45220.596 | -77.390 |
| 15 | Human immunodeficiency virus 1 | 200000 | 201026.902 | 0.513 |
| 16 | Norovirus GI | 50000 | 50524.777 | 1.050 |
| 17 | Tobacco mosaic virus | 250000 | 243272.405 | -2.691 |
| 18 | Bacteroides ovatus | 312500 | 111631.939 | -64.278 |
| 19 | Rotavirus A | 250000 | 251608.261 | 0.643 |
| 20 | Rotavirus B | 250000 | 253634.055 | 1.454 |
| 21 | Rotavirus C | 250000 | 254565.176 | 1.826 |
| 22 | Penicillium digitatum | 100000 | 95742.552 | -4.257 |
| 23 | Candida dubliniensis | 200000 | 184501.730 | -7.749 |
| 24 | Eremothecium sinecaudum | 50000 | 49425.836 | -1.148 |
| 25 | Parabacteroides merdae | 187500 | 125862.826 | -32.873 |
| 26 | Alternaria dauci | 150000 | 141107.893 | -5.928 |
| 27 | Eubacterium callanderi | 62500 | 30952.294 | -50.476 |
| 28 | Malassezia restricta | 150000 | 149869.196 | -0.087 |
| 29 | Blautia luti | 312500 | 53234.639 | -82.965 |
| 30 | Trichoderma asperellum | 50000 | 46584.994 | -6.830 |
| 31 | Human adenovirus 7 | 150000 | 147511.825 | -1.659 |
| 32 | Saccharomyces kudriavzevii | 200000 | 190980.300 | -4.510 |
| 33 | Saccharomyces mikatae | 200000 | 189682.529 | -5.159 |
| 34 | Hungatella hathewayi | 62500 | 40802.328 | -34.716 |
| 35 | Aspergillus chevalieri | 200000 | 187004.613 | -6.498 |
| 36 | Acidaminococcus intestini | 62500 | 57048.401 | -8.723 |
| 37 | Fusarium falciforme | 50000 | 46336.938 | -7.326 |
| 38 | Puccinia triticina | 50000 | 49269.888 | -1.460 |
| 39 | Alistipes finegoldii | 62500 | 51804.368 | -17.113 |
| 40 | Akkermansia muciniphila | 312500 | 196080.997 | -37.254 |
| 41 | Candida orthopsilosis | 200000 | 189190.931 | -5.405 |
| 42 | Bacteroides intestinalis | 312500 | 163291.164 | -47.747 |
| 43 | Kazachstania africana | 100000 | 97363.929 | -2.636 |
| 44 | Dietzia lutea | 250000 | 232472.093 | -7.011 |
| 45 | Alistipes indistinctus | 62500 | 55391.499 | -11.374 |
| 46 | Mimivirus terra2 | 100000 | 84794.462 | -15.206 |
| 47 | Ruthenibacterium lactatiformans | 187500 | 135305.249 | -27.837 |
| 48 | Kwoniella shandongensis | 50000 | 48329.668 | -3.341 |
| 49 | Cryptococcus decagattii | 50000 | 47962.656 | -4.075 |
| 50 | Butyricimonas faecalis | 62500 | 49405.437 | -20.951 |
| 51 | Akanthomyces muscarius | 50000 | 47989.514 | -4.021 |
| 52 | Eisenbergiella porci | 62500 | 57135.945 | -8.582 |
| 53 | Bovine alphaherpesvirus 2 | 150000 | 148849.267 | -0.767 |
| 54 | Human herpesvirus 4 type 2 | 150000 | 139697.441 | -6.868 |
| 55 | Hepatitis C virus | 150000 | 151377.085 | 0.918 |
| 56 | Zaire ebolavirus | 50000 | 49744.206 | -0.512 |
| 57 | Dengue virus | 100000 | 101890.120 | 1.890 |
df_numerical_stats = {'pass': [], 'mode': [], 'profiler': [],
'diff_counts': [], 'MRE_counts': [],'MRED_counts': [], 'MAE_counts': [], 'MAED_counts': [], 'MACV_counts': [],
'corr_counts': [], 'R2_counts': [], 'taxid_counts': [],
'species_counts': [], 'expected_counts': [], 'observed_counts': [],
'diff_abundance': [], 'MRE_abundance': [],'MRED_abundance': [], 'MAE_abundance': [], 'MAED_abundance': [], 'MACV_abundance': [],
'corr_abundance': [], 'R2_abundance': [], 'taxid_abundance': [],
'species_abundance': [], 'expected_abundance': [], 'observed_abundance': [], }
for passn in [0, 2]:
for mode in range(1, 10):
for profiler in LIST_PROFILERS + ['mean']:
summary_table_flags = pd.read_csv(f'{RESULTS_DIR}/summary/ARTIFICIAL_pass{passn}_mode{mode}_taxspecies_S{S}.flags.tsv', sep='\t')
summary_table_counts = pd.read_csv(f'{RESULTS_DIR}/summary/ARTIFICIAL_pass{passn}_mode{mode}_taxspecies_S{S}.diversity.tsv', sep='\t')
try:
diff_counts, MRE_counts, MRED_counts, MAE_counts, MAED_counts, MACV_counts, corr_counts, rmse_counts, taxids_counts, species_counts, expected_counts, observed_counts = \
calculate_numerical_metrics(table_artificial_taxcounts, summary_table_counts, \
profiler, suffix='norm')
diff_abundance, MRE_abundance, MRED_abundance, MAE_abundance, MAED_abundance, MACV_abundance, corr_abundance, rmse_abundance, taxids_abundance, species_abundance, expected_abundance, observed_abundance = \
calculate_numerical_metrics(table_artificial_taxcounts, summary_table_counts, \
profiler, suffix='relab')
except KeyError:
raise
continue
df_numerical_stats['pass'].append(passn)
df_numerical_stats['mode'].append(mode)
df_numerical_stats['profiler'].append(profiler)
df_numerical_stats['diff_counts'].append(diff_counts)
df_numerical_stats['MRE_counts'].append(MRE_counts)
df_numerical_stats['MRED_counts'].append(MRED_counts)
df_numerical_stats['MAE_counts'].append(MAE_counts)
df_numerical_stats['MAED_counts'].append(MAED_counts)
df_numerical_stats['MACV_counts'].append(MACV_counts)
df_numerical_stats['corr_counts'].append(corr_counts)
df_numerical_stats['R2_counts'].append(rmse_counts)
df_numerical_stats['taxid_counts'].append(taxids_counts)
df_numerical_stats['species_counts'].append(species_counts)
df_numerical_stats['expected_counts'].append(expected_counts)
df_numerical_stats['observed_counts'].append(observed_counts)
df_numerical_stats['diff_abundance'].append(diff_abundance)
df_numerical_stats['MRE_abundance'].append(MRE_abundance)
df_numerical_stats['MRED_abundance'].append(MRED_abundance)
df_numerical_stats['MAE_abundance'].append(MAE_abundance)
df_numerical_stats['MAED_abundance'].append(MAED_abundance)
df_numerical_stats['MACV_abundance'].append(MACV_abundance)
df_numerical_stats['corr_abundance'].append(corr_abundance)
df_numerical_stats['R2_abundance'].append(rmse_abundance)
df_numerical_stats['taxid_abundance'].append(taxids_abundance)
df_numerical_stats['species_abundance'].append(species_abundance)
df_numerical_stats['expected_abundance'].append(expected_abundance)
df_numerical_stats['observed_abundance'].append(observed_abundance)
df_numerical_stats = pd.DataFrame(df_numerical_stats)
df_numerical_stats
| pass | mode | profiler | diff_counts | MRE_counts | MRED_counts | MAE_counts | MAED_counts | MACV_counts | corr_counts | R2_counts | taxid_counts | species_counts | expected_counts | observed_counts | diff_abundance | MRE_abundance | MRED_abundance | MAE_abundance | MAED_abundance | MACV_abundance | corr_abundance | R2_abundance | taxid_abundance | species_abundance | expected_abundance | observed_abundance | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | centrifuge | [-21.76736, -50.1056, -74.39946666666667, -58.... | -16.939 | 23.674 | 17.173 | 23.505 | 1.369 | 0.635 | 0.576 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [244477.0, 93552.0, 48001.0, 51790.0, 189991.0... | [-1.33929624238111, -37.07720182312471, -67.71... | 4.750 | 29.856 | 26.450 | 14.640 | 0.554 | 0.801 | 0.576 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.0831469924255903, 1.1798024658164117, 0.605... |
| 1 | 0 | 1 | ganon | [-23.91328, -55.785066666666665, -83.386666666... | -28.729 | 21.022 | 28.729 | 21.022 | 0.732 | 0.558 | 0.590 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [237771.0, 82903.0, 31150.0, 42979.0, 159924.0... | [7.790932959479549, -37.36135668956172, -76.46... | 0.969 | 29.782 | 25.411 | 15.563 | 0.612 | 0.790 | 0.590 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.368466654983736, 1.1744745620707178, 0.4412... |
| 2 | 0 | 1 | kaiju | [-53.25664, -79.21386666666666, -91.4437333333... | -49.543 | 28.481 | 49.543 | 28.481 | 0.575 | 0.389 | 0.313 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [146073.0, 38974.0, 16043.0, 14537.0, 54507.0,... | [-8.305719789553791, -59.224806809815064, -83.... | -1.021 | 55.870 | 47.167 | 29.963 | 0.635 | 0.764 | 0.313 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [2.865446256576444, 0.7645348723159675, 0.3147... |
| 3 | 0 | 1 | kraken2 | [-76.07584, -94.63306666666666, -99.4, -99.609... | -65.147 | 24.902 | 65.147 | 24.902 | 0.382 | 0.176 | 0.122 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [74763.0, 10063.0, 1125.0, 488.0, 11530.0, 288... | [-19.322126949109276, -81.90144330504141, -97.... | 17.532 | 83.974 | 75.284 | 41.126 | 0.546 | 0.594 | 0.122 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [2.521183532840335, 0.3393479380304735, 0.0379... |
| 4 | 0 | 1 | krakenuniq | [-19.62944, -48.239466666666665, -68.408533333... | -16.729 | 23.969 | 17.018 | 23.765 | 1.396 | 0.642 | 0.572 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [251158.0, 97051.0, 59234.0, 56908.0, 183826.0... | [-0.4454715144359227, -35.88453918746712, -60.... | 3.147 | 29.691 | 25.571 | 15.414 | 0.603 | 0.795 | 0.572 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.1110790151738774, 1.2021648902349915, 0.733... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 103 | 2 | 9 | ganon | [3.88512, -35.4976, -56.00053333333334, -32.8,... | -1.882 | 23.489 | 18.223 | 14.940 | 0.820 | 0.751 | 0.659 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [324641.0, 120942.0, 82499.0, 84000.0, 215092.... | [5.0978227572311425, -34.74463135224731, -55.4... | -0.736 | 23.763 | 18.879 | 14.451 | 0.765 | 0.760 | 0.659 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.284306961163473, 1.223538162145363, 0.83462... |
| 104 | 2 | 9 | kaiju | [-48.45856, -76.704, -89.0208, -85.9616, -80.8... | -43.474 | 29.718 | 43.474 | 29.718 | 0.684 | 0.422 | 0.324 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [161067.0, 43680.0, 20586.0, 17548.0, 59848.0,... | [-9.872292921902698, -59.26355445072247, -80.8... | -1.156 | 51.967 | 43.407 | 28.597 | 0.659 | 0.738 | 0.324 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [2.8164908461905407, 0.7638083540489538, 0.359... |
| 105 | 2 | 9 | kraken2 | [-39.60768, -84.74186666666667, -95.9482666666... | -32.087 | 35.514 | 32.087 | 35.514 | 1.107 | 0.419 | 0.235 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [188726.0, 28609.0, 7597.0, 4884.0, 33042.0, 6... | [0.13138111998259205, -74.70178386954647, -93.... | 12.601 | 58.882 | 53.987 | 26.671 | 0.494 | 0.695 | 0.235 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.129105659999456, 0.4743415524460034, 0.1259... |
| 106 | 2 | 9 | krakenuniq | [-19.67168, -48.3264, -68.4816, -54.56, -41.30... | -16.778 | 23.985 | 17.064 | 23.782 | 1.394 | 0.641 | 0.572 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [251026.0, 96888.0, 59097.0, 56800.0, 183437.0... | [-0.37064433374560224, -35.91043018258363, -60... | 3.219 | 29.748 | 25.644 | 15.417 | 0.601 | 0.796 | 0.572 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.11341736457045, 1.2016794340765569, 0.73296... |
| 107 | 2 | 9 | mean | [-20.390584802138655, -49.665829444940506, -66... | -16.259 | 21.692 | 16.933 | 21.171 | 1.250 | 0.638 | 0.622 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [248779.4224933167, 94376.56979073654, 62310.1... | [-1.6718373579364112, -43.06956794651194, -63.... | 8.287 | 32.919 | 29.573 | 16.667 | 0.564 | 0.799 | 0.536 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.072755082564487, 1.0674456010029012, 0.6795... |
108 rows × 27 columns
Analysis of kappa/F1¶
F1-score and $\kappa$ are quite different measures but we observe that they are correlated in this data.
# Compute and print correlation (Pearson by default)
corr = df_nominal_stats['f1'].corr(df_nominal_stats['kappa'])
print("Pearson correlation between F1 and Kappa:", corr)
# Scatter plot with the identity line
plt.figure(figsize=(4, 4))
# Add the identity line (y = x)
plt.plot([0, 1], [0, 1], color='red', linestyle='--', alpha=0.5)
sns.scatterplot(x='f1', y='kappa', data=df_nominal_stats, s=20)
plt.text(0.01, 0.95, f'R$^2$: {corr**2:.5f}')
sns.despine(top=True, right=True)
# Add title and labels
plt.title('F1-score vs Kappa')
plt.xlabel('F1-score')
plt.ylabel('Kappa')
plt.tight_layout()
for format in ['png', 'tiff']:
plt.savefig(f'{RESULTS_DIR}/figures/paper/fig0.kappa-f1.{format}', dpi=DPI, bbox_inches='tight' )
plt.show()
Pearson correlation between F1 and Kappa: 0.9997826111567186
In that sense, we can then use one of the measures to explain the results and don't need the second one. We are going to select the F1 score because it has a more clear interpretability and it is related to precision and recall, which are alrady being used.
import random
def cohen_kappa(tp, fp, fn, tn):
"""
Compute Cohen's kappa given the confusion matrix counts:
TP = true positives
FP = false positives
FN = false negatives
TN = true negatives
"""
N = tp + fp + fn + tn # total
# Observed agreement
po = (tp + tn) / N
# Expected agreement
# (actual positives * predicted positives) + (actual negatives * predicted negatives)
p_a = (tp + fn) / N # actual positives
p_p = (tp + fp) / N # predicted positives
p_n = (fp + tn) / N # actual negatives? Actually "actual positives" vs "predicted negatives" can be spelled out:
p_n_act = (fp + tn) / N # actual negatives
p_n_pred = (fn + tn) / N # predicted negatives
pe = p_a * p_p + p_n_act * p_n_pred
if np.isclose(pe, 1.0):
# If pe is 1, the denominator (1 - pe) -> 0, can cause issues
return np.nan
kappa = (po - pe) / (1 - pe)
return kappa
def f1_score_simple(tp, fp, fn):
"""
Compute the F1 score from TP, FP, FN:
F1 = 2 * TP / (2*TP + FP + FN)
"""
denom = (2 * tp + fp + fn)
if denom == 0:
return np.nan
return 2 * tp / denom
# --- Simulation parameters ---
N = 3000 # total samples per confusion matrix
n_sims = 100000 # number of random simulations
f1_values = []
kappa_values = []
table_values = []
for _ in range(n_sims):
# We need TP, FP, FN, TN >= 0 and sum = N.
# One way: sample three random integers and let the fourth be what's left.
# This ensures the sum is N.
tp = random.randint(0, N)
fp = random.randint(0, N - tp)
fn = random.randint(0, N - tp - fp)
tn = N - tp - fp - fn
# Compute F1
f1 = f1_score_simple(tp, fp, fn)
# Compute Kappa
kappa = cohen_kappa(tp, fp, fn, tn)
# We only keep valid calculations (not NaN)
if not np.isnan(f1) and not np.isnan(kappa):
f1_values.append(f1)
kappa_values.append(kappa)
table_values.append((tp, fp, fn, tn))
# Convert to numpy arrays for correlation
f1_values = np.array(f1_values)
kappa_values = np.array(kappa_values)
# Compute Pearson correlation
corr, p_value = pearsonr(f1_values, kappa_values)
print(f"Number of valid simulations: {len(f1_values)} / {n_sims}")
print(f"Pearson correlation between F1 and Kappa: {corr:.3f} (p = {p_value:.3e})")
# Plot the scatter of F1 vs Kappa
plt.figure(figsize=(7, 5))
plt.scatter(f1_values, kappa_values, alpha=0.2, s=10)
plt.title("Monte Carlo Simulation (TP, FP, FN, TN random)")
plt.xlabel("F1 Score")
plt.ylabel("Cohen's Kappa")
plt.plot([0, 1], [0, 1])
plt.grid(True)
plt.show()
Number of valid simulations: 99965 / 100000 Pearson correlation between F1 and Kappa: 0.681 (p = 0.000e+00)
N = 0.8
df = pd.DataFrame({'f1': f1_values, 'kappa': kappa_values, 'table': table_values})
df[(df['f1'] > N - 0.03) & (df['f1'] < N + 0.03) & (df['kappa'] < N + 0.03) & (df['kappa'] > N- 0.03)]
| f1 | kappa | table | |
|---|---|---|---|
| 1440 | 0.821 | 0.778 | (484, 149, 62, 2305) |
| 38383 | 0.829 | 0.772 | (627, 206, 53, 2114) |
| 39301 | 0.810 | 0.778 | (351, 126, 39, 2484) |
| 74870 | 0.811 | 0.792 | (232, 108, 0, 2660) |
| 78198 | 0.830 | 0.780 | (562, 94, 137, 2207) |
| 93867 | 0.811 | 0.791 | (236, 80, 30, 2654) |
N = 1
df[(df['f1'] > N - 0.03) & (df['f1'] < N + 0.03) & (df['kappa'] < N + 0.03) & (df['kappa'] > N- 0.03)]
| f1 | kappa | table | |
|---|---|---|---|
| 361 | 0.992 | 0.972 | (2118, 1, 33, 848) |
| 911 | 1.000 | 1.000 | (2999, 0, 0, 1) |
| 3050 | 0.999 | 0.974 | (2879, 5, 1, 115) |
| 4738 | 1.000 | 1.000 | (2943, 0, 0, 57) |
| 6415 | 1.000 | 1.000 | (2986, 0, 0, 14) |
| ... | ... | ... | ... |
| 96296 | 1.000 | 1.000 | (2993, 0, 0, 7) |
| 96612 | 1.000 | 1.000 | (2999, 0, 0, 1) |
| 97751 | 0.992 | 0.983 | (1564, 1, 25, 1410) |
| 97820 | 0.999 | 0.977 | (2883, 1, 4, 112) |
| 98444 | 1.000 | 0.985 | (2965, 1, 0, 34) |
90 rows × 3 columns
N = 0
df[(df['f1'] > N - 0.03) & (df['f1'] < N + 0.03) & (df['kappa'] < N + 0.03) & (df['kappa'] > N- 0.03)]
| f1 | kappa | table | |
|---|---|---|---|
| 410 | 0.019 | -0.002 | (27, 2856, 4, 113) |
| 1178 | 0.014 | -0.020 | (18, 2561, 32, 389) |
| 2550 | 0.018 | -0.017 | (26, 2769, 27, 178) |
| 5017 | 0.004 | -0.004 | (5, 2715, 7, 273) |
| 5733 | 0.001 | -0.013 | (1, 1611, 20, 1368) |
| ... | ... | ... | ... |
| 96808 | 0.010 | -0.015 | (15, 2958, 22, 5) |
| 97000 | 0.007 | -0.003 | (11, 2969, 4, 16) |
| 97339 | 0.020 | -0.021 | (28, 2667, 34, 271) |
| 97542 | 0.015 | -0.028 | (20, 2583, 45, 352) |
| 99850 | 0.005 | -0.012 | (8, 2949, 18, 25) |
107 rows × 3 columns
How does the S parametter used during curve fitting affect?¶
The S parametter is useful to tweak the detection results, so that we can include more or less species during the flagging step. Since it is a structural parametter, we want to fit it first so that we can answer several other comparisons.
To do this we are going to use the nominal variables and their derived statistics.
Checking recall/precision/F1-score for inclusion/exclusion of species¶
cols = [f'{i}_norm' for i in LIST_PROFILERS] + ['mean_norm']
modes = range(2, 9)
passn = [2]
S_values = df_nominal_stats['S'].unique()
subset_df = df_nominal_stats[(df_nominal_stats['pass'].isin(passn)) & \
(df_nominal_stats['column'].isin(cols)) & \
(df_nominal_stats['mode'].isin(modes))]
melted_df = pd.melt(
subset_df,
id_vars=['mode', 'S', 'column'],
value_vars=['recall', 'precision', 'f1'],
var_name='metric',
value_name='score'
)
# Create a colormap for 'mode'
norm = Normalize(vmin=melted_df['mode'].min(), vmax=melted_df['mode'].max())
cmap = plt.cm.viridis # Choose a colormap (e.g., 'viridis', 'plasma', 'cividis')
# Create a FacetGrid: 6x3 grid (row for each profiler, column for each metric)
g = sns.FacetGrid(
melted_df,
col='column',
row='metric',
height=2,
sharey=True,
sharex=True
)
# Map the lineplot to the grid
def lineplot_with_cmap(data, **kwargs):
for mode in sorted(data['mode'].unique()):
subset = data[data['mode'] == mode]
plt.plot(subset['S'], subset['score'], label=f"Mode {mode}",
color=cmap(norm(mode)), marker='o')
g.map_dataframe(lineplot_with_cmap)
# Create a legend for the discrete modes
handles = [
plt.Line2D([0], [0], color=cmap(norm(mode)), marker='o', linestyle='', label=f"Mode {mode}")
for mode in sorted(melted_df['mode'].unique())
]
plt.legend(
handles=handles,
title="",
bbox_to_anchor=(1.05, 3),
loc='center left',
frameon=False
)
# Set x-axis ticks (if you have specific S values)
g.set(xticks=subset_df['S'].unique())
for ax in g.axes.ravel():
ax.set_title('')
# Add axis labels and titles
for ax, profiler in zip(g.axes[0, :], melted_df['column'].unique()):
ax.set_title(profiler.replace('_norm', ''))
for ax, score in zip(g.axes[:, 0], ['recall', 'precision', 'F1-score']):
ax.set_ylabel(score)
plt.tight_layout()
for format in ['png', 'tiff']:
plt.savefig(f'{RESULTS_DIR}/figures/paper/figE.{format}', dpi=DPI, bbox_inches='tight')
plt.show()
/tmp/ipykernel_2918775/2133720995.py:58: UserWarning: Tight layout not applied. tight_layout cannot make Axes height small enough to accommodate all Axes decorations. plt.tight_layout()
g = sns.FacetGrid(
melted_df,
col='column',
row='metric',
height=3,
sharey=True,
sharex=True
)
g.map(sns.boxplot, 'S', 'score')
# Set x-axis ticks (if you have specific S values)
for ax in g.axes.ravel():
ax.set_title('')
# Add axis labels and titles
for ax, profiler in zip(g.axes[0, :], melted_df['column'].unique()):
ax.set_title(profiler.replace('_norm', ''))
for ax, score in zip(g.axes[:, 0], ['recall', 'precision', 'F1-score']):
ax.set_ylabel(score)
plt.subplots_adjust(top=0.9)
g.fig.suptitle("Metrics by Profiler and Mode", fontsize=16)
plt.show()
/home/neuroimm/.local/share/mamba/envs/EVs/lib/python3.13/site-packages/seaborn/axisgrid.py:718: UserWarning: Using the boxplot function without specifying `order` is likely to produce an incorrect plot. warnings.warn(warning)
The aim of this part of the analysis was to select the "optimal" S to then make other comparisons and extract proper conclusions.
If we look at individual profilers, the aim is not the select the S with best F1 score, but to select the smallest S that provides a sufficiently high recall, ensuring that we don't lose TP species. This threshold depends on the profiler. For CEN it is 6-7, GAN is 7-10, KAI is 1-2, KR2 is 4-5, KRU is 5-6. We see that at these values the precision drops (expectedly), but it remains stable afterwards for most profilers. Therefore, a value of S=2 should be sufficient to ensure that the results are correct.
IMPORTANT: conceptually, if we are choosng the number of the species based on the mean number of reads, using a low S is still good, because the number of reads reported by profilers that are not flagged are still considered. That is, the # of reads assigned to one profiler is the same regardless of S.
The advantage of using the mean value instead of the individual profilers is that it tends to retrieve a better stability on the precision throughout the S values and modes. In fact, each profiler has an individual dinamic, and the mean value averages them all. Therefore, using the averaged value for S allows us to choose the parameter with better predictability, ensuring that we don't choose a very high value, while at the same time keeping the correct number of reads.
Therefore, we are going to choose S=2 and S=7 for comparisons of robustness with biological samples.
Does pass0/pass2 (no host pre-mapping vs host pre-mapping) affect the detection of the species?¶
For this part we are going to run run analyses:
- Retrieve the raw detection of species with the passes, and calculate their jaccard index.
- Calculate $\chi^2$ for each case and see if there are significative differences.
- Calculate the Pearson correlation + RMSE for several mode values.
Total number of species and Jaccard index¶
# Plot Jaccard index between the different species
"""
For this part we are going to read all the .standardised.species reports and simply read the number of species and compute a table with the total number of species and the jaccard index
"""
def get_n_species(sample, mode, profiler, S=15):
pass0_df = pd.read_csv(f'{RESULTS_DIR}/summary/{sample}_pass0_mode{mode}_taxspecies_S{S}.diversity.tsv', sep='\t').set_index('taxonomy_id')
pass2_df = pd.read_csv(f'{RESULTS_DIR}/summary/{sample}_pass2_mode{mode}_taxspecies_S{S}.diversity.tsv', sep='\t').set_index('taxonomy_id')
pass0_profiler = pass0_df[profiler].dropna()
pass2_profiler = pass2_df[profiler].dropna()
pass0_taxids = pass0_profiler.index.values
pass2_taxids = pass2_profiler.index.values
jaccard_index = len(np.intersect1d(pass0_taxids, pass2_taxids)) / len(np.union1d(pass0_taxids, pass2_taxids))
return len(pass0_taxids), len(pass2_taxids), jaccard_index
dict_n_species = {'profiler': [],
'mode': [],
'pass 0 species': [],
'pass 2 species': [],
'jaccard': []}
for profiler in LIST_PROFILERS + ['mean']:
for mode in range(1,10):
try:
pass0_n_species, pass2_n_species, jaccard_index = get_n_species('ARTIFICIAL', mode, profiler + '_norm')
dict_n_species['profiler'].append(profiler)
dict_n_species['mode'].append(mode)
dict_n_species['pass 0 species'].append(pass0_n_species)
dict_n_species['pass 2 species'].append(pass2_n_species)
dict_n_species['jaccard'].append(jaccard_index)
except:
print(f'No entry added for profiler {profiler} and mode {mode}')
df_n_species = pd.DataFrame(dict_n_species)
plt.rc('axes', linewidth=0.65)
# Initialize a 3x2 grid for subplots
fig, axes = plt.subplots(2, 3, figsize=(11, 6))
axes = axes.flatten()
# Set Seaborn style
sns.set_theme(style="white")
# Define custom labels for the shared legend
legend_elements = [
plt.Line2D([0], [0], marker='o', color='#648FFF', label='Pass 0', markersize=8, linestyle='None'),
plt.Line2D([0], [0], marker='o', color='#785EF0', label='Pass 2', markersize=8, linestyle='None'),
plt.Line2D([0], [0], marker='o', color='#DC267F', label='Jaccard Index', markersize=8, linestyle='-')
]
# Plot for each profiler
for i, profiler in enumerate(LIST_PROFILERS + ['mean']):
df_profiler = df_n_species[df_n_species['profiler'] == profiler]
ax1 = axes[i]
# Scatter plots for `pass 0 species` and `pass 2 species`
sns.scatterplot(
data=df_profiler, x='mode', y='pass 0 species', ax=ax1, color='#648FFF', s=50
)
sns.scatterplot(
data=df_profiler, x='mode', y='pass 2 species', ax=ax1, color='#785EF0', s=50
)
sns.despine(top=True, right=True)
sns.set_theme(style="white")
# Configure the primary y-axis
ax1.set_xticks(range(1, 10))
ax1.set_xlabel('Mode')
ax1.set_ylabel('Species Count', color='black')
#ax1.set_yscale('log') # Optional: Log scale
ax1.set_title(f'{profiler.capitalize()}')
# Create a secondary y-axis for Jaccard index
ax2 = ax1.twinx()
sns.lineplot(
data=df_profiler, x='mode', y='jaccard', ax=ax2, color='#DC267F', marker='o'
)
ax2.set_ylabel('Jaccard Index', color='#DC267F')
sns.despine(top=True, right=True)
# Hide unused axes
for j in range(len(LIST_PROFILERS) + 1, len(axes)):
axes[j].axis('off')
# Add a shared legend
fig.legend(
handles=legend_elements, loc='center right', frameon=False, title="", bbox_to_anchor=(1.15, 0.5)
)
plt.tight_layout()
for format in ['png', 'tiff']:
plt.savefig(f'{RESULTS_DIR}/figures/paper/figB.{format}', dpi=DPI, bbox_inches='tight' )
sns.lineplot(df_n_species, x='mode', y='jaccard', hue='profiler', marker='o', palette=custom_palette)
plt.legend(
loc='center right', frameon=False, bbox_to_anchor=(1.35, 0.5),
)
<matplotlib.legend.Legend at 0x76fde1836ad0>
The number of detected species tends to go up with the mode, which is expected (with the exception of krakenuniq). For most profilers the increase is linear, but for centrifuge the increase is exponential, although it may reach the top of detected species at some point.
We see that in general Jaccard indexes are very high. We should consider that this plot is not "strictly" relevant because many, many species are false positive, so we don't really care about the number of falsely assigned species. Still, it is important to note that pass does not have, grosso modo, a relevant impact. It is interesting to find that for kaiju and centrifuge the mode has an impact. For Kaiju it decreases; this it might be because the number of spurious species has increased; while for centrifuge the jaccard index may increase because their number of species is so high (12000!!) that we may be reaching the maximum of discoverable amount of species, and therefore it is obvious that the Jaccard index will increase in that case. For the rest of profilers the mode does not seem to affect the Jaccard index, regardless of the total number of species detected.
Checking statistical differences in the truth table.¶
We are going to use the truth tables to compute statistically differential capture of species. We are going to use S=2 and S=7 throughout the different modes and profilers.
from scipy.stats import chi2_contingency
SMALL_VAL = 0.1
df_sub = df_nominal_stats[(df_nominal_stats['column'].isin([f'{i}_norm' for i in LIST_PROFILERS] + ['mean_norm'])) &
(df_nominal_stats['S'].isin([0, 1, 2, 3, 4, 5, 6, 7, 10, 15]))].copy()
# Initialize a list to store chi-squared results
chi2_results = []
# Iterate over unique combinations of mode, S, and column
for (mode, S, column), group in df_sub.groupby(['mode', 'S', 'column']):
try:
# Extract contingency tables for full (TP, FN, FP, TN) and partial (TP, FP, FN)
full_contingency_table = []
partial_contingency_table = []
precision_diff, recall_diff, f1_diff = None, None, None # Initialize differences
for p in [0, 2]:
data = group[group['pass'] == p]
if len(data):
(tp, fn, fp, tn) = data.iloc[0]['TP|FN|FP|TN']
full_contingency_table.append([tp, fn, fp, tn]) # Full table
partial_contingency_table.append([tp, fp, fn]) # Partial table
# Calculate differences for precision, recall, and f1
if p == 0:
precision_0 = data.iloc[0]['precision']
recall_0 = data.iloc[0]['recall']
f1_0 = data.iloc[0]['f1']
elif p == 2:
precision_2 = data.iloc[0]['precision']
recall_2 = data.iloc[0]['recall']
f1_2 = data.iloc[0]['f1']
# Compute differences
if 'precision_0' in locals() and 'precision_2' in locals():
precision_diff = precision_2 - precision_0
recall_diff = recall_2 - recall_0
f1_diff = f1_2 - f1_0
# Perform chi-squared test for full contingency table
if len(full_contingency_table) == 2:
chi2_full, p_value_full, _, _ = chi2_contingency(np.array(full_contingency_table) + SMALL_VAL)
# Perform chi-squared test for partial contingency table
if len(partial_contingency_table) == 2:
chi2_partial, p_value_partial, _, _ = chi2_contingency(np.array(partial_contingency_table) + SMALL_VAL)
# Store results
chi2_results.append({
'mode': mode,
'S': S,
'column': column,
'chi2_full': chi2_full if len(full_contingency_table) == 2 else None,
'p_value_full': p_value_full if len(full_contingency_table) == 2 else None,
'chi2_partial': chi2_partial if len(partial_contingency_table) == 2 else None,
'p_value_partial': p_value_partial if len(partial_contingency_table) == 2 else None,
'precision_diff': precision_diff,
'recall_diff': recall_diff,
'f1_diff': f1_diff,
'stats_pass0': full_contingency_table[0],
'stats_pass2': full_contingency_table[1],
})
except Exception as e:
print(f"Error processing {mode}, {S}, {column}: {e}")
# Convert results to a DataFrame
df_pass_chi2_stats = pd.DataFrame(chi2_results)
# Add significance categories based on p-value thresholds
alpha = 0.1
df_pass_chi2_stats['significance'] = df_pass_chi2_stats.apply(
lambda row: (
'Both Significant' if row['p_value_full'] < alpha and row['p_value_partial'] < alpha else
'Only Full Significant' if row['p_value_full'] < alpha else
'Only Partial Significant' if row['p_value_partial'] < alpha else
'Neither Significant'
),
axis=1
)
# Count points in each quadrant
quadrant_counts = {
'Both Significant': len(df_pass_chi2_stats[(df_pass_chi2_stats['p_value_full'] < alpha) & (df_pass_chi2_stats['p_value_partial'] < alpha)]),
'Only Full Significant': len(df_pass_chi2_stats[(df_pass_chi2_stats['p_value_full'] < alpha) & (df_pass_chi2_stats['p_value_partial'] >= alpha)]),
'Only Partial Significant': len(df_pass_chi2_stats[(df_pass_chi2_stats['p_value_full'] >= alpha) & (df_pass_chi2_stats['p_value_partial'] < alpha)]),
'Neither Significant': len(df_pass_chi2_stats[(df_pass_chi2_stats['p_value_full'] >= alpha) & (df_pass_chi2_stats['p_value_partial'] >= alpha)])
}
# Set up the plots
fig, axs = plt.subplots(1, 2, figsize=(12, 6))
# First plot: chi2_full vs chi2_partial
sns.scatterplot(data=df_pass_chi2_stats, x='chi2_full', y='chi2_partial', ax=axs[0])
axs[0].set_title('Chi2 Full vs Partial')
axs[0].set_xlabel('Chi2 Full')
axs[0].set_ylabel('Chi2 Partial')
# Second plot: p_value_full vs p_value_partial with significance categories
sns.scatterplot(
data=df_pass_chi2_stats,
x='p_value_full',
y='p_value_partial',
hue='significance',
palette={
'Both Significant': '#bc0000',
'Only Full Significant': '#0000bc',
'Only Partial Significant': '#00bc00',
'Neither Significant': '#aaaaaa'
},
ax=axs[1]
)
# Add thresholds
axs[1].axhline(alpha, color='black', linestyle='--', linewidth=1)
axs[1].axvline(alpha, color='black', linestyle='--', linewidth=1)
# Annotate quadrant counts
axs[1].text(0.25, 0.75, f"Both Significant\n{quadrant_counts['Both Significant']}",
ha='center', color='#bc0000', fontsize=12, bbox=dict(facecolor='white', edgecolor='none'))
axs[1].text(0.75, 0.75, f"Only Full Significant\n{quadrant_counts['Only Full Significant']}",
ha='center', color='#0000bc', fontsize=12, bbox=dict(facecolor='white', edgecolor='none'))
axs[1].text(0.25, 0.25, f"Only Partial Significant\n{quadrant_counts['Only Partial Significant']}",
ha='center', color='#00bc00', fontsize=12, bbox=dict(facecolor='white', edgecolor='none'))
axs[1].text(0.75, 0.25, f"Neither Significant\n{quadrant_counts['Neither Significant']}",
ha='center', color='#aaaaaa', fontsize=12, bbox=dict(facecolor='white', edgecolor='none'))
# Customize the plot
axs[1].set_title('P-Value Comparison with Significance')
axs[1].set_xlabel('P-Value Full')
axs[1].set_ylabel('P-Value Partial')
axs[1].set_xlim(0, 1)
axs[1].set_ylim(0, 1)
plt.legend(loc='center right', frameon=False, bbox_to_anchor=(1.65, 0.5))
# Adjust layout
plt.tight_layout()
# Show the plots
plt.show()
df_pass_chi2_stats[df_pass_chi2_stats['significance'] != 'Neither Significant'].sort_values(by=['mode', 'S'])
| mode | S | column | chi2_full | p_value_full | chi2_partial | p_value_partial | precision_diff | recall_diff | f1_diff | stats_pass0 | stats_pass2 | significance | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5 | 1 | 0 | mean_norm | 59.705 | 0.000 | 59.578 | 0.000 | -0.043 | 0.683 | 0.716 | [4, 56, 0, 3637] | [45, 15, 2, 3520] | Both Significant |
| 13 | 1 | 2 | ganon_norm | 30.477 | 0.000 | 30.438 | 0.000 | 0.060 | 0.500 | 0.498 | [11, 49, 1, 3636] | [41, 19, 1, 3521] | Both Significant |
| 18 | 1 | 3 | centrifuge_norm | 5.064 | 0.167 | 5.032 | 0.081 | 0.000 | 0.183 | 0.227 | [11, 49, 0, 3637] | [22, 38, 0, 3522] | Only Partial Significant |
| 43 | 1 | 7 | ganon_norm | 37.082 | 0.000 | 33.421 | 0.000 | -0.269 | 0.267 | 0.001 | [42, 18, 1, 3636] | [58, 2, 24, 3498] | Both Significant |
| 62 | 2 | 0 | kaiju_norm | 8.014 | 0.046 | 7.984 | 0.018 | 0.000 | 0.167 | 0.269 | [2, 58, 0, 3724] | [12, 48, 0, 3611] | Both Significant |
| 65 | 2 | 0 | mean_norm | 10.970 | 0.012 | 10.962 | 0.004 | 0.050 | -0.233 | -0.321 | [19, 41, 1, 3723] | [5, 55, 0, 3611] | Both Significant |
| 73 | 2 | 2 | ganon_norm | 30.473 | 0.000 | 30.438 | 0.000 | 0.060 | 0.500 | 0.498 | [11, 49, 1, 3723] | [41, 19, 1, 3610] | Both Significant |
| 78 | 2 | 3 | centrifuge_norm | 5.062 | 0.167 | 5.032 | 0.081 | 0.000 | 0.183 | 0.227 | [11, 49, 0, 3724] | [22, 38, 0, 3611] | Only Partial Significant |
| 96 | 2 | 6 | centrifuge_norm | 46.620 | 0.000 | 41.727 | 0.000 | 0.376 | -0.233 | 0.088 | [58, 2, 35, 3689] | [44, 16, 0, 3611] | Both Significant |
| 102 | 2 | 7 | centrifuge_norm | 46.620 | 0.000 | 41.727 | 0.000 | 0.376 | -0.233 | 0.088 | [58, 2, 35, 3689] | [44, 16, 0, 3611] | Both Significant |
| 103 | 2 | 7 | ganon_norm | 37.046 | 0.000 | 33.421 | 0.000 | -0.269 | 0.267 | 0.001 | [42, 18, 1, 3723] | [58, 2, 24, 3587] | Both Significant |
| 122 | 3 | 0 | kaiju_norm | 8.013 | 0.046 | 7.984 | 0.018 | 0.000 | 0.167 | 0.269 | [2, 58, 0, 3761] | [12, 48, 0, 3648] | Both Significant |
| 125 | 3 | 0 | mean_norm | 26.453 | 0.000 | 26.381 | 0.000 | 0.013 | 0.467 | 0.405 | [17, 43, 1, 3760] | [45, 15, 2, 3646] | Both Significant |
| 138 | 3 | 3 | centrifuge_norm | 5.061 | 0.167 | 5.032 | 0.081 | 0.000 | 0.183 | 0.227 | [11, 49, 0, 3761] | [22, 38, 0, 3648] | Only Partial Significant |
| 139 | 3 | 3 | ganon_norm | 36.467 | 0.000 | 36.431 | 0.000 | 0.061 | 0.550 | 0.533 | [11, 49, 1, 3760] | [44, 16, 1, 3647] | Both Significant |
| 182 | 4 | 0 | kaiju_norm | 19.754 | 0.000 | 19.750 | 0.000 | 0.081 | -0.367 | -0.368 | [34, 26, 3, 3845] | [12, 48, 0, 3733] | Both Significant |
| 185 | 4 | 0 | mean_norm | 44.783 | 0.000 | 44.789 | 0.000 | 0.004 | -0.600 | -0.516 | [49, 11, 4, 3844] | [13, 47, 1, 3732] | Both Significant |
| 198 | 4 | 3 | centrifuge_norm | 38.612 | 0.000 | 38.576 | 0.000 | 0.000 | 0.567 | 0.547 | [11, 49, 0, 3848] | [45, 15, 0, 3733] | Both Significant |
| 199 | 4 | 3 | ganon_norm | 36.467 | 0.000 | 36.431 | 0.000 | 0.061 | 0.550 | 0.533 | [11, 49, 1, 3847] | [44, 16, 1, 3732] | Both Significant |
| 245 | 5 | 0 | mean_norm | 17.691 | 0.001 | 17.077 | 0.000 | 0.154 | -0.183 | -0.016 | [58, 2, 14, 4048] | [47, 13, 2, 3940] | Both Significant |
| 259 | 5 | 3 | ganon_norm | 40.835 | 0.000 | 40.799 | 0.000 | 0.062 | 0.583 | 0.554 | [11, 49, 1, 4061] | [46, 14, 1, 3941] | Both Significant |
| 263 | 5 | 3 | mean_norm | 6.513 | 0.089 | 2.956 | 0.228 | -0.114 | 0.000 | -0.097 | [58, 2, 43, 4019] | [58, 2, 68, 3874] | Only Full Significant |
| 305 | 6 | 0 | mean_norm | 43.871 | 0.000 | 40.577 | 0.000 | 0.300 | -0.317 | -0.006 | [58, 2, 31, 4162] | [39, 21, 2, 4066] | Both Significant |
| 319 | 6 | 3 | ganon_norm | 40.836 | 0.000 | 40.799 | 0.000 | 0.062 | 0.583 | 0.554 | [11, 49, 1, 4192] | [46, 14, 1, 4067] | Both Significant |
| 354 | 6 | 15 | centrifuge_norm | 8.454 | 0.037 | 3.712 | 0.156 | -0.125 | 0.000 | -0.109 | [58, 2, 46, 4147] | [58, 2, 76, 3992] | Only Full Significant |
| 365 | 7 | 0 | mean_norm | 46.384 | 0.000 | 46.336 | 0.000 | 0.084 | 0.583 | 0.422 | [23, 37, 4, 4836] | [58, 2, 4, 4678] | Both Significant |
| 379 | 7 | 3 | ganon_norm | 40.843 | 0.000 | 40.799 | 0.000 | 0.062 | 0.583 | 0.554 | [11, 49, 1, 4839] | [46, 14, 1, 4681] | Both Significant |
| 402 | 7 | 7 | centrifuge_norm | 12.768 | 0.005 | 5.586 | 0.061 | -0.152 | 0.000 | -0.135 | [58, 2, 45, 4795] | [58, 2, 83, 4599] | Both Significant |
| 407 | 7 | 7 | mean_norm | 6.531 | 0.088 | 3.144 | 0.208 | 0.102 | 0.000 | 0.101 | [58, 2, 104, 4736] | [58, 2, 68, 4614] | Only Full Significant |
| 413 | 7 | 10 | mean_norm | 6.531 | 0.088 | 3.144 | 0.208 | 0.102 | 0.000 | 0.101 | [58, 2, 104, 4736] | [58, 2, 68, 4614] | Only Full Significant |
| 437 | 8 | 2 | mean_norm | 9.109 | 0.028 | 5.037 | 0.081 | 0.147 | 0.000 | 0.127 | [58, 2, 76, 7660] | [58, 2, 42, 7520] | Both Significant |
| 439 | 8 | 3 | ganon_norm | 40.820 | 0.000 | 40.799 | 0.000 | 0.062 | 0.583 | 0.554 | [11, 49, 1, 7735] | [46, 14, 1, 7561] | Both Significant |
| 443 | 8 | 3 | mean_norm | 9.109 | 0.028 | 5.037 | 0.081 | 0.147 | 0.000 | 0.127 | [58, 2, 76, 7660] | [58, 2, 42, 7520] | Both Significant |
| 449 | 8 | 4 | mean_norm | 8.979 | 0.030 | 3.999 | 0.135 | 0.113 | 0.000 | 0.114 | [58, 2, 111, 7625] | [58, 2, 69, 7493] | Only Full Significant |
| 482 | 9 | 0 | kaiju_norm | 997.648 | 0.000 | 888.595 | 0.000 | -0.437 | 0.933 | 0.085 | [1, 59, 1, 12540] | [57, 3, 854, 11515] | Both Significant |
| 485 | 9 | 0 | mean_norm | 24.758 | 0.000 | 24.539 | 0.000 | 0.101 | -0.383 | -0.228 | [51, 9, 8, 12533] | [28, 32, 1, 12368] | Both Significant |
| 499 | 9 | 3 | ganon_norm | 38.413 | 0.000 | 38.405 | 0.000 | 0.051 | 0.567 | 0.519 | [13, 47, 1, 12540] | [47, 13, 1, 12368] | Both Significant |
| 507 | 9 | 4 | kraken2_norm | 32.370 | 0.000 | 31.976 | 0.000 | -0.109 | 0.400 | 0.210 | [33, 27, 0, 12541] | [57, 3, 7, 12362] | Both Significant |
| 509 | 9 | 4 | mean_norm | 17.490 | 0.001 | 7.116 | 0.028 | 0.147 | 0.000 | 0.151 | [58, 2, 129, 12412] | [58, 2, 69, 12300] | Both Significant |
| 529 | 9 | 10 | ganon_norm | 10.471 | 0.015 | 5.917 | 0.052 | 0.164 | 0.000 | 0.136 | [58, 2, 70, 12471] | [58, 2, 36, 12333] | Both Significant |
| 535 | 9 | 15 | ganon_norm | 10.471 | 0.015 | 5.917 | 0.052 | 0.164 | 0.000 | 0.136 | [58, 2, 70, 12471] | [58, 2, 36, 12333] | Both Significant |
Only using the full table (for publication)¶
plt.rc('axes', linewidth=0.65)
df_pass_chi2_stats['are_diff'] = [True if i < alpha else False for i in df_pass_chi2_stats['p_value_full']]
print((df_pass_chi2_stats['are_diff'] == True).sum(), (df_pass_chi2_stats['are_diff'] == False).sum())
display(df_pass_chi2_stats)
display(df_pass_chi2_stats[df_pass_chi2_stats['are_diff'] == True])
# Create a grid of three axes for the plots
fig, axs = plt.subplots(1, 3, figsize=(7.5, 2.7), sharey=True)
# Define the metrics to plot
metrics = ['f1_diff', 'precision_diff', 'recall_diff']
titles = ['F1 Score', 'Precision', 'Recall']
# Iterate through the metrics and create a boxplot for each
for ax, metric, title in zip(axs, metrics, titles):
sns.boxplot(
data=df_pass_chi2_stats,
x="are_diff",
y=metric,
palette=sns.color_palette(['#4E79A7', '#A0CBE8']),
ax=ax
)
sns.despine(top=True, right=True, ax=ax)
ax.set_title(title, fontsize=14, pad=40)
ax.set_xlabel("Group", fontsize=12)
if metric == 'f1_diff': # Label y-axis only for the first plot
ax.set_ylabel("Difference", fontsize=12)
else:
ax.set_ylabel("")
# Highlight medians
medians = df_pass_chi2_stats.groupby("are_diff")[metric].median()
for i, median in enumerate(medians):
ax.text(i, median, f"{median:.2f}", ha='center', va='bottom', fontsize=10)
pairs=[(False, True)]
annotator = Annotator(ax, pairs, data=df_pass_chi2_stats, x='are_diff', y=metric)
annotator.configure(test='Mann-Whitney', text_format='simple', loc='outside', line_width=1)
annotator.apply_and_annotate()
# Adjust layout and save the plot
plt.tight_layout()
for format in ['png', 'tiff']:
plt.savefig(f'{RESULTS_DIR}/figures/paper/figC.{format}', dpi=DPI)
plt.show()
38 502
| mode | S | column | chi2_full | p_value_full | chi2_partial | p_value_partial | precision_diff | recall_diff | f1_diff | stats_pass0 | stats_pass2 | significance | are_diff | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | centrifuge_norm | 0.031 | 0.999 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | [3, 57, 0, 3637] | [3, 57, 0, 3522] | Neither Significant | False |
| 1 | 1 | 0 | ganon_norm | 1.356 | 0.716 | 1.349 | 0.509 | 0.167 | -0.033 | -0.056 | [5, 55, 1, 3636] | [3, 57, 0, 3522] | Neither Significant | False |
| 2 | 1 | 0 | kaiju_norm | 0.031 | 0.999 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | [6, 54, 0, 3637] | [6, 54, 0, 3522] | Neither Significant | False |
| 3 | 1 | 0 | kraken2_norm | 0.031 | 0.999 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | [13, 47, 0, 3637] | [13, 47, 0, 3522] | Neither Significant | False |
| 4 | 1 | 0 | krakenuniq_norm | 0.459 | 0.928 | 0.428 | 0.807 | 0.000 | -0.033 | -0.057 | [6, 54, 0, 3637] | [4, 56, 0, 3522] | Neither Significant | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 535 | 9 | 15 | ganon_norm | 10.471 | 0.015 | 5.917 | 0.052 | 0.164 | 0.000 | 0.136 | [58, 2, 70, 12471] | [58, 2, 36, 12333] | Both Significant | True |
| 536 | 9 | 15 | kaiju_norm | 0.180 | 0.981 | 0.127 | 0.939 | 0.024 | 0.000 | 0.020 | [57, 3, 55, 12486] | [57, 3, 50, 12319] | Neither Significant | False |
| 537 | 9 | 15 | kraken2_norm | 0.006 | 1.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | [57, 3, 7, 12534] | [57, 3, 7, 12362] | Neither Significant | False |
| 538 | 9 | 15 | krakenuniq_norm | 0.014 | 1.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | [58, 2, 87, 12454] | [58, 2, 87, 12282] | Neither Significant | False |
| 539 | 9 | 15 | mean_norm | 0.018 | 0.999 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | [58, 2, 129, 12412] | [58, 2, 129, 12240] | Neither Significant | False |
540 rows × 14 columns
| mode | S | column | chi2_full | p_value_full | chi2_partial | p_value_partial | precision_diff | recall_diff | f1_diff | stats_pass0 | stats_pass2 | significance | are_diff | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5 | 1 | 0 | mean_norm | 59.705 | 0.000 | 59.578 | 0.000 | -0.043 | 0.683 | 0.716 | [4, 56, 0, 3637] | [45, 15, 2, 3520] | Both Significant | True |
| 13 | 1 | 2 | ganon_norm | 30.477 | 0.000 | 30.438 | 0.000 | 0.060 | 0.500 | 0.498 | [11, 49, 1, 3636] | [41, 19, 1, 3521] | Both Significant | True |
| 43 | 1 | 7 | ganon_norm | 37.082 | 0.000 | 33.421 | 0.000 | -0.269 | 0.267 | 0.001 | [42, 18, 1, 3636] | [58, 2, 24, 3498] | Both Significant | True |
| 62 | 2 | 0 | kaiju_norm | 8.014 | 0.046 | 7.984 | 0.018 | 0.000 | 0.167 | 0.269 | [2, 58, 0, 3724] | [12, 48, 0, 3611] | Both Significant | True |
| 65 | 2 | 0 | mean_norm | 10.970 | 0.012 | 10.962 | 0.004 | 0.050 | -0.233 | -0.321 | [19, 41, 1, 3723] | [5, 55, 0, 3611] | Both Significant | True |
| 73 | 2 | 2 | ganon_norm | 30.473 | 0.000 | 30.438 | 0.000 | 0.060 | 0.500 | 0.498 | [11, 49, 1, 3723] | [41, 19, 1, 3610] | Both Significant | True |
| 96 | 2 | 6 | centrifuge_norm | 46.620 | 0.000 | 41.727 | 0.000 | 0.376 | -0.233 | 0.088 | [58, 2, 35, 3689] | [44, 16, 0, 3611] | Both Significant | True |
| 102 | 2 | 7 | centrifuge_norm | 46.620 | 0.000 | 41.727 | 0.000 | 0.376 | -0.233 | 0.088 | [58, 2, 35, 3689] | [44, 16, 0, 3611] | Both Significant | True |
| 103 | 2 | 7 | ganon_norm | 37.046 | 0.000 | 33.421 | 0.000 | -0.269 | 0.267 | 0.001 | [42, 18, 1, 3723] | [58, 2, 24, 3587] | Both Significant | True |
| 122 | 3 | 0 | kaiju_norm | 8.013 | 0.046 | 7.984 | 0.018 | 0.000 | 0.167 | 0.269 | [2, 58, 0, 3761] | [12, 48, 0, 3648] | Both Significant | True |
| 125 | 3 | 0 | mean_norm | 26.453 | 0.000 | 26.381 | 0.000 | 0.013 | 0.467 | 0.405 | [17, 43, 1, 3760] | [45, 15, 2, 3646] | Both Significant | True |
| 139 | 3 | 3 | ganon_norm | 36.467 | 0.000 | 36.431 | 0.000 | 0.061 | 0.550 | 0.533 | [11, 49, 1, 3760] | [44, 16, 1, 3647] | Both Significant | True |
| 182 | 4 | 0 | kaiju_norm | 19.754 | 0.000 | 19.750 | 0.000 | 0.081 | -0.367 | -0.368 | [34, 26, 3, 3845] | [12, 48, 0, 3733] | Both Significant | True |
| 185 | 4 | 0 | mean_norm | 44.783 | 0.000 | 44.789 | 0.000 | 0.004 | -0.600 | -0.516 | [49, 11, 4, 3844] | [13, 47, 1, 3732] | Both Significant | True |
| 198 | 4 | 3 | centrifuge_norm | 38.612 | 0.000 | 38.576 | 0.000 | 0.000 | 0.567 | 0.547 | [11, 49, 0, 3848] | [45, 15, 0, 3733] | Both Significant | True |
| 199 | 4 | 3 | ganon_norm | 36.467 | 0.000 | 36.431 | 0.000 | 0.061 | 0.550 | 0.533 | [11, 49, 1, 3847] | [44, 16, 1, 3732] | Both Significant | True |
| 245 | 5 | 0 | mean_norm | 17.691 | 0.001 | 17.077 | 0.000 | 0.154 | -0.183 | -0.016 | [58, 2, 14, 4048] | [47, 13, 2, 3940] | Both Significant | True |
| 259 | 5 | 3 | ganon_norm | 40.835 | 0.000 | 40.799 | 0.000 | 0.062 | 0.583 | 0.554 | [11, 49, 1, 4061] | [46, 14, 1, 3941] | Both Significant | True |
| 263 | 5 | 3 | mean_norm | 6.513 | 0.089 | 2.956 | 0.228 | -0.114 | 0.000 | -0.097 | [58, 2, 43, 4019] | [58, 2, 68, 3874] | Only Full Significant | True |
| 305 | 6 | 0 | mean_norm | 43.871 | 0.000 | 40.577 | 0.000 | 0.300 | -0.317 | -0.006 | [58, 2, 31, 4162] | [39, 21, 2, 4066] | Both Significant | True |
| 319 | 6 | 3 | ganon_norm | 40.836 | 0.000 | 40.799 | 0.000 | 0.062 | 0.583 | 0.554 | [11, 49, 1, 4192] | [46, 14, 1, 4067] | Both Significant | True |
| 354 | 6 | 15 | centrifuge_norm | 8.454 | 0.037 | 3.712 | 0.156 | -0.125 | 0.000 | -0.109 | [58, 2, 46, 4147] | [58, 2, 76, 3992] | Only Full Significant | True |
| 365 | 7 | 0 | mean_norm | 46.384 | 0.000 | 46.336 | 0.000 | 0.084 | 0.583 | 0.422 | [23, 37, 4, 4836] | [58, 2, 4, 4678] | Both Significant | True |
| 379 | 7 | 3 | ganon_norm | 40.843 | 0.000 | 40.799 | 0.000 | 0.062 | 0.583 | 0.554 | [11, 49, 1, 4839] | [46, 14, 1, 4681] | Both Significant | True |
| 402 | 7 | 7 | centrifuge_norm | 12.768 | 0.005 | 5.586 | 0.061 | -0.152 | 0.000 | -0.135 | [58, 2, 45, 4795] | [58, 2, 83, 4599] | Both Significant | True |
| 407 | 7 | 7 | mean_norm | 6.531 | 0.088 | 3.144 | 0.208 | 0.102 | 0.000 | 0.101 | [58, 2, 104, 4736] | [58, 2, 68, 4614] | Only Full Significant | True |
| 413 | 7 | 10 | mean_norm | 6.531 | 0.088 | 3.144 | 0.208 | 0.102 | 0.000 | 0.101 | [58, 2, 104, 4736] | [58, 2, 68, 4614] | Only Full Significant | True |
| 437 | 8 | 2 | mean_norm | 9.109 | 0.028 | 5.037 | 0.081 | 0.147 | 0.000 | 0.127 | [58, 2, 76, 7660] | [58, 2, 42, 7520] | Both Significant | True |
| 439 | 8 | 3 | ganon_norm | 40.820 | 0.000 | 40.799 | 0.000 | 0.062 | 0.583 | 0.554 | [11, 49, 1, 7735] | [46, 14, 1, 7561] | Both Significant | True |
| 443 | 8 | 3 | mean_norm | 9.109 | 0.028 | 5.037 | 0.081 | 0.147 | 0.000 | 0.127 | [58, 2, 76, 7660] | [58, 2, 42, 7520] | Both Significant | True |
| 449 | 8 | 4 | mean_norm | 8.979 | 0.030 | 3.999 | 0.135 | 0.113 | 0.000 | 0.114 | [58, 2, 111, 7625] | [58, 2, 69, 7493] | Only Full Significant | True |
| 482 | 9 | 0 | kaiju_norm | 997.648 | 0.000 | 888.595 | 0.000 | -0.437 | 0.933 | 0.085 | [1, 59, 1, 12540] | [57, 3, 854, 11515] | Both Significant | True |
| 485 | 9 | 0 | mean_norm | 24.758 | 0.000 | 24.539 | 0.000 | 0.101 | -0.383 | -0.228 | [51, 9, 8, 12533] | [28, 32, 1, 12368] | Both Significant | True |
| 499 | 9 | 3 | ganon_norm | 38.413 | 0.000 | 38.405 | 0.000 | 0.051 | 0.567 | 0.519 | [13, 47, 1, 12540] | [47, 13, 1, 12368] | Both Significant | True |
| 507 | 9 | 4 | kraken2_norm | 32.370 | 0.000 | 31.976 | 0.000 | -0.109 | 0.400 | 0.210 | [33, 27, 0, 12541] | [57, 3, 7, 12362] | Both Significant | True |
| 509 | 9 | 4 | mean_norm | 17.490 | 0.001 | 7.116 | 0.028 | 0.147 | 0.000 | 0.151 | [58, 2, 129, 12412] | [58, 2, 69, 12300] | Both Significant | True |
| 529 | 9 | 10 | ganon_norm | 10.471 | 0.015 | 5.917 | 0.052 | 0.164 | 0.000 | 0.136 | [58, 2, 70, 12471] | [58, 2, 36, 12333] | Both Significant | True |
| 535 | 9 | 15 | ganon_norm | 10.471 | 0.015 | 5.917 | 0.052 | 0.164 | 0.000 | 0.136 | [58, 2, 70, 12471] | [58, 2, 36, 12333] | Both Significant | True |
False vs. True: Mann-Whitney-Wilcoxon test two-sided, P_val:1.167e-07 U_stat=4.738e+03 False vs. True: Mann-Whitney-Wilcoxon test two-sided, P_val:1.681e-04 U_stat=6.271e+03 False vs. True: Mann-Whitney-Wilcoxon test two-sided, P_val:6.430e-06 U_stat=6.282e+03
/tmp/ipykernel_2918775/3028195495.py:24: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot( /tmp/ipykernel_2918775/3028195495.py:24: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot( /tmp/ipykernel_2918775/3028195495.py:24: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(
df_pass_chi2_stats[df_pass_chi2_stats['are_diff'] == True].groupby('S').count()
| mode | column | chi2_full | p_value_full | chi2_partial | p_value_partial | precision_diff | recall_diff | f1_diff | stats_pass0 | stats_pass2 | significance | are_diff | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| S | |||||||||||||
| 0 | 12 | 12 | 12 | 12 | 12 | 12 | 12 | 12 | 12 | 12 | 12 | 12 | 12 |
| 2 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 |
| 3 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 4 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 |
| 6 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| 7 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 |
| 10 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
| 15 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
df_pass_chi2_stats[(df_pass_chi2_stats['are_diff'] == True) & (df_pass_chi2_stats['S'] == 0)][['precision_diff', 'recall_diff', 'f1_diff']].median()
precision_diff 0.032 recall_diff -0.008 f1_diff 0.039 dtype: float64
df_pass_chi2_stats[(df_pass_chi2_stats['are_diff'] == True) & (df_pass_chi2_stats['S'] == 3)][['precision_diff', 'recall_diff', 'f1_diff']].median()
precision_diff 0.062 recall_diff 0.567 f1_diff 0.540 dtype: float64
df_pass_chi2_stats[df_pass_chi2_stats['are_diff'] == True].groupby('column').count()
| mode | S | chi2_full | p_value_full | chi2_partial | p_value_partial | precision_diff | recall_diff | f1_diff | stats_pass0 | stats_pass2 | significance | are_diff | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| column | |||||||||||||
| centrifuge_norm | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 |
| ganon_norm | 13 | 13 | 13 | 13 | 13 | 13 | 13 | 13 | 13 | 13 | 13 | 13 | 13 |
| kaiju_norm | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 |
| kraken2_norm | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| mean_norm | 15 | 15 | 15 | 15 | 15 | 15 | 15 | 15 | 15 | 15 | 15 | 15 | 15 |
df_pass_chi2_stats[(df_pass_chi2_stats['are_diff'] == True) & (df_pass_chi2_stats['column'] == 'ganon_norm')][['precision_diff', 'recall_diff', 'f1_diff']].median()
precision_diff 0.061 recall_diff 0.550 f1_diff 0.519 dtype: float64
df_pass_chi2_stats[(df_pass_chi2_stats['are_diff'] == True) & (df_pass_chi2_stats['column'] == 'mean_norm')][['precision_diff', 'recall_diff', 'f1_diff']].median()
precision_diff 0.102 recall_diff 0.000 f1_diff 0.101 dtype: float64
df_pass_chi2_stats[df_pass_chi2_stats['are_diff'] == True].groupby('mode').count()
| S | column | chi2_full | p_value_full | chi2_partial | p_value_partial | precision_diff | recall_diff | f1_diff | stats_pass0 | stats_pass2 | significance | are_diff | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| mode | |||||||||||||
| 1 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 |
| 2 | 6 | 6 | 6 | 6 | 6 | 6 | 6 | 6 | 6 | 6 | 6 | 6 | 6 |
| 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 |
| 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 |
| 5 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 |
| 6 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 |
| 7 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 |
| 8 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 |
| 9 | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 7 |
As we see, using a second pass generally does not significantly change the assignment of TP/FP/FN | TN.
In the cases where it changes, the f1-score generally improves, either by increasing the number of TP (by reducing the amount of FN) or reduces the amount of false positives.
Therefore, looking at nominal info choosing pass2 is the best option.
Checking at the correlation between the read counts¶
Now we are going to check the proportion of reads that are correctly assigned to gold truth species. For that we are goin two show:
- A correlation plot with pass0 and pass2 reads. We are also going to calculate the correlation between pass0 / pass2 and the expected counts.
df_numerical_stats
| pass | mode | profiler | diff_counts | MRE_counts | MRED_counts | MAE_counts | MAED_counts | MACV_counts | corr_counts | R2_counts | taxid_counts | species_counts | expected_counts | observed_counts | diff_abundance | MRE_abundance | MRED_abundance | MAE_abundance | MAED_abundance | MACV_abundance | corr_abundance | R2_abundance | taxid_abundance | species_abundance | expected_abundance | observed_abundance | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | centrifuge | [-21.76736, -50.1056, -74.39946666666667, -58.... | -16.939 | 23.674 | 17.173 | 23.505 | 1.369 | 0.635 | 0.576 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [244477.0, 93552.0, 48001.0, 51790.0, 189991.0... | [-1.33929624238111, -37.07720182312471, -67.71... | 4.750 | 29.856 | 26.450 | 14.640 | 0.554 | 0.801 | 0.576 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.0831469924255903, 1.1798024658164117, 0.605... |
| 1 | 0 | 1 | ganon | [-23.91328, -55.785066666666665, -83.386666666... | -28.729 | 21.022 | 28.729 | 21.022 | 0.732 | 0.558 | 0.590 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [237771.0, 82903.0, 31150.0, 42979.0, 159924.0... | [7.790932959479549, -37.36135668956172, -76.46... | 0.969 | 29.782 | 25.411 | 15.563 | 0.612 | 0.790 | 0.590 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.368466654983736, 1.1744745620707178, 0.4412... |
| 2 | 0 | 1 | kaiju | [-53.25664, -79.21386666666666, -91.4437333333... | -49.543 | 28.481 | 49.543 | 28.481 | 0.575 | 0.389 | 0.313 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [146073.0, 38974.0, 16043.0, 14537.0, 54507.0,... | [-8.305719789553791, -59.224806809815064, -83.... | -1.021 | 55.870 | 47.167 | 29.963 | 0.635 | 0.764 | 0.313 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [2.865446256576444, 0.7645348723159675, 0.3147... |
| 3 | 0 | 1 | kraken2 | [-76.07584, -94.63306666666666, -99.4, -99.609... | -65.147 | 24.902 | 65.147 | 24.902 | 0.382 | 0.176 | 0.122 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [74763.0, 10063.0, 1125.0, 488.0, 11530.0, 288... | [-19.322126949109276, -81.90144330504141, -97.... | 17.532 | 83.974 | 75.284 | 41.126 | 0.546 | 0.594 | 0.122 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [2.521183532840335, 0.3393479380304735, 0.0379... |
| 4 | 0 | 1 | krakenuniq | [-19.62944, -48.239466666666665, -68.408533333... | -16.729 | 23.969 | 17.018 | 23.765 | 1.396 | 0.642 | 0.572 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [251158.0, 97051.0, 59234.0, 56908.0, 183826.0... | [-0.4454715144359227, -35.88453918746712, -60.... | 3.147 | 29.691 | 25.571 | 15.414 | 0.603 | 0.795 | 0.572 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.1110790151738774, 1.2021648902349915, 0.733... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 103 | 2 | 9 | ganon | [3.88512, -35.4976, -56.00053333333334, -32.8,... | -1.882 | 23.489 | 18.223 | 14.940 | 0.820 | 0.751 | 0.659 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [324641.0, 120942.0, 82499.0, 84000.0, 215092.... | [5.0978227572311425, -34.74463135224731, -55.4... | -0.736 | 23.763 | 18.879 | 14.451 | 0.765 | 0.760 | 0.659 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.284306961163473, 1.223538162145363, 0.83462... |
| 104 | 2 | 9 | kaiju | [-48.45856, -76.704, -89.0208, -85.9616, -80.8... | -43.474 | 29.718 | 43.474 | 29.718 | 0.684 | 0.422 | 0.324 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [161067.0, 43680.0, 20586.0, 17548.0, 59848.0,... | [-9.872292921902698, -59.26355445072247, -80.8... | -1.156 | 51.967 | 43.407 | 28.597 | 0.659 | 0.738 | 0.324 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [2.8164908461905407, 0.7638083540489538, 0.359... |
| 105 | 2 | 9 | kraken2 | [-39.60768, -84.74186666666667, -95.9482666666... | -32.087 | 35.514 | 32.087 | 35.514 | 1.107 | 0.419 | 0.235 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [188726.0, 28609.0, 7597.0, 4884.0, 33042.0, 6... | [0.13138111998259205, -74.70178386954647, -93.... | 12.601 | 58.882 | 53.987 | 26.671 | 0.494 | 0.695 | 0.235 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.129105659999456, 0.4743415524460034, 0.1259... |
| 106 | 2 | 9 | krakenuniq | [-19.67168, -48.3264, -68.4816, -54.56, -41.30... | -16.778 | 23.985 | 17.064 | 23.782 | 1.394 | 0.641 | 0.572 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [251026.0, 96888.0, 59097.0, 56800.0, 183437.0... | [-0.37064433374560224, -35.91043018258363, -60... | 3.219 | 29.748 | 25.644 | 15.417 | 0.601 | 0.796 | 0.572 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.11341736457045, 1.2016794340765569, 0.73296... |
| 107 | 2 | 9 | mean | [-20.390584802138655, -49.665829444940506, -66... | -16.259 | 21.692 | 16.933 | 21.171 | 1.250 | 0.638 | 0.622 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [248779.4224933167, 94376.56979073654, 62310.1... | [-1.6718373579364112, -43.06956794651194, -63.... | 8.287 | 32.919 | 29.573 | 16.667 | 0.564 | 0.799 | 0.536 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.072755082564487, 1.0674456010029012, 0.6795... |
108 rows × 27 columns
# Filter for pass=0 and pass=2
df_pass_0 = df_numerical_stats[df_numerical_stats['pass'] == 0].set_index(['mode', 'profiler'])
df_pass_2 = df_numerical_stats[df_numerical_stats['pass'] == 2].set_index(['mode', 'profiler'])
# Align the two DataFrames to ensure consistent indexing
aligned_df = pd.concat([df_pass_0, df_pass_2], axis=1, keys=['pass0', 'pass2'])
# Extract relevant columns and observed counts for pass=0 and pass=2
comparison_df = aligned_df[['pass0', 'pass2']].apply(
lambda x: pd.Series({
'passn': 'comparison', # Mark as comparison for clarity
'mode': x.name[0],
'profiler': x.name[1],
'taxid_counts': x['pass0']['taxid_counts'], # Keep consistent metadata
'expected_counts': x['pass0']['expected_counts'],
'pass0_counts': x['pass0']['observed_counts'],
'pass2_counts': x['pass2']['observed_counts'],
'pass0_MAE': x['pass0']['MAE_counts'],
'pass0_diff': x['pass0']['diff_counts'],
'pass2_MAE': x['pass2']['MAE_counts'],
'pass2_diff': x['pass2']['diff_counts'],
}),
axis=1
)
# Reset index for the final DataFrame
comparison_df = comparison_df.reset_index(drop=True)
plt.rc('axes', linewidth=0.65)
# Filter the DataFrame for the specific mode (5 in this case)
mode_df = comparison_df[comparison_df['mode'] == 5]
# Get unique profilers for layout
profilers = mode_df['profiler'].unique()
num_profilers = len(profilers)
# Set up the figure with GridSpec for custom layout
fig, axs = plt.subplots(3, num_profilers, figsize=(2 * num_profilers, 6), sharex=True, sharey=True)
# Define plotting functions with regression line
def plot_pass0_vs_pass2(ax, data, profiler, i):
x = data['pass0_counts'].values[0]
y = data['pass2_counts'].values[0]
# Scatter plot
ax.scatter(x, y, alpha=0.7)
# Plot y=x line
min_val, max_val = min(x.min(), y.min()), max(x.max(), y.max())
ax.plot([0, 350000], [0, 350000], '--', color='#DC267F')
# Pearson correlation and R^2
slope, intercept, r_value, _, _ = linregress(x, y)
r2 = r_value ** 2
ax.annotate(f"Slope: {slope:.2f}\nR$^2$: {r2:.2f}", xy=(0.05, 0.95),
xycoords='axes fraction', fontsize=12, verticalalignment='top')
if i == 0:
ax.set_ylabel("Pass0 vs Pass2", fontsize=12)
# Set profiler title on top
ax.set_title(profiler, fontsize=14)
sns.despine(top=True, right=True, ax=ax)
def plot_with_regression(ax, x, y, xlabel, i):
# Scatter plot
ax.scatter(x, y, alpha=0.7)
# Regression line
slope, intercept, r_value, _, _ = linregress(x, y)
ax.plot(x, slope * x + intercept, '-', color='#DC267F')
# Pearson correlation and R^2
r2 = r_value ** 2
ax.annotate(f"Slope: {slope:.2f}\nR$^2$: {r2:.2f}", xy=(0.05, 0.95),
xycoords='axes fraction', fontsize=12, verticalalignment='top')
if i == 0:
ax.set_ylabel(xlabel, fontsize=12)
sns.despine(top=True, right=True, ax=ax)
# Iterate through profilers and create subplots for each
for i, profiler in enumerate(profilers):
profiler_data = mode_df[mode_df['profiler'] == profiler]
x_pass0 = profiler_data['pass0_counts'].values[0]
x_expected = profiler_data['expected_counts'].values[0]
y_pass2 = profiler_data['pass2_counts'].values[0]
# Pass0 vs Pass2 (Row 1)
ax = fig.add_subplot(axs[0, i])
plot_pass0_vs_pass2(ax, profiler_data, profiler, i)
# Pass2 vs Expected (Row 2)
ax = fig.add_subplot(axs[1, i])
plot_with_regression(ax, x_expected, y_pass2, "Pass2 vs Expected", i)
# Pass0 vs Expected (Row 3)
ax = fig.add_subplot(axs[2, i])
plot_with_regression(ax, x_expected, x_pass0, "Pass0 vs Expected", i)
ax.set_xticks([0, 300000])
ax.set_yticks([0, 300000])
# Aesthetic adjustments
for ax in axs[-1, :]:
ax.set_xlabel("Counts", fontsize=12) # Only bottom row has x-labels
plt.tight_layout()
for format in ['png', 'tiff']:
plt.savefig(f'{RESULTS_DIR}/figures/paper/figD.{format}', dpi=DPI)
plt.show()
We see that the correlation in number of counts for the ground truth species is almost one. Additionally, if there are differences, they improve the correlation to the expected amount of counts, so it has a positive impact.
Checking at MAE values¶
We are going to see if the MAE values are equal or higher in pass2 vs pass0.
import seaborn as sns
import matplotlib.pyplot as plt
comparison_df_exploded = comparison_df.melt(
id_vars=['mode', 'profiler'],
value_vars=['pass0_diff', 'pass2_diff'],
var_name='pass_type',
value_name='diff_value'
)
# Convert `diff_value` from a list to separate rows
comparison_df_exploded = comparison_df_exploded.explode('diff_value')
# Ensure `diff_value` is numeric after exploding
comparison_df_exploded['diff_value'] = pd.to_numeric(comparison_df_exploded['diff_value'])
# Get unique profilers
profilers = comparison_df_exploded['profiler'].unique()
num_profilers = len(profilers)
# Set up the figure
fig, axs = plt.subplots(1, num_profilers, figsize=(6 * num_profilers, 6), sharey=True)
# Iterate through profilers and plot the data
for i, profiler in enumerate(profilers):
ax = axs[i]
profiler_data = comparison_df_exploded[comparison_df_exploded['profiler'] == profiler]
# Create the plot
sns.boxplot(
data=profiler_data,
x='mode',
y='diff_value',
hue='pass_type',
ax=ax,
palette='viridis'
)
# Set plot title and labels
ax.set_title(f'Profiler: {profiler}', fontsize=14)
ax.set_xlabel('Mode', fontsize=12)
if i == 0:
ax.set_ylabel('Diff Values', fontsize=12)
else:
ax.set_ylabel('')
# Adjust layout and display
plt.tight_layout()
plt.show()
We see that, again, the differences in MAE are not due to the pass, with the exception of ganon (and mean by extension), where running the data with pass2 improves the error rate.
Thus, both based on nominal and numerical criteria, running with pass2 is the best option.
Which mode is best for the data?¶
We have used several modes to study how it affects the detection of species, and the number of reads it detects. We are going to use the same methods as before to check for the answer.
- We are going to check which mode (using S=2 and S=7) retains the best F1 scores.
- We are going to check which mode keeps the best correlation / MAE with expected counts.
Checking F1-scores¶
df_nominal_stats_sub = df_nominal_stats[(df_nominal_stats['pass'] == 2) & (df_nominal_stats['S'].isin([0, 2, 7, 15])) & (df_nominal_stats['column'].isin([f'{i}_norm' for i in LIST_PROFILERS + ['mean']]))]
df_nominal_stats_sub
| pass | mode | S | column | precision | recall | f1 | kappa | TP|FN|FP|TN | |
|---|---|---|---|---|---|---|---|---|---|
| 1260 | 2 | 1 | 0 | centrifuge_norm | 1.000 | 0.050 | 0.095 | 0.094 | (3, 57, 0, 3522) |
| 1261 | 2 | 1 | 0 | ganon_norm | 1.000 | 0.050 | 0.095 | 0.094 | (3, 57, 0, 3522) |
| 1262 | 2 | 1 | 0 | kaiju_norm | 1.000 | 0.100 | 0.182 | 0.179 | (6, 54, 0, 3522) |
| 1263 | 2 | 1 | 0 | kraken2_norm | 1.000 | 0.217 | 0.356 | 0.352 | (13, 47, 0, 3522) |
| 1264 | 2 | 1 | 0 | krakenuniq_norm | 1.000 | 0.067 | 0.125 | 0.123 | (4, 56, 0, 3522) |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2507 | 2 | 9 | 15 | ganon_norm | 0.617 | 0.967 | 0.753 | 0.752 | (58, 2, 36, 12333) |
| 2508 | 2 | 9 | 15 | kaiju_norm | 0.533 | 0.950 | 0.683 | 0.681 | (57, 3, 50, 12319) |
| 2509 | 2 | 9 | 15 | kraken2_norm | 0.891 | 0.950 | 0.919 | 0.919 | (57, 3, 7, 12362) |
| 2510 | 2 | 9 | 15 | krakenuniq_norm | 0.400 | 0.967 | 0.566 | 0.563 | (58, 2, 87, 12282) |
| 2516 | 2 | 9 | 15 | mean_norm | 0.310 | 0.967 | 0.470 | 0.466 | (58, 2, 129, 12240) |
216 rows × 9 columns
plt.rc('axes', linewidth=0.65)
melted_df = pd.melt(
df_nominal_stats_sub,
id_vars=['mode', 'S', 'column'],
value_vars=['recall', 'precision', 'f1'],
var_name='metric',
value_name='score'
)
# Create a colormap for 'mode'
norm = Normalize(vmin=melted_df['mode'].min(), vmax=melted_df['mode'].max())
cmap = plt.cm.viridis # Choose a colormap (e.g., 'viridis', 'plasma', 'cividis')
# Create a FacetGrid: 6x3 grid (row for each profiler, column for each metric)
g = sns.FacetGrid(
melted_df,
col='column',
row='metric',
height=2,
sharey=True,
sharex=True
)
# Map the lineplot to the grid
def lineplot_with_cmap(data, **kwargs):
for S in sorted(data['S'].unique()):
subset = data[data['S'] == S]
plt.plot(subset['mode'], subset['score'], label=f"Mode {mode}",
color=cmap(norm(S)), marker='o')
g.map_dataframe(lineplot_with_cmap)
# Create a legend for the discrete modes
handles = [
plt.Line2D([0], [0], color=cmap(norm(mode)), marker='o', linestyle='', label=f"S {mode}")
for mode in sorted(melted_df['S'].unique())
]
plt.legend(
handles=handles,
title="",
bbox_to_anchor=(1.05, 3),
loc='center left',
frameon=False
)
# Set x-axis ticks (if you have specific S values)
g.set(xticks=df_nominal_stats_sub['mode'].unique())
for ax in g.axes.ravel():
ax.set_title('')
# Add axis labels and titles
for ax, profiler in zip(g.axes[0, :], melted_df['column'].unique()):
ax.set_title(profiler.replace('_norm', ''))
for ax, score in zip(g.axes[:, 0], ['recall', 'precision', 'F1-score']):
ax.set_ylabel(score)
plt.tight_layout()
for format in ['png', 'tiff']:
plt.savefig(f'{RESULTS_DIR}/figures/paper/figF.{format}', dpi=DPI, bbox_inches='tight')
plt.show()
/tmp/ipykernel_2918775/1058285318.py:60: UserWarning: Tight layout not applied. tight_layout cannot make Axes height small enough to accommodate all Axes decorations. plt.tight_layout()
Based on the table of truth, the precision, recall and f1-score are more S-dependent than mode-dependent; and their patterns are more profiler dependent than anything else. Therefore, the mode is not completely relevant for a proper species detection and, if so, since higher mode values lead to a lower precision (more FP), we could consider using a lower mode if we want to minimize the FPs values.
Checking correlation and MAE values¶
df_numerical_stats_sub = df_numerical_stats[(df_numerical_stats['pass'] == 2) & (df_numerical_stats['mode'].isin([1, 3, 5, 7, 9]))]
# Get unique profilers for layout
profilers = df_numerical_stats_sub['profiler'].unique()
modes = df_numerical_stats_sub['mode'].unique()
# Set up the figure with GridSpec for custom layout
fig, axs = plt.subplots(len(profilers), len(modes), figsize=(2 * len(modes), 2 * len(profilers)), sharex=True, sharey=True)
# Define plotting functions with regression line
def plot_regression(ax, data, profiler, i):
x = data['expected_counts'].values[0]
y = data['observed_counts'].values[0]
# Scatter plot
ax.scatter(x, y, alpha=0.7)
# Plot y=x line
min_val, max_val = min(x.min(), y.min()), max(x.max(), y.max())
ax.plot([0, 350000], [0, 350000], 'r--')
# Pearson correlation and R^2
slope, intercept, r_value, _, _ = linregress(x, y)
r2 = r_value ** 2
ax.annotate(f"Slope: {slope:.2f}\nR2: {r2:.2f}", xy=(0.05, 0.95),
xycoords='axes fraction', fontsize=10, verticalalignment='top')
# Iterate through profilers and create subplots for each
for i, profiler in enumerate(profilers):
for j, mode in enumerate(modes):
profiler_data = df_numerical_stats_sub[(df_numerical_stats_sub['profiler'] == profiler) & (df_numerical_stats_sub['mode'] == mode)]
# Pass0 vs Pass2 (Row 1)
ax = fig.add_subplot(axs[i, j])
plot_regression(ax, profiler_data, profiler, i)
if j == 0:
ax.set_ylabel(profiler, fontsize=12)
# Set profiler title on top
if i == 0:
ax.set_title(f'Mode {mode}', fontsize=14)
# Aesthetic adjustments
for ax in axs[-1, :]:
ax.set_xlabel("Counts", fontsize=12) # Only bottom row has x-labels
plt.tight_layout()
plt.show()
# Prepare the data for plotting
expanded_data = []
for _, row in df_numerical_stats_sub.iterrows():
profiler = row['profiler']
mode = row['mode']
for value in row['diff_counts']:
expanded_data.append({'profiler': profiler, 'mode': mode, 'diff_counts': value})
# Convert to a new DataFrame
plot_data = pd.DataFrame(expanded_data)
# Initialize the grid of plots with seaborn
g = sns.FacetGrid(plot_data, col="profiler", height=4, aspect=1)
# Plot the data on each facet
g.map_dataframe(
sns.boxplot,
x="mode",
y="diff_counts",
palette="viridis"
)
# Adjust the legend and titles
g.set_axis_labels("Mode", "Difference Counts")
g.set_titles(col_template="{col_name} Profiler")
plt.tight_layout()
plt.show()
/home/neuroimm/.local/share/mamba/envs/EVs/lib/python3.13/site-packages/seaborn/axisgrid.py:854: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. func(*plot_args, **plot_kwargs) /home/neuroimm/.local/share/mamba/envs/EVs/lib/python3.13/site-packages/seaborn/axisgrid.py:854: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. func(*plot_args, **plot_kwargs) /home/neuroimm/.local/share/mamba/envs/EVs/lib/python3.13/site-packages/seaborn/axisgrid.py:854: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. func(*plot_args, **plot_kwargs) /home/neuroimm/.local/share/mamba/envs/EVs/lib/python3.13/site-packages/seaborn/axisgrid.py:854: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. func(*plot_args, **plot_kwargs) /home/neuroimm/.local/share/mamba/envs/EVs/lib/python3.13/site-packages/seaborn/axisgrid.py:854: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. func(*plot_args, **plot_kwargs) /home/neuroimm/.local/share/mamba/envs/EVs/lib/python3.13/site-packages/seaborn/axisgrid.py:854: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. func(*plot_args, **plot_kwargs)
from matplotlib.ticker import MaxNLocator
plt.rc('axes', linewidth=0.65)
# Initialize the grid of plots with seaborn
g = sns.FacetGrid(df_numerical_stats_sub, col="profiler", height=2.3, aspect=0.85, sharex=False, sharey=False)
# Plot the data on each facet
g.map_dataframe(
sns.scatterplot,
x="corr_counts",
y="R2_counts",
hue="mode",
palette="viridis"
)
# Adjust the legend and titles
g.add_legend(title="Mode", bbox_to_anchor=(1.05, 0.5))
g.set_axis_labels("Correlation Counts", "R² Counts")
g.set_titles(col_template="{col_name}")
for ax in g.axes.flat:
ax.xaxis.set_major_locator(MaxNLocator(nbins=2))
plt.tight_layout()
for format in ['png', 'tiff']:
plt.savefig(f'{RESULTS_DIR}/figures/paper/figGA.{format}', dpi=DPI, bbox_inches='tight')
plt.show()
g = sns.FacetGrid(df_numerical_stats_sub, col="profiler", height=2.3, aspect=0.85, sharex=False, sharey=False)
# Plot the data on each facet
g.map_dataframe(
sns.scatterplot,
x="MAE_counts",
y="MAED_counts",
hue="mode",
palette="viridis"
)
# Adjust the legend and titles
g.add_legend(title="Mode", bbox_to_anchor=(1.05, 0.5))
g.set_axis_labels("MAE", "MAED")
g.set_titles(col_template="{col_name}")
plt.tight_layout()
for format in ['png', 'tiff']:
plt.savefig(f'{RESULTS_DIR}/figures/paper/figGB.{format}', dpi=DPI, bbox_inches='tight')
plt.show()
g = sns.FacetGrid(df_numerical_stats_sub, col="profiler", height=2.3, aspect=0.85, sharex=False, sharey=False)
# Plot the data on each facet
g.map_dataframe(
sns.scatterplot,
x="MRE_counts",
y="MRED_counts",
hue="mode",
palette="viridis"
)
# Adjust the legend and titles
g.add_legend(title="Mode", bbox_to_anchor=(1.05, 0.5))
g.set_axis_labels("MRE", "MRED")
g.set_titles(col_template="{col_name}")
plt.tight_layout()
for format in ['png', 'tiff']:
plt.savefig(f'{RESULTS_DIR}/figures/paper/figGC.{format}', dpi=DPI, bbox_inches='tight')
plt.show()
# Create the DataFrame
df = df_numerical_stats[df_numerical_stats['pass'] == 2].copy()
# Calculate the differences from the mean for each mode
df_mean = df[df['profiler'] == 'mean'][['mode', 'MRED_counts']].rename(columns={'MRED_counts': 'mean_MRED'})
df = df.merge(df_mean, on='mode')
df['difference'] = df['MRED_counts'] - df['mean_MRED']
# Filter out the mean profiler itself
df_filtered = df[df['profiler'] != 'mean'].sort_values('profiler')
# Perform Mann-Whitney U tests
profiler_groups = df_filtered['profiler'].unique()
p_values = []
results = []
for profiler in profiler_groups:
group_data = df_filtered[df_filtered['profiler'] == profiler]
mean_data = group_data['MRED_counts'].values
prof_data = group_data['mean_MRED'].values
stat, p = wilcoxon(mean_data, prof_data, alternative='greater')
p_values.append(p)
results.append({'profiler': profiler, 'statistic': stat, 'p_value': p})
# Correct p-values for multiple testing
p_values_corrected = multipletests(p_values, method='fdr_bh')[1]
for i, result in enumerate(results):
result['p_value_corrected'] = p_values_corrected[i]
# Convert results to DataFrame
results_df = pd.DataFrame(results)
display(results_df)
# Plot the differences
fig, ax = plt.subplots(1, 1, figsize=(5, 3))
sns.boxplot(data=df_filtered, x='profiler', y='difference', palette=custom_palette, ax=ax)
# sns.stripplot(data=df_filtered, x='profiler', y='difference', color='black', alpha=0.5, jitter=True)
medians = df_filtered.groupby("profiler")['difference'].median()
for i, median in enumerate(medians):
ax.text(i, median , f"{median:.2f}", ha='center', va='bottom', fontsize=8)
plt.axhline(0, linestyle='--', color='#848484', linewidth=0.5)
plt.ylabel('MRED difference from mean')
plt.xlabel('')
sns.despine(top=True, right=True)
plt.tight_layout()
for format in ['png', 'tiff']:
plt.savefig(f'{RESULTS_DIR}/figures/paper/figJ.{format}', dpi=DPI, bbox_inches='tight')
plt.show()
| profiler | statistic | p_value | p_value_corrected | |
|---|---|---|---|---|
| 0 | centrifuge | 45.000 | 0.002 | 0.002 |
| 1 | ganon | 45.000 | 0.002 | 0.002 |
| 2 | kaiju | 45.000 | 0.002 | 0.002 |
| 3 | kraken2 | 45.000 | 0.002 | 0.002 |
| 4 | krakenuniq | 45.000 | 0.002 | 0.002 |
/tmp/ipykernel_2918775/2421930555.py:45: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(data=df_filtered, x='profiler', y='difference', palette=custom_palette, ax=ax) /tmp/ipykernel_2918775/2421930555.py:45: UserWarning: The palette list has more values (6) than needed (5), which may not be intended. sns.boxplot(data=df_filtered, x='profiler', y='difference', palette=custom_palette, ax=ax)
# Create the DataFrame
df = df_numerical_stats[df_numerical_stats['pass'] == 2].copy()
# Calculate the differences from the mean for each mode
df_mean = df[df['profiler'] == 'mean'][['mode', 'MAED_counts']].rename(columns={'MAED_counts': 'mean_MAED'})
df = df.merge(df_mean, on='mode')
df['difference'] = df['MAED_counts'] - df['mean_MAED']
# Filter out the mean profiler itself
df_filtered = df[df['profiler'] != 'mean'].sort_values('profiler')
# Perform Mann-Whitney U tests
profiler_groups = df_filtered['profiler'].unique()
p_values = []
results = []
for profiler in profiler_groups:
group_data = df_filtered[df_filtered['profiler'] == profiler]
mean_data = group_data['MAED_counts'].values
prof_data = group_data['mean_MAED'].values
stat, p = wilcoxon(mean_data, prof_data, alternative='greater')
p_values.append(p)
results.append({'profiler': profiler, 'statistic': stat, 'p_value': p})
# Correct p-values for multiple testing
p_values_corrected = multipletests(p_values, method='fdr_bh')[1]
for i, result in enumerate(results):
result['p_value_corrected'] = p_values_corrected[i]
# Convert results to DataFrame
results_df = pd.DataFrame(results)
display(results_df)
# Plot the differences
fig, ax = plt.subplots(1, 1, figsize=(5, 3))
sns.boxplot(data=df_filtered, x='profiler', y='difference', palette=custom_palette, ax=ax)
# sns.stripplot(data=df_filtered, x='profiler', y='difference', color='black', alpha=0.5, jitter=True)
medians = df_filtered.groupby("profiler")['difference'].median()
for i, median in enumerate(medians):
ax.text(i, median , f"{median:.2f}", ha='center', va='bottom', fontsize=8)
plt.axhline(0, linestyle='--', color='#848484', linewidth=0.5)
plt.ylabel('MAED difference from mean')
plt.xlabel('')
sns.despine(top=True, right=True)
plt.tight_layout()
for format in ['png', 'tiff']:
plt.savefig(f'{RESULTS_DIR}/figures/paper/figJ.{format}', dpi=DPI, bbox_inches='tight')
plt.show()
| profiler | statistic | p_value | p_value_corrected | |
|---|---|---|---|---|
| 0 | centrifuge | 45.000 | 0.002 | 0.002 |
| 1 | ganon | 7.000 | 0.973 | 0.973 |
| 2 | kaiju | 45.000 | 0.002 | 0.002 |
| 3 | kraken2 | 45.000 | 0.002 | 0.002 |
| 4 | krakenuniq | 45.000 | 0.002 | 0.002 |
/tmp/ipykernel_2918775/683032758.py:45: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(data=df_filtered, x='profiler', y='difference', palette=custom_palette, ax=ax) /tmp/ipykernel_2918775/683032758.py:45: UserWarning: The palette list has more values (6) than needed (5), which may not be intended. sns.boxplot(data=df_filtered, x='profiler', y='difference', palette=custom_palette, ax=ax)
df_numerical_stats_sub[df_numerical_stats_sub['mode'].isin([3, 7])]
| pass | mode | profiler | diff_counts | MRE_counts | MRED_counts | MAE_counts | MAED_counts | MACV_counts | corr_counts | R2_counts | taxid_counts | species_counts | expected_counts | observed_counts | diff_abundance | MRE_abundance | MRED_abundance | MAE_abundance | MAED_abundance | MACV_abundance | corr_abundance | R2_abundance | taxid_abundance | species_abundance | expected_abundance | observed_abundance | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 66 | 2 | 3 | centrifuge | [-21.29664, -49.24053333333333, -72.7157333333... | -16.551 | 23.325 | 16.808 | 23.141 | 1.377 | 0.642 | 0.586 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [245948.0, 95174.0, 51158.0, 53926.0, 192262.0... | [-1.6074186089308142, -36.542036383614835, -65... | 4.325 | 29.161 | 25.668 | 14.498 | 0.565 | 0.803 | 0.586 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.074768168470912, 1.1898368178072218, 0.6395... |
| 67 | 2 | 3 | ganon | [-6.65408, -46.7088, -73.0704, -50.604, -40.65... | -8.143 | 26.077 | 19.274 | 19.361 | 1.005 | 0.683 | 0.580 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [291706.0, 99921.0, 50493.0, 61745.0, 185450.0... | [3.2451929989731525, -41.05730353135007, -70.2... | 1.598 | 28.843 | 24.447 | 15.387 | 0.629 | 0.755 | 0.580 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.226412281217911, 1.105175558787186, 0.55847... |
| 68 | 2 | 3 | kaiju | [-52.1184, -78.6064, -90.92586666666666, -87.8... | -48.224 | 28.792 | 48.224 | 28.792 | 0.597 | 0.398 | 0.316 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [149630.0, 40113.0, 17014.0, 15201.0, 55766.0,... | [-6.421758363959896, -58.18904401137832, -82.2... | 1.190 | 56.270 | 47.277 | 30.539 | 0.646 | 0.779 | 0.316 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [2.9243200511262533, 0.7839554247866565, 0.332... |
| 69 | 2 | 3 | kraken2 | [-61.60288, -91.8592, -98.78133333333334, -99.... | -50.081 | 32.137 | 50.081 | 32.137 | 0.642 | 0.278 | 0.161 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [119991.0, 15264.0, 2285.0, 1020.0, 17562.0, 4... | [-11.271327674607562, -81.18811057530996, -97.... | 15.354 | 74.263 | 67.854 | 33.862 | 0.499 | 0.642 | 0.161 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [2.7727710101685137, 0.3527229267129384, 0.052... |
| 70 | 2 | 3 | krakenuniq | [-19.67168, -48.3264, -68.4816, -54.56, -41.30... | -16.778 | 23.985 | 17.064 | 23.782 | 1.394 | 0.641 | 0.572 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [251026.0, 96888.0, 59097.0, 56800.0, 183437.0... | [-0.37064433374560224, -35.91043018258363, -60... | 3.219 | 29.748 | 25.644 | 15.417 | 0.601 | 0.796 | 0.572 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.11341736457045, 1.2016794340765569, 0.73296... |
| 71 | 2 | 3 | mean | [-25.802934689620443, -53.521229040380774, -73... | -20.388 | 21.741 | 20.388 | 21.741 | 1.066 | 0.601 | 0.601 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [231865.8290949361, 87147.69554928604, 49730.5... | [-3.0213808182247845, -44.51661135451871, -68.... | 12.824 | 38.048 | 34.447 | 20.629 | 0.599 | 0.804 | 0.485 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.0305818494304755, 1.040313537102774, 0.5921... |
| 90 | 2 | 7 | centrifuge | [-20.50976, -47.4128, -69.89973333333333, -54.... | -15.904 | 22.759 | 16.191 | 22.556 | 1.393 | 0.653 | 0.603 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [248407.0, 98601.0, 56438.0, 56618.0, 196085.0... | [-2.4354608041876133, -35.45564920173818, -63.... | 3.218 | 27.934 | 24.137 | 14.425 | 0.598 | 0.801 | 0.603 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.048891849869137, 1.2102065774674091, 0.6927... |
| 91 | 2 | 7 | ganon | [2.87936, -39.4528, -60.080533333333335, -36.5... | -2.179 | 24.830 | 19.200 | 15.894 | 0.828 | 0.734 | 0.629 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [321498.0, 113526.0, 74849.0, 79318.0, 200909.... | [5.609860704572441, -37.84582876437132, -59.02... | 0.418 | 25.489 | 20.717 | 14.855 | 0.717 | 0.754 | 0.629 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.3003081470178888, 1.1653907106680377, 0.768... |
| 92 | 2 | 7 | kaiju | [-49.07328, -77.00266666666667, -89.2613333333... | -44.465 | 29.503 | 44.465 | 29.503 | 0.664 | 0.419 | 0.323 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [159146.0, 43120.0, 20135.0, 17133.0, 58980.0,... | [-9.114576548016046, -58.958236888878254, -80.... | -0.890 | 52.653 | 43.948 | 29.012 | 0.660 | 0.747 | 0.323 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [2.8401694828744986, 0.7695330583335327, 0.359... |
| 93 | 2 | 7 | kraken2 | [-44.83456, -87.64213333333333, -97.136, -97.6... | -36.009 | 35.648 | 36.009 | 35.648 | 0.990 | 0.386 | 0.212 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [172392.0, 23171.0, 5370.0, 2881.0, 26475.0, 6... | [-2.383528721026508, -78.1324804708156, -94.93... | 13.233 | 63.080 | 58.103 | 27.897 | 0.480 | 0.682 | 0.212 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.0505147274679216, 0.4100159911722076, 0.095... |
| 94 | 2 | 7 | krakenuniq | [-19.67168, -48.3264, -68.4816, -54.56, -41.30... | -16.778 | 23.985 | 17.064 | 23.782 | 1.394 | 0.641 | 0.572 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [251026.0, 96888.0, 59097.0, 56800.0, 183437.0... | [-0.37064433374560224, -35.91043018258363, -60... | 3.219 | 29.748 | 25.644 | 15.417 | 0.601 | 0.796 | 0.572 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.11341736457045, 1.2016794340765569, 0.73296... |
| 95 | 2 | 7 | mean | [-21.385476139592473, -51.124221746876785, -68... | -16.856 | 21.928 | 17.401 | 21.498 | 1.235 | 0.629 | 0.614 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [245670.38706377352, 91642.08422460603, 58892.... | [-1.5125795343509907, -43.63677614078014, -64.... | 9.779 | 34.128 | 30.912 | 17.460 | 0.565 | 0.803 | 0.524 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.0777318895515315, 1.0568104473603723, 0.659... |
def plot_prof_couns(df_combo):
# Assuming your data is in a DataFrame called `df`
# Expand the diff_counts column into individual rows for plotting
df_expanded = df_combo.explode('diff_counts')
df_expanded['diff_counts'] = pd.to_numeric(df_expanded['diff_counts'])
# Create the plot
plt.figure(figsize=(10, 2.5))
for profiler, countsdiff in zip(df_combo['profiler'], df_combo['diff_counts']):
me = np.median(countsdiff)
med = np.std(countsdiff)
plt.scatter(me, profiler, color='#DC267F', label='_nolegend_', s=100, zorder=3, marker = '|')
plt.plot([me-med, me+med], [profiler, profiler], color='#DC267F', label='_nolegend_',)
sns.stripplot(
data=df_expanded,
x='diff_counts',
y='profiler',
jitter=True, # Adds jitter for better visibility of points
size=5, # Adjust point size
alpha=0.7, # Slight transparency
color="#648FFF"
)
# Add a vertical line at x=0
plt.axvline(0, color='#848484', linestyle='--', linewidth=1)
sns.despine(top=True, right=True)
# Add labels and title
# plt.title('Diff Counts Across Profilers', fontsize=14)
plt.xlabel('PRE', fontsize=12)
plt.ylabel('', fontsize=12)
plt.grid(True, axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
for format in ['png', 'tiff']:
plt.savefig(f'{RESULTS_DIR}/figures/paper/figH{mode}.{format}', dpi=DPI, bbox_inches='tight')
plt.show()
for mode in [3,5,7]:
print(mode)
df_combo = df_numerical_stats[(df_numerical_stats['pass'] == 2) & \
(df_numerical_stats['mode'] == mode)] # We choose a large number because S in not relevant here (but with small S we may select few datasets)
plot_prof_couns(df_combo)
3
5
7
Based on these results, we see that a higher mode increases the correlation (slope) and R2; and decreases the MAE (although it increases the MAED). However, again, we see that this effect is very profiler-dependent. It is interesting to note, however, that there are some species that have a very bad detection rate throughout the profiler/mode/S values.
Based on that, considering that higher modes (1) lower the MAE / increase the correlation but (2) decrease the precision; but considering that the effect on (1) is more pronounced that in (2), we are going to choose a high mode value, but not extreme. For instance, mode = 7
Which species have the worst assignment?¶
species_to_kingdom = {
# Bacteria
"Cutibacterium acnes": "Bacteria",
"Lactobacillus acidophilus": "Bacteria",
"Bifidobacterium bifidum": "Bacteria",
"Akkermansia muciniphila": "Bacteria",
"Blautia coccoides": "Bacteria",
"Blautia luti": "Bacteria",
"Bacteroides ovatus": "Bacteria",
"Bacteroides intestinalis": "Bacteria",
"Bacteroides fragilis": "Bacteria",
"Escherichia coli": "Bacteria",
"Dietzia lutea": "Bacteria",
"Ruthenibacterium lactatiformans": "Bacteria",
"Faecalibacterium prausnitzii": "Bacteria",
"Parabacteroides distasonis": "Bacteria",
"Parabacteroides merdae": "Bacteria",
"Fusicatenibacter saccharivorans": "Bacteria",
"Erysipelatoclostridium ramosum": "Bacteria",
"Streptococcus salivarius": "Bacteria",
"Hungatella hathewayi": "Bacteria",
"Eisenbergiella porci": "Bacteria",
"Butyricimonas faecalis": "Bacteria",
"Alistipes indistinctus": "Bacteria",
"Alistipes finegoldii": "Bacteria",
"Eubacterium callanderi": "Bacteria",
"Acidaminococcus intestini": "Bacteria",
# Fungi
"Aspergillus chevalieri": "Fungi",
"Aspergillus flavus": "Fungi",
"Saccharomyces cerevisiae": "Fungi",
"Saccharomyces kudriavzevii": "Fungi",
"Saccharomyces mikatae": "Fungi",
"Candida albicans": "Fungi",
"Candida dubliniensis": "Fungi",
"Candida orthopsilosis": "Fungi",
"Malassezia restricta": "Fungi",
"Alternaria dauci": "Fungi",
"Kazachstania africana": "Fungi",
"Penicillium digitatum": "Fungi",
"Pichia kudriavzevii": "Fungi",
"Trichoderma asperellum": "Fungi",
"Akanthomyces muscarius": "Fungi",
"Fusarium falciforme": "Fungi",
"Eremothecium sinecaudum": "Fungi",
"Cryptococcus decagattii": "Fungi",
"Kwoniella shandongensis": "Fungi",
"Puccinia triticina": "Fungi",
# Virus
"Tobacco mosaic virus": "Virus",
"Rotavirus A": "Virus",
"Rotavirus B": "Virus",
"Rotavirus C": "Virus",
"Bacteriophage P2": "Virus",
"Escherichia phage T4": "Virus",
"Human immunodeficiency virus 1": "Virus",
"Human adenovirus 7": "Virus",
"Hepatitis C virus": "Virus",
"Bovine alphaherpesvirus 2": "Virus",
"Human herpesvirus 4 type 2": "Virus",
"Mimivirus terra2": "Virus",
"Dengue virus": "Virus",
"Norovirus GI": "Virus",
"Zaire ebolavirus": "Virus"
}
mode = 5
df_combo = df_numerical_stats[(df_numerical_stats['pass'] == 2) & \
(df_numerical_stats['mode'] == mode)].set_index('profiler')
df_combo
| pass | mode | diff_counts | MRE_counts | MRED_counts | MAE_counts | MAED_counts | MACV_counts | corr_counts | R2_counts | taxid_counts | species_counts | expected_counts | observed_counts | diff_abundance | MRE_abundance | MRED_abundance | MAE_abundance | MAED_abundance | MACV_abundance | corr_abundance | R2_abundance | taxid_abundance | species_abundance | expected_abundance | observed_abundance | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| profiler | ||||||||||||||||||||||||||
| centrifuge | 2 | 5 | [-20.92736, -48.3728, -71.28853333333333, -55.... | -16.211 | 23.038 | 16.486 | 22.842 | 1.386 | 0.648 | 0.595 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [247102.0, 96801.0, 53834.0, 55355.0, 194199.0... | [-2.0102896294236245, -36.0216836665145, -64.4... | 3.835 | 28.550 | 24.922 | 14.446 | 0.580 | 0.803 | 0.595 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.0621784490805117, 1.1995934312528531, 0.667... |
| ganon | 2 | 5 | [-0.50112, -43.06133333333333, -65.74453333333... | -3.843 | 25.604 | 19.386 | 17.162 | 0.885 | 0.717 | 0.606 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [310934.0, 106760.0, 64229.0, 72144.0, 194362.... | [4.519396024457606, -40.188321210872765, -64.0... | 1.009 | 26.896 | 22.260 | 15.131 | 0.680 | 0.753 | 0.606 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.2662311257643, 1.1214689772961357, 0.674698... |
| kaiju | 2 | 5 | [-49.99712, -77.44426666666666, -89.7125333333... | -45.742 | 29.240 | 45.742 | 29.240 | 0.639 | 0.413 | 0.322 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [156259.0, 42292.0, 19289.0, 16479.0, 57971.0,... | [-8.0675120659037, -58.53029500478051, -81.086... | -0.245 | 53.759 | 44.984 | 29.436 | 0.654 | 0.759 | 0.322 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [2.8728902479405094, 0.7775569686603653, 0.354... |
| kraken2 | 2 | 5 | [-51.72896, -89.8096, -98.04693333333333, -98.... | -41.501 | 34.745 | 41.501 | 34.745 | 0.837 | 0.341 | 0.189 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [150847.0, 19107.0, 3662.0, 1740.0, 21850.0, 5... | [-5.859592421967733, -80.12621212670827, -96.1... | 14.088 | 67.761 | 62.520 | 29.686 | 0.475 | 0.665 | 0.189 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [2.9418877368135083, 0.3726335226242199, 0.071... |
| krakenuniq | 2 | 5 | [-19.67168, -48.3264, -68.4816, -54.56, -41.30... | -16.778 | 23.985 | 17.064 | 23.782 | 1.394 | 0.641 | 0.572 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [251026.0, 96888.0, 59097.0, 56800.0, 183437.0... | [-0.37064433374560224, -35.91043018258363, -60... | 3.219 | 29.748 | 25.644 | 15.417 | 0.601 | 0.796 | 0.572 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.11341736457045, 1.2016794340765569, 0.73296... |
| mean | 2 | 5 | [-23.187162957260917, -52.379110996145705, -70... | -18.085 | 21.968 | 18.371 | 21.729 | 1.183 | 0.618 | 0.607 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [312500, 187500, 187500, 125000, 312500, 12500... | [240040.11575855964, 89289.1668822268, 54476.7... | [-2.1606238158283873, -44.255403818539435, -66... | 11.009 | 35.605 | 32.294 | 18.601 | 0.576 | 0.803 | 0.509 | [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... | [Bacteroides fragilis, Parabacteroides distaso... | [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... | [3.057480505755363, 1.0452111784023856, 0.6271... |
# RUN FOR MEAN IN MODE 1 3 5 7 9
# RUN FOR MODE 5 AND THE REST OF PROFILERS
prof = 'mean'
df_diffab = pd.DataFrame({'species': df_combo.loc[prof, 'species_counts'],
'expected_counts': df_combo.loc[prof, 'expected_counts'], \
'observed_counts': df_combo.loc[prof, 'observed_counts'], \
'diff_abundance': df_combo.loc[prof, 'diff_counts']})
df_diffab['kingdom'] = [species_to_kingdom[i] for i in df_diffab['species']]
df_diffab['isfamily'] = False
family_species = ['Blautia coccoides', 'Blautia luti', 'Bacteroides ovatus', 'Bacteroides intestinalis','Bacteroides fragilis',
'Parabacteroides distasonis', 'Parabacteroides merdae', 'Alistipes indistinctus', 'Alistipes finegoldii', 'Aspergillus chevalieri',
'Aspergillus flavus', 'Saccharomyces cerevisiae', 'Saccharomyces kudriavzevii', 'Saccharomyces mikatae', 'Candida albicans', 'Candida dubliniensis',
'Candida orthopsilosis', 'Rotavirus A', 'Rotavirus B', 'Rotavirus C']
df_diffab.loc[df_diffab['species'].isin(family_species), 'isfamily'] = True
df_diffab.sort_values(by=['expected_counts', 'diff_abundance'], ascending=False)
| species | expected_counts | observed_counts | diff_abundance | kingdom | isfamily | |
|---|---|---|---|---|---|---|
| 8 | Cutibacterium acnes | 312500 | 254046.803 | -18.705 | Bacteria | False |
| 0 | Bacteroides fragilis | 312500 | 240040.116 | -23.187 | Bacteria | True |
| 6 | Lactobacillus acidophilus | 312500 | 239520.544 | -23.353 | Bacteria | False |
| 40 | Akkermansia muciniphila | 312500 | 196080.997 | -37.254 | Bacteria | False |
| 7 | Bifidobacterium bifidum | 312500 | 184723.065 | -40.889 | Bacteria | False |
| 4 | Blautia coccoides | 312500 | 173325.771 | -44.536 | Bacteria | True |
| 42 | Bacteroides intestinalis | 312500 | 163291.164 | -47.747 | Bacteria | True |
| 18 | Bacteroides ovatus | 312500 | 111631.939 | -64.278 | Bacteria | True |
| 29 | Blautia luti | 312500 | 53234.639 | -82.965 | Bacteria | True |
| 21 | Rotavirus C | 250000 | 254565.176 | 1.826 | Virus | True |
| 20 | Rotavirus B | 250000 | 253634.055 | 1.454 | Virus | True |
| 19 | Rotavirus A | 250000 | 251608.261 | 0.643 | Virus | True |
| 17 | Tobacco mosaic virus | 250000 | 243272.405 | -2.691 | Virus | False |
| 44 | Dietzia lutea | 250000 | 232472.093 | -7.011 | Bacteria | False |
| 15 | Human immunodeficiency virus 1 | 200000 | 201026.902 | 0.513 | Virus | False |
| 10 | Saccharomyces cerevisiae | 200000 | 195902.958 | -2.049 | Fungi | True |
| 32 | Saccharomyces kudriavzevii | 200000 | 190980.300 | -4.510 | Fungi | True |
| 33 | Saccharomyces mikatae | 200000 | 189682.529 | -5.159 | Fungi | True |
| 41 | Candida orthopsilosis | 200000 | 189190.931 | -5.405 | Fungi | True |
| 35 | Aspergillus chevalieri | 200000 | 187004.613 | -6.498 | Fungi | True |
| 12 | Candida albicans | 200000 | 184824.390 | -7.588 | Fungi | True |
| 23 | Candida dubliniensis | 200000 | 184501.730 | -7.749 | Fungi | True |
| 13 | Escherichia phage T4 | 200000 | 164474.441 | -17.763 | Virus | False |
| 11 | Aspergillus flavus | 200000 | 148008.703 | -25.996 | Fungi | True |
| 14 | Bacteriophage P2 | 200000 | 45220.596 | -77.390 | Virus | False |
| 47 | Ruthenibacterium lactatiformans | 187500 | 135305.249 | -27.837 | Bacteria | False |
| 25 | Parabacteroides merdae | 187500 | 125862.826 | -32.873 | Bacteria | True |
| 1 | Parabacteroides distasonis | 187500 | 89289.167 | -52.379 | Bacteria | True |
| 2 | Faecalibacterium prausnitzii | 187500 | 54476.741 | -70.946 | Bacteria | False |
| 55 | Hepatitis C virus | 150000 | 151377.085 | 0.918 | Virus | False |
| 28 | Malassezia restricta | 150000 | 149869.196 | -0.087 | Fungi | False |
| 53 | Bovine alphaherpesvirus 2 | 150000 | 148849.267 | -0.767 | Virus | False |
| 31 | Human adenovirus 7 | 150000 | 147511.825 | -1.659 | Virus | False |
| 26 | Alternaria dauci | 150000 | 141107.893 | -5.928 | Fungi | False |
| 54 | Human herpesvirus 4 type 2 | 150000 | 139697.441 | -6.868 | Virus | False |
| 5 | Erysipelatoclostridium ramosum | 125000 | 67123.702 | -46.301 | Bacteria | False |
| 3 | Streptococcus salivarius | 125000 | 58117.095 | -53.506 | Bacteria | False |
| 57 | Dengue virus | 100000 | 101890.120 | 1.890 | Virus | False |
| 43 | Kazachstania africana | 100000 | 97363.929 | -2.636 | Fungi | False |
| 22 | Penicillium digitatum | 100000 | 95742.552 | -4.257 | Fungi | False |
| 46 | Mimivirus terra2 | 100000 | 84794.462 | -15.206 | Virus | False |
| 52 | Eisenbergiella porci | 62500 | 57135.945 | -8.582 | Bacteria | False |
| 36 | Acidaminococcus intestini | 62500 | 57048.401 | -8.723 | Bacteria | False |
| 45 | Alistipes indistinctus | 62500 | 55391.499 | -11.374 | Bacteria | True |
| 39 | Alistipes finegoldii | 62500 | 51804.368 | -17.113 | Bacteria | True |
| 50 | Butyricimonas faecalis | 62500 | 49405.437 | -20.951 | Bacteria | False |
| 34 | Hungatella hathewayi | 62500 | 40802.328 | -34.716 | Bacteria | False |
| 27 | Eubacterium callanderi | 62500 | 30952.294 | -50.476 | Bacteria | False |
| 16 | Norovirus GI | 50000 | 50524.777 | 1.050 | Virus | False |
| 56 | Zaire ebolavirus | 50000 | 49744.206 | -0.512 | Virus | False |
| 24 | Eremothecium sinecaudum | 50000 | 49425.836 | -1.148 | Fungi | False |
| 38 | Puccinia triticina | 50000 | 49269.888 | -1.460 | Fungi | False |
| 9 | Pichia kudriavzevii | 50000 | 48697.541 | -2.605 | Fungi | False |
| 48 | Kwoniella shandongensis | 50000 | 48329.668 | -3.341 | Fungi | False |
| 51 | Akanthomyces muscarius | 50000 | 47989.514 | -4.021 | Fungi | False |
| 49 | Cryptococcus decagattii | 50000 | 47962.656 | -4.075 | Fungi | False |
| 30 | Trichoderma asperellum | 50000 | 46584.994 | -6.830 | Fungi | False |
| 37 | Fusarium falciforme | 50000 | 46336.938 | -7.326 | Fungi | False |
# Create a grid of three axes for the plots
fig, axs = plt.subplots(1, 3, figsize=(9, 3.5), sharey=True)
pairs=[(True, False)]
ax1 = sns.boxplot(df_diffab, x='isfamily', y='diff_abundance', ax=axs[0], palette=sns.color_palette(['#4E79A7', '#A0CBE8']))
medians = df_diffab.groupby("isfamily")['diff_abundance'].median()
for i, median in enumerate(medians):
ax1.text(i, median - 8, f"{median:.2f}", ha='center', va='bottom', fontsize=8)
axs[0].set_ylabel("PRE", fontsize=12)
annotator = Annotator(ax1, pairs, data=df_diffab, x='isfamily', y='diff_abundance')
annotator.configure(test='Mann-Whitney', text_format='simple', loc='outside', line_width=1)
annotator.apply_and_annotate()
axs[0].set_xlabel("Species belongs to\na shared genus", fontsize=12)
ax2 = sns.boxplot(df_diffab, x='kingdom', y='diff_abundance', ax=axs[1], palette=sns.color_palette(['#4E79A7', '#77a2c8', '#A0CBE8']))
pairs=[("Virus", 'Bacteria'), ('Fungi', 'Bacteria'), ('Fungi', 'Virus')]
annotator = Annotator(ax2, pairs, data=df_diffab, x='kingdom', y='diff_abundance')
annotator.configure(test='Mann-Whitney', text_format='simple', loc='outside', line_width=1)
annotator.apply_and_annotate()
medians = df_diffab.groupby("kingdom")['diff_abundance'].median()
for i, median in enumerate(medians):
ax2.text(i, median - 8, f"{median:.2f}", ha='center', va='bottom', fontsize=8)
df_bac = df_diffab[df_diffab['kingdom'] == 'Bacteria']
ax3 = sns.boxplot(df_bac, x='isfamily', y='diff_abundance', ax=axs[2], palette=sns.color_palette(['#4E79A7', '#A0CBE8']))
pairs=[(True, False)]
annotator = Annotator(ax3, pairs, data=df_bac, x='isfamily', y='diff_abundance')
annotator.configure(test='Mann-Whitney', text_format='simple', loc='outside', line_width=1)
annotator.apply_and_annotate()
axs[2].set_xlabel("Bacterial species belongs\nto a shared genus", fontsize=12)
medians = df_bac.groupby("isfamily")['diff_abundance'].median()
for i, median in enumerate(medians):
ax3.text(i, median - 8, f"{median:.2f}", ha='center', va='bottom', fontsize=8)
for ax in axs:
sns.despine(top=True, right=True, ax=ax)
plt.tight_layout()
for format in ['png', 'tiff']:
plt.savefig(f'{RESULTS_DIR}/figures/paper/figI-{mode}-{prof}.{format}', dpi=DPI, bbox_inches='tight')
plt.show()
/tmp/ipykernel_2918775/1519887836.py:7: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. ax1 = sns.boxplot(df_diffab, x='isfamily', y='diff_abundance', ax=axs[0], palette=sns.color_palette(['#4E79A7', '#A0CBE8'])) /tmp/ipykernel_2918775/1519887836.py:20: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. ax2 = sns.boxplot(df_diffab, x='kingdom', y='diff_abundance', ax=axs[1], palette=sns.color_palette(['#4E79A7', '#77a2c8', '#A0CBE8'])) /tmp/ipykernel_2918775/1519887836.py:34: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. ax3 = sns.boxplot(df_bac, x='isfamily', y='diff_abundance', ax=axs[2], palette=sns.color_palette(['#4E79A7', '#A0CBE8']))
False vs. True: Mann-Whitney-Wilcoxon test two-sided, P_val:3.304e-01 U_stat=4.400e+02 Bacteria vs. Fungi: Mann-Whitney-Wilcoxon test two-sided, P_val:1.184e-07 U_stat=1.200e+01 Fungi vs. Virus: Mann-Whitney-Wilcoxon test two-sided, P_val:3.156e-02 U_stat=8.500e+01 Bacteria vs. Virus: Mann-Whitney-Wilcoxon test two-sided, P_val:2.544e-05 U_stat=3.100e+01 False vs. True: Mann-Whitney-Wilcoxon test two-sided, P_val:3.610e-01 U_stat=7.800e+01