In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import os
from datetime import datetime
from matplotlib.colors import Normalize, ListedColormap
from scipy.stats import linregress, pearsonr
from scipy.stats import mannwhitneyu, wilcoxon
from statsmodels.stats.multitest import multipletests

from statannotations.Annotator import Annotator
In [69]:
pd.options.display.float_format = '{:.3f}'.format
pd.set_option('display.max_columns', None)

plt.rcParams['figure.dpi']=170

plt.rc('axes', linewidth=0.65)  # Adjust the line width of plot frames
plt.rc('xtick.major', width=0.0)
plt.rc('ytick.major', width=0.0)


DPI=250
In [70]:
from list_vars import LIST_PROFILERS, DIR_FIGURES, RESULTS_DIR

In silico sample analysis¶

In this notebook we are going to do an analysis on the in silico samples, where we are going to study several variables.


How many reads are incorrectly mapped if we do not perfom a host mapping step?¶

It has been reported that not mapping to human databases before profiling increases the number of reads assigned to other organisms.

In this case, we are going to do 3 checks with the in silico dataset using pass2 (profiling after 2-time host mapping) and pass0 (direct profiling withou host mapping), and we are going to check the influence in parameter sensitivity:

  • We are going to see what is the total number of reads mapped to the human dataset, and what is the offset left unmapped which should have been mapped to human.
    • We are also going to do the same with the microbial reads, and see if more microbial reads have been assigned to the pass0 dataset.

Later in the analysis we are going to do two additional analyses:

  • We are going to see the number of species present in total between pass0 and pass2, and their jaccard index.
  • We are going to calculate the ratio between the number of reads in pass0 and pass2.
In [71]:
df_host_map_info = pd.read_csv(f'{RESULTS_DIR}/counts/mapping_counts.txt', sep='\t').set_index('SAMPLE')
In [72]:
artificial_taxid_counts = pd.read_csv('table_artificial_taxid.csv', sep=';', names=['species', 'taxid', 'reads'])
artificial_taxid_counts['reads_true'] = (artificial_taxid_counts['reads'] / 2).astype(int)

n_true_human_reads = int(artificial_taxid_counts['reads_true'].iloc[0])
n_true_human_reads
Out[72]:
20000000
In [73]:
n_mapped_reads_1and2_maps = df_host_map_info.loc['ARTIFICIAL', '1st_mapped'] + df_host_map_info.loc['ARTIFICIAL', '2nd_mapped']

print(f'There is a total of {n_mapped_reads_1and2_maps} reads mapped to human during the 1st and 2nd map, which represents around {100 * n_mapped_reads_1and2_maps/n_true_human_reads} % of the total number of reads ({n_true_human_reads}).')
print(f'There is a total of {n_true_human_reads - n_mapped_reads_1and2_maps} reads remaining to be mapped.')
There is a total of 38219883 reads mapped to human during the 1st and 2nd map, which represents around 191.099415 % of the total number of reads (20000000).
There is a total of -18219883 reads remaining to be mapped.
In [74]:
df_host_profile_info = pd.read_csv(f'{RESULTS_DIR}/counts/profiling_counts_ARTIFICIAL.txt', sep='\t')
df_host_profile_info_artificial = df_host_profile_info[df_host_profile_info['SAMPLE'] == 'ARTIFICIAL']

df_host_profile_info_artificial['mapped_human_1_2_maps'] = 0
df_host_profile_info_artificial.loc[df_host_profile_info_artificial['pass'] == 2, 'mapped_human_1_2_maps'] = n_mapped_reads_1and2_maps

df_host_profile_info_artificial['mapped_human_total'] = df_host_profile_info_artificial['mapped_human_1_2_maps'] + df_host_profile_info_artificial['mapped_human']
df_host_profile_info_artificial['total_reads'] = df_host_profile_info_artificial['mapped_human_total'] + df_host_profile_info_artificial['mapped_others'] + df_host_profile_info_artificial['unmapped']

df_host_profile_info_artificial['observed_human_prop'] = df_host_profile_info_artificial['mapped_human_total'] / df_host_profile_info_artificial['total_reads']
df_host_profile_info_artificial['observed_others_prop'] = df_host_profile_info_artificial['mapped_others'] / df_host_profile_info_artificial['total_reads']
df_host_profile_info_artificial['observed_unmapped_prop'] = df_host_profile_info_artificial['unmapped'] / df_host_profile_info_artificial['total_reads']

df_host_profile_info_artificial['expected_human_prop'] = n_true_human_reads / artificial_taxid_counts['reads_true'].sum() # 0.8
df_host_profile_info_artificial['expected_others_prop'] = 1 - n_true_human_reads / artificial_taxid_counts['reads_true'].sum() # 0.8

df_host_profile_info_artificial['calculated_unmapped_human_prop'] = df_host_profile_info_artificial['expected_human_prop'] - df_host_profile_info_artificial['observed_human_prop']
df_host_profile_info_artificial['calculated_unmapped_others_prop'] = df_host_profile_info_artificial['expected_others_prop'] - df_host_profile_info_artificial['observed_others_prop']

df_host_profile_info_artificial['proportion_mapped_other_reads'] = df_host_profile_info_artificial['observed_others_prop'] /  df_host_profile_info_artificial['expected_others_prop']


for profiler in LIST_PROFILERS:
    display(profiler)
    display(df_host_profile_info_artificial[df_host_profile_info_artificial['profiler'] == profiler])
'centrifuge'
SAMPLE pass mode profiler mapped_human mapped_others unmapped mapped_human_1_2_maps mapped_human_total total_reads observed_human_prop observed_others_prop observed_unmapped_prop expected_human_prop expected_others_prop calculated_unmapped_human_prop calculated_unmapped_others_prop proportion_mapped_other_reads
0 ARTIFICIAL 0 1 centrifuge 38740565 7929463 3326083 0 38740565 49996111 0.775 0.159 0.067 0.800 0.200 0.025 0.041 0.793
1 ARTIFICIAL 0 2 centrifuge 38831835 7963709 3200567 0 38831835 49996111 0.777 0.159 0.064 0.800 0.200 0.023 0.041 0.796
2 ARTIFICIAL 0 3 centrifuge 38909846 7998877 3087388 0 38909846 49996111 0.778 0.160 0.062 0.800 0.200 0.022 0.040 0.800
3 ARTIFICIAL 0 4 centrifuge 38976065 8034549 2985497 0 38976065 49996111 0.780 0.161 0.060 0.800 0.200 0.020 0.039 0.804
4 ARTIFICIAL 0 5 centrifuge 39023379 8070059 2902673 0 39023379 49996111 0.781 0.161 0.058 0.800 0.200 0.019 0.039 0.807
5 ARTIFICIAL 0 6 centrifuge 39064849 8106808 2824454 0 39064849 49996111 0.781 0.162 0.056 0.800 0.200 0.019 0.038 0.811
6 ARTIFICIAL 0 7 centrifuge 39095674 8148852 2751585 0 39095674 49996111 0.782 0.163 0.055 0.800 0.200 0.018 0.037 0.815
7 ARTIFICIAL 0 8 centrifuge 39120561 8209851 2665699 0 39120561 49996111 0.782 0.164 0.053 0.800 0.200 0.018 0.036 0.821
8 ARTIFICIAL 0 9 centrifuge 39149190 8425120 2421801 0 39149190 49996111 0.783 0.169 0.048 0.800 0.200 0.017 0.031 0.843
9 ARTIFICIAL 2 1 centrifuge 1046644 7929421 2800163 38219883 39266527 49996111 0.785 0.159 0.056 0.800 0.200 0.015 0.041 0.793
10 ARTIFICIAL 2 2 centrifuge 1052656 7964098 2759474 38219883 39272539 49996111 0.786 0.159 0.055 0.800 0.200 0.014 0.041 0.796
11 ARTIFICIAL 2 3 centrifuge 1058357 7998912 2718959 38219883 39278240 49996111 0.786 0.160 0.054 0.800 0.200 0.014 0.040 0.800
12 ARTIFICIAL 2 4 centrifuge 1063583 8034577 2678068 38219883 39283466 49996111 0.786 0.161 0.054 0.800 0.200 0.014 0.039 0.804
13 ARTIFICIAL 2 5 centrifuge 1066865 8069484 2639879 38219883 39286748 49996111 0.786 0.161 0.053 0.800 0.200 0.014 0.039 0.807
14 ARTIFICIAL 2 6 centrifuge 1069906 8105966 2600356 38219883 39289789 49996111 0.786 0.162 0.052 0.800 0.200 0.014 0.038 0.811
15 ARTIFICIAL 2 7 centrifuge 1071551 8147452 2557225 38219883 39291434 49996111 0.786 0.163 0.051 0.800 0.200 0.014 0.037 0.815
16 ARTIFICIAL 2 8 centrifuge 1073535 8203982 2498711 38219883 39293418 49996111 0.786 0.164 0.050 0.800 0.200 0.014 0.036 0.820
17 ARTIFICIAL 2 9 centrifuge 1082232 8384059 2309937 38219883 39302115 49996111 0.786 0.168 0.046 0.800 0.200 0.014 0.032 0.838
'ganon'
SAMPLE pass mode profiler mapped_human mapped_others unmapped mapped_human_1_2_maps mapped_human_total total_reads observed_human_prop observed_others_prop observed_unmapped_prop expected_human_prop expected_others_prop calculated_unmapped_human_prop calculated_unmapped_others_prop proportion_mapped_other_reads
18 ARTIFICIAL 0 1 ganon 35534622 7058731 7402758 0 35534622 49996111 0.711 0.141 0.148 0.800 0.200 0.089 0.059 0.706
19 ARTIFICIAL 0 2 ganon 35534622 7058731 7402758 0 35534622 49996111 0.711 0.141 0.148 0.800 0.200 0.089 0.059 0.706
20 ARTIFICIAL 0 3 ganon 38667598 8417056 2911457 0 38667598 49996111 0.773 0.168 0.058 0.800 0.200 0.027 0.032 0.842
21 ARTIFICIAL 0 4 ganon 38665847 8418807 2911457 0 38665847 49996111 0.773 0.168 0.058 0.800 0.200 0.027 0.032 0.842
22 ARTIFICIAL 0 5 ganon 39527606 8895428 1573077 0 39527606 49996111 0.791 0.178 0.031 0.800 0.200 0.009 0.022 0.890
23 ARTIFICIAL 0 6 ganon 39526084 8896950 1573077 0 39526084 49996111 0.791 0.178 0.031 0.800 0.200 0.009 0.022 0.890
24 ARTIFICIAL 0 7 ganon 39779348 9122179 1094584 0 39779348 49996111 0.796 0.182 0.022 0.800 0.200 0.004 0.018 0.912
25 ARTIFICIAL 0 8 ganon 39773555 9127972 1094584 0 39773555 49996111 0.796 0.183 0.022 0.800 0.200 0.004 0.017 0.913
26 ARTIFICIAL 0 9 ganon 39861349 9281351 853411 0 39861349 49996111 0.797 0.186 0.017 0.800 0.200 0.003 0.014 0.928
27 ARTIFICIAL 2 1 ganon 1080116 7658050 3038062 38219883 39299999 49996111 0.786 0.153 0.061 0.800 0.200 0.014 0.047 0.766
28 ARTIFICIAL 2 2 ganon 1080116 7658050 3038062 38219883 39299999 49996111 0.786 0.153 0.061 0.800 0.200 0.014 0.047 0.766
29 ARTIFICIAL 2 3 ganon 1176914 9041188 1558126 38219883 39396797 49996111 0.788 0.181 0.031 0.800 0.200 0.012 0.019 0.904
30 ARTIFICIAL 2 4 ganon 1176339 9041763 1558126 38219883 39396222 49996111 0.788 0.181 0.031 0.800 0.200 0.012 0.019 0.904
31 ARTIFICIAL 2 5 ganon 1214141 9519657 1042430 38219883 39434024 49996111 0.789 0.190 0.021 0.800 0.200 0.011 0.010 0.952
32 ARTIFICIAL 2 6 ganon 1213866 9519932 1042430 38219883 39433749 49996111 0.789 0.190 0.021 0.800 0.200 0.011 0.010 0.952
33 ARTIFICIAL 2 7 ganon 1228567 9741454 806207 38219883 39448450 49996111 0.789 0.195 0.016 0.800 0.200 0.011 0.005 0.974
34 ARTIFICIAL 2 8 ganon 1227189 9742832 806207 38219883 39447072 49996111 0.789 0.195 0.016 0.800 0.200 0.011 0.005 0.974
35 ARTIFICIAL 2 9 ganon 1231920 9884612 659696 38219883 39451803 49996111 0.789 0.198 0.013 0.800 0.200 0.011 0.002 0.989
'kaiju'
SAMPLE pass mode profiler mapped_human mapped_others unmapped mapped_human_1_2_maps mapped_human_total total_reads observed_human_prop observed_others_prop observed_unmapped_prop expected_human_prop expected_others_prop calculated_unmapped_human_prop calculated_unmapped_others_prop proportion_mapped_other_reads
36 ARTIFICIAL 0 1 kaiju 814678 5097740 44083693 0 814678 49996111 0.016 0.102 0.882 0.800 0.200 0.784 0.098 0.510
37 ARTIFICIAL 0 2 kaiju 1065560 5349844 43580707 0 1065560 49996111 0.021 0.107 0.872 0.800 0.200 0.779 0.093 0.535
38 ARTIFICIAL 0 3 kaiju 1065560 5349844 43580707 0 1065560 49996111 0.021 0.107 0.872 0.800 0.200 0.779 0.093 0.535
39 ARTIFICIAL 0 4 kaiju 1519215 5586056 42890840 0 1519215 49996111 0.030 0.112 0.858 0.800 0.200 0.770 0.088 0.559
40 ARTIFICIAL 0 5 kaiju 3099887 5936414 40959810 0 3099887 49996111 0.062 0.119 0.819 0.800 0.200 0.738 0.081 0.594
41 ARTIFICIAL 0 6 kaiju 3100631 5936491 40958989 0 3100631 49996111 0.062 0.119 0.819 0.800 0.200 0.738 0.081 0.594
42 ARTIFICIAL 0 7 kaiju 4812786 6324993 38858332 0 4812786 49996111 0.096 0.127 0.777 0.800 0.200 0.704 0.073 0.633
43 ARTIFICIAL 0 8 kaiju 4862779 6330261 38803071 0 4862779 49996111 0.097 0.127 0.776 0.800 0.200 0.703 0.073 0.633
44 ARTIFICIAL 0 9 kaiju 5825922 6650125 37520064 0 5825922 49996111 0.117 0.133 0.750 0.800 0.200 0.683 0.067 0.665
45 ARTIFICIAL 2 1 kaiju 56877 4941549 6777802 38219883 38276760 49996111 0.766 0.099 0.136 0.800 0.200 0.034 0.101 0.494
46 ARTIFICIAL 2 2 kaiju 68592 5116745 6590891 38219883 38288475 49996111 0.766 0.102 0.132 0.800 0.200 0.034 0.098 0.512
47 ARTIFICIAL 2 3 kaiju 68592 5116745 6590891 38219883 38288475 49996111 0.766 0.102 0.132 0.800 0.200 0.034 0.098 0.512
48 ARTIFICIAL 2 4 kaiju 82822 5262441 6430965 38219883 38302705 49996111 0.766 0.105 0.129 0.800 0.200 0.034 0.095 0.526
49 ARTIFICIAL 2 5 kaiju 118607 5439087 6218534 38219883 38338490 49996111 0.767 0.109 0.124 0.800 0.200 0.033 0.091 0.544
50 ARTIFICIAL 2 6 kaiju 118607 5439147 6218474 38219883 38338490 49996111 0.767 0.109 0.124 0.800 0.200 0.033 0.091 0.544
51 ARTIFICIAL 2 7 kaiju 169913 5603398 6002917 38219883 38389796 49996111 0.768 0.112 0.120 0.800 0.200 0.032 0.088 0.560
52 ARTIFICIAL 2 8 kaiju 170510 5604969 6000749 38219883 38390393 49996111 0.768 0.112 0.120 0.800 0.200 0.032 0.088 0.561
53 ARTIFICIAL 2 9 kaiju 194987 5718712 5862529 38219883 38414870 49996111 0.768 0.114 0.117 0.800 0.200 0.032 0.086 0.572
'kraken2'
SAMPLE pass mode profiler mapped_human mapped_others unmapped mapped_human_1_2_maps mapped_human_total total_reads observed_human_prop observed_others_prop observed_unmapped_prop expected_human_prop expected_others_prop calculated_unmapped_human_prop calculated_unmapped_others_prop proportion_mapped_other_reads
54 ARTIFICIAL 0 1 kraken2 14455036 2965393 32575682 0 14455036 49996111 0.289 0.059 0.652 0.800 0.200 0.511 0.141 0.297
55 ARTIFICIAL 0 2 kraken2 19419056 3736988 26840067 0 19419056 49996111 0.388 0.075 0.537 0.800 0.200 0.412 0.125 0.374
56 ARTIFICIAL 0 3 kraken2 23299865 4319590 22376656 0 23299865 49996111 0.466 0.086 0.448 0.800 0.200 0.334 0.114 0.432
57 ARTIFICIAL 0 4 kraken2 26673093 4785348 18537670 0 26673093 49996111 0.534 0.096 0.371 0.800 0.200 0.266 0.104 0.479
58 ARTIFICIAL 0 5 kraken2 29088107 5118672 15789332 0 29088107 49996111 0.582 0.102 0.316 0.800 0.200 0.218 0.098 0.512
59 ARTIFICIAL 0 6 kraken2 31245840 5417465 13332806 0 31245840 49996111 0.625 0.108 0.267 0.800 0.200 0.175 0.092 0.542
60 ARTIFICIAL 0 7 kraken2 32777703 5644156 11574252 0 32777703 49996111 0.656 0.113 0.232 0.800 0.200 0.144 0.087 0.564
61 ARTIFICIAL 0 8 kraken2 34127026 5848584 10020501 0 34127026 49996111 0.683 0.117 0.200 0.800 0.200 0.117 0.083 0.585
62 ARTIFICIAL 0 9 kraken2 35488630 6041475 8466006 0 35488630 49996111 0.710 0.121 0.169 0.800 0.200 0.090 0.079 0.604
63 ARTIFICIAL 2 1 kraken2 381464 2975038 8419726 38219883 38601347 49996111 0.772 0.060 0.168 0.800 0.200 0.028 0.140 0.298
64 ARTIFICIAL 2 2 kraken2 503239 3755612 7517377 38219883 38723122 49996111 0.775 0.075 0.150 0.800 0.200 0.025 0.125 0.376
65 ARTIFICIAL 2 3 kraken2 591284 4327476 6857468 38219883 38811167 49996111 0.776 0.087 0.137 0.800 0.200 0.024 0.113 0.433
66 ARTIFICIAL 2 4 kraken2 664379 4770367 6341482 38219883 38884262 49996111 0.778 0.095 0.127 0.800 0.200 0.022 0.105 0.477
67 ARTIFICIAL 2 5 kraken2 723279 5127558 5925391 38219883 38943162 49996111 0.779 0.103 0.119 0.800 0.200 0.021 0.097 0.513
68 ARTIFICIAL 2 6 kraken2 772466 5409228 5594534 38219883 38992349 49996111 0.780 0.108 0.112 0.800 0.200 0.020 0.092 0.541
69 ARTIFICIAL 2 7 kraken2 813970 5651243 5311015 38219883 39033853 49996111 0.781 0.113 0.106 0.800 0.200 0.019 0.087 0.565
70 ARTIFICIAL 2 8 kraken2 848074 5843864 5084290 38219883 39067957 49996111 0.781 0.117 0.102 0.800 0.200 0.019 0.083 0.584
71 ARTIFICIAL 2 9 kraken2 895442 6031308 4849478 38219883 39115325 49996111 0.782 0.121 0.097 0.800 0.200 0.018 0.079 0.603
'krakenuniq'
SAMPLE pass mode profiler mapped_human mapped_others unmapped mapped_human_1_2_maps mapped_human_total total_reads observed_human_prop observed_others_prop observed_unmapped_prop expected_human_prop expected_others_prop calculated_unmapped_human_prop calculated_unmapped_others_prop proportion_mapped_other_reads
72 ARTIFICIAL 0 1 krakenuniq 39192382 8073019 2730710 0 39192382 49996111 0.784 0.161 0.055 0.800 0.200 0.016 0.039 0.807
73 ARTIFICIAL 0 2 krakenuniq 39192382 8073019 2730710 0 39192382 49996111 0.784 0.161 0.055 0.800 0.200 0.016 0.039 0.807
74 ARTIFICIAL 0 3 krakenuniq 39192382 8073019 2730710 0 39192382 49996111 0.784 0.161 0.055 0.800 0.200 0.016 0.039 0.807
75 ARTIFICIAL 0 4 krakenuniq 39192382 8073019 2730710 0 39192382 49996111 0.784 0.161 0.055 0.800 0.200 0.016 0.039 0.807
76 ARTIFICIAL 0 5 krakenuniq 39192382 8073019 2730710 0 39192382 49996111 0.784 0.161 0.055 0.800 0.200 0.016 0.039 0.807
77 ARTIFICIAL 0 6 krakenuniq 39192382 8073019 2730710 0 39192382 49996111 0.784 0.161 0.055 0.800 0.200 0.016 0.039 0.807
78 ARTIFICIAL 0 7 krakenuniq 39192382 8073019 2730710 0 39192382 49996111 0.784 0.161 0.055 0.800 0.200 0.016 0.039 0.807
79 ARTIFICIAL 0 8 krakenuniq 39192382 8073019 2730710 0 39192382 49996111 0.784 0.161 0.055 0.800 0.200 0.016 0.039 0.807
80 ARTIFICIAL 0 9 krakenuniq 39192382 8073019 2730710 0 39192382 49996111 0.784 0.161 0.055 0.800 0.200 0.016 0.039 0.807
81 ARTIFICIAL 2 1 krakenuniq 1094558 8062716 2618954 38219883 39314441 49996111 0.786 0.161 0.052 0.800 0.200 0.014 0.039 0.806
82 ARTIFICIAL 2 2 krakenuniq 1094558 8062716 2618954 38219883 39314441 49996111 0.786 0.161 0.052 0.800 0.200 0.014 0.039 0.806
83 ARTIFICIAL 2 3 krakenuniq 1094558 8062716 2618954 38219883 39314441 49996111 0.786 0.161 0.052 0.800 0.200 0.014 0.039 0.806
84 ARTIFICIAL 2 4 krakenuniq 1094558 8062716 2618954 38219883 39314441 49996111 0.786 0.161 0.052 0.800 0.200 0.014 0.039 0.806
85 ARTIFICIAL 2 5 krakenuniq 1094558 8062716 2618954 38219883 39314441 49996111 0.786 0.161 0.052 0.800 0.200 0.014 0.039 0.806
86 ARTIFICIAL 2 6 krakenuniq 1094558 8062716 2618954 38219883 39314441 49996111 0.786 0.161 0.052 0.800 0.200 0.014 0.039 0.806
87 ARTIFICIAL 2 7 krakenuniq 1094558 8062716 2618954 38219883 39314441 49996111 0.786 0.161 0.052 0.800 0.200 0.014 0.039 0.806
88 ARTIFICIAL 2 8 krakenuniq 1094558 8062716 2618954 38219883 39314441 49996111 0.786 0.161 0.052 0.800 0.200 0.014 0.039 0.806
89 ARTIFICIAL 2 9 krakenuniq 1094558 8062716 2618954 38219883 39314441 49996111 0.786 0.161 0.052 0.800 0.200 0.014 0.039 0.806
In [75]:
custom_palette = ["#648FFF", "#785EF0", "#DC267F", "#FE6100", "#FFB000", "#848484"]
In [76]:
fig, axs = plt.subplots(1, 2, figsize=(8, 3))

# 1A) Check if there are differences in human read assignment.

# Step 1: Calculate the differences between pass 2 and pass 0 for each profiler and mode

pass_diff = (
    df_host_profile_info_artificial.pivot_table(
        index=["profiler", "mode"], columns="pass", values="observed_human_prop"
    )
    .reset_index()
)

# Ensure column names are integers
pass_diff.columns.name = None  # Remove the columns' name from pivot_table
pass_diff.columns = ['profiler', 'mode', 0, 2]  # Explicitly rename columns

# Calculate the difference
pass_diff["difference"] = 100 * (pass_diff[2] - pass_diff[0])



# Step 2: Plot the differences using a lineplot
g = sns.lineplot(
    data=pass_diff,
    x="mode",
    y="difference",
    hue="profiler",
    marker="o",
    palette=custom_palette,
    ax=axs[0],
    legend=False
)

axs[0].axhline(0, color="#848484", linestyle="--", linewidth=0.8)
axs[0].set_title("Human assignment", fontsize=14)
axs[0].set_xlabel("Mode", fontsize=12)
axs[0].set_ylabel("Diff pass2 - pass0 (%)", fontsize=12)



# 1B) Check if there are differences in non-human read assignment.

pass_diff = (
    df_host_profile_info_artificial.pivot_table(
        index=["profiler", "mode"], columns="pass", values="observed_others_prop"
    )
    .reset_index()
)

pass_diff["difference"] = 100 * (pass_diff[2] - pass_diff[0])



# Step 2: Plot the differences using a lineplot

h = sns.lineplot(
    data=pass_diff,
    x="mode",
    y="difference",
    hue="profiler",
    marker="o",
    palette=custom_palette,
    ax=axs[1],
)

sns.despine(top=True, right=True)

axs[1].axhline(0, color="gray", linestyle="--", linewidth=0.8)
axs[1].set_title("Non-human assignment", fontsize=12)
axs[1].set_xlabel("Mode", fontsize=12)
axs[1].set_ylabel("", fontsize=12)

for ax in axs:
    ax.set_xticks(range(1, 10))

axs[1].legend(title="Profiler", frameon=False, bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
plt.tight_layout()
plt.rc('axes', linewidth=0.65)
for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/figA.{format}', dpi=DPI, )
/tmp/ipykernel_2918775/492332427.py:24: UserWarning: The palette list has more values (6) than needed (5), which may not be intended.
  g = sns.lineplot(
/tmp/ipykernel_2918775/492332427.py:57: UserWarning: The palette list has more values (6) than needed (5), which may not be intended.
  h = sns.lineplot(
No description has been provided for this image

What do we see here?

  • The number of reads assigned to humans without host mapping is very variable depending on the profiler. Centrifuge, krakenuniq and ganon and map human reads correctly, whereas kaiju, kraken2 fail to map the reads to human. The differences tend to decrease with the sensitivity mode, that is, paradoxically, a more strict read assignment leads to an improved number of human-mapped reads. However, this makes sense because more reads are assigned in general, and thus both human and non-human reads are mapped.
  • However, this difference does not occur in non-human species. In general, non-human species are assigned equally with or without host mapping. This is interesting because we would expect a higher amount of reads assigned to non-human species originating from a false positive assignment of human reads, but seems not to be the case, even in profilers that have a high ammount of unmapped human reads.
    • Still, we have to take into acount that the profiler databases include a host mapping step.
In [77]:
table_artificial_taxcounts = pd.read_csv('../../src/version_2/table_artificial_taxid.csv', sep=';', names=['species', 'taxid', 'count'])
table_artificial_taxcounts = table_artificial_taxcounts[table_artificial_taxcounts['taxid'] != 9606]
table_artificial_taxcounts['abundance'] = 100 * table_artificial_taxcounts['count'] / table_artificial_taxcounts['count'].sum()
table_artificial_taxcounts
Out[77]:
species taxid count abundance
1 Cutibacterium acnes 1747 312500 3.125
2 Lactobacillus acidophilus 1579 312500 3.125
3 Bifidobacterium bifidum 1681 312500 3.125
4 Akkermansia muciniphila 239935 312500 3.125
5 Blautia coccoides 1532 312500 3.125
6 Blautia luti 89014 312500 3.125
7 Bacteroides ovatus 28116 312500 3.125
8 Bacteroides intestinalis 329854 312500 3.125
9 Bacteroides fragilis 817 312500 3.125
10 Escherichia coli 511145 312500 3.125
11 Dietzia lutea 546160 250000 2.500
12 Ruthenibacterium lactatiformans 1550024 187500 1.875
13 Faecalibacterium prausnitzii 853 187500 1.875
14 Parabacteroides distasonis 823 187500 1.875
15 Parabacteroides merdae 46503 187500 1.875
16 Fusicatenibacter saccharivorans 1150298 187500 1.875
17 Erysipelatoclostridium ramosum 1547 125000 1.250
18 Streptococcus salivarius 1304 125000 1.250
19 Hungatella hathewayi 154046 62500 0.625
20 Eisenbergiella porci 2652274 62500 0.625
21 Butyricimonas faecalis 2093856 62500 0.625
22 Alistipes indistinctus 626932 62500 0.625
23 Alistipes finegoldii 214856 62500 0.625
24 Eubacterium callanderi 53442 62500 0.625
25 Acidaminococcus intestini 187327 62500 0.625
26 Aspergillus chevalieri 182096 200000 2.000
27 Aspergillus flavus 5059 200000 2.000
28 Saccharomyces cerevisiae 4932 200000 2.000
29 Saccharomyces kudriavzevii 114524 200000 2.000
30 Saccharomyces mikatae 114525 200000 2.000
31 Candida albicans 5476 200000 2.000
32 Candida dubliniensis 42374 200000 2.000
33 Candida orthopsilosis 273371 200000 2.000
34 Malassezia restricta 76775 150000 1.500
35 Alternaria dauci 48095 150000 1.500
36 Kazachstania africana 432096 100000 1.000
37 Penicillium digitatum 36651 100000 1.000
38 Pichia kudriavzevii 4909 50000 0.500
39 Trichoderma asperellum 101201 50000 0.500
40 Akanthomyces muscarius 2231603 50000 0.500
41 Fusarium falciforme 195108 50000 0.500
42 Eremothecium sinecaudum 45286 50000 0.500
43 Cryptococcus decagattii 1859122 50000 0.500
44 Kwoniella shandongensis 1734106 50000 0.500
45 Puccinia triticina 208348 50000 0.500
46 Tobacco mosaic virus 12242 250000 2.500
47 Rotavirus A 28875 250000 2.500
48 Rotavirus B 28876 250000 2.500
49 Rotavirus C 36427 250000 2.500
50 Bacteriophage P2 10679 200000 2.000
51 Escherichia phage T4 10665 200000 2.000
52 Human immunodeficiency virus 1 11676 200000 2.000
53 Human adenovirus 7 108098 150000 1.500
54 Hepatitis C virus 3052230 150000 1.500
55 Bovine alphaherpesvirus 2 3050244 150000 1.500
56 Human herpesvirus 4 type 2 3050299 150000 1.500
57 Mimivirus terra2 1128151 100000 1.000
58 Dengue virus 3052464 100000 1.000
59 Norovirus GI 11983 50000 0.500
60 Zaire ebolavirus 3052462 50000 0.500

Computing detection stats to aswer the questions¶

One of the parameters used during profiling is the mode of the profilers. Each profiler has a different set of parametters to include reads as valid or not. This may results in the detection of false positives and negatives.

Here, we are going to study this effect in in silico samples to see if there are major changes. We are can measure the effectivity of several variables:

  • Categorical values: we can use each of the columns in the flag system to check how well were species assigned. We can use the precision (TP/TP + FP), recall (TP/TP+FN) and F1-score (2 x precision x recall / precision + recall) and Cohen's kappa. $$\kappa = \frac{p_0-p_e}{1-p_e} \quad p_0 = \frac{TP + TN}{TP + FP + FN + TN} \quad p_e=\frac{TP + FP}{TP + FP + FN + TN}\cdot\frac{TP + FN }{TP + FP + FN + TN} + \frac{TN + FP}{TP + FP + FN + TN}\cdot\frac{TN + FN}{TP + FP + FN + TN}$$

  • Numerical values: we can use the normalized value and the abundance to see how well are reads classified. For that we can use the expected number of reads and abundance. With that we will calculate the (1) difference between observed and expected categories and (2) the mean error and the (3) mean absolute error: $$(1) \qquad DIFF_i = 100\cdot\frac{x_{obs,i} - x_{exp,i}}{x_{exp,i}}$$ $$(2) \qquad ME = E[DIFF_i] = \frac{100}{N}\sum\frac{x_{obs,i} - x_{exp,i}}{x_{exp,i}}$$ $$(3A) \qquad MAE = E[|DIFF_i|] = \frac{100}{N}\sum\frac{|x_{obs,i} - x_{exp,i}|}{x_{exp,i}}$$ $$(3B) \qquad MAED = \sigma[|DIFF_i|]$$

  • For numerical values we are also going to calculate the pearson correlation between the observed and expected values, using a log10(1+x) transform

Categorical values¶

In [78]:
def calculate_nominal_metrics(df_tax_ground_truth, df_flags_observed, column):
    list_expected_taxids = list(df_tax_ground_truth['taxid'].astype(int).values)
    list_observed_true_taxids = list(df_flags_observed.loc[df_flags_observed[column] == False, 'taxonomy_id'].astype(int).values)
    list_observed_false_taxids = list(df_flags_observed.loc[df_flags_observed[column] == True, 'taxonomy_id'].astype(int).values)

    TP = len([i for i in list_expected_taxids if i in list_observed_true_taxids])
    FN = len([i for i in list_expected_taxids if i not in list_observed_true_taxids])
    FP = len([i for i in list_observed_true_taxids if i not in list_expected_taxids])
    TN = len([i for i in list_observed_false_taxids if i not in list_expected_taxids])

    assert len(set(list_expected_taxids + list_observed_true_taxids + list_observed_false_taxids)) == TP + FN + FP + TN

    precision = TP / (TP + FP)
    recall = TP / (TP + FN)

    try:
        f1 = (2 * precision * recall) / (precision + recall)
    except:
        f1 = 0
    
    # Create kappa measures
    ALL = TP + FN + FP + TN
    p0 = (TP + TN) / (ALL)
    pe = (TP + FP)/ALL * (TP + FN)/ALL + (TN + FP)/ALL * (TN + FN)/ALL
    kappa = (p0 - pe) / (1 - pe)

    return precision, recall, f1, kappa, TP, FN, FP, TN
In [79]:
columns_selected = ['centrifuge_norm', 'ganon_norm', 'kaiju_norm', 'kmcp_norm', 'kraken2_norm', 'krakenuniq_norm',
                    'centrifuge_relab', 'ganon_relab', 'kaiju_relab', 'kmcp_relab', 'kraken2_relab', 'krakenuniq_relab',
                    'mean_norm', 'CV_norm', 'mean_relab', 'CV_relab']
df_nominal_stats = {'pass': [], 'mode': [], 'S': [], 'column': [], 'precision': [], 'recall': [], 'f1': [], 
                    'kappa': [], 'TP|FN|FP|TN': []}

for passn in [0, 2]:
    for mode in range(1, 10):
        for S in [0, 1, 2, 3, 4, 5, 6, 7, 10, 15]:
            for column in columns_selected: 
                summary_table_flags = pd.read_csv(f'{RESULTS_DIR}/summary/ARTIFICIAL_pass{passn}_mode{mode}_taxspecies_S{S}.flags.tsv', sep='\t')
                try:
                    precision, recall, f1, kappa, TP, FN, FP, TN = calculate_nominal_metrics(table_artificial_taxcounts, summary_table_flags, column)
                except KeyError:
                    continue 

                df_nominal_stats['pass'].append(passn)
                df_nominal_stats['mode'].append(mode)
                df_nominal_stats['S'].append(S)
                df_nominal_stats['column'].append(column)

                df_nominal_stats['precision'].append(precision)
                df_nominal_stats['recall'].append(recall)
                df_nominal_stats['f1'].append(f1)
                df_nominal_stats['kappa'].append(kappa)
                df_nominal_stats['TP|FN|FP|TN'].append((TP, FN, FP, TN))

df_nominal_stats = pd.DataFrame(df_nominal_stats)
In [80]:
df_nominal_stats
Out[80]:
pass mode S column precision recall f1 kappa TP|FN|FP|TN
0 0 1 0 centrifuge_norm 1.000 0.050 0.095 0.094 (3, 57, 0, 3637)
1 0 1 0 ganon_norm 0.833 0.083 0.152 0.149 (5, 55, 1, 3636)
2 0 1 0 kaiju_norm 1.000 0.100 0.182 0.179 (6, 54, 0, 3637)
3 0 1 0 kraken2_norm 1.000 0.217 0.356 0.352 (13, 47, 0, 3637)
4 0 1 0 krakenuniq_norm 1.000 0.100 0.182 0.179 (6, 54, 0, 3637)
... ... ... ... ... ... ... ... ... ...
2515 2 9 15 krakenuniq_relab 0.400 0.967 0.566 0.563 (58, 2, 87, 12282)
2516 2 9 15 mean_norm 0.310 0.967 0.470 0.466 (58, 2, 129, 12240)
2517 2 9 15 CV_norm 0.005 0.900 0.009 -0.000 (54, 6, 11726, 643)
2518 2 9 15 mean_relab 0.258 0.967 0.407 0.402 (58, 2, 167, 12202)
2519 2 9 15 CV_relab 0.005 0.933 0.009 -0.000 (56, 4, 11731, 638)

2520 rows × 9 columns

Numerical values¶

In [81]:
def compute_mad(values):
    median = np.median(values)
    mad = np.median(np.abs(values - median))
    return mad

def calculate_numerical_metrics(df_tax_ground_truth, df_counts_observed, profiler, suffix):
    df_tax_ground_truth = df_tax_ground_truth.copy().set_index('taxid')
    df_counts_observed = df_counts_observed.copy().set_index('taxonomy_id')

    list_expected_taxids = df_tax_ground_truth.index.astype(int).values
    list_observed_true_taxids = df_counts_observed.index.astype(int).values

    combined_taxid = np.intersect1d(list_expected_taxids, list_observed_true_taxids)
    species = df_tax_ground_truth.loc[combined_taxid, 'species'].values
    observed_counts = df_counts_observed.loc[combined_taxid, f'{profiler}_{suffix}'].fillna(0).values

    expected_col = 'count' if suffix == 'norm' else 'abundance'
    expected_counts = df_tax_ground_truth.loc[combined_taxid, expected_col].values

    diff_counts = 100 * (observed_counts - expected_counts) / expected_counts

    MRE_counts = np.mean(diff_counts)
    MRED_counts = np.std(diff_counts)
    MAE_counts = np.mean(np.abs(diff_counts))
    MAED_counts = np.std(np.abs(diff_counts))
    MACV_counts = MAED_counts / MAE_counts

    if len(combined_taxid) > 35:
        x,y = expected_counts, observed_counts 

        slope, _, r_value, _, _ = linregress(x, y)
        r2 = r_value ** 2
    else:
        slope, r2 = np.nan, np.nan

    return diff_counts, MRE_counts, MRED_counts, MAE_counts, MAED_counts, MACV_counts, slope, r2, combined_taxid, species, expected_counts, observed_counts
In [82]:
for passn in [2]:
    for mode in range(5, 6):
            for profiler in ['mean']: 
                try:
                    summary_table_flags = pd.read_csv(f'{RESULTS_DIR}/summary/ARTIFICIAL_pass{passn}_mode{mode}_taxspecies_S{S}.flags.tsv', sep='\t')
                    summary_table_counts = pd.read_csv(f'{RESULTS_DIR}/summary/ARTIFICIAL_pass{passn}_mode{mode}_taxspecies_S{S}.diversity.tsv', sep='\t')
                    diff_counts, MRE_counts, MRED_counts, MAE_counts, MAED_counts, MACV_counts, corr_counts, rmse_counts, taxids_counts, species_counts, expected_counts, observed_counts = \
                        calculate_numerical_metrics(table_artificial_taxcounts, summary_table_counts, \
                                                                        profiler, suffix='norm')
                    diff_abundance, MRE_abundance, MRED_abundance, MAE_abundance, MAED_abundance, MACV_abundance, corr_abundance, rmse_abundance, taxids_abundance, species_abundance, expected_abundance, observed_abundance = \
                        calculate_numerical_metrics(table_artificial_taxcounts, summary_table_counts, \
                                                                        profiler, suffix='relab')
                except KeyError:
                    raise

df = pd.DataFrame({'species': species_counts, 'expected_counts': expected_counts, 'observed_counts': observed_counts, 'diff_abundance': diff_counts})
df
Out[82]:
species expected_counts observed_counts diff_abundance
0 Bacteroides fragilis 312500 240040.116 -23.187
1 Parabacteroides distasonis 187500 89289.167 -52.379
2 Faecalibacterium prausnitzii 187500 54476.741 -70.946
3 Streptococcus salivarius 125000 58117.095 -53.506
4 Blautia coccoides 312500 173325.771 -44.536
5 Erysipelatoclostridium ramosum 125000 67123.702 -46.301
6 Lactobacillus acidophilus 312500 239520.544 -23.353
7 Bifidobacterium bifidum 312500 184723.065 -40.889
8 Cutibacterium acnes 312500 254046.803 -18.705
9 Pichia kudriavzevii 50000 48697.541 -2.605
10 Saccharomyces cerevisiae 200000 195902.958 -2.049
11 Aspergillus flavus 200000 148008.703 -25.996
12 Candida albicans 200000 184824.390 -7.588
13 Escherichia phage T4 200000 164474.441 -17.763
14 Bacteriophage P2 200000 45220.596 -77.390
15 Human immunodeficiency virus 1 200000 201026.902 0.513
16 Norovirus GI 50000 50524.777 1.050
17 Tobacco mosaic virus 250000 243272.405 -2.691
18 Bacteroides ovatus 312500 111631.939 -64.278
19 Rotavirus A 250000 251608.261 0.643
20 Rotavirus B 250000 253634.055 1.454
21 Rotavirus C 250000 254565.176 1.826
22 Penicillium digitatum 100000 95742.552 -4.257
23 Candida dubliniensis 200000 184501.730 -7.749
24 Eremothecium sinecaudum 50000 49425.836 -1.148
25 Parabacteroides merdae 187500 125862.826 -32.873
26 Alternaria dauci 150000 141107.893 -5.928
27 Eubacterium callanderi 62500 30952.294 -50.476
28 Malassezia restricta 150000 149869.196 -0.087
29 Blautia luti 312500 53234.639 -82.965
30 Trichoderma asperellum 50000 46584.994 -6.830
31 Human adenovirus 7 150000 147511.825 -1.659
32 Saccharomyces kudriavzevii 200000 190980.300 -4.510
33 Saccharomyces mikatae 200000 189682.529 -5.159
34 Hungatella hathewayi 62500 40802.328 -34.716
35 Aspergillus chevalieri 200000 187004.613 -6.498
36 Acidaminococcus intestini 62500 57048.401 -8.723
37 Fusarium falciforme 50000 46336.938 -7.326
38 Puccinia triticina 50000 49269.888 -1.460
39 Alistipes finegoldii 62500 51804.368 -17.113
40 Akkermansia muciniphila 312500 196080.997 -37.254
41 Candida orthopsilosis 200000 189190.931 -5.405
42 Bacteroides intestinalis 312500 163291.164 -47.747
43 Kazachstania africana 100000 97363.929 -2.636
44 Dietzia lutea 250000 232472.093 -7.011
45 Alistipes indistinctus 62500 55391.499 -11.374
46 Mimivirus terra2 100000 84794.462 -15.206
47 Ruthenibacterium lactatiformans 187500 135305.249 -27.837
48 Kwoniella shandongensis 50000 48329.668 -3.341
49 Cryptococcus decagattii 50000 47962.656 -4.075
50 Butyricimonas faecalis 62500 49405.437 -20.951
51 Akanthomyces muscarius 50000 47989.514 -4.021
52 Eisenbergiella porci 62500 57135.945 -8.582
53 Bovine alphaherpesvirus 2 150000 148849.267 -0.767
54 Human herpesvirus 4 type 2 150000 139697.441 -6.868
55 Hepatitis C virus 150000 151377.085 0.918
56 Zaire ebolavirus 50000 49744.206 -0.512
57 Dengue virus 100000 101890.120 1.890
In [83]:
df_numerical_stats = {'pass': [], 'mode': [], 'profiler': [], 
                    'diff_counts': [], 'MRE_counts': [],'MRED_counts': [], 'MAE_counts': [], 'MAED_counts': [], 'MACV_counts': [], 
                    'corr_counts': [], 'R2_counts': [], 'taxid_counts': [], 
                    'species_counts': [], 'expected_counts': [], 'observed_counts': [], 
                    'diff_abundance': [], 'MRE_abundance': [],'MRED_abundance': [], 'MAE_abundance': [], 'MAED_abundance': [], 'MACV_abundance': [], 
                    'corr_abundance': [], 'R2_abundance': [], 'taxid_abundance': [], 
                    'species_abundance': [], 'expected_abundance': [], 'observed_abundance': [],  }

for passn in [0, 2]:
    for mode in range(1, 10):
            for profiler in LIST_PROFILERS + ['mean']: 
                summary_table_flags = pd.read_csv(f'{RESULTS_DIR}/summary/ARTIFICIAL_pass{passn}_mode{mode}_taxspecies_S{S}.flags.tsv', sep='\t')
                summary_table_counts = pd.read_csv(f'{RESULTS_DIR}/summary/ARTIFICIAL_pass{passn}_mode{mode}_taxspecies_S{S}.diversity.tsv', sep='\t')
                try:
                    diff_counts, MRE_counts, MRED_counts, MAE_counts, MAED_counts, MACV_counts, corr_counts, rmse_counts, taxids_counts, species_counts, expected_counts, observed_counts = \
                        calculate_numerical_metrics(table_artificial_taxcounts, summary_table_counts, \
                                                                        profiler, suffix='norm')
                    diff_abundance, MRE_abundance, MRED_abundance, MAE_abundance, MAED_abundance, MACV_abundance, corr_abundance, rmse_abundance, taxids_abundance, species_abundance, expected_abundance, observed_abundance = \
                        calculate_numerical_metrics(table_artificial_taxcounts, summary_table_counts, \
                                                                        profiler, suffix='relab')
                except KeyError:
                    raise
                    continue 
                
                df_numerical_stats['pass'].append(passn)
                df_numerical_stats['mode'].append(mode)
                df_numerical_stats['profiler'].append(profiler)

                df_numerical_stats['diff_counts'].append(diff_counts)
                df_numerical_stats['MRE_counts'].append(MRE_counts)          
                df_numerical_stats['MRED_counts'].append(MRED_counts)              
                df_numerical_stats['MAE_counts'].append(MAE_counts)                
                df_numerical_stats['MAED_counts'].append(MAED_counts)                
                df_numerical_stats['MACV_counts'].append(MACV_counts)                
                df_numerical_stats['corr_counts'].append(corr_counts)                
                df_numerical_stats['R2_counts'].append(rmse_counts) 
                df_numerical_stats['taxid_counts'].append(taxids_counts)
                df_numerical_stats['species_counts'].append(species_counts)
                df_numerical_stats['expected_counts'].append(expected_counts)
                df_numerical_stats['observed_counts'].append(observed_counts)

                df_numerical_stats['diff_abundance'].append(diff_abundance)
                df_numerical_stats['MRE_abundance'].append(MRE_abundance)
                df_numerical_stats['MRED_abundance'].append(MRED_abundance)

                df_numerical_stats['MAE_abundance'].append(MAE_abundance)
                df_numerical_stats['MAED_abundance'].append(MAED_abundance)
                df_numerical_stats['MACV_abundance'].append(MACV_abundance)
                df_numerical_stats['corr_abundance'].append(corr_abundance)                
                df_numerical_stats['R2_abundance'].append(rmse_abundance) 
                df_numerical_stats['taxid_abundance'].append(taxids_abundance)
                df_numerical_stats['species_abundance'].append(species_abundance)
                df_numerical_stats['expected_abundance'].append(expected_abundance)
                df_numerical_stats['observed_abundance'].append(observed_abundance)

df_numerical_stats = pd.DataFrame(df_numerical_stats)
In [84]:
df_numerical_stats
Out[84]:
pass mode profiler diff_counts MRE_counts MRED_counts MAE_counts MAED_counts MACV_counts corr_counts R2_counts taxid_counts species_counts expected_counts observed_counts diff_abundance MRE_abundance MRED_abundance MAE_abundance MAED_abundance MACV_abundance corr_abundance R2_abundance taxid_abundance species_abundance expected_abundance observed_abundance
0 0 1 centrifuge [-21.76736, -50.1056, -74.39946666666667, -58.... -16.939 23.674 17.173 23.505 1.369 0.635 0.576 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [244477.0, 93552.0, 48001.0, 51790.0, 189991.0... [-1.33929624238111, -37.07720182312471, -67.71... 4.750 29.856 26.450 14.640 0.554 0.801 0.576 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.0831469924255903, 1.1798024658164117, 0.605...
1 0 1 ganon [-23.91328, -55.785066666666665, -83.386666666... -28.729 21.022 28.729 21.022 0.732 0.558 0.590 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [237771.0, 82903.0, 31150.0, 42979.0, 159924.0... [7.790932959479549, -37.36135668956172, -76.46... 0.969 29.782 25.411 15.563 0.612 0.790 0.590 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.368466654983736, 1.1744745620707178, 0.4412...
2 0 1 kaiju [-53.25664, -79.21386666666666, -91.4437333333... -49.543 28.481 49.543 28.481 0.575 0.389 0.313 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [146073.0, 38974.0, 16043.0, 14537.0, 54507.0,... [-8.305719789553791, -59.224806809815064, -83.... -1.021 55.870 47.167 29.963 0.635 0.764 0.313 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [2.865446256576444, 0.7645348723159675, 0.3147...
3 0 1 kraken2 [-76.07584, -94.63306666666666, -99.4, -99.609... -65.147 24.902 65.147 24.902 0.382 0.176 0.122 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [74763.0, 10063.0, 1125.0, 488.0, 11530.0, 288... [-19.322126949109276, -81.90144330504141, -97.... 17.532 83.974 75.284 41.126 0.546 0.594 0.122 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [2.521183532840335, 0.3393479380304735, 0.0379...
4 0 1 krakenuniq [-19.62944, -48.239466666666665, -68.408533333... -16.729 23.969 17.018 23.765 1.396 0.642 0.572 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [251158.0, 97051.0, 59234.0, 56908.0, 183826.0... [-0.4454715144359227, -35.88453918746712, -60.... 3.147 29.691 25.571 15.414 0.603 0.795 0.572 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.1110790151738774, 1.2021648902349915, 0.733...
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
103 2 9 ganon [3.88512, -35.4976, -56.00053333333334, -32.8,... -1.882 23.489 18.223 14.940 0.820 0.751 0.659 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [324641.0, 120942.0, 82499.0, 84000.0, 215092.... [5.0978227572311425, -34.74463135224731, -55.4... -0.736 23.763 18.879 14.451 0.765 0.760 0.659 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.284306961163473, 1.223538162145363, 0.83462...
104 2 9 kaiju [-48.45856, -76.704, -89.0208, -85.9616, -80.8... -43.474 29.718 43.474 29.718 0.684 0.422 0.324 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [161067.0, 43680.0, 20586.0, 17548.0, 59848.0,... [-9.872292921902698, -59.26355445072247, -80.8... -1.156 51.967 43.407 28.597 0.659 0.738 0.324 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [2.8164908461905407, 0.7638083540489538, 0.359...
105 2 9 kraken2 [-39.60768, -84.74186666666667, -95.9482666666... -32.087 35.514 32.087 35.514 1.107 0.419 0.235 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [188726.0, 28609.0, 7597.0, 4884.0, 33042.0, 6... [0.13138111998259205, -74.70178386954647, -93.... 12.601 58.882 53.987 26.671 0.494 0.695 0.235 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.129105659999456, 0.4743415524460034, 0.1259...
106 2 9 krakenuniq [-19.67168, -48.3264, -68.4816, -54.56, -41.30... -16.778 23.985 17.064 23.782 1.394 0.641 0.572 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [251026.0, 96888.0, 59097.0, 56800.0, 183437.0... [-0.37064433374560224, -35.91043018258363, -60... 3.219 29.748 25.644 15.417 0.601 0.796 0.572 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.11341736457045, 1.2016794340765569, 0.73296...
107 2 9 mean [-20.390584802138655, -49.665829444940506, -66... -16.259 21.692 16.933 21.171 1.250 0.638 0.622 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [248779.4224933167, 94376.56979073654, 62310.1... [-1.6718373579364112, -43.06956794651194, -63.... 8.287 32.919 29.573 16.667 0.564 0.799 0.536 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.072755082564487, 1.0674456010029012, 0.6795...

108 rows × 27 columns

Analysis of kappa/F1¶

F1-score and $\kappa$ are quite different measures but we observe that they are correlated in this data.

In [85]:
# Compute and print correlation (Pearson by default)
corr = df_nominal_stats['f1'].corr(df_nominal_stats['kappa'])

print("Pearson correlation between F1 and Kappa:", corr)

# Scatter plot with the identity line
plt.figure(figsize=(4, 4))

# Add the identity line (y = x)
plt.plot([0, 1], [0, 1], color='red', linestyle='--', alpha=0.5)

sns.scatterplot(x='f1', y='kappa', data=df_nominal_stats, s=20)

plt.text(0.01, 0.95, f'R$^2$: {corr**2:.5f}')

sns.despine(top=True, right=True)

# Add title and labels
plt.title('F1-score vs Kappa')
plt.xlabel('F1-score')
plt.ylabel('Kappa')
plt.tight_layout()

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/fig0.kappa-f1.{format}', dpi=DPI, bbox_inches='tight' )

plt.show()
Pearson correlation between F1 and Kappa: 0.9997826111567186
No description has been provided for this image

In that sense, we can then use one of the measures to explain the results and don't need the second one. We are going to select the F1 score because it has a more clear interpretability and it is related to precision and recall, which are alrady being used.

In [86]:
import random

def cohen_kappa(tp, fp, fn, tn):
    """
    Compute Cohen's kappa given the confusion matrix counts:
      TP = true positives
      FP = false positives
      FN = false negatives
      TN = true negatives
    """
    N = tp + fp + fn + tn  # total
    
    # Observed agreement
    po = (tp + tn) / N
    
    # Expected agreement
    # (actual positives * predicted positives) + (actual negatives * predicted negatives)
    p_a = (tp + fn) / N  # actual positives
    p_p = (tp + fp) / N  # predicted positives
    p_n = (fp + tn) / N  # actual negatives? Actually "actual positives" vs "predicted negatives" can be spelled out:

    p_n_act = (fp + tn) / N   # actual negatives
    p_n_pred = (fn + tn) / N  # predicted negatives
    
    pe = p_a * p_p + p_n_act * p_n_pred
    
    if np.isclose(pe, 1.0):
        # If pe is 1, the denominator (1 - pe) -> 0, can cause issues
        return np.nan
    
    kappa = (po - pe) / (1 - pe)
    return kappa

def f1_score_simple(tp, fp, fn):
    """
    Compute the F1 score from TP, FP, FN:
      F1 = 2 * TP / (2*TP + FP + FN)
    """
    denom = (2 * tp + fp + fn)
    if denom == 0:
        return np.nan
    return 2 * tp / denom

# --- Simulation parameters ---
N = 3000        # total samples per confusion matrix
n_sims = 100000  # number of random simulations

f1_values = []
kappa_values = []
table_values = []

for _ in range(n_sims):
    # We need TP, FP, FN, TN >= 0 and sum = N.
    # One way: sample three random integers and let the fourth be what's left.
    # This ensures the sum is N.
    
    tp = random.randint(0, N)
    fp = random.randint(0, N - tp)
    fn = random.randint(0, N - tp - fp)
    tn = N - tp - fp - fn
    
    # Compute F1
    f1 = f1_score_simple(tp, fp, fn)
    # Compute Kappa
    kappa = cohen_kappa(tp, fp, fn, tn)
    
    # We only keep valid calculations (not NaN)
    if not np.isnan(f1) and not np.isnan(kappa):
        f1_values.append(f1)
        kappa_values.append(kappa)
        table_values.append((tp, fp, fn, tn))

# Convert to numpy arrays for correlation
f1_values = np.array(f1_values)
kappa_values = np.array(kappa_values)

# Compute Pearson correlation
corr, p_value = pearsonr(f1_values, kappa_values)

print(f"Number of valid simulations: {len(f1_values)} / {n_sims}")
print(f"Pearson correlation between F1 and Kappa: {corr:.3f} (p = {p_value:.3e})")

# Plot the scatter of F1 vs Kappa
plt.figure(figsize=(7, 5))
plt.scatter(f1_values, kappa_values, alpha=0.2, s=10)
plt.title("Monte Carlo Simulation (TP, FP, FN, TN random)")
plt.xlabel("F1 Score")
plt.ylabel("Cohen's Kappa")

plt.plot([0, 1], [0, 1])

plt.grid(True)
plt.show()
Number of valid simulations: 99965 / 100000
Pearson correlation between F1 and Kappa: 0.681 (p = 0.000e+00)
No description has been provided for this image
In [87]:
N = 0.8

df = pd.DataFrame({'f1': f1_values, 'kappa': kappa_values, 'table': table_values})

df[(df['f1'] > N - 0.03) & (df['f1'] < N + 0.03) & (df['kappa'] < N + 0.03) & (df['kappa'] > N- 0.03)]
Out[87]:
f1 kappa table
1440 0.821 0.778 (484, 149, 62, 2305)
38383 0.829 0.772 (627, 206, 53, 2114)
39301 0.810 0.778 (351, 126, 39, 2484)
74870 0.811 0.792 (232, 108, 0, 2660)
78198 0.830 0.780 (562, 94, 137, 2207)
93867 0.811 0.791 (236, 80, 30, 2654)
In [88]:
N = 1

df[(df['f1'] > N - 0.03) & (df['f1'] < N + 0.03) & (df['kappa'] < N + 0.03) & (df['kappa'] > N- 0.03)]
Out[88]:
f1 kappa table
361 0.992 0.972 (2118, 1, 33, 848)
911 1.000 1.000 (2999, 0, 0, 1)
3050 0.999 0.974 (2879, 5, 1, 115)
4738 1.000 1.000 (2943, 0, 0, 57)
6415 1.000 1.000 (2986, 0, 0, 14)
... ... ... ...
96296 1.000 1.000 (2993, 0, 0, 7)
96612 1.000 1.000 (2999, 0, 0, 1)
97751 0.992 0.983 (1564, 1, 25, 1410)
97820 0.999 0.977 (2883, 1, 4, 112)
98444 1.000 0.985 (2965, 1, 0, 34)

90 rows × 3 columns

In [89]:
N = 0

df[(df['f1'] > N - 0.03) & (df['f1'] < N + 0.03) & (df['kappa'] < N + 0.03) & (df['kappa'] > N- 0.03)]
Out[89]:
f1 kappa table
410 0.019 -0.002 (27, 2856, 4, 113)
1178 0.014 -0.020 (18, 2561, 32, 389)
2550 0.018 -0.017 (26, 2769, 27, 178)
5017 0.004 -0.004 (5, 2715, 7, 273)
5733 0.001 -0.013 (1, 1611, 20, 1368)
... ... ... ...
96808 0.010 -0.015 (15, 2958, 22, 5)
97000 0.007 -0.003 (11, 2969, 4, 16)
97339 0.020 -0.021 (28, 2667, 34, 271)
97542 0.015 -0.028 (20, 2583, 45, 352)
99850 0.005 -0.012 (8, 2949, 18, 25)

107 rows × 3 columns

How does the S parametter used during curve fitting affect?¶

The S parametter is useful to tweak the detection results, so that we can include more or less species during the flagging step. Since it is a structural parametter, we want to fit it first so that we can answer several other comparisons.

To do this we are going to use the nominal variables and their derived statistics.

Checking recall/precision/F1-score for inclusion/exclusion of species¶

In [90]:
cols = [f'{i}_norm' for i in LIST_PROFILERS] + ['mean_norm']
modes = range(2, 9)
passn = [2]
S_values = df_nominal_stats['S'].unique()
In [91]:
subset_df = df_nominal_stats[(df_nominal_stats['pass'].isin(passn)) & \
                             (df_nominal_stats['column'].isin(cols)) & \
                              (df_nominal_stats['mode'].isin(modes))]
In [92]:
melted_df = pd.melt(
    subset_df,
    id_vars=['mode', 'S', 'column'],
    value_vars=['recall', 'precision', 'f1'],
    var_name='metric',
    value_name='score'
)

# Create a colormap for 'mode'
norm = Normalize(vmin=melted_df['mode'].min(), vmax=melted_df['mode'].max())
cmap = plt.cm.viridis  # Choose a colormap (e.g., 'viridis', 'plasma', 'cividis')

# Create a FacetGrid: 6x3 grid (row for each profiler, column for each metric)
g = sns.FacetGrid(
    melted_df, 
    col='column', 
    row='metric', 
    height=2, 
    sharey=True, 
    sharex=True
)

# Map the lineplot to the grid
def lineplot_with_cmap(data, **kwargs):
    for mode in sorted(data['mode'].unique()):
        subset = data[data['mode'] == mode]
        plt.plot(subset['S'], subset['score'], label=f"Mode {mode}",
                 color=cmap(norm(mode)), marker='o')

g.map_dataframe(lineplot_with_cmap)

# Create a legend for the discrete modes
handles = [
    plt.Line2D([0], [0], color=cmap(norm(mode)), marker='o', linestyle='', label=f"Mode {mode}")
    for mode in sorted(melted_df['mode'].unique())
]
plt.legend(
    handles=handles, 
    title="", 
    bbox_to_anchor=(1.05, 3), 
    loc='center left', 
    frameon=False
)

# Set x-axis ticks (if you have specific S values)
g.set(xticks=subset_df['S'].unique())

for ax in g.axes.ravel():
    ax.set_title('')

# Add axis labels and titles
for ax, profiler in zip(g.axes[0, :], melted_df['column'].unique()):
    ax.set_title(profiler.replace('_norm', ''))

for ax, score in zip(g.axes[:, 0], ['recall', 'precision', 'F1-score']):
    ax.set_ylabel(score)

plt.tight_layout()

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/figE.{format}', dpi=DPI, bbox_inches='tight')


plt.show()
/tmp/ipykernel_2918775/2133720995.py:58: UserWarning: Tight layout not applied. tight_layout cannot make Axes height small enough to accommodate all Axes decorations.
  plt.tight_layout()
No description has been provided for this image
In [93]:
g = sns.FacetGrid(
    melted_df, 
    col='column', 
    row='metric', 
    height=3, 
    sharey=True, 
    sharex=True
)

g.map(sns.boxplot, 'S', 'score')

# Set x-axis ticks (if you have specific S values)
for ax in g.axes.ravel():
    ax.set_title('')

# Add axis labels and titles
for ax, profiler in zip(g.axes[0, :], melted_df['column'].unique()):
    ax.set_title(profiler.replace('_norm', ''))

for ax, score in zip(g.axes[:, 0], ['recall', 'precision', 'F1-score']):
    ax.set_ylabel(score)

plt.subplots_adjust(top=0.9)
g.fig.suptitle("Metrics by Profiler and Mode", fontsize=16)

plt.show()
/home/neuroimm/.local/share/mamba/envs/EVs/lib/python3.13/site-packages/seaborn/axisgrid.py:718: UserWarning: Using the boxplot function without specifying `order` is likely to produce an incorrect plot.
  warnings.warn(warning)
No description has been provided for this image

The aim of this part of the analysis was to select the "optimal" S to then make other comparisons and extract proper conclusions. If we look at individual profilers, the aim is not the select the S with best F1 score, but to select the smallest S that provides a sufficiently high recall, ensuring that we don't lose TP species. This threshold depends on the profiler. For CEN it is 6-7, GAN is 7-10, KAI is 1-2, KR2 is 4-5, KRU is 5-6. We see that at these values the precision drops (expectedly), but it remains stable afterwards for most profilers. Therefore, a value of S=2 should be sufficient to ensure that the results are correct.

IMPORTANT: conceptually, if we are choosng the number of the species based on the mean number of reads, using a low S is still good, because the number of reads reported by profilers that are not flagged are still considered. That is, the # of reads assigned to one profiler is the same regardless of S.

The advantage of using the mean value instead of the individual profilers is that it tends to retrieve a better stability on the precision throughout the S values and modes. In fact, each profiler has an individual dinamic, and the mean value averages them all. Therefore, using the averaged value for S allows us to choose the parameter with better predictability, ensuring that we don't choose a very high value, while at the same time keeping the correct number of reads.

Therefore, we are going to choose S=2 and S=7 for comparisons of robustness with biological samples.

Does pass0/pass2 (no host pre-mapping vs host pre-mapping) affect the detection of the species?¶

For this part we are going to run run analyses:

  • Retrieve the raw detection of species with the passes, and calculate their jaccard index.
  • Calculate $\chi^2$ for each case and see if there are significative differences.
  • Calculate the Pearson correlation + RMSE for several mode values.

Total number of species and Jaccard index¶

In [94]:
# Plot Jaccard index between the different species

"""
For this part we are going to read all the .standardised.species reports and simply read the number of species and compute a table with the total number of species and the jaccard index
"""

def get_n_species(sample, mode, profiler, S=15):
    pass0_df = pd.read_csv(f'{RESULTS_DIR}/summary/{sample}_pass0_mode{mode}_taxspecies_S{S}.diversity.tsv', sep='\t').set_index('taxonomy_id')
    pass2_df = pd.read_csv(f'{RESULTS_DIR}/summary/{sample}_pass2_mode{mode}_taxspecies_S{S}.diversity.tsv', sep='\t').set_index('taxonomy_id')

    pass0_profiler = pass0_df[profiler].dropna()
    pass2_profiler = pass2_df[profiler].dropna()

    pass0_taxids = pass0_profiler.index.values
    pass2_taxids = pass2_profiler.index.values

    jaccard_index = len(np.intersect1d(pass0_taxids, pass2_taxids)) / len(np.union1d(pass0_taxids, pass2_taxids))

    return len(pass0_taxids), len(pass2_taxids), jaccard_index


dict_n_species = {'profiler': [],
                  'mode': [],
                  'pass 0 species': [],
                  'pass 2 species': [],
                  'jaccard': []}

for profiler in LIST_PROFILERS + ['mean']:
    for mode in range(1,10):
        try:
            pass0_n_species, pass2_n_species, jaccard_index = get_n_species('ARTIFICIAL', mode, profiler + '_norm')
            dict_n_species['profiler'].append(profiler)
            dict_n_species['mode'].append(mode)
            dict_n_species['pass 0 species'].append(pass0_n_species)
            dict_n_species['pass 2 species'].append(pass2_n_species)
            dict_n_species['jaccard'].append(jaccard_index)
        except:
            print(f'No entry added for profiler {profiler} and mode {mode}')

df_n_species = pd.DataFrame(dict_n_species)
In [95]:
plt.rc('axes', linewidth=0.65)

# Initialize a 3x2 grid for subplots
fig, axes = plt.subplots(2, 3, figsize=(11, 6))
axes = axes.flatten()

# Set Seaborn style
sns.set_theme(style="white")

# Define custom labels for the shared legend
legend_elements = [
    plt.Line2D([0], [0], marker='o', color='#648FFF', label='Pass 0', markersize=8, linestyle='None'),
    plt.Line2D([0], [0], marker='o', color='#785EF0', label='Pass 2', markersize=8, linestyle='None'),
    plt.Line2D([0], [0], marker='o', color='#DC267F', label='Jaccard Index', markersize=8, linestyle='-')
]

# Plot for each profiler
for i, profiler in enumerate(LIST_PROFILERS + ['mean']):
    df_profiler = df_n_species[df_n_species['profiler'] == profiler]
    ax1 = axes[i]
    
    # Scatter plots for `pass 0 species` and `pass 2 species`
    sns.scatterplot(
        data=df_profiler, x='mode', y='pass 0 species', ax=ax1, color='#648FFF', s=50
    )
    sns.scatterplot(
        data=df_profiler, x='mode', y='pass 2 species', ax=ax1, color='#785EF0', s=50
    )
    
    sns.despine(top=True, right=True)
    
    sns.set_theme(style="white")
    
    # Configure the primary y-axis
    ax1.set_xticks(range(1, 10))
    ax1.set_xlabel('Mode')
    ax1.set_ylabel('Species Count', color='black')
    #ax1.set_yscale('log')  # Optional: Log scale
    ax1.set_title(f'{profiler.capitalize()}')
    
    # Create a secondary y-axis for Jaccard index
    ax2 = ax1.twinx()
    sns.lineplot(
        data=df_profiler, x='mode', y='jaccard', ax=ax2, color='#DC267F', marker='o'
    )
    ax2.set_ylabel('Jaccard Index', color='#DC267F')

    sns.despine(top=True, right=True)
# Hide unused axes
for j in range(len(LIST_PROFILERS) + 1, len(axes)):
    axes[j].axis('off')

# Add a shared legend
fig.legend(
    handles=legend_elements, loc='center right', frameon=False, title="", bbox_to_anchor=(1.15, 0.5)
)

plt.tight_layout()

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/figB.{format}', dpi=DPI, bbox_inches='tight' )
No description has been provided for this image
In [96]:
sns.lineplot(df_n_species, x='mode', y='jaccard', hue='profiler', marker='o', palette=custom_palette)
plt.legend(
    loc='center right', frameon=False, bbox_to_anchor=(1.35, 0.5),
)
Out[96]:
<matplotlib.legend.Legend at 0x76fde1836ad0>
No description has been provided for this image

The number of detected species tends to go up with the mode, which is expected (with the exception of krakenuniq). For most profilers the increase is linear, but for centrifuge the increase is exponential, although it may reach the top of detected species at some point.

We see that in general Jaccard indexes are very high. We should consider that this plot is not "strictly" relevant because many, many species are false positive, so we don't really care about the number of falsely assigned species. Still, it is important to note that pass does not have, grosso modo, a relevant impact. It is interesting to find that for kaiju and centrifuge the mode has an impact. For Kaiju it decreases; this it might be because the number of spurious species has increased; while for centrifuge the jaccard index may increase because their number of species is so high (12000!!) that we may be reaching the maximum of discoverable amount of species, and therefore it is obvious that the Jaccard index will increase in that case. For the rest of profilers the mode does not seem to affect the Jaccard index, regardless of the total number of species detected.

Checking statistical differences in the truth table.¶

We are going to use the truth tables to compute statistically differential capture of species. We are going to use S=2 and S=7 throughout the different modes and profilers.

In [97]:
from scipy.stats import chi2_contingency
In [98]:
SMALL_VAL = 0.1

df_sub = df_nominal_stats[(df_nominal_stats['column'].isin([f'{i}_norm' for i in LIST_PROFILERS] + ['mean_norm'])) & 
                          (df_nominal_stats['S'].isin([0, 1, 2, 3, 4, 5, 6, 7, 10, 15]))].copy()

# Initialize a list to store chi-squared results
chi2_results = []

# Iterate over unique combinations of mode, S, and column
for (mode, S, column), group in df_sub.groupby(['mode', 'S', 'column']):
    try:
        # Extract contingency tables for full (TP, FN, FP, TN) and partial (TP, FP, FN)
        full_contingency_table = []
        partial_contingency_table = []
        precision_diff, recall_diff, f1_diff = None, None, None  # Initialize differences
        
        for p in [0, 2]:
            data = group[group['pass'] == p]
            if len(data):
                (tp, fn, fp, tn) = data.iloc[0]['TP|FN|FP|TN']
                full_contingency_table.append([tp, fn, fp, tn])  # Full table
                partial_contingency_table.append([tp, fp, fn])   # Partial table

                # Calculate differences for precision, recall, and f1
                if p == 0:
                    precision_0 = data.iloc[0]['precision']
                    recall_0 = data.iloc[0]['recall']
                    f1_0 = data.iloc[0]['f1']
                elif p == 2:
                    precision_2 = data.iloc[0]['precision']
                    recall_2 = data.iloc[0]['recall']
                    f1_2 = data.iloc[0]['f1']

        # Compute differences
        if 'precision_0' in locals() and 'precision_2' in locals():
            precision_diff = precision_2 - precision_0
            recall_diff = recall_2 - recall_0
            f1_diff = f1_2 - f1_0

        # Perform chi-squared test for full contingency table
        if len(full_contingency_table) == 2:
            chi2_full, p_value_full, _, _ = chi2_contingency(np.array(full_contingency_table) + SMALL_VAL)
        
        # Perform chi-squared test for partial contingency table
        if len(partial_contingency_table) == 2:
            chi2_partial, p_value_partial, _, _ = chi2_contingency(np.array(partial_contingency_table) + SMALL_VAL)
        
        # Store results
        chi2_results.append({
            'mode': mode,
            'S': S,
            'column': column,
            'chi2_full': chi2_full if len(full_contingency_table) == 2 else None,
            'p_value_full': p_value_full if len(full_contingency_table) == 2 else None,
            'chi2_partial': chi2_partial if len(partial_contingency_table) == 2 else None,
            'p_value_partial': p_value_partial if len(partial_contingency_table) == 2 else None,
            'precision_diff': precision_diff,
            'recall_diff': recall_diff,
            'f1_diff': f1_diff,
            'stats_pass0': full_contingency_table[0],
            'stats_pass2': full_contingency_table[1],
        })
    except Exception as e:
        print(f"Error processing {mode}, {S}, {column}: {e}")

# Convert results to a DataFrame
df_pass_chi2_stats = pd.DataFrame(chi2_results)
In [99]:
# Add significance categories based on p-value thresholds
alpha = 0.1 

df_pass_chi2_stats['significance'] = df_pass_chi2_stats.apply(
    lambda row: (
        'Both Significant' if row['p_value_full'] < alpha and row['p_value_partial'] < alpha else
        'Only Full Significant' if row['p_value_full'] < alpha else
        'Only Partial Significant' if row['p_value_partial'] < alpha else
        'Neither Significant'
    ),
    axis=1
)

# Count points in each quadrant
quadrant_counts = {
    'Both Significant': len(df_pass_chi2_stats[(df_pass_chi2_stats['p_value_full'] < alpha) & (df_pass_chi2_stats['p_value_partial'] < alpha)]),
    'Only Full Significant': len(df_pass_chi2_stats[(df_pass_chi2_stats['p_value_full'] < alpha) & (df_pass_chi2_stats['p_value_partial'] >= alpha)]),
    'Only Partial Significant': len(df_pass_chi2_stats[(df_pass_chi2_stats['p_value_full'] >= alpha) & (df_pass_chi2_stats['p_value_partial'] < alpha)]),
    'Neither Significant': len(df_pass_chi2_stats[(df_pass_chi2_stats['p_value_full'] >= alpha) & (df_pass_chi2_stats['p_value_partial'] >= alpha)])
}

# Set up the plots
fig, axs = plt.subplots(1, 2, figsize=(12, 6))

# First plot: chi2_full vs chi2_partial
sns.scatterplot(data=df_pass_chi2_stats, x='chi2_full', y='chi2_partial', ax=axs[0])
axs[0].set_title('Chi2 Full vs Partial')
axs[0].set_xlabel('Chi2 Full')
axs[0].set_ylabel('Chi2 Partial')

# Second plot: p_value_full vs p_value_partial with significance categories
sns.scatterplot(
    data=df_pass_chi2_stats,
    x='p_value_full',
    y='p_value_partial',
    hue='significance',
    palette={
        'Both Significant': '#bc0000',
        'Only Full Significant': '#0000bc',
        'Only Partial Significant': '#00bc00',
        'Neither Significant': '#aaaaaa'
    },
    ax=axs[1]
)

# Add thresholds
axs[1].axhline(alpha, color='black', linestyle='--', linewidth=1)
axs[1].axvline(alpha, color='black', linestyle='--', linewidth=1)

# Annotate quadrant counts
axs[1].text(0.25, 0.75, f"Both Significant\n{quadrant_counts['Both Significant']}", 
            ha='center', color='#bc0000', fontsize=12, bbox=dict(facecolor='white', edgecolor='none'))
axs[1].text(0.75, 0.75, f"Only Full Significant\n{quadrant_counts['Only Full Significant']}", 
            ha='center', color='#0000bc', fontsize=12, bbox=dict(facecolor='white', edgecolor='none'))
axs[1].text(0.25, 0.25, f"Only Partial Significant\n{quadrant_counts['Only Partial Significant']}", 
            ha='center', color='#00bc00', fontsize=12, bbox=dict(facecolor='white', edgecolor='none'))
axs[1].text(0.75, 0.25, f"Neither Significant\n{quadrant_counts['Neither Significant']}", 
            ha='center', color='#aaaaaa', fontsize=12, bbox=dict(facecolor='white', edgecolor='none'))

# Customize the plot
axs[1].set_title('P-Value Comparison with Significance')
axs[1].set_xlabel('P-Value Full')
axs[1].set_ylabel('P-Value Partial')
axs[1].set_xlim(0, 1)
axs[1].set_ylim(0, 1)

plt.legend(loc='center right', frameon=False, bbox_to_anchor=(1.65, 0.5))
# Adjust layout
plt.tight_layout()

# Show the plots
plt.show()
No description has been provided for this image
In [100]:
df_pass_chi2_stats[df_pass_chi2_stats['significance'] != 'Neither Significant'].sort_values(by=['mode', 'S'])
Out[100]:
mode S column chi2_full p_value_full chi2_partial p_value_partial precision_diff recall_diff f1_diff stats_pass0 stats_pass2 significance
5 1 0 mean_norm 59.705 0.000 59.578 0.000 -0.043 0.683 0.716 [4, 56, 0, 3637] [45, 15, 2, 3520] Both Significant
13 1 2 ganon_norm 30.477 0.000 30.438 0.000 0.060 0.500 0.498 [11, 49, 1, 3636] [41, 19, 1, 3521] Both Significant
18 1 3 centrifuge_norm 5.064 0.167 5.032 0.081 0.000 0.183 0.227 [11, 49, 0, 3637] [22, 38, 0, 3522] Only Partial Significant
43 1 7 ganon_norm 37.082 0.000 33.421 0.000 -0.269 0.267 0.001 [42, 18, 1, 3636] [58, 2, 24, 3498] Both Significant
62 2 0 kaiju_norm 8.014 0.046 7.984 0.018 0.000 0.167 0.269 [2, 58, 0, 3724] [12, 48, 0, 3611] Both Significant
65 2 0 mean_norm 10.970 0.012 10.962 0.004 0.050 -0.233 -0.321 [19, 41, 1, 3723] [5, 55, 0, 3611] Both Significant
73 2 2 ganon_norm 30.473 0.000 30.438 0.000 0.060 0.500 0.498 [11, 49, 1, 3723] [41, 19, 1, 3610] Both Significant
78 2 3 centrifuge_norm 5.062 0.167 5.032 0.081 0.000 0.183 0.227 [11, 49, 0, 3724] [22, 38, 0, 3611] Only Partial Significant
96 2 6 centrifuge_norm 46.620 0.000 41.727 0.000 0.376 -0.233 0.088 [58, 2, 35, 3689] [44, 16, 0, 3611] Both Significant
102 2 7 centrifuge_norm 46.620 0.000 41.727 0.000 0.376 -0.233 0.088 [58, 2, 35, 3689] [44, 16, 0, 3611] Both Significant
103 2 7 ganon_norm 37.046 0.000 33.421 0.000 -0.269 0.267 0.001 [42, 18, 1, 3723] [58, 2, 24, 3587] Both Significant
122 3 0 kaiju_norm 8.013 0.046 7.984 0.018 0.000 0.167 0.269 [2, 58, 0, 3761] [12, 48, 0, 3648] Both Significant
125 3 0 mean_norm 26.453 0.000 26.381 0.000 0.013 0.467 0.405 [17, 43, 1, 3760] [45, 15, 2, 3646] Both Significant
138 3 3 centrifuge_norm 5.061 0.167 5.032 0.081 0.000 0.183 0.227 [11, 49, 0, 3761] [22, 38, 0, 3648] Only Partial Significant
139 3 3 ganon_norm 36.467 0.000 36.431 0.000 0.061 0.550 0.533 [11, 49, 1, 3760] [44, 16, 1, 3647] Both Significant
182 4 0 kaiju_norm 19.754 0.000 19.750 0.000 0.081 -0.367 -0.368 [34, 26, 3, 3845] [12, 48, 0, 3733] Both Significant
185 4 0 mean_norm 44.783 0.000 44.789 0.000 0.004 -0.600 -0.516 [49, 11, 4, 3844] [13, 47, 1, 3732] Both Significant
198 4 3 centrifuge_norm 38.612 0.000 38.576 0.000 0.000 0.567 0.547 [11, 49, 0, 3848] [45, 15, 0, 3733] Both Significant
199 4 3 ganon_norm 36.467 0.000 36.431 0.000 0.061 0.550 0.533 [11, 49, 1, 3847] [44, 16, 1, 3732] Both Significant
245 5 0 mean_norm 17.691 0.001 17.077 0.000 0.154 -0.183 -0.016 [58, 2, 14, 4048] [47, 13, 2, 3940] Both Significant
259 5 3 ganon_norm 40.835 0.000 40.799 0.000 0.062 0.583 0.554 [11, 49, 1, 4061] [46, 14, 1, 3941] Both Significant
263 5 3 mean_norm 6.513 0.089 2.956 0.228 -0.114 0.000 -0.097 [58, 2, 43, 4019] [58, 2, 68, 3874] Only Full Significant
305 6 0 mean_norm 43.871 0.000 40.577 0.000 0.300 -0.317 -0.006 [58, 2, 31, 4162] [39, 21, 2, 4066] Both Significant
319 6 3 ganon_norm 40.836 0.000 40.799 0.000 0.062 0.583 0.554 [11, 49, 1, 4192] [46, 14, 1, 4067] Both Significant
354 6 15 centrifuge_norm 8.454 0.037 3.712 0.156 -0.125 0.000 -0.109 [58, 2, 46, 4147] [58, 2, 76, 3992] Only Full Significant
365 7 0 mean_norm 46.384 0.000 46.336 0.000 0.084 0.583 0.422 [23, 37, 4, 4836] [58, 2, 4, 4678] Both Significant
379 7 3 ganon_norm 40.843 0.000 40.799 0.000 0.062 0.583 0.554 [11, 49, 1, 4839] [46, 14, 1, 4681] Both Significant
402 7 7 centrifuge_norm 12.768 0.005 5.586 0.061 -0.152 0.000 -0.135 [58, 2, 45, 4795] [58, 2, 83, 4599] Both Significant
407 7 7 mean_norm 6.531 0.088 3.144 0.208 0.102 0.000 0.101 [58, 2, 104, 4736] [58, 2, 68, 4614] Only Full Significant
413 7 10 mean_norm 6.531 0.088 3.144 0.208 0.102 0.000 0.101 [58, 2, 104, 4736] [58, 2, 68, 4614] Only Full Significant
437 8 2 mean_norm 9.109 0.028 5.037 0.081 0.147 0.000 0.127 [58, 2, 76, 7660] [58, 2, 42, 7520] Both Significant
439 8 3 ganon_norm 40.820 0.000 40.799 0.000 0.062 0.583 0.554 [11, 49, 1, 7735] [46, 14, 1, 7561] Both Significant
443 8 3 mean_norm 9.109 0.028 5.037 0.081 0.147 0.000 0.127 [58, 2, 76, 7660] [58, 2, 42, 7520] Both Significant
449 8 4 mean_norm 8.979 0.030 3.999 0.135 0.113 0.000 0.114 [58, 2, 111, 7625] [58, 2, 69, 7493] Only Full Significant
482 9 0 kaiju_norm 997.648 0.000 888.595 0.000 -0.437 0.933 0.085 [1, 59, 1, 12540] [57, 3, 854, 11515] Both Significant
485 9 0 mean_norm 24.758 0.000 24.539 0.000 0.101 -0.383 -0.228 [51, 9, 8, 12533] [28, 32, 1, 12368] Both Significant
499 9 3 ganon_norm 38.413 0.000 38.405 0.000 0.051 0.567 0.519 [13, 47, 1, 12540] [47, 13, 1, 12368] Both Significant
507 9 4 kraken2_norm 32.370 0.000 31.976 0.000 -0.109 0.400 0.210 [33, 27, 0, 12541] [57, 3, 7, 12362] Both Significant
509 9 4 mean_norm 17.490 0.001 7.116 0.028 0.147 0.000 0.151 [58, 2, 129, 12412] [58, 2, 69, 12300] Both Significant
529 9 10 ganon_norm 10.471 0.015 5.917 0.052 0.164 0.000 0.136 [58, 2, 70, 12471] [58, 2, 36, 12333] Both Significant
535 9 15 ganon_norm 10.471 0.015 5.917 0.052 0.164 0.000 0.136 [58, 2, 70, 12471] [58, 2, 36, 12333] Both Significant

Only using the full table (for publication)¶

In [101]:
plt.rc('axes', linewidth=0.65)


df_pass_chi2_stats['are_diff'] = [True if i < alpha else False for i in df_pass_chi2_stats['p_value_full']]


print((df_pass_chi2_stats['are_diff'] == True).sum(), (df_pass_chi2_stats['are_diff'] == False).sum())

display(df_pass_chi2_stats)
display(df_pass_chi2_stats[df_pass_chi2_stats['are_diff'] == True])



# Create a grid of three axes for the plots
fig, axs = plt.subplots(1, 3, figsize=(7.5, 2.7), sharey=True)

# Define the metrics to plot
metrics = ['f1_diff', 'precision_diff', 'recall_diff']
titles = ['F1 Score', 'Precision', 'Recall']


# Iterate through the metrics and create a boxplot for each
for ax, metric, title in zip(axs, metrics, titles):
    sns.boxplot(
        data=df_pass_chi2_stats, 
        x="are_diff", 
        y=metric, 
        palette=sns.color_palette(['#4E79A7', '#A0CBE8']),
        ax=ax
    )
    sns.despine(top=True, right=True, ax=ax)

    ax.set_title(title, fontsize=14, pad=40)
    ax.set_xlabel("Group", fontsize=12)
    if metric == 'f1_diff':  # Label y-axis only for the first plot
        ax.set_ylabel("Difference", fontsize=12)
    else:
        ax.set_ylabel("")

    # Highlight medians
    medians = df_pass_chi2_stats.groupby("are_diff")[metric].median()
    for i, median in enumerate(medians):
        ax.text(i, median, f"{median:.2f}", ha='center', va='bottom', fontsize=10)
    
    pairs=[(False, True)]

    annotator = Annotator(ax, pairs, data=df_pass_chi2_stats, x='are_diff', y=metric)
    annotator.configure(test='Mann-Whitney', text_format='simple', loc='outside', line_width=1)
    annotator.apply_and_annotate()



# Adjust layout and save the plot
plt.tight_layout()

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/figC.{format}', dpi=DPI)

plt.show()
38 502
mode S column chi2_full p_value_full chi2_partial p_value_partial precision_diff recall_diff f1_diff stats_pass0 stats_pass2 significance are_diff
0 1 0 centrifuge_norm 0.031 0.999 0.000 1.000 0.000 0.000 0.000 [3, 57, 0, 3637] [3, 57, 0, 3522] Neither Significant False
1 1 0 ganon_norm 1.356 0.716 1.349 0.509 0.167 -0.033 -0.056 [5, 55, 1, 3636] [3, 57, 0, 3522] Neither Significant False
2 1 0 kaiju_norm 0.031 0.999 0.000 1.000 0.000 0.000 0.000 [6, 54, 0, 3637] [6, 54, 0, 3522] Neither Significant False
3 1 0 kraken2_norm 0.031 0.999 0.000 1.000 0.000 0.000 0.000 [13, 47, 0, 3637] [13, 47, 0, 3522] Neither Significant False
4 1 0 krakenuniq_norm 0.459 0.928 0.428 0.807 0.000 -0.033 -0.057 [6, 54, 0, 3637] [4, 56, 0, 3522] Neither Significant False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
535 9 15 ganon_norm 10.471 0.015 5.917 0.052 0.164 0.000 0.136 [58, 2, 70, 12471] [58, 2, 36, 12333] Both Significant True
536 9 15 kaiju_norm 0.180 0.981 0.127 0.939 0.024 0.000 0.020 [57, 3, 55, 12486] [57, 3, 50, 12319] Neither Significant False
537 9 15 kraken2_norm 0.006 1.000 0.000 1.000 0.000 0.000 0.000 [57, 3, 7, 12534] [57, 3, 7, 12362] Neither Significant False
538 9 15 krakenuniq_norm 0.014 1.000 0.000 1.000 0.000 0.000 0.000 [58, 2, 87, 12454] [58, 2, 87, 12282] Neither Significant False
539 9 15 mean_norm 0.018 0.999 0.000 1.000 0.000 0.000 0.000 [58, 2, 129, 12412] [58, 2, 129, 12240] Neither Significant False

540 rows × 14 columns

mode S column chi2_full p_value_full chi2_partial p_value_partial precision_diff recall_diff f1_diff stats_pass0 stats_pass2 significance are_diff
5 1 0 mean_norm 59.705 0.000 59.578 0.000 -0.043 0.683 0.716 [4, 56, 0, 3637] [45, 15, 2, 3520] Both Significant True
13 1 2 ganon_norm 30.477 0.000 30.438 0.000 0.060 0.500 0.498 [11, 49, 1, 3636] [41, 19, 1, 3521] Both Significant True
43 1 7 ganon_norm 37.082 0.000 33.421 0.000 -0.269 0.267 0.001 [42, 18, 1, 3636] [58, 2, 24, 3498] Both Significant True
62 2 0 kaiju_norm 8.014 0.046 7.984 0.018 0.000 0.167 0.269 [2, 58, 0, 3724] [12, 48, 0, 3611] Both Significant True
65 2 0 mean_norm 10.970 0.012 10.962 0.004 0.050 -0.233 -0.321 [19, 41, 1, 3723] [5, 55, 0, 3611] Both Significant True
73 2 2 ganon_norm 30.473 0.000 30.438 0.000 0.060 0.500 0.498 [11, 49, 1, 3723] [41, 19, 1, 3610] Both Significant True
96 2 6 centrifuge_norm 46.620 0.000 41.727 0.000 0.376 -0.233 0.088 [58, 2, 35, 3689] [44, 16, 0, 3611] Both Significant True
102 2 7 centrifuge_norm 46.620 0.000 41.727 0.000 0.376 -0.233 0.088 [58, 2, 35, 3689] [44, 16, 0, 3611] Both Significant True
103 2 7 ganon_norm 37.046 0.000 33.421 0.000 -0.269 0.267 0.001 [42, 18, 1, 3723] [58, 2, 24, 3587] Both Significant True
122 3 0 kaiju_norm 8.013 0.046 7.984 0.018 0.000 0.167 0.269 [2, 58, 0, 3761] [12, 48, 0, 3648] Both Significant True
125 3 0 mean_norm 26.453 0.000 26.381 0.000 0.013 0.467 0.405 [17, 43, 1, 3760] [45, 15, 2, 3646] Both Significant True
139 3 3 ganon_norm 36.467 0.000 36.431 0.000 0.061 0.550 0.533 [11, 49, 1, 3760] [44, 16, 1, 3647] Both Significant True
182 4 0 kaiju_norm 19.754 0.000 19.750 0.000 0.081 -0.367 -0.368 [34, 26, 3, 3845] [12, 48, 0, 3733] Both Significant True
185 4 0 mean_norm 44.783 0.000 44.789 0.000 0.004 -0.600 -0.516 [49, 11, 4, 3844] [13, 47, 1, 3732] Both Significant True
198 4 3 centrifuge_norm 38.612 0.000 38.576 0.000 0.000 0.567 0.547 [11, 49, 0, 3848] [45, 15, 0, 3733] Both Significant True
199 4 3 ganon_norm 36.467 0.000 36.431 0.000 0.061 0.550 0.533 [11, 49, 1, 3847] [44, 16, 1, 3732] Both Significant True
245 5 0 mean_norm 17.691 0.001 17.077 0.000 0.154 -0.183 -0.016 [58, 2, 14, 4048] [47, 13, 2, 3940] Both Significant True
259 5 3 ganon_norm 40.835 0.000 40.799 0.000 0.062 0.583 0.554 [11, 49, 1, 4061] [46, 14, 1, 3941] Both Significant True
263 5 3 mean_norm 6.513 0.089 2.956 0.228 -0.114 0.000 -0.097 [58, 2, 43, 4019] [58, 2, 68, 3874] Only Full Significant True
305 6 0 mean_norm 43.871 0.000 40.577 0.000 0.300 -0.317 -0.006 [58, 2, 31, 4162] [39, 21, 2, 4066] Both Significant True
319 6 3 ganon_norm 40.836 0.000 40.799 0.000 0.062 0.583 0.554 [11, 49, 1, 4192] [46, 14, 1, 4067] Both Significant True
354 6 15 centrifuge_norm 8.454 0.037 3.712 0.156 -0.125 0.000 -0.109 [58, 2, 46, 4147] [58, 2, 76, 3992] Only Full Significant True
365 7 0 mean_norm 46.384 0.000 46.336 0.000 0.084 0.583 0.422 [23, 37, 4, 4836] [58, 2, 4, 4678] Both Significant True
379 7 3 ganon_norm 40.843 0.000 40.799 0.000 0.062 0.583 0.554 [11, 49, 1, 4839] [46, 14, 1, 4681] Both Significant True
402 7 7 centrifuge_norm 12.768 0.005 5.586 0.061 -0.152 0.000 -0.135 [58, 2, 45, 4795] [58, 2, 83, 4599] Both Significant True
407 7 7 mean_norm 6.531 0.088 3.144 0.208 0.102 0.000 0.101 [58, 2, 104, 4736] [58, 2, 68, 4614] Only Full Significant True
413 7 10 mean_norm 6.531 0.088 3.144 0.208 0.102 0.000 0.101 [58, 2, 104, 4736] [58, 2, 68, 4614] Only Full Significant True
437 8 2 mean_norm 9.109 0.028 5.037 0.081 0.147 0.000 0.127 [58, 2, 76, 7660] [58, 2, 42, 7520] Both Significant True
439 8 3 ganon_norm 40.820 0.000 40.799 0.000 0.062 0.583 0.554 [11, 49, 1, 7735] [46, 14, 1, 7561] Both Significant True
443 8 3 mean_norm 9.109 0.028 5.037 0.081 0.147 0.000 0.127 [58, 2, 76, 7660] [58, 2, 42, 7520] Both Significant True
449 8 4 mean_norm 8.979 0.030 3.999 0.135 0.113 0.000 0.114 [58, 2, 111, 7625] [58, 2, 69, 7493] Only Full Significant True
482 9 0 kaiju_norm 997.648 0.000 888.595 0.000 -0.437 0.933 0.085 [1, 59, 1, 12540] [57, 3, 854, 11515] Both Significant True
485 9 0 mean_norm 24.758 0.000 24.539 0.000 0.101 -0.383 -0.228 [51, 9, 8, 12533] [28, 32, 1, 12368] Both Significant True
499 9 3 ganon_norm 38.413 0.000 38.405 0.000 0.051 0.567 0.519 [13, 47, 1, 12540] [47, 13, 1, 12368] Both Significant True
507 9 4 kraken2_norm 32.370 0.000 31.976 0.000 -0.109 0.400 0.210 [33, 27, 0, 12541] [57, 3, 7, 12362] Both Significant True
509 9 4 mean_norm 17.490 0.001 7.116 0.028 0.147 0.000 0.151 [58, 2, 129, 12412] [58, 2, 69, 12300] Both Significant True
529 9 10 ganon_norm 10.471 0.015 5.917 0.052 0.164 0.000 0.136 [58, 2, 70, 12471] [58, 2, 36, 12333] Both Significant True
535 9 15 ganon_norm 10.471 0.015 5.917 0.052 0.164 0.000 0.136 [58, 2, 70, 12471] [58, 2, 36, 12333] Both Significant True
False vs. True: Mann-Whitney-Wilcoxon test two-sided, P_val:1.167e-07 U_stat=4.738e+03
False vs. True: Mann-Whitney-Wilcoxon test two-sided, P_val:1.681e-04 U_stat=6.271e+03
False vs. True: Mann-Whitney-Wilcoxon test two-sided, P_val:6.430e-06 U_stat=6.282e+03
/tmp/ipykernel_2918775/3028195495.py:24: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(
/tmp/ipykernel_2918775/3028195495.py:24: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(
/tmp/ipykernel_2918775/3028195495.py:24: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(
No description has been provided for this image
In [102]:
df_pass_chi2_stats[df_pass_chi2_stats['are_diff'] == True].groupby('S').count()
Out[102]:
mode column chi2_full p_value_full chi2_partial p_value_partial precision_diff recall_diff f1_diff stats_pass0 stats_pass2 significance are_diff
S
0 12 12 12 12 12 12 12 12 12 12 12 12 12
2 3 3 3 3 3 3 3 3 3 3 3 3 3
3 10 10 10 10 10 10 10 10 10 10 10 10 10
4 3 3 3 3 3 3 3 3 3 3 3 3 3
6 1 1 1 1 1 1 1 1 1 1 1 1 1
7 5 5 5 5 5 5 5 5 5 5 5 5 5
10 2 2 2 2 2 2 2 2 2 2 2 2 2
15 2 2 2 2 2 2 2 2 2 2 2 2 2
In [103]:
df_pass_chi2_stats[(df_pass_chi2_stats['are_diff'] == True) & (df_pass_chi2_stats['S'] == 0)][['precision_diff',	'recall_diff',	'f1_diff']].median()
Out[103]:
precision_diff    0.032
recall_diff      -0.008
f1_diff           0.039
dtype: float64
In [104]:
df_pass_chi2_stats[(df_pass_chi2_stats['are_diff'] == True) & (df_pass_chi2_stats['S'] == 3)][['precision_diff',	'recall_diff',	'f1_diff']].median()
Out[104]:
precision_diff   0.062
recall_diff      0.567
f1_diff          0.540
dtype: float64
In [105]:
df_pass_chi2_stats[df_pass_chi2_stats['are_diff'] == True].groupby('column').count()
Out[105]:
mode S chi2_full p_value_full chi2_partial p_value_partial precision_diff recall_diff f1_diff stats_pass0 stats_pass2 significance are_diff
column
centrifuge_norm 5 5 5 5 5 5 5 5 5 5 5 5 5
ganon_norm 13 13 13 13 13 13 13 13 13 13 13 13 13
kaiju_norm 4 4 4 4 4 4 4 4 4 4 4 4 4
kraken2_norm 1 1 1 1 1 1 1 1 1 1 1 1 1
mean_norm 15 15 15 15 15 15 15 15 15 15 15 15 15
In [106]:
df_pass_chi2_stats[(df_pass_chi2_stats['are_diff'] == True) & (df_pass_chi2_stats['column'] == 'ganon_norm')][['precision_diff',	'recall_diff',	'f1_diff']].median()
Out[106]:
precision_diff   0.061
recall_diff      0.550
f1_diff          0.519
dtype: float64
In [107]:
df_pass_chi2_stats[(df_pass_chi2_stats['are_diff'] == True) & (df_pass_chi2_stats['column'] == 'mean_norm')][['precision_diff',	'recall_diff',	'f1_diff']].median()
Out[107]:
precision_diff   0.102
recall_diff      0.000
f1_diff          0.101
dtype: float64
In [108]:
df_pass_chi2_stats[df_pass_chi2_stats['are_diff'] == True].groupby('mode').count()
Out[108]:
S column chi2_full p_value_full chi2_partial p_value_partial precision_diff recall_diff f1_diff stats_pass0 stats_pass2 significance are_diff
mode
1 3 3 3 3 3 3 3 3 3 3 3 3 3
2 6 6 6 6 6 6 6 6 6 6 6 6 6
3 3 3 3 3 3 3 3 3 3 3 3 3 3
4 4 4 4 4 4 4 4 4 4 4 4 4 4
5 3 3 3 3 3 3 3 3 3 3 3 3 3
6 3 3 3 3 3 3 3 3 3 3 3 3 3
7 5 5 5 5 5 5 5 5 5 5 5 5 5
8 4 4 4 4 4 4 4 4 4 4 4 4 4
9 7 7 7 7 7 7 7 7 7 7 7 7 7

As we see, using a second pass generally does not significantly change the assignment of TP/FP/FN | TN.

In the cases where it changes, the f1-score generally improves, either by increasing the number of TP (by reducing the amount of FN) or reduces the amount of false positives.

Therefore, looking at nominal info choosing pass2 is the best option.

Checking at the correlation between the read counts¶

Now we are going to check the proportion of reads that are correctly assigned to gold truth species. For that we are goin two show:

  • A correlation plot with pass0 and pass2 reads. We are also going to calculate the correlation between pass0 / pass2 and the expected counts.
In [109]:
df_numerical_stats
Out[109]:
pass mode profiler diff_counts MRE_counts MRED_counts MAE_counts MAED_counts MACV_counts corr_counts R2_counts taxid_counts species_counts expected_counts observed_counts diff_abundance MRE_abundance MRED_abundance MAE_abundance MAED_abundance MACV_abundance corr_abundance R2_abundance taxid_abundance species_abundance expected_abundance observed_abundance
0 0 1 centrifuge [-21.76736, -50.1056, -74.39946666666667, -58.... -16.939 23.674 17.173 23.505 1.369 0.635 0.576 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [244477.0, 93552.0, 48001.0, 51790.0, 189991.0... [-1.33929624238111, -37.07720182312471, -67.71... 4.750 29.856 26.450 14.640 0.554 0.801 0.576 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.0831469924255903, 1.1798024658164117, 0.605...
1 0 1 ganon [-23.91328, -55.785066666666665, -83.386666666... -28.729 21.022 28.729 21.022 0.732 0.558 0.590 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [237771.0, 82903.0, 31150.0, 42979.0, 159924.0... [7.790932959479549, -37.36135668956172, -76.46... 0.969 29.782 25.411 15.563 0.612 0.790 0.590 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.368466654983736, 1.1744745620707178, 0.4412...
2 0 1 kaiju [-53.25664, -79.21386666666666, -91.4437333333... -49.543 28.481 49.543 28.481 0.575 0.389 0.313 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [146073.0, 38974.0, 16043.0, 14537.0, 54507.0,... [-8.305719789553791, -59.224806809815064, -83.... -1.021 55.870 47.167 29.963 0.635 0.764 0.313 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [2.865446256576444, 0.7645348723159675, 0.3147...
3 0 1 kraken2 [-76.07584, -94.63306666666666, -99.4, -99.609... -65.147 24.902 65.147 24.902 0.382 0.176 0.122 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [74763.0, 10063.0, 1125.0, 488.0, 11530.0, 288... [-19.322126949109276, -81.90144330504141, -97.... 17.532 83.974 75.284 41.126 0.546 0.594 0.122 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [2.521183532840335, 0.3393479380304735, 0.0379...
4 0 1 krakenuniq [-19.62944, -48.239466666666665, -68.408533333... -16.729 23.969 17.018 23.765 1.396 0.642 0.572 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [251158.0, 97051.0, 59234.0, 56908.0, 183826.0... [-0.4454715144359227, -35.88453918746712, -60.... 3.147 29.691 25.571 15.414 0.603 0.795 0.572 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.1110790151738774, 1.2021648902349915, 0.733...
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
103 2 9 ganon [3.88512, -35.4976, -56.00053333333334, -32.8,... -1.882 23.489 18.223 14.940 0.820 0.751 0.659 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [324641.0, 120942.0, 82499.0, 84000.0, 215092.... [5.0978227572311425, -34.74463135224731, -55.4... -0.736 23.763 18.879 14.451 0.765 0.760 0.659 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.284306961163473, 1.223538162145363, 0.83462...
104 2 9 kaiju [-48.45856, -76.704, -89.0208, -85.9616, -80.8... -43.474 29.718 43.474 29.718 0.684 0.422 0.324 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [161067.0, 43680.0, 20586.0, 17548.0, 59848.0,... [-9.872292921902698, -59.26355445072247, -80.8... -1.156 51.967 43.407 28.597 0.659 0.738 0.324 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [2.8164908461905407, 0.7638083540489538, 0.359...
105 2 9 kraken2 [-39.60768, -84.74186666666667, -95.9482666666... -32.087 35.514 32.087 35.514 1.107 0.419 0.235 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [188726.0, 28609.0, 7597.0, 4884.0, 33042.0, 6... [0.13138111998259205, -74.70178386954647, -93.... 12.601 58.882 53.987 26.671 0.494 0.695 0.235 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.129105659999456, 0.4743415524460034, 0.1259...
106 2 9 krakenuniq [-19.67168, -48.3264, -68.4816, -54.56, -41.30... -16.778 23.985 17.064 23.782 1.394 0.641 0.572 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [251026.0, 96888.0, 59097.0, 56800.0, 183437.0... [-0.37064433374560224, -35.91043018258363, -60... 3.219 29.748 25.644 15.417 0.601 0.796 0.572 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.11341736457045, 1.2016794340765569, 0.73296...
107 2 9 mean [-20.390584802138655, -49.665829444940506, -66... -16.259 21.692 16.933 21.171 1.250 0.638 0.622 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [248779.4224933167, 94376.56979073654, 62310.1... [-1.6718373579364112, -43.06956794651194, -63.... 8.287 32.919 29.573 16.667 0.564 0.799 0.536 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.072755082564487, 1.0674456010029012, 0.6795...

108 rows × 27 columns

In [110]:
# Filter for pass=0 and pass=2
df_pass_0 = df_numerical_stats[df_numerical_stats['pass'] == 0].set_index(['mode', 'profiler'])
df_pass_2 = df_numerical_stats[df_numerical_stats['pass'] == 2].set_index(['mode', 'profiler'])

# Align the two DataFrames to ensure consistent indexing
aligned_df = pd.concat([df_pass_0, df_pass_2], axis=1, keys=['pass0', 'pass2'])

# Extract relevant columns and observed counts for pass=0 and pass=2
comparison_df = aligned_df[['pass0', 'pass2']].apply(
    lambda x: pd.Series({
        'passn': 'comparison',  # Mark as comparison for clarity
        'mode': x.name[0],
        'profiler': x.name[1],
        'taxid_counts': x['pass0']['taxid_counts'],  # Keep consistent metadata
        'expected_counts': x['pass0']['expected_counts'],
        'pass0_counts': x['pass0']['observed_counts'],
        'pass2_counts': x['pass2']['observed_counts'],
        'pass0_MAE': x['pass0']['MAE_counts'],
        'pass0_diff': x['pass0']['diff_counts'],
        'pass2_MAE': x['pass2']['MAE_counts'],
        'pass2_diff': x['pass2']['diff_counts'],
    }),
    axis=1
)

# Reset index for the final DataFrame
comparison_df = comparison_df.reset_index(drop=True)
In [111]:
plt.rc('axes', linewidth=0.65)

# Filter the DataFrame for the specific mode (5 in this case)
mode_df = comparison_df[comparison_df['mode'] == 5]

# Get unique profilers for layout
profilers = mode_df['profiler'].unique()
num_profilers = len(profilers)

# Set up the figure with GridSpec for custom layout
fig, axs = plt.subplots(3, num_profilers, figsize=(2 * num_profilers, 6), sharex=True, sharey=True)

# Define plotting functions with regression line
def plot_pass0_vs_pass2(ax, data, profiler, i):
    x = data['pass0_counts'].values[0]
    y = data['pass2_counts'].values[0]
    
    # Scatter plot
    ax.scatter(x, y, alpha=0.7)
    
    # Plot y=x line
    min_val, max_val = min(x.min(), y.min()), max(x.max(), y.max())
    ax.plot([0, 350000], [0, 350000], '--', color='#DC267F')
    
    # Pearson correlation and R^2
    slope, intercept, r_value, _, _ = linregress(x, y)
    r2 = r_value ** 2
    ax.annotate(f"Slope: {slope:.2f}\nR$^2$: {r2:.2f}", xy=(0.05, 0.95),
                xycoords='axes fraction', fontsize=12, verticalalignment='top')
    if i == 0:
        ax.set_ylabel("Pass0 vs Pass2", fontsize=12)
    # Set profiler title on top
    ax.set_title(profiler, fontsize=14)

    sns.despine(top=True, right=True, ax=ax)

    
def plot_with_regression(ax, x, y, xlabel, i):
    # Scatter plot
    ax.scatter(x, y, alpha=0.7)

    # Regression line
    slope, intercept, r_value, _, _ = linregress(x, y)
    ax.plot(x, slope * x + intercept, '-', color='#DC267F')

    # Pearson correlation and R^2
    r2 = r_value ** 2
    ax.annotate(f"Slope: {slope:.2f}\nR$^2$: {r2:.2f}", xy=(0.05, 0.95),
                xycoords='axes fraction', fontsize=12, verticalalignment='top')
    if i == 0:
        ax.set_ylabel(xlabel, fontsize=12)
    sns.despine(top=True, right=True, ax=ax)

# Iterate through profilers and create subplots for each
for i, profiler in enumerate(profilers):
    profiler_data = mode_df[mode_df['profiler'] == profiler]
    x_pass0 = profiler_data['pass0_counts'].values[0]
    x_expected = profiler_data['expected_counts'].values[0]
    y_pass2 = profiler_data['pass2_counts'].values[0]
    
    # Pass0 vs Pass2 (Row 1)
    ax = fig.add_subplot(axs[0, i])
    plot_pass0_vs_pass2(ax, profiler_data, profiler, i)
    
    # Pass2 vs Expected (Row 2)
    ax = fig.add_subplot(axs[1, i])
    plot_with_regression(ax, x_expected, y_pass2, "Pass2 vs Expected", i)
    
    # Pass0 vs Expected (Row 3)
    ax = fig.add_subplot(axs[2, i])
    plot_with_regression(ax, x_expected, x_pass0, "Pass0 vs Expected", i)

    ax.set_xticks([0, 300000])
    ax.set_yticks([0, 300000])



# Aesthetic adjustments
for ax in axs[-1, :]:
    ax.set_xlabel("Counts", fontsize=12)  # Only bottom row has x-labels

plt.tight_layout()

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/figD.{format}', dpi=DPI)


plt.show()
No description has been provided for this image

We see that the correlation in number of counts for the ground truth species is almost one. Additionally, if there are differences, they improve the correlation to the expected amount of counts, so it has a positive impact.

Checking at MAE values¶

We are going to see if the MAE values are equal or higher in pass2 vs pass0.

In [112]:
import seaborn as sns
import matplotlib.pyplot as plt


comparison_df_exploded = comparison_df.melt(
    id_vars=['mode', 'profiler'],
    value_vars=['pass0_diff', 'pass2_diff'],
    var_name='pass_type',
    value_name='diff_value'
)

# Convert `diff_value` from a list to separate rows
comparison_df_exploded = comparison_df_exploded.explode('diff_value')

# Ensure `diff_value` is numeric after exploding
comparison_df_exploded['diff_value'] = pd.to_numeric(comparison_df_exploded['diff_value'])




# Get unique profilers
profilers = comparison_df_exploded['profiler'].unique()
num_profilers = len(profilers)

# Set up the figure
fig, axs = plt.subplots(1, num_profilers, figsize=(6 * num_profilers, 6), sharey=True)

# Iterate through profilers and plot the data
for i, profiler in enumerate(profilers):
    ax = axs[i]
    profiler_data = comparison_df_exploded[comparison_df_exploded['profiler'] == profiler]

    # Create the plot
    sns.boxplot(
        data=profiler_data,
        x='mode',
        y='diff_value',
        hue='pass_type',
        ax=ax,
        palette='viridis'
    )
    
    # Set plot title and labels
    ax.set_title(f'Profiler: {profiler}', fontsize=14)
    ax.set_xlabel('Mode', fontsize=12)
    if i == 0:
        ax.set_ylabel('Diff Values', fontsize=12)
    else:
        ax.set_ylabel('')

# Adjust layout and display
plt.tight_layout()
plt.show()
No description has been provided for this image

We see that, again, the differences in MAE are not due to the pass, with the exception of ganon (and mean by extension), where running the data with pass2 improves the error rate.

Thus, both based on nominal and numerical criteria, running with pass2 is the best option.

Which mode is best for the data?¶

We have used several modes to study how it affects the detection of species, and the number of reads it detects. We are going to use the same methods as before to check for the answer.

  • We are going to check which mode (using S=2 and S=7) retains the best F1 scores.
  • We are going to check which mode keeps the best correlation / MAE with expected counts.

Checking F1-scores¶

In [113]:
df_nominal_stats_sub = df_nominal_stats[(df_nominal_stats['pass'] == 2) & (df_nominal_stats['S'].isin([0, 2, 7, 15])) & (df_nominal_stats['column'].isin([f'{i}_norm' for i in LIST_PROFILERS + ['mean']]))]
In [114]:
df_nominal_stats_sub
Out[114]:
pass mode S column precision recall f1 kappa TP|FN|FP|TN
1260 2 1 0 centrifuge_norm 1.000 0.050 0.095 0.094 (3, 57, 0, 3522)
1261 2 1 0 ganon_norm 1.000 0.050 0.095 0.094 (3, 57, 0, 3522)
1262 2 1 0 kaiju_norm 1.000 0.100 0.182 0.179 (6, 54, 0, 3522)
1263 2 1 0 kraken2_norm 1.000 0.217 0.356 0.352 (13, 47, 0, 3522)
1264 2 1 0 krakenuniq_norm 1.000 0.067 0.125 0.123 (4, 56, 0, 3522)
... ... ... ... ... ... ... ... ... ...
2507 2 9 15 ganon_norm 0.617 0.967 0.753 0.752 (58, 2, 36, 12333)
2508 2 9 15 kaiju_norm 0.533 0.950 0.683 0.681 (57, 3, 50, 12319)
2509 2 9 15 kraken2_norm 0.891 0.950 0.919 0.919 (57, 3, 7, 12362)
2510 2 9 15 krakenuniq_norm 0.400 0.967 0.566 0.563 (58, 2, 87, 12282)
2516 2 9 15 mean_norm 0.310 0.967 0.470 0.466 (58, 2, 129, 12240)

216 rows × 9 columns

In [115]:
plt.rc('axes', linewidth=0.65)

melted_df = pd.melt(
    df_nominal_stats_sub,
    id_vars=['mode', 'S', 'column'],
    value_vars=['recall', 'precision', 'f1'],
    var_name='metric',
    value_name='score'
)

# Create a colormap for 'mode'
norm = Normalize(vmin=melted_df['mode'].min(), vmax=melted_df['mode'].max())
cmap = plt.cm.viridis  # Choose a colormap (e.g., 'viridis', 'plasma', 'cividis')

# Create a FacetGrid: 6x3 grid (row for each profiler, column for each metric)
g = sns.FacetGrid(
    melted_df, 
    col='column', 
    row='metric', 
    height=2, 
    sharey=True, 
    sharex=True
)

# Map the lineplot to the grid
def lineplot_with_cmap(data, **kwargs):
    for S in sorted(data['S'].unique()):
        subset = data[data['S'] == S]
        plt.plot(subset['mode'], subset['score'], label=f"Mode {mode}",
                 color=cmap(norm(S)), marker='o')

g.map_dataframe(lineplot_with_cmap)

# Create a legend for the discrete modes
handles = [
    plt.Line2D([0], [0], color=cmap(norm(mode)), marker='o', linestyle='', label=f"S {mode}")
    for mode in sorted(melted_df['S'].unique())
]
plt.legend(
    handles=handles, 
    title="", 
    bbox_to_anchor=(1.05, 3), 
    loc='center left', 
    frameon=False
)

# Set x-axis ticks (if you have specific S values)
g.set(xticks=df_nominal_stats_sub['mode'].unique())

for ax in g.axes.ravel():
    ax.set_title('')

# Add axis labels and titles
for ax, profiler in zip(g.axes[0, :], melted_df['column'].unique()):
    ax.set_title(profiler.replace('_norm', ''))

for ax, score in zip(g.axes[:, 0], ['recall', 'precision', 'F1-score']):
    ax.set_ylabel(score)

plt.tight_layout()

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/figF.{format}', dpi=DPI, bbox_inches='tight')

plt.show()
/tmp/ipykernel_2918775/1058285318.py:60: UserWarning: Tight layout not applied. tight_layout cannot make Axes height small enough to accommodate all Axes decorations.
  plt.tight_layout()
No description has been provided for this image

Based on the table of truth, the precision, recall and f1-score are more S-dependent than mode-dependent; and their patterns are more profiler dependent than anything else. Therefore, the mode is not completely relevant for a proper species detection and, if so, since higher mode values lead to a lower precision (more FP), we could consider using a lower mode if we want to minimize the FPs values.

Checking correlation and MAE values¶

In [116]:
df_numerical_stats_sub = df_numerical_stats[(df_numerical_stats['pass'] == 2) & (df_numerical_stats['mode'].isin([1, 3, 5, 7, 9]))]
In [117]:
# Get unique profilers for layout
profilers = df_numerical_stats_sub['profiler'].unique()
modes = df_numerical_stats_sub['mode'].unique()


# Set up the figure with GridSpec for custom layout
fig, axs = plt.subplots(len(profilers), len(modes), figsize=(2 * len(modes), 2 * len(profilers)), sharex=True, sharey=True)

# Define plotting functions with regression line
def plot_regression(ax, data, profiler, i):
    x = data['expected_counts'].values[0]
    y = data['observed_counts'].values[0]
    
    # Scatter plot
    ax.scatter(x, y, alpha=0.7)
    
    # Plot y=x line
    min_val, max_val = min(x.min(), y.min()), max(x.max(), y.max())
    ax.plot([0, 350000], [0, 350000], 'r--')
    
    # Pearson correlation and R^2
    slope, intercept, r_value, _, _ = linregress(x, y)
    r2 = r_value ** 2
    ax.annotate(f"Slope: {slope:.2f}\nR2: {r2:.2f}", xy=(0.05, 0.95),
                xycoords='axes fraction', fontsize=10, verticalalignment='top')
    

# Iterate through profilers and create subplots for each
for i, profiler in enumerate(profilers):
    for j, mode in enumerate(modes):
        profiler_data = df_numerical_stats_sub[(df_numerical_stats_sub['profiler'] == profiler) & (df_numerical_stats_sub['mode'] == mode)]
        
        # Pass0 vs Pass2 (Row 1)
        ax = fig.add_subplot(axs[i, j])
        plot_regression(ax, profiler_data, profiler, i)

        if j == 0:
            ax.set_ylabel(profiler, fontsize=12)
            # Set profiler title on top
        if i == 0:
            ax.set_title(f'Mode {mode}', fontsize=14)

# Aesthetic adjustments
for ax in axs[-1, :]:
    ax.set_xlabel("Counts", fontsize=12)  # Only bottom row has x-labels

plt.tight_layout()
plt.show()
No description has been provided for this image
In [118]:
# Prepare the data for plotting
expanded_data = []
for _, row in df_numerical_stats_sub.iterrows():
    profiler = row['profiler']
    mode = row['mode']
    for value in row['diff_counts']:
        expanded_data.append({'profiler': profiler, 'mode': mode, 'diff_counts': value})

# Convert to a new DataFrame
plot_data = pd.DataFrame(expanded_data)

# Initialize the grid of plots with seaborn
g = sns.FacetGrid(plot_data, col="profiler", height=4, aspect=1)

# Plot the data on each facet
g.map_dataframe(
    sns.boxplot,
    x="mode",
    y="diff_counts",
    palette="viridis"
)

# Adjust the legend and titles
g.set_axis_labels("Mode", "Difference Counts")
g.set_titles(col_template="{col_name} Profiler")

plt.tight_layout()
plt.show()
/home/neuroimm/.local/share/mamba/envs/EVs/lib/python3.13/site-packages/seaborn/axisgrid.py:854: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  func(*plot_args, **plot_kwargs)
/home/neuroimm/.local/share/mamba/envs/EVs/lib/python3.13/site-packages/seaborn/axisgrid.py:854: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  func(*plot_args, **plot_kwargs)
/home/neuroimm/.local/share/mamba/envs/EVs/lib/python3.13/site-packages/seaborn/axisgrid.py:854: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  func(*plot_args, **plot_kwargs)
/home/neuroimm/.local/share/mamba/envs/EVs/lib/python3.13/site-packages/seaborn/axisgrid.py:854: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  func(*plot_args, **plot_kwargs)
/home/neuroimm/.local/share/mamba/envs/EVs/lib/python3.13/site-packages/seaborn/axisgrid.py:854: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  func(*plot_args, **plot_kwargs)
/home/neuroimm/.local/share/mamba/envs/EVs/lib/python3.13/site-packages/seaborn/axisgrid.py:854: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  func(*plot_args, **plot_kwargs)
No description has been provided for this image
In [119]:
from matplotlib.ticker import MaxNLocator

plt.rc('axes', linewidth=0.65)

# Initialize the grid of plots with seaborn
g = sns.FacetGrid(df_numerical_stats_sub, col="profiler", height=2.3, aspect=0.85, sharex=False, sharey=False)

# Plot the data on each facet
g.map_dataframe(
    sns.scatterplot,
    x="corr_counts",
    y="R2_counts",
    hue="mode",
    palette="viridis"
)

# Adjust the legend and titles
g.add_legend(title="Mode", bbox_to_anchor=(1.05, 0.5))

g.set_axis_labels("Correlation Counts", "R² Counts")
g.set_titles(col_template="{col_name}")

for ax in g.axes.flat:
    ax.xaxis.set_major_locator(MaxNLocator(nbins=2)) 


plt.tight_layout()

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/figGA.{format}', dpi=DPI, bbox_inches='tight')

plt.show()
No description has been provided for this image
In [120]:
g = sns.FacetGrid(df_numerical_stats_sub, col="profiler", height=2.3, aspect=0.85, sharex=False, sharey=False)

# Plot the data on each facet
g.map_dataframe(
    sns.scatterplot,
    x="MAE_counts",
    y="MAED_counts",
    hue="mode",
    palette="viridis"
)

# Adjust the legend and titles
g.add_legend(title="Mode", bbox_to_anchor=(1.05, 0.5))

g.set_axis_labels("MAE", "MAED")
g.set_titles(col_template="{col_name}")

plt.tight_layout()

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/figGB.{format}', dpi=DPI, bbox_inches='tight')


plt.show()
No description has been provided for this image
In [121]:
g = sns.FacetGrid(df_numerical_stats_sub, col="profiler", height=2.3, aspect=0.85, sharex=False, sharey=False)

# Plot the data on each facet
g.map_dataframe(
    sns.scatterplot,
    x="MRE_counts",
    y="MRED_counts",
    hue="mode",
    palette="viridis"
)

# Adjust the legend and titles
g.add_legend(title="Mode", bbox_to_anchor=(1.05, 0.5))

g.set_axis_labels("MRE", "MRED")
g.set_titles(col_template="{col_name}")

plt.tight_layout()

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/figGC.{format}', dpi=DPI, bbox_inches='tight')


plt.show()
No description has been provided for this image
In [122]:
# Create the DataFrame
df = df_numerical_stats[df_numerical_stats['pass'] == 2].copy()



# Calculate the differences from the mean for each mode
df_mean = df[df['profiler'] == 'mean'][['mode', 'MRED_counts']].rename(columns={'MRED_counts': 'mean_MRED'})
df = df.merge(df_mean, on='mode')
df['difference'] = df['MRED_counts'] - df['mean_MRED']


# Filter out the mean profiler itself
df_filtered = df[df['profiler'] != 'mean'].sort_values('profiler')


# Perform Mann-Whitney U tests
profiler_groups = df_filtered['profiler'].unique()
p_values = []
results = []


for profiler in profiler_groups:
    group_data = df_filtered[df_filtered['profiler'] == profiler]
    mean_data = group_data['MRED_counts'].values
    prof_data = group_data['mean_MRED'].values
    stat, p = wilcoxon(mean_data, prof_data, alternative='greater')
    p_values.append(p)
    results.append({'profiler': profiler, 'statistic': stat, 'p_value': p})

# Correct p-values for multiple testing
p_values_corrected = multipletests(p_values, method='fdr_bh')[1]

for i, result in enumerate(results):
    result['p_value_corrected'] = p_values_corrected[i]


# Convert results to DataFrame
results_df = pd.DataFrame(results)

display(results_df)

# Plot the differences
fig, ax = plt.subplots(1, 1, figsize=(5, 3))

sns.boxplot(data=df_filtered, x='profiler', y='difference', palette=custom_palette, ax=ax)
# sns.stripplot(data=df_filtered, x='profiler', y='difference', color='black', alpha=0.5, jitter=True)
medians = df_filtered.groupby("profiler")['difference'].median()
for i, median in enumerate(medians):
        ax.text(i, median , f"{median:.2f}", ha='center', va='bottom', fontsize=8)


plt.axhline(0, linestyle='--', color='#848484', linewidth=0.5)
plt.ylabel('MRED difference from mean')
plt.xlabel('')

sns.despine(top=True, right=True)

plt.tight_layout()

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/figJ.{format}', dpi=DPI, bbox_inches='tight')


plt.show()
profiler statistic p_value p_value_corrected
0 centrifuge 45.000 0.002 0.002
1 ganon 45.000 0.002 0.002
2 kaiju 45.000 0.002 0.002
3 kraken2 45.000 0.002 0.002
4 krakenuniq 45.000 0.002 0.002
/tmp/ipykernel_2918775/2421930555.py:45: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df_filtered, x='profiler', y='difference', palette=custom_palette, ax=ax)
/tmp/ipykernel_2918775/2421930555.py:45: UserWarning: The palette list has more values (6) than needed (5), which may not be intended.
  sns.boxplot(data=df_filtered, x='profiler', y='difference', palette=custom_palette, ax=ax)
No description has been provided for this image
In [123]:
# Create the DataFrame
df = df_numerical_stats[df_numerical_stats['pass'] == 2].copy()



# Calculate the differences from the mean for each mode
df_mean = df[df['profiler'] == 'mean'][['mode', 'MAED_counts']].rename(columns={'MAED_counts': 'mean_MAED'})
df = df.merge(df_mean, on='mode')
df['difference'] = df['MAED_counts'] - df['mean_MAED']


# Filter out the mean profiler itself
df_filtered = df[df['profiler'] != 'mean'].sort_values('profiler')


# Perform Mann-Whitney U tests
profiler_groups = df_filtered['profiler'].unique()
p_values = []
results = []


for profiler in profiler_groups:
    group_data = df_filtered[df_filtered['profiler'] == profiler]
    mean_data = group_data['MAED_counts'].values
    prof_data = group_data['mean_MAED'].values
    stat, p = wilcoxon(mean_data, prof_data, alternative='greater')
    p_values.append(p)
    results.append({'profiler': profiler, 'statistic': stat, 'p_value': p})

# Correct p-values for multiple testing
p_values_corrected = multipletests(p_values, method='fdr_bh')[1]

for i, result in enumerate(results):
    result['p_value_corrected'] = p_values_corrected[i]


# Convert results to DataFrame
results_df = pd.DataFrame(results)

display(results_df)

# Plot the differences
fig, ax = plt.subplots(1, 1, figsize=(5, 3))

sns.boxplot(data=df_filtered, x='profiler', y='difference', palette=custom_palette, ax=ax)
# sns.stripplot(data=df_filtered, x='profiler', y='difference', color='black', alpha=0.5, jitter=True)
medians = df_filtered.groupby("profiler")['difference'].median()
for i, median in enumerate(medians):
        ax.text(i, median , f"{median:.2f}", ha='center', va='bottom', fontsize=8)



plt.axhline(0, linestyle='--', color='#848484', linewidth=0.5)
plt.ylabel('MAED difference from mean')
plt.xlabel('')

sns.despine(top=True, right=True)

plt.tight_layout()

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/figJ.{format}', dpi=DPI, bbox_inches='tight')


plt.show()
profiler statistic p_value p_value_corrected
0 centrifuge 45.000 0.002 0.002
1 ganon 7.000 0.973 0.973
2 kaiju 45.000 0.002 0.002
3 kraken2 45.000 0.002 0.002
4 krakenuniq 45.000 0.002 0.002
/tmp/ipykernel_2918775/683032758.py:45: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df_filtered, x='profiler', y='difference', palette=custom_palette, ax=ax)
/tmp/ipykernel_2918775/683032758.py:45: UserWarning: The palette list has more values (6) than needed (5), which may not be intended.
  sns.boxplot(data=df_filtered, x='profiler', y='difference', palette=custom_palette, ax=ax)
No description has been provided for this image
In [124]:
df_numerical_stats_sub[df_numerical_stats_sub['mode'].isin([3, 7])]
Out[124]:
pass mode profiler diff_counts MRE_counts MRED_counts MAE_counts MAED_counts MACV_counts corr_counts R2_counts taxid_counts species_counts expected_counts observed_counts diff_abundance MRE_abundance MRED_abundance MAE_abundance MAED_abundance MACV_abundance corr_abundance R2_abundance taxid_abundance species_abundance expected_abundance observed_abundance
66 2 3 centrifuge [-21.29664, -49.24053333333333, -72.7157333333... -16.551 23.325 16.808 23.141 1.377 0.642 0.586 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [245948.0, 95174.0, 51158.0, 53926.0, 192262.0... [-1.6074186089308142, -36.542036383614835, -65... 4.325 29.161 25.668 14.498 0.565 0.803 0.586 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.074768168470912, 1.1898368178072218, 0.6395...
67 2 3 ganon [-6.65408, -46.7088, -73.0704, -50.604, -40.65... -8.143 26.077 19.274 19.361 1.005 0.683 0.580 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [291706.0, 99921.0, 50493.0, 61745.0, 185450.0... [3.2451929989731525, -41.05730353135007, -70.2... 1.598 28.843 24.447 15.387 0.629 0.755 0.580 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.226412281217911, 1.105175558787186, 0.55847...
68 2 3 kaiju [-52.1184, -78.6064, -90.92586666666666, -87.8... -48.224 28.792 48.224 28.792 0.597 0.398 0.316 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [149630.0, 40113.0, 17014.0, 15201.0, 55766.0,... [-6.421758363959896, -58.18904401137832, -82.2... 1.190 56.270 47.277 30.539 0.646 0.779 0.316 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [2.9243200511262533, 0.7839554247866565, 0.332...
69 2 3 kraken2 [-61.60288, -91.8592, -98.78133333333334, -99.... -50.081 32.137 50.081 32.137 0.642 0.278 0.161 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [119991.0, 15264.0, 2285.0, 1020.0, 17562.0, 4... [-11.271327674607562, -81.18811057530996, -97.... 15.354 74.263 67.854 33.862 0.499 0.642 0.161 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [2.7727710101685137, 0.3527229267129384, 0.052...
70 2 3 krakenuniq [-19.67168, -48.3264, -68.4816, -54.56, -41.30... -16.778 23.985 17.064 23.782 1.394 0.641 0.572 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [251026.0, 96888.0, 59097.0, 56800.0, 183437.0... [-0.37064433374560224, -35.91043018258363, -60... 3.219 29.748 25.644 15.417 0.601 0.796 0.572 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.11341736457045, 1.2016794340765569, 0.73296...
71 2 3 mean [-25.802934689620443, -53.521229040380774, -73... -20.388 21.741 20.388 21.741 1.066 0.601 0.601 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [231865.8290949361, 87147.69554928604, 49730.5... [-3.0213808182247845, -44.51661135451871, -68.... 12.824 38.048 34.447 20.629 0.599 0.804 0.485 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.0305818494304755, 1.040313537102774, 0.5921...
90 2 7 centrifuge [-20.50976, -47.4128, -69.89973333333333, -54.... -15.904 22.759 16.191 22.556 1.393 0.653 0.603 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [248407.0, 98601.0, 56438.0, 56618.0, 196085.0... [-2.4354608041876133, -35.45564920173818, -63.... 3.218 27.934 24.137 14.425 0.598 0.801 0.603 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.048891849869137, 1.2102065774674091, 0.6927...
91 2 7 ganon [2.87936, -39.4528, -60.080533333333335, -36.5... -2.179 24.830 19.200 15.894 0.828 0.734 0.629 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [321498.0, 113526.0, 74849.0, 79318.0, 200909.... [5.609860704572441, -37.84582876437132, -59.02... 0.418 25.489 20.717 14.855 0.717 0.754 0.629 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.3003081470178888, 1.1653907106680377, 0.768...
92 2 7 kaiju [-49.07328, -77.00266666666667, -89.2613333333... -44.465 29.503 44.465 29.503 0.664 0.419 0.323 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [159146.0, 43120.0, 20135.0, 17133.0, 58980.0,... [-9.114576548016046, -58.958236888878254, -80.... -0.890 52.653 43.948 29.012 0.660 0.747 0.323 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [2.8401694828744986, 0.7695330583335327, 0.359...
93 2 7 kraken2 [-44.83456, -87.64213333333333, -97.136, -97.6... -36.009 35.648 36.009 35.648 0.990 0.386 0.212 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [172392.0, 23171.0, 5370.0, 2881.0, 26475.0, 6... [-2.383528721026508, -78.1324804708156, -94.93... 13.233 63.080 58.103 27.897 0.480 0.682 0.212 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.0505147274679216, 0.4100159911722076, 0.095...
94 2 7 krakenuniq [-19.67168, -48.3264, -68.4816, -54.56, -41.30... -16.778 23.985 17.064 23.782 1.394 0.641 0.572 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [251026.0, 96888.0, 59097.0, 56800.0, 183437.0... [-0.37064433374560224, -35.91043018258363, -60... 3.219 29.748 25.644 15.417 0.601 0.796 0.572 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.11341736457045, 1.2016794340765569, 0.73296...
95 2 7 mean [-21.385476139592473, -51.124221746876785, -68... -16.856 21.928 17.401 21.498 1.235 0.629 0.614 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [245670.38706377352, 91642.08422460603, 58892.... [-1.5125795343509907, -43.63677614078014, -64.... 9.779 34.128 30.912 17.460 0.565 0.803 0.524 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.0777318895515315, 1.0568104473603723, 0.659...
In [125]:
def plot_prof_couns(df_combo):
    # Assuming your data is in a DataFrame called `df`
    # Expand the diff_counts column into individual rows for plotting
    df_expanded = df_combo.explode('diff_counts')
    df_expanded['diff_counts'] = pd.to_numeric(df_expanded['diff_counts'])

    # Create the plot
    plt.figure(figsize=(10, 2.5))
    

    for profiler, countsdiff in zip(df_combo['profiler'], df_combo['diff_counts']):
        me = np.median(countsdiff)
        med = np.std(countsdiff)
        plt.scatter(me, profiler, color='#DC267F', label='_nolegend_', s=100, zorder=3, marker = '|')
        plt.plot([me-med, me+med], [profiler, profiler], color='#DC267F', label='_nolegend_',)

    sns.stripplot(
        data=df_expanded,
        x='diff_counts',
        y='profiler',
        jitter=True,  # Adds jitter for better visibility of points
        size=5,  # Adjust point size
        alpha=0.7,  # Slight transparency
        color="#648FFF"
    )

    # Add a vertical line at x=0
    plt.axvline(0, color='#848484', linestyle='--', linewidth=1)

    sns.despine(top=True, right=True)

    # Add labels and title
    # plt.title('Diff Counts Across Profilers', fontsize=14)
    plt.xlabel('PRE', fontsize=12)
    plt.ylabel('', fontsize=12)
    plt.grid(True, axis='x', linestyle='--', alpha=0.6)
    plt.tight_layout()

    for format in ['png', 'tiff']: 
        plt.savefig(f'{RESULTS_DIR}/figures/paper/figH{mode}.{format}', dpi=DPI, bbox_inches='tight')

    plt.show()
In [126]:
for mode in [3,5,7]:
    print(mode)
    df_combo = df_numerical_stats[(df_numerical_stats['pass'] == 2) & \
                              (df_numerical_stats['mode'] == mode)] # We choose a large number because S in not relevant here (but with small S we may select few datasets)

    plot_prof_couns(df_combo)
3
No description has been provided for this image
5
No description has been provided for this image
7
No description has been provided for this image

Based on these results, we see that a higher mode increases the correlation (slope) and R2; and decreases the MAE (although it increases the MAED). However, again, we see that this effect is very profiler-dependent. It is interesting to note, however, that there are some species that have a very bad detection rate throughout the profiler/mode/S values.

Based on that, considering that higher modes (1) lower the MAE / increase the correlation but (2) decrease the precision; but considering that the effect on (1) is more pronounced that in (2), we are going to choose a high mode value, but not extreme. For instance, mode = 7

Which species have the worst assignment?¶

In [127]:
species_to_kingdom = {
    # Bacteria
    "Cutibacterium acnes": "Bacteria",
    "Lactobacillus acidophilus": "Bacteria",
    "Bifidobacterium bifidum": "Bacteria",
    "Akkermansia muciniphila": "Bacteria",
    "Blautia coccoides": "Bacteria",
    "Blautia luti": "Bacteria",
    "Bacteroides ovatus": "Bacteria",
    "Bacteroides intestinalis": "Bacteria",
    "Bacteroides fragilis": "Bacteria",
    "Escherichia coli": "Bacteria",
    "Dietzia lutea": "Bacteria",
    "Ruthenibacterium lactatiformans": "Bacteria",
    "Faecalibacterium prausnitzii": "Bacteria",
    "Parabacteroides distasonis": "Bacteria",
    "Parabacteroides merdae": "Bacteria",
    "Fusicatenibacter saccharivorans": "Bacteria",
    "Erysipelatoclostridium ramosum": "Bacteria",
    "Streptococcus salivarius": "Bacteria",
    "Hungatella hathewayi": "Bacteria",
    "Eisenbergiella porci": "Bacteria",
    "Butyricimonas faecalis": "Bacteria",
    "Alistipes indistinctus": "Bacteria",
    "Alistipes finegoldii": "Bacteria",
    "Eubacterium callanderi": "Bacteria",
    "Acidaminococcus intestini": "Bacteria",
    
    # Fungi
    "Aspergillus chevalieri": "Fungi",
    "Aspergillus flavus": "Fungi",
    "Saccharomyces cerevisiae": "Fungi",
    "Saccharomyces kudriavzevii": "Fungi",
    "Saccharomyces mikatae": "Fungi",
    "Candida albicans": "Fungi",
    "Candida dubliniensis": "Fungi",
    "Candida orthopsilosis": "Fungi",
    "Malassezia restricta": "Fungi",
    "Alternaria dauci": "Fungi",
    "Kazachstania africana": "Fungi",
    "Penicillium digitatum": "Fungi",
    "Pichia kudriavzevii": "Fungi",
    "Trichoderma asperellum": "Fungi",
    "Akanthomyces muscarius": "Fungi",
    "Fusarium falciforme": "Fungi",
    "Eremothecium sinecaudum": "Fungi",
    "Cryptococcus decagattii": "Fungi",
    "Kwoniella shandongensis": "Fungi",
    "Puccinia triticina": "Fungi",
    
    # Virus
    "Tobacco mosaic virus": "Virus",
    "Rotavirus A": "Virus",
    "Rotavirus B": "Virus",
    "Rotavirus C": "Virus",
    "Bacteriophage P2": "Virus",
    "Escherichia phage T4": "Virus",
    "Human immunodeficiency virus 1": "Virus",
    "Human adenovirus 7": "Virus",
    "Hepatitis C virus": "Virus",
    "Bovine alphaherpesvirus 2": "Virus",
    "Human herpesvirus 4 type 2": "Virus",
    "Mimivirus terra2": "Virus",
    "Dengue virus": "Virus",
    "Norovirus GI": "Virus",
    "Zaire ebolavirus": "Virus"
}
In [128]:
mode = 5
In [129]:
df_combo = df_numerical_stats[(df_numerical_stats['pass'] == 2) & \
                              (df_numerical_stats['mode'] == mode)].set_index('profiler')
In [130]:
df_combo
Out[130]:
pass mode diff_counts MRE_counts MRED_counts MAE_counts MAED_counts MACV_counts corr_counts R2_counts taxid_counts species_counts expected_counts observed_counts diff_abundance MRE_abundance MRED_abundance MAE_abundance MAED_abundance MACV_abundance corr_abundance R2_abundance taxid_abundance species_abundance expected_abundance observed_abundance
profiler
centrifuge 2 5 [-20.92736, -48.3728, -71.28853333333333, -55.... -16.211 23.038 16.486 22.842 1.386 0.648 0.595 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [247102.0, 96801.0, 53834.0, 55355.0, 194199.0... [-2.0102896294236245, -36.0216836665145, -64.4... 3.835 28.550 24.922 14.446 0.580 0.803 0.595 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.0621784490805117, 1.1995934312528531, 0.667...
ganon 2 5 [-0.50112, -43.06133333333333, -65.74453333333... -3.843 25.604 19.386 17.162 0.885 0.717 0.606 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [310934.0, 106760.0, 64229.0, 72144.0, 194362.... [4.519396024457606, -40.188321210872765, -64.0... 1.009 26.896 22.260 15.131 0.680 0.753 0.606 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.2662311257643, 1.1214689772961357, 0.674698...
kaiju 2 5 [-49.99712, -77.44426666666666, -89.7125333333... -45.742 29.240 45.742 29.240 0.639 0.413 0.322 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [156259.0, 42292.0, 19289.0, 16479.0, 57971.0,... [-8.0675120659037, -58.53029500478051, -81.086... -0.245 53.759 44.984 29.436 0.654 0.759 0.322 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [2.8728902479405094, 0.7775569686603653, 0.354...
kraken2 2 5 [-51.72896, -89.8096, -98.04693333333333, -98.... -41.501 34.745 41.501 34.745 0.837 0.341 0.189 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [150847.0, 19107.0, 3662.0, 1740.0, 21850.0, 5... [-5.859592421967733, -80.12621212670827, -96.1... 14.088 67.761 62.520 29.686 0.475 0.665 0.189 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [2.9418877368135083, 0.3726335226242199, 0.071...
krakenuniq 2 5 [-19.67168, -48.3264, -68.4816, -54.56, -41.30... -16.778 23.985 17.064 23.782 1.394 0.641 0.572 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [251026.0, 96888.0, 59097.0, 56800.0, 183437.0... [-0.37064433374560224, -35.91043018258363, -60... 3.219 29.748 25.644 15.417 0.601 0.796 0.572 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.11341736457045, 1.2016794340765569, 0.73296...
mean 2 5 [-23.187162957260917, -52.379110996145705, -70... -18.085 21.968 18.371 21.729 1.183 0.618 0.607 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [312500, 187500, 187500, 125000, 312500, 12500... [240040.11575855964, 89289.1668822268, 54476.7... [-2.1606238158283873, -44.255403818539435, -66... 11.009 35.605 32.294 18.601 0.576 0.803 0.509 [817, 823, 853, 1304, 1532, 1547, 1579, 1681, ... [Bacteroides fragilis, Parabacteroides distaso... [3.125, 1.875, 1.875, 1.25, 3.125, 1.25, 3.125... [3.057480505755363, 1.0452111784023856, 0.6271...
In [131]:
# RUN FOR MEAN IN MODE 1 3 5 7 9
# RUN FOR MODE 5 AND THE REST OF PROFILERS
In [132]:
prof = 'mean'

df_diffab = pd.DataFrame({'species': df_combo.loc[prof, 'species_counts'],  
                          'expected_counts': df_combo.loc[prof, 'expected_counts'], \
                          'observed_counts': df_combo.loc[prof, 'observed_counts'], \
                          'diff_abundance': df_combo.loc[prof, 'diff_counts']})
df_diffab['kingdom'] = [species_to_kingdom[i] for i in df_diffab['species']]
df_diffab['isfamily'] = False

family_species = ['Blautia coccoides', 'Blautia luti', 'Bacteroides ovatus', 'Bacteroides intestinalis','Bacteroides fragilis', 
                  'Parabacteroides distasonis', 'Parabacteroides merdae', 'Alistipes indistinctus', 'Alistipes finegoldii', 'Aspergillus chevalieri', 
                  'Aspergillus flavus', 'Saccharomyces cerevisiae', 'Saccharomyces kudriavzevii', 'Saccharomyces mikatae', 'Candida albicans', 'Candida dubliniensis', 
                  'Candida orthopsilosis', 'Rotavirus A', 'Rotavirus B', 'Rotavirus C']
df_diffab.loc[df_diffab['species'].isin(family_species), 'isfamily'] = True
df_diffab.sort_values(by=['expected_counts', 'diff_abundance'], ascending=False)
Out[132]:
species expected_counts observed_counts diff_abundance kingdom isfamily
8 Cutibacterium acnes 312500 254046.803 -18.705 Bacteria False
0 Bacteroides fragilis 312500 240040.116 -23.187 Bacteria True
6 Lactobacillus acidophilus 312500 239520.544 -23.353 Bacteria False
40 Akkermansia muciniphila 312500 196080.997 -37.254 Bacteria False
7 Bifidobacterium bifidum 312500 184723.065 -40.889 Bacteria False
4 Blautia coccoides 312500 173325.771 -44.536 Bacteria True
42 Bacteroides intestinalis 312500 163291.164 -47.747 Bacteria True
18 Bacteroides ovatus 312500 111631.939 -64.278 Bacteria True
29 Blautia luti 312500 53234.639 -82.965 Bacteria True
21 Rotavirus C 250000 254565.176 1.826 Virus True
20 Rotavirus B 250000 253634.055 1.454 Virus True
19 Rotavirus A 250000 251608.261 0.643 Virus True
17 Tobacco mosaic virus 250000 243272.405 -2.691 Virus False
44 Dietzia lutea 250000 232472.093 -7.011 Bacteria False
15 Human immunodeficiency virus 1 200000 201026.902 0.513 Virus False
10 Saccharomyces cerevisiae 200000 195902.958 -2.049 Fungi True
32 Saccharomyces kudriavzevii 200000 190980.300 -4.510 Fungi True
33 Saccharomyces mikatae 200000 189682.529 -5.159 Fungi True
41 Candida orthopsilosis 200000 189190.931 -5.405 Fungi True
35 Aspergillus chevalieri 200000 187004.613 -6.498 Fungi True
12 Candida albicans 200000 184824.390 -7.588 Fungi True
23 Candida dubliniensis 200000 184501.730 -7.749 Fungi True
13 Escherichia phage T4 200000 164474.441 -17.763 Virus False
11 Aspergillus flavus 200000 148008.703 -25.996 Fungi True
14 Bacteriophage P2 200000 45220.596 -77.390 Virus False
47 Ruthenibacterium lactatiformans 187500 135305.249 -27.837 Bacteria False
25 Parabacteroides merdae 187500 125862.826 -32.873 Bacteria True
1 Parabacteroides distasonis 187500 89289.167 -52.379 Bacteria True
2 Faecalibacterium prausnitzii 187500 54476.741 -70.946 Bacteria False
55 Hepatitis C virus 150000 151377.085 0.918 Virus False
28 Malassezia restricta 150000 149869.196 -0.087 Fungi False
53 Bovine alphaherpesvirus 2 150000 148849.267 -0.767 Virus False
31 Human adenovirus 7 150000 147511.825 -1.659 Virus False
26 Alternaria dauci 150000 141107.893 -5.928 Fungi False
54 Human herpesvirus 4 type 2 150000 139697.441 -6.868 Virus False
5 Erysipelatoclostridium ramosum 125000 67123.702 -46.301 Bacteria False
3 Streptococcus salivarius 125000 58117.095 -53.506 Bacteria False
57 Dengue virus 100000 101890.120 1.890 Virus False
43 Kazachstania africana 100000 97363.929 -2.636 Fungi False
22 Penicillium digitatum 100000 95742.552 -4.257 Fungi False
46 Mimivirus terra2 100000 84794.462 -15.206 Virus False
52 Eisenbergiella porci 62500 57135.945 -8.582 Bacteria False
36 Acidaminococcus intestini 62500 57048.401 -8.723 Bacteria False
45 Alistipes indistinctus 62500 55391.499 -11.374 Bacteria True
39 Alistipes finegoldii 62500 51804.368 -17.113 Bacteria True
50 Butyricimonas faecalis 62500 49405.437 -20.951 Bacteria False
34 Hungatella hathewayi 62500 40802.328 -34.716 Bacteria False
27 Eubacterium callanderi 62500 30952.294 -50.476 Bacteria False
16 Norovirus GI 50000 50524.777 1.050 Virus False
56 Zaire ebolavirus 50000 49744.206 -0.512 Virus False
24 Eremothecium sinecaudum 50000 49425.836 -1.148 Fungi False
38 Puccinia triticina 50000 49269.888 -1.460 Fungi False
9 Pichia kudriavzevii 50000 48697.541 -2.605 Fungi False
48 Kwoniella shandongensis 50000 48329.668 -3.341 Fungi False
51 Akanthomyces muscarius 50000 47989.514 -4.021 Fungi False
49 Cryptococcus decagattii 50000 47962.656 -4.075 Fungi False
30 Trichoderma asperellum 50000 46584.994 -6.830 Fungi False
37 Fusarium falciforme 50000 46336.938 -7.326 Fungi False
In [133]:
# Create a grid of three axes for the plots
fig, axs = plt.subplots(1, 3, figsize=(9, 3.5), sharey=True)

pairs=[(True, False)]


ax1 = sns.boxplot(df_diffab, x='isfamily', y='diff_abundance', ax=axs[0], palette=sns.color_palette(['#4E79A7', '#A0CBE8']))
medians = df_diffab.groupby("isfamily")['diff_abundance'].median()
for i, median in enumerate(medians):
        ax1.text(i, median - 8, f"{median:.2f}", ha='center', va='bottom', fontsize=8)

axs[0].set_ylabel("PRE", fontsize=12)
annotator = Annotator(ax1, pairs, data=df_diffab, x='isfamily', y='diff_abundance')
annotator.configure(test='Mann-Whitney', text_format='simple', loc='outside', line_width=1)
annotator.apply_and_annotate()
axs[0].set_xlabel("Species belongs to\na shared genus", fontsize=12)



ax2 = sns.boxplot(df_diffab, x='kingdom', y='diff_abundance', ax=axs[1], palette=sns.color_palette(['#4E79A7', '#77a2c8', '#A0CBE8']))
pairs=[("Virus", 'Bacteria'), ('Fungi', 'Bacteria'), ('Fungi', 'Virus')]
annotator = Annotator(ax2, pairs, data=df_diffab, x='kingdom', y='diff_abundance')
annotator.configure(test='Mann-Whitney', text_format='simple', loc='outside', line_width=1)
annotator.apply_and_annotate()
medians = df_diffab.groupby("kingdom")['diff_abundance'].median()
for i, median in enumerate(medians):
        ax2.text(i, median - 8, f"{median:.2f}", ha='center', va='bottom', fontsize=8)




df_bac = df_diffab[df_diffab['kingdom'] == 'Bacteria']

ax3 = sns.boxplot(df_bac, x='isfamily', y='diff_abundance', ax=axs[2], palette=sns.color_palette(['#4E79A7', '#A0CBE8']))
pairs=[(True, False)]
annotator = Annotator(ax3, pairs, data=df_bac, x='isfamily', y='diff_abundance')
annotator.configure(test='Mann-Whitney', text_format='simple', loc='outside', line_width=1)
annotator.apply_and_annotate()
axs[2].set_xlabel("Bacterial species belongs\nto a shared genus", fontsize=12)
medians = df_bac.groupby("isfamily")['diff_abundance'].median()
for i, median in enumerate(medians):
        ax3.text(i, median - 8, f"{median:.2f}", ha='center', va='bottom', fontsize=8)


for ax in axs:
    sns.despine(top=True, right=True, ax=ax)


plt.tight_layout()

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/figI-{mode}-{prof}.{format}', dpi=DPI, bbox_inches='tight')

plt.show()
/tmp/ipykernel_2918775/1519887836.py:7: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax1 = sns.boxplot(df_diffab, x='isfamily', y='diff_abundance', ax=axs[0], palette=sns.color_palette(['#4E79A7', '#A0CBE8']))
/tmp/ipykernel_2918775/1519887836.py:20: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax2 = sns.boxplot(df_diffab, x='kingdom', y='diff_abundance', ax=axs[1], palette=sns.color_palette(['#4E79A7', '#77a2c8', '#A0CBE8']))
/tmp/ipykernel_2918775/1519887836.py:34: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax3 = sns.boxplot(df_bac, x='isfamily', y='diff_abundance', ax=axs[2], palette=sns.color_palette(['#4E79A7', '#A0CBE8']))
False vs. True: Mann-Whitney-Wilcoxon test two-sided, P_val:3.304e-01 U_stat=4.400e+02
Bacteria vs. Fungi: Mann-Whitney-Wilcoxon test two-sided, P_val:1.184e-07 U_stat=1.200e+01
Fungi vs. Virus: Mann-Whitney-Wilcoxon test two-sided, P_val:3.156e-02 U_stat=8.500e+01
Bacteria vs. Virus: Mann-Whitney-Wilcoxon test two-sided, P_val:2.544e-05 U_stat=3.100e+01
False vs. True: Mann-Whitney-Wilcoxon test two-sided, P_val:3.610e-01 U_stat=7.800e+01
No description has been provided for this image