InĀ [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy.stats import mannwhitneyu, wilcoxon
from statsmodels.stats.multitest import multipletests

from statannotations.Annotator import Annotator
InĀ [2]:
pd.options.display.float_format = '{:.3f}'.format
pd.set_option('display.max_columns', None)

plt.rcParams['figure.dpi']=170

DPI=250
InĀ [3]:
from list_vars import LIST_PROFILERS, DIR_FIGURES, RESULTS_DIR, POOLS, CONTROLS

Biological sample analysis (part 2)¶

In this notebook we are going to do the analysis on the biological samples (POOL samples + controls).

One thing we want to check is how S and mode affect the robustness of our results. So, to do that we are going to compare the results using a comparative plot. The comparativ plot shows how many new variable are shown compared to the previous case. So, we can do a comparative increase on mode by fixing S, or on S by fixing the mode.

This analysis can be performed with many variables, and we are going to choose the following:

  1. The number of detected species, across all samples and per individual sample (after NORM+ and cutting of species with more than 65% NaNs).
  2. The number of detected differentially abundant species across the 4 comparisons.

With this in mind, we can later select one S and one mode and do the following analyses.

  1. The importance of including the biological control samples to ensure that false positives are not considered.
  2. The importance of normalizing the reads considering the biogical samples.
  3. Plot the significantly differentially abundant species.

Observing the effect of $S$ on species retention for biological controls¶

InĀ [4]:
rename_mapping = {
        'POOL1': 'RR1', 'POOL2': 'RR2', 'POOL3': 'RR3', 'POOL4': 'RR4',
        'POOL5': 'SP1', 'POOL6': 'SP2', 'POOL7': 'SP3', 'POOL8': 'SP4',
        'POOL9': 'HC1', 'POOL10': 'HC2', 'POOL11': 'HC3', 'POOL12': 'HC4',
    }
InĀ [5]:
dict_retained_discarded_species = {'mode': [], 'S': [], 'sample': [], 'retained': [], 'discarded': []}

for mode in [3, 5, 7]:
    for S in [0, 1, 2, 3, 4, 5, 6, 7, 10, 15]:
        for sample in [f'POOL{i}' for i in range(1, 13)] + ['ACIDOLA', 'BLACTIS']:
            flags_file =  f'{RESULTS_DIR}/summary/{sample}_pass2_mode{mode}_taxgenus_S{S}_NORM+.flags.tsv'
            df_flags = pd.read_csv(flags_file, sep='\t').set_index('taxonomy_id')[['name', 'lineage', 'mean_norm']]
            
            dict_retained_discarded_species['mode'].append(mode)
            dict_retained_discarded_species['S'].append(S)
            dict_retained_discarded_species['sample'].append(rename_mapping[sample] if 'POOL' in sample else sample)
            dict_retained_discarded_species['retained'].append((df_flags['mean_norm'] == False).sum())
            dict_retained_discarded_species['discarded'].append((df_flags['mean_norm'] == True).sum())          

df_retained_discarded_species = pd.DataFrame(dict_retained_discarded_species)
df_retained_discarded_species['total'] = df_retained_discarded_species['retained'] + df_retained_discarded_species['discarded']

df_retained_discarded_species['percentage'] = 100 * df_retained_discarded_species['retained'] / df_retained_discarded_species['total']
InĀ [6]:
df_retained_discarded_species
Out[6]:
mode S sample retained discarded total percentage
0 3 0 RR1 16 1433 1449 1.104
1 3 0 RR2 30 1778 1808 1.659
2 3 0 RR3 28 1927 1955 1.432
3 3 0 RR4 28 1923 1951 1.435
4 3 0 SP1 6 1913 1919 0.313
... ... ... ... ... ... ... ...
415 7 15 HC2 67 2072 2139 3.132
416 7 15 HC3 110 1978 2088 5.268
417 7 15 HC4 77 2183 2260 3.407
418 7 15 ACIDOLA 3 665 668 0.449
419 7 15 BLACTIS 2 251 253 0.791

420 rows Ɨ 7 columns

InĀ [Ā ]:
 
InĀ [7]:
df_total_counts = df_retained_discarded_species.groupby(['mode', 'sample'])['total'].max().unstack(level=0)
df_total_counts['increment_3_5'] = df_total_counts[5] / df_total_counts[3]
df_total_counts['increment_3_7'] = df_total_counts[7] / df_total_counts[3]

df_total_counts
Out[7]:
mode 3 5 7 increment_3_5 increment_3_7
sample
ACIDOLA 563 603 668 1.071 1.187
BLACTIS 163 194 253 1.190 1.552
HC1 2087 2129 2186 1.020 1.047
HC2 1995 2054 2139 1.030 1.072
HC3 1956 1996 2088 1.020 1.067
HC4 2160 2204 2260 1.020 1.046
RR1 1449 1527 1622 1.054 1.119
RR2 1808 1871 1955 1.035 1.081
RR3 1955 2000 2081 1.023 1.064
RR4 1951 1999 2076 1.025 1.064
SP1 1919 1982 2038 1.033 1.062
SP2 2235 2282 2344 1.021 1.049
SP3 1934 1982 2061 1.025 1.066
SP4 2314 2354 2408 1.017 1.041
InĀ [8]:
df_retained_discarded_species
Out[8]:
mode S sample retained discarded total percentage
0 3 0 RR1 16 1433 1449 1.104
1 3 0 RR2 30 1778 1808 1.659
2 3 0 RR3 28 1927 1955 1.432
3 3 0 RR4 28 1923 1951 1.435
4 3 0 SP1 6 1913 1919 0.313
... ... ... ... ... ... ... ...
415 7 15 HC2 67 2072 2139 3.132
416 7 15 HC3 110 1978 2088 5.268
417 7 15 HC4 77 2183 2260 3.407
418 7 15 ACIDOLA 3 665 668 0.449
419 7 15 BLACTIS 2 251 253 0.791

420 rows Ɨ 7 columns

InĀ [9]:
for column, name in zip(['retained', 'percentage'], ['Retained genera',  'Retained percentage']):
    # Set up the grid for 14 samples in a 4x4 layout
    samples = df_retained_discarded_species['sample'].unique()  # Unique samples
    n_cols, n_rows = 7, 2  # Define the grid layout

    # Initialize the grid layout
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(2*n_cols, 2*n_rows), sharex=True)

    # Placeholder to collect handles and labels for the legend
    handles, labels = None, None


    # Loop through each sample and create its subplot
    for i, sample in enumerate(samples):
        row, col = divmod(i, n_cols)  # Get the row and column index for the subplot
        ax = axes[row, col]

        # Filter the DataFrame for the current sample
        sample_data = df_retained_discarded_species[df_retained_discarded_species['sample'] == sample]

        # Plot retained species as a function of S with mode as hue
        lineplot = sns.lineplot(
            data=sample_data,
            x='S',
            y=column,
            hue='mode',
            marker='o',
            ax=ax,
            legend=(i == 6)  # Enable legend only for the first subplot
        )
        ax.set_title(sample)  # Title of the subplot is the sample name
        ax.set_xlabel("S")
        if col == 0:
            ax.set_ylabel(name)
        else:
            ax.set_ylabel("")

        ax.set_xticks([0,  2,  4,  6,  10, 15])

        if i==6:
            ax.legend(frameon=False, bbox_to_anchor=(1.05, 1), title='Mode')

        # Set a specific Y-axis limit for the last two samples
        if column == 'retained':
            if i in [len(samples) - 2, len(samples) - 1]:  # Last two samples
                ax.set_ylim(0, 4)

    # Remove extra subplots (for grid cells not needed)
    for j in range(i + 1, n_rows * n_cols):
        fig.delaxes(axes[j // n_cols, j % n_cols])


    # Adjust the main plot area to leave space for the legend
    plt.tight_layout()

    for format in ['png', 'tiff']: 
        plt.savefig(f'{RESULTS_DIR}/figures/paper/s_effect_{column}.{format}', dpi=DPI)

    plt.show()
No description has been provided for this image
No description has been provided for this image

Plotting sample detection and statistacally differential species across modes and S¶

InĀ [10]:
# Initialize an empty list to collect data rows
data_rows = []

# Iterate through modes and S values
for mode in [3, 5, 7]:
    for S in [0, 1, 2, 3, 4, 5, 6, 7, 10, 15]:
        # Read the data
        df_normpipe_retained_species = pd.read_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_retained.tsv', sep='\t')
        df_normpipe_discarded_normplus = pd.read_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_discarded_norm+.tsv', sep='\t')
        df_normpipe_discarded_common = pd.read_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_discarded_common.tsv', sep='\t')

        # Get the list of samples (assuming they are column names)
        samples = df_normpipe_retained_species.columns[3:]

        data_rows.append({
                'mode': mode,
                'S': S,
                'sample': 'ALL',
                'count_retained': len(df_normpipe_retained_species),
                'count_discarded_norm': len(df_normpipe_discarded_normplus),
                'count_discarded_all': len(df_normpipe_discarded_common)
            })
        
        for sample in samples:
            # Count non-NaN species for each dataframe and sample
            count_retained = df_normpipe_retained_species[sample].notna().sum()
            count_discarded_norm = df_normpipe_discarded_normplus[sample].notna().sum()
            count_discarded_all = df_normpipe_discarded_common[sample].notna().sum()

            # Collect the data as a dictionary
            data_rows.append({
                'mode': mode,
                'S': S,
                'sample': sample,
                'count_retained': count_retained,
                'count_discarded_norm': count_discarded_norm,
                'count_discarded_all': count_discarded_all
            })

# Convert the collected rows into a dataframe
df_stats_species_count = pd.DataFrame(data_rows)
df_stats_species_count['count_total'] = df_stats_species_count['count_retained'] + df_stats_species_count['count_discarded_norm'] + df_stats_species_count['count_discarded_all']
df_stats_species_count
Out[10]:
mode S sample count_retained count_discarded_norm count_discarded_all count_total
0 3 0 ALL 30 14 2 46
1 3 0 RR1 30 14 2 46
2 3 0 RR2 30 14 2 46
3 3 0 RR3 30 14 2 46
4 3 0 RR4 30 14 2 46
... ... ... ... ... ... ... ...
505 7 15 HC4 87 78 9 174
506 7 15 ACIDOLA 40 77 9 126
507 7 15 BLACTIS 17 40 9 66
508 7 15 n_samples_flag 87 78 9 174
509 7 15 selected_flag 87 78 9 174

510 rows Ɨ 7 columns

InĀ [11]:
df_stats_species_count[df_stats_species_count['sample'] == 'ALL']
Out[11]:
mode S sample count_retained count_discarded_norm count_discarded_all count_total
0 3 0 ALL 30 14 2 46
17 3 1 ALL 36 40 5 81
34 3 2 ALL 58 52 6 116
51 3 3 ALL 62 53 6 121
68 3 4 ALL 62 54 6 122
85 3 5 ALL 74 56 6 136
102 3 6 ALL 84 59 6 149
119 3 7 ALL 84 61 6 151
136 3 10 ALL 94 62 6 162
153 3 15 ALL 98 65 7 170
170 5 0 ALL 26 14 3 43
187 5 1 ALL 35 33 22 90
204 5 2 ALL 40 33 22 95
221 5 3 ALL 59 37 25 121
238 5 4 ALL 66 38 25 129
255 5 5 ALL 74 43 25 142
272 5 6 ALL 74 43 25 142
289 5 7 ALL 82 46 25 153
306 5 10 ALL 90 49 27 166
323 5 15 ALL 96 54 28 178
340 7 0 ALL 19 26 6 51
357 7 1 ALL 33 46 7 86
374 7 2 ALL 50 59 7 116
391 7 3 ALL 52 62 8 122
408 7 4 ALL 63 66 9 138
425 7 5 ALL 70 69 9 148
442 7 6 ALL 75 71 9 155
459 7 7 ALL 75 71 9 155
476 7 10 ALL 79 72 9 160
493 7 15 ALL 87 78 9 174
InĀ [12]:
# Initialize an empty list to collect data rows
data_rows = []

# Iterate through modes and S values
for mode in [3, 5, 7]:
    for S in [0, 1, 2, 3, 4, 5, 6, 7, 10, 15]:
        # Read the data
        df_pval_HCvsRR = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_HCvsRR.tsv', sep='\t')
        df_pval_HCvsSP = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_HCvsSP.tsv', sep='\t')
        df_pval_RRvsSP = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_RRvsSP.tsv', sep='\t')
        df_pval_sex = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_sex.tsv', sep='\t')

        # Get the list of samples (assuming they are column names)
        samples = df_normpipe_retained_species.columns[3:]

        data_rows.append({
                'mode': mode,
                'S': S,
                'count_HCvsRR': len(df_pval_HCvsRR[df_pval_HCvsRR['pval_MW'] < 0.05]),
                'count_HCvsSP': len(df_pval_HCvsSP[df_pval_HCvsSP['pval_MW'] < 0.05]),
                'count_RRvsSP': len(df_pval_RRvsSP[df_pval_RRvsSP['pval_MW'] < 0.05]),
                'count_sex': len(df_pval_sex[df_pval_sex['pval_MW'] < 0.05]),
                'species_HCvsRR': df_pval_HCvsRR[df_pval_HCvsRR['pval_MW'] < 0.05]['name'].values.tolist(),
                'species_HCvsSP': df_pval_HCvsSP[df_pval_HCvsSP['pval_MW'] < 0.05]['name'].values.tolist(),
                'species_RRvsSP': df_pval_RRvsSP[df_pval_RRvsSP['pval_MW'] < 0.05]['name'].values.tolist(),
                'species_sex': df_pval_sex[df_pval_sex['pval_MW'] < 0.05]['name'].values.tolist()
            })
        
      

# Convert the collected rows into a dataframe
df_stats_species_diffabundance = pd.DataFrame(data_rows)
df_stats_species_diffabundance['count_total'] = df_stats_species_diffabundance['count_HCvsRR'] + df_stats_species_diffabundance['count_HCvsSP'] + \
                                                df_stats_species_diffabundance['count_RRvsSP'] + df_stats_species_diffabundance['count_sex']
df_stats_species_diffabundance
Out[12]:
mode S count_HCvsRR count_HCvsSP count_RRvsSP count_sex species_HCvsRR species_HCvsSP species_RRvsSP species_sex count_total
0 3 0 0 1 2 0 [] [Colletotrichum] [Dictyostelium, Tepidimonas] [] 3
1 3 1 0 1 3 0 [] [Colletotrichum] [Dictyostelium, Tepidimonas, Brevundimonas] [] 4
2 3 2 0 1 5 0 [] [Colletotrichum] [Dictyostelium, Tepidimonas, Brevundimonas, Ro... [] 6
3 3 3 0 1 5 0 [] [Colletotrichum] [Dictyostelium, Tepidimonas, Brevundimonas, Ro... [] 6
4 3 4 0 1 5 0 [] [Colletotrichum] [Dictyostelium, Tepidimonas, Brevundimonas, Ro... [] 6
5 3 5 0 2 6 0 [] [Colletotrichum, Xanthomonas] [Dictyostelium, Tepidimonas, Brevundimonas, Xa... [] 8
6 3 6 0 2 7 0 [] [Colletotrichum, Xanthomonas] [Dictyostelium, Tepidimonas, Brevundimonas, Fl... [] 9
7 3 7 0 2 7 0 [] [Colletotrichum, Xanthomonas] [Dictyostelium, Tepidimonas, Brevundimonas, Fl... [] 9
8 3 10 0 2 7 0 [] [Colletotrichum, Xanthomonas] [Dictyostelium, Tepidimonas, Brevundimonas, Fl... [] 9
9 3 15 0 2 8 0 [] [Colletotrichum, Xanthomonas] [Dictyostelium, Tepidimonas, Brevundimonas, Ca... [] 10
10 5 0 0 0 1 0 [] [] [Sphingobium] [] 1
11 5 1 0 0 1 0 [] [] [Dictyostelium] [] 1
12 5 2 0 0 2 0 [] [] [Dictyostelium, Microlunatus] [] 2
13 5 3 0 0 5 0 [] [] [Dictyostelium, Microlunatus, Aquirhabdus, Ros... [] 5
14 5 4 0 1 7 0 [] [Xanthomonas] [Dictyostelium, Aquirhabdus, Flavobacterium, L... [] 8
15 5 5 0 1 8 0 [] [Xanthomonas] [Dictyostelium, Aquirhabdus, Rhizopus, Lysobac... [] 9
16 5 6 0 1 8 0 [] [Xanthomonas] [Dictyostelium, Aquirhabdus, Rhizopus, Lysobac... [] 9
17 5 7 0 1 8 0 [] [Xanthomonas] [Dictyostelium, Aquirhabdus, Rhizopus, Lysobac... [] 9
18 5 10 0 1 9 0 [] [Xanthomonas] [Dictyostelium, Candida, Rhizopus, Lysobacter,... [] 10
19 5 15 0 1 9 0 [] [Xanthomonas] [Dictyostelium, Candida, Rhizopus, Roseomonas,... [] 10
20 7 0 0 0 2 0 [] [] [Dictyostelium, Parastagonospora] [] 2
21 7 1 0 0 3 0 [] [] [Dictyostelium, Comamonas, Parastagonospora] [] 3
22 7 2 0 0 7 0 [] [] [Dictyostelium, Parastagonospora, Roseomonas, ... [] 7
23 7 3 0 0 7 0 [] [] [Dictyostelium, Parastagonospora, Roseomonas, ... [] 7
24 7 4 0 0 7 0 [] [] [Dictyostelium, Parastagonospora, Roseomonas, ... [] 7
25 7 5 0 0 7 0 [] [] [Dictyostelium, Parastagonospora, Roseomonas, ... [] 7
26 7 6 0 0 7 0 [] [] [Dictyostelium, Parastagonospora, Roseomonas, ... [] 7
27 7 7 0 0 7 0 [] [] [Dictyostelium, Parastagonospora, Roseomonas, ... [] 7
28 7 10 1 0 8 0 [Leptolyngbya] [] [Dictyostelium, Parastagonospora, Roseomonas, ... [] 9
29 7 15 1 0 11 0 [Leptolyngbya] [] [Dictyostelium, Chloroflexus, Comamonas, Rhizo... [] 12

Why normalization with biological controls is relevant¶

InĀ [13]:
mode = 3
S = 7
InĀ [14]:
df_normpipe_retained_species = pd.read_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_retained.tsv', sep='\t')
df_normpipe_discarded_normplus = pd.read_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_discarded_norm+.tsv', sep='\t')
df_normpipe_discarded_common = pd.read_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_discarded_common.tsv', sep='\t')
InĀ [15]:
print(len(df_normpipe_retained_species), len(df_normpipe_discarded_normplus), len(df_normpipe_discarded_common))
84 61 6
InĀ [16]:
species = 'Janibacter'

display(df_normpipe_retained_species[df_normpipe_retained_species['name'] == species])
display(df_normpipe_discarded_normplus[df_normpipe_discarded_normplus['name'] == species])
display(df_normpipe_discarded_common[df_normpipe_discarded_common['name'] == species])
taxonomy_id name lineage RR1 RR2 RR3 RR4 SP1 SP2 SP3 SP4 HC1 HC2 HC3 HC4 ACIDOLA BLACTIS n_samples_flag selected_flag
taxonomy_id name lineage RR1 RR2 RR3 RR4 SP1 SP2 SP3 SP4 HC1 HC2 HC3 HC4 ACIDOLA BLACTIS n_samples_flag selected_flag
35 53457 Janibacter Janibacter;Intrasporangiaceae;Micrococcales;Ac... 390.927 349.060 1392.642 672.706 7238.942 4715.237 1088.757 881.804 893.698 1339.119 1358.470 670.537 2361.363 NaN 4 True
taxonomy_id name lineage RR1 RR2 RR3 RR4 SP1 SP2 SP3 SP4 HC1 HC2 HC3 HC4 ACIDOLA BLACTIS n_samples_flag selected_flag

Plotting discarded species¶

InĀ [17]:
def annotation_pval(df, ax, groups, x_col, y_col):
    if len(groups) == 2:
        pairs = [(groups[0], groups[1])]

    annotator = Annotator(ax, pairs, data=df, x=x_col, y=y_col)
    annotator.configure(test='Mann-Whitney', text_format='simple', loc='outside', line_width=0.65, fontsize=8)
    annotator.apply_and_annotate()

def add_medians(df, ax, x_col, y_col, ysync=0.1, xsync=0.1):
    groups = df[x_col].unique()
    vals = [df[df[x_col] == group][y_col].values for group in groups]

    medians = [np.nanmedian(val) for val in vals]
    medianslog =  [np.log10(np.nanmedian(val) + 1) for val in vals]
    

    for i, median in enumerate(medians):
            ax.text(i + xsync, medianslog[i] + ysync, f"{medians[i]:.0f}", ha='left', va='bottom', fontsize=7)

    log2fc = np.log2(medians[1] / medians[0])

    if log2fc > 0:
        ax.text(-0.4, 5, f"log$_2$FC: {log2fc:.2f}", ha='left', va='bottom', fontsize=7)
    else:
        ax.text(1.5, 5, f"log$_2$FC: {log2fc:.2f}", ha='right', va='bottom', fontsize=7)
InĀ [18]:
def plot_ax_2(ax, y1, y2, color1, color2, group1, group2, species):
    ax.scatter([0] * len(y1), y1, color=color1, label=group1, alpha=0.8)
    ax.scatter([1] * len(y2), y2, color=color2, label=group2, alpha=0.8)

    
    # Add the means as horizontal lines
    ax.plot([0 - 0.2, 0 + 0.2], [np.log10(np.nanmedian(10 ** y1 - 1) + 1), np.log10(np.nanmedian(10 ** y1 - 1) + 1)], color=color1, lw=2)
    ax.plot([1 - 0.2, 1 + 0.2], [np.log10(np.nanmedian(10 ** y2 - 1) + 1), np.log10(np.nanmedian(10 ** y2 - 1) + 1)], color=color2, lw=2)

    # Add horizontal gridlines at 0.5 intervals
    for y in np.arange(0, max(6, max(max(y1) + 0.5, max(y2) + 0.5)), 1):  # Adjust range to match y-axis limits
        ax.axhline(y, color='lightgray', linestyle='--', linewidth=0.5, zorder=0)

    # Customize the x-axis
    ax.set_xticks([0, 1])
    ax.set_xticklabels([group1, group2])
    ax.set_xlim(-0.5, 1.5)

    # Set axis limits and labels
    ax.set_ylim(0, max(6, max(max(y1) + 0.5, max(y2) + 0.5)))
    ax.set_ylabel('log$_{10}$ counts')

    # Remove the x and y axis lines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    # Remove ticks on the y-axis
    ax.yaxis.set_ticks_position('none')

    ax.set_title(species, fontsize=10, pad=20)

    # Set lighter grid aesthetics
    ax.grid(False)


def plot_ax_4(ax, yHC, ySP, yRR, yCTRL, species):
    ax.scatter([0] * len(yHC), yHC, color='#648FFF', label='HC', alpha=0.8)
    ax.scatter([1] * len(ySP), ySP, color='#DC267F', label='SP', alpha=0.8)
    ax.scatter([2] * len(yRR), yRR, color='#FE6100', label='RR', alpha=0.8)
    ax.scatter([3] * len(yCTRL), yCTRL, color='#848484', label='CTRL', alpha=0.8)

    # Add the means as horizontal lines
    ax.plot([0 - 0.2, 0 + 0.2], [np.nanmean(yHC), np.nanmean(yHC)], color='#648FFF', lw=2)
    ax.plot([1 - 0.2, 1 + 0.2], [np.nanmean(ySP), np.nanmean(ySP)], color='#DC267F', lw=2)
    ax.plot([2 - 0.2, 2 + 0.2], [np.nanmean(yRR), np.nanmean(yRR)], color='#FE6100', lw=2)
    ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)

    # Add horizontal gridlines at 0.5 intervals
    for y in np.arange(0, max(5, max(max(yHC) + 0.5, max(ySP) + 0.5, max(yRR) + 0.5, max(yCTRL) + 0.5)), 1):  # Adjust range to match y-axis limits
        ax.axhline(y, color='lightgray', linestyle='--', linewidth=0.5, zorder=0)

    # Customize the x-axis
    ax.set_xticks([0, 1, 2, 3])
    ax.set_xticklabels(['HC', 'SP', 'RR', 'CTRL'])
    ax.set_xlim(-0.5, 3.5)

    # Set axis limits and labels
    ax.set_ylim(0, max(5, max(max(yHC) + 0.5, max(ySP) + 0.5, max(yRR) + 0.5, max(yCTRL) + 0.5)))
    ax.set_ylabel('log$_{10}$ counts')

    # Remove the x and y axis lines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    # Remove ticks on the y-axis
    ax.yaxis.set_ticks_position('none')

    ax.set_title(species, fontsize=10)

    # Set lighter grid aesthetics
    ax.grid(False)
InĀ [19]:
df_normpipe_discarded_common
Out[19]:
taxonomy_id name lineage RR1 RR2 RR3 RR4 SP1 SP2 SP3 SP4 HC1 HC2 HC3 HC4 ACIDOLA BLACTIS n_samples_flag selected_flag
0 561 Escherichia Escherichia;Enterobacteriaceae;Enterobacterale... 1115.072 7312.573 1574.702 1431.427 2693.906 5329.932 2680.127 7308.049 10986.526 2538.369 949.367 3667.465 94128.188 2927.082 11 True
1 570 Klebsiella Klebsiella;Klebsiella/Raoultella group;Enterob... 1049.170 1451.155 3828.523 2841.372 2759.812 3712.944 2972.702 6579.477 8212.239 4684.084 1298.011 3249.274 62221.478 7892.033 9 True
2 1678 Bifidobacterium Bifidobacterium;Bifidobacteriaceae;Bifidobacte... 840.215 17712.591 18087.868 403.355 1531.230 683507.244 14725.521 13617.540 10692.445 204.798 421.165 902.409 2358.385 59416136.257 7 True
3 1578 Lactobacillus Lactobacillus;Lactobacillaceae;Lactobacillales... 723.488 4012.419 525.605 1601.527 3410.966 3384.103 1667.824 2040.589 4594.691 1390.181 23001.370 3768.125 56383341.619 2445.995 9 True
4 613 Serratia Serratia;Yersiniaceae;Enterobacterales;Gammapr... 108.323 265.833 283.792 1006.155 1887.851 7829.756 458.064 16502.561 622.139 5766.380 281.936 1006.653 39416.996 1005.002 3 True
5 2048137 Agathobaculum Agathobaculum;Butyricicoccaceae;Eubacteriales;... 16.241 2.176 3777.769 16.844 11.008 92358.913 2486.322 999.148 804.799 23.261 14.330 150.199 7810.561 130.633 3 True
InĀ [20]:
def plot_species(df, ncols_max=6):
    nplots = len(df)
    nrows = nplots//ncols_max 
    nrows += (nplots % ncols_max != 0)

    if nplots:
        # Create the figure and axis
        fig, axs = plt.subplots(nrows, ncols_max, figsize=(2 * ncols_max, 2 * nrows))

        for i, species in enumerate(df.index):
            ax = axs.ravel()[i] if nplots > 1 else axs
            yHC = np.log10(df.loc[species][['HC1', 'HC2', 'HC3', 'HC4']].astype(float).values + 1)
            ySP = np.log10(df.loc[species][['SP1', 'SP2', 'SP3', 'SP4']].astype(float).values + 1)
            yRR = np.log10(df.loc[species][['RR1', 'RR2', 'RR3', 'RR4']].astype(float).values + 1)
            yCTRL = np.log10(df.loc[species][['ACIDOLA', 'BLACTIS']].astype(float).values + 1)
            
            plot_ax_4(ax, yHC, ySP, yRR, yCTRL, species)
            df_annot = pd.DataFrame({'group': ['HC'] * 4 + ['SP'] * 4 + ['RR'] * 4 + ['CTRL'] * 2, 
                                     'exp': df.loc[species, ['HC1', 'HC2', 'HC3', 'HC4', 'SP1', 'SP2', 'SP3', 'SP4', 'RR1', 'RR2', 'RR3', 'RR4', 'ACIDOLA', 'BLACTIS']].astype(float).values})
            # add_medians(df_annot, ax, x_col='group', y_col='exp', xsync=0.15, ysync=0)

        if nplots % ncols_max:
            for i in range(nplots, nrows * ncols_max ):
                axs.ravel()[i].axis('off')
        plt.tight_layout()
InĀ [21]:
plot_species(df_normpipe_discarded_common.copy().set_index('name'), ncols_max=6)
for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/l2fc_NORM+x_discarded.{format}', dpi=DPI)

plot_species(df_normpipe_discarded_normplus.copy().set_index('name'), ncols_max=7)
for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/l2fc_NORMx_discarded.{format}', dpi=DPI)

plot_species(df_normpipe_retained_species.copy().set_index('name'), ncols_max=8)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Plotting differential species¶

InĀ [22]:
def plot_diff_sopecies(df_pval, cols_A, cols_B, color1, color2, nrows=2, other_labels = None):
    ncols = len(df_pval) // nrows

    if other_labels is None:
        g1, g2 = cols_A[0][:-1], cols_B[0][:-1]

    if ncols:
        # Create the figure and axis
        fig, axs = plt.subplots(nrows, ncols, figsize=(2 * ncols, nrows * 2))

        for i in range(len(df_pval)):
            df_i = df_pval.iloc[i]
            ax = axs.ravel()[i] if ncols > 1 else axs
            y1 = np.log10(df_i[cols_A].astype(float).values + 1)
            y2 = np.log10(df_i[cols_B].astype(float).values + 1)
            
            species = df_i['name']

            df_annot = pd.DataFrame({'group': [i[:-1] for i in cols_A + cols_B], 'exp': df_i[cols_A + cols_B].astype(float).values})

            plot_ax_2(ax, y1=y1, y2=y2, color1=color1, color2=color2, group1=g1, group2=g2, species=species)
            annotation_pval(df_annot, ax, [g1, g2], x_col='group', y_col='exp')
            add_medians(df_annot, ax, x_col='group', y_col='exp', xsync=0.15, ysync=0)

        plt.tight_layout()
InĀ [23]:
# Plot HC vs RR
df_pval_HCvsRR = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_HCvsRR.tsv', sep='\t')
df_pval_HCvsRR_up = df_pval_HCvsRR[(df_pval_HCvsRR['pval_MW'] < 0.15) & (df_pval_HCvsRR['log2FC'] > 0)].sort_values(by='log2FC').head()
df_pval_HCvsRR_down = df_pval_HCvsRR[(df_pval_HCvsRR['pval_MW'] < 0.15) & (df_pval_HCvsRR['log2FC'] <= 0)].sort_values(by='log2FC').tail()

df_pval_HCvsRR_updown = pd.concat([df_pval_HCvsRR_up, df_pval_HCvsRR_down])
df_pval_HCvsRR_updown = df_pval_HCvsRR_updown.iloc[:len(df_pval_HCvsRR_updown) - len(df_pval_HCvsRR_updown) % 2]


ncols = len(df_pval_HCvsRR_updown) 

plot_diff_sopecies(df_pval_HCvsRR_updown,  ['HC1', 'HC2', 'HC3', 'HC4'], ['RR1', 'RR2', 'RR3', 'RR4'], "#648FFF", "#FE6100", nrows=1)

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/l2fc_HCvRR.{format}', dpi=DPI)




df_pval_HCvsRR_up = df_pval_HCvsRR[df_pval_HCvsRR['pval_MW'] < 0.05].sort_values(by='log2FC').head()
df_pval_HCvsRR_down = df_pval_HCvsRR[df_pval_HCvsRR['pval_MW'] < 0.05].sort_values(by='log2FC').tail()

df_pval_HCvsRR_updown = pd.concat([df_pval_HCvsRR_up, df_pval_HCvsRR_down])

ncols = len(df_pval_HCvsRR_updown) 

plot_diff_sopecies(df_pval_HCvsRR_updown,  ['HC1', 'HC2', 'HC3', 'HC4'], ['RR1', 'RR2', 'RR3', 'RR4'], "#648FFF", "#FE6100", nrows=1)

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/pval_HCvRR.{format}', dpi=DPI)
HC vs. RR: Mann-Whitney-Wilcoxon test two-sided, P_val:5.714e-02 U_stat=1.500e+01
HC vs. RR: Mann-Whitney-Wilcoxon test two-sided, P_val:1.143e-01 U_stat=1.400e+01
HC vs. RR: Mann-Whitney-Wilcoxon test two-sided, P_val:1.143e-01 U_stat=1.400e+01
HC vs. RR: Mann-Whitney-Wilcoxon test two-sided, P_val:1.143e-01 U_stat=1.400e+01
No description has been provided for this image
InĀ [24]:
# Plot HC vs SP

df_pval_HCvsSP = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_HCvsSP.tsv', sep='\t')

df_pval_HCvsSP_up = df_pval_HCvsSP[(df_pval_HCvsSP['pval_MW'] < 0.15) & (df_pval_HCvsSP['log2FC'] > 0)].sort_values(by='log2FC').head()
df_pval_HCvsSP_down = df_pval_HCvsSP[(df_pval_HCvsSP['pval_MW'] < 0.15) & (df_pval_HCvsSP['log2FC'] <= 0)].sort_values(by='log2FC').tail()

df_pval_HCvsSP_updown = pd.concat([df_pval_HCvsSP_up, df_pval_HCvsSP_down])
df_pval_HCvsSP_updown = df_pval_HCvsSP_updown.iloc[:len(df_pval_HCvsSP_updown) - len(df_pval_HCvsSP_updown) % 2]

ncols = len(df_pval_HCvsSP_updown)

plot_diff_sopecies(df_pval_HCvsSP_updown,  ['HC1', 'HC2', 'HC3', 'HC4'], ['SP1', 'SP2', 'SP3', 'SP4'], "#648FFF", "#DC267F", nrows=1)

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/l2fc_HCvSP.{format}', dpi=DPI)




df_pval_HCvsSP_up = df_pval_HCvsSP[df_pval_HCvsSP['pval_MW'] < 0.05].sort_values(by='log2FC').head()

ncols = len(df_pval_HCvsSP_up) 

plot_diff_sopecies(df_pval_HCvsSP_up,  ['HC1', 'HC2', 'HC3', 'HC4'], ['SP1', 'SP2', 'SP3', 'SP4'], "#648FFF", "#DC267F", nrows=1)                                        

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/pval_HCvsSP.{format}', dpi=DPI)
HC vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=1.600e+01
HC vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:1.143e-01 U_stat=1.400e+01
HC vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:1.143e-01 U_stat=2.000e+00
HC vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:1.143e-01 U_stat=2.000e+00
HC vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:5.714e-02 U_stat=1.000e+00
HC vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=0.000e+00
HC vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=0.000e+00
HC vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=1.600e+01
No description has been provided for this image
No description has been provided for this image
InĀ [25]:
# # Plot RR vs SP
# df_pval_RRvsSP = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_RRvsSP.tsv', sep='\t')
# df_pval_RRvsSP_up = df_pval_RRvsSP[(df_pval_RRvsSP['pval_MW'] < 0.15) & (df_pval_RRvsSP['log2FC'] > 0)].sort_values(by='log2FC').head(14)
# df_pval_RRvsSP_down = df_pval_RRvsSP[(df_pval_RRvsSP['pval_MW'] < 0.15) & (df_pval_RRvsSP['log2FC'] <= 0)].sort_values(by='log2FC').head(14)

# df_pval_RRvsSP_updown = pd.concat([df_pval_RRvsSP_up, df_pval_RRvsSP_down])
# df_pval_RRvsSP_updown = df_pval_RRvsSP_updown.iloc[:len(df_pval_RRvsSP_updown) - len(df_pval_RRvsSP_updown) % 2]

# ncols = len(df_pval_RRvsSP_updown) // 3

# plot_diff_sopecies(df_pval_RRvsSP_updown,  ['RR1', 'RR2', 'RR3', 'RR4'], ['SP1', 'SP2', 'SP3', 'SP4'],  "#FE6100", "#DC267F", nrows=3)

# for format in ['png', 'tiff']: 
#     plt.savefig(f'{RESULTS_DIR}/figures/paper/l2fc_RRvsSP.{format}', dpi=DPI)


df_pval_RRvsSP_up = df_pval_RRvsSP[df_pval_RRvsSP['pval_MW'] < 0.05].sort_values(by='log2FC').head(16)

ncols = len(df_pval_RRvsSP_up)

plot_diff_sopecies(df_pval_RRvsSP_up,  ['RR1', 'RR2', 'RR3', 'RR4'], ['SP1', 'SP2', 'SP3', 'SP4'],  "#FE6100", "#DC267F", nrows=1) 

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/pval_RRvsSP.{format}', dpi=DPI)
RR vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=0.000e+00
RR vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=0.000e+00
RR vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=0.000e+00
RR vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=0.000e+00
RR vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=0.000e+00
RR vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=0.000e+00
RR vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=0.000e+00
RR vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=1.600e+01
RR vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=1.600e+01
RR vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=1.600e+01
RR vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=1.600e+01
No description has been provided for this image
InĀ [26]:
# Plot sex

df_pval_sex = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_sex.tsv', sep='\t')
df_pval_sex = df_pval_sex.rename(columns={'HC1':'Male1', 'HC2':'Male2', 'RR1':'Male3', 'RR2':'Male4', 'SP1':'Male5', 'SP2':'Male6', 'HC3': 'Female1', 'HC4': 'Female2', 'RR3': 'Female3', 'RR4': 'Female4', 'SP3': 'Female5', 'SP4': 'Female6'})

df_pval_sex_up = df_pval_sex[(df_pval_sex['pval_MW'] < 0.15) & (df_pval_sex['log2FC'] > 0)].sort_values(by='log2FC').head(10)
df_pval_sex_down = df_pval_sex[(df_pval_sex['pval_MW'] < 0.15) & (df_pval_sex['log2FC'] <= 0)].sort_values(by='log2FC').tail(10)

df_pval_sex_updown = pd.concat([df_pval_sex_up, df_pval_sex_down])
df_pval_sex_updown = df_pval_sex_updown.iloc[:len(df_pval_sex_updown) - len(df_pval_sex_updown) % 2]

ncols = len(df_pval_sex_updown) 

plot_diff_sopecies(df_pval_sex_updown, [f'Female{i}' for i in range(1,7)], [f'Male{i}' for i in range(1,7)], "#785EF0", "#FFB000", nrows=1)

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/l2fc_sex.{format}', dpi=DPI)


df_pval_sex_up = df_pval_sex[df_pval_sex['pval_MW'] < 0.05].sort_values(by='log2FC').head()

ncols = len(df_pval_sex_up) 

plot_diff_sopecies(df_pval_sex_up,  [f'Male{i}' for i in range(1,7)], [f'Female{i}' for i in range(1,7)], "#785EF0", "#FFB000", nrows=1) 


for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/pval_sex.{format}', dpi=DPI)
Female vs. Male: Mann-Whitney-Wilcoxon test two-sided, P_val:9.307e-02 U_stat=2.900e+01
Female vs. Male: Mann-Whitney-Wilcoxon test two-sided, P_val:1.320e-01 U_stat=2.800e+01
No description has been provided for this image
InĀ [27]:
species = 'Sutterella'

df_all = pd.concat([df_normpipe_retained_species, df_normpipe_discarded_normplus, df_normpipe_discarded_common]).set_index('name')

yHC = np.log10(df_all.loc[species][['HC1', 'HC2', 'HC3', 'HC4']].astype(float).values + 1)
ySP = np.log10(df_all.loc[species][['SP1', 'SP2', 'SP3', 'SP4']].astype(float).values + 1)
yRR = np.log10(df_all.loc[species][['RR1', 'RR2', 'RR3', 'RR4']].astype(float).values + 1)
yCTRL = np.log10(df_all.loc[species][['ACIDOLA', 'BLACTIS']].astype(float).values + 1)

fig, ax = plt.subplots(1, 1)
plot_ax_4(ax, yHC, ySP, yRR, yCTRL, species)


display(df_normpipe_discarded_common[df_normpipe_discarded_common['name'] == species])
display(df_normpipe_discarded_normplus[df_normpipe_discarded_normplus['name'] == species])
display(df_normpipe_retained_species[df_normpipe_retained_species['name'] == species])


display(df_pval_HCvsRR[df_pval_HCvsRR['name'] == species])
display(df_pval_HCvsSP[df_pval_HCvsSP['name'] == species])
display(df_pval_RRvsSP[df_pval_RRvsSP['name'] == species])
display(df_pval_sex[df_pval_sex['name'] == species])
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice
  ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
taxonomy_id name lineage RR1 RR2 RR3 RR4 SP1 SP2 SP3 SP4 HC1 HC2 HC3 HC4 ACIDOLA BLACTIS n_samples_flag selected_flag
taxonomy_id name lineage RR1 RR2 RR3 RR4 SP1 SP2 SP3 SP4 HC1 HC2 HC3 HC4 ACIDOLA BLACTIS n_samples_flag selected_flag
taxonomy_id name lineage RR1 RR2 RR3 RR4 SP1 SP2 SP3 SP4 HC1 HC2 HC3 HC4 ACIDOLA BLACTIS n_samples_flag selected_flag
67 40544 Sutterella Sutterella;Sutterellaceae;Burkholderiales;Beta... 25.817 18.048 135.433 33.415 39.716 63923.175 151.334 146838.149 196.681 32.478 25.354 249.184 NaN NaN 2 True
taxonomy_id name lineage RR1 RR2 RR3 RR4 SP1 SP2 SP3 SP4 HC1 HC2 HC3 HC4 ACIDOLA BLACTIS n_samples_flag selected_flag log2FC pval_MW pval_MW_corrected
34 40544 Sutterella Sutterella;Sutterellaceae;Burkholderiales;Beta... 25.817 18.048 135.433 33.415 39.716 63923.175 151.334 146838.149 196.681 32.478 25.354 249.184 NaN NaN 2 True 1.952 0.486 0.954
taxonomy_id name lineage RR1 RR2 RR3 RR4 SP1 SP2 SP3 SP4 HC1 HC2 HC3 HC4 ACIDOLA BLACTIS n_samples_flag selected_flag log2FC pval_MW pval_MW_corrected
36 40544 Sutterella Sutterella;Sutterellaceae;Burkholderiales;Beta... 25.817 18.048 135.433 33.415 39.716 63923.175 151.334 146838.149 196.681 32.478 25.354 249.184 NaN NaN 2 True -8.127 0.343 0.720
taxonomy_id name lineage RR1 RR2 RR3 RR4 SP1 SP2 SP3 SP4 HC1 HC2 HC3 HC4 ACIDOLA BLACTIS n_samples_flag selected_flag log2FC pval_MW pval_MW_corrected
16 40544 Sutterella Sutterella;Sutterellaceae;Burkholderiales;Beta... 25.483 17.673 136.252 34.105 42.397 85481.707 163.082 182976.731 190.430 35.876 35.654 236.958 NaN NaN 2 True -10.489 0.057 0.276
taxonomy_id name lineage Male3 Male4 Female3 Female4 Male5 Male6 Female5 Female6 Male1 Male2 Female1 Female2 ACIDOLA BLACTIS n_samples_flag selected_flag log2FC pval_MW pval_MW_corrected
12 40544 Sutterella Sutterella;Sutterellaceae;Burkholderiales;Beta... 25.817 18.048 135.433 33.415 39.716 63923.175 151.334 146838.149 196.681 32.478 25.354 249.184 NaN NaN 2 True -1.990 0.485 1.000
No description has been provided for this image