InĀ [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy.stats import mannwhitneyu, wilcoxon
from statsmodels.stats.multitest import multipletests
from statannotations.Annotator import Annotator
InĀ [2]:
pd.options.display.float_format = '{:.3f}'.format
pd.set_option('display.max_columns', None)
plt.rcParams['figure.dpi']=170
DPI=250
InĀ [3]:
from list_vars import LIST_PROFILERS, DIR_FIGURES, RESULTS_DIR, POOLS, CONTROLS
Biological sample analysis (part 2)¶
In this notebook we are going to do the analysis on the biological samples (POOL samples + controls).
One thing we want to check is how S and mode affect the robustness of our results. So, to do that we are going to compare the results using a comparative plot. The comparativ plot shows how many new variable are shown compared to the previous case. So, we can do a comparative increase on mode by fixing S, or on S by fixing the mode.
This analysis can be performed with many variables, and we are going to choose the following:
- The number of detected species, across all samples and per individual sample (after NORM+ and cutting of species with more than 65% NaNs).
- The number of detected differentially abundant species across the 4 comparisons.
With this in mind, we can later select one S and one mode and do the following analyses.
- The importance of including the biological control samples to ensure that false positives are not considered.
- The importance of normalizing the reads considering the biogical samples.
- Plot the significantly differentially abundant species.
Observing the effect of $S$ on species retention for biological controls¶
InĀ [4]:
rename_mapping = {
'POOL1': 'RR1', 'POOL2': 'RR2', 'POOL3': 'RR3', 'POOL4': 'RR4',
'POOL5': 'SP1', 'POOL6': 'SP2', 'POOL7': 'SP3', 'POOL8': 'SP4',
'POOL9': 'HC1', 'POOL10': 'HC2', 'POOL11': 'HC3', 'POOL12': 'HC4',
}
InĀ [5]:
dict_retained_discarded_species = {'mode': [], 'S': [], 'sample': [], 'retained': [], 'discarded': []}
for mode in [3, 5, 7]:
for S in [0, 1, 2, 3, 4, 5, 6, 7, 10, 15]:
for sample in [f'POOL{i}' for i in range(1, 13)] + ['ACIDOLA', 'BLACTIS']:
flags_file = f'{RESULTS_DIR}/summary/{sample}_pass2_mode{mode}_taxgenus_S{S}_NORM+.flags.tsv'
df_flags = pd.read_csv(flags_file, sep='\t').set_index('taxonomy_id')[['name', 'lineage', 'mean_norm']]
dict_retained_discarded_species['mode'].append(mode)
dict_retained_discarded_species['S'].append(S)
dict_retained_discarded_species['sample'].append(rename_mapping[sample] if 'POOL' in sample else sample)
dict_retained_discarded_species['retained'].append((df_flags['mean_norm'] == False).sum())
dict_retained_discarded_species['discarded'].append((df_flags['mean_norm'] == True).sum())
df_retained_discarded_species = pd.DataFrame(dict_retained_discarded_species)
df_retained_discarded_species['total'] = df_retained_discarded_species['retained'] + df_retained_discarded_species['discarded']
df_retained_discarded_species['percentage'] = 100 * df_retained_discarded_species['retained'] / df_retained_discarded_species['total']
InĀ [6]:
df_retained_discarded_species
Out[6]:
| mode | S | sample | retained | discarded | total | percentage | |
|---|---|---|---|---|---|---|---|
| 0 | 3 | 0 | RR1 | 16 | 1433 | 1449 | 1.104 |
| 1 | 3 | 0 | RR2 | 30 | 1778 | 1808 | 1.659 |
| 2 | 3 | 0 | RR3 | 28 | 1927 | 1955 | 1.432 |
| 3 | 3 | 0 | RR4 | 28 | 1923 | 1951 | 1.435 |
| 4 | 3 | 0 | SP1 | 6 | 1913 | 1919 | 0.313 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 415 | 7 | 15 | HC2 | 67 | 2072 | 2139 | 3.132 |
| 416 | 7 | 15 | HC3 | 110 | 1978 | 2088 | 5.268 |
| 417 | 7 | 15 | HC4 | 77 | 2183 | 2260 | 3.407 |
| 418 | 7 | 15 | ACIDOLA | 3 | 665 | 668 | 0.449 |
| 419 | 7 | 15 | BLACTIS | 2 | 251 | 253 | 0.791 |
420 rows Ć 7 columns
InĀ [Ā ]:
InĀ [7]:
df_total_counts = df_retained_discarded_species.groupby(['mode', 'sample'])['total'].max().unstack(level=0)
df_total_counts['increment_3_5'] = df_total_counts[5] / df_total_counts[3]
df_total_counts['increment_3_7'] = df_total_counts[7] / df_total_counts[3]
df_total_counts
Out[7]:
| mode | 3 | 5 | 7 | increment_3_5 | increment_3_7 |
|---|---|---|---|---|---|
| sample | |||||
| ACIDOLA | 563 | 603 | 668 | 1.071 | 1.187 |
| BLACTIS | 163 | 194 | 253 | 1.190 | 1.552 |
| HC1 | 2087 | 2129 | 2186 | 1.020 | 1.047 |
| HC2 | 1995 | 2054 | 2139 | 1.030 | 1.072 |
| HC3 | 1956 | 1996 | 2088 | 1.020 | 1.067 |
| HC4 | 2160 | 2204 | 2260 | 1.020 | 1.046 |
| RR1 | 1449 | 1527 | 1622 | 1.054 | 1.119 |
| RR2 | 1808 | 1871 | 1955 | 1.035 | 1.081 |
| RR3 | 1955 | 2000 | 2081 | 1.023 | 1.064 |
| RR4 | 1951 | 1999 | 2076 | 1.025 | 1.064 |
| SP1 | 1919 | 1982 | 2038 | 1.033 | 1.062 |
| SP2 | 2235 | 2282 | 2344 | 1.021 | 1.049 |
| SP3 | 1934 | 1982 | 2061 | 1.025 | 1.066 |
| SP4 | 2314 | 2354 | 2408 | 1.017 | 1.041 |
InĀ [8]:
df_retained_discarded_species
Out[8]:
| mode | S | sample | retained | discarded | total | percentage | |
|---|---|---|---|---|---|---|---|
| 0 | 3 | 0 | RR1 | 16 | 1433 | 1449 | 1.104 |
| 1 | 3 | 0 | RR2 | 30 | 1778 | 1808 | 1.659 |
| 2 | 3 | 0 | RR3 | 28 | 1927 | 1955 | 1.432 |
| 3 | 3 | 0 | RR4 | 28 | 1923 | 1951 | 1.435 |
| 4 | 3 | 0 | SP1 | 6 | 1913 | 1919 | 0.313 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 415 | 7 | 15 | HC2 | 67 | 2072 | 2139 | 3.132 |
| 416 | 7 | 15 | HC3 | 110 | 1978 | 2088 | 5.268 |
| 417 | 7 | 15 | HC4 | 77 | 2183 | 2260 | 3.407 |
| 418 | 7 | 15 | ACIDOLA | 3 | 665 | 668 | 0.449 |
| 419 | 7 | 15 | BLACTIS | 2 | 251 | 253 | 0.791 |
420 rows Ć 7 columns
InĀ [9]:
for column, name in zip(['retained', 'percentage'], ['Retained genera', 'Retained percentage']):
# Set up the grid for 14 samples in a 4x4 layout
samples = df_retained_discarded_species['sample'].unique() # Unique samples
n_cols, n_rows = 7, 2 # Define the grid layout
# Initialize the grid layout
fig, axes = plt.subplots(n_rows, n_cols, figsize=(2*n_cols, 2*n_rows), sharex=True)
# Placeholder to collect handles and labels for the legend
handles, labels = None, None
# Loop through each sample and create its subplot
for i, sample in enumerate(samples):
row, col = divmod(i, n_cols) # Get the row and column index for the subplot
ax = axes[row, col]
# Filter the DataFrame for the current sample
sample_data = df_retained_discarded_species[df_retained_discarded_species['sample'] == sample]
# Plot retained species as a function of S with mode as hue
lineplot = sns.lineplot(
data=sample_data,
x='S',
y=column,
hue='mode',
marker='o',
ax=ax,
legend=(i == 6) # Enable legend only for the first subplot
)
ax.set_title(sample) # Title of the subplot is the sample name
ax.set_xlabel("S")
if col == 0:
ax.set_ylabel(name)
else:
ax.set_ylabel("")
ax.set_xticks([0, 2, 4, 6, 10, 15])
if i==6:
ax.legend(frameon=False, bbox_to_anchor=(1.05, 1), title='Mode')
# Set a specific Y-axis limit for the last two samples
if column == 'retained':
if i in [len(samples) - 2, len(samples) - 1]: # Last two samples
ax.set_ylim(0, 4)
# Remove extra subplots (for grid cells not needed)
for j in range(i + 1, n_rows * n_cols):
fig.delaxes(axes[j // n_cols, j % n_cols])
# Adjust the main plot area to leave space for the legend
plt.tight_layout()
for format in ['png', 'tiff']:
plt.savefig(f'{RESULTS_DIR}/figures/paper/s_effect_{column}.{format}', dpi=DPI)
plt.show()
Plotting sample detection and statistacally differential species across modes and S¶
InĀ [10]:
# Initialize an empty list to collect data rows
data_rows = []
# Iterate through modes and S values
for mode in [3, 5, 7]:
for S in [0, 1, 2, 3, 4, 5, 6, 7, 10, 15]:
# Read the data
df_normpipe_retained_species = pd.read_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_retained.tsv', sep='\t')
df_normpipe_discarded_normplus = pd.read_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_discarded_norm+.tsv', sep='\t')
df_normpipe_discarded_common = pd.read_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_discarded_common.tsv', sep='\t')
# Get the list of samples (assuming they are column names)
samples = df_normpipe_retained_species.columns[3:]
data_rows.append({
'mode': mode,
'S': S,
'sample': 'ALL',
'count_retained': len(df_normpipe_retained_species),
'count_discarded_norm': len(df_normpipe_discarded_normplus),
'count_discarded_all': len(df_normpipe_discarded_common)
})
for sample in samples:
# Count non-NaN species for each dataframe and sample
count_retained = df_normpipe_retained_species[sample].notna().sum()
count_discarded_norm = df_normpipe_discarded_normplus[sample].notna().sum()
count_discarded_all = df_normpipe_discarded_common[sample].notna().sum()
# Collect the data as a dictionary
data_rows.append({
'mode': mode,
'S': S,
'sample': sample,
'count_retained': count_retained,
'count_discarded_norm': count_discarded_norm,
'count_discarded_all': count_discarded_all
})
# Convert the collected rows into a dataframe
df_stats_species_count = pd.DataFrame(data_rows)
df_stats_species_count['count_total'] = df_stats_species_count['count_retained'] + df_stats_species_count['count_discarded_norm'] + df_stats_species_count['count_discarded_all']
df_stats_species_count
Out[10]:
| mode | S | sample | count_retained | count_discarded_norm | count_discarded_all | count_total | |
|---|---|---|---|---|---|---|---|
| 0 | 3 | 0 | ALL | 30 | 14 | 2 | 46 |
| 1 | 3 | 0 | RR1 | 30 | 14 | 2 | 46 |
| 2 | 3 | 0 | RR2 | 30 | 14 | 2 | 46 |
| 3 | 3 | 0 | RR3 | 30 | 14 | 2 | 46 |
| 4 | 3 | 0 | RR4 | 30 | 14 | 2 | 46 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 505 | 7 | 15 | HC4 | 87 | 78 | 9 | 174 |
| 506 | 7 | 15 | ACIDOLA | 40 | 77 | 9 | 126 |
| 507 | 7 | 15 | BLACTIS | 17 | 40 | 9 | 66 |
| 508 | 7 | 15 | n_samples_flag | 87 | 78 | 9 | 174 |
| 509 | 7 | 15 | selected_flag | 87 | 78 | 9 | 174 |
510 rows Ć 7 columns
InĀ [11]:
df_stats_species_count[df_stats_species_count['sample'] == 'ALL']
Out[11]:
| mode | S | sample | count_retained | count_discarded_norm | count_discarded_all | count_total | |
|---|---|---|---|---|---|---|---|
| 0 | 3 | 0 | ALL | 30 | 14 | 2 | 46 |
| 17 | 3 | 1 | ALL | 36 | 40 | 5 | 81 |
| 34 | 3 | 2 | ALL | 58 | 52 | 6 | 116 |
| 51 | 3 | 3 | ALL | 62 | 53 | 6 | 121 |
| 68 | 3 | 4 | ALL | 62 | 54 | 6 | 122 |
| 85 | 3 | 5 | ALL | 74 | 56 | 6 | 136 |
| 102 | 3 | 6 | ALL | 84 | 59 | 6 | 149 |
| 119 | 3 | 7 | ALL | 84 | 61 | 6 | 151 |
| 136 | 3 | 10 | ALL | 94 | 62 | 6 | 162 |
| 153 | 3 | 15 | ALL | 98 | 65 | 7 | 170 |
| 170 | 5 | 0 | ALL | 26 | 14 | 3 | 43 |
| 187 | 5 | 1 | ALL | 35 | 33 | 22 | 90 |
| 204 | 5 | 2 | ALL | 40 | 33 | 22 | 95 |
| 221 | 5 | 3 | ALL | 59 | 37 | 25 | 121 |
| 238 | 5 | 4 | ALL | 66 | 38 | 25 | 129 |
| 255 | 5 | 5 | ALL | 74 | 43 | 25 | 142 |
| 272 | 5 | 6 | ALL | 74 | 43 | 25 | 142 |
| 289 | 5 | 7 | ALL | 82 | 46 | 25 | 153 |
| 306 | 5 | 10 | ALL | 90 | 49 | 27 | 166 |
| 323 | 5 | 15 | ALL | 96 | 54 | 28 | 178 |
| 340 | 7 | 0 | ALL | 19 | 26 | 6 | 51 |
| 357 | 7 | 1 | ALL | 33 | 46 | 7 | 86 |
| 374 | 7 | 2 | ALL | 50 | 59 | 7 | 116 |
| 391 | 7 | 3 | ALL | 52 | 62 | 8 | 122 |
| 408 | 7 | 4 | ALL | 63 | 66 | 9 | 138 |
| 425 | 7 | 5 | ALL | 70 | 69 | 9 | 148 |
| 442 | 7 | 6 | ALL | 75 | 71 | 9 | 155 |
| 459 | 7 | 7 | ALL | 75 | 71 | 9 | 155 |
| 476 | 7 | 10 | ALL | 79 | 72 | 9 | 160 |
| 493 | 7 | 15 | ALL | 87 | 78 | 9 | 174 |
InĀ [12]:
# Initialize an empty list to collect data rows
data_rows = []
# Iterate through modes and S values
for mode in [3, 5, 7]:
for S in [0, 1, 2, 3, 4, 5, 6, 7, 10, 15]:
# Read the data
df_pval_HCvsRR = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_HCvsRR.tsv', sep='\t')
df_pval_HCvsSP = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_HCvsSP.tsv', sep='\t')
df_pval_RRvsSP = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_RRvsSP.tsv', sep='\t')
df_pval_sex = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_sex.tsv', sep='\t')
# Get the list of samples (assuming they are column names)
samples = df_normpipe_retained_species.columns[3:]
data_rows.append({
'mode': mode,
'S': S,
'count_HCvsRR': len(df_pval_HCvsRR[df_pval_HCvsRR['pval_MW'] < 0.05]),
'count_HCvsSP': len(df_pval_HCvsSP[df_pval_HCvsSP['pval_MW'] < 0.05]),
'count_RRvsSP': len(df_pval_RRvsSP[df_pval_RRvsSP['pval_MW'] < 0.05]),
'count_sex': len(df_pval_sex[df_pval_sex['pval_MW'] < 0.05]),
'species_HCvsRR': df_pval_HCvsRR[df_pval_HCvsRR['pval_MW'] < 0.05]['name'].values.tolist(),
'species_HCvsSP': df_pval_HCvsSP[df_pval_HCvsSP['pval_MW'] < 0.05]['name'].values.tolist(),
'species_RRvsSP': df_pval_RRvsSP[df_pval_RRvsSP['pval_MW'] < 0.05]['name'].values.tolist(),
'species_sex': df_pval_sex[df_pval_sex['pval_MW'] < 0.05]['name'].values.tolist()
})
# Convert the collected rows into a dataframe
df_stats_species_diffabundance = pd.DataFrame(data_rows)
df_stats_species_diffabundance['count_total'] = df_stats_species_diffabundance['count_HCvsRR'] + df_stats_species_diffabundance['count_HCvsSP'] + \
df_stats_species_diffabundance['count_RRvsSP'] + df_stats_species_diffabundance['count_sex']
df_stats_species_diffabundance
Out[12]:
| mode | S | count_HCvsRR | count_HCvsSP | count_RRvsSP | count_sex | species_HCvsRR | species_HCvsSP | species_RRvsSP | species_sex | count_total | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3 | 0 | 0 | 1 | 2 | 0 | [] | [Colletotrichum] | [Dictyostelium, Tepidimonas] | [] | 3 |
| 1 | 3 | 1 | 0 | 1 | 3 | 0 | [] | [Colletotrichum] | [Dictyostelium, Tepidimonas, Brevundimonas] | [] | 4 |
| 2 | 3 | 2 | 0 | 1 | 5 | 0 | [] | [Colletotrichum] | [Dictyostelium, Tepidimonas, Brevundimonas, Ro... | [] | 6 |
| 3 | 3 | 3 | 0 | 1 | 5 | 0 | [] | [Colletotrichum] | [Dictyostelium, Tepidimonas, Brevundimonas, Ro... | [] | 6 |
| 4 | 3 | 4 | 0 | 1 | 5 | 0 | [] | [Colletotrichum] | [Dictyostelium, Tepidimonas, Brevundimonas, Ro... | [] | 6 |
| 5 | 3 | 5 | 0 | 2 | 6 | 0 | [] | [Colletotrichum, Xanthomonas] | [Dictyostelium, Tepidimonas, Brevundimonas, Xa... | [] | 8 |
| 6 | 3 | 6 | 0 | 2 | 7 | 0 | [] | [Colletotrichum, Xanthomonas] | [Dictyostelium, Tepidimonas, Brevundimonas, Fl... | [] | 9 |
| 7 | 3 | 7 | 0 | 2 | 7 | 0 | [] | [Colletotrichum, Xanthomonas] | [Dictyostelium, Tepidimonas, Brevundimonas, Fl... | [] | 9 |
| 8 | 3 | 10 | 0 | 2 | 7 | 0 | [] | [Colletotrichum, Xanthomonas] | [Dictyostelium, Tepidimonas, Brevundimonas, Fl... | [] | 9 |
| 9 | 3 | 15 | 0 | 2 | 8 | 0 | [] | [Colletotrichum, Xanthomonas] | [Dictyostelium, Tepidimonas, Brevundimonas, Ca... | [] | 10 |
| 10 | 5 | 0 | 0 | 0 | 1 | 0 | [] | [] | [Sphingobium] | [] | 1 |
| 11 | 5 | 1 | 0 | 0 | 1 | 0 | [] | [] | [Dictyostelium] | [] | 1 |
| 12 | 5 | 2 | 0 | 0 | 2 | 0 | [] | [] | [Dictyostelium, Microlunatus] | [] | 2 |
| 13 | 5 | 3 | 0 | 0 | 5 | 0 | [] | [] | [Dictyostelium, Microlunatus, Aquirhabdus, Ros... | [] | 5 |
| 14 | 5 | 4 | 0 | 1 | 7 | 0 | [] | [Xanthomonas] | [Dictyostelium, Aquirhabdus, Flavobacterium, L... | [] | 8 |
| 15 | 5 | 5 | 0 | 1 | 8 | 0 | [] | [Xanthomonas] | [Dictyostelium, Aquirhabdus, Rhizopus, Lysobac... | [] | 9 |
| 16 | 5 | 6 | 0 | 1 | 8 | 0 | [] | [Xanthomonas] | [Dictyostelium, Aquirhabdus, Rhizopus, Lysobac... | [] | 9 |
| 17 | 5 | 7 | 0 | 1 | 8 | 0 | [] | [Xanthomonas] | [Dictyostelium, Aquirhabdus, Rhizopus, Lysobac... | [] | 9 |
| 18 | 5 | 10 | 0 | 1 | 9 | 0 | [] | [Xanthomonas] | [Dictyostelium, Candida, Rhizopus, Lysobacter,... | [] | 10 |
| 19 | 5 | 15 | 0 | 1 | 9 | 0 | [] | [Xanthomonas] | [Dictyostelium, Candida, Rhizopus, Roseomonas,... | [] | 10 |
| 20 | 7 | 0 | 0 | 0 | 2 | 0 | [] | [] | [Dictyostelium, Parastagonospora] | [] | 2 |
| 21 | 7 | 1 | 0 | 0 | 3 | 0 | [] | [] | [Dictyostelium, Comamonas, Parastagonospora] | [] | 3 |
| 22 | 7 | 2 | 0 | 0 | 7 | 0 | [] | [] | [Dictyostelium, Parastagonospora, Roseomonas, ... | [] | 7 |
| 23 | 7 | 3 | 0 | 0 | 7 | 0 | [] | [] | [Dictyostelium, Parastagonospora, Roseomonas, ... | [] | 7 |
| 24 | 7 | 4 | 0 | 0 | 7 | 0 | [] | [] | [Dictyostelium, Parastagonospora, Roseomonas, ... | [] | 7 |
| 25 | 7 | 5 | 0 | 0 | 7 | 0 | [] | [] | [Dictyostelium, Parastagonospora, Roseomonas, ... | [] | 7 |
| 26 | 7 | 6 | 0 | 0 | 7 | 0 | [] | [] | [Dictyostelium, Parastagonospora, Roseomonas, ... | [] | 7 |
| 27 | 7 | 7 | 0 | 0 | 7 | 0 | [] | [] | [Dictyostelium, Parastagonospora, Roseomonas, ... | [] | 7 |
| 28 | 7 | 10 | 1 | 0 | 8 | 0 | [Leptolyngbya] | [] | [Dictyostelium, Parastagonospora, Roseomonas, ... | [] | 9 |
| 29 | 7 | 15 | 1 | 0 | 11 | 0 | [Leptolyngbya] | [] | [Dictyostelium, Chloroflexus, Comamonas, Rhizo... | [] | 12 |
Why normalization with biological controls is relevant¶
InĀ [13]:
mode = 3
S = 7
InĀ [14]:
df_normpipe_retained_species = pd.read_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_retained.tsv', sep='\t')
df_normpipe_discarded_normplus = pd.read_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_discarded_norm+.tsv', sep='\t')
df_normpipe_discarded_common = pd.read_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_discarded_common.tsv', sep='\t')
InĀ [15]:
print(len(df_normpipe_retained_species), len(df_normpipe_discarded_normplus), len(df_normpipe_discarded_common))
84 61 6
InĀ [16]:
species = 'Janibacter'
display(df_normpipe_retained_species[df_normpipe_retained_species['name'] == species])
display(df_normpipe_discarded_normplus[df_normpipe_discarded_normplus['name'] == species])
display(df_normpipe_discarded_common[df_normpipe_discarded_common['name'] == species])
| taxonomy_id | name | lineage | RR1 | RR2 | RR3 | RR4 | SP1 | SP2 | SP3 | SP4 | HC1 | HC2 | HC3 | HC4 | ACIDOLA | BLACTIS | n_samples_flag | selected_flag |
|---|
| taxonomy_id | name | lineage | RR1 | RR2 | RR3 | RR4 | SP1 | SP2 | SP3 | SP4 | HC1 | HC2 | HC3 | HC4 | ACIDOLA | BLACTIS | n_samples_flag | selected_flag | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 35 | 53457 | Janibacter | Janibacter;Intrasporangiaceae;Micrococcales;Ac... | 390.927 | 349.060 | 1392.642 | 672.706 | 7238.942 | 4715.237 | 1088.757 | 881.804 | 893.698 | 1339.119 | 1358.470 | 670.537 | 2361.363 | NaN | 4 | True |
| taxonomy_id | name | lineage | RR1 | RR2 | RR3 | RR4 | SP1 | SP2 | SP3 | SP4 | HC1 | HC2 | HC3 | HC4 | ACIDOLA | BLACTIS | n_samples_flag | selected_flag |
|---|
Plotting discarded species¶
InĀ [17]:
def annotation_pval(df, ax, groups, x_col, y_col):
if len(groups) == 2:
pairs = [(groups[0], groups[1])]
annotator = Annotator(ax, pairs, data=df, x=x_col, y=y_col)
annotator.configure(test='Mann-Whitney', text_format='simple', loc='outside', line_width=0.65, fontsize=8)
annotator.apply_and_annotate()
def add_medians(df, ax, x_col, y_col, ysync=0.1, xsync=0.1):
groups = df[x_col].unique()
vals = [df[df[x_col] == group][y_col].values for group in groups]
medians = [np.nanmedian(val) for val in vals]
medianslog = [np.log10(np.nanmedian(val) + 1) for val in vals]
for i, median in enumerate(medians):
ax.text(i + xsync, medianslog[i] + ysync, f"{medians[i]:.0f}", ha='left', va='bottom', fontsize=7)
log2fc = np.log2(medians[1] / medians[0])
if log2fc > 0:
ax.text(-0.4, 5, f"log$_2$FC: {log2fc:.2f}", ha='left', va='bottom', fontsize=7)
else:
ax.text(1.5, 5, f"log$_2$FC: {log2fc:.2f}", ha='right', va='bottom', fontsize=7)
InĀ [18]:
def plot_ax_2(ax, y1, y2, color1, color2, group1, group2, species):
ax.scatter([0] * len(y1), y1, color=color1, label=group1, alpha=0.8)
ax.scatter([1] * len(y2), y2, color=color2, label=group2, alpha=0.8)
# Add the means as horizontal lines
ax.plot([0 - 0.2, 0 + 0.2], [np.log10(np.nanmedian(10 ** y1 - 1) + 1), np.log10(np.nanmedian(10 ** y1 - 1) + 1)], color=color1, lw=2)
ax.plot([1 - 0.2, 1 + 0.2], [np.log10(np.nanmedian(10 ** y2 - 1) + 1), np.log10(np.nanmedian(10 ** y2 - 1) + 1)], color=color2, lw=2)
# Add horizontal gridlines at 0.5 intervals
for y in np.arange(0, max(6, max(max(y1) + 0.5, max(y2) + 0.5)), 1): # Adjust range to match y-axis limits
ax.axhline(y, color='lightgray', linestyle='--', linewidth=0.5, zorder=0)
# Customize the x-axis
ax.set_xticks([0, 1])
ax.set_xticklabels([group1, group2])
ax.set_xlim(-0.5, 1.5)
# Set axis limits and labels
ax.set_ylim(0, max(6, max(max(y1) + 0.5, max(y2) + 0.5)))
ax.set_ylabel('log$_{10}$ counts')
# Remove the x and y axis lines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
# Remove ticks on the y-axis
ax.yaxis.set_ticks_position('none')
ax.set_title(species, fontsize=10, pad=20)
# Set lighter grid aesthetics
ax.grid(False)
def plot_ax_4(ax, yHC, ySP, yRR, yCTRL, species):
ax.scatter([0] * len(yHC), yHC, color='#648FFF', label='HC', alpha=0.8)
ax.scatter([1] * len(ySP), ySP, color='#DC267F', label='SP', alpha=0.8)
ax.scatter([2] * len(yRR), yRR, color='#FE6100', label='RR', alpha=0.8)
ax.scatter([3] * len(yCTRL), yCTRL, color='#848484', label='CTRL', alpha=0.8)
# Add the means as horizontal lines
ax.plot([0 - 0.2, 0 + 0.2], [np.nanmean(yHC), np.nanmean(yHC)], color='#648FFF', lw=2)
ax.plot([1 - 0.2, 1 + 0.2], [np.nanmean(ySP), np.nanmean(ySP)], color='#DC267F', lw=2)
ax.plot([2 - 0.2, 2 + 0.2], [np.nanmean(yRR), np.nanmean(yRR)], color='#FE6100', lw=2)
ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
# Add horizontal gridlines at 0.5 intervals
for y in np.arange(0, max(5, max(max(yHC) + 0.5, max(ySP) + 0.5, max(yRR) + 0.5, max(yCTRL) + 0.5)), 1): # Adjust range to match y-axis limits
ax.axhline(y, color='lightgray', linestyle='--', linewidth=0.5, zorder=0)
# Customize the x-axis
ax.set_xticks([0, 1, 2, 3])
ax.set_xticklabels(['HC', 'SP', 'RR', 'CTRL'])
ax.set_xlim(-0.5, 3.5)
# Set axis limits and labels
ax.set_ylim(0, max(5, max(max(yHC) + 0.5, max(ySP) + 0.5, max(yRR) + 0.5, max(yCTRL) + 0.5)))
ax.set_ylabel('log$_{10}$ counts')
# Remove the x and y axis lines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
# Remove ticks on the y-axis
ax.yaxis.set_ticks_position('none')
ax.set_title(species, fontsize=10)
# Set lighter grid aesthetics
ax.grid(False)
InĀ [19]:
df_normpipe_discarded_common
Out[19]:
| taxonomy_id | name | lineage | RR1 | RR2 | RR3 | RR4 | SP1 | SP2 | SP3 | SP4 | HC1 | HC2 | HC3 | HC4 | ACIDOLA | BLACTIS | n_samples_flag | selected_flag | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 561 | Escherichia | Escherichia;Enterobacteriaceae;Enterobacterale... | 1115.072 | 7312.573 | 1574.702 | 1431.427 | 2693.906 | 5329.932 | 2680.127 | 7308.049 | 10986.526 | 2538.369 | 949.367 | 3667.465 | 94128.188 | 2927.082 | 11 | True |
| 1 | 570 | Klebsiella | Klebsiella;Klebsiella/Raoultella group;Enterob... | 1049.170 | 1451.155 | 3828.523 | 2841.372 | 2759.812 | 3712.944 | 2972.702 | 6579.477 | 8212.239 | 4684.084 | 1298.011 | 3249.274 | 62221.478 | 7892.033 | 9 | True |
| 2 | 1678 | Bifidobacterium | Bifidobacterium;Bifidobacteriaceae;Bifidobacte... | 840.215 | 17712.591 | 18087.868 | 403.355 | 1531.230 | 683507.244 | 14725.521 | 13617.540 | 10692.445 | 204.798 | 421.165 | 902.409 | 2358.385 | 59416136.257 | 7 | True |
| 3 | 1578 | Lactobacillus | Lactobacillus;Lactobacillaceae;Lactobacillales... | 723.488 | 4012.419 | 525.605 | 1601.527 | 3410.966 | 3384.103 | 1667.824 | 2040.589 | 4594.691 | 1390.181 | 23001.370 | 3768.125 | 56383341.619 | 2445.995 | 9 | True |
| 4 | 613 | Serratia | Serratia;Yersiniaceae;Enterobacterales;Gammapr... | 108.323 | 265.833 | 283.792 | 1006.155 | 1887.851 | 7829.756 | 458.064 | 16502.561 | 622.139 | 5766.380 | 281.936 | 1006.653 | 39416.996 | 1005.002 | 3 | True |
| 5 | 2048137 | Agathobaculum | Agathobaculum;Butyricicoccaceae;Eubacteriales;... | 16.241 | 2.176 | 3777.769 | 16.844 | 11.008 | 92358.913 | 2486.322 | 999.148 | 804.799 | 23.261 | 14.330 | 150.199 | 7810.561 | 130.633 | 3 | True |
InĀ [20]:
def plot_species(df, ncols_max=6):
nplots = len(df)
nrows = nplots//ncols_max
nrows += (nplots % ncols_max != 0)
if nplots:
# Create the figure and axis
fig, axs = plt.subplots(nrows, ncols_max, figsize=(2 * ncols_max, 2 * nrows))
for i, species in enumerate(df.index):
ax = axs.ravel()[i] if nplots > 1 else axs
yHC = np.log10(df.loc[species][['HC1', 'HC2', 'HC3', 'HC4']].astype(float).values + 1)
ySP = np.log10(df.loc[species][['SP1', 'SP2', 'SP3', 'SP4']].astype(float).values + 1)
yRR = np.log10(df.loc[species][['RR1', 'RR2', 'RR3', 'RR4']].astype(float).values + 1)
yCTRL = np.log10(df.loc[species][['ACIDOLA', 'BLACTIS']].astype(float).values + 1)
plot_ax_4(ax, yHC, ySP, yRR, yCTRL, species)
df_annot = pd.DataFrame({'group': ['HC'] * 4 + ['SP'] * 4 + ['RR'] * 4 + ['CTRL'] * 2,
'exp': df.loc[species, ['HC1', 'HC2', 'HC3', 'HC4', 'SP1', 'SP2', 'SP3', 'SP4', 'RR1', 'RR2', 'RR3', 'RR4', 'ACIDOLA', 'BLACTIS']].astype(float).values})
# add_medians(df_annot, ax, x_col='group', y_col='exp', xsync=0.15, ysync=0)
if nplots % ncols_max:
for i in range(nplots, nrows * ncols_max ):
axs.ravel()[i].axis('off')
plt.tight_layout()
InĀ [21]:
plot_species(df_normpipe_discarded_common.copy().set_index('name'), ncols_max=6)
for format in ['png', 'tiff']:
plt.savefig(f'{RESULTS_DIR}/figures/paper/l2fc_NORM+x_discarded.{format}', dpi=DPI)
plot_species(df_normpipe_discarded_normplus.copy().set_index('name'), ncols_max=7)
for format in ['png', 'tiff']:
plt.savefig(f'{RESULTS_DIR}/figures/paper/l2fc_NORMx_discarded.{format}', dpi=DPI)
plot_species(df_normpipe_retained_species.copy().set_index('name'), ncols_max=8)
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2) /tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
Plotting differential species¶
InĀ [22]:
def plot_diff_sopecies(df_pval, cols_A, cols_B, color1, color2, nrows=2, other_labels = None):
ncols = len(df_pval) // nrows
if other_labels is None:
g1, g2 = cols_A[0][:-1], cols_B[0][:-1]
if ncols:
# Create the figure and axis
fig, axs = plt.subplots(nrows, ncols, figsize=(2 * ncols, nrows * 2))
for i in range(len(df_pval)):
df_i = df_pval.iloc[i]
ax = axs.ravel()[i] if ncols > 1 else axs
y1 = np.log10(df_i[cols_A].astype(float).values + 1)
y2 = np.log10(df_i[cols_B].astype(float).values + 1)
species = df_i['name']
df_annot = pd.DataFrame({'group': [i[:-1] for i in cols_A + cols_B], 'exp': df_i[cols_A + cols_B].astype(float).values})
plot_ax_2(ax, y1=y1, y2=y2, color1=color1, color2=color2, group1=g1, group2=g2, species=species)
annotation_pval(df_annot, ax, [g1, g2], x_col='group', y_col='exp')
add_medians(df_annot, ax, x_col='group', y_col='exp', xsync=0.15, ysync=0)
plt.tight_layout()
InĀ [23]:
# Plot HC vs RR
df_pval_HCvsRR = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_HCvsRR.tsv', sep='\t')
df_pval_HCvsRR_up = df_pval_HCvsRR[(df_pval_HCvsRR['pval_MW'] < 0.15) & (df_pval_HCvsRR['log2FC'] > 0)].sort_values(by='log2FC').head()
df_pval_HCvsRR_down = df_pval_HCvsRR[(df_pval_HCvsRR['pval_MW'] < 0.15) & (df_pval_HCvsRR['log2FC'] <= 0)].sort_values(by='log2FC').tail()
df_pval_HCvsRR_updown = pd.concat([df_pval_HCvsRR_up, df_pval_HCvsRR_down])
df_pval_HCvsRR_updown = df_pval_HCvsRR_updown.iloc[:len(df_pval_HCvsRR_updown) - len(df_pval_HCvsRR_updown) % 2]
ncols = len(df_pval_HCvsRR_updown)
plot_diff_sopecies(df_pval_HCvsRR_updown, ['HC1', 'HC2', 'HC3', 'HC4'], ['RR1', 'RR2', 'RR3', 'RR4'], "#648FFF", "#FE6100", nrows=1)
for format in ['png', 'tiff']:
plt.savefig(f'{RESULTS_DIR}/figures/paper/l2fc_HCvRR.{format}', dpi=DPI)
df_pval_HCvsRR_up = df_pval_HCvsRR[df_pval_HCvsRR['pval_MW'] < 0.05].sort_values(by='log2FC').head()
df_pval_HCvsRR_down = df_pval_HCvsRR[df_pval_HCvsRR['pval_MW'] < 0.05].sort_values(by='log2FC').tail()
df_pval_HCvsRR_updown = pd.concat([df_pval_HCvsRR_up, df_pval_HCvsRR_down])
ncols = len(df_pval_HCvsRR_updown)
plot_diff_sopecies(df_pval_HCvsRR_updown, ['HC1', 'HC2', 'HC3', 'HC4'], ['RR1', 'RR2', 'RR3', 'RR4'], "#648FFF", "#FE6100", nrows=1)
for format in ['png', 'tiff']:
plt.savefig(f'{RESULTS_DIR}/figures/paper/pval_HCvRR.{format}', dpi=DPI)
HC vs. RR: Mann-Whitney-Wilcoxon test two-sided, P_val:5.714e-02 U_stat=1.500e+01 HC vs. RR: Mann-Whitney-Wilcoxon test two-sided, P_val:1.143e-01 U_stat=1.400e+01 HC vs. RR: Mann-Whitney-Wilcoxon test two-sided, P_val:1.143e-01 U_stat=1.400e+01 HC vs. RR: Mann-Whitney-Wilcoxon test two-sided, P_val:1.143e-01 U_stat=1.400e+01
InĀ [24]:
# Plot HC vs SP
df_pval_HCvsSP = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_HCvsSP.tsv', sep='\t')
df_pval_HCvsSP_up = df_pval_HCvsSP[(df_pval_HCvsSP['pval_MW'] < 0.15) & (df_pval_HCvsSP['log2FC'] > 0)].sort_values(by='log2FC').head()
df_pval_HCvsSP_down = df_pval_HCvsSP[(df_pval_HCvsSP['pval_MW'] < 0.15) & (df_pval_HCvsSP['log2FC'] <= 0)].sort_values(by='log2FC').tail()
df_pval_HCvsSP_updown = pd.concat([df_pval_HCvsSP_up, df_pval_HCvsSP_down])
df_pval_HCvsSP_updown = df_pval_HCvsSP_updown.iloc[:len(df_pval_HCvsSP_updown) - len(df_pval_HCvsSP_updown) % 2]
ncols = len(df_pval_HCvsSP_updown)
plot_diff_sopecies(df_pval_HCvsSP_updown, ['HC1', 'HC2', 'HC3', 'HC4'], ['SP1', 'SP2', 'SP3', 'SP4'], "#648FFF", "#DC267F", nrows=1)
for format in ['png', 'tiff']:
plt.savefig(f'{RESULTS_DIR}/figures/paper/l2fc_HCvSP.{format}', dpi=DPI)
df_pval_HCvsSP_up = df_pval_HCvsSP[df_pval_HCvsSP['pval_MW'] < 0.05].sort_values(by='log2FC').head()
ncols = len(df_pval_HCvsSP_up)
plot_diff_sopecies(df_pval_HCvsSP_up, ['HC1', 'HC2', 'HC3', 'HC4'], ['SP1', 'SP2', 'SP3', 'SP4'], "#648FFF", "#DC267F", nrows=1)
for format in ['png', 'tiff']:
plt.savefig(f'{RESULTS_DIR}/figures/paper/pval_HCvsSP.{format}', dpi=DPI)
HC vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=1.600e+01 HC vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:1.143e-01 U_stat=1.400e+01 HC vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:1.143e-01 U_stat=2.000e+00 HC vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:1.143e-01 U_stat=2.000e+00 HC vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:5.714e-02 U_stat=1.000e+00 HC vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=0.000e+00 HC vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=0.000e+00 HC vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=1.600e+01
InĀ [25]:
# # Plot RR vs SP
# df_pval_RRvsSP = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_RRvsSP.tsv', sep='\t')
# df_pval_RRvsSP_up = df_pval_RRvsSP[(df_pval_RRvsSP['pval_MW'] < 0.15) & (df_pval_RRvsSP['log2FC'] > 0)].sort_values(by='log2FC').head(14)
# df_pval_RRvsSP_down = df_pval_RRvsSP[(df_pval_RRvsSP['pval_MW'] < 0.15) & (df_pval_RRvsSP['log2FC'] <= 0)].sort_values(by='log2FC').head(14)
# df_pval_RRvsSP_updown = pd.concat([df_pval_RRvsSP_up, df_pval_RRvsSP_down])
# df_pval_RRvsSP_updown = df_pval_RRvsSP_updown.iloc[:len(df_pval_RRvsSP_updown) - len(df_pval_RRvsSP_updown) % 2]
# ncols = len(df_pval_RRvsSP_updown) // 3
# plot_diff_sopecies(df_pval_RRvsSP_updown, ['RR1', 'RR2', 'RR3', 'RR4'], ['SP1', 'SP2', 'SP3', 'SP4'], "#FE6100", "#DC267F", nrows=3)
# for format in ['png', 'tiff']:
# plt.savefig(f'{RESULTS_DIR}/figures/paper/l2fc_RRvsSP.{format}', dpi=DPI)
df_pval_RRvsSP_up = df_pval_RRvsSP[df_pval_RRvsSP['pval_MW'] < 0.05].sort_values(by='log2FC').head(16)
ncols = len(df_pval_RRvsSP_up)
plot_diff_sopecies(df_pval_RRvsSP_up, ['RR1', 'RR2', 'RR3', 'RR4'], ['SP1', 'SP2', 'SP3', 'SP4'], "#FE6100", "#DC267F", nrows=1)
for format in ['png', 'tiff']:
plt.savefig(f'{RESULTS_DIR}/figures/paper/pval_RRvsSP.{format}', dpi=DPI)
RR vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=0.000e+00 RR vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=0.000e+00 RR vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=0.000e+00 RR vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=0.000e+00 RR vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=0.000e+00 RR vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=0.000e+00 RR vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=0.000e+00 RR vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=1.600e+01 RR vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=1.600e+01 RR vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=1.600e+01 RR vs. SP: Mann-Whitney-Wilcoxon test two-sided, P_val:2.857e-02 U_stat=1.600e+01
InĀ [26]:
# Plot sex
df_pval_sex = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_sex.tsv', sep='\t')
df_pval_sex = df_pval_sex.rename(columns={'HC1':'Male1', 'HC2':'Male2', 'RR1':'Male3', 'RR2':'Male4', 'SP1':'Male5', 'SP2':'Male6', 'HC3': 'Female1', 'HC4': 'Female2', 'RR3': 'Female3', 'RR4': 'Female4', 'SP3': 'Female5', 'SP4': 'Female6'})
df_pval_sex_up = df_pval_sex[(df_pval_sex['pval_MW'] < 0.15) & (df_pval_sex['log2FC'] > 0)].sort_values(by='log2FC').head(10)
df_pval_sex_down = df_pval_sex[(df_pval_sex['pval_MW'] < 0.15) & (df_pval_sex['log2FC'] <= 0)].sort_values(by='log2FC').tail(10)
df_pval_sex_updown = pd.concat([df_pval_sex_up, df_pval_sex_down])
df_pval_sex_updown = df_pval_sex_updown.iloc[:len(df_pval_sex_updown) - len(df_pval_sex_updown) % 2]
ncols = len(df_pval_sex_updown)
plot_diff_sopecies(df_pval_sex_updown, [f'Female{i}' for i in range(1,7)], [f'Male{i}' for i in range(1,7)], "#785EF0", "#FFB000", nrows=1)
for format in ['png', 'tiff']:
plt.savefig(f'{RESULTS_DIR}/figures/paper/l2fc_sex.{format}', dpi=DPI)
df_pval_sex_up = df_pval_sex[df_pval_sex['pval_MW'] < 0.05].sort_values(by='log2FC').head()
ncols = len(df_pval_sex_up)
plot_diff_sopecies(df_pval_sex_up, [f'Male{i}' for i in range(1,7)], [f'Female{i}' for i in range(1,7)], "#785EF0", "#FFB000", nrows=1)
for format in ['png', 'tiff']:
plt.savefig(f'{RESULTS_DIR}/figures/paper/pval_sex.{format}', dpi=DPI)
Female vs. Male: Mann-Whitney-Wilcoxon test two-sided, P_val:9.307e-02 U_stat=2.900e+01 Female vs. Male: Mann-Whitney-Wilcoxon test two-sided, P_val:1.320e-01 U_stat=2.800e+01
InĀ [27]:
species = 'Sutterella'
df_all = pd.concat([df_normpipe_retained_species, df_normpipe_discarded_normplus, df_normpipe_discarded_common]).set_index('name')
yHC = np.log10(df_all.loc[species][['HC1', 'HC2', 'HC3', 'HC4']].astype(float).values + 1)
ySP = np.log10(df_all.loc[species][['SP1', 'SP2', 'SP3', 'SP4']].astype(float).values + 1)
yRR = np.log10(df_all.loc[species][['RR1', 'RR2', 'RR3', 'RR4']].astype(float).values + 1)
yCTRL = np.log10(df_all.loc[species][['ACIDOLA', 'BLACTIS']].astype(float).values + 1)
fig, ax = plt.subplots(1, 1)
plot_ax_4(ax, yHC, ySP, yRR, yCTRL, species)
display(df_normpipe_discarded_common[df_normpipe_discarded_common['name'] == species])
display(df_normpipe_discarded_normplus[df_normpipe_discarded_normplus['name'] == species])
display(df_normpipe_retained_species[df_normpipe_retained_species['name'] == species])
display(df_pval_HCvsRR[df_pval_HCvsRR['name'] == species])
display(df_pval_HCvsSP[df_pval_HCvsSP['name'] == species])
display(df_pval_RRvsSP[df_pval_RRvsSP['name'] == species])
display(df_pval_sex[df_pval_sex['name'] == species])
/tmp/ipykernel_176357/464872998.py:46: RuntimeWarning: Mean of empty slice ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)
| taxonomy_id | name | lineage | RR1 | RR2 | RR3 | RR4 | SP1 | SP2 | SP3 | SP4 | HC1 | HC2 | HC3 | HC4 | ACIDOLA | BLACTIS | n_samples_flag | selected_flag |
|---|
| taxonomy_id | name | lineage | RR1 | RR2 | RR3 | RR4 | SP1 | SP2 | SP3 | SP4 | HC1 | HC2 | HC3 | HC4 | ACIDOLA | BLACTIS | n_samples_flag | selected_flag |
|---|
| taxonomy_id | name | lineage | RR1 | RR2 | RR3 | RR4 | SP1 | SP2 | SP3 | SP4 | HC1 | HC2 | HC3 | HC4 | ACIDOLA | BLACTIS | n_samples_flag | selected_flag | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 67 | 40544 | Sutterella | Sutterella;Sutterellaceae;Burkholderiales;Beta... | 25.817 | 18.048 | 135.433 | 33.415 | 39.716 | 63923.175 | 151.334 | 146838.149 | 196.681 | 32.478 | 25.354 | 249.184 | NaN | NaN | 2 | True |
| taxonomy_id | name | lineage | RR1 | RR2 | RR3 | RR4 | SP1 | SP2 | SP3 | SP4 | HC1 | HC2 | HC3 | HC4 | ACIDOLA | BLACTIS | n_samples_flag | selected_flag | log2FC | pval_MW | pval_MW_corrected | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 34 | 40544 | Sutterella | Sutterella;Sutterellaceae;Burkholderiales;Beta... | 25.817 | 18.048 | 135.433 | 33.415 | 39.716 | 63923.175 | 151.334 | 146838.149 | 196.681 | 32.478 | 25.354 | 249.184 | NaN | NaN | 2 | True | 1.952 | 0.486 | 0.954 |
| taxonomy_id | name | lineage | RR1 | RR2 | RR3 | RR4 | SP1 | SP2 | SP3 | SP4 | HC1 | HC2 | HC3 | HC4 | ACIDOLA | BLACTIS | n_samples_flag | selected_flag | log2FC | pval_MW | pval_MW_corrected | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 36 | 40544 | Sutterella | Sutterella;Sutterellaceae;Burkholderiales;Beta... | 25.817 | 18.048 | 135.433 | 33.415 | 39.716 | 63923.175 | 151.334 | 146838.149 | 196.681 | 32.478 | 25.354 | 249.184 | NaN | NaN | 2 | True | -8.127 | 0.343 | 0.720 |
| taxonomy_id | name | lineage | RR1 | RR2 | RR3 | RR4 | SP1 | SP2 | SP3 | SP4 | HC1 | HC2 | HC3 | HC4 | ACIDOLA | BLACTIS | n_samples_flag | selected_flag | log2FC | pval_MW | pval_MW_corrected | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 16 | 40544 | Sutterella | Sutterella;Sutterellaceae;Burkholderiales;Beta... | 25.483 | 17.673 | 136.252 | 34.105 | 42.397 | 85481.707 | 163.082 | 182976.731 | 190.430 | 35.876 | 35.654 | 236.958 | NaN | NaN | 2 | True | -10.489 | 0.057 | 0.276 |
| taxonomy_id | name | lineage | Male3 | Male4 | Female3 | Female4 | Male5 | Male6 | Female5 | Female6 | Male1 | Male2 | Female1 | Female2 | ACIDOLA | BLACTIS | n_samples_flag | selected_flag | log2FC | pval_MW | pval_MW_corrected | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 12 | 40544 | Sutterella | Sutterella;Sutterellaceae;Burkholderiales;Beta... | 25.817 | 18.048 | 135.433 | 33.415 | 39.716 | 63923.175 | 151.334 | 146838.149 | 196.681 | 32.478 | 25.354 | 249.184 | NaN | NaN | 2 | True | -1.990 | 0.485 | 1.000 |