In this notebook, we reduce the sample size to make the analysis more approachable using average infrastructure.
from __future__ import print_function
import pandas as pd
import seaborn as sns
from matplotlib import pyplot
import matplotlib
import warnings
warnings.filterwarnings('ignore')
sns.set_style("darkgrid")
sns.set(font_scale=1.5)
sns.set_context('paper')
# %matplotlib notebook
import pandas as pd
data_location = "../../data/raw/"
# For a cohort of 2749 samples (TB-portals + HTBC)
# ROWS - inclusion of INDELS lead to 96633 more data points
# cat cohort.bqsr.filter.snps.tsv | wc -l => 356154
# cat cohort.bqsr.filter.snps.indels.tsv | wc -l => 452787
# In the reduced dataset 1726 samples
# cat final.cohort.bqsr.filter.snps.tsv | wc -l => 142327
# In the final filtered dataset for 1726 samples
# head -1 final.binarized_final_monolabel_df.tsv | sed 's/[^\t]//g'| wc -c => 52684
# libraries
import numpy as np
import matplotlib.pyplot as plt
plt.figure(
figsize=(10,10),
dpi=100)
# set width of bar
barWidth = 0.25
# set height of bar
bars1 = [800, 423]
bars2 = [338, 163]
# Set position of bar on X axis
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
# Make the plot
plt.bar(r1, bars1, width=barWidth, edgecolor='white', label='TBP')
plt.bar(r2, bars2, width=barWidth, edgecolor='white', label='HTBC')
# Add xticks on the middle of the group bars
# plt.xlabel('Cohort Drug Resistance Profile', fontweight='bold')
plt.xticks([r + 0.10 for r in range(len(bars1))], ['Drug Resistant Samples', 'Drug Sensitive Samples'])
# plt.xticks(rotation=45)
pyplot.ylabel('Number of samples')
# Create legend & Show graphic
plt.legend()
plt.show()
a4_dims = (15, 10)
fig, ax = pyplot.subplots(figsize=a4_dims)
sns.barplot(ax=ax, x=['cohort.bqsr.filter.snps.indels.tsv', 'cohort.bqsr.filter.snps.tsv', 'final.cohort.bqsr.filter.snps.tsv', 'final.binarized_final_monolabel_df.tsv' ], y=[452787, 356154, 142327, 52684 ])
<AxesSubplot:>
# libraries
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')
plt.figure(
figsize=(10,10),
dpi=100)
# set width of bar
barWidth = 0.25
# set height of bar
bars1 = [ 142327, 52684 ]
bars2 = [338, 163]
# Set position of bar on X axis
r1 = np.arange(len(bars1))
# Make the plot
plt.bar(r1, bars1, width=barWidth, edgecolor='white')
# plt.bar(r2, bars2, width=barWidth, edgecolor='white', label='HTBC')
# Add xticks on the middle of the group bars
# plt.xlabel('Cohort Drug Resistance Profile', fontweight='bold')
plt.xticks([0.05, 1], ['Raw Mutation Dataset', 'Processed Mutation Dataset'], fontsize=14)
# plt.xticks(rotation=45)
plt.yticks(fontsize=14)
plt.ylabel('Number of columns', fontsize=14)
# Create legend & Show graphic
# plt.legend()
plt.show()
a4_dims = (10, 10)
fig, ax = pyplot.subplots(figsize=a4_dims)
sns.barplot(ax=ax, x=['final.cohort.bqsr.filter.snps.tsv', 'final.binarized_final_monolabel_df.tsv' ], y=[ 142327, 52684 ])
<AxesSubplot:>
# Read and transpose the cohort tbprofile file for ALL TB-portals + HTBC Genomes with SampleID as index
tbprofiler_df = pd.read_json(data_location + "cohort.tbprofiler.json", encoding="UTF-8")
tbprofiler_df = tbprofiler_df.transpose()
tbprofiler_df.head()
MDR | XDR | amikacin | aminoglycosides | bedaquiline | capreomycin | ciprofloxacin | clofazimine | cycloserine | delamanid | ... | levofloxacin | linezolid | main_lin | moxifloxacin | ofloxacin | para-aminosalicylic_acid | pyrazinamide | rifampicin | streptomycin | sublin | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ERR027458 | R | - | - | - | - | - | - | - | - | ... | - | - | lineage2 | - | - | - | - | rpoB_p.Ser450Leu, rpoB_p.Glu761Asp | rrs_r.517c>t | lineage2.2.1 | |
ERR027459 | - | - | - | - | - | - | - | - | ... | - | - | lineage2 | - | - | - | - | rpoB_p.Leu430Pro | - | lineage2.2.1 | ||
ERR027460 | - | - | - | - | - | - | - | - | ... | - | - | - | - | - | - | - | - | ||||
ERR027461 | - | - | - | - | - | - | - | - | ... | - | - | lineage2 | - | - | - | - | - | - | lineage2.2.1 | ||
ERR027462 | - | - | - | - | - | - | - | - | ... | - | - | lineage2 | - | - | - | - | - | - | lineage2.2.1 |
5 rows × 26 columns
tbprofiler_df.shape
(2654, 26)
tbprofiler_df.columns.tolist()
['MDR', 'XDR', 'amikacin', 'aminoglycosides', 'bedaquiline', 'capreomycin', 'ciprofloxacin', 'clofazimine', 'cycloserine', 'delamanid', 'drtype', 'ethambutol', 'ethionamide', 'fluoroquinolones', 'isoniazid', 'kanamycin', 'levofloxacin', 'linezolid', 'main_lin', 'moxifloxacin', 'ofloxacin', 'para-aminosalicylic_acid', 'pyrazinamide', 'rifampicin', 'streptomycin', 'sublin']
# Derive a new column `Resistance_Status` by treating MDR/XDR/TDR as the same and preparing Tb-profiler output for Binary Classification
resistance_status_df = tbprofiler_df
resistance_status_df['Resistance_Status'] = resistance_status_df.apply(
lambda row: 'Sensitive' if (row.drtype == 'Sensitive') else 'Resistant', axis=1)
# resistance_status_df.head()
drugs_column_names = ['rifampicin',
'isoniazid',
'pyrazinamide',
'ethambutol',
'streptomycin',
'fluoroquinolones',
'moxifloxacin',
'ofloxacin',
'levofloxacin',
'ciprofloxacin',
'aminoglycosides',
'amikacin',
'kanamycin',
'capreomycin',
'ethionamide',
'para-aminosalicylic_acid',
'cycloserine',
'linezolid',
'bedaquiline',
'clofazimine',
'delamanid']
lineage_column_names = ['main_lin', 'sublin']
resistance_status_column_names = ['drtype', 'MDR', 'XDR', 'Resistance_Status']
renamed_drug_columns_names = ['rifampicin_resistance',
'isoniazid_resistance',
'pyrazinamide_resistance',
'ethambutol_resistance',
'streptomycin_resistance',
'fluoroquinolones_resistance',
'moxifloxacin_resistance',
'ofloxacin_resistance',
'levofloxacin_resistance',
'ciprofloxacin_resistance',
'aminoglycosides_resistance',
'amikacin_resistance',
'kanamycin_resistance',
'capreomycin_resistance',
'ethionamide_resistance',
'para-aminosalicylic_acid_resistance',
'cycloserine_resistance',
'linezolid_resistance',
'bedaquiline_resistance',
'clofazimine_resistance',
'delamanid_resistance']
renamed_drug_columns_names_dict = {
'rifampicin': 'rifampicin_resistance',
'isoniazid': 'isoniazid_resistance',
'pyrazinamide': 'pyrazinamide_resistance',
'ethambutol': 'ethambutol_resistance',
'streptomycin': 'streptomycin_resistance',
'fluoroquinolones': 'fluoroquinolones_resistance',
'moxifloxacin': 'moxifloxacin_resistance',
'ofloxacin': 'ofloxacin_resistance',
'levofloxacin': 'levofloxacin_resistance',
'ciprofloxacin': 'ciprofloxacin_resistance',
'aminoglycosides': 'aminoglycosides_resistance',
'amikacin': 'amikacin_resistance',
'kanamycin': 'kanamycin_resistance',
'capreomycin': 'capreomycin_resistance',
'ethionamide': 'ethionamide_resistance',
'para-aminosalicylic_acid': 'para-aminosalicylic_acid_resistance',
'cycloserine': 'cycloserine_resistance',
'linezolid': 'linezolid_resistance',
'bedaquiline': 'bedaquiline_resistance',
'clofazimine': 'clofazimine_resistance',
'delamanid': 'delamanid_resistance'
}
# Rename the columns for being explicit once the overall dataframe is created.
resistance_status_df.rename(columns=renamed_drug_columns_names_dict,
inplace=True)
resistance_status_df.head()
MDR | XDR | amikacin_resistance | aminoglycosides_resistance | bedaquiline_resistance | capreomycin_resistance | ciprofloxacin_resistance | clofazimine_resistance | cycloserine_resistance | delamanid_resistance | ... | linezolid_resistance | main_lin | moxifloxacin_resistance | ofloxacin_resistance | para-aminosalicylic_acid_resistance | pyrazinamide_resistance | rifampicin_resistance | streptomycin_resistance | sublin | Resistance_Status | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ERR027458 | R | - | - | - | - | - | - | - | - | ... | - | lineage2 | - | - | - | - | rpoB_p.Ser450Leu, rpoB_p.Glu761Asp | rrs_r.517c>t | lineage2.2.1 | Resistant | |
ERR027459 | - | - | - | - | - | - | - | - | ... | - | lineage2 | - | - | - | - | rpoB_p.Leu430Pro | - | lineage2.2.1 | Resistant | ||
ERR027460 | - | - | - | - | - | - | - | - | ... | - | - | - | - | - | - | - | Sensitive | ||||
ERR027461 | - | - | - | - | - | - | - | - | ... | - | lineage2 | - | - | - | - | - | - | lineage2.2.1 | Sensitive | ||
ERR027462 | - | - | - | - | - | - | - | - | ... | - | lineage2 | - | - | - | - | - | - | lineage2.2.1 | Sensitive |
5 rows × 27 columns
# Binarize the dataframe, for drug columns, to focus only on the presence/absence of mutation.
# We treat all "-" as absence of mutation and assign it as 0
binarized_resistance_status_df = resistance_status_df
for col_name in renamed_drug_columns_names:
binarized_resistance_status_df[col_name] = resistance_status_df[col_name].apply(
lambda resistance: 0 if resistance is '-' else 1)
binarized_resistance_status_df.head()
MDR | XDR | amikacin_resistance | aminoglycosides_resistance | bedaquiline_resistance | capreomycin_resistance | ciprofloxacin_resistance | clofazimine_resistance | cycloserine_resistance | delamanid_resistance | ... | linezolid_resistance | main_lin | moxifloxacin_resistance | ofloxacin_resistance | para-aminosalicylic_acid_resistance | pyrazinamide_resistance | rifampicin_resistance | streptomycin_resistance | sublin | Resistance_Status | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ERR027458 | R | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | lineage2 | 0 | 0 | 0 | 0 | 1 | 1 | lineage2.2.1 | Resistant | |
ERR027459 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | lineage2 | 0 | 0 | 0 | 0 | 1 | 0 | lineage2.2.1 | Resistant | ||
ERR027460 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Sensitive | ||||
ERR027461 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | lineage2 | 0 | 0 | 0 | 0 | 0 | 0 | lineage2.2.1 | Sensitive | ||
ERR027462 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | lineage2 | 0 | 0 | 0 | 0 | 0 | 0 | lineage2.2.1 | Sensitive |
5 rows × 27 columns
binarized_resistance_status_df.shape
(2654, 27)
binarized_resistance_status_df[['Resistance_Status']].head()
Resistance_Status | |
---|---|
ERR027458 | Resistant |
ERR027459 | Resistant |
ERR027460 | Sensitive |
ERR027461 | Sensitive |
ERR027462 | Sensitive |
binarized_resistance_status_df[['Resistance_Status']]
Resistance_Status | |
---|---|
ERR027458 | Resistant |
ERR027459 | Resistant |
ERR027460 | Sensitive |
ERR027461 | Sensitive |
ERR027462 | Sensitive |
ERR027463 | Resistant |
ERR027464 | Sensitive |
ERR027465 | Resistant |
ERR027466 | Sensitive |
ERR027467 | Resistant |
ERR027468 | Resistant |
ERR027469 | Sensitive |
ERR047880 | Resistant |
ERR047881 | Resistant |
ERR047882 | Resistant |
ERR047883 | Resistant |
ERR047884 | Resistant |
ERR047885 | Resistant |
ERR047886 | Resistant |
ERR047887 | Resistant |
ERR047888 | Resistant |
ERR047889 | Resistant |
ERR047890 | Resistant |
ERR047891 | Resistant |
ERR067576 | Resistant |
ERR067577 | Resistant |
ERR067578 | Resistant |
ERR067579 | Sensitive |
ERR067580 | Sensitive |
ERR067581 | Resistant |
... | ... |
SRR9738501 | Sensitive |
SRR9738503 | Resistant |
SRR9738504 | Resistant |
SRR9738505 | Resistant |
SRR9738506 | Resistant |
SRR9738508 | Resistant |
SRR9738510 | Resistant |
SRR9738512 | Resistant |
SRR9738513 | Resistant |
SRR9738514 | Resistant |
SRR9738518 | Resistant |
SRR9738519 | Resistant |
SRR9738521 | Resistant |
SRR9738523 | Resistant |
SRR9738526 | Resistant |
SRR9738527 | Resistant |
SRR9738530 | Resistant |
SRR9738535 | Resistant |
SRR9738538 | Resistant |
SRR9738541 | Resistant |
SRR9738545 | Resistant |
SRR9738546 | Resistant |
SRR9738547 | Resistant |
SRR9738549 | Resistant |
SRR9738552 | Resistant |
SRR9738553 | Resistant |
SRR9738554 | Resistant |
SRR9738556 | Resistant |
SRR9738557 | Resistant |
SRR9738558 | Resistant |
2654 rows × 1 columns
binarized_resistance_status_df[renamed_drug_columns_names].sum().to_dict()
{'rifampicin_resistance': 1727, 'isoniazid_resistance': 1813, 'pyrazinamide_resistance': 1110, 'ethambutol_resistance': 1549, 'streptomycin_resistance': 1642, 'fluoroquinolones_resistance': 883, 'moxifloxacin_resistance': 883, 'ofloxacin_resistance': 883, 'levofloxacin_resistance': 883, 'ciprofloxacin_resistance': 883, 'aminoglycosides_resistance': 401, 'amikacin_resistance': 401, 'kanamycin_resistance': 1049, 'capreomycin_resistance': 460, 'ethionamide_resistance': 1016, 'para-aminosalicylic_acid_resistance': 163, 'cycloserine_resistance': 94, 'linezolid_resistance': 5, 'bedaquiline_resistance': 1, 'clofazimine_resistance': 1, 'delamanid_resistance': 14}
a_dict = binarized_resistance_status_df[renamed_drug_columns_names].sum().to_dict()
# sns.barplot(data=a_dict, y=a_dict.values())
a_dict
{'rifampicin_resistance': 1727, 'isoniazid_resistance': 1813, 'pyrazinamide_resistance': 1110, 'ethambutol_resistance': 1549, 'streptomycin_resistance': 1642, 'fluoroquinolones_resistance': 883, 'moxifloxacin_resistance': 883, 'ofloxacin_resistance': 883, 'levofloxacin_resistance': 883, 'ciprofloxacin_resistance': 883, 'aminoglycosides_resistance': 401, 'amikacin_resistance': 401, 'kanamycin_resistance': 1049, 'capreomycin_resistance': 460, 'ethionamide_resistance': 1016, 'para-aminosalicylic_acid_resistance': 163, 'cycloserine_resistance': 94, 'linezolid_resistance': 5, 'bedaquiline_resistance': 1, 'clofazimine_resistance': 1, 'delamanid_resistance': 14}
# Distribution of Resistant/Sensitive genomes
binarized_resistance_status_df['Resistance_Status'].hist()
<AxesSubplot:>
# Isolate a list of resistant genomes
resistant_genomes = binarized_resistance_status_df[binarized_resistance_status_df['Resistance_Status'] == "Resistant"].index.tolist()
resistant_genomes
['ERR027458', 'ERR027459', 'ERR027463', 'ERR027465', 'ERR027467', 'ERR027468', 'ERR047880', 'ERR047881', 'ERR047882', 'ERR047883', 'ERR047884', 'ERR047885', 'ERR047886', 'ERR047887', 'ERR047888', 'ERR047889', 'ERR047890', 'ERR047891', 'ERR067576', 'ERR067577', 'ERR067578', 'ERR067581', 'ERR067582', 'ERR067583', 'ERR067584', 'ERR067586', 'ERR067587', 'ERR067588', 'ERR067589', 'ERR067590', 'ERR067592', 'ERR067593', 'ERR067596', 'ERR067598', 'ERR067599', 'ERR067600', 'ERR067602', 'ERR067603', 'ERR067605', 'ERR067608', 'ERR067609', 'ERR067610', 'ERR067611', 'ERR067612', 'ERR067613', 'ERR067615', 'ERR067618', 'ERR067619', 'ERR067620', 'ERR067621', 'ERR067622', 'ERR067623', 'ERR067625', 'ERR067626', 'ERR067627', 'ERR067628', 'ERR067629', 'ERR067630', 'ERR067631', 'ERR067635', 'ERR067638', 'ERR067639', 'ERR067641', 'ERR067642', 'ERR067643', 'ERR067644', 'ERR067645', 'ERR067647', 'ERR067649', 'ERR067650', 'ERR067651', 'ERR067652', 'ERR067653', 'ERR067654', 'ERR067656', 'ERR067657', 'ERR067658', 'ERR067659', 'ERR067660', 'ERR067661', 'ERR067662', 'ERR067663', 'ERR067665', 'ERR067666', 'ERR067667', 'ERR067668', 'ERR067669', 'ERR067670', 'ERR067671', 'ERR067672', 'ERR067673', 'ERR067674', 'ERR067675', 'ERR067677', 'ERR067678', 'ERR067679', 'ERR067683', 'ERR067685', 'ERR067687', 'ERR067691', 'ERR067692', 'ERR067694', 'ERR067695', 'ERR067696', 'ERR067697', 'ERR067698', 'ERR067701', 'ERR067703', 'ERR067704', 'ERR067705', 'ERR067707', 'ERR067709', 'ERR067710', 'ERR067711', 'ERR067712', 'ERR067714', 'ERR067715', 'ERR067717', 'ERR067718', 'ERR067722', 'ERR067723', 'ERR067735', 'ERR067736', 'ERR067738', 'ERR067739', 'ERR067748', 'ERR067749', 'ERR067751', 'ERR067754', 'ERR067755', 'ERR067760', 'ERR067762', 'ERR067765', 'ERR108421', 'ERR108422', 'ERR3335723', 'ERR3335724', 'ERR3335725', 'ERR3335726', 'ERR3335727', 'ERR3335728', 'ERR3335729', 'ERR3335730', 'ERR3335731', 'ERR3335732', 'ERR3335733', 'ERR3335734', 'ERR3335735', 'ERR3335736', 'ERR3335737', 'ERR3335738', 'ERR3335739', 'ERR3335740', 'ERR3335741', 'ERR3335742', 'ERR3335743', 'ERR3335744', 'ERR3335745', 'ERR3335746', 'ERR3335747', 'ERR3335748', 'ERR3335749', 'ERR3335750', 'ERR3335751', 'ERR3335752', 'ERR3335753', 'ERR3335754', 'ERR3335755', 'ERR3335756', 'ERR3335757', 'ERR3335758', 'ERR3335759', 'ERR3335761', 'ERR3335762', 'ERR3335763', 'ERR3335764', 'ERR3335765', 'ERR3335766', 'ERR3335767', 'ERR3335768', 'ERR3335769', 'ERR3335770', 'ERR3335771', 'ERR3335772', 'ERR3335773', 'ERR3335774', 'ERR3335775', 'ERR3335776', 'ERR3335777', 'ERR3335778', 'ERR3335779', 'ERR3335780', 'ERR3335781', 'ERR3335782', 'ERR3335784', 'ERR3335785', 'ERR3335786', 'ERR3335787', 'ERR3335788', 'ERR3335789', 'ERR3335790', 'ERR3335791', 'ERR3335792', 'ERR3335793', 'ERR3335794', 'ERR3335795', 'ERR3335796', 'ERR3335797', 'ERR3335798', 'ERR3335799', 'ERR3335800', 'ERR3335801', 'ERR3335802', 'ERR3335803', 'ERR688013', 'ERR688014', 'ERR688015', 'ERR688016', 'ERR688017', 'ERR688018', 'ERR688019', 'ERR688020', 'ERR688021', 'ERR688022', 'ERR688023', 'ERR688024', 'ERR688025', 'ERR688026', 'ERR688027', 'ERR688028', 'ERR688029', 'ERR688030', 'ERR688031', 'ERR688032', 'ERR688033', 'ERR688034', 'ERR688035', 'ERR688036', 'ERR688037', 'ERR688038', 'ERR688039', 'ERR688040', 'ERR688041', 'ERR688042', 'ERR688043', 'ERR688044', 'ERR688045', 'ERR688046', 'ERR688047', 'ERR688048', 'ERR688049', 'SRR10379876', 'SRR10379877', 'SRR10379879', 'SRR10379882', 'SRR10379883', 'SRR10379886', 'SRR10379887', 'SRR10379889', 'SRR10379890', 'SRR10379894', 'SRR10379895', 'SRR10379896', 'SRR10379898', 'SRR10379899', 'SRR10379903', 'SRR10379905', 'SRR10379906', 'SRR10379908', 'SRR10379909', 'SRR10379910', 'SRR10379912', 'SRR10379913', 'SRR10379914', 'SRR10379916', 'SRR10379917', 'SRR10379918', 'SRR10379919', 'SRR10379921', 'SRR10379922', 'SRR10379927', 'SRR10379928', 'SRR10379929', 'SRR10379931', 'SRR10379932', 'SRR10379933', 'SRR10379934', 'SRR10379936', 'SRR10379939', 'SRR10379940', 'SRR10379941', 'SRR10379942', 'SRR10379943', 'SRR10379944', 'SRR10379945', 'SRR10379948', 'SRR10379949', 'SRR10379950', 'SRR10379953', 'SRR10379955', 'SRR10379956', 'SRR10379957', 'SRR10379963', 'SRR10379964', 'SRR10379965', 'SRR10379966', 'SRR10379967', 'SRR10379969', 'SRR10379971', 'SRR10379972', 'SRR10379973', 'SRR10379974', 'SRR10379976', 'SRR10379977', 'SRR10379978', 'SRR10379979', 'SRR10379982', 'SRR10379983', 'SRR10379984', 'SRR10379985', 'SRR10379986', 'SRR10379987', 'SRR10379989', 'SRR10379990', 'SRR10379995', 'SRR10379997', 'SRR10379998', 'SRR10380002', 'SRR10380003', 'SRR10380004', 'SRR10380005', 'SRR10380007', 'SRR10380008', 'SRR10380009', 'SRR10380010', 'SRR10380011', 'SRR10380012', 'SRR10380013', 'SRR10380014', 'SRR10380015', 'SRR10380017', 'SRR10380018', 'SRR10380019', 'SRR10380020', 'SRR10380021', 'SRR10380022', 'SRR10380023', 'SRR10380024', 'SRR10380029', 'SRR10380031', 'SRR10380032', 'SRR10380033', 'SRR10380034', 'SRR10380035', 'SRR10380036', 'SRR10380037', 'SRR10380038', 'SRR10380039', 'SRR10380040', 'SRR10380041', 'SRR10380042', 'SRR10380043', 'SRR10380045', 'SRR10380046', 'SRR10380047', 'SRR10380048', 'SRR10380050', 'SRR10380051', 'SRR10380052', 'SRR10380053', 'SRR10380054', 'SRR10380055', 'SRR10380056', 'SRR10380057', 'SRR10380060', 'SRR10380061', 'SRR10380062', 'SRR10380063', 'SRR10380064', 'SRR10380066', 'SRR10380068', 'SRR10380070', 'SRR10380071', 'SRR10380072', 'SRR10380073', 'SRR10380074', 'SRR10380076', 'SRR10380077', 'SRR10380078', 'SRR10380079', 'SRR10380080', 'SRR10380081', 'SRR10380082', 'SRR10380083', 'SRR10380084', 'SRR10380085', 'SRR10380086', 'SRR10380087', 'SRR10380089', 'SRR10380092', 'SRR10380093', 'SRR10380094', 'SRR10380095', 'SRR10380096', 'SRR10380097', 'SRR10380098', 'SRR10380099', 'SRR10380101', 'SRR10380102', 'SRR10380103', 'SRR10380104', 'SRR10380105', 'SRR10380106', 'SRR10380107', 'SRR10380109', 'SRR10380110', 'SRR10380111', 'SRR10380113', 'SRR10380117', 'SRR10380118', 'SRR10380120', 'SRR10380121', 'SRR10380122', 'SRR10380123', 'SRR10380124', 'SRR10380125', 'SRR10380129', 'SRR10380130', 'SRR10380131', 'SRR10380132', 'SRR10380135', 'SRR10380136', 'SRR10380137', 'SRR10380138', 'SRR10380139', 'SRR10380142', 'SRR10380143', 'SRR10380144', 'SRR10380145', 'SRR10380146', 'SRR10380147', 'SRR10380148', 'SRR10380149', 'SRR10380150', 'SRR10380155', 'SRR10380159', 'SRR10380160', 'SRR10380161', 'SRR10380164', 'SRR10380165', 'SRR10380166', 'SRR10380167', 'SRR10380168', 'SRR10380170', 'SRR10380171', 'SRR10380173', 'SRR10380176', 'SRR10380178', 'SRR10380184', 'SRR10380186', 'SRR10380187', 'SRR10380189', 'SRR10380194', 'SRR10380196', 'SRR10380197', 'SRR10380199', 'SRR10380201', 'SRR10380204', 'SRR10380206', 'SRR10380209', 'SRR10380210', 'SRR10380213', 'SRR10380214', 'SRR10380215', 'SRR10380216', 'SRR10380217', 'SRR10380218', 'SRR10380222', 'SRR10380226', 'SRR10380231', 'SRR10380234', 'SRR10380238', 'SRR10380240', 'SRR10380241', 'SRR10380242', 'SRR10380243', 'SRR10380245', 'SRR10380246', 'SRR10380247', 'SRR10380248', 'SRR10380249', 'SRR10380250', 'SRR10380253', 'SRR10397092', 'SRR10397093', 'SRR10397094', 'SRR10397095', 'SRR10397096', 'SRR10397097', 'SRR10397098', 'SRR10397099', 'SRR10397100', 'SRR10397101', 'SRR10397102', 'SRR10397103', 'SRR10397104', 'SRR10397105', 'SRR10397106', 'SRR10397107', 'SRR10397108', 'SRR10397109', 'SRR10397110', 'SRR10397111', 'SRR10397112', 'SRR10397114', 'SRR10397115', 'SRR10397116', 'SRR10397117', 'SRR10397118', 'SRR10397119', 'SRR10397120', 'SRR10397121', 'SRR10397122', 'SRR10397123', 'SRR10397124', 'SRR10397125', 'SRR10397126', 'SRR10397128', 'SRR10397129', 'SRR10397130', 'SRR10397131', 'SRR10397132', 'SRR10397133', 'SRR10397134', 'SRR10397135', 'SRR10397136', 'SRR10397137', 'SRR10397138', 'SRR10397139', 'SRR10397140', 'SRR10397141', 'SRR10397142', 'SRR10397143', 'SRR10397144', 'SRR10397145', 'SRR10397149', 'SRR10397150', 'SRR10397151', 'SRR10397152', 'SRR10397153', 'SRR10397154', 'SRR10397155', 'SRR10397156', 'SRR10397157', 'SRR10397158', 'SRR10397159', 'SRR10397160', 'SRR10397161', 'SRR10397162', 'SRR10397164', 'SRR10397165', 'SRR10397166', 'SRR10397167', 'SRR10397168', 'SRR10397170', 'SRR10397173', 'SRR10397174', 'SRR10397175', 'SRR10397176', 'SRR10397177', 'SRR10397178', 'SRR10397179', 'SRR10397180', 'SRR10397181', 'SRR10397182', 'SRR10397183', 'SRR10397185', 'SRR10397186', 'SRR10397187', 'SRR10397188', 'SRR10397189', 'SRR10397190', 'SRR10397192', 'SRR10397193', 'SRR10397194', 'SRR10397197', 'SRR10397198', 'SRR10397199', 'SRR10397200', 'SRR10397204', 'SRR10397209', 'SRR10397211', 'SRR10397212', 'SRR10397213', 'SRR10397214', 'SRR10397215', 'SRR10397217', 'SRR10397218', 'SRR10397219', 'SRR10397220', 'SRR10397221', 'SRR10397222', 'SRR10397223', 'SRR10397224', 'SRR10397225', 'SRR10397226', 'SRR10397227', 'SRR10397228', 'SRR10397230', 'SRR10397231', 'SRR10397232', 'SRR10397233', 'SRR10397234', 'SRR10397235', 'SRR10397236', 'SRR10397237', 'SRR10397239', 'SRR10397240', 'SRR10397241', 'SRR10397242', 'SRR10397243', 'SRR10397244', 'SRR10397245', 'SRR10397249', 'SRR10397252', 'SRR10397253', 'SRR10397256', 'SRR10397258', 'SRR10397260', 'SRR10397262', 'SRR10397264', 'SRR10397265', 'SRR10397266', 'SRR10397267', 'SRR10397268', 'SRR10397270', 'SRR10397273', 'SRR10525317', 'SRR10525318', 'SRR10525319', 'SRR10525320', 'SRR10525321', 'SRR10525322', 'SRR10525324', 'SRR10525325', 'SRR10525326', 'SRR10525327', 'SRR10525328', 'SRR10525329', 'SRR10525330', 'SRR10525331', 'SRR10525332', 'SRR10525333', 'SRR10525334', 'SRR10525335', 'SRR10525336', 'SRR10525337', 'SRR10525338', 'SRR10525339', 'SRR10525340', 'SRR10525341', 'SRR10525342', 'SRR10525348', 'SRR10525349', 'SRR10525350', 'SRR10525351', 'SRR10525352', 'SRR10525353', 'SRR10525354', 'SRR10525355', 'SRR10525356', 'SRR10525357', 'SRR10525358', 'SRR10525359', 'SRR10525360', 'SRR10525361', 'SRR10525362', 'SRR10525363', 'SRR10525364', 'SRR10525365', 'SRR10525366', 'SRR10525367', 'SRR10525368', 'SRR10525369', 'SRR10525370', 'SRR10525371', 'SRR10525372', 'SRR10525373', 'SRR10525374', 'SRR10525375', 'SRR11033592', 'SRR11033593', 'SRR11033597', 'SRR11033600', 'SRR11033609', 'SRR11033611', 'SRR11033620', 'SRR11033625', 'SRR11033634', 'SRR11033637', 'SRR11033646', 'SRR11033649', 'SRR11033658', 'SRR11033668', 'SRR11033679', 'SRR11033682', 'SRR11033684', 'SRR11033685', 'SRR11033689', 'SRR11033690', 'SRR11033691', 'SRR11033700', 'SRR11033702', 'SRR11033712', 'SRR11033714', 'SRR11033715', 'SRR11033716', 'SRR11033717', 'SRR11033718', 'SRR11033719', 'SRR11033721', 'SRR11033724', 'SRR11033726', 'SRR11033728', 'SRR11033729', 'SRR11033730', 'SRR11033734', 'SRR11033736', 'SRR11033738', 'SRR11033740', 'SRR11033741', 'SRR11033743', 'SRR11033751', 'SRR11033756', 'SRR11033757', 'SRR11033760', 'SRR11033761', 'SRR11033762', 'SRR11033767', 'SRR1158874', 'SRR1158890', 'SRR1158898', 'SRR1158907', 'SRR1158923', 'SRR1158929', 'SRR1158943', 'SRR1158946', 'SRR1158950', 'SRR1158990', 'SRR1159002', 'SRR1159003', 'SRR1159005', 'SRR1159006', 'SRR1159029', 'SRR1159034', 'SRR1159044', 'SRR1159053', 'SRR1159075', 'SRR1159076', 'SRR1159083', 'SRR1159108', 'SRR1159121', 'SRR1159122', 'SRR1159126', 'SRR1159129', 'SRR1159150', 'SRR1159154', 'SRR1159167', 'SRR1159175', 'SRR1159180', 'SRR1159204', 'SRR1159245', 'SRR1159262', 'SRR1159279', 'SRR1159283', 'SRR1159290', 'SRR1159293', 'SRR1159298', 'SRR1159299', 'SRR1159303', 'SRR1159308', 'SRR1159309', 'SRR1159338', 'SRR1159349', 'SRR1159350', 'SRR1159351', 'SRR1159352', 'SRR1159362', 'SRR1159369', 'SRR1159370', 'SRR1159377', 'SRR1159389', 'SRR1159393', 'SRR1159521', 'SRR1159526', 'SRR1159661', 'SRR1159700', 'SRR1159729', 'SRR1159738', 'SRR1159747', 'SRR1159760', 'SRR1159790', 'SRR1159810', 'SRR1159818', 'SRR1159861', 'SRR1159954', 'SRR1159959', 'SRR1159977', 'SRR1159986', 'SRR1162483', 'SRR1162485', 'SRR1162491', 'SRR1162494', 'SRR1162498', 'SRR1162502', 'SRR1162504', 'SRR1162505', 'SRR1162509', 'SRR1162518', 'SRR1162529', 'SRR1162531', 'SRR1162532', 'SRR1162533', 'SRR1162534', 'SRR1162535', 'SRR1162537', 'SRR1162539', 'SRR1162541', 'SRR1162542', 'SRR1162947', 'SRR1162952', 'SRR1162953', 'SRR1162961', 'SRR1162962', 'SRR1162966', 'SRR1162971', 'SRR1162972', 'SRR1162980', 'SRR1162995', 'SRR1162996', 'SRR1162997', 'SRR1163001', 'SRR1163002', 'SRR1163021', 'SRR1163022', 'SRR1163023', 'SRR1163025', 'SRR1163037', 'SRR1163050', 'SRR1163073', 'SRR1163079', 'SRR1163080', 'SRR1163081', 'SRR1163101', 'SRR1163103', 'SRR1163104', 'SRR1163106', 'SRR1163114', 'SRR1163115', 'SRR1163121', 'SRR1163126', 'SRR1163135', 'SRR1163139', 'SRR1163140', 'SRR1163145', 'SRR1163166', 'SRR1163172', 'SRR1163173', 'SRR1163177', 'SRR1163178', 'SRR1163185', 'SRR1163189', 'SRR1163194', 'SRR1163198', 'SRR1163199', 'SRR1163202', 'SRR1163203', 'SRR1163204', 'SRR1163211', 'SRR1163286', 'SRR1163290', 'SRR1163291', 'SRR1163296', 'SRR1163298', 'SRR1163299', 'SRR1163303', 'SRR1163309', 'SRR1163310', 'SRR1163313', 'SRR1163314', 'SRR1163315', 'SRR1163317', 'SRR1163318', 'SRR1163319', 'SRR1163325', 'SRR1163326', 'SRR1163329', 'SRR1163330', 'SRR1163332', 'SRR1163336', 'SRR1163338', 'SRR1163341', 'SRR1163348', 'SRR1163349', 'SRR1163354', 'SRR1163363', 'SRR1163369', 'SRR1163373', 'SRR1163374', 'SRR1163376', 'SRR1163380', 'SRR1163385', 'SRR1163387', 'SRR1163392', 'SRR1163397', 'SRR1163399', 'SRR1163404', 'SRR1163405', 'SRR1163406', 'SRR1163412', 'SRR1163414', 'SRR1163415', 'SRR1163416', 'SRR1163423', 'SRR1163429', 'SRR1163431', 'SRR1163432', 'SRR1163447', 'SRR11638839', 'SRR11648203', 'SRR1166908', 'SRR1166910', 'SRR1166916', 'SRR1166917', 'SRR1166920', 'SRR1168993', 'SRR1172412', 'SRR3510602', 'SRR3544716', 'SRR3544717', 'SRR3544718', 'SRR3544724', 'SRR3544725', 'SRR3544726', 'SRR3544727', 'SRR3544728', 'SRR3544729', 'SRR3544730', 'SRR3544731', 'SRR3544732', 'SRR3544733', 'SRR3544734', 'SRR3544735', 'SRR3544737', 'SRR3544739', 'SRR3544740', 'SRR3544741', 'SRR3544743', 'SRR3544747', 'SRR3544748', 'SRR3544751', 'SRR3743200', 'SRR3743202', 'SRR3743203', 'SRR3743368', 'SRR3743370', 'SRR3743373', 'SRR3743375', 'SRR3743376', 'SRR3743377', 'SRR3743378', 'SRR3743379', 'SRR3743380', 'SRR3743381', 'SRR3743382', 'SRR3743383', 'SRR3743384', 'SRR3743385', 'SRR3743386', 'SRR3743388', 'SRR3743389', 'SRR3743390', 'SRR3743391', 'SRR3743392', 'SRR3743393', 'SRR3743394', 'SRR3743396', 'SRR3743397', 'SRR3743398', 'SRR3743399', 'SRR3743400', 'SRR3743402', 'SRR3743403', 'SRR3743404', 'SRR3743405', 'SRR3743406', 'SRR3743407', 'SRR3743408', 'SRR3743409', 'SRR3743410', ...]
# Isolate a list of sensitive genomes
sensitive_genomes = binarized_resistance_status_df[binarized_resistance_status_df['Resistance_Status'] == "Sensitive"].index.tolist()
sensitive_genomes
['ERR027460', 'ERR027461', 'ERR027462', 'ERR027464', 'ERR027466', 'ERR027469', 'ERR067579', 'ERR067580', 'ERR067585', 'ERR067591', 'ERR067594', 'ERR067595', 'ERR067597', 'ERR067601', 'ERR067604', 'ERR067606', 'ERR067607', 'ERR067614', 'ERR067616', 'ERR067617', 'ERR067624', 'ERR067632', 'ERR067633', 'ERR067634', 'ERR067636', 'ERR067637', 'ERR067640', 'ERR067646', 'ERR067648', 'ERR067655', 'ERR067664', 'ERR067676', 'ERR067680', 'ERR067682', 'ERR067684', 'ERR067686', 'ERR067688', 'ERR067689', 'ERR067690', 'ERR067693', 'ERR067699', 'ERR067700', 'ERR067702', 'ERR067706', 'ERR067708', 'ERR067713', 'ERR067716', 'ERR067719', 'ERR067726', 'ERR067730', 'ERR067737', 'ERR067743', 'ERR067753', 'ERR067757', 'ERR3335783', 'ERR688008', 'ERR688009', 'ERR688010', 'ERR688011', 'ERR688012', 'SRR10379878', 'SRR10379880', 'SRR10379881', 'SRR10379884', 'SRR10379885', 'SRR10379888', 'SRR10379891', 'SRR10379892', 'SRR10379893', 'SRR10379897', 'SRR10379900', 'SRR10379901', 'SRR10379902', 'SRR10379904', 'SRR10379907', 'SRR10379911', 'SRR10379915', 'SRR10379920', 'SRR10379923', 'SRR10379924', 'SRR10379925', 'SRR10379926', 'SRR10379930', 'SRR10379935', 'SRR10379937', 'SRR10379938', 'SRR10379946', 'SRR10379947', 'SRR10379951', 'SRR10379952', 'SRR10379954', 'SRR10379958', 'SRR10379959', 'SRR10379960', 'SRR10379961', 'SRR10379962', 'SRR10379968', 'SRR10379970', 'SRR10379975', 'SRR10379980', 'SRR10379981', 'SRR10379988', 'SRR10379991', 'SRR10379992', 'SRR10379993', 'SRR10379994', 'SRR10379996', 'SRR10379999', 'SRR10380000', 'SRR10380001', 'SRR10380006', 'SRR10380016', 'SRR10380025', 'SRR10380026', 'SRR10380027', 'SRR10380028', 'SRR10380030', 'SRR10380044', 'SRR10380049', 'SRR10380058', 'SRR10380059', 'SRR10380065', 'SRR10380067', 'SRR10380069', 'SRR10380075', 'SRR10380090', 'SRR10380091', 'SRR10380100', 'SRR10380108', 'SRR10380112', 'SRR10380114', 'SRR10380115', 'SRR10380116', 'SRR10380119', 'SRR10380126', 'SRR10380127', 'SRR10380128', 'SRR10380133', 'SRR10380134', 'SRR10380140', 'SRR10380141', 'SRR10380151', 'SRR10380152', 'SRR10380153', 'SRR10380154', 'SRR10380156', 'SRR10380157', 'SRR10380158', 'SRR10380162', 'SRR10380163', 'SRR10380169', 'SRR10380172', 'SRR10380174', 'SRR10380175', 'SRR10380177', 'SRR10380179', 'SRR10380180', 'SRR10380181', 'SRR10380182', 'SRR10380183', 'SRR10380185', 'SRR10380188', 'SRR10380190', 'SRR10380191', 'SRR10380192', 'SRR10380193', 'SRR10380195', 'SRR10380198', 'SRR10380200', 'SRR10380202', 'SRR10380203', 'SRR10380205', 'SRR10380207', 'SRR10380208', 'SRR10380211', 'SRR10380212', 'SRR10380219', 'SRR10380220', 'SRR10380221', 'SRR10380223', 'SRR10380224', 'SRR10380225', 'SRR10380227', 'SRR10380228', 'SRR10380229', 'SRR10380230', 'SRR10380232', 'SRR10380233', 'SRR10380235', 'SRR10380236', 'SRR10380237', 'SRR10380239', 'SRR10380244', 'SRR10380251', 'SRR10380252', 'SRR10397127', 'SRR10397146', 'SRR10397147', 'SRR10397148', 'SRR10397163', 'SRR10397169', 'SRR10397171', 'SRR10397172', 'SRR10397191', 'SRR10397195', 'SRR10397196', 'SRR10397202', 'SRR10397205', 'SRR10397208', 'SRR10397216', 'SRR10397229', 'SRR10397246', 'SRR10397247', 'SRR10397248', 'SRR10397250', 'SRR10397251', 'SRR10397254', 'SRR10397255', 'SRR10397257', 'SRR10397259', 'SRR10397261', 'SRR10397263', 'SRR10397269', 'SRR10397271', 'SRR10525323', 'SRR10525343', 'SRR10525344', 'SRR10525345', 'SRR10525346', 'SRR10525347', 'SRR11033589', 'SRR11033590', 'SRR11033594', 'SRR11033595', 'SRR11033596', 'SRR11033599', 'SRR11033601', 'SRR11033602', 'SRR11033603', 'SRR11033604', 'SRR11033605', 'SRR11033606', 'SRR11033607', 'SRR11033608', 'SRR11033610', 'SRR11033612', 'SRR11033613', 'SRR11033615', 'SRR11033616', 'SRR11033617', 'SRR11033618', 'SRR11033619', 'SRR11033621', 'SRR11033622', 'SRR11033623', 'SRR11033624', 'SRR11033626', 'SRR11033627', 'SRR11033628', 'SRR11033629', 'SRR11033630', 'SRR11033631', 'SRR11033632', 'SRR11033633', 'SRR11033635', 'SRR11033636', 'SRR11033638', 'SRR11033640', 'SRR11033642', 'SRR11033643', 'SRR11033645', 'SRR11033647', 'SRR11033648', 'SRR11033650', 'SRR11033652', 'SRR11033653', 'SRR11033654', 'SRR11033656', 'SRR11033657', 'SRR11033659', 'SRR11033660', 'SRR11033661', 'SRR11033664', 'SRR11033665', 'SRR11033666', 'SRR11033667', 'SRR11033669', 'SRR11033670', 'SRR11033671', 'SRR11033672', 'SRR11033675', 'SRR11033676', 'SRR11033680', 'SRR11033681', 'SRR11033683', 'SRR11033687', 'SRR11033688', 'SRR11033694', 'SRR11033696', 'SRR11033697', 'SRR11033698', 'SRR11033699', 'SRR11033701', 'SRR11033703', 'SRR11033705', 'SRR11033706', 'SRR11033707', 'SRR11033708', 'SRR11033709', 'SRR11033710', 'SRR11033711', 'SRR11033713', 'SRR11033720', 'SRR11033722', 'SRR11033723', 'SRR11033725', 'SRR11033733', 'SRR11033735', 'SRR11033744', 'SRR11033746', 'SRR11033747', 'SRR11033748', 'SRR11033749', 'SRR11033750', 'SRR11033752', 'SRR11033753', 'SRR11033754', 'SRR11033755', 'SRR11033758', 'SRR11033759', 'SRR11033763', 'SRR11033764', 'SRR11033765', 'SRR11033766', 'SRR11033768', 'SRR11033769', 'SRR11033770', 'SRR11033771', 'SRR11033772', 'SRR11033773', 'SRR11033774', 'SRR11033776', 'SRR11033777', 'SRR11033778', 'SRR11033779', 'SRR11033780', 'SRR1158931', 'SRR1158939', 'SRR1158998', 'SRR1159038', 'SRR1159052', 'SRR1159171', 'SRR1159237', 'SRR1159261', 'SRR1159301', 'SRR1159310', 'SRR1159360', 'SRR1159366', 'SRR1159682', 'SRR1159687', 'SRR1159713', 'SRR1159723', 'SRR1159895', 'SRR1159904', 'SRR1159935', 'SRR1159952', 'SRR1162495', 'SRR1162513', 'SRR1162521', 'SRR1162536', 'SRR1162957', 'SRR1162967', 'SRR1163016', 'SRR1163029', 'SRR1163038', 'SRR1163096', 'SRR1163097', 'SRR1163102', 'SRR1163105', 'SRR1163127', 'SRR1163167', 'SRR1163171', 'SRR1163184', 'SRR1163195', 'SRR1163287', 'SRR1163302', 'SRR1163304', 'SRR1163347', 'SRR1163353', 'SRR1163365', 'SRR1163366', 'SRR1163368', 'SRR1163371', 'SRR1163372', 'SRR1163386', 'SRR1163393', 'SRR1163398', 'SRR1163411', 'SRR1163421', 'SRR1163430', 'SRR1166911', 'SRR1166921', 'SRR1169019', 'SRR1169065', 'SRR1172273', 'SRR1172286', 'SRR3544719', 'SRR3544720', 'SRR3544721', 'SRR3544722', 'SRR3544723', 'SRR3544736', 'SRR3544738', 'SRR3544742', 'SRR3544744', 'SRR3544745', 'SRR3544746', 'SRR3544749', 'SRR3544750', 'SRR3544752', 'SRR3743189', 'SRR3743199', 'SRR3743201', 'SRR3743371', 'SRR3743374', 'SRR3743387', 'SRR3743395', 'SRR3743411', 'SRR3743413', 'SRR3743414', 'SRR3743436', 'SRR3743457', 'SRR3743458', 'SRR3743463', 'SRR3743478', 'SRR3743482', 'SRR3743483', 'SRR3743485', 'SRR3743493', 'SRR3743495', 'SRR5065201', 'SRR5065202', 'SRR5065205', 'SRR5065207', 'SRR5065208', 'SRR5065210', 'SRR5065211', 'SRR5065212', 'SRR5065220', 'SRR5065223', 'SRR5065224', 'SRR5065225', 'SRR5065226', 'SRR5065227', 'SRR5065229', 'SRR5065232', 'SRR5065233', 'SRR5065237', 'SRR5065239', 'SRR5065242', 'SRR5065243', 'SRR5065244', 'SRR5065246', 'SRR5065251', 'SRR5065252', 'SRR5065254', 'SRR5065255', 'SRR5065256', 'SRR5065262', 'SRR5065264', 'SRR5065266', 'SRR5065272', 'SRR5065273', 'SRR5065275', 'SRR5065278', 'SRR5065283', 'SRR5065284', 'SRR5065285', 'SRR5065286', 'SRR5065290', 'SRR5065294', 'SRR5065295', 'SRR5065296', 'SRR5065297', 'SRR5065298', 'SRR5065301', 'SRR5065302', 'SRR5065304', 'SRR5065305', 'SRR5065306', 'SRR5065307', 'SRR5065309', 'SRR5065310', 'SRR5065312', 'SRR5065317', 'SRR5065318', 'SRR5065319', 'SRR5065320', 'SRR5065321', 'SRR5065323', 'SRR5065327', 'SRR5065330', 'SRR5065331', 'SRR5065333', 'SRR5065334', 'SRR5065337', 'SRR5065338', 'SRR5065340', 'SRR5065344', 'SRR5065346', 'SRR5065349', 'SRR5065350', 'SRR5065352', 'SRR5065353', 'SRR5065354', 'SRR5065355', 'SRR5065359', 'SRR5065360', 'SRR5065363', 'SRR5065364', 'SRR5065365', 'SRR5065367', 'SRR5065368', 'SRR5065369', 'SRR5065372', 'SRR5065374', 'SRR5065375', 'SRR5065378', 'SRR5065379', 'SRR5065382', 'SRR5065383', 'SRR5065384', 'SRR5065387', 'SRR5065388', 'SRR5065395', 'SRR5065396', 'SRR5065398', 'SRR5065399', 'SRR5065401', 'SRR5065402', 'SRR5152915', 'SRR5152928', 'SRR5152929', 'SRR5152938', 'SRR5153032', 'SRR5153073', 'SRR5153076', 'SRR5153083', 'SRR5153084', 'SRR5153090', 'SRR5153131', 'SRR5153132', 'SRR5153266', 'SRR5153310', 'SRR5153596', 'SRR5153818', 'SRR5153822', 'SRR5153836', 'SRR5153843', 'SRR5153845', 'SRR5153847', 'SRR5153851', 'SRR5153858', 'SRR5153861', 'SRR5153862', 'SRR5153866', 'SRR5153870', 'SRR5153880', 'SRR5153881', 'SRR5153884', 'SRR5153900', 'SRR5153909', 'SRR5153913', 'SRR5153914', 'SRR5153918', 'SRR5153922', 'SRR5153927', 'SRR5486871', 'SRR5486889', 'SRR6356934', 'SRR6356950', 'SRR6356958', 'SRR6356964', 'SRR6356971', 'SRR6356973', 'SRR6356975', 'SRR6356976', 'SRR6356978', 'SRR6356982', 'SRR6356983', 'SRR6356986', 'SRR6356988', 'SRR6356989', 'SRR6356995', 'SRR6458398', 'SRR6458399', 'SRR6458400', 'SRR6458402', 'SRR6458404', 'SRR6458405', 'SRR6458408', 'SRR6458409', 'SRR6458457', 'SRR6458458', 'SRR6458459', 'SRR6458461', 'SRR6458464', 'SRR6807669', 'SRR6807670', 'SRR6807671', 'SRR6807678', 'SRR6807680', 'SRR6807683', 'SRR6807697', 'SRR6807722', 'SRR6807726', 'SRR6807727', 'SRR6807728', 'SRR6807730', 'SRR6807734', 'SRR6807738', 'SRR6807743', 'SRR6807744', 'SRR6807745', 'SRR6807752', 'SRR6807761', 'SRR7516306', 'SRR7516315', 'SRR7516342', 'SRR7516345', 'SRR7516348', 'SRR7516349', 'SRR7516350', 'SRR7516351', 'SRR7516354', 'SRR7516356', 'SRR7516357', 'SRR7516360', 'SRR7516361', 'SRR7516362', 'SRR7516363', 'SRR7516364', 'SRR7516365', 'SRR7516366', 'SRR7516367', 'SRR7516379', 'SRR7516381', 'SRR7516382', 'SRR7516390', 'SRR7516397', 'SRR7516408', 'SRR7516409', 'SRR7516411', 'SRR7516412', 'SRR7516415', 'SRR7516417', 'SRR7516420', 'SRR7516421', 'SRR7516430', 'SRR7516431', 'SRR7516432', 'SRR7516433', 'SRR7516436', 'SRR7516440', 'SRR7516444', 'SRR7516445', 'SRR7516446', 'SRR7516447', 'SRR7516449', 'SRR7516453', 'SRR7516456', 'SRR7516457', 'SRR7516460', 'SRR7654034', 'SRR7654036', 'SRR7655467', 'SRR7655613', 'SRR7655615', 'SRR7655617', 'SRR7655618', 'SRR8552931', 'SRR8552932', 'SRR8552933', 'SRR9738475', 'SRR9738478', 'SRR9738501']
# Read the genomes IDs of genomes
# - Tb-portals
# - HTBC
import json
with open(data_location + "test_train_genome_ids.json") as f:
all_genomes_ids_dict = json.load(f)
all_genomes_ids_dict
{'tb_portals': ['SRR10379876', 'SRR10379877', 'SRR10379878', 'SRR10379879', 'SRR10379880', 'SRR10379881', 'SRR10379882', 'SRR10379883', 'SRR10379884', 'SRR10379885', 'SRR10379886', 'SRR10379887', 'SRR10379888', 'SRR10379889', 'SRR10379890', 'SRR10379891', 'SRR10379892', 'SRR10379893', 'SRR10379894', 'SRR10379895', 'SRR10379896', 'SRR10379897', 'SRR10379898', 'SRR10379899', 'SRR10379900', 'SRR10379901', 'SRR10379902', 'SRR10379903', 'SRR10379904', 'SRR10379905', 'SRR10379906', 'SRR10379907', 'SRR10379908', 'SRR10379909', 'SRR10379910', 'SRR10379911', 'SRR10379912', 'SRR10379913', 'SRR10379914', 'SRR10379915', 'SRR10379916', 'SRR10379917', 'SRR10379918', 'SRR10379919', 'SRR10379920', 'SRR10379921', 'SRR10379922', 'SRR10379923', 'SRR10379924', 'SRR10379925', 'SRR10379926', 'SRR10379927', 'SRR10379928', 'SRR10379929', 'SRR10379930', 'SRR10379931', 'SRR10379932', 'SRR10379933', 'SRR10379934', 'SRR10379935', 'SRR10379936', 'SRR10379937', 'SRR10379938', 'SRR10379939', 'SRR10379940', 'SRR10379941', 'SRR10379942', 'SRR10379943', 'SRR10379944', 'SRR10379945', 'SRR10379946', 'SRR10379947', 'SRR10379948', 'SRR10379949', 'SRR10379950', 'SRR10379951', 'SRR10379952', 'SRR10379953', 'SRR10379954', 'SRR10379955', 'SRR10379956', 'SRR10379957', 'SRR10379958', 'SRR10379959', 'SRR10379960', 'SRR10379961', 'SRR10379962', 'SRR10379963', 'SRR10379964', 'SRR10379965', 'SRR10379966', 'SRR10379967', 'SRR10379968', 'SRR10379969', 'SRR10379970', 'SRR10379971', 'SRR10379972', 'SRR10379973', 'SRR10379974', 'SRR10379975', 'SRR10379976', 'SRR10379977', 'SRR10379978', 'SRR10379979', 'SRR10379980', 'SRR10379981', 'SRR10379982', 'SRR10379983', 'SRR10379984', 'SRR10379985', 'SRR10379986', 'SRR10379987', 'SRR10379988', 'SRR10379989', 'SRR10379990', 'SRR10379991', 'SRR10379992', 'SRR10379993', 'SRR10379994', 'SRR10379995', 'SRR10379996', 'SRR10379997', 'SRR10379998', 'SRR10379999', 'SRR10380000', 'SRR10380001', 'SRR10380002', 'SRR10380003', 'SRR10380004', 'SRR10380005', 'SRR10380006', 'SRR10380007', 'SRR10380008', 'SRR10380009', 'SRR10380010', 'SRR10380011', 'SRR10380012', 'SRR10380013', 'SRR10380014', 'SRR10380015', 'SRR10380016', 'SRR10380017', 'SRR10380018', 'SRR10380019', 'SRR10380020', 'SRR10380021', 'SRR10380022', 'SRR10380023', 'SRR10380024', 'SRR10380025', 'SRR10380026', 'SRR10380027', 'SRR10380028', 'SRR10380029', 'SRR10380030', 'SRR10380031', 'SRR10380032', 'SRR10380033', 'SRR10380034', 'SRR10380035', 'SRR10380036', 'SRR10380037', 'SRR10380038', 'SRR10380039', 'SRR10380040', 'SRR10380041', 'SRR10380042', 'SRR10380043', 'SRR10380044', 'SRR10380045', 'SRR10380046', 'SRR10380047', 'SRR10380048', 'SRR10380049', 'SRR10380050', 'SRR10380051', 'SRR10380052', 'SRR10380053', 'SRR10380054', 'SRR10380055', 'SRR10380056', 'SRR10380057', 'SRR10380058', 'SRR10380059', 'SRR10380060', 'SRR10380061', 'SRR10380062', 'SRR10380063', 'SRR10380064', 'SRR10380065', 'SRR10380066', 'SRR10380067', 'SRR10380068', 'SRR10380069', 'SRR10380070', 'SRR10380071', 'SRR10380072', 'SRR10380073', 'SRR10380074', 'SRR10380075', 'SRR10380076', 'SRR10380077', 'SRR10380078', 'SRR10380079', 'SRR10380080', 'SRR10380081', 'SRR10380082', 'SRR10380083', 'SRR10380084', 'SRR10380085', 'SRR10380086', 'SRR10380087', 'SRR10380089', 'SRR10380090', 'SRR10380091', 'SRR10380092', 'SRR10380093', 'SRR10380094', 'SRR10380095', 'SRR10380096', 'SRR10380097', 'SRR10380098', 'SRR10380099', 'SRR10380100', 'SRR10380101', 'SRR10380102', 'SRR10380103', 'SRR10380104', 'SRR10380105', 'SRR10380106', 'SRR10380107', 'SRR10380108', 'SRR10380109', 'SRR10380110', 'SRR10380111', 'SRR10380112', 'SRR10380113', 'SRR10380114', 'SRR10380115', 'SRR10380116', 'SRR10380117', 'SRR10380118', 'SRR10380119', 'SRR10380120', 'SRR10380121', 'SRR10380122', 'SRR10380123', 'SRR10380124', 'SRR10380125', 'SRR10380126', 'SRR10380127', 'SRR10380128', 'SRR10380129', 'SRR10380130', 'SRR10380131', 'SRR10380132', 'SRR10380133', 'SRR10380134', 'SRR10380135', 'SRR10380136', 'SRR10380137', 'SRR10380138', 'SRR10380139', 'SRR10380140', 'SRR10380141', 'SRR10380142', 'SRR10380143', 'SRR10380144', 'SRR10380145', 'SRR10380146', 'SRR10380147', 'SRR10380148', 'SRR10380149', 'SRR10380150', 'SRR10380151', 'SRR10380152', 'SRR10380153', 'SRR10380154', 'SRR10380155', 'SRR10380156', 'SRR10380157', 'SRR10380158', 'SRR10380159', 'SRR10380160', 'SRR10380161', 'SRR10380162', 'SRR10380163', 'SRR10380164', 'SRR10380165', 'SRR10380166', 'SRR10380167', 'SRR10380168', 'SRR10380169', 'SRR10380170', 'SRR10380171', 'SRR10380172', 'SRR10380173', 'SRR10380174', 'SRR10380175', 'SRR10380176', 'SRR10380177', 'SRR10380178', 'SRR10380179', 'SRR10380180', 'SRR10380181', 'SRR10380182', 'SRR10380183', 'SRR10380184', 'SRR10380185', 'SRR10380186', 'SRR10380187', 'SRR10380188', 'SRR10380189', 'SRR10380190', 'SRR10380191', 'SRR10380192', 'SRR10380193', 'SRR10380194', 'SRR10380195', 'SRR10380196', 'SRR10380197', 'SRR10380198', 'SRR10380199', 'SRR10380200', 'SRR10380201', 'SRR10380202', 'SRR10380203', 'SRR10380204', 'SRR10380205', 'SRR10380206', 'SRR10380207', 'SRR10380208', 'SRR10380209', 'SRR10380210', 'SRR10380211', 'SRR10380212', 'SRR10380213', 'SRR10380214', 'SRR10380215', 'SRR10380216', 'SRR10380217', 'SRR10380218', 'SRR10380219', 'SRR10380220', 'SRR10380221', 'SRR10380222', 'SRR10380223', 'SRR10380224', 'SRR10380225', 'SRR10380226', 'SRR10380227', 'SRR10380228', 'SRR10380229', 'SRR10380230', 'SRR10380231', 'SRR10380232', 'SRR10380233', 'SRR10380234', 'SRR10380235', 'SRR10380236', 'SRR10380237', 'SRR10380238', 'SRR10380239', 'SRR10380240', 'SRR10380241', 'SRR10380242', 'SRR10380243', 'SRR10380244', 'SRR10380245', 'SRR10380246', 'SRR10380247', 'SRR10380248', 'SRR10380249', 'SRR10380250', 'SRR10380251', 'SRR10380252', 'SRR10380253', 'SRR10397092', 'SRR10397093', 'SRR10397094', 'SRR10397095', 'SRR10397096', 'SRR10397097', 'SRR10397098', 'SRR10397099', 'SRR10397100', 'SRR10397101', 'SRR10397102', 'SRR10397103', 'SRR10397104', 'SRR10397105', 'SRR10397106', 'SRR10397107', 'SRR10397108', 'SRR10397109', 'SRR10397110', 'SRR10397111', 'SRR10397112', 'SRR10397114', 'SRR10397115', 'SRR10397116', 'SRR10397117', 'SRR10397118', 'SRR10397119', 'SRR10397120', 'SRR10397121', 'SRR10397122', 'SRR10397123', 'SRR10397124', 'SRR10397125', 'SRR10397126', 'SRR10397127', 'SRR10397128', 'SRR10397129', 'SRR10397130', 'SRR10397131', 'SRR10397132', 'SRR10397133', 'SRR10397134', 'SRR10397135', 'SRR10397136', 'SRR10397137', 'SRR10397138', 'SRR10397139', 'SRR10397140', 'SRR10397141', 'SRR10397142', 'SRR10397143', 'SRR10397144', 'SRR10397145', 'SRR10397146', 'SRR10397147', 'SRR10397148', 'SRR10397149', 'SRR10397150', 'SRR10397151', 'SRR10397152', 'SRR10397153', 'SRR10397154', 'SRR10397155', 'SRR10397156', 'SRR10397157', 'SRR10397158', 'SRR10397159', 'SRR10397160', 'SRR10397161', 'SRR10397162', 'SRR10397163', 'SRR10397164', 'SRR10397165', 'SRR10397166', 'SRR10397167', 'SRR10397168', 'SRR10397169', 'SRR10397170', 'SRR10397171', 'SRR10397172', 'SRR10397173', 'SRR10397174', 'SRR10397175', 'SRR10397176', 'SRR10397177', 'SRR10397178', 'SRR10397179', 'SRR10397180', 'SRR10397181', 'SRR10397182', 'SRR10397183', 'SRR10397185', 'SRR10397186', 'SRR10397187', 'SRR10397188', 'SRR10397189', 'SRR10397190', 'SRR10397191', 'SRR10397192', 'SRR10397193', 'SRR10397194', 'SRR10397195', 'SRR10397196', 'SRR10397197', 'SRR10397198', 'SRR10397199', 'SRR10397200', 'SRR10397202', 'SRR10397204', 'SRR10397205', 'SRR10397208', 'SRR10397209', 'SRR10397211', 'SRR10397212', 'SRR10397213', 'SRR10397214', 'SRR10397215', 'SRR10397216', 'SRR10397217', 'SRR10397218', 'SRR10397219', 'SRR10397220', 'SRR10397221', 'SRR10397222', 'SRR10397223', 'SRR10397224', 'SRR10397225', 'SRR10397226', 'SRR10397227', 'SRR10397228', 'SRR10397229', 'SRR10397230', 'SRR10397231', 'SRR10397232', 'SRR10397233', 'SRR10397234', 'SRR10397235', 'SRR10397236', 'SRR10397237', 'SRR10397239', 'SRR10397240', 'SRR10397241', 'SRR10397242', 'SRR10397243', 'SRR10397244', 'SRR10397245', 'SRR10397246', 'SRR10397247', 'SRR10397248', 'SRR10397249', 'SRR10397250', 'SRR10397251', 'SRR10397252', 'SRR10397253', 'SRR10397254', 'SRR10397255', 'SRR10397256', 'SRR10397257', 'SRR10397258', 'SRR10397259', 'SRR10397260', 'SRR10397261', 'SRR10397262', 'SRR10397263', 'SRR10397264', 'SRR10397265', 'SRR10397266', 'SRR10397267', 'SRR10397268', 'SRR10397269', 'SRR10397270', 'SRR10397271', 'SRR10397273', 'SRR10525317', 'SRR10525318', 'SRR10525319', 'SRR10525320', 'SRR10525321', 'SRR10525322', 'SRR10525323', 'SRR10525324', 'SRR10525325', 'SRR10525326', 'SRR10525327', 'SRR10525328', 'SRR10525329', 'SRR10525330', 'SRR10525331', 'SRR10525332', 'SRR10525333', 'SRR10525334', 'SRR10525335', 'SRR10525336', 'SRR10525337', 'SRR10525338', 'SRR10525339', 'SRR10525340', 'SRR10525341', 'SRR10525342', 'SRR10525343', 'SRR10525344', 'SRR10525345', 'SRR10525346', 'SRR10525347', 'SRR10525348', 'SRR10525349', 'SRR10525350', 'SRR10525351', 'SRR10525352', 'SRR10525353', 'SRR10525354', 'SRR10525355', 'SRR10525356', 'SRR10525357', 'SRR10525358', 'SRR10525359', 'SRR10525360', 'SRR10525361', 'SRR10525362', 'SRR10525363', 'SRR10525364', 'SRR10525365', 'SRR10525366', 'SRR10525367', 'SRR10525368', 'SRR10525369', 'SRR10525370', 'SRR10525371', 'SRR10525372', 'SRR10525373', 'SRR10525374', 'SRR10525375', 'SRR11033589', 'SRR11033590', 'SRR11033592', 'SRR11033593', 'SRR11033594', 'SRR11033595', 'SRR11033596', 'SRR11033597', 'SRR11033599', 'SRR11033600', 'SRR11033601', 'SRR11033602', 'SRR11033603', 'SRR11033604', 'SRR11033605', 'SRR11033606', 'SRR11033607', 'SRR11033608', 'SRR11033609', 'SRR11033610', 'SRR11033611', 'SRR11033612', 'SRR11033613', 'SRR11033615', 'SRR11033616', 'SRR11033617', 'SRR11033618', 'SRR11033619', 'SRR11033620', 'SRR11033621', 'SRR11033622', 'SRR11033623', 'SRR11033624', 'SRR11033625', 'SRR11033626', 'SRR11033627', 'SRR11033628', 'SRR11033629', 'SRR11033630', 'SRR11033631', 'SRR11033632', 'SRR11033633', 'SRR11033634', 'SRR11033635', 'SRR11033636', 'SRR11033637', 'SRR11033638', 'SRR11033640', 'SRR11033642', 'SRR11033643', 'SRR11033645', 'SRR11033646', 'SRR11033647', 'SRR11033648', 'SRR11033649', 'SRR11033650', 'SRR11033652', 'SRR11033653', 'SRR11033654', 'SRR11033656', 'SRR11033657', 'SRR11033658', 'SRR11033659', 'SRR11033660', 'SRR11033661', 'SRR11033664', 'SRR11033665', 'SRR11033666', 'SRR11033667', 'SRR11033668', 'SRR11033669', 'SRR11033670', 'SRR11033671', 'SRR11033672', 'SRR11033675', 'SRR11033676', 'SRR11033679', 'SRR11033680', 'SRR11033681', 'SRR11033682', 'SRR11033683', 'SRR11033684', 'SRR11033685', 'SRR11033687', 'SRR11033688', 'SRR11033689', 'SRR11033690', 'SRR11033691', 'SRR11033694', 'SRR11033696', 'SRR11033697', 'SRR11033698', 'SRR11033699', 'SRR11033700', 'SRR11033701', 'SRR11033702', 'SRR11033703', 'SRR11033705', 'SRR11033706', 'SRR11033707', 'SRR11033708', 'SRR11033709', 'SRR11033710', 'SRR11033711', 'SRR11033712', 'SRR11033713', 'SRR11033714', 'SRR11033715', 'SRR11033716', 'SRR11033717', 'SRR11033718', 'SRR11033719', 'SRR11033720', 'SRR11033721', 'SRR11033722', 'SRR11033723', 'SRR11033724', 'SRR11033725', 'SRR11033726', 'SRR11033728', 'SRR11033729', 'SRR11033730', 'SRR11033733', 'SRR11033734', 'SRR11033735', 'SRR11033736', 'SRR11033738', 'SRR11033740', 'SRR11033741', 'SRR11033743', 'SRR11033744', 'SRR11033746', 'SRR11033747', 'SRR11033748', 'SRR11033749', 'SRR11033750', 'SRR11033751', 'SRR11033752', 'SRR11033753', 'SRR11033754', 'SRR11033755', 'SRR11033756', 'SRR11033757', 'SRR11033758', 'SRR11033759', 'SRR11033760', 'SRR11033761', 'SRR11033762', 'SRR11033763', 'SRR11033764', 'SRR11033765', 'SRR11033766', 'SRR11033767', 'SRR11033768', 'SRR11033769', 'SRR11033770', 'SRR11033771', 'SRR11033772', 'SRR11033773', 'SRR11033774', 'SRR11033776', 'SRR11033777', 'SRR11033778', 'SRR11033779', 'SRR11033780', 'SRR1158874', 'SRR1158890', 'SRR1158898', 'SRR1158907', 'SRR1158923', 'SRR1158929', 'SRR1158931', 'SRR1158939', 'SRR1158943', 'SRR1158946', 'SRR1158950', 'SRR1158990', 'SRR1158998', 'SRR1159002', 'SRR1159003', 'SRR1159005', 'SRR1159006', 'SRR1159029', 'SRR1159034', 'SRR1159038', 'SRR1159044', 'SRR1159052', 'SRR1159053', 'SRR1159075', 'SRR1159076', 'SRR1159083', 'SRR1159108', 'SRR1159121', 'SRR1159122', 'SRR1159126', 'SRR1159129', 'SRR1159150', 'SRR1159154', 'SRR1159167', 'SRR1159171', 'SRR1159175', 'SRR1159180', 'SRR1159204', 'SRR1159237', 'SRR1159245', 'SRR1159261', 'SRR1159262', 'SRR1159279', 'SRR1159283', 'SRR1159290', 'SRR1159293', 'SRR1159298', 'SRR1159299', 'SRR1159301', 'SRR1159303', 'SRR1159308', 'SRR1159309', 'SRR1159310', 'SRR1159338', 'SRR1159349', 'SRR1159350', 'SRR1159351', 'SRR1159352', 'SRR1159360', 'SRR1159362', 'SRR1159366', 'SRR1159369', 'SRR1159370', 'SRR1159377', 'SRR1159389', 'SRR1159393', 'SRR1159521', 'SRR1159526', 'SRR1159661', 'SRR1159682', 'SRR1159687', 'SRR1159700', 'SRR1159713', 'SRR1159723', 'SRR1159729', 'SRR1159738', 'SRR1159747', 'SRR1159760', 'SRR1159790', 'SRR1159810', 'SRR1159818', 'SRR1159861', 'SRR1159895', 'SRR1159904', 'SRR1159935', 'SRR1159952', 'SRR1159954', 'SRR1159959', 'SRR1159977', 'SRR1159986', 'SRR1162483', 'SRR1162485', 'SRR1162491', 'SRR1162494', 'SRR1162495', 'SRR1162498', 'SRR1162502', 'SRR1162504', 'SRR1162505', 'SRR1162509', 'SRR1162513', 'SRR1162518', 'SRR1162521', 'SRR1162529', 'SRR1162531', 'SRR1162532', 'SRR1162533', 'SRR1162534', 'SRR1162535', 'SRR1162536', 'SRR1162537', 'SRR1162539', 'SRR1162541', 'SRR1162542', 'SRR1162947', 'SRR1162952', 'SRR1162953', 'SRR1162957', 'SRR1162961', 'SRR1162962', 'SRR1162966', 'SRR1162967', 'SRR1162971', 'SRR1162972', 'SRR1162980', 'SRR1162995', 'SRR1162996', 'SRR1162997', 'SRR1163001', 'SRR1163002', 'SRR1163016', 'SRR1163021', 'SRR1163022', 'SRR1163023', 'SRR1163025', 'SRR1163029', 'SRR1163037', 'SRR1163038', 'SRR1163050', 'SRR1163073', 'SRR1163079', 'SRR1163080', 'SRR1163081', 'SRR1163096', 'SRR1163097', 'SRR1163101', 'SRR1163102', 'SRR1163103', 'SRR1163104', 'SRR1163105', 'SRR1163106', 'SRR1163114', 'SRR1163115', 'SRR1163121', 'SRR1163126', 'SRR1163127', 'SRR1163135', 'SRR1163139', 'SRR1163140', 'SRR1163145', 'SRR1163166', 'SRR1163167', 'SRR1163171', 'SRR1163172', 'SRR1163173', 'SRR1163177', 'SRR1163178', 'SRR1163184', 'SRR1163185', 'SRR1163189', 'SRR1163194', 'SRR1163195', 'SRR1163198', 'SRR1163199', 'SRR1163202', 'SRR1163203', 'SRR1163204', 'SRR1163211', 'SRR1163286', 'SRR1163287', 'SRR1163290', 'SRR1163291', 'SRR1163296', 'SRR1163298', 'SRR1163299', 'SRR1163302', 'SRR1163303', 'SRR1163304', 'SRR1163309', 'SRR1163310', 'SRR1163313', 'SRR1163314', 'SRR1163315', 'SRR1163317', 'SRR1163318', 'SRR1163319', 'SRR1163325', 'SRR1163326', 'SRR1163329', 'SRR1163330', 'SRR1163332', 'SRR1163336', 'SRR1163338', 'SRR1163341', 'SRR1163347', 'SRR1163348', 'SRR1163349', 'SRR1163353', 'SRR1163354', 'SRR1163363', 'SRR1163365', 'SRR1163366', 'SRR1163368', 'SRR1163369', 'SRR1163371', 'SRR1163372', 'SRR1163373', 'SRR1163374', 'SRR1163376', 'SRR1163380', 'SRR1163385', 'SRR1163386', 'SRR1163387', 'SRR1163392', 'SRR1163393', 'SRR1163397', ...], 'htbc': ['SRR5065266', 'ERR3335748', 'SRR5065310', 'SRR5065230', 'ERR067722', 'ERR067597', 'SRR5065319', 'SRR5065360', 'ERR688042', 'SRR5065347', 'SRR5065376', 'SRR5065339', 'SRR5065410', 'SRR5065285', 'ERR3335784', 'ERR067649', 'ERR3335786', 'ERR688025', 'ERR3335797', 'SRR8552930', 'ERR067705', 'SRR5065235', 'SRR5065359', 'ERR067749', 'ERR067717', 'ERR3335802', 'SRR5065232', 'ERR067735', 'ERR3335727', 'ERR3335730', 'ERR3335735', 'ERR047884', 'ERR067589', 'SRR5065251', 'SRR5065214', 'ERR067660', 'SRR5065361', 'ERR067673', 'SRR5065304', 'ERR047881', 'SRR5065323', 'ERR067707', 'ERR067576', 'ERR067613', 'ERR067739', 'ERR067582', 'ERR067631', 'SRR5065288', 'SRR5065406', 'SRR5065368', 'ERR688040', 'SRR5065302', 'ERR067679', 'SRR5065270', 'ERR067695', 'ERR3335742', 'ERR067686', 'SRR5065330', 'SRR8552934', 'ERR3335750', 'ERR067593', 'ERR3335729', 'ERR067602', 'SRR5065336', 'ERR067704', 'ERR067700', 'SRR5065355', 'SRR5065396', 'ERR3335725', 'SRR5065248', 'ERR067689', 'ERR067640', 'SRR5065239', 'SRR5065400', 'ERR3335728', 'SRR11638839', 'ERR067669', 'ERR067591', 'ERR067712', 'ERR3335761', 'ERR067653', 'ERR067710', 'ERR067664', 'SRR5065399', 'ERR067635', 'SRR5065403', 'ERR047883', 'ERR067620', 'SRR5065295', 'ERR067652', 'ERR027460', 'SRR5065209', 'ERR067621', 'ERR3335770', 'SRR5065263', 'ERR3335801', 'ERR067713', 'ERR3335740', 'ERR067611', 'ERR067723', 'ERR067634', 'SRR5065234', 'SRR5065331', 'ERR108422', 'ERR067678', 'SRR5065316', 'SRR5065205', 'ERR067714', 'ERR3335773', 'ERR3335732', 'ERR3335763', 'ERR3335788', 'SRR5065395', 'SRR8552927', 'ERR3335799', 'ERR067751', 'SRR5065289', 'ERR067618', 'ERR067715', 'ERR067577', 'ERR3335753', 'SRR5065318', 'SRR8552935', 'ERR3335745', 'SRR5065408', 'ERR3335777', 'ERR067609', 'ERR067699', 'ERR067581', 'ERR3335769', 'ERR067583', 'ERR067691', 'ERR027467', 'ERR067650', 'SRR5065268', 'ERR3335746', 'ERR067711', 'SRR5065322', 'SRR5065256', 'ERR3335768', 'SRR5065274', 'SRR5065402', 'SRR5065226', 'SRR5065397', 'SRR5065312', 'SRR5065328', 'ERR067651', 'ERR688036', 'ERR067654', 'SRR5065338', 'ERR067738', 'ERR067726', 'ERR3335734', 'ERR688049', 'ERR067718', 'ERR027458', 'ERR3335736', 'ERR067698', 'ERR3335776', 'ERR067709', 'ERR067600', 'ERR067666', 'ERR067627', 'SRR5065278', 'SRR5065249', 'SRR5065293', 'SRR5065211', 'SRR5065233', 'ERR688023', 'ERR067737', 'ERR688046', 'SRR5065254', 'ERR3335739', 'SRR5065271', 'ERR067612', 'ERR688016', 'ERR067585', 'SRR5065321', 'ERR067662', 'ERR067675', 'SRR5065388', 'ERR047891', 'ERR027462', 'ERR067624', 'ERR688026', 'SRR5065387', 'ERR067594', 'SRR5065379', 'SRR5065296', 'ERR067655', 'SRR5065255', 'SRR5065279', 'ERR3335775', 'SRR5065378', 'SRR5065294', 'ERR3335752', 'SRR5065236', 'ERR3335779', 'SRR5065313', 'ERR027465', 'SRR8552932', 'ERR067646', 'SRR5065357', 'SRR5065286', 'ERR067694', 'ERR688020', 'ERR3335743', 'ERR027466', 'ERR067607', 'ERR067584', 'SRR5065374', 'SRR5065325', 'ERR067616', 'ERR067648', 'SRR5065210', 'ERR3335782', 'ERR067736', 'SRR5065375', 'SRR5065314', 'ERR067667', 'ERR688024', 'SRR5065203', 'ERR688037', 'ERR067630', 'SRR5065262', 'ERR027464', 'ERR067639', 'ERR067599', 'SRR5065292', 'SRR5065206', 'SRR5065208', 'SRR5065277', 'SRR5065238', 'ERR067703', 'ERR067636', 'ERR047890', 'ERR3335765', 'ERR067625', 'SRR5065243', 'SRR5065267', 'ERR067647', 'ERR688030', 'ERR067578', 'SRR5065202', 'SRR5065305', 'ERR067637', 'ERR067604', 'ERR688009', 'SRR5065257', 'ERR3335749', 'SRR5065231', 'ERR3335764', 'SRR5065383', 'ERR067615', 'SRR5065281', 'ERR067580', 'SRR5065227', 'SRR5065229', 'ERR3335738', 'ERR3335793', 'SRR5065401', 'SRR8552929', 'ERR067606', 'SRR5065261', 'SRR5065353', 'ERR3335792', 'ERR688048', 'ERR688008', 'ERR3335766', 'ERR067643', 'ERR027461', 'ERR027468', 'SRR5065363', 'ERR067614', 'SRR5065380', 'ERR688035', 'SRR5065381', 'SRR5065253', 'ERR688043', 'SRR5065404', 'ERR067677', 'SRR5065354', 'SRR5065259', 'SRR5065350', 'ERR067645', 'SRR5065348', 'SRR5065366', 'SRR5065213', 'SRR5065212', 'SRR5065352', 'ERR3335758', 'ERR047885', 'SRR5065260', 'ERR688044', 'ERR067661', 'SRR5065333', 'SRR5065282', 'SRR5065334', 'SRR5065269', 'SRR5065405', 'SRR5065358', 'ERR3335771', 'ERR3335781', 'SRR5065207', 'ERR047882', 'ERR067690', 'SRR5065219', 'SRR5065265', 'ERR3335733', 'ERR067688', 'ERR688045', 'SRR5065382', 'ERR688013', 'SRR5065344', 'SRR5065284', 'ERR3335800', 'ERR067657', 'ERR688039', 'ERR067684', 'ERR3335795', 'ERR3335772', 'ERR688034', 'ERR067693', 'SRR5065364', 'ERR047889', 'ERR067730', 'SRR5065370', 'ERR067765', 'ERR067610', 'ERR3335723', 'ERR067587', 'SRR5065320', 'ERR067588', 'SRR5065301', 'ERR067748', 'ERR688038', 'ERR027463', 'ERR3335790', 'SRR5065290', 'SRR5065225', 'ERR067622', 'SRR5065309', 'SRR5065356', 'ERR3335724', 'ERR3335796', 'ERR067656', 'ERR688011', 'SRR5065349', 'ERR3335757', 'ERR067674', 'ERR067586', 'ERR067696', 'ERR067644', 'SRR5065342', 'ERR067619', 'SRR5065326', 'SRR5065346', 'ERR067755', 'ERR067719', 'ERR3335783', 'ERR688029', 'ERR047887', 'ERR067680', 'ERR3335744', 'ERR067598', 'SRR5065241', 'SRR5065201', 'ERR3335794', 'ERR067670', 'SRR5065280', 'ERR067605', 'SRR5065317', 'SRR5065264', 'ERR688021', 'ERR3335747', 'ERR688017', 'ERR688018', 'SRR5065298', 'ERR067702', 'ERR3335754', 'SRR8552933', 'ERR047880', 'ERR067665', 'ERR3335798', 'ERR3335737', 'SRR11648203', 'ERR688010', 'ERR688041', 'ERR067676', 'ERR067682', 'ERR688022', 'SRR5065275', 'ERR3335787', 'ERR067638', 'SRR5065327', 'ERR067601', 'ERR067671', 'ERR3335780', 'ERR688032', 'ERR067592', 'SRR5065384', 'ERR3335726', 'SRR5065237', 'ERR688015', 'ERR067629', 'ERR688012', 'SRR5065343', 'ERR688027', 'ERR067687', 'ERR067706', 'ERR067595', 'ERR067683', 'ERR3335803', 'SRR5065306', 'ERR067579', 'ERR067716', 'ERR3335767', 'ERR3335755', 'SRR5065291', 'ERR027459', 'ERR067658', 'SRR5065258', 'SRR5065371', 'ERR067633', 'SRR5065409', 'ERR047886', 'ERR067757', 'SRR8552931', 'SRR5065337', 'ERR3335759', 'ERR067663', 'SRR5065287', 'SRR5065250', 'ERR688033', 'ERR3335785', 'ERR067628', 'ERR3335774', 'SRR5065242', 'SRR5065411', 'SRR5065340', 'ERR067672', 'SRR5065369', 'SRR5065223', 'ERR067617', 'SRR5065398', 'ERR067590', 'SRR5065244', 'SRR5065245', 'ERR067701', 'ERR067641', 'SRR5065273', 'ERR067760', 'ERR3335751', 'ERR067668', 'ERR688014', 'ERR3335756', 'ERR067708', 'SRR5065385', 'ERR067697', 'SRR5065272', 'SRR5065252', 'SRR5065377', 'ERR3335741', 'ERR067762', 'ERR688028', 'SRR5065276', 'ERR067623', 'ERR067603', 'ERR3335762', 'ERR067608', 'ERR688019', 'SRR5065218', 'ERR067754', 'SRR5065224', 'ERR067743', 'ERR067626', 'ERR067685', 'SRR5065365', 'ERR067659', 'ERR067642', 'ERR067632', 'ERR688031', 'ERR3335791', 'SRR5065247', 'ERR067753', 'SRR5065220', 'SRR5065372', 'ERR108421', 'ERR3335789', 'ERR3335778', 'ERR027469', 'SRR5065246', 'SRR5065307', 'ERR688047', 'ERR3335731', 'SRR5065283', 'ERR067692', 'ERR047888', 'ERR067596', 'SRR5065367', 'SRR5065297']}
# Genome IDs from the tb-portals dataset
tbportals_genomes = all_genomes_ids_dict['tb_portals']
len(tbportals_genomes)
2153
tbportals_genomes_set = set(tbportals_genomes)
tbportals_genomes_set
{'SRR11033621', 'SRR5153826', 'SRR5153609', 'SRR10397123', 'SRR10397226', 'SRR11033694', 'SRR6807744', 'SRR5153929', 'SRR7592352', 'SRR1159986', 'SRR11033741', 'SRR10380225', 'SRR6384962', 'SRR7592389', 'SRR10379973', 'SRR10397114', 'SRR5153321', 'SRR7657753', 'SRR1163173', 'SRR5153842', 'SRR10380168', 'SRR7592382', 'SRR5153074', 'SRR3743386', 'SRR7516288', 'SRR7592373', 'SRR9738541', 'SRR6807739', 'SRR11033729', 'SRR10380203', 'SRR11033665', 'SRR1163365', 'SRR6458443', 'SRR6807705', 'SRR6356994', 'SRR10379986', 'SRR5153614', 'SRR10525332', 'SRR10379932', 'SRR5486889', 'SRR10397105', 'SRR1159310', 'SRR6356975', 'SRR3544741', 'SRR7592348', 'SRR7516377', 'SRR10380082', 'SRR6458449', 'SRR11033779', 'SRR1163392', 'SRR10379880', 'SRR11033762', 'SRR10380061', 'SRR5153076', 'SRR7516430', 'SRR6807715', 'SRR10380019', 'SRR5153253', 'SRR3743483', 'SRR10379897', 'SRR10397270', 'SRR1159283', 'SRR11033689', 'SRR10379957', 'SRR5153918', 'SRR5153827', 'SRR10380005', 'SRR11033712', 'SRR10379913', 'SRR10380178', 'SRR7592374', 'SRR10380104', 'SRR10379909', 'SRR6458451', 'SRR10380190', 'SRR7652980', 'SRR10380033', 'SRR5153915', 'SRR5486881', 'SRR6356959', 'SRR6458441', 'SRR7657759', 'SRR5152906', 'SRR7516444', 'SRR5486873', 'SRR6807698', 'SRR7592366', 'SRR1159279', 'SRR5152928', 'SRR5486898', 'SRR10380176', 'SRR9738478', 'SRR1159076', 'SRR6356924', 'SRR10380006', 'SRR10379954', 'SRR6458393', 'SRR3544734', 'SRR3743402', 'SRR7592369', 'SRR11033657', 'SRR10379938', 'SRR5153913', 'SRR10380055', 'SRR9738488', 'SRR10380016', 'SRR10380177', 'SRR6807751', 'SRR1162967', 'SRR10380125', 'SRR6356972', 'SRR1159362', 'SRR6458427', 'SRR5152921', 'SRR7654039', 'SRR10380120', 'SRR10380252', 'SRR1163037', 'SRR10379891', 'SRR10380220', 'SRR10525338', 'SRR7516319', 'SRR10397157', 'SRR10380222', 'SRR3743400', 'SRR7655471', 'SRR10525320', 'SRR11033608', 'SRR3743385', 'SRR5152920', 'SRR5152950', 'SRR5153900', 'SRR11033610', 'SRR10380119', 'SRR10379906', 'SRR5152940', 'SRR5152948', 'SRR1163369', 'SRR11033713', 'SRR11033603', 'SRR10525340', 'SRR7516423', 'SRR3743485', 'SRR7655790', 'SRR6458460', 'SRR5153213', 'SRR10397188', 'SRR10380133', 'SRR1163354', 'SRR3743413', 'SRR6807712', 'SRR10397095', 'SRR7655978', 'SRR10379933', 'SRR7516428', 'SRR6807726', 'SRR10397213', 'SRR7657761', 'SRR1162542', 'SRR11033733', 'SRR1163387', 'SRR10380223', 'SRR1163302', 'SRR10380071', 'SRR10379945', 'SRR10379903', 'SRR11033625', 'SRR7516323', 'SRR5153848', 'SRR10397204', 'SRR1162952', 'SRR10525347', 'SRR10380150', 'SRR11033682', 'SRR3544743', 'SRR3544748', 'SRR1163198', 'SRR10380140', 'SRR5152908', 'SRR1163121', 'SRR7653080', 'SRR7592337', 'SRR10380110', 'SRR6357007', 'SRR10380044', 'SRR10379890', 'SRR7516322', 'SRR10397094', 'SRR11033630', 'SRR7592384', 'SRR1159044', 'SRR6458409', 'SRR10380197', 'SRR5153279', 'SRR5153883', 'SRR1159904', 'SRR6356945', 'SRR1162502', 'SRR3544731', 'SRR10379962', 'SRR5153262', 'SRR10397160', 'SRR5153316', 'SRR10380162', 'SRR5153928', 'SRR9738553', 'SRR5153271', 'SRR6356927', 'SRR6384970', 'SRR7592340', 'SRR11033589', 'SRR1158898', 'SRR6458445', 'SRR6807669', 'SRR6807733', 'SRR5486878', 'SRR3743438', 'SRR6384965', 'SRR7516302', 'SRR11033652', 'SRR6458412', 'SRR1163421', 'SRR3743498', 'SRR9738519', 'SRR10379978', 'SRR6356933', 'SRR7516345', 'SRR10380077', 'SRR3743481', 'SRR10397178', 'SRR11033738', 'SRR6356990', 'SRR7516351', 'SRR10379923', 'SRR5153606', 'SRR1159700', 'SRR1159303', 'SRR10380028', 'SRR5153237', 'SRR6357005', 'SRR10380065', 'SRR10380242', 'SRR7516333', 'SRR10397180', 'SRR1159309', 'SRR5486892', 'SRR5152951', 'SRR5153835', 'SRR10380155', 'SRR7653082', 'SRR7655466', 'SRR3544742', 'SRR10379992', 'SRR10380198', 'SRR11033755', 'SRR5153291', 'SRR11033730', 'SRR3743459', 'SRR7516289', 'SRR10379922', 'SRR1159167', 'SRR11033634', 'SRR10380078', 'SRR10397266', 'SRR11033645', 'SRR5486880', 'SRR7516438', 'SRR10379947', 'SRR10525343', 'SRR10379893', 'SRR6807688', 'SRR5153821', 'SRR5153811', 'SRR10379989', 'SRR6357002', 'SRR10379882', 'SRR7655615', 'SRR6356976', 'SRR10397208', 'SRR10397257', 'SRR7516342', 'SRR10379883', 'SRR11033777', 'SRR7516427', 'SRR7655469', 'SRR10397179', 'SRR10397191', 'SRR10380026', 'SRR11033714', 'SRR10380091', 'SRR10380073', 'SRR10380205', 'SRR3544751', 'SRR9738508', 'SRR10397273', 'SRR5153226', 'SRR7516314', 'SRR5153245', 'SRR10380050', 'SRR3544744', 'SRR3544750', 'SRR7516366', 'SRR7654034', 'SRR7655981', 'SRR10379939', 'SRR7516329', 'SRR10379943', 'SRR11033613', 'SRR3743482', 'SRR6356985', 'SRR9738547', 'SRR11033659', 'SRR11033617', 'SRR7592343', 'SRR7653085', 'SRR10380045', 'SRR10379940', 'SRR1162509', 'SRR10397175', 'SRR1163319', 'SRR1159298', 'SRR5153086', 'SRR7657755', 'SRR1162532', 'SRR5153831', 'SRR5486895', 'SRR10379900', 'SRR3743494', 'SRR10379878', 'SRR9738487', 'SRR7516352', 'SRR10397107', 'SRR10397122', 'SRR5153095', 'SRR10380011', 'SRR11033754', 'SRR7592335', 'SRR5153849', 'SRR11033596', 'SRR7516450', 'SRR7516313', 'SRR6356983', 'SRR11033600', 'SRR6356965', 'SRR5153904', 'SRR5153206', 'SRR9738510', 'SRR1159682', 'SRR5153717', 'SRR1162972', 'SRR10380209', 'SRR1159308', 'SRR7655786', 'SRR1159154', 'SRR5486888', 'SRR6458463', 'SRR7516332', 'SRR7516355', 'SRR10397169', 'SRR6807696', 'SRR5153234', 'SRR5486891', 'SRR5153868', 'SRR1163330', 'SRR9738485', 'SRR5153711', 'SRR6458442', 'SRR10380235', 'SRR6807692', 'SRR5153232', 'SRR6807734', 'SRR7516380', 'SRR5153085', 'SRR10397247', 'SRR10380146', 'SRR1163411', 'SRR5152916', 'SRR11033624', 'SRR10379904', 'SRR1159687', 'SRR6356934', 'SRR10397148', 'SRR6356926', 'SRR6356958', 'SRR5153240', 'SRR7516363', 'SRR3743449', 'SRR10380251', 'SRR5153854', 'SRR7592363', 'SRR10379928', 'SRR3743368', 'SRR3544735', 'SRR5486904', 'SRR5153824', 'SRR1163366', 'SRR10379972', 'SRR7655789', 'SRR6356977', 'SRR7516416', 'SRR6458399', 'SRR6807713', 'SRR7657748', 'SRR11033757', 'SRR5153852', 'SRR5153710', 'SRR11033696', 'SRR10380067', 'SRR11033718', 'SRR10380013', 'SRR11033656', 'SRR10380070', 'SRR10397133', 'SRR10380147', 'SRR10525359', 'SRR7516334', 'SRR5153921', 'SRR10397240', 'SRR1163414', 'SRR10380041', 'SRR1166910', 'SRR6807755', 'SRR10380141', 'SRR5486887', 'SRR11033672', 'SRR5153830', 'SRR1159293', 'SRR7654040', 'SRR10397256', 'SRR1162537', 'SRR11033650', 'SRR6356973', 'SRR3544745', 'SRR7516376', 'SRR5152896', 'SRR9738552', 'SRR10380210', 'SRR11033636', 'SRR7516454', 'SRR9738545', 'SRR10379950', 'SRR6807681', 'SRR10525345', 'SRR5153601', 'SRR10379949', 'SRR6807725', 'SRR6458452', 'SRR7516346', 'SRR10379990', 'SRR7516350', 'SRR5153078', 'SRR3743409', 'SRR10380037', 'SRR7516371', 'SRR5153712', 'SRR10380051', 'SRR10397096', 'SRR6807738', 'SRR3544723', 'SRR1159006', 'SRR5153235', 'SRR5152945', 'SRR7653024', 'SRR10380115', 'SRR10397263', 'SRR7592365', 'SRR7652970', 'SRR10397252', 'SRR10397131', 'SRR10380214', 'SRR7655953', 'SRR10397235', 'SRR7516434', 'SRR7516317', 'SRR3743458', 'SRR5152930', 'SRR10380057', 'SRR9738486', 'SRR10380034', 'SRR10525339', 'SRR11033679', 'SRR5153214', 'SRR11033611', 'SRR5153083', 'SRR7516312', 'SRR1162953', 'SRR10380022', 'SRR5153275', 'SRR7653017', 'SRR1159351', 'SRR10379997', 'SRR9738476', 'SRR5153327', 'SRR5153885', 'SRR7516298', 'SRR5152912', 'SRR10380186', 'SRR5486883', 'SRR7655613', 'SRR5153266', 'SRR6807717', 'SRR1163115', 'SRR10380226', 'SRR5153082', 'SRR1163029', 'SRR10379889', 'SRR11033597', 'SRR10379971', 'SRR6807736', 'SRR1163397', 'SRR6458383', 'SRR10397219', 'SRR1159738', 'SRR5153194', 'SRR1163341', 'SRR7516402', 'SRR1162539', 'SRR10380208', 'SRR11033753', 'SRR1162513', 'SRR7516303', 'SRR5153325', 'SRR10397246', 'SRR6807697', 'SRR10525362', 'SRR10397116', 'SRR11033702', 'SRR6356935', 'SRR7516307', 'SRR7592380', 'SRR11033631', 'SRR5153619', 'SRR10525328', 'SRR10397239', 'SRR1163022', 'SRR9738484', 'SRR5153307', 'SRR6458384', 'SRR10525369', 'SRR6356940', 'SRR10379958', 'SRR6807743', 'SRR7516335', 'SRR10525342', 'SRR7516426', 'SRR9738481', 'SRR7516330', 'SRR7516408', 'SRR10380144', 'SRR7516386', 'SRR1163135', 'SRR7516316', 'SRR7516336', 'SRR7653083', 'SRR11033622', 'SRR6458457', 'SRR6458455', 'SRR10380007', 'SRR1163199', 'SRR7655470', 'SRR1163171', 'SRR10379955', 'SRR1158923', 'SRR3743473', 'SRR1159262', 'SRR1159526', 'SRR5153093', 'SRR5153809', 'SRR10379993', 'SRR6458398', 'SRR11033676', 'SRR5153317', 'SRR3743408', 'SRR6356944', 'SRR9738503', 'SRR9738514', 'SRR1163166', 'SRR1166921', 'SRR7516412', 'SRR11033640', 'SRR11033646', 'SRR1159521', 'SRR10380093', 'SRR6807718', 'SRR6807716', 'SRR6356939', 'SRR1159747', 'SRR6458456', 'SRR6356953', 'SRR6807678', 'SRR10397185', 'SRR11033628', 'SRR11033647', 'SRR10379929', 'SRR7653027', 'SRR5153423', 'SRR3743201', 'SRR5152973', 'SRR10380243', 'SRR5153326', 'SRR6807672', 'SRR7516365', 'SRR10380169', 'SRR10397205', 'SRR10379899', 'SRR1159954', 'SRR11033697', 'SRR11033765', 'SRR5153324', 'SRR6356957', 'SRR11033715', 'SRR6356928', 'SRR5152910', 'SRR6458400', 'SRR6458406', 'SRR1159005', 'SRR5153615', 'SRR11033623', 'SRR5152941', 'SRR6807690', 'SRR10397232', 'SRR1159729', 'SRR7516344', 'SRR10380049', 'SRR5486900', 'SRR11033635', 'SRR9738558', 'SRR1163393', 'SRR10397117', 'SRR6357004', 'SRR10379996', 'SRR10379907', 'SRR10380018', 'SRR5153608', 'SRR10380193', 'SRR10397217', 'SRR10380215', 'SRR1159369', 'SRR7652971', 'SRR10380249', 'SRR6356952', 'SRR7516429', 'SRR7655979', 'SRR6807756', 'SRR5153602', 'SRR1163204', 'SRR6807735', 'SRR3743392', 'SRR10379901', 'SRR5153328', 'SRR1162957', 'SRR10380023', 'SRR7516409', 'SRR10380009', 'SRR6807746', 'SRR6807741', 'SRR5153851', 'SRR5153600', 'SRR5152958', 'SRR11033723', 'SRR10379976', 'SRR5153836', 'SRR10380109', 'SRR1163291', 'SRR6356923', 'SRR5153081', 'SRR10525333', 'SRR5153611', 'SRR7516343', 'SRR10380117', 'SRR5152944', 'SRR11033618', 'SRR6807731', 'SRR7516418', 'SRR10380066', 'SRR1159034', 'SRR6458410', 'SRR5153919', 'SRR10379936', 'SRR7516354', 'SRR7516458', 'SRR10397172', 'SRR10380207', 'SRR5153841', 'SRR5486869', 'SRR11033680', 'SRR5152917', 'SRR11033743', 'SRR6458392', 'SRR5152919', 'SRR11033769', 'SRR5152946', 'SRR6458414', 'SRR5153332', 'SRR6807719', 'SRR5486866', 'SRR5152959', 'SRR1159204', 'SRR10379942', 'SRR6458389', 'SRR5153867', 'SRR6458407', 'SRR10380121', 'SRR6356963', 'SRR7516417', 'SRR10379999', 'SRR10397186', 'SRR3743383', 'SRR7653025', 'SRR3743407', 'SRR10380101', 'SRR10380184', 'SRR6807747', 'SRR11033710', 'SRR5153265', 'SRR10380054', 'SRR7516293', 'SRR3743411', 'SRR6356955', 'SRR6356938', 'SRR6356997', 'SRR10380236', 'SRR1163430', 'SRR11033711', 'SRR3743474', 'SRR1158943', 'SRR6356950', 'SRR5153073', 'SRR7516383', 'SRR6807704', 'SRR10397181', 'SRR7516373', 'SRR7592388', 'SRR11033616', 'SRR3743403', 'SRR7657757', 'SRR7516328', 'SRR10380064', 'SRR10379960', 'SRR7652968', 'SRR7653011', 'SRR1162534', 'SRR5153840', 'SRR7516361', 'SRR5153090', 'SRR1159180', 'SRR6458418', 'SRR10379963', 'SRR7657749', 'SRR3743415', 'SRR10525361', 'SRR7652977', 'SRR7516440', 'SRR6458415', 'SRR7516449', 'SRR1158929', 'SRR1159038', 'SRR10379983', 'SRR5153855', 'SRR11033752', 'SRR10379967', 'SRR6458397', 'SRR5152953', 'SRR5486876', 'SRR7516398', 'SRR10380046', 'SRR10379924', 'SRR3743450', 'SRR10379931', 'SRR11033615', 'SRR3743377', 'SRR11033594', 'SRR6357008', 'SRR9738482', 'SRR7516459', 'SRR1159349', 'SRR1162531', 'SRR10380183', 'SRR5486885', 'SRR3544732', 'SRR1159818', 'SRR7516309', 'SRR10397248', 'SRR7592332', 'SRR3743500', 'SRR5153708', 'SRR10525324', 'SRR7516297', 'SRR3743486', 'SRR6807737', 'SRR10380068', 'SRR11033774', 'SRR7516437', 'SRR10379884', 'SRR7516385', 'SRR3743412', 'SRR10380039', 'SRR11033607', 'SRR6807695', 'SRR3544747', 'SRR7592357', 'SRR5153920', 'SRR11033761', 'SRR7592342', 'SRR1162535', 'SRR10379953', 'SRR6807711', 'SRR7652975', 'SRR7516315', 'SRR7516379', 'SRR9738546', 'SRR10397119', 'SRR7516320', 'SRR1163406', 'SRR5486868', 'SRR7516433', 'SRR10397144', 'SRR10380111', 'SRR7655612', 'SRR10397227', 'SRR1159150', 'SRR1166916', 'SRR5153278', 'SRR5153917', 'SRR7516452', 'SRR7652996', 'SRR10397196', 'SRR7657746', 'SRR7655614', 'SRR10380084', 'SRR5153597', 'SRR7592330', 'SRR5153906', 'SRR1163021', 'SRR10397153', 'SRR6356999', 'SRR10397195', 'SRR5486872', 'SRR6458434', 'SRR7516299', 'SRR7592371', 'SRR10380030', 'SRR7592368', 'SRR5153814', 'SRR10379925', 'SRR10525317', 'SRR7592327', 'SRR7516310', 'SRR7516396', 'SRR10379946', 'SRR10380212', 'SRR1172286', 'SRR10380196', 'SRR9738530', 'SRR5153924', 'SRR1163314', 'SRR10397132', 'SRR1163429', 'SRR5486902', 'SRR5152905', 'SRR7516419', 'SRR10380123', 'SRR10380010', 'SRR5153861', 'SRR3743200', 'SRR11033595', 'SRR5153722', 'SRR3743463', 'SRR7516403', 'SRR5153903', 'SRR10379908', 'SRR1162996', 'SRR6458448', 'SRR5153922', 'SRR3743416', 'SRR7655980', 'SRR9738518', 'SRR10380227', 'SRR10379921', 'SRR11033667', 'SRR3544727', 'SRR5486884', 'SRR10379927', 'SRR11033654', 'SRR1162533', 'SRR10397222', 'SRR1163080', 'SRR1169019', 'SRR7516448', 'SRR11033658', 'SRR10379934', 'SRR5153878', 'SRR5153866', 'SRR10380127', 'SRR10397110', 'SRR6807748', 'SRR7592346', 'SRR7516325', 'SRR10397124', 'SRR5153844', 'SRR10380163', 'SRR10380148', 'SRR10397176', 'SRR6356980', 'SRR7516305', 'SRR10397229', 'SRR11033764', 'SRR1158907', 'SRR1163363', 'SRR1163338', 'SRR5153622', 'SRR5153075', 'SRR5153255', 'SRR6458454', 'SRR5153879', 'SRR10397214', 'SRR6356960', 'SRR7516399', 'SRR7592325', 'SRR10397129', 'SRR3743475', 'SRR1159350', 'SRR5153509', 'SRR5153077', 'SRR5153308', 'SRR6458464', 'SRR11033778', 'SRR5153270', 'SRR10397139', 'SRR5152902', 'SRR1163298', 'SRR11033612', 'SRR10379915', 'SRR10380002', 'SRR10379984', 'SRR3743202', 'SRR1163177', 'SRR5153222', 'SRR10379886', 'SRR10397194', 'SRR10379937', 'SRR3743487', 'SRR6807722', 'SRR9738512', 'SRR1159237', 'SRR5153882', 'SRR7654035', 'SRR3743398', 'SRR6356981', 'SRR10397130', 'SRR11033660', 'SRR5153319', 'SRR10525348', 'SRR10379974', 'SRR1159713', 'SRR1163106', 'SRR5153272', 'SRR5152942', 'SRR1159352', 'SRR6458416', 'SRR6356989', 'SRR7657750', 'SRR7653022', 'SRR5152937', 'SRR1163368', 'SRR7516372', 'SRR10380069', 'SRR11033670', 'SRR6458450', 'SRR11033666', 'SRR7516404', 'SRR1163185', 'SRR5153838', 'SRR1158890', 'SRR11033763', 'SRR10525375', 'SRR10397173', 'SRR3743391', 'SRR7516424', 'SRR11033734', 'SRR1163103', 'SRR10380031', 'SRR10379969', 'SRR11033758', 'SRR5153599', 'SRR11033720', 'SRR6458444', 'SRR6807714', 'SRR6807724', 'SRR5153596', 'SRR3544733', 'SRR7592349', 'SRR5152938', 'SRR10379935', 'SRR7655465', 'SRR10380100', 'SRR10380172', 'SRR10397166', 'SRR7516420', 'SRR6458396', ...}
# Genome IDs from the htbc dataset
htbc_genomes = set(all_genomes_ids_dict['htbc'])
len(htbc_genomes)
501
htbc_genomes_set = set(htbc_genomes)
# Find HTBC genomes, which are resistant
htbc_resistant_genomes = list(htbc_genomes_set.intersection(resistant_genomes))
len(htbc_resistant_genomes)
338
# Find HTBC genomes, which are sensitive
htbc_sensitive_genomes = list(htbc_genomes_set.intersection(sensitive_genomes))
len(htbc_sensitive_genomes)
163
# Find Tb-portals genomes, which are resistant
tbportals_resistant_genomes = list(tbportals_genomes_set.intersection(resistant_genomes))
len(tbportals_resistant_genomes[830:])
800
# Find Tb-portals genomes, which are sensitive
tbportals_sensitive_genomes = list(tbportals_genomes_set.intersection(sensitive_genomes))
len(tbportals_sensitive_genomes)
523
# Create a list of HTBC genomes to be used for the final analysis.
final_htbc_test_genome_ids = htbc_resistant_genomes + htbc_sensitive_genomes
len(final_htbc_test_genome_ids)
501
# Create a list of Tb-portals genomes to be used for the final analysis.
final_tbportals_train_genome_ids = tbportals_resistant_genomes[830:] + tbportals_sensitive_genomes[100:]
len(final_tbportals_train_genome_ids)
1223
# Append the list of final Tb-portals genomes to the JSON file.
all_genomes_ids_dict['final_tbportals_train_genomes'] = final_tbportals_train_genome_ids
all_genomes_ids_dict.keys()
dict_keys(['tb_portals', 'htbc', 'final_tbportals_train_genomes'])
# Append the list of final HTBC genomes to the JSON file.
all_genomes_ids_dict['final_htbc_test_genomes'] = final_htbc_test_genome_ids
all_genomes_ids_dict.keys()
dict_keys(['tb_portals', 'htbc', 'final_tbportals_train_genomes', 'final_htbc_test_genomes'])
# Write the final JSON file.
import json
with open(data_location + "test_train_genome_ids.json", "w") as f:
json.dump(all_genomes_ids_dict, f, indent= 4)
binarized_final_df = pd.read_csv("../data/processed/final.binarized_final_multilabel_df.csv").set_index('SampleID')
binarized_final_df.head()
binarized_final_df.columns
binarized_final_df.index
binarized_final_df= binarized_final_df.drop(columns=[*renamed_drug_columns_names, *lineage_column_names, 'drtype', 'MDR', 'XDR'], axis= 1)
binarized_final_df.head()
binarized_final_df.shape
binarized_final_df['Resistance_Status']= binarized_final_df['Resistance_Status'].apply(lambda resistance: 0.0 if resistance == 'Sensitive' else 1.0)
binarized_final_df.head()
binarized_final_df.to_csv("../data/processed/final.binarized_final_monolabel_df.tsv", "\t")
binarized_final_df.head()
train = binarized_final_df.loc[final_tbportals_train_genome_ids]
train.shape
train.to_csv("../data/processed/final.train.tsv", "\t")
train.head()
test = binarized_final_df.loc[final_htbc_test_genome_ids]
test.shape
test.to_csv("../data/processed/final.test.tsv", "\t")
test.shape