VERSION = "20221001"
import pickle
from math import ceil
import pandas as pd
import seaborn as sb
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc_file_defaults
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import ConfusionMatrixDisplay
with open("fcs_events_logicle_20220930.pkl", 'rb') as fh:
fcs_events = pickle.load(fh)
markers = [c for c in fcs_events.columns if c not in ['sample', 'peptide', 'source']]
markers
['CD16', 'KLRG1', 'CD8', 'CD28', 'CD27', 'CD244', 'CD161', 'CD127', 'CCR5', 'CCR7', 'CD95', 'TIGIT', 'CXCR3', 'PD1', 'CD57', 'CD103', 'Perforin', 'CD3', 'ICOS', 'HLADR', 'CD45RO', 'GranzymeB', 'CD56', 'CD45RA', 'CD101', 'CD38', 'CX3CR1', 'CD71', 'CD39']
CD3 and CD8 included
print("Number of samples", len(fcs_events['sample'].unique()))
Number of samples 162
num_cols = 3
num_rows = ceil(len(markers)/num_cols)
fig, axes = plt.subplots(
num_rows, num_cols, figsize=(8*num_cols, 6*num_rows))
for ax, ch in zip(axes.ravel(), markers):
sb.boxplot(x=fcs_events[ch], ax=ax)#, saturation=0.5, width=0.4, boxprops={'zorder': 2}, palette="Set2")
plt.show()
for ch in markers:
fcs_events[ch] = zscore(fcs_events[ch].values)
num_cols = 3
num_rows = ceil(len(markers)/num_cols)
fig, axes = plt.subplots(
num_rows, num_cols, figsize=(8*num_cols, 6*num_rows))
for ax, ch in zip(axes.ravel(), markers):
sb.boxplot(x=fcs_events[ch], ax=ax)#, saturation=0.5, width=0.4, boxprops={'zorder': 2}, palette="Set2")
plt.show()
def sample_events(fcs_events, cells_per_source = 1000):
"""sample from `fcs_events` to get max `cells_per_source` cells per source. sample evently from non-zero patients
"""
fcs_events_sampled = pd.DataFrame()
for source in fcs_events['source'].unique():
# sample accordingly from donor's that actually have hits from this source
mask = fcs_events['source'] == source
counts = fcs_events['sample'][mask].value_counts()# contains only non-zero counts
n = int(cells_per_source / len(counts))
#print(f"Sampling {n} cells each from {len(counts)} samples for {source} to reach {sample_size}")
for sample in counts.index:
mask = (fcs_events['sample'] == sample) & (fcs_events['source'] == source)
#if n > sum(mask):
# print(f"Sampling only {min(sum(mask), n)} {source} hits from {sample}")
fcs_events_sampled = pd.concat([fcs_events_sampled, fcs_events[mask].sample(min(sum(mask), n))])
return fcs_events_sampled
solver = "liblinear"
penalty = "l1"
cv = 6
class_weight = "balanced"
outerfolds = 100
non_markers = list(set(fcs_events_sampled.columns)-set(markers))
best_f1 = -1
for i in range(outerfolds):
print(f"Starting iteration #{i}")
fcs_events_sampled = sample_events(fcs_events)
#y = fcs_events_sampled['source']
y_ext = fcs_events_sampled[non_markers]
X = fcs_events_sampled[markers]
X_train, X_test, y_train_ext, y_test_ext = train_test_split(X, y_ext)#, random_state=42)# also shuffles
y_train = y_train_ext['source']
y_test = y_test_ext['source']
lg = LogisticRegressionCV(n_jobs=2, cv=cv,
solver=solver, penalty=penalty, class_weight=class_weight)
lg.fit(X_train, y_train)
y_pred = lg.predict(X_test)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")
with open(f"targetscape-model-{VERSION}_results_{i}.pkl", 'wb') as fh:
pickle.dump(lg, fh)
pickle.dump([f1, precision, recall], fh)
pickle.dump([X_train, X_test, y_train_ext, y_test_ext], fh)
if f1 > best_f1:
best_f1 = f1
print(f"New best F1={f1:.3f} in iteration #{i}")
Starting iteration #0 New best F1=0.702 in iteration #0 Starting iteration #1 Starting iteration #2 Starting iteration #3 New best F1=0.702 in iteration #3 Starting iteration #4 Starting iteration #5 Starting iteration #6 Starting iteration #7 Starting iteration #8 Starting iteration #9 Starting iteration #10 Starting iteration #11 Starting iteration #12 Starting iteration #13 New best F1=0.723 in iteration #13 Starting iteration #14 Starting iteration #15 Starting iteration #16 Starting iteration #17 Starting iteration #18 Starting iteration #19 Starting iteration #20 Starting iteration #21 Starting iteration #22 Starting iteration #23 Starting iteration #24 Starting iteration #25 Starting iteration #26 Starting iteration #27 Starting iteration #28 Starting iteration #29 Starting iteration #30 Starting iteration #31 Starting iteration #32 Starting iteration #33 Starting iteration #34 Starting iteration #35 Starting iteration #36 Starting iteration #37 Starting iteration #38 Starting iteration #39 Starting iteration #40 Starting iteration #41 Starting iteration #42 Starting iteration #43 Starting iteration #44 Starting iteration #45 Starting iteration #46 Starting iteration #47 Starting iteration #48 Starting iteration #49 Starting iteration #50 Starting iteration #51 Starting iteration #52 Starting iteration #53 Starting iteration #54 Starting iteration #55 Starting iteration #56 Starting iteration #57 Starting iteration #58 Starting iteration #59 Starting iteration #60 Starting iteration #61 Starting iteration #62 Starting iteration #63 Starting iteration #64 Starting iteration #65 Starting iteration #66 Starting iteration #67 Starting iteration #68 Starting iteration #69 Starting iteration #70 Starting iteration #71 Starting iteration #72 Starting iteration #73 Starting iteration #74 Starting iteration #75 Starting iteration #76 Starting iteration #77 Starting iteration #78 Starting iteration #79 Starting iteration #80 Starting iteration #81 Starting iteration #82 Starting iteration #83 Starting iteration #84 Starting iteration #85 Starting iteration #86 Starting iteration #87 Starting iteration #88 Starting iteration #89 Starting iteration #90 Starting iteration #91 Starting iteration #92 Starting iteration #93 Starting iteration #94 Starting iteration #95 Starting iteration #96 Starting iteration #97 Starting iteration #98 Starting iteration #99
best_round = 13
with open(f"targetscape-model-{VERSION}_results_{best_round}.pkl", 'rb') as fh:
best_lg = pickle.load(fh)
best_f1, best_precision, best_recall = pickle.load(fh)
best_X_train, best_X_test, best_y_train_ext, best_y_test_ext = pickle.load(fh)
best_f1, best_precision, best_recall
(0.7228558519536106, 0.7249593965409483, 0.7262931034482759)
best_y_pred = best_lg.predict(best_X_test)
f1 = f1_score(best_y_test_ext['source'], best_y_pred, average="weighted")
f1
0.7228558519536106
all_f1 = []
all_precision = []
all_recall = []
for i in range(outerfolds):
with open(f"targetscape-model-{VERSION}_results_{i}.pkl", 'rb') as fh:
_lg = pickle.load(fh)
_f1, _precision, _recall = pickle.load(fh)
_X_train, X_test, y_train_ext, y_test_ext = pickle.load(fh)
# predict with best model on this test set and record scores
y_pred = best_lg.predict(X_test)
all_f1.append(
f1_score(y_test_ext['source'], y_pred, average="weighted"))
all_precision.append(
precision_score(y_test_ext['source'], y_pred, average="weighted"))
all_recall.append(
recall_score(y_test_ext['source'], y_pred, average="weighted"))
df1 = pd.DataFrame(all_f1, columns=["Score"])
df1['Category'] = 'F1'
df2 = pd.DataFrame(all_precision, columns=["Score"])
df2['Category'] = 'Precision'
df3 = pd.DataFrame(all_recall, columns=["Score"])
df3['Category'] = 'Recall'
cdf = pd.concat([df1, df2, df3])
with open(f"targetscape-model-{VERSION}_scores.pkl", 'wb') as fh:
pickle.dump(cdf, fh)
cdf.to_csv(f"targetscape-model-{VERSION}_scores.csv")
# https://stackoverflow.com/questions/69458775/seaborn-boxplot-and-stripplot-points-arent-aligned-over-the-x-axis-by-hue
plt.figure(figsize=(8,10))
#sb.set_context("notebook", font_scale=3)
#sb.set(style="darkgrid")
sb.set_theme()
#sb.set_context("talk")#paper")
sb.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
#sb.despine(left=True)
ax = sb.stripplot(x="Category", y="Score", hue="Category",
data=cdf, alpha=0.3, linewidth=1, jitter=True, dodge=True,)
sb.boxplot(x="Category", y="Score", hue="Category",
data=cdf, fliersize=0, ax=ax)
# Get the handles and labels
handles, labels = ax.get_legend_handles_labels()
# When creating the legend, only use the first two elements
l = plt.legend(handles[0:3], labels[0:3], bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.ylim(0.625, 0.75)
plt.show()
For reference see previous incarnatation, which are all broken (precision is always one)
labels = []
predictions = []
samples = []
for i in range(outerfolds):
with open(f"targetscape-model-{VERSION}_results_{i}.pkl", 'rb') as fh:
_lg = pickle.load(fh)
_f1, _precision, _recall = pickle.load(fh)
_X_train, X_test, _y_train_ext, y_test_ext = pickle.load(fh)
y_pred = best_lg.predict(X_test)
labels.extend(y_test_ext['source'])
samples.extend(y_test_ext['sample'])
predictions.extend(y_pred)
#print(f1_score(y_test, y_pred, average="weighted"))
len(labels), len(predictions), len(samples)
(92800, 92800, 92800)
f1_score(labels, predictions, average="weighted")
0.6897079284414659
df = pd.DataFrame(predictions, columns=['Prediction'])
df ['Label'] = labels
df ['Sample'] = samples
df
Prediction | Label | Sample | |
---|---|---|---|
0 | CMV | CMV | SDBB096 |
1 | Flu | Flu | SDBB048 |
2 | CD8 | CD8 | DLS020 |
3 | EBV | CMV | SDBB128 |
4 | CD8 | CD8 | SDBB109 |
... | ... | ... | ... |
92795 | CD8 | CMV | SDBB093 |
92796 | EBV | EBV | DLS046 |
92797 | Flu | Flu | SDBB089 |
92798 | CD8 | CD8 | SDBB021 |
92799 | EBV | EBV | SDBB018 |
92800 rows × 3 columns
df.to_csv(f"targetscape-model-{VERSION}_predictions.csv")
coef_df = pd.DataFrame(best_lg.coef_)
coef_df = coef_df.rename(dict(enumerate(X.columns)), axis='columns')
coef_df['source'] = best_lg.classes_
coef_df = coef_df.set_index("source")
coef_df
CD16 | KLRG1 | CD8 | CD28 | CD27 | CD244 | CD161 | CD127 | CCR5 | CCR7 | ... | HLADR | CD45RO | GranzymeB | CD56 | CD45RA | CD101 | CD38 | CX3CR1 | CD71 | CD39 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
source | |||||||||||||||||||||
CD8 | -0.122736 | 0.122313 | -0.888155 | 0.104413 | 0.058758 | -0.045696 | 0.326850 | -0.518837 | -0.398624 | 0.000000 | ... | 0.000000 | -0.384487 | -0.247468 | 0.169034 | 0.212548 | -0.124061 | -0.016366 | -0.343833 | -0.030653 | 0.051271 |
CMV | -0.068701 | 0.224346 | 0.676842 | -0.053722 | -0.364873 | 0.372978 | -0.025279 | 0.000000 | -0.033722 | -0.102487 | ... | 0.052049 | -0.073462 | 0.461349 | -0.253484 | 0.149088 | 0.000000 | -0.022462 | 0.432681 | 0.026664 | -0.080587 |
EBV | 0.027371 | -0.037237 | -0.445967 | 0.132548 | 0.407655 | 0.103396 | -0.541379 | -0.143046 | -0.050989 | 0.532898 | ... | 0.198746 | 1.658728 | 0.016827 | 0.251512 | -0.451361 | -0.209694 | 0.132480 | -0.134069 | 0.000000 | -0.001118 |
Flu | 0.394121 | -0.436465 | 0.553383 | -0.259573 | 0.071487 | -0.493775 | 0.047624 | 1.026198 | 0.524661 | -0.442971 | ... | -0.370499 | -0.649010 | -0.177127 | 0.033219 | -0.171614 | 0.351369 | -0.097041 | -0.163171 | 0.033346 | 0.058603 |
4 rows × 29 columns
coef_df.to_csv(f"targetscape-model-{VERSION}_best_model_feature_importance.csv")
for source in set(fcs_events['source']):
f = coef_df.T[source].sort_values()
print(source, f.abs().sort_values()[:5])
print()
CMV ICOS 0.000000 CD101 0.000000 CD127 0.000000 CD38 0.022462 CD161 0.025279 Name: CMV, dtype: float64 CD8 CCR7 0.000000 HLADR 0.000000 CD38 0.016366 ICOS 0.027913 CD71 0.030653 Name: CD8, dtype: float64 Flu CD57 0.007231 CD56 0.033219 CD71 0.033346 CD161 0.047624 CD39 0.058603 Name: Flu, dtype: float64 EBV CD71 0.000000 CD95 0.000000 CD103 0.000000 CD39 0.001118 CD57 0.009855 Name: EBV, dtype: float64
sb.set_theme()
#plt.rcParams["figure.figsize"] = (5, 7)
def bar_color(df, color1, color2):
return np.where(df.values>0,color1,color2).T
sources = set(fcs_events['source'])
num_cols = 1
num_rows = ceil(len(sources)/num_cols)
fig, axes = plt.subplots(
num_rows, num_cols, figsize=(12*num_cols, 9*num_rows))
for ax, source in zip(axes.ravel(), sources):
f = coef_df.T[source].sort_values()
ax.barh(range(len(f)), f.values, color=bar_color(f, 'g', 'r'))#, width=0.8, bottom=None, *, align='center', data=None, **kwargs)
ax.set_yticks(range(len(f)), f.index)#, rotation='vertical')
#plt.xticks(x, labels, rotation='vertical')
#set parameters for tick labels
ax.tick_params(axis='y', which='major', labelsize=10)
ax.set_title(f"{source}", fontsize=18)
plt.tight_layout()
#plt.rcParams["figure.figsize"] = plt.rcParamsDefault["figure.figsize"]
#plt.savefig("paper-2022-04-22_best_model_feature_importance.png")
#plt.savefig("paper-2022-04-22_best_model_feature_importance.pdf")
proba = best_lg.predict_proba(fcs_events[markers])
_ = plt.hist([max(x) for x in proba], range=(0, 1), bins=50)
plt.title("Probability estimates (max)")
Text(0.5, 1.0, 'Probability estimates (max)')
def plot_wrapper(y_test, y_pred, solver, penalty, f1, class_weight, cv=None):
num_cols = 2
num_rows = 1
fig, axes = plt.subplots(
num_rows, num_cols, figsize=(8*num_cols, 6*num_rows))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=axes[0], normalize='true')
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=axes[1], normalize=None)
fig.suptitle(f'LogisticRegression {solver} {penalty} F1={f1:.3f} class_weight={class_weight} cv={cv}', fontsize=16)
y_pred = best_lg.predict(fcs_events[markers])
f1 = f1_score(fcs_events['source'], y_pred, average="weighted")
f1
0.7000523619813446
rc_file_defaults()
plot_wrapper(fcs_events['source'], y_pred, solver, penalty, f1, class_weight, cv=cv)