In [3]:
VERSION = "20221001"
In [4]:
import pickle
from math import ceil
In [5]:
import pandas as pd
import seaborn as sb
import numpy as np

import matplotlib.pyplot as plt
from matplotlib import rc_file_defaults

from scipy.stats import zscore

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import ConfusionMatrixDisplay
In [6]:
with open("fcs_events_logicle_20220930.pkl", 'rb') as fh:
    fcs_events = pickle.load(fh)
    
In [7]:
markers = [c for c in fcs_events.columns if c not in ['sample', 'peptide', 'source']]
markers
Out[7]:
['CD16',
 'KLRG1',
 'CD8',
 'CD28',
 'CD27',
 'CD244',
 'CD161',
 'CD127',
 'CCR5',
 'CCR7',
 'CD95',
 'TIGIT',
 'CXCR3',
 'PD1',
 'CD57',
 'CD103',
 'Perforin',
 'CD3',
 'ICOS',
 'HLADR',
 'CD45RO',
 'GranzymeB',
 'CD56',
 'CD45RA',
 'CD101',
 'CD38',
 'CX3CR1',
 'CD71',
 'CD39']

CD3 and CD8 included

In [8]:
print("Number of samples", len(fcs_events['sample'].unique()))
Number of samples 162
In [9]:
num_cols = 3
num_rows = ceil(len(markers)/num_cols)

fig, axes = plt.subplots(
    num_rows, num_cols, figsize=(8*num_cols, 6*num_rows))

for ax, ch in zip(axes.ravel(), markers):
    sb.boxplot(x=fcs_events[ch], ax=ax)#, saturation=0.5, width=0.4, boxprops={'zorder': 2}, palette="Set2")
plt.show()
In [10]:
for ch in markers:
    fcs_events[ch] = zscore(fcs_events[ch].values)
    
In [11]:
num_cols = 3
num_rows = ceil(len(markers)/num_cols)

fig, axes = plt.subplots(
    num_rows, num_cols, figsize=(8*num_cols, 6*num_rows))

for ax, ch in zip(axes.ravel(), markers):
    sb.boxplot(x=fcs_events[ch], ax=ax)#, saturation=0.5, width=0.4, boxprops={'zorder': 2}, palette="Set2")
plt.show()

Sampling¶

In [12]:
def sample_events(fcs_events, cells_per_source = 1000):
    """sample from `fcs_events` to get max `cells_per_source` cells per source. sample evently from non-zero patients
    """
    fcs_events_sampled = pd.DataFrame()
    for source in fcs_events['source'].unique():
        # sample accordingly from donor's that actually have hits from this source
        mask = fcs_events['source'] == source
        counts = fcs_events['sample'][mask].value_counts()# contains only non-zero counts
        n = int(cells_per_source / len(counts))
        #print(f"Sampling {n} cells each from {len(counts)} samples for {source} to reach {sample_size}")
        for sample in counts.index:
            mask = (fcs_events['sample'] == sample) & (fcs_events['source'] == source)
            #if n > sum(mask):
            #    print(f"Sampling only {min(sum(mask), n)} {source} hits from {sample}")
            fcs_events_sampled = pd.concat([fcs_events_sampled, fcs_events[mask].sample(min(sum(mask), n))])
    return fcs_events_sampled

Regression¶

In [20]:
solver = "liblinear"
penalty = "l1"
cv = 6
class_weight = "balanced"
outerfolds = 100

non_markers = list(set(fcs_events_sampled.columns)-set(markers))
best_f1 = -1
for i in range(outerfolds):
    print(f"Starting iteration #{i}")

    fcs_events_sampled = sample_events(fcs_events)
    #y = fcs_events_sampled['source']
    y_ext = fcs_events_sampled[non_markers]
    X = fcs_events_sampled[markers]
    X_train, X_test, y_train_ext, y_test_ext = train_test_split(X, y_ext)#, random_state=42)# also shuffles
    
    y_train = y_train_ext['source']
    y_test = y_test_ext['source']
    lg = LogisticRegressionCV(n_jobs=2, cv=cv,
        solver=solver, penalty=penalty, class_weight=class_weight)
    lg.fit(X_train, y_train)
    y_pred = lg.predict(X_test)
    
    f1 = f1_score(y_test, y_pred, average="weighted")
    precision = precision_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")
    
    with open(f"targetscape-model-{VERSION}_results_{i}.pkl", 'wb') as fh:
        pickle.dump(lg, fh)
        pickle.dump([f1, precision, recall], fh)
        pickle.dump([X_train, X_test, y_train_ext, y_test_ext], fh)        
        
    if f1 > best_f1:
        best_f1 = f1
        print(f"New best F1={f1:.3f} in iteration #{i}")
    
Starting iteration #0
New best F1=0.702 in iteration #0
Starting iteration #1
Starting iteration #2
Starting iteration #3
New best F1=0.702 in iteration #3
Starting iteration #4
Starting iteration #5
Starting iteration #6
Starting iteration #7
Starting iteration #8
Starting iteration #9
Starting iteration #10
Starting iteration #11
Starting iteration #12
Starting iteration #13
New best F1=0.723 in iteration #13
Starting iteration #14
Starting iteration #15
Starting iteration #16
Starting iteration #17
Starting iteration #18
Starting iteration #19
Starting iteration #20
Starting iteration #21
Starting iteration #22
Starting iteration #23
Starting iteration #24
Starting iteration #25
Starting iteration #26
Starting iteration #27
Starting iteration #28
Starting iteration #29
Starting iteration #30
Starting iteration #31
Starting iteration #32
Starting iteration #33
Starting iteration #34
Starting iteration #35
Starting iteration #36
Starting iteration #37
Starting iteration #38
Starting iteration #39
Starting iteration #40
Starting iteration #41
Starting iteration #42
Starting iteration #43
Starting iteration #44
Starting iteration #45
Starting iteration #46
Starting iteration #47
Starting iteration #48
Starting iteration #49
Starting iteration #50
Starting iteration #51
Starting iteration #52
Starting iteration #53
Starting iteration #54
Starting iteration #55
Starting iteration #56
Starting iteration #57
Starting iteration #58
Starting iteration #59
Starting iteration #60
Starting iteration #61
Starting iteration #62
Starting iteration #63
Starting iteration #64
Starting iteration #65
Starting iteration #66
Starting iteration #67
Starting iteration #68
Starting iteration #69
Starting iteration #70
Starting iteration #71
Starting iteration #72
Starting iteration #73
Starting iteration #74
Starting iteration #75
Starting iteration #76
Starting iteration #77
Starting iteration #78
Starting iteration #79
Starting iteration #80
Starting iteration #81
Starting iteration #82
Starting iteration #83
Starting iteration #84
Starting iteration #85
Starting iteration #86
Starting iteration #87
Starting iteration #88
Starting iteration #89
Starting iteration #90
Starting iteration #91
Starting iteration #92
Starting iteration #93
Starting iteration #94
Starting iteration #95
Starting iteration #96
Starting iteration #97
Starting iteration #98
Starting iteration #99

Load best model and confirm results¶

In [27]:
best_round = 13
with open(f"targetscape-model-{VERSION}_results_{best_round}.pkl", 'rb') as fh:
    best_lg = pickle.load(fh)
    best_f1, best_precision, best_recall = pickle.load(fh)
    best_X_train, best_X_test, best_y_train_ext, best_y_test_ext = pickle.load(fh)        
In [28]:
best_f1, best_precision, best_recall
Out[28]:
(0.7228558519536106, 0.7249593965409483, 0.7262931034482759)
In [29]:
best_y_pred = best_lg.predict(best_X_test)
f1 = f1_score(best_y_test_ext['source'], best_y_pred, average="weighted")
f1
Out[29]:
0.7228558519536106
In [30]:
all_f1 = []
all_precision = []
all_recall = []

for i in range(outerfolds):
    with open(f"targetscape-model-{VERSION}_results_{i}.pkl", 'rb') as fh:
        _lg = pickle.load(fh)
        _f1, _precision, _recall = pickle.load(fh)
        _X_train, X_test, y_train_ext, y_test_ext = pickle.load(fh)        

    # predict with best model on this test set and record scores
    y_pred = best_lg.predict(X_test)
    all_f1.append(
        f1_score(y_test_ext['source'], y_pred, average="weighted"))
    all_precision.append(
        precision_score(y_test_ext['source'], y_pred, average="weighted"))
    all_recall.append(
        recall_score(y_test_ext['source'], y_pred, average="weighted"))

    
In [31]:
df1 = pd.DataFrame(all_f1, columns=["Score"])
df1['Category'] = 'F1'
df2 = pd.DataFrame(all_precision, columns=["Score"])
df2['Category'] = 'Precision'
df3 = pd.DataFrame(all_recall, columns=["Score"])
df3['Category'] = 'Recall'
cdf = pd.concat([df1, df2, df3])
In [32]:
with open(f"targetscape-model-{VERSION}_scores.pkl", 'wb') as fh:
    pickle.dump(cdf, fh)
In [33]:
cdf.to_csv(f"targetscape-model-{VERSION}_scores.csv")
In [34]:
# https://stackoverflow.com/questions/69458775/seaborn-boxplot-and-stripplot-points-arent-aligned-over-the-x-axis-by-hue
plt.figure(figsize=(8,10))
#sb.set_context("notebook", font_scale=3)

#sb.set(style="darkgrid")
sb.set_theme()
#sb.set_context("talk")#paper")
sb.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
#sb.despine(left=True)

ax = sb.stripplot(x="Category", y="Score", hue="Category",
                  data=cdf, alpha=0.3, linewidth=1, jitter=True, dodge=True,)
sb.boxplot(x="Category", y="Score", hue="Category",
           data=cdf, fliersize=0, ax=ax)

# Get the handles and labels
handles, labels = ax.get_legend_handles_labels()
# When creating the legend, only use the first two elements
l = plt.legend(handles[0:3], labels[0:3], bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

plt.ylim(0.625, 0.75) 
plt.show()

Single prediction with class and sample¶

In [35]:
For reference see previous incarnatation, which are all broken (precision is always one)
In [44]:
labels = []
predictions = []
samples = []

for i in range(outerfolds):
    with open(f"targetscape-model-{VERSION}_results_{i}.pkl", 'rb') as fh:
        _lg = pickle.load(fh)
        _f1, _precision, _recall = pickle.load(fh)
        _X_train, X_test, _y_train_ext, y_test_ext = pickle.load(fh)
        y_pred = best_lg.predict(X_test)
        labels.extend(y_test_ext['source'])
        samples.extend(y_test_ext['sample'])
        predictions.extend(y_pred)
        #print(f1_score(y_test, y_pred, average="weighted"))
In [46]:
len(labels), len(predictions), len(samples)
Out[46]:
(92800, 92800, 92800)
In [47]:
f1_score(labels, predictions, average="weighted")
Out[47]:
0.6897079284414659
In [48]:
df = pd.DataFrame(predictions, columns=['Prediction'])
df ['Label'] = labels
df ['Sample'] = samples
df
Out[48]:
Prediction Label Sample
0 CMV CMV SDBB096
1 Flu Flu SDBB048
2 CD8 CD8 DLS020
3 EBV CMV SDBB128
4 CD8 CD8 SDBB109
... ... ... ...
92795 CD8 CMV SDBB093
92796 EBV EBV DLS046
92797 Flu Flu SDBB089
92798 CD8 CD8 SDBB021
92799 EBV EBV SDBB018

92800 rows × 3 columns

In [49]:
df.to_csv(f"targetscape-model-{VERSION}_predictions.csv")

Feature importance¶

In [51]:
coef_df = pd.DataFrame(best_lg.coef_)
coef_df = coef_df.rename(dict(enumerate(X.columns)), axis='columns')
coef_df['source'] = best_lg.classes_
coef_df = coef_df.set_index("source")
coef_df
Out[51]:
CD16 KLRG1 CD8 CD28 CD27 CD244 CD161 CD127 CCR5 CCR7 ... HLADR CD45RO GranzymeB CD56 CD45RA CD101 CD38 CX3CR1 CD71 CD39
source
CD8 -0.122736 0.122313 -0.888155 0.104413 0.058758 -0.045696 0.326850 -0.518837 -0.398624 0.000000 ... 0.000000 -0.384487 -0.247468 0.169034 0.212548 -0.124061 -0.016366 -0.343833 -0.030653 0.051271
CMV -0.068701 0.224346 0.676842 -0.053722 -0.364873 0.372978 -0.025279 0.000000 -0.033722 -0.102487 ... 0.052049 -0.073462 0.461349 -0.253484 0.149088 0.000000 -0.022462 0.432681 0.026664 -0.080587
EBV 0.027371 -0.037237 -0.445967 0.132548 0.407655 0.103396 -0.541379 -0.143046 -0.050989 0.532898 ... 0.198746 1.658728 0.016827 0.251512 -0.451361 -0.209694 0.132480 -0.134069 0.000000 -0.001118
Flu 0.394121 -0.436465 0.553383 -0.259573 0.071487 -0.493775 0.047624 1.026198 0.524661 -0.442971 ... -0.370499 -0.649010 -0.177127 0.033219 -0.171614 0.351369 -0.097041 -0.163171 0.033346 0.058603

4 rows × 29 columns

In [52]:
coef_df.to_csv(f"targetscape-model-{VERSION}_best_model_feature_importance.csv")
In [53]:
for source in set(fcs_events['source']):
    f = coef_df.T[source].sort_values()
    print(source, f.abs().sort_values()[:5])
    print()
CMV ICOS     0.000000
CD101    0.000000
CD127    0.000000
CD38     0.022462
CD161    0.025279
Name: CMV, dtype: float64

CD8 CCR7     0.000000
HLADR    0.000000
CD38     0.016366
ICOS     0.027913
CD71     0.030653
Name: CD8, dtype: float64

Flu CD57     0.007231
CD56     0.033219
CD71     0.033346
CD161    0.047624
CD39     0.058603
Name: Flu, dtype: float64

EBV CD71     0.000000
CD95     0.000000
CD103    0.000000
CD39     0.001118
CD57     0.009855
Name: EBV, dtype: float64

In [54]:
sb.set_theme()
#plt.rcParams["figure.figsize"] = (5, 7)

def bar_color(df, color1, color2):
    return np.where(df.values>0,color1,color2).T

sources = set(fcs_events['source'])
num_cols = 1
num_rows = ceil(len(sources)/num_cols)

fig, axes = plt.subplots(
    num_rows, num_cols, figsize=(12*num_cols, 9*num_rows))
for ax, source in zip(axes.ravel(), sources):
    f = coef_df.T[source].sort_values()
    ax.barh(range(len(f)), f.values,  color=bar_color(f, 'g', 'r'))#, width=0.8, bottom=None, *, align='center', data=None, **kwargs)
    ax.set_yticks(range(len(f)), f.index)#, rotation='vertical')
    #plt.xticks(x, labels, rotation='vertical')
    #set parameters for tick labels
    ax.tick_params(axis='y', which='major', labelsize=10)
    ax.set_title(f"{source}", fontsize=18)

plt.tight_layout()
#plt.rcParams["figure.figsize"] = plt.rcParamsDefault["figure.figsize"]
#plt.savefig("paper-2022-04-22_best_model_feature_importance.png")
#plt.savefig("paper-2022-04-22_best_model_feature_importance.pdf")

Prob estimates¶

In [55]:
proba = best_lg.predict_proba(fcs_events[markers])
_ = plt.hist([max(x) for x in proba], range=(0, 1), bins=50)
plt.title("Probability estimates (max)")
Out[55]:
Text(0.5, 1.0, 'Probability estimates (max)')
In [56]:
def plot_wrapper(y_test, y_pred, solver, penalty, f1, class_weight, cv=None):
    num_cols = 2
    num_rows = 1
    fig, axes = plt.subplots(
        num_rows, num_cols, figsize=(8*num_cols, 6*num_rows))
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=axes[0], normalize='true')
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=axes[1], normalize=None)
    fig.suptitle(f'LogisticRegression {solver} {penalty} F1={f1:.3f} class_weight={class_weight} cv={cv}', fontsize=16)

    
In [57]:
y_pred = best_lg.predict(fcs_events[markers])
f1 = f1_score(fcs_events['source'], y_pred, average="weighted")
f1
Out[57]:
0.7000523619813446
In [58]:
rc_file_defaults()

plot_wrapper(fcs_events['source'], y_pred, solver, penalty, f1, class_weight, cv=cv)