In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data
import gzip
import h5py
import numpy as np
import argparse
import os
import torch.optim as optim
from model2 import MolecularICVAE
from utils import decode_smiles_from_indexes, load_dataset

###  load data and create conditional input from labels



(1) **y_train_ic (batch_size, 1,33)** is the condition input which is concatenated with the SMILES one-hot encoding vector **X_train (batch_size, 120,33)** as the input **(batch_size, 121,33)** of ICVAE.

(2) **y_train_l (batch_size, 128)** is the condition input which is used to constrain the latent vector (batch_size, 120,33) to the molecular properties.

note: we only set the first two dimension of latent vector as the conditions.

In [2]:
lat_dim = 128

X_train, X_test, charset = load_dataset('./data/processed.h5')

y_train = np.load("./prop_np/weight/y_train_norm.npy")
y_test = np.load("./prop_np/weight/y_test_norm.npy")

y_train_ic = np.repeat(y_train[:,np.newaxis], 33, -1)
y_test_ic = np.repeat(y_test[:,np.newaxis], 33, -1)

y_train_l = np.repeat(y_train[:, np.newaxis], lat_dim, axis=1)
y_test_l = np.repeat(y_test[:, np.newaxis], lat_dim, axis=1)

y_train_l[:,2:] = 0.
y_test_l[:,2:] = 0.

In [3]:
torch_X_train = torch.from_numpy(X_train).type(torch.FloatTensor)
torch_X_test = torch.from_numpy(X_test).type(torch.FloatTensor)

torch_ic_train = torch.from_numpy(y_train_ic).type(torch.FloatTensor) 
torch_ic_test = torch.from_numpy(y_test_ic).type(torch.FloatTensor)

torch_l_train = torch.from_numpy(y_train_l).type(torch.FloatTensor) 
torch_l_test = torch.from_numpy(y_test_l).type(torch.FloatTensor)

torch_lc_train = torch.from_numpy(y_train).type(torch.FloatTensor) 
torch_lc_test = torch.from_numpy(y_test).type(torch.FloatTensor)

train = torch.utils.data.TensorDataset(torch_X_train, torch_ic_train, torch_l_train, torch_lc_train)
test = torch.utils.data.TensorDataset(torch_X_test, torch_ic_test, torch_l_test, torch_lc_test)

In [4]:
train_loader = torch.utils.data.DataLoader(train, shuffle=True, batch_size=250)
test_loader = torch.utils.data.DataLoader(test, shuffle=True, batch_size=250)

### correlate the latent vector with molecular properties

the only difference between our model with other vae-based models is that we use **(z_mean-y_arg)** to fouce the mean value of latent vector into the molecular property value, while other vae-based models only use **z_mean**.

**note: y_arg is the y_train_l**.

In [5]:
def vae_loss(x_decoded_mean, x, z_mean, z_logvar, y_arg):
    xent_loss = F.binary_cross_entropy(x_decoded_mean, x, size_average=False)
    kl_loss = -0.5 * torch.sum(1 + z_logvar - (z_mean-y_arg).pow(2) - z_logvar.exp())
    return 0.5*xent_loss + kl_loss, kl_loss

In [6]:
torch.manual_seed(42)
epochs = 100
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = MolecularICVAE().to(device)
optimizer = optim.Adam(model.parameters())

In [7]:
def train(epoch):
    model.train()
    train_loss = 0
    KL_loss = 0
    latent_arr = []
    label_arr = []
    for batch_idx, data in enumerate(train_loader):
        oh, ic, l, lc = data
        oh,label_ic,label_l, label_lc = oh.unsqueeze(1).to(device),ic.to(device),l.to(device),lc.to(device)
        optimizer.zero_grad()
        output, mean, logvar,latent = model(oh, label_ic,label_lc)
        loss, kl_loss = vae_loss(output, oh.squeeze(1), mean, logvar, label_l)
        loss.backward()
        train_loss += loss.item()
        KL_loss+=kl_loss.item()
        optimizer.step()
        latent_arr.append(latent.cpu().detach().numpy())
        label_arr.append(label_l.cpu().detach().numpy()[:,:7])
        
    print('train CL: '+str((train_loss-KL_loss) / len(train_loader.dataset)) + '  train KL: ' + str(KL_loss / len(train_loader.dataset)))
    
    return train_loss / len(train_loader.dataset),latent_arr, label_arr

In [8]:
def test(epoch):
    model.eval()
    test_loss = 0
    KL_loss = 0
    for batch_idx, data in enumerate(test_loader):
        oh, ic, l, lc = data
        oh,label_ic,label_l, label_lc = oh.unsqueeze(1).to(device),ic.to(device),l.to(device),lc.to(device)
        optimizer.zero_grad()
        output, mean, logvar,latent = model(oh, label_ic,label_lc)
        loss, kl_loss = vae_loss(output, oh.squeeze(1), mean, logvar, label_l)
        KL_loss+=kl_loss.item()
        test_loss += loss.item()
    print('test CL: '+str((test_loss-KL_loss) / len(test_loader.dataset)) + '  test KL: ' + str(KL_loss / len(test_loader.dataset)))
    
    return test_loss / len(test_loader.dataset)

In [9]:
for epoch in range(1, epochs + 1):
    train_loss,latent_arr,label_arr = train(epoch)
    test_loss = test(epoch)

RuntimeError: shape '[250, 1, 2, 34]' is invalid for input of size 8250

### save latent vector and model

save the latent vector for draw the latent image,

and save the model for sampling the molecule.

In [None]:
latent_np = np.array(latent_arr)
label_np = np.array(label_arr)

latent_np = latent_np[:,:,:2].reshape((-1,2))
label_np = label_np[:,:,:2].reshape((-1,2))
np.save("./result/latent/MW_latent3.npy", latent_np)
np.save("./result/latent/MW_label3.npy", label_np)

torch.save(model.state_dict(), "./result/model/MW_model3.pth")