In [None]:
# Constants
TRAINING_SET_PATH = './training_set.json'
TEST_SET_PATH = './test_set.json'
DATA_PATH = './data.json'
SCIBERT_TRAINING_REPRESENTATION = './scibert_training_representations.npz'
SCIBERT_TEST_REPRESENTATION = './scibert_test_representations.npz'
VECTORIZER_PATH = './scibertvectorizer.pkl'
CLUSTERING_METHOD = 'agglomerative' # choices ['kmeans', 'agglomerative']
RESULTS_PATH = './scibert_{}_results.json'.format(CLUSTERING_METHOD)
BUCKET = 'YOUR_BUCKET_ID'
MODEL_TEMPLATE_PATH = 'scibert_{}_{}.pkl'.format(CLUSTERING_METHOD, '{}')
BERT_PATH = './scibert_scivocab_uncased'
MAX_SEQUENCE_LENGTH = 512
N_CLUSTERS = range(200, 2100, 50)

In [None]:
import json
import pickle

def read_json(input_path):
    with open(input_path, encoding='utf-8') as f:
        json_data = json.load(f)

    return json_data

def read_pickle(input_path):
    with open(input_path, 'rb') as f:
        loaded_object = pickle.load(f)
    return loaded_object

def write_json(json_data, output_path):
    with open(output_path, 'w') as json_file:
        json.dump(json_data, json_file, indent=4)

def write_pickle(data, output_path):
    with open(output_path, 'wb') as f:
        pickle.dump(data, f)  

In [None]:
## Mount Drive into Colab
from google.colab import drive
drive.mount('/content/drive')

!cp '/path_to_your_directory_on_google_drive/training_set.json' $TRAINING_SET_PATH
!cp '/path_to_your_directory_on_google_drive/test_set.json' $TEST_SET_PATH
!cp '/path_to_your_directory_on_google_drive/data.json' $DATA_PATH
!cp '/path_to_your_directory_on_google_drive/'$SCIBERT_TRAINING_REPRESENTATION $SCIBERT_TRAINING_REPRESENTATION
!cp '/path_to_your_directory_on_google_drive/'$SCIBERT_TEST_REPRESENTATION $SCIBERT_TEST_REPRESENTATION

# Training

In [None]:
# Import and process the training data
import pandas as pd
from scipy import sparse

train_json = read_json(TRAINING_SET_PATH)
train_df = pd.json_normalize(train_json['instances'])
train_df_transformed = sparse.load_npz(SCIBERT_TRAINING_REPRESENTATION)

# we need to build the clusters on the complete dataset, since the "prediction" in hierarchical clusterings requires re-building the clusters.
if CLUSTERING_METHOD == 'agglomerative':
  test_df_transformed = sparse.load_npz(SCIBERT_TEST_REPRESENTATION)
  train_df_transformed = sparse.vstack((train_df_transformed, test_df_transformed))

train_df_transformed.shape

In [None]:
from sklearn.cluster import KMeans 
from sklearn.cluster import AgglomerativeClustering
from time import time
from google.colab import auth
auth.authenticate_user()

for n in N_CLUSTERS:
    t0 = time()
    MODEL_PATH = MODEL_TEMPLATE_PATH.format(n)
    print(MODEL_PATH)

    if CLUSTERING_METHOD == 'kmeans':
      clustering_model = KMeans(n_clusters=n, random_state=212)
      clustering_model = clustering_model.fit(train_df_transformed)
    elif CLUSTERING_METHOD == 'agglomerative':
      clustering_model = AgglomerativeClustering(n_clusters=n, linkage='ward')
      clustering_model = clustering_model.fit(train_df_transformed.toarray())
    
    print('{0:2f}'.format(time() - t0))
    write_pickle(clustering_model, MODEL_PATH)
    
    # Upload model to bucket
    !gsutil cp {MODEL_PATH} gs://{BUCKET}

# Evaluation

## Vectorizer

In [None]:
!pip install transformers
# Downloading the scibert model
!wget -qO- https://s3-us-west-2.amazonaws.com/ai2-s2-research/scibert/pytorch_models/scibert_scivocab_uncased.tar | tar --transform 's/^dbt2-0.37.50.3/dbt2/' -xv
!tar -xzf ./scibert_scivocab_uncased/weights.tar.gz -C ./scibert_scivocab_uncased/
!mv ./scibert_scivocab_uncased/bert_config.json ./scibert_scivocab_uncased/config.json

In [None]:
import torch
from transformers import BertModel, BertTokenizer
from torch.utils.data import Dataset

class ClusteringDataset(Dataset):
    def __init__(self, df):
        self.data = df

    def __len__(self):
        return len(self.data.index)

    def __getitem__(self, idx):
        instances = self.data['text']
        
        return instances.iloc[[idx]].values[0]

class SciBERT:
    DEFAULT_PATH = 'allenai/scibert_scivocab_uncased'

    @staticmethod
    def model(path=DEFAULT_PATH):
        return BertModel.from_pretrained(path)

    @staticmethod
    def tokenizer(path=DEFAULT_PATH):
        return BertTokenizer.from_pretrained(path)

class SciBERTVectorizer:

  def __init__(self):
    pass

  def fit(self, tokenizer, model):
    self.model = model
    self.tokenizer = tokenizer

    return self

  def transform(self, text_batch):
    text_tokenized = self.tokenizer(text=text_batch, padding='max_length', max_length=MAX_SEQUENCE_LENGTH, truncation=True, return_tensors='pt')
    outputs = self.model(text_tokenized['input_ids'])

    return outputs['last_hidden_state'].squeeze(0).mean(0).detach().numpy()

In [None]:
tokenizer = SciBERT.tokenizer(BERT_PATH)
model = SciBERT.model(BERT_PATH)

vectorizer = SciBERTVectorizer().fit(tokenizer, model)
write_pickle(vectorizer, VECTORIZER_PATH)

## Evaluation Functions

In [None]:
import numpy as np

def predict_comparisons(clustering_model, test_element, test_element_index, train_df):
    if CLUSTERING_METHOD == 'kmeans':
      cluster_label = clustering_model.predict(test_element)
      cluster_instances_indices = np.argwhere(clustering_model.labels_ == cluster_label).squeeze(1)
    elif CLUSTERING_METHOD == 'agglomerative':
      cluster_label = clustering_model.labels_[train_df.shape[0] + test_element_index]
      cluster_instances_indices = np.argwhere(clustering_model.labels_[:train_df.shape[0]] == cluster_label).squeeze(1)

    cluster_instances = train_df.iloc[cluster_instances_indices]
    comparison_ids = cluster_instances['comparison_id'].unique()
    return comparison_ids


def map_to_predicates(data, comparison_ids):
    predicate_ids = []
    
    for comparison in data['comparisons']:
      if comparison['id'] in comparison_ids:

        for predicate in comparison['predicates']:
          if predicate['id'] in predicate_ids:
            continue

          predicate_ids.append(predicate['id'])

    return predicate_ids

def evaluate_macro(expected, predicted):
    return compute_metrics(evaluate_micro(expected, predicted))

def evaluate_micro(expected, predicted):
    """
    tp: correctly predicted properties --> found in expected and predicted sets
    fp: incorrectly predicted properties --> found only in predicted set
    fn: incorrectly predicted properties for other classes -> found only in expected set
    """
    tp = len(set(expected).intersection(predicted))
    fp = len(set(predicted).difference(expected))
    fn = len(set(expected).difference(predicted))
    
    return np.array([tp, fp, fn])


def compute_metrics(confusion_results):
    tp, fp, fn = confusion_results
    
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_measure = 2 * ((precision * recall) / (precision + recall)) 

    return np.array([precision, recall, f_measure])

In [None]:
# Import and process the test data
import pandas as pd 

data = read_json(DATA_PATH)

train_json = read_json(TRAINING_SET_PATH)
train_df = pd.json_normalize(train_json['instances'])

test_json = read_json(TEST_SET_PATH)
test_df = pd.json_normalize(test_json['instances'])

## Evalation Loop

In [None]:
import os
from scipy import sparse
from google.colab import auth
auth.authenticate_user()

vectorizer = read_pickle(VECTORIZER_PATH)

results = {}
vectorized_texts = np.empty((0, 768), dtype=np.float32)
for i, k in enumerate(N_CLUSTERS):
    MODEL_PATH = MODEL_TEMPLATE_PATH.format(k)
    print('evaluating model: {}'.format(MODEL_PATH))

    if not os.path.exists(MODEL_PATH):
        !gsutil cp gs://$BUCKET/$MODEL_PATH $MODEL_PATH
    clustering_model = read_pickle(MODEL_PATH)

    macro_measures = np.empty((0,3), dtype=np.float32)
    micro_measures = np.zeros(3)
    for test_instance_index, test_instance in test_df.iterrows():
        expected_comparison_id, text = test_instance['comparison_id'], test_instance['text']
        expected = map_to_predicates(data, [expected_comparison_id])

        # transform the texts only once. First iteration takes ~15 minutes
        if i == 0:
            vectorized_text = vectorizer.transform([text])
            vectorized_texts = np.vstack((vectorized_texts, vectorized_text))
        else:
          vectorized_text = vectorized_texts[test_instance_index]

        predicted_comparison_ids = predict_comparisons(clustering_model, vectorized_text.reshape(1, -1), test_instance_index, train_df)
        predicted = map_to_predicates(data, predicted_comparison_ids)
        macro_measures = np.vstack((macro_measures, evaluate_macro(expected, predicted)))
        micro_measures += evaluate_micro(expected, predicted)

    macro_measures = np.nanmean(macro_measures, axis=0)
    micro_measures = compute_metrics(micro_measures)
    results[str(k)] = {
        'k': k,
        'macro': {
            'precision': macro_measures[0],
            'recall': macro_measures[1],
            'f_measure': macro_measures[2]
        },
        'micro': {
            'precision': micro_measures[0],
            'recall': micro_measures[1],
            'f_measure': micro_measures[2]
        }
    }
    write_json(results, RESULTS_PATH)
    !cp $RESULTS_PATH '/path_to_your_directory_on_google_drive/'$RESULTS_PATH

In [None]:
!cp '/path_to_your_directory_on_google_drive/'$RESULTS_PATH $RESULTS_PATH

results = read_json(RESULTS_PATH)
results_df = pd.json_normalize(results.values())
results_df