In [None]:
# Constants
TRAINING_SET_PATH = './training_set.json'
TEST_SET_PATH = './test_set.json'
DATA_PATH = './data.json'
VECTORIZER_PATH = './tfidfvectorizer.pkl'
CLUSTERING_METHOD = 'agglomerative' # choices ['kmeans', 'agglomerative']
RESULTS_PATH = './tfidf_{}_results.json'.format(CLUSTERING_METHOD)
EXPECTED_PREDICTED_PATH = './tfidf_{}_expected_predicted.json'.format(CLUSTERING_METHOD)
BUCKET = 'YOUR_BUCKET_ID'
MODEL_TEMPLATE_PATH = 'tfidf_{}_{}.pkl'.format(CLUSTERING_METHOD, '{}')
N_CLUSTERS = range(200, 2100, 50)

In [None]:
"""removes punctuation, stopwords, and returns a list of the remaining words, or tokens"""
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# Cleaning the text
import string
import json
import pickle

def text_process(text):
    '''
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Return the cleaned text as a list of words
    4. Remove words
    '''
    stemmer = WordNetLemmatizer()
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join([i for i in nopunc if not i.isdigit()])
    nopunc =  [word.lower() for word in nopunc.split() if word not in stopwords.words('english')]
    return ' '.join([stemmer.lemmatize(word) for word in nopunc])

def read_json(input_path):
    with open(input_path, encoding='utf-8') as f:
        json_data = json.load(f)

    return json_data

def read_pickle(input_path):
    with open(input_path, 'rb') as f:
        loaded_object = pickle.load(f)
    return loaded_object

def write_json(json_data, output_path):
    with open(output_path, 'w') as json_file:
        json.dump(json_data, json_file, indent=4)

def write_pickle(data, output_path):
    with open(output_path, 'wb') as f:
        pickle.dump(data, f)  

In [None]:
## Mount Drive into Colab
from google.colab import drive
drive.mount('/content/drive')

!cp '/path_to_your_directory_on_google_drive/training_set.json' $TRAINING_SET_PATH
!cp '/path_to_your_directory_on_google_drive/test_set.json' $TEST_SET_PATH
!cp '/path_to_your_directory_on_google_drive/data.json' $DATA_PATH

# Training

In [None]:
# Import and process the training data
import pandas as pd

train_json = read_json(TRAINING_SET_PATH)
train_df = pd.json_normalize(train_json['instances'])
train_df['text'] = train_df['text'].apply(text_process)

if CLUSTERING_METHOD == 'agglomerative':
  test_json = read_json(TEST_SET_PATH)
  test_df = pd.json_normalize(test_json['instances'])
  test_df['text'] = test_df['text'].apply(text_process)

In [None]:
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2)).fit(train_df['text'])
write_pickle(vectorizer, VECTORIZER_PATH)

train_df_transformed = vectorizer.transform(train_df['text'])

# we need to build the clusters on the complete dataset, since the "prediction" in hierarchical clusterings requires re-building the clusters.
if CLUSTERING_METHOD == 'agglomerative':
  test_df_transformed = vectorizer.transform(test_df['text'])
  train_df_transformed = sparse.vstack((train_df_transformed, test_df_transformed))

train_df_transformed.shape

In [None]:
#checking for optimal number of clusters
from sklearn.cluster import KMeans 
from sklearn.cluster import AgglomerativeClustering
from time import time
from google.colab import auth
auth.authenticate_user()

for n in N_CLUSTERS:
    t0 = time()
    MODEL_PATH = MODEL_TEMPLATE_PATH.format(n)
    print(MODEL_PATH)

    if CLUSTERING_METHOD == 'kmeans':
      clustering_model = KMeans(n_clusters=n, random_state=212)
      clustering_model = clustering_model.fit(train_df_transformed)
    elif CLUSTERING_METHOD == 'agglomerative':
      clustering_model = AgglomerativeClustering(n_clusters=n, linkage='ward')
      clustering_model = clustering_model.fit(train_df_transformed.toarray())

    print('{0:2f}'.format(time() - t0))
    write_pickle(clustering_model, MODEL_PATH)

    # Upload model to bucket
    !gsutil cp {MODEL_PATH} gs://{BUCKET}

# Evaluation

In [None]:
import numpy as np

def predict_comparisons(clustering_model, test_element, test_element_index, train_df):
    if CLUSTERING_METHOD == 'kmeans':
      cluster_label = clustering_model.predict(test_element)
      cluster_instances_indices = np.argwhere(clustering_model.labels_ == cluster_label).squeeze(1)
    elif CLUSTERING_METHOD == 'agglomerative':
      cluster_label = clustering_model.labels_[train_df.shape[0] + test_element_index]
      cluster_instances_indices = np.argwhere(clustering_model.labels_[:train_df.shape[0]] == cluster_label).squeeze(1)

    cluster_instances = train_df.iloc[cluster_instances_indices]
    comparison_ids = cluster_instances['comparison_id'].unique()
    return comparison_ids


def map_to_predicates(data, comparison_ids):
    predicate_ids = []
    
    for comparison in data['comparisons']:
      if comparison['id'] in comparison_ids:

        for predicate in comparison['predicates']:
          if predicate['id'] in predicate_ids:
            continue

          predicate_ids.append(predicate['id'])

    return predicate_ids

def evaluate_macro(expected, predicted):
    return compute_metrics(evaluate_micro(expected, predicted))

def evaluate_micro(expected, predicted):
    """
    tp: correctly predicted properties --> found in expected and predicted sets
    fp: incorrectly predicted properties --> found only in predicted set
    fn: incorrectly predicted properties for other classes -> found only in expected set
    """
    tp = len(set(expected).intersection(predicted))
    fp = len(set(predicted).difference(expected))
    fn = len(set(expected).difference(predicted))
    
    return np.array([tp, fp, fn])


def compute_metrics(confusion_results):
    tp, fp, fn = confusion_results

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_measure = 2 * ((precision * recall) / (precision + recall)) 
    
    return np.array([precision, recall, f_measure])

In [None]:
# Import and process the test data
import pandas as pd

train_json = read_json(TRAINING_SET_PATH)
train_df = pd.json_normalize(train_json['instances'])

test_json = read_json(TEST_SET_PATH)
test_df = pd.json_normalize(test_json['instances'])
test_df['text'] = test_df['text'].apply(text_process)

In [None]:
import os
from google.colab import auth
auth.authenticate_user()

data = read_json(DATA_PATH)
vectorizer = read_pickle(VECTORIZER_PATH)

results = {}
for n in N_CLUSTERS:
    MODEL_PATH = MODEL_TEMPLATE_PATH.format(n)
    print('evaluating model: {}'.format(MODEL_PATH))

    if not os.path.exists(MODEL_PATH):
        !gsutil cp gs://$BUCKET/$MODEL_PATH $MODEL_PATH
    clustering_model = read_pickle(MODEL_PATH)

    macro_measures = np.empty((0,3), float)
    micro_measures = np.zeros(3)
    for test_instance_index, test_instance in test_df.iterrows():
        expected_comparison_id, text = test_instance['comparison_id'], test_instance['text']
        expected = map_to_predicates(data, [expected_comparison_id])
        vectorized_text = vectorizer.transform([text])

        predicted_comparison_ids = predict_comparisons(clustering_model, vectorized_text, test_instance_index, train_df)
        predicted = map_to_predicates(data, predicted_comparison_ids)
        macro_measures = np.vstack((macro_measures, evaluate_macro(expected, predicted)))
        micro_measures += evaluate_micro(expected, predicted)
    
    macro_measures = np.nanmean(macro_measures, axis=0)
    micro_measures = compute_metrics(micro_measures)
    results[str(n)] = {
        'k': n,
        'macro': {
            'precision': macro_measures[0],
            'recall': macro_measures[1],
            'f_measure': macro_measures[2]
        },
        'micro': {
            'precision': micro_measures[0],
            'recall': micro_measures[1],
            'f_measure': micro_measures[2]
        }
    }
    write_json(results, RESULTS_PATH)
    !cp $RESULTS_PATH '/path_to_your_directory_on_google_drive/'$RESULTS_PATH

In [None]:
!cp '/path_to_your_directory_on_google_drive/'$RESULTS_PATH $RESULTS_PATH


results = read_json(RESULTS_PATH)
results_df = pd.json_normalize(results.values())
results_df

# Cluster Analysis

In [None]:
from google.colab import auth
auth.authenticate_user()

model_path = 'choose_any_model_from_your_bucket.pkl'
!gsutil cp gs://$BUCKET/$model_path $model_path

In [None]:
model_path = 'choose_any_model_from_your_bucket.pkl'
model = read_pickle(model_path)

In [None]:
# min, max, avg papers per cluster
import numpy as np 
import pandas as pd

unique, counts = np.unique(model.labels_, return_counts=True)
print(np.min(counts))
print(np.max(counts))
print(np.average(counts))

In [None]:
# min, max, avg comparisons per cluster
train_json = read_json(TRAINING_SET_PATH)
train_df = pd.json_normalize(train_json['instances'])

if CLUSTERING_METHOD == 'agglomerative':
  test_json = read_json(TEST_SET_PATH)
  test_df = pd.json_normalize(test_json['instances'])
  train_df = pd.concat([train_df, test_df])

try:
  train_df.insert(1, 'cluster_id', model.labels_)
except:
  print('already inserted!')

clusters_comparisons = train_df[['cluster_id', 'comparison_id']].drop_duplicates()
unique, counts = np.unique(clusters_comparisons['cluster_id'], return_counts=True)
print(np.min(counts))
print(np.max(counts))
print(np.average(counts))

In [None]:
# find out how the comparisons are distributed over clusters and how much pure is the distribution
train_json = read_json(TRAINING_SET_PATH)
train_df = pd.json_normalize(train_json['instances'])

puriteis = []
number_of_clusters = []
for comparison_id, number_of_papers in train_df['comparison_id'].value_counts().items():
  paper_indices = train_df[train_df['comparison_id'] == comparison_id].index
  clusters_labels = model.labels_[paper_indices]
  clusters_comparisons = []
  pure_clusters = 0

  for cluster_label in np.unique(clusters_labels):
    cluster_instances_indices = np.argwhere(model.labels_[:train_df.shape[0]] == cluster_label).squeeze(1)
    cluster_instances = train_df.iloc[cluster_instances_indices]
    # TODO: remove the next line if you want to ignore the fact that "comparisons can share papers".
    cluster_instances = cluster_instances.drop_duplicates(subset='paper_id')
    cluster_comparisons = cluster_instances['comparison_id'].unique()
    clusters_comparisons.extend(cluster_comparisons)
    if len(cluster_comparisons) == 1:
      pure_clusters += 1

  purity = pure_clusters / len(np.unique(clusters_labels))
  puriteis.append(purity)
  number_of_clusters.append(len(np.unique(clusters_labels)))
  print('comparison {} with {} papers is distributed over {} clusters containing {} comparisons, where {} clusters are pure. - Purity={}'.format(comparison_id, number_of_papers, len(np.unique(clusters_labels)), len(set(clusters_comparisons)), pure_clusters, purity))
  print('comparisons: {}'.format(set(clusters_comparisons)))

print('Average purity: {:.3f}'.format(np.average(puriteis)))
print('min clusters/comparison', np.min(number_of_clusters))
print('max clusters/comparison', np.max(number_of_clusters))
print('avg clusters/comparison', np.average(number_of_clusters))