#%%
"""Script to perform cluster analysis based on titles and
   abstracts in the RP bibliography.
"""

from collections import defaultdict
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from operator import itemgetter
from progress.bar import ChargingBar
from pybtex.database.input import bibtex
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score, silhouette_samples, adjusted_rand_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
import nltk
import numpy as np
import datetime

#nltk.download('punkt')

random_state = 13


# definition lemmatising function
def lemmatize_text(text):

    
    tokens = word_tokenize(text.lower())
  
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
  
    # remove specific words
    # (there are some abbreviations which can have impact on the clustering)
    remove_words = ['new','ha','rp','br','used','approach','proposed','study','method','based','result','non','using', 'wa']
    if remove_words:
        lemmatized_tokens = [token for token in lemmatized_tokens if token not in remove_words]

    return ' '.join(lemmatized_tokens)
    

# load Bibtex file
parser = bibtex.Parser()
bibdata = parser.parse_file("../rp.bib")
labels = sorted(bibdata.entries.keys())


# extract titles and abstracts
titles_abstracts = []
labelsSelected = []
currentYear = datetime.datetime.now().year
minYear = 1987
maxYear = currentYear
for bib_id in labels:

    entry = bibdata.entries[bib_id]
    year = entry.fields.get('year', '')

    # ignore papers "software" and "related"    
    if entry.fields["annote"] == 'Software' or entry.fields["annote"] == 'Related':
        continue
    
    # handle papers in press    
    if year == "in press":
        # continue # if papers in press should be ignored
        entry.fields['year'] = maxYear
        year = maxYear
    
    # consider only papers in a certain time interval
    if double(year) >= minYear and double(year) <= maxYear:
        title = entry.fields.get('title', '')
        abstract = entry.fields.get('abstract', '')
        keywords = entry.fields.get('keywords', '')
        titles_abstracts.append(title + ' ' + abstract + ' ' + keywords)
        labelsSelected.append(bib_id)

# lemmatise text
titles_abstracts_lemmatized = [lemmatize_text(text) for text in titles_abstracts]


# find optimal cluster number
qualityScore = np.zeros((50,4))
davies_bouldin_idx = np.zeros(50)
calinski_harabasz_idx = np.zeros(50)
silhouette_idx = np.zeros(50)
silhouette_idx2 = np.zeros(50)
bar = ChargingBar('Find optimal cluster number', max=len(davies_bouldin_idx), suffix='%(percent).0f%% - %(eta)ds')
for num_clusters in range(1,50+1):

    bar.next()

    # Create a pipeline with TF-IDF and K-Means
    model = make_pipeline(TfidfVectorizer(stop_words='english', max_df=.33, sublinear_tf=True), KMeans(n_clusters=num_clusters, n_init=15, tol=1e-5, random_state=random_state))
    # Extract feature names from TF-IDF vectorizer
    vectorizer = model.named_steps['tfidfvectorizer']
    # Fit the model
    clusterLabels = model.fit_predict(titles_abstracts_lemmatized)
    # Check if there are more than one unique cluster label
    if len(set(clusterLabels)) > 1:
        qualityScore[num_clusters-1, 0] = davies_bouldin_score(vectorizer.fit_transform(titles_abstracts_lemmatized).toarray(), clusterLabels)
        qualityScore[num_clusters-1, 1] = calinski_harabasz_score(vectorizer.fit_transform(titles_abstracts_lemmatized).toarray(), clusterLabels)
        qualityScore[num_clusters-1, 2] = silhouette_score(vectorizer.fit_transform(titles_abstracts_lemmatized).toarray(), clusterLabels)

        s =  silhouette_samples(vectorizer.fit_transform(titles_abstracts_lemmatized).toarray(), clusterLabels)
        qualityScore[num_clusters-1, 3] = np.sum(s < 0.0)/len(s)

    else:
        qualityScore[num_clusters-1, :] = nan

plt.clf()
plt.plot(range(1,50+1), qualityScore[:,0])
plt.plot(range(1,50+1), qualityScore[:,1])
plt.grid('on')

# store Davies Bouldin index
output_file = "../Data/cluster_DBindex.txt"
np.savetxt(output_file, qualityScore, fmt='%f')





# define the number of clusters (you may adjust this)
num_clusters = 22

# create a pipeline with TF-IDF and K-Means
model = make_pipeline(TfidfVectorizer(stop_words='english', max_df=.33, sublinear_tf=True), KMeans(n_clusters=num_clusters, n_init=15, tol=1e-5, random_state=random_state))

# fit the model
clusterLabels = model.fit_predict(titles_abstracts_lemmatized)

# calculate linkage matrix for hierarchical clustering
linkage_matrix = linkage(model.named_steps['kmeans'].cluster_centers_, method='ward')

# plot dendrogram
plt.figure(figsize=(10, 6))
dendrogram(linkage_matrix, labels=[f"Cluster {i+1}" for i in range(num_clusters)], leaf_rotation=90);
plt.title('Hierarchical dendrogram of clustering')
plt.xlabel('Cluster')
plt.ylabel('Distance')
plt.show()


# get cluster assignments
cluster_assignments = model.predict(titles_abstracts_lemmatized)

# get the cluster sizes
cluster_sizes = np.bincount(cluster_assignments)

# sort the clusters based on their sizes
sorted_clusters = np.argsort(cluster_sizes)[::-1]  # sort in descending order of size

# print papers in each cluster and top features
output_file = "../Data/cluster_allyears.txt"

# open the file in write mode
with open(output_file, "w") as f:

    #for cluster_id in range(num_clusters):
    for idxCluster, cluster_id in enumerate(sorted_clusters):

        cluster_papers = [labelsSelected[i] for i in range(len(cluster_assignments)) if cluster_assignments[i] == cluster_id]
        
        cluster_years = [int(bibdata.entries[l].fields.get('year', '')) for l in cluster_papers]

        #print(f"Cluster: {idxCluster + 1} ({len(cluster_papers)} papers)")
        print(f"{idxCluster + 1}    &   &{len(cluster_papers)}")
        f.write(f"Cluster: {idxCluster + 1} ({len(cluster_papers)} papers, {min(cluster_years)} started)\n")
        #print(f"Papers: {', '.join(cluster_papers)}")
        f.write(f"Papers: {', '.join(cluster_papers)}\n")

        # extract feature names from TF-IDF vectorizer
        vectorizer = model.named_steps['tfidfvectorizer']
        feature_names = np.array(vectorizer.get_feature_names_out())

        # get centroid for the current cluster
        centroid = model.named_steps['kmeans'].cluster_centers_[cluster_id]
        #centroid = model.named_steps['minibatchkmeans'].cluster_centers_[cluster_id]

        # get indices of top N features
        top_n_features_indices = np.argsort(centroid)[::-1][:15]  # Adjust the number 10 as needed

        # print the top N features
        top_n_features = feature_names[top_n_features_indices]
        #print(f"Top Features: {', '.join(top_n_features)}\n")
        print(f"    &{', '.join(top_n_features)}\\\\\n")
        f.write(f"Top Features: {', '.join(top_n_features)}\n\n")


# write cluster top words to file
with open('../Data/cluster_topWords.txt', "w") as f:
    for idxCluster, cluster_id in enumerate(sorted_clusters):

        cluster_papers = [labelsSelected[i] for i in range(len(cluster_assignments)) if cluster_assignments[i] == cluster_id]
        
        # extract feature names from TF-IDF vectorizer
        vectorizer = model.named_steps['tfidfvectorizer']
        feature_names = np.array(vectorizer.get_feature_names_out())

        # get centroid for the current cluster
        centroid = model.named_steps['kmeans'].cluster_centers_[cluster_id]
        #centroid = model.named_steps['minibatchkmeans'].cluster_centers_[cluster_id]

        # get indices of top N features
        top_n_features_indices = np.argsort(centroid)[::-1][:15]  # Adjust the number 10 as needed

        # print the top N features
        top_n_features = feature_names[top_n_features_indices]
        f.write(f"{', '.join(top_n_features)}\n\n")
        #f.write(f"{idxCluster+1}\t&\t\t&{len(cluster_papers)}\t\t&{', '.join(top_n_features)}\\\\\n")







# clustering for extending years
yearsList = range(minYear, maxYear + 1)
clusterYears = zeros((num_clusters, len(yearsList)))
idx = 0

output_file = "../Data/cluster_individualyears.txt"

# open the file in write mode
with open(output_file, "w") as f:

    bar = ChargingBar('Clusters for each year', max=len(yearsList), suffix='%(percent).0f%% - %(eta)ds')
    for maxYear in yearsList:
    
        bar.next()

        #print(f"Year: {minYear}-{maxYear}")
        f.write(f"\nYear: {minYear}-{maxYear} ")

        titles_abstracts = []
        labelsYears = []

        for bib_id in labels:
            entry = bibdata.entries[bib_id]
            year = entry.fields.get('year', '')

            # ignore papers "software" and "related"    
            if entry.fields["annote"] == 'Software' or entry.fields["annote"] == 'Related':
                continue

            ## ignore papers in press    
            #if year == "in press":
            #    continue

            # consider only papers in a certain time interval
            if double(year) <= maxYear and double(year) >= minYear:
                title = entry.fields.get('title', '')
                abstract = entry.fields.get('abstract', '')
                titles_abstracts.append(title + ' ' + abstract)
                labelsYears.append(bib_id)


        # lemmatise text
        titles_abstracts_lemmatized = [lemmatize_text(text) for text in titles_abstracts]

        # get cluster assignments using general model
        cluster_assignments = model.predict(titles_abstracts_lemmatized)


        # number of found/ assigned clusters in current year 
        f.write(f"(num clusters: {len(unique(cluster_assignments))})\n")
        # print papers in each cluster and top features
        for idxCluster, cluster_id in enumerate(sorted_clusters):
            cluster_papers = [labelsYears[i] for i in range(len(cluster_assignments)) if cluster_assignments[i] == cluster_id]

            if cluster_papers == []:
               continue

            clusterYears[idxCluster,idx] = len(cluster_papers) / len(labelsYears)

            #print(f"Cluster: {idxCluster + 1} ({len(cluster_papers)} papers)")
            f.write(f"Cluster: {idxCluster + 1} ({len(cluster_papers)} papers)\n")
            #print(f"Papers: {', '.join(cluster_papers)}")
            f.write(f"Papers: {', '.join(cluster_papers)}\n")

        idx += 1


# store cluster statistics
output_file = "../Data/cluster_stat.txt"
np.savetxt(output_file, clusterYears, fmt='%f')
