import json
from collections import defaultdict
from text_prepro_tools import clean_up_content

def clean_up_diff(text_content):
    text_content = clean_up_content(text_content, rem_stop_words=True)
    puncts = ['!', ':', '...', '.', '%', '$', "'", '"', ';', 'ˆ', 'ˇ']\
            + ['(', ')']
    for rem in puncts:
        text_content = text_content.replace(rem, '') 
    return text_content

def prepro_diff(diff_contents):
    processed_diff_content = []
    for article in diff_contents: 
        processed = {}
        diff_raw = article['diff_cont']
        for ver, rev_dif in diff_raw.items():
            processed[ver] = clean_up_diff(' '.join(rev_dif))
        processed['doi'] = article['doi']
        processed_diff_content.append(processed)
    return processed_diff_content

def make_corpora_for_lda(texts):
    
    # count the number of occurrences of the word
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1
    num = list(frequency.values())
    num.sort(reverse=True)
    # remove low freaquency words (less 50 times in the whole documents)
    texts = [[token for token in text if (frequency[token] > 50)] for text in texts]
    
    # remove 1 or 2 caractors words
    text_remove = []
    for tex in texts:
        tex = [w for w in tex if len(w) > 2]
        text_remove.append(tex)
    return text_remove

# read raw file
with open('LDA/diff_content.json', 'r') as f:
    diff_content = json.load(f)
processed_diff_content = prepro_diff(diff_content)

# remove_low_frequency_words
texts = [t['version-0-2'].split(' ') for t in processed_diff_content]
texts = make_corpora_for_lda(texts)
# save
with open('LDA/lda_text_diff.txt', 'w') as f:
    json.dump(texts, f)