from gensim.models.phrases import Phrases
import os
from datetime import datetime
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

class NGramGeneration:

    @staticmethod
    def generate_bi_trigrams(input_data):
        # Add bigrams and trigrams (only ones that appear 2 times or more)
        input_data['bigrams_tokens'] = None
        input_data['bigrams_text'] = None
        input_data['trigrams_tokens'] = None
        input_data['trigrams_text'] = None
        input_data['fourgrams_tokens'] = None
        input_data['fourgrams_text'] = None
        input_data['fivegrams_tokens'] = None
        input_data['fivegrams_text'] = None
        bigram = Phrases(input_data['input_tokens'], min_count=1, delimiter='_')
        trigram = Phrases(bigram[input_data['input_tokens']], min_count=1, delimiter='_')
        fourgram = Phrases(trigram[bigram[input_data['input_tokens']]], min_count=1, delimiter='_')
        fivegram = Phrases(fourgram[trigram[bigram[input_data['input_tokens']]]], min_count=1, delimiter='_')
        for idx, row in input_data.iterrows():
            bigrams_ = [b for b in bigram[row['input_tokens']] if b.count('_') == 1]
            trigrams_ = [t for t in trigram[bigram[row['input_tokens']]] if t.count('_') == 2]
            fourgrams_ = [t for t in fourgram[trigram[bigram[row['input_tokens']]]] if t.count('_') == 3]
            fivegrams_ = [t for t in fivegram[fourgram[trigram[bigram[row['input_tokens']]]]] if t.count('_') == 4]
            
            input_data.at[idx, 'bigrams_tokens'] = bigrams_
            input_data.at[idx, 'bigrams_text'] = ','.join(bigrams_)
            
            input_data.at[idx, 'trigrams_tokens'] = trigrams_
            input_data.at[idx, 'trigrams_text'] = ','.join(trigrams_)
            
            input_data.at[idx, 'fourgrams_tokens'] = fourgrams_
            input_data.at[idx, 'fourgrams_text'] = ','.join(fourgrams_)
            
            input_data.at[idx, 'fivegrams_tokens'] = fivegrams_
            input_data.at[idx, 'fivegrams_text'] = ','.join(fivegrams_)

        return input_data


    @staticmethod
    def generate_word_frequency(corpus, file_name, top_n=None):
        now = int(round(datetime.now().timestamp() * 1000))

        vec = CountVectorizer().fit(corpus)
        bag_of_words = vec.transform(corpus)
        sum_words = bag_of_words.sum(axis=0)
        words_freq = [(word, sum_words[0, idx])
                      for word, idx in vec.vocabulary_.items()]
        words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)

        df = pd.DataFrame(words_freq, columns=["word", "count"])
        df.to_csv("wordfreq_%s_%s.csv" % (file_name,now), index=False)

        return words_freq[:top_n]