import random

import constants
from utils import file_utils, word_utils


def write_data_to_files(prefix,
                        train_source_seqs,
                        train_target_seqs,
                        eval_source_seqs,
                        eval_target_seqs,
                        is_append=False):
    file_utils.write_string_to_file(prefix + constants.SUFFIX_TRAIN_SOURCE, train_source_seqs, is_append)
    file_utils.write_string_to_file(prefix + constants.SUFFIX_TRAIN_TARGET, train_target_seqs, is_append)
    file_utils.write_string_to_file(prefix + constants.SUFFIX_EVAL_SOURCE, eval_source_seqs, is_append)
    file_utils.write_string_to_file(prefix + constants.SUFFIX_EVAL_TARGET, eval_target_seqs, is_append)


def verb_filter():
    prefix = 'static/data/text_limit_len_commits/text_limit_len_commits'
    eval_source_seqs = file_utils.read_file_to_string(prefix + constants.SUFFIX_EVAL_SOURCE).split('\n')
    train_source_seqs = file_utils.read_file_to_string(prefix + constants.SUFFIX_TRAIN_SOURCE).split('\n')
    eval_target_seqs = file_utils.read_file_to_string(prefix + constants.SUFFIX_EVAL_TARGET).split('\n')
    train_target_seqs = file_utils.read_file_to_string(prefix + constants.SUFFIX_TRAIN_TARGET).split('\n')

    source_seqs = train_source_seqs + eval_source_seqs
    target_seqs = train_target_seqs + eval_target_seqs

    train_limit_len_source_seqs = list()
    eval_limit_len_source_seqs = list()
    train_limit_len_target_seqs = list()
    eval_limit_len_target_seqs = list()

    source_vocab_set = dict()
    target_vocab_set = dict()

    for idx, source_seq in enumerate(source_seqs):
        source_words = [word for word in source_seq.split(' ') if word != '']
        target_seq = target_seqs[idx]
        target_words = [word for word in target_seq.split(' ') if word != '']
        if len(source_words) > 0 \
                and len(target_words) > 0 \
                and word_utils.starts_with_verb(target_words) \
                and word_utils.overlap_two_seq(source_words, target_words) \
                and 'revert' not in target_words:
            if random.random() < constants.EVAL_RATE:
                eval_limit_len_source_seqs.append(' '.join(source_words))
                eval_limit_len_target_seqs.append(' '.join(target_words))
            else:
                train_limit_len_source_seqs.append(' '.join(source_words))
                train_limit_len_target_seqs.append(' '.join(target_words))

            for source_word in source_words:
                if source_word not in source_vocab_set:
                    source_vocab_set[source_word] = 1
                else:
                    source_vocab_set[source_word] += 1
            for target_word in target_words:
                if target_word not in target_vocab_set:
                    target_vocab_set[target_word] = 1
                else:
                    target_vocab_set[target_word] += 1

    print('len(train_limit_len_source_seqs)', len(train_limit_len_source_seqs))

    prefix = 'static/data/text_verb_commits/text_verb_commits'
    write_data_to_files(prefix,
                        '\n'.join(train_limit_len_source_seqs),
                        '\n'.join(train_limit_len_target_seqs),
                        '\n'.join(eval_limit_len_source_seqs),
                        '\n'.join(eval_limit_len_target_seqs))


if __name__ == '__main__':
    verb_filter()
