import os

import constants
from utils import file_utils, word_utils
from preprocess.diff import get_diff_from_raw_diff


def get_text_diff_seq(dir_path):
    patch_path = os.path.join(dir_path, 'patch')
    diff_str = file_utils.read_abs_file_to_string(patch_path)

    diff_seq_list = get_diff_from_raw_diff(diff_str, tokenize=True, skip_lines=0)
    diff_seq = (' %s ' % constants.SPECIAL_WORD_BLOCK).join([diff_seq['patch'] for diff_seq in diff_seq_list])

    return diff_seq


def _get_commit_diff_seqs_without_java(commit_dir, dirs):
    diff_seqs = list()

    for child_dir in dirs:
        child_dir_path = os.path.join(commit_dir, child_dir)
        file_name = ' '.join(word_utils.word_tokenizer(child_dir))

        cur_diff_seq = get_text_diff_seq(child_dir_path)

        cur_diff_seq = '%s %s %s %s' % (
            constants.SPECIAL_WORD_FILE_START, file_name, constants.SPECIAL_WORD_FILE_END, cur_diff_seq)

        diff_seqs.append(cur_diff_seq)

    return diff_seqs


def get_commit_diff_seq(commit_dir):
    files, dirs = file_utils.get_files_dirs_in_dir(commit_dir)
    if 'error' in files:
        return None

    diff_seqs = _get_commit_diff_seqs_without_java(commit_dir, dirs)

    return ' '.join(diff_seqs)
