import os
import pandas as pd
import re
import argparse
import datetime

"""Program for converting a directory of webanno tsv files to conll files. Modified for annotation campaign Canaréno and Normenanalyse 2 project.
It will create the short span version with ... (add afterwards with ellipsen, prefix, suffix etc.).
"""

def webanno_tsv_to_conll(path, output_path):
    ### overall wrapper function
    count = -1 # first iteration is empty
    names = []
    annotators = {}
    for (dirpath, dirnames, filenames) in os.walk(path):
        if dirnames:
            print("Got following directory names: ")
            print(dirnames)
            names = dirnames
            # check for directories to get avoid stress
            if not os.path.exists(output_path + "Documents_simple"):
                os.makedirs(output_path + "Documents_simple")
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            # handle directory files in macOS
            if filename != "INITIAL_CAS.tsv" and filename != ".DS_Store":
                if filename in annotators.keys():
                    annotators[filename] += 1
                else:
                    annotators[filename] = 1
                df = webanno_tsv_to_df(filepath)
                simple_name = output_path + "Documents_simple" +"/"+ names[count]+ ".conll"#os.path.join(output_path, names[count], "simple", filename+ "_short.conll")
                df_to_conll(df, simple_name)
                #print("Added file "+ filepath)
            else:
                #print("skipped 'INITIAL_CAS.tsv' file")
                pass
        count += 1
    print(annotators)
    print("Finished")

def webanno_tsv_to_df(path, verbose = False):
    # modified function from Clara; puts the tsv file in input "path" into a df data frame
    with open(path, 'r', encoding='utf8') as f:
        data = f.read()
        data_split = []
        for row in data.split('\n'):
            #print(row)
            if len(row.split("\t"))==7:
                # actual data
                trow = row.split("\t")
                data_split.append(trow)
            else:
                if not row:
                    # end of sentence
                    data_split.append(["\n","\n","\n","\n","\n","\n","\n"])
                if verbose and row.startswith("#Text="):
                    #print(row)
                    pass
        df = pd.DataFrame(data_split, columns=['tsv-index', 'range', 'token', 'FIM-Tag', 'Prefix', 'Suffix', 'Newline'])
    f.close()

    # remove rows containing non-breaking space (several variants found during validation
    rows_before_space_fixup = len(df)
    df = df[df['token'].apply(lambda x:  x != '\u00a0')]
    df = df[df['token'].apply(lambda x:  x != '\u00a0\u00a0')]
    df = df[df['token'].apply(lambda x:  x != '\u00a0\u00a0\u00a0')]
    df = df[df['token'].apply(lambda x:  x != '\u00a0\u00a0\u00a0\u00a0')]
    df = df[df['token'].apply(lambda x:  x != '\u00a0\u00a0\u00a0\u00a0\u00a0')]
    df = df[df['token'].apply(lambda x:  x != '\u00a0\u0020\u00a0')]
    df = df[df['token'].apply(lambda x:  x != '\u00a0\u00a0\u0020\u00a0')]
    removed_rows_count = rows_before_space_fixup - len(df)

    if removed_rows_count != 0 and PRINT_REMOVED_ROWS:
        log(f"removed {removed_rows_count} rows (tokens) containing only non-breaking space (possibly combined with space)")

    return df


def log(log):
    """ helper function that logs strings to a file"""
    with open('conversion.log', 'a') as f:
        f.write(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " " + log + "\n")
        f.close()


def xfix_validation(suffix):
    """ check for rogue brackets and spaces in prefix/suffixes and fix them"""

    # edge case during annotation that we dont like
    if " " in suffix:
        log(f"Suffix \"{suffix}\" has a space in the suffix, setting it to empty string")
        suffix = ""

    # suffixes can contain [1] etc for some reason
    if "[" in suffix:
        old_suffix = suffix
        suffix = re.sub(r'\[\d+\]$', '', suffix)
        log(f"Suffix \"{old_suffix}\" has a [x] in the suffix, removing it -> \"{suffix}\"")

    return suffix

def add_prefix_and_suffix_to_token(df, index):
    """ returns restored full token, returns -1 if the token is invalid"""
    # adds - if possible a given suffix and prefix to a token
    prefix = ""
    suffix = ""
    token = df["token"][index]

    # extract prefix and suffix from the dataframe (for the row we are working)
    if df["Prefix"][index]!= "_" and not df["Prefix"][index].startswith("*"):
        prefix = df["Prefix"][index]
    if df["Suffix"][index]!= "_" and not df["Suffix"][index].startswith("*"):
        suffix = df["Suffix"][index]

    #  check for special cases and handle them
    if suffix:
        # if suffix is present, often the token ends with a dash, remove it
        if token[-1] == "-":
            token = token[:-1]

    suffix = xfix_validation(suffix)
    prefix = xfix_validation(prefix)

    complete_token = prefix + token + suffix

    # remaining special cases that only concern full token
    if (complete_token == suffix) or (complete_token == prefix):
        # this does not remove empty tokens used for line separation (handled in df_to_conll)
        log(f"Complete token \"{complete_token}\" is equal to the prefix or suffix (\"{prefix}\", \"{suffix}\"), skipping it (likely no valid word or redundant)")
        return -1

    # for printing detailed information in case this function made any changes
    if complete_token != token and PRINT_SUFFIX_PREFIX_DEBUG:
        pre = '{:<20}'.format(prefix)
        tok = '{:<20}'.format(token)
        suf = '{:<20}'.format(suffix)
        print(f"{pre} + {tok} + {suf} = {complete_token}")
        uni_string = ""
        for char in token:
            unicode_code = ord(char)
            uni_string += f"\\u{unicode_code:04x}"
        if PRINT_UNICODE_OF_TOKEN:
            print(f"Unicode character of token: {uni_string}")
    return complete_token

# returns -1 if the token should be skipped
def wrapper_token_and_label_extraction(df, i, string, tag_before):
    """wrapper that substitutes old names and extracts the tags"""
    prefix = "B-"
    # df["FIM-Tag"][i] is the Tag + [2] o.ä.
    # token is the word in the sentence
    token = add_prefix_and_suffix_to_token(df, i)

    if token == -1:
        return -1

    # tag is the annotation
    only_bigger_annotation = df["FIM-Tag"][i].split("|")[0]
    tag = re.sub(r'\[[0-9]*\]', '', only_bigger_annotation)
    tag = tag.split("|")[0]
    if tag_before == only_bigger_annotation: #FIM-Tag because is same annotation (has number)
        prefix = "I-"
    # string is the complete conll line
    string = token + " " + prefix + tag + "\n"
    if tag == "Ressource":
        string = token + " " + prefix + "Dokument" + "\n"
    elif tag == "Entscheidungsfrist":
        string = token + " " + prefix + "Frist" + "\n"
    elif tag == "Abgabefrist":
        string = token + " " + prefix + "Frist" + "\n"
    # I need here the first tag with the number to see if it is a new annotation.
    return string, only_bigger_annotation


def df_to_conll(df, filename_simple):
    # creates both conll files by iteratively going through the tsv files
    Lines_short = []
    Lines_long = []
    tag_before = ""
    for i in df.index:
        string = ""
        string_long = ""
        if df["FIM-Tag"][i] == "\n":
            if Lines_short and Lines_short[0] != "\n":
                string += "\n"
        elif df["token"][i] == " ":
            continue
        elif df["FIM-Tag"][i] != "_":
            ret = wrapper_token_and_label_extraction(df, i, string, tag_before)
            if ret == -1:
                # skip empty tokens detected by wrapper_token_and_label_extraction
                continue
            else:
                string, tag_before = ret
        else:
            ret = add_prefix_and_suffix_to_token(df, i)
            # skip empty tokens detected by add_prefix_and_suffix_to_token
            if ret == -1:
                continue
            string = ret + " O\n"
            tag_before = ""
        if string != " O\n": #empty token
            Lines_short.append(string)
    #lines = Lines#[2:]
    with open(filename_simple, "w", encoding='utf8') as file1:         # Writing data to a file
        file1.writelines(Lines_short)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
                    prog='convert_tsv_to_conll_NAP_2.py',
                    description='After downloading an annotation project from the INCEpTION platform, we use the annotation directory to extract the annotation from the tsv files and convert them to short span IOB2 annotations. This program was specifically directed at the conversion of Normenanalyse 2 project and its annotation scheme.',
                    epilog='May the Force be with you!')
    parser.add_argument('-i', '--input_dir', required=True, help="input directory location")
    parser.add_argument('-o', '--output_dir', required=True, help="path for output directory")
    args = parser.parse_args()
    print(args)

    PRINT_REMOVED_ROWS = True
    PRINT_SUFFIX_PREFIX_DEBUG = True
    PRINT_UNICODE_OF_TOKEN = False

    path_source =  args.input_dir #"~/Documents/Corpus_data/Data_Project_2_31_1_24/annotation" #
    path_dest =  args.output_dir #"~/Documents/Corpus_data/converted_data" #
    webanno_tsv_to_conll(path_source, path_dest)
