import os
import pandas as pd
import argparse

"""Program for converting a directory of webanno tsv files to conll files.
Modified for annotation campaign Canaréno and Normenanalyse 2 project.
It will create the short span version with ....
"""


def webanno_tsv_to_conll(path, output_path, verbose=False):
    # overall wrapper function
    count = -1  # first iteration is empty
    names = []
    annotators = {}
    for (dirpath, dirnames, filenames) in os.walk(path):
        if dirnames:
            print("Got following directory names: ")
            print(dirnames)
            names = dirnames
            # check for directories to get avoid stress
            if not os.path.exists(output_path):
                os.makedirs(output_path)
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            if filename != "INITIAL_CAS.tsv" and filename != ".DS_Store":
                if filename in annotators.keys():
                    annotators[filename] += 1
                else:
                    annotators[filename] = 1
                df = webanno_tsv_to_df(filepath, verbose=verbose)

                save_path = output_path + "/" + names[count] + ".conll"
                df_to_conll(df, save_path)
                print(f"Added file {filepath}")
            else:
                print("skipped 'INITIAL_CAS.tsv' file")
        count += 1
    print(annotators)
    print("Finished")


def webanno_tsv_to_df(path, verbose=False):
    # modified function from Clara;
    # puts the tsv file in input "path" into a df data frame
    with open(path, 'r', encoding='utf8') as f:
        data = f.read()
        data_split = []
        for row in data.split('\n'):

            if "-" in row.split("\t")[0]:
                if not (row.startswith("#Text") or row.startswith("#T_SP") or row.startswith("#FORMAT")):
                    if not (row.startswith("#T_RL")):
                        # only rows with info (tokens) should come through here
                        trow = row.split("\t")
                        data_split.append(trow)
            else:
                if not row:
                    # no idea what "not row" means, but it works (not my code)
                    data_split.append(["\n"])
                if verbose and row.startswith("#Text="):
                    pass

    df = pd.DataFrame(data_split)

    # automatically identify which data frame cols correspond to which class
    # sometimes some classes dont occur, so no column for them

    class_cols = {"Text": 2}  # default column for text, always

    classes = ['Aktion', 'Bedingung', 'Dokument',
               'Ergebnisempfänger', 'Frist', 'Handlungsgrundlage',
               'Hauptakteur', 'Mitwirkender', 'Signalwort']

    assigned_cols = []
    assigned_names = []
    for cls in classes:
        for i in range(0, len(df.columns)):  
            if i == 2:  # skip text column
                continue

            column_i = df[i]
            if column_i.str.contains(cls).any():
                class_cols[cls] = i

                # to check for double-assignment
                assigned_cols.append(i)
                assigned_names.append(cls)
                break

    # 
    assert len(assigned_cols) < 10

    if not len(assigned_cols) == len(set(assigned_cols)):
        # so one column was assigned to 2 or more classes
        print("Error: column-class mapping ambiguous!")
        print(assigned_cols)
        print(assigned_names)
        quit()

    if not len(assigned_names) == len(set(assigned_names)):
        # so one class was assigned to 2 or more columns
        print("Error: column-class mapping ambiguous!")
        print(assigned_cols)
        print(assigned_names)
        quit()

    if verbose:
        print("Working path: ", path)
        print("Identified the following class-column mapping:")
        print(class_cols)

    keep_cols = []
    name_list = []

    # convert dictionary to list (same order of cols and their names)
    for key in class_cols.keys():
        # not very robust, but should suffice
        keep_cols.append(class_cols[key])
        name_list.append(key)
    if verbose:
        print("Keeping columns at positions: ", keep_cols)
        print("Setting column names: ", name_list)

    # length should be the same, by definition
    assert len(keep_cols) == len(name_list)

    # keep only columns that correspond to classes, assign proper names
    df = df[keep_cols]
    df.columns = name_list

    # remove essentialy empty rows, containing non-breaking space
    # because non-breaking space was improperly tokenized
    df = df[df['Text'].apply(lambda x:  x != '\u00a0')]
    df = df[df['Text'].apply(lambda x:  x != '\u00a0\u00a0')]
    df = df[df['Text'].apply(lambda x:  x != '\u00a0\u00a0\u00a0')]
    df = df[df['Text'].apply(lambda x:  x != '\u00a0\u00a0\u00a0\u00a0')]
    df = df[df['Text'].apply(lambda x:  x != '\u00a0\u00a0\u00a0\u00a0\u00a0')]
    df = df[df['Text'].apply(lambda x:  x != '\u00a0\u0020\u00a0')]
    df = df[df['Text'].apply(lambda x:  x != '\u00a0\u00a0\u0020\u00a0')]


    df.to_csv('./output.csv', index=False)
    #quit()

    return df


def check_annotation_info_present(row):
    tags = ['Aktion', 'Bedingung', 'Dokument', 'Ergebnisempfänger', 'Frist',
            'Handlungsgrundlage', 'Hauptakteur', 'Mitwirkender', 'Signalwort']

    tags_present = []

    for tag in tags:
        try:
            content = row[tag]
        except KeyError:
            # that column does not exist in this dataframe
            # because it was not present in annotated data
            content = ""
        if content:
            content = content.strip()
            if content != "" and content != "_" and content != "*":
                tags_present.append(content)

    if VERBOSE:
        print("found tags in row:", tags_present)
    return tags_present


def df_to_conll(df, output_path):
    # creates both conll files by iteratively going through the tsv files
    lines = []

    # iterate through pandas dataframe by the row index
    withinAnnotation = False
    cw_class = ""
    previous_token_tag = "O"
    last_token_was_newline = True

    for i in range(0, len(df)):
        error_printed_txt = False

        if VERBOSE:
            print("=====================================")
            print("at row: ", i)
            print("currently_in_annotation: ", withinAnnotation)
            print("cw_class: ", cw_class)

        curr_row = df.iloc[i]

        # update
        token_text = curr_row["Text"]

        if token_text is None and i == 0:
            # prevent newline on first line
            continue

        if token_text is None:
            if last_token_was_newline:
                # prevent to sequential newlines
                continue
            else:
                # this is how empty lines in tsv are represented in the df
                # so this represents the end of a sentence
                lines.append("\n")
                last_token_was_newline = True
                continue

        # after any non-newline token, a newline is allowed by default
        last_token_was_newline = False

        extracted_tags = check_annotation_info_present(curr_row)

        extracted_tag = False  # default, if no tag is present

        if len(extracted_tags) > 1:
            # figure out which one to keep
            # check if currently in annotation mode, if yes, keep the one for that class
            if withinAnnotation:
                for el in extracted_tags:
                    tag_class = el.split("-")[0]
                    if tag_class == cw_class:
                        extracted_tag = el
                        break
            else:
                # if not in annotation mode, keep Start, rather than Core
                # do Core first, so if Start exists, it overwrites Core
                for el in extracted_tags:
                    if "Core" in el:
                        extracted_tag = el
                        break
                # overwrite with Start if present
                # note: this breaks up Handlungsgrundlagen Core sequences
                for el in extracted_tags:
                    if "Start" in el:
                        extracted_tag = el
                        break
                for el in extracted_tags:
                    if "End" in el:
                        print("Special unhandled error (surprise)")
                        #assert not True
        elif len(extracted_tags) == 1:
            extracted_tag = extracted_tags[0]
        else:
            # no tag present
            extracted_tag = False

        if VERBOSE:
            print("decided on tag:", extracted_tag)

        if withinAnnotation:
            if extracted_tag and "End" in extracted_tag:
                # check if the End tag matches the start tag class
                if extracted_tag.split("-")[0] != cw_class:
                    # in this case, we found an End tag, but it does not match the class of the start tag, so we ignore it
                    # and continue with normal annotation mode of current class
                    tag = "I-" + cw_class
                    if VERBOSE:
                        print("i, curr_row, cw_class, extracted_tag:")
                        print(i, curr_row, cw_class, extracted_tag)
                # tags with end span multiple tokens until explicit start is
                # encountered (annotation mode disabled)
                # so here the span ends, annotate this token, then disable annotation mode
                else:
                    # check if the next token als has an End-annotation
                    # if yes, we dont end annotation mode, and just annotate "I" (fixes annotation-issue)

                    next_token_same_class_and_end = False

                    if i < len(df) - 1:
                        tmprow = df.iloc[i+1]
                        nxt_tags = check_annotation_info_present(tmprow)
                        for el in nxt_tags:
                            if "End" in el and el.split("-")[0] == cw_class:
                                tag = "I-" + cw_class
                                next_token_same_class_and_end = True
                                break

                    if not next_token_same_class_and_end:
                        tag = "I-" + cw_class
                        withinAnnotation = False
                        cw_class = ""
            elif extracted_tag:
                # throw an error, exit program (some tag exists, but does not fit)
                error_msg = "| Error: currently in annotation mode"
                error_msg += f", but non-matching tag: {extracted_tag} found"
                error_msg += f", expected END tag for: {cw_class}. handling this by keep outer annotation."
                print(error_msg)
                # lines.append(token_text + " " + "XXX " + error_msg + "\n")
                # error_printed_txt = True
                tag = "I-" + cw_class
            else:
                # new token, still in annotation mode
                # no special action taken, just continue annotation
                tag = "I-" + cw_class
        else:
            # check for new starting annotation
            if extracted_tag:
                # currently not in annotation mode, and new annotation starts
                if "Core" in extracted_tag:
                    if "Start" in extracted_tag:
                        # START can occur with CORE tag
                        # tags with start span multiple tokens until explicit
                        # end is encountered (annotation mode enabled)
                        cw_class = extracted_tag.split("-")[0]
                        tag = "B-" + cw_class
                        withinAnnotation = True
                    else:
                        # not in annotation, and Core without Start -> single token annotation
                        cw_class = extracted_tag.split("-")[0]
                        tag = "B-" + cw_class

                        if previous_token_tag == "Handlungsgrundlage" and cw_class == "Handlungsgrundlage":
                            tag = "I-" + cw_class

                        withinAnnotation = False
                        cw_class = ""
                elif "Start" in extracted_tag:
                    # START can occur without CORE tag
                    # tags with start span multiple tokens until
                    # explicit end is encountered (annotation mode enabled)
                    cw_class = extracted_tag.split("-")[0]
                    tag = "B-" + cw_class
                    withinAnnotation = True
                elif "End" in extracted_tag:

                    if FAIL_SILENTLY:
                        tag = "O"
                    else:
                        error_msg = "| Error: Encountered END token, this should not happen when not in annotation mode (cannot end it)"
                        lines.append(token_text + " " + "XXX " + error_msg + "\n")
                        error_printed_txt = True
                else:
                    if FAIL_SILENTLY:
                        tag = "O"
                    else:
                        error_msg = f"| Error: unknown situation, found: {extracted_tag}"
                        lines.append(token_text + " " + "XXX " + error_msg + "\n")
                        error_printed_txt = True
            else:
                # not in annotation mode and no new annotation starts
                tag = "O"
                previous_token_tag = "O"

        # just for taking a look at the data, not correct
        if not error_printed_txt:
            # print(token_text, tag)
            lines.append(token_text + " " + tag + "\n")
            # get class that was assigned to this token for handling of
            # Handlungsgrundlage special case
            if tag != "O":
                previous_token_tag = tag.split("-")[1]

    with open(output_path, "w", encoding='utf8') as file:
        file.writelines(lines)

    return


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
                    prog='convert_tsv_to_conll_NAP_0.py',
                    description='After downloading an annotation project from the INCEpTION platform, we use the annotation directory to extract the annotation from the tsv files and convert them to short span IOB2 annotations. This program was specifically directed at the conversion of Normenanalyse 0.1 project and its annotation scheme.',
                    epilog='May the Force be with you!')
    parser.add_argument('-i', '--input_dir',
                        required=True, help="input directory location")
    parser.add_argument('-o', '--output_dir',
                        required=True, help="path for output directory")
    args = parser.parse_args()
    print(args)

    VERBOSE = False
    FAIL_SILENTLY = True

    path_source = args.input_dir
    path_dest = args.output_dir
    webanno_tsv_to_conll(path_source, path_dest, VERBOSE)

    # Example usage:
    #  python3 convert_tsv_to_conll_NAP_0.py -i ./project_0_curation -o ./project_0_out
