import os
import pandas as pd
import re
import argparse
import sys

"""Program for converting a directory of webanno tsv files to conll files. Modified for annotation campaign Canaréno and Normenanalyse 2 project.
It will create the short span version with ... (add afterwards with ellipsen, prefix, suffix etc.).
"""

def webanno_tsv_to_conll(path, output_path):
    ### overall wrapper function
    count = -1 # first iteration is empty
    names = []
    annotators = {}
    for (dirpath, dirnames, filenames) in os.walk(path):
        if dirnames:
            print("Got following directory names: ")
            print(dirnames)
            names = dirnames
            # check for directories to get avoid stress
            if not os.path.exists(output_path + "Documents_simple"):
                os.makedirs(output_path + "Documents_simple")
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            if filename != "INITIAL_CAS.tsv" and filename != ".DS_Store":
                if filename in annotators.keys():
                    annotators[filename] += 1
                else:
                    annotators[filename] = 1
                df = webanno_tsv_to_df(filepath)
                simple_name = output_path + "Documents_simple" +"/"+ names[count]+ ".conll"#os.path.join(output_path, names[count], "simple", filename+ "_short.conll")
                df_to_conll(df, simple_name)
                print("Added file "+ filepath)
            else:
                print("skipped 'INITIAL_CAS.tsv' file")
        count += 1
    print(annotators)
    print("Finished")

def webanno_tsv_to_df(path, verbose = False):
    # modified function from Clara; puts the tsv file in input "path" into a df data frame
    with open(path, 'r', encoding='utf8') as f:
        data = f.read()
        data_split = []
        for row in data.split('\n'):
            #print(row)
            if len(row.split("\t"))==7:
                # actual data
                trow = row.split("\t")
                data_split.append(trow)
            else:
                if not row:
                    # end of sentence
                    data_split.append(["\n","\n","\n","\n","\n","\n","\n"])
                if verbose and row.startswith("#Text="):
                    print(row)
        df = pd.DataFrame(data_split, columns=['tsv-index', 'range', 'token', 'FIM-Tag', 'Prefix', 'Suffix', 'Newline'])
    f.close()

    # remove rows containing non-breaking space
    rows_before_space_fixup = len(df)
    df = df[df['token'].apply(lambda x:  x != '\u00a0')]
    df = df[df['token'].apply(lambda x:  x != '\u00a0\u00a0')]
    df = df[df['token'].apply(lambda x:  x != '\u00a0\u00a0\u00a0')]
    df = df[df['token'].apply(lambda x:  x != '\u00a0\u00a0\u00a0\u00a0')]
    df = df[df['token'].apply(lambda x:  x != '\u00a0\u00a0\u00a0\u00a0\u00a0')]
    df = df[df['token'].apply(lambda x:  x != '\u00a0\u0020\u00a0')]
    df = df[df['token'].apply(lambda x:  x != '\u00a0\u00a0\u0020\u00a0')]
    removed_rows_count = rows_before_space_fixup - len(df)

    if removed_rows_count != 0 and PRINT_REMOVED_ROWS:
        print(f"removed {removed_rows_count} rows (tokens) containing only non-breaking space (possibly combined with space)")


    return df

def add_prefix_and_suffix_to_token(df, index):
    ### adds - if possible a given suffix and prefix to a token
    prefix = ""
    suffix = ""
    token = df["token"][index]
    if df["Prefix"][index]!= "_" and not df["Prefix"][index].startswith("*"):
        prefix = df["Prefix"][index]
    if df["Suffix"][index]!= "_" and not df["Suffix"][index].startswith("*"):
        suffix = df["Suffix"][index]
    if suffix:
        if token[-1] == "-":
            token = token[:-1]
    complete_token = prefix + token + suffix
    return complete_token

def wrapper_token_and_label_extraction(df, i, string, tag_before):
    """wrapper that substitutes old names and extracts the tags"""
    prefix = "B-"
    # df["FIM-Tag"][i] is the Tag + [2] o.ä.
    # token is the word in the sentence
    token = add_prefix_and_suffix_to_token(df, i)
    # tag is the annotation
    only_bigger_annotation = df["FIM-Tag"][i].split("|")[0]
    tag = re.sub(r'\[[0-9]*\]', '', only_bigger_annotation)
    tag = tag.split("|")[0]
    if tag_before == only_bigger_annotation: #FIM-Tag because is same annotation (has number)
        prefix = "I-"
    # string is the complete conll line
    string = token + " " + prefix + tag + "\n"
    if tag == "Ressource":
        string = token + " " + prefix + "Dokument" + "\n"
    elif tag == "Entscheidungsfrist":
        string = token + " " + prefix + "Frist" + "\n"
    elif tag == "Abgabefrist":
        string = token + " " + prefix + "Frist" + "\n"
    # I need here the first tag with the number to see if it is a new annotation.
    return string, only_bigger_annotation


def df_to_conll(df, filename_simple):
    # creates both conll files by iteratively going through the tsv files
    Lines_short = []
    Lines_long = []
    tag_before = ""
    for i in df.index:
        string = ""
        string_long = ""
        if df["FIM-Tag"][i] == "\n":
            if Lines_short and Lines_short[0] != "\n":
                string += "\n"
        elif df["token"][i] == " ":
            continue
        elif df["FIM-Tag"][i] != "_":
            string, tag_before = wrapper_token_and_label_extraction(df, i, string, tag_before)
        else:
            string = add_prefix_and_suffix_to_token(df, i) + " O\n"
            tag_before = ""
        if string != " O\n": #empty token
            Lines_short.append(string)
    #lines = Lines#[2:]
    with open(filename_simple, "w", encoding='utf8') as file1:         # Writing data to a file
        file1.writelines(Lines_short)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
                    prog='convert_tsv_to_conll_NAP_2.py',
                    description='After downloading an annotation project from the INCEpTION platform, we use the annotation directory to extract the annotation from the tsv files and convert them to short span IOB2 annotations. This program was specifically directed at the conversion of Normenanalyse 2 project and its annotation scheme.',
                    epilog='May the Force be with you!')
    parser.add_argument('-i', '--input_dir', required=True, help="input directory location")
    parser.add_argument('-o', '--output_dir', required=True, help="path for output directory")
    args = parser.parse_args()
    print(args)

    PRINT_REMOVED_ROWS = True

    path_source =  args.input_dir #"~/Documents/Corpus_data/Data_Project_2_31_1_24/annotation" #
    path_dest =  args.output_dir #"~/Documents/Corpus_data/converted_data" #
    webanno_tsv_to_conll(path_source, path_dest)
