import os
import pandas as pd 
import re
import sys

"""short program for converting a directory of webanno tsv files to conll files. Modified for annotation campaign Canaréno. 
It will create a long and a short version.
"""

def webanno_tsv_to_conll(path, output_path):
    ### overall wrapper function
    count = -1 # first iteration is empty 
    names = []
    for (dirpath, dirnames, filenames) in os.walk(path):
        if dirnames: 
            print("Got following directory names: ")
            print(dirnames)
            names = dirnames
            # check for directories to get avoid stress
            isExist = os.path.exists(output_path + "Documents_simple")
            if not isExist:
                os.makedirs(output_path + "Documents_simple")
            isExist = os.path.exists(output_path + "Documents_enriched")
            if not isExist:
                os.makedirs(output_path + "Documents_enriched")
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            df = webanno_tsv_to_df(filepath)

            if df is None:
                print("Could not convert file: ", filepath)
                continue

            simple_name = output_path + "Documents_simple" +"/"+ names[count]+ filename+ "_short.conll"#os.path.join(output_path, names[count], "simple", filename+ "_short.conll")
            enriched_name = output_path  +"Documents_enriched" +"/"+ names[count] +filename+ "_long.conll"#os.path.join(output_path,names[count], "enriched", filename+ "_long.conll")
            df_to_conll(df, simple_name, enriched_name)
            print("Added file "+ filepath+ " to simple and enriched corpus")
        count += 1
    print("Finished")  

def webanno_tsv_to_df(path, verbose = False):
    # modified function from Clara; puts the tsv file in input "path" into a df data frame 
    with open(path, 'r', encoding='utf8') as f:
        try:
            data = f.read()
        except:
            print("Could not read file: ", path)
            return None
        data_split = []
        for row in data.split('\n'):
            if len(row.split("\t"))==9 or len(row.split("\t"))==8: 
                # actual data
                trow = row.split("\t")
                data_split.append(trow)
            else: 
                if not row:
                    # end of sentence
                    data_split.append(["\n","\n","\n","\n","\n","\n","\n","\n","\n"])
                if verbose and row.startswith("#Text="):
                    print(row)
        df = pd.DataFrame(data_split, columns=['tsv-index', 'range', 'token', 'FIM-Tag', 'Prefix', 'span-type', 'Suffix', 'Relation1', 'Relation2' ])
    f.close()
    return df

def add_prefix_and_suffix_to_token(df, index):
    ### adds - if possible a given suffix and prefix to a token
    prefix = ""
    suffix = ""
    token = df["token"][index]
    if df["Prefix"][index]!= "_" and not df["Prefix"][index].startswith("*"):
        prefix = df["Prefix"][index]
    if df["Suffix"][index]!= "_" and not df["Suffix"][index].startswith("*"):
        suffix = df["Suffix"][index]
    if suffix:
        if token[-1] == "-":
            token = token[:-1]
    complete_token = prefix + token + suffix
    return complete_token

def wrapper_token_and_label_extraction(df, i, string, tag_before):
    ### wrapper that substitutes old names and extracts the tags
    prefix = "B-"
    token = add_prefix_and_suffix_to_token(df, i)
    tag = re.sub(r'\[[0-9]*\]', '', df["FIM-Tag"][i])
    ### TODO: make sure that tag_before differentiates between long and short span annotations
    if tag_before == df["FIM-Tag"][i]: #FIM-Tag because is same annotation (has number)
        prefix = "I-"
    string = token + " " + prefix + tag + "\n"
    if tag == "Ressource":
        string = token + " " + prefix + "Dokument" + "\n"
    elif tag == "Entscheidungsfrist":
        string = token + " " + prefix + "Frist" + "\n"
    elif tag == "Abgabefrist":
        string = token + " " + prefix + "Frist" + "\n"
    return string, df["FIM-Tag"][i] 


def df_to_conll(df, filename_simple, filename_long):
    # creates both conll files by iteratively going through the tsv files
    Lines_short = []
    Lines_long = []
    tag_before_short = ""
    tag_before_long = ""
    for i in df.index:
        string = ""
        string_long = ""
        if df["FIM-Tag"][i] == "\n":
            if Lines_short and Lines_short[0] != "\n":
                string += "\n"
            if Lines_long and Lines_long[0] != "\n":
                string_long += "\n"
        elif df["token"][i] == " " or df["token"][i] == " ":
            continue
        elif df["FIM-Tag"][i] != "_" and df["span-type"][i].startswith("Short"):
            string, tag_before_short = wrapper_token_and_label_extraction(df, i, string, tag_before_short)
            #### TODO change for long-span corpus if necessary
            #### meaning that we would have to check the 'Relation1', 'Relation2' tabs and consequently change B- to I- tags. Example: 99110003001000_c.txt, lines 956-959
            string_long = string
        elif df["FIM-Tag"][i] != "_" and df["span-type"][i].startswith("Long"):
            string_long, tag_before_long = wrapper_token_and_label_extraction(df, i, string, tag_before_long)
            #### TODO change for long-span corpus if necessary
            string = add_prefix_and_suffix_to_token(df, i) + " O\n" 
            tag_before_short = ""
        else:
            string = string_long = add_prefix_and_suffix_to_token(df, i) + " O\n" 
            tag_before_short = tag_before_long = ""
        if string != " O\n": #empty token
            Lines_short.append(string)
        if string_long != " O\n": #empty token
            Lines_long.append(string_long)
    #lines = Lines#[2:]
    with open(filename_simple, "w", encoding='utf8') as file1:         # Writing data to a file
        file1.writelines(Lines_short)
    with open(filename_long, "w", encoding='utf8') as file2:         # Writing data to a file
        file2.writelines(Lines_long)

if __name__ == '__main__':
    if len(sys.argv) < 3:
        raise Exception("invalid number of arguments given; expected directory location and output directory location")
    path_source = sys.argv[1]
    path_dest = sys.argv[2]
    webanno_tsv_to_conll(path_source, path_dest)
