#!/usr/bin/env python3
'''Pos tagging test data'''

from sys import argv
import json

import os, sys, re
from corpustools import argparse_version, ccat, corpusxmlfile, modes, util


def analyses(modename, lang, input):
    pipeline = modes.Pipeline(modename, lang)
    pipeline.sanity_check()
    res = pipeline.run(input.encode('utf8'))
    return res


def pos_tagging(data_file):
    out_name = data_file.split("/")[-1]
    f = open(data_file)
    f2 = open(data_file + ".json", "w")
    print("Processing file: ", data_file)
    print("...")
    tot_data = []
    outFST = ""
    lemma_an = []
    for sentence in f:
        sen_ar = []
        outFST = analyses("hfst", "sme", sentence)
        lemma_an = outFST.split('"<')
        for an in lemma_an:
            token = (an.split('>"')[0])
            pos = []
            data = {}
            try:
                analysis = re.split('\t', an)[1]
                no_lemma = analysis.split('" ')[1]
                no_lemma_ar = no_lemma.split(' ')
                for el in no_lemma_ar:
                    reg = re.search("Der\/*|Sem\/*|Ex\/*|Gram\/*|Err\/*|TV|IV", el)
                    if not reg:
                        pos.append(el)
                        break
            except:
                pos.append("")
            if token:
                data["correct"] = token
                data["pos"] = pos
                tot_data.append(data)

    json.dump(tot_data, f2, ensure_ascii=False, indent=4)
    print("Done processing file: ", data_file)
    f.close()
    f2.close()


home = os.getenv("HOME")
#modify test_file according to your path to testing data
test_file = home + "/compound-errors/data/restricted/src-boundcorpus-errcmp.txt"

pos_tagging(test_file)
