#!/usr/bin/env python3
'''
Load splitted data and add POS

Usage:
python3 write_json.py <FILE_NAME>
'''

from sys import argv
import json
from subprocess import Popen, PIPE
import re, os


data_file = argv[1]
home = os.getenv("HOME")
#modify path_to_lang according to your path to lang-sme
path_to_lang = home + "/giellalt/giellalt/lang-sme/"

cmd = "' | hfst-tokenise --print-all --giella-cg --no-weights --unique " + path_to_lang + "tools/tokenisers/tokeniser-disamb-gt-desc.pmhfst | vislcg3 --grammar " + path_to_lang + "tools/tokenisers/mwe-dis.bin | cg-mwesplit | vislcg3 --grammar " + path_to_lang + "src/cg3/disambiguator.bin | vislcg3 --grammar " + path_to_lang + "src/cg3/korp.bin | vislcg3 --grammar " + path_to_lang + "src/cg3/dependency.bin"

with open(data_file) as f:
  data = json.load(f)

for pair in data:
    pos = []
    for error in pair["error"]:
        if error:
            pFST = Popen("echo '" + error + cmd, shell=True, stdout=PIPE, stderr=PIPE)
            outFST, errFST = pFST.communicate()
            outFST = outFST.decode()
            try:
                analysis = re.split('\t', outFST)[1]
                no_lemma = analysis.split('" ')[1]
                no_lemma_ar = no_lemma.split(' ')
                for el in no_lemma_ar:
                    reg = re.search("Der\/*|Sem\/*|Ex\/*|Gram\/*", el)
                    if not reg:
                        pos.append(el)
                        break
            except:
                pos.append("")
    pair["pos"] = pos

f = open(data_file, "w")
json.dump(data, f, ensure_ascii=False, indent=4)
f.close()
