from uralicNLP import uralicApi
from uralicNLP.ud_tools import UD_collection
from glob import glob
from mikatools import *

def _clean_morphology(m):
	m = m.replace("+Cmp","")
	m = m.replace("/Sg", "+Sg+")
	return m

def _cutup_morph(parts, original_word):
	if len(parts) != 2:
		print(original_word, parts)
		return parts
	new_parts = [original_word[0:len(original_word)- len(parts[1])], parts[1]]
	return new_parts


def generate(morph, original_word):
	parts = morph.split("#")
	res = []
	cutup = False
	for i, p in enumerate(parts):
		p = _clean_morphology(p)
		r = uralicApi.generate(p, "sme")
		if len(r) == 0:
			#exit()
			res.append(p.split("+")[0])
			if i != len(parts)-1:
				cutup = True
			else:
				cutup = False
				print(morph, original_word)

		else:
			res.append(r[0][0])
	if cutup:
		res = _cutup_morph(res, original_word)
	return res

def split(word):
	analysis = uralicApi.analyze(word, "sme")
	for l in analysis:
		if "#" in l[0] and "Err" not in l[0]:
			return generate(l[0], word)
	return [word]

results = []
for file in glob("data/*.conllu"):
	ud = UD_collection(open_read(file))
	for sentence in ud:
		res_sent = []
		for word in sentence:
			parts = split(word.form)
			res_sent.append({"correct" : word.form, "error": parts, "pos": word.xpostag})
		results.append(res_sent)
json_dump(results, "data/ud_generated.json")