Source code for annotation.pos_tagging
from flair.models import SequenceTagger
import sys
from flair.data import Sentence
import flair.datasets
from populate_tables import *
from retrieve_from_tables import *
import pickle
[docs]def select_pos_model(language):
"""
Selects the appropriate pos tagging model based on the language.
ENG language uses a pretrained model provided by Flair.
NOR, SPA, GER, CZE, and FRE languages use multilingual pretrained model provided by Flair.
CAT, RUS and POR languages use models trained by me.
Args:
param1 language (string): 3-digit language ISO code.
Returns:
part-of-speech tagging model (Pytorch object).
"""
if 'ENG' in language:
return SequenceTagger.load('upos')
elif 'CAT' in language:
return SequenceTagger.load('/home/danielly/workspace/trained_models/cat_150ep.pt')
elif 'RUS' in language:
return SequenceTagger.load('/home/danielly/workspace/trained_models/rus_150ep.pt')
elif 'POR' in language:
return SequenceTagger.load('/home/danielly/workspace/trained_models/por_150ep.pt')
else:
return SequenceTagger.load('pos-multi')
[docs]def output_language_specific_dictionaries(languages):
"""
Gets each text segment and its ID from the database, building 4 dictionaries (request, response, instruction, introduction).
After, saves the dictionaries as a pickle dump.
Args:
param1 languages (list of strings): list of 3-digit language ISO codes.
"""
for l in languages:
request, response, instruction, introduction = build_id_dicts_per_language(l)
dicts = {'request': request, 'response': response, 'instruction': instruction, 'introduction': introduction}
save_dictionaries('/home/danielly/workspace/trained_models/'+l+'_dicts.pickle', dicts)
[docs]def from_tagged_dict_to_table(dicts):
"""
Uses the tagged dictionaries containing the tagged data from each table to
update the respective pos_tagged_text columns in the database.
The survey_item table is also updated using the same IDs
Args:
param1 dicts (a dictionary of dictionaries): the dictionary name corresponds to the table name (key),
and the dictionary (value) has the IDs and the tagged text.
"""
for k, v in list(dicts.items()):
if k == 'request':
table_name = 'request'
table_id_name = 'requestid'
elif k == 'response':
table_name = 'response'
table_id_name = 'responseid'
elif k == 'instruction':
table_name = 'instruction'
table_id_name = 'instructionid'
elif k == 'introduction':
table_name = 'introduction'
table_id_name = 'introductionid'
tag_item_type_table(v, table_name, table_id_name)
tag_survey_item(v, table_id_name)
[docs]def load_dict(path):
"""
Loads a dictionary stored as a picke object.
Args:
param1 path (string): the path to the dictionary
Returns:
the loaded dictionary.
"""
with open(path, 'rb') as handle:
dicts = pickle.load(handle)
return dicts
[docs]def save_dictionaries(path, dicts):
"""
Saves a dictionary as a picke object.
Args:
param1 path (string): the path to the dictionary
param2 dicts (a dictionary of dictionaries): the dictionary name corresponds to the table name (key),
and the dictionary (value) has the IDs and the tagged text.
"""
with open(path, 'wb') as handle:
pickle.dump(dicts, handle, protocol=pickle.HIGHEST_PROTOCOL)
[docs]def tag_dictionary(tagger, dictionary):
"""
Tags each sentence of the untagged dictionary and updates its value to the tagged sentence.
Args:
param1 part-of-speech tagging model (Pytorch object): language specific (or multilingual) pos tagging model
param2 dictionary (dictionary): has the IDs and the untagged text.
Returns:
A dictionary with the text segment IDs and the tagged text.
"""
for k, v in list(dictionary.items()):
sentence = Sentence(v)
tagger.predict(sentence)
tagged_sentence = sentence.to_tagged_string()
dictionary[k] = tagged_sentence
return dictionary
[docs]def save_tagged_dictionary(tagger, language):
"""
Loads a given untagged dictionary, calls the tagging method and saves the tagged dictionary.
Args:
param1 part-of-speech tagging model (Pytorch object): language specific (or multilingual) pos tagging model
param2 language (string): 3-digit language ISO code.
"""
dicts = load_dict('/home/danielly/workspace/trained_models/'+language+'_dicts.pickle')
for k, v in list(dicts.items()):
dicts[k] = tag_dictionary(tagger, v)
save_dictionaries('/home/danielly/workspace/trained_models/'+language+'_dicts_tagged.pickle', dicts)
def main():
languages = ['POR', 'NOR', 'SPA', 'CAT', 'GER', 'CZE', 'FRE', 'ENG', 'RUS']
output_language_specific_dictionaries(languages)
for l in languages:
tagger = select_pos_model(l)
save_tagged_dictionary(tagger, l)
dicts = load_dict('/home/danielly/workspace/trained_models/'+l+'_dicts_tagged.pickle')
from_tagged_dict_to_table(dicts)
if __name__ == "__main__":
main()