Source code for annotation.ner_annotation_cze_rus

from deeppavlov import build_model
import os
import sys
import pandas as pd

[docs]def ner_annotation(df, ner): """ Iterates through the preprocessed and POS tag annotated RUS and CZE spreadsheets, adding the NER annotation. POS tag is done in the mcsq_annotation script. CZE and RUS languages use multilingual pretrained model provided by Deeppavlov. The Slavic-BERT-NER from Deeppavlov uses lib versions that are imcompatible with the ones from the mcsq_annotation script, therefore this script should be run using a separate virtual environment. Args: param1 df (pandas dataframe): the dataframe that holds the preprocessed and POS tag annotated questionnaire. param2 ner (BERT model): pretrained NER model provided by Deeppavlov. Returns: df_tagged (pandas dataframe), the questionnaire with added NER annotations. """ df_tagged = pd.DataFrame(columns=['survey_item_ID', 'Study', 'module', 'item_type', 'item_name', 'item_value', 'text', 'ner_tagged_text', 'pos_tagged_text']) for i, row in df.iterrows(): tagged = ner([row['text']]) flat_list = [item for sublist in tagged for item in sublist] entities = [] for token, tag in zip(flat_list[0], flat_list[1]): if tag != 'O': entities.append(token+'<'+tag+'>') if ''.join(entities) != '': data = {'survey_item_ID': row['survey_item_ID'], 'Study': row['Study'], 'module': row['module'], 'item_type': row['item_type'], 'item_name': row['item_name'], 'item_value': row['item_value'], 'text': row['text'], 'ner_tagged_text': ' '.join(entities), 'pos_tagged_text': row['pos_tagged_text']} else: data = {'survey_item_ID': row['survey_item_ID'], 'Study': row['Study'], 'module': row['module'], 'item_type': row['item_type'], 'item_name': row['item_name'], 'item_value': row['item_value'], 'text': row['text'], 'ner_tagged_text': None, 'pos_tagged_text': row['pos_tagged_text']} df_tagged = df_tagged.append(data, ignore_index = True) return df_tagged
def main(folder_path): path = os.chdir(folder_path) files = os.listdir(path) ner_model = build_model("/home/danielly/workspace/Slavic-BERT-NER/ner_bert_slav.json", download=True) for index, file in enumerate(files): if file.endswith(".csv"): if 'CZE' in file or 'RUS' in file: print(file) df = pd.read_csv(file, dtype=str, sep='\t') df_tagged = ner_annotation(df, ner_model) df_tagged.to_csv(file, encoding='utf-8', sep='\t', index=False) if __name__ == "__main__": folder_path = str(sys.argv[1]) main(folder_path)