Source code for preprocessing.utils
"""
Python3 script with utility functions for preprocessing
Author: Danielly Sorato
Author contact: danielly.sorato@gmail.com
"""
import re
from itertools import groupby
import pandas as pd
import nltk
initial_sufix = 0
def reset_initial_sufix():
global initial_sufix
initial_sufix = 0
def update_survey_item_id(prefix):
global initial_sufix
initial_sufix = initial_sufix + 1
survey_item_id = prefix+str(initial_sufix)
return survey_item_id
def get_survey_item_id(prefix):
global initial_sufix
survey_item_id = prefix+str(initial_sufix)
return survey_item_id
def decide_on_survey_item_id(prefix, old_item_name, new_item_name):
if old_item_name == new_item_name:
survey_item_id = get_survey_item_id(prefix)
else:
survey_item_id = update_survey_item_id(prefix)
return survey_item_id
[docs]def recognize_standard_response_scales(filename, text):
"""
Recognizes special answer categories from EVS by testing the answer segment against the
language dependent pattern definitions for the special categories.
Args:
param1 filename (string): input file name.
param2 text (string): answer text segment.
Returns:
If a pattern was found, returns a string informing the special category,
otherwise returns None.
"""
if 'CZE' in filename:
dk_pattern = re.compile("(Neví)", re.IGNORECASE)
refusal_pattern = re.compile("(Neodpověděl|neodpověděl(a)|BEZ ODPOVĚDI)", re.IGNORECASE)
dontapply_pattern = re.compile("(Nehodí se)", re.IGNORECASE)
if dk_pattern.match(text):
return 'dk'
elif refusal_pattern.match(text):
return 'refusal'
elif dontapply_pattern.match(text):
return 'dontapply'
else:
return None
if 'ENG' in filename:
dk_pattern = re.compile("(don't know)", re.IGNORECASE)
refusal_pattern = re.compile("(refusal|no answer)", re.IGNORECASE)
dontapply_pattern = re.compile("(not applicable)", re.IGNORECASE)
if dk_pattern.match(text):
return 'dk'
elif refusal_pattern.match(text):
return 'refusal'
elif dontapply_pattern.match(text):
return 'dontapply'
else:
return None
if 'FRE' in filename:
dk_pattern = re.compile("(ne sait pas)", re.IGNORECASE)
refusal_pattern = re.compile("(pas de réponse|refus|sans réponse|sans reponse)", re.IGNORECASE)
dontapply_pattern = re.compile("(ne s'applique pas|Non applicable|Pas d'application|Non concerné)", re.IGNORECASE)
if dk_pattern.match(text):
return 'dk'
elif refusal_pattern.match(text):
return 'refusal'
elif dontapply_pattern.match(text):
return 'dontapply'
else:
return None
if 'GER' in filename:
dk_pattern = re.compile("(weiß nicht|weiss nicht)", re.IGNORECASE)
refusal_pattern = re.compile("(verweigert|keine Antwort|Keine Antwort)", re.IGNORECASE)
dontapply_pattern = re.compile("(trifft nicht zu|nicht zutreffend)", re.IGNORECASE)
if dk_pattern.match(text):
return 'dk'
elif refusal_pattern.match(text):
return 'refusal'
elif dontapply_pattern.match(text):
return 'dontapply'
else:
return None
if 'NOR' in filename:
dk_pattern = re.compile("(Vet ikke)", re.IGNORECASE)
refusal_pattern = re.compile("(Ikke svar)", re.IGNORECASE)
dontapply_pattern = re.compile("(NA)")
if dk_pattern.match(text):
return 'dk'
elif refusal_pattern.match(text):
return 'refusal'
elif dontapply_pattern.match(text):
return 'dontapply'
else:
return None
if 'POR' in filename:
dk_pattern = re.compile("(não sabe)", re.IGNORECASE)
refusal_pattern = re.compile("(não responde)", re.IGNORECASE)
dontapply_pattern = re.compile("(não se aplica)", re.IGNORECASE)
if dk_pattern.match(text):
return 'dk'
elif refusal_pattern.match(text):
return 'refusal'
elif dontapply_pattern.match(text):
return 'dontapply'
else:
return None
if 'RUS' in filename:
if 'LV' in filename:
dk_pattern = re.compile("(Затрудняюсь ответить|Не знает)", re.IGNORECASE)
refusal_pattern = re.compile("(Отказ от ответа|Нет ответа)", re.IGNORECASE)
dontapply_pattern = re.compile("(не соответствующий|не применимо)", re.IGNORECASE)
if dk_pattern.match(text):
return 'dk'
elif refusal_pattern.match(text):
return 'refusal'
elif dontapply_pattern.match(text):
return 'dontapply'
else:
return None
else:
dk_pattern = re.compile("(Затрудняюсь ответить|Не знаю)", re.IGNORECASE)
refusal_pattern = re.compile("(Отказ от ответа|Нет ответа)", re.IGNORECASE)
if 'BY' in filename:
dontapply_pattern = re.compile("(вопрос не применим|не применимо)", re.IGNORECASE)
else:
dontapply_pattern = re.compile("(Не подходит|НЕ ПРИМЕНИМО)", re.IGNORECASE)
if dk_pattern.match(text):
return 'dk'
elif refusal_pattern.match(text):
return 'refusal'
elif dontapply_pattern.match(text):
return 'dontapply'
else:
return None
if 'SPA' in filename:
dk_pattern = re.compile("(No sabe)", re.IGNORECASE)
refusal_pattern = re.compile("(no contesta)", re.IGNORECASE)
dontapply_pattern = re.compile("(no aplicable)", re.IGNORECASE)
if dk_pattern.match(text):
return 'dk'
elif refusal_pattern.match(text):
return 'refusal'
elif dontapply_pattern.match(text):
return 'dontapply'
else:
return None
[docs]def determine_country(filename):
"""
Determines the full name of the country, based on ISO code for country
that is embedded in the file name.
Args:
param1 filename (string): input file name.
Returns:
full name of the country (string).
"""
if '_AT' in filename:
country = 'Austria'
if '_AZ' in filename:
country = 'Azerbaijan'
if '_BE' in filename:
country = 'Belgium'
if '_BY' in filename:
country = 'Belarus'
if '_BG' in filename:
country = 'Bulgaria'
if '_CH' in filename:
country = 'Switzerland'
if '_CY' in filename:
if 'TUR' in filename:
country = 'Northern Cyprus'
else:
country = 'Cyprus'
if '_CZ' in filename:
country = 'Czech Republic'
if '_DE' in filename:
country = 'Germany'
if '_DK' in filename:
country = 'Denmark'
if '_EE' in filename:
country = 'Estonia'
if '_ES' in filename:
country = 'Spain'
if '_FI' in filename:
country = 'Finland'
if '_FR' in filename:
country = 'France'
if '_GE' in filename:
country = 'Georgia'
if '_GB' in filename or 'SOURCE' in filename:
country = 'Great Britain'
if '_GR' in filename:
country = 'Greece'
if '_HV' in filename:
country = 'Bosnia and Herzegovina'
if '_HU' in filename:
country = 'Hungary'
if '_IE' in filename:
country = 'Ireland'
if '_IT' in filename:
country = 'Italy'
if '_IS' in filename:
country = 'Iceland'
if '_NIR' in filename:
country = 'Northern Ireland'
if '_LU' in filename:
country = 'Luxembourg'
if '_LV' in filename:
country = 'Latvia'
if '_LT' in filename:
country = 'Lithuania'
if '_MD' in filename:
country = 'Moldova'
if '_ME' in filename:
country = 'Montenegro'
if '_MK' in filename:
country = 'Macedonia'
if '_MT' in filename:
country = 'Malta'
if '_NO' in filename:
country = 'Norway'
if '_NL' in filename:
country = 'Neatherlands'
if '_PT' in filename:
country = 'Portugal'
if '_PL' in filename:
country = 'Poland'
if '_RU' in filename:
country = 'Russian Federation'
if '_SE' in filename:
country = 'Sweden'
if '_SI' in filename:
country = 'Slovenia'
if '_SK' in filename:
country = 'Slovakia'
if '_TR' in filename:
country = 'Turkey'
if '_UA' in filename:
country = 'Ukraine'
return country
[docs]def determine_sentence_tokenizer(filename):
"""
Provide the sentence splitter suffix to instantiate it in accordance
to the target language (information emebedded on filename).
Args:
param1 filename (string): input file name.
Returns:
a sentence splitter suffix (string) according to the target language.
"""
#there is no sentence segmentation for bulgarian in NLTK..
if 'BUL_' in filename:
sentence_splitter_suffix = 'turkish.pickle'
if 'CZE_' in filename:
sentence_splitter_suffix = 'czech.pickle'
if 'DAN_' in filename:
sentence_splitter_suffix = 'danish.pickle'
if 'DUT_' in filename:
sentence_splitter_suffix = 'dutch.pickle'
if 'ENG_' in filename:
sentence_splitter_suffix = 'english.pickle'
if 'EST_' in filename:
sentence_splitter_suffix = 'estonian.pickle'
if 'FRE_' in filename:
sentence_splitter_suffix = 'french.pickle'
if 'FIN_' in filename:
sentence_splitter_suffix = 'finnish.pickle'
if 'GER_' in filename:
sentence_splitter_suffix = 'german.pickle'
if 'GRE_' in filename:
sentence_splitter_suffix = 'greek.pickle'
if 'ITA_' in filename:
sentence_splitter_suffix = 'italian.pickle'
if 'NOR_' in filename:
sentence_splitter_suffix = 'norwegian.pickle'
if 'POL_' in filename:
sentence_splitter_suffix = 'polish.pickle'
if 'POR_' in filename:
sentence_splitter_suffix = 'portuguese.pickle'
if 'RUS_' in filename:
sentence_splitter_suffix = 'russian.pickle'
if 'SPA_' in filename or 'CAT_' in filename:
sentence_splitter_suffix = 'spanish.pickle'
if 'SLV_' in filename:
sentence_splitter_suffix = 'slovene.pickle'
if 'SWE_' in filename:
sentence_splitter_suffix = 'swedish.pickle'
if 'TUR_' in filename:
sentence_splitter_suffix = 'turkish.pickle'
#there is no sentence segmentation for louxemburgish in NLTK..
if 'UKR_' in filename:
sentence_splitter_suffix = 'russian.pickle'
#there is no sentence segmentation for louxemburgish in NLTK..
if 'LTZ_' in filename:
sentence_splitter_suffix = 'german.pickle'
#there is no sentence segmentation for maltese in NLTK..
if 'MLT_' in filename:
sentence_splitter_suffix = 'turkish.pickle'
#there is no sentence segmentation for croatian in NLTK..
if 'HRV_' in filename:
sentence_splitter_suffix = 'turkish.pickle'
return sentence_splitter_suffix
[docs]def get_sentence_splitter(filename):
"""
Decide what Instantiate Punkt Sentence Tokenizer from NLTK should be instantiated, according to
the information embedded in the filename.
Args:
param1 filename (string): input file name.
Returns:
a sentence splitter (NLTK object) instantiated according to the target language.
"""
sentence_splitter_prefix = 'tokenizers/punkt/'
sentence_splitter_suffix = determine_sentence_tokenizer(filename)
sentence_splitter_path = sentence_splitter_prefix+sentence_splitter_suffix
sentence_splitter = nltk.data.load(sentence_splitter_path)
return sentence_splitter