"""
Python3 script to transform XML ESS data into spreadsheet format used as input for MCSQ
Author: Danielly Sorato
Author contact: danielly.sorato@gmail.com
"""
import xml.etree.ElementTree as ET
import pandas as pd
import nltk.data
import sys
import re
import string
import utils as ut
from preprocessing_ess_utils import *
ignore = re.compile(r'F[1-3]\_[0-1][2-9](\_.*)?')
ignore_f3_dk_rf = re.compile(r'F3\_01(\_.*)?')
[docs]def clean_answer_category(text):
"""
Cleans the answer segment, by standardizing the text and removing undesired elements.
Args:
param1 text (string): answer segment currently being analyzed.
Returns:
standardized answer text (string).
"""
if isinstance(text, str):
text = re.sub(r"<br>\s?\s?\d+$", "", text)
text = re.sub(r"<br />\s?\s?\d+$", "", text)
text = re.sub(r"<br>\s?\s?\+\d+$", "", text)
text = re.sub(r"<br />\s?\s?\-\d+$", "", text)
text = re.sub(r"<br>\s?\s?\-\d+$", "", text)
text = re.sub("…", "...", text)
text = re.sub("’", "'", text)
text = re.sub(" :", ":", text)
text = re.sub("(\s0\s?$)", "", text)
text = re.sub("(\s10\s?$)", "", text)
text = re.sub("(\s\+\d\s?$)", "", text)
text = re.sub("(\s\-\d\s?$)", "", text)
text = re.sub("\s+\?", "?", text)
text = re.sub("[.]{4,}", "", text)
text = re.sub("[!]{2,}", "!", text)
text = re.sub('</strong>', "",text)
text = re.sub('<strong>', "",text)
text = re.sub('</p>', "",text)
text = re.sub('<p>', "",text)
text = re.sub('</em>', "",text)
text = re.sub('<em>', "",text)
text = re.sub('</br>', " ",text)
text = re.sub('<br />', " ",text)
text = re.sub('<br>', " ",text)
text = re.sub('<b><b>', " ",text)
text = re.sub('</b></b>', " ",text)
text = re.sub('<b>', " ",text)
text = re.sub('</b>', " ",text)
text = re.sub('</u>', " ",text)
text = re.sub('<u>', " ",text)
text = re.sub('<', " ",text)
text = re.sub('>', " ",text)
text = re.sub('>', " ",text)
text = re.sub('<', " ",text)
text = re.sub('>', " ",text)
text = re.sub(' ', " ",text)
text = re.sub('_*', "",text)
text = re.sub(';', "",text)
text = text.replace('------->','')
text = text.replace('__________','')
text = text.replace('________________________','')
text = text.replace('\n','')
text = text.replace('(','')
text = text.replace(')','')
text = text.replace('[','')
text = text.replace(']','')
text = text.replace('<i>','')
text = text.replace('</i>','')
text = text.replace('^footnote','')
text = text.rstrip()
text = text.lstrip()
else:
text = ''
return text
[docs]def clean(text):
"""
Cleans the question or instruction segment, by standardizing the text and removing undesired elements.
Args:
param1 text (string): question or instruction segment currently being analyzed.
Returns:
standardized question or instruction text (string).
"""
if isinstance(text, str):
text = text.replace('\n','')
text = text.replace('\r','')
text = text.replace(' ?','?')
text = text.replace('‐','-')
text = re.sub("»", "", text)
text = re.sub("^\s?\.\s?", "", text)
text = re.sub("«", "", text)
text = re.sub("…", "...", text)
text = re.sub("’", "'", text)
text = re.sub(" :", ":", text)
text = re.sub("Enq.:", "Enquêteur:", text, flags=re.I)
text = re.sub("INT.:", "INTERVIEWER:", text, flags=re.I)
text = re.sub("[.]{4,}", "", text)
text = re.sub("[!]{2,}", "!", text)
text = re.sub('</strong>', "",text)
text = re.sub('<strong>', "",text)
text = re.sub('</p>', "",text)
text = re.sub('<p>', "",text)
text = re.sub('~', " ",text)
text = re.sub('_*', "",text)
text = re.sub('<b><b>', " ",text)
text = re.sub('</b></b>', " ",text)
text = re.sub('<b>', " ",text)
text = re.sub('</b>', " ",text)
text = re.sub('</em>', "",text)
text = re.sub('<em>', "",text)
text = re.sub('</br>', " ",text)
text = re.sub('<br />', " ",text)
text = re.sub('<br>', " ",text)
text = re.sub('</u>', " ",text)
text = re.sub('<u>', " ",text)
text = re.sub('<', " ",text)
text = re.sub('>', " ",text)
text = re.sub('>', " ",text)
text = re.sub('<', " ",text)
text = re.sub('>', " ",text)
text = re.sub(' ', " ",text)
text = re.sub('…', " ",text)
text = re.sub('&hellip', " ",text)
text = re.sub('&rsquo', "'",text)
text = re.sub('\[\d+\]', "",text)
text = re.sub(';', "",text)
text = text.replace('einfg.','einfügen')
text = text.replace('[^footnote]','')
text = text.replace('e.g','')
text = text.replace('<P STYLE="text-align: left">','')
text = text.replace('<P STYLE="text-align: right">','')
text = text.replace('>','')
text = text.replace('<','')
text = text.replace('i.e.','ie')
text = text.rstrip()
text = text.lstrip()
else:
text = ''
return text
[docs]def identify_showcard_instruction(text, country_language):
"""
Language specific definitions of the word 'card' used in the ESS files. If the text matches the word, then it is a showcard instruction.
Args:
param1 text (string): text segment being analyzed.
param2 country_language (string): country and language metadata, embedded in the name of the input file.
Returns:
item_type (string). Either request or instruction in the case that it is a showcard instruction.
"""
item_type = 'REQUEST'
if 'FRE' in country_language:
if 'CH' in country_language or 'BE' in country_language:
showcard = 'CARTE'
else:
showcard = 'LISTE'
if 'GER' in country_language:
if 'CH' in country_language or 'AT' in country_language:
showcard = 'KARTE'
else:
showcard = 'LISTE'
if 'NOR' in country_language:
showcard = 'KORT'
if 'RUS' in country_language:
showcard = 'КАРТОЧКУ'
if 'RUS_LV' in country_language:
showcard = 'КАРТОЧКА'
if 'ENG' in country_language:
showcard = 'CARD'
if re.compile(showcard).findall(text):
item_type = 'INSTRUCTION'
return item_type
[docs]def get_answer_id(node, parent_map):
"""
Gets the answer id from node attributes, if it exists
Args:
param1 node: current xml tree node being analyzed in outer loop.
param2 parent_map (dictionary): a dictionary containing information about parent-child relationships in XML tree.
Returns:
answer_id (string) if it exists, otherwise None.
"""
parent = parent_map[node]
parent_of_parent = parent_map[parent]
if 'answer_id' in parent_map[parent_of_parent].attrib:
answer_id = parent_map[parent_of_parent].attrib['answer_id']
else:
answer_id = None
return str(answer_id)
[docs]def segment_question_instruction(df_question_instruction, parent_map, node, item_name, item_type, splitter, country_language):
"""
Extracts the question/instruction text segments from a node, if the node text exists.
nodes to extract questions and instructions (introduction is not present in metadata)
Args:
param1 df_question_instruction (pandas dataframe): a dataframe to store processed question and instruction segments
param2 parent_map (dictionary): a dictionary containing information about parent-child relationships in XML tree.
param3 node (tree object): XML node being analyzed.
param4 item_name (string): item name metadata extracted from node.attrib['name']
param5 item_type (string): item type metadata inferred from parent_map[node].attrib['type_name']
param6 splitter (NLTK object): Sentence segmenter object from NLTK
param7 country_language (string): country and language metadata, extracted from the input file name.
Returns:
Updated df_question_instruction dataframe, with new question and instruction segments.
"""
if node.text != '' and isinstance(node.text, str):
sentences = splitter.tokenize(clean(node.text))
item_name, modified_item_type = adjust_item_name(item_name)
if item_name != 'REJECT QUESTION':
if modified_item_type != None:
item_type = modified_item_type
for sentence in sentences:
if item_type == 'REQUEST':
data = {'answer_id': get_answer_id(node, parent_map), 'item_name':item_name,
'item_type':identify_showcard_instruction(sentence, country_language), 'text':sentence}
df_question_instruction = df_question_instruction.append(data, ignore_index=True)
else:
data = {'answer_id': None, 'item_name':item_name, 'item_type':item_type, 'text':sentence}
df_question_instruction = df_question_instruction.append(data, ignore_index=True)
else:
return df_question_instruction
return df_question_instruction
[docs]def adjust_item_name(item_name):
"""
Adjust item_name inconsistencies (and item_type in some cases) present in source XML file.
Args:
param1 item_name (string): item_name metadata, extracted from input file.
Returns:
adjusted item_name and item_type metadata.
"""
if re.compile(r'F([2-3]|N)_0[2-9]').match(item_name) or item_name == 'Labels_in_F1_F4' or item_name == 'Instruction' or item_name == 'I' or item_name == 'I_below_C33_C35':
item_name = 'REJECT QUESTION'
item_type = None
else:
item_type = None
if 'INTRO_G_2' in item_name:
item_type = 'INTRODUCTION'
item_name = 'G8a'
return item_name, item_type
if 'Intro_to_C33_35' in item_name:
item_type = 'INTRODUCTION'
item_name = 'C33'
return item_name, item_type
if 'I_after_D30_32' in item_name:
item_type = 'INSTRUCTION'
item_name = 'D32'
return item_name, item_type
if 'in' in item_name and 'minutes' not in item_name:
item_name = item_name.split('in')
item_name = item_name[1]
if 'above' in item_name and '_' not in item_name:
item_type = 'INTRODUCTION'
item_name = item_name.split(' ')
item_name = item_name[-1]
return item_name, item_type
if 'after' in item_name and '_' not in item_name:
item_type = 'INTRODUCTION'
item_name = item_name.split(' ')
item_name = item_name[-1]
return item_name, item_type
if 'above' in item_name and '_' in item_name:
item_type = 'INTRODUCTION'
item_name = item_name.split('_')
item_name = item_name[-1]
return item_name, item_type
if 'after' in item_name and '_' in item_name:
item_type = 'INTRODUCTION'
item_name = item_name.split('_')
item_name = item_name[-1]
return item_name, item_type
if ' below ' in item_name:
item_type = 'INTRODUCTION'
item_name = item_name.split(' below ')
item_name = item_name[-1]
return item_name, item_type
if '_' in item_name:
item_name = item_name.split('_')
item_name = item_name[0]
if '.' in item_name:
item_name = item_name.split('.')
item_name = item_name[0]+item_name[1].lower()
return item_name, item_type
[docs]def process_question_instruction_node(ess_questions_instructions, df_question_instruction, parent_map, splitter, country_language, extract_source):
"""
Iterates through question nodes to extract questions and instructions (introduction is not present in metadata)
Args:
param1 ess_questions_instructions: question and instruction nodes.
param2 df_question_instruction (pandas dataframe): a dataframe to store processed question and instruction segments
param3 parent_map (dictionary): a dictionary containing information about parent-child relationships in XML tree.
param4 splitter (NLTK object): sentence segmentation from NLTK library.
param5 country_language (string): country and language metadata, extracted from the input file name.
param6 extract_source (int): flag that indicates if the script should extract the ENG_SOURCE data or the target language.
Returns:
Updated df_question_instruction dataframe, with new question and instruction segments.
"""
for question in ess_questions_instructions:
for node in question.getiterator():
if node.tag == 'question' and 'name' in node.attrib and 'tmt_id' in node.attrib:
item_name = node.attrib['name']
"""
translation_id == 1 is the english version
"""
if extract_source == 1:
if node.tag == 'text' and 'translation_id' in node.attrib and node.attrib['translation_id'] == '1':
if 'type_name' in parent_map[node].attrib and parent_map[node].attrib['type_name'] == 'QText':
if ignore.match(item_name) is None and ignore_f3_dk_rf.match(item_name) is None:
text = node.text
item_type = 'REQUEST'
df_question_instruction = segment_question_instruction(df_question_instruction, parent_map, node, item_name, item_type, splitter,
country_language)
if 'type_name' in parent_map[node].attrib and parent_map[node].attrib['type_name'] == 'QInstruction':
text = node.text
item_type = 'INSTRUCTION'
df_question_instruction = segment_question_instruction(df_question_instruction, parent_map, node, item_name, item_type, splitter,
country_language)
else:
if node.tag == 'text' and 'translation_id' in node.attrib and node.attrib['translation_id'] != '1':
if ignore.match(item_name) is None and ignore_f3_dk_rf.match(item_name) is None:
if 'type_name' in parent_map[node].attrib and parent_map[node].attrib['type_name'] == 'QText':
text = node.text
item_type = 'REQUEST'
df_question_instruction = segment_question_instruction(df_question_instruction, parent_map, node, item_name, item_type, splitter,
country_language)
if 'type_name' in parent_map[node].attrib and parent_map[node].attrib['type_name'] == 'QInstruction':
text = node.text
item_type = 'INSTRUCTION'
df_question_instruction = segment_question_instruction(df_question_instruction, parent_map, node, item_name, item_type, splitter,
country_language)
return df_question_instruction
[docs]def process_answer_node(ess_answers, df_answers, parent_map, ess_special_answer_categories, extract_source):
"""
Iterates through answer nodes to extract answer segments.
Args:
param1 ess_answers: answer nodes.
param2 df_answers (pandas dataframe): a dataframe to store processed answer segments
param3 parent_map (dictionary): a dictionary containing information about parent-child relationships in XML tree.
param4 ess_special_answer_categories (Python object): instance of SpecialAnswerCategories object, in accordance to the country_language.
param5 extract_source (int): flag that indicates if the script should extract the ENG_SOURCE data or the target language.
Returns:
Updated df_answers dataframe, with new answer segments.
"""
for answer in ess_answers:
for node in answer.getiterator():
if node.tag == 'answer' and 'name' in node.attrib and 'tmt_id' in node.attrib:
item_name = node.attrib['name']
if '_' in item_name:
item_name = item_name.split('_')
item_name = item_name[1]
"""
translation_id == 1 is the english version
"""
if extract_source == 1:
if node.tag == 'text' and 'translation_id' in node.attrib and node.attrib['translation_id'] == '1':
text = node.text
if node.text != '' and isinstance(node.text, str) and 'does not exist in' not in text:
parent = parent_map[node]
parent_of_parent = parent_map[parent]
answer_id = parent_map[parent_of_parent].attrib['tmt_id']
if 'labelvalue' in parent_map[node].attrib:
item_value = parent_map[node].attrib['labelvalue']
else:
item_value = parent_map[node].attrib['order']
text = clean_answer_category(text)
if df_answers.empty ==False:
last_row = df_answers.iloc[-1]
if text != last_row['text']:
text, item_value = check_if_answer_is_special_category(text, item_value, ess_special_answer_categories)
if 'labelvalue' in parent_map[node].attrib and str(parent_map[node].attrib['labelvalue']) == str(text):
pass
else:
data = {'answer_id': answer_id, 'item_name': item_name, 'item_type':'RESPONSE',
'text': text, 'item_value': str(item_value)}
df_answers = df_answers.append(data, ignore_index=True)
else:
text, item_value = check_if_answer_is_special_category(text, item_value, ess_special_answer_categories)
if 'labelvalue' in parent_map[node].attrib and str(parent_map[node].attrib['labelvalue']) == str(text):
pass
else:
data = {'answer_id': answer_id, 'item_name': item_name, 'item_type':'RESPONSE',
'text': text, 'item_value': str(item_value)}
df_answers = df_answers.append(data, ignore_index=True)
else:
if node.tag == 'text' and 'translation_id' in node.attrib and node.attrib['translation_id'] != '1':
text = node.text
if node.text != '' and isinstance(node.text, str) and 'does not exist in' not in text:
parent = parent_map[node]
parent_of_parent = parent_map[parent]
answer_id = parent_map[parent_of_parent].attrib['tmt_id']
if 'labelvalue' in parent_map[node].attrib:
item_value = parent_map[node].attrib['labelvalue']
else:
item_value = parent_map[node].attrib['order']
text = clean_answer_category(text)
if df_answers.empty ==False:
last_row = df_answers.iloc[-1]
if text != last_row['text']:
text, item_value = check_if_answer_is_special_category(text, item_value, ess_special_answer_categories)
if 'labelvalue' in parent_map[node].attrib and str(parent_map[node].attrib['labelvalue']) == str(text):
pass
else:
data = {'answer_id': answer_id, 'item_name': item_name, 'item_type':'RESPONSE',
'text': text, 'item_value': str(item_value)}
df_answers = df_answers.append(data, ignore_index=True)
else:
text, item_value = check_if_answer_is_special_category(text, item_value, ess_special_answer_categories)
if 'labelvalue' in parent_map[node].attrib and str(parent_map[node].attrib['labelvalue']) == str(text):
pass
else:
data = {'answer_id': answer_id, 'item_name': item_name, 'item_type':'RESPONSE',
'text': text, 'item_value': str(item_value)}
df_answers = df_answers.append(data, ignore_index=True)
return df_answers
[docs]def set_initial_structures(filename, extract_source):
"""
Set initial structures that are necessary for the extraction of each questionnaire.
Args:
param1 filename (string): name of the input file.
Returns:
df_questionnaire to store questionnaire data (pandas dataframe),
survey_item_prefix, which is the prefix of survey_item_ID (string),
study/country_language, which are metadata parameters embedded in the file name (string and string)
and sentence splitter to segment request/instruction segments when necessary (NLTK object).
"""
"""
A pandas dataframe to store questionnaire data.
"""
df_questionnaire = pd.DataFrame(columns=['survey_item_ID', 'Study', 'module', 'item_type', 'item_name', 'item_value', 'text'])
"""
The prefix of a EVS survey item is study+'_'+language+'_'+country+'_'
"""
if extract_source == 1 and 'ESS_R09' in filename:
survey_item_prefix = 'ESS_R09_2018_ENG_SOURCE_'
elif extract_source == 1 and 'ESS_R08' in filename:
survey_item_prefix = 'ESS_R08_2016_ENG_SOURCE_'
else:
survey_item_prefix = re.sub('\.xml', '', filename)+'_'
"""
Reset the initial survey_id sufix, because main is called iterativelly for every file in folder.
"""
ut.reset_initial_sufix()
"""
Retrieve study and country_language information from the name of the input file.
"""
if extract_source == 1 and 'ESS_R09' in filename:
study, country_language = get_country_language_and_study_info('ESS_R09_2018_ENG_SOURCE')
elif extract_source == 1 and 'ESS_R08' in filename:
study, country_language = get_country_language_and_study_info('ESS_R08_2016_ENG_SOURCE')
else:
study, country_language = get_country_language_and_study_info(filename)
"""
Instantiate a NLTK sentence splitter based on file input language.
"""
if extract_source == 1:
splitter = ut.get_sentence_splitter('ESS_R09_2018_ENG_SOURCE')
else:
splitter = ut.get_sentence_splitter(filename)
return df_questionnaire, survey_item_prefix, study, country_language,splitter
def main(filename):
extract_source = 0
"""
Parse the input XML file by filename
"""
file = str(filename)
tree = ET.parse(file)
root = tree.getroot()
"""
Create a dictionary containing parent-child relations of the parsed tree
"""
parent_map = dict((c, p) for p in tree.getiterator() for c in p)
ess_questions_instructions = root.findall('.//questionnaire/questions')
ess_answers = root.findall('.//questionnaire/answers')
ess_showcards = root.findall('.//questionnaire/showcards')
df_questionnaire, survey_item_prefix, study, country_language,splitter = set_initial_structures(filename, extract_source)
ess_special_answer_categories = instantiate_special_answer_category_object(country_language)
if 'GER_AT' in filename:
ess_special_answer_categories.refuse[0] = 'Verweigert'
ess_special_answer_categories.dont_know[0] = 'Weiß nicht'
if 'GER_CH' in filename:
ess_special_answer_categories.refuse[0] = 'Antwort verweigert'
ess_special_answer_categories.dont_know[0] = 'Weiss nicht'
if 'GER_DE' in filename:
ess_special_answer_categories.refuse[0] = 'Antwort verweigert'
if 'NOR_NO' in filename:
ess_special_answer_categories.refuse[0] = 'Nekter'
df_question_instruction = pd.DataFrame(columns=['answer_id', 'item_name', 'item_type', 'text'])
df_answers = pd.DataFrame(columns=['answer_id', 'item_name', 'text', 'item_value'])
item_value = None
df_question_instruction = process_question_instruction_node(ess_questions_instructions, df_question_instruction, parent_map,
splitter, country_language, extract_source)
df_answers = process_answer_node(ess_answers, df_answers, parent_map, ess_special_answer_categories, extract_source)
unique_item_names_question_instruction = df_question_instruction.item_name.unique()
for unique_item_name in unique_item_names_question_instruction:
df_question_instruction_by_item_name = df_question_instruction[df_question_instruction['item_name'].str.lower()==unique_item_name.lower()]
df_answers_by_item_name = df_answers[df_answers['item_name'].str.lower()==unique_item_name.lower()]
module = retrieve_item_module(unique_item_name, study)
last_item_name = ''
for i, row in df_question_instruction_by_item_name.iterrows():
item_name = row['item_name']
if item_name =='Instruction' or item_name == 'Intro':
item_name = last_item_name
if 'Row ' not in item_name and item_name != 'CI' and item_name != 'outro' and 'istration Note' not in item_name and item_name != 'box':
if df_questionnaire.empty:
survey_item_id = ut.get_survey_item_id(survey_item_prefix)
else:
survey_item_id = ut.update_survey_item_id(survey_item_prefix)
if check_if_segment_is_instruction(row['text'], country_language):
item_type = 'INSTRUCTION'
else:
item_type = row['item_type']
data = {'survey_item_ID': survey_item_id, 'Study': study,
'module': module, 'item_type': item_type,
'item_name': item_name, 'item_value':None, 'text': row['text']}
df_questionnaire = df_questionnaire.append(data, ignore_index=True)
last_item_name = item_name
if df_answers_by_item_name.empty == False:
for j, row in df_answers_by_item_name.iterrows():
if row['item_value'] == 'None':
item_value = None
else:
item_value = row['item_value']
data = {'survey_item_ID':ut.update_survey_item_id(survey_item_prefix), 'Study': study,
'module': module,'item_type': row['item_type'],
'item_name': item_name, 'item_value':item_value, 'text':row['text']}
df_questionnaire = df_questionnaire.append(data, ignore_index=True)
# df_question_instruction.to_csv('questions.csv', encoding='utf-8-sig', index=False)
# df_answers.to_csv('answers.csv', encoding='utf-8-sig', index=False)
df_questionnaire.to_csv(survey_item_prefix[:-1]+'.csv', encoding='utf-8-sig', sep='\t', index=False)
if __name__ == "__main__":
filename = str(sys.argv[1])
print("Executing data cleaning/extraction script for ESS (xml files)")
main(filename)