Source code for preprocessing.share_covid_data_extraction

import sys
import utils as ut
import nltk
import pandas as pd
import os
import re


[docs]def get_language_country_iso_codes(language_country): """ Returns the ISO codes for language and country based on the values retrieved from input file. Only for target languages of MCSQ. Args: param1 language_country (string): language and country information retrieved from input file. Returns: language_country (string). Variable representing the language and country metadata in ISO codes. """ if language_country in ['seSE', 'nlNL', 'dkDK', 'grGR', 'nlBE', 'heIL', 'arIL', 'siSI', 'hrHR', 'fiFI', 'bgBG', 'seFI', 'eeEE', 'plPL', 'roRO', 'skSK', 'mtMT', 'huHU', 'lvLV', 'ltLT', 'grCY']: return None elif language_country == 'en': return 'ENG_SOURCE' elif language_country == 'enMT': return 'ENG_MT' elif language_country == 'deAT': return 'GER_AT' elif language_country == 'deCH': return 'GER_CH' elif language_country == 'deDE': return 'GER_DE' elif language_country == 'deLU': return 'GER_LU' elif language_country == 'czCZ': return 'CZE_CZ' elif language_country == 'frFR': return 'FRE_FR' elif language_country == 'frCH': return 'FRE_CH' elif language_country == 'frBE': return 'FRE_BE' elif language_country == 'frLU': return 'FRE_LU' elif language_country == 'itIT': return 'ITA_IT' elif language_country == 'itCH': return 'ITA_CH' elif language_country == 'esES': return 'SPA_ES' elif language_country == 'ptPT': return 'POR_PT' elif language_country == 'ruEE': return 'RUS_EE' elif language_country == 'ruLV': return 'RUS_LV' elif language_country == 'ruIL': return 'RUS_IL'
[docs]def retrieve_module_from_item_name(item_name): """ Returns the module of the question based on the item_name variable. This information comes from http://www.share-project.org/special-data-sets/share-covid-19-questionnaire.html Args: param1 item_name (string): item_name information retrieved from input file. Returns: module (string). Module of the question. """ if 'CAA' in item_name or 'CADN' in item_name: return 'A - Intro and basic demographics' elif 'CAPH' in item_name or 'CAH' in item_name or 'CAMH' in item_name and item_name != 'CAHH017_': return 'H - Health (physical and mental) and health behavior' elif 'CAC' in item_name and item_name != 'CACO007_': return 'C - Corona-related infection' elif 'CAQ' in item_name: return 'Q - Quality of healthcare' elif 'CAW' in item_name or 'CAEP' in item_name: return 'W - Work' elif 'CAE' in item_name or item_name == 'CACO007_' or item_name == 'CAHH017_': return 'E - Economic situation' elif 'CAS' in item_name: return 'S - Social Networks' elif 'CAF' in item_name: return 'F - Finale'
[docs]def set_initial_structures(language_country): """ Set initial structures that are necessary for the extraction of each questionnaire. Args: param1 language_country (string): language and country of the subdataframe being analyzed Returns: df_questionnaire to store questionnaire data (pandas dataframe), survey_item_prefix, which is the prefix of survey_item_ID (string), and sentence splitter to segment request/introduction/instruction segments when necessary (NLTK object). """ """ A pandas dataframe to store questionnaire data. """ df_questionnaire = pd.DataFrame(columns=['survey_item_ID', 'Study', 'module', 'item_type', 'item_name', 'item_value', 'text']) """ The prefix of a EVS survey item is study+'_'+language+'_'+country+'_' """ survey_item_prefix = 'SHA_COVID_2020_'+language_country+'_' """ Reset the initial survey_id sufix, because main is called iterativelly for every file in folder. """ ut.reset_initial_sufix() """ Instantiate a NLTK sentence splitter based on file input language. """ splitter = ut.get_sentence_splitter(language_country) return df_questionnaire, survey_item_prefix, splitter
[docs]def replace_abbreviations_and_fills(sentence): """ Replaces abbreviations and fills text from the text of input file. Args: param1 sentence (string): text segment from input file. Returns: sentence (string). Text segment without abbreviations and fills text. """ if ' Ud.' in sentence: sentence = sentence.replace(' Ud.', ' usted') if ' R ' in sentence: sentence = sentence.replace(' R ', ' respondent ') if ' R.' in sentence: sentence = sentence.replace(' R.', ' respondent.') if " R's" in sentence: sentence = sentence.replace(" R's", " respondent's") if 'IWER' in sentence: sentence = sentence.replace('IWER', 'Interviewer') if '[FILL in name of CTL institution]' in sentence: sentence = sentence.replace('[FILL in name of CTL institution]', '[Institution]') if '[FILL in name of Survey Agency]' in sentence: sentence = sentence.replace('[FILL in name of Survey Agency]', '[Survey Agency]') if '[FILL in telephone number of survey agency]' in sentence: sentence = sentence.replace('[FILL in telephone number of survey agency]', '[Telephone of Survey Agency]') sentence = re.sub(" :", ":", sentence) sentence = re.sub("’", "'", sentence) sentence = re.sub("…", "...", sentence) sentence = re.sub(" :", ":", sentence) sentence = re.sub("’", "'", sentence) sentence = sentence.replace(" ", " ") sentence = sentence.replace(" ?", "?") return sentence
[docs]def preprocess_answer_segment(row, df_questionnaire, survey_item_prefix, splitter): """ Extracts and processes the answer segments from the input file. Args: param1 row (pandas dataframe object): dataframe row being currently analyzed. param2 df_questionnaire (pandas dataframe): pandas dataframe to store questionnaire data. param3 survey_item_prefix (string): prefix of survey_item_ID. param4 splitter (NLTK object): NLTK object for sentence segmentation instantiated in accordance to the language. Returns: updated df_questionnaire with new valid answer segments. """ sentence = replace_abbreviations_and_fills(row['text']) if df_questionnaire.empty: survey_item_id = ut.get_survey_item_id(survey_item_prefix) else: survey_item_id = ut.update_survey_item_id(survey_item_prefix) data = {"survey_item_ID": survey_item_id,'Study': survey_item_prefix[:-1], 'module': retrieve_module_from_item_name(row['name']), 'item_type': 'RESPONSE', 'item_name': row['name'], 'item_value': row['item_order'], 'text': sentence} df_questionnaire = df_questionnaire.append(data, ignore_index = True) return df_questionnaire
[docs]def preprocess_question_segment(row, df_questionnaire, survey_item_prefix, splitter): """ Extracts and processes the question segments from the input file. Args: param1 row (pandas dataframe object): dataframe row being currently analyzed. param2 df_questionnaire (pandas dataframe): pandas dataframe to store questionnaire data. param3 survey_item_prefix (string): prefix of survey_item_ID. param4 splitter (NLTK object): NLTK object for sentence segmentation instantiated in accordance to the language. Returns: updated df_questionnaire with new valid question segments. """ raw_item = replace_abbreviations_and_fills(row['text']) sentences = splitter.tokenize(raw_item) for sentence in sentences: if df_questionnaire.empty: survey_item_id = ut.get_survey_item_id(survey_item_prefix) else: survey_item_id = ut.update_survey_item_id(survey_item_prefix) data = {"survey_item_ID": survey_item_id,'Study': survey_item_prefix[:-1], 'module': retrieve_module_from_item_name(row['name']), 'item_type': 'REQUEST', 'item_name': row['name'], 'item_value': None, 'text': sentence} df_questionnaire = df_questionnaire.append(data, ignore_index = True) return df_questionnaire
[docs]def preprocess_instruction_segment(row, df_questionnaire, survey_item_prefix, splitter): """ Extracts and processes the instruction segments from the input file. Args: param1 row (pandas dataframe object): dataframe row being currently analyzed. param2 df_questionnaire (pandas dataframe): pandas dataframe to store questionnaire data. param3 survey_item_prefix (string): prefix of survey_item_ID. param4 splitter (NLTK object): NLTK object for sentence segmentation instantiated in accordance to the language. Returns: updated df_questionnaire with new valid instruction segments. """ raw_item = replace_abbreviations_and_fills(row['text']) sentences = splitter.tokenize(raw_item) for sentence in sentences: if df_questionnaire.empty: survey_item_id = ut.get_survey_item_id(survey_item_prefix) else: survey_item_id = ut.update_survey_item_id(survey_item_prefix) data = {"survey_item_ID": survey_item_id,'Study': survey_item_prefix[:-1], 'module': retrieve_module_from_item_name(row['name']), 'item_type': 'INSTRUCTION', 'item_name': row['name'], 'item_value': None, 'text': sentence} df_questionnaire = df_questionnaire.append(data, ignore_index = True) return df_questionnaire
def main(folder_path): path = os.chdir(folder_path) files = os.listdir(path) for index, file in enumerate(files): if file.endswith(".csv"): questionnaire = pd.read_csv(file) unique_country_language_pairs = questionnaire['lang'].unique() for pair in unique_country_language_pairs: language_country = get_language_country_iso_codes(pair) if language_country: df_questionnaire, survey_item_prefix, splitter = set_initial_structures(language_country) filter_df_by_lang = questionnaire['lang']==pair df = questionnaire[filter_df_by_lang] for i, row in df.iterrows(): if row['item_type'] == 'QText': df_questionnaire = preprocess_question_segment(row, df_questionnaire, survey_item_prefix, splitter) elif row['item_type'] == 'Response option': df_questionnaire = preprocess_answer_segment(row, df_questionnaire, survey_item_prefix, splitter) elif row['item_type'] == 'QInstruction': df_questionnaire = preprocess_instruction_segment(row, df_questionnaire, survey_item_prefix, splitter) df_questionnaire.to_csv(survey_item_prefix[:-1]+'.csv', encoding='utf-8-sig', sep='\t', index=False) if __name__ == "__main__": folder_path = str(sys.argv[1]) print("Executing data extraction script for SHARE COVID-19 questionnaires") main(folder_path)