import pandas as pd
import nltk
import sys
import os
import re
from countryspecificrequest import *
def check(n, l):
subs = [l[i:i+n] for i in range(len(l)) if len(l[i:i+n]) == n]
return any([(sorted(sub) in range(min(l), max(l)+1)) for sub in subs])
[docs]def identify_showc_segment(list_source, list_target, item_type):
"""
Searches in list_source, list_target if there are intructions that seem to be show card segments that should be aligned together.
This method was implemented as an additional strategy to align correctly instruction segments.
Args:
param1 list_source (list): list of source segments (contains segments of same item_name and item_type)
param2 list_target (list): list of target segments (contains segments of same item_name and item_type)
param3 item_type (string): item_type metadata being analyzed. In this method we want to consider only instruction segments.
Returns:
str indicating if instructions that follow the show card segments were found.
"""
if item_type == 'INSTRUCTION':
possible_card_instruction_source = 'dummy value'
possible_card_instruction_target = 'dummy value'
for i, item in enumerate(list_source):
if re.compile(r'\s\d+\b|\s[A-Z]\b|\s[A-Z]\d+\b').search(item[-1]):
possible_card_instruction_source = i
break
for i, item in enumerate(list_target):
if re.compile(r'\s\d+\b|\s[A-Z]\b|\s[A-Z]\d+\b').search(item[-1]):
possible_card_instruction_target = i
break
if possible_card_instruction_target !='dummy value' and possible_card_instruction_source!='dummy value':
return (possible_card_instruction_target,possible_card_instruction_source)
else:
return 'No showc segment identified'
else:
return 'No showc segment identified'
[docs]def find_best_match(list_source, list_target, item_type):
"""
Finds the best match for source and target segments (same item_type) based on the lenght of the segments.
Args:
param1 list_source (list): list of source segments (contains segments of same item_name and item_type)
param2 list_target (list): list of target segments (contains segments of same item_name and item_type)
Returns:
alignment (list). Alignment pair represented by the index of target and source segments being aligned
(index 0 = target,index 1 = source), selected with lenght of the segments strategy.
"""
dict_source = dict()
dict_target = dict()
alignment_candidates = dict()
showc = identify_showc_segment(list_source, list_target, item_type)
if showc != 'No showc segment identified':
return showc
for i, item in enumerate(list_source):
dict_source[i] = len(item[-1])
for i, item in enumerate(list_target):
dict_target[i] = len(item[-1])
for target_k, target_v in list(dict_target.items()):
for source_k, source_v in list(dict_source.items()):
alignment_candidates[target_k, source_k] = target_v/source_v
best_candidate = min(alignment_candidates.values(), key=lambda x:abs(x-1))
for k, v in list(alignment_candidates.items()):
if v == best_candidate:
alignment = k
return alignment
[docs]def get_original_index(list_source, list_target, source_segment_index, target_segment_index, aux_source, aux_target):
"""
Gets the original index of aligned segments, as the auxiliary lists are being modified and the indexes
does not correspond to the original ones.
Args:
param1 list_source (list): list of source segments (contains segments of same item_name and item_type).
param2 list_target (list): list of target segments (contains segments of same item_name and item_type).
param3 source_segment_index (int): source segment aligned in aux_source list.
param4 target_segment_index (int): target segment aligned in aux_target list.
param5 aux_source (list): auxiliary list of source segments being modified in outer loop (contains segments of same item_name and item_type).
param6 aux_target (list): auxiliary list of target segments being modified in outer loop (contains segments of same item_name and item_type).
Returns:
original_index_target (int), original_index_source (int). Original indexes (in list_source, list_target) of source/target segments aligned.
"""
target_segment_text = aux_target[target_segment_index]
original_index_target = list_target.index(target_segment_text)
source_segment_text = aux_source[source_segment_index]
original_index_source = list_source.index(source_segment_text)
return original_index_target, original_index_source
[docs]def only_one_segment_in_source_align(alignment, source_segment, target_segment, list_target, aux_target, df):
"""
Fills the dataframe with remaining target segments that do not have source correspondencies.
This method is called when the dataframe contains one source segment to two or more target segments.
The alignment pair is defined in the find_best_match() method and the remaining target segments
are included in this method, respecting the structure order.
Args:
param1 alignment (list): Alignment pair represented by the index of target and source segments being aligned
(index 0 = target,index 1 = source), selected with lenght of the segments strategy.
param2 source_segment (list): source segment that has a match according to find_best_match().
param3 target_segment (list): target segment that has a match according to find_best_match().
param4 list_target (list): list of target segments (contains segments of same item_name and item_type)
param5 aux_target (list): list of source segments, excluding the target_segment (contains segments of same item_name and item_type)
param6 df (pandas dataframe): dataframe to store the questionnaire alignment
Returns:
df (pandas dataframe) with newly aligned survey item segments.
"""
"""
If the index of the target list that was first aligned is 0 (the first one), then other elements in the list go after this
"""
if alignment[0] == 0:
data = {'source_survey_itemID':source_segment[0], 'target_survey_itemID': target_segment[0],
'Study': target_segment[1], 'module': target_segment[2], 'item_type': target_segment[3],
'item_name':target_segment[4], 'item_value': None, 'source_text': source_segment[-1],
'target_text': target_segment[-1]}
df = df.append(data, ignore_index=True)
for i, item in enumerate(aux_target):
data = {'source_survey_itemID': None, 'target_survey_itemID': item[0] , 'Study': item[1],
'module': item[2], 'item_type': item[3], 'item_name':item[4], 'item_value': None,
'source_text': None, 'target_text': item[6]}
df = df.append(data, ignore_index=True)
#If the index of the target list that was first aligned is the last segment on the list then it goes after all other segments.
elif alignment[0] == len(list_target)-1:
for i, item in enumerate(aux_target):
data = {'source_survey_itemID': None, 'target_survey_itemID': item[0] , 'Study': item[1],
'module': item[2], 'item_type': item[3], 'item_name':item[4], 'item_value': None,
'source_text': None, 'target_text': item[6]}
df = df.append(data, ignore_index=True)
data = {'source_survey_itemID': source_segment[0], 'target_survey_itemID': target_segment[0],
'Study': target_segment[1], 'module': target_segment[2], 'item_type': target_segment[3],
'item_name':target_segment[4], 'item_value': None, 'source_text': source_segment[-1],
'target_text': target_segment[-1]}
df = df.append(data, ignore_index=True)
# If the index of the target list that was first aligned is neither the first nor the last segment, we have to find its place using the index.
else:
for i, item in enumerate(list_target):
if i != alignment[0]:
data = {'source_survey_itemID': None, 'target_survey_itemID': item[0] , 'Study': item[1],
'module': item[2], 'item_type': item[3], 'item_name':item[4], 'item_value': None,
'source_text': None, 'target_text': item[6]}
df = df.append(data, ignore_index=True)
elif i == alignment[0]:
data = {'source_survey_itemID': source_segment[0], 'target_survey_itemID': target_segment[0],
'Study': target_segment[1], 'module': target_segment[2], 'item_type': target_segment[3],
'item_name':target_segment[4], 'item_value': None, 'source_text': source_segment[-1],
'target_text': target_segment[-1]}
df = df.append(data, ignore_index=True)
return df
[docs]def only_one_segment_in_target_align(alignment, source_segment, target_segment, list_source, aux_source, df):
"""
Fills the dataframe with remaining source segments that do not have target correspondencies.
This method is called when the dataframe contains one target segment to two or more source segments.
The alignment pair is defined in the find_best_match() method and the remaining source segments
are included in this method, respecting the structure order.
Args:
param1 alignment (list): Alignment pair represented by the index of target and source segments being aligned
(index 0 = target,index 1 = source), selected with lenght of the segments strategy.
param2 source_segment (list): source segment that has a match according to find_best_match().
param3 target_segment (list): target segment that has a match according to find_best_match().
param4 list_source (list): list of source segments (contains segments of same item_name and item_type)
param5 aux_source (list): list of source segments, excluding the source_segment (contains segments of same item_name and item_type)
param6 df (pandas dataframe): dataframe to store the questionnaire alignment
Returns:
df (pandas dataframe) with newly aligned survey item segments.
"""
"""
If the index of the source list that was first aligned is 0 (the first one), then other elements in the list go after this
"""
if alignment[1] == 0:
data = {'source_survey_itemID':source_segment[0], 'target_survey_itemID': target_segment[0],
'Study': target_segment[1], 'module': target_segment[2], 'item_type': target_segment[3],
'item_name':target_segment[4], 'item_value': None, 'source_text': source_segment[-1],
'target_text': target_segment[-1]}
df = df.append(data, ignore_index=True)
for i, item in enumerate(aux_source):
data = {'source_survey_itemID': item[0], 'target_survey_itemID': None, 'Study': item[1],
'module': item[2], 'item_type': item[3], 'item_name':item[4], 'item_value': None,
'source_text': item[6], 'target_text': None}
df = df.append(data, ignore_index=True)
#If the index of the source list that was first aligned is the last segment on the list then it goes after all other segments.
elif alignment[1] == len(list_source)-1:
for i, item in enumerate(aux_source):
data = {'source_survey_itemID': item[0], 'target_survey_itemID': None, 'Study': item[1],
'module': item[2], 'item_type': item[3], 'item_name':item[4], 'item_value': None,
'source_text': item[6], 'target_text': None}
df = df.append(data, ignore_index=True)
data = {'source_survey_itemID': source_segment[0], 'target_survey_itemID': target_segment[0],
'Study': target_segment[1], 'module': target_segment[2], 'item_type': target_segment[3],
'item_name':target_segment[4], 'item_value': None, 'source_text': source_segment[-1],
'target_text': target_segment[-1]}
df = df.append(data, ignore_index=True)
# If the index of the source list that was first aligned is neither the first nor the last segment, we have to find its place using the index.
else:
for i, item in enumerate(list_source):
if i != alignment[1]:
data = {'source_survey_itemID': item[0], 'target_survey_itemID': None , 'Study': item[1],
'module': item[2], 'item_type': item[3], 'item_name':item[4], 'item_value': None,
'source_text': item[6], 'target_text': None}
df = df.append(data, ignore_index=True)
elif i == alignment[1]:
data = {'source_survey_itemID': source_segment[0], 'target_survey_itemID': target_segment[0],
'Study': target_segment[1], 'module': target_segment[2], 'item_type': target_segment[3],
'item_name':target_segment[4], 'item_value': None, 'source_text': source_segment[-1],
'target_text': target_segment[-1]}
df = df.append(data, ignore_index=True)
return df
[docs]def treat_a_single_pairless_target(list_source, list_target, sorted_aligments, target_segments_without_pair, df):
"""
Align source and target segments, in case where there is only one target segment without a pair.
Args:
param1 list_source (list): list of source segments (contains segments of same item_name and item_type)
param2 list_target (list): list of target segments (contains segments of same item_name and item_type)
param3 sorted_aligments (list): sorted list segments aligned via best match strategy.
param4 target_segments_without_pair (int): index of pairless target segment
param5 df (pandas dataframe): dataframe to store the questionnaire alignment
Returns:
df (pandas dataframe) with newly aligned survey item segments.
"""
if target_segments_without_pair == 0:
data = {'source_survey_itemID': None, 'target_survey_itemID': list_target[target_segments_without_pair][0],
'Study': list_target[target_segments_without_pair][1], 'module': list_target[target_segments_without_pair][2],
'item_type': list_target[target_segments_without_pair][3], 'item_name':list_target[target_segments_without_pair][4],
'item_value': None, 'source_text': None,
'target_text': list_target[target_segments_without_pair][-1]}
df = df.append(data, ignore_index=True)
for alignment in sorted_aligments:
source_index = alignment[1]
target_index = alignment[0]
data = {'source_survey_itemID': list_source[source_index][0], 'target_survey_itemID': list_target[target_index][0],
'Study': list_target[target_index][1], 'module': list_target[target_index][2],
'item_type': list_target[target_index][3], 'item_name':list_target[target_index][4],
'item_value': None, 'source_text': list_source[source_index][-1],
'target_text': list_target[target_index][-1]}
df = df.append(data, ignore_index=True)
elif target_segments_without_pair == len(list_target)-1:
for alignment in sorted_aligments:
source_index = alignment[1]
target_index = alignment[0]
data = {'source_survey_itemID': list_source[source_index][0], 'target_survey_itemID': list_target[target_index][0],
'Study': list_target[target_index][1], 'module': list_target[target_index][2],
'item_type': list_target[target_index][3], 'item_name':list_target[target_index][4],
'item_value': None, 'source_text': list_source[source_index][-1],
'target_text': list_target[target_index][-1]}
df = df.append(data, ignore_index=True)
data = {'source_survey_itemID': None, 'target_survey_itemID': list_target[target_segments_without_pair][0],
'Study': list_target[target_segments_without_pair][1], 'module': list_target[target_segments_without_pair][2],
'item_type': list_target[target_segments_without_pair][3], 'item_name':list_target[target_segments_without_pair][4],
'item_value': None, 'source_text': None,
'target_text': list_target[target_segments_without_pair][-1]}
df = df.append(data, ignore_index=True)
else:
for alignment in sorted_aligments:
source_index = alignment[1]
target_index = alignment[0]
if target_segments_without_pair == target_index-1:
data = {'source_survey_itemID': None, 'target_survey_itemID': list_target[target_segments_without_pair][0],
'Study': list_target[target_segments_without_pair][1], 'module': list_target[target_segments_without_pair][2],
'item_type': list_target[target_segments_without_pair][3], 'item_name':list_target[target_segments_without_pair][4],
'item_value': None, 'source_text': None,
'target_text': list_target[target_segments_without_pair][-1]}
df = df.append(data, ignore_index=True)
data = {'source_survey_itemID': list_source[source_index][0], 'target_survey_itemID': list_target[target_index][0],
'Study': list_target[target_index][1], 'module': list_target[target_index][2],
'item_type': list_target[target_index][3], 'item_name':list_target[target_index][4],
'item_value': None, 'source_text': list_source[source_index][-1],
'target_text': list_target[target_index][-1]}
df = df.append(data, ignore_index=True)
else:
data = {'source_survey_itemID': list_source[source_index][0], 'target_survey_itemID': list_target[target_index][0],
'Study': list_target[target_index][1], 'module': list_target[target_index][2],
'item_type': list_target[target_index][3], 'item_name':list_target[target_index][4],
'item_value': None, 'source_text': list_source[source_index][-1],
'target_text': list_target[target_index][-1]}
df = df.append(data, ignore_index=True)
return df
[docs]def treat_a_single_pairless_source(list_source, list_target, sorted_aligments, source_segments_without_pair, df):
"""
Align source and target segments, in case where there is only one source segment without a pair.
Args:
param1 list_source (list): list of source segments (contains segments of same item_name and item_type)
param2 list_target (list): list of target segments (contains segments of same item_name and item_type)
param3 sorted_aligments (list): sorted list segments aligned via best match strategy.
param4 source_segments_without_pair (int): index of pairless souce segment
param5 df (pandas dataframe): dataframe to store the questionnaire alignment
Returns:
df (pandas dataframe) with newly aligned survey item segments.
"""
if source_segments_without_pair == 0:
data = {'source_survey_itemID': list_source[source_segments_without_pair][0], 'target_survey_itemID': None,
'Study': list_source[source_segments_without_pair][1], 'module': list_source[source_segments_without_pair][2],
'item_type': list_source[source_segments_without_pair][3], 'item_name':list_source[source_segments_without_pair][4],
'item_value': None, 'source_text': list_source[source_segments_without_pair][-1],
'target_text': None}
df = df.append(data, ignore_index=True)
for alignment in sorted_aligments:
source_index = alignment[0]
target_index = alignment[1]
data = {'source_survey_itemID': list_source[source_index][0], 'target_survey_itemID': list_target[target_index][0],
'Study': list_source[source_index][1], 'module': list_source[source_index][2],
'item_type': list_source[source_index][3], 'item_name':list_source[source_index][4],
'item_value': None, 'source_text': list_source[source_index][-1],
'target_text': list_target[target_index][-1]}
df = df.append(data, ignore_index=True)
elif source_segments_without_pair == len(list_source)-1:
for alignment in sorted_aligments:
source_index = alignment[0]
target_index = alignment[1]
data = {'source_survey_itemID': list_source[source_index][0], 'target_survey_itemID': list_target[target_index][0],
'Study': list_source[source_index][1], 'module': list_source[source_index][2],
'item_type': list_source[source_index][3], 'item_name':list_source[source_index][4],
'item_value': None, 'source_text': list_source[source_index][-1],
'target_text': list_target[target_index][-1]}
df = df.append(data, ignore_index=True)
data = {'source_survey_itemID': list_source[source_segments_without_pair][0], 'target_survey_itemID': None,
'Study': list_source[source_segments_without_pair][1], 'module': list_source[source_segments_without_pair][2],
'item_type': list_source[source_segments_without_pair][3], 'item_name':list_source[source_segments_without_pair][4],
'item_value': None, 'source_text': list_source[source_segments_without_pair][-1],
'target_text': None}
df = df.append(data, ignore_index=True)
else:
for alignment in sorted_aligments:
source_index = alignment[0]
target_index = alignment[1]
if source_segments_without_pair == source_index-1:
data = {'source_survey_itemID': list_source[source_segments_without_pair][0], 'target_survey_itemID': None,
'Study': list_source[source_segments_without_pair][1], 'module': list_source[source_segments_without_pair][2],
'item_type': list_source[source_segments_without_pair][3], 'item_name':list_source[source_segments_without_pair][4],
'item_value': None, 'source_text': list_source[source_segments_without_pair][-1],
'target_text': None}
df = df.append(data, ignore_index=True)
data = {'source_survey_itemID': list_source[source_index][0], 'target_survey_itemID': list_target[target_index][0],
'Study': list_source[source_index][1], 'module': list_source[source_index][2],
'item_type': list_source[source_index][3], 'item_name':list_source[source_index][4],
'item_value': None, 'source_text': list_source[source_index][-1],
'target_text': list_target[target_index][-1]}
df = df.append(data, ignore_index=True)
else:
data = {'source_survey_itemID': list_source[source_index][0], 'target_survey_itemID': list_target[target_index][0],
'Study': list_source[source_index][1], 'module': list_source[source_index][2],
'item_type': list_source[source_index][3], 'item_name':list_source[source_index][4],
'item_value': None, 'source_text': list_source[source_index][-1],
'target_text': list_target[target_index][-1]}
df = df.append(data, ignore_index=True)
return df
[docs]def treat_multiple_pairless_source_segments(list_source, list_target, sorted_aligments, source_segments_without_pair, df):
"""
Align source and target segments, in case where there are multiple source segments without a pair.
Args:
param1 list_source (list): list of source segments (contains segments of same item_name and item_type)
param2 list_target (list): list of target segments (contains segments of same item_name and item_type)
param3 sorted_aligments (list): sorted list segments aligned via best match strategy.
param4 source_segments_without_pair (list): indexes of pairless souce segments
param5 df (pandas dataframe): dataframe to store the questionnaire alignment
Returns:
df (pandas dataframe) with newly aligned survey item segments.
"""
if sorted(source_segments_without_pair) == list(range(min(source_segments_without_pair), max(source_segments_without_pair)+1)):
if source_segments_without_pair[0] > sorted_aligments[0][-1]:
for alignment in sorted_aligments:
source_index = alignment[0]
target_index = alignment[1]
data = {'source_survey_itemID': list_source[source_index][0], 'target_survey_itemID': list_target[target_index][0],
'Study': list_source[source_index][1], 'module': list_source[source_index][2],
'item_type': list_source[source_index][3], 'item_name':list_source[source_index][4],
'item_value': None, 'source_text': list_source[source_index][-1],
'target_text': list_target[target_index][-1]}
df = df.append(data, ignore_index=True)
for pairless in source_segments_without_pair:
data = {'source_survey_itemID': list_source[pairless][0], 'target_survey_itemID': None,
'Study': list_source[pairless][1], 'module': list_source[pairless][2],
'item_type': list_source[pairless][3], 'item_name':list_source[pairless][4],
'item_value': None, 'source_text': list_source[pairless][-1],
'target_text': None}
df = df.append(data, ignore_index=True)
elif source_segments_without_pair[0] < sorted_aligments[0][0]:
for pairless in source_segments_without_pair:
data = {'source_survey_itemID': list_source[pairless][0], 'target_survey_itemID': None,
'Study': list_source[pairless][1], 'module': list_source[pairless][2],
'item_type': list_source[pairless][3], 'item_name':list_source[pairless][4],
'item_value': None, 'source_text': list_source[pairless][-1],
'target_text': None}
df = df.append(data, ignore_index=True)
for alignment in sorted_aligments:
source_index = alignment[0]
target_index = alignment[1]
data = {'source_survey_itemID': list_source[source_index][0], 'target_survey_itemID': list_target[target_index][0],
'Study': list_source[source_index][1], 'module': list_source[source_index][2],
'item_type': list_source[source_index][3], 'item_name':list_source[source_index][4],
'item_value': None, 'source_text': list_source[source_index][-1],
'target_text': list_target[target_index][-1]}
df = df.append(data, ignore_index=True)
else:
for alignment in sorted_aligments:
source_index = alignment[0]
target_index = alignment[1]
for pairless in source_segments_without_pair:
if pairless < source_index:
data = {'source_survey_itemID': list_source[pairless][0], 'target_survey_itemID': None,
'Study': list_source[pairless][1], 'module': list_source[pairless][2],
'item_type': list_source[pairless][3], 'item_name':list_source[pairless][4],
'item_value': None, 'source_text': list_source[pairless][-1],
'target_text': None}
df = df.append(data, ignore_index=True)
source_segments_without_pair.remove(pairless)
data = {'source_survey_itemID': list_source[source_index][0], 'target_survey_itemID': list_target[target_index][0],
'Study': list_source[source_index][1], 'module': list_source[source_index][2],
'item_type': list_source[source_index][3], 'item_name':list_source[source_index][4],
'item_value': None, 'source_text': list_source[source_index][-1],
'target_text': list_target[target_index][-1]}
df = df.append(data, ignore_index=True)
if source_segments_without_pair:
index = source_segments_without_pair[0]
data = {'source_survey_itemID': list_source[index][0], 'target_survey_itemID': None,
'Study': list_source[index][1], 'module': list_source[index][2],
'item_type': list_source[index][3], 'item_name':list_source[index][4],
'item_value': None, 'source_text': list_source[index][-1],
'target_text': None}
df = df.append(data, ignore_index=True)
return df
[docs]def treat_multiple_pairless_target_segments(list_source, list_target, sorted_aligments, target_segments_without_pair, df):
"""
Align source and target segments, in case where there are multiple target segments without a pair.
Args:
param1 list_source (list): list of source segments (contains segments of same item_name and item_type)
param2 list_target (list): list of target segments (contains segments of same item_name and item_type)
param3 sorted_aligments (list): sorted list segments aligned via best match strategy.
param4 target_segments_without_pair (list): indexes of pairless target segments
param5 df (pandas dataframe): dataframe to store the questionnaire alignment
Returns:
df (pandas dataframe) with newly aligned survey item segments.
"""
if sorted(target_segments_without_pair) == list(range(min(target_segments_without_pair), max(target_segments_without_pair)+1)):
if target_segments_without_pair[0] > sorted_aligments[0][-1]:
for alignment in sorted_aligments:
source_index = alignment[1]
target_index = alignment[0]
data = {'source_survey_itemID': list_source[source_index][0], 'target_survey_itemID': list_target[target_index][0],
'Study': list_target[target_index][1], 'module': list_target[target_index][2],
'item_type': list_target[target_index][3], 'item_name':list_target[target_index][4],
'item_value': None, 'source_text': list_source[source_index][-1],
'target_text': list_target[target_index][-1]}
df = df.append(data, ignore_index=True)
for pairless in target_segments_without_pair:
data = {'source_survey_itemID': None, 'target_survey_itemID': list_target[pairless][0],
'Study': list_target[pairless][1], 'module': list_target[pairless][2],
'item_type': list_target[pairless][3], 'item_name':list_target[pairless][4],
'item_value': None, 'source_text': None,
'target_text': list_target[pairless][-1]}
df = df.append(data, ignore_index=True)
elif target_segments_without_pair[0] < sorted_aligments[0][0]:
for pairless in target_segments_without_pair:
data = {'source_survey_itemID': None, 'target_survey_itemID': list_target[pairless][0],
'Study': list_target[pairless][1], 'module': list_target[pairless][2],
'item_type': list_target[pairless][3], 'item_name':list_target[pairless][4],
'item_value': None, 'source_text': None,
'target_text': list_target[pairless][-1]}
df = df.append(data, ignore_index=True)
for alignment in sorted_aligments:
source_index = alignment[1]
target_index = alignment[0]
data = {'source_survey_itemID': list_source[source_index][0], 'target_survey_itemID': list_target[target_index][0],
'Study': list_target[target_index][1], 'module': list_target[target_index][2],
'item_type': list_target[target_index][3], 'item_name':list_target[target_index][4],
'item_value': None, 'source_text': list_source[source_index][-1],
'target_text': list_target[target_index][-1]}
df = df.append(data, ignore_index=True)
else:
for alignment in sorted_aligments:
source_index = alignment[1]
target_index = alignment[0]
for pairless in target_segments_without_pair:
if pairless < target_index:
data = {'source_survey_itemID': None, 'target_survey_itemID': list_target[pairless][0],
'Study': list_target[pairless][1], 'module': list_target[pairless][2],
'item_type': list_target[pairless][3], 'item_name':list_target[pairless][4],
'item_value': None, 'source_text': None,
'target_text': list_target[pairless][-1]}
df = df.append(data, ignore_index=True)
target_segments_without_pair.remove(pairless)
data = {'source_survey_itemID': list_source[source_index][0], 'target_survey_itemID': list_target[target_index][0],
'Study': list_target[target_index][1], 'module': list_target[target_index][2],
'item_type': list_target[target_index][3], 'item_name':list_target[target_index][4],
'item_value': None, 'source_text': list_source[source_index][-1],
'target_text': list_target[target_index][-1]}
df = df.append(data, ignore_index=True)
if target_segments_without_pair:
index = target_segments_without_pair[0]
data = {'source_survey_itemID': None, 'target_survey_itemID': list_target[index][0],
'Study': list_target[index][1], 'module': list_target[index][2],
'item_type': list_target[index][3], 'item_name':list_target[index][4],
'item_value': None, 'source_text': None,
'target_text': list_target[index][-1]}
df = df.append(data, ignore_index=True)
return df
[docs]def align_more_segments_in_target(list_source, list_target, sorted_aligments, target_segments_without_pair, df):
"""
Calls the appropriate method for alignment with more segments in target dataframe, concerning the number
of pairless segments.
This method is called from a broader sets of cases contained in prepare_alignment_with_more_segments_in_source.
If there is only one pairless target segment, call treat_a_single_pairless_target(),
otherwise call treat_multiple_pairless_target_segments()
Args:
param1 list_source (list): list of source segments (contains segments of same item_name and item_type)
param2 list_target (list): list of target segments (contains segments of same item_name and item_type)
param3 sorted_aligments (list): sorted list segments aligned via best match strategy.
param4 target_segments_without_pair (list): indexes of pairless target segments
param5 df (pandas dataframe): dataframe to store the questionnaire alignment
Returns:
df (pandas dataframe) with newly aligned survey item segments.
"""
if len(target_segments_without_pair) == 1:
df = treat_a_single_pairless_target(list_source, list_target, sorted_aligments, target_segments_without_pair[0], df)
else:
df = treat_multiple_pairless_target_segments(list_source, list_target, sorted_aligments, target_segments_without_pair, df)
return df
[docs]def align_more_segments_in_source(list_source, list_target, sorted_aligments, source_segments_without_pair, df):
"""
Calls the appropriate method for alignment with more segments in source dataframe, concerning the number
of pairless segments.
This method is called from a broader sets of cases contained in prepare_alignment_with_more_segments_in_source.
If there is only one pairless source segment, call treat_a_single_pairless_source(),
otherwise call treat_multiple_pairless_source_segments()
Args:
param1 list_source (list): list of source segments (contains segments of same item_name and item_type)
param2 list_target (list): list of target segments (contains segments of same item_name and item_type)
param3 sorted_aligments (list): sorted list segments aligned via best match strategy.
param4 source_segments_without_pair (list): indexes of pairless source segments
param5 df (pandas dataframe): dataframe to store the questionnaire alignment
Returns:
df (pandas dataframe) with newly aligned survey item segments.
"""
if len(source_segments_without_pair) == 1:
df = treat_a_single_pairless_source(list_source, list_target, sorted_aligments, source_segments_without_pair[0], df)
else:
df = treat_multiple_pairless_source_segments(list_source, list_target, sorted_aligments, source_segments_without_pair, df)
return df
[docs]def prepare_alignment_with_more_segments_in_source(df, list_source, list_target, item_type):
"""
Calls the appropriate method for alignment with more segments in source dataframe, concerning the number
of pairless segments
Args:
param1 df (pandas dataframe): dataframe to store the questionnaire alignment
param2 list_source (list): list of source segments (contains segments of same item_name and item_type)
param3 list_target (list): list of target segments (contains segments of same item_name and item_type)
param4 item_type (string): item_type metadata, can be REQUEST, INTRODUCTION or INSTRUCTION
Returns:
df (pandas dataframe) with newly aligned survey item segments.
"""
"""
index 0 = target
index 1 = source
"""
first_alignment = find_best_match(list_source, list_target, item_type)
target_segment_index = first_alignment[0]
source_segment_index = first_alignment[1]
aux_source = list_source.copy()
del aux_source[source_segment_index]
aux_target = list_target.copy()
del aux_target[target_segment_index]
"""
This is the case where there is only one target segment for two or more source segments
"""
if not aux_target:
df = only_one_segment_in_target_align(first_alignment, list_source[source_segment_index],
list_target[target_segment_index], list_source, aux_source, df)
#If there are still other source segments, call best match method again
else:
alignments = [[source_segment_index, target_segment_index]]
source_segments_paired = [source_segment_index]
while aux_target:
alignment = find_best_match(aux_source, aux_target, item_type)
target_segment_index = alignment[0]
source_segment_index = alignment[1]
original_index_target, original_index_source = get_original_index(list_source, list_target,
source_segment_index, target_segment_index, aux_source, aux_target)
alignments.append([original_index_source,original_index_target])
source_segments_paired.append(original_index_source)
del aux_source[source_segment_index]
del aux_target[target_segment_index]
sorted_aligments = sorted(alignments)
indexes_of_source_segment_index = [index for index, value in enumerate(list_source)]
source_segments_without_pair = list(set(indexes_of_source_segment_index) - set(source_segments_paired))
df = align_more_segments_in_source(list_source, list_target, sorted_aligments, source_segments_without_pair, df)
return df
[docs]def prepare_alignment_with_more_segments_in_target(df, list_source, list_target, item_type):
"""
Calls the appropriate method for alignment with more segments in target dataframe, concerning the number
of pairless segments
Args:
param1 df (pandas dataframe): dataframe to store the questionnaire alignment
param2 list_source (list): list of source segments (contains segments of same item_name and item_type)
param3 list_target (list): list of target segments (contains segments of same item_name and item_type)
param4 item_type (string): item_type metadata, can be REQUEST, INTRODUCTION or INSTRUCTION
Returns:
df (pandas dataframe) with newly aligned survey item segments.
"""
"""
index 0 = target
index 1 = source
"""
first_alignment = find_best_match(list_source, list_target, item_type)
target_segment_index = first_alignment[0]
source_segment_index = first_alignment[1]
aux_source = list_source.copy()
del aux_source[source_segment_index]
aux_target = list_target.copy()
del aux_target[target_segment_index]
"""
This is the case where there is only one source segment for two or more target segments
"""
if not aux_source:
df = only_one_segment_in_source_align(first_alignment, list_source[source_segment_index],
list_target[target_segment_index], list_target, aux_target, df)
#If there are still other source segments, call best match method again
else:
alignments = [[target_segment_index, source_segment_index]]
target_segments_paired = [target_segment_index]
while aux_source:
alignment = find_best_match(aux_source, aux_target, item_type)
target_segment_index = alignment[0]
source_segment_index = alignment[1]
original_index_target, original_index_source = get_original_index(list_source, list_target,
source_segment_index, target_segment_index, aux_source, aux_target)
alignments.append([original_index_target, original_index_source])
target_segments_paired.append(original_index_target)
del aux_source[source_segment_index]
del aux_target[target_segment_index]
sorted_aligments = sorted(alignments)
indexes_of_target_segment_index = [index for index, value in enumerate(list_target)]
target_segments_without_pair = list(set(indexes_of_target_segment_index) - set(target_segments_paired))
df = align_more_segments_in_target(list_source, list_target, sorted_aligments, target_segments_without_pair, df)
return df
[docs]def same_source_target_index_card_instructions(source_index, target_index, aux_source, aux_target, list_source,list_target,item_type,df):
"""
Align source and target instruction segments identified as showcard segments.
Args:
param1 source_index (int): source showcard segment.
param2 target_index (int): target showcard segment.
param3 aux_source (list): auxiliary list of source segments being modified in outer loop (contains segments of same item_name and item_type).
param4 aux_target (list): auxiliary list of target segments being modified in outer loop (contains segments of same item_name and item_type).
param5 list_source (list): list of source segments (contains segments of same item_name and item_type).
param6 list_target (list): list of target segments (contains segments of same item_name and item_type).
param7 item_type (string): item_type metadata, can be REQUEST, INTRODUCTION or INSTRUCTION
param8 df (pandas dataframe): dataframe to store the questionnaire alignment
Returns:
df (pandas dataframe) with newly aligned survey item segments.
"""
if source_index == 0 and target_index == 0:
data = {'source_survey_itemID': list_source[0][0], 'target_survey_itemID': list_target[0][0],
'Study': list_source[0][1], 'module': list_source[0][2], 'item_type': item_type,
'item_name':list_source[0][4], 'item_value': None, 'source_text': list_source[0][6],
'target_text': list_target[0][6]}
df = df.append(data, ignore_index=True)
for i, item in enumerate(aux_source):
data = {'source_survey_itemID': item[0], 'target_survey_itemID': aux_target[i][0] , 'Study': item[1],
'module': item[2], 'item_type': item_type, 'item_name':item[4], 'item_value': None,
'source_text': item[6], 'target_text': aux_target[i][6]}
df = df.append(data, ignore_index=True)
return df
elif source_index == len(list_source)-1 and target_index == len(list_target)-1:
for i, item in enumerate(aux_source):
data = {'source_survey_itemID': item[0], 'target_survey_itemID': aux_target[i][0] , 'Study': item[1],
'module': item[2], 'item_type': item_type, 'item_name':item[4], 'item_value': None,
'source_text': item[6], 'target_text': aux_target[i][6]}
df = df.append(data, ignore_index=True)
data = {'source_survey_itemID': list_source[source_index][0], 'target_survey_itemID': list_target[target_index][0] ,
'Study': list_source[source_index][1], 'module': list_source[source_index][2], 'item_type': item_type,
'item_name':list_source[source_index][4], 'item_value': None,
'source_text': list_source[source_index][6], 'target_text': list_target[target_index][6]}
df = df.append(data, ignore_index=True)
return df
[docs]def different_source_target_index_card_instructions(source_index, target_index, aux_source, aux_target, list_source,list_target,item_type,df):
"""
Align source and target instruction segments identified as showcard segments. For sake of simplicity,
when the source index and the target index of showcard segments are different, they are moved to the
first position in the isntruction list.
Args:
param1 source_index (int): source showcard segment.
param2 target_index (int): target showcard segment.
param3 aux_source (list): auxiliary list of source segments being modified in outer loop (contains segments of same item_name and item_type).
param4 aux_target (list): auxiliary list of target segments being modified in outer loop (contains segments of same item_name and item_type).
param5 list_source (list): list of source segments (contains segments of same item_name and item_type).
param6 list_target (list): list of target segments (contains segments of same item_name and item_type).
param7 item_type (string): item_type metadata, can be REQUEST, INTRODUCTION or INSTRUCTION
param8 df (pandas dataframe): dataframe to store the questionnaire alignment
Returns:
df (pandas dataframe) with newly aligned survey item segments.
"""
list_source[source_index], list_source[0] = list_source[0], list_source[source_index]
list_target[target_index], list_target[0] = list_target[0], list_target[target_index]
df = same_source_target_index_card_instructions(0, 0, aux_source, aux_target, list_source,list_target,item_type,df)
return df
[docs]def align_introduction_instruction_request(df, df_source, df_target, item_type):
"""
Aligns introduction, instruction and requests segments. Differently from response segments, these other item types can't be
merged. There are five distinct cases to consider: 1) only source segments (df_target is empty), 2) only target segments (df_source is empty),
3) df_source has more segments than df_target 4) df_target has more segments than df_source and, 5) df_source and df_target have the
same number of segments.
Args:
param1 df (pandas dataframe): dataframe to store the questionnaire alignment.
param2 df_source (pandas dataframe): dataframe containing the data of the source questionnaire (always English).
param3 df_target (pandas dataframe): dataframe containing the data of the target questionnaire
param4 item_type (string): metadata that indicates if the dataframes contain introductions, instructions or requests.
Returns:
df (pandas dataframe) with newly aligned survey item segments.
"""
df_source = df_source[df_source['item_type']==item_type]
df_target = df_target[df_target['item_type']==item_type]
if df_source.empty:
for i,row in df_target.iterrows():
data = {'source_survey_itemID': None, 'target_survey_itemID': row['survey_item_ID'], 'Study': row['Study'],
'module': row['module'], 'item_type': item_type, 'item_name':row['item_name'], 'item_value': None,
'source_text': None, 'target_text': row['text']}
df = df.append(data, ignore_index=True)
return df
if df_target.empty:
for i,row in df_source.iterrows():
data = {'source_survey_itemID': row['survey_item_ID'], 'target_survey_itemID': None , 'Study': row['Study'],
'module': row['module'], 'item_type': item_type, 'item_name':row['item_name'], 'item_value': None,
'source_text': row['text'], 'target_text': None}
df = df.append(data, ignore_index=True)
return df
else:
list_target = df_target.values.tolist()
list_source = df_source.values.tolist()
if len(list_source) > len(df_target):
df = prepare_alignment_with_more_segments_in_source(df, list_source, list_target, item_type)
return df
elif len(list_target) > len(list_source):
df = prepare_alignment_with_more_segments_in_target(df, list_source, list_target, item_type)
return df
elif len(list_target) == len(list_source):
showc = identify_showc_segment(list_source, list_target, item_type)
if item_type == 'INSTRUCTION' and showc != 'No showc segment identified':
target_index = showc[0]
source_index = showc[1]
aux_source = list_source.copy()
del aux_source[source_index]
aux_target = list_target.copy()
del aux_target[target_index]
if target_index == source_index:
df = same_source_target_index_card_instructions(source_index, target_index, aux_source, aux_target,
list_source,list_target,item_type,df)
return df
else:
df = different_source_target_index_card_instructions(source_index, target_index, aux_source, aux_target,
list_source,list_target,item_type,df)
return df
else:
for i, item in enumerate(list_source):
data = {'source_survey_itemID': item[0], 'target_survey_itemID': list_target[i][0] , 'Study': item[1],
'module': item[2], 'item_type': item_type, 'item_name':item[4], 'item_value': None,
'source_text': item[6], 'target_text': list_target[i][6]}
df = df.append(data, ignore_index=True)
return df
return df
[docs]def align_responses(df, df_source, df_target):
"""
Aligns response segments by merging them on item_value metadata.
Args:
param1 df (pandas dataframe): dataframe to store the questionnaire alignment
param2 df_source (pandas dataframe): dataframe containing the data of the source questionnaire (always English).
param3 df_target (pandas dataframe): dataframe containing the data of the target questionnaire
Returns:
df (pandas dataframe) with newly aligned response segments.
"""
df_source = df_source[df_source['item_type']=='RESPONSE']
df_target = df_target[df_target['item_type']=='RESPONSE']
print(df_target)
df_merge = pd.merge(df_source, df_target, on='item_value')
print(df_merge)
for i, row in df_merge.iterrows():
data = {'source_survey_itemID': row['survey_item_ID_x'], 'target_survey_itemID': row['survey_item_ID_y'], 'Study': row['Study_x'],
'module': row['module_x'], 'item_type': 'RESPONSE', 'item_name':row['item_name_x'], 'item_value': row['item_value'],
'source_text': row['text_x'], 'target_text': row['text_y']}
df = df.append(data, ignore_index=True)
return df
[docs]def filter_by_module(df_source, df_target, module):
"""
Filters the source and target dataframes by the module that is being currently analyzed.
Args:
param1 df_source (pandas dataframe): dataframe containing the data of the source questionnaire (always English).
param2 df_target (pandas dataframe): dataframe containing the data of the target questionnaire
param3 module (string): questionnaire module being currently analyzed in outer loop.
Returns:
df_source (pandas dataframe) and param2 df_target (pandas dataframe). Source and target dataframes filtered
by the module specified by parameter.
"""
df_source = df_source[df_source['module']==module]
df_target = df_target[df_target['module']==module]
return df_source, df_target
[docs]def instantiate_country_specific_request_object(study):
"""
Instantiates the appropriate set of country-specific requests according to the study.
Country-specific requests are deleted from alignment by design because the answer categories
frequently change from country to country.
Args:
param1 study (string): study metadata, embedded in filenames.
Returns:
country_specific_requests (Python object). Instance of python object that encapsulates the item names of
the country specific questions.
"""
if 'ESS_R01' in study:
country_specific_requests = ESSCountrySpecificR01()
elif 'ESS_R02' in study:
country_specific_requests = ESSCountrySpecificR02()
elif 'ESS_R03' in study:
country_specific_requests = ESSCountrySpecificR03()
elif 'ESS_R04' in study:
country_specific_requests = ESSCountrySpecificR04()
elif 'ESS_R05' in study:
country_specific_requests = ESSCountrySpecificR05()
elif 'ESS_R06' in study:
country_specific_requests = ESSCountrySpecificR06()
elif 'EVS_R03' in study:
country_specific_requests = EVSCountrySpecificR03()
elif 'EVS_R04' in study:
country_specific_requests = EVSCountrySpecificR04()
return country_specific_requests
def main(folder_path, filename_source, filename_target):
path = os.chdir(folder_path)
df_source = pd.read_csv(filename_source, dtype=str)
df_target = pd.read_csv(filename_target, dtype=str)
df = pd.DataFrame(columns=['source_survey_itemID', 'target_survey_itemID', 'Study', 'module', 'item_type', 'item_name', 'item_value',
'source_text', 'target_text'])
study = get_study_metadata(filename_source)
target_language_country = get_target_language_country_metadata(filename_target)
if 'EVS' in study:
source_language_country = 'ENG_GB'
else:
source_language_country = 'ENG_SOURCE'
if 'EVS' in study or 'ESS' in study:
country_specific_requests = instantiate_country_specific_request_object(study)
"""
Computes the intersection between the modules of source and target questionnaires.
We are only interested in aligning modules that are present in both files.
"""
intersection_modules = set(df_source.module.unique()).intersection(set(df_target.module.unique()))
for module in sorted(intersection_modules):
df_source_filtered, df_target_filtered = filter_by_module(df_source, df_target, module)
unique_item_names_source = df_source_filtered.item_name.unique()
unique_item_names_target = df_target_filtered.item_name.unique()
for unique_item_name in unique_item_names_source:
process_responses = True
if 'EVS' in study or 'ESS' in study:
if unique_item_name.lower() in country_specific_requests.item_names:
process_responses = False
"""
Computes the intersection between the item names of source and target questionnaires.
We are only interested in aligning questions that are present in both files.
"""
df_source_by_item_name = df_source_filtered[df_source_filtered['item_name'].str.lower()==unique_item_name.lower()]
df_target_by_item_name = df_target_filtered[df_target_filtered['item_name'].str.lower()==unique_item_name.lower()]
if df_target_by_item_name.empty == False and df_source_by_item_name.empty == False:
df = align_on_metadata(df, df_source_by_item_name, df_target_by_item_name, process_responses)
df.to_csv(source_language_country+'_'+target_language_country+'_'+study+'.csv', encoding='utf-8', index=False)
if __name__ == "__main__":
folder_path = str(sys.argv[1])
filename_source = str(sys.argv[2])
filename_target = str(sys.argv[3])
main(folder_path, filename_source,filename_target)