bench_generator.mappings
This module holds the Mappings class which scales the dataset size by the number of members in a dataset such as number of rows for tabular data.
1#!/usr/bin/env python3 2 3""" 4This module holds the Mappings class which scales the dataset size 5by the number of members in a dataset such as number of rows for tabular data. 6""" 7 8import os 9import string 10from pandas import DataFrame 11from rdflib.namespace import RDF 12from rdflib import Graph, URIRef, BNode, Literal, Namespace 13from bench_generator.scenario import Scenario 14from bench_generator.logger import Logger 15 16DATA_FILE = 'data.csv' 17CSV_MAPPING_FILE = 'mapping.rml.ttl' 18RDB_MAPPING_FILE = 'mapping.r2rml.ttl' 19R2RML = Namespace('http://www.w3.org/ns/r2rml#') 20QL = Namespace('http://semweb.mmlab.be/ns/ql#') 21EX = Namespace('http://example.com/') 22 23 24class Mappings(Scenario): 25 def __init__(self, main_directory: str, verbose: bool, number_of_tms: int, 26 number_of_poms: int, number_of_members: int, 27 number_of_properties: int, value_size: int, data_format: str, 28 engine: str): 29 """Initialize a Mappings scenario. 30 31 Parameters 32 ---------- 33 main_directory : str 34 Root directory for generating instances of Mappings. 35 verbose : bool 36 Verbose logging enabled or not. 37 number_of_members : int 38 Number of members to generate, for example 5000 for 5K rows in a 39 tabular data structure. 40 number_of_properties : int 41 Number of properties per member to generate, for example 20 for 42 20 columns in a tabular data structure. 43 value_size : int 44 Number of characters to add to default value generation, 45 for example: 256 will expand all values to 256 characters. 46 data_format : str 47 Data format to use for generating the data set, for example: 48 "csv", "json", "xml", "postgresql", "mysql" 49 engine : str 50 Engine to use for execution of the generated scenario's instance, 51 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 52 or "OntopMaterialize" 53 """ 54 self._number_of_tms: int = number_of_tms 55 self._number_of_poms: int = number_of_poms 56 self._number_of_members: int = number_of_members 57 self._number_of_properties: int = number_of_properties 58 self._value_size: int = value_size 59 60 super().__init__(data_format, engine, main_directory, verbose) 61 if self._data_format != 'csv': 62 raise NotImplementedError(f'Data format {self._data_format} ' 63 f'is not implemented by {__name__}') 64 65 self._logger = Logger(__name__, self._main_directory, self._verbose) 66 67 def generate(self) -> bool: 68 """Generate the instance using the Mappings scenario. 69 70 Only CSV files are currently implemented! 71 """ 72 if self._data_format == 'csv': 73 return self._generate_csv() 74 elif self._data_format == 'postgresql': 75 return self._generate_postgresql() 76 else: 77 raise NotImplementedError(f'Data format {self._data_format} ' 78 f'is not implemented by {__name__}') 79 80 def path(self) -> str: 81 """Builds the file path for the instance of a Mappings scenario. 82 83 Returns 84 ------- 85 path : str 86 File path for the Mappings's instance. 87 """ 88 key = f'mappings_{self._number_of_tms}_' \ 89 f'{self._number_of_poms}' 90 path = os.path.join(self._main_directory, self._engine, 91 self._data_format, key) 92 self._logger.debug(f'Generating to {path}') 93 os.makedirs(path, exist_ok=True) 94 return path 95 96 def _generate_dataframe(self, member_offset: int = 1, 97 property_offset: int = 1) -> DataFrame: 98 """Generate mappings. 99 100 Parameters 101 ---------- 102 member_offset : int 103 Offset to start member ID generation from. Default 1 (no offset). 104 property_offset : int 105 Offset to start property ID generation from. Default 1 (no offset). 106 107 Returns 108 ------- 109 dataframe : DataFrame 110 Panda's DataFrame with generated mappings. 111 """ 112 subject_id = range(member_offset, 113 self._number_of_members + member_offset) 114 value_id = range(property_offset, 115 self._number_of_members + property_offset) 116 data: dict = {'id': subject_id} 117 n_ascii = len(string.ascii_letters) 118 119 for j in range(1, self._number_of_properties + 1): 120 # Append ASCII characters if necessary, use modulo to avoid out of 121 # range in ASCII table 122 append_value = '' 123 if self._value_size > 0: 124 append_value = '_' 125 for n in range(self._value_size): 126 append_value += string.ascii_letters[n % n_ascii] 127 128 # Generate value V_{property}_{member} honoring the value size 129 value = [f'V_{j}-{i}{append_value}' for i in value_id] 130 data[f'p{j}'] = value 131 132 return DataFrame(data) 133 134 def _generate_mapping(self) -> Graph: 135 """Generate a [R2]RML mapping for a Mappings instance. 136 137 Returns 138 ------- 139 mapping : Graph 140 [R2]RML mapping as an RDFLib Graph. 141 """ 142 mapping: Graph = Graph(base='http://ex.com/') 143 mapping.bind('rr', R2RML) 144 mapping.bind('ql', QL) 145 mapping.bind('ex', EX) 146 147 for i in range(1, self._number_of_tms + 1): 148 subject_template = Literal(f'http://ex.com/table/{{p{i}}}') 149 if self._data_format == 'postgresql': 150 triples_map_iri = self._add_triples_map_table(mapping, 151 subject_template, 152 Literal('data'), 153 number=i) 154 elif self._data_format == 'csv': 155 csv_path = Literal('/data/shared/data.csv') 156 triples_map_iri = \ 157 self._add_triples_map_source(mapping, subject_template, 158 csv_path, number=i) 159 else: 160 msg = f'{self._data_format} not implemented' 161 raise NotImplementedError(msg) 162 163 for j in range(1, self._number_of_poms + 1): 164 self._add_predicate_object_map(mapping, triples_map_iri, 165 EX[f'p{j}'], Literal(f'p{j}')) 166 167 return mapping 168 169 def _generate_csv(self) -> bool: 170 """Generate the instance as CSV files. 171 172 Returns 173 ------- 174 success : bool 175 True if successfull, false otherwise 176 """ 177 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 178 data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE) 179 self._generate_dataframe().to_csv(data_path, index=False) 180 181 mapping_path = os.path.join(self.path(), 'data', 'shared', 182 CSV_MAPPING_FILE) 183 mapping: Graph = self._generate_mapping() 184 mapping.serialize(destination=mapping_path, format='turtle') 185 self._generate_scenario() 186 187 return True 188 189 def _generate_postgresql(self) -> bool: 190 """Generate the instance as PostgreSQL with CSV files to load. 191 192 Returns 193 ------- 194 success : bool 195 True if successfull, false otherwise 196 """ 197 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 198 data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE) 199 self._generate_dataframe().to_csv(data_path, index=False) 200 201 mapping_path = os.path.join(self.path(), 'data', 'shared', 202 RDB_MAPPING_FILE) 203 mapping: Graph = self._generate_mapping() 204 mapping.serialize(destination=mapping_path, format='turtle') 205 self._generate_scenario() 206 207 return True 208 209 def _generate_scenario(self) -> bool: 210 """Generate the metadata for this scenario. 211 212 Configures the execution pipeline automatically. 213 214 Returns 215 ------- 216 success : bool 217 True if successfull, false otherwise 218 """ 219 name: str = f'mappings_{self._number_of_tms}_{self._number_of_poms}' 220 description: str = f'Mappings {self._number_of_tms}TM + ' + \ 221 f'{self._number_of_poms}POMs' 222 iri: str = f'http://example.org/mappings/{self._number_of_tms}/' + \ 223 f'{self._number_of_poms}' 224 225 if self._data_format == 'postgresql': 226 return self._generate_metadata(iri, name, description, 227 RDB_MAPPING_FILE) 228 elif self._data_format == 'csv': 229 return self._generate_metadata(iri, name, description, 230 CSV_MAPPING_FILE) 231 else: 232 raise NotImplementedError(f'{self._data_format} not implemented') 233 234 return False
DATA_FILE =
'data.csv'
CSV_MAPPING_FILE =
'mapping.rml.ttl'
RDB_MAPPING_FILE =
'mapping.r2rml.ttl'
R2RML =
Namespace('http://www.w3.org/ns/r2rml#')
QL =
Namespace('http://semweb.mmlab.be/ns/ql#')
EX =
Namespace('http://example.com/')
25class Mappings(Scenario): 26 def __init__(self, main_directory: str, verbose: bool, number_of_tms: int, 27 number_of_poms: int, number_of_members: int, 28 number_of_properties: int, value_size: int, data_format: str, 29 engine: str): 30 """Initialize a Mappings scenario. 31 32 Parameters 33 ---------- 34 main_directory : str 35 Root directory for generating instances of Mappings. 36 verbose : bool 37 Verbose logging enabled or not. 38 number_of_members : int 39 Number of members to generate, for example 5000 for 5K rows in a 40 tabular data structure. 41 number_of_properties : int 42 Number of properties per member to generate, for example 20 for 43 20 columns in a tabular data structure. 44 value_size : int 45 Number of characters to add to default value generation, 46 for example: 256 will expand all values to 256 characters. 47 data_format : str 48 Data format to use for generating the data set, for example: 49 "csv", "json", "xml", "postgresql", "mysql" 50 engine : str 51 Engine to use for execution of the generated scenario's instance, 52 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 53 or "OntopMaterialize" 54 """ 55 self._number_of_tms: int = number_of_tms 56 self._number_of_poms: int = number_of_poms 57 self._number_of_members: int = number_of_members 58 self._number_of_properties: int = number_of_properties 59 self._value_size: int = value_size 60 61 super().__init__(data_format, engine, main_directory, verbose) 62 if self._data_format != 'csv': 63 raise NotImplementedError(f'Data format {self._data_format} ' 64 f'is not implemented by {__name__}') 65 66 self._logger = Logger(__name__, self._main_directory, self._verbose) 67 68 def generate(self) -> bool: 69 """Generate the instance using the Mappings scenario. 70 71 Only CSV files are currently implemented! 72 """ 73 if self._data_format == 'csv': 74 return self._generate_csv() 75 elif self._data_format == 'postgresql': 76 return self._generate_postgresql() 77 else: 78 raise NotImplementedError(f'Data format {self._data_format} ' 79 f'is not implemented by {__name__}') 80 81 def path(self) -> str: 82 """Builds the file path for the instance of a Mappings scenario. 83 84 Returns 85 ------- 86 path : str 87 File path for the Mappings's instance. 88 """ 89 key = f'mappings_{self._number_of_tms}_' \ 90 f'{self._number_of_poms}' 91 path = os.path.join(self._main_directory, self._engine, 92 self._data_format, key) 93 self._logger.debug(f'Generating to {path}') 94 os.makedirs(path, exist_ok=True) 95 return path 96 97 def _generate_dataframe(self, member_offset: int = 1, 98 property_offset: int = 1) -> DataFrame: 99 """Generate mappings. 100 101 Parameters 102 ---------- 103 member_offset : int 104 Offset to start member ID generation from. Default 1 (no offset). 105 property_offset : int 106 Offset to start property ID generation from. Default 1 (no offset). 107 108 Returns 109 ------- 110 dataframe : DataFrame 111 Panda's DataFrame with generated mappings. 112 """ 113 subject_id = range(member_offset, 114 self._number_of_members + member_offset) 115 value_id = range(property_offset, 116 self._number_of_members + property_offset) 117 data: dict = {'id': subject_id} 118 n_ascii = len(string.ascii_letters) 119 120 for j in range(1, self._number_of_properties + 1): 121 # Append ASCII characters if necessary, use modulo to avoid out of 122 # range in ASCII table 123 append_value = '' 124 if self._value_size > 0: 125 append_value = '_' 126 for n in range(self._value_size): 127 append_value += string.ascii_letters[n % n_ascii] 128 129 # Generate value V_{property}_{member} honoring the value size 130 value = [f'V_{j}-{i}{append_value}' for i in value_id] 131 data[f'p{j}'] = value 132 133 return DataFrame(data) 134 135 def _generate_mapping(self) -> Graph: 136 """Generate a [R2]RML mapping for a Mappings instance. 137 138 Returns 139 ------- 140 mapping : Graph 141 [R2]RML mapping as an RDFLib Graph. 142 """ 143 mapping: Graph = Graph(base='http://ex.com/') 144 mapping.bind('rr', R2RML) 145 mapping.bind('ql', QL) 146 mapping.bind('ex', EX) 147 148 for i in range(1, self._number_of_tms + 1): 149 subject_template = Literal(f'http://ex.com/table/{{p{i}}}') 150 if self._data_format == 'postgresql': 151 triples_map_iri = self._add_triples_map_table(mapping, 152 subject_template, 153 Literal('data'), 154 number=i) 155 elif self._data_format == 'csv': 156 csv_path = Literal('/data/shared/data.csv') 157 triples_map_iri = \ 158 self._add_triples_map_source(mapping, subject_template, 159 csv_path, number=i) 160 else: 161 msg = f'{self._data_format} not implemented' 162 raise NotImplementedError(msg) 163 164 for j in range(1, self._number_of_poms + 1): 165 self._add_predicate_object_map(mapping, triples_map_iri, 166 EX[f'p{j}'], Literal(f'p{j}')) 167 168 return mapping 169 170 def _generate_csv(self) -> bool: 171 """Generate the instance as CSV files. 172 173 Returns 174 ------- 175 success : bool 176 True if successfull, false otherwise 177 """ 178 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 179 data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE) 180 self._generate_dataframe().to_csv(data_path, index=False) 181 182 mapping_path = os.path.join(self.path(), 'data', 'shared', 183 CSV_MAPPING_FILE) 184 mapping: Graph = self._generate_mapping() 185 mapping.serialize(destination=mapping_path, format='turtle') 186 self._generate_scenario() 187 188 return True 189 190 def _generate_postgresql(self) -> bool: 191 """Generate the instance as PostgreSQL with CSV files to load. 192 193 Returns 194 ------- 195 success : bool 196 True if successfull, false otherwise 197 """ 198 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 199 data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE) 200 self._generate_dataframe().to_csv(data_path, index=False) 201 202 mapping_path = os.path.join(self.path(), 'data', 'shared', 203 RDB_MAPPING_FILE) 204 mapping: Graph = self._generate_mapping() 205 mapping.serialize(destination=mapping_path, format='turtle') 206 self._generate_scenario() 207 208 return True 209 210 def _generate_scenario(self) -> bool: 211 """Generate the metadata for this scenario. 212 213 Configures the execution pipeline automatically. 214 215 Returns 216 ------- 217 success : bool 218 True if successfull, false otherwise 219 """ 220 name: str = f'mappings_{self._number_of_tms}_{self._number_of_poms}' 221 description: str = f'Mappings {self._number_of_tms}TM + ' + \ 222 f'{self._number_of_poms}POMs' 223 iri: str = f'http://example.org/mappings/{self._number_of_tms}/' + \ 224 f'{self._number_of_poms}' 225 226 if self._data_format == 'postgresql': 227 return self._generate_metadata(iri, name, description, 228 RDB_MAPPING_FILE) 229 elif self._data_format == 'csv': 230 return self._generate_metadata(iri, name, description, 231 CSV_MAPPING_FILE) 232 else: 233 raise NotImplementedError(f'{self._data_format} not implemented') 234 235 return False
Helper class that provides a standard way to create an ABC using inheritance.
Mappings( main_directory: str, verbose: bool, number_of_tms: int, number_of_poms: int, number_of_members: int, number_of_properties: int, value_size: int, data_format: str, engine: str)
26 def __init__(self, main_directory: str, verbose: bool, number_of_tms: int, 27 number_of_poms: int, number_of_members: int, 28 number_of_properties: int, value_size: int, data_format: str, 29 engine: str): 30 """Initialize a Mappings scenario. 31 32 Parameters 33 ---------- 34 main_directory : str 35 Root directory for generating instances of Mappings. 36 verbose : bool 37 Verbose logging enabled or not. 38 number_of_members : int 39 Number of members to generate, for example 5000 for 5K rows in a 40 tabular data structure. 41 number_of_properties : int 42 Number of properties per member to generate, for example 20 for 43 20 columns in a tabular data structure. 44 value_size : int 45 Number of characters to add to default value generation, 46 for example: 256 will expand all values to 256 characters. 47 data_format : str 48 Data format to use for generating the data set, for example: 49 "csv", "json", "xml", "postgresql", "mysql" 50 engine : str 51 Engine to use for execution of the generated scenario's instance, 52 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 53 or "OntopMaterialize" 54 """ 55 self._number_of_tms: int = number_of_tms 56 self._number_of_poms: int = number_of_poms 57 self._number_of_members: int = number_of_members 58 self._number_of_properties: int = number_of_properties 59 self._value_size: int = value_size 60 61 super().__init__(data_format, engine, main_directory, verbose) 62 if self._data_format != 'csv': 63 raise NotImplementedError(f'Data format {self._data_format} ' 64 f'is not implemented by {__name__}') 65 66 self._logger = Logger(__name__, self._main_directory, self._verbose)
Initialize a Mappings scenario.
Parameters
- main_directory (str): Root directory for generating instances of Mappings.
- verbose (bool): Verbose logging enabled or not.
- number_of_members (int): Number of members to generate, for example 5000 for 5K rows in a tabular data structure.
- number_of_properties (int): Number of properties per member to generate, for example 20 for 20 columns in a tabular data structure.
- value_size (int): Number of characters to add to default value generation, for example: 256 will expand all values to 256 characters.
- data_format (str): Data format to use for generating the data set, for example: "csv", "json", "xml", "postgresql", "mysql"
- engine (str): Engine to use for execution of the generated scenario's instance, for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", or "OntopMaterialize"
def
generate(self) -> bool:
68 def generate(self) -> bool: 69 """Generate the instance using the Mappings scenario. 70 71 Only CSV files are currently implemented! 72 """ 73 if self._data_format == 'csv': 74 return self._generate_csv() 75 elif self._data_format == 'postgresql': 76 return self._generate_postgresql() 77 else: 78 raise NotImplementedError(f'Data format {self._data_format} ' 79 f'is not implemented by {__name__}')
Generate the instance using the Mappings scenario.
Only CSV files are currently implemented!
def
path(self) -> str:
81 def path(self) -> str: 82 """Builds the file path for the instance of a Mappings scenario. 83 84 Returns 85 ------- 86 path : str 87 File path for the Mappings's instance. 88 """ 89 key = f'mappings_{self._number_of_tms}_' \ 90 f'{self._number_of_poms}' 91 path = os.path.join(self._main_directory, self._engine, 92 self._data_format, key) 93 self._logger.debug(f'Generating to {path}') 94 os.makedirs(path, exist_ok=True) 95 return path
Builds the file path for the instance of a Mappings scenario.
Returns
- path (str): File path for the Mappings's instance.