bench_generator.named_graph
This module holds the NamedGraph class which scales the dataset size by the number of members in a dataset such as number of rows for tabular data.
1#!/usr/bin/env python3 2 3""" 4This module holds the NamedGraph class which scales the dataset size 5by the number of members in a dataset such as number of rows for tabular data. 6""" 7 8import os 9import string 10from pandas import DataFrame 11from rdflib.namespace import RDF 12from rdflib import Graph, URIRef, BNode, Literal, Namespace 13from bench_generator.scenario import Scenario 14from bench_generator.logger import Logger 15 16DATA_FILE = 'data.csv' 17CSV_MAPPING_FILE = 'mapping.rml.ttl' 18RDB_MAPPING_FILE = 'mapping.r2rml.ttl' 19R2RML = Namespace('http://www.w3.org/ns/r2rml#') 20RML = Namespace('http://semweb.mmlab.be/ns/rml#') 21QL = Namespace('http://semweb.mmlab.be/ns/ql#') 22EX = Namespace('http://example.com/') 23 24 25class NamedGraph(Scenario): 26 def __init__(self, main_directory: str, verbose: bool, 27 number_of_ng_pom: int, number_of_ng_s: int, static: bool, 28 number_of_tms: int, number_of_poms: int, 29 number_of_members: int, number_of_properties: int, 30 value_size: int, data_format: str, engine: str): 31 """Initialize a NamedGraph scenario. 32 33 Parameters 34 ---------- 35 main_directory : str 36 Root directory for generating instances of NamedGraph. 37 verbose : bool 38 Verbose logging enabled or not. 39 number_of_ng_pom : int 40 Number of named graphs per Predicate Object Map. 41 number_of_ng_s : int 42 Number of named graphs for Subject Map. 43 number_of_members : int 44 Number of members to generate, for example 5000 for 5K rows in a 45 tabular data structure. 46 number_of_properties : int 47 Number of properties per member to generate, for example 20 for 48 20 columns in a tabular data structure. 49 value_size : int 50 Number of characters to add to default value generation, 51 for example: 256 will expand all values to 256 characters. 52 data_format : str 53 Data format to use for generating the data set, for example: 54 "csv", "json", "xml", "postgresql", "mysql" 55 engine : str 56 Engine to use for execution of the generated scenario's instance, 57 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 58 or "OntopMaterialize" 59 """ 60 self._number_of_ng_pom: int = number_of_ng_pom 61 self._number_of_ng_s: int = number_of_ng_s 62 self._static: bool = static 63 self._number_of_tms: int = number_of_tms 64 self._number_of_poms: int = number_of_poms 65 self._number_of_members: int = number_of_members 66 self._number_of_properties: int = number_of_properties 67 self._value_size: int = value_size 68 69 super().__init__(data_format, engine, main_directory, verbose) 70 self._logger = Logger(__name__, self._main_directory, self._verbose) 71 72 def generate(self) -> bool: 73 """Generate the instance using the NamedGraph scenario. 74 75 Only CSV files are currently implemented! 76 """ 77 if self._data_format == 'csv': 78 return self._generate_csv() 79 elif self._data_format == 'postgresql': 80 return self._generate_postgresql() 81 else: 82 raise NotImplementedError(f'Data format {self._data_format} ' 83 f'is not implemented by {__name__}') 84 85 def path(self) -> str: 86 """Builds the file path for the instance of a NamedGraph scenario. 87 88 Returns 89 ------- 90 path : str 91 File path for the NamedGraph's instance. 92 """ 93 key = f'namedgraph_{self._number_of_ng_s}SM-NG_' \ 94 f'{self._number_of_ng_pom}POM-NG_{self._number_of_tms}TM_' \ 95 f'{self._number_of_poms}POM_{self._static}' 96 path = os.path.join(self._main_directory, self._engine, 97 self._data_format, key) 98 self._logger.debug(f'Generating to {path}') 99 os.makedirs(path, exist_ok=True) 100 return path 101 102 def _generate_dataframe(self, member_offset: int = 1, 103 property_offset: int = 1) -> DataFrame: 104 """Generate mappings. 105 106 Parameters 107 ---------- 108 member_offset : int 109 Offset to start member ID generation from. Default 1 (no offset). 110 property_offset : int 111 Offset to start property ID generation from. Default 1 (no offset). 112 113 Returns 114 ------- 115 dataframe : DataFrame 116 Panda's DataFrame with generated mappings. 117 """ 118 subject_id = range(member_offset, 119 self._number_of_members + member_offset) 120 value_id = range(property_offset, 121 self._number_of_members + property_offset) 122 data: dict = {'id': subject_id} 123 n_ascii = len(string.ascii_letters) 124 125 for j in range(1, self._number_of_properties + 1): 126 # Append ASCII characters if necessary, use modulo to avoid out of 127 # range in ASCII table 128 append_value = '' 129 if self._value_size > 0: 130 append_value = '_' 131 for n in range(self._value_size): 132 append_value += string.ascii_letters[n % n_ascii] 133 134 # Generate value V_{property}_{member} honoring the value size 135 value = [f'V_{j}-{i}{append_value}' for i in value_id] 136 data[f'p{j}'] = value 137 138 return DataFrame(data) 139 140 def _add_predicate_object_map(self, mapping: Graph, triplesmap_iri: URIRef, 141 predicate_value: URIRef, 142 object_value: Literal, named_graphs: int = 0, 143 static: bool = True) -> BNode: 144 """Insert a PredicateObjectMap into a [R2]RML mapping 145 146 Parameters 147 ---------- 148 mapping : Graph 149 [R2]RML mapping as an RDFLib Graph. 150 triples_map_iri : URIRef 151 IRI of the Triples Map to insert the PredicateObjectMap in. 152 predicate_value : Literal 153 Predicate IRI value for PredicateObjectMap. 154 object_value : Literal 155 Object value for PredicateObjectMap. 156 157 Returns 158 ------- 159 predicate_object_map_iri : BNode 160 Predicate Object Map blank node ID. 161 """ 162 predicate_object_map_iri = BNode() 163 predicate_map_iri = BNode() 164 object_map_iri = BNode() 165 166 mapping.add((predicate_map_iri, R2RML.constant, predicate_value)) 167 mapping.add((predicate_map_iri, RDF.type, R2RML.PredicateMap)) 168 if self._data_format == 'postgresql': 169 mapping.add((object_map_iri, R2RML.column, object_value)) 170 else: 171 mapping.add((object_map_iri, RML.reference, object_value)) 172 mapping.add((object_map_iri, RDF.type, R2RML.ObjectMap)) 173 mapping.add((predicate_object_map_iri, R2RML.predicateMap, 174 predicate_map_iri)) 175 mapping.add((predicate_object_map_iri, R2RML.objectMap, 176 object_map_iri)) 177 mapping.add((predicate_object_map_iri, RDF.type, 178 R2RML.PredicateObjectMap)) 179 mapping.add((triplesmap_iri, R2RML.predicateObjectMap, 180 predicate_object_map_iri)) 181 182 for i in range(1, named_graphs + 1): 183 if static: 184 if self._number_of_ng_s == 0: 185 mapping.add((predicate_object_map_iri, R2RML.graph, 186 URIRef(f'http://example.org/graph{i}'))) 187 else: 188 mapping.add((predicate_object_map_iri, R2RML.graph, 189 URIRef(f'http://example.org/pom/graph{i}'))) 190 else: 191 graph_map_iri = BNode() 192 mapping.add((predicate_object_map_iri, R2RML.graphMap, 193 graph_map_iri)) 194 if self._number_of_ng_s == 0: 195 mapping.add((graph_map_iri, R2RML.template, 196 Literal(f'http://example.org/graph{{p{i}}}'))) 197 else: 198 mapping.add((graph_map_iri, R2RML.template, 199 Literal('http://example.org/pom/' 200 f'graph{{p{i}}}'))) 201 202 return predicate_object_map_iri 203 204 def _add_triples_map_source(self, mapping: Graph, subject_value: Literal, 205 source_path: Literal, number: int = 1, 206 named_graphs: int = 0, 207 static: bool = True) -> URIRef: 208 """Insert a TriplesMap into a RML mapping with a Logical Source 209 210 Parameters 211 ---------- 212 mapping : Graph 213 [R2]RML mapping as an RDFLib Graph. 214 subject_value : Literal 215 Subject IRI template value. 216 source_path : Literal 217 Path to source file. 218 number : int 219 Triples Map number, default 1. 220 named_graphs : int 221 Number of named graphs, default 0. 222 223 Returns 224 ------- 225 triples_map_iri : URIRef 226 IRI of the Triples Map inserted into the mapping. 227 """ 228 triples_map_iri = URIRef(f'{mapping.base}#TriplesMap{number}') 229 subject_map_iri = BNode() 230 logical_source_iri = BNode() 231 232 mapping.add((logical_source_iri, RML.source, source_path)) 233 mapping.add((logical_source_iri, RML.referenceFormulation, QL.CSV)) 234 mapping.add((logical_source_iri, RDF.type, RML.LogicalSource)) 235 mapping.add((triples_map_iri, RML.logicalSource, logical_source_iri)) 236 mapping.add((triples_map_iri, R2RML.subjectMap, subject_map_iri)) 237 mapping.add((triples_map_iri, RDF.type, R2RML.TriplesMap)) 238 mapping.add((subject_map_iri, R2RML.template, subject_value)) 239 240 for i in range(1, named_graphs + 1): 241 if static: 242 mapping.add((subject_map_iri, R2RML.graph, 243 URIRef(f'http://example.org/graph{i}'))) 244 else: 245 graph_map_iri = BNode() 246 mapping.add((subject_map_iri, R2RML.graphMap, graph_map_iri)) 247 mapping.add((graph_map_iri, R2RML.template, 248 Literal(f'http://example.org/graph{{p{i}}}'))) 249 250 return triples_map_iri 251 252 def _add_triples_map_table(self, mapping: Graph, subject_value: Literal, 253 table_name: Literal, number: int = 1, 254 named_graphs: int = 0, 255 static: bool = True) -> URIRef: 256 """Insert a TriplesMap into a [R2]RML mapping with a Logical Table 257 258 Parameters 259 ---------- 260 mapping : Graph 261 [R2]RML mapping as an RDFLib Graph. 262 subject_value : Literal 263 Subject IRI template value. 264 table_name : Literal 265 SQL table name to add. 266 number : int 267 Triples Map number, default 1. 268 named_graphs : int 269 Number of named graphs, default 0. 270 271 Returns 272 ------- 273 triples_map_iri : URIRef 274 IRI of the Triples Map inserted into the mapping. 275 """ 276 triples_map_iri = URIRef(f'{mapping.base}#TriplesMap{number}') 277 subject_map_iri = BNode() 278 logical_table_iri = BNode() 279 280 mapping.add((logical_table_iri, R2RML.tableName, table_name)) 281 mapping.add((logical_table_iri, RDF.type, R2RML.LogicalTable)) 282 mapping.add((triples_map_iri, R2RML.logicalTable, logical_table_iri)) 283 mapping.add((triples_map_iri, R2RML.subjectMap, subject_map_iri)) 284 mapping.add((triples_map_iri, RDF.type, R2RML.TriplesMap)) 285 mapping.add((subject_map_iri, R2RML.template, subject_value)) 286 287 for i in range(1, named_graphs + 1): 288 if static: 289 mapping.add((subject_map_iri, R2RML.graph, 290 URIRef(f'http://example.org/graph{i}'))) 291 else: 292 graph_map_iri = BNode() 293 mapping.add((subject_map_iri, R2RML.graphMap, graph_map_iri)) 294 mapping.add((graph_map_iri, R2RML.template, 295 Literal(f'http://example.org/graph{{p{i}}}'))) 296 297 return triples_map_iri 298 299 def _generate_mapping(self) -> Graph: 300 """Generate a [R2]RML mapping for a NamedGraph instance. 301 302 Returns 303 ------- 304 mapping : Graph 305 [R2]RML mapping as an RDFLib Graph. 306 """ 307 mapping: Graph = Graph(base='http://ex.com/') 308 mapping.bind('rr', R2RML) 309 mapping.bind('ql', QL) 310 mapping.bind('ex', EX) 311 312 for i in range(1, self._number_of_tms + 1): 313 subject_template = Literal(f'http://ex.com/table/{{p{i}}}') 314 if self._data_format == 'postgresql': 315 triples_map_iri = self._add_triples_map_table(mapping, 316 subject_template, 317 Literal('data'), 318 number=i, 319 named_graphs=self._number_of_ng_s, 320 static=self._static) 321 elif self._data_format == 'csv': 322 csv_path = Literal('/data/shared/data.csv') 323 triples_map_iri = \ 324 self._add_triples_map_source(mapping, subject_template, 325 csv_path, number=i, 326 named_graphs=self._number_of_ng_s, 327 static=self._static) 328 else: 329 msg = f'{self._data_format} not implemented' 330 raise NotImplementedError(msg) 331 332 for j in range(1, self._number_of_poms + 1): 333 self._add_predicate_object_map(mapping, triples_map_iri, 334 EX[f'p{j}'], Literal(f'p{j}'), 335 named_graphs=self._number_of_ng_pom, 336 static=self._static) 337 338 return mapping 339 340 def _generate_csv(self) -> bool: 341 """Generate the instance as CSV files. 342 343 Returns 344 ------- 345 success : bool 346 True if successfull, false otherwise 347 """ 348 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 349 data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE) 350 self._generate_dataframe().to_csv(data_path, index=False) 351 352 mapping_path = os.path.join(self.path(), 'data', 'shared', 353 CSV_MAPPING_FILE) 354 mapping: Graph = self._generate_mapping() 355 mapping.serialize(destination=mapping_path, format='turtle') 356 self._generate_scenario() 357 358 return True 359 360 def _generate_postgresql(self) -> bool: 361 """Generate the instance as PostgreSQL with CSV files to load. 362 363 Returns 364 ------- 365 success : bool 366 True if successfull, false otherwise 367 """ 368 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 369 data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE) 370 self._generate_dataframe().to_csv(data_path, index=False) 371 372 mapping_path = os.path.join(self.path(), 'data', 'shared', 373 RDB_MAPPING_FILE) 374 mapping: Graph = self._generate_mapping() 375 mapping.serialize(destination=mapping_path, format='turtle') 376 self._generate_scenario() 377 378 return True 379 380 def _generate_scenario(self) -> bool: 381 """Generate the metadata for this scenario. 382 383 Configures the execution pipeline automatically. 384 385 Returns 386 ------- 387 success : bool 388 True if successfull, false otherwise 389 """ 390 name: str = f'namedgraph_{self._number_of_ng_s}_' \ 391 f'{self._number_of_ng_pom}_{self._number_of_tms}_' \ 392 f'{self._number_of_poms}_{self._static}' 393 description: str = f'NamedGraph {self._number_of_tms}TM + ' + \ 394 f'{self._number_of_poms}POMs' 395 iri: str = f'http://example.org/mappings/{self._number_of_tms}/' + \ 396 f'{self._number_of_poms}/{self._number_of_ng_s}/' + \ 397 f'{self._number_of_ng_pom}/{self._static}' 398 399 if self._data_format == 'postgresql': 400 return self._generate_metadata(iri, name, description, 401 RDB_MAPPING_FILE, 402 serialization='nquads') 403 elif self._data_format == 'csv': 404 return self._generate_metadata(iri, name, description, 405 CSV_MAPPING_FILE, 406 serialization='nquads') 407 else: 408 raise NotImplementedError(f'{self._data_format} not implemented') 409 410 return False
DATA_FILE =
'data.csv'
CSV_MAPPING_FILE =
'mapping.rml.ttl'
RDB_MAPPING_FILE =
'mapping.r2rml.ttl'
R2RML =
Namespace('http://www.w3.org/ns/r2rml#')
RML =
Namespace('http://semweb.mmlab.be/ns/rml#')
QL =
Namespace('http://semweb.mmlab.be/ns/ql#')
EX =
Namespace('http://example.com/')
26class NamedGraph(Scenario): 27 def __init__(self, main_directory: str, verbose: bool, 28 number_of_ng_pom: int, number_of_ng_s: int, static: bool, 29 number_of_tms: int, number_of_poms: int, 30 number_of_members: int, number_of_properties: int, 31 value_size: int, data_format: str, engine: str): 32 """Initialize a NamedGraph scenario. 33 34 Parameters 35 ---------- 36 main_directory : str 37 Root directory for generating instances of NamedGraph. 38 verbose : bool 39 Verbose logging enabled or not. 40 number_of_ng_pom : int 41 Number of named graphs per Predicate Object Map. 42 number_of_ng_s : int 43 Number of named graphs for Subject Map. 44 number_of_members : int 45 Number of members to generate, for example 5000 for 5K rows in a 46 tabular data structure. 47 number_of_properties : int 48 Number of properties per member to generate, for example 20 for 49 20 columns in a tabular data structure. 50 value_size : int 51 Number of characters to add to default value generation, 52 for example: 256 will expand all values to 256 characters. 53 data_format : str 54 Data format to use for generating the data set, for example: 55 "csv", "json", "xml", "postgresql", "mysql" 56 engine : str 57 Engine to use for execution of the generated scenario's instance, 58 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 59 or "OntopMaterialize" 60 """ 61 self._number_of_ng_pom: int = number_of_ng_pom 62 self._number_of_ng_s: int = number_of_ng_s 63 self._static: bool = static 64 self._number_of_tms: int = number_of_tms 65 self._number_of_poms: int = number_of_poms 66 self._number_of_members: int = number_of_members 67 self._number_of_properties: int = number_of_properties 68 self._value_size: int = value_size 69 70 super().__init__(data_format, engine, main_directory, verbose) 71 self._logger = Logger(__name__, self._main_directory, self._verbose) 72 73 def generate(self) -> bool: 74 """Generate the instance using the NamedGraph scenario. 75 76 Only CSV files are currently implemented! 77 """ 78 if self._data_format == 'csv': 79 return self._generate_csv() 80 elif self._data_format == 'postgresql': 81 return self._generate_postgresql() 82 else: 83 raise NotImplementedError(f'Data format {self._data_format} ' 84 f'is not implemented by {__name__}') 85 86 def path(self) -> str: 87 """Builds the file path for the instance of a NamedGraph scenario. 88 89 Returns 90 ------- 91 path : str 92 File path for the NamedGraph's instance. 93 """ 94 key = f'namedgraph_{self._number_of_ng_s}SM-NG_' \ 95 f'{self._number_of_ng_pom}POM-NG_{self._number_of_tms}TM_' \ 96 f'{self._number_of_poms}POM_{self._static}' 97 path = os.path.join(self._main_directory, self._engine, 98 self._data_format, key) 99 self._logger.debug(f'Generating to {path}') 100 os.makedirs(path, exist_ok=True) 101 return path 102 103 def _generate_dataframe(self, member_offset: int = 1, 104 property_offset: int = 1) -> DataFrame: 105 """Generate mappings. 106 107 Parameters 108 ---------- 109 member_offset : int 110 Offset to start member ID generation from. Default 1 (no offset). 111 property_offset : int 112 Offset to start property ID generation from. Default 1 (no offset). 113 114 Returns 115 ------- 116 dataframe : DataFrame 117 Panda's DataFrame with generated mappings. 118 """ 119 subject_id = range(member_offset, 120 self._number_of_members + member_offset) 121 value_id = range(property_offset, 122 self._number_of_members + property_offset) 123 data: dict = {'id': subject_id} 124 n_ascii = len(string.ascii_letters) 125 126 for j in range(1, self._number_of_properties + 1): 127 # Append ASCII characters if necessary, use modulo to avoid out of 128 # range in ASCII table 129 append_value = '' 130 if self._value_size > 0: 131 append_value = '_' 132 for n in range(self._value_size): 133 append_value += string.ascii_letters[n % n_ascii] 134 135 # Generate value V_{property}_{member} honoring the value size 136 value = [f'V_{j}-{i}{append_value}' for i in value_id] 137 data[f'p{j}'] = value 138 139 return DataFrame(data) 140 141 def _add_predicate_object_map(self, mapping: Graph, triplesmap_iri: URIRef, 142 predicate_value: URIRef, 143 object_value: Literal, named_graphs: int = 0, 144 static: bool = True) -> BNode: 145 """Insert a PredicateObjectMap into a [R2]RML mapping 146 147 Parameters 148 ---------- 149 mapping : Graph 150 [R2]RML mapping as an RDFLib Graph. 151 triples_map_iri : URIRef 152 IRI of the Triples Map to insert the PredicateObjectMap in. 153 predicate_value : Literal 154 Predicate IRI value for PredicateObjectMap. 155 object_value : Literal 156 Object value for PredicateObjectMap. 157 158 Returns 159 ------- 160 predicate_object_map_iri : BNode 161 Predicate Object Map blank node ID. 162 """ 163 predicate_object_map_iri = BNode() 164 predicate_map_iri = BNode() 165 object_map_iri = BNode() 166 167 mapping.add((predicate_map_iri, R2RML.constant, predicate_value)) 168 mapping.add((predicate_map_iri, RDF.type, R2RML.PredicateMap)) 169 if self._data_format == 'postgresql': 170 mapping.add((object_map_iri, R2RML.column, object_value)) 171 else: 172 mapping.add((object_map_iri, RML.reference, object_value)) 173 mapping.add((object_map_iri, RDF.type, R2RML.ObjectMap)) 174 mapping.add((predicate_object_map_iri, R2RML.predicateMap, 175 predicate_map_iri)) 176 mapping.add((predicate_object_map_iri, R2RML.objectMap, 177 object_map_iri)) 178 mapping.add((predicate_object_map_iri, RDF.type, 179 R2RML.PredicateObjectMap)) 180 mapping.add((triplesmap_iri, R2RML.predicateObjectMap, 181 predicate_object_map_iri)) 182 183 for i in range(1, named_graphs + 1): 184 if static: 185 if self._number_of_ng_s == 0: 186 mapping.add((predicate_object_map_iri, R2RML.graph, 187 URIRef(f'http://example.org/graph{i}'))) 188 else: 189 mapping.add((predicate_object_map_iri, R2RML.graph, 190 URIRef(f'http://example.org/pom/graph{i}'))) 191 else: 192 graph_map_iri = BNode() 193 mapping.add((predicate_object_map_iri, R2RML.graphMap, 194 graph_map_iri)) 195 if self._number_of_ng_s == 0: 196 mapping.add((graph_map_iri, R2RML.template, 197 Literal(f'http://example.org/graph{{p{i}}}'))) 198 else: 199 mapping.add((graph_map_iri, R2RML.template, 200 Literal('http://example.org/pom/' 201 f'graph{{p{i}}}'))) 202 203 return predicate_object_map_iri 204 205 def _add_triples_map_source(self, mapping: Graph, subject_value: Literal, 206 source_path: Literal, number: int = 1, 207 named_graphs: int = 0, 208 static: bool = True) -> URIRef: 209 """Insert a TriplesMap into a RML mapping with a Logical Source 210 211 Parameters 212 ---------- 213 mapping : Graph 214 [R2]RML mapping as an RDFLib Graph. 215 subject_value : Literal 216 Subject IRI template value. 217 source_path : Literal 218 Path to source file. 219 number : int 220 Triples Map number, default 1. 221 named_graphs : int 222 Number of named graphs, default 0. 223 224 Returns 225 ------- 226 triples_map_iri : URIRef 227 IRI of the Triples Map inserted into the mapping. 228 """ 229 triples_map_iri = URIRef(f'{mapping.base}#TriplesMap{number}') 230 subject_map_iri = BNode() 231 logical_source_iri = BNode() 232 233 mapping.add((logical_source_iri, RML.source, source_path)) 234 mapping.add((logical_source_iri, RML.referenceFormulation, QL.CSV)) 235 mapping.add((logical_source_iri, RDF.type, RML.LogicalSource)) 236 mapping.add((triples_map_iri, RML.logicalSource, logical_source_iri)) 237 mapping.add((triples_map_iri, R2RML.subjectMap, subject_map_iri)) 238 mapping.add((triples_map_iri, RDF.type, R2RML.TriplesMap)) 239 mapping.add((subject_map_iri, R2RML.template, subject_value)) 240 241 for i in range(1, named_graphs + 1): 242 if static: 243 mapping.add((subject_map_iri, R2RML.graph, 244 URIRef(f'http://example.org/graph{i}'))) 245 else: 246 graph_map_iri = BNode() 247 mapping.add((subject_map_iri, R2RML.graphMap, graph_map_iri)) 248 mapping.add((graph_map_iri, R2RML.template, 249 Literal(f'http://example.org/graph{{p{i}}}'))) 250 251 return triples_map_iri 252 253 def _add_triples_map_table(self, mapping: Graph, subject_value: Literal, 254 table_name: Literal, number: int = 1, 255 named_graphs: int = 0, 256 static: bool = True) -> URIRef: 257 """Insert a TriplesMap into a [R2]RML mapping with a Logical Table 258 259 Parameters 260 ---------- 261 mapping : Graph 262 [R2]RML mapping as an RDFLib Graph. 263 subject_value : Literal 264 Subject IRI template value. 265 table_name : Literal 266 SQL table name to add. 267 number : int 268 Triples Map number, default 1. 269 named_graphs : int 270 Number of named graphs, default 0. 271 272 Returns 273 ------- 274 triples_map_iri : URIRef 275 IRI of the Triples Map inserted into the mapping. 276 """ 277 triples_map_iri = URIRef(f'{mapping.base}#TriplesMap{number}') 278 subject_map_iri = BNode() 279 logical_table_iri = BNode() 280 281 mapping.add((logical_table_iri, R2RML.tableName, table_name)) 282 mapping.add((logical_table_iri, RDF.type, R2RML.LogicalTable)) 283 mapping.add((triples_map_iri, R2RML.logicalTable, logical_table_iri)) 284 mapping.add((triples_map_iri, R2RML.subjectMap, subject_map_iri)) 285 mapping.add((triples_map_iri, RDF.type, R2RML.TriplesMap)) 286 mapping.add((subject_map_iri, R2RML.template, subject_value)) 287 288 for i in range(1, named_graphs + 1): 289 if static: 290 mapping.add((subject_map_iri, R2RML.graph, 291 URIRef(f'http://example.org/graph{i}'))) 292 else: 293 graph_map_iri = BNode() 294 mapping.add((subject_map_iri, R2RML.graphMap, graph_map_iri)) 295 mapping.add((graph_map_iri, R2RML.template, 296 Literal(f'http://example.org/graph{{p{i}}}'))) 297 298 return triples_map_iri 299 300 def _generate_mapping(self) -> Graph: 301 """Generate a [R2]RML mapping for a NamedGraph instance. 302 303 Returns 304 ------- 305 mapping : Graph 306 [R2]RML mapping as an RDFLib Graph. 307 """ 308 mapping: Graph = Graph(base='http://ex.com/') 309 mapping.bind('rr', R2RML) 310 mapping.bind('ql', QL) 311 mapping.bind('ex', EX) 312 313 for i in range(1, self._number_of_tms + 1): 314 subject_template = Literal(f'http://ex.com/table/{{p{i}}}') 315 if self._data_format == 'postgresql': 316 triples_map_iri = self._add_triples_map_table(mapping, 317 subject_template, 318 Literal('data'), 319 number=i, 320 named_graphs=self._number_of_ng_s, 321 static=self._static) 322 elif self._data_format == 'csv': 323 csv_path = Literal('/data/shared/data.csv') 324 triples_map_iri = \ 325 self._add_triples_map_source(mapping, subject_template, 326 csv_path, number=i, 327 named_graphs=self._number_of_ng_s, 328 static=self._static) 329 else: 330 msg = f'{self._data_format} not implemented' 331 raise NotImplementedError(msg) 332 333 for j in range(1, self._number_of_poms + 1): 334 self._add_predicate_object_map(mapping, triples_map_iri, 335 EX[f'p{j}'], Literal(f'p{j}'), 336 named_graphs=self._number_of_ng_pom, 337 static=self._static) 338 339 return mapping 340 341 def _generate_csv(self) -> bool: 342 """Generate the instance as CSV files. 343 344 Returns 345 ------- 346 success : bool 347 True if successfull, false otherwise 348 """ 349 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 350 data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE) 351 self._generate_dataframe().to_csv(data_path, index=False) 352 353 mapping_path = os.path.join(self.path(), 'data', 'shared', 354 CSV_MAPPING_FILE) 355 mapping: Graph = self._generate_mapping() 356 mapping.serialize(destination=mapping_path, format='turtle') 357 self._generate_scenario() 358 359 return True 360 361 def _generate_postgresql(self) -> bool: 362 """Generate the instance as PostgreSQL with CSV files to load. 363 364 Returns 365 ------- 366 success : bool 367 True if successfull, false otherwise 368 """ 369 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 370 data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE) 371 self._generate_dataframe().to_csv(data_path, index=False) 372 373 mapping_path = os.path.join(self.path(), 'data', 'shared', 374 RDB_MAPPING_FILE) 375 mapping: Graph = self._generate_mapping() 376 mapping.serialize(destination=mapping_path, format='turtle') 377 self._generate_scenario() 378 379 return True 380 381 def _generate_scenario(self) -> bool: 382 """Generate the metadata for this scenario. 383 384 Configures the execution pipeline automatically. 385 386 Returns 387 ------- 388 success : bool 389 True if successfull, false otherwise 390 """ 391 name: str = f'namedgraph_{self._number_of_ng_s}_' \ 392 f'{self._number_of_ng_pom}_{self._number_of_tms}_' \ 393 f'{self._number_of_poms}_{self._static}' 394 description: str = f'NamedGraph {self._number_of_tms}TM + ' + \ 395 f'{self._number_of_poms}POMs' 396 iri: str = f'http://example.org/mappings/{self._number_of_tms}/' + \ 397 f'{self._number_of_poms}/{self._number_of_ng_s}/' + \ 398 f'{self._number_of_ng_pom}/{self._static}' 399 400 if self._data_format == 'postgresql': 401 return self._generate_metadata(iri, name, description, 402 RDB_MAPPING_FILE, 403 serialization='nquads') 404 elif self._data_format == 'csv': 405 return self._generate_metadata(iri, name, description, 406 CSV_MAPPING_FILE, 407 serialization='nquads') 408 else: 409 raise NotImplementedError(f'{self._data_format} not implemented') 410 411 return False
Helper class that provides a standard way to create an ABC using inheritance.
NamedGraph( main_directory: str, verbose: bool, number_of_ng_pom: int, number_of_ng_s: int, static: bool, number_of_tms: int, number_of_poms: int, number_of_members: int, number_of_properties: int, value_size: int, data_format: str, engine: str)
27 def __init__(self, main_directory: str, verbose: bool, 28 number_of_ng_pom: int, number_of_ng_s: int, static: bool, 29 number_of_tms: int, number_of_poms: int, 30 number_of_members: int, number_of_properties: int, 31 value_size: int, data_format: str, engine: str): 32 """Initialize a NamedGraph scenario. 33 34 Parameters 35 ---------- 36 main_directory : str 37 Root directory for generating instances of NamedGraph. 38 verbose : bool 39 Verbose logging enabled or not. 40 number_of_ng_pom : int 41 Number of named graphs per Predicate Object Map. 42 number_of_ng_s : int 43 Number of named graphs for Subject Map. 44 number_of_members : int 45 Number of members to generate, for example 5000 for 5K rows in a 46 tabular data structure. 47 number_of_properties : int 48 Number of properties per member to generate, for example 20 for 49 20 columns in a tabular data structure. 50 value_size : int 51 Number of characters to add to default value generation, 52 for example: 256 will expand all values to 256 characters. 53 data_format : str 54 Data format to use for generating the data set, for example: 55 "csv", "json", "xml", "postgresql", "mysql" 56 engine : str 57 Engine to use for execution of the generated scenario's instance, 58 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 59 or "OntopMaterialize" 60 """ 61 self._number_of_ng_pom: int = number_of_ng_pom 62 self._number_of_ng_s: int = number_of_ng_s 63 self._static: bool = static 64 self._number_of_tms: int = number_of_tms 65 self._number_of_poms: int = number_of_poms 66 self._number_of_members: int = number_of_members 67 self._number_of_properties: int = number_of_properties 68 self._value_size: int = value_size 69 70 super().__init__(data_format, engine, main_directory, verbose) 71 self._logger = Logger(__name__, self._main_directory, self._verbose)
Initialize a NamedGraph scenario.
Parameters
- main_directory (str): Root directory for generating instances of NamedGraph.
- verbose (bool): Verbose logging enabled or not.
- number_of_ng_pom (int): Number of named graphs per Predicate Object Map.
- number_of_ng_s (int): Number of named graphs for Subject Map.
- number_of_members (int): Number of members to generate, for example 5000 for 5K rows in a tabular data structure.
- number_of_properties (int): Number of properties per member to generate, for example 20 for 20 columns in a tabular data structure.
- value_size (int): Number of characters to add to default value generation, for example: 256 will expand all values to 256 characters.
- data_format (str): Data format to use for generating the data set, for example: "csv", "json", "xml", "postgresql", "mysql"
- engine (str): Engine to use for execution of the generated scenario's instance, for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", or "OntopMaterialize"
def
generate(self) -> bool:
73 def generate(self) -> bool: 74 """Generate the instance using the NamedGraph scenario. 75 76 Only CSV files are currently implemented! 77 """ 78 if self._data_format == 'csv': 79 return self._generate_csv() 80 elif self._data_format == 'postgresql': 81 return self._generate_postgresql() 82 else: 83 raise NotImplementedError(f'Data format {self._data_format} ' 84 f'is not implemented by {__name__}')
Generate the instance using the NamedGraph scenario.
Only CSV files are currently implemented!
def
path(self) -> str:
86 def path(self) -> str: 87 """Builds the file path for the instance of a NamedGraph scenario. 88 89 Returns 90 ------- 91 path : str 92 File path for the NamedGraph's instance. 93 """ 94 key = f'namedgraph_{self._number_of_ng_s}SM-NG_' \ 95 f'{self._number_of_ng_pom}POM-NG_{self._number_of_tms}TM_' \ 96 f'{self._number_of_poms}POM_{self._static}' 97 path = os.path.join(self._main_directory, self._engine, 98 self._data_format, key) 99 self._logger.debug(f'Generating to {path}') 100 os.makedirs(path, exist_ok=True) 101 return path
Builds the file path for the instance of a NamedGraph scenario.
Returns
- path (str): File path for the NamedGraph's instance.