bench_generator.joins
This module holds the Joins class which scales the dataset size by the number of members in a dataset such as number of rows for tabular data.
1#!/usr/bin/env python3 2 3""" 4This module holds the Joins class which scales the dataset size 5by the number of members in a dataset such as number of rows for tabular data. 6""" 7 8import os 9import string 10import random 11from pandas import DataFrame 12from rdflib.namespace import RDF 13from rdflib import Graph, URIRef, BNode, Literal, Namespace 14from bench_generator.scenario import Scenario 15from bench_generator.logger import Logger 16 17DATA_FILE1 = 'data1.csv' 18DATA_FILE2 = 'data2.csv' 19MAPPING_FILE = 'mapping.r2rml.ttl' 20R2RML = Namespace('http://www.w3.org/ns/r2rml#') 21QL = Namespace('http://semweb.mmlab.be/ns/ql#') 22EX = Namespace('http://example.com/') 23 24 25class Joins(Scenario): 26 def __init__(self, main_directory: str, verbose: bool, percentage: float, 27 number_of_members: int, number_of_properties: int, 28 value_size: int, data_format: str, engine: str, 29 seed: int = 0, join_n: int = 1, join_m: int = 1): 30 """Initialize a Raw Data scenario. 31 32 Parameters 33 ---------- 34 main_directory : str 35 Root directory for generating instances of Raw Data. 36 verbose : bool 37 Verbose logging enabled or not. 38 percentage : float 39 Percentage of members which should result into a join. 40 number_of_members : int 41 Number of members to generate, for example 5000 for 5K rows in a 42 tabular data structure. 43 number_of_properties : int 44 Number of properties per member to generate, for example 20 for 45 20 columns in a tabular data structure. 46 value_size : int 47 Number of characters to add to default value generation, 48 for example: 256 will expand all values to 256 characters. 49 data_format : str 50 Data format to use for generating the data set, for example: 51 "csv", "json", "xml", "postgresql", "mysql" 52 engine : str 53 Engine to use for execution of the generated scenario's instance, 54 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 55 or "OntopMaterialize" 56 seed : int 57 Random seed to use, default 0. 58 join_n : int 59 Join N-M relationship value N, default 1. 60 join_m: int 61 Join N-M relationship value M, default 1. 62 """ 63 self._percentage = percentage 64 self._number_of_members: int = number_of_members 65 self._number_of_properties: int = number_of_properties 66 self._value_size: int = value_size 67 self._data_format: str = data_format 68 self._engine: str = engine 69 self._join_n: int = join_n 70 self._join_m: int = join_m 71 random.seed(seed) 72 73 if self._data_format != 'csv': 74 raise NotImplementedError(f'Data format {self._data_format} ' 75 f'is not implemented by {__name__}') 76 77 super().__init__(main_directory, verbose) 78 self._logger = Logger(__name__, self._main_directory, self._verbose) 79 self._logger.debug(f'Generating join {self._join_n}-{self._join_m}' 80 f' with {self._percentage}%') 81 82 def generate(self) -> bool: 83 """Generate the instance using the Raw Data scenario. 84 85 Only CSV files are currently implemented! 86 """ 87 if self._data_format == 'csv': 88 return self._generate_csv() 89 elif self._data_format == 'postgresql': 90 return self._generate_postgresql() 91 else: 92 raise NotImplementedError(f'Data format {self._data_format} ' 93 f'is not implemented by {__name__}') 94 95 def path(self) -> str: 96 """Builds the file path for the instance of a Raw Data scenario. 97 98 Returns 99 ------- 100 path : str 101 File path for the Raw Data's instance. 102 """ 103 key = f'joins_{self._join_n}-{self._join_m}_{self._percentage}' 104 path = os.path.join(self._main_directory, self._engine, 105 self._data_format, key) 106 self._logger.debug(f'Generating to {path}') 107 os.makedirs(path, exist_ok=True) 108 return path 109 110 def _generate_dataframe(self, member_offset: int = 1, 111 property_offset: int = 1) -> DataFrame: 112 """Generate joins. 113 114 Parameters 115 ---------- 116 member_offset : int 117 Offset to start member ID generation from. Default 1 (no offset). 118 property_offset : int 119 Offset to start property ID generation from. Default 1 (no offset). 120 121 Returns 122 ------- 123 dataframe : DataFrame 124 Panda's DataFrame with generated joins. 125 """ 126 subject_id = range(member_offset, 127 self._number_of_members + member_offset) 128 value_id = range(property_offset, 129 self._number_of_members + property_offset) 130 data: dict = {'id': subject_id} 131 n_ascii = len(string.ascii_letters) 132 133 for j in range(1, self._number_of_properties + 1): 134 # Append ASCII characters if necessary, use modulo to avoid out of 135 # range in ASCII table 136 append_value = '' 137 if self._value_size > 0: 138 append_value = '_' 139 for n in range(self._value_size): 140 append_value += string.ascii_letters[n % n_ascii] 141 142 # Generate value V_{property}_{member} honoring the value size 143 value = [f'V_{j}-{i}{append_value}' for i in value_id] 144 data[f'p{j}'] = value 145 146 return DataFrame(data) 147 148 def _update_one_on_one(self, dataframe1: DataFrame, 149 dataframe2: DataFrame) -> DataFrame: 150 # 0% percentage results in zero matches for the join condition, 151 # don't even bother to try to match the dataframes 152 if self._percentage == 0.0: 153 return dataframe1, dataframe2 154 155 # Sample both dataframes 156 percentage_members = int(self._number_of_members * 157 (self._percentage / 100.0)) 158 dataframe1_sample = dataframe1 \ 159 .loc[random.sample(list(dataframe1.index), percentage_members)] 160 dataframe1_sample.reset_index(drop=True) 161 number_of_members_n = self._number_of_members * percentage_members 162 number_of_members_to_join_n = number_of_members_n / self._join_n 163 number_of_members_m = self._number_of_members * percentage_members 164 number_of_members_to_join_m = number_of_members_m / self._join_m 165 166 members_sample_size = max(int(number_of_members_to_join_n + 0.5), 167 int(number_of_members_to_join_m + 0.5)) 168 members_sample = dataframe1.iloc[random.sample(list(dataframe1.index), 169 members_sample_size)] 170 171 # Extract unique values of p1 from dataframe 1, only those sampled for 172 # percentage to dataframe 2 173 members_value = \ 174 list(set([row[1]['p1'] for row in members_sample.iterrows()]))\ 175 [:int(number_of_members_to_join_m + 0.5)] 176 # Repeat the values M times to honor the relation size 177 members_value = members_value * self.join_m 178 179 # Limit number of values because we may have more values than members 180 if len(members_value) > self._number_of_members: 181 members_value = members_value[:self._number_of_members] 182 183 dataframe2_sample = dataframe2 \ 184 .loc[random.sample(list(dataframe2.index), percentage_members)] 185 186 # Update dataframe2 to match with dataframe1 187 for i, j in zip(members_value, list(dataframe1_sample.index)): 188 dataframe2.loc[j, 'id'] = i 189 190 # Extract unique values of p1 from dataframe 2, only those sampled for 191 # percentage to dataframe 1 192 members_value = \ 193 list(set([row[1]['p1'] for row in members_sample.iterrows()]))\ 194 [:int(number_of_members_to_join_n + 0.5)] 195 # Repeat the values M times to honor the relation size 196 members_value = members_value * self.join_n 197 198 if len(members_value) > self._number_of_members: 199 members_value = members_value[:self._number_of_members] 200 201 dataframe1_sample = dataframe1 \ 202 .loc[random.sample(list(dataframe1.index), percentage_members)] 203 204 # Update dataframe1 to match with dataframe2 205 for i, j in zip(members_value, list(dataframe2_sample.index)): 206 dataframe1.loc[j, 'id'] = i 207 208 return dataframe1, dataframe2 209 210 def _add_predicate_object_map(self, mapping: Graph, triplesmap_iri: URIRef, 211 predicate_value: URIRef, 212 object_value: Literal) -> BNode: 213 """Insert a PredicateObjectMap into a [R2]RML mapping 214 215 Parameters 216 ---------- 217 mapping : Graph 218 [R2]RML mapping as an RDFLib Graph. 219 triples_map_iri : URIRef 220 IRI of the Triples Map to insert the PredicateObjectMap in. 221 predicate_value : URIRef 222 Predicate IRI value for PredicateObjectMap. 223 object_value : Literal 224 Object value for PredicateObjectMap. 225 226 Returns 227 ------- 228 predicate_object_map_iri : BNode 229 Predicate Object Map blank node ID. 230 """ 231 predicate_object_map_iri = BNode() 232 predicate_map_iri = BNode() 233 object_map_iri = BNode() 234 235 mapping.add((predicate_map_iri, R2RML.constant, predicate_value)) 236 mapping.add((predicate_map_iri, RDF.type, R2RML.PredicateMap)) 237 mapping.add((object_map_iri, R2RML.column, object_value)) 238 mapping.add((object_map_iri, RDF.type, R2RML.ObjectMap)) 239 mapping.add((predicate_object_map_iri, R2RML.predicateMap, 240 predicate_map_iri)) 241 mapping.add((predicate_object_map_iri, R2RML.objectMap, 242 object_map_iri)) 243 mapping.add((predicate_object_map_iri, RDF.type, 244 R2RML.PredicateObjectMap)) 245 mapping.add((triplesmap_iri, R2RML.predicateObjectMap, 246 predicate_object_map_iri)) 247 248 return predicate_object_map_iri 249 250 def _add_join_predicate_object_map(self, mapping: Graph, 251 triplesmap_iri: URIRef, 252 predicate_value: URIRef, 253 object_value: Literal, 254 parent_triplesmap_iri: URIRef, 255 child_value: Literal, 256 parent_value: Literal) -> BNode: 257 """Insert a join with join condition into a [R2]RML mapping 258 259 Parameters 260 ---------- 261 mapping : Graph 262 [R2]RML mapping as an RDFLib Graph. 263 triples_map_iri : URIRef 264 IRI of the Triples Map to insert the PredicateObjectMap in. 265 predicate_value : URIRef 266 Predicate IRI value for PredicateObjectMap. 267 object_value : Literal 268 Object value for PredicateObjectMap. 269 270 Returns 271 ------- 272 predicat_object_map_with_join_iri : BNode 273 Predicate Object Map with join blank node ID. 274 """ 275 predicate_object_map_iri = BNode() 276 predicate_map_iri = BNode() 277 object_map_iri = BNode() 278 join_condition_iri = BNode() 279 280 mapping.add((join_condition_iri, R2RML.child, child_value)) 281 mapping.add((join_condition_iri, R2RML.parent, parent_value)) 282 mapping.add((join_condition_iri, RDF.type, R2RML.JoinCondition)) 283 mapping.add((predicate_map_iri, R2RML.constant, predicate_value)) 284 mapping.add((predicate_map_iri, RDF.type, R2RML.PredicateMap)) 285 mapping.add((object_map_iri, RDF.type, R2RML.ReferenceObjectMap)) 286 mapping.add((object_map_iri, R2RML.parentTriplesMap, 287 parent_triplesmap_iri)) 288 mapping.add((object_map_iri, R2RML.joinCondition, join_condition_iri)) 289 mapping.add((predicate_object_map_iri, R2RML.predicateMap, 290 predicate_map_iri)) 291 mapping.add((predicate_object_map_iri, R2RML.objectMap, 292 object_map_iri)) 293 mapping.add((predicate_object_map_iri, RDF.type, 294 R2RML.PredicateObjectMap)) 295 mapping.add((triplesmap_iri, R2RML.predicateObjectMap, 296 predicate_object_map_iri)) 297 298 return join_condition_iri 299 300 def _add_triples_map(self, mapping: Graph, subject_value: Literal, 301 table_name: Literal, number: int = 1) -> URIRef: 302 """Insert a TriplesMap into a [R2]RML mapping 303 304 Parameters 305 ---------- 306 mapping : Graph 307 [R2]RML mapping as an RDFLib Graph. 308 subject_value : Literal 309 Subject IRI template value. 310 table_name : Literal 311 SQL table name to add. 312 313 number : int 314 Triples Map number, default 1. 315 316 Returns 317 ------- 318 triples_map_iri : URIRef 319 IRI of the Triples Map inserted into the mapping. 320 """ 321 triples_map_iri = URIRef(f'{mapping.base}#TriplesMap{number}') 322 subject_map_iri = BNode() 323 logical_table_iri = BNode() 324 325 mapping.add((logical_table_iri, R2RML.tableName, table_name)) 326 mapping.add((triples_map_iri, R2RML.logicalTable, logical_table_iri)) 327 mapping.add((triples_map_iri, R2RML.subjectMap, subject_map_iri)) 328 mapping.add((triples_map_iri, RDF.type, R2RML.TriplesMap)) 329 mapping.add((subject_map_iri, R2RML.template, subject_value)) 330 331 return triples_map_iri 332 333 def _generate_mapping(self) -> Graph: 334 """Generate a [R2]RML mapping for a Joins instance. 335 336 Returns 337 ------- 338 mapping : Graph 339 [R2]RML mapping as an RDFLib Graph. 340 """ 341 mapping: Graph = Graph(base='http://ex.com/') 342 mapping.bind('rr', R2RML) 343 mapping.bind('ql', QL) 344 mapping.bind('ex', EX) 345 subject1_template = Literal('http://ex.com/table1/{id}') 346 subject2_template = Literal('http://ex.com/table2/{id}') 347 triples_map1_iri = self._add_triples_map(mapping, subject1_template, 348 Literal('data'), number=1) 349 triples_map2_iri = self._add_triples_map(mapping, subject2_template, 350 Literal('data'), number=2) 351 352 self._add_join_predicate_object_map(mapping, triples_map1_iri, 353 EX['j1'], Literal('p1'), 354 triples_map2_iri, Literal('id'), 355 Literal('id')) 356 357 return mapping 358 359 def _generate_csv(self) -> bool: 360 """Generate the instance as CSV files. 361 362 Returns 363 ------- 364 success : bool 365 True if successfull, false otherwise 366 """ 367 data1_path = os.path.join(self.path(), DATA_FILE1) 368 dataframe1 = self._generate_dataframe() 369 data2_path = os.path.join(self.path(), DATA_FILE2) 370 dataframe2 = self._generate_dataframe(self._number_of_members + 1, 371 self._number_of_properties + 1) 372 dataframe1, dataframe2 = self._update_one_on_one(dataframe1, 373 dataframe2) 374 dataframe1.to_csv(data1_path, index=False) 375 dataframe2.to_csv(data2_path, index=False) 376 377 mapping_path = os.path.join(self.path(), MAPPING_FILE) 378 mapping: Graph = self._generate_mapping() 379 mapping.serialize(destination=mapping_path, format='turtle') 380 381 return True 382 383 def _generate_postgresql(self) -> bool: 384 """Generate the instance as PostgreSQL with CSV files to load. 385 386 Returns 387 ------- 388 success : bool 389 True if successfull, false otherwise 390 """ 391 data1_path = os.path.join(self.path(), DATA_FILE1) 392 self._generate_dataframe().to_csv(data1_path, index=False) 393 data2_path = os.path.join(self.path(), DATA_FILE2) 394 self._generate_dataframe().to_csv(data2_path, index=False) 395 396 mapping_path = os.path.join(self.path(), MAPPING_FILE) 397 mapping: Graph = self._generate_mapping() 398 mapping.serialize(destination=mapping_path, format='turtle') 399 400 return True
DATA_FILE1 =
'data1.csv'
DATA_FILE2 =
'data2.csv'
MAPPING_FILE =
'mapping.r2rml.ttl'
R2RML =
Namespace('http://www.w3.org/ns/r2rml#')
QL =
Namespace('http://semweb.mmlab.be/ns/ql#')
EX =
Namespace('http://example.com/')
26class Joins(Scenario): 27 def __init__(self, main_directory: str, verbose: bool, percentage: float, 28 number_of_members: int, number_of_properties: int, 29 value_size: int, data_format: str, engine: str, 30 seed: int = 0, join_n: int = 1, join_m: int = 1): 31 """Initialize a Raw Data scenario. 32 33 Parameters 34 ---------- 35 main_directory : str 36 Root directory for generating instances of Raw Data. 37 verbose : bool 38 Verbose logging enabled or not. 39 percentage : float 40 Percentage of members which should result into a join. 41 number_of_members : int 42 Number of members to generate, for example 5000 for 5K rows in a 43 tabular data structure. 44 number_of_properties : int 45 Number of properties per member to generate, for example 20 for 46 20 columns in a tabular data structure. 47 value_size : int 48 Number of characters to add to default value generation, 49 for example: 256 will expand all values to 256 characters. 50 data_format : str 51 Data format to use for generating the data set, for example: 52 "csv", "json", "xml", "postgresql", "mysql" 53 engine : str 54 Engine to use for execution of the generated scenario's instance, 55 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 56 or "OntopMaterialize" 57 seed : int 58 Random seed to use, default 0. 59 join_n : int 60 Join N-M relationship value N, default 1. 61 join_m: int 62 Join N-M relationship value M, default 1. 63 """ 64 self._percentage = percentage 65 self._number_of_members: int = number_of_members 66 self._number_of_properties: int = number_of_properties 67 self._value_size: int = value_size 68 self._data_format: str = data_format 69 self._engine: str = engine 70 self._join_n: int = join_n 71 self._join_m: int = join_m 72 random.seed(seed) 73 74 if self._data_format != 'csv': 75 raise NotImplementedError(f'Data format {self._data_format} ' 76 f'is not implemented by {__name__}') 77 78 super().__init__(main_directory, verbose) 79 self._logger = Logger(__name__, self._main_directory, self._verbose) 80 self._logger.debug(f'Generating join {self._join_n}-{self._join_m}' 81 f' with {self._percentage}%') 82 83 def generate(self) -> bool: 84 """Generate the instance using the Raw Data scenario. 85 86 Only CSV files are currently implemented! 87 """ 88 if self._data_format == 'csv': 89 return self._generate_csv() 90 elif self._data_format == 'postgresql': 91 return self._generate_postgresql() 92 else: 93 raise NotImplementedError(f'Data format {self._data_format} ' 94 f'is not implemented by {__name__}') 95 96 def path(self) -> str: 97 """Builds the file path for the instance of a Raw Data scenario. 98 99 Returns 100 ------- 101 path : str 102 File path for the Raw Data's instance. 103 """ 104 key = f'joins_{self._join_n}-{self._join_m}_{self._percentage}' 105 path = os.path.join(self._main_directory, self._engine, 106 self._data_format, key) 107 self._logger.debug(f'Generating to {path}') 108 os.makedirs(path, exist_ok=True) 109 return path 110 111 def _generate_dataframe(self, member_offset: int = 1, 112 property_offset: int = 1) -> DataFrame: 113 """Generate joins. 114 115 Parameters 116 ---------- 117 member_offset : int 118 Offset to start member ID generation from. Default 1 (no offset). 119 property_offset : int 120 Offset to start property ID generation from. Default 1 (no offset). 121 122 Returns 123 ------- 124 dataframe : DataFrame 125 Panda's DataFrame with generated joins. 126 """ 127 subject_id = range(member_offset, 128 self._number_of_members + member_offset) 129 value_id = range(property_offset, 130 self._number_of_members + property_offset) 131 data: dict = {'id': subject_id} 132 n_ascii = len(string.ascii_letters) 133 134 for j in range(1, self._number_of_properties + 1): 135 # Append ASCII characters if necessary, use modulo to avoid out of 136 # range in ASCII table 137 append_value = '' 138 if self._value_size > 0: 139 append_value = '_' 140 for n in range(self._value_size): 141 append_value += string.ascii_letters[n % n_ascii] 142 143 # Generate value V_{property}_{member} honoring the value size 144 value = [f'V_{j}-{i}{append_value}' for i in value_id] 145 data[f'p{j}'] = value 146 147 return DataFrame(data) 148 149 def _update_one_on_one(self, dataframe1: DataFrame, 150 dataframe2: DataFrame) -> DataFrame: 151 # 0% percentage results in zero matches for the join condition, 152 # don't even bother to try to match the dataframes 153 if self._percentage == 0.0: 154 return dataframe1, dataframe2 155 156 # Sample both dataframes 157 percentage_members = int(self._number_of_members * 158 (self._percentage / 100.0)) 159 dataframe1_sample = dataframe1 \ 160 .loc[random.sample(list(dataframe1.index), percentage_members)] 161 dataframe1_sample.reset_index(drop=True) 162 number_of_members_n = self._number_of_members * percentage_members 163 number_of_members_to_join_n = number_of_members_n / self._join_n 164 number_of_members_m = self._number_of_members * percentage_members 165 number_of_members_to_join_m = number_of_members_m / self._join_m 166 167 members_sample_size = max(int(number_of_members_to_join_n + 0.5), 168 int(number_of_members_to_join_m + 0.5)) 169 members_sample = dataframe1.iloc[random.sample(list(dataframe1.index), 170 members_sample_size)] 171 172 # Extract unique values of p1 from dataframe 1, only those sampled for 173 # percentage to dataframe 2 174 members_value = \ 175 list(set([row[1]['p1'] for row in members_sample.iterrows()]))\ 176 [:int(number_of_members_to_join_m + 0.5)] 177 # Repeat the values M times to honor the relation size 178 members_value = members_value * self.join_m 179 180 # Limit number of values because we may have more values than members 181 if len(members_value) > self._number_of_members: 182 members_value = members_value[:self._number_of_members] 183 184 dataframe2_sample = dataframe2 \ 185 .loc[random.sample(list(dataframe2.index), percentage_members)] 186 187 # Update dataframe2 to match with dataframe1 188 for i, j in zip(members_value, list(dataframe1_sample.index)): 189 dataframe2.loc[j, 'id'] = i 190 191 # Extract unique values of p1 from dataframe 2, only those sampled for 192 # percentage to dataframe 1 193 members_value = \ 194 list(set([row[1]['p1'] for row in members_sample.iterrows()]))\ 195 [:int(number_of_members_to_join_n + 0.5)] 196 # Repeat the values M times to honor the relation size 197 members_value = members_value * self.join_n 198 199 if len(members_value) > self._number_of_members: 200 members_value = members_value[:self._number_of_members] 201 202 dataframe1_sample = dataframe1 \ 203 .loc[random.sample(list(dataframe1.index), percentage_members)] 204 205 # Update dataframe1 to match with dataframe2 206 for i, j in zip(members_value, list(dataframe2_sample.index)): 207 dataframe1.loc[j, 'id'] = i 208 209 return dataframe1, dataframe2 210 211 def _add_predicate_object_map(self, mapping: Graph, triplesmap_iri: URIRef, 212 predicate_value: URIRef, 213 object_value: Literal) -> BNode: 214 """Insert a PredicateObjectMap into a [R2]RML mapping 215 216 Parameters 217 ---------- 218 mapping : Graph 219 [R2]RML mapping as an RDFLib Graph. 220 triples_map_iri : URIRef 221 IRI of the Triples Map to insert the PredicateObjectMap in. 222 predicate_value : URIRef 223 Predicate IRI value for PredicateObjectMap. 224 object_value : Literal 225 Object value for PredicateObjectMap. 226 227 Returns 228 ------- 229 predicate_object_map_iri : BNode 230 Predicate Object Map blank node ID. 231 """ 232 predicate_object_map_iri = BNode() 233 predicate_map_iri = BNode() 234 object_map_iri = BNode() 235 236 mapping.add((predicate_map_iri, R2RML.constant, predicate_value)) 237 mapping.add((predicate_map_iri, RDF.type, R2RML.PredicateMap)) 238 mapping.add((object_map_iri, R2RML.column, object_value)) 239 mapping.add((object_map_iri, RDF.type, R2RML.ObjectMap)) 240 mapping.add((predicate_object_map_iri, R2RML.predicateMap, 241 predicate_map_iri)) 242 mapping.add((predicate_object_map_iri, R2RML.objectMap, 243 object_map_iri)) 244 mapping.add((predicate_object_map_iri, RDF.type, 245 R2RML.PredicateObjectMap)) 246 mapping.add((triplesmap_iri, R2RML.predicateObjectMap, 247 predicate_object_map_iri)) 248 249 return predicate_object_map_iri 250 251 def _add_join_predicate_object_map(self, mapping: Graph, 252 triplesmap_iri: URIRef, 253 predicate_value: URIRef, 254 object_value: Literal, 255 parent_triplesmap_iri: URIRef, 256 child_value: Literal, 257 parent_value: Literal) -> BNode: 258 """Insert a join with join condition into a [R2]RML mapping 259 260 Parameters 261 ---------- 262 mapping : Graph 263 [R2]RML mapping as an RDFLib Graph. 264 triples_map_iri : URIRef 265 IRI of the Triples Map to insert the PredicateObjectMap in. 266 predicate_value : URIRef 267 Predicate IRI value for PredicateObjectMap. 268 object_value : Literal 269 Object value for PredicateObjectMap. 270 271 Returns 272 ------- 273 predicat_object_map_with_join_iri : BNode 274 Predicate Object Map with join blank node ID. 275 """ 276 predicate_object_map_iri = BNode() 277 predicate_map_iri = BNode() 278 object_map_iri = BNode() 279 join_condition_iri = BNode() 280 281 mapping.add((join_condition_iri, R2RML.child, child_value)) 282 mapping.add((join_condition_iri, R2RML.parent, parent_value)) 283 mapping.add((join_condition_iri, RDF.type, R2RML.JoinCondition)) 284 mapping.add((predicate_map_iri, R2RML.constant, predicate_value)) 285 mapping.add((predicate_map_iri, RDF.type, R2RML.PredicateMap)) 286 mapping.add((object_map_iri, RDF.type, R2RML.ReferenceObjectMap)) 287 mapping.add((object_map_iri, R2RML.parentTriplesMap, 288 parent_triplesmap_iri)) 289 mapping.add((object_map_iri, R2RML.joinCondition, join_condition_iri)) 290 mapping.add((predicate_object_map_iri, R2RML.predicateMap, 291 predicate_map_iri)) 292 mapping.add((predicate_object_map_iri, R2RML.objectMap, 293 object_map_iri)) 294 mapping.add((predicate_object_map_iri, RDF.type, 295 R2RML.PredicateObjectMap)) 296 mapping.add((triplesmap_iri, R2RML.predicateObjectMap, 297 predicate_object_map_iri)) 298 299 return join_condition_iri 300 301 def _add_triples_map(self, mapping: Graph, subject_value: Literal, 302 table_name: Literal, number: int = 1) -> URIRef: 303 """Insert a TriplesMap into a [R2]RML mapping 304 305 Parameters 306 ---------- 307 mapping : Graph 308 [R2]RML mapping as an RDFLib Graph. 309 subject_value : Literal 310 Subject IRI template value. 311 table_name : Literal 312 SQL table name to add. 313 314 number : int 315 Triples Map number, default 1. 316 317 Returns 318 ------- 319 triples_map_iri : URIRef 320 IRI of the Triples Map inserted into the mapping. 321 """ 322 triples_map_iri = URIRef(f'{mapping.base}#TriplesMap{number}') 323 subject_map_iri = BNode() 324 logical_table_iri = BNode() 325 326 mapping.add((logical_table_iri, R2RML.tableName, table_name)) 327 mapping.add((triples_map_iri, R2RML.logicalTable, logical_table_iri)) 328 mapping.add((triples_map_iri, R2RML.subjectMap, subject_map_iri)) 329 mapping.add((triples_map_iri, RDF.type, R2RML.TriplesMap)) 330 mapping.add((subject_map_iri, R2RML.template, subject_value)) 331 332 return triples_map_iri 333 334 def _generate_mapping(self) -> Graph: 335 """Generate a [R2]RML mapping for a Joins instance. 336 337 Returns 338 ------- 339 mapping : Graph 340 [R2]RML mapping as an RDFLib Graph. 341 """ 342 mapping: Graph = Graph(base='http://ex.com/') 343 mapping.bind('rr', R2RML) 344 mapping.bind('ql', QL) 345 mapping.bind('ex', EX) 346 subject1_template = Literal('http://ex.com/table1/{id}') 347 subject2_template = Literal('http://ex.com/table2/{id}') 348 triples_map1_iri = self._add_triples_map(mapping, subject1_template, 349 Literal('data'), number=1) 350 triples_map2_iri = self._add_triples_map(mapping, subject2_template, 351 Literal('data'), number=2) 352 353 self._add_join_predicate_object_map(mapping, triples_map1_iri, 354 EX['j1'], Literal('p1'), 355 triples_map2_iri, Literal('id'), 356 Literal('id')) 357 358 return mapping 359 360 def _generate_csv(self) -> bool: 361 """Generate the instance as CSV files. 362 363 Returns 364 ------- 365 success : bool 366 True if successfull, false otherwise 367 """ 368 data1_path = os.path.join(self.path(), DATA_FILE1) 369 dataframe1 = self._generate_dataframe() 370 data2_path = os.path.join(self.path(), DATA_FILE2) 371 dataframe2 = self._generate_dataframe(self._number_of_members + 1, 372 self._number_of_properties + 1) 373 dataframe1, dataframe2 = self._update_one_on_one(dataframe1, 374 dataframe2) 375 dataframe1.to_csv(data1_path, index=False) 376 dataframe2.to_csv(data2_path, index=False) 377 378 mapping_path = os.path.join(self.path(), MAPPING_FILE) 379 mapping: Graph = self._generate_mapping() 380 mapping.serialize(destination=mapping_path, format='turtle') 381 382 return True 383 384 def _generate_postgresql(self) -> bool: 385 """Generate the instance as PostgreSQL with CSV files to load. 386 387 Returns 388 ------- 389 success : bool 390 True if successfull, false otherwise 391 """ 392 data1_path = os.path.join(self.path(), DATA_FILE1) 393 self._generate_dataframe().to_csv(data1_path, index=False) 394 data2_path = os.path.join(self.path(), DATA_FILE2) 395 self._generate_dataframe().to_csv(data2_path, index=False) 396 397 mapping_path = os.path.join(self.path(), MAPPING_FILE) 398 mapping: Graph = self._generate_mapping() 399 mapping.serialize(destination=mapping_path, format='turtle') 400 401 return True
Helper class that provides a standard way to create an ABC using inheritance.
Joins( main_directory: str, verbose: bool, percentage: float, number_of_members: int, number_of_properties: int, value_size: int, data_format: str, engine: str, seed: int = 0, join_n: int = 1, join_m: int = 1)
27 def __init__(self, main_directory: str, verbose: bool, percentage: float, 28 number_of_members: int, number_of_properties: int, 29 value_size: int, data_format: str, engine: str, 30 seed: int = 0, join_n: int = 1, join_m: int = 1): 31 """Initialize a Raw Data scenario. 32 33 Parameters 34 ---------- 35 main_directory : str 36 Root directory for generating instances of Raw Data. 37 verbose : bool 38 Verbose logging enabled or not. 39 percentage : float 40 Percentage of members which should result into a join. 41 number_of_members : int 42 Number of members to generate, for example 5000 for 5K rows in a 43 tabular data structure. 44 number_of_properties : int 45 Number of properties per member to generate, for example 20 for 46 20 columns in a tabular data structure. 47 value_size : int 48 Number of characters to add to default value generation, 49 for example: 256 will expand all values to 256 characters. 50 data_format : str 51 Data format to use for generating the data set, for example: 52 "csv", "json", "xml", "postgresql", "mysql" 53 engine : str 54 Engine to use for execution of the generated scenario's instance, 55 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 56 or "OntopMaterialize" 57 seed : int 58 Random seed to use, default 0. 59 join_n : int 60 Join N-M relationship value N, default 1. 61 join_m: int 62 Join N-M relationship value M, default 1. 63 """ 64 self._percentage = percentage 65 self._number_of_members: int = number_of_members 66 self._number_of_properties: int = number_of_properties 67 self._value_size: int = value_size 68 self._data_format: str = data_format 69 self._engine: str = engine 70 self._join_n: int = join_n 71 self._join_m: int = join_m 72 random.seed(seed) 73 74 if self._data_format != 'csv': 75 raise NotImplementedError(f'Data format {self._data_format} ' 76 f'is not implemented by {__name__}') 77 78 super().__init__(main_directory, verbose) 79 self._logger = Logger(__name__, self._main_directory, self._verbose) 80 self._logger.debug(f'Generating join {self._join_n}-{self._join_m}' 81 f' with {self._percentage}%')
Initialize a Raw Data scenario.
Parameters
- main_directory (str): Root directory for generating instances of Raw Data.
- verbose (bool): Verbose logging enabled or not.
- percentage (float): Percentage of members which should result into a join.
- number_of_members (int): Number of members to generate, for example 5000 for 5K rows in a tabular data structure.
- number_of_properties (int): Number of properties per member to generate, for example 20 for 20 columns in a tabular data structure.
- value_size (int): Number of characters to add to default value generation, for example: 256 will expand all values to 256 characters.
- data_format (str): Data format to use for generating the data set, for example: "csv", "json", "xml", "postgresql", "mysql"
- engine (str): Engine to use for execution of the generated scenario's instance, for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", or "OntopMaterialize"
- seed (int): Random seed to use, default 0.
- join_n (int): Join N-M relationship value N, default 1.
- join_m (int): Join N-M relationship value M, default 1.
def
generate(self) -> bool:
83 def generate(self) -> bool: 84 """Generate the instance using the Raw Data scenario. 85 86 Only CSV files are currently implemented! 87 """ 88 if self._data_format == 'csv': 89 return self._generate_csv() 90 elif self._data_format == 'postgresql': 91 return self._generate_postgresql() 92 else: 93 raise NotImplementedError(f'Data format {self._data_format} ' 94 f'is not implemented by {__name__}')
Generate the instance using the Raw Data scenario.
Only CSV files are currently implemented!
def
path(self) -> str:
96 def path(self) -> str: 97 """Builds the file path for the instance of a Raw Data scenario. 98 99 Returns 100 ------- 101 path : str 102 File path for the Raw Data's instance. 103 """ 104 key = f'joins_{self._join_n}-{self._join_m}_{self._percentage}' 105 path = os.path.join(self._main_directory, self._engine, 106 self._data_format, key) 107 self._logger.debug(f'Generating to {path}') 108 os.makedirs(path, exist_ok=True) 109 return path
Builds the file path for the instance of a Raw Data scenario.
Returns
- path (str): File path for the Raw Data's instance.