bench_generator.joins_multiple
This module holds the Joins class which scales the dataset size by the number of members in a dataset such as number of rows for tabular data.
1#!/usr/bin/env python3 2 3""" 4This module holds the Joins class which scales the dataset size 5by the number of members in a dataset such as number of rows for tabular data. 6""" 7 8import os 9import string 10import random 11from typing import Tuple 12from pandas import DataFrame 13from rdflib.namespace import RDF 14from rdflib import Graph, URIRef, BNode, Literal, Namespace 15from bench_generator.scenario import Scenario 16from bench_generator.logger import Logger 17 18DATA_FILE1 = 'data1.csv' 19DATA_FILE2 = 'data2.csv' 20RDB_MAPPING_FILE = 'mapping.r2rml.ttl' 21CSV_MAPPING_FILE = 'mapping.rml.ttl' 22R2RML = Namespace('http://www.w3.org/ns/r2rml#') 23QL = Namespace('http://semweb.mmlab.be/ns/ql#') 24EX = Namespace('http://example.com/') 25MEMBERS_PERCENTAGE = 50.0 26 27 28class JoinsMultiple(Scenario): 29 def __init__(self, main_directory: str, verbose: bool, percentage: float, 30 n: int, m: int, jc: int, number_of_members: int, 31 number_of_properties: int, value_size: int, data_format: str, 32 engine: str, seed: int = 0): 33 """Initialize a Joins Multiple scenario. 34 35 Member's percentage is always set to 50%. 36 37 Parameters 38 ---------- 39 main_directory : str 40 Root directory for generating instances of Joins Multiple. 41 verbose : bool 42 Verbose logging enabled or not. 43 percentage : float 44 Percentage of relations which should result into a join. 45 n : int 46 Relation size N. 47 m : int 48 Relation size M. 49 jc : int 50 Number of Join Conditions. 51 number_of_members : int 52 Number of members to generate, for example 5000 for 5K rows in a 53 tabular data structure. 54 number_of_properties : int 55 Number of properties per member to generate, for example 20 for 56 20 columns in a tabular data structure. 57 value_size : int 58 Number of characters to add to default value generation, 59 for example: 256 will expand all values to 256 characters. 60 data_format : str 61 Data format to use for generating the data set, for example: 62 "csv", "json", "xml", "postgresql", "mysql" 63 engine : str 64 Engine to use for execution of the generated scenario's instance, 65 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 66 or "OntopMaterialize" 67 seed : int 68 Random seed to use, default 0. 69 """ 70 self._percentage = percentage 71 self._n = n 72 self._m = m 73 self._jc = jc 74 self._number_of_members: int = number_of_members 75 self._number_of_properties: int = number_of_properties 76 self._value_size: int = value_size 77 random.seed(seed) 78 79 super().__init__(data_format, engine, main_directory, verbose) 80 81 if self._data_format != 'csv': 82 raise NotImplementedError(f'Data format {self._data_format} ' 83 f'is not implemented by {__name__}') 84 85 self._logger = Logger(__name__, self._main_directory, self._verbose) 86 self._logger.debug(f'Generating join relations {self._n}-{self._m}' 87 f' with {self._percentage}% of relations,') 88 89 def generate(self) -> bool: 90 """Generate the instance using the Joins Multiple scenario. 91 92 Only CSV files are currently implemented! 93 """ 94 if self._data_format == 'csv': 95 return self._generate_csv() 96 elif self._data_format == 'postgresql': 97 return self._generate_postgresql() 98 else: 99 raise NotImplementedError(f'Data format {self._data_format} ' 100 f'is not implemented by {__name__}') 101 102 def path(self) -> str: 103 """Builds the file path for the instance of a Joins Multiple scenario. 104 105 Returns 106 ------- 107 path : str 108 File path for the Joins Multiple's instance. 109 """ 110 key = f'joins_mutiple_{self._n}-{self._m}_{self._jc}jc' + \ 111 f'_{self._percentage}' 112 path = os.path.join(self._main_directory, self._engine, 113 self._data_format, key) 114 self._logger.debug(f'Generating to {path}') 115 os.makedirs(path, exist_ok=True) 116 return path 117 118 def _generate_dataframe(self, member_offset: int = 1, 119 property_offset: int = 1) -> DataFrame: 120 """Generate joins. 121 122 Parameters 123 ---------- 124 member_offset : int 125 Offset to start member ID generation from. Default 1 (no offset). 126 property_offset : int 127 Offset to start property ID generation from. Default 1 (no offset). 128 129 Returns 130 ------- 131 dataframe : DataFrame 132 Panda's DataFrame with generated joins. 133 """ 134 subject_id = range(member_offset, 135 self._number_of_members + member_offset) 136 value_id = range(property_offset, 137 self._number_of_members + property_offset) 138 data: dict = {'id': subject_id} 139 n_ascii = len(string.ascii_letters) 140 141 for j in range(1, self._number_of_properties + 1): 142 # Append ASCII characters if necessary, use modulo to avoid out of 143 # range in ASCII table 144 append_value = '' 145 if self._value_size > 0: 146 append_value = '_' 147 for n in range(self._value_size): 148 append_value += string.ascii_letters[n % n_ascii] 149 150 # Generate value V_{property}_{member} honoring the value size 151 value = [f'V_{j}-{i}{append_value}' for i in value_id] 152 data[f'p{j}'] = value 153 154 return DataFrame(data) 155 156 def _update_many_on_many(self, 157 dataframe1: DataFrame, 158 dataframe2: DataFrame) -> Tuple[DataFrame, 159 DataFrame]: 160 # 0% percentage results in zero matches for the join condition, 161 # don't even bother to try to match the dataframes 162 if self._percentage == 0.0: 163 return dataframe1, dataframe2 164 165 percentaged_members = \ 166 self._number_of_members * (self._percentage / 100.0) 167 168 sample1 = dataframe1.iloc[random.sample(list(dataframe1.index), 169 int(percentaged_members))] 170 sample1_v = sample1.reset_index(drop=True) 171 sample2 = dataframe2.iloc[random.sample(list(dataframe2.index), 172 int(percentaged_members))] 173 sample2_v = sample2.reset_index(drop=True) 174 175 number_of_members_n = self._number_of_members * (self._percentage / 100.0) 176 number_of_members_m = self._number_of_members * (self._percentage / 100.0) 177 members_to_join_n = number_of_members_n / self._n 178 members_to_join_m = number_of_members_m / self._m 179 180 k = max(int(members_to_join_n + 0.5), int(members_to_join_m + 0.5)) 181 sample_members = sample1_v.iloc[random.sample(list(sample1_v.index), 182 k)] 183 values = list(set([m[1]['p1'] for m in sample_members.iterrows()]))[:int(members_to_join_m + 0.5)] 184 values = values * self._m 185 if len(values) > self._number_of_members: 186 values = values[:self._number_of_members] 187 188 sample2_v = dataframe2.iloc[random.sample(list(dataframe2.index), 189 len(values))] 190 191 for jc in range(1, self._jc + 1): 192 for i, j in zip(values, list(sample2_v.index)): 193 dataframe2.loc[j, f'p{jc}'] = i 194 195 #### 196 values = list(set([m[1]['p1'] for m in sample_members.iterrows()]))[:int(members_to_join_n + 0.5)] 197 values = values * self._n 198 if len(values) > self._number_of_members: 199 values = values[:self._number_of_members] 200 201 sample1_v = dataframe1.iloc[random.sample(list(dataframe1.index), 202 len(values))] 203 for jc in range(1, self._jc + 1): 204 for i, j in zip(values, list(sample1_v.index)): 205 dataframe1.loc[j, f'p{jc}'] = i 206 207 return dataframe1, dataframe2 208 209 def _add_join_multiple_predicate_object_map(self, mapping: Graph, 210 triplesmap_iri: URIRef, 211 predicate_value: URIRef, 212 object_value: Literal, 213 parent_triplesmap_iri: URIRef, 214 jc_values: list) -> Graph: 215 predicate_object_map_iri = BNode() 216 predicate_map_iri = BNode() 217 object_map_iri = BNode() 218 219 for jc in jc_values: 220 join_condition_iri = BNode() 221 mapping.add((join_condition_iri, R2RML.child, jc['child'])) 222 mapping.add((join_condition_iri, R2RML.parent, jc['parent'])) 223 mapping.add((join_condition_iri, RDF.type, R2RML.JoinCondition)) 224 mapping.add((object_map_iri, R2RML.joinCondition, join_condition_iri)) 225 226 mapping.add((predicate_map_iri, R2RML.constant, predicate_value)) 227 mapping.add((predicate_map_iri, RDF.type, R2RML.PredicateMap)) 228 mapping.add((object_map_iri, RDF.type, R2RML.ReferenceObjectMap)) 229 mapping.add((object_map_iri, R2RML.parentTriplesMap, parent_triplesmap_iri)) 230 mapping.add((predicate_object_map_iri, R2RML.predicateMap, predicate_map_iri)) 231 mapping.add((predicate_object_map_iri, R2RML.objectMap, object_map_iri)) 232 mapping.add((predicate_object_map_iri, RDF.type, R2RML.PredicateObjectMap)) 233 mapping.add((triplesmap_iri, R2RML.predicateObjectMap, predicate_object_map_iri)) 234 235 return mapping 236 237 def _generate_mapping(self) -> Graph: 238 """Generate a [R2]RML mapping for a Joins instance. 239 240 Returns 241 ------- 242 mapping : Graph 243 [R2]RML mapping as an RDFLib Graph. 244 """ 245 mapping: Graph = Graph(base='http://ex.com/') 246 mapping.bind('rr', R2RML) 247 mapping.bind('ql', QL) 248 mapping.bind('ex', EX) 249 subject1_template = Literal('http://ex.com/table1/{id}') 250 subject2_template = Literal('http://ex.com/table2/{id}') 251 if self._data_format == 'postgresql': 252 triples_map1_iri = self._add_triples_map(mapping, 253 subject1_template, 254 Literal('data'), number=1) 255 triples_map2_iri = self._add_triples_map(mapping, 256 subject2_template, 257 Literal('data'), number=2) 258 elif self._data_format == 'csv': 259 triples_map1_iri = \ 260 self._add_triples_map_source(mapping, subject1_template, 261 Literal('/data/shared/data1.csv'), 262 number=1) 263 triples_map2_iri = \ 264 self._add_triples_map_source(mapping, subject1_template, 265 Literal('/data/shared/data2.csv'), 266 number=2) 267 else: 268 raise NotImplementedError(f'{self._data_format} not implemented') 269 270 jc_values = [] 271 for i in range(1, self._jc + 1): 272 jc_values.append({ 273 'child': Literal(f'p{i}'), 274 'parent': Literal(f'p{i}') 275 }) 276 277 self._add_join_multiple_predicate_object_map(mapping, triples_map1_iri, 278 EX['j1'], Literal('p1'), 279 triples_map2_iri, 280 jc_values) 281 282 return mapping 283 284 def _generate_csv(self) -> bool: 285 """Generate the instance as CSV files. 286 287 Returns 288 ------- 289 success : bool 290 True if successfull, false otherwise 291 """ 292 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 293 data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1) 294 dataframe1 = self._generate_dataframe() 295 data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2) 296 dataframe2 = self._generate_dataframe(self._number_of_members + 1, 297 self._number_of_properties + 1) 298 dataframe1, dataframe2 = self._update_many_on_many(dataframe1, 299 dataframe2) 300 dataframe1.to_csv(data1_path, index=False) 301 dataframe2.to_csv(data2_path, index=False) 302 303 mapping_path = os.path.join(self.path(), 'data', 'shared', CSV_MAPPING_FILE) 304 mapping: Graph = self._generate_mapping() 305 mapping.serialize(destination=mapping_path, format='turtle') 306 self._generate_scenario() 307 308 return True 309 310 def _generate_postgresql(self) -> bool: 311 """Generate the instance as PostgreSQL with CSV files to load. 312 313 Returns 314 ------- 315 success : bool 316 True if successfull, false otherwise 317 """ 318 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 319 data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1) 320 self._generate_dataframe().to_csv(data1_path, index=False) 321 data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2) 322 self._generate_dataframe().to_csv(data2_path, index=False) 323 324 mapping_path = os.path.join(self.path(), 'data', 'shared', RDB_MAPPING_FILE) 325 mapping: Graph = self._generate_mapping() 326 mapping.serialize(destination=mapping_path, format='turtle') 327 self._generate_scenario() 328 329 return True 330 331 def _generate_scenario(self) -> bool: 332 """Generate the metadata for this scenario. 333 334 Configures the execution pipeline automatically. 335 336 Returns 337 ------- 338 success : bool 339 True if successfull, false otherwise 340 """ 341 name: str = f'join_multiple_{self._n}-{self._m}_{self._jc}_{self._percentage}' 342 description: str = f'Join Multiple {self._n}-{self._m} {self._jc}JC {self._percentage}% ' 343 iri: str = f'http://example.org/join-percentage/{self._percentage}/' 344 345 if self._data_format == 'postgresql': 346 return self._generate_metadata(iri, name, description, 347 RDB_MAPPING_FILE) 348 elif self._data_format == 'csv': 349 return self._generate_metadata(iri, name, description, 350 CSV_MAPPING_FILE) 351 else: 352 raise NotImplementedError(f'{self._data_format} not implemented') 353 354 return False
DATA_FILE1 =
'data1.csv'
DATA_FILE2 =
'data2.csv'
RDB_MAPPING_FILE =
'mapping.r2rml.ttl'
CSV_MAPPING_FILE =
'mapping.rml.ttl'
R2RML =
Namespace('http://www.w3.org/ns/r2rml#')
QL =
Namespace('http://semweb.mmlab.be/ns/ql#')
EX =
Namespace('http://example.com/')
MEMBERS_PERCENTAGE =
50.0
29class JoinsMultiple(Scenario): 30 def __init__(self, main_directory: str, verbose: bool, percentage: float, 31 n: int, m: int, jc: int, number_of_members: int, 32 number_of_properties: int, value_size: int, data_format: str, 33 engine: str, seed: int = 0): 34 """Initialize a Joins Multiple scenario. 35 36 Member's percentage is always set to 50%. 37 38 Parameters 39 ---------- 40 main_directory : str 41 Root directory for generating instances of Joins Multiple. 42 verbose : bool 43 Verbose logging enabled or not. 44 percentage : float 45 Percentage of relations which should result into a join. 46 n : int 47 Relation size N. 48 m : int 49 Relation size M. 50 jc : int 51 Number of Join Conditions. 52 number_of_members : int 53 Number of members to generate, for example 5000 for 5K rows in a 54 tabular data structure. 55 number_of_properties : int 56 Number of properties per member to generate, for example 20 for 57 20 columns in a tabular data structure. 58 value_size : int 59 Number of characters to add to default value generation, 60 for example: 256 will expand all values to 256 characters. 61 data_format : str 62 Data format to use for generating the data set, for example: 63 "csv", "json", "xml", "postgresql", "mysql" 64 engine : str 65 Engine to use for execution of the generated scenario's instance, 66 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 67 or "OntopMaterialize" 68 seed : int 69 Random seed to use, default 0. 70 """ 71 self._percentage = percentage 72 self._n = n 73 self._m = m 74 self._jc = jc 75 self._number_of_members: int = number_of_members 76 self._number_of_properties: int = number_of_properties 77 self._value_size: int = value_size 78 random.seed(seed) 79 80 super().__init__(data_format, engine, main_directory, verbose) 81 82 if self._data_format != 'csv': 83 raise NotImplementedError(f'Data format {self._data_format} ' 84 f'is not implemented by {__name__}') 85 86 self._logger = Logger(__name__, self._main_directory, self._verbose) 87 self._logger.debug(f'Generating join relations {self._n}-{self._m}' 88 f' with {self._percentage}% of relations,') 89 90 def generate(self) -> bool: 91 """Generate the instance using the Joins Multiple scenario. 92 93 Only CSV files are currently implemented! 94 """ 95 if self._data_format == 'csv': 96 return self._generate_csv() 97 elif self._data_format == 'postgresql': 98 return self._generate_postgresql() 99 else: 100 raise NotImplementedError(f'Data format {self._data_format} ' 101 f'is not implemented by {__name__}') 102 103 def path(self) -> str: 104 """Builds the file path for the instance of a Joins Multiple scenario. 105 106 Returns 107 ------- 108 path : str 109 File path for the Joins Multiple's instance. 110 """ 111 key = f'joins_mutiple_{self._n}-{self._m}_{self._jc}jc' + \ 112 f'_{self._percentage}' 113 path = os.path.join(self._main_directory, self._engine, 114 self._data_format, key) 115 self._logger.debug(f'Generating to {path}') 116 os.makedirs(path, exist_ok=True) 117 return path 118 119 def _generate_dataframe(self, member_offset: int = 1, 120 property_offset: int = 1) -> DataFrame: 121 """Generate joins. 122 123 Parameters 124 ---------- 125 member_offset : int 126 Offset to start member ID generation from. Default 1 (no offset). 127 property_offset : int 128 Offset to start property ID generation from. Default 1 (no offset). 129 130 Returns 131 ------- 132 dataframe : DataFrame 133 Panda's DataFrame with generated joins. 134 """ 135 subject_id = range(member_offset, 136 self._number_of_members + member_offset) 137 value_id = range(property_offset, 138 self._number_of_members + property_offset) 139 data: dict = {'id': subject_id} 140 n_ascii = len(string.ascii_letters) 141 142 for j in range(1, self._number_of_properties + 1): 143 # Append ASCII characters if necessary, use modulo to avoid out of 144 # range in ASCII table 145 append_value = '' 146 if self._value_size > 0: 147 append_value = '_' 148 for n in range(self._value_size): 149 append_value += string.ascii_letters[n % n_ascii] 150 151 # Generate value V_{property}_{member} honoring the value size 152 value = [f'V_{j}-{i}{append_value}' for i in value_id] 153 data[f'p{j}'] = value 154 155 return DataFrame(data) 156 157 def _update_many_on_many(self, 158 dataframe1: DataFrame, 159 dataframe2: DataFrame) -> Tuple[DataFrame, 160 DataFrame]: 161 # 0% percentage results in zero matches for the join condition, 162 # don't even bother to try to match the dataframes 163 if self._percentage == 0.0: 164 return dataframe1, dataframe2 165 166 percentaged_members = \ 167 self._number_of_members * (self._percentage / 100.0) 168 169 sample1 = dataframe1.iloc[random.sample(list(dataframe1.index), 170 int(percentaged_members))] 171 sample1_v = sample1.reset_index(drop=True) 172 sample2 = dataframe2.iloc[random.sample(list(dataframe2.index), 173 int(percentaged_members))] 174 sample2_v = sample2.reset_index(drop=True) 175 176 number_of_members_n = self._number_of_members * (self._percentage / 100.0) 177 number_of_members_m = self._number_of_members * (self._percentage / 100.0) 178 members_to_join_n = number_of_members_n / self._n 179 members_to_join_m = number_of_members_m / self._m 180 181 k = max(int(members_to_join_n + 0.5), int(members_to_join_m + 0.5)) 182 sample_members = sample1_v.iloc[random.sample(list(sample1_v.index), 183 k)] 184 values = list(set([m[1]['p1'] for m in sample_members.iterrows()]))[:int(members_to_join_m + 0.5)] 185 values = values * self._m 186 if len(values) > self._number_of_members: 187 values = values[:self._number_of_members] 188 189 sample2_v = dataframe2.iloc[random.sample(list(dataframe2.index), 190 len(values))] 191 192 for jc in range(1, self._jc + 1): 193 for i, j in zip(values, list(sample2_v.index)): 194 dataframe2.loc[j, f'p{jc}'] = i 195 196 #### 197 values = list(set([m[1]['p1'] for m in sample_members.iterrows()]))[:int(members_to_join_n + 0.5)] 198 values = values * self._n 199 if len(values) > self._number_of_members: 200 values = values[:self._number_of_members] 201 202 sample1_v = dataframe1.iloc[random.sample(list(dataframe1.index), 203 len(values))] 204 for jc in range(1, self._jc + 1): 205 for i, j in zip(values, list(sample1_v.index)): 206 dataframe1.loc[j, f'p{jc}'] = i 207 208 return dataframe1, dataframe2 209 210 def _add_join_multiple_predicate_object_map(self, mapping: Graph, 211 triplesmap_iri: URIRef, 212 predicate_value: URIRef, 213 object_value: Literal, 214 parent_triplesmap_iri: URIRef, 215 jc_values: list) -> Graph: 216 predicate_object_map_iri = BNode() 217 predicate_map_iri = BNode() 218 object_map_iri = BNode() 219 220 for jc in jc_values: 221 join_condition_iri = BNode() 222 mapping.add((join_condition_iri, R2RML.child, jc['child'])) 223 mapping.add((join_condition_iri, R2RML.parent, jc['parent'])) 224 mapping.add((join_condition_iri, RDF.type, R2RML.JoinCondition)) 225 mapping.add((object_map_iri, R2RML.joinCondition, join_condition_iri)) 226 227 mapping.add((predicate_map_iri, R2RML.constant, predicate_value)) 228 mapping.add((predicate_map_iri, RDF.type, R2RML.PredicateMap)) 229 mapping.add((object_map_iri, RDF.type, R2RML.ReferenceObjectMap)) 230 mapping.add((object_map_iri, R2RML.parentTriplesMap, parent_triplesmap_iri)) 231 mapping.add((predicate_object_map_iri, R2RML.predicateMap, predicate_map_iri)) 232 mapping.add((predicate_object_map_iri, R2RML.objectMap, object_map_iri)) 233 mapping.add((predicate_object_map_iri, RDF.type, R2RML.PredicateObjectMap)) 234 mapping.add((triplesmap_iri, R2RML.predicateObjectMap, predicate_object_map_iri)) 235 236 return mapping 237 238 def _generate_mapping(self) -> Graph: 239 """Generate a [R2]RML mapping for a Joins instance. 240 241 Returns 242 ------- 243 mapping : Graph 244 [R2]RML mapping as an RDFLib Graph. 245 """ 246 mapping: Graph = Graph(base='http://ex.com/') 247 mapping.bind('rr', R2RML) 248 mapping.bind('ql', QL) 249 mapping.bind('ex', EX) 250 subject1_template = Literal('http://ex.com/table1/{id}') 251 subject2_template = Literal('http://ex.com/table2/{id}') 252 if self._data_format == 'postgresql': 253 triples_map1_iri = self._add_triples_map(mapping, 254 subject1_template, 255 Literal('data'), number=1) 256 triples_map2_iri = self._add_triples_map(mapping, 257 subject2_template, 258 Literal('data'), number=2) 259 elif self._data_format == 'csv': 260 triples_map1_iri = \ 261 self._add_triples_map_source(mapping, subject1_template, 262 Literal('/data/shared/data1.csv'), 263 number=1) 264 triples_map2_iri = \ 265 self._add_triples_map_source(mapping, subject1_template, 266 Literal('/data/shared/data2.csv'), 267 number=2) 268 else: 269 raise NotImplementedError(f'{self._data_format} not implemented') 270 271 jc_values = [] 272 for i in range(1, self._jc + 1): 273 jc_values.append({ 274 'child': Literal(f'p{i}'), 275 'parent': Literal(f'p{i}') 276 }) 277 278 self._add_join_multiple_predicate_object_map(mapping, triples_map1_iri, 279 EX['j1'], Literal('p1'), 280 triples_map2_iri, 281 jc_values) 282 283 return mapping 284 285 def _generate_csv(self) -> bool: 286 """Generate the instance as CSV files. 287 288 Returns 289 ------- 290 success : bool 291 True if successfull, false otherwise 292 """ 293 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 294 data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1) 295 dataframe1 = self._generate_dataframe() 296 data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2) 297 dataframe2 = self._generate_dataframe(self._number_of_members + 1, 298 self._number_of_properties + 1) 299 dataframe1, dataframe2 = self._update_many_on_many(dataframe1, 300 dataframe2) 301 dataframe1.to_csv(data1_path, index=False) 302 dataframe2.to_csv(data2_path, index=False) 303 304 mapping_path = os.path.join(self.path(), 'data', 'shared', CSV_MAPPING_FILE) 305 mapping: Graph = self._generate_mapping() 306 mapping.serialize(destination=mapping_path, format='turtle') 307 self._generate_scenario() 308 309 return True 310 311 def _generate_postgresql(self) -> bool: 312 """Generate the instance as PostgreSQL with CSV files to load. 313 314 Returns 315 ------- 316 success : bool 317 True if successfull, false otherwise 318 """ 319 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 320 data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1) 321 self._generate_dataframe().to_csv(data1_path, index=False) 322 data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2) 323 self._generate_dataframe().to_csv(data2_path, index=False) 324 325 mapping_path = os.path.join(self.path(), 'data', 'shared', RDB_MAPPING_FILE) 326 mapping: Graph = self._generate_mapping() 327 mapping.serialize(destination=mapping_path, format='turtle') 328 self._generate_scenario() 329 330 return True 331 332 def _generate_scenario(self) -> bool: 333 """Generate the metadata for this scenario. 334 335 Configures the execution pipeline automatically. 336 337 Returns 338 ------- 339 success : bool 340 True if successfull, false otherwise 341 """ 342 name: str = f'join_multiple_{self._n}-{self._m}_{self._jc}_{self._percentage}' 343 description: str = f'Join Multiple {self._n}-{self._m} {self._jc}JC {self._percentage}% ' 344 iri: str = f'http://example.org/join-percentage/{self._percentage}/' 345 346 if self._data_format == 'postgresql': 347 return self._generate_metadata(iri, name, description, 348 RDB_MAPPING_FILE) 349 elif self._data_format == 'csv': 350 return self._generate_metadata(iri, name, description, 351 CSV_MAPPING_FILE) 352 else: 353 raise NotImplementedError(f'{self._data_format} not implemented') 354 355 return False
Helper class that provides a standard way to create an ABC using inheritance.
JoinsMultiple( main_directory: str, verbose: bool, percentage: float, n: int, m: int, jc: int, number_of_members: int, number_of_properties: int, value_size: int, data_format: str, engine: str, seed: int = 0)
30 def __init__(self, main_directory: str, verbose: bool, percentage: float, 31 n: int, m: int, jc: int, number_of_members: int, 32 number_of_properties: int, value_size: int, data_format: str, 33 engine: str, seed: int = 0): 34 """Initialize a Joins Multiple scenario. 35 36 Member's percentage is always set to 50%. 37 38 Parameters 39 ---------- 40 main_directory : str 41 Root directory for generating instances of Joins Multiple. 42 verbose : bool 43 Verbose logging enabled or not. 44 percentage : float 45 Percentage of relations which should result into a join. 46 n : int 47 Relation size N. 48 m : int 49 Relation size M. 50 jc : int 51 Number of Join Conditions. 52 number_of_members : int 53 Number of members to generate, for example 5000 for 5K rows in a 54 tabular data structure. 55 number_of_properties : int 56 Number of properties per member to generate, for example 20 for 57 20 columns in a tabular data structure. 58 value_size : int 59 Number of characters to add to default value generation, 60 for example: 256 will expand all values to 256 characters. 61 data_format : str 62 Data format to use for generating the data set, for example: 63 "csv", "json", "xml", "postgresql", "mysql" 64 engine : str 65 Engine to use for execution of the generated scenario's instance, 66 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 67 or "OntopMaterialize" 68 seed : int 69 Random seed to use, default 0. 70 """ 71 self._percentage = percentage 72 self._n = n 73 self._m = m 74 self._jc = jc 75 self._number_of_members: int = number_of_members 76 self._number_of_properties: int = number_of_properties 77 self._value_size: int = value_size 78 random.seed(seed) 79 80 super().__init__(data_format, engine, main_directory, verbose) 81 82 if self._data_format != 'csv': 83 raise NotImplementedError(f'Data format {self._data_format} ' 84 f'is not implemented by {__name__}') 85 86 self._logger = Logger(__name__, self._main_directory, self._verbose) 87 self._logger.debug(f'Generating join relations {self._n}-{self._m}' 88 f' with {self._percentage}% of relations,')
Initialize a Joins Multiple scenario.
Member's percentage is always set to 50%.
Parameters
- main_directory (str): Root directory for generating instances of Joins Multiple.
- verbose (bool): Verbose logging enabled or not.
- percentage (float): Percentage of relations which should result into a join.
- n (int): Relation size N.
- m (int): Relation size M.
- jc (int): Number of Join Conditions.
- number_of_members (int): Number of members to generate, for example 5000 for 5K rows in a tabular data structure.
- number_of_properties (int): Number of properties per member to generate, for example 20 for 20 columns in a tabular data structure.
- value_size (int): Number of characters to add to default value generation, for example: 256 will expand all values to 256 characters.
- data_format (str): Data format to use for generating the data set, for example: "csv", "json", "xml", "postgresql", "mysql"
- engine (str): Engine to use for execution of the generated scenario's instance, for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", or "OntopMaterialize"
- seed (int): Random seed to use, default 0.
def
generate(self) -> bool:
90 def generate(self) -> bool: 91 """Generate the instance using the Joins Multiple scenario. 92 93 Only CSV files are currently implemented! 94 """ 95 if self._data_format == 'csv': 96 return self._generate_csv() 97 elif self._data_format == 'postgresql': 98 return self._generate_postgresql() 99 else: 100 raise NotImplementedError(f'Data format {self._data_format} ' 101 f'is not implemented by {__name__}')
Generate the instance using the Joins Multiple scenario.
Only CSV files are currently implemented!
def
path(self) -> str:
103 def path(self) -> str: 104 """Builds the file path for the instance of a Joins Multiple scenario. 105 106 Returns 107 ------- 108 path : str 109 File path for the Joins Multiple's instance. 110 """ 111 key = f'joins_mutiple_{self._n}-{self._m}_{self._jc}jc' + \ 112 f'_{self._percentage}' 113 path = os.path.join(self._main_directory, self._engine, 114 self._data_format, key) 115 self._logger.debug(f'Generating to {path}') 116 os.makedirs(path, exist_ok=True) 117 return path
Builds the file path for the instance of a Joins Multiple scenario.
Returns
- path (str): File path for the Joins Multiple's instance.