bench_generator.joins_percentage
This module holds the Joins class which scales the dataset size by the number of members in a dataset such as number of rows for tabular data.
1#!/usr/bin/env python3 2 3""" 4This module holds the Joins class which scales the dataset size 5by the number of members in a dataset such as number of rows for tabular data. 6""" 7 8import os 9import string 10import random 11from typing import Tuple 12from pandas import DataFrame 13from rdflib.namespace import RDF 14from rdflib import Graph, URIRef, BNode, Literal, Namespace 15from bench_generator.scenario import Scenario 16from bench_generator.logger import Logger 17 18DATA_FILE1 = 'data1.csv' 19DATA_FILE2 = 'data2.csv' 20RDB_MAPPING_FILE = 'mapping.r2rml.ttl' 21CSV_MAPPING_FILE = 'mapping.rml.ttl' 22R2RML = Namespace('http://www.w3.org/ns/r2rml#') 23QL = Namespace('http://semweb.mmlab.be/ns/ql#') 24EX = Namespace('http://example.com/') 25 26 27class JoinsPercentage(Scenario): 28 def __init__(self, main_directory: str, verbose: bool, percentage: float, 29 number_of_members: int, number_of_properties: int, 30 value_size: int, data_format: str, engine: str, 31 seed: int = 0): 32 """Initialize a Joins Percentage scenario. 33 34 Parameters 35 ---------- 36 main_directory : str 37 Root directory for generating instances of Joins Percentage. 38 verbose : bool 39 Verbose logging enabled or not. 40 percentage : float 41 Percentage of members which should result into a join. 42 number_of_members : int 43 Number of members to generate, for example 5000 for 5K rows in a 44 tabular data structure. 45 number_of_properties : int 46 Number of properties per member to generate, for example 20 for 47 20 columns in a tabular data structure. 48 value_size : int 49 Number of characters to add to default value generation, 50 for example: 256 will expand all values to 256 characters. 51 data_format : str 52 Data format to use for generating the data set, for example: 53 "csv", "json", "xml", "postgresql", "mysql" 54 engine : str 55 Engine to use for execution of the generated scenario's instance, 56 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 57 or "OntopMaterialize" 58 seed : int 59 Random seed to use, default 0. 60 """ 61 self._percentage = percentage 62 self._number_of_members: int = number_of_members 63 self._number_of_properties: int = number_of_properties 64 self._value_size: int = value_size 65 random.seed(seed) 66 67 super().__init__(data_format, engine, main_directory, verbose) 68 69 if self._data_format != 'csv': 70 raise NotImplementedError(f'Data format {self._data_format} ' 71 f'is not implemented by {__name__}') 72 73 self._logger = Logger(__name__, self._main_directory, self._verbose) 74 self._logger.debug(f'Generating join percentage' 75 f' with {self._percentage}% of members,') 76 77 def generate(self) -> bool: 78 """Generate the instance using the Joins Percentage scenario. 79 80 Only CSV files are currently implemented! 81 """ 82 if self._data_format == 'csv': 83 return self._generate_csv() 84 elif self._data_format == 'postgresql': 85 return self._generate_postgresql() 86 else: 87 raise NotImplementedError(f'Data format {self._data_format} ' 88 f'is not implemented by {__name__}') 89 90 def path(self) -> str: 91 """Builds the file path for the instance of a Joins Percentage scenario. 92 93 Returns 94 ------- 95 path : str 96 File path for the Joins Percentage's instance. 97 """ 98 key = f'joins_perc_1-1_{self._percentage}' 99 path = os.path.join(self._main_directory, self._engine, 100 self._data_format, key) 101 self._logger.debug(f'Generating to {path}') 102 os.makedirs(path, exist_ok=True) 103 return path 104 105 def _generate_dataframe(self, member_offset: int = 1, 106 property_offset: int = 1) -> DataFrame: 107 """Generate joins. 108 109 Parameters 110 ---------- 111 member_offset : int 112 Offset to start member ID generation from. Default 1 (no offset). 113 property_offset : int 114 Offset to start property ID generation from. Default 1 (no offset). 115 116 Returns 117 ------- 118 dataframe : DataFrame 119 Panda's DataFrame with generated joins. 120 """ 121 subject_id = range(member_offset, 122 self._number_of_members + member_offset) 123 value_id = range(property_offset, 124 self._number_of_members + property_offset) 125 data: dict = {'id': subject_id} 126 n_ascii = len(string.ascii_letters) 127 128 for j in range(1, self._number_of_properties + 1): 129 # Append ASCII characters if necessary, use modulo to avoid out of 130 # range in ASCII table 131 append_value = '' 132 if self._value_size > 0: 133 append_value = '_' 134 for n in range(self._value_size): 135 append_value += string.ascii_letters[n % n_ascii] 136 137 # Generate value V_{property}_{member} honoring the value size 138 value = [f'V_{j}-{i}{append_value}' for i in value_id] 139 data[f'p{j}'] = value 140 141 return DataFrame(data) 142 143 def _update_one_on_one(self, dataframe1: DataFrame, dataframe2: DataFrame)\ 144 -> Tuple[DataFrame, DataFrame]: 145 # 0% percentage results in zero matches for the join condition, 146 # don't even bother to try to match the dataframes 147 if self._percentage == 0.0: 148 return dataframe1, dataframe2 149 150 percentaged_members = \ 151 self._number_of_members * (self._percentage / 100.0) 152 153 sample1 = dataframe1.iloc[random.sample(list(dataframe1.index), 154 int(percentaged_members))] 155 sample2 = dataframe2.iloc[random.sample(list(dataframe2.index), 156 int(percentaged_members))] 157 158 for i, j in zip(list(sample1.index), list(sample2.index)): 159 dataframe2.loc[j, 'id'] = dataframe1.loc[i, 'id'] 160 161 return dataframe1, dataframe2 162 163 def _add_join_predicate_object_map(self, mapping: Graph, 164 triplesmap_iri: URIRef, 165 predicate_value: URIRef, 166 object_value: Literal, 167 parent_triplesmap_iri: URIRef, 168 child_value: Literal, 169 parent_value: Literal) -> BNode: 170 """Insert a join with join condition into a [R2]RML mapping 171 172 Parameters 173 ---------- 174 mapping : Graph 175 [R2]RML mapping as an RDFLib Graph. 176 triples_map_iri : URIRef 177 IRI of the Triples Map to insert the PredicateObjectMap in. 178 predicate_value : URIRef 179 Predicate IRI value for PredicateObjectMap. 180 object_value : Literal 181 Object value for PredicateObjectMap. 182 183 Returns 184 ------- 185 predicat_object_map_with_join_iri : BNode 186 Predicate Object Map with join blank node ID. 187 """ 188 predicate_object_map_iri = BNode() 189 predicate_map_iri = BNode() 190 object_map_iri = BNode() 191 join_condition_iri = BNode() 192 193 mapping.add((join_condition_iri, R2RML.child, child_value)) 194 mapping.add((join_condition_iri, R2RML.parent, parent_value)) 195 mapping.add((join_condition_iri, RDF.type, R2RML.JoinCondition)) 196 mapping.add((predicate_map_iri, R2RML.constant, predicate_value)) 197 mapping.add((predicate_map_iri, RDF.type, R2RML.PredicateMap)) 198 mapping.add((object_map_iri, RDF.type, R2RML.ReferenceObjectMap)) 199 mapping.add((object_map_iri, R2RML.parentTriplesMap, 200 parent_triplesmap_iri)) 201 mapping.add((object_map_iri, R2RML.joinCondition, join_condition_iri)) 202 mapping.add((predicate_object_map_iri, R2RML.predicateMap, 203 predicate_map_iri)) 204 mapping.add((predicate_object_map_iri, R2RML.objectMap, 205 object_map_iri)) 206 mapping.add((predicate_object_map_iri, RDF.type, 207 R2RML.PredicateObjectMap)) 208 mapping.add((triplesmap_iri, R2RML.predicateObjectMap, 209 predicate_object_map_iri)) 210 211 return join_condition_iri 212 213 def _generate_mapping(self) -> Graph: 214 """Generate a [R2]RML mapping for a Joins instance. 215 216 Returns 217 ------- 218 mapping : Graph 219 [R2]RML mapping as an RDFLib Graph. 220 """ 221 mapping: Graph = Graph(base='http://ex.com/') 222 mapping.bind('rr', R2RML) 223 mapping.bind('ql', QL) 224 mapping.bind('ex', EX) 225 subject1_template = Literal('http://ex.com/table1/{id}') 226 subject2_template = Literal('http://ex.com/table2/{id}') 227 if self._data_format == 'postgresql': 228 triples_map1_iri = self._add_triples_map(mapping, 229 subject1_template, 230 Literal('data'), number=1) 231 triples_map2_iri = self._add_triples_map(mapping, 232 subject2_template, 233 Literal('data'), number=2) 234 elif self._data_format == 'csv': 235 triples_map1_iri = \ 236 self._add_triples_map_source(mapping, subject1_template, 237 Literal('/data/shared/data1.csv'), 238 number=1) 239 triples_map2_iri = \ 240 self._add_triples_map_source(mapping, subject2_template, 241 Literal('/data/shared/data2.csv'), 242 number=2) 243 else: 244 raise NotImplementedError(f'{self._data_format} not implemented') 245 246 self._add_join_predicate_object_map(mapping, triples_map1_iri, 247 EX['j1'], Literal('p1'), 248 triples_map2_iri, Literal('id'), 249 Literal('id')) 250 251 return mapping 252 253 def _generate_csv(self) -> bool: 254 """Generate the instance as CSV files. 255 256 Returns 257 ------- 258 success : bool 259 True if successfull, false otherwise 260 """ 261 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 262 data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1) 263 dataframe1 = self._generate_dataframe() 264 data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2) 265 dataframe2 = self._generate_dataframe(self._number_of_members + 1, 266 self._number_of_properties + 1) 267 dataframe1, dataframe2 = self._update_one_on_one(dataframe1, 268 dataframe2) 269 dataframe1.to_csv(data1_path, index=False) 270 dataframe2.to_csv(data2_path, index=False) 271 272 mapping_path = os.path.join(self.path(), 'data', 'shared', CSV_MAPPING_FILE) 273 mapping: Graph = self._generate_mapping() 274 mapping.serialize(destination=mapping_path, format='turtle') 275 self._generate_scenario() 276 277 return True 278 279 def _generate_postgresql(self) -> bool: 280 """Generate the instance as PostgreSQL with CSV files to load. 281 282 Returns 283 ------- 284 success : bool 285 True if successfull, false otherwise 286 """ 287 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 288 data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1) 289 self._generate_dataframe().to_csv(data1_path, index=False) 290 data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2) 291 self._generate_dataframe().to_csv(data2_path, index=False) 292 293 mapping_path = os.path.join(self.path(), 'data', 'shared', RDB_MAPPING_FILE) 294 mapping: Graph = self._generate_mapping() 295 mapping.serialize(destination=mapping_path, format='turtle') 296 self._generate_scenario() 297 298 return True 299 300 def _generate_scenario(self) -> bool: 301 """Generate the metadata for this scenario. 302 303 Configures the execution pipeline automatically. 304 305 Returns 306 ------- 307 success : bool 308 True if successfull, false otherwise 309 """ 310 name: str = f'join_percentage_{self._percentage}' 311 description: str = f'Join Percentage {self._percentage}% ' 312 iri: str = f'http://example.org/join-percentage/{self._percentage}/' 313 314 if self._data_format == 'postgresql': 315 return self._generate_metadata(iri, name, description, 316 RDB_MAPPING_FILE) 317 elif self._data_format == 'csv': 318 return self._generate_metadata(iri, name, description, 319 CSV_MAPPING_FILE) 320 else: 321 raise NotImplementedError(f'{self._data_format} not implemented') 322 323 return False
DATA_FILE1 =
'data1.csv'
DATA_FILE2 =
'data2.csv'
RDB_MAPPING_FILE =
'mapping.r2rml.ttl'
CSV_MAPPING_FILE =
'mapping.rml.ttl'
R2RML =
Namespace('http://www.w3.org/ns/r2rml#')
QL =
Namespace('http://semweb.mmlab.be/ns/ql#')
EX =
Namespace('http://example.com/')
28class JoinsPercentage(Scenario): 29 def __init__(self, main_directory: str, verbose: bool, percentage: float, 30 number_of_members: int, number_of_properties: int, 31 value_size: int, data_format: str, engine: str, 32 seed: int = 0): 33 """Initialize a Joins Percentage scenario. 34 35 Parameters 36 ---------- 37 main_directory : str 38 Root directory for generating instances of Joins Percentage. 39 verbose : bool 40 Verbose logging enabled or not. 41 percentage : float 42 Percentage of members which should result into a join. 43 number_of_members : int 44 Number of members to generate, for example 5000 for 5K rows in a 45 tabular data structure. 46 number_of_properties : int 47 Number of properties per member to generate, for example 20 for 48 20 columns in a tabular data structure. 49 value_size : int 50 Number of characters to add to default value generation, 51 for example: 256 will expand all values to 256 characters. 52 data_format : str 53 Data format to use for generating the data set, for example: 54 "csv", "json", "xml", "postgresql", "mysql" 55 engine : str 56 Engine to use for execution of the generated scenario's instance, 57 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 58 or "OntopMaterialize" 59 seed : int 60 Random seed to use, default 0. 61 """ 62 self._percentage = percentage 63 self._number_of_members: int = number_of_members 64 self._number_of_properties: int = number_of_properties 65 self._value_size: int = value_size 66 random.seed(seed) 67 68 super().__init__(data_format, engine, main_directory, verbose) 69 70 if self._data_format != 'csv': 71 raise NotImplementedError(f'Data format {self._data_format} ' 72 f'is not implemented by {__name__}') 73 74 self._logger = Logger(__name__, self._main_directory, self._verbose) 75 self._logger.debug(f'Generating join percentage' 76 f' with {self._percentage}% of members,') 77 78 def generate(self) -> bool: 79 """Generate the instance using the Joins Percentage scenario. 80 81 Only CSV files are currently implemented! 82 """ 83 if self._data_format == 'csv': 84 return self._generate_csv() 85 elif self._data_format == 'postgresql': 86 return self._generate_postgresql() 87 else: 88 raise NotImplementedError(f'Data format {self._data_format} ' 89 f'is not implemented by {__name__}') 90 91 def path(self) -> str: 92 """Builds the file path for the instance of a Joins Percentage scenario. 93 94 Returns 95 ------- 96 path : str 97 File path for the Joins Percentage's instance. 98 """ 99 key = f'joins_perc_1-1_{self._percentage}' 100 path = os.path.join(self._main_directory, self._engine, 101 self._data_format, key) 102 self._logger.debug(f'Generating to {path}') 103 os.makedirs(path, exist_ok=True) 104 return path 105 106 def _generate_dataframe(self, member_offset: int = 1, 107 property_offset: int = 1) -> DataFrame: 108 """Generate joins. 109 110 Parameters 111 ---------- 112 member_offset : int 113 Offset to start member ID generation from. Default 1 (no offset). 114 property_offset : int 115 Offset to start property ID generation from. Default 1 (no offset). 116 117 Returns 118 ------- 119 dataframe : DataFrame 120 Panda's DataFrame with generated joins. 121 """ 122 subject_id = range(member_offset, 123 self._number_of_members + member_offset) 124 value_id = range(property_offset, 125 self._number_of_members + property_offset) 126 data: dict = {'id': subject_id} 127 n_ascii = len(string.ascii_letters) 128 129 for j in range(1, self._number_of_properties + 1): 130 # Append ASCII characters if necessary, use modulo to avoid out of 131 # range in ASCII table 132 append_value = '' 133 if self._value_size > 0: 134 append_value = '_' 135 for n in range(self._value_size): 136 append_value += string.ascii_letters[n % n_ascii] 137 138 # Generate value V_{property}_{member} honoring the value size 139 value = [f'V_{j}-{i}{append_value}' for i in value_id] 140 data[f'p{j}'] = value 141 142 return DataFrame(data) 143 144 def _update_one_on_one(self, dataframe1: DataFrame, dataframe2: DataFrame)\ 145 -> Tuple[DataFrame, DataFrame]: 146 # 0% percentage results in zero matches for the join condition, 147 # don't even bother to try to match the dataframes 148 if self._percentage == 0.0: 149 return dataframe1, dataframe2 150 151 percentaged_members = \ 152 self._number_of_members * (self._percentage / 100.0) 153 154 sample1 = dataframe1.iloc[random.sample(list(dataframe1.index), 155 int(percentaged_members))] 156 sample2 = dataframe2.iloc[random.sample(list(dataframe2.index), 157 int(percentaged_members))] 158 159 for i, j in zip(list(sample1.index), list(sample2.index)): 160 dataframe2.loc[j, 'id'] = dataframe1.loc[i, 'id'] 161 162 return dataframe1, dataframe2 163 164 def _add_join_predicate_object_map(self, mapping: Graph, 165 triplesmap_iri: URIRef, 166 predicate_value: URIRef, 167 object_value: Literal, 168 parent_triplesmap_iri: URIRef, 169 child_value: Literal, 170 parent_value: Literal) -> BNode: 171 """Insert a join with join condition into a [R2]RML mapping 172 173 Parameters 174 ---------- 175 mapping : Graph 176 [R2]RML mapping as an RDFLib Graph. 177 triples_map_iri : URIRef 178 IRI of the Triples Map to insert the PredicateObjectMap in. 179 predicate_value : URIRef 180 Predicate IRI value for PredicateObjectMap. 181 object_value : Literal 182 Object value for PredicateObjectMap. 183 184 Returns 185 ------- 186 predicat_object_map_with_join_iri : BNode 187 Predicate Object Map with join blank node ID. 188 """ 189 predicate_object_map_iri = BNode() 190 predicate_map_iri = BNode() 191 object_map_iri = BNode() 192 join_condition_iri = BNode() 193 194 mapping.add((join_condition_iri, R2RML.child, child_value)) 195 mapping.add((join_condition_iri, R2RML.parent, parent_value)) 196 mapping.add((join_condition_iri, RDF.type, R2RML.JoinCondition)) 197 mapping.add((predicate_map_iri, R2RML.constant, predicate_value)) 198 mapping.add((predicate_map_iri, RDF.type, R2RML.PredicateMap)) 199 mapping.add((object_map_iri, RDF.type, R2RML.ReferenceObjectMap)) 200 mapping.add((object_map_iri, R2RML.parentTriplesMap, 201 parent_triplesmap_iri)) 202 mapping.add((object_map_iri, R2RML.joinCondition, join_condition_iri)) 203 mapping.add((predicate_object_map_iri, R2RML.predicateMap, 204 predicate_map_iri)) 205 mapping.add((predicate_object_map_iri, R2RML.objectMap, 206 object_map_iri)) 207 mapping.add((predicate_object_map_iri, RDF.type, 208 R2RML.PredicateObjectMap)) 209 mapping.add((triplesmap_iri, R2RML.predicateObjectMap, 210 predicate_object_map_iri)) 211 212 return join_condition_iri 213 214 def _generate_mapping(self) -> Graph: 215 """Generate a [R2]RML mapping for a Joins instance. 216 217 Returns 218 ------- 219 mapping : Graph 220 [R2]RML mapping as an RDFLib Graph. 221 """ 222 mapping: Graph = Graph(base='http://ex.com/') 223 mapping.bind('rr', R2RML) 224 mapping.bind('ql', QL) 225 mapping.bind('ex', EX) 226 subject1_template = Literal('http://ex.com/table1/{id}') 227 subject2_template = Literal('http://ex.com/table2/{id}') 228 if self._data_format == 'postgresql': 229 triples_map1_iri = self._add_triples_map(mapping, 230 subject1_template, 231 Literal('data'), number=1) 232 triples_map2_iri = self._add_triples_map(mapping, 233 subject2_template, 234 Literal('data'), number=2) 235 elif self._data_format == 'csv': 236 triples_map1_iri = \ 237 self._add_triples_map_source(mapping, subject1_template, 238 Literal('/data/shared/data1.csv'), 239 number=1) 240 triples_map2_iri = \ 241 self._add_triples_map_source(mapping, subject2_template, 242 Literal('/data/shared/data2.csv'), 243 number=2) 244 else: 245 raise NotImplementedError(f'{self._data_format} not implemented') 246 247 self._add_join_predicate_object_map(mapping, triples_map1_iri, 248 EX['j1'], Literal('p1'), 249 triples_map2_iri, Literal('id'), 250 Literal('id')) 251 252 return mapping 253 254 def _generate_csv(self) -> bool: 255 """Generate the instance as CSV files. 256 257 Returns 258 ------- 259 success : bool 260 True if successfull, false otherwise 261 """ 262 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 263 data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1) 264 dataframe1 = self._generate_dataframe() 265 data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2) 266 dataframe2 = self._generate_dataframe(self._number_of_members + 1, 267 self._number_of_properties + 1) 268 dataframe1, dataframe2 = self._update_one_on_one(dataframe1, 269 dataframe2) 270 dataframe1.to_csv(data1_path, index=False) 271 dataframe2.to_csv(data2_path, index=False) 272 273 mapping_path = os.path.join(self.path(), 'data', 'shared', CSV_MAPPING_FILE) 274 mapping: Graph = self._generate_mapping() 275 mapping.serialize(destination=mapping_path, format='turtle') 276 self._generate_scenario() 277 278 return True 279 280 def _generate_postgresql(self) -> bool: 281 """Generate the instance as PostgreSQL with CSV files to load. 282 283 Returns 284 ------- 285 success : bool 286 True if successfull, false otherwise 287 """ 288 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 289 data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1) 290 self._generate_dataframe().to_csv(data1_path, index=False) 291 data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2) 292 self._generate_dataframe().to_csv(data2_path, index=False) 293 294 mapping_path = os.path.join(self.path(), 'data', 'shared', RDB_MAPPING_FILE) 295 mapping: Graph = self._generate_mapping() 296 mapping.serialize(destination=mapping_path, format='turtle') 297 self._generate_scenario() 298 299 return True 300 301 def _generate_scenario(self) -> bool: 302 """Generate the metadata for this scenario. 303 304 Configures the execution pipeline automatically. 305 306 Returns 307 ------- 308 success : bool 309 True if successfull, false otherwise 310 """ 311 name: str = f'join_percentage_{self._percentage}' 312 description: str = f'Join Percentage {self._percentage}% ' 313 iri: str = f'http://example.org/join-percentage/{self._percentage}/' 314 315 if self._data_format == 'postgresql': 316 return self._generate_metadata(iri, name, description, 317 RDB_MAPPING_FILE) 318 elif self._data_format == 'csv': 319 return self._generate_metadata(iri, name, description, 320 CSV_MAPPING_FILE) 321 else: 322 raise NotImplementedError(f'{self._data_format} not implemented') 323 324 return False
Helper class that provides a standard way to create an ABC using inheritance.
JoinsPercentage( main_directory: str, verbose: bool, percentage: float, number_of_members: int, number_of_properties: int, value_size: int, data_format: str, engine: str, seed: int = 0)
29 def __init__(self, main_directory: str, verbose: bool, percentage: float, 30 number_of_members: int, number_of_properties: int, 31 value_size: int, data_format: str, engine: str, 32 seed: int = 0): 33 """Initialize a Joins Percentage scenario. 34 35 Parameters 36 ---------- 37 main_directory : str 38 Root directory for generating instances of Joins Percentage. 39 verbose : bool 40 Verbose logging enabled or not. 41 percentage : float 42 Percentage of members which should result into a join. 43 number_of_members : int 44 Number of members to generate, for example 5000 for 5K rows in a 45 tabular data structure. 46 number_of_properties : int 47 Number of properties per member to generate, for example 20 for 48 20 columns in a tabular data structure. 49 value_size : int 50 Number of characters to add to default value generation, 51 for example: 256 will expand all values to 256 characters. 52 data_format : str 53 Data format to use for generating the data set, for example: 54 "csv", "json", "xml", "postgresql", "mysql" 55 engine : str 56 Engine to use for execution of the generated scenario's instance, 57 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 58 or "OntopMaterialize" 59 seed : int 60 Random seed to use, default 0. 61 """ 62 self._percentage = percentage 63 self._number_of_members: int = number_of_members 64 self._number_of_properties: int = number_of_properties 65 self._value_size: int = value_size 66 random.seed(seed) 67 68 super().__init__(data_format, engine, main_directory, verbose) 69 70 if self._data_format != 'csv': 71 raise NotImplementedError(f'Data format {self._data_format} ' 72 f'is not implemented by {__name__}') 73 74 self._logger = Logger(__name__, self._main_directory, self._verbose) 75 self._logger.debug(f'Generating join percentage' 76 f' with {self._percentage}% of members,')
Initialize a Joins Percentage scenario.
Parameters
- main_directory (str): Root directory for generating instances of Joins Percentage.
- verbose (bool): Verbose logging enabled or not.
- percentage (float): Percentage of members which should result into a join.
- number_of_members (int): Number of members to generate, for example 5000 for 5K rows in a tabular data structure.
- number_of_properties (int): Number of properties per member to generate, for example 20 for 20 columns in a tabular data structure.
- value_size (int): Number of characters to add to default value generation, for example: 256 will expand all values to 256 characters.
- data_format (str): Data format to use for generating the data set, for example: "csv", "json", "xml", "postgresql", "mysql"
- engine (str): Engine to use for execution of the generated scenario's instance, for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", or "OntopMaterialize"
- seed (int): Random seed to use, default 0.
def
generate(self) -> bool:
78 def generate(self) -> bool: 79 """Generate the instance using the Joins Percentage scenario. 80 81 Only CSV files are currently implemented! 82 """ 83 if self._data_format == 'csv': 84 return self._generate_csv() 85 elif self._data_format == 'postgresql': 86 return self._generate_postgresql() 87 else: 88 raise NotImplementedError(f'Data format {self._data_format} ' 89 f'is not implemented by {__name__}')
Generate the instance using the Joins Percentage scenario.
Only CSV files are currently implemented!
def
path(self) -> str:
91 def path(self) -> str: 92 """Builds the file path for the instance of a Joins Percentage scenario. 93 94 Returns 95 ------- 96 path : str 97 File path for the Joins Percentage's instance. 98 """ 99 key = f'joins_perc_1-1_{self._percentage}' 100 path = os.path.join(self._main_directory, self._engine, 101 self._data_format, key) 102 self._logger.debug(f'Generating to {path}') 103 os.makedirs(path, exist_ok=True) 104 return path
Builds the file path for the instance of a Joins Percentage scenario.
Returns
- path (str): File path for the Joins Percentage's instance.