bench_generator.joins_duplicate
This module holds the Joins class which scales the dataset size by the number of members in a dataset such as number of rows for tabular data.
1#!/usr/bin/env python3 2 3""" 4This module holds the Joins class which scales the dataset size 5by the number of members in a dataset such as number of rows for tabular data. 6""" 7 8import os 9import string 10import random 11from typing import Tuple 12from pandas import DataFrame 13from rdflib.namespace import RDF 14from rdflib import Graph, URIRef, BNode, Literal, Namespace 15from bench_generator.scenario import Scenario 16from bench_generator.logger import Logger 17 18DATA_FILE1 = 'data1.csv' 19DATA_FILE2 = 'data2.csv' 20RDB_MAPPING_FILE = 'mapping.r2rml.ttl' 21CSV_MAPPING_FILE = 'mapping.rml.ttl' 22R2RML = Namespace('http://www.w3.org/ns/r2rml#') 23QL = Namespace('http://semweb.mmlab.be/ns/ql#') 24EX = Namespace('http://example.com/') 25 26 27class JoinsDuplicate(Scenario): 28 def __init__(self, main_directory: str, verbose: bool, percentage: float, 29 number_of_duplicates: int, number_of_members: int, 30 number_of_properties: int, value_size: int, data_format: str, 31 engine: str, seed: int = 0): 32 """Initialize a Joins Duplicate scenario. 33 34 Parameters 35 ---------- 36 main_directory : str 37 Root directory for generating instances of Joins Duplicate. 38 verbose : bool 39 Verbose logging enabled or not. 40 percentage : float 41 Duplicate of members which should result into a join. 42 number_of_duplicates : int 43 Number of duplicates to generate. 44 number_of_members : int 45 Number of members to generate, for example 5000 for 5K rows in a 46 tabular data structure. 47 number_of_properties : int 48 Number of properties per member to generate, for example 20 for 49 20 columns in a tabular data structure. 50 value_size : int 51 Number of characters to add to default value generation, 52 for example: 256 will expand all values to 256 characters. 53 data_format : str 54 Data format to use for generating the data set, for example: 55 "csv", "json", "xml", "postgresql", "mysql" 56 engine : str 57 Engine to use for execution of the generated scenario's instance, 58 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 59 or "OntopMaterialize" 60 seed : int 61 Random seed to use, default 0. 62 """ 63 self._percentage = percentage 64 self._number_of_duplicates = number_of_duplicates 65 self._number_of_members: int = number_of_members 66 self._number_of_properties: int = number_of_properties 67 self._value_size: int = value_size 68 random.seed(seed) 69 70 super().__init__(data_format, engine, main_directory, verbose) 71 72 if self._data_format != 'csv': 73 raise NotImplementedError(f'Data format {self._data_format} ' 74 f'is not implemented by {__name__}') 75 76 self._logger = Logger(__name__, self._main_directory, self._verbose) 77 self._logger.debug(f'Generating join percentage' 78 f' with {self._percentage}% of members,') 79 80 def generate(self) -> bool: 81 """Generate the instance using the Joins Duplicate scenario. 82 83 Only CSV files are currently implemented! 84 """ 85 if self._data_format == 'csv': 86 return self._generate_csv() 87 elif self._data_format == 'postgresql': 88 return self._generate_postgresql() 89 else: 90 raise NotImplementedError(f'Data format {self._data_format} ' 91 f'is not implemented by {__name__}') 92 93 def path(self) -> str: 94 """Builds the file path for the instance of a Joins Duplicate scenario. 95 96 Returns 97 ------- 98 path : str 99 File path for the Joins Duplicate's instance. 100 """ 101 key = f'joins_duplicates_{self._number_of_duplicates}' + \ 102 f'_{self._percentage}' 103 path = os.path.join(self._main_directory, self._engine, 104 self._data_format, key) 105 self._logger.debug(f'Generating to {path}') 106 os.makedirs(path, exist_ok=True) 107 return path 108 109 def _generate_dataframe(self, member_offset: int = 1, 110 property_offset: int = 1) -> DataFrame: 111 """Generate joins. 112 113 Parameters 114 ---------- 115 member_offset : int 116 Offset to start member ID generation from. Default 1 (no offset). 117 property_offset : int 118 Offset to start property ID generation from. Default 1 (no offset). 119 120 Returns 121 ------- 122 dataframe : DataFrame 123 Panda's DataFrame with generated joins. 124 """ 125 subject_id = range(member_offset, 126 self._number_of_members + member_offset) 127 value_id = range(property_offset, 128 self._number_of_members + property_offset) 129 data: dict = {'id': subject_id} 130 n_ascii = len(string.ascii_letters) 131 132 for j in range(1, self._number_of_properties + 1): 133 # Append ASCII characters if necessary, use modulo to avoid out of 134 # range in ASCII table 135 append_value = '' 136 if self._value_size > 0: 137 append_value = '_' 138 for n in range(self._value_size): 139 append_value += string.ascii_letters[n % n_ascii] 140 141 # Generate value V_{property}_{member} honoring the value size 142 value = [f'V_{j}-{i}{append_value}' for i in value_id] 143 data[f'p{j}'] = value 144 145 return DataFrame(data) 146 147 def _update_duplicates(self, 148 dataframe1: DataFrame, 149 dataframe2: DataFrame) -> Tuple[DataFrame, 150 DataFrame]: 151 duplicates = self._number_of_members * (self._percentage / 100.0) 152 num_P1s = duplicates / self._number_of_duplicates 153 n = min(num_P1s * (self._number_of_duplicates + 1), 154 self._number_of_members) 155 156 sample1 = dataframe1.iloc[random.sample(list(dataframe1.index), 157 int(n))] 158 sample2 = dataframe2.iloc[random.sample(list(dataframe2.index), 159 int(n))] 160 values = list(set([m[1]['p1'] for m in sample1.iterrows()])) 161 162 if len(values) > self._number_of_members: 163 values = values[:self._number_of_members] 164 165 member_id = -1 166 member_count = 0 167 for i, j in zip(values, list(sample2.index)): 168 if member_id == -1: 169 member_id = int(dataframe2.loc[j, 'id']) 170 171 dataframe2.loc[j, 'id'] = member_id 172 dataframe2.loc[j, 'p1'] = i 173 member_count += 1 174 175 if member_count >= self._number_of_duplicates: 176 member_id = -1 177 member_count = 0 178 179 return dataframe1, dataframe2 180 181 def _add_join_predicate_object_map(self, mapping: Graph, 182 triplesmap_iri: URIRef, 183 predicate_value: URIRef, 184 object_value: Literal, 185 parent_triplesmap_iri: URIRef, 186 child_value: Literal, 187 parent_value: Literal) -> BNode: 188 """Insert a join with join condition into a [R2]RML mapping 189 190 Parameters 191 ---------- 192 mapping : Graph 193 [R2]RML mapping as an RDFLib Graph. 194 triples_map_iri : URIRef 195 IRI of the Triples Map to insert the PredicateObjectMap in. 196 predicate_value : URIRef 197 Predicate IRI value for PredicateObjectMap. 198 object_value : Literal 199 Object value for PredicateObjectMap. 200 201 Returns 202 ------- 203 predicat_object_map_with_join_iri : BNode 204 Predicate Object Map with join blank node ID. 205 """ 206 predicate_object_map_iri = BNode() 207 predicate_map_iri = BNode() 208 object_map_iri = BNode() 209 join_condition_iri = BNode() 210 211 mapping.add((join_condition_iri, R2RML.child, child_value)) 212 mapping.add((join_condition_iri, R2RML.parent, parent_value)) 213 mapping.add((join_condition_iri, RDF.type, R2RML.JoinCondition)) 214 mapping.add((predicate_map_iri, R2RML.constant, predicate_value)) 215 mapping.add((predicate_map_iri, RDF.type, R2RML.PredicateMap)) 216 mapping.add((object_map_iri, RDF.type, R2RML.ReferenceObjectMap)) 217 mapping.add((object_map_iri, R2RML.parentTriplesMap, 218 parent_triplesmap_iri)) 219 mapping.add((object_map_iri, R2RML.joinCondition, join_condition_iri)) 220 mapping.add((predicate_object_map_iri, R2RML.predicateMap, 221 predicate_map_iri)) 222 mapping.add((predicate_object_map_iri, R2RML.objectMap, 223 object_map_iri)) 224 mapping.add((predicate_object_map_iri, RDF.type, 225 R2RML.PredicateObjectMap)) 226 mapping.add((triplesmap_iri, R2RML.predicateObjectMap, 227 predicate_object_map_iri)) 228 229 return join_condition_iri 230 231 def _generate_mapping(self) -> Graph: 232 """Generate a [R2]RML mapping for a Joins instance. 233 234 Returns 235 ------- 236 mapping : Graph 237 [R2]RML mapping as an RDFLib Graph. 238 """ 239 mapping: Graph = Graph(base='http://ex.com/') 240 mapping.bind('rr', R2RML) 241 mapping.bind('ql', QL) 242 mapping.bind('ex', EX) 243 subject1_template = Literal('http://ex.com/table1/{id}') 244 subject2_template = Literal('http://ex.com/table2/{id}') 245 if self._data_format == 'postgresql': 246 triples_map1_iri = self._add_triples_map(mapping, 247 subject1_template, 248 Literal('data'), number=1) 249 triples_map2_iri = self._add_triples_map(mapping, 250 subject2_template, 251 Literal('data'), number=2) 252 elif self._data_format == 'csv': 253 triples_map1_iri = \ 254 self._add_triples_map_source(mapping, subject1_template, 255 Literal('/data/shared/data1.csv'), 256 number=1) 257 triples_map2_iri = \ 258 self._add_triples_map_source(mapping, subject2_template, 259 Literal('/data/shared/data2.csv'), 260 number=2) 261 else: 262 raise NotImplementedError(f'{self._data_format} not implemented') 263 264 self._add_join_predicate_object_map(mapping, triples_map1_iri, 265 EX['j1'], Literal('p1'), 266 triples_map2_iri, Literal('p1'), 267 Literal('p1')) 268 269 return mapping 270 271 def _generate_csv(self) -> bool: 272 """Generate the instance as CSV files. 273 274 Returns 275 ------- 276 success : bool 277 True if successfull, false otherwise 278 """ 279 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 280 data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1) 281 dataframe1 = self._generate_dataframe() 282 data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2) 283 dataframe2 = self._generate_dataframe(self._number_of_members + 1, 284 self._number_of_properties + 1) 285 dataframe1, dataframe2 = self._update_duplicates(dataframe1, 286 dataframe2) 287 dataframe1.to_csv(data1_path, index=False) 288 dataframe2.to_csv(data2_path, index=False) 289 290 mapping_path = os.path.join(self.path(), 'data', 'shared', CSV_MAPPING_FILE) 291 mapping: Graph = self._generate_mapping() 292 mapping.serialize(destination=mapping_path, format='turtle') 293 self._generate_scenario() 294 295 return True 296 297 def _generate_postgresql(self) -> bool: 298 """Generate the instance as PostgreSQL with CSV files to load. 299 300 Returns 301 ------- 302 success : bool 303 True if successfull, false otherwise 304 """ 305 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 306 data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1) 307 self._generate_dataframe().to_csv(data1_path, index=False) 308 data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2) 309 self._generate_dataframe().to_csv(data2_path, index=False) 310 311 mapping_path = os.path.join(self.path(), 'data', 'shared', RDB_MAPPING_FILE) 312 mapping: Graph = self._generate_mapping() 313 mapping.serialize(destination=mapping_path, format='turtle') 314 self._generate_scenario() 315 316 return True 317 318 def _generate_scenario(self) -> bool: 319 """Generate the metadata for this scenario. 320 321 Configures the execution pipeline automatically. 322 323 Returns 324 ------- 325 success : bool 326 True if successfull, false otherwise 327 """ 328 name: str = f'join_duplicates_{self._number_of_duplicates}' + \ 329 f'_{self._percentage}' 330 description: str = f'Join Duplicate {self._number_of_duplicates}' + \ 331 f'({self._percentage}%)' 332 iri: str = 'http://example.org/join-duplicates/' + \ 333 f'{self._number_of_duplicates}/{self._percentage}/' 334 335 if self._data_format == 'postgresql': 336 return self._generate_metadata(iri, name, description, 337 RDB_MAPPING_FILE) 338 elif self._data_format == 'csv': 339 return self._generate_metadata(iri, name, description, 340 CSV_MAPPING_FILE) 341 else: 342 raise NotImplementedError(f'{self._data_format} not implemented') 343 344 return False
DATA_FILE1 =
'data1.csv'
DATA_FILE2 =
'data2.csv'
RDB_MAPPING_FILE =
'mapping.r2rml.ttl'
CSV_MAPPING_FILE =
'mapping.rml.ttl'
R2RML =
Namespace('http://www.w3.org/ns/r2rml#')
QL =
Namespace('http://semweb.mmlab.be/ns/ql#')
EX =
Namespace('http://example.com/')
28class JoinsDuplicate(Scenario): 29 def __init__(self, main_directory: str, verbose: bool, percentage: float, 30 number_of_duplicates: int, number_of_members: int, 31 number_of_properties: int, value_size: int, data_format: str, 32 engine: str, seed: int = 0): 33 """Initialize a Joins Duplicate scenario. 34 35 Parameters 36 ---------- 37 main_directory : str 38 Root directory for generating instances of Joins Duplicate. 39 verbose : bool 40 Verbose logging enabled or not. 41 percentage : float 42 Duplicate of members which should result into a join. 43 number_of_duplicates : int 44 Number of duplicates to generate. 45 number_of_members : int 46 Number of members to generate, for example 5000 for 5K rows in a 47 tabular data structure. 48 number_of_properties : int 49 Number of properties per member to generate, for example 20 for 50 20 columns in a tabular data structure. 51 value_size : int 52 Number of characters to add to default value generation, 53 for example: 256 will expand all values to 256 characters. 54 data_format : str 55 Data format to use for generating the data set, for example: 56 "csv", "json", "xml", "postgresql", "mysql" 57 engine : str 58 Engine to use for execution of the generated scenario's instance, 59 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 60 or "OntopMaterialize" 61 seed : int 62 Random seed to use, default 0. 63 """ 64 self._percentage = percentage 65 self._number_of_duplicates = number_of_duplicates 66 self._number_of_members: int = number_of_members 67 self._number_of_properties: int = number_of_properties 68 self._value_size: int = value_size 69 random.seed(seed) 70 71 super().__init__(data_format, engine, main_directory, verbose) 72 73 if self._data_format != 'csv': 74 raise NotImplementedError(f'Data format {self._data_format} ' 75 f'is not implemented by {__name__}') 76 77 self._logger = Logger(__name__, self._main_directory, self._verbose) 78 self._logger.debug(f'Generating join percentage' 79 f' with {self._percentage}% of members,') 80 81 def generate(self) -> bool: 82 """Generate the instance using the Joins Duplicate scenario. 83 84 Only CSV files are currently implemented! 85 """ 86 if self._data_format == 'csv': 87 return self._generate_csv() 88 elif self._data_format == 'postgresql': 89 return self._generate_postgresql() 90 else: 91 raise NotImplementedError(f'Data format {self._data_format} ' 92 f'is not implemented by {__name__}') 93 94 def path(self) -> str: 95 """Builds the file path for the instance of a Joins Duplicate scenario. 96 97 Returns 98 ------- 99 path : str 100 File path for the Joins Duplicate's instance. 101 """ 102 key = f'joins_duplicates_{self._number_of_duplicates}' + \ 103 f'_{self._percentage}' 104 path = os.path.join(self._main_directory, self._engine, 105 self._data_format, key) 106 self._logger.debug(f'Generating to {path}') 107 os.makedirs(path, exist_ok=True) 108 return path 109 110 def _generate_dataframe(self, member_offset: int = 1, 111 property_offset: int = 1) -> DataFrame: 112 """Generate joins. 113 114 Parameters 115 ---------- 116 member_offset : int 117 Offset to start member ID generation from. Default 1 (no offset). 118 property_offset : int 119 Offset to start property ID generation from. Default 1 (no offset). 120 121 Returns 122 ------- 123 dataframe : DataFrame 124 Panda's DataFrame with generated joins. 125 """ 126 subject_id = range(member_offset, 127 self._number_of_members + member_offset) 128 value_id = range(property_offset, 129 self._number_of_members + property_offset) 130 data: dict = {'id': subject_id} 131 n_ascii = len(string.ascii_letters) 132 133 for j in range(1, self._number_of_properties + 1): 134 # Append ASCII characters if necessary, use modulo to avoid out of 135 # range in ASCII table 136 append_value = '' 137 if self._value_size > 0: 138 append_value = '_' 139 for n in range(self._value_size): 140 append_value += string.ascii_letters[n % n_ascii] 141 142 # Generate value V_{property}_{member} honoring the value size 143 value = [f'V_{j}-{i}{append_value}' for i in value_id] 144 data[f'p{j}'] = value 145 146 return DataFrame(data) 147 148 def _update_duplicates(self, 149 dataframe1: DataFrame, 150 dataframe2: DataFrame) -> Tuple[DataFrame, 151 DataFrame]: 152 duplicates = self._number_of_members * (self._percentage / 100.0) 153 num_P1s = duplicates / self._number_of_duplicates 154 n = min(num_P1s * (self._number_of_duplicates + 1), 155 self._number_of_members) 156 157 sample1 = dataframe1.iloc[random.sample(list(dataframe1.index), 158 int(n))] 159 sample2 = dataframe2.iloc[random.sample(list(dataframe2.index), 160 int(n))] 161 values = list(set([m[1]['p1'] for m in sample1.iterrows()])) 162 163 if len(values) > self._number_of_members: 164 values = values[:self._number_of_members] 165 166 member_id = -1 167 member_count = 0 168 for i, j in zip(values, list(sample2.index)): 169 if member_id == -1: 170 member_id = int(dataframe2.loc[j, 'id']) 171 172 dataframe2.loc[j, 'id'] = member_id 173 dataframe2.loc[j, 'p1'] = i 174 member_count += 1 175 176 if member_count >= self._number_of_duplicates: 177 member_id = -1 178 member_count = 0 179 180 return dataframe1, dataframe2 181 182 def _add_join_predicate_object_map(self, mapping: Graph, 183 triplesmap_iri: URIRef, 184 predicate_value: URIRef, 185 object_value: Literal, 186 parent_triplesmap_iri: URIRef, 187 child_value: Literal, 188 parent_value: Literal) -> BNode: 189 """Insert a join with join condition into a [R2]RML mapping 190 191 Parameters 192 ---------- 193 mapping : Graph 194 [R2]RML mapping as an RDFLib Graph. 195 triples_map_iri : URIRef 196 IRI of the Triples Map to insert the PredicateObjectMap in. 197 predicate_value : URIRef 198 Predicate IRI value for PredicateObjectMap. 199 object_value : Literal 200 Object value for PredicateObjectMap. 201 202 Returns 203 ------- 204 predicat_object_map_with_join_iri : BNode 205 Predicate Object Map with join blank node ID. 206 """ 207 predicate_object_map_iri = BNode() 208 predicate_map_iri = BNode() 209 object_map_iri = BNode() 210 join_condition_iri = BNode() 211 212 mapping.add((join_condition_iri, R2RML.child, child_value)) 213 mapping.add((join_condition_iri, R2RML.parent, parent_value)) 214 mapping.add((join_condition_iri, RDF.type, R2RML.JoinCondition)) 215 mapping.add((predicate_map_iri, R2RML.constant, predicate_value)) 216 mapping.add((predicate_map_iri, RDF.type, R2RML.PredicateMap)) 217 mapping.add((object_map_iri, RDF.type, R2RML.ReferenceObjectMap)) 218 mapping.add((object_map_iri, R2RML.parentTriplesMap, 219 parent_triplesmap_iri)) 220 mapping.add((object_map_iri, R2RML.joinCondition, join_condition_iri)) 221 mapping.add((predicate_object_map_iri, R2RML.predicateMap, 222 predicate_map_iri)) 223 mapping.add((predicate_object_map_iri, R2RML.objectMap, 224 object_map_iri)) 225 mapping.add((predicate_object_map_iri, RDF.type, 226 R2RML.PredicateObjectMap)) 227 mapping.add((triplesmap_iri, R2RML.predicateObjectMap, 228 predicate_object_map_iri)) 229 230 return join_condition_iri 231 232 def _generate_mapping(self) -> Graph: 233 """Generate a [R2]RML mapping for a Joins instance. 234 235 Returns 236 ------- 237 mapping : Graph 238 [R2]RML mapping as an RDFLib Graph. 239 """ 240 mapping: Graph = Graph(base='http://ex.com/') 241 mapping.bind('rr', R2RML) 242 mapping.bind('ql', QL) 243 mapping.bind('ex', EX) 244 subject1_template = Literal('http://ex.com/table1/{id}') 245 subject2_template = Literal('http://ex.com/table2/{id}') 246 if self._data_format == 'postgresql': 247 triples_map1_iri = self._add_triples_map(mapping, 248 subject1_template, 249 Literal('data'), number=1) 250 triples_map2_iri = self._add_triples_map(mapping, 251 subject2_template, 252 Literal('data'), number=2) 253 elif self._data_format == 'csv': 254 triples_map1_iri = \ 255 self._add_triples_map_source(mapping, subject1_template, 256 Literal('/data/shared/data1.csv'), 257 number=1) 258 triples_map2_iri = \ 259 self._add_triples_map_source(mapping, subject2_template, 260 Literal('/data/shared/data2.csv'), 261 number=2) 262 else: 263 raise NotImplementedError(f'{self._data_format} not implemented') 264 265 self._add_join_predicate_object_map(mapping, triples_map1_iri, 266 EX['j1'], Literal('p1'), 267 triples_map2_iri, Literal('p1'), 268 Literal('p1')) 269 270 return mapping 271 272 def _generate_csv(self) -> bool: 273 """Generate the instance as CSV files. 274 275 Returns 276 ------- 277 success : bool 278 True if successfull, false otherwise 279 """ 280 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 281 data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1) 282 dataframe1 = self._generate_dataframe() 283 data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2) 284 dataframe2 = self._generate_dataframe(self._number_of_members + 1, 285 self._number_of_properties + 1) 286 dataframe1, dataframe2 = self._update_duplicates(dataframe1, 287 dataframe2) 288 dataframe1.to_csv(data1_path, index=False) 289 dataframe2.to_csv(data2_path, index=False) 290 291 mapping_path = os.path.join(self.path(), 'data', 'shared', CSV_MAPPING_FILE) 292 mapping: Graph = self._generate_mapping() 293 mapping.serialize(destination=mapping_path, format='turtle') 294 self._generate_scenario() 295 296 return True 297 298 def _generate_postgresql(self) -> bool: 299 """Generate the instance as PostgreSQL with CSV files to load. 300 301 Returns 302 ------- 303 success : bool 304 True if successfull, false otherwise 305 """ 306 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 307 data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1) 308 self._generate_dataframe().to_csv(data1_path, index=False) 309 data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2) 310 self._generate_dataframe().to_csv(data2_path, index=False) 311 312 mapping_path = os.path.join(self.path(), 'data', 'shared', RDB_MAPPING_FILE) 313 mapping: Graph = self._generate_mapping() 314 mapping.serialize(destination=mapping_path, format='turtle') 315 self._generate_scenario() 316 317 return True 318 319 def _generate_scenario(self) -> bool: 320 """Generate the metadata for this scenario. 321 322 Configures the execution pipeline automatically. 323 324 Returns 325 ------- 326 success : bool 327 True if successfull, false otherwise 328 """ 329 name: str = f'join_duplicates_{self._number_of_duplicates}' + \ 330 f'_{self._percentage}' 331 description: str = f'Join Duplicate {self._number_of_duplicates}' + \ 332 f'({self._percentage}%)' 333 iri: str = 'http://example.org/join-duplicates/' + \ 334 f'{self._number_of_duplicates}/{self._percentage}/' 335 336 if self._data_format == 'postgresql': 337 return self._generate_metadata(iri, name, description, 338 RDB_MAPPING_FILE) 339 elif self._data_format == 'csv': 340 return self._generate_metadata(iri, name, description, 341 CSV_MAPPING_FILE) 342 else: 343 raise NotImplementedError(f'{self._data_format} not implemented') 344 345 return False
Helper class that provides a standard way to create an ABC using inheritance.
JoinsDuplicate( main_directory: str, verbose: bool, percentage: float, number_of_duplicates: int, number_of_members: int, number_of_properties: int, value_size: int, data_format: str, engine: str, seed: int = 0)
29 def __init__(self, main_directory: str, verbose: bool, percentage: float, 30 number_of_duplicates: int, number_of_members: int, 31 number_of_properties: int, value_size: int, data_format: str, 32 engine: str, seed: int = 0): 33 """Initialize a Joins Duplicate scenario. 34 35 Parameters 36 ---------- 37 main_directory : str 38 Root directory for generating instances of Joins Duplicate. 39 verbose : bool 40 Verbose logging enabled or not. 41 percentage : float 42 Duplicate of members which should result into a join. 43 number_of_duplicates : int 44 Number of duplicates to generate. 45 number_of_members : int 46 Number of members to generate, for example 5000 for 5K rows in a 47 tabular data structure. 48 number_of_properties : int 49 Number of properties per member to generate, for example 20 for 50 20 columns in a tabular data structure. 51 value_size : int 52 Number of characters to add to default value generation, 53 for example: 256 will expand all values to 256 characters. 54 data_format : str 55 Data format to use for generating the data set, for example: 56 "csv", "json", "xml", "postgresql", "mysql" 57 engine : str 58 Engine to use for execution of the generated scenario's instance, 59 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 60 or "OntopMaterialize" 61 seed : int 62 Random seed to use, default 0. 63 """ 64 self._percentage = percentage 65 self._number_of_duplicates = number_of_duplicates 66 self._number_of_members: int = number_of_members 67 self._number_of_properties: int = number_of_properties 68 self._value_size: int = value_size 69 random.seed(seed) 70 71 super().__init__(data_format, engine, main_directory, verbose) 72 73 if self._data_format != 'csv': 74 raise NotImplementedError(f'Data format {self._data_format} ' 75 f'is not implemented by {__name__}') 76 77 self._logger = Logger(__name__, self._main_directory, self._verbose) 78 self._logger.debug(f'Generating join percentage' 79 f' with {self._percentage}% of members,')
Initialize a Joins Duplicate scenario.
Parameters
- main_directory (str): Root directory for generating instances of Joins Duplicate.
- verbose (bool): Verbose logging enabled or not.
- percentage (float): Duplicate of members which should result into a join.
- number_of_duplicates (int): Number of duplicates to generate.
- number_of_members (int): Number of members to generate, for example 5000 for 5K rows in a tabular data structure.
- number_of_properties (int): Number of properties per member to generate, for example 20 for 20 columns in a tabular data structure.
- value_size (int): Number of characters to add to default value generation, for example: 256 will expand all values to 256 characters.
- data_format (str): Data format to use for generating the data set, for example: "csv", "json", "xml", "postgresql", "mysql"
- engine (str): Engine to use for execution of the generated scenario's instance, for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", or "OntopMaterialize"
- seed (int): Random seed to use, default 0.
def
generate(self) -> bool:
81 def generate(self) -> bool: 82 """Generate the instance using the Joins Duplicate scenario. 83 84 Only CSV files are currently implemented! 85 """ 86 if self._data_format == 'csv': 87 return self._generate_csv() 88 elif self._data_format == 'postgresql': 89 return self._generate_postgresql() 90 else: 91 raise NotImplementedError(f'Data format {self._data_format} ' 92 f'is not implemented by {__name__}')
Generate the instance using the Joins Duplicate scenario.
Only CSV files are currently implemented!
def
path(self) -> str:
94 def path(self) -> str: 95 """Builds the file path for the instance of a Joins Duplicate scenario. 96 97 Returns 98 ------- 99 path : str 100 File path for the Joins Duplicate's instance. 101 """ 102 key = f'joins_duplicates_{self._number_of_duplicates}' + \ 103 f'_{self._percentage}' 104 path = os.path.join(self._main_directory, self._engine, 105 self._data_format, key) 106 self._logger.debug(f'Generating to {path}') 107 os.makedirs(path, exist_ok=True) 108 return path
Builds the file path for the instance of a Joins Duplicate scenario.
Returns
- path (str): File path for the Joins Duplicate's instance.