bench_generator.joins_relation
This module holds the Joins class which scales the dataset size by the number of members in a dataset such as number of rows for tabular data.
1#!/usr/bin/env python3 2 3""" 4This module holds the Joins class which scales the dataset size 5by the number of members in a dataset such as number of rows for tabular data. 6""" 7 8import os 9import string 10import random 11from typing import Tuple 12from pandas import DataFrame 13from rdflib.namespace import RDF 14from rdflib import Graph, URIRef, BNode, Literal, Namespace 15from bench_generator.scenario import Scenario 16from bench_generator.logger import Logger 17 18DATA_FILE1 = 'data1.csv' 19DATA_FILE2 = 'data2.csv' 20RDB_MAPPING_FILE = 'mapping.r2rml.ttl' 21CSV_MAPPING_FILE = 'mapping.r2rml.ttl' 22R2RML = Namespace('http://www.w3.org/ns/r2rml#') 23QL = Namespace('http://semweb.mmlab.be/ns/ql#') 24EX = Namespace('http://example.com/') 25MEMBERS_PERCENTAGE = 50.0 26 27 28class JoinsRelation(Scenario): 29 def __init__(self, main_directory: str, verbose: bool, percentage: float, 30 n: int, m: int, number_of_members: int, 31 number_of_properties: int, value_size: int, data_format: str, 32 engine: str, seed: int = 0): 33 """Initialize a Joins Relations scenario. 34 35 Member's percentage is always set to 50%. 36 37 Parameters 38 ---------- 39 main_directory : str 40 Root directory for generating instances of Joins Relations. 41 verbose : bool 42 Verbose logging enabled or not. 43 percentage : float 44 Percentage of relations which should result into a join. 45 n : int 46 Relation size N. 47 m : int 48 Relation size M. 49 number_of_members : int 50 Number of members to generate, for example 5000 for 5K rows in a 51 tabular data structure. 52 number_of_properties : int 53 Number of properties per member to generate, for example 20 for 54 20 columns in a tabular data structure. 55 value_size : int 56 Number of characters to add to default value generation, 57 for example: 256 will expand all values to 256 characters. 58 data_format : str 59 Data format to use for generating the data set, for example: 60 "csv", "json", "xml", "postgresql", "mysql" 61 engine : str 62 Engine to use for execution of the generated scenario's instance, 63 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 64 or "OntopMaterialize" 65 seed : int 66 Random seed to use, default 0. 67 """ 68 self._percentage = percentage 69 self._n = n 70 self._m = m 71 self._number_of_members: int = number_of_members 72 self._number_of_properties: int = number_of_properties 73 self._value_size: int = value_size 74 random.seed(seed) 75 76 super().__init__(data_format, engine, main_directory, verbose) 77 78 if self._data_format != 'csv': 79 raise NotImplementedError(f'Data format {self._data_format} ' 80 f'is not implemented by {__name__}') 81 82 self._logger = Logger(__name__, self._main_directory, self._verbose) 83 self._logger.debug(f'Generating join relations {self._n}-{self._m}' 84 f' with {self._percentage}% of relations,') 85 86 def generate(self) -> bool: 87 """Generate the instance using the Joins Relations scenario. 88 89 Only CSV files are currently implemented! 90 """ 91 if self._data_format == 'csv': 92 return self._generate_csv() 93 elif self._data_format == 'postgresql': 94 return self._generate_postgresql() 95 else: 96 raise NotImplementedError(f'Data format {self._data_format} ' 97 f'is not implemented by {__name__}') 98 99 def path(self) -> str: 100 """Builds the file path for the instance of a Joins Relations scenario. 101 102 Returns 103 ------- 104 path : str 105 File path for the Joins Relations's instance. 106 """ 107 key = f'joins_relations_{self._n}-{self._m}_{self._percentage}' 108 path = os.path.join(self._main_directory, self._engine, 109 self._data_format, key) 110 self._logger.debug(f'Generating to {path}') 111 os.makedirs(path, exist_ok=True) 112 return path 113 114 def _generate_dataframe(self, member_offset: int = 1, 115 property_offset: int = 1) -> DataFrame: 116 """Generate joins. 117 118 Parameters 119 ---------- 120 member_offset : int 121 Offset to start member ID generation from. Default 1 (no offset). 122 property_offset : int 123 Offset to start property ID generation from. Default 1 (no offset). 124 125 Returns 126 ------- 127 dataframe : DataFrame 128 Panda's DataFrame with generated joins. 129 """ 130 subject_id = range(member_offset, 131 self._number_of_members + member_offset) 132 value_id = range(property_offset, 133 self._number_of_members + property_offset) 134 data: dict = {'id': subject_id} 135 n_ascii = len(string.ascii_letters) 136 137 for j in range(1, self._number_of_properties + 1): 138 # Append ASCII characters if necessary, use modulo to avoid out of 139 # range in ASCII table 140 append_value = '' 141 if self._value_size > 0: 142 append_value = '_' 143 for n in range(self._value_size): 144 append_value += string.ascii_letters[n % n_ascii] 145 146 # Generate value V_{property}_{member} honoring the value size 147 value = [f'V_{j}-{i}{append_value}' for i in value_id] 148 data[f'p{j}'] = value 149 150 return DataFrame(data) 151 152 def _update_many_on_many(self, 153 dataframe1: DataFrame, 154 dataframe2: DataFrame) -> Tuple[DataFrame, 155 DataFrame]: 156 # 0% percentage results in zero matches for the join condition, 157 # don't even bother to try to match the dataframes 158 if self._percentage == 0.0: 159 return dataframe1, dataframe2 160 161 percentaged_members = \ 162 self._number_of_members * (self._percentage / 100.0) 163 164 sample1 = dataframe1.iloc[random.sample(list(dataframe1.index), 165 int(percentaged_members))] 166 sample1_v = sample1.reset_index(drop=True) 167 sample2 = dataframe2.iloc[random.sample(list(dataframe2.index), 168 int(percentaged_members))] 169 sample2_v = sample2.reset_index(drop=True) 170 171 number_of_members_n = self._number_of_members * (self._percentage / 100.0) 172 number_of_members_m = self._number_of_members * (self._percentage / 100.0) 173 members_to_join_n = number_of_members_n / self._n 174 members_to_join_m = number_of_members_m / self._m 175 176 k = max(int(members_to_join_n + 0.5), int(members_to_join_m + 0.5)) 177 sample_members = sample1_v.iloc[random.sample(list(sample1_v.index), 178 k)] 179 values = list(set([m[1]['p1'] for m in sample_members.iterrows()]))[:int(members_to_join_m + 0.5)] 180 values = values * self._m 181 if len(values) > self._number_of_members: 182 values = values[:self._number_of_members] 183 184 sample2_v = dataframe2.iloc[random.sample(list(dataframe2.index), 185 len(values))] 186 187 for i, j in zip(values, list(sample2_v.index)): 188 dataframe2.loc[j, 'p1'] = i 189 190 #### 191 values = list(set([m[1]['p1'] for m in sample_members.iterrows()]))[:int(members_to_join_n + 0.5)] 192 values = values * self._n 193 if len(values) > self._number_of_members: 194 values = values[:self._number_of_members] 195 196 sample1_v = dataframe1.iloc[random.sample(list(dataframe1.index), 197 len(values))] 198 for i, j in zip(values, list(sample1_v.index)): 199 dataframe1.loc[j, 'p1'] = i 200 201 return dataframe1, dataframe2 202 203 def _generate_mapping(self) -> Graph: 204 """Generate a [R2]RML mapping for a Joins instance. 205 206 Returns 207 ------- 208 mapping : Graph 209 [R2]RML mapping as an RDFLib Graph. 210 """ 211 mapping: Graph = Graph(base='http://ex.com/') 212 mapping.bind('rr', R2RML) 213 mapping.bind('ql', QL) 214 mapping.bind('ex', EX) 215 subject1_template = Literal('http://ex.com/table1/{id}') 216 subject2_template = Literal('http://ex.com/table2/{id}') 217 if self._data_format == 'postgresql': 218 triples_map1_iri = self._add_triples_map(mapping, 219 subject1_template, 220 Literal('data'), number=1) 221 triples_map2_iri = self._add_triples_map(mapping, 222 subject2_template, 223 Literal('data'), number=2) 224 elif self._data_format == 'csv': 225 triples_map1_iri = \ 226 self._add_triples_map_source(mapping, subject1_template, 227 Literal('/data/shared/data1.csv'), 228 number=1) 229 triples_map2_iri = \ 230 self._add_triples_map_source(mapping, subject1_template, 231 Literal('/data/shared/data2.csv'), 232 number=2) 233 else: 234 raise NotImplementedError(f'{self._data_format} not implemented') 235 236 self._add_join_predicate_object_map(mapping, triples_map1_iri, 237 EX['j1'], Literal('p1'), 238 triples_map2_iri, Literal('p1'), 239 Literal('p1')) 240 241 return mapping 242 243 def _generate_csv(self) -> bool: 244 """Generate the instance as CSV files. 245 246 Returns 247 ------- 248 success : bool 249 True if successfull, false otherwise 250 """ 251 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 252 data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1) 253 dataframe1 = self._generate_dataframe() 254 data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2) 255 dataframe2 = self._generate_dataframe(self._number_of_members + 1, 256 self._number_of_properties + 1) 257 dataframe1, dataframe2 = self._update_many_on_many(dataframe1, 258 dataframe2) 259 dataframe1.to_csv(data1_path, index=False) 260 dataframe2.to_csv(data2_path, index=False) 261 262 mapping_path = os.path.join(self.path(), 'data', 'shared', CSV_MAPPING_FILE) 263 mapping: Graph = self._generate_mapping() 264 mapping.serialize(destination=mapping_path, format='turtle') 265 self._generate_scenario() 266 267 return True 268 269 def _generate_postgresql(self) -> bool: 270 """Generate the instance as PostgreSQL with CSV files to load. 271 272 Returns 273 ------- 274 success : bool 275 True if successfull, false otherwise 276 """ 277 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 278 data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1) 279 dataframe1 = self._generate_dataframe() 280 data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2) 281 dataframe2 = self._generate_dataframe(self._number_of_members + 1, 282 self._number_of_properties + 1) 283 dataframe1, dataframe2 = self._update_many_on_many(dataframe1, 284 dataframe2) 285 dataframe1.to_csv(data1_path, index=False) 286 dataframe2.to_csv(data2_path, index=False) 287 288 mapping_path = os.path.join(self.path(), 'data', 'shared', RDB_MAPPING_FILE) 289 mapping: Graph = self._generate_mapping() 290 mapping.serialize(destination=mapping_path, format='turtle') 291 self._generate_scenario() 292 293 return True 294 295 def _generate_scenario(self) -> bool: 296 """Generate the metadata for this scenario. 297 298 Configures the execution pipeline automatically. 299 300 Returns 301 ------- 302 success : bool 303 True if successfull, false otherwise 304 """ 305 name: str = f'join_relation_{self._n}_{self._m}_{self._percentage}' 306 description: str = f'Join Relation {self._n}-{self._m} {self._percentage}% ' 307 iri: str = f'http://example.org/join-relation/{self._n}-{self._m}/{self._percentage}/' 308 309 if self._data_format == 'postgresql': 310 return self._generate_metadata(iri, name, description, 311 RDB_MAPPING_FILE) 312 elif self._data_format == 'csv': 313 return self._generate_metadata(iri, name, description, 314 CSV_MAPPING_FILE) 315 else: 316 raise NotImplementedError(f'{self._data_format} not implemented') 317 318 return False
DATA_FILE1 =
'data1.csv'
DATA_FILE2 =
'data2.csv'
RDB_MAPPING_FILE =
'mapping.r2rml.ttl'
CSV_MAPPING_FILE =
'mapping.r2rml.ttl'
R2RML =
Namespace('http://www.w3.org/ns/r2rml#')
QL =
Namespace('http://semweb.mmlab.be/ns/ql#')
EX =
Namespace('http://example.com/')
MEMBERS_PERCENTAGE =
50.0
29class JoinsRelation(Scenario): 30 def __init__(self, main_directory: str, verbose: bool, percentage: float, 31 n: int, m: int, number_of_members: int, 32 number_of_properties: int, value_size: int, data_format: str, 33 engine: str, seed: int = 0): 34 """Initialize a Joins Relations scenario. 35 36 Member's percentage is always set to 50%. 37 38 Parameters 39 ---------- 40 main_directory : str 41 Root directory for generating instances of Joins Relations. 42 verbose : bool 43 Verbose logging enabled or not. 44 percentage : float 45 Percentage of relations which should result into a join. 46 n : int 47 Relation size N. 48 m : int 49 Relation size M. 50 number_of_members : int 51 Number of members to generate, for example 5000 for 5K rows in a 52 tabular data structure. 53 number_of_properties : int 54 Number of properties per member to generate, for example 20 for 55 20 columns in a tabular data structure. 56 value_size : int 57 Number of characters to add to default value generation, 58 for example: 256 will expand all values to 256 characters. 59 data_format : str 60 Data format to use for generating the data set, for example: 61 "csv", "json", "xml", "postgresql", "mysql" 62 engine : str 63 Engine to use for execution of the generated scenario's instance, 64 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 65 or "OntopMaterialize" 66 seed : int 67 Random seed to use, default 0. 68 """ 69 self._percentage = percentage 70 self._n = n 71 self._m = m 72 self._number_of_members: int = number_of_members 73 self._number_of_properties: int = number_of_properties 74 self._value_size: int = value_size 75 random.seed(seed) 76 77 super().__init__(data_format, engine, main_directory, verbose) 78 79 if self._data_format != 'csv': 80 raise NotImplementedError(f'Data format {self._data_format} ' 81 f'is not implemented by {__name__}') 82 83 self._logger = Logger(__name__, self._main_directory, self._verbose) 84 self._logger.debug(f'Generating join relations {self._n}-{self._m}' 85 f' with {self._percentage}% of relations,') 86 87 def generate(self) -> bool: 88 """Generate the instance using the Joins Relations scenario. 89 90 Only CSV files are currently implemented! 91 """ 92 if self._data_format == 'csv': 93 return self._generate_csv() 94 elif self._data_format == 'postgresql': 95 return self._generate_postgresql() 96 else: 97 raise NotImplementedError(f'Data format {self._data_format} ' 98 f'is not implemented by {__name__}') 99 100 def path(self) -> str: 101 """Builds the file path for the instance of a Joins Relations scenario. 102 103 Returns 104 ------- 105 path : str 106 File path for the Joins Relations's instance. 107 """ 108 key = f'joins_relations_{self._n}-{self._m}_{self._percentage}' 109 path = os.path.join(self._main_directory, self._engine, 110 self._data_format, key) 111 self._logger.debug(f'Generating to {path}') 112 os.makedirs(path, exist_ok=True) 113 return path 114 115 def _generate_dataframe(self, member_offset: int = 1, 116 property_offset: int = 1) -> DataFrame: 117 """Generate joins. 118 119 Parameters 120 ---------- 121 member_offset : int 122 Offset to start member ID generation from. Default 1 (no offset). 123 property_offset : int 124 Offset to start property ID generation from. Default 1 (no offset). 125 126 Returns 127 ------- 128 dataframe : DataFrame 129 Panda's DataFrame with generated joins. 130 """ 131 subject_id = range(member_offset, 132 self._number_of_members + member_offset) 133 value_id = range(property_offset, 134 self._number_of_members + property_offset) 135 data: dict = {'id': subject_id} 136 n_ascii = len(string.ascii_letters) 137 138 for j in range(1, self._number_of_properties + 1): 139 # Append ASCII characters if necessary, use modulo to avoid out of 140 # range in ASCII table 141 append_value = '' 142 if self._value_size > 0: 143 append_value = '_' 144 for n in range(self._value_size): 145 append_value += string.ascii_letters[n % n_ascii] 146 147 # Generate value V_{property}_{member} honoring the value size 148 value = [f'V_{j}-{i}{append_value}' for i in value_id] 149 data[f'p{j}'] = value 150 151 return DataFrame(data) 152 153 def _update_many_on_many(self, 154 dataframe1: DataFrame, 155 dataframe2: DataFrame) -> Tuple[DataFrame, 156 DataFrame]: 157 # 0% percentage results in zero matches for the join condition, 158 # don't even bother to try to match the dataframes 159 if self._percentage == 0.0: 160 return dataframe1, dataframe2 161 162 percentaged_members = \ 163 self._number_of_members * (self._percentage / 100.0) 164 165 sample1 = dataframe1.iloc[random.sample(list(dataframe1.index), 166 int(percentaged_members))] 167 sample1_v = sample1.reset_index(drop=True) 168 sample2 = dataframe2.iloc[random.sample(list(dataframe2.index), 169 int(percentaged_members))] 170 sample2_v = sample2.reset_index(drop=True) 171 172 number_of_members_n = self._number_of_members * (self._percentage / 100.0) 173 number_of_members_m = self._number_of_members * (self._percentage / 100.0) 174 members_to_join_n = number_of_members_n / self._n 175 members_to_join_m = number_of_members_m / self._m 176 177 k = max(int(members_to_join_n + 0.5), int(members_to_join_m + 0.5)) 178 sample_members = sample1_v.iloc[random.sample(list(sample1_v.index), 179 k)] 180 values = list(set([m[1]['p1'] for m in sample_members.iterrows()]))[:int(members_to_join_m + 0.5)] 181 values = values * self._m 182 if len(values) > self._number_of_members: 183 values = values[:self._number_of_members] 184 185 sample2_v = dataframe2.iloc[random.sample(list(dataframe2.index), 186 len(values))] 187 188 for i, j in zip(values, list(sample2_v.index)): 189 dataframe2.loc[j, 'p1'] = i 190 191 #### 192 values = list(set([m[1]['p1'] for m in sample_members.iterrows()]))[:int(members_to_join_n + 0.5)] 193 values = values * self._n 194 if len(values) > self._number_of_members: 195 values = values[:self._number_of_members] 196 197 sample1_v = dataframe1.iloc[random.sample(list(dataframe1.index), 198 len(values))] 199 for i, j in zip(values, list(sample1_v.index)): 200 dataframe1.loc[j, 'p1'] = i 201 202 return dataframe1, dataframe2 203 204 def _generate_mapping(self) -> Graph: 205 """Generate a [R2]RML mapping for a Joins instance. 206 207 Returns 208 ------- 209 mapping : Graph 210 [R2]RML mapping as an RDFLib Graph. 211 """ 212 mapping: Graph = Graph(base='http://ex.com/') 213 mapping.bind('rr', R2RML) 214 mapping.bind('ql', QL) 215 mapping.bind('ex', EX) 216 subject1_template = Literal('http://ex.com/table1/{id}') 217 subject2_template = Literal('http://ex.com/table2/{id}') 218 if self._data_format == 'postgresql': 219 triples_map1_iri = self._add_triples_map(mapping, 220 subject1_template, 221 Literal('data'), number=1) 222 triples_map2_iri = self._add_triples_map(mapping, 223 subject2_template, 224 Literal('data'), number=2) 225 elif self._data_format == 'csv': 226 triples_map1_iri = \ 227 self._add_triples_map_source(mapping, subject1_template, 228 Literal('/data/shared/data1.csv'), 229 number=1) 230 triples_map2_iri = \ 231 self._add_triples_map_source(mapping, subject1_template, 232 Literal('/data/shared/data2.csv'), 233 number=2) 234 else: 235 raise NotImplementedError(f'{self._data_format} not implemented') 236 237 self._add_join_predicate_object_map(mapping, triples_map1_iri, 238 EX['j1'], Literal('p1'), 239 triples_map2_iri, Literal('p1'), 240 Literal('p1')) 241 242 return mapping 243 244 def _generate_csv(self) -> bool: 245 """Generate the instance as CSV files. 246 247 Returns 248 ------- 249 success : bool 250 True if successfull, false otherwise 251 """ 252 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 253 data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1) 254 dataframe1 = self._generate_dataframe() 255 data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2) 256 dataframe2 = self._generate_dataframe(self._number_of_members + 1, 257 self._number_of_properties + 1) 258 dataframe1, dataframe2 = self._update_many_on_many(dataframe1, 259 dataframe2) 260 dataframe1.to_csv(data1_path, index=False) 261 dataframe2.to_csv(data2_path, index=False) 262 263 mapping_path = os.path.join(self.path(), 'data', 'shared', CSV_MAPPING_FILE) 264 mapping: Graph = self._generate_mapping() 265 mapping.serialize(destination=mapping_path, format='turtle') 266 self._generate_scenario() 267 268 return True 269 270 def _generate_postgresql(self) -> bool: 271 """Generate the instance as PostgreSQL with CSV files to load. 272 273 Returns 274 ------- 275 success : bool 276 True if successfull, false otherwise 277 """ 278 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 279 data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1) 280 dataframe1 = self._generate_dataframe() 281 data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2) 282 dataframe2 = self._generate_dataframe(self._number_of_members + 1, 283 self._number_of_properties + 1) 284 dataframe1, dataframe2 = self._update_many_on_many(dataframe1, 285 dataframe2) 286 dataframe1.to_csv(data1_path, index=False) 287 dataframe2.to_csv(data2_path, index=False) 288 289 mapping_path = os.path.join(self.path(), 'data', 'shared', RDB_MAPPING_FILE) 290 mapping: Graph = self._generate_mapping() 291 mapping.serialize(destination=mapping_path, format='turtle') 292 self._generate_scenario() 293 294 return True 295 296 def _generate_scenario(self) -> bool: 297 """Generate the metadata for this scenario. 298 299 Configures the execution pipeline automatically. 300 301 Returns 302 ------- 303 success : bool 304 True if successfull, false otherwise 305 """ 306 name: str = f'join_relation_{self._n}_{self._m}_{self._percentage}' 307 description: str = f'Join Relation {self._n}-{self._m} {self._percentage}% ' 308 iri: str = f'http://example.org/join-relation/{self._n}-{self._m}/{self._percentage}/' 309 310 if self._data_format == 'postgresql': 311 return self._generate_metadata(iri, name, description, 312 RDB_MAPPING_FILE) 313 elif self._data_format == 'csv': 314 return self._generate_metadata(iri, name, description, 315 CSV_MAPPING_FILE) 316 else: 317 raise NotImplementedError(f'{self._data_format} not implemented') 318 319 return False
Helper class that provides a standard way to create an ABC using inheritance.
JoinsRelation( main_directory: str, verbose: bool, percentage: float, n: int, m: int, number_of_members: int, number_of_properties: int, value_size: int, data_format: str, engine: str, seed: int = 0)
30 def __init__(self, main_directory: str, verbose: bool, percentage: float, 31 n: int, m: int, number_of_members: int, 32 number_of_properties: int, value_size: int, data_format: str, 33 engine: str, seed: int = 0): 34 """Initialize a Joins Relations scenario. 35 36 Member's percentage is always set to 50%. 37 38 Parameters 39 ---------- 40 main_directory : str 41 Root directory for generating instances of Joins Relations. 42 verbose : bool 43 Verbose logging enabled or not. 44 percentage : float 45 Percentage of relations which should result into a join. 46 n : int 47 Relation size N. 48 m : int 49 Relation size M. 50 number_of_members : int 51 Number of members to generate, for example 5000 for 5K rows in a 52 tabular data structure. 53 number_of_properties : int 54 Number of properties per member to generate, for example 20 for 55 20 columns in a tabular data structure. 56 value_size : int 57 Number of characters to add to default value generation, 58 for example: 256 will expand all values to 256 characters. 59 data_format : str 60 Data format to use for generating the data set, for example: 61 "csv", "json", "xml", "postgresql", "mysql" 62 engine : str 63 Engine to use for execution of the generated scenario's instance, 64 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 65 or "OntopMaterialize" 66 seed : int 67 Random seed to use, default 0. 68 """ 69 self._percentage = percentage 70 self._n = n 71 self._m = m 72 self._number_of_members: int = number_of_members 73 self._number_of_properties: int = number_of_properties 74 self._value_size: int = value_size 75 random.seed(seed) 76 77 super().__init__(data_format, engine, main_directory, verbose) 78 79 if self._data_format != 'csv': 80 raise NotImplementedError(f'Data format {self._data_format} ' 81 f'is not implemented by {__name__}') 82 83 self._logger = Logger(__name__, self._main_directory, self._verbose) 84 self._logger.debug(f'Generating join relations {self._n}-{self._m}' 85 f' with {self._percentage}% of relations,')
Initialize a Joins Relations scenario.
Member's percentage is always set to 50%.
Parameters
- main_directory (str): Root directory for generating instances of Joins Relations.
- verbose (bool): Verbose logging enabled or not.
- percentage (float): Percentage of relations which should result into a join.
- n (int): Relation size N.
- m (int): Relation size M.
- number_of_members (int): Number of members to generate, for example 5000 for 5K rows in a tabular data structure.
- number_of_properties (int): Number of properties per member to generate, for example 20 for 20 columns in a tabular data structure.
- value_size (int): Number of characters to add to default value generation, for example: 256 will expand all values to 256 characters.
- data_format (str): Data format to use for generating the data set, for example: "csv", "json", "xml", "postgresql", "mysql"
- engine (str): Engine to use for execution of the generated scenario's instance, for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", or "OntopMaterialize"
- seed (int): Random seed to use, default 0.
def
generate(self) -> bool:
87 def generate(self) -> bool: 88 """Generate the instance using the Joins Relations scenario. 89 90 Only CSV files are currently implemented! 91 """ 92 if self._data_format == 'csv': 93 return self._generate_csv() 94 elif self._data_format == 'postgresql': 95 return self._generate_postgresql() 96 else: 97 raise NotImplementedError(f'Data format {self._data_format} ' 98 f'is not implemented by {__name__}')
Generate the instance using the Joins Relations scenario.
Only CSV files are currently implemented!
def
path(self) -> str:
100 def path(self) -> str: 101 """Builds the file path for the instance of a Joins Relations scenario. 102 103 Returns 104 ------- 105 path : str 106 File path for the Joins Relations's instance. 107 """ 108 key = f'joins_relations_{self._n}-{self._m}_{self._percentage}' 109 path = os.path.join(self._main_directory, self._engine, 110 self._data_format, key) 111 self._logger.debug(f'Generating to {path}') 112 os.makedirs(path, exist_ok=True) 113 return path
Builds the file path for the instance of a Joins Relations scenario.
Returns
- path (str): File path for the Joins Relations's instance.