bench_generator.duplicates
This module holds the Duplicates class which scales the number of duplicates in a data set with a fixed data size.
1#!/usr/bin/env python3 2 3""" 4This module holds the Duplicates class which scales the number of duplicates 5in a data set with a fixed data size. 6""" 7 8import os 9import string 10import random 11import numpy 12from pandas import DataFrame 13from rdflib.namespace import RDF 14from rdflib import Graph, URIRef, BNode, Literal, Namespace 15from bench_generator.scenario import Scenario 16from bench_generator.logger import Logger 17 18DATA_FILE = 'data.csv' 19RDB_MAPPING_FILE = 'mapping.r2rml.ttl' 20CSV_MAPPING_FILE = 'mapping.rml.ttl' 21R2RML = Namespace('http://www.w3.org/ns/r2rml#') 22RML = Namespace('http://semweb.mmlab.be/ns/rml#') 23QL = Namespace('http://semweb.mmlab.be/ns/ql#') 24EX = Namespace('http://example.com/') 25 26 27class Duplicates(Scenario): 28 def __init__(self, main_directory: str, verbose: bool, 29 percentage: float, data_format: str, engine: str, 30 seed: int = 0, number_of_members: int = 100000, 31 number_of_properties: int = 20, value_size: int = 0): 32 """Initialize a Duplicates scenario. 33 34 Parameters 35 ---------- 36 main_directory : str 37 Root directory for generating instances of Duplicates. 38 verbose : bool 39 Verbose logging enabled or not. 40 percentage : float 41 Percentage duplicates to generate, for example 50% results into 42 a dataset with 50% the same data values. 43 data_format : str 44 Data format to use for generating the data set, for example: 45 "csv", "json", "xml", "postgresql", "mysql" 46 engine : str 47 Engine to use for execution of the generated scenario's instance, 48 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 49 or "OntopMaterialize" 50 seed : int 51 Random seed to use, default 0. 52 number_of_members : int 53 Number of members to generate, for example 5000 for 5K rows in a 54 tabular data structure. Default 100K members. 55 number_of_properties : int 56 Number of properties per member to generate, for example 20 for 57 20 columns in a tabular data structure. Default 20 properties. 58 value_size : int 59 Number of characters to add to default value generation, 60 for example: 256 will expand all values to 256 characters. 61 Default 0 added characters. 62 """ 63 self._percentage: float = percentage 64 self._number_of_members = number_of_members 65 self._number_of_properties = number_of_properties 66 self._value_size = value_size 67 random.seed(seed) 68 69 super().__init__(data_format, engine, main_directory, verbose) 70 71 if self._data_format != 'csv': 72 raise NotImplementedError(f'Data format {self._data_format} ' 73 f'is not implemented by {__name__}') 74 75 self._logger = Logger(__name__, self._main_directory, self._verbose) 76 77 def generate(self) -> bool: 78 """Generate the instance using the Duplciates scenario. 79 80 Only CSV files are currently implemented! 81 """ 82 if self._data_format == 'csv': 83 return self._generate_csv() 84 elif self._data_format == 'postgresql': 85 return self._generate_postgresql() 86 else: 87 raise NotImplementedError(f'Data format {self._data_format} ' 88 f'is not implemented by {__name__}') 89 90 def path(self) -> str: 91 """Builds the file path for the instance of a Duplicates scenario. 92 93 Returns 94 ------- 95 path : str 96 File path for the Duplicates' instance. 97 """ 98 key = f'duplicates_{self._percentage}_percentage' 99 path = os.path.join(self._main_directory, self._engine, 100 self._data_format, key) 101 self._logger.debug(f'Generating to {path}') 102 os.makedirs(path, exist_ok=True) 103 return path 104 105 def _generate_dataframe(self, member_offset: int = 1, 106 property_offset: int = 1) -> DataFrame: 107 """Generate duplicates data. 108 109 Parameters 110 ---------- 111 member_offset : int 112 Offset to start member ID generation from. Default 1 (no offset). 113 property_offset : int 114 Offset to start property ID generation from. Default 1 (no offset). 115 116 Returns 117 ------- 118 dataframe : DataFrame 119 Panda's DataFrame with generated data. 120 """ 121 subject_id = range(member_offset, 122 self._number_of_members + member_offset) 123 value_id = range(property_offset, 124 self._number_of_members + property_offset) 125 data: dict = {'id': subject_id} 126 n_ascii = len(string.ascii_letters) 127 128 for j in range(1, self._number_of_properties + 1): 129 # Append ASCII characters if necessary, use modulo to avoid out of 130 # range in ASCII table 131 append_value = '' 132 if self._value_size > 0: 133 append_value = '_' 134 for n in range(self._value_size): 135 append_value += string.ascii_letters[n % n_ascii] 136 137 # Generate value V_{property}_{member} honoring the value size 138 value = [f'V_{j}-{i}{append_value}' for i in value_id] 139 data[f'p{j}'] = value 140 141 return DataFrame(data) 142 143 def _update_dataframe(self, dataframe: DataFrame): 144 """ 145 Sample a percentage of the dataframe to fill with the same value. 146 147 Parameters 148 ---------- 149 dataframe : DataFrame 150 The dataframe to update. 151 152 Returns 153 ------- 154 dataframe : DataFrame 155 The updated dataframe. 156 """ 157 percentage_members: float = self._number_of_members * \ 158 (self._percentage / 100.0) 159 sample = dataframe.iloc[random.sample(list(dataframe.index), 160 int(percentage_members))] 161 for i in list(sample.index): 162 for j in range(1, self._number_of_properties + 1): 163 dataframe.loc[i, f'p{j}'] = 'DUPLICATE' 164 dataframe.loc[i, 'id'] = numpy.iinfo(numpy.int64).max 165 166 return dataframe 167 168 def _generate_mapping(self) -> Graph: 169 """Generate a [R2]RML mapping for a Duplicates instance. 170 171 Returns 172 ------- 173 mapping : Graph 174 [R2]RML mapping as an RDFLib Graph. 175 """ 176 mapping: Graph = Graph(base='http://ex.com/') 177 mapping.bind('rr', R2RML) 178 mapping.bind('ql', QL) 179 mapping.bind('ex', EX) 180 subject_template = Literal('http://ex.com/table/{id}') 181 182 if self._data_format == 'postgresql': 183 triples_map_iri = self._add_triples_map_table(mapping, 184 subject_template, 185 Literal('data')) 186 elif self._data_format == 'csv': 187 triples_map_iri = \ 188 self._add_triples_map_source(mapping, subject_template, 189 Literal('/data/shared/data.csv')) 190 else: 191 raise NotImplementedError(f'{self._data_format} not implemented') 192 193 for i in range(1, self._number_of_properties + 1): 194 self._add_predicate_object_map(mapping, triples_map_iri, 195 EX[f'p{i}'], Literal(f'p{i}')) 196 197 return mapping 198 199 def _generate_csv(self) -> bool: 200 """Generate the instance as CSV files. 201 202 Returns 203 ------- 204 success : bool 205 True if successfull, false otherwise 206 """ 207 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 208 data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE) 209 dataframe = self._generate_dataframe() 210 dataframe = self._update_dataframe(dataframe) 211 dataframe.to_csv(data_path, index=False) 212 213 mapping_path = os.path.join(self.path(), 'data', 'shared', 214 CSV_MAPPING_FILE) 215 mapping: Graph = self._generate_mapping() 216 mapping.serialize(destination=mapping_path, format='turtle') 217 self._generate_scenario() 218 219 return True 220 221 def _generate_postgresql(self) -> bool: 222 """Generate the instance as PostgreSQL with CSV files to load. 223 224 Returns 225 ------- 226 success : bool 227 True if successfull, false otherwise 228 """ 229 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 230 data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE) 231 dataframe = self._generate_dataframe() 232 dataframe = self._update_dataframe(dataframe) 233 dataframe.to_csv(data_path, index=False) 234 235 mapping_path = os.path.join(self.path(), 'data', 'shared', 236 RDB_MAPPING_FILE) 237 mapping: Graph = self._generate_mapping() 238 mapping.serialize(destination=mapping_path, format='turtle') 239 self._generate_scenario() 240 241 return True 242 243 def _generate_scenario(self) -> bool: 244 """Generate the metadata for this scenario. 245 246 Configures the execution pipeline automatically. 247 248 Returns 249 ------- 250 success : bool 251 True if successfull, false otherwise 252 """ 253 name: str = f'duplicates_{self._percentage}' 254 description: str = f'Duplicates {self._percentage}' 255 iri: str = f'http://example.org/duplicates/{self._percentage}/' 256 257 if self._data_format == 'postgresql': 258 return self._generate_metadata(iri, name, description, 259 RDB_MAPPING_FILE) 260 elif self._data_format == 'csv': 261 return self._generate_metadata(iri, name, description, 262 CSV_MAPPING_FILE) 263 else: 264 raise NotImplementedError(f'{self._data_format} not implemented') 265 266 return True
DATA_FILE =
'data.csv'
RDB_MAPPING_FILE =
'mapping.r2rml.ttl'
CSV_MAPPING_FILE =
'mapping.rml.ttl'
R2RML =
Namespace('http://www.w3.org/ns/r2rml#')
RML =
Namespace('http://semweb.mmlab.be/ns/rml#')
QL =
Namespace('http://semweb.mmlab.be/ns/ql#')
EX =
Namespace('http://example.com/')
28class Duplicates(Scenario): 29 def __init__(self, main_directory: str, verbose: bool, 30 percentage: float, data_format: str, engine: str, 31 seed: int = 0, number_of_members: int = 100000, 32 number_of_properties: int = 20, value_size: int = 0): 33 """Initialize a Duplicates scenario. 34 35 Parameters 36 ---------- 37 main_directory : str 38 Root directory for generating instances of Duplicates. 39 verbose : bool 40 Verbose logging enabled or not. 41 percentage : float 42 Percentage duplicates to generate, for example 50% results into 43 a dataset with 50% the same data values. 44 data_format : str 45 Data format to use for generating the data set, for example: 46 "csv", "json", "xml", "postgresql", "mysql" 47 engine : str 48 Engine to use for execution of the generated scenario's instance, 49 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 50 or "OntopMaterialize" 51 seed : int 52 Random seed to use, default 0. 53 number_of_members : int 54 Number of members to generate, for example 5000 for 5K rows in a 55 tabular data structure. Default 100K members. 56 number_of_properties : int 57 Number of properties per member to generate, for example 20 for 58 20 columns in a tabular data structure. Default 20 properties. 59 value_size : int 60 Number of characters to add to default value generation, 61 for example: 256 will expand all values to 256 characters. 62 Default 0 added characters. 63 """ 64 self._percentage: float = percentage 65 self._number_of_members = number_of_members 66 self._number_of_properties = number_of_properties 67 self._value_size = value_size 68 random.seed(seed) 69 70 super().__init__(data_format, engine, main_directory, verbose) 71 72 if self._data_format != 'csv': 73 raise NotImplementedError(f'Data format {self._data_format} ' 74 f'is not implemented by {__name__}') 75 76 self._logger = Logger(__name__, self._main_directory, self._verbose) 77 78 def generate(self) -> bool: 79 """Generate the instance using the Duplciates scenario. 80 81 Only CSV files are currently implemented! 82 """ 83 if self._data_format == 'csv': 84 return self._generate_csv() 85 elif self._data_format == 'postgresql': 86 return self._generate_postgresql() 87 else: 88 raise NotImplementedError(f'Data format {self._data_format} ' 89 f'is not implemented by {__name__}') 90 91 def path(self) -> str: 92 """Builds the file path for the instance of a Duplicates scenario. 93 94 Returns 95 ------- 96 path : str 97 File path for the Duplicates' instance. 98 """ 99 key = f'duplicates_{self._percentage}_percentage' 100 path = os.path.join(self._main_directory, self._engine, 101 self._data_format, key) 102 self._logger.debug(f'Generating to {path}') 103 os.makedirs(path, exist_ok=True) 104 return path 105 106 def _generate_dataframe(self, member_offset: int = 1, 107 property_offset: int = 1) -> DataFrame: 108 """Generate duplicates data. 109 110 Parameters 111 ---------- 112 member_offset : int 113 Offset to start member ID generation from. Default 1 (no offset). 114 property_offset : int 115 Offset to start property ID generation from. Default 1 (no offset). 116 117 Returns 118 ------- 119 dataframe : DataFrame 120 Panda's DataFrame with generated data. 121 """ 122 subject_id = range(member_offset, 123 self._number_of_members + member_offset) 124 value_id = range(property_offset, 125 self._number_of_members + property_offset) 126 data: dict = {'id': subject_id} 127 n_ascii = len(string.ascii_letters) 128 129 for j in range(1, self._number_of_properties + 1): 130 # Append ASCII characters if necessary, use modulo to avoid out of 131 # range in ASCII table 132 append_value = '' 133 if self._value_size > 0: 134 append_value = '_' 135 for n in range(self._value_size): 136 append_value += string.ascii_letters[n % n_ascii] 137 138 # Generate value V_{property}_{member} honoring the value size 139 value = [f'V_{j}-{i}{append_value}' for i in value_id] 140 data[f'p{j}'] = value 141 142 return DataFrame(data) 143 144 def _update_dataframe(self, dataframe: DataFrame): 145 """ 146 Sample a percentage of the dataframe to fill with the same value. 147 148 Parameters 149 ---------- 150 dataframe : DataFrame 151 The dataframe to update. 152 153 Returns 154 ------- 155 dataframe : DataFrame 156 The updated dataframe. 157 """ 158 percentage_members: float = self._number_of_members * \ 159 (self._percentage / 100.0) 160 sample = dataframe.iloc[random.sample(list(dataframe.index), 161 int(percentage_members))] 162 for i in list(sample.index): 163 for j in range(1, self._number_of_properties + 1): 164 dataframe.loc[i, f'p{j}'] = 'DUPLICATE' 165 dataframe.loc[i, 'id'] = numpy.iinfo(numpy.int64).max 166 167 return dataframe 168 169 def _generate_mapping(self) -> Graph: 170 """Generate a [R2]RML mapping for a Duplicates instance. 171 172 Returns 173 ------- 174 mapping : Graph 175 [R2]RML mapping as an RDFLib Graph. 176 """ 177 mapping: Graph = Graph(base='http://ex.com/') 178 mapping.bind('rr', R2RML) 179 mapping.bind('ql', QL) 180 mapping.bind('ex', EX) 181 subject_template = Literal('http://ex.com/table/{id}') 182 183 if self._data_format == 'postgresql': 184 triples_map_iri = self._add_triples_map_table(mapping, 185 subject_template, 186 Literal('data')) 187 elif self._data_format == 'csv': 188 triples_map_iri = \ 189 self._add_triples_map_source(mapping, subject_template, 190 Literal('/data/shared/data.csv')) 191 else: 192 raise NotImplementedError(f'{self._data_format} not implemented') 193 194 for i in range(1, self._number_of_properties + 1): 195 self._add_predicate_object_map(mapping, triples_map_iri, 196 EX[f'p{i}'], Literal(f'p{i}')) 197 198 return mapping 199 200 def _generate_csv(self) -> bool: 201 """Generate the instance as CSV files. 202 203 Returns 204 ------- 205 success : bool 206 True if successfull, false otherwise 207 """ 208 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 209 data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE) 210 dataframe = self._generate_dataframe() 211 dataframe = self._update_dataframe(dataframe) 212 dataframe.to_csv(data_path, index=False) 213 214 mapping_path = os.path.join(self.path(), 'data', 'shared', 215 CSV_MAPPING_FILE) 216 mapping: Graph = self._generate_mapping() 217 mapping.serialize(destination=mapping_path, format='turtle') 218 self._generate_scenario() 219 220 return True 221 222 def _generate_postgresql(self) -> bool: 223 """Generate the instance as PostgreSQL with CSV files to load. 224 225 Returns 226 ------- 227 success : bool 228 True if successfull, false otherwise 229 """ 230 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 231 data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE) 232 dataframe = self._generate_dataframe() 233 dataframe = self._update_dataframe(dataframe) 234 dataframe.to_csv(data_path, index=False) 235 236 mapping_path = os.path.join(self.path(), 'data', 'shared', 237 RDB_MAPPING_FILE) 238 mapping: Graph = self._generate_mapping() 239 mapping.serialize(destination=mapping_path, format='turtle') 240 self._generate_scenario() 241 242 return True 243 244 def _generate_scenario(self) -> bool: 245 """Generate the metadata for this scenario. 246 247 Configures the execution pipeline automatically. 248 249 Returns 250 ------- 251 success : bool 252 True if successfull, false otherwise 253 """ 254 name: str = f'duplicates_{self._percentage}' 255 description: str = f'Duplicates {self._percentage}' 256 iri: str = f'http://example.org/duplicates/{self._percentage}/' 257 258 if self._data_format == 'postgresql': 259 return self._generate_metadata(iri, name, description, 260 RDB_MAPPING_FILE) 261 elif self._data_format == 'csv': 262 return self._generate_metadata(iri, name, description, 263 CSV_MAPPING_FILE) 264 else: 265 raise NotImplementedError(f'{self._data_format} not implemented') 266 267 return True
Helper class that provides a standard way to create an ABC using inheritance.
Duplicates( main_directory: str, verbose: bool, percentage: float, data_format: str, engine: str, seed: int = 0, number_of_members: int = 100000, number_of_properties: int = 20, value_size: int = 0)
29 def __init__(self, main_directory: str, verbose: bool, 30 percentage: float, data_format: str, engine: str, 31 seed: int = 0, number_of_members: int = 100000, 32 number_of_properties: int = 20, value_size: int = 0): 33 """Initialize a Duplicates scenario. 34 35 Parameters 36 ---------- 37 main_directory : str 38 Root directory for generating instances of Duplicates. 39 verbose : bool 40 Verbose logging enabled or not. 41 percentage : float 42 Percentage duplicates to generate, for example 50% results into 43 a dataset with 50% the same data values. 44 data_format : str 45 Data format to use for generating the data set, for example: 46 "csv", "json", "xml", "postgresql", "mysql" 47 engine : str 48 Engine to use for execution of the generated scenario's instance, 49 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 50 or "OntopMaterialize" 51 seed : int 52 Random seed to use, default 0. 53 number_of_members : int 54 Number of members to generate, for example 5000 for 5K rows in a 55 tabular data structure. Default 100K members. 56 number_of_properties : int 57 Number of properties per member to generate, for example 20 for 58 20 columns in a tabular data structure. Default 20 properties. 59 value_size : int 60 Number of characters to add to default value generation, 61 for example: 256 will expand all values to 256 characters. 62 Default 0 added characters. 63 """ 64 self._percentage: float = percentage 65 self._number_of_members = number_of_members 66 self._number_of_properties = number_of_properties 67 self._value_size = value_size 68 random.seed(seed) 69 70 super().__init__(data_format, engine, main_directory, verbose) 71 72 if self._data_format != 'csv': 73 raise NotImplementedError(f'Data format {self._data_format} ' 74 f'is not implemented by {__name__}') 75 76 self._logger = Logger(__name__, self._main_directory, self._verbose)
Initialize a Duplicates scenario.
Parameters
- main_directory (str): Root directory for generating instances of Duplicates.
- verbose (bool): Verbose logging enabled or not.
- percentage (float): Percentage duplicates to generate, for example 50% results into a dataset with 50% the same data values.
- data_format (str): Data format to use for generating the data set, for example: "csv", "json", "xml", "postgresql", "mysql"
- engine (str): Engine to use for execution of the generated scenario's instance, for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", or "OntopMaterialize"
- seed (int): Random seed to use, default 0.
- number_of_members (int): Number of members to generate, for example 5000 for 5K rows in a tabular data structure. Default 100K members.
- number_of_properties (int): Number of properties per member to generate, for example 20 for 20 columns in a tabular data structure. Default 20 properties.
- value_size (int): Number of characters to add to default value generation, for example: 256 will expand all values to 256 characters. Default 0 added characters.
def
generate(self) -> bool:
78 def generate(self) -> bool: 79 """Generate the instance using the Duplciates scenario. 80 81 Only CSV files are currently implemented! 82 """ 83 if self._data_format == 'csv': 84 return self._generate_csv() 85 elif self._data_format == 'postgresql': 86 return self._generate_postgresql() 87 else: 88 raise NotImplementedError(f'Data format {self._data_format} ' 89 f'is not implemented by {__name__}')
Generate the instance using the Duplciates scenario.
Only CSV files are currently implemented!
def
path(self) -> str:
91 def path(self) -> str: 92 """Builds the file path for the instance of a Duplicates scenario. 93 94 Returns 95 ------- 96 path : str 97 File path for the Duplicates' instance. 98 """ 99 key = f'duplicates_{self._percentage}_percentage' 100 path = os.path.join(self._main_directory, self._engine, 101 self._data_format, key) 102 self._logger.debug(f'Generating to {path}') 103 os.makedirs(path, exist_ok=True) 104 return path
Builds the file path for the instance of a Duplicates scenario.
Returns
- path (str): File path for the Duplicates' instance.