bench_generator.empty_values
This module holds the EmptyValues class which scales the number of empty values in a data set with a fixed data size.
1#!/usr/bin/env python3 2 3""" 4This module holds the EmptyValues class which scales the number of empty values 5in a data set with a fixed data size. 6""" 7 8import os 9import string 10import random 11from pandas import DataFrame 12from rdflib.namespace import RDF 13from rdflib import Graph, URIRef, BNode, Literal, Namespace 14from bench_generator.scenario import Scenario 15from bench_generator.logger import Logger 16 17DATA_FILE = 'data.csv' 18RDB_MAPPING_FILE = 'mapping.r2rml.ttl' 19CSV_MAPPING_FILE = 'mapping.rml.ttl' 20R2RML = Namespace('http://www.w3.org/ns/r2rml#') 21RML = Namespace('http://semweb.mmlab.be/ns/rml#') 22QL = Namespace('http://semweb.mmlab.be/ns/ql#') 23EX = Namespace('http://example.com/') 24 25 26class EmptyValues(Scenario): 27 def __init__(self, main_directory: str, verbose: bool, 28 percentage: float, data_format: str, engine: str, 29 seed: int = 0, number_of_members: int = 100000, 30 number_of_properties: int = 20, value_size: int = 0): 31 """Initialize a EmptyValues scenario. 32 33 Parameters 34 ---------- 35 main_directory : str 36 Root directory for generating instances of EmptyValues. 37 verbose : bool 38 Verbose logging enabled or not. 39 percentage : float 40 Percentage empty values to generate, for example 50% results into 41 a dataset with 50% the same data values. 42 data_format : str 43 Data format to use for generating the data set, for example: 44 "csv", "json", "xml", "postgresql", "mysql" 45 engine : str 46 Engine to use for execution of the generated scenario's instance, 47 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 48 or "OntopMaterialize" 49 seed : int 50 Random seed to use, default 0. 51 number_of_members : int 52 Number of members to generate, for example 5000 for 5K rows in a 53 tabular data structure. Default 100K members. 54 number_of_properties : int 55 Number of properties per member to generate, for example 20 for 56 20 columns in a tabular data structure. Default 20 properties. 57 value_size : int 58 Number of characters to add to default value generation, 59 for example: 256 will expand all values to 256 characters. 60 Default 0 added characters. 61 """ 62 self._percentage: float = percentage 63 self._number_of_members = number_of_members 64 self._number_of_properties = number_of_properties 65 self._value_size = value_size 66 random.seed(seed) 67 68 super().__init__(data_format, engine, main_directory, verbose) 69 if self._data_format != 'csv': 70 raise NotImplementedError(f'Data format {self._data_format} ' 71 f'is not implemented by {__name__}') 72 73 self._logger = Logger(__name__, self._main_directory, self._verbose) 74 75 def generate(self) -> bool: 76 """Generate the instance using the Duplciates scenario. 77 78 Only CSV files are currently implemented! 79 """ 80 if self._data_format == 'csv': 81 return self._generate_csv() 82 elif self._data_format == 'postgresql': 83 return self._generate_postgresql() 84 else: 85 raise NotImplementedError(f'Data format {self._data_format} ' 86 f'is not implemented by {__name__}') 87 88 def path(self) -> str: 89 """Builds the file path for the instance of a EmptyValues scenario. 90 91 Returns 92 ------- 93 path : str 94 File path for the EmptyValues' instance. 95 """ 96 key = f'empty_{self._percentage}_percentage' 97 path = os.path.join(self._main_directory, self._engine, 98 self._data_format, key) 99 self._logger.debug(f'Generating to {path}') 100 os.makedirs(path, exist_ok=True) 101 return path 102 103 def _generate_dataframe(self, member_offset: int = 1, 104 property_offset: int = 1) -> DataFrame: 105 """Generate empty values data. 106 107 Parameters 108 ---------- 109 member_offset : int 110 Offset to start member ID generation from. Default 1 (no offset). 111 property_offset : int 112 Offset to start property ID generation from. Default 1 (no offset). 113 114 Returns 115 ------- 116 dataframe : DataFrame 117 Panda's DataFrame with generated data. 118 """ 119 subject_id = range(member_offset, 120 self._number_of_members + member_offset) 121 value_id = range(property_offset, 122 self._number_of_members + property_offset) 123 data: dict = {'id': subject_id} 124 n_ascii = len(string.ascii_letters) 125 126 for j in range(1, self._number_of_properties + 1): 127 # Append ASCII characters if necessary, use modulo to avoid out of 128 # range in ASCII table 129 append_value = '' 130 if self._value_size > 0: 131 append_value = '_' 132 for n in range(self._value_size): 133 append_value += string.ascii_letters[n % n_ascii] 134 135 # Generate value V_{property}_{member} honoring the value size 136 value = [f'V_{j}-{i}{append_value}' for i in value_id] 137 data[f'p{j}'] = value 138 139 return DataFrame(data) 140 141 def _update_dataframe(self, dataframe: DataFrame): 142 """ 143 Sample a percentage of the dataframe to fill with the same value. 144 145 Parameters 146 ---------- 147 dataframe : DataFrame 148 The dataframe to update. 149 150 Returns 151 ------- 152 dataframe : DataFrame 153 The updated dataframe. 154 """ 155 percentage_members: float = self._number_of_members * \ 156 (self._percentage / 100.0) 157 sample = dataframe.iloc[random.sample(list(dataframe.index), 158 int(percentage_members))] 159 for i in list(sample.index): 160 for j in range(1, self._number_of_properties + 1): 161 dataframe.loc[i, f'p{j}'] = 'NULL' 162 163 return dataframe 164 165 def _generate_mapping(self) -> Graph: 166 """Generate a [R2]RML mapping for a EmptyValues instance. 167 168 Returns 169 ------- 170 mapping : Graph 171 [R2]RML mapping as an RDFLib Graph. 172 """ 173 mapping: Graph = Graph(base='http://ex.com/') 174 mapping.bind('rr', R2RML) 175 mapping.bind('ql', QL) 176 mapping.bind('ex', EX) 177 subject_template = Literal('http://ex.com/table/{id}') 178 179 if self._data_format == 'postgresql': 180 triples_map_iri = self._add_triples_map_table(mapping, 181 subject_template, 182 Literal('data')) 183 elif self._data_format == 'csv': 184 triples_map_iri = \ 185 self._add_triples_map_source(mapping, subject_template, 186 Literal('/data/shared/data.csv')) 187 else: 188 raise NotImplementedError(f'{self._data_format} not implemented') 189 190 for i in range(1, self._number_of_properties + 1): 191 self._add_predicate_object_map(mapping, triples_map_iri, 192 EX[f'p{i}'], Literal(f'p{i}')) 193 194 return mapping 195 196 def _generate_csv(self) -> bool: 197 """Generate the instance as CSV files. 198 199 Returns 200 ------- 201 success : bool 202 True if successfull, false otherwise 203 """ 204 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 205 data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE) 206 dataframe = self._generate_dataframe() 207 dataframe = self._update_dataframe(dataframe) 208 dataframe.to_csv(data_path, index=False) 209 210 mapping_path = os.path.join(self.path(), 'data', 'shared', 211 CSV_MAPPING_FILE) 212 mapping: Graph = self._generate_mapping() 213 mapping.serialize(destination=mapping_path, format='turtle') 214 self._generate_scenario() 215 216 return True 217 218 def _generate_postgresql(self) -> bool: 219 """Generate the instance as PostgreSQL with CSV files to load. 220 221 Returns 222 ------- 223 success : bool 224 True if successfull, false otherwise 225 """ 226 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 227 data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE) 228 dataframe = self._generate_dataframe() 229 dataframe = self._update_dataframe(dataframe) 230 dataframe.to_csv(data_path, index=False) 231 232 mapping_path = os.path.join(self.path(), 'data', 'shared', 233 RDB_MAPPING_FILE) 234 mapping: Graph = self._generate_mapping() 235 mapping.serialize(destination=mapping_path, format='turtle') 236 self._generate_scenario() 237 238 return True 239 240 def _generate_scenario(self) -> bool: 241 """Generate the metadata for this scenario. 242 243 Configures the execution pipeline automatically. 244 245 Returns 246 ------- 247 success : bool 248 True if successfull, false otherwise 249 """ 250 name: str = f'empty_{self._percentage}' 251 description: str = f'Empty Values {self._percentage}' 252 iri: str = f'http://example.org/empty/{self._percentage}/' 253 254 if self._data_format == 'postgresql': 255 return self._generate_metadata(iri, name, description, 256 RDB_MAPPING_FILE) 257 elif self._data_format == 'csv': 258 return self._generate_metadata(iri, name, description, 259 CSV_MAPPING_FILE) 260 else: 261 raise NotImplementedError(f'{self._data_format} not implemented') 262 263 return False
DATA_FILE =
'data.csv'
RDB_MAPPING_FILE =
'mapping.r2rml.ttl'
CSV_MAPPING_FILE =
'mapping.rml.ttl'
R2RML =
Namespace('http://www.w3.org/ns/r2rml#')
RML =
Namespace('http://semweb.mmlab.be/ns/rml#')
QL =
Namespace('http://semweb.mmlab.be/ns/ql#')
EX =
Namespace('http://example.com/')
27class EmptyValues(Scenario): 28 def __init__(self, main_directory: str, verbose: bool, 29 percentage: float, data_format: str, engine: str, 30 seed: int = 0, number_of_members: int = 100000, 31 number_of_properties: int = 20, value_size: int = 0): 32 """Initialize a EmptyValues scenario. 33 34 Parameters 35 ---------- 36 main_directory : str 37 Root directory for generating instances of EmptyValues. 38 verbose : bool 39 Verbose logging enabled or not. 40 percentage : float 41 Percentage empty values to generate, for example 50% results into 42 a dataset with 50% the same data values. 43 data_format : str 44 Data format to use for generating the data set, for example: 45 "csv", "json", "xml", "postgresql", "mysql" 46 engine : str 47 Engine to use for execution of the generated scenario's instance, 48 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 49 or "OntopMaterialize" 50 seed : int 51 Random seed to use, default 0. 52 number_of_members : int 53 Number of members to generate, for example 5000 for 5K rows in a 54 tabular data structure. Default 100K members. 55 number_of_properties : int 56 Number of properties per member to generate, for example 20 for 57 20 columns in a tabular data structure. Default 20 properties. 58 value_size : int 59 Number of characters to add to default value generation, 60 for example: 256 will expand all values to 256 characters. 61 Default 0 added characters. 62 """ 63 self._percentage: float = percentage 64 self._number_of_members = number_of_members 65 self._number_of_properties = number_of_properties 66 self._value_size = value_size 67 random.seed(seed) 68 69 super().__init__(data_format, engine, main_directory, verbose) 70 if self._data_format != 'csv': 71 raise NotImplementedError(f'Data format {self._data_format} ' 72 f'is not implemented by {__name__}') 73 74 self._logger = Logger(__name__, self._main_directory, self._verbose) 75 76 def generate(self) -> bool: 77 """Generate the instance using the Duplciates scenario. 78 79 Only CSV files are currently implemented! 80 """ 81 if self._data_format == 'csv': 82 return self._generate_csv() 83 elif self._data_format == 'postgresql': 84 return self._generate_postgresql() 85 else: 86 raise NotImplementedError(f'Data format {self._data_format} ' 87 f'is not implemented by {__name__}') 88 89 def path(self) -> str: 90 """Builds the file path for the instance of a EmptyValues scenario. 91 92 Returns 93 ------- 94 path : str 95 File path for the EmptyValues' instance. 96 """ 97 key = f'empty_{self._percentage}_percentage' 98 path = os.path.join(self._main_directory, self._engine, 99 self._data_format, key) 100 self._logger.debug(f'Generating to {path}') 101 os.makedirs(path, exist_ok=True) 102 return path 103 104 def _generate_dataframe(self, member_offset: int = 1, 105 property_offset: int = 1) -> DataFrame: 106 """Generate empty values data. 107 108 Parameters 109 ---------- 110 member_offset : int 111 Offset to start member ID generation from. Default 1 (no offset). 112 property_offset : int 113 Offset to start property ID generation from. Default 1 (no offset). 114 115 Returns 116 ------- 117 dataframe : DataFrame 118 Panda's DataFrame with generated data. 119 """ 120 subject_id = range(member_offset, 121 self._number_of_members + member_offset) 122 value_id = range(property_offset, 123 self._number_of_members + property_offset) 124 data: dict = {'id': subject_id} 125 n_ascii = len(string.ascii_letters) 126 127 for j in range(1, self._number_of_properties + 1): 128 # Append ASCII characters if necessary, use modulo to avoid out of 129 # range in ASCII table 130 append_value = '' 131 if self._value_size > 0: 132 append_value = '_' 133 for n in range(self._value_size): 134 append_value += string.ascii_letters[n % n_ascii] 135 136 # Generate value V_{property}_{member} honoring the value size 137 value = [f'V_{j}-{i}{append_value}' for i in value_id] 138 data[f'p{j}'] = value 139 140 return DataFrame(data) 141 142 def _update_dataframe(self, dataframe: DataFrame): 143 """ 144 Sample a percentage of the dataframe to fill with the same value. 145 146 Parameters 147 ---------- 148 dataframe : DataFrame 149 The dataframe to update. 150 151 Returns 152 ------- 153 dataframe : DataFrame 154 The updated dataframe. 155 """ 156 percentage_members: float = self._number_of_members * \ 157 (self._percentage / 100.0) 158 sample = dataframe.iloc[random.sample(list(dataframe.index), 159 int(percentage_members))] 160 for i in list(sample.index): 161 for j in range(1, self._number_of_properties + 1): 162 dataframe.loc[i, f'p{j}'] = 'NULL' 163 164 return dataframe 165 166 def _generate_mapping(self) -> Graph: 167 """Generate a [R2]RML mapping for a EmptyValues instance. 168 169 Returns 170 ------- 171 mapping : Graph 172 [R2]RML mapping as an RDFLib Graph. 173 """ 174 mapping: Graph = Graph(base='http://ex.com/') 175 mapping.bind('rr', R2RML) 176 mapping.bind('ql', QL) 177 mapping.bind('ex', EX) 178 subject_template = Literal('http://ex.com/table/{id}') 179 180 if self._data_format == 'postgresql': 181 triples_map_iri = self._add_triples_map_table(mapping, 182 subject_template, 183 Literal('data')) 184 elif self._data_format == 'csv': 185 triples_map_iri = \ 186 self._add_triples_map_source(mapping, subject_template, 187 Literal('/data/shared/data.csv')) 188 else: 189 raise NotImplementedError(f'{self._data_format} not implemented') 190 191 for i in range(1, self._number_of_properties + 1): 192 self._add_predicate_object_map(mapping, triples_map_iri, 193 EX[f'p{i}'], Literal(f'p{i}')) 194 195 return mapping 196 197 def _generate_csv(self) -> bool: 198 """Generate the instance as CSV files. 199 200 Returns 201 ------- 202 success : bool 203 True if successfull, false otherwise 204 """ 205 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 206 data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE) 207 dataframe = self._generate_dataframe() 208 dataframe = self._update_dataframe(dataframe) 209 dataframe.to_csv(data_path, index=False) 210 211 mapping_path = os.path.join(self.path(), 'data', 'shared', 212 CSV_MAPPING_FILE) 213 mapping: Graph = self._generate_mapping() 214 mapping.serialize(destination=mapping_path, format='turtle') 215 self._generate_scenario() 216 217 return True 218 219 def _generate_postgresql(self) -> bool: 220 """Generate the instance as PostgreSQL with CSV files to load. 221 222 Returns 223 ------- 224 success : bool 225 True if successfull, false otherwise 226 """ 227 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 228 data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE) 229 dataframe = self._generate_dataframe() 230 dataframe = self._update_dataframe(dataframe) 231 dataframe.to_csv(data_path, index=False) 232 233 mapping_path = os.path.join(self.path(), 'data', 'shared', 234 RDB_MAPPING_FILE) 235 mapping: Graph = self._generate_mapping() 236 mapping.serialize(destination=mapping_path, format='turtle') 237 self._generate_scenario() 238 239 return True 240 241 def _generate_scenario(self) -> bool: 242 """Generate the metadata for this scenario. 243 244 Configures the execution pipeline automatically. 245 246 Returns 247 ------- 248 success : bool 249 True if successfull, false otherwise 250 """ 251 name: str = f'empty_{self._percentage}' 252 description: str = f'Empty Values {self._percentage}' 253 iri: str = f'http://example.org/empty/{self._percentage}/' 254 255 if self._data_format == 'postgresql': 256 return self._generate_metadata(iri, name, description, 257 RDB_MAPPING_FILE) 258 elif self._data_format == 'csv': 259 return self._generate_metadata(iri, name, description, 260 CSV_MAPPING_FILE) 261 else: 262 raise NotImplementedError(f'{self._data_format} not implemented') 263 264 return False
Helper class that provides a standard way to create an ABC using inheritance.
EmptyValues( main_directory: str, verbose: bool, percentage: float, data_format: str, engine: str, seed: int = 0, number_of_members: int = 100000, number_of_properties: int = 20, value_size: int = 0)
28 def __init__(self, main_directory: str, verbose: bool, 29 percentage: float, data_format: str, engine: str, 30 seed: int = 0, number_of_members: int = 100000, 31 number_of_properties: int = 20, value_size: int = 0): 32 """Initialize a EmptyValues scenario. 33 34 Parameters 35 ---------- 36 main_directory : str 37 Root directory for generating instances of EmptyValues. 38 verbose : bool 39 Verbose logging enabled or not. 40 percentage : float 41 Percentage empty values to generate, for example 50% results into 42 a dataset with 50% the same data values. 43 data_format : str 44 Data format to use for generating the data set, for example: 45 "csv", "json", "xml", "postgresql", "mysql" 46 engine : str 47 Engine to use for execution of the generated scenario's instance, 48 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 49 or "OntopMaterialize" 50 seed : int 51 Random seed to use, default 0. 52 number_of_members : int 53 Number of members to generate, for example 5000 for 5K rows in a 54 tabular data structure. Default 100K members. 55 number_of_properties : int 56 Number of properties per member to generate, for example 20 for 57 20 columns in a tabular data structure. Default 20 properties. 58 value_size : int 59 Number of characters to add to default value generation, 60 for example: 256 will expand all values to 256 characters. 61 Default 0 added characters. 62 """ 63 self._percentage: float = percentage 64 self._number_of_members = number_of_members 65 self._number_of_properties = number_of_properties 66 self._value_size = value_size 67 random.seed(seed) 68 69 super().__init__(data_format, engine, main_directory, verbose) 70 if self._data_format != 'csv': 71 raise NotImplementedError(f'Data format {self._data_format} ' 72 f'is not implemented by {__name__}') 73 74 self._logger = Logger(__name__, self._main_directory, self._verbose)
Initialize a EmptyValues scenario.
Parameters
- main_directory (str): Root directory for generating instances of EmptyValues.
- verbose (bool): Verbose logging enabled or not.
- percentage (float): Percentage empty values to generate, for example 50% results into a dataset with 50% the same data values.
- data_format (str): Data format to use for generating the data set, for example: "csv", "json", "xml", "postgresql", "mysql"
- engine (str): Engine to use for execution of the generated scenario's instance, for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", or "OntopMaterialize"
- seed (int): Random seed to use, default 0.
- number_of_members (int): Number of members to generate, for example 5000 for 5K rows in a tabular data structure. Default 100K members.
- number_of_properties (int): Number of properties per member to generate, for example 20 for 20 columns in a tabular data structure. Default 20 properties.
- value_size (int): Number of characters to add to default value generation, for example: 256 will expand all values to 256 characters. Default 0 added characters.
def
generate(self) -> bool:
76 def generate(self) -> bool: 77 """Generate the instance using the Duplciates scenario. 78 79 Only CSV files are currently implemented! 80 """ 81 if self._data_format == 'csv': 82 return self._generate_csv() 83 elif self._data_format == 'postgresql': 84 return self._generate_postgresql() 85 else: 86 raise NotImplementedError(f'Data format {self._data_format} ' 87 f'is not implemented by {__name__}')
Generate the instance using the Duplciates scenario.
Only CSV files are currently implemented!
def
path(self) -> str:
89 def path(self) -> str: 90 """Builds the file path for the instance of a EmptyValues scenario. 91 92 Returns 93 ------- 94 path : str 95 File path for the EmptyValues' instance. 96 """ 97 key = f'empty_{self._percentage}_percentage' 98 path = os.path.join(self._main_directory, self._engine, 99 self._data_format, key) 100 self._logger.debug(f'Generating to {path}') 101 os.makedirs(path, exist_ok=True) 102 return path
Builds the file path for the instance of a EmptyValues scenario.
Returns
- path (str): File path for the EmptyValues' instance.