bench_generator.raw_data
This module holds the RawData class which scales the dataset size by the number of members in a dataset such as number of rows for tabular data.
1#!/usr/bin/env python3 2 3""" 4This module holds the RawData class which scales the dataset size 5by the number of members in a dataset such as number of rows for tabular data. 6""" 7 8import os 9import string 10from pandas import DataFrame 11from rdflib import Graph, Literal, Namespace 12from bench_generator.scenario import Scenario 13from bench_generator.logger import Logger 14 15DATA_FILE = 'data.csv' 16RDB_MAPPING_FILE = 'mapping.r2rml.ttl' 17CSV_MAPPING_FILE = 'mapping.rml.ttl' 18R2RML = Namespace('http://www.w3.org/ns/r2rml#') 19RML = Namespace('http://semweb.mmlab.be/ns/rml#') 20QL = Namespace('http://semweb.mmlab.be/ns/ql#') 21EX = Namespace('http://example.com/') 22 23 24class RawData(Scenario): 25 def __init__(self, main_directory: str, verbose: bool, 26 number_of_members: int, number_of_properties: int, 27 value_size: int, data_format: str, engine: str): 28 """Initialize a Raw Data scenario. 29 30 Parameters 31 ---------- 32 main_directory : str 33 Root directory for generating instances of Raw Data. 34 verbose : bool 35 Verbose logging enabled or not. 36 number_of_members : int 37 Number of members to generate, for example 5000 for 5K rows in a 38 tabular data structure. 39 number_of_properties : int 40 Number of properties per member to generate, for example 20 for 41 20 columns in a tabular data structure. 42 value_size : int 43 Number of characters to add to default value generation, 44 for example: 256 will expand all values to 256 characters. 45 data_format : str 46 Data format to use for generating the data set, for example: 47 "csv", "json", "xml", "postgresql", "mysql" 48 engine : str 49 Engine to use for execution of the generated scenario's instance, 50 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 51 or "OntopMaterialize" 52 """ 53 self._number_of_members: int = number_of_members 54 self._number_of_properties: int = number_of_properties 55 self._value_size: int = value_size 56 57 super().__init__(data_format, engine, main_directory, verbose) 58 self._logger = Logger(__name__, self._main_directory, self._verbose) 59 60 def generate(self) -> bool: 61 """Generate the instance using the Raw Data scenario. 62 63 Only CSV files are currently implemented! 64 """ 65 if self._data_format == 'csv': 66 return self._generate_csv() 67 elif self._data_format == 'postgresql': 68 return self._generate_postgresql() 69 else: 70 raise NotImplementedError(f'Data format {self._data_format} ' 71 f'is not implemented by {__name__}') 72 73 def path(self) -> str: 74 """Builds the file path for the instance of a Raw Data scenario. 75 76 Returns 77 ------- 78 path : str 79 File path for the Raw Data's instance. 80 """ 81 key = f'raw_{self._number_of_members}_' \ 82 f'{self._number_of_properties}_{self._value_size}' 83 path = os.path.join(self._main_directory, self._engine, 84 self._data_format, key) 85 self._logger.debug(f'Generating to {path}') 86 os.makedirs(path, exist_ok=True) 87 return path 88 89 def _generate_dataframe(self, member_offset: int = 1, 90 property_offset: int = 1) -> DataFrame: 91 """Generate raw data. 92 93 Parameters 94 ---------- 95 member_offset : int 96 Offset to start member ID generation from. Default 1 (no offset). 97 property_offset : int 98 Offset to start property ID generation from. Default 1 (no offset). 99 100 Returns 101 ------- 102 dataframe : DataFrame 103 Panda's DataFrame with generated raw data. 104 """ 105 subject_id = range(member_offset, 106 self._number_of_members + member_offset) 107 value_id = range(property_offset, 108 self._number_of_members + property_offset) 109 data: dict = {'id': subject_id} 110 n_ascii = len(string.ascii_letters) 111 112 for j in range(1, self._number_of_properties + 1): 113 # Append ASCII characters if necessary, use modulo to avoid out of 114 # range in ASCII table 115 append_value = '' 116 if self._value_size > 0: 117 append_value = '_' 118 for n in range(self._value_size): 119 append_value += string.ascii_letters[n % n_ascii] 120 121 # Generate value V_{property}_{member} honoring the value size 122 value = [f'V_{j}-{i}{append_value}' for i in value_id] 123 data[f'p{j}'] = value 124 125 return DataFrame(data) 126 127 def _generate_mapping(self) -> Graph: 128 """Generate a [R2]RML mapping for a RawData instance. 129 130 Returns 131 ------- 132 mapping : Graph 133 [R2]RML mapping as an RDFLib Graph. 134 """ 135 mapping: Graph = Graph(base='http://ex.com/') 136 mapping.bind('rr', R2RML) 137 mapping.bind('rml', RML) 138 mapping.bind('ql', QL) 139 mapping.bind('ex', EX) 140 subject_template = Literal('http://ex.com/table/{id}') 141 142 if self._data_format == 'postgresql': 143 triples_map_iri = self._add_triples_map_table(mapping, 144 subject_template, 145 Literal('data')) 146 elif self._data_format == 'csv': 147 triples_map_iri = \ 148 self._add_triples_map_source(mapping, subject_template, 149 Literal('/data/shared/data.csv')) 150 else: 151 raise NotImplementedError(f'{self._data_format} not implemented') 152 153 for i in range(1, self._number_of_properties + 1): 154 self._add_predicate_object_map(mapping, triples_map_iri, 155 EX[f'p{i}'], Literal(f'p{i}')) 156 157 return mapping 158 159 def _generate_csv(self) -> bool: 160 """Generate the instance as CSV files. 161 162 Returns 163 ------- 164 success : bool 165 True if successfull, false otherwise 166 """ 167 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 168 data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE) 169 self._generate_dataframe().to_csv(data_path, index=False) 170 171 mapping_path = os.path.join(self.path(), 'data', 'shared', 172 CSV_MAPPING_FILE) 173 mapping: Graph = self._generate_mapping() 174 mapping.serialize(destination=mapping_path, format='turtle') 175 self._generate_scenario() 176 177 return True 178 179 def _generate_postgresql(self) -> bool: 180 """Generate the instance as PostgreSQL with CSV files to load. 181 182 Returns 183 ------- 184 success : bool 185 True if successfull, false otherwise 186 """ 187 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 188 data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE) 189 self._generate_dataframe().to_csv(data_path, index=False) 190 191 mapping_path = os.path.join(self.path(), 'data', 'shared', 192 RDB_MAPPING_FILE) 193 mapping: Graph = self._generate_mapping() 194 mapping.serialize(destination=mapping_path, format='turtle') 195 self._generate_scenario() 196 197 return True 198 199 def _generate_scenario(self) -> bool: 200 """Generate the metadata for this scenario. 201 202 Configures the execution pipeline automatically. 203 204 Returns 205 ------- 206 success : bool 207 True if successfull, false otherwise 208 """ 209 name: str = f'raw_{self._number_of_members}_' + \ 210 f'{self._number_of_properties}_{self._value_size}' 211 description: str = f'Raw Data Values {self._number_of_members} ' + \ 212 f'members, {self._number_of_properties} ' + \ 213 f'properties, and {self._value_size} value size' 214 iri: str = f'http://example.org/raw/{self._number_of_members}/' + \ 215 f'{self._number_of_properties}/{self._value_size}' 216 217 if self._data_format == 'postgresql': 218 return self._generate_metadata(iri, name, description, 219 RDB_MAPPING_FILE) 220 elif self._data_format == 'csv': 221 return self._generate_metadata(iri, name, description, 222 CSV_MAPPING_FILE) 223 else: 224 raise NotImplementedError(f'{self._data_format} not implemented') 225 226 return False
DATA_FILE =
'data.csv'
RDB_MAPPING_FILE =
'mapping.r2rml.ttl'
CSV_MAPPING_FILE =
'mapping.rml.ttl'
R2RML =
Namespace('http://www.w3.org/ns/r2rml#')
RML =
Namespace('http://semweb.mmlab.be/ns/rml#')
QL =
Namespace('http://semweb.mmlab.be/ns/ql#')
EX =
Namespace('http://example.com/')
25class RawData(Scenario): 26 def __init__(self, main_directory: str, verbose: bool, 27 number_of_members: int, number_of_properties: int, 28 value_size: int, data_format: str, engine: str): 29 """Initialize a Raw Data scenario. 30 31 Parameters 32 ---------- 33 main_directory : str 34 Root directory for generating instances of Raw Data. 35 verbose : bool 36 Verbose logging enabled or not. 37 number_of_members : int 38 Number of members to generate, for example 5000 for 5K rows in a 39 tabular data structure. 40 number_of_properties : int 41 Number of properties per member to generate, for example 20 for 42 20 columns in a tabular data structure. 43 value_size : int 44 Number of characters to add to default value generation, 45 for example: 256 will expand all values to 256 characters. 46 data_format : str 47 Data format to use for generating the data set, for example: 48 "csv", "json", "xml", "postgresql", "mysql" 49 engine : str 50 Engine to use for execution of the generated scenario's instance, 51 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 52 or "OntopMaterialize" 53 """ 54 self._number_of_members: int = number_of_members 55 self._number_of_properties: int = number_of_properties 56 self._value_size: int = value_size 57 58 super().__init__(data_format, engine, main_directory, verbose) 59 self._logger = Logger(__name__, self._main_directory, self._verbose) 60 61 def generate(self) -> bool: 62 """Generate the instance using the Raw Data scenario. 63 64 Only CSV files are currently implemented! 65 """ 66 if self._data_format == 'csv': 67 return self._generate_csv() 68 elif self._data_format == 'postgresql': 69 return self._generate_postgresql() 70 else: 71 raise NotImplementedError(f'Data format {self._data_format} ' 72 f'is not implemented by {__name__}') 73 74 def path(self) -> str: 75 """Builds the file path for the instance of a Raw Data scenario. 76 77 Returns 78 ------- 79 path : str 80 File path for the Raw Data's instance. 81 """ 82 key = f'raw_{self._number_of_members}_' \ 83 f'{self._number_of_properties}_{self._value_size}' 84 path = os.path.join(self._main_directory, self._engine, 85 self._data_format, key) 86 self._logger.debug(f'Generating to {path}') 87 os.makedirs(path, exist_ok=True) 88 return path 89 90 def _generate_dataframe(self, member_offset: int = 1, 91 property_offset: int = 1) -> DataFrame: 92 """Generate raw data. 93 94 Parameters 95 ---------- 96 member_offset : int 97 Offset to start member ID generation from. Default 1 (no offset). 98 property_offset : int 99 Offset to start property ID generation from. Default 1 (no offset). 100 101 Returns 102 ------- 103 dataframe : DataFrame 104 Panda's DataFrame with generated raw data. 105 """ 106 subject_id = range(member_offset, 107 self._number_of_members + member_offset) 108 value_id = range(property_offset, 109 self._number_of_members + property_offset) 110 data: dict = {'id': subject_id} 111 n_ascii = len(string.ascii_letters) 112 113 for j in range(1, self._number_of_properties + 1): 114 # Append ASCII characters if necessary, use modulo to avoid out of 115 # range in ASCII table 116 append_value = '' 117 if self._value_size > 0: 118 append_value = '_' 119 for n in range(self._value_size): 120 append_value += string.ascii_letters[n % n_ascii] 121 122 # Generate value V_{property}_{member} honoring the value size 123 value = [f'V_{j}-{i}{append_value}' for i in value_id] 124 data[f'p{j}'] = value 125 126 return DataFrame(data) 127 128 def _generate_mapping(self) -> Graph: 129 """Generate a [R2]RML mapping for a RawData instance. 130 131 Returns 132 ------- 133 mapping : Graph 134 [R2]RML mapping as an RDFLib Graph. 135 """ 136 mapping: Graph = Graph(base='http://ex.com/') 137 mapping.bind('rr', R2RML) 138 mapping.bind('rml', RML) 139 mapping.bind('ql', QL) 140 mapping.bind('ex', EX) 141 subject_template = Literal('http://ex.com/table/{id}') 142 143 if self._data_format == 'postgresql': 144 triples_map_iri = self._add_triples_map_table(mapping, 145 subject_template, 146 Literal('data')) 147 elif self._data_format == 'csv': 148 triples_map_iri = \ 149 self._add_triples_map_source(mapping, subject_template, 150 Literal('/data/shared/data.csv')) 151 else: 152 raise NotImplementedError(f'{self._data_format} not implemented') 153 154 for i in range(1, self._number_of_properties + 1): 155 self._add_predicate_object_map(mapping, triples_map_iri, 156 EX[f'p{i}'], Literal(f'p{i}')) 157 158 return mapping 159 160 def _generate_csv(self) -> bool: 161 """Generate the instance as CSV files. 162 163 Returns 164 ------- 165 success : bool 166 True if successfull, false otherwise 167 """ 168 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 169 data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE) 170 self._generate_dataframe().to_csv(data_path, index=False) 171 172 mapping_path = os.path.join(self.path(), 'data', 'shared', 173 CSV_MAPPING_FILE) 174 mapping: Graph = self._generate_mapping() 175 mapping.serialize(destination=mapping_path, format='turtle') 176 self._generate_scenario() 177 178 return True 179 180 def _generate_postgresql(self) -> bool: 181 """Generate the instance as PostgreSQL with CSV files to load. 182 183 Returns 184 ------- 185 success : bool 186 True if successfull, false otherwise 187 """ 188 os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True) 189 data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE) 190 self._generate_dataframe().to_csv(data_path, index=False) 191 192 mapping_path = os.path.join(self.path(), 'data', 'shared', 193 RDB_MAPPING_FILE) 194 mapping: Graph = self._generate_mapping() 195 mapping.serialize(destination=mapping_path, format='turtle') 196 self._generate_scenario() 197 198 return True 199 200 def _generate_scenario(self) -> bool: 201 """Generate the metadata for this scenario. 202 203 Configures the execution pipeline automatically. 204 205 Returns 206 ------- 207 success : bool 208 True if successfull, false otherwise 209 """ 210 name: str = f'raw_{self._number_of_members}_' + \ 211 f'{self._number_of_properties}_{self._value_size}' 212 description: str = f'Raw Data Values {self._number_of_members} ' + \ 213 f'members, {self._number_of_properties} ' + \ 214 f'properties, and {self._value_size} value size' 215 iri: str = f'http://example.org/raw/{self._number_of_members}/' + \ 216 f'{self._number_of_properties}/{self._value_size}' 217 218 if self._data_format == 'postgresql': 219 return self._generate_metadata(iri, name, description, 220 RDB_MAPPING_FILE) 221 elif self._data_format == 'csv': 222 return self._generate_metadata(iri, name, description, 223 CSV_MAPPING_FILE) 224 else: 225 raise NotImplementedError(f'{self._data_format} not implemented') 226 227 return False
Helper class that provides a standard way to create an ABC using inheritance.
RawData( main_directory: str, verbose: bool, number_of_members: int, number_of_properties: int, value_size: int, data_format: str, engine: str)
26 def __init__(self, main_directory: str, verbose: bool, 27 number_of_members: int, number_of_properties: int, 28 value_size: int, data_format: str, engine: str): 29 """Initialize a Raw Data scenario. 30 31 Parameters 32 ---------- 33 main_directory : str 34 Root directory for generating instances of Raw Data. 35 verbose : bool 36 Verbose logging enabled or not. 37 number_of_members : int 38 Number of members to generate, for example 5000 for 5K rows in a 39 tabular data structure. 40 number_of_properties : int 41 Number of properties per member to generate, for example 20 for 42 20 columns in a tabular data structure. 43 value_size : int 44 Number of characters to add to default value generation, 45 for example: 256 will expand all values to 256 characters. 46 data_format : str 47 Data format to use for generating the data set, for example: 48 "csv", "json", "xml", "postgresql", "mysql" 49 engine : str 50 Engine to use for execution of the generated scenario's instance, 51 for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", 52 or "OntopMaterialize" 53 """ 54 self._number_of_members: int = number_of_members 55 self._number_of_properties: int = number_of_properties 56 self._value_size: int = value_size 57 58 super().__init__(data_format, engine, main_directory, verbose) 59 self._logger = Logger(__name__, self._main_directory, self._verbose)
Initialize a Raw Data scenario.
Parameters
- main_directory (str): Root directory for generating instances of Raw Data.
- verbose (bool): Verbose logging enabled or not.
- number_of_members (int): Number of members to generate, for example 5000 for 5K rows in a tabular data structure.
- number_of_properties (int): Number of properties per member to generate, for example 20 for 20 columns in a tabular data structure.
- value_size (int): Number of characters to add to default value generation, for example: 256 will expand all values to 256 characters.
- data_format (str): Data format to use for generating the data set, for example: "csv", "json", "xml", "postgresql", "mysql"
- engine (str): Engine to use for execution of the generated scenario's instance, for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", or "OntopMaterialize"
def
generate(self) -> bool:
61 def generate(self) -> bool: 62 """Generate the instance using the Raw Data scenario. 63 64 Only CSV files are currently implemented! 65 """ 66 if self._data_format == 'csv': 67 return self._generate_csv() 68 elif self._data_format == 'postgresql': 69 return self._generate_postgresql() 70 else: 71 raise NotImplementedError(f'Data format {self._data_format} ' 72 f'is not implemented by {__name__}')
Generate the instance using the Raw Data scenario.
Only CSV files are currently implemented!
def
path(self) -> str:
74 def path(self) -> str: 75 """Builds the file path for the instance of a Raw Data scenario. 76 77 Returns 78 ------- 79 path : str 80 File path for the Raw Data's instance. 81 """ 82 key = f'raw_{self._number_of_members}_' \ 83 f'{self._number_of_properties}_{self._value_size}' 84 path = os.path.join(self._main_directory, self._engine, 85 self._data_format, key) 86 self._logger.debug(f'Generating to {path}') 87 os.makedirs(path, exist_ok=True) 88 return path
Builds the file path for the instance of a Raw Data scenario.
Returns
- path (str): File path for the Raw Data's instance.