bench_generator.empty_values

This module holds the EmptyValues class which scales the number of empty values in a data set with a fixed data size.

  1#!/usr/bin/env python3
  2
  3"""
  4This module holds the EmptyValues class which scales the number of empty values
  5in a data set with a fixed data size.
  6"""
  7
  8import os
  9import string
 10import random
 11from pandas import DataFrame
 12from rdflib.namespace import RDF
 13from rdflib import Graph, URIRef, BNode, Literal, Namespace
 14from bench_generator.scenario import Scenario
 15from bench_generator.logger import Logger
 16
 17DATA_FILE = 'data.csv'
 18RDB_MAPPING_FILE = 'mapping.r2rml.ttl'
 19CSV_MAPPING_FILE = 'mapping.rml.ttl'
 20R2RML = Namespace('http://www.w3.org/ns/r2rml#')
 21RML = Namespace('http://semweb.mmlab.be/ns/rml#')
 22QL = Namespace('http://semweb.mmlab.be/ns/ql#')
 23EX = Namespace('http://example.com/')
 24
 25
 26class EmptyValues(Scenario):
 27    def __init__(self, main_directory: str, verbose: bool,
 28                 percentage: float, data_format: str, engine: str,
 29                 seed: int = 0, number_of_members: int = 100000,
 30                 number_of_properties: int = 20, value_size: int = 0):
 31        """Initialize a EmptyValues scenario.
 32
 33        Parameters
 34        ----------
 35        main_directory : str
 36            Root directory for generating instances of EmptyValues.
 37        verbose : bool
 38            Verbose logging enabled or not.
 39        percentage : float
 40            Percentage empty values to generate, for example 50% results into
 41            a dataset with 50% the same data values.
 42        data_format : str
 43            Data format to use for generating the data set, for example:
 44            "csv", "json", "xml", "postgresql", "mysql"
 45        engine : str
 46            Engine to use for execution of the generated scenario's instance,
 47            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
 48            or "OntopMaterialize"
 49        seed : int
 50            Random seed to use, default 0.
 51        number_of_members : int
 52            Number of members to generate, for example 5000 for 5K rows in a
 53            tabular data structure. Default 100K members.
 54        number_of_properties : int
 55            Number of properties per member to generate, for example 20 for
 56            20 columns in a tabular data structure. Default 20 properties.
 57        value_size : int
 58            Number of characters to add to default value generation,
 59            for example: 256 will expand all values to 256 characters.
 60            Default 0 added characters.
 61        """
 62        self._percentage: float = percentage
 63        self._number_of_members = number_of_members
 64        self._number_of_properties = number_of_properties
 65        self._value_size = value_size
 66        random.seed(seed)
 67
 68        super().__init__(data_format, engine, main_directory, verbose)
 69        if self._data_format != 'csv':
 70            raise NotImplementedError(f'Data format {self._data_format} '
 71                                      f'is not implemented by {__name__}')
 72
 73        self._logger = Logger(__name__, self._main_directory, self._verbose)
 74
 75    def generate(self) -> bool:
 76        """Generate the instance using the Duplciates scenario.
 77
 78        Only CSV files are currently implemented!
 79        """
 80        if self._data_format == 'csv':
 81            return self._generate_csv()
 82        elif self._data_format == 'postgresql':
 83            return self._generate_postgresql()
 84        else:
 85            raise NotImplementedError(f'Data format {self._data_format} '
 86                                      f'is not implemented by {__name__}')
 87
 88    def path(self) -> str:
 89        """Builds the file path for the instance of a EmptyValues scenario.
 90
 91        Returns
 92        -------
 93        path : str
 94            File path for the EmptyValues' instance.
 95        """
 96        key = f'empty_{self._percentage}_percentage'
 97        path = os.path.join(self._main_directory, self._engine,
 98                            self._data_format, key)
 99        self._logger.debug(f'Generating to {path}')
100        os.makedirs(path, exist_ok=True)
101        return path
102
103    def _generate_dataframe(self, member_offset: int = 1,
104                            property_offset: int = 1) -> DataFrame:
105        """Generate empty values data.
106
107        Parameters
108        ----------
109        member_offset : int
110            Offset to start member ID generation from. Default 1 (no offset).
111        property_offset : int
112            Offset to start property ID generation from. Default 1 (no offset).
113
114        Returns
115        -------
116        dataframe : DataFrame
117            Panda's DataFrame with generated data.
118        """
119        subject_id = range(member_offset,
120                           self._number_of_members + member_offset)
121        value_id = range(property_offset,
122                         self._number_of_members + property_offset)
123        data: dict = {'id': subject_id}
124        n_ascii = len(string.ascii_letters)
125
126        for j in range(1, self._number_of_properties + 1):
127            # Append ASCII characters if necessary, use modulo to avoid out of
128            # range in ASCII table
129            append_value = ''
130            if self._value_size > 0:
131                append_value = '_'
132            for n in range(self._value_size):
133                append_value += string.ascii_letters[n % n_ascii]
134
135            # Generate value V_{property}_{member} honoring the value size
136            value = [f'V_{j}-{i}{append_value}' for i in value_id]
137            data[f'p{j}'] = value
138
139        return DataFrame(data)
140
141    def _update_dataframe(self, dataframe: DataFrame):
142        """
143        Sample a percentage of the dataframe to fill with the same value.
144
145        Parameters
146        ----------
147        dataframe : DataFrame
148            The dataframe to update.
149
150        Returns
151        -------
152        dataframe : DataFrame
153            The updated dataframe.
154        """
155        percentage_members: float = self._number_of_members * \
156            (self._percentage / 100.0)
157        sample = dataframe.iloc[random.sample(list(dataframe.index),
158                                              int(percentage_members))]
159        for i in list(sample.index):
160            for j in range(1, self._number_of_properties + 1):
161                dataframe.loc[i, f'p{j}'] = 'NULL'
162
163        return dataframe
164
165    def _generate_mapping(self) -> Graph:
166        """Generate a [R2]RML mapping for a EmptyValues instance.
167
168        Returns
169        -------
170        mapping : Graph
171            [R2]RML mapping as an RDFLib Graph.
172        """
173        mapping: Graph = Graph(base='http://ex.com/')
174        mapping.bind('rr', R2RML)
175        mapping.bind('ql', QL)
176        mapping.bind('ex', EX)
177        subject_template = Literal('http://ex.com/table/{id}')
178
179        if self._data_format == 'postgresql':
180            triples_map_iri = self._add_triples_map_table(mapping,
181                                                          subject_template,
182                                                          Literal('data'))
183        elif self._data_format == 'csv':
184            triples_map_iri = \
185                self._add_triples_map_source(mapping, subject_template,
186                                             Literal('/data/shared/data.csv'))
187        else:
188            raise NotImplementedError(f'{self._data_format} not implemented')
189
190        for i in range(1, self._number_of_properties + 1):
191            self._add_predicate_object_map(mapping, triples_map_iri,
192                                           EX[f'p{i}'], Literal(f'p{i}'))
193
194        return mapping
195
196    def _generate_csv(self) -> bool:
197        """Generate the instance as CSV files.
198
199        Returns
200        -------
201        success : bool
202            True if successfull, false otherwise
203        """
204        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
205        data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE)
206        dataframe = self._generate_dataframe()
207        dataframe = self._update_dataframe(dataframe)
208        dataframe.to_csv(data_path, index=False)
209
210        mapping_path = os.path.join(self.path(), 'data', 'shared',
211                                    CSV_MAPPING_FILE)
212        mapping: Graph = self._generate_mapping()
213        mapping.serialize(destination=mapping_path, format='turtle')
214        self._generate_scenario()
215
216        return True
217
218    def _generate_postgresql(self) -> bool:
219        """Generate the instance as PostgreSQL with CSV files to load.
220
221        Returns
222        -------
223        success : bool
224            True if successfull, false otherwise
225        """
226        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
227        data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE)
228        dataframe = self._generate_dataframe()
229        dataframe = self._update_dataframe(dataframe)
230        dataframe.to_csv(data_path, index=False)
231
232        mapping_path = os.path.join(self.path(), 'data', 'shared',
233                                    RDB_MAPPING_FILE)
234        mapping: Graph = self._generate_mapping()
235        mapping.serialize(destination=mapping_path, format='turtle')
236        self._generate_scenario()
237
238        return True
239
240    def _generate_scenario(self) -> bool:
241        """Generate the metadata for this scenario.
242
243        Configures the execution pipeline automatically.
244
245        Returns
246        -------
247        success : bool
248            True if successfull, false otherwise
249        """
250        name: str = f'empty_{self._percentage}'
251        description: str = f'Empty Values {self._percentage}'
252        iri: str = f'http://example.org/empty/{self._percentage}/'
253
254        if self._data_format == 'postgresql':
255            return self._generate_metadata(iri, name, description,
256                                           RDB_MAPPING_FILE)
257        elif self._data_format == 'csv':
258            return self._generate_metadata(iri, name, description,
259                                           CSV_MAPPING_FILE)
260        else:
261            raise NotImplementedError(f'{self._data_format} not implemented')
262
263        return False
DATA_FILE = 'data.csv'
RDB_MAPPING_FILE = 'mapping.r2rml.ttl'
CSV_MAPPING_FILE = 'mapping.rml.ttl'
R2RML = Namespace('http://www.w3.org/ns/r2rml#')
RML = Namespace('http://semweb.mmlab.be/ns/rml#')
QL = Namespace('http://semweb.mmlab.be/ns/ql#')
EX = Namespace('http://example.com/')
class EmptyValues(bench_generator.scenario.Scenario):
 27class EmptyValues(Scenario):
 28    def __init__(self, main_directory: str, verbose: bool,
 29                 percentage: float, data_format: str, engine: str,
 30                 seed: int = 0, number_of_members: int = 100000,
 31                 number_of_properties: int = 20, value_size: int = 0):
 32        """Initialize a EmptyValues scenario.
 33
 34        Parameters
 35        ----------
 36        main_directory : str
 37            Root directory for generating instances of EmptyValues.
 38        verbose : bool
 39            Verbose logging enabled or not.
 40        percentage : float
 41            Percentage empty values to generate, for example 50% results into
 42            a dataset with 50% the same data values.
 43        data_format : str
 44            Data format to use for generating the data set, for example:
 45            "csv", "json", "xml", "postgresql", "mysql"
 46        engine : str
 47            Engine to use for execution of the generated scenario's instance,
 48            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
 49            or "OntopMaterialize"
 50        seed : int
 51            Random seed to use, default 0.
 52        number_of_members : int
 53            Number of members to generate, for example 5000 for 5K rows in a
 54            tabular data structure. Default 100K members.
 55        number_of_properties : int
 56            Number of properties per member to generate, for example 20 for
 57            20 columns in a tabular data structure. Default 20 properties.
 58        value_size : int
 59            Number of characters to add to default value generation,
 60            for example: 256 will expand all values to 256 characters.
 61            Default 0 added characters.
 62        """
 63        self._percentage: float = percentage
 64        self._number_of_members = number_of_members
 65        self._number_of_properties = number_of_properties
 66        self._value_size = value_size
 67        random.seed(seed)
 68
 69        super().__init__(data_format, engine, main_directory, verbose)
 70        if self._data_format != 'csv':
 71            raise NotImplementedError(f'Data format {self._data_format} '
 72                                      f'is not implemented by {__name__}')
 73
 74        self._logger = Logger(__name__, self._main_directory, self._verbose)
 75
 76    def generate(self) -> bool:
 77        """Generate the instance using the Duplciates scenario.
 78
 79        Only CSV files are currently implemented!
 80        """
 81        if self._data_format == 'csv':
 82            return self._generate_csv()
 83        elif self._data_format == 'postgresql':
 84            return self._generate_postgresql()
 85        else:
 86            raise NotImplementedError(f'Data format {self._data_format} '
 87                                      f'is not implemented by {__name__}')
 88
 89    def path(self) -> str:
 90        """Builds the file path for the instance of a EmptyValues scenario.
 91
 92        Returns
 93        -------
 94        path : str
 95            File path for the EmptyValues' instance.
 96        """
 97        key = f'empty_{self._percentage}_percentage'
 98        path = os.path.join(self._main_directory, self._engine,
 99                            self._data_format, key)
100        self._logger.debug(f'Generating to {path}')
101        os.makedirs(path, exist_ok=True)
102        return path
103
104    def _generate_dataframe(self, member_offset: int = 1,
105                            property_offset: int = 1) -> DataFrame:
106        """Generate empty values data.
107
108        Parameters
109        ----------
110        member_offset : int
111            Offset to start member ID generation from. Default 1 (no offset).
112        property_offset : int
113            Offset to start property ID generation from. Default 1 (no offset).
114
115        Returns
116        -------
117        dataframe : DataFrame
118            Panda's DataFrame with generated data.
119        """
120        subject_id = range(member_offset,
121                           self._number_of_members + member_offset)
122        value_id = range(property_offset,
123                         self._number_of_members + property_offset)
124        data: dict = {'id': subject_id}
125        n_ascii = len(string.ascii_letters)
126
127        for j in range(1, self._number_of_properties + 1):
128            # Append ASCII characters if necessary, use modulo to avoid out of
129            # range in ASCII table
130            append_value = ''
131            if self._value_size > 0:
132                append_value = '_'
133            for n in range(self._value_size):
134                append_value += string.ascii_letters[n % n_ascii]
135
136            # Generate value V_{property}_{member} honoring the value size
137            value = [f'V_{j}-{i}{append_value}' for i in value_id]
138            data[f'p{j}'] = value
139
140        return DataFrame(data)
141
142    def _update_dataframe(self, dataframe: DataFrame):
143        """
144        Sample a percentage of the dataframe to fill with the same value.
145
146        Parameters
147        ----------
148        dataframe : DataFrame
149            The dataframe to update.
150
151        Returns
152        -------
153        dataframe : DataFrame
154            The updated dataframe.
155        """
156        percentage_members: float = self._number_of_members * \
157            (self._percentage / 100.0)
158        sample = dataframe.iloc[random.sample(list(dataframe.index),
159                                              int(percentage_members))]
160        for i in list(sample.index):
161            for j in range(1, self._number_of_properties + 1):
162                dataframe.loc[i, f'p{j}'] = 'NULL'
163
164        return dataframe
165
166    def _generate_mapping(self) -> Graph:
167        """Generate a [R2]RML mapping for a EmptyValues instance.
168
169        Returns
170        -------
171        mapping : Graph
172            [R2]RML mapping as an RDFLib Graph.
173        """
174        mapping: Graph = Graph(base='http://ex.com/')
175        mapping.bind('rr', R2RML)
176        mapping.bind('ql', QL)
177        mapping.bind('ex', EX)
178        subject_template = Literal('http://ex.com/table/{id}')
179
180        if self._data_format == 'postgresql':
181            triples_map_iri = self._add_triples_map_table(mapping,
182                                                          subject_template,
183                                                          Literal('data'))
184        elif self._data_format == 'csv':
185            triples_map_iri = \
186                self._add_triples_map_source(mapping, subject_template,
187                                             Literal('/data/shared/data.csv'))
188        else:
189            raise NotImplementedError(f'{self._data_format} not implemented')
190
191        for i in range(1, self._number_of_properties + 1):
192            self._add_predicate_object_map(mapping, triples_map_iri,
193                                           EX[f'p{i}'], Literal(f'p{i}'))
194
195        return mapping
196
197    def _generate_csv(self) -> bool:
198        """Generate the instance as CSV files.
199
200        Returns
201        -------
202        success : bool
203            True if successfull, false otherwise
204        """
205        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
206        data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE)
207        dataframe = self._generate_dataframe()
208        dataframe = self._update_dataframe(dataframe)
209        dataframe.to_csv(data_path, index=False)
210
211        mapping_path = os.path.join(self.path(), 'data', 'shared',
212                                    CSV_MAPPING_FILE)
213        mapping: Graph = self._generate_mapping()
214        mapping.serialize(destination=mapping_path, format='turtle')
215        self._generate_scenario()
216
217        return True
218
219    def _generate_postgresql(self) -> bool:
220        """Generate the instance as PostgreSQL with CSV files to load.
221
222        Returns
223        -------
224        success : bool
225            True if successfull, false otherwise
226        """
227        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
228        data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE)
229        dataframe = self._generate_dataframe()
230        dataframe = self._update_dataframe(dataframe)
231        dataframe.to_csv(data_path, index=False)
232
233        mapping_path = os.path.join(self.path(), 'data', 'shared',
234                                    RDB_MAPPING_FILE)
235        mapping: Graph = self._generate_mapping()
236        mapping.serialize(destination=mapping_path, format='turtle')
237        self._generate_scenario()
238
239        return True
240
241    def _generate_scenario(self) -> bool:
242        """Generate the metadata for this scenario.
243
244        Configures the execution pipeline automatically.
245
246        Returns
247        -------
248        success : bool
249            True if successfull, false otherwise
250        """
251        name: str = f'empty_{self._percentage}'
252        description: str = f'Empty Values {self._percentage}'
253        iri: str = f'http://example.org/empty/{self._percentage}/'
254
255        if self._data_format == 'postgresql':
256            return self._generate_metadata(iri, name, description,
257                                           RDB_MAPPING_FILE)
258        elif self._data_format == 'csv':
259            return self._generate_metadata(iri, name, description,
260                                           CSV_MAPPING_FILE)
261        else:
262            raise NotImplementedError(f'{self._data_format} not implemented')
263
264        return False

Helper class that provides a standard way to create an ABC using inheritance.

EmptyValues( main_directory: str, verbose: bool, percentage: float, data_format: str, engine: str, seed: int = 0, number_of_members: int = 100000, number_of_properties: int = 20, value_size: int = 0)
28    def __init__(self, main_directory: str, verbose: bool,
29                 percentage: float, data_format: str, engine: str,
30                 seed: int = 0, number_of_members: int = 100000,
31                 number_of_properties: int = 20, value_size: int = 0):
32        """Initialize a EmptyValues scenario.
33
34        Parameters
35        ----------
36        main_directory : str
37            Root directory for generating instances of EmptyValues.
38        verbose : bool
39            Verbose logging enabled or not.
40        percentage : float
41            Percentage empty values to generate, for example 50% results into
42            a dataset with 50% the same data values.
43        data_format : str
44            Data format to use for generating the data set, for example:
45            "csv", "json", "xml", "postgresql", "mysql"
46        engine : str
47            Engine to use for execution of the generated scenario's instance,
48            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
49            or "OntopMaterialize"
50        seed : int
51            Random seed to use, default 0.
52        number_of_members : int
53            Number of members to generate, for example 5000 for 5K rows in a
54            tabular data structure. Default 100K members.
55        number_of_properties : int
56            Number of properties per member to generate, for example 20 for
57            20 columns in a tabular data structure. Default 20 properties.
58        value_size : int
59            Number of characters to add to default value generation,
60            for example: 256 will expand all values to 256 characters.
61            Default 0 added characters.
62        """
63        self._percentage: float = percentage
64        self._number_of_members = number_of_members
65        self._number_of_properties = number_of_properties
66        self._value_size = value_size
67        random.seed(seed)
68
69        super().__init__(data_format, engine, main_directory, verbose)
70        if self._data_format != 'csv':
71            raise NotImplementedError(f'Data format {self._data_format} '
72                                      f'is not implemented by {__name__}')
73
74        self._logger = Logger(__name__, self._main_directory, self._verbose)

Initialize a EmptyValues scenario.

Parameters
  • main_directory (str): Root directory for generating instances of EmptyValues.
  • verbose (bool): Verbose logging enabled or not.
  • percentage (float): Percentage empty values to generate, for example 50% results into a dataset with 50% the same data values.
  • data_format (str): Data format to use for generating the data set, for example: "csv", "json", "xml", "postgresql", "mysql"
  • engine (str): Engine to use for execution of the generated scenario's instance, for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", or "OntopMaterialize"
  • seed (int): Random seed to use, default 0.
  • number_of_members (int): Number of members to generate, for example 5000 for 5K rows in a tabular data structure. Default 100K members.
  • number_of_properties (int): Number of properties per member to generate, for example 20 for 20 columns in a tabular data structure. Default 20 properties.
  • value_size (int): Number of characters to add to default value generation, for example: 256 will expand all values to 256 characters. Default 0 added characters.
def generate(self) -> bool:
76    def generate(self) -> bool:
77        """Generate the instance using the Duplciates scenario.
78
79        Only CSV files are currently implemented!
80        """
81        if self._data_format == 'csv':
82            return self._generate_csv()
83        elif self._data_format == 'postgresql':
84            return self._generate_postgresql()
85        else:
86            raise NotImplementedError(f'Data format {self._data_format} '
87                                      f'is not implemented by {__name__}')

Generate the instance using the Duplciates scenario.

Only CSV files are currently implemented!

def path(self) -> str:
 89    def path(self) -> str:
 90        """Builds the file path for the instance of a EmptyValues scenario.
 91
 92        Returns
 93        -------
 94        path : str
 95            File path for the EmptyValues' instance.
 96        """
 97        key = f'empty_{self._percentage}_percentage'
 98        path = os.path.join(self._main_directory, self._engine,
 99                            self._data_format, key)
100        self._logger.debug(f'Generating to {path}')
101        os.makedirs(path, exist_ok=True)
102        return path

Builds the file path for the instance of a EmptyValues scenario.

Returns
  • path (str): File path for the EmptyValues' instance.