bench_generator.duplicates

This module holds the Duplicates class which scales the number of duplicates in a data set with a fixed data size.

  1#!/usr/bin/env python3
  2
  3"""
  4This module holds the Duplicates class which scales the number of duplicates
  5in a data set with a fixed data size.
  6"""
  7
  8import os
  9import string
 10import random
 11import numpy
 12from pandas import DataFrame
 13from rdflib.namespace import RDF
 14from rdflib import Graph, URIRef, BNode, Literal, Namespace
 15from bench_generator.scenario import Scenario
 16from bench_generator.logger import Logger
 17
 18DATA_FILE = 'data.csv'
 19RDB_MAPPING_FILE = 'mapping.r2rml.ttl'
 20CSV_MAPPING_FILE = 'mapping.rml.ttl'
 21R2RML = Namespace('http://www.w3.org/ns/r2rml#')
 22RML = Namespace('http://semweb.mmlab.be/ns/rml#')
 23QL = Namespace('http://semweb.mmlab.be/ns/ql#')
 24EX = Namespace('http://example.com/')
 25
 26
 27class Duplicates(Scenario):
 28    def __init__(self, main_directory: str, verbose: bool,
 29                 percentage: float, data_format: str, engine: str,
 30                 seed: int = 0, number_of_members: int = 100000,
 31                 number_of_properties: int = 20, value_size: int = 0):
 32        """Initialize a Duplicates scenario.
 33
 34        Parameters
 35        ----------
 36        main_directory : str
 37            Root directory for generating instances of Duplicates.
 38        verbose : bool
 39            Verbose logging enabled or not.
 40        percentage : float
 41            Percentage duplicates to generate, for example 50% results into
 42            a dataset with 50% the same data values.
 43        data_format : str
 44            Data format to use for generating the data set, for example:
 45            "csv", "json", "xml", "postgresql", "mysql"
 46        engine : str
 47            Engine to use for execution of the generated scenario's instance,
 48            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
 49            or "OntopMaterialize"
 50        seed : int
 51            Random seed to use, default 0.
 52        number_of_members : int
 53            Number of members to generate, for example 5000 for 5K rows in a
 54            tabular data structure. Default 100K members.
 55        number_of_properties : int
 56            Number of properties per member to generate, for example 20 for
 57            20 columns in a tabular data structure. Default 20 properties.
 58        value_size : int
 59            Number of characters to add to default value generation,
 60            for example: 256 will expand all values to 256 characters.
 61            Default 0 added characters.
 62        """
 63        self._percentage: float = percentage
 64        self._number_of_members = number_of_members
 65        self._number_of_properties = number_of_properties
 66        self._value_size = value_size
 67        random.seed(seed)
 68
 69        super().__init__(data_format, engine, main_directory, verbose)
 70
 71        if self._data_format != 'csv':
 72            raise NotImplementedError(f'Data format {self._data_format} '
 73                                      f'is not implemented by {__name__}')
 74
 75        self._logger = Logger(__name__, self._main_directory, self._verbose)
 76
 77    def generate(self) -> bool:
 78        """Generate the instance using the Duplciates scenario.
 79
 80        Only CSV files are currently implemented!
 81        """
 82        if self._data_format == 'csv':
 83            return self._generate_csv()
 84        elif self._data_format == 'postgresql':
 85            return self._generate_postgresql()
 86        else:
 87            raise NotImplementedError(f'Data format {self._data_format} '
 88                                      f'is not implemented by {__name__}')
 89
 90    def path(self) -> str:
 91        """Builds the file path for the instance of a Duplicates scenario.
 92
 93        Returns
 94        -------
 95        path : str
 96            File path for the Duplicates' instance.
 97        """
 98        key = f'duplicates_{self._percentage}_percentage'
 99        path = os.path.join(self._main_directory, self._engine,
100                            self._data_format, key)
101        self._logger.debug(f'Generating to {path}')
102        os.makedirs(path, exist_ok=True)
103        return path
104
105    def _generate_dataframe(self, member_offset: int = 1,
106                            property_offset: int = 1) -> DataFrame:
107        """Generate duplicates data.
108
109        Parameters
110        ----------
111        member_offset : int
112            Offset to start member ID generation from. Default 1 (no offset).
113        property_offset : int
114            Offset to start property ID generation from. Default 1 (no offset).
115
116        Returns
117        -------
118        dataframe : DataFrame
119            Panda's DataFrame with generated data.
120        """
121        subject_id = range(member_offset,
122                           self._number_of_members + member_offset)
123        value_id = range(property_offset,
124                         self._number_of_members + property_offset)
125        data: dict = {'id': subject_id}
126        n_ascii = len(string.ascii_letters)
127
128        for j in range(1, self._number_of_properties + 1):
129            # Append ASCII characters if necessary, use modulo to avoid out of
130            # range in ASCII table
131            append_value = ''
132            if self._value_size > 0:
133                append_value = '_'
134            for n in range(self._value_size):
135                append_value += string.ascii_letters[n % n_ascii]
136
137            # Generate value V_{property}_{member} honoring the value size
138            value = [f'V_{j}-{i}{append_value}' for i in value_id]
139            data[f'p{j}'] = value
140
141        return DataFrame(data)
142
143    def _update_dataframe(self, dataframe: DataFrame):
144        """
145        Sample a percentage of the dataframe to fill with the same value.
146
147        Parameters
148        ----------
149        dataframe : DataFrame
150            The dataframe to update.
151
152        Returns
153        -------
154        dataframe : DataFrame
155            The updated dataframe.
156        """
157        percentage_members: float = self._number_of_members * \
158            (self._percentage / 100.0)
159        sample = dataframe.iloc[random.sample(list(dataframe.index),
160                                              int(percentage_members))]
161        for i in list(sample.index):
162            for j in range(1, self._number_of_properties + 1):
163                dataframe.loc[i, f'p{j}'] = 'DUPLICATE'
164            dataframe.loc[i, 'id'] = numpy.iinfo(numpy.int64).max
165
166        return dataframe
167
168    def _generate_mapping(self) -> Graph:
169        """Generate a [R2]RML mapping for a Duplicates instance.
170
171        Returns
172        -------
173        mapping : Graph
174            [R2]RML mapping as an RDFLib Graph.
175        """
176        mapping: Graph = Graph(base='http://ex.com/')
177        mapping.bind('rr', R2RML)
178        mapping.bind('ql', QL)
179        mapping.bind('ex', EX)
180        subject_template = Literal('http://ex.com/table/{id}')
181
182        if self._data_format == 'postgresql':
183            triples_map_iri = self._add_triples_map_table(mapping,
184                                                          subject_template,
185                                                          Literal('data'))
186        elif self._data_format == 'csv':
187            triples_map_iri = \
188                self._add_triples_map_source(mapping, subject_template,
189                                             Literal('/data/shared/data.csv'))
190        else:
191            raise NotImplementedError(f'{self._data_format} not implemented')
192
193        for i in range(1, self._number_of_properties + 1):
194            self._add_predicate_object_map(mapping, triples_map_iri,
195                                           EX[f'p{i}'], Literal(f'p{i}'))
196
197        return mapping
198
199    def _generate_csv(self) -> bool:
200        """Generate the instance as CSV files.
201
202        Returns
203        -------
204        success : bool
205            True if successfull, false otherwise
206        """
207        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
208        data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE)
209        dataframe = self._generate_dataframe()
210        dataframe = self._update_dataframe(dataframe)
211        dataframe.to_csv(data_path, index=False)
212
213        mapping_path = os.path.join(self.path(), 'data', 'shared',
214                                    CSV_MAPPING_FILE)
215        mapping: Graph = self._generate_mapping()
216        mapping.serialize(destination=mapping_path, format='turtle')
217        self._generate_scenario()
218
219        return True
220
221    def _generate_postgresql(self) -> bool:
222        """Generate the instance as PostgreSQL with CSV files to load.
223
224        Returns
225        -------
226        success : bool
227            True if successfull, false otherwise
228        """
229        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
230        data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE)
231        dataframe = self._generate_dataframe()
232        dataframe = self._update_dataframe(dataframe)
233        dataframe.to_csv(data_path, index=False)
234
235        mapping_path = os.path.join(self.path(), 'data', 'shared',
236                                    RDB_MAPPING_FILE)
237        mapping: Graph = self._generate_mapping()
238        mapping.serialize(destination=mapping_path, format='turtle')
239        self._generate_scenario()
240
241        return True
242
243    def _generate_scenario(self) -> bool:
244        """Generate the metadata for this scenario.
245
246        Configures the execution pipeline automatically.
247
248        Returns
249        -------
250        success : bool
251            True if successfull, false otherwise
252        """
253        name: str = f'duplicates_{self._percentage}'
254        description: str = f'Duplicates {self._percentage}'
255        iri: str = f'http://example.org/duplicates/{self._percentage}/'
256
257        if self._data_format == 'postgresql':
258            return self._generate_metadata(iri, name, description,
259                                           RDB_MAPPING_FILE)
260        elif self._data_format == 'csv':
261            return self._generate_metadata(iri, name, description,
262                                           CSV_MAPPING_FILE)
263        else:
264            raise NotImplementedError(f'{self._data_format} not implemented')
265
266        return True
DATA_FILE = 'data.csv'
RDB_MAPPING_FILE = 'mapping.r2rml.ttl'
CSV_MAPPING_FILE = 'mapping.rml.ttl'
R2RML = Namespace('http://www.w3.org/ns/r2rml#')
RML = Namespace('http://semweb.mmlab.be/ns/rml#')
QL = Namespace('http://semweb.mmlab.be/ns/ql#')
EX = Namespace('http://example.com/')
class Duplicates(bench_generator.scenario.Scenario):
 28class Duplicates(Scenario):
 29    def __init__(self, main_directory: str, verbose: bool,
 30                 percentage: float, data_format: str, engine: str,
 31                 seed: int = 0, number_of_members: int = 100000,
 32                 number_of_properties: int = 20, value_size: int = 0):
 33        """Initialize a Duplicates scenario.
 34
 35        Parameters
 36        ----------
 37        main_directory : str
 38            Root directory for generating instances of Duplicates.
 39        verbose : bool
 40            Verbose logging enabled or not.
 41        percentage : float
 42            Percentage duplicates to generate, for example 50% results into
 43            a dataset with 50% the same data values.
 44        data_format : str
 45            Data format to use for generating the data set, for example:
 46            "csv", "json", "xml", "postgresql", "mysql"
 47        engine : str
 48            Engine to use for execution of the generated scenario's instance,
 49            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
 50            or "OntopMaterialize"
 51        seed : int
 52            Random seed to use, default 0.
 53        number_of_members : int
 54            Number of members to generate, for example 5000 for 5K rows in a
 55            tabular data structure. Default 100K members.
 56        number_of_properties : int
 57            Number of properties per member to generate, for example 20 for
 58            20 columns in a tabular data structure. Default 20 properties.
 59        value_size : int
 60            Number of characters to add to default value generation,
 61            for example: 256 will expand all values to 256 characters.
 62            Default 0 added characters.
 63        """
 64        self._percentage: float = percentage
 65        self._number_of_members = number_of_members
 66        self._number_of_properties = number_of_properties
 67        self._value_size = value_size
 68        random.seed(seed)
 69
 70        super().__init__(data_format, engine, main_directory, verbose)
 71
 72        if self._data_format != 'csv':
 73            raise NotImplementedError(f'Data format {self._data_format} '
 74                                      f'is not implemented by {__name__}')
 75
 76        self._logger = Logger(__name__, self._main_directory, self._verbose)
 77
 78    def generate(self) -> bool:
 79        """Generate the instance using the Duplciates scenario.
 80
 81        Only CSV files are currently implemented!
 82        """
 83        if self._data_format == 'csv':
 84            return self._generate_csv()
 85        elif self._data_format == 'postgresql':
 86            return self._generate_postgresql()
 87        else:
 88            raise NotImplementedError(f'Data format {self._data_format} '
 89                                      f'is not implemented by {__name__}')
 90
 91    def path(self) -> str:
 92        """Builds the file path for the instance of a Duplicates scenario.
 93
 94        Returns
 95        -------
 96        path : str
 97            File path for the Duplicates' instance.
 98        """
 99        key = f'duplicates_{self._percentage}_percentage'
100        path = os.path.join(self._main_directory, self._engine,
101                            self._data_format, key)
102        self._logger.debug(f'Generating to {path}')
103        os.makedirs(path, exist_ok=True)
104        return path
105
106    def _generate_dataframe(self, member_offset: int = 1,
107                            property_offset: int = 1) -> DataFrame:
108        """Generate duplicates data.
109
110        Parameters
111        ----------
112        member_offset : int
113            Offset to start member ID generation from. Default 1 (no offset).
114        property_offset : int
115            Offset to start property ID generation from. Default 1 (no offset).
116
117        Returns
118        -------
119        dataframe : DataFrame
120            Panda's DataFrame with generated data.
121        """
122        subject_id = range(member_offset,
123                           self._number_of_members + member_offset)
124        value_id = range(property_offset,
125                         self._number_of_members + property_offset)
126        data: dict = {'id': subject_id}
127        n_ascii = len(string.ascii_letters)
128
129        for j in range(1, self._number_of_properties + 1):
130            # Append ASCII characters if necessary, use modulo to avoid out of
131            # range in ASCII table
132            append_value = ''
133            if self._value_size > 0:
134                append_value = '_'
135            for n in range(self._value_size):
136                append_value += string.ascii_letters[n % n_ascii]
137
138            # Generate value V_{property}_{member} honoring the value size
139            value = [f'V_{j}-{i}{append_value}' for i in value_id]
140            data[f'p{j}'] = value
141
142        return DataFrame(data)
143
144    def _update_dataframe(self, dataframe: DataFrame):
145        """
146        Sample a percentage of the dataframe to fill with the same value.
147
148        Parameters
149        ----------
150        dataframe : DataFrame
151            The dataframe to update.
152
153        Returns
154        -------
155        dataframe : DataFrame
156            The updated dataframe.
157        """
158        percentage_members: float = self._number_of_members * \
159            (self._percentage / 100.0)
160        sample = dataframe.iloc[random.sample(list(dataframe.index),
161                                              int(percentage_members))]
162        for i in list(sample.index):
163            for j in range(1, self._number_of_properties + 1):
164                dataframe.loc[i, f'p{j}'] = 'DUPLICATE'
165            dataframe.loc[i, 'id'] = numpy.iinfo(numpy.int64).max
166
167        return dataframe
168
169    def _generate_mapping(self) -> Graph:
170        """Generate a [R2]RML mapping for a Duplicates instance.
171
172        Returns
173        -------
174        mapping : Graph
175            [R2]RML mapping as an RDFLib Graph.
176        """
177        mapping: Graph = Graph(base='http://ex.com/')
178        mapping.bind('rr', R2RML)
179        mapping.bind('ql', QL)
180        mapping.bind('ex', EX)
181        subject_template = Literal('http://ex.com/table/{id}')
182
183        if self._data_format == 'postgresql':
184            triples_map_iri = self._add_triples_map_table(mapping,
185                                                          subject_template,
186                                                          Literal('data'))
187        elif self._data_format == 'csv':
188            triples_map_iri = \
189                self._add_triples_map_source(mapping, subject_template,
190                                             Literal('/data/shared/data.csv'))
191        else:
192            raise NotImplementedError(f'{self._data_format} not implemented')
193
194        for i in range(1, self._number_of_properties + 1):
195            self._add_predicate_object_map(mapping, triples_map_iri,
196                                           EX[f'p{i}'], Literal(f'p{i}'))
197
198        return mapping
199
200    def _generate_csv(self) -> bool:
201        """Generate the instance as CSV files.
202
203        Returns
204        -------
205        success : bool
206            True if successfull, false otherwise
207        """
208        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
209        data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE)
210        dataframe = self._generate_dataframe()
211        dataframe = self._update_dataframe(dataframe)
212        dataframe.to_csv(data_path, index=False)
213
214        mapping_path = os.path.join(self.path(), 'data', 'shared',
215                                    CSV_MAPPING_FILE)
216        mapping: Graph = self._generate_mapping()
217        mapping.serialize(destination=mapping_path, format='turtle')
218        self._generate_scenario()
219
220        return True
221
222    def _generate_postgresql(self) -> bool:
223        """Generate the instance as PostgreSQL with CSV files to load.
224
225        Returns
226        -------
227        success : bool
228            True if successfull, false otherwise
229        """
230        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
231        data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE)
232        dataframe = self._generate_dataframe()
233        dataframe = self._update_dataframe(dataframe)
234        dataframe.to_csv(data_path, index=False)
235
236        mapping_path = os.path.join(self.path(), 'data', 'shared',
237                                    RDB_MAPPING_FILE)
238        mapping: Graph = self._generate_mapping()
239        mapping.serialize(destination=mapping_path, format='turtle')
240        self._generate_scenario()
241
242        return True
243
244    def _generate_scenario(self) -> bool:
245        """Generate the metadata for this scenario.
246
247        Configures the execution pipeline automatically.
248
249        Returns
250        -------
251        success : bool
252            True if successfull, false otherwise
253        """
254        name: str = f'duplicates_{self._percentage}'
255        description: str = f'Duplicates {self._percentage}'
256        iri: str = f'http://example.org/duplicates/{self._percentage}/'
257
258        if self._data_format == 'postgresql':
259            return self._generate_metadata(iri, name, description,
260                                           RDB_MAPPING_FILE)
261        elif self._data_format == 'csv':
262            return self._generate_metadata(iri, name, description,
263                                           CSV_MAPPING_FILE)
264        else:
265            raise NotImplementedError(f'{self._data_format} not implemented')
266
267        return True

Helper class that provides a standard way to create an ABC using inheritance.

Duplicates( main_directory: str, verbose: bool, percentage: float, data_format: str, engine: str, seed: int = 0, number_of_members: int = 100000, number_of_properties: int = 20, value_size: int = 0)
29    def __init__(self, main_directory: str, verbose: bool,
30                 percentage: float, data_format: str, engine: str,
31                 seed: int = 0, number_of_members: int = 100000,
32                 number_of_properties: int = 20, value_size: int = 0):
33        """Initialize a Duplicates scenario.
34
35        Parameters
36        ----------
37        main_directory : str
38            Root directory for generating instances of Duplicates.
39        verbose : bool
40            Verbose logging enabled or not.
41        percentage : float
42            Percentage duplicates to generate, for example 50% results into
43            a dataset with 50% the same data values.
44        data_format : str
45            Data format to use for generating the data set, for example:
46            "csv", "json", "xml", "postgresql", "mysql"
47        engine : str
48            Engine to use for execution of the generated scenario's instance,
49            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
50            or "OntopMaterialize"
51        seed : int
52            Random seed to use, default 0.
53        number_of_members : int
54            Number of members to generate, for example 5000 for 5K rows in a
55            tabular data structure. Default 100K members.
56        number_of_properties : int
57            Number of properties per member to generate, for example 20 for
58            20 columns in a tabular data structure. Default 20 properties.
59        value_size : int
60            Number of characters to add to default value generation,
61            for example: 256 will expand all values to 256 characters.
62            Default 0 added characters.
63        """
64        self._percentage: float = percentage
65        self._number_of_members = number_of_members
66        self._number_of_properties = number_of_properties
67        self._value_size = value_size
68        random.seed(seed)
69
70        super().__init__(data_format, engine, main_directory, verbose)
71
72        if self._data_format != 'csv':
73            raise NotImplementedError(f'Data format {self._data_format} '
74                                      f'is not implemented by {__name__}')
75
76        self._logger = Logger(__name__, self._main_directory, self._verbose)

Initialize a Duplicates scenario.

Parameters
  • main_directory (str): Root directory for generating instances of Duplicates.
  • verbose (bool): Verbose logging enabled or not.
  • percentage (float): Percentage duplicates to generate, for example 50% results into a dataset with 50% the same data values.
  • data_format (str): Data format to use for generating the data set, for example: "csv", "json", "xml", "postgresql", "mysql"
  • engine (str): Engine to use for execution of the generated scenario's instance, for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", or "OntopMaterialize"
  • seed (int): Random seed to use, default 0.
  • number_of_members (int): Number of members to generate, for example 5000 for 5K rows in a tabular data structure. Default 100K members.
  • number_of_properties (int): Number of properties per member to generate, for example 20 for 20 columns in a tabular data structure. Default 20 properties.
  • value_size (int): Number of characters to add to default value generation, for example: 256 will expand all values to 256 characters. Default 0 added characters.
def generate(self) -> bool:
78    def generate(self) -> bool:
79        """Generate the instance using the Duplciates scenario.
80
81        Only CSV files are currently implemented!
82        """
83        if self._data_format == 'csv':
84            return self._generate_csv()
85        elif self._data_format == 'postgresql':
86            return self._generate_postgresql()
87        else:
88            raise NotImplementedError(f'Data format {self._data_format} '
89                                      f'is not implemented by {__name__}')

Generate the instance using the Duplciates scenario.

Only CSV files are currently implemented!

def path(self) -> str:
 91    def path(self) -> str:
 92        """Builds the file path for the instance of a Duplicates scenario.
 93
 94        Returns
 95        -------
 96        path : str
 97            File path for the Duplicates' instance.
 98        """
 99        key = f'duplicates_{self._percentage}_percentage'
100        path = os.path.join(self._main_directory, self._engine,
101                            self._data_format, key)
102        self._logger.debug(f'Generating to {path}')
103        os.makedirs(path, exist_ok=True)
104        return path

Builds the file path for the instance of a Duplicates scenario.

Returns
  • path (str): File path for the Duplicates' instance.