bench_generator.joins_duplicate

This module holds the Joins class which scales the dataset size by the number of members in a dataset such as number of rows for tabular data.

  1#!/usr/bin/env python3
  2
  3"""
  4This module holds the Joins class which scales the dataset size
  5by the number of members in a dataset such as number of rows for tabular data.
  6"""
  7
  8import os
  9import string
 10import random
 11from typing import Tuple
 12from pandas import DataFrame
 13from rdflib.namespace import RDF
 14from rdflib import Graph, URIRef, BNode, Literal, Namespace
 15from bench_generator.scenario import Scenario
 16from bench_generator.logger import Logger
 17
 18DATA_FILE1 = 'data1.csv'
 19DATA_FILE2 = 'data2.csv'
 20RDB_MAPPING_FILE = 'mapping.r2rml.ttl'
 21CSV_MAPPING_FILE = 'mapping.rml.ttl'
 22R2RML = Namespace('http://www.w3.org/ns/r2rml#')
 23QL = Namespace('http://semweb.mmlab.be/ns/ql#')
 24EX = Namespace('http://example.com/')
 25
 26
 27class JoinsDuplicate(Scenario):
 28    def __init__(self, main_directory: str, verbose: bool, percentage: float,
 29                 number_of_duplicates: int, number_of_members: int,
 30                 number_of_properties: int, value_size: int, data_format: str,
 31                 engine: str, seed: int = 0):
 32        """Initialize a Joins Duplicate scenario.
 33
 34        Parameters
 35        ----------
 36        main_directory : str
 37            Root directory for generating instances of Joins Duplicate.
 38        verbose : bool
 39            Verbose logging enabled or not.
 40        percentage : float
 41            Duplicate of members which should result into a join.
 42        number_of_duplicates : int
 43            Number of duplicates to generate.
 44        number_of_members : int
 45            Number of members to generate, for example 5000 for 5K rows in a
 46            tabular data structure.
 47        number_of_properties : int
 48            Number of properties per member to generate, for example 20 for
 49            20 columns in a tabular data structure.
 50        value_size : int
 51            Number of characters to add to default value generation,
 52            for example: 256 will expand all values to 256 characters.
 53        data_format : str
 54            Data format to use for generating the data set, for example:
 55            "csv", "json", "xml", "postgresql", "mysql"
 56        engine : str
 57            Engine to use for execution of the generated scenario's instance,
 58            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
 59            or "OntopMaterialize"
 60        seed : int
 61            Random seed to use, default 0.
 62        """
 63        self._percentage = percentage
 64        self._number_of_duplicates = number_of_duplicates
 65        self._number_of_members: int = number_of_members
 66        self._number_of_properties: int = number_of_properties
 67        self._value_size: int = value_size
 68        random.seed(seed)
 69
 70        super().__init__(data_format, engine, main_directory, verbose)
 71
 72        if self._data_format != 'csv':
 73            raise NotImplementedError(f'Data format {self._data_format} '
 74                                      f'is not implemented by {__name__}')
 75
 76        self._logger = Logger(__name__, self._main_directory, self._verbose)
 77        self._logger.debug(f'Generating join percentage'
 78                           f' with {self._percentage}% of members,')
 79
 80    def generate(self) -> bool:
 81        """Generate the instance using the Joins Duplicate scenario.
 82
 83        Only CSV files are currently implemented!
 84        """
 85        if self._data_format == 'csv':
 86            return self._generate_csv()
 87        elif self._data_format == 'postgresql':
 88            return self._generate_postgresql()
 89        else:
 90            raise NotImplementedError(f'Data format {self._data_format} '
 91                                      f'is not implemented by {__name__}')
 92
 93    def path(self) -> str:
 94        """Builds the file path for the instance of a Joins Duplicate scenario.
 95
 96        Returns
 97        -------
 98        path : str
 99            File path for the Joins Duplicate's instance.
100        """
101        key = f'joins_duplicates_{self._number_of_duplicates}' + \
102              f'_{self._percentage}'
103        path = os.path.join(self._main_directory, self._engine,
104                            self._data_format, key)
105        self._logger.debug(f'Generating to {path}')
106        os.makedirs(path, exist_ok=True)
107        return path
108
109    def _generate_dataframe(self, member_offset: int = 1,
110                            property_offset: int = 1) -> DataFrame:
111        """Generate joins.
112
113        Parameters
114        ----------
115        member_offset : int
116            Offset to start member ID generation from. Default 1 (no offset).
117        property_offset : int
118            Offset to start property ID generation from. Default 1 (no offset).
119
120        Returns
121        -------
122        dataframe : DataFrame
123            Panda's DataFrame with generated joins.
124        """
125        subject_id = range(member_offset,
126                           self._number_of_members + member_offset)
127        value_id = range(property_offset,
128                         self._number_of_members + property_offset)
129        data: dict = {'id': subject_id}
130        n_ascii = len(string.ascii_letters)
131
132        for j in range(1, self._number_of_properties + 1):
133            # Append ASCII characters if necessary, use modulo to avoid out of
134            # range in ASCII table
135            append_value = ''
136            if self._value_size > 0:
137                append_value = '_'
138            for n in range(self._value_size):
139                append_value += string.ascii_letters[n % n_ascii]
140
141            # Generate value V_{property}_{member} honoring the value size
142            value = [f'V_{j}-{i}{append_value}' for i in value_id]
143            data[f'p{j}'] = value
144
145        return DataFrame(data)
146
147    def _update_duplicates(self,
148                           dataframe1: DataFrame,
149                           dataframe2: DataFrame) -> Tuple[DataFrame,
150                                                           DataFrame]:
151        duplicates = self._number_of_members * (self._percentage / 100.0)
152        num_P1s = duplicates / self._number_of_duplicates
153        n = min(num_P1s * (self._number_of_duplicates + 1),
154                self._number_of_members)
155
156        sample1 = dataframe1.iloc[random.sample(list(dataframe1.index),
157                                                int(n))]
158        sample2 = dataframe2.iloc[random.sample(list(dataframe2.index),
159                                                int(n))]
160        values = list(set([m[1]['p1'] for m in sample1.iterrows()]))
161
162        if len(values) > self._number_of_members:
163            values = values[:self._number_of_members]
164
165        member_id = -1
166        member_count = 0
167        for i, j in zip(values, list(sample2.index)):
168            if member_id == -1:
169                member_id = int(dataframe2.loc[j, 'id'])
170
171            dataframe2.loc[j, 'id'] = member_id
172            dataframe2.loc[j, 'p1'] = i
173            member_count += 1
174
175            if member_count >= self._number_of_duplicates:
176                member_id = -1
177                member_count = 0
178
179        return dataframe1, dataframe2
180
181    def _add_join_predicate_object_map(self, mapping: Graph,
182                                       triplesmap_iri: URIRef,
183                                       predicate_value: URIRef,
184                                       object_value: Literal,
185                                       parent_triplesmap_iri: URIRef,
186                                       child_value: Literal,
187                                       parent_value: Literal) -> BNode:
188        """Insert a join with join condition into a [R2]RML mapping
189
190        Parameters
191        ----------
192        mapping : Graph
193            [R2]RML mapping as an RDFLib Graph.
194        triples_map_iri : URIRef
195            IRI of the Triples Map to insert the PredicateObjectMap in.
196        predicate_value : URIRef
197            Predicate IRI value for PredicateObjectMap.
198        object_value : Literal
199            Object value for PredicateObjectMap.
200
201        Returns
202        -------
203        predicat_object_map_with_join_iri : BNode
204            Predicate Object Map with join blank node ID.
205        """
206        predicate_object_map_iri = BNode()
207        predicate_map_iri = BNode()
208        object_map_iri = BNode()
209        join_condition_iri = BNode()
210
211        mapping.add((join_condition_iri, R2RML.child, child_value))
212        mapping.add((join_condition_iri, R2RML.parent, parent_value))
213        mapping.add((join_condition_iri, RDF.type, R2RML.JoinCondition))
214        mapping.add((predicate_map_iri, R2RML.constant, predicate_value))
215        mapping.add((predicate_map_iri, RDF.type, R2RML.PredicateMap))
216        mapping.add((object_map_iri, RDF.type, R2RML.ReferenceObjectMap))
217        mapping.add((object_map_iri, R2RML.parentTriplesMap,
218                     parent_triplesmap_iri))
219        mapping.add((object_map_iri, R2RML.joinCondition, join_condition_iri))
220        mapping.add((predicate_object_map_iri, R2RML.predicateMap,
221                     predicate_map_iri))
222        mapping.add((predicate_object_map_iri, R2RML.objectMap,
223                     object_map_iri))
224        mapping.add((predicate_object_map_iri, RDF.type,
225                     R2RML.PredicateObjectMap))
226        mapping.add((triplesmap_iri, R2RML.predicateObjectMap,
227                     predicate_object_map_iri))
228
229        return join_condition_iri
230
231    def _generate_mapping(self) -> Graph:
232        """Generate a [R2]RML mapping for a Joins instance.
233
234        Returns
235        -------
236        mapping : Graph
237            [R2]RML mapping as an RDFLib Graph.
238        """
239        mapping: Graph = Graph(base='http://ex.com/')
240        mapping.bind('rr', R2RML)
241        mapping.bind('ql', QL)
242        mapping.bind('ex', EX)
243        subject1_template = Literal('http://ex.com/table1/{id}')
244        subject2_template = Literal('http://ex.com/table2/{id}')
245        if self._data_format == 'postgresql':
246            triples_map1_iri = self._add_triples_map(mapping,
247                                                     subject1_template,
248                                                     Literal('data'), number=1)
249            triples_map2_iri = self._add_triples_map(mapping,
250                                                     subject2_template,
251                                                     Literal('data'), number=2)
252        elif self._data_format == 'csv':
253            triples_map1_iri = \
254                self._add_triples_map_source(mapping, subject1_template,
255                                             Literal('/data/shared/data1.csv'),
256                                             number=1)
257            triples_map2_iri = \
258                self._add_triples_map_source(mapping, subject2_template,
259                                             Literal('/data/shared/data2.csv'),
260                                             number=2)
261        else:
262            raise NotImplementedError(f'{self._data_format} not implemented')
263
264        self._add_join_predicate_object_map(mapping, triples_map1_iri,
265                                            EX['j1'], Literal('p1'),
266                                            triples_map2_iri, Literal('p1'),
267                                            Literal('p1'))
268
269        return mapping
270
271    def _generate_csv(self) -> bool:
272        """Generate the instance as CSV files.
273
274        Returns
275        -------
276        success : bool
277            True if successfull, false otherwise
278        """
279        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
280        data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1)
281        dataframe1 = self._generate_dataframe()
282        data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2)
283        dataframe2 = self._generate_dataframe(self._number_of_members + 1,
284                                              self._number_of_properties + 1)
285        dataframe1, dataframe2 = self._update_duplicates(dataframe1,
286                                                         dataframe2)
287        dataframe1.to_csv(data1_path, index=False)
288        dataframe2.to_csv(data2_path, index=False)
289
290        mapping_path = os.path.join(self.path(), 'data', 'shared', CSV_MAPPING_FILE)
291        mapping: Graph = self._generate_mapping()
292        mapping.serialize(destination=mapping_path, format='turtle')
293        self._generate_scenario()
294
295        return True
296
297    def _generate_postgresql(self) -> bool:
298        """Generate the instance as PostgreSQL with CSV files to load.
299
300        Returns
301        -------
302        success : bool
303            True if successfull, false otherwise
304        """
305        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
306        data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1)
307        self._generate_dataframe().to_csv(data1_path, index=False)
308        data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2)
309        self._generate_dataframe().to_csv(data2_path, index=False)
310
311        mapping_path = os.path.join(self.path(), 'data', 'shared', RDB_MAPPING_FILE)
312        mapping: Graph = self._generate_mapping()
313        mapping.serialize(destination=mapping_path, format='turtle')
314        self._generate_scenario()
315
316        return True
317
318    def _generate_scenario(self) -> bool:
319        """Generate the metadata for this scenario.
320
321        Configures the execution pipeline automatically.
322
323        Returns
324        -------
325        success : bool
326            True if successfull, false otherwise
327        """
328        name: str = f'join_duplicates_{self._number_of_duplicates}' + \
329                    f'_{self._percentage}'
330        description: str = f'Join Duplicate {self._number_of_duplicates}' + \
331                           f'({self._percentage}%)'
332        iri: str = 'http://example.org/join-duplicates/' + \
333                   f'{self._number_of_duplicates}/{self._percentage}/'
334
335        if self._data_format == 'postgresql':
336            return self._generate_metadata(iri, name, description,
337                                           RDB_MAPPING_FILE)
338        elif self._data_format == 'csv':
339            return self._generate_metadata(iri, name, description,
340                                           CSV_MAPPING_FILE)
341        else:
342            raise NotImplementedError(f'{self._data_format} not implemented')
343
344        return False
DATA_FILE1 = 'data1.csv'
DATA_FILE2 = 'data2.csv'
RDB_MAPPING_FILE = 'mapping.r2rml.ttl'
CSV_MAPPING_FILE = 'mapping.rml.ttl'
R2RML = Namespace('http://www.w3.org/ns/r2rml#')
QL = Namespace('http://semweb.mmlab.be/ns/ql#')
EX = Namespace('http://example.com/')
class JoinsDuplicate(bench_generator.scenario.Scenario):
 28class JoinsDuplicate(Scenario):
 29    def __init__(self, main_directory: str, verbose: bool, percentage: float,
 30                 number_of_duplicates: int, number_of_members: int,
 31                 number_of_properties: int, value_size: int, data_format: str,
 32                 engine: str, seed: int = 0):
 33        """Initialize a Joins Duplicate scenario.
 34
 35        Parameters
 36        ----------
 37        main_directory : str
 38            Root directory for generating instances of Joins Duplicate.
 39        verbose : bool
 40            Verbose logging enabled or not.
 41        percentage : float
 42            Duplicate of members which should result into a join.
 43        number_of_duplicates : int
 44            Number of duplicates to generate.
 45        number_of_members : int
 46            Number of members to generate, for example 5000 for 5K rows in a
 47            tabular data structure.
 48        number_of_properties : int
 49            Number of properties per member to generate, for example 20 for
 50            20 columns in a tabular data structure.
 51        value_size : int
 52            Number of characters to add to default value generation,
 53            for example: 256 will expand all values to 256 characters.
 54        data_format : str
 55            Data format to use for generating the data set, for example:
 56            "csv", "json", "xml", "postgresql", "mysql"
 57        engine : str
 58            Engine to use for execution of the generated scenario's instance,
 59            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
 60            or "OntopMaterialize"
 61        seed : int
 62            Random seed to use, default 0.
 63        """
 64        self._percentage = percentage
 65        self._number_of_duplicates = number_of_duplicates
 66        self._number_of_members: int = number_of_members
 67        self._number_of_properties: int = number_of_properties
 68        self._value_size: int = value_size
 69        random.seed(seed)
 70
 71        super().__init__(data_format, engine, main_directory, verbose)
 72
 73        if self._data_format != 'csv':
 74            raise NotImplementedError(f'Data format {self._data_format} '
 75                                      f'is not implemented by {__name__}')
 76
 77        self._logger = Logger(__name__, self._main_directory, self._verbose)
 78        self._logger.debug(f'Generating join percentage'
 79                           f' with {self._percentage}% of members,')
 80
 81    def generate(self) -> bool:
 82        """Generate the instance using the Joins Duplicate scenario.
 83
 84        Only CSV files are currently implemented!
 85        """
 86        if self._data_format == 'csv':
 87            return self._generate_csv()
 88        elif self._data_format == 'postgresql':
 89            return self._generate_postgresql()
 90        else:
 91            raise NotImplementedError(f'Data format {self._data_format} '
 92                                      f'is not implemented by {__name__}')
 93
 94    def path(self) -> str:
 95        """Builds the file path for the instance of a Joins Duplicate scenario.
 96
 97        Returns
 98        -------
 99        path : str
100            File path for the Joins Duplicate's instance.
101        """
102        key = f'joins_duplicates_{self._number_of_duplicates}' + \
103              f'_{self._percentage}'
104        path = os.path.join(self._main_directory, self._engine,
105                            self._data_format, key)
106        self._logger.debug(f'Generating to {path}')
107        os.makedirs(path, exist_ok=True)
108        return path
109
110    def _generate_dataframe(self, member_offset: int = 1,
111                            property_offset: int = 1) -> DataFrame:
112        """Generate joins.
113
114        Parameters
115        ----------
116        member_offset : int
117            Offset to start member ID generation from. Default 1 (no offset).
118        property_offset : int
119            Offset to start property ID generation from. Default 1 (no offset).
120
121        Returns
122        -------
123        dataframe : DataFrame
124            Panda's DataFrame with generated joins.
125        """
126        subject_id = range(member_offset,
127                           self._number_of_members + member_offset)
128        value_id = range(property_offset,
129                         self._number_of_members + property_offset)
130        data: dict = {'id': subject_id}
131        n_ascii = len(string.ascii_letters)
132
133        for j in range(1, self._number_of_properties + 1):
134            # Append ASCII characters if necessary, use modulo to avoid out of
135            # range in ASCII table
136            append_value = ''
137            if self._value_size > 0:
138                append_value = '_'
139            for n in range(self._value_size):
140                append_value += string.ascii_letters[n % n_ascii]
141
142            # Generate value V_{property}_{member} honoring the value size
143            value = [f'V_{j}-{i}{append_value}' for i in value_id]
144            data[f'p{j}'] = value
145
146        return DataFrame(data)
147
148    def _update_duplicates(self,
149                           dataframe1: DataFrame,
150                           dataframe2: DataFrame) -> Tuple[DataFrame,
151                                                           DataFrame]:
152        duplicates = self._number_of_members * (self._percentage / 100.0)
153        num_P1s = duplicates / self._number_of_duplicates
154        n = min(num_P1s * (self._number_of_duplicates + 1),
155                self._number_of_members)
156
157        sample1 = dataframe1.iloc[random.sample(list(dataframe1.index),
158                                                int(n))]
159        sample2 = dataframe2.iloc[random.sample(list(dataframe2.index),
160                                                int(n))]
161        values = list(set([m[1]['p1'] for m in sample1.iterrows()]))
162
163        if len(values) > self._number_of_members:
164            values = values[:self._number_of_members]
165
166        member_id = -1
167        member_count = 0
168        for i, j in zip(values, list(sample2.index)):
169            if member_id == -1:
170                member_id = int(dataframe2.loc[j, 'id'])
171
172            dataframe2.loc[j, 'id'] = member_id
173            dataframe2.loc[j, 'p1'] = i
174            member_count += 1
175
176            if member_count >= self._number_of_duplicates:
177                member_id = -1
178                member_count = 0
179
180        return dataframe1, dataframe2
181
182    def _add_join_predicate_object_map(self, mapping: Graph,
183                                       triplesmap_iri: URIRef,
184                                       predicate_value: URIRef,
185                                       object_value: Literal,
186                                       parent_triplesmap_iri: URIRef,
187                                       child_value: Literal,
188                                       parent_value: Literal) -> BNode:
189        """Insert a join with join condition into a [R2]RML mapping
190
191        Parameters
192        ----------
193        mapping : Graph
194            [R2]RML mapping as an RDFLib Graph.
195        triples_map_iri : URIRef
196            IRI of the Triples Map to insert the PredicateObjectMap in.
197        predicate_value : URIRef
198            Predicate IRI value for PredicateObjectMap.
199        object_value : Literal
200            Object value for PredicateObjectMap.
201
202        Returns
203        -------
204        predicat_object_map_with_join_iri : BNode
205            Predicate Object Map with join blank node ID.
206        """
207        predicate_object_map_iri = BNode()
208        predicate_map_iri = BNode()
209        object_map_iri = BNode()
210        join_condition_iri = BNode()
211
212        mapping.add((join_condition_iri, R2RML.child, child_value))
213        mapping.add((join_condition_iri, R2RML.parent, parent_value))
214        mapping.add((join_condition_iri, RDF.type, R2RML.JoinCondition))
215        mapping.add((predicate_map_iri, R2RML.constant, predicate_value))
216        mapping.add((predicate_map_iri, RDF.type, R2RML.PredicateMap))
217        mapping.add((object_map_iri, RDF.type, R2RML.ReferenceObjectMap))
218        mapping.add((object_map_iri, R2RML.parentTriplesMap,
219                     parent_triplesmap_iri))
220        mapping.add((object_map_iri, R2RML.joinCondition, join_condition_iri))
221        mapping.add((predicate_object_map_iri, R2RML.predicateMap,
222                     predicate_map_iri))
223        mapping.add((predicate_object_map_iri, R2RML.objectMap,
224                     object_map_iri))
225        mapping.add((predicate_object_map_iri, RDF.type,
226                     R2RML.PredicateObjectMap))
227        mapping.add((triplesmap_iri, R2RML.predicateObjectMap,
228                     predicate_object_map_iri))
229
230        return join_condition_iri
231
232    def _generate_mapping(self) -> Graph:
233        """Generate a [R2]RML mapping for a Joins instance.
234
235        Returns
236        -------
237        mapping : Graph
238            [R2]RML mapping as an RDFLib Graph.
239        """
240        mapping: Graph = Graph(base='http://ex.com/')
241        mapping.bind('rr', R2RML)
242        mapping.bind('ql', QL)
243        mapping.bind('ex', EX)
244        subject1_template = Literal('http://ex.com/table1/{id}')
245        subject2_template = Literal('http://ex.com/table2/{id}')
246        if self._data_format == 'postgresql':
247            triples_map1_iri = self._add_triples_map(mapping,
248                                                     subject1_template,
249                                                     Literal('data'), number=1)
250            triples_map2_iri = self._add_triples_map(mapping,
251                                                     subject2_template,
252                                                     Literal('data'), number=2)
253        elif self._data_format == 'csv':
254            triples_map1_iri = \
255                self._add_triples_map_source(mapping, subject1_template,
256                                             Literal('/data/shared/data1.csv'),
257                                             number=1)
258            triples_map2_iri = \
259                self._add_triples_map_source(mapping, subject2_template,
260                                             Literal('/data/shared/data2.csv'),
261                                             number=2)
262        else:
263            raise NotImplementedError(f'{self._data_format} not implemented')
264
265        self._add_join_predicate_object_map(mapping, triples_map1_iri,
266                                            EX['j1'], Literal('p1'),
267                                            triples_map2_iri, Literal('p1'),
268                                            Literal('p1'))
269
270        return mapping
271
272    def _generate_csv(self) -> bool:
273        """Generate the instance as CSV files.
274
275        Returns
276        -------
277        success : bool
278            True if successfull, false otherwise
279        """
280        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
281        data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1)
282        dataframe1 = self._generate_dataframe()
283        data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2)
284        dataframe2 = self._generate_dataframe(self._number_of_members + 1,
285                                              self._number_of_properties + 1)
286        dataframe1, dataframe2 = self._update_duplicates(dataframe1,
287                                                         dataframe2)
288        dataframe1.to_csv(data1_path, index=False)
289        dataframe2.to_csv(data2_path, index=False)
290
291        mapping_path = os.path.join(self.path(), 'data', 'shared', CSV_MAPPING_FILE)
292        mapping: Graph = self._generate_mapping()
293        mapping.serialize(destination=mapping_path, format='turtle')
294        self._generate_scenario()
295
296        return True
297
298    def _generate_postgresql(self) -> bool:
299        """Generate the instance as PostgreSQL with CSV files to load.
300
301        Returns
302        -------
303        success : bool
304            True if successfull, false otherwise
305        """
306        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
307        data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1)
308        self._generate_dataframe().to_csv(data1_path, index=False)
309        data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2)
310        self._generate_dataframe().to_csv(data2_path, index=False)
311
312        mapping_path = os.path.join(self.path(), 'data', 'shared', RDB_MAPPING_FILE)
313        mapping: Graph = self._generate_mapping()
314        mapping.serialize(destination=mapping_path, format='turtle')
315        self._generate_scenario()
316
317        return True
318
319    def _generate_scenario(self) -> bool:
320        """Generate the metadata for this scenario.
321
322        Configures the execution pipeline automatically.
323
324        Returns
325        -------
326        success : bool
327            True if successfull, false otherwise
328        """
329        name: str = f'join_duplicates_{self._number_of_duplicates}' + \
330                    f'_{self._percentage}'
331        description: str = f'Join Duplicate {self._number_of_duplicates}' + \
332                           f'({self._percentage}%)'
333        iri: str = 'http://example.org/join-duplicates/' + \
334                   f'{self._number_of_duplicates}/{self._percentage}/'
335
336        if self._data_format == 'postgresql':
337            return self._generate_metadata(iri, name, description,
338                                           RDB_MAPPING_FILE)
339        elif self._data_format == 'csv':
340            return self._generate_metadata(iri, name, description,
341                                           CSV_MAPPING_FILE)
342        else:
343            raise NotImplementedError(f'{self._data_format} not implemented')
344
345        return False

Helper class that provides a standard way to create an ABC using inheritance.

JoinsDuplicate( main_directory: str, verbose: bool, percentage: float, number_of_duplicates: int, number_of_members: int, number_of_properties: int, value_size: int, data_format: str, engine: str, seed: int = 0)
29    def __init__(self, main_directory: str, verbose: bool, percentage: float,
30                 number_of_duplicates: int, number_of_members: int,
31                 number_of_properties: int, value_size: int, data_format: str,
32                 engine: str, seed: int = 0):
33        """Initialize a Joins Duplicate scenario.
34
35        Parameters
36        ----------
37        main_directory : str
38            Root directory for generating instances of Joins Duplicate.
39        verbose : bool
40            Verbose logging enabled or not.
41        percentage : float
42            Duplicate of members which should result into a join.
43        number_of_duplicates : int
44            Number of duplicates to generate.
45        number_of_members : int
46            Number of members to generate, for example 5000 for 5K rows in a
47            tabular data structure.
48        number_of_properties : int
49            Number of properties per member to generate, for example 20 for
50            20 columns in a tabular data structure.
51        value_size : int
52            Number of characters to add to default value generation,
53            for example: 256 will expand all values to 256 characters.
54        data_format : str
55            Data format to use for generating the data set, for example:
56            "csv", "json", "xml", "postgresql", "mysql"
57        engine : str
58            Engine to use for execution of the generated scenario's instance,
59            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
60            or "OntopMaterialize"
61        seed : int
62            Random seed to use, default 0.
63        """
64        self._percentage = percentage
65        self._number_of_duplicates = number_of_duplicates
66        self._number_of_members: int = number_of_members
67        self._number_of_properties: int = number_of_properties
68        self._value_size: int = value_size
69        random.seed(seed)
70
71        super().__init__(data_format, engine, main_directory, verbose)
72
73        if self._data_format != 'csv':
74            raise NotImplementedError(f'Data format {self._data_format} '
75                                      f'is not implemented by {__name__}')
76
77        self._logger = Logger(__name__, self._main_directory, self._verbose)
78        self._logger.debug(f'Generating join percentage'
79                           f' with {self._percentage}% of members,')

Initialize a Joins Duplicate scenario.

Parameters
  • main_directory (str): Root directory for generating instances of Joins Duplicate.
  • verbose (bool): Verbose logging enabled or not.
  • percentage (float): Duplicate of members which should result into a join.
  • number_of_duplicates (int): Number of duplicates to generate.
  • number_of_members (int): Number of members to generate, for example 5000 for 5K rows in a tabular data structure.
  • number_of_properties (int): Number of properties per member to generate, for example 20 for 20 columns in a tabular data structure.
  • value_size (int): Number of characters to add to default value generation, for example: 256 will expand all values to 256 characters.
  • data_format (str): Data format to use for generating the data set, for example: "csv", "json", "xml", "postgresql", "mysql"
  • engine (str): Engine to use for execution of the generated scenario's instance, for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", or "OntopMaterialize"
  • seed (int): Random seed to use, default 0.
def generate(self) -> bool:
81    def generate(self) -> bool:
82        """Generate the instance using the Joins Duplicate scenario.
83
84        Only CSV files are currently implemented!
85        """
86        if self._data_format == 'csv':
87            return self._generate_csv()
88        elif self._data_format == 'postgresql':
89            return self._generate_postgresql()
90        else:
91            raise NotImplementedError(f'Data format {self._data_format} '
92                                      f'is not implemented by {__name__}')

Generate the instance using the Joins Duplicate scenario.

Only CSV files are currently implemented!

def path(self) -> str:
 94    def path(self) -> str:
 95        """Builds the file path for the instance of a Joins Duplicate scenario.
 96
 97        Returns
 98        -------
 99        path : str
100            File path for the Joins Duplicate's instance.
101        """
102        key = f'joins_duplicates_{self._number_of_duplicates}' + \
103              f'_{self._percentage}'
104        path = os.path.join(self._main_directory, self._engine,
105                            self._data_format, key)
106        self._logger.debug(f'Generating to {path}')
107        os.makedirs(path, exist_ok=True)
108        return path

Builds the file path for the instance of a Joins Duplicate scenario.

Returns
  • path (str): File path for the Joins Duplicate's instance.