bench_generator.joins_percentage

This module holds the Joins class which scales the dataset size by the number of members in a dataset such as number of rows for tabular data.

  1#!/usr/bin/env python3
  2
  3"""
  4This module holds the Joins class which scales the dataset size
  5by the number of members in a dataset such as number of rows for tabular data.
  6"""
  7
  8import os
  9import string
 10import random
 11from typing import Tuple
 12from pandas import DataFrame
 13from rdflib.namespace import RDF
 14from rdflib import Graph, URIRef, BNode, Literal, Namespace
 15from bench_generator.scenario import Scenario
 16from bench_generator.logger import Logger
 17
 18DATA_FILE1 = 'data1.csv'
 19DATA_FILE2 = 'data2.csv'
 20RDB_MAPPING_FILE = 'mapping.r2rml.ttl'
 21CSV_MAPPING_FILE = 'mapping.rml.ttl'
 22R2RML = Namespace('http://www.w3.org/ns/r2rml#')
 23QL = Namespace('http://semweb.mmlab.be/ns/ql#')
 24EX = Namespace('http://example.com/')
 25
 26
 27class JoinsPercentage(Scenario):
 28    def __init__(self, main_directory: str, verbose: bool, percentage: float,
 29                 number_of_members: int, number_of_properties: int,
 30                 value_size: int, data_format: str, engine: str,
 31                 seed: int = 0):
 32        """Initialize a Joins Percentage scenario.
 33
 34        Parameters
 35        ----------
 36        main_directory : str
 37            Root directory for generating instances of Joins Percentage.
 38        verbose : bool
 39            Verbose logging enabled or not.
 40        percentage : float
 41            Percentage of members which should result into a join.
 42        number_of_members : int
 43            Number of members to generate, for example 5000 for 5K rows in a
 44            tabular data structure.
 45        number_of_properties : int
 46            Number of properties per member to generate, for example 20 for
 47            20 columns in a tabular data structure.
 48        value_size : int
 49            Number of characters to add to default value generation,
 50            for example: 256 will expand all values to 256 characters.
 51        data_format : str
 52            Data format to use for generating the data set, for example:
 53            "csv", "json", "xml", "postgresql", "mysql"
 54        engine : str
 55            Engine to use for execution of the generated scenario's instance,
 56            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
 57            or "OntopMaterialize"
 58        seed : int
 59            Random seed to use, default 0.
 60        """
 61        self._percentage = percentage
 62        self._number_of_members: int = number_of_members
 63        self._number_of_properties: int = number_of_properties
 64        self._value_size: int = value_size
 65        random.seed(seed)
 66
 67        super().__init__(data_format, engine, main_directory, verbose)
 68
 69        if self._data_format != 'csv':
 70            raise NotImplementedError(f'Data format {self._data_format} '
 71                                      f'is not implemented by {__name__}')
 72
 73        self._logger = Logger(__name__, self._main_directory, self._verbose)
 74        self._logger.debug(f'Generating join percentage'
 75                           f' with {self._percentage}% of members,')
 76
 77    def generate(self) -> bool:
 78        """Generate the instance using the Joins Percentage scenario.
 79
 80        Only CSV files are currently implemented!
 81        """
 82        if self._data_format == 'csv':
 83            return self._generate_csv()
 84        elif self._data_format == 'postgresql':
 85            return self._generate_postgresql()
 86        else:
 87            raise NotImplementedError(f'Data format {self._data_format} '
 88                                      f'is not implemented by {__name__}')
 89
 90    def path(self) -> str:
 91        """Builds the file path for the instance of a Joins Percentage scenario.
 92
 93        Returns
 94        -------
 95        path : str
 96            File path for the Joins Percentage's instance.
 97        """
 98        key = f'joins_perc_1-1_{self._percentage}'
 99        path = os.path.join(self._main_directory, self._engine,
100                            self._data_format, key)
101        self._logger.debug(f'Generating to {path}')
102        os.makedirs(path, exist_ok=True)
103        return path
104
105    def _generate_dataframe(self, member_offset: int = 1,
106                            property_offset: int = 1) -> DataFrame:
107        """Generate joins.
108
109        Parameters
110        ----------
111        member_offset : int
112            Offset to start member ID generation from. Default 1 (no offset).
113        property_offset : int
114            Offset to start property ID generation from. Default 1 (no offset).
115
116        Returns
117        -------
118        dataframe : DataFrame
119            Panda's DataFrame with generated joins.
120        """
121        subject_id = range(member_offset,
122                           self._number_of_members + member_offset)
123        value_id = range(property_offset,
124                         self._number_of_members + property_offset)
125        data: dict = {'id': subject_id}
126        n_ascii = len(string.ascii_letters)
127
128        for j in range(1, self._number_of_properties + 1):
129            # Append ASCII characters if necessary, use modulo to avoid out of
130            # range in ASCII table
131            append_value = ''
132            if self._value_size > 0:
133                append_value = '_'
134            for n in range(self._value_size):
135                append_value += string.ascii_letters[n % n_ascii]
136
137            # Generate value V_{property}_{member} honoring the value size
138            value = [f'V_{j}-{i}{append_value}' for i in value_id]
139            data[f'p{j}'] = value
140
141        return DataFrame(data)
142
143    def _update_one_on_one(self, dataframe1: DataFrame, dataframe2: DataFrame)\
144            -> Tuple[DataFrame, DataFrame]:
145        # 0% percentage results in zero matches for the join condition,
146        # don't even bother to try to match the dataframes
147        if self._percentage == 0.0:
148            return dataframe1, dataframe2
149
150        percentaged_members = \
151            self._number_of_members * (self._percentage / 100.0)
152
153        sample1 = dataframe1.iloc[random.sample(list(dataframe1.index),
154                                                int(percentaged_members))]
155        sample2 = dataframe2.iloc[random.sample(list(dataframe2.index),
156                                                int(percentaged_members))]
157
158        for i, j in zip(list(sample1.index), list(sample2.index)):
159            dataframe2.loc[j, 'id'] = dataframe1.loc[i, 'id']
160
161        return dataframe1, dataframe2
162
163    def _add_join_predicate_object_map(self, mapping: Graph,
164                                       triplesmap_iri: URIRef,
165                                       predicate_value: URIRef,
166                                       object_value: Literal,
167                                       parent_triplesmap_iri: URIRef,
168                                       child_value: Literal,
169                                       parent_value: Literal) -> BNode:
170        """Insert a join with join condition into a [R2]RML mapping
171
172        Parameters
173        ----------
174        mapping : Graph
175            [R2]RML mapping as an RDFLib Graph.
176        triples_map_iri : URIRef
177            IRI of the Triples Map to insert the PredicateObjectMap in.
178        predicate_value : URIRef
179            Predicate IRI value for PredicateObjectMap.
180        object_value : Literal
181            Object value for PredicateObjectMap.
182
183        Returns
184        -------
185        predicat_object_map_with_join_iri : BNode
186            Predicate Object Map with join blank node ID.
187        """
188        predicate_object_map_iri = BNode()
189        predicate_map_iri = BNode()
190        object_map_iri = BNode()
191        join_condition_iri = BNode()
192
193        mapping.add((join_condition_iri, R2RML.child, child_value))
194        mapping.add((join_condition_iri, R2RML.parent, parent_value))
195        mapping.add((join_condition_iri, RDF.type, R2RML.JoinCondition))
196        mapping.add((predicate_map_iri, R2RML.constant, predicate_value))
197        mapping.add((predicate_map_iri, RDF.type, R2RML.PredicateMap))
198        mapping.add((object_map_iri, RDF.type, R2RML.ReferenceObjectMap))
199        mapping.add((object_map_iri, R2RML.parentTriplesMap,
200                     parent_triplesmap_iri))
201        mapping.add((object_map_iri, R2RML.joinCondition, join_condition_iri))
202        mapping.add((predicate_object_map_iri, R2RML.predicateMap,
203                     predicate_map_iri))
204        mapping.add((predicate_object_map_iri, R2RML.objectMap,
205                     object_map_iri))
206        mapping.add((predicate_object_map_iri, RDF.type,
207                     R2RML.PredicateObjectMap))
208        mapping.add((triplesmap_iri, R2RML.predicateObjectMap,
209                     predicate_object_map_iri))
210
211        return join_condition_iri
212
213    def _generate_mapping(self) -> Graph:
214        """Generate a [R2]RML mapping for a Joins instance.
215
216        Returns
217        -------
218        mapping : Graph
219            [R2]RML mapping as an RDFLib Graph.
220        """
221        mapping: Graph = Graph(base='http://ex.com/')
222        mapping.bind('rr', R2RML)
223        mapping.bind('ql', QL)
224        mapping.bind('ex', EX)
225        subject1_template = Literal('http://ex.com/table1/{id}')
226        subject2_template = Literal('http://ex.com/table2/{id}')
227        if self._data_format == 'postgresql':
228            triples_map1_iri = self._add_triples_map(mapping,
229                                                     subject1_template,
230                                                     Literal('data'), number=1)
231            triples_map2_iri = self._add_triples_map(mapping,
232                                                     subject2_template,
233                                                     Literal('data'), number=2)
234        elif self._data_format == 'csv':
235            triples_map1_iri = \
236                self._add_triples_map_source(mapping, subject1_template,
237                                             Literal('/data/shared/data1.csv'),
238                                             number=1)
239            triples_map2_iri = \
240                self._add_triples_map_source(mapping, subject2_template,
241                                             Literal('/data/shared/data2.csv'),
242                                             number=2)
243        else:
244            raise NotImplementedError(f'{self._data_format} not implemented')
245
246        self._add_join_predicate_object_map(mapping, triples_map1_iri,
247                                            EX['j1'], Literal('p1'),
248                                            triples_map2_iri, Literal('id'),
249                                            Literal('id'))
250
251        return mapping
252
253    def _generate_csv(self) -> bool:
254        """Generate the instance as CSV files.
255
256        Returns
257        -------
258        success : bool
259            True if successfull, false otherwise
260        """
261        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
262        data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1)
263        dataframe1 = self._generate_dataframe()
264        data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2)
265        dataframe2 = self._generate_dataframe(self._number_of_members + 1,
266                                              self._number_of_properties + 1)
267        dataframe1, dataframe2 = self._update_one_on_one(dataframe1,
268                                                         dataframe2)
269        dataframe1.to_csv(data1_path, index=False)
270        dataframe2.to_csv(data2_path, index=False)
271
272        mapping_path = os.path.join(self.path(), 'data', 'shared', CSV_MAPPING_FILE)
273        mapping: Graph = self._generate_mapping()
274        mapping.serialize(destination=mapping_path, format='turtle')
275        self._generate_scenario()
276
277        return True
278
279    def _generate_postgresql(self) -> bool:
280        """Generate the instance as PostgreSQL with CSV files to load.
281
282        Returns
283        -------
284        success : bool
285            True if successfull, false otherwise
286        """
287        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
288        data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1)
289        self._generate_dataframe().to_csv(data1_path, index=False)
290        data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2)
291        self._generate_dataframe().to_csv(data2_path, index=False)
292
293        mapping_path = os.path.join(self.path(), 'data', 'shared', RDB_MAPPING_FILE)
294        mapping: Graph = self._generate_mapping()
295        mapping.serialize(destination=mapping_path, format='turtle')
296        self._generate_scenario()
297
298        return True
299
300    def _generate_scenario(self) -> bool:
301        """Generate the metadata for this scenario.
302
303        Configures the execution pipeline automatically.
304
305        Returns
306        -------
307        success : bool
308            True if successfull, false otherwise
309        """
310        name: str = f'join_percentage_{self._percentage}'
311        description: str = f'Join Percentage {self._percentage}% '
312        iri: str = f'http://example.org/join-percentage/{self._percentage}/'
313
314        if self._data_format == 'postgresql':
315            return self._generate_metadata(iri, name, description,
316                                           RDB_MAPPING_FILE)
317        elif self._data_format == 'csv':
318            return self._generate_metadata(iri, name, description,
319                                           CSV_MAPPING_FILE)
320        else:
321            raise NotImplementedError(f'{self._data_format} not implemented')
322
323        return False
DATA_FILE1 = 'data1.csv'
DATA_FILE2 = 'data2.csv'
RDB_MAPPING_FILE = 'mapping.r2rml.ttl'
CSV_MAPPING_FILE = 'mapping.rml.ttl'
R2RML = Namespace('http://www.w3.org/ns/r2rml#')
QL = Namespace('http://semweb.mmlab.be/ns/ql#')
EX = Namespace('http://example.com/')
class JoinsPercentage(bench_generator.scenario.Scenario):
 28class JoinsPercentage(Scenario):
 29    def __init__(self, main_directory: str, verbose: bool, percentage: float,
 30                 number_of_members: int, number_of_properties: int,
 31                 value_size: int, data_format: str, engine: str,
 32                 seed: int = 0):
 33        """Initialize a Joins Percentage scenario.
 34
 35        Parameters
 36        ----------
 37        main_directory : str
 38            Root directory for generating instances of Joins Percentage.
 39        verbose : bool
 40            Verbose logging enabled or not.
 41        percentage : float
 42            Percentage of members which should result into a join.
 43        number_of_members : int
 44            Number of members to generate, for example 5000 for 5K rows in a
 45            tabular data structure.
 46        number_of_properties : int
 47            Number of properties per member to generate, for example 20 for
 48            20 columns in a tabular data structure.
 49        value_size : int
 50            Number of characters to add to default value generation,
 51            for example: 256 will expand all values to 256 characters.
 52        data_format : str
 53            Data format to use for generating the data set, for example:
 54            "csv", "json", "xml", "postgresql", "mysql"
 55        engine : str
 56            Engine to use for execution of the generated scenario's instance,
 57            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
 58            or "OntopMaterialize"
 59        seed : int
 60            Random seed to use, default 0.
 61        """
 62        self._percentage = percentage
 63        self._number_of_members: int = number_of_members
 64        self._number_of_properties: int = number_of_properties
 65        self._value_size: int = value_size
 66        random.seed(seed)
 67
 68        super().__init__(data_format, engine, main_directory, verbose)
 69
 70        if self._data_format != 'csv':
 71            raise NotImplementedError(f'Data format {self._data_format} '
 72                                      f'is not implemented by {__name__}')
 73
 74        self._logger = Logger(__name__, self._main_directory, self._verbose)
 75        self._logger.debug(f'Generating join percentage'
 76                           f' with {self._percentage}% of members,')
 77
 78    def generate(self) -> bool:
 79        """Generate the instance using the Joins Percentage scenario.
 80
 81        Only CSV files are currently implemented!
 82        """
 83        if self._data_format == 'csv':
 84            return self._generate_csv()
 85        elif self._data_format == 'postgresql':
 86            return self._generate_postgresql()
 87        else:
 88            raise NotImplementedError(f'Data format {self._data_format} '
 89                                      f'is not implemented by {__name__}')
 90
 91    def path(self) -> str:
 92        """Builds the file path for the instance of a Joins Percentage scenario.
 93
 94        Returns
 95        -------
 96        path : str
 97            File path for the Joins Percentage's instance.
 98        """
 99        key = f'joins_perc_1-1_{self._percentage}'
100        path = os.path.join(self._main_directory, self._engine,
101                            self._data_format, key)
102        self._logger.debug(f'Generating to {path}')
103        os.makedirs(path, exist_ok=True)
104        return path
105
106    def _generate_dataframe(self, member_offset: int = 1,
107                            property_offset: int = 1) -> DataFrame:
108        """Generate joins.
109
110        Parameters
111        ----------
112        member_offset : int
113            Offset to start member ID generation from. Default 1 (no offset).
114        property_offset : int
115            Offset to start property ID generation from. Default 1 (no offset).
116
117        Returns
118        -------
119        dataframe : DataFrame
120            Panda's DataFrame with generated joins.
121        """
122        subject_id = range(member_offset,
123                           self._number_of_members + member_offset)
124        value_id = range(property_offset,
125                         self._number_of_members + property_offset)
126        data: dict = {'id': subject_id}
127        n_ascii = len(string.ascii_letters)
128
129        for j in range(1, self._number_of_properties + 1):
130            # Append ASCII characters if necessary, use modulo to avoid out of
131            # range in ASCII table
132            append_value = ''
133            if self._value_size > 0:
134                append_value = '_'
135            for n in range(self._value_size):
136                append_value += string.ascii_letters[n % n_ascii]
137
138            # Generate value V_{property}_{member} honoring the value size
139            value = [f'V_{j}-{i}{append_value}' for i in value_id]
140            data[f'p{j}'] = value
141
142        return DataFrame(data)
143
144    def _update_one_on_one(self, dataframe1: DataFrame, dataframe2: DataFrame)\
145            -> Tuple[DataFrame, DataFrame]:
146        # 0% percentage results in zero matches for the join condition,
147        # don't even bother to try to match the dataframes
148        if self._percentage == 0.0:
149            return dataframe1, dataframe2
150
151        percentaged_members = \
152            self._number_of_members * (self._percentage / 100.0)
153
154        sample1 = dataframe1.iloc[random.sample(list(dataframe1.index),
155                                                int(percentaged_members))]
156        sample2 = dataframe2.iloc[random.sample(list(dataframe2.index),
157                                                int(percentaged_members))]
158
159        for i, j in zip(list(sample1.index), list(sample2.index)):
160            dataframe2.loc[j, 'id'] = dataframe1.loc[i, 'id']
161
162        return dataframe1, dataframe2
163
164    def _add_join_predicate_object_map(self, mapping: Graph,
165                                       triplesmap_iri: URIRef,
166                                       predicate_value: URIRef,
167                                       object_value: Literal,
168                                       parent_triplesmap_iri: URIRef,
169                                       child_value: Literal,
170                                       parent_value: Literal) -> BNode:
171        """Insert a join with join condition into a [R2]RML mapping
172
173        Parameters
174        ----------
175        mapping : Graph
176            [R2]RML mapping as an RDFLib Graph.
177        triples_map_iri : URIRef
178            IRI of the Triples Map to insert the PredicateObjectMap in.
179        predicate_value : URIRef
180            Predicate IRI value for PredicateObjectMap.
181        object_value : Literal
182            Object value for PredicateObjectMap.
183
184        Returns
185        -------
186        predicat_object_map_with_join_iri : BNode
187            Predicate Object Map with join blank node ID.
188        """
189        predicate_object_map_iri = BNode()
190        predicate_map_iri = BNode()
191        object_map_iri = BNode()
192        join_condition_iri = BNode()
193
194        mapping.add((join_condition_iri, R2RML.child, child_value))
195        mapping.add((join_condition_iri, R2RML.parent, parent_value))
196        mapping.add((join_condition_iri, RDF.type, R2RML.JoinCondition))
197        mapping.add((predicate_map_iri, R2RML.constant, predicate_value))
198        mapping.add((predicate_map_iri, RDF.type, R2RML.PredicateMap))
199        mapping.add((object_map_iri, RDF.type, R2RML.ReferenceObjectMap))
200        mapping.add((object_map_iri, R2RML.parentTriplesMap,
201                     parent_triplesmap_iri))
202        mapping.add((object_map_iri, R2RML.joinCondition, join_condition_iri))
203        mapping.add((predicate_object_map_iri, R2RML.predicateMap,
204                     predicate_map_iri))
205        mapping.add((predicate_object_map_iri, R2RML.objectMap,
206                     object_map_iri))
207        mapping.add((predicate_object_map_iri, RDF.type,
208                     R2RML.PredicateObjectMap))
209        mapping.add((triplesmap_iri, R2RML.predicateObjectMap,
210                     predicate_object_map_iri))
211
212        return join_condition_iri
213
214    def _generate_mapping(self) -> Graph:
215        """Generate a [R2]RML mapping for a Joins instance.
216
217        Returns
218        -------
219        mapping : Graph
220            [R2]RML mapping as an RDFLib Graph.
221        """
222        mapping: Graph = Graph(base='http://ex.com/')
223        mapping.bind('rr', R2RML)
224        mapping.bind('ql', QL)
225        mapping.bind('ex', EX)
226        subject1_template = Literal('http://ex.com/table1/{id}')
227        subject2_template = Literal('http://ex.com/table2/{id}')
228        if self._data_format == 'postgresql':
229            triples_map1_iri = self._add_triples_map(mapping,
230                                                     subject1_template,
231                                                     Literal('data'), number=1)
232            triples_map2_iri = self._add_triples_map(mapping,
233                                                     subject2_template,
234                                                     Literal('data'), number=2)
235        elif self._data_format == 'csv':
236            triples_map1_iri = \
237                self._add_triples_map_source(mapping, subject1_template,
238                                             Literal('/data/shared/data1.csv'),
239                                             number=1)
240            triples_map2_iri = \
241                self._add_triples_map_source(mapping, subject2_template,
242                                             Literal('/data/shared/data2.csv'),
243                                             number=2)
244        else:
245            raise NotImplementedError(f'{self._data_format} not implemented')
246
247        self._add_join_predicate_object_map(mapping, triples_map1_iri,
248                                            EX['j1'], Literal('p1'),
249                                            triples_map2_iri, Literal('id'),
250                                            Literal('id'))
251
252        return mapping
253
254    def _generate_csv(self) -> bool:
255        """Generate the instance as CSV files.
256
257        Returns
258        -------
259        success : bool
260            True if successfull, false otherwise
261        """
262        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
263        data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1)
264        dataframe1 = self._generate_dataframe()
265        data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2)
266        dataframe2 = self._generate_dataframe(self._number_of_members + 1,
267                                              self._number_of_properties + 1)
268        dataframe1, dataframe2 = self._update_one_on_one(dataframe1,
269                                                         dataframe2)
270        dataframe1.to_csv(data1_path, index=False)
271        dataframe2.to_csv(data2_path, index=False)
272
273        mapping_path = os.path.join(self.path(), 'data', 'shared', CSV_MAPPING_FILE)
274        mapping: Graph = self._generate_mapping()
275        mapping.serialize(destination=mapping_path, format='turtle')
276        self._generate_scenario()
277
278        return True
279
280    def _generate_postgresql(self) -> bool:
281        """Generate the instance as PostgreSQL with CSV files to load.
282
283        Returns
284        -------
285        success : bool
286            True if successfull, false otherwise
287        """
288        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
289        data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1)
290        self._generate_dataframe().to_csv(data1_path, index=False)
291        data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2)
292        self._generate_dataframe().to_csv(data2_path, index=False)
293
294        mapping_path = os.path.join(self.path(), 'data', 'shared', RDB_MAPPING_FILE)
295        mapping: Graph = self._generate_mapping()
296        mapping.serialize(destination=mapping_path, format='turtle')
297        self._generate_scenario()
298
299        return True
300
301    def _generate_scenario(self) -> bool:
302        """Generate the metadata for this scenario.
303
304        Configures the execution pipeline automatically.
305
306        Returns
307        -------
308        success : bool
309            True if successfull, false otherwise
310        """
311        name: str = f'join_percentage_{self._percentage}'
312        description: str = f'Join Percentage {self._percentage}% '
313        iri: str = f'http://example.org/join-percentage/{self._percentage}/'
314
315        if self._data_format == 'postgresql':
316            return self._generate_metadata(iri, name, description,
317                                           RDB_MAPPING_FILE)
318        elif self._data_format == 'csv':
319            return self._generate_metadata(iri, name, description,
320                                           CSV_MAPPING_FILE)
321        else:
322            raise NotImplementedError(f'{self._data_format} not implemented')
323
324        return False

Helper class that provides a standard way to create an ABC using inheritance.

JoinsPercentage( main_directory: str, verbose: bool, percentage: float, number_of_members: int, number_of_properties: int, value_size: int, data_format: str, engine: str, seed: int = 0)
29    def __init__(self, main_directory: str, verbose: bool, percentage: float,
30                 number_of_members: int, number_of_properties: int,
31                 value_size: int, data_format: str, engine: str,
32                 seed: int = 0):
33        """Initialize a Joins Percentage scenario.
34
35        Parameters
36        ----------
37        main_directory : str
38            Root directory for generating instances of Joins Percentage.
39        verbose : bool
40            Verbose logging enabled or not.
41        percentage : float
42            Percentage of members which should result into a join.
43        number_of_members : int
44            Number of members to generate, for example 5000 for 5K rows in a
45            tabular data structure.
46        number_of_properties : int
47            Number of properties per member to generate, for example 20 for
48            20 columns in a tabular data structure.
49        value_size : int
50            Number of characters to add to default value generation,
51            for example: 256 will expand all values to 256 characters.
52        data_format : str
53            Data format to use for generating the data set, for example:
54            "csv", "json", "xml", "postgresql", "mysql"
55        engine : str
56            Engine to use for execution of the generated scenario's instance,
57            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
58            or "OntopMaterialize"
59        seed : int
60            Random seed to use, default 0.
61        """
62        self._percentage = percentage
63        self._number_of_members: int = number_of_members
64        self._number_of_properties: int = number_of_properties
65        self._value_size: int = value_size
66        random.seed(seed)
67
68        super().__init__(data_format, engine, main_directory, verbose)
69
70        if self._data_format != 'csv':
71            raise NotImplementedError(f'Data format {self._data_format} '
72                                      f'is not implemented by {__name__}')
73
74        self._logger = Logger(__name__, self._main_directory, self._verbose)
75        self._logger.debug(f'Generating join percentage'
76                           f' with {self._percentage}% of members,')

Initialize a Joins Percentage scenario.

Parameters
  • main_directory (str): Root directory for generating instances of Joins Percentage.
  • verbose (bool): Verbose logging enabled or not.
  • percentage (float): Percentage of members which should result into a join.
  • number_of_members (int): Number of members to generate, for example 5000 for 5K rows in a tabular data structure.
  • number_of_properties (int): Number of properties per member to generate, for example 20 for 20 columns in a tabular data structure.
  • value_size (int): Number of characters to add to default value generation, for example: 256 will expand all values to 256 characters.
  • data_format (str): Data format to use for generating the data set, for example: "csv", "json", "xml", "postgresql", "mysql"
  • engine (str): Engine to use for execution of the generated scenario's instance, for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", or "OntopMaterialize"
  • seed (int): Random seed to use, default 0.
def generate(self) -> bool:
78    def generate(self) -> bool:
79        """Generate the instance using the Joins Percentage scenario.
80
81        Only CSV files are currently implemented!
82        """
83        if self._data_format == 'csv':
84            return self._generate_csv()
85        elif self._data_format == 'postgresql':
86            return self._generate_postgresql()
87        else:
88            raise NotImplementedError(f'Data format {self._data_format} '
89                                      f'is not implemented by {__name__}')

Generate the instance using the Joins Percentage scenario.

Only CSV files are currently implemented!

def path(self) -> str:
 91    def path(self) -> str:
 92        """Builds the file path for the instance of a Joins Percentage scenario.
 93
 94        Returns
 95        -------
 96        path : str
 97            File path for the Joins Percentage's instance.
 98        """
 99        key = f'joins_perc_1-1_{self._percentage}'
100        path = os.path.join(self._main_directory, self._engine,
101                            self._data_format, key)
102        self._logger.debug(f'Generating to {path}')
103        os.makedirs(path, exist_ok=True)
104        return path

Builds the file path for the instance of a Joins Percentage scenario.

Returns
  • path (str): File path for the Joins Percentage's instance.