bench_generator.joins_multiple

This module holds the Joins class which scales the dataset size by the number of members in a dataset such as number of rows for tabular data.

  1#!/usr/bin/env python3
  2
  3"""
  4This module holds the Joins class which scales the dataset size
  5by the number of members in a dataset such as number of rows for tabular data.
  6"""
  7
  8import os
  9import string
 10import random
 11from typing import Tuple
 12from pandas import DataFrame
 13from rdflib.namespace import RDF
 14from rdflib import Graph, URIRef, BNode, Literal, Namespace
 15from bench_generator.scenario import Scenario
 16from bench_generator.logger import Logger
 17
 18DATA_FILE1 = 'data1.csv'
 19DATA_FILE2 = 'data2.csv'
 20RDB_MAPPING_FILE = 'mapping.r2rml.ttl'
 21CSV_MAPPING_FILE = 'mapping.rml.ttl'
 22R2RML = Namespace('http://www.w3.org/ns/r2rml#')
 23QL = Namespace('http://semweb.mmlab.be/ns/ql#')
 24EX = Namespace('http://example.com/')
 25MEMBERS_PERCENTAGE = 50.0
 26
 27
 28class JoinsMultiple(Scenario):
 29    def __init__(self, main_directory: str, verbose: bool, percentage: float,
 30                 n: int, m: int, jc: int, number_of_members: int,
 31                 number_of_properties: int, value_size: int, data_format: str,
 32                 engine: str, seed: int = 0):
 33        """Initialize a Joins Multiple scenario.
 34
 35        Member's percentage is always set to 50%.
 36
 37        Parameters
 38        ----------
 39        main_directory : str
 40            Root directory for generating instances of Joins Multiple.
 41        verbose : bool
 42            Verbose logging enabled or not.
 43        percentage : float
 44            Percentage of relations which should result into a join.
 45        n : int
 46            Relation size N.
 47        m : int
 48            Relation size M.
 49        jc : int
 50            Number of Join Conditions.
 51        number_of_members : int
 52            Number of members to generate, for example 5000 for 5K rows in a
 53            tabular data structure.
 54        number_of_properties : int
 55            Number of properties per member to generate, for example 20 for
 56            20 columns in a tabular data structure.
 57        value_size : int
 58            Number of characters to add to default value generation,
 59            for example: 256 will expand all values to 256 characters.
 60        data_format : str
 61            Data format to use for generating the data set, for example:
 62            "csv", "json", "xml", "postgresql", "mysql"
 63        engine : str
 64            Engine to use for execution of the generated scenario's instance,
 65            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
 66            or "OntopMaterialize"
 67        seed : int
 68            Random seed to use, default 0.
 69        """
 70        self._percentage = percentage
 71        self._n = n
 72        self._m = m
 73        self._jc = jc
 74        self._number_of_members: int = number_of_members
 75        self._number_of_properties: int = number_of_properties
 76        self._value_size: int = value_size
 77        random.seed(seed)
 78
 79        super().__init__(data_format, engine, main_directory, verbose)
 80
 81        if self._data_format != 'csv':
 82            raise NotImplementedError(f'Data format {self._data_format} '
 83                                      f'is not implemented by {__name__}')
 84
 85        self._logger = Logger(__name__, self._main_directory, self._verbose)
 86        self._logger.debug(f'Generating join relations {self._n}-{self._m}'
 87                           f' with {self._percentage}% of relations,')
 88
 89    def generate(self) -> bool:
 90        """Generate the instance using the Joins Multiple scenario.
 91
 92        Only CSV files are currently implemented!
 93        """
 94        if self._data_format == 'csv':
 95            return self._generate_csv()
 96        elif self._data_format == 'postgresql':
 97            return self._generate_postgresql()
 98        else:
 99            raise NotImplementedError(f'Data format {self._data_format} '
100                                      f'is not implemented by {__name__}')
101
102    def path(self) -> str:
103        """Builds the file path for the instance of a Joins Multiple scenario.
104
105        Returns
106        -------
107        path : str
108            File path for the Joins Multiple's instance.
109        """
110        key = f'joins_mutiple_{self._n}-{self._m}_{self._jc}jc' + \
111              f'_{self._percentage}'
112        path = os.path.join(self._main_directory, self._engine,
113                            self._data_format, key)
114        self._logger.debug(f'Generating to {path}')
115        os.makedirs(path, exist_ok=True)
116        return path
117
118    def _generate_dataframe(self, member_offset: int = 1,
119                            property_offset: int = 1) -> DataFrame:
120        """Generate joins.
121
122        Parameters
123        ----------
124        member_offset : int
125            Offset to start member ID generation from. Default 1 (no offset).
126        property_offset : int
127            Offset to start property ID generation from. Default 1 (no offset).
128
129        Returns
130        -------
131        dataframe : DataFrame
132            Panda's DataFrame with generated joins.
133        """
134        subject_id = range(member_offset,
135                           self._number_of_members + member_offset)
136        value_id = range(property_offset,
137                         self._number_of_members + property_offset)
138        data: dict = {'id': subject_id}
139        n_ascii = len(string.ascii_letters)
140
141        for j in range(1, self._number_of_properties + 1):
142            # Append ASCII characters if necessary, use modulo to avoid out of
143            # range in ASCII table
144            append_value = ''
145            if self._value_size > 0:
146                append_value = '_'
147            for n in range(self._value_size):
148                append_value += string.ascii_letters[n % n_ascii]
149
150            # Generate value V_{property}_{member} honoring the value size
151            value = [f'V_{j}-{i}{append_value}' for i in value_id]
152            data[f'p{j}'] = value
153
154        return DataFrame(data)
155
156    def _update_many_on_many(self,
157                             dataframe1: DataFrame,
158                             dataframe2: DataFrame) -> Tuple[DataFrame,
159                                                             DataFrame]:
160        # 0% percentage results in zero matches for the join condition,
161        # don't even bother to try to match the dataframes
162        if self._percentage == 0.0:
163            return dataframe1, dataframe2
164
165        percentaged_members = \
166            self._number_of_members * (self._percentage / 100.0)
167
168        sample1 = dataframe1.iloc[random.sample(list(dataframe1.index),
169                                                int(percentaged_members))]
170        sample1_v = sample1.reset_index(drop=True)
171        sample2 = dataframe2.iloc[random.sample(list(dataframe2.index),
172                                                int(percentaged_members))]
173        sample2_v = sample2.reset_index(drop=True)
174
175        number_of_members_n = self._number_of_members * (self._percentage / 100.0)
176        number_of_members_m = self._number_of_members * (self._percentage / 100.0)
177        members_to_join_n = number_of_members_n / self._n
178        members_to_join_m = number_of_members_m / self._m
179
180        k = max(int(members_to_join_n + 0.5), int(members_to_join_m + 0.5))
181        sample_members = sample1_v.iloc[random.sample(list(sample1_v.index),
182                                                      k)]
183        values = list(set([m[1]['p1'] for m in sample_members.iterrows()]))[:int(members_to_join_m + 0.5)]
184        values = values * self._m
185        if len(values) > self._number_of_members:
186            values = values[:self._number_of_members]
187
188        sample2_v = dataframe2.iloc[random.sample(list(dataframe2.index),
189                                                  len(values))]
190
191        for jc in range(1, self._jc + 1):
192            for i, j in zip(values, list(sample2_v.index)):
193                dataframe2.loc[j, f'p{jc}'] = i
194
195        ####
196        values = list(set([m[1]['p1'] for m in sample_members.iterrows()]))[:int(members_to_join_n + 0.5)]
197        values = values * self._n
198        if len(values) > self._number_of_members:
199            values = values[:self._number_of_members]
200
201        sample1_v = dataframe1.iloc[random.sample(list(dataframe1.index),
202                                                  len(values))]
203        for jc in range(1, self._jc + 1):
204            for i, j in zip(values, list(sample1_v.index)):
205                dataframe1.loc[j, f'p{jc}'] = i
206
207        return dataframe1, dataframe2
208
209    def _add_join_multiple_predicate_object_map(self, mapping: Graph,
210                                                triplesmap_iri: URIRef,
211                                                predicate_value: URIRef,
212                                                object_value: Literal,
213                                                parent_triplesmap_iri: URIRef,
214                                                jc_values: list) -> Graph:
215        predicate_object_map_iri = BNode()
216        predicate_map_iri = BNode()
217        object_map_iri = BNode()
218
219        for jc in jc_values:
220            join_condition_iri = BNode()
221            mapping.add((join_condition_iri, R2RML.child, jc['child']))
222            mapping.add((join_condition_iri, R2RML.parent, jc['parent']))
223            mapping.add((join_condition_iri, RDF.type, R2RML.JoinCondition))
224            mapping.add((object_map_iri, R2RML.joinCondition, join_condition_iri))
225
226        mapping.add((predicate_map_iri, R2RML.constant, predicate_value))
227        mapping.add((predicate_map_iri, RDF.type, R2RML.PredicateMap))
228        mapping.add((object_map_iri, RDF.type, R2RML.ReferenceObjectMap))
229        mapping.add((object_map_iri, R2RML.parentTriplesMap, parent_triplesmap_iri))
230        mapping.add((predicate_object_map_iri, R2RML.predicateMap, predicate_map_iri))
231        mapping.add((predicate_object_map_iri, R2RML.objectMap, object_map_iri))
232        mapping.add((predicate_object_map_iri, RDF.type, R2RML.PredicateObjectMap))
233        mapping.add((triplesmap_iri, R2RML.predicateObjectMap, predicate_object_map_iri))
234
235        return mapping
236
237    def _generate_mapping(self) -> Graph:
238        """Generate a [R2]RML mapping for a Joins instance.
239
240        Returns
241        -------
242        mapping : Graph
243            [R2]RML mapping as an RDFLib Graph.
244        """
245        mapping: Graph = Graph(base='http://ex.com/')
246        mapping.bind('rr', R2RML)
247        mapping.bind('ql', QL)
248        mapping.bind('ex', EX)
249        subject1_template = Literal('http://ex.com/table1/{id}')
250        subject2_template = Literal('http://ex.com/table2/{id}')
251        if self._data_format == 'postgresql':
252            triples_map1_iri = self._add_triples_map(mapping,
253                                                     subject1_template,
254                                                     Literal('data'), number=1)
255            triples_map2_iri = self._add_triples_map(mapping,
256                                                     subject2_template,
257                                                     Literal('data'), number=2)
258        elif self._data_format == 'csv':
259            triples_map1_iri = \
260                self._add_triples_map_source(mapping, subject1_template,
261                                             Literal('/data/shared/data1.csv'),
262                                             number=1)
263            triples_map2_iri = \
264                self._add_triples_map_source(mapping, subject1_template,
265                                             Literal('/data/shared/data2.csv'),
266                                             number=2)
267        else:
268            raise NotImplementedError(f'{self._data_format} not implemented')
269
270        jc_values = []
271        for i in range(1, self._jc + 1):
272            jc_values.append({
273                'child': Literal(f'p{i}'),
274                'parent': Literal(f'p{i}')
275            })
276
277        self._add_join_multiple_predicate_object_map(mapping, triples_map1_iri,
278                                                     EX['j1'], Literal('p1'),
279                                                     triples_map2_iri,
280                                                     jc_values)
281
282        return mapping
283
284    def _generate_csv(self) -> bool:
285        """Generate the instance as CSV files.
286
287        Returns
288        -------
289        success : bool
290            True if successfull, false otherwise
291        """
292        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
293        data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1)
294        dataframe1 = self._generate_dataframe()
295        data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2)
296        dataframe2 = self._generate_dataframe(self._number_of_members + 1,
297                                              self._number_of_properties + 1)
298        dataframe1, dataframe2 = self._update_many_on_many(dataframe1,
299                                                           dataframe2)
300        dataframe1.to_csv(data1_path, index=False)
301        dataframe2.to_csv(data2_path, index=False)
302
303        mapping_path = os.path.join(self.path(), 'data', 'shared', CSV_MAPPING_FILE)
304        mapping: Graph = self._generate_mapping()
305        mapping.serialize(destination=mapping_path, format='turtle')
306        self._generate_scenario()
307
308        return True
309
310    def _generate_postgresql(self) -> bool:
311        """Generate the instance as PostgreSQL with CSV files to load.
312
313        Returns
314        -------
315        success : bool
316            True if successfull, false otherwise
317        """
318        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
319        data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1)
320        self._generate_dataframe().to_csv(data1_path, index=False)
321        data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2)
322        self._generate_dataframe().to_csv(data2_path, index=False)
323
324        mapping_path = os.path.join(self.path(), 'data', 'shared', RDB_MAPPING_FILE)
325        mapping: Graph = self._generate_mapping()
326        mapping.serialize(destination=mapping_path, format='turtle')
327        self._generate_scenario()
328
329        return True
330
331    def _generate_scenario(self) -> bool:
332        """Generate the metadata for this scenario.
333
334        Configures the execution pipeline automatically.
335
336        Returns
337        -------
338        success : bool
339            True if successfull, false otherwise
340        """
341        name: str = f'join_multiple_{self._n}-{self._m}_{self._jc}_{self._percentage}'
342        description: str = f'Join Multiple {self._n}-{self._m} {self._jc}JC {self._percentage}% '
343        iri: str = f'http://example.org/join-percentage/{self._percentage}/'
344
345        if self._data_format == 'postgresql':
346            return self._generate_metadata(iri, name, description,
347                                           RDB_MAPPING_FILE)
348        elif self._data_format == 'csv':
349            return self._generate_metadata(iri, name, description,
350                                           CSV_MAPPING_FILE)
351        else:
352            raise NotImplementedError(f'{self._data_format} not implemented')
353
354        return False
DATA_FILE1 = 'data1.csv'
DATA_FILE2 = 'data2.csv'
RDB_MAPPING_FILE = 'mapping.r2rml.ttl'
CSV_MAPPING_FILE = 'mapping.rml.ttl'
R2RML = Namespace('http://www.w3.org/ns/r2rml#')
QL = Namespace('http://semweb.mmlab.be/ns/ql#')
EX = Namespace('http://example.com/')
MEMBERS_PERCENTAGE = 50.0
class JoinsMultiple(bench_generator.scenario.Scenario):
 29class JoinsMultiple(Scenario):
 30    def __init__(self, main_directory: str, verbose: bool, percentage: float,
 31                 n: int, m: int, jc: int, number_of_members: int,
 32                 number_of_properties: int, value_size: int, data_format: str,
 33                 engine: str, seed: int = 0):
 34        """Initialize a Joins Multiple scenario.
 35
 36        Member's percentage is always set to 50%.
 37
 38        Parameters
 39        ----------
 40        main_directory : str
 41            Root directory for generating instances of Joins Multiple.
 42        verbose : bool
 43            Verbose logging enabled or not.
 44        percentage : float
 45            Percentage of relations which should result into a join.
 46        n : int
 47            Relation size N.
 48        m : int
 49            Relation size M.
 50        jc : int
 51            Number of Join Conditions.
 52        number_of_members : int
 53            Number of members to generate, for example 5000 for 5K rows in a
 54            tabular data structure.
 55        number_of_properties : int
 56            Number of properties per member to generate, for example 20 for
 57            20 columns in a tabular data structure.
 58        value_size : int
 59            Number of characters to add to default value generation,
 60            for example: 256 will expand all values to 256 characters.
 61        data_format : str
 62            Data format to use for generating the data set, for example:
 63            "csv", "json", "xml", "postgresql", "mysql"
 64        engine : str
 65            Engine to use for execution of the generated scenario's instance,
 66            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
 67            or "OntopMaterialize"
 68        seed : int
 69            Random seed to use, default 0.
 70        """
 71        self._percentage = percentage
 72        self._n = n
 73        self._m = m
 74        self._jc = jc
 75        self._number_of_members: int = number_of_members
 76        self._number_of_properties: int = number_of_properties
 77        self._value_size: int = value_size
 78        random.seed(seed)
 79
 80        super().__init__(data_format, engine, main_directory, verbose)
 81
 82        if self._data_format != 'csv':
 83            raise NotImplementedError(f'Data format {self._data_format} '
 84                                      f'is not implemented by {__name__}')
 85
 86        self._logger = Logger(__name__, self._main_directory, self._verbose)
 87        self._logger.debug(f'Generating join relations {self._n}-{self._m}'
 88                           f' with {self._percentage}% of relations,')
 89
 90    def generate(self) -> bool:
 91        """Generate the instance using the Joins Multiple scenario.
 92
 93        Only CSV files are currently implemented!
 94        """
 95        if self._data_format == 'csv':
 96            return self._generate_csv()
 97        elif self._data_format == 'postgresql':
 98            return self._generate_postgresql()
 99        else:
100            raise NotImplementedError(f'Data format {self._data_format} '
101                                      f'is not implemented by {__name__}')
102
103    def path(self) -> str:
104        """Builds the file path for the instance of a Joins Multiple scenario.
105
106        Returns
107        -------
108        path : str
109            File path for the Joins Multiple's instance.
110        """
111        key = f'joins_mutiple_{self._n}-{self._m}_{self._jc}jc' + \
112              f'_{self._percentage}'
113        path = os.path.join(self._main_directory, self._engine,
114                            self._data_format, key)
115        self._logger.debug(f'Generating to {path}')
116        os.makedirs(path, exist_ok=True)
117        return path
118
119    def _generate_dataframe(self, member_offset: int = 1,
120                            property_offset: int = 1) -> DataFrame:
121        """Generate joins.
122
123        Parameters
124        ----------
125        member_offset : int
126            Offset to start member ID generation from. Default 1 (no offset).
127        property_offset : int
128            Offset to start property ID generation from. Default 1 (no offset).
129
130        Returns
131        -------
132        dataframe : DataFrame
133            Panda's DataFrame with generated joins.
134        """
135        subject_id = range(member_offset,
136                           self._number_of_members + member_offset)
137        value_id = range(property_offset,
138                         self._number_of_members + property_offset)
139        data: dict = {'id': subject_id}
140        n_ascii = len(string.ascii_letters)
141
142        for j in range(1, self._number_of_properties + 1):
143            # Append ASCII characters if necessary, use modulo to avoid out of
144            # range in ASCII table
145            append_value = ''
146            if self._value_size > 0:
147                append_value = '_'
148            for n in range(self._value_size):
149                append_value += string.ascii_letters[n % n_ascii]
150
151            # Generate value V_{property}_{member} honoring the value size
152            value = [f'V_{j}-{i}{append_value}' for i in value_id]
153            data[f'p{j}'] = value
154
155        return DataFrame(data)
156
157    def _update_many_on_many(self,
158                             dataframe1: DataFrame,
159                             dataframe2: DataFrame) -> Tuple[DataFrame,
160                                                             DataFrame]:
161        # 0% percentage results in zero matches for the join condition,
162        # don't even bother to try to match the dataframes
163        if self._percentage == 0.0:
164            return dataframe1, dataframe2
165
166        percentaged_members = \
167            self._number_of_members * (self._percentage / 100.0)
168
169        sample1 = dataframe1.iloc[random.sample(list(dataframe1.index),
170                                                int(percentaged_members))]
171        sample1_v = sample1.reset_index(drop=True)
172        sample2 = dataframe2.iloc[random.sample(list(dataframe2.index),
173                                                int(percentaged_members))]
174        sample2_v = sample2.reset_index(drop=True)
175
176        number_of_members_n = self._number_of_members * (self._percentage / 100.0)
177        number_of_members_m = self._number_of_members * (self._percentage / 100.0)
178        members_to_join_n = number_of_members_n / self._n
179        members_to_join_m = number_of_members_m / self._m
180
181        k = max(int(members_to_join_n + 0.5), int(members_to_join_m + 0.5))
182        sample_members = sample1_v.iloc[random.sample(list(sample1_v.index),
183                                                      k)]
184        values = list(set([m[1]['p1'] for m in sample_members.iterrows()]))[:int(members_to_join_m + 0.5)]
185        values = values * self._m
186        if len(values) > self._number_of_members:
187            values = values[:self._number_of_members]
188
189        sample2_v = dataframe2.iloc[random.sample(list(dataframe2.index),
190                                                  len(values))]
191
192        for jc in range(1, self._jc + 1):
193            for i, j in zip(values, list(sample2_v.index)):
194                dataframe2.loc[j, f'p{jc}'] = i
195
196        ####
197        values = list(set([m[1]['p1'] for m in sample_members.iterrows()]))[:int(members_to_join_n + 0.5)]
198        values = values * self._n
199        if len(values) > self._number_of_members:
200            values = values[:self._number_of_members]
201
202        sample1_v = dataframe1.iloc[random.sample(list(dataframe1.index),
203                                                  len(values))]
204        for jc in range(1, self._jc + 1):
205            for i, j in zip(values, list(sample1_v.index)):
206                dataframe1.loc[j, f'p{jc}'] = i
207
208        return dataframe1, dataframe2
209
210    def _add_join_multiple_predicate_object_map(self, mapping: Graph,
211                                                triplesmap_iri: URIRef,
212                                                predicate_value: URIRef,
213                                                object_value: Literal,
214                                                parent_triplesmap_iri: URIRef,
215                                                jc_values: list) -> Graph:
216        predicate_object_map_iri = BNode()
217        predicate_map_iri = BNode()
218        object_map_iri = BNode()
219
220        for jc in jc_values:
221            join_condition_iri = BNode()
222            mapping.add((join_condition_iri, R2RML.child, jc['child']))
223            mapping.add((join_condition_iri, R2RML.parent, jc['parent']))
224            mapping.add((join_condition_iri, RDF.type, R2RML.JoinCondition))
225            mapping.add((object_map_iri, R2RML.joinCondition, join_condition_iri))
226
227        mapping.add((predicate_map_iri, R2RML.constant, predicate_value))
228        mapping.add((predicate_map_iri, RDF.type, R2RML.PredicateMap))
229        mapping.add((object_map_iri, RDF.type, R2RML.ReferenceObjectMap))
230        mapping.add((object_map_iri, R2RML.parentTriplesMap, parent_triplesmap_iri))
231        mapping.add((predicate_object_map_iri, R2RML.predicateMap, predicate_map_iri))
232        mapping.add((predicate_object_map_iri, R2RML.objectMap, object_map_iri))
233        mapping.add((predicate_object_map_iri, RDF.type, R2RML.PredicateObjectMap))
234        mapping.add((triplesmap_iri, R2RML.predicateObjectMap, predicate_object_map_iri))
235
236        return mapping
237
238    def _generate_mapping(self) -> Graph:
239        """Generate a [R2]RML mapping for a Joins instance.
240
241        Returns
242        -------
243        mapping : Graph
244            [R2]RML mapping as an RDFLib Graph.
245        """
246        mapping: Graph = Graph(base='http://ex.com/')
247        mapping.bind('rr', R2RML)
248        mapping.bind('ql', QL)
249        mapping.bind('ex', EX)
250        subject1_template = Literal('http://ex.com/table1/{id}')
251        subject2_template = Literal('http://ex.com/table2/{id}')
252        if self._data_format == 'postgresql':
253            triples_map1_iri = self._add_triples_map(mapping,
254                                                     subject1_template,
255                                                     Literal('data'), number=1)
256            triples_map2_iri = self._add_triples_map(mapping,
257                                                     subject2_template,
258                                                     Literal('data'), number=2)
259        elif self._data_format == 'csv':
260            triples_map1_iri = \
261                self._add_triples_map_source(mapping, subject1_template,
262                                             Literal('/data/shared/data1.csv'),
263                                             number=1)
264            triples_map2_iri = \
265                self._add_triples_map_source(mapping, subject1_template,
266                                             Literal('/data/shared/data2.csv'),
267                                             number=2)
268        else:
269            raise NotImplementedError(f'{self._data_format} not implemented')
270
271        jc_values = []
272        for i in range(1, self._jc + 1):
273            jc_values.append({
274                'child': Literal(f'p{i}'),
275                'parent': Literal(f'p{i}')
276            })
277
278        self._add_join_multiple_predicate_object_map(mapping, triples_map1_iri,
279                                                     EX['j1'], Literal('p1'),
280                                                     triples_map2_iri,
281                                                     jc_values)
282
283        return mapping
284
285    def _generate_csv(self) -> bool:
286        """Generate the instance as CSV files.
287
288        Returns
289        -------
290        success : bool
291            True if successfull, false otherwise
292        """
293        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
294        data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1)
295        dataframe1 = self._generate_dataframe()
296        data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2)
297        dataframe2 = self._generate_dataframe(self._number_of_members + 1,
298                                              self._number_of_properties + 1)
299        dataframe1, dataframe2 = self._update_many_on_many(dataframe1,
300                                                           dataframe2)
301        dataframe1.to_csv(data1_path, index=False)
302        dataframe2.to_csv(data2_path, index=False)
303
304        mapping_path = os.path.join(self.path(), 'data', 'shared', CSV_MAPPING_FILE)
305        mapping: Graph = self._generate_mapping()
306        mapping.serialize(destination=mapping_path, format='turtle')
307        self._generate_scenario()
308
309        return True
310
311    def _generate_postgresql(self) -> bool:
312        """Generate the instance as PostgreSQL with CSV files to load.
313
314        Returns
315        -------
316        success : bool
317            True if successfull, false otherwise
318        """
319        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
320        data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1)
321        self._generate_dataframe().to_csv(data1_path, index=False)
322        data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2)
323        self._generate_dataframe().to_csv(data2_path, index=False)
324
325        mapping_path = os.path.join(self.path(), 'data', 'shared', RDB_MAPPING_FILE)
326        mapping: Graph = self._generate_mapping()
327        mapping.serialize(destination=mapping_path, format='turtle')
328        self._generate_scenario()
329
330        return True
331
332    def _generate_scenario(self) -> bool:
333        """Generate the metadata for this scenario.
334
335        Configures the execution pipeline automatically.
336
337        Returns
338        -------
339        success : bool
340            True if successfull, false otherwise
341        """
342        name: str = f'join_multiple_{self._n}-{self._m}_{self._jc}_{self._percentage}'
343        description: str = f'Join Multiple {self._n}-{self._m} {self._jc}JC {self._percentage}% '
344        iri: str = f'http://example.org/join-percentage/{self._percentage}/'
345
346        if self._data_format == 'postgresql':
347            return self._generate_metadata(iri, name, description,
348                                           RDB_MAPPING_FILE)
349        elif self._data_format == 'csv':
350            return self._generate_metadata(iri, name, description,
351                                           CSV_MAPPING_FILE)
352        else:
353            raise NotImplementedError(f'{self._data_format} not implemented')
354
355        return False

Helper class that provides a standard way to create an ABC using inheritance.

JoinsMultiple( main_directory: str, verbose: bool, percentage: float, n: int, m: int, jc: int, number_of_members: int, number_of_properties: int, value_size: int, data_format: str, engine: str, seed: int = 0)
30    def __init__(self, main_directory: str, verbose: bool, percentage: float,
31                 n: int, m: int, jc: int, number_of_members: int,
32                 number_of_properties: int, value_size: int, data_format: str,
33                 engine: str, seed: int = 0):
34        """Initialize a Joins Multiple scenario.
35
36        Member's percentage is always set to 50%.
37
38        Parameters
39        ----------
40        main_directory : str
41            Root directory for generating instances of Joins Multiple.
42        verbose : bool
43            Verbose logging enabled or not.
44        percentage : float
45            Percentage of relations which should result into a join.
46        n : int
47            Relation size N.
48        m : int
49            Relation size M.
50        jc : int
51            Number of Join Conditions.
52        number_of_members : int
53            Number of members to generate, for example 5000 for 5K rows in a
54            tabular data structure.
55        number_of_properties : int
56            Number of properties per member to generate, for example 20 for
57            20 columns in a tabular data structure.
58        value_size : int
59            Number of characters to add to default value generation,
60            for example: 256 will expand all values to 256 characters.
61        data_format : str
62            Data format to use for generating the data set, for example:
63            "csv", "json", "xml", "postgresql", "mysql"
64        engine : str
65            Engine to use for execution of the generated scenario's instance,
66            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
67            or "OntopMaterialize"
68        seed : int
69            Random seed to use, default 0.
70        """
71        self._percentage = percentage
72        self._n = n
73        self._m = m
74        self._jc = jc
75        self._number_of_members: int = number_of_members
76        self._number_of_properties: int = number_of_properties
77        self._value_size: int = value_size
78        random.seed(seed)
79
80        super().__init__(data_format, engine, main_directory, verbose)
81
82        if self._data_format != 'csv':
83            raise NotImplementedError(f'Data format {self._data_format} '
84                                      f'is not implemented by {__name__}')
85
86        self._logger = Logger(__name__, self._main_directory, self._verbose)
87        self._logger.debug(f'Generating join relations {self._n}-{self._m}'
88                           f' with {self._percentage}% of relations,')

Initialize a Joins Multiple scenario.

Member's percentage is always set to 50%.

Parameters
  • main_directory (str): Root directory for generating instances of Joins Multiple.
  • verbose (bool): Verbose logging enabled or not.
  • percentage (float): Percentage of relations which should result into a join.
  • n (int): Relation size N.
  • m (int): Relation size M.
  • jc (int): Number of Join Conditions.
  • number_of_members (int): Number of members to generate, for example 5000 for 5K rows in a tabular data structure.
  • number_of_properties (int): Number of properties per member to generate, for example 20 for 20 columns in a tabular data structure.
  • value_size (int): Number of characters to add to default value generation, for example: 256 will expand all values to 256 characters.
  • data_format (str): Data format to use for generating the data set, for example: "csv", "json", "xml", "postgresql", "mysql"
  • engine (str): Engine to use for execution of the generated scenario's instance, for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", or "OntopMaterialize"
  • seed (int): Random seed to use, default 0.
def generate(self) -> bool:
 90    def generate(self) -> bool:
 91        """Generate the instance using the Joins Multiple scenario.
 92
 93        Only CSV files are currently implemented!
 94        """
 95        if self._data_format == 'csv':
 96            return self._generate_csv()
 97        elif self._data_format == 'postgresql':
 98            return self._generate_postgresql()
 99        else:
100            raise NotImplementedError(f'Data format {self._data_format} '
101                                      f'is not implemented by {__name__}')

Generate the instance using the Joins Multiple scenario.

Only CSV files are currently implemented!

def path(self) -> str:
103    def path(self) -> str:
104        """Builds the file path for the instance of a Joins Multiple scenario.
105
106        Returns
107        -------
108        path : str
109            File path for the Joins Multiple's instance.
110        """
111        key = f'joins_mutiple_{self._n}-{self._m}_{self._jc}jc' + \
112              f'_{self._percentage}'
113        path = os.path.join(self._main_directory, self._engine,
114                            self._data_format, key)
115        self._logger.debug(f'Generating to {path}')
116        os.makedirs(path, exist_ok=True)
117        return path

Builds the file path for the instance of a Joins Multiple scenario.

Returns
  • path (str): File path for the Joins Multiple's instance.