bench_generator.joins_relation

This module holds the Joins class which scales the dataset size by the number of members in a dataset such as number of rows for tabular data.

  1#!/usr/bin/env python3
  2
  3"""
  4This module holds the Joins class which scales the dataset size
  5by the number of members in a dataset such as number of rows for tabular data.
  6"""
  7
  8import os
  9import string
 10import random
 11from typing import Tuple
 12from pandas import DataFrame
 13from rdflib.namespace import RDF
 14from rdflib import Graph, URIRef, BNode, Literal, Namespace
 15from bench_generator.scenario import Scenario
 16from bench_generator.logger import Logger
 17
 18DATA_FILE1 = 'data1.csv'
 19DATA_FILE2 = 'data2.csv'
 20RDB_MAPPING_FILE = 'mapping.r2rml.ttl'
 21CSV_MAPPING_FILE = 'mapping.r2rml.ttl'
 22R2RML = Namespace('http://www.w3.org/ns/r2rml#')
 23QL = Namespace('http://semweb.mmlab.be/ns/ql#')
 24EX = Namespace('http://example.com/')
 25MEMBERS_PERCENTAGE = 50.0
 26
 27
 28class JoinsRelation(Scenario):
 29    def __init__(self, main_directory: str, verbose: bool, percentage: float,
 30                 n: int, m: int, number_of_members: int,
 31                 number_of_properties: int, value_size: int, data_format: str,
 32                 engine: str, seed: int = 0):
 33        """Initialize a Joins Relations scenario.
 34
 35        Member's percentage is always set to 50%.
 36
 37        Parameters
 38        ----------
 39        main_directory : str
 40            Root directory for generating instances of Joins Relations.
 41        verbose : bool
 42            Verbose logging enabled or not.
 43        percentage : float
 44            Percentage of relations which should result into a join.
 45        n : int
 46            Relation size N.
 47        m : int
 48            Relation size M.
 49        number_of_members : int
 50            Number of members to generate, for example 5000 for 5K rows in a
 51            tabular data structure.
 52        number_of_properties : int
 53            Number of properties per member to generate, for example 20 for
 54            20 columns in a tabular data structure.
 55        value_size : int
 56            Number of characters to add to default value generation,
 57            for example: 256 will expand all values to 256 characters.
 58        data_format : str
 59            Data format to use for generating the data set, for example:
 60            "csv", "json", "xml", "postgresql", "mysql"
 61        engine : str
 62            Engine to use for execution of the generated scenario's instance,
 63            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
 64            or "OntopMaterialize"
 65        seed : int
 66            Random seed to use, default 0.
 67        """
 68        self._percentage = percentage
 69        self._n = n
 70        self._m = m
 71        self._number_of_members: int = number_of_members
 72        self._number_of_properties: int = number_of_properties
 73        self._value_size: int = value_size
 74        random.seed(seed)
 75
 76        super().__init__(data_format, engine, main_directory, verbose)
 77
 78        if self._data_format != 'csv':
 79            raise NotImplementedError(f'Data format {self._data_format} '
 80                                      f'is not implemented by {__name__}')
 81
 82        self._logger = Logger(__name__, self._main_directory, self._verbose)
 83        self._logger.debug(f'Generating join relations {self._n}-{self._m}'
 84                           f' with {self._percentage}% of relations,')
 85
 86    def generate(self) -> bool:
 87        """Generate the instance using the Joins Relations scenario.
 88
 89        Only CSV files are currently implemented!
 90        """
 91        if self._data_format == 'csv':
 92            return self._generate_csv()
 93        elif self._data_format == 'postgresql':
 94            return self._generate_postgresql()
 95        else:
 96            raise NotImplementedError(f'Data format {self._data_format} '
 97                                      f'is not implemented by {__name__}')
 98
 99    def path(self) -> str:
100        """Builds the file path for the instance of a Joins Relations scenario.
101
102        Returns
103        -------
104        path : str
105            File path for the Joins Relations's instance.
106        """
107        key = f'joins_relations_{self._n}-{self._m}_{self._percentage}'
108        path = os.path.join(self._main_directory, self._engine,
109                            self._data_format, key)
110        self._logger.debug(f'Generating to {path}')
111        os.makedirs(path, exist_ok=True)
112        return path
113
114    def _generate_dataframe(self, member_offset: int = 1,
115                            property_offset: int = 1) -> DataFrame:
116        """Generate joins.
117
118        Parameters
119        ----------
120        member_offset : int
121            Offset to start member ID generation from. Default 1 (no offset).
122        property_offset : int
123            Offset to start property ID generation from. Default 1 (no offset).
124
125        Returns
126        -------
127        dataframe : DataFrame
128            Panda's DataFrame with generated joins.
129        """
130        subject_id = range(member_offset,
131                           self._number_of_members + member_offset)
132        value_id = range(property_offset,
133                         self._number_of_members + property_offset)
134        data: dict = {'id': subject_id}
135        n_ascii = len(string.ascii_letters)
136
137        for j in range(1, self._number_of_properties + 1):
138            # Append ASCII characters if necessary, use modulo to avoid out of
139            # range in ASCII table
140            append_value = ''
141            if self._value_size > 0:
142                append_value = '_'
143            for n in range(self._value_size):
144                append_value += string.ascii_letters[n % n_ascii]
145
146            # Generate value V_{property}_{member} honoring the value size
147            value = [f'V_{j}-{i}{append_value}' for i in value_id]
148            data[f'p{j}'] = value
149
150        return DataFrame(data)
151
152    def _update_many_on_many(self,
153                             dataframe1: DataFrame,
154                             dataframe2: DataFrame) -> Tuple[DataFrame,
155                                                             DataFrame]:
156        # 0% percentage results in zero matches for the join condition,
157        # don't even bother to try to match the dataframes
158        if self._percentage == 0.0:
159            return dataframe1, dataframe2
160
161        percentaged_members = \
162            self._number_of_members * (self._percentage / 100.0)
163
164        sample1 = dataframe1.iloc[random.sample(list(dataframe1.index),
165                                                int(percentaged_members))]
166        sample1_v = sample1.reset_index(drop=True)
167        sample2 = dataframe2.iloc[random.sample(list(dataframe2.index),
168                                                int(percentaged_members))]
169        sample2_v = sample2.reset_index(drop=True)
170
171        number_of_members_n = self._number_of_members * (self._percentage / 100.0)
172        number_of_members_m = self._number_of_members * (self._percentage / 100.0)
173        members_to_join_n = number_of_members_n / self._n
174        members_to_join_m = number_of_members_m / self._m
175
176        k = max(int(members_to_join_n + 0.5), int(members_to_join_m + 0.5))
177        sample_members = sample1_v.iloc[random.sample(list(sample1_v.index),
178                                                      k)]
179        values = list(set([m[1]['p1'] for m in sample_members.iterrows()]))[:int(members_to_join_m + 0.5)]
180        values = values * self._m
181        if len(values) > self._number_of_members:
182            values = values[:self._number_of_members]
183
184        sample2_v = dataframe2.iloc[random.sample(list(dataframe2.index),
185                                                  len(values))]
186
187        for i, j in zip(values, list(sample2_v.index)):
188            dataframe2.loc[j, 'p1'] = i
189
190        ####
191        values = list(set([m[1]['p1'] for m in sample_members.iterrows()]))[:int(members_to_join_n + 0.5)]
192        values = values * self._n
193        if len(values) > self._number_of_members:
194            values = values[:self._number_of_members]
195
196        sample1_v = dataframe1.iloc[random.sample(list(dataframe1.index),
197                                                  len(values))]
198        for i, j in zip(values, list(sample1_v.index)):
199            dataframe1.loc[j, 'p1'] = i
200
201        return dataframe1, dataframe2
202
203    def _generate_mapping(self) -> Graph:
204        """Generate a [R2]RML mapping for a Joins instance.
205
206        Returns
207        -------
208        mapping : Graph
209            [R2]RML mapping as an RDFLib Graph.
210        """
211        mapping: Graph = Graph(base='http://ex.com/')
212        mapping.bind('rr', R2RML)
213        mapping.bind('ql', QL)
214        mapping.bind('ex', EX)
215        subject1_template = Literal('http://ex.com/table1/{id}')
216        subject2_template = Literal('http://ex.com/table2/{id}')
217        if self._data_format == 'postgresql':
218            triples_map1_iri = self._add_triples_map(mapping,
219                                                     subject1_template,
220                                                     Literal('data'), number=1)
221            triples_map2_iri = self._add_triples_map(mapping,
222                                                     subject2_template,
223                                                     Literal('data'), number=2)
224        elif self._data_format == 'csv':
225            triples_map1_iri = \
226                self._add_triples_map_source(mapping, subject1_template,
227                                             Literal('/data/shared/data1.csv'),
228                                             number=1)
229            triples_map2_iri = \
230                self._add_triples_map_source(mapping, subject1_template,
231                                             Literal('/data/shared/data2.csv'),
232                                             number=2)
233        else:
234            raise NotImplementedError(f'{self._data_format} not implemented')
235
236        self._add_join_predicate_object_map(mapping, triples_map1_iri,
237                                            EX['j1'], Literal('p1'),
238                                            triples_map2_iri, Literal('p1'),
239                                            Literal('p1'))
240
241        return mapping
242
243    def _generate_csv(self) -> bool:
244        """Generate the instance as CSV files.
245
246        Returns
247        -------
248        success : bool
249            True if successfull, false otherwise
250        """
251        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
252        data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1)
253        dataframe1 = self._generate_dataframe()
254        data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2)
255        dataframe2 = self._generate_dataframe(self._number_of_members + 1,
256                                              self._number_of_properties + 1)
257        dataframe1, dataframe2 = self._update_many_on_many(dataframe1,
258                                                           dataframe2)
259        dataframe1.to_csv(data1_path, index=False)
260        dataframe2.to_csv(data2_path, index=False)
261
262        mapping_path = os.path.join(self.path(), 'data', 'shared', CSV_MAPPING_FILE)
263        mapping: Graph = self._generate_mapping()
264        mapping.serialize(destination=mapping_path, format='turtle')
265        self._generate_scenario()
266
267        return True
268
269    def _generate_postgresql(self) -> bool:
270        """Generate the instance as PostgreSQL with CSV files to load.
271
272        Returns
273        -------
274        success : bool
275            True if successfull, false otherwise
276        """
277        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
278        data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1)
279        dataframe1 = self._generate_dataframe()
280        data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2)
281        dataframe2 = self._generate_dataframe(self._number_of_members + 1,
282                                              self._number_of_properties + 1)
283        dataframe1, dataframe2 = self._update_many_on_many(dataframe1,
284                                                           dataframe2)
285        dataframe1.to_csv(data1_path, index=False)
286        dataframe2.to_csv(data2_path, index=False)
287
288        mapping_path = os.path.join(self.path(), 'data', 'shared', RDB_MAPPING_FILE)
289        mapping: Graph = self._generate_mapping()
290        mapping.serialize(destination=mapping_path, format='turtle')
291        self._generate_scenario()
292
293        return True
294
295    def _generate_scenario(self) -> bool:
296        """Generate the metadata for this scenario.
297
298        Configures the execution pipeline automatically.
299
300        Returns
301        -------
302        success : bool
303            True if successfull, false otherwise
304        """
305        name: str = f'join_relation_{self._n}_{self._m}_{self._percentage}'
306        description: str = f'Join Relation {self._n}-{self._m} {self._percentage}% '
307        iri: str = f'http://example.org/join-relation/{self._n}-{self._m}/{self._percentage}/'
308
309        if self._data_format == 'postgresql':
310            return self._generate_metadata(iri, name, description,
311                                           RDB_MAPPING_FILE)
312        elif self._data_format == 'csv':
313            return self._generate_metadata(iri, name, description,
314                                           CSV_MAPPING_FILE)
315        else:
316            raise NotImplementedError(f'{self._data_format} not implemented')
317
318        return False
DATA_FILE1 = 'data1.csv'
DATA_FILE2 = 'data2.csv'
RDB_MAPPING_FILE = 'mapping.r2rml.ttl'
CSV_MAPPING_FILE = 'mapping.r2rml.ttl'
R2RML = Namespace('http://www.w3.org/ns/r2rml#')
QL = Namespace('http://semweb.mmlab.be/ns/ql#')
EX = Namespace('http://example.com/')
MEMBERS_PERCENTAGE = 50.0
class JoinsRelation(bench_generator.scenario.Scenario):
 29class JoinsRelation(Scenario):
 30    def __init__(self, main_directory: str, verbose: bool, percentage: float,
 31                 n: int, m: int, number_of_members: int,
 32                 number_of_properties: int, value_size: int, data_format: str,
 33                 engine: str, seed: int = 0):
 34        """Initialize a Joins Relations scenario.
 35
 36        Member's percentage is always set to 50%.
 37
 38        Parameters
 39        ----------
 40        main_directory : str
 41            Root directory for generating instances of Joins Relations.
 42        verbose : bool
 43            Verbose logging enabled or not.
 44        percentage : float
 45            Percentage of relations which should result into a join.
 46        n : int
 47            Relation size N.
 48        m : int
 49            Relation size M.
 50        number_of_members : int
 51            Number of members to generate, for example 5000 for 5K rows in a
 52            tabular data structure.
 53        number_of_properties : int
 54            Number of properties per member to generate, for example 20 for
 55            20 columns in a tabular data structure.
 56        value_size : int
 57            Number of characters to add to default value generation,
 58            for example: 256 will expand all values to 256 characters.
 59        data_format : str
 60            Data format to use for generating the data set, for example:
 61            "csv", "json", "xml", "postgresql", "mysql"
 62        engine : str
 63            Engine to use for execution of the generated scenario's instance,
 64            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
 65            or "OntopMaterialize"
 66        seed : int
 67            Random seed to use, default 0.
 68        """
 69        self._percentage = percentage
 70        self._n = n
 71        self._m = m
 72        self._number_of_members: int = number_of_members
 73        self._number_of_properties: int = number_of_properties
 74        self._value_size: int = value_size
 75        random.seed(seed)
 76
 77        super().__init__(data_format, engine, main_directory, verbose)
 78
 79        if self._data_format != 'csv':
 80            raise NotImplementedError(f'Data format {self._data_format} '
 81                                      f'is not implemented by {__name__}')
 82
 83        self._logger = Logger(__name__, self._main_directory, self._verbose)
 84        self._logger.debug(f'Generating join relations {self._n}-{self._m}'
 85                           f' with {self._percentage}% of relations,')
 86
 87    def generate(self) -> bool:
 88        """Generate the instance using the Joins Relations scenario.
 89
 90        Only CSV files are currently implemented!
 91        """
 92        if self._data_format == 'csv':
 93            return self._generate_csv()
 94        elif self._data_format == 'postgresql':
 95            return self._generate_postgresql()
 96        else:
 97            raise NotImplementedError(f'Data format {self._data_format} '
 98                                      f'is not implemented by {__name__}')
 99
100    def path(self) -> str:
101        """Builds the file path for the instance of a Joins Relations scenario.
102
103        Returns
104        -------
105        path : str
106            File path for the Joins Relations's instance.
107        """
108        key = f'joins_relations_{self._n}-{self._m}_{self._percentage}'
109        path = os.path.join(self._main_directory, self._engine,
110                            self._data_format, key)
111        self._logger.debug(f'Generating to {path}')
112        os.makedirs(path, exist_ok=True)
113        return path
114
115    def _generate_dataframe(self, member_offset: int = 1,
116                            property_offset: int = 1) -> DataFrame:
117        """Generate joins.
118
119        Parameters
120        ----------
121        member_offset : int
122            Offset to start member ID generation from. Default 1 (no offset).
123        property_offset : int
124            Offset to start property ID generation from. Default 1 (no offset).
125
126        Returns
127        -------
128        dataframe : DataFrame
129            Panda's DataFrame with generated joins.
130        """
131        subject_id = range(member_offset,
132                           self._number_of_members + member_offset)
133        value_id = range(property_offset,
134                         self._number_of_members + property_offset)
135        data: dict = {'id': subject_id}
136        n_ascii = len(string.ascii_letters)
137
138        for j in range(1, self._number_of_properties + 1):
139            # Append ASCII characters if necessary, use modulo to avoid out of
140            # range in ASCII table
141            append_value = ''
142            if self._value_size > 0:
143                append_value = '_'
144            for n in range(self._value_size):
145                append_value += string.ascii_letters[n % n_ascii]
146
147            # Generate value V_{property}_{member} honoring the value size
148            value = [f'V_{j}-{i}{append_value}' for i in value_id]
149            data[f'p{j}'] = value
150
151        return DataFrame(data)
152
153    def _update_many_on_many(self,
154                             dataframe1: DataFrame,
155                             dataframe2: DataFrame) -> Tuple[DataFrame,
156                                                             DataFrame]:
157        # 0% percentage results in zero matches for the join condition,
158        # don't even bother to try to match the dataframes
159        if self._percentage == 0.0:
160            return dataframe1, dataframe2
161
162        percentaged_members = \
163            self._number_of_members * (self._percentage / 100.0)
164
165        sample1 = dataframe1.iloc[random.sample(list(dataframe1.index),
166                                                int(percentaged_members))]
167        sample1_v = sample1.reset_index(drop=True)
168        sample2 = dataframe2.iloc[random.sample(list(dataframe2.index),
169                                                int(percentaged_members))]
170        sample2_v = sample2.reset_index(drop=True)
171
172        number_of_members_n = self._number_of_members * (self._percentage / 100.0)
173        number_of_members_m = self._number_of_members * (self._percentage / 100.0)
174        members_to_join_n = number_of_members_n / self._n
175        members_to_join_m = number_of_members_m / self._m
176
177        k = max(int(members_to_join_n + 0.5), int(members_to_join_m + 0.5))
178        sample_members = sample1_v.iloc[random.sample(list(sample1_v.index),
179                                                      k)]
180        values = list(set([m[1]['p1'] for m in sample_members.iterrows()]))[:int(members_to_join_m + 0.5)]
181        values = values * self._m
182        if len(values) > self._number_of_members:
183            values = values[:self._number_of_members]
184
185        sample2_v = dataframe2.iloc[random.sample(list(dataframe2.index),
186                                                  len(values))]
187
188        for i, j in zip(values, list(sample2_v.index)):
189            dataframe2.loc[j, 'p1'] = i
190
191        ####
192        values = list(set([m[1]['p1'] for m in sample_members.iterrows()]))[:int(members_to_join_n + 0.5)]
193        values = values * self._n
194        if len(values) > self._number_of_members:
195            values = values[:self._number_of_members]
196
197        sample1_v = dataframe1.iloc[random.sample(list(dataframe1.index),
198                                                  len(values))]
199        for i, j in zip(values, list(sample1_v.index)):
200            dataframe1.loc[j, 'p1'] = i
201
202        return dataframe1, dataframe2
203
204    def _generate_mapping(self) -> Graph:
205        """Generate a [R2]RML mapping for a Joins instance.
206
207        Returns
208        -------
209        mapping : Graph
210            [R2]RML mapping as an RDFLib Graph.
211        """
212        mapping: Graph = Graph(base='http://ex.com/')
213        mapping.bind('rr', R2RML)
214        mapping.bind('ql', QL)
215        mapping.bind('ex', EX)
216        subject1_template = Literal('http://ex.com/table1/{id}')
217        subject2_template = Literal('http://ex.com/table2/{id}')
218        if self._data_format == 'postgresql':
219            triples_map1_iri = self._add_triples_map(mapping,
220                                                     subject1_template,
221                                                     Literal('data'), number=1)
222            triples_map2_iri = self._add_triples_map(mapping,
223                                                     subject2_template,
224                                                     Literal('data'), number=2)
225        elif self._data_format == 'csv':
226            triples_map1_iri = \
227                self._add_triples_map_source(mapping, subject1_template,
228                                             Literal('/data/shared/data1.csv'),
229                                             number=1)
230            triples_map2_iri = \
231                self._add_triples_map_source(mapping, subject1_template,
232                                             Literal('/data/shared/data2.csv'),
233                                             number=2)
234        else:
235            raise NotImplementedError(f'{self._data_format} not implemented')
236
237        self._add_join_predicate_object_map(mapping, triples_map1_iri,
238                                            EX['j1'], Literal('p1'),
239                                            triples_map2_iri, Literal('p1'),
240                                            Literal('p1'))
241
242        return mapping
243
244    def _generate_csv(self) -> bool:
245        """Generate the instance as CSV files.
246
247        Returns
248        -------
249        success : bool
250            True if successfull, false otherwise
251        """
252        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
253        data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1)
254        dataframe1 = self._generate_dataframe()
255        data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2)
256        dataframe2 = self._generate_dataframe(self._number_of_members + 1,
257                                              self._number_of_properties + 1)
258        dataframe1, dataframe2 = self._update_many_on_many(dataframe1,
259                                                           dataframe2)
260        dataframe1.to_csv(data1_path, index=False)
261        dataframe2.to_csv(data2_path, index=False)
262
263        mapping_path = os.path.join(self.path(), 'data', 'shared', CSV_MAPPING_FILE)
264        mapping: Graph = self._generate_mapping()
265        mapping.serialize(destination=mapping_path, format='turtle')
266        self._generate_scenario()
267
268        return True
269
270    def _generate_postgresql(self) -> bool:
271        """Generate the instance as PostgreSQL with CSV files to load.
272
273        Returns
274        -------
275        success : bool
276            True if successfull, false otherwise
277        """
278        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
279        data1_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE1)
280        dataframe1 = self._generate_dataframe()
281        data2_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE2)
282        dataframe2 = self._generate_dataframe(self._number_of_members + 1,
283                                              self._number_of_properties + 1)
284        dataframe1, dataframe2 = self._update_many_on_many(dataframe1,
285                                                           dataframe2)
286        dataframe1.to_csv(data1_path, index=False)
287        dataframe2.to_csv(data2_path, index=False)
288
289        mapping_path = os.path.join(self.path(), 'data', 'shared', RDB_MAPPING_FILE)
290        mapping: Graph = self._generate_mapping()
291        mapping.serialize(destination=mapping_path, format='turtle')
292        self._generate_scenario()
293
294        return True
295
296    def _generate_scenario(self) -> bool:
297        """Generate the metadata for this scenario.
298
299        Configures the execution pipeline automatically.
300
301        Returns
302        -------
303        success : bool
304            True if successfull, false otherwise
305        """
306        name: str = f'join_relation_{self._n}_{self._m}_{self._percentage}'
307        description: str = f'Join Relation {self._n}-{self._m} {self._percentage}% '
308        iri: str = f'http://example.org/join-relation/{self._n}-{self._m}/{self._percentage}/'
309
310        if self._data_format == 'postgresql':
311            return self._generate_metadata(iri, name, description,
312                                           RDB_MAPPING_FILE)
313        elif self._data_format == 'csv':
314            return self._generate_metadata(iri, name, description,
315                                           CSV_MAPPING_FILE)
316        else:
317            raise NotImplementedError(f'{self._data_format} not implemented')
318
319        return False

Helper class that provides a standard way to create an ABC using inheritance.

JoinsRelation( main_directory: str, verbose: bool, percentage: float, n: int, m: int, number_of_members: int, number_of_properties: int, value_size: int, data_format: str, engine: str, seed: int = 0)
30    def __init__(self, main_directory: str, verbose: bool, percentage: float,
31                 n: int, m: int, number_of_members: int,
32                 number_of_properties: int, value_size: int, data_format: str,
33                 engine: str, seed: int = 0):
34        """Initialize a Joins Relations scenario.
35
36        Member's percentage is always set to 50%.
37
38        Parameters
39        ----------
40        main_directory : str
41            Root directory for generating instances of Joins Relations.
42        verbose : bool
43            Verbose logging enabled or not.
44        percentage : float
45            Percentage of relations which should result into a join.
46        n : int
47            Relation size N.
48        m : int
49            Relation size M.
50        number_of_members : int
51            Number of members to generate, for example 5000 for 5K rows in a
52            tabular data structure.
53        number_of_properties : int
54            Number of properties per member to generate, for example 20 for
55            20 columns in a tabular data structure.
56        value_size : int
57            Number of characters to add to default value generation,
58            for example: 256 will expand all values to 256 characters.
59        data_format : str
60            Data format to use for generating the data set, for example:
61            "csv", "json", "xml", "postgresql", "mysql"
62        engine : str
63            Engine to use for execution of the generated scenario's instance,
64            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
65            or "OntopMaterialize"
66        seed : int
67            Random seed to use, default 0.
68        """
69        self._percentage = percentage
70        self._n = n
71        self._m = m
72        self._number_of_members: int = number_of_members
73        self._number_of_properties: int = number_of_properties
74        self._value_size: int = value_size
75        random.seed(seed)
76
77        super().__init__(data_format, engine, main_directory, verbose)
78
79        if self._data_format != 'csv':
80            raise NotImplementedError(f'Data format {self._data_format} '
81                                      f'is not implemented by {__name__}')
82
83        self._logger = Logger(__name__, self._main_directory, self._verbose)
84        self._logger.debug(f'Generating join relations {self._n}-{self._m}'
85                           f' with {self._percentage}% of relations,')

Initialize a Joins Relations scenario.

Member's percentage is always set to 50%.

Parameters
  • main_directory (str): Root directory for generating instances of Joins Relations.
  • verbose (bool): Verbose logging enabled or not.
  • percentage (float): Percentage of relations which should result into a join.
  • n (int): Relation size N.
  • m (int): Relation size M.
  • number_of_members (int): Number of members to generate, for example 5000 for 5K rows in a tabular data structure.
  • number_of_properties (int): Number of properties per member to generate, for example 20 for 20 columns in a tabular data structure.
  • value_size (int): Number of characters to add to default value generation, for example: 256 will expand all values to 256 characters.
  • data_format (str): Data format to use for generating the data set, for example: "csv", "json", "xml", "postgresql", "mysql"
  • engine (str): Engine to use for execution of the generated scenario's instance, for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", or "OntopMaterialize"
  • seed (int): Random seed to use, default 0.
def generate(self) -> bool:
87    def generate(self) -> bool:
88        """Generate the instance using the Joins Relations scenario.
89
90        Only CSV files are currently implemented!
91        """
92        if self._data_format == 'csv':
93            return self._generate_csv()
94        elif self._data_format == 'postgresql':
95            return self._generate_postgresql()
96        else:
97            raise NotImplementedError(f'Data format {self._data_format} '
98                                      f'is not implemented by {__name__}')

Generate the instance using the Joins Relations scenario.

Only CSV files are currently implemented!

def path(self) -> str:
100    def path(self) -> str:
101        """Builds the file path for the instance of a Joins Relations scenario.
102
103        Returns
104        -------
105        path : str
106            File path for the Joins Relations's instance.
107        """
108        key = f'joins_relations_{self._n}-{self._m}_{self._percentage}'
109        path = os.path.join(self._main_directory, self._engine,
110                            self._data_format, key)
111        self._logger.debug(f'Generating to {path}')
112        os.makedirs(path, exist_ok=True)
113        return path

Builds the file path for the instance of a Joins Relations scenario.

Returns
  • path (str): File path for the Joins Relations's instance.