bench_generator.joins

This module holds the Joins class which scales the dataset size by the number of members in a dataset such as number of rows for tabular data.

  1#!/usr/bin/env python3
  2
  3"""
  4This module holds the Joins class which scales the dataset size
  5by the number of members in a dataset such as number of rows for tabular data.
  6"""
  7
  8import os
  9import string
 10import random
 11from pandas import DataFrame
 12from rdflib.namespace import RDF
 13from rdflib import Graph, URIRef, BNode, Literal, Namespace
 14from bench_generator.scenario import Scenario
 15from bench_generator.logger import Logger
 16
 17DATA_FILE1 = 'data1.csv'
 18DATA_FILE2 = 'data2.csv'
 19MAPPING_FILE = 'mapping.r2rml.ttl'
 20R2RML = Namespace('http://www.w3.org/ns/r2rml#')
 21QL = Namespace('http://semweb.mmlab.be/ns/ql#')
 22EX = Namespace('http://example.com/')
 23
 24
 25class Joins(Scenario):
 26    def __init__(self, main_directory: str, verbose: bool, percentage: float,
 27                 number_of_members: int, number_of_properties: int,
 28                 value_size: int, data_format: str, engine: str,
 29                 seed: int = 0, join_n: int = 1, join_m: int = 1):
 30        """Initialize a Raw Data scenario.
 31
 32        Parameters
 33        ----------
 34        main_directory : str
 35            Root directory for generating instances of Raw Data.
 36        verbose : bool
 37            Verbose logging enabled or not.
 38        percentage : float
 39            Percentage of members which should result into a join.
 40        number_of_members : int
 41            Number of members to generate, for example 5000 for 5K rows in a
 42            tabular data structure.
 43        number_of_properties : int
 44            Number of properties per member to generate, for example 20 for
 45            20 columns in a tabular data structure.
 46        value_size : int
 47            Number of characters to add to default value generation,
 48            for example: 256 will expand all values to 256 characters.
 49        data_format : str
 50            Data format to use for generating the data set, for example:
 51            "csv", "json", "xml", "postgresql", "mysql"
 52        engine : str
 53            Engine to use for execution of the generated scenario's instance,
 54            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
 55            or "OntopMaterialize"
 56        seed : int
 57            Random seed to use, default 0.
 58        join_n : int
 59            Join N-M relationship value N, default 1.
 60        join_m: int
 61            Join N-M relationship value M, default 1.
 62        """
 63        self._percentage = percentage
 64        self._number_of_members: int = number_of_members
 65        self._number_of_properties: int = number_of_properties
 66        self._value_size: int = value_size
 67        self._data_format: str = data_format
 68        self._engine: str = engine
 69        self._join_n: int = join_n
 70        self._join_m: int = join_m
 71        random.seed(seed)
 72
 73        if self._data_format != 'csv':
 74            raise NotImplementedError(f'Data format {self._data_format} '
 75                                      f'is not implemented by {__name__}')
 76
 77        super().__init__(main_directory, verbose)
 78        self._logger = Logger(__name__, self._main_directory, self._verbose)
 79        self._logger.debug(f'Generating join {self._join_n}-{self._join_m}'
 80                           f' with {self._percentage}%')
 81
 82    def generate(self) -> bool:
 83        """Generate the instance using the Raw Data scenario.
 84
 85        Only CSV files are currently implemented!
 86        """
 87        if self._data_format == 'csv':
 88            return self._generate_csv()
 89        elif self._data_format == 'postgresql':
 90            return self._generate_postgresql()
 91        else:
 92            raise NotImplementedError(f'Data format {self._data_format} '
 93                                      f'is not implemented by {__name__}')
 94
 95    def path(self) -> str:
 96        """Builds the file path for the instance of a Raw Data scenario.
 97
 98        Returns
 99        -------
100        path : str
101            File path for the Raw Data's instance.
102        """
103        key = f'joins_{self._join_n}-{self._join_m}_{self._percentage}'
104        path = os.path.join(self._main_directory, self._engine,
105                            self._data_format, key)
106        self._logger.debug(f'Generating to {path}')
107        os.makedirs(path, exist_ok=True)
108        return path
109
110    def _generate_dataframe(self, member_offset: int = 1,
111                            property_offset: int = 1) -> DataFrame:
112        """Generate joins.
113
114        Parameters
115        ----------
116        member_offset : int
117            Offset to start member ID generation from. Default 1 (no offset).
118        property_offset : int
119            Offset to start property ID generation from. Default 1 (no offset).
120
121        Returns
122        -------
123        dataframe : DataFrame
124            Panda's DataFrame with generated joins.
125        """
126        subject_id = range(member_offset,
127                           self._number_of_members + member_offset)
128        value_id = range(property_offset,
129                         self._number_of_members + property_offset)
130        data: dict = {'id': subject_id}
131        n_ascii = len(string.ascii_letters)
132
133        for j in range(1, self._number_of_properties + 1):
134            # Append ASCII characters if necessary, use modulo to avoid out of
135            # range in ASCII table
136            append_value = ''
137            if self._value_size > 0:
138                append_value = '_'
139            for n in range(self._value_size):
140                append_value += string.ascii_letters[n % n_ascii]
141
142            # Generate value V_{property}_{member} honoring the value size
143            value = [f'V_{j}-{i}{append_value}' for i in value_id]
144            data[f'p{j}'] = value
145
146        return DataFrame(data)
147
148    def _update_one_on_one(self, dataframe1: DataFrame,
149                           dataframe2: DataFrame) -> DataFrame:
150        # 0% percentage results in zero matches for the join condition,
151        # don't even bother to try to match the dataframes
152        if self._percentage == 0.0:
153            return dataframe1, dataframe2
154
155        # Sample both dataframes
156        percentage_members = int(self._number_of_members *
157                                 (self._percentage / 100.0))
158        dataframe1_sample = dataframe1 \
159            .loc[random.sample(list(dataframe1.index), percentage_members)]
160        dataframe1_sample.reset_index(drop=True)
161        number_of_members_n = self._number_of_members * percentage_members
162        number_of_members_to_join_n = number_of_members_n / self._join_n
163        number_of_members_m = self._number_of_members * percentage_members
164        number_of_members_to_join_m = number_of_members_m / self._join_m
165
166        members_sample_size = max(int(number_of_members_to_join_n + 0.5),
167                                  int(number_of_members_to_join_m + 0.5))
168        members_sample = dataframe1.iloc[random.sample(list(dataframe1.index),
169                                         members_sample_size)]
170
171        # Extract unique values of p1 from dataframe 1, only those sampled for
172        # percentage to dataframe 2
173        members_value = \
174            list(set([row[1]['p1'] for row in members_sample.iterrows()]))\
175            [:int(number_of_members_to_join_m + 0.5)]
176        # Repeat the values M times to honor the relation size
177        members_value = members_value * self.join_m
178
179        # Limit number of values because we may have more values than members
180        if len(members_value) > self._number_of_members:
181            members_value = members_value[:self._number_of_members]
182
183        dataframe2_sample = dataframe2 \
184            .loc[random.sample(list(dataframe2.index), percentage_members)]
185
186        # Update dataframe2 to match with dataframe1
187        for i, j in zip(members_value, list(dataframe1_sample.index)):
188            dataframe2.loc[j, 'id'] = i
189
190        # Extract unique values of p1 from dataframe 2, only those sampled for
191        # percentage to dataframe 1
192        members_value = \
193            list(set([row[1]['p1'] for row in members_sample.iterrows()]))\
194            [:int(number_of_members_to_join_n + 0.5)]
195        # Repeat the values M times to honor the relation size
196        members_value = members_value * self.join_n
197
198        if len(members_value) > self._number_of_members:
199            members_value = members_value[:self._number_of_members]
200
201        dataframe1_sample = dataframe1 \
202            .loc[random.sample(list(dataframe1.index), percentage_members)]
203
204        # Update dataframe1 to match with dataframe2
205        for i, j in zip(members_value, list(dataframe2_sample.index)):
206            dataframe1.loc[j, 'id'] = i
207
208        return dataframe1, dataframe2
209
210    def _add_predicate_object_map(self, mapping: Graph, triplesmap_iri: URIRef,
211                                  predicate_value: URIRef,
212                                  object_value: Literal) -> BNode:
213        """Insert a PredicateObjectMap into a [R2]RML mapping
214
215        Parameters
216        ----------
217        mapping : Graph
218            [R2]RML mapping as an RDFLib Graph.
219        triples_map_iri : URIRef
220            IRI of the Triples Map to insert the PredicateObjectMap in.
221        predicate_value : URIRef
222            Predicate IRI value for PredicateObjectMap.
223        object_value : Literal
224            Object value for PredicateObjectMap.
225
226        Returns
227        -------
228        predicate_object_map_iri : BNode
229            Predicate Object Map blank node ID.
230        """
231        predicate_object_map_iri = BNode()
232        predicate_map_iri = BNode()
233        object_map_iri = BNode()
234
235        mapping.add((predicate_map_iri, R2RML.constant, predicate_value))
236        mapping.add((predicate_map_iri, RDF.type, R2RML.PredicateMap))
237        mapping.add((object_map_iri, R2RML.column, object_value))
238        mapping.add((object_map_iri, RDF.type, R2RML.ObjectMap))
239        mapping.add((predicate_object_map_iri, R2RML.predicateMap,
240                     predicate_map_iri))
241        mapping.add((predicate_object_map_iri, R2RML.objectMap,
242                     object_map_iri))
243        mapping.add((predicate_object_map_iri, RDF.type,
244                     R2RML.PredicateObjectMap))
245        mapping.add((triplesmap_iri, R2RML.predicateObjectMap,
246                     predicate_object_map_iri))
247
248        return predicate_object_map_iri
249
250    def _add_join_predicate_object_map(self, mapping: Graph,
251                                       triplesmap_iri: URIRef,
252                                       predicate_value: URIRef,
253                                       object_value: Literal,
254                                       parent_triplesmap_iri: URIRef,
255                                       child_value: Literal,
256                                       parent_value: Literal) -> BNode:
257        """Insert a join with join condition into a [R2]RML mapping
258
259        Parameters
260        ----------
261        mapping : Graph
262            [R2]RML mapping as an RDFLib Graph.
263        triples_map_iri : URIRef
264            IRI of the Triples Map to insert the PredicateObjectMap in.
265        predicate_value : URIRef
266            Predicate IRI value for PredicateObjectMap.
267        object_value : Literal
268            Object value for PredicateObjectMap.
269
270        Returns
271        -------
272        predicat_object_map_with_join_iri : BNode
273            Predicate Object Map with join blank node ID.
274        """
275        predicate_object_map_iri = BNode()
276        predicate_map_iri = BNode()
277        object_map_iri = BNode()
278        join_condition_iri = BNode()
279
280        mapping.add((join_condition_iri, R2RML.child, child_value))
281        mapping.add((join_condition_iri, R2RML.parent, parent_value))
282        mapping.add((join_condition_iri, RDF.type, R2RML.JoinCondition))
283        mapping.add((predicate_map_iri, R2RML.constant, predicate_value))
284        mapping.add((predicate_map_iri, RDF.type, R2RML.PredicateMap))
285        mapping.add((object_map_iri, RDF.type, R2RML.ReferenceObjectMap))
286        mapping.add((object_map_iri, R2RML.parentTriplesMap,
287                     parent_triplesmap_iri))
288        mapping.add((object_map_iri, R2RML.joinCondition, join_condition_iri))
289        mapping.add((predicate_object_map_iri, R2RML.predicateMap,
290                     predicate_map_iri))
291        mapping.add((predicate_object_map_iri, R2RML.objectMap,
292                     object_map_iri))
293        mapping.add((predicate_object_map_iri, RDF.type,
294                     R2RML.PredicateObjectMap))
295        mapping.add((triplesmap_iri, R2RML.predicateObjectMap,
296                     predicate_object_map_iri))
297
298        return join_condition_iri
299
300    def _add_triples_map(self, mapping: Graph, subject_value: Literal,
301                         table_name: Literal, number: int = 1) -> URIRef:
302        """Insert a TriplesMap into a [R2]RML mapping
303
304        Parameters
305        ----------
306        mapping : Graph
307            [R2]RML mapping as an RDFLib Graph.
308        subject_value : Literal
309            Subject IRI template value.
310        table_name : Literal
311            SQL table name to add.
312
313        number : int
314            Triples Map number, default 1.
315
316        Returns
317        -------
318        triples_map_iri : URIRef
319            IRI of the Triples Map inserted into the mapping.
320        """
321        triples_map_iri = URIRef(f'{mapping.base}#TriplesMap{number}')
322        subject_map_iri = BNode()
323        logical_table_iri = BNode()
324
325        mapping.add((logical_table_iri, R2RML.tableName, table_name))
326        mapping.add((triples_map_iri, R2RML.logicalTable, logical_table_iri))
327        mapping.add((triples_map_iri, R2RML.subjectMap, subject_map_iri))
328        mapping.add((triples_map_iri, RDF.type, R2RML.TriplesMap))
329        mapping.add((subject_map_iri, R2RML.template, subject_value))
330
331        return triples_map_iri
332
333    def _generate_mapping(self) -> Graph:
334        """Generate a [R2]RML mapping for a Joins instance.
335
336        Returns
337        -------
338        mapping : Graph
339            [R2]RML mapping as an RDFLib Graph.
340        """
341        mapping: Graph = Graph(base='http://ex.com/')
342        mapping.bind('rr', R2RML)
343        mapping.bind('ql', QL)
344        mapping.bind('ex', EX)
345        subject1_template = Literal('http://ex.com/table1/{id}')
346        subject2_template = Literal('http://ex.com/table2/{id}')
347        triples_map1_iri = self._add_triples_map(mapping, subject1_template,
348                                                 Literal('data'), number=1)
349        triples_map2_iri = self._add_triples_map(mapping, subject2_template,
350                                                 Literal('data'), number=2)
351
352        self._add_join_predicate_object_map(mapping, triples_map1_iri,
353                                            EX['j1'], Literal('p1'),
354                                            triples_map2_iri, Literal('id'),
355                                            Literal('id'))
356
357        return mapping
358
359    def _generate_csv(self) -> bool:
360        """Generate the instance as CSV files.
361
362        Returns
363        -------
364        success : bool
365            True if successfull, false otherwise
366        """
367        data1_path = os.path.join(self.path(), DATA_FILE1)
368        dataframe1 = self._generate_dataframe()
369        data2_path = os.path.join(self.path(), DATA_FILE2)
370        dataframe2 = self._generate_dataframe(self._number_of_members + 1,
371                                              self._number_of_properties + 1)
372        dataframe1, dataframe2 = self._update_one_on_one(dataframe1,
373                                                         dataframe2)
374        dataframe1.to_csv(data1_path, index=False)
375        dataframe2.to_csv(data2_path, index=False)
376
377        mapping_path = os.path.join(self.path(), MAPPING_FILE)
378        mapping: Graph = self._generate_mapping()
379        mapping.serialize(destination=mapping_path, format='turtle')
380
381        return True
382
383    def _generate_postgresql(self) -> bool:
384        """Generate the instance as PostgreSQL with CSV files to load.
385
386        Returns
387        -------
388        success : bool
389            True if successfull, false otherwise
390        """
391        data1_path = os.path.join(self.path(), DATA_FILE1)
392        self._generate_dataframe().to_csv(data1_path, index=False)
393        data2_path = os.path.join(self.path(), DATA_FILE2)
394        self._generate_dataframe().to_csv(data2_path, index=False)
395
396        mapping_path = os.path.join(self.path(), MAPPING_FILE)
397        mapping: Graph = self._generate_mapping()
398        mapping.serialize(destination=mapping_path, format='turtle')
399
400        return True
DATA_FILE1 = 'data1.csv'
DATA_FILE2 = 'data2.csv'
MAPPING_FILE = 'mapping.r2rml.ttl'
R2RML = Namespace('http://www.w3.org/ns/r2rml#')
QL = Namespace('http://semweb.mmlab.be/ns/ql#')
EX = Namespace('http://example.com/')
class Joins(bench_generator.scenario.Scenario):
 26class Joins(Scenario):
 27    def __init__(self, main_directory: str, verbose: bool, percentage: float,
 28                 number_of_members: int, number_of_properties: int,
 29                 value_size: int, data_format: str, engine: str,
 30                 seed: int = 0, join_n: int = 1, join_m: int = 1):
 31        """Initialize a Raw Data scenario.
 32
 33        Parameters
 34        ----------
 35        main_directory : str
 36            Root directory for generating instances of Raw Data.
 37        verbose : bool
 38            Verbose logging enabled or not.
 39        percentage : float
 40            Percentage of members which should result into a join.
 41        number_of_members : int
 42            Number of members to generate, for example 5000 for 5K rows in a
 43            tabular data structure.
 44        number_of_properties : int
 45            Number of properties per member to generate, for example 20 for
 46            20 columns in a tabular data structure.
 47        value_size : int
 48            Number of characters to add to default value generation,
 49            for example: 256 will expand all values to 256 characters.
 50        data_format : str
 51            Data format to use for generating the data set, for example:
 52            "csv", "json", "xml", "postgresql", "mysql"
 53        engine : str
 54            Engine to use for execution of the generated scenario's instance,
 55            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
 56            or "OntopMaterialize"
 57        seed : int
 58            Random seed to use, default 0.
 59        join_n : int
 60            Join N-M relationship value N, default 1.
 61        join_m: int
 62            Join N-M relationship value M, default 1.
 63        """
 64        self._percentage = percentage
 65        self._number_of_members: int = number_of_members
 66        self._number_of_properties: int = number_of_properties
 67        self._value_size: int = value_size
 68        self._data_format: str = data_format
 69        self._engine: str = engine
 70        self._join_n: int = join_n
 71        self._join_m: int = join_m
 72        random.seed(seed)
 73
 74        if self._data_format != 'csv':
 75            raise NotImplementedError(f'Data format {self._data_format} '
 76                                      f'is not implemented by {__name__}')
 77
 78        super().__init__(main_directory, verbose)
 79        self._logger = Logger(__name__, self._main_directory, self._verbose)
 80        self._logger.debug(f'Generating join {self._join_n}-{self._join_m}'
 81                           f' with {self._percentage}%')
 82
 83    def generate(self) -> bool:
 84        """Generate the instance using the Raw Data scenario.
 85
 86        Only CSV files are currently implemented!
 87        """
 88        if self._data_format == 'csv':
 89            return self._generate_csv()
 90        elif self._data_format == 'postgresql':
 91            return self._generate_postgresql()
 92        else:
 93            raise NotImplementedError(f'Data format {self._data_format} '
 94                                      f'is not implemented by {__name__}')
 95
 96    def path(self) -> str:
 97        """Builds the file path for the instance of a Raw Data scenario.
 98
 99        Returns
100        -------
101        path : str
102            File path for the Raw Data's instance.
103        """
104        key = f'joins_{self._join_n}-{self._join_m}_{self._percentage}'
105        path = os.path.join(self._main_directory, self._engine,
106                            self._data_format, key)
107        self._logger.debug(f'Generating to {path}')
108        os.makedirs(path, exist_ok=True)
109        return path
110
111    def _generate_dataframe(self, member_offset: int = 1,
112                            property_offset: int = 1) -> DataFrame:
113        """Generate joins.
114
115        Parameters
116        ----------
117        member_offset : int
118            Offset to start member ID generation from. Default 1 (no offset).
119        property_offset : int
120            Offset to start property ID generation from. Default 1 (no offset).
121
122        Returns
123        -------
124        dataframe : DataFrame
125            Panda's DataFrame with generated joins.
126        """
127        subject_id = range(member_offset,
128                           self._number_of_members + member_offset)
129        value_id = range(property_offset,
130                         self._number_of_members + property_offset)
131        data: dict = {'id': subject_id}
132        n_ascii = len(string.ascii_letters)
133
134        for j in range(1, self._number_of_properties + 1):
135            # Append ASCII characters if necessary, use modulo to avoid out of
136            # range in ASCII table
137            append_value = ''
138            if self._value_size > 0:
139                append_value = '_'
140            for n in range(self._value_size):
141                append_value += string.ascii_letters[n % n_ascii]
142
143            # Generate value V_{property}_{member} honoring the value size
144            value = [f'V_{j}-{i}{append_value}' for i in value_id]
145            data[f'p{j}'] = value
146
147        return DataFrame(data)
148
149    def _update_one_on_one(self, dataframe1: DataFrame,
150                           dataframe2: DataFrame) -> DataFrame:
151        # 0% percentage results in zero matches for the join condition,
152        # don't even bother to try to match the dataframes
153        if self._percentage == 0.0:
154            return dataframe1, dataframe2
155
156        # Sample both dataframes
157        percentage_members = int(self._number_of_members *
158                                 (self._percentage / 100.0))
159        dataframe1_sample = dataframe1 \
160            .loc[random.sample(list(dataframe1.index), percentage_members)]
161        dataframe1_sample.reset_index(drop=True)
162        number_of_members_n = self._number_of_members * percentage_members
163        number_of_members_to_join_n = number_of_members_n / self._join_n
164        number_of_members_m = self._number_of_members * percentage_members
165        number_of_members_to_join_m = number_of_members_m / self._join_m
166
167        members_sample_size = max(int(number_of_members_to_join_n + 0.5),
168                                  int(number_of_members_to_join_m + 0.5))
169        members_sample = dataframe1.iloc[random.sample(list(dataframe1.index),
170                                         members_sample_size)]
171
172        # Extract unique values of p1 from dataframe 1, only those sampled for
173        # percentage to dataframe 2
174        members_value = \
175            list(set([row[1]['p1'] for row in members_sample.iterrows()]))\
176            [:int(number_of_members_to_join_m + 0.5)]
177        # Repeat the values M times to honor the relation size
178        members_value = members_value * self.join_m
179
180        # Limit number of values because we may have more values than members
181        if len(members_value) > self._number_of_members:
182            members_value = members_value[:self._number_of_members]
183
184        dataframe2_sample = dataframe2 \
185            .loc[random.sample(list(dataframe2.index), percentage_members)]
186
187        # Update dataframe2 to match with dataframe1
188        for i, j in zip(members_value, list(dataframe1_sample.index)):
189            dataframe2.loc[j, 'id'] = i
190
191        # Extract unique values of p1 from dataframe 2, only those sampled for
192        # percentage to dataframe 1
193        members_value = \
194            list(set([row[1]['p1'] for row in members_sample.iterrows()]))\
195            [:int(number_of_members_to_join_n + 0.5)]
196        # Repeat the values M times to honor the relation size
197        members_value = members_value * self.join_n
198
199        if len(members_value) > self._number_of_members:
200            members_value = members_value[:self._number_of_members]
201
202        dataframe1_sample = dataframe1 \
203            .loc[random.sample(list(dataframe1.index), percentage_members)]
204
205        # Update dataframe1 to match with dataframe2
206        for i, j in zip(members_value, list(dataframe2_sample.index)):
207            dataframe1.loc[j, 'id'] = i
208
209        return dataframe1, dataframe2
210
211    def _add_predicate_object_map(self, mapping: Graph, triplesmap_iri: URIRef,
212                                  predicate_value: URIRef,
213                                  object_value: Literal) -> BNode:
214        """Insert a PredicateObjectMap into a [R2]RML mapping
215
216        Parameters
217        ----------
218        mapping : Graph
219            [R2]RML mapping as an RDFLib Graph.
220        triples_map_iri : URIRef
221            IRI of the Triples Map to insert the PredicateObjectMap in.
222        predicate_value : URIRef
223            Predicate IRI value for PredicateObjectMap.
224        object_value : Literal
225            Object value for PredicateObjectMap.
226
227        Returns
228        -------
229        predicate_object_map_iri : BNode
230            Predicate Object Map blank node ID.
231        """
232        predicate_object_map_iri = BNode()
233        predicate_map_iri = BNode()
234        object_map_iri = BNode()
235
236        mapping.add((predicate_map_iri, R2RML.constant, predicate_value))
237        mapping.add((predicate_map_iri, RDF.type, R2RML.PredicateMap))
238        mapping.add((object_map_iri, R2RML.column, object_value))
239        mapping.add((object_map_iri, RDF.type, R2RML.ObjectMap))
240        mapping.add((predicate_object_map_iri, R2RML.predicateMap,
241                     predicate_map_iri))
242        mapping.add((predicate_object_map_iri, R2RML.objectMap,
243                     object_map_iri))
244        mapping.add((predicate_object_map_iri, RDF.type,
245                     R2RML.PredicateObjectMap))
246        mapping.add((triplesmap_iri, R2RML.predicateObjectMap,
247                     predicate_object_map_iri))
248
249        return predicate_object_map_iri
250
251    def _add_join_predicate_object_map(self, mapping: Graph,
252                                       triplesmap_iri: URIRef,
253                                       predicate_value: URIRef,
254                                       object_value: Literal,
255                                       parent_triplesmap_iri: URIRef,
256                                       child_value: Literal,
257                                       parent_value: Literal) -> BNode:
258        """Insert a join with join condition into a [R2]RML mapping
259
260        Parameters
261        ----------
262        mapping : Graph
263            [R2]RML mapping as an RDFLib Graph.
264        triples_map_iri : URIRef
265            IRI of the Triples Map to insert the PredicateObjectMap in.
266        predicate_value : URIRef
267            Predicate IRI value for PredicateObjectMap.
268        object_value : Literal
269            Object value for PredicateObjectMap.
270
271        Returns
272        -------
273        predicat_object_map_with_join_iri : BNode
274            Predicate Object Map with join blank node ID.
275        """
276        predicate_object_map_iri = BNode()
277        predicate_map_iri = BNode()
278        object_map_iri = BNode()
279        join_condition_iri = BNode()
280
281        mapping.add((join_condition_iri, R2RML.child, child_value))
282        mapping.add((join_condition_iri, R2RML.parent, parent_value))
283        mapping.add((join_condition_iri, RDF.type, R2RML.JoinCondition))
284        mapping.add((predicate_map_iri, R2RML.constant, predicate_value))
285        mapping.add((predicate_map_iri, RDF.type, R2RML.PredicateMap))
286        mapping.add((object_map_iri, RDF.type, R2RML.ReferenceObjectMap))
287        mapping.add((object_map_iri, R2RML.parentTriplesMap,
288                     parent_triplesmap_iri))
289        mapping.add((object_map_iri, R2RML.joinCondition, join_condition_iri))
290        mapping.add((predicate_object_map_iri, R2RML.predicateMap,
291                     predicate_map_iri))
292        mapping.add((predicate_object_map_iri, R2RML.objectMap,
293                     object_map_iri))
294        mapping.add((predicate_object_map_iri, RDF.type,
295                     R2RML.PredicateObjectMap))
296        mapping.add((triplesmap_iri, R2RML.predicateObjectMap,
297                     predicate_object_map_iri))
298
299        return join_condition_iri
300
301    def _add_triples_map(self, mapping: Graph, subject_value: Literal,
302                         table_name: Literal, number: int = 1) -> URIRef:
303        """Insert a TriplesMap into a [R2]RML mapping
304
305        Parameters
306        ----------
307        mapping : Graph
308            [R2]RML mapping as an RDFLib Graph.
309        subject_value : Literal
310            Subject IRI template value.
311        table_name : Literal
312            SQL table name to add.
313
314        number : int
315            Triples Map number, default 1.
316
317        Returns
318        -------
319        triples_map_iri : URIRef
320            IRI of the Triples Map inserted into the mapping.
321        """
322        triples_map_iri = URIRef(f'{mapping.base}#TriplesMap{number}')
323        subject_map_iri = BNode()
324        logical_table_iri = BNode()
325
326        mapping.add((logical_table_iri, R2RML.tableName, table_name))
327        mapping.add((triples_map_iri, R2RML.logicalTable, logical_table_iri))
328        mapping.add((triples_map_iri, R2RML.subjectMap, subject_map_iri))
329        mapping.add((triples_map_iri, RDF.type, R2RML.TriplesMap))
330        mapping.add((subject_map_iri, R2RML.template, subject_value))
331
332        return triples_map_iri
333
334    def _generate_mapping(self) -> Graph:
335        """Generate a [R2]RML mapping for a Joins instance.
336
337        Returns
338        -------
339        mapping : Graph
340            [R2]RML mapping as an RDFLib Graph.
341        """
342        mapping: Graph = Graph(base='http://ex.com/')
343        mapping.bind('rr', R2RML)
344        mapping.bind('ql', QL)
345        mapping.bind('ex', EX)
346        subject1_template = Literal('http://ex.com/table1/{id}')
347        subject2_template = Literal('http://ex.com/table2/{id}')
348        triples_map1_iri = self._add_triples_map(mapping, subject1_template,
349                                                 Literal('data'), number=1)
350        triples_map2_iri = self._add_triples_map(mapping, subject2_template,
351                                                 Literal('data'), number=2)
352
353        self._add_join_predicate_object_map(mapping, triples_map1_iri,
354                                            EX['j1'], Literal('p1'),
355                                            triples_map2_iri, Literal('id'),
356                                            Literal('id'))
357
358        return mapping
359
360    def _generate_csv(self) -> bool:
361        """Generate the instance as CSV files.
362
363        Returns
364        -------
365        success : bool
366            True if successfull, false otherwise
367        """
368        data1_path = os.path.join(self.path(), DATA_FILE1)
369        dataframe1 = self._generate_dataframe()
370        data2_path = os.path.join(self.path(), DATA_FILE2)
371        dataframe2 = self._generate_dataframe(self._number_of_members + 1,
372                                              self._number_of_properties + 1)
373        dataframe1, dataframe2 = self._update_one_on_one(dataframe1,
374                                                         dataframe2)
375        dataframe1.to_csv(data1_path, index=False)
376        dataframe2.to_csv(data2_path, index=False)
377
378        mapping_path = os.path.join(self.path(), MAPPING_FILE)
379        mapping: Graph = self._generate_mapping()
380        mapping.serialize(destination=mapping_path, format='turtle')
381
382        return True
383
384    def _generate_postgresql(self) -> bool:
385        """Generate the instance as PostgreSQL with CSV files to load.
386
387        Returns
388        -------
389        success : bool
390            True if successfull, false otherwise
391        """
392        data1_path = os.path.join(self.path(), DATA_FILE1)
393        self._generate_dataframe().to_csv(data1_path, index=False)
394        data2_path = os.path.join(self.path(), DATA_FILE2)
395        self._generate_dataframe().to_csv(data2_path, index=False)
396
397        mapping_path = os.path.join(self.path(), MAPPING_FILE)
398        mapping: Graph = self._generate_mapping()
399        mapping.serialize(destination=mapping_path, format='turtle')
400
401        return True

Helper class that provides a standard way to create an ABC using inheritance.

Joins( main_directory: str, verbose: bool, percentage: float, number_of_members: int, number_of_properties: int, value_size: int, data_format: str, engine: str, seed: int = 0, join_n: int = 1, join_m: int = 1)
27    def __init__(self, main_directory: str, verbose: bool, percentage: float,
28                 number_of_members: int, number_of_properties: int,
29                 value_size: int, data_format: str, engine: str,
30                 seed: int = 0, join_n: int = 1, join_m: int = 1):
31        """Initialize a Raw Data scenario.
32
33        Parameters
34        ----------
35        main_directory : str
36            Root directory for generating instances of Raw Data.
37        verbose : bool
38            Verbose logging enabled or not.
39        percentage : float
40            Percentage of members which should result into a join.
41        number_of_members : int
42            Number of members to generate, for example 5000 for 5K rows in a
43            tabular data structure.
44        number_of_properties : int
45            Number of properties per member to generate, for example 20 for
46            20 columns in a tabular data structure.
47        value_size : int
48            Number of characters to add to default value generation,
49            for example: 256 will expand all values to 256 characters.
50        data_format : str
51            Data format to use for generating the data set, for example:
52            "csv", "json", "xml", "postgresql", "mysql"
53        engine : str
54            Engine to use for execution of the generated scenario's instance,
55            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
56            or "OntopMaterialize"
57        seed : int
58            Random seed to use, default 0.
59        join_n : int
60            Join N-M relationship value N, default 1.
61        join_m: int
62            Join N-M relationship value M, default 1.
63        """
64        self._percentage = percentage
65        self._number_of_members: int = number_of_members
66        self._number_of_properties: int = number_of_properties
67        self._value_size: int = value_size
68        self._data_format: str = data_format
69        self._engine: str = engine
70        self._join_n: int = join_n
71        self._join_m: int = join_m
72        random.seed(seed)
73
74        if self._data_format != 'csv':
75            raise NotImplementedError(f'Data format {self._data_format} '
76                                      f'is not implemented by {__name__}')
77
78        super().__init__(main_directory, verbose)
79        self._logger = Logger(__name__, self._main_directory, self._verbose)
80        self._logger.debug(f'Generating join {self._join_n}-{self._join_m}'
81                           f' with {self._percentage}%')

Initialize a Raw Data scenario.

Parameters
  • main_directory (str): Root directory for generating instances of Raw Data.
  • verbose (bool): Verbose logging enabled or not.
  • percentage (float): Percentage of members which should result into a join.
  • number_of_members (int): Number of members to generate, for example 5000 for 5K rows in a tabular data structure.
  • number_of_properties (int): Number of properties per member to generate, for example 20 for 20 columns in a tabular data structure.
  • value_size (int): Number of characters to add to default value generation, for example: 256 will expand all values to 256 characters.
  • data_format (str): Data format to use for generating the data set, for example: "csv", "json", "xml", "postgresql", "mysql"
  • engine (str): Engine to use for execution of the generated scenario's instance, for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", or "OntopMaterialize"
  • seed (int): Random seed to use, default 0.
  • join_n (int): Join N-M relationship value N, default 1.
  • join_m (int): Join N-M relationship value M, default 1.
def generate(self) -> bool:
83    def generate(self) -> bool:
84        """Generate the instance using the Raw Data scenario.
85
86        Only CSV files are currently implemented!
87        """
88        if self._data_format == 'csv':
89            return self._generate_csv()
90        elif self._data_format == 'postgresql':
91            return self._generate_postgresql()
92        else:
93            raise NotImplementedError(f'Data format {self._data_format} '
94                                      f'is not implemented by {__name__}')

Generate the instance using the Raw Data scenario.

Only CSV files are currently implemented!

def path(self) -> str:
 96    def path(self) -> str:
 97        """Builds the file path for the instance of a Raw Data scenario.
 98
 99        Returns
100        -------
101        path : str
102            File path for the Raw Data's instance.
103        """
104        key = f'joins_{self._join_n}-{self._join_m}_{self._percentage}'
105        path = os.path.join(self._main_directory, self._engine,
106                            self._data_format, key)
107        self._logger.debug(f'Generating to {path}')
108        os.makedirs(path, exist_ok=True)
109        return path

Builds the file path for the instance of a Raw Data scenario.

Returns
  • path (str): File path for the Raw Data's instance.