bench_generator.named_graph

This module holds the NamedGraph class which scales the dataset size by the number of members in a dataset such as number of rows for tabular data.

  1#!/usr/bin/env python3
  2
  3"""
  4This module holds the NamedGraph class which scales the dataset size
  5by the number of members in a dataset such as number of rows for tabular data.
  6"""
  7
  8import os
  9import string
 10from pandas import DataFrame
 11from rdflib.namespace import RDF
 12from rdflib import Graph, URIRef, BNode, Literal, Namespace
 13from bench_generator.scenario import Scenario
 14from bench_generator.logger import Logger
 15
 16DATA_FILE = 'data.csv'
 17CSV_MAPPING_FILE = 'mapping.rml.ttl'
 18RDB_MAPPING_FILE = 'mapping.r2rml.ttl'
 19R2RML = Namespace('http://www.w3.org/ns/r2rml#')
 20RML = Namespace('http://semweb.mmlab.be/ns/rml#')
 21QL = Namespace('http://semweb.mmlab.be/ns/ql#')
 22EX = Namespace('http://example.com/')
 23
 24
 25class NamedGraph(Scenario):
 26    def __init__(self, main_directory: str, verbose: bool,
 27                 number_of_ng_pom: int, number_of_ng_s: int, static: bool,
 28                 number_of_tms: int, number_of_poms: int,
 29                 number_of_members: int, number_of_properties: int,
 30                 value_size: int, data_format: str, engine: str):
 31        """Initialize a NamedGraph scenario.
 32
 33        Parameters
 34        ----------
 35        main_directory : str
 36            Root directory for generating instances of NamedGraph.
 37        verbose : bool
 38            Verbose logging enabled or not.
 39        number_of_ng_pom : int
 40            Number of named graphs per Predicate Object Map.
 41        number_of_ng_s : int
 42            Number of named graphs for Subject Map.
 43        number_of_members : int
 44            Number of members to generate, for example 5000 for 5K rows in a
 45            tabular data structure.
 46        number_of_properties : int
 47            Number of properties per member to generate, for example 20 for
 48            20 columns in a tabular data structure.
 49        value_size : int
 50            Number of characters to add to default value generation,
 51            for example: 256 will expand all values to 256 characters.
 52        data_format : str
 53            Data format to use for generating the data set, for example:
 54            "csv", "json", "xml", "postgresql", "mysql"
 55        engine : str
 56            Engine to use for execution of the generated scenario's instance,
 57            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
 58            or "OntopMaterialize"
 59        """
 60        self._number_of_ng_pom: int = number_of_ng_pom
 61        self._number_of_ng_s: int = number_of_ng_s
 62        self._static: bool = static
 63        self._number_of_tms: int = number_of_tms
 64        self._number_of_poms: int = number_of_poms
 65        self._number_of_members: int = number_of_members
 66        self._number_of_properties: int = number_of_properties
 67        self._value_size: int = value_size
 68
 69        super().__init__(data_format, engine, main_directory, verbose)
 70        self._logger = Logger(__name__, self._main_directory, self._verbose)
 71
 72    def generate(self) -> bool:
 73        """Generate the instance using the NamedGraph scenario.
 74
 75        Only CSV files are currently implemented!
 76        """
 77        if self._data_format == 'csv':
 78            return self._generate_csv()
 79        elif self._data_format == 'postgresql':
 80            return self._generate_postgresql()
 81        else:
 82            raise NotImplementedError(f'Data format {self._data_format} '
 83                                      f'is not implemented by {__name__}')
 84
 85    def path(self) -> str:
 86        """Builds the file path for the instance of a NamedGraph scenario.
 87
 88        Returns
 89        -------
 90        path : str
 91            File path for the NamedGraph's instance.
 92        """
 93        key = f'namedgraph_{self._number_of_ng_s}SM-NG_' \
 94              f'{self._number_of_ng_pom}POM-NG_{self._number_of_tms}TM_' \
 95              f'{self._number_of_poms}POM_{self._static}'
 96        path = os.path.join(self._main_directory, self._engine,
 97                            self._data_format, key)
 98        self._logger.debug(f'Generating to {path}')
 99        os.makedirs(path, exist_ok=True)
100        return path
101
102    def _generate_dataframe(self, member_offset: int = 1,
103                            property_offset: int = 1) -> DataFrame:
104        """Generate mappings.
105
106        Parameters
107        ----------
108        member_offset : int
109            Offset to start member ID generation from. Default 1 (no offset).
110        property_offset : int
111            Offset to start property ID generation from. Default 1 (no offset).
112
113        Returns
114        -------
115        dataframe : DataFrame
116            Panda's DataFrame with generated mappings.
117        """
118        subject_id = range(member_offset,
119                           self._number_of_members + member_offset)
120        value_id = range(property_offset,
121                         self._number_of_members + property_offset)
122        data: dict = {'id': subject_id}
123        n_ascii = len(string.ascii_letters)
124
125        for j in range(1, self._number_of_properties + 1):
126            # Append ASCII characters if necessary, use modulo to avoid out of
127            # range in ASCII table
128            append_value = ''
129            if self._value_size > 0:
130                append_value = '_'
131            for n in range(self._value_size):
132                append_value += string.ascii_letters[n % n_ascii]
133
134            # Generate value V_{property}_{member} honoring the value size
135            value = [f'V_{j}-{i}{append_value}' for i in value_id]
136            data[f'p{j}'] = value
137
138        return DataFrame(data)
139
140    def _add_predicate_object_map(self, mapping: Graph, triplesmap_iri: URIRef,
141                                  predicate_value: URIRef,
142                                  object_value: Literal, named_graphs: int = 0,
143                                  static: bool = True) -> BNode:
144        """Insert a PredicateObjectMap into a [R2]RML mapping
145
146        Parameters
147        ----------
148        mapping : Graph
149            [R2]RML mapping as an RDFLib Graph.
150        triples_map_iri : URIRef
151            IRI of the Triples Map to insert the PredicateObjectMap in.
152        predicate_value : Literal
153            Predicate IRI value for PredicateObjectMap.
154        object_value : Literal
155            Object value for PredicateObjectMap.
156
157        Returns
158        -------
159        predicate_object_map_iri : BNode
160            Predicate Object Map blank node ID.
161        """
162        predicate_object_map_iri = BNode()
163        predicate_map_iri = BNode()
164        object_map_iri = BNode()
165
166        mapping.add((predicate_map_iri, R2RML.constant, predicate_value))
167        mapping.add((predicate_map_iri, RDF.type, R2RML.PredicateMap))
168        if self._data_format == 'postgresql':
169            mapping.add((object_map_iri, R2RML.column, object_value))
170        else:
171            mapping.add((object_map_iri, RML.reference, object_value))
172        mapping.add((object_map_iri, RDF.type, R2RML.ObjectMap))
173        mapping.add((predicate_object_map_iri, R2RML.predicateMap,
174                     predicate_map_iri))
175        mapping.add((predicate_object_map_iri, R2RML.objectMap,
176                     object_map_iri))
177        mapping.add((predicate_object_map_iri, RDF.type,
178                     R2RML.PredicateObjectMap))
179        mapping.add((triplesmap_iri, R2RML.predicateObjectMap,
180                     predicate_object_map_iri))
181
182        for i in range(1, named_graphs + 1):
183            if static:
184                if self._number_of_ng_s == 0:
185                    mapping.add((predicate_object_map_iri, R2RML.graph,
186                                URIRef(f'http://example.org/graph{i}')))
187                else:
188                    mapping.add((predicate_object_map_iri, R2RML.graph,
189                                URIRef(f'http://example.org/pom/graph{i}')))
190            else:
191                graph_map_iri = BNode()
192                mapping.add((predicate_object_map_iri, R2RML.graphMap,
193                             graph_map_iri))
194                if self._number_of_ng_s == 0:
195                    mapping.add((graph_map_iri, R2RML.template,
196                                 Literal(f'http://example.org/graph{{p{i}}}')))
197                else:
198                    mapping.add((graph_map_iri, R2RML.template,
199                                 Literal('http://example.org/pom/'
200                                         f'graph{{p{i}}}')))
201
202        return predicate_object_map_iri
203
204    def _add_triples_map_source(self, mapping: Graph, subject_value: Literal,
205                                source_path: Literal, number: int = 1,
206                                named_graphs: int = 0,
207                                static: bool = True) -> URIRef:
208        """Insert a TriplesMap into a RML mapping with a Logical Source
209
210        Parameters
211        ----------
212        mapping : Graph
213            [R2]RML mapping as an RDFLib Graph.
214        subject_value : Literal
215            Subject IRI template value.
216        source_path : Literal
217            Path to source file.
218        number : int
219            Triples Map number, default 1.
220        named_graphs : int
221            Number of named graphs, default 0.
222
223        Returns
224        -------
225        triples_map_iri : URIRef
226            IRI of the Triples Map inserted into the mapping.
227        """
228        triples_map_iri = URIRef(f'{mapping.base}#TriplesMap{number}')
229        subject_map_iri = BNode()
230        logical_source_iri = BNode()
231
232        mapping.add((logical_source_iri, RML.source, source_path))
233        mapping.add((logical_source_iri, RML.referenceFormulation, QL.CSV))
234        mapping.add((logical_source_iri, RDF.type, RML.LogicalSource))
235        mapping.add((triples_map_iri, RML.logicalSource, logical_source_iri))
236        mapping.add((triples_map_iri, R2RML.subjectMap, subject_map_iri))
237        mapping.add((triples_map_iri, RDF.type, R2RML.TriplesMap))
238        mapping.add((subject_map_iri, R2RML.template, subject_value))
239
240        for i in range(1, named_graphs + 1):
241            if static:
242                mapping.add((subject_map_iri, R2RML.graph,
243                             URIRef(f'http://example.org/graph{i}')))
244            else:
245                graph_map_iri = BNode()
246                mapping.add((subject_map_iri, R2RML.graphMap, graph_map_iri))
247                mapping.add((graph_map_iri, R2RML.template,
248                             Literal(f'http://example.org/graph{{p{i}}}')))
249
250        return triples_map_iri
251
252    def _add_triples_map_table(self, mapping: Graph, subject_value: Literal,
253                               table_name: Literal, number: int = 1,
254                               named_graphs: int = 0,
255                               static: bool = True) -> URIRef:
256        """Insert a TriplesMap into a [R2]RML mapping with a Logical Table
257
258        Parameters
259        ----------
260        mapping : Graph
261            [R2]RML mapping as an RDFLib Graph.
262        subject_value : Literal
263            Subject IRI template value.
264        table_name : Literal
265            SQL table name to add.
266        number : int
267            Triples Map number, default 1.
268        named_graphs : int
269            Number of named graphs, default 0.
270
271        Returns
272        -------
273        triples_map_iri : URIRef
274            IRI of the Triples Map inserted into the mapping.
275        """
276        triples_map_iri = URIRef(f'{mapping.base}#TriplesMap{number}')
277        subject_map_iri = BNode()
278        logical_table_iri = BNode()
279
280        mapping.add((logical_table_iri, R2RML.tableName, table_name))
281        mapping.add((logical_table_iri, RDF.type, R2RML.LogicalTable))
282        mapping.add((triples_map_iri, R2RML.logicalTable, logical_table_iri))
283        mapping.add((triples_map_iri, R2RML.subjectMap, subject_map_iri))
284        mapping.add((triples_map_iri, RDF.type, R2RML.TriplesMap))
285        mapping.add((subject_map_iri, R2RML.template, subject_value))
286
287        for i in range(1, named_graphs + 1):
288            if static:
289                mapping.add((subject_map_iri, R2RML.graph,
290                             URIRef(f'http://example.org/graph{i}')))
291            else:
292                graph_map_iri = BNode()
293                mapping.add((subject_map_iri, R2RML.graphMap, graph_map_iri))
294                mapping.add((graph_map_iri, R2RML.template,
295                             Literal(f'http://example.org/graph{{p{i}}}')))
296
297        return triples_map_iri
298
299    def _generate_mapping(self) -> Graph:
300        """Generate a [R2]RML mapping for a NamedGraph instance.
301
302        Returns
303        -------
304        mapping : Graph
305            [R2]RML mapping as an RDFLib Graph.
306        """
307        mapping: Graph = Graph(base='http://ex.com/')
308        mapping.bind('rr', R2RML)
309        mapping.bind('ql', QL)
310        mapping.bind('ex', EX)
311
312        for i in range(1, self._number_of_tms + 1):
313            subject_template = Literal(f'http://ex.com/table/{{p{i}}}')
314            if self._data_format == 'postgresql':
315                triples_map_iri = self._add_triples_map_table(mapping,
316                                                              subject_template,
317                                                              Literal('data'),
318                                                              number=i,
319                                                              named_graphs=self._number_of_ng_s,
320                                                              static=self._static)
321            elif self._data_format == 'csv':
322                csv_path = Literal('/data/shared/data.csv')
323                triples_map_iri = \
324                    self._add_triples_map_source(mapping, subject_template,
325                                                 csv_path, number=i,
326                                                 named_graphs=self._number_of_ng_s,
327                                                 static=self._static)
328            else:
329                msg = f'{self._data_format} not implemented'
330                raise NotImplementedError(msg)
331
332            for j in range(1, self._number_of_poms + 1):
333                self._add_predicate_object_map(mapping, triples_map_iri,
334                                               EX[f'p{j}'], Literal(f'p{j}'),
335                                               named_graphs=self._number_of_ng_pom,
336                                               static=self._static)
337
338        return mapping
339
340    def _generate_csv(self) -> bool:
341        """Generate the instance as CSV files.
342
343        Returns
344        -------
345        success : bool
346            True if successfull, false otherwise
347        """
348        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
349        data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE)
350        self._generate_dataframe().to_csv(data_path, index=False)
351
352        mapping_path = os.path.join(self.path(), 'data', 'shared',
353                                    CSV_MAPPING_FILE)
354        mapping: Graph = self._generate_mapping()
355        mapping.serialize(destination=mapping_path, format='turtle')
356        self._generate_scenario()
357
358        return True
359
360    def _generate_postgresql(self) -> bool:
361        """Generate the instance as PostgreSQL with CSV files to load.
362
363        Returns
364        -------
365        success : bool
366            True if successfull, false otherwise
367        """
368        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
369        data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE)
370        self._generate_dataframe().to_csv(data_path, index=False)
371
372        mapping_path = os.path.join(self.path(), 'data', 'shared',
373                                    RDB_MAPPING_FILE)
374        mapping: Graph = self._generate_mapping()
375        mapping.serialize(destination=mapping_path, format='turtle')
376        self._generate_scenario()
377
378        return True
379
380    def _generate_scenario(self) -> bool:
381        """Generate the metadata for this scenario.
382
383        Configures the execution pipeline automatically.
384
385        Returns
386        -------
387        success : bool
388            True if successfull, false otherwise
389        """
390        name: str = f'namedgraph_{self._number_of_ng_s}_' \
391                    f'{self._number_of_ng_pom}_{self._number_of_tms}_' \
392                    f'{self._number_of_poms}_{self._static}'
393        description: str = f'NamedGraph {self._number_of_tms}TM + ' + \
394                           f'{self._number_of_poms}POMs'
395        iri: str = f'http://example.org/mappings/{self._number_of_tms}/' + \
396                   f'{self._number_of_poms}/{self._number_of_ng_s}/' + \
397                   f'{self._number_of_ng_pom}/{self._static}'
398
399        if self._data_format == 'postgresql':
400            return self._generate_metadata(iri, name, description,
401                                           RDB_MAPPING_FILE,
402                                           serialization='nquads')
403        elif self._data_format == 'csv':
404            return self._generate_metadata(iri, name, description,
405                                           CSV_MAPPING_FILE,
406                                           serialization='nquads')
407        else:
408            raise NotImplementedError(f'{self._data_format} not implemented')
409
410        return False
DATA_FILE = 'data.csv'
CSV_MAPPING_FILE = 'mapping.rml.ttl'
RDB_MAPPING_FILE = 'mapping.r2rml.ttl'
R2RML = Namespace('http://www.w3.org/ns/r2rml#')
RML = Namespace('http://semweb.mmlab.be/ns/rml#')
QL = Namespace('http://semweb.mmlab.be/ns/ql#')
EX = Namespace('http://example.com/')
class NamedGraph(bench_generator.scenario.Scenario):
 26class NamedGraph(Scenario):
 27    def __init__(self, main_directory: str, verbose: bool,
 28                 number_of_ng_pom: int, number_of_ng_s: int, static: bool,
 29                 number_of_tms: int, number_of_poms: int,
 30                 number_of_members: int, number_of_properties: int,
 31                 value_size: int, data_format: str, engine: str):
 32        """Initialize a NamedGraph scenario.
 33
 34        Parameters
 35        ----------
 36        main_directory : str
 37            Root directory for generating instances of NamedGraph.
 38        verbose : bool
 39            Verbose logging enabled or not.
 40        number_of_ng_pom : int
 41            Number of named graphs per Predicate Object Map.
 42        number_of_ng_s : int
 43            Number of named graphs for Subject Map.
 44        number_of_members : int
 45            Number of members to generate, for example 5000 for 5K rows in a
 46            tabular data structure.
 47        number_of_properties : int
 48            Number of properties per member to generate, for example 20 for
 49            20 columns in a tabular data structure.
 50        value_size : int
 51            Number of characters to add to default value generation,
 52            for example: 256 will expand all values to 256 characters.
 53        data_format : str
 54            Data format to use for generating the data set, for example:
 55            "csv", "json", "xml", "postgresql", "mysql"
 56        engine : str
 57            Engine to use for execution of the generated scenario's instance,
 58            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
 59            or "OntopMaterialize"
 60        """
 61        self._number_of_ng_pom: int = number_of_ng_pom
 62        self._number_of_ng_s: int = number_of_ng_s
 63        self._static: bool = static
 64        self._number_of_tms: int = number_of_tms
 65        self._number_of_poms: int = number_of_poms
 66        self._number_of_members: int = number_of_members
 67        self._number_of_properties: int = number_of_properties
 68        self._value_size: int = value_size
 69
 70        super().__init__(data_format, engine, main_directory, verbose)
 71        self._logger = Logger(__name__, self._main_directory, self._verbose)
 72
 73    def generate(self) -> bool:
 74        """Generate the instance using the NamedGraph scenario.
 75
 76        Only CSV files are currently implemented!
 77        """
 78        if self._data_format == 'csv':
 79            return self._generate_csv()
 80        elif self._data_format == 'postgresql':
 81            return self._generate_postgresql()
 82        else:
 83            raise NotImplementedError(f'Data format {self._data_format} '
 84                                      f'is not implemented by {__name__}')
 85
 86    def path(self) -> str:
 87        """Builds the file path for the instance of a NamedGraph scenario.
 88
 89        Returns
 90        -------
 91        path : str
 92            File path for the NamedGraph's instance.
 93        """
 94        key = f'namedgraph_{self._number_of_ng_s}SM-NG_' \
 95              f'{self._number_of_ng_pom}POM-NG_{self._number_of_tms}TM_' \
 96              f'{self._number_of_poms}POM_{self._static}'
 97        path = os.path.join(self._main_directory, self._engine,
 98                            self._data_format, key)
 99        self._logger.debug(f'Generating to {path}')
100        os.makedirs(path, exist_ok=True)
101        return path
102
103    def _generate_dataframe(self, member_offset: int = 1,
104                            property_offset: int = 1) -> DataFrame:
105        """Generate mappings.
106
107        Parameters
108        ----------
109        member_offset : int
110            Offset to start member ID generation from. Default 1 (no offset).
111        property_offset : int
112            Offset to start property ID generation from. Default 1 (no offset).
113
114        Returns
115        -------
116        dataframe : DataFrame
117            Panda's DataFrame with generated mappings.
118        """
119        subject_id = range(member_offset,
120                           self._number_of_members + member_offset)
121        value_id = range(property_offset,
122                         self._number_of_members + property_offset)
123        data: dict = {'id': subject_id}
124        n_ascii = len(string.ascii_letters)
125
126        for j in range(1, self._number_of_properties + 1):
127            # Append ASCII characters if necessary, use modulo to avoid out of
128            # range in ASCII table
129            append_value = ''
130            if self._value_size > 0:
131                append_value = '_'
132            for n in range(self._value_size):
133                append_value += string.ascii_letters[n % n_ascii]
134
135            # Generate value V_{property}_{member} honoring the value size
136            value = [f'V_{j}-{i}{append_value}' for i in value_id]
137            data[f'p{j}'] = value
138
139        return DataFrame(data)
140
141    def _add_predicate_object_map(self, mapping: Graph, triplesmap_iri: URIRef,
142                                  predicate_value: URIRef,
143                                  object_value: Literal, named_graphs: int = 0,
144                                  static: bool = True) -> BNode:
145        """Insert a PredicateObjectMap into a [R2]RML mapping
146
147        Parameters
148        ----------
149        mapping : Graph
150            [R2]RML mapping as an RDFLib Graph.
151        triples_map_iri : URIRef
152            IRI of the Triples Map to insert the PredicateObjectMap in.
153        predicate_value : Literal
154            Predicate IRI value for PredicateObjectMap.
155        object_value : Literal
156            Object value for PredicateObjectMap.
157
158        Returns
159        -------
160        predicate_object_map_iri : BNode
161            Predicate Object Map blank node ID.
162        """
163        predicate_object_map_iri = BNode()
164        predicate_map_iri = BNode()
165        object_map_iri = BNode()
166
167        mapping.add((predicate_map_iri, R2RML.constant, predicate_value))
168        mapping.add((predicate_map_iri, RDF.type, R2RML.PredicateMap))
169        if self._data_format == 'postgresql':
170            mapping.add((object_map_iri, R2RML.column, object_value))
171        else:
172            mapping.add((object_map_iri, RML.reference, object_value))
173        mapping.add((object_map_iri, RDF.type, R2RML.ObjectMap))
174        mapping.add((predicate_object_map_iri, R2RML.predicateMap,
175                     predicate_map_iri))
176        mapping.add((predicate_object_map_iri, R2RML.objectMap,
177                     object_map_iri))
178        mapping.add((predicate_object_map_iri, RDF.type,
179                     R2RML.PredicateObjectMap))
180        mapping.add((triplesmap_iri, R2RML.predicateObjectMap,
181                     predicate_object_map_iri))
182
183        for i in range(1, named_graphs + 1):
184            if static:
185                if self._number_of_ng_s == 0:
186                    mapping.add((predicate_object_map_iri, R2RML.graph,
187                                URIRef(f'http://example.org/graph{i}')))
188                else:
189                    mapping.add((predicate_object_map_iri, R2RML.graph,
190                                URIRef(f'http://example.org/pom/graph{i}')))
191            else:
192                graph_map_iri = BNode()
193                mapping.add((predicate_object_map_iri, R2RML.graphMap,
194                             graph_map_iri))
195                if self._number_of_ng_s == 0:
196                    mapping.add((graph_map_iri, R2RML.template,
197                                 Literal(f'http://example.org/graph{{p{i}}}')))
198                else:
199                    mapping.add((graph_map_iri, R2RML.template,
200                                 Literal('http://example.org/pom/'
201                                         f'graph{{p{i}}}')))
202
203        return predicate_object_map_iri
204
205    def _add_triples_map_source(self, mapping: Graph, subject_value: Literal,
206                                source_path: Literal, number: int = 1,
207                                named_graphs: int = 0,
208                                static: bool = True) -> URIRef:
209        """Insert a TriplesMap into a RML mapping with a Logical Source
210
211        Parameters
212        ----------
213        mapping : Graph
214            [R2]RML mapping as an RDFLib Graph.
215        subject_value : Literal
216            Subject IRI template value.
217        source_path : Literal
218            Path to source file.
219        number : int
220            Triples Map number, default 1.
221        named_graphs : int
222            Number of named graphs, default 0.
223
224        Returns
225        -------
226        triples_map_iri : URIRef
227            IRI of the Triples Map inserted into the mapping.
228        """
229        triples_map_iri = URIRef(f'{mapping.base}#TriplesMap{number}')
230        subject_map_iri = BNode()
231        logical_source_iri = BNode()
232
233        mapping.add((logical_source_iri, RML.source, source_path))
234        mapping.add((logical_source_iri, RML.referenceFormulation, QL.CSV))
235        mapping.add((logical_source_iri, RDF.type, RML.LogicalSource))
236        mapping.add((triples_map_iri, RML.logicalSource, logical_source_iri))
237        mapping.add((triples_map_iri, R2RML.subjectMap, subject_map_iri))
238        mapping.add((triples_map_iri, RDF.type, R2RML.TriplesMap))
239        mapping.add((subject_map_iri, R2RML.template, subject_value))
240
241        for i in range(1, named_graphs + 1):
242            if static:
243                mapping.add((subject_map_iri, R2RML.graph,
244                             URIRef(f'http://example.org/graph{i}')))
245            else:
246                graph_map_iri = BNode()
247                mapping.add((subject_map_iri, R2RML.graphMap, graph_map_iri))
248                mapping.add((graph_map_iri, R2RML.template,
249                             Literal(f'http://example.org/graph{{p{i}}}')))
250
251        return triples_map_iri
252
253    def _add_triples_map_table(self, mapping: Graph, subject_value: Literal,
254                               table_name: Literal, number: int = 1,
255                               named_graphs: int = 0,
256                               static: bool = True) -> URIRef:
257        """Insert a TriplesMap into a [R2]RML mapping with a Logical Table
258
259        Parameters
260        ----------
261        mapping : Graph
262            [R2]RML mapping as an RDFLib Graph.
263        subject_value : Literal
264            Subject IRI template value.
265        table_name : Literal
266            SQL table name to add.
267        number : int
268            Triples Map number, default 1.
269        named_graphs : int
270            Number of named graphs, default 0.
271
272        Returns
273        -------
274        triples_map_iri : URIRef
275            IRI of the Triples Map inserted into the mapping.
276        """
277        triples_map_iri = URIRef(f'{mapping.base}#TriplesMap{number}')
278        subject_map_iri = BNode()
279        logical_table_iri = BNode()
280
281        mapping.add((logical_table_iri, R2RML.tableName, table_name))
282        mapping.add((logical_table_iri, RDF.type, R2RML.LogicalTable))
283        mapping.add((triples_map_iri, R2RML.logicalTable, logical_table_iri))
284        mapping.add((triples_map_iri, R2RML.subjectMap, subject_map_iri))
285        mapping.add((triples_map_iri, RDF.type, R2RML.TriplesMap))
286        mapping.add((subject_map_iri, R2RML.template, subject_value))
287
288        for i in range(1, named_graphs + 1):
289            if static:
290                mapping.add((subject_map_iri, R2RML.graph,
291                             URIRef(f'http://example.org/graph{i}')))
292            else:
293                graph_map_iri = BNode()
294                mapping.add((subject_map_iri, R2RML.graphMap, graph_map_iri))
295                mapping.add((graph_map_iri, R2RML.template,
296                             Literal(f'http://example.org/graph{{p{i}}}')))
297
298        return triples_map_iri
299
300    def _generate_mapping(self) -> Graph:
301        """Generate a [R2]RML mapping for a NamedGraph instance.
302
303        Returns
304        -------
305        mapping : Graph
306            [R2]RML mapping as an RDFLib Graph.
307        """
308        mapping: Graph = Graph(base='http://ex.com/')
309        mapping.bind('rr', R2RML)
310        mapping.bind('ql', QL)
311        mapping.bind('ex', EX)
312
313        for i in range(1, self._number_of_tms + 1):
314            subject_template = Literal(f'http://ex.com/table/{{p{i}}}')
315            if self._data_format == 'postgresql':
316                triples_map_iri = self._add_triples_map_table(mapping,
317                                                              subject_template,
318                                                              Literal('data'),
319                                                              number=i,
320                                                              named_graphs=self._number_of_ng_s,
321                                                              static=self._static)
322            elif self._data_format == 'csv':
323                csv_path = Literal('/data/shared/data.csv')
324                triples_map_iri = \
325                    self._add_triples_map_source(mapping, subject_template,
326                                                 csv_path, number=i,
327                                                 named_graphs=self._number_of_ng_s,
328                                                 static=self._static)
329            else:
330                msg = f'{self._data_format} not implemented'
331                raise NotImplementedError(msg)
332
333            for j in range(1, self._number_of_poms + 1):
334                self._add_predicate_object_map(mapping, triples_map_iri,
335                                               EX[f'p{j}'], Literal(f'p{j}'),
336                                               named_graphs=self._number_of_ng_pom,
337                                               static=self._static)
338
339        return mapping
340
341    def _generate_csv(self) -> bool:
342        """Generate the instance as CSV files.
343
344        Returns
345        -------
346        success : bool
347            True if successfull, false otherwise
348        """
349        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
350        data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE)
351        self._generate_dataframe().to_csv(data_path, index=False)
352
353        mapping_path = os.path.join(self.path(), 'data', 'shared',
354                                    CSV_MAPPING_FILE)
355        mapping: Graph = self._generate_mapping()
356        mapping.serialize(destination=mapping_path, format='turtle')
357        self._generate_scenario()
358
359        return True
360
361    def _generate_postgresql(self) -> bool:
362        """Generate the instance as PostgreSQL with CSV files to load.
363
364        Returns
365        -------
366        success : bool
367            True if successfull, false otherwise
368        """
369        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
370        data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE)
371        self._generate_dataframe().to_csv(data_path, index=False)
372
373        mapping_path = os.path.join(self.path(), 'data', 'shared',
374                                    RDB_MAPPING_FILE)
375        mapping: Graph = self._generate_mapping()
376        mapping.serialize(destination=mapping_path, format='turtle')
377        self._generate_scenario()
378
379        return True
380
381    def _generate_scenario(self) -> bool:
382        """Generate the metadata for this scenario.
383
384        Configures the execution pipeline automatically.
385
386        Returns
387        -------
388        success : bool
389            True if successfull, false otherwise
390        """
391        name: str = f'namedgraph_{self._number_of_ng_s}_' \
392                    f'{self._number_of_ng_pom}_{self._number_of_tms}_' \
393                    f'{self._number_of_poms}_{self._static}'
394        description: str = f'NamedGraph {self._number_of_tms}TM + ' + \
395                           f'{self._number_of_poms}POMs'
396        iri: str = f'http://example.org/mappings/{self._number_of_tms}/' + \
397                   f'{self._number_of_poms}/{self._number_of_ng_s}/' + \
398                   f'{self._number_of_ng_pom}/{self._static}'
399
400        if self._data_format == 'postgresql':
401            return self._generate_metadata(iri, name, description,
402                                           RDB_MAPPING_FILE,
403                                           serialization='nquads')
404        elif self._data_format == 'csv':
405            return self._generate_metadata(iri, name, description,
406                                           CSV_MAPPING_FILE,
407                                           serialization='nquads')
408        else:
409            raise NotImplementedError(f'{self._data_format} not implemented')
410
411        return False

Helper class that provides a standard way to create an ABC using inheritance.

NamedGraph( main_directory: str, verbose: bool, number_of_ng_pom: int, number_of_ng_s: int, static: bool, number_of_tms: int, number_of_poms: int, number_of_members: int, number_of_properties: int, value_size: int, data_format: str, engine: str)
27    def __init__(self, main_directory: str, verbose: bool,
28                 number_of_ng_pom: int, number_of_ng_s: int, static: bool,
29                 number_of_tms: int, number_of_poms: int,
30                 number_of_members: int, number_of_properties: int,
31                 value_size: int, data_format: str, engine: str):
32        """Initialize a NamedGraph scenario.
33
34        Parameters
35        ----------
36        main_directory : str
37            Root directory for generating instances of NamedGraph.
38        verbose : bool
39            Verbose logging enabled or not.
40        number_of_ng_pom : int
41            Number of named graphs per Predicate Object Map.
42        number_of_ng_s : int
43            Number of named graphs for Subject Map.
44        number_of_members : int
45            Number of members to generate, for example 5000 for 5K rows in a
46            tabular data structure.
47        number_of_properties : int
48            Number of properties per member to generate, for example 20 for
49            20 columns in a tabular data structure.
50        value_size : int
51            Number of characters to add to default value generation,
52            for example: 256 will expand all values to 256 characters.
53        data_format : str
54            Data format to use for generating the data set, for example:
55            "csv", "json", "xml", "postgresql", "mysql"
56        engine : str
57            Engine to use for execution of the generated scenario's instance,
58            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
59            or "OntopMaterialize"
60        """
61        self._number_of_ng_pom: int = number_of_ng_pom
62        self._number_of_ng_s: int = number_of_ng_s
63        self._static: bool = static
64        self._number_of_tms: int = number_of_tms
65        self._number_of_poms: int = number_of_poms
66        self._number_of_members: int = number_of_members
67        self._number_of_properties: int = number_of_properties
68        self._value_size: int = value_size
69
70        super().__init__(data_format, engine, main_directory, verbose)
71        self._logger = Logger(__name__, self._main_directory, self._verbose)

Initialize a NamedGraph scenario.

Parameters
  • main_directory (str): Root directory for generating instances of NamedGraph.
  • verbose (bool): Verbose logging enabled or not.
  • number_of_ng_pom (int): Number of named graphs per Predicate Object Map.
  • number_of_ng_s (int): Number of named graphs for Subject Map.
  • number_of_members (int): Number of members to generate, for example 5000 for 5K rows in a tabular data structure.
  • number_of_properties (int): Number of properties per member to generate, for example 20 for 20 columns in a tabular data structure.
  • value_size (int): Number of characters to add to default value generation, for example: 256 will expand all values to 256 characters.
  • data_format (str): Data format to use for generating the data set, for example: "csv", "json", "xml", "postgresql", "mysql"
  • engine (str): Engine to use for execution of the generated scenario's instance, for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", or "OntopMaterialize"
def generate(self) -> bool:
73    def generate(self) -> bool:
74        """Generate the instance using the NamedGraph scenario.
75
76        Only CSV files are currently implemented!
77        """
78        if self._data_format == 'csv':
79            return self._generate_csv()
80        elif self._data_format == 'postgresql':
81            return self._generate_postgresql()
82        else:
83            raise NotImplementedError(f'Data format {self._data_format} '
84                                      f'is not implemented by {__name__}')

Generate the instance using the NamedGraph scenario.

Only CSV files are currently implemented!

def path(self) -> str:
 86    def path(self) -> str:
 87        """Builds the file path for the instance of a NamedGraph scenario.
 88
 89        Returns
 90        -------
 91        path : str
 92            File path for the NamedGraph's instance.
 93        """
 94        key = f'namedgraph_{self._number_of_ng_s}SM-NG_' \
 95              f'{self._number_of_ng_pom}POM-NG_{self._number_of_tms}TM_' \
 96              f'{self._number_of_poms}POM_{self._static}'
 97        path = os.path.join(self._main_directory, self._engine,
 98                            self._data_format, key)
 99        self._logger.debug(f'Generating to {path}')
100        os.makedirs(path, exist_ok=True)
101        return path

Builds the file path for the instance of a NamedGraph scenario.

Returns
  • path (str): File path for the NamedGraph's instance.