bench_executor.ontop

Ontop is a Virtual Knowledge Graph system. It exposes the content of arbitrary relational databases as knowledge graphs. These graphs are virtual, which means that data remains in the data sources instead of being moved to another database.

Website: https://ontop-vkg.org
Repository: https://github.com/ontop/ontop

  1#!/usr/bin/env python3
  2
  3"""
  4Ontop is a Virtual Knowledge Graph system. It exposes the content of
  5arbitrary relational databases as knowledge graphs. These graphs are virtual,
  6which means that data remains in the data sources instead of being moved
  7to another database.
  8
  9**Website**: https://ontop-vkg.org<br>
 10**Repository**: https://github.com/ontop/ontop
 11"""
 12
 13import os
 14import psutil
 15import configparser
 16from rdflib import Graph, Namespace, RDF, URIRef
 17from timeout_decorator import timeout, TimeoutError  # type: ignore
 18from typing import Dict, Optional, cast
 19from bench_executor.container import Container
 20from bench_executor.logger import Logger
 21
 22VERSION = '5.0.0'
 23TIMEOUT = 6 * 3600  # 6 hours
 24R2RML = Namespace('http://www.w3.org/ns/r2rml#')
 25
 26
 27class Ontop(Container):
 28    """Ontop container super class for OntopMaterialize and OntopVirtualize."""
 29    def __init__(self, name: str, data_path: str, logger: Logger, mode: str):
 30        """Creates an instance of the Ontop class.
 31
 32        Parameters
 33        ----------
 34        name : str
 35            Pretty name of the container.
 36        data_path: str
 37            Path to the data directory of the case.
 38        logger : Logger
 39            Logger to use for log messages.
 40        mode : str
 41            Ontop mode: `materialize` or `endpoint`
 42        """
 43        self._mode = mode
 44        self._headers: Dict[str, Dict[str, str]] = {}
 45        self._logger = logger
 46        self._data_path = data_path
 47
 48        if self._mode == 'endpoint':
 49            subdir = 'ontopvirtualize'
 50        elif self._mode == 'materialize':
 51            subdir = 'ontopmaterialize'
 52        else:
 53            raise ValueError(f'Unknown Ontop mode: "{self._mode}"')
 54        os.makedirs(os.path.join(self._data_path, subdir), exist_ok=True)
 55
 56        # Set Java heap to 1/2 of available memory instead of the default 1/4
 57        max_heap = int(psutil.virtual_memory().total * (1/2))
 58
 59        # Configure logging
 60        log_level = 'info'
 61        if self._logger.verbose:
 62            log_level = 'debug'
 63        self._logger.info(f'Initialized Ontop logger at "{log_level}" level')
 64
 65        environment = {'ONTOP_JAVA_ARGS': f'-Xmx{max_heap} -Xms{max_heap}',
 66                       'ONTOP_LOG_LEVEL': log_level}
 67        super().__init__(f'kgconstruct/ontop:v{VERSION}', name,
 68                         self._logger,
 69                         ports={'8888': '8888'},
 70                         environment=environment,
 71                         volumes=[f'{self._data_path}/'
 72                                  f'{self.root_mount_directory}:/data',
 73                                  f'{self._data_path}/shared:/data/shared'])
 74
 75    @property
 76    def root_mount_directory(self) -> str:
 77        """Subdirectory in the root directory of the case for Ontop.
 78
 79        Returns
 80        -------
 81        subdirectory : str
 82            Subdirectory of the root directory for Ontop.
 83
 84        """
 85        if self._mode == 'endpoint':
 86            return 'ontopvirtualize'
 87        elif self._mode == 'materialize':
 88            return 'ontopmaterialize'
 89        else:
 90            raise ValueError(f'Unknown Ontop mode: "{self._mode}"')
 91
 92    @property
 93    def endpoint(self) -> str:
 94        """SPARQL endpoint URL for Ontop.
 95
 96        Returns
 97        -------
 98        url : str
 99            SPARQL endpoint URL.
100        """
101        return 'http://localhost:8888/sparql'
102
103    @property
104    def headers(self) -> dict:
105        """HTTP headers of SPARQL queries for serialization formats.
106
107        Only supported serialization formats are included in the dictionary.
108        Currently, the following formats are supported:
109        - N-Triples
110        - N-Quads
111        - Turtle
112        - CSV
113        - RDF/JSON
114        - RDF/XML
115        - JSON-LD
116
117        Returns
118        -------
119        headers : dict
120            Dictionary of headers to use for each serialization format.
121        """
122        return self._headers
123
124    def _execute(self, arguments: list) -> bool:
125        """Execute Ontop with given arguments.
126
127        Parameters
128        ----------
129        arguments : list
130            Arguments to supply to Ontop.
131
132        Returns
133        -------
134        success : bool
135            Whether the execution succeeded or not.
136        """
137
138        cmd = f'/ontop/ontop {self._mode} {" ".join(arguments)}'
139        self._logger.info(f'Executing Ontop with command: {cmd}')
140        if self._mode == 'endpoint':
141            log_line = 'OntopEndpointApplication - Started ' + \
142                       'OntopEndpointApplication'
143            success = self.run_and_wait_for_log(log_line, cmd)
144        elif self._mode == 'materialize':
145            success = self.run_and_wait_for_exit(cmd)
146        else:
147            self._logger.error(f'Unknown Ontop mode "{self._mode}"')
148            success = False
149
150        return success
151
152    def _execute_mapping(self,
153                         config_file: str,
154                         arguments: list,
155                         mapping_file: str,
156                         output_file: Optional[str],
157                         rdb_username: str,
158                         rdb_password: str,
159                         rdb_host: str,
160                         rdb_port: int,
161                         rdb_name: str,
162                         rdb_type: str) -> bool:
163        """Execute a mapping file with Ontop.
164
165        Only relational databases are supported by
166        Ontop, thus the relational database parameters are mandantory.
167
168        Parameters
169        ----------
170        config_file : str
171            Name of the generated config file for Ontop.
172        arguments : list
173            List of arguments to pass to Ontop.
174        mapping_file : str
175            Name of the mapping file to use.
176        output_file : Optional[str]
177            Name of the output file to use. Only applicable for
178            materialization.
179        rdb_username : str
180            Username for the database.
181        rdb_password : str
182            Password for the database.
183        rdb_host : str
184            Hostname for the database.
185        rdb_port : int
186            Port for the database.
187        rdb_name : str
188            Database name for the database.
189        rdb_type : str
190            Database type.
191
192        Returns
193        -------
194        success : bool
195            Whether the execution was successfull or not.
196        """
197        # Generate INI configuration file since no CLI is available
198        config = configparser.ConfigParser()
199        config['root'] = {}
200        if rdb_type == 'MySQL':
201            dsn = f'jdbc:mysql://{rdb_host}:{rdb_port}/{rdb_name}'
202            config['root']['jdbc.url'] = dsn
203            config['root']['jdbc.driver'] = 'com.mysql.cj.jdbc.Driver'
204        elif rdb_type == 'PostgreSQL':
205            dsn = f'jdbc:postgresql://{rdb_host}:{rdb_port}/{rdb_name}'
206            config['root']['jdbc.url'] = dsn
207            config['root']['jdbc.driver'] = 'org.postgresql.Driver'
208        else:
209            msg = f'Unknown RDB type: "{rdb_type}"'
210            self._logger.error(msg)
211            raise ValueError(msg)
212        config['root']['jdbc.user'] = rdb_username
213        config['root']['jdbc.password'] = rdb_password
214
215        path = os.path.join(self._data_path, self.root_mount_directory)
216        os.makedirs(path, exist_ok=True)
217        with open(os.path.join(path, 'config.properties'), 'w') as f:
218            config.write(f, space_around_delimiters=False)
219
220        # .properties files are like .ini files but without a [HEADER]
221        # Use a [root] header and remove it after writing
222        with open(os.path.join(path, 'config.properties'), 'r') as f:
223            data = f.read()
224
225        with open(os.path.join(path, 'config.properties'), 'w') as f:
226            f.write(data.replace('[root]\n', ''))
227
228        # Compatibility with Ontop requiring rr:class
229        # Replace any rdf:type construction with rr:class
230        # Without this, a strange error is raised: 'The definition of the
231        # predicate is not always a ground term triple(s,p,o)'
232        g = Graph()
233        g.bind('r2rml', R2RML)
234        g.bind('rdf', RDF)
235        g.parse(os.path.join(self._data_path, 'shared',
236                             os.path.basename(mapping_file)))
237
238        for triples_map_iri, p, o in g.triples((None, RDF.type,
239                                                R2RML.TriplesMap)):
240            subject_map_iri = g.value(triples_map_iri, R2RML.subjectMap)
241
242            if subject_map_iri is None:
243                self._logger.warning("Subject Map not present in Triples Map")
244                break
245
246            iter_pom = g.triples((triples_map_iri,
247                                  R2RML.predicateObjectMap,
248                                  None))
249            for s, p, predicate_object_map_iri in iter_pom:
250                predicate_map_iri = g.value(predicate_object_map_iri,
251                                            R2RML.predicateMap)
252                object_map_iri = g.value(predicate_object_map_iri,
253                                         R2RML.objectMap)
254
255                if predicate_map_iri is None or object_map_iri is None:
256                    continue
257
258                # Check if PredicateObjectMap is pointing to a PredicateMap
259                # specifying rdf:type. Skip this PredicateObjectMap if not
260                if g.value(predicate_map_iri, R2RML.constant) != RDF.type:
261                    continue
262
263                # Retrieve the ObjectMap rr:constant value and add it as
264                # rr:class to the Subject Map is present
265                rdf_type_value = cast(URIRef,
266                                      g.value(object_map_iri, R2RML.constant))
267                if rdf_type_value is not None:
268                    iri = URIRef(rdf_type_value.toPython())
269                    g.add((subject_map_iri, R2RML['class'], iri))
270                else:
271                    msg = 'Cannot extract rr:class value, rdf:type value ' + \
272                          'is not a constant value!'
273                    self._logger.error(msg)
274                    return False
275
276                # Remove all triples associated with the rdf:type PredicateMap
277                for s, p, o in g.triples((predicate_map_iri, None, None)):
278                    g.remove((s, p, o))
279
280                # Remove all triples associated with the rdf:type ObjectMap
281                for s, p, o in g.triples((object_map_iri, None, None)):
282                    g.remove((s, p, o))
283
284                # Remove all triples associated with the
285                # rdf:type PredicateObjectMap
286                for s, p, o in g.triples((object_map_iri, None, None)):
287                    g.remove((s, p, o))
288
289                # Remove PredicateObjectMap from Triples Map
290                g.remove((triples_map_iri, R2RML.predicateObjectMap,
291                          predicate_object_map_iri))
292
293            destination = os.path.join(self._data_path,
294                                       self.root_mount_directory,
295                                       'mapping_converted.r2rml.ttl')
296            g.serialize(destination=destination, format='turtle')
297
298        arguments.append('-m')
299        arguments.append('/data/mapping_converted.r2rml.ttl')
300        if output_file is not None:
301            arguments.append('-o')
302            arguments.append(os.path.join('/data/shared/', output_file))
303        arguments.append('-p')
304        arguments.append('/data/config.properties')
305
306        return self._execute(arguments)
307
308
309class OntopVirtualize(Ontop):
310    """OntopVirtualize container for setting up an Ontop SPARQL endpoint."""
311    def __init__(self, data_path: str, config_path: str, directory: str,
312                 verbose: bool):
313        """Creates an instance of the OntopVirtualize class.
314
315        Parameters
316        ----------
317        data_path : str
318            Path to the data directory of the case.
319        config_path : str
320            Path to the config directory of the case.
321        directory : str
322            Path to the directory to store logs.
323        verbose : bool
324            Enable verbose logs.
325        """
326        self._data_path = os.path.abspath(data_path)
327        self._config_path = os.path.abspath(config_path)
328        self._logger = Logger(__name__, directory, verbose)
329        super().__init__('Ontop-Virtualize', self._data_path, self._logger,
330                         'endpoint')
331
332    def execute_mapping(self,
333                        mapping_file: str,
334                        serialization: str,
335                        rdb_username: str,
336                        rdb_password: str,
337                        rdb_host: str,
338                        rdb_port: int,
339                        rdb_name: str,
340                        rdb_type: str) -> bool:
341        """Start an Ontop SPARQL endpoint with a mapping.
342
343        Only relational databases are supported by
344        Ontop, thus the relational database parameters are mandantory.
345        Ontop SPARQL endpoint supports the following serialization formats:
346        - N-Triples (Ontop v5+)
347        - N-Quads (Ontop v5+)
348        - Turtle
349        - RDF/JSON
350        - JSON-LD
351        - CSV
352
353        Parameters
354        ----------
355        mapping_file : str
356            Path to the mapping file to execute.
357        serialization : str
358            Serialization format to use.
359        rdb_username : str
360            Username for the database.
361        rdb_password : str
362            Password for the database.
363        rdb_host : str
364            Hostname for the database.
365        rdb_port : int
366            Port for the database.
367        rdb_name : str
368            Database name for the database.
369        rdb_type : str
370            Database type.
371
372        Returns
373        -------
374        success : bool
375            Whether the execution was successfull or not.
376        """
377        config_file = f'{self._data_path}/{self.root_mount_directory}' + \
378                      '/config.properties'
379        arguments = ['--cors-allowed-origins=*', '--port=8888']
380        self._headers['ntriples'] = {'Accept': 'application/n-triples'}
381        self._headers['nquads'] = {'Accept': 'application/n-quads'}
382        self._headers['turtle'] = {'Accept': 'text/turtle'}
383        self._headers['rdfjson'] = {'Accept': 'application/rdf+json'}
384        self._headers['rdfxml'] = {'Accept': 'application/rdf+xml'}
385        self._headers['jsonld'] = {'Accept': 'application/ld+json'}
386        self._headers['csv'] = {'Accept': 'text/csv'}
387        if serialization not in self._headers.keys():
388            msg = 'Unsupported serialization format ' + \
389                  f'"{serialization}" for Ontop'
390            self._logger.error(msg)
391            raise ValueError(msg)
392        return super()._execute_mapping(config_file, arguments,
393                                        mapping_file, None, rdb_username,
394                                        rdb_password, rdb_host, rdb_port,
395                                        rdb_name, rdb_type)
396
397
398class OntopMaterialize(Ontop):
399    """OntopMaterialize container to execute a R2RML mapping."""
400    def __init__(self, data_path: str, config_path: str, directory: str,
401                 verbose: bool):
402        """Creates an instance of the OntopMaterialize class.
403
404        Parameters
405        ----------
406        data_path : str
407            Path to the data directory of the case.
408        config_path : str
409            Path to the config directory of the case.
410        directory : str
411            Path to the directory to store logs.
412        verbose : bool
413            Enable verbose logs.
414        """
415        self._data_path = os.path.abspath(data_path)
416        self._config_path = os.path.abspath(config_path)
417        self._logger = Logger(__name__, directory, verbose)
418        os.makedirs(os.path.join(self._data_path, 'ontopmaterialize'),
419                    exist_ok=True)
420        super().__init__('Ontop-Materialize', self._data_path, self._logger,
421                         'materialize')
422
423    @timeout(TIMEOUT)
424    def _execute_mapping_with_timeout(self, mapping_file: str,
425                                      output_file: str,
426                                      serialization: str,
427                                      rdb_username: str,
428                                      rdb_password: str,
429                                      rdb_host: str,
430                                      rdb_port: int,
431                                      rdb_name: str,
432                                      rdb_type: str) -> bool:
433        """Execute a mapping with a provided timeout.
434
435        Returns
436        -------
437        success : bool
438            Whether the execution was successfull or not.
439        """
440        config_file = f'{self._data_path}/{self.root_mount_directory}' + \
441                      '/config.properties'
442        arguments = ['-f', serialization]
443        self._headers = {}
444        return super()._execute_mapping(config_file, arguments,
445                                        mapping_file, output_file,
446                                        rdb_username, rdb_password,
447                                        rdb_host, rdb_port, rdb_name, rdb_type)
448
449    def execute_mapping(self,
450                        mapping_file: str,
451                        output_file: str,
452                        serialization: str,
453                        rdb_username: str,
454                        rdb_password: str,
455                        rdb_host: str,
456                        rdb_port: int,
457                        rdb_name: str,
458                        rdb_type: str) -> bool:
459        """Execute a R2RML mapping with Ontop
460
461        N-Quads and N-Triples are currently supported as serialization
462        for Ontop materialize. Only relational databases are supported by
463        Ontop, thus the relational database parameters are mandantory.
464
465        Parameters
466        ----------
467        mapping_file : str
468            Path to the mapping file to execute.
469        output_file : str
470            Name of the output file to store the triples in. This is not used
471            for OntopVirtualize.
472        serialization : str
473            Serialization format to use.
474        rdb_username : str
475            Username for the database.
476        rdb_password : str
477            Password for the database.
478        rdb_host : str
479            Hostname for the database.
480        rdb_port : int
481            Port for the database.
482        rdb_name : str
483            Database name for the database.
484        rdb_type : str
485            Database type.
486
487        Returns
488        -------
489        success : bool
490            Whether the execution was successfull or not.
491        """
492        try:
493            return self._execute_mapping_with_timeout(mapping_file,
494                                                      output_file,
495                                                      serialization,
496                                                      rdb_username,
497                                                      rdb_password,
498                                                      rdb_host,
499                                                      rdb_port,
500                                                      rdb_name,
501                                                      rdb_type)
502        except TimeoutError:
503            msg = f'Timeout ({TIMEOUT}s) reached for Ontop Materialize'
504            self._logger.warning(msg)
505
506        return False
VERSION = '5.0.0'
TIMEOUT = 21600
R2RML = Namespace('http://www.w3.org/ns/r2rml#')
class Ontop(bench_executor.container.Container):
 28class Ontop(Container):
 29    """Ontop container super class for OntopMaterialize and OntopVirtualize."""
 30    def __init__(self, name: str, data_path: str, logger: Logger, mode: str):
 31        """Creates an instance of the Ontop class.
 32
 33        Parameters
 34        ----------
 35        name : str
 36            Pretty name of the container.
 37        data_path: str
 38            Path to the data directory of the case.
 39        logger : Logger
 40            Logger to use for log messages.
 41        mode : str
 42            Ontop mode: `materialize` or `endpoint`
 43        """
 44        self._mode = mode
 45        self._headers: Dict[str, Dict[str, str]] = {}
 46        self._logger = logger
 47        self._data_path = data_path
 48
 49        if self._mode == 'endpoint':
 50            subdir = 'ontopvirtualize'
 51        elif self._mode == 'materialize':
 52            subdir = 'ontopmaterialize'
 53        else:
 54            raise ValueError(f'Unknown Ontop mode: "{self._mode}"')
 55        os.makedirs(os.path.join(self._data_path, subdir), exist_ok=True)
 56
 57        # Set Java heap to 1/2 of available memory instead of the default 1/4
 58        max_heap = int(psutil.virtual_memory().total * (1/2))
 59
 60        # Configure logging
 61        log_level = 'info'
 62        if self._logger.verbose:
 63            log_level = 'debug'
 64        self._logger.info(f'Initialized Ontop logger at "{log_level}" level')
 65
 66        environment = {'ONTOP_JAVA_ARGS': f'-Xmx{max_heap} -Xms{max_heap}',
 67                       'ONTOP_LOG_LEVEL': log_level}
 68        super().__init__(f'kgconstruct/ontop:v{VERSION}', name,
 69                         self._logger,
 70                         ports={'8888': '8888'},
 71                         environment=environment,
 72                         volumes=[f'{self._data_path}/'
 73                                  f'{self.root_mount_directory}:/data',
 74                                  f'{self._data_path}/shared:/data/shared'])
 75
 76    @property
 77    def root_mount_directory(self) -> str:
 78        """Subdirectory in the root directory of the case for Ontop.
 79
 80        Returns
 81        -------
 82        subdirectory : str
 83            Subdirectory of the root directory for Ontop.
 84
 85        """
 86        if self._mode == 'endpoint':
 87            return 'ontopvirtualize'
 88        elif self._mode == 'materialize':
 89            return 'ontopmaterialize'
 90        else:
 91            raise ValueError(f'Unknown Ontop mode: "{self._mode}"')
 92
 93    @property
 94    def endpoint(self) -> str:
 95        """SPARQL endpoint URL for Ontop.
 96
 97        Returns
 98        -------
 99        url : str
100            SPARQL endpoint URL.
101        """
102        return 'http://localhost:8888/sparql'
103
104    @property
105    def headers(self) -> dict:
106        """HTTP headers of SPARQL queries for serialization formats.
107
108        Only supported serialization formats are included in the dictionary.
109        Currently, the following formats are supported:
110        - N-Triples
111        - N-Quads
112        - Turtle
113        - CSV
114        - RDF/JSON
115        - RDF/XML
116        - JSON-LD
117
118        Returns
119        -------
120        headers : dict
121            Dictionary of headers to use for each serialization format.
122        """
123        return self._headers
124
125    def _execute(self, arguments: list) -> bool:
126        """Execute Ontop with given arguments.
127
128        Parameters
129        ----------
130        arguments : list
131            Arguments to supply to Ontop.
132
133        Returns
134        -------
135        success : bool
136            Whether the execution succeeded or not.
137        """
138
139        cmd = f'/ontop/ontop {self._mode} {" ".join(arguments)}'
140        self._logger.info(f'Executing Ontop with command: {cmd}')
141        if self._mode == 'endpoint':
142            log_line = 'OntopEndpointApplication - Started ' + \
143                       'OntopEndpointApplication'
144            success = self.run_and_wait_for_log(log_line, cmd)
145        elif self._mode == 'materialize':
146            success = self.run_and_wait_for_exit(cmd)
147        else:
148            self._logger.error(f'Unknown Ontop mode "{self._mode}"')
149            success = False
150
151        return success
152
153    def _execute_mapping(self,
154                         config_file: str,
155                         arguments: list,
156                         mapping_file: str,
157                         output_file: Optional[str],
158                         rdb_username: str,
159                         rdb_password: str,
160                         rdb_host: str,
161                         rdb_port: int,
162                         rdb_name: str,
163                         rdb_type: str) -> bool:
164        """Execute a mapping file with Ontop.
165
166        Only relational databases are supported by
167        Ontop, thus the relational database parameters are mandantory.
168
169        Parameters
170        ----------
171        config_file : str
172            Name of the generated config file for Ontop.
173        arguments : list
174            List of arguments to pass to Ontop.
175        mapping_file : str
176            Name of the mapping file to use.
177        output_file : Optional[str]
178            Name of the output file to use. Only applicable for
179            materialization.
180        rdb_username : str
181            Username for the database.
182        rdb_password : str
183            Password for the database.
184        rdb_host : str
185            Hostname for the database.
186        rdb_port : int
187            Port for the database.
188        rdb_name : str
189            Database name for the database.
190        rdb_type : str
191            Database type.
192
193        Returns
194        -------
195        success : bool
196            Whether the execution was successfull or not.
197        """
198        # Generate INI configuration file since no CLI is available
199        config = configparser.ConfigParser()
200        config['root'] = {}
201        if rdb_type == 'MySQL':
202            dsn = f'jdbc:mysql://{rdb_host}:{rdb_port}/{rdb_name}'
203            config['root']['jdbc.url'] = dsn
204            config['root']['jdbc.driver'] = 'com.mysql.cj.jdbc.Driver'
205        elif rdb_type == 'PostgreSQL':
206            dsn = f'jdbc:postgresql://{rdb_host}:{rdb_port}/{rdb_name}'
207            config['root']['jdbc.url'] = dsn
208            config['root']['jdbc.driver'] = 'org.postgresql.Driver'
209        else:
210            msg = f'Unknown RDB type: "{rdb_type}"'
211            self._logger.error(msg)
212            raise ValueError(msg)
213        config['root']['jdbc.user'] = rdb_username
214        config['root']['jdbc.password'] = rdb_password
215
216        path = os.path.join(self._data_path, self.root_mount_directory)
217        os.makedirs(path, exist_ok=True)
218        with open(os.path.join(path, 'config.properties'), 'w') as f:
219            config.write(f, space_around_delimiters=False)
220
221        # .properties files are like .ini files but without a [HEADER]
222        # Use a [root] header and remove it after writing
223        with open(os.path.join(path, 'config.properties'), 'r') as f:
224            data = f.read()
225
226        with open(os.path.join(path, 'config.properties'), 'w') as f:
227            f.write(data.replace('[root]\n', ''))
228
229        # Compatibility with Ontop requiring rr:class
230        # Replace any rdf:type construction with rr:class
231        # Without this, a strange error is raised: 'The definition of the
232        # predicate is not always a ground term triple(s,p,o)'
233        g = Graph()
234        g.bind('r2rml', R2RML)
235        g.bind('rdf', RDF)
236        g.parse(os.path.join(self._data_path, 'shared',
237                             os.path.basename(mapping_file)))
238
239        for triples_map_iri, p, o in g.triples((None, RDF.type,
240                                                R2RML.TriplesMap)):
241            subject_map_iri = g.value(triples_map_iri, R2RML.subjectMap)
242
243            if subject_map_iri is None:
244                self._logger.warning("Subject Map not present in Triples Map")
245                break
246
247            iter_pom = g.triples((triples_map_iri,
248                                  R2RML.predicateObjectMap,
249                                  None))
250            for s, p, predicate_object_map_iri in iter_pom:
251                predicate_map_iri = g.value(predicate_object_map_iri,
252                                            R2RML.predicateMap)
253                object_map_iri = g.value(predicate_object_map_iri,
254                                         R2RML.objectMap)
255
256                if predicate_map_iri is None or object_map_iri is None:
257                    continue
258
259                # Check if PredicateObjectMap is pointing to a PredicateMap
260                # specifying rdf:type. Skip this PredicateObjectMap if not
261                if g.value(predicate_map_iri, R2RML.constant) != RDF.type:
262                    continue
263
264                # Retrieve the ObjectMap rr:constant value and add it as
265                # rr:class to the Subject Map is present
266                rdf_type_value = cast(URIRef,
267                                      g.value(object_map_iri, R2RML.constant))
268                if rdf_type_value is not None:
269                    iri = URIRef(rdf_type_value.toPython())
270                    g.add((subject_map_iri, R2RML['class'], iri))
271                else:
272                    msg = 'Cannot extract rr:class value, rdf:type value ' + \
273                          'is not a constant value!'
274                    self._logger.error(msg)
275                    return False
276
277                # Remove all triples associated with the rdf:type PredicateMap
278                for s, p, o in g.triples((predicate_map_iri, None, None)):
279                    g.remove((s, p, o))
280
281                # Remove all triples associated with the rdf:type ObjectMap
282                for s, p, o in g.triples((object_map_iri, None, None)):
283                    g.remove((s, p, o))
284
285                # Remove all triples associated with the
286                # rdf:type PredicateObjectMap
287                for s, p, o in g.triples((object_map_iri, None, None)):
288                    g.remove((s, p, o))
289
290                # Remove PredicateObjectMap from Triples Map
291                g.remove((triples_map_iri, R2RML.predicateObjectMap,
292                          predicate_object_map_iri))
293
294            destination = os.path.join(self._data_path,
295                                       self.root_mount_directory,
296                                       'mapping_converted.r2rml.ttl')
297            g.serialize(destination=destination, format='turtle')
298
299        arguments.append('-m')
300        arguments.append('/data/mapping_converted.r2rml.ttl')
301        if output_file is not None:
302            arguments.append('-o')
303            arguments.append(os.path.join('/data/shared/', output_file))
304        arguments.append('-p')
305        arguments.append('/data/config.properties')
306
307        return self._execute(arguments)

Ontop container super class for OntopMaterialize and OntopVirtualize.

Ontop( name: str, data_path: str, logger: bench_executor.logger.Logger, mode: str)
30    def __init__(self, name: str, data_path: str, logger: Logger, mode: str):
31        """Creates an instance of the Ontop class.
32
33        Parameters
34        ----------
35        name : str
36            Pretty name of the container.
37        data_path: str
38            Path to the data directory of the case.
39        logger : Logger
40            Logger to use for log messages.
41        mode : str
42            Ontop mode: `materialize` or `endpoint`
43        """
44        self._mode = mode
45        self._headers: Dict[str, Dict[str, str]] = {}
46        self._logger = logger
47        self._data_path = data_path
48
49        if self._mode == 'endpoint':
50            subdir = 'ontopvirtualize'
51        elif self._mode == 'materialize':
52            subdir = 'ontopmaterialize'
53        else:
54            raise ValueError(f'Unknown Ontop mode: "{self._mode}"')
55        os.makedirs(os.path.join(self._data_path, subdir), exist_ok=True)
56
57        # Set Java heap to 1/2 of available memory instead of the default 1/4
58        max_heap = int(psutil.virtual_memory().total * (1/2))
59
60        # Configure logging
61        log_level = 'info'
62        if self._logger.verbose:
63            log_level = 'debug'
64        self._logger.info(f'Initialized Ontop logger at "{log_level}" level')
65
66        environment = {'ONTOP_JAVA_ARGS': f'-Xmx{max_heap} -Xms{max_heap}',
67                       'ONTOP_LOG_LEVEL': log_level}
68        super().__init__(f'kgconstruct/ontop:v{VERSION}', name,
69                         self._logger,
70                         ports={'8888': '8888'},
71                         environment=environment,
72                         volumes=[f'{self._data_path}/'
73                                  f'{self.root_mount_directory}:/data',
74                                  f'{self._data_path}/shared:/data/shared'])

Creates an instance of the Ontop class.

Parameters
  • name (str): Pretty name of the container.
  • data_path (str): Path to the data directory of the case.
  • logger (Logger): Logger to use for log messages.
  • mode (str): Ontop mode: materialize or endpoint
root_mount_directory: str
76    @property
77    def root_mount_directory(self) -> str:
78        """Subdirectory in the root directory of the case for Ontop.
79
80        Returns
81        -------
82        subdirectory : str
83            Subdirectory of the root directory for Ontop.
84
85        """
86        if self._mode == 'endpoint':
87            return 'ontopvirtualize'
88        elif self._mode == 'materialize':
89            return 'ontopmaterialize'
90        else:
91            raise ValueError(f'Unknown Ontop mode: "{self._mode}"')

Subdirectory in the root directory of the case for Ontop.

Returns
  • subdirectory (str): Subdirectory of the root directory for Ontop.
endpoint: str
 93    @property
 94    def endpoint(self) -> str:
 95        """SPARQL endpoint URL for Ontop.
 96
 97        Returns
 98        -------
 99        url : str
100            SPARQL endpoint URL.
101        """
102        return 'http://localhost:8888/sparql'

SPARQL endpoint URL for Ontop.

Returns
  • url (str): SPARQL endpoint URL.
headers: dict
104    @property
105    def headers(self) -> dict:
106        """HTTP headers of SPARQL queries for serialization formats.
107
108        Only supported serialization formats are included in the dictionary.
109        Currently, the following formats are supported:
110        - N-Triples
111        - N-Quads
112        - Turtle
113        - CSV
114        - RDF/JSON
115        - RDF/XML
116        - JSON-LD
117
118        Returns
119        -------
120        headers : dict
121            Dictionary of headers to use for each serialization format.
122        """
123        return self._headers

HTTP headers of SPARQL queries for serialization formats.

Only supported serialization formats are included in the dictionary. Currently, the following formats are supported:

  • N-Triples
  • N-Quads
  • Turtle
  • CSV
  • RDF/JSON
  • RDF/XML
  • JSON-LD
Returns
  • headers (dict): Dictionary of headers to use for each serialization format.
class OntopVirtualize(Ontop):
310class OntopVirtualize(Ontop):
311    """OntopVirtualize container for setting up an Ontop SPARQL endpoint."""
312    def __init__(self, data_path: str, config_path: str, directory: str,
313                 verbose: bool):
314        """Creates an instance of the OntopVirtualize class.
315
316        Parameters
317        ----------
318        data_path : str
319            Path to the data directory of the case.
320        config_path : str
321            Path to the config directory of the case.
322        directory : str
323            Path to the directory to store logs.
324        verbose : bool
325            Enable verbose logs.
326        """
327        self._data_path = os.path.abspath(data_path)
328        self._config_path = os.path.abspath(config_path)
329        self._logger = Logger(__name__, directory, verbose)
330        super().__init__('Ontop-Virtualize', self._data_path, self._logger,
331                         'endpoint')
332
333    def execute_mapping(self,
334                        mapping_file: str,
335                        serialization: str,
336                        rdb_username: str,
337                        rdb_password: str,
338                        rdb_host: str,
339                        rdb_port: int,
340                        rdb_name: str,
341                        rdb_type: str) -> bool:
342        """Start an Ontop SPARQL endpoint with a mapping.
343
344        Only relational databases are supported by
345        Ontop, thus the relational database parameters are mandantory.
346        Ontop SPARQL endpoint supports the following serialization formats:
347        - N-Triples (Ontop v5+)
348        - N-Quads (Ontop v5+)
349        - Turtle
350        - RDF/JSON
351        - JSON-LD
352        - CSV
353
354        Parameters
355        ----------
356        mapping_file : str
357            Path to the mapping file to execute.
358        serialization : str
359            Serialization format to use.
360        rdb_username : str
361            Username for the database.
362        rdb_password : str
363            Password for the database.
364        rdb_host : str
365            Hostname for the database.
366        rdb_port : int
367            Port for the database.
368        rdb_name : str
369            Database name for the database.
370        rdb_type : str
371            Database type.
372
373        Returns
374        -------
375        success : bool
376            Whether the execution was successfull or not.
377        """
378        config_file = f'{self._data_path}/{self.root_mount_directory}' + \
379                      '/config.properties'
380        arguments = ['--cors-allowed-origins=*', '--port=8888']
381        self._headers['ntriples'] = {'Accept': 'application/n-triples'}
382        self._headers['nquads'] = {'Accept': 'application/n-quads'}
383        self._headers['turtle'] = {'Accept': 'text/turtle'}
384        self._headers['rdfjson'] = {'Accept': 'application/rdf+json'}
385        self._headers['rdfxml'] = {'Accept': 'application/rdf+xml'}
386        self._headers['jsonld'] = {'Accept': 'application/ld+json'}
387        self._headers['csv'] = {'Accept': 'text/csv'}
388        if serialization not in self._headers.keys():
389            msg = 'Unsupported serialization format ' + \
390                  f'"{serialization}" for Ontop'
391            self._logger.error(msg)
392            raise ValueError(msg)
393        return super()._execute_mapping(config_file, arguments,
394                                        mapping_file, None, rdb_username,
395                                        rdb_password, rdb_host, rdb_port,
396                                        rdb_name, rdb_type)

OntopVirtualize container for setting up an Ontop SPARQL endpoint.

OntopVirtualize(data_path: str, config_path: str, directory: str, verbose: bool)
312    def __init__(self, data_path: str, config_path: str, directory: str,
313                 verbose: bool):
314        """Creates an instance of the OntopVirtualize class.
315
316        Parameters
317        ----------
318        data_path : str
319            Path to the data directory of the case.
320        config_path : str
321            Path to the config directory of the case.
322        directory : str
323            Path to the directory to store logs.
324        verbose : bool
325            Enable verbose logs.
326        """
327        self._data_path = os.path.abspath(data_path)
328        self._config_path = os.path.abspath(config_path)
329        self._logger = Logger(__name__, directory, verbose)
330        super().__init__('Ontop-Virtualize', self._data_path, self._logger,
331                         'endpoint')

Creates an instance of the OntopVirtualize class.

Parameters
  • data_path (str): Path to the data directory of the case.
  • config_path (str): Path to the config directory of the case.
  • directory (str): Path to the directory to store logs.
  • verbose (bool): Enable verbose logs.
def execute_mapping( self, mapping_file: str, serialization: str, rdb_username: str, rdb_password: str, rdb_host: str, rdb_port: int, rdb_name: str, rdb_type: str) -> bool:
333    def execute_mapping(self,
334                        mapping_file: str,
335                        serialization: str,
336                        rdb_username: str,
337                        rdb_password: str,
338                        rdb_host: str,
339                        rdb_port: int,
340                        rdb_name: str,
341                        rdb_type: str) -> bool:
342        """Start an Ontop SPARQL endpoint with a mapping.
343
344        Only relational databases are supported by
345        Ontop, thus the relational database parameters are mandantory.
346        Ontop SPARQL endpoint supports the following serialization formats:
347        - N-Triples (Ontop v5+)
348        - N-Quads (Ontop v5+)
349        - Turtle
350        - RDF/JSON
351        - JSON-LD
352        - CSV
353
354        Parameters
355        ----------
356        mapping_file : str
357            Path to the mapping file to execute.
358        serialization : str
359            Serialization format to use.
360        rdb_username : str
361            Username for the database.
362        rdb_password : str
363            Password for the database.
364        rdb_host : str
365            Hostname for the database.
366        rdb_port : int
367            Port for the database.
368        rdb_name : str
369            Database name for the database.
370        rdb_type : str
371            Database type.
372
373        Returns
374        -------
375        success : bool
376            Whether the execution was successfull or not.
377        """
378        config_file = f'{self._data_path}/{self.root_mount_directory}' + \
379                      '/config.properties'
380        arguments = ['--cors-allowed-origins=*', '--port=8888']
381        self._headers['ntriples'] = {'Accept': 'application/n-triples'}
382        self._headers['nquads'] = {'Accept': 'application/n-quads'}
383        self._headers['turtle'] = {'Accept': 'text/turtle'}
384        self._headers['rdfjson'] = {'Accept': 'application/rdf+json'}
385        self._headers['rdfxml'] = {'Accept': 'application/rdf+xml'}
386        self._headers['jsonld'] = {'Accept': 'application/ld+json'}
387        self._headers['csv'] = {'Accept': 'text/csv'}
388        if serialization not in self._headers.keys():
389            msg = 'Unsupported serialization format ' + \
390                  f'"{serialization}" for Ontop'
391            self._logger.error(msg)
392            raise ValueError(msg)
393        return super()._execute_mapping(config_file, arguments,
394                                        mapping_file, None, rdb_username,
395                                        rdb_password, rdb_host, rdb_port,
396                                        rdb_name, rdb_type)

Start an Ontop SPARQL endpoint with a mapping.

Only relational databases are supported by Ontop, thus the relational database parameters are mandantory. Ontop SPARQL endpoint supports the following serialization formats:

  • N-Triples (Ontop v5+)
  • N-Quads (Ontop v5+)
  • Turtle
  • RDF/JSON
  • JSON-LD
  • CSV
Parameters
  • mapping_file (str): Path to the mapping file to execute.
  • serialization (str): Serialization format to use.
  • rdb_username (str): Username for the database.
  • rdb_password (str): Password for the database.
  • rdb_host (str): Hostname for the database.
  • rdb_port (int): Port for the database.
  • rdb_name (str): Database name for the database.
  • rdb_type (str): Database type.
Returns
  • success (bool): Whether the execution was successfull or not.
class OntopMaterialize(Ontop):
399class OntopMaterialize(Ontop):
400    """OntopMaterialize container to execute a R2RML mapping."""
401    def __init__(self, data_path: str, config_path: str, directory: str,
402                 verbose: bool):
403        """Creates an instance of the OntopMaterialize class.
404
405        Parameters
406        ----------
407        data_path : str
408            Path to the data directory of the case.
409        config_path : str
410            Path to the config directory of the case.
411        directory : str
412            Path to the directory to store logs.
413        verbose : bool
414            Enable verbose logs.
415        """
416        self._data_path = os.path.abspath(data_path)
417        self._config_path = os.path.abspath(config_path)
418        self._logger = Logger(__name__, directory, verbose)
419        os.makedirs(os.path.join(self._data_path, 'ontopmaterialize'),
420                    exist_ok=True)
421        super().__init__('Ontop-Materialize', self._data_path, self._logger,
422                         'materialize')
423
424    @timeout(TIMEOUT)
425    def _execute_mapping_with_timeout(self, mapping_file: str,
426                                      output_file: str,
427                                      serialization: str,
428                                      rdb_username: str,
429                                      rdb_password: str,
430                                      rdb_host: str,
431                                      rdb_port: int,
432                                      rdb_name: str,
433                                      rdb_type: str) -> bool:
434        """Execute a mapping with a provided timeout.
435
436        Returns
437        -------
438        success : bool
439            Whether the execution was successfull or not.
440        """
441        config_file = f'{self._data_path}/{self.root_mount_directory}' + \
442                      '/config.properties'
443        arguments = ['-f', serialization]
444        self._headers = {}
445        return super()._execute_mapping(config_file, arguments,
446                                        mapping_file, output_file,
447                                        rdb_username, rdb_password,
448                                        rdb_host, rdb_port, rdb_name, rdb_type)
449
450    def execute_mapping(self,
451                        mapping_file: str,
452                        output_file: str,
453                        serialization: str,
454                        rdb_username: str,
455                        rdb_password: str,
456                        rdb_host: str,
457                        rdb_port: int,
458                        rdb_name: str,
459                        rdb_type: str) -> bool:
460        """Execute a R2RML mapping with Ontop
461
462        N-Quads and N-Triples are currently supported as serialization
463        for Ontop materialize. Only relational databases are supported by
464        Ontop, thus the relational database parameters are mandantory.
465
466        Parameters
467        ----------
468        mapping_file : str
469            Path to the mapping file to execute.
470        output_file : str
471            Name of the output file to store the triples in. This is not used
472            for OntopVirtualize.
473        serialization : str
474            Serialization format to use.
475        rdb_username : str
476            Username for the database.
477        rdb_password : str
478            Password for the database.
479        rdb_host : str
480            Hostname for the database.
481        rdb_port : int
482            Port for the database.
483        rdb_name : str
484            Database name for the database.
485        rdb_type : str
486            Database type.
487
488        Returns
489        -------
490        success : bool
491            Whether the execution was successfull or not.
492        """
493        try:
494            return self._execute_mapping_with_timeout(mapping_file,
495                                                      output_file,
496                                                      serialization,
497                                                      rdb_username,
498                                                      rdb_password,
499                                                      rdb_host,
500                                                      rdb_port,
501                                                      rdb_name,
502                                                      rdb_type)
503        except TimeoutError:
504            msg = f'Timeout ({TIMEOUT}s) reached for Ontop Materialize'
505            self._logger.warning(msg)
506
507        return False

OntopMaterialize container to execute a R2RML mapping.

OntopMaterialize(data_path: str, config_path: str, directory: str, verbose: bool)
401    def __init__(self, data_path: str, config_path: str, directory: str,
402                 verbose: bool):
403        """Creates an instance of the OntopMaterialize class.
404
405        Parameters
406        ----------
407        data_path : str
408            Path to the data directory of the case.
409        config_path : str
410            Path to the config directory of the case.
411        directory : str
412            Path to the directory to store logs.
413        verbose : bool
414            Enable verbose logs.
415        """
416        self._data_path = os.path.abspath(data_path)
417        self._config_path = os.path.abspath(config_path)
418        self._logger = Logger(__name__, directory, verbose)
419        os.makedirs(os.path.join(self._data_path, 'ontopmaterialize'),
420                    exist_ok=True)
421        super().__init__('Ontop-Materialize', self._data_path, self._logger,
422                         'materialize')

Creates an instance of the OntopMaterialize class.

Parameters
  • data_path (str): Path to the data directory of the case.
  • config_path (str): Path to the config directory of the case.
  • directory (str): Path to the directory to store logs.
  • verbose (bool): Enable verbose logs.
def execute_mapping( self, mapping_file: str, output_file: str, serialization: str, rdb_username: str, rdb_password: str, rdb_host: str, rdb_port: int, rdb_name: str, rdb_type: str) -> bool:
450    def execute_mapping(self,
451                        mapping_file: str,
452                        output_file: str,
453                        serialization: str,
454                        rdb_username: str,
455                        rdb_password: str,
456                        rdb_host: str,
457                        rdb_port: int,
458                        rdb_name: str,
459                        rdb_type: str) -> bool:
460        """Execute a R2RML mapping with Ontop
461
462        N-Quads and N-Triples are currently supported as serialization
463        for Ontop materialize. Only relational databases are supported by
464        Ontop, thus the relational database parameters are mandantory.
465
466        Parameters
467        ----------
468        mapping_file : str
469            Path to the mapping file to execute.
470        output_file : str
471            Name of the output file to store the triples in. This is not used
472            for OntopVirtualize.
473        serialization : str
474            Serialization format to use.
475        rdb_username : str
476            Username for the database.
477        rdb_password : str
478            Password for the database.
479        rdb_host : str
480            Hostname for the database.
481        rdb_port : int
482            Port for the database.
483        rdb_name : str
484            Database name for the database.
485        rdb_type : str
486            Database type.
487
488        Returns
489        -------
490        success : bool
491            Whether the execution was successfull or not.
492        """
493        try:
494            return self._execute_mapping_with_timeout(mapping_file,
495                                                      output_file,
496                                                      serialization,
497                                                      rdb_username,
498                                                      rdb_password,
499                                                      rdb_host,
500                                                      rdb_port,
501                                                      rdb_name,
502                                                      rdb_type)
503        except TimeoutError:
504            msg = f'Timeout ({TIMEOUT}s) reached for Ontop Materialize'
505            self._logger.warning(msg)
506
507        return False

Execute a R2RML mapping with Ontop

N-Quads and N-Triples are currently supported as serialization for Ontop materialize. Only relational databases are supported by Ontop, thus the relational database parameters are mandantory.

Parameters
  • mapping_file (str): Path to the mapping file to execute.
  • output_file (str): Name of the output file to store the triples in. This is not used for OntopVirtualize.
  • serialization (str): Serialization format to use.
  • rdb_username (str): Username for the database.
  • rdb_password (str): Password for the database.
  • rdb_host (str): Hostname for the database.
  • rdb_port (int): Port for the database.
  • rdb_name (str): Database name for the database.
  • rdb_type (str): Database type.
Returns
  • success (bool): Whether the execution was successfull or not.