bench_executor.ontop
Ontop is a Virtual Knowledge Graph system. It exposes the content of arbitrary relational databases as knowledge graphs. These graphs are virtual, which means that data remains in the data sources instead of being moved to another database.
Website: https://ontop-vkg.org
Repository: https://github.com/ontop/ontop
1#!/usr/bin/env python3 2 3""" 4Ontop is a Virtual Knowledge Graph system. It exposes the content of 5arbitrary relational databases as knowledge graphs. These graphs are virtual, 6which means that data remains in the data sources instead of being moved 7to another database. 8 9**Website**: https://ontop-vkg.org<br> 10**Repository**: https://github.com/ontop/ontop 11""" 12 13import os 14import psutil 15import configparser 16from rdflib import Graph, Namespace, RDF, URIRef 17from timeout_decorator import timeout, TimeoutError # type: ignore 18from typing import Dict, Optional, cast 19from bench_executor.container import Container 20from bench_executor.logger import Logger 21 22VERSION = '5.0.0' 23TIMEOUT = 6 * 3600 # 6 hours 24R2RML = Namespace('http://www.w3.org/ns/r2rml#') 25 26 27class Ontop(Container): 28 """Ontop container super class for OntopMaterialize and OntopVirtualize.""" 29 def __init__(self, name: str, data_path: str, logger: Logger, mode: str): 30 """Creates an instance of the Ontop class. 31 32 Parameters 33 ---------- 34 name : str 35 Pretty name of the container. 36 data_path: str 37 Path to the data directory of the case. 38 logger : Logger 39 Logger to use for log messages. 40 mode : str 41 Ontop mode: `materialize` or `endpoint` 42 """ 43 self._mode = mode 44 self._headers: Dict[str, Dict[str, str]] = {} 45 self._logger = logger 46 self._data_path = data_path 47 48 if self._mode == 'endpoint': 49 subdir = 'ontopvirtualize' 50 elif self._mode == 'materialize': 51 subdir = 'ontopmaterialize' 52 else: 53 raise ValueError(f'Unknown Ontop mode: "{self._mode}"') 54 os.makedirs(os.path.join(self._data_path, subdir), exist_ok=True) 55 56 # Set Java heap to 1/2 of available memory instead of the default 1/4 57 max_heap = int(psutil.virtual_memory().total * (1/2)) 58 59 # Configure logging 60 log_level = 'info' 61 if self._logger.verbose: 62 log_level = 'debug' 63 self._logger.info(f'Initialized Ontop logger at "{log_level}" level') 64 65 environment = {'ONTOP_JAVA_ARGS': f'-Xmx{max_heap} -Xms{max_heap}', 66 'ONTOP_LOG_LEVEL': log_level} 67 super().__init__(f'kgconstruct/ontop:v{VERSION}', name, 68 self._logger, 69 ports={'8888': '8888'}, 70 environment=environment, 71 volumes=[f'{self._data_path}/' 72 f'{self.root_mount_directory}:/data', 73 f'{self._data_path}/shared:/data/shared']) 74 75 @property 76 def root_mount_directory(self) -> str: 77 """Subdirectory in the root directory of the case for Ontop. 78 79 Returns 80 ------- 81 subdirectory : str 82 Subdirectory of the root directory for Ontop. 83 84 """ 85 if self._mode == 'endpoint': 86 return 'ontopvirtualize' 87 elif self._mode == 'materialize': 88 return 'ontopmaterialize' 89 else: 90 raise ValueError(f'Unknown Ontop mode: "{self._mode}"') 91 92 @property 93 def endpoint(self) -> str: 94 """SPARQL endpoint URL for Ontop. 95 96 Returns 97 ------- 98 url : str 99 SPARQL endpoint URL. 100 """ 101 return 'http://localhost:8888/sparql' 102 103 @property 104 def headers(self) -> dict: 105 """HTTP headers of SPARQL queries for serialization formats. 106 107 Only supported serialization formats are included in the dictionary. 108 Currently, the following formats are supported: 109 - N-Triples 110 - N-Quads 111 - Turtle 112 - CSV 113 - RDF/JSON 114 - RDF/XML 115 - JSON-LD 116 117 Returns 118 ------- 119 headers : dict 120 Dictionary of headers to use for each serialization format. 121 """ 122 return self._headers 123 124 def _execute(self, arguments: list) -> bool: 125 """Execute Ontop with given arguments. 126 127 Parameters 128 ---------- 129 arguments : list 130 Arguments to supply to Ontop. 131 132 Returns 133 ------- 134 success : bool 135 Whether the execution succeeded or not. 136 """ 137 138 cmd = f'/ontop/ontop {self._mode} {" ".join(arguments)}' 139 self._logger.info(f'Executing Ontop with command: {cmd}') 140 if self._mode == 'endpoint': 141 log_line = 'OntopEndpointApplication - Started ' + \ 142 'OntopEndpointApplication' 143 success = self.run_and_wait_for_log(log_line, cmd) 144 elif self._mode == 'materialize': 145 success = self.run_and_wait_for_exit(cmd) 146 else: 147 self._logger.error(f'Unknown Ontop mode "{self._mode}"') 148 success = False 149 150 return success 151 152 def _execute_mapping(self, 153 config_file: str, 154 arguments: list, 155 mapping_file: str, 156 output_file: Optional[str], 157 rdb_username: str, 158 rdb_password: str, 159 rdb_host: str, 160 rdb_port: int, 161 rdb_name: str, 162 rdb_type: str) -> bool: 163 """Execute a mapping file with Ontop. 164 165 Only relational databases are supported by 166 Ontop, thus the relational database parameters are mandantory. 167 168 Parameters 169 ---------- 170 config_file : str 171 Name of the generated config file for Ontop. 172 arguments : list 173 List of arguments to pass to Ontop. 174 mapping_file : str 175 Name of the mapping file to use. 176 output_file : Optional[str] 177 Name of the output file to use. Only applicable for 178 materialization. 179 rdb_username : str 180 Username for the database. 181 rdb_password : str 182 Password for the database. 183 rdb_host : str 184 Hostname for the database. 185 rdb_port : int 186 Port for the database. 187 rdb_name : str 188 Database name for the database. 189 rdb_type : str 190 Database type. 191 192 Returns 193 ------- 194 success : bool 195 Whether the execution was successfull or not. 196 """ 197 # Generate INI configuration file since no CLI is available 198 config = configparser.ConfigParser() 199 config['root'] = {} 200 if rdb_type == 'MySQL': 201 dsn = f'jdbc:mysql://{rdb_host}:{rdb_port}/{rdb_name}' 202 config['root']['jdbc.url'] = dsn 203 config['root']['jdbc.driver'] = 'com.mysql.cj.jdbc.Driver' 204 elif rdb_type == 'PostgreSQL': 205 dsn = f'jdbc:postgresql://{rdb_host}:{rdb_port}/{rdb_name}' 206 config['root']['jdbc.url'] = dsn 207 config['root']['jdbc.driver'] = 'org.postgresql.Driver' 208 else: 209 msg = f'Unknown RDB type: "{rdb_type}"' 210 self._logger.error(msg) 211 raise ValueError(msg) 212 config['root']['jdbc.user'] = rdb_username 213 config['root']['jdbc.password'] = rdb_password 214 215 path = os.path.join(self._data_path, self.root_mount_directory) 216 os.makedirs(path, exist_ok=True) 217 with open(os.path.join(path, 'config.properties'), 'w') as f: 218 config.write(f, space_around_delimiters=False) 219 220 # .properties files are like .ini files but without a [HEADER] 221 # Use a [root] header and remove it after writing 222 with open(os.path.join(path, 'config.properties'), 'r') as f: 223 data = f.read() 224 225 with open(os.path.join(path, 'config.properties'), 'w') as f: 226 f.write(data.replace('[root]\n', '')) 227 228 # Compatibility with Ontop requiring rr:class 229 # Replace any rdf:type construction with rr:class 230 # Without this, a strange error is raised: 'The definition of the 231 # predicate is not always a ground term triple(s,p,o)' 232 g = Graph() 233 g.bind('r2rml', R2RML) 234 g.bind('rdf', RDF) 235 g.parse(os.path.join(self._data_path, 'shared', 236 os.path.basename(mapping_file))) 237 238 for triples_map_iri, p, o in g.triples((None, RDF.type, 239 R2RML.TriplesMap)): 240 subject_map_iri = g.value(triples_map_iri, R2RML.subjectMap) 241 242 if subject_map_iri is None: 243 self._logger.warning("Subject Map not present in Triples Map") 244 break 245 246 iter_pom = g.triples((triples_map_iri, 247 R2RML.predicateObjectMap, 248 None)) 249 for s, p, predicate_object_map_iri in iter_pom: 250 predicate_map_iri = g.value(predicate_object_map_iri, 251 R2RML.predicateMap) 252 object_map_iri = g.value(predicate_object_map_iri, 253 R2RML.objectMap) 254 255 if predicate_map_iri is None or object_map_iri is None: 256 continue 257 258 # Check if PredicateObjectMap is pointing to a PredicateMap 259 # specifying rdf:type. Skip this PredicateObjectMap if not 260 if g.value(predicate_map_iri, R2RML.constant) != RDF.type: 261 continue 262 263 # Retrieve the ObjectMap rr:constant value and add it as 264 # rr:class to the Subject Map is present 265 rdf_type_value = cast(URIRef, 266 g.value(object_map_iri, R2RML.constant)) 267 if rdf_type_value is not None: 268 iri = URIRef(rdf_type_value.toPython()) 269 g.add((subject_map_iri, R2RML['class'], iri)) 270 else: 271 msg = 'Cannot extract rr:class value, rdf:type value ' + \ 272 'is not a constant value!' 273 self._logger.error(msg) 274 return False 275 276 # Remove all triples associated with the rdf:type PredicateMap 277 for s, p, o in g.triples((predicate_map_iri, None, None)): 278 g.remove((s, p, o)) 279 280 # Remove all triples associated with the rdf:type ObjectMap 281 for s, p, o in g.triples((object_map_iri, None, None)): 282 g.remove((s, p, o)) 283 284 # Remove all triples associated with the 285 # rdf:type PredicateObjectMap 286 for s, p, o in g.triples((object_map_iri, None, None)): 287 g.remove((s, p, o)) 288 289 # Remove PredicateObjectMap from Triples Map 290 g.remove((triples_map_iri, R2RML.predicateObjectMap, 291 predicate_object_map_iri)) 292 293 destination = os.path.join(self._data_path, 294 self.root_mount_directory, 295 'mapping_converted.r2rml.ttl') 296 g.serialize(destination=destination, format='turtle') 297 298 arguments.append('-m') 299 arguments.append('/data/mapping_converted.r2rml.ttl') 300 if output_file is not None: 301 arguments.append('-o') 302 arguments.append(os.path.join('/data/shared/', output_file)) 303 arguments.append('-p') 304 arguments.append('/data/config.properties') 305 306 return self._execute(arguments) 307 308 309class OntopVirtualize(Ontop): 310 """OntopVirtualize container for setting up an Ontop SPARQL endpoint.""" 311 def __init__(self, data_path: str, config_path: str, directory: str, 312 verbose: bool): 313 """Creates an instance of the OntopVirtualize class. 314 315 Parameters 316 ---------- 317 data_path : str 318 Path to the data directory of the case. 319 config_path : str 320 Path to the config directory of the case. 321 directory : str 322 Path to the directory to store logs. 323 verbose : bool 324 Enable verbose logs. 325 """ 326 self._data_path = os.path.abspath(data_path) 327 self._config_path = os.path.abspath(config_path) 328 self._logger = Logger(__name__, directory, verbose) 329 super().__init__('Ontop-Virtualize', self._data_path, self._logger, 330 'endpoint') 331 332 def execute_mapping(self, 333 mapping_file: str, 334 serialization: str, 335 rdb_username: str, 336 rdb_password: str, 337 rdb_host: str, 338 rdb_port: int, 339 rdb_name: str, 340 rdb_type: str) -> bool: 341 """Start an Ontop SPARQL endpoint with a mapping. 342 343 Only relational databases are supported by 344 Ontop, thus the relational database parameters are mandantory. 345 Ontop SPARQL endpoint supports the following serialization formats: 346 - N-Triples (Ontop v5+) 347 - N-Quads (Ontop v5+) 348 - Turtle 349 - RDF/JSON 350 - JSON-LD 351 - CSV 352 353 Parameters 354 ---------- 355 mapping_file : str 356 Path to the mapping file to execute. 357 serialization : str 358 Serialization format to use. 359 rdb_username : str 360 Username for the database. 361 rdb_password : str 362 Password for the database. 363 rdb_host : str 364 Hostname for the database. 365 rdb_port : int 366 Port for the database. 367 rdb_name : str 368 Database name for the database. 369 rdb_type : str 370 Database type. 371 372 Returns 373 ------- 374 success : bool 375 Whether the execution was successfull or not. 376 """ 377 config_file = f'{self._data_path}/{self.root_mount_directory}' + \ 378 '/config.properties' 379 arguments = ['--cors-allowed-origins=*', '--port=8888'] 380 self._headers['ntriples'] = {'Accept': 'application/n-triples'} 381 self._headers['nquads'] = {'Accept': 'application/n-quads'} 382 self._headers['turtle'] = {'Accept': 'text/turtle'} 383 self._headers['rdfjson'] = {'Accept': 'application/rdf+json'} 384 self._headers['rdfxml'] = {'Accept': 'application/rdf+xml'} 385 self._headers['jsonld'] = {'Accept': 'application/ld+json'} 386 self._headers['csv'] = {'Accept': 'text/csv'} 387 if serialization not in self._headers.keys(): 388 msg = 'Unsupported serialization format ' + \ 389 f'"{serialization}" for Ontop' 390 self._logger.error(msg) 391 raise ValueError(msg) 392 return super()._execute_mapping(config_file, arguments, 393 mapping_file, None, rdb_username, 394 rdb_password, rdb_host, rdb_port, 395 rdb_name, rdb_type) 396 397 398class OntopMaterialize(Ontop): 399 """OntopMaterialize container to execute a R2RML mapping.""" 400 def __init__(self, data_path: str, config_path: str, directory: str, 401 verbose: bool): 402 """Creates an instance of the OntopMaterialize class. 403 404 Parameters 405 ---------- 406 data_path : str 407 Path to the data directory of the case. 408 config_path : str 409 Path to the config directory of the case. 410 directory : str 411 Path to the directory to store logs. 412 verbose : bool 413 Enable verbose logs. 414 """ 415 self._data_path = os.path.abspath(data_path) 416 self._config_path = os.path.abspath(config_path) 417 self._logger = Logger(__name__, directory, verbose) 418 os.makedirs(os.path.join(self._data_path, 'ontopmaterialize'), 419 exist_ok=True) 420 super().__init__('Ontop-Materialize', self._data_path, self._logger, 421 'materialize') 422 423 @timeout(TIMEOUT) 424 def _execute_mapping_with_timeout(self, mapping_file: str, 425 output_file: str, 426 serialization: str, 427 rdb_username: str, 428 rdb_password: str, 429 rdb_host: str, 430 rdb_port: int, 431 rdb_name: str, 432 rdb_type: str) -> bool: 433 """Execute a mapping with a provided timeout. 434 435 Returns 436 ------- 437 success : bool 438 Whether the execution was successfull or not. 439 """ 440 config_file = f'{self._data_path}/{self.root_mount_directory}' + \ 441 '/config.properties' 442 arguments = ['-f', serialization] 443 self._headers = {} 444 return super()._execute_mapping(config_file, arguments, 445 mapping_file, output_file, 446 rdb_username, rdb_password, 447 rdb_host, rdb_port, rdb_name, rdb_type) 448 449 def execute_mapping(self, 450 mapping_file: str, 451 output_file: str, 452 serialization: str, 453 rdb_username: str, 454 rdb_password: str, 455 rdb_host: str, 456 rdb_port: int, 457 rdb_name: str, 458 rdb_type: str) -> bool: 459 """Execute a R2RML mapping with Ontop 460 461 N-Quads and N-Triples are currently supported as serialization 462 for Ontop materialize. Only relational databases are supported by 463 Ontop, thus the relational database parameters are mandantory. 464 465 Parameters 466 ---------- 467 mapping_file : str 468 Path to the mapping file to execute. 469 output_file : str 470 Name of the output file to store the triples in. This is not used 471 for OntopVirtualize. 472 serialization : str 473 Serialization format to use. 474 rdb_username : str 475 Username for the database. 476 rdb_password : str 477 Password for the database. 478 rdb_host : str 479 Hostname for the database. 480 rdb_port : int 481 Port for the database. 482 rdb_name : str 483 Database name for the database. 484 rdb_type : str 485 Database type. 486 487 Returns 488 ------- 489 success : bool 490 Whether the execution was successfull or not. 491 """ 492 try: 493 return self._execute_mapping_with_timeout(mapping_file, 494 output_file, 495 serialization, 496 rdb_username, 497 rdb_password, 498 rdb_host, 499 rdb_port, 500 rdb_name, 501 rdb_type) 502 except TimeoutError: 503 msg = f'Timeout ({TIMEOUT}s) reached for Ontop Materialize' 504 self._logger.warning(msg) 505 506 return False
28class Ontop(Container): 29 """Ontop container super class for OntopMaterialize and OntopVirtualize.""" 30 def __init__(self, name: str, data_path: str, logger: Logger, mode: str): 31 """Creates an instance of the Ontop class. 32 33 Parameters 34 ---------- 35 name : str 36 Pretty name of the container. 37 data_path: str 38 Path to the data directory of the case. 39 logger : Logger 40 Logger to use for log messages. 41 mode : str 42 Ontop mode: `materialize` or `endpoint` 43 """ 44 self._mode = mode 45 self._headers: Dict[str, Dict[str, str]] = {} 46 self._logger = logger 47 self._data_path = data_path 48 49 if self._mode == 'endpoint': 50 subdir = 'ontopvirtualize' 51 elif self._mode == 'materialize': 52 subdir = 'ontopmaterialize' 53 else: 54 raise ValueError(f'Unknown Ontop mode: "{self._mode}"') 55 os.makedirs(os.path.join(self._data_path, subdir), exist_ok=True) 56 57 # Set Java heap to 1/2 of available memory instead of the default 1/4 58 max_heap = int(psutil.virtual_memory().total * (1/2)) 59 60 # Configure logging 61 log_level = 'info' 62 if self._logger.verbose: 63 log_level = 'debug' 64 self._logger.info(f'Initialized Ontop logger at "{log_level}" level') 65 66 environment = {'ONTOP_JAVA_ARGS': f'-Xmx{max_heap} -Xms{max_heap}', 67 'ONTOP_LOG_LEVEL': log_level} 68 super().__init__(f'kgconstruct/ontop:v{VERSION}', name, 69 self._logger, 70 ports={'8888': '8888'}, 71 environment=environment, 72 volumes=[f'{self._data_path}/' 73 f'{self.root_mount_directory}:/data', 74 f'{self._data_path}/shared:/data/shared']) 75 76 @property 77 def root_mount_directory(self) -> str: 78 """Subdirectory in the root directory of the case for Ontop. 79 80 Returns 81 ------- 82 subdirectory : str 83 Subdirectory of the root directory for Ontop. 84 85 """ 86 if self._mode == 'endpoint': 87 return 'ontopvirtualize' 88 elif self._mode == 'materialize': 89 return 'ontopmaterialize' 90 else: 91 raise ValueError(f'Unknown Ontop mode: "{self._mode}"') 92 93 @property 94 def endpoint(self) -> str: 95 """SPARQL endpoint URL for Ontop. 96 97 Returns 98 ------- 99 url : str 100 SPARQL endpoint URL. 101 """ 102 return 'http://localhost:8888/sparql' 103 104 @property 105 def headers(self) -> dict: 106 """HTTP headers of SPARQL queries for serialization formats. 107 108 Only supported serialization formats are included in the dictionary. 109 Currently, the following formats are supported: 110 - N-Triples 111 - N-Quads 112 - Turtle 113 - CSV 114 - RDF/JSON 115 - RDF/XML 116 - JSON-LD 117 118 Returns 119 ------- 120 headers : dict 121 Dictionary of headers to use for each serialization format. 122 """ 123 return self._headers 124 125 def _execute(self, arguments: list) -> bool: 126 """Execute Ontop with given arguments. 127 128 Parameters 129 ---------- 130 arguments : list 131 Arguments to supply to Ontop. 132 133 Returns 134 ------- 135 success : bool 136 Whether the execution succeeded or not. 137 """ 138 139 cmd = f'/ontop/ontop {self._mode} {" ".join(arguments)}' 140 self._logger.info(f'Executing Ontop with command: {cmd}') 141 if self._mode == 'endpoint': 142 log_line = 'OntopEndpointApplication - Started ' + \ 143 'OntopEndpointApplication' 144 success = self.run_and_wait_for_log(log_line, cmd) 145 elif self._mode == 'materialize': 146 success = self.run_and_wait_for_exit(cmd) 147 else: 148 self._logger.error(f'Unknown Ontop mode "{self._mode}"') 149 success = False 150 151 return success 152 153 def _execute_mapping(self, 154 config_file: str, 155 arguments: list, 156 mapping_file: str, 157 output_file: Optional[str], 158 rdb_username: str, 159 rdb_password: str, 160 rdb_host: str, 161 rdb_port: int, 162 rdb_name: str, 163 rdb_type: str) -> bool: 164 """Execute a mapping file with Ontop. 165 166 Only relational databases are supported by 167 Ontop, thus the relational database parameters are mandantory. 168 169 Parameters 170 ---------- 171 config_file : str 172 Name of the generated config file for Ontop. 173 arguments : list 174 List of arguments to pass to Ontop. 175 mapping_file : str 176 Name of the mapping file to use. 177 output_file : Optional[str] 178 Name of the output file to use. Only applicable for 179 materialization. 180 rdb_username : str 181 Username for the database. 182 rdb_password : str 183 Password for the database. 184 rdb_host : str 185 Hostname for the database. 186 rdb_port : int 187 Port for the database. 188 rdb_name : str 189 Database name for the database. 190 rdb_type : str 191 Database type. 192 193 Returns 194 ------- 195 success : bool 196 Whether the execution was successfull or not. 197 """ 198 # Generate INI configuration file since no CLI is available 199 config = configparser.ConfigParser() 200 config['root'] = {} 201 if rdb_type == 'MySQL': 202 dsn = f'jdbc:mysql://{rdb_host}:{rdb_port}/{rdb_name}' 203 config['root']['jdbc.url'] = dsn 204 config['root']['jdbc.driver'] = 'com.mysql.cj.jdbc.Driver' 205 elif rdb_type == 'PostgreSQL': 206 dsn = f'jdbc:postgresql://{rdb_host}:{rdb_port}/{rdb_name}' 207 config['root']['jdbc.url'] = dsn 208 config['root']['jdbc.driver'] = 'org.postgresql.Driver' 209 else: 210 msg = f'Unknown RDB type: "{rdb_type}"' 211 self._logger.error(msg) 212 raise ValueError(msg) 213 config['root']['jdbc.user'] = rdb_username 214 config['root']['jdbc.password'] = rdb_password 215 216 path = os.path.join(self._data_path, self.root_mount_directory) 217 os.makedirs(path, exist_ok=True) 218 with open(os.path.join(path, 'config.properties'), 'w') as f: 219 config.write(f, space_around_delimiters=False) 220 221 # .properties files are like .ini files but without a [HEADER] 222 # Use a [root] header and remove it after writing 223 with open(os.path.join(path, 'config.properties'), 'r') as f: 224 data = f.read() 225 226 with open(os.path.join(path, 'config.properties'), 'w') as f: 227 f.write(data.replace('[root]\n', '')) 228 229 # Compatibility with Ontop requiring rr:class 230 # Replace any rdf:type construction with rr:class 231 # Without this, a strange error is raised: 'The definition of the 232 # predicate is not always a ground term triple(s,p,o)' 233 g = Graph() 234 g.bind('r2rml', R2RML) 235 g.bind('rdf', RDF) 236 g.parse(os.path.join(self._data_path, 'shared', 237 os.path.basename(mapping_file))) 238 239 for triples_map_iri, p, o in g.triples((None, RDF.type, 240 R2RML.TriplesMap)): 241 subject_map_iri = g.value(triples_map_iri, R2RML.subjectMap) 242 243 if subject_map_iri is None: 244 self._logger.warning("Subject Map not present in Triples Map") 245 break 246 247 iter_pom = g.triples((triples_map_iri, 248 R2RML.predicateObjectMap, 249 None)) 250 for s, p, predicate_object_map_iri in iter_pom: 251 predicate_map_iri = g.value(predicate_object_map_iri, 252 R2RML.predicateMap) 253 object_map_iri = g.value(predicate_object_map_iri, 254 R2RML.objectMap) 255 256 if predicate_map_iri is None or object_map_iri is None: 257 continue 258 259 # Check if PredicateObjectMap is pointing to a PredicateMap 260 # specifying rdf:type. Skip this PredicateObjectMap if not 261 if g.value(predicate_map_iri, R2RML.constant) != RDF.type: 262 continue 263 264 # Retrieve the ObjectMap rr:constant value and add it as 265 # rr:class to the Subject Map is present 266 rdf_type_value = cast(URIRef, 267 g.value(object_map_iri, R2RML.constant)) 268 if rdf_type_value is not None: 269 iri = URIRef(rdf_type_value.toPython()) 270 g.add((subject_map_iri, R2RML['class'], iri)) 271 else: 272 msg = 'Cannot extract rr:class value, rdf:type value ' + \ 273 'is not a constant value!' 274 self._logger.error(msg) 275 return False 276 277 # Remove all triples associated with the rdf:type PredicateMap 278 for s, p, o in g.triples((predicate_map_iri, None, None)): 279 g.remove((s, p, o)) 280 281 # Remove all triples associated with the rdf:type ObjectMap 282 for s, p, o in g.triples((object_map_iri, None, None)): 283 g.remove((s, p, o)) 284 285 # Remove all triples associated with the 286 # rdf:type PredicateObjectMap 287 for s, p, o in g.triples((object_map_iri, None, None)): 288 g.remove((s, p, o)) 289 290 # Remove PredicateObjectMap from Triples Map 291 g.remove((triples_map_iri, R2RML.predicateObjectMap, 292 predicate_object_map_iri)) 293 294 destination = os.path.join(self._data_path, 295 self.root_mount_directory, 296 'mapping_converted.r2rml.ttl') 297 g.serialize(destination=destination, format='turtle') 298 299 arguments.append('-m') 300 arguments.append('/data/mapping_converted.r2rml.ttl') 301 if output_file is not None: 302 arguments.append('-o') 303 arguments.append(os.path.join('/data/shared/', output_file)) 304 arguments.append('-p') 305 arguments.append('/data/config.properties') 306 307 return self._execute(arguments)
Ontop container super class for OntopMaterialize and OntopVirtualize.
30 def __init__(self, name: str, data_path: str, logger: Logger, mode: str): 31 """Creates an instance of the Ontop class. 32 33 Parameters 34 ---------- 35 name : str 36 Pretty name of the container. 37 data_path: str 38 Path to the data directory of the case. 39 logger : Logger 40 Logger to use for log messages. 41 mode : str 42 Ontop mode: `materialize` or `endpoint` 43 """ 44 self._mode = mode 45 self._headers: Dict[str, Dict[str, str]] = {} 46 self._logger = logger 47 self._data_path = data_path 48 49 if self._mode == 'endpoint': 50 subdir = 'ontopvirtualize' 51 elif self._mode == 'materialize': 52 subdir = 'ontopmaterialize' 53 else: 54 raise ValueError(f'Unknown Ontop mode: "{self._mode}"') 55 os.makedirs(os.path.join(self._data_path, subdir), exist_ok=True) 56 57 # Set Java heap to 1/2 of available memory instead of the default 1/4 58 max_heap = int(psutil.virtual_memory().total * (1/2)) 59 60 # Configure logging 61 log_level = 'info' 62 if self._logger.verbose: 63 log_level = 'debug' 64 self._logger.info(f'Initialized Ontop logger at "{log_level}" level') 65 66 environment = {'ONTOP_JAVA_ARGS': f'-Xmx{max_heap} -Xms{max_heap}', 67 'ONTOP_LOG_LEVEL': log_level} 68 super().__init__(f'kgconstruct/ontop:v{VERSION}', name, 69 self._logger, 70 ports={'8888': '8888'}, 71 environment=environment, 72 volumes=[f'{self._data_path}/' 73 f'{self.root_mount_directory}:/data', 74 f'{self._data_path}/shared:/data/shared'])
Creates an instance of the Ontop class.
Parameters
- name (str): Pretty name of the container.
- data_path (str): Path to the data directory of the case.
- logger (Logger): Logger to use for log messages.
- mode (str):
Ontop mode:
materialize
orendpoint
76 @property 77 def root_mount_directory(self) -> str: 78 """Subdirectory in the root directory of the case for Ontop. 79 80 Returns 81 ------- 82 subdirectory : str 83 Subdirectory of the root directory for Ontop. 84 85 """ 86 if self._mode == 'endpoint': 87 return 'ontopvirtualize' 88 elif self._mode == 'materialize': 89 return 'ontopmaterialize' 90 else: 91 raise ValueError(f'Unknown Ontop mode: "{self._mode}"')
Subdirectory in the root directory of the case for Ontop.
Returns
- subdirectory (str): Subdirectory of the root directory for Ontop.
93 @property 94 def endpoint(self) -> str: 95 """SPARQL endpoint URL for Ontop. 96 97 Returns 98 ------- 99 url : str 100 SPARQL endpoint URL. 101 """ 102 return 'http://localhost:8888/sparql'
SPARQL endpoint URL for Ontop.
Returns
- url (str): SPARQL endpoint URL.
104 @property 105 def headers(self) -> dict: 106 """HTTP headers of SPARQL queries for serialization formats. 107 108 Only supported serialization formats are included in the dictionary. 109 Currently, the following formats are supported: 110 - N-Triples 111 - N-Quads 112 - Turtle 113 - CSV 114 - RDF/JSON 115 - RDF/XML 116 - JSON-LD 117 118 Returns 119 ------- 120 headers : dict 121 Dictionary of headers to use for each serialization format. 122 """ 123 return self._headers
HTTP headers of SPARQL queries for serialization formats.
Only supported serialization formats are included in the dictionary. Currently, the following formats are supported:
- N-Triples
- N-Quads
- Turtle
- CSV
- RDF/JSON
- RDF/XML
- JSON-LD
Returns
- headers (dict): Dictionary of headers to use for each serialization format.
Inherited Members
310class OntopVirtualize(Ontop): 311 """OntopVirtualize container for setting up an Ontop SPARQL endpoint.""" 312 def __init__(self, data_path: str, config_path: str, directory: str, 313 verbose: bool): 314 """Creates an instance of the OntopVirtualize class. 315 316 Parameters 317 ---------- 318 data_path : str 319 Path to the data directory of the case. 320 config_path : str 321 Path to the config directory of the case. 322 directory : str 323 Path to the directory to store logs. 324 verbose : bool 325 Enable verbose logs. 326 """ 327 self._data_path = os.path.abspath(data_path) 328 self._config_path = os.path.abspath(config_path) 329 self._logger = Logger(__name__, directory, verbose) 330 super().__init__('Ontop-Virtualize', self._data_path, self._logger, 331 'endpoint') 332 333 def execute_mapping(self, 334 mapping_file: str, 335 serialization: str, 336 rdb_username: str, 337 rdb_password: str, 338 rdb_host: str, 339 rdb_port: int, 340 rdb_name: str, 341 rdb_type: str) -> bool: 342 """Start an Ontop SPARQL endpoint with a mapping. 343 344 Only relational databases are supported by 345 Ontop, thus the relational database parameters are mandantory. 346 Ontop SPARQL endpoint supports the following serialization formats: 347 - N-Triples (Ontop v5+) 348 - N-Quads (Ontop v5+) 349 - Turtle 350 - RDF/JSON 351 - JSON-LD 352 - CSV 353 354 Parameters 355 ---------- 356 mapping_file : str 357 Path to the mapping file to execute. 358 serialization : str 359 Serialization format to use. 360 rdb_username : str 361 Username for the database. 362 rdb_password : str 363 Password for the database. 364 rdb_host : str 365 Hostname for the database. 366 rdb_port : int 367 Port for the database. 368 rdb_name : str 369 Database name for the database. 370 rdb_type : str 371 Database type. 372 373 Returns 374 ------- 375 success : bool 376 Whether the execution was successfull or not. 377 """ 378 config_file = f'{self._data_path}/{self.root_mount_directory}' + \ 379 '/config.properties' 380 arguments = ['--cors-allowed-origins=*', '--port=8888'] 381 self._headers['ntriples'] = {'Accept': 'application/n-triples'} 382 self._headers['nquads'] = {'Accept': 'application/n-quads'} 383 self._headers['turtle'] = {'Accept': 'text/turtle'} 384 self._headers['rdfjson'] = {'Accept': 'application/rdf+json'} 385 self._headers['rdfxml'] = {'Accept': 'application/rdf+xml'} 386 self._headers['jsonld'] = {'Accept': 'application/ld+json'} 387 self._headers['csv'] = {'Accept': 'text/csv'} 388 if serialization not in self._headers.keys(): 389 msg = 'Unsupported serialization format ' + \ 390 f'"{serialization}" for Ontop' 391 self._logger.error(msg) 392 raise ValueError(msg) 393 return super()._execute_mapping(config_file, arguments, 394 mapping_file, None, rdb_username, 395 rdb_password, rdb_host, rdb_port, 396 rdb_name, rdb_type)
OntopVirtualize container for setting up an Ontop SPARQL endpoint.
312 def __init__(self, data_path: str, config_path: str, directory: str, 313 verbose: bool): 314 """Creates an instance of the OntopVirtualize class. 315 316 Parameters 317 ---------- 318 data_path : str 319 Path to the data directory of the case. 320 config_path : str 321 Path to the config directory of the case. 322 directory : str 323 Path to the directory to store logs. 324 verbose : bool 325 Enable verbose logs. 326 """ 327 self._data_path = os.path.abspath(data_path) 328 self._config_path = os.path.abspath(config_path) 329 self._logger = Logger(__name__, directory, verbose) 330 super().__init__('Ontop-Virtualize', self._data_path, self._logger, 331 'endpoint')
Creates an instance of the OntopVirtualize class.
Parameters
- data_path (str): Path to the data directory of the case.
- config_path (str): Path to the config directory of the case.
- directory (str): Path to the directory to store logs.
- verbose (bool): Enable verbose logs.
333 def execute_mapping(self, 334 mapping_file: str, 335 serialization: str, 336 rdb_username: str, 337 rdb_password: str, 338 rdb_host: str, 339 rdb_port: int, 340 rdb_name: str, 341 rdb_type: str) -> bool: 342 """Start an Ontop SPARQL endpoint with a mapping. 343 344 Only relational databases are supported by 345 Ontop, thus the relational database parameters are mandantory. 346 Ontop SPARQL endpoint supports the following serialization formats: 347 - N-Triples (Ontop v5+) 348 - N-Quads (Ontop v5+) 349 - Turtle 350 - RDF/JSON 351 - JSON-LD 352 - CSV 353 354 Parameters 355 ---------- 356 mapping_file : str 357 Path to the mapping file to execute. 358 serialization : str 359 Serialization format to use. 360 rdb_username : str 361 Username for the database. 362 rdb_password : str 363 Password for the database. 364 rdb_host : str 365 Hostname for the database. 366 rdb_port : int 367 Port for the database. 368 rdb_name : str 369 Database name for the database. 370 rdb_type : str 371 Database type. 372 373 Returns 374 ------- 375 success : bool 376 Whether the execution was successfull or not. 377 """ 378 config_file = f'{self._data_path}/{self.root_mount_directory}' + \ 379 '/config.properties' 380 arguments = ['--cors-allowed-origins=*', '--port=8888'] 381 self._headers['ntriples'] = {'Accept': 'application/n-triples'} 382 self._headers['nquads'] = {'Accept': 'application/n-quads'} 383 self._headers['turtle'] = {'Accept': 'text/turtle'} 384 self._headers['rdfjson'] = {'Accept': 'application/rdf+json'} 385 self._headers['rdfxml'] = {'Accept': 'application/rdf+xml'} 386 self._headers['jsonld'] = {'Accept': 'application/ld+json'} 387 self._headers['csv'] = {'Accept': 'text/csv'} 388 if serialization not in self._headers.keys(): 389 msg = 'Unsupported serialization format ' + \ 390 f'"{serialization}" for Ontop' 391 self._logger.error(msg) 392 raise ValueError(msg) 393 return super()._execute_mapping(config_file, arguments, 394 mapping_file, None, rdb_username, 395 rdb_password, rdb_host, rdb_port, 396 rdb_name, rdb_type)
Start an Ontop SPARQL endpoint with a mapping.
Only relational databases are supported by Ontop, thus the relational database parameters are mandantory. Ontop SPARQL endpoint supports the following serialization formats:
- N-Triples (Ontop v5+)
- N-Quads (Ontop v5+)
- Turtle
- RDF/JSON
- JSON-LD
- CSV
Parameters
- mapping_file (str): Path to the mapping file to execute.
- serialization (str): Serialization format to use.
- rdb_username (str): Username for the database.
- rdb_password (str): Password for the database.
- rdb_host (str): Hostname for the database.
- rdb_port (int): Port for the database.
- rdb_name (str): Database name for the database.
- rdb_type (str): Database type.
Returns
- success (bool): Whether the execution was successfull or not.
399class OntopMaterialize(Ontop): 400 """OntopMaterialize container to execute a R2RML mapping.""" 401 def __init__(self, data_path: str, config_path: str, directory: str, 402 verbose: bool): 403 """Creates an instance of the OntopMaterialize class. 404 405 Parameters 406 ---------- 407 data_path : str 408 Path to the data directory of the case. 409 config_path : str 410 Path to the config directory of the case. 411 directory : str 412 Path to the directory to store logs. 413 verbose : bool 414 Enable verbose logs. 415 """ 416 self._data_path = os.path.abspath(data_path) 417 self._config_path = os.path.abspath(config_path) 418 self._logger = Logger(__name__, directory, verbose) 419 os.makedirs(os.path.join(self._data_path, 'ontopmaterialize'), 420 exist_ok=True) 421 super().__init__('Ontop-Materialize', self._data_path, self._logger, 422 'materialize') 423 424 @timeout(TIMEOUT) 425 def _execute_mapping_with_timeout(self, mapping_file: str, 426 output_file: str, 427 serialization: str, 428 rdb_username: str, 429 rdb_password: str, 430 rdb_host: str, 431 rdb_port: int, 432 rdb_name: str, 433 rdb_type: str) -> bool: 434 """Execute a mapping with a provided timeout. 435 436 Returns 437 ------- 438 success : bool 439 Whether the execution was successfull or not. 440 """ 441 config_file = f'{self._data_path}/{self.root_mount_directory}' + \ 442 '/config.properties' 443 arguments = ['-f', serialization] 444 self._headers = {} 445 return super()._execute_mapping(config_file, arguments, 446 mapping_file, output_file, 447 rdb_username, rdb_password, 448 rdb_host, rdb_port, rdb_name, rdb_type) 449 450 def execute_mapping(self, 451 mapping_file: str, 452 output_file: str, 453 serialization: str, 454 rdb_username: str, 455 rdb_password: str, 456 rdb_host: str, 457 rdb_port: int, 458 rdb_name: str, 459 rdb_type: str) -> bool: 460 """Execute a R2RML mapping with Ontop 461 462 N-Quads and N-Triples are currently supported as serialization 463 for Ontop materialize. Only relational databases are supported by 464 Ontop, thus the relational database parameters are mandantory. 465 466 Parameters 467 ---------- 468 mapping_file : str 469 Path to the mapping file to execute. 470 output_file : str 471 Name of the output file to store the triples in. This is not used 472 for OntopVirtualize. 473 serialization : str 474 Serialization format to use. 475 rdb_username : str 476 Username for the database. 477 rdb_password : str 478 Password for the database. 479 rdb_host : str 480 Hostname for the database. 481 rdb_port : int 482 Port for the database. 483 rdb_name : str 484 Database name for the database. 485 rdb_type : str 486 Database type. 487 488 Returns 489 ------- 490 success : bool 491 Whether the execution was successfull or not. 492 """ 493 try: 494 return self._execute_mapping_with_timeout(mapping_file, 495 output_file, 496 serialization, 497 rdb_username, 498 rdb_password, 499 rdb_host, 500 rdb_port, 501 rdb_name, 502 rdb_type) 503 except TimeoutError: 504 msg = f'Timeout ({TIMEOUT}s) reached for Ontop Materialize' 505 self._logger.warning(msg) 506 507 return False
OntopMaterialize container to execute a R2RML mapping.
401 def __init__(self, data_path: str, config_path: str, directory: str, 402 verbose: bool): 403 """Creates an instance of the OntopMaterialize class. 404 405 Parameters 406 ---------- 407 data_path : str 408 Path to the data directory of the case. 409 config_path : str 410 Path to the config directory of the case. 411 directory : str 412 Path to the directory to store logs. 413 verbose : bool 414 Enable verbose logs. 415 """ 416 self._data_path = os.path.abspath(data_path) 417 self._config_path = os.path.abspath(config_path) 418 self._logger = Logger(__name__, directory, verbose) 419 os.makedirs(os.path.join(self._data_path, 'ontopmaterialize'), 420 exist_ok=True) 421 super().__init__('Ontop-Materialize', self._data_path, self._logger, 422 'materialize')
Creates an instance of the OntopMaterialize class.
Parameters
- data_path (str): Path to the data directory of the case.
- config_path (str): Path to the config directory of the case.
- directory (str): Path to the directory to store logs.
- verbose (bool): Enable verbose logs.
450 def execute_mapping(self, 451 mapping_file: str, 452 output_file: str, 453 serialization: str, 454 rdb_username: str, 455 rdb_password: str, 456 rdb_host: str, 457 rdb_port: int, 458 rdb_name: str, 459 rdb_type: str) -> bool: 460 """Execute a R2RML mapping with Ontop 461 462 N-Quads and N-Triples are currently supported as serialization 463 for Ontop materialize. Only relational databases are supported by 464 Ontop, thus the relational database parameters are mandantory. 465 466 Parameters 467 ---------- 468 mapping_file : str 469 Path to the mapping file to execute. 470 output_file : str 471 Name of the output file to store the triples in. This is not used 472 for OntopVirtualize. 473 serialization : str 474 Serialization format to use. 475 rdb_username : str 476 Username for the database. 477 rdb_password : str 478 Password for the database. 479 rdb_host : str 480 Hostname for the database. 481 rdb_port : int 482 Port for the database. 483 rdb_name : str 484 Database name for the database. 485 rdb_type : str 486 Database type. 487 488 Returns 489 ------- 490 success : bool 491 Whether the execution was successfull or not. 492 """ 493 try: 494 return self._execute_mapping_with_timeout(mapping_file, 495 output_file, 496 serialization, 497 rdb_username, 498 rdb_password, 499 rdb_host, 500 rdb_port, 501 rdb_name, 502 rdb_type) 503 except TimeoutError: 504 msg = f'Timeout ({TIMEOUT}s) reached for Ontop Materialize' 505 self._logger.warning(msg) 506 507 return False
Execute a R2RML mapping with Ontop
N-Quads and N-Triples are currently supported as serialization for Ontop materialize. Only relational databases are supported by Ontop, thus the relational database parameters are mandantory.
Parameters
- mapping_file (str): Path to the mapping file to execute.
- output_file (str): Name of the output file to store the triples in. This is not used for OntopVirtualize.
- serialization (str): Serialization format to use.
- rdb_username (str): Username for the database.
- rdb_password (str): Password for the database.
- rdb_host (str): Hostname for the database.
- rdb_port (int): Port for the database.
- rdb_name (str): Database name for the database.
- rdb_type (str): Database type.
Returns
- success (bool): Whether the execution was successfull or not.