Source code for helper_functions.RDFparsers

#!/usr/bin/python
# -*- coding: utf-8 -*-
from entities.models import Place, Institution, Person, Work
from relations.models import (
    InstitutionInstitution, InstitutionPlace, PersonPerson,
    PersonWork, PersonPlace, PersonInstitution, PlacePlace, PlaceEvent, PlaceWork)
from metainfo.models import Uri as genUri
from labels.models import Label
from vocabularies.models import (
    LabelType, InstitutionInstitutionRelation,
    InstitutionPlaceRelation, VocabsUri, ProfessionType, PersonWorkRelation,
    PersonPersonRelation, PersonPlaceRelation, WorkType, PersonInstitutionRelation,
    PlaceType, PlacePlaceRelation)
from apis.settings.NER_settings import geonames_feature_codes as gn_f
from apis.settings.RDF_settings import sett_RDF_generic
from metainfo.models import Collection, Uri

import rdflib
from rdflib import ConjunctiveGraph, URIRef, RDFS, Literal, OWL
import re
from datetime import datetime
import types
from django.db.models import Q
from django.contrib.contenttypes.models import ContentType
from django.core.exceptions import FieldError

[docs]class GenericRDFParser(object): """A generic class for parsing RDFs to the APIS data model and save objects to the db. Attributes: - self.objct: (object) the object created by the parser - self.labels: (list) list of labels created by the parser (only saved when self.save() is called) - self.related_objcts: (list) list of related objects (relations) (only saved when self.save() is called) - self.kind: (string) kind of entity of the object (Persion, Place, Institution, Work, Event) - self.uri: (string) uri provided when initializing the object - self.saved: (boolean) indicates whether the object was saved to the db - self.created: (boolean) indicates whether the object was created """
[docs] def save(self): """ :return: django object saved to db or False if nothing was saved """ if not self.created: return False self.objct.status = 'distinct' self.objct.save() def_coll, created = Collection.objects.get_or_create(name='Default import collection') self.objct.collection.add(def_coll) self.saved = True Uri.objects.create(uri=self.uri, entity=self.objct) for lab in self.labels: lab.temp_entity = self.objct lab.save() for obj in self.related_objcts: if hasattr(obj, 'related_'+self.kind.lower()+'B'): setattr(obj, 'related_' + self.kind.lower() + 'A_id', self.objct.pk) else: setattr(obj, 'related_'+self.kind.lower()+'_id', self.objct.pk) obj.save() return self.objct
[docs] def merge(self, m_obj, app_label_relations='relations'): """ :param m_obj: the object to merge with (must be an django model object instance) :param app_label_relations: (string) the label of the Django app that contains the relations :return: django object saved to db or False if nothing was saved """ for rel in ContentType.objects.filter(app_label=app_label_relations, model__icontains=self.kind.lower()): rel_q = {'related_' + self.kind.lower(): m_obj} rel2 = rel.model_class() try: for rel_exst in rel2.objects.filter(**rel_q): setattr(rel_exst, 'related_'+self.kind.lower()+'_id', self.objct.pk) rel_exst.save() except FieldError: # e.g. PlacePlace relations have different related_ fields rel_q = {'related_' + self.kind.lower()+'A': m_obj} for rel_exst in rel2.objects.filter(**rel_q): setattr(rel_exst, 'related_'+self.kind.lower()+'A_id', self.objct.pk) rel_exst.save() rel_q = {'related_' + self.kind.lower() + 'B': m_obj} for rel_exst in rel2.objects.filter(**rel_q): setattr(rel_exst, 'related_'+self.kind.lower()+'B_id', self.objct.pk) rel_exst.save() for z in genUri.objects.filter(entity=m_obj): z.entity_id = self.objct.pk z.save() for z in Label.objects.filter(temp_entity=m_obj): z.temp_entity_id = self.objct.pk z.save() if hasattr(m_obj, 'first_name'): legacy_name = '{}, {}'.format(m_obj.name, m_obj.first_name) else: legacy_name = m_obj.name lt, created = LabelType.objects.get_or_create(name='legacy name') Label.objects.create(temp_entity_id=self.objct.pk, label=legacy_name, label_type=lt) for col in m_obj.collection.all(): self.objct.collection.add(col) for ann in m_obj.annotation_set.all(): ann.entity_link.remove(m_obj) ann.entity_link.add(self.objct) m_obj.delete() return self.objct
[docs] def get_or_create(self): """ :return: Returns the parsed object. Saves it to the db when needed """ if not self.created: return self.objct else: if not self.saved: ob = self.save() return ob else: return self.objct
def __init__(self, uri, kind, app_label_entities="entities", app_label_relations="relations", app_label_vocabularies="vocabularies", **kwargs): """ :param uri: (url) Uri to parse the object from (http://test.at). The uri must start with a base url mentioned in the RDF parser settings file. :param kind: (string) Kind of entity (Person, Place, Institution, Work, Event) :param app_label_entities: (string) Name of the Django app that contains the entities that we create. :param app_label_relations: (string) Name of the Django app that contains the relations for the merging process. :param app_label_vocabularies: (string) Name of the Django app that contains the vocabularies defining the entities and relations. """ owl = "http://www.w3.org/2002/07/owl#" def exist(uri): if objct.objects.filter(uri__uri=uri).count() > 0: return True, objct.objects.get(uri__uri=uri) else: return False, False def prep_string(tupl): if isinstance(tupl, str): return tupl if tupl[1]: m = re.match(tupl[1][0], tupl[0]) group = tupl[1][1] if not group: group = 0 try: return m.group(group) except: return tupl[0] else: r = tupl[0] return r.strip() objct = ContentType.objects.get(app_label=app_label_entities, model=kind.lower()).model_class() force = kwargs.get('force', None) res_attrb = dict() labels = [] related_objcts = [] test = exist(uri) self.uri = uri self.kind = kind self.saved = False if test[0] and not force: self.objct = test[1] self.created = False else: self.created = True rdf_t = dict() for x in sett_RDF_generic[kind]['data']: self.settings_defined = False if not uri.startswith(x['base_url']): continue self.settings_defined = True g = rdflib.Graph() uri_2 = uri if not uri_2.endswith('/'): uri_2 += '/' o2 = rdflib.term.URIRef(uri) g.parse('{}{}'.format(uri_2.strip(), x['url_appendix']), format='xml') sameas = rdflib.term.URIRef(owl+'sameAs') list_sameas = [] for p in g.objects(subject=o2, predicate=sameas): list_sameas.append(genUri(uri=p)) self.sameas = list_sameas if 'kind' in x.keys(): for k in x['kind']: kind_rdf = rdflib.term.URIRef(k[0]) kind_val = g.value(o2, kind_rdf) if kind_val is not None: break else: kind_val = k[1] if kind_val is not None: kind_objct = ContentType.objects.get( app_label=app_label_vocabularies, model=kind.lower() + 'Type'.lower()).model_class() kind_objct, created = kind_objct.objects.get_or_create(name=kind_val) res_attrb['kind'] = kind_objct for uri_2 in list_sameas: test = exist(uri_2) if test[0]: self.objct = test[1] self.created = False uri_3 = genUri(uri=uri, entity=self.objct) uri_3.save() for xx in x['attributes']: rdf_t[xx['name']] = () subj2 = [] results = [] ind_type = () for z in xx['identifiers']: if len(results) > 0: continue cnt = 0 cnt_2 = 1 try: k = z[cnt_2] except: k = '=' subj = [o2, ] while k: for indx, s in enumerate(subj): if z[cnt][0] == 'objects': pred = rdflib.term.URIRef(z[cnt][2]) res = g.objects(subject=s, predicate=pred) if type(res) != types.GeneratorType: break for r in res: if z[cnt][3]: if not getattr(r, z[cnt][3][0]) == z[cnt][3][1]: continue if k == '>': subj2.append(r) elif k == '=': results.append((z[cnt][1], r, indx)) ind_type += ((len(ind_type), z[cnt][1]),) cnt_2 += 2 try: k = z[cnt_2] except: k = '=' if cnt + 2 > len(z): k = None cnt += 2 subj = subj2 for attrb in sett_RDF_generic[kind]['matching']['attributes'].keys(): res_2 = [] for x in sett_RDF_generic[kind]['matching']['attributes'][attrb]: for s in x: for ind, elem in filter(lambda x: x[1] == s[0], ind_type): elem = results[ind][1] res_2.append(prep_string((elem, s[1]))) if isinstance(s, str): res_2.append(s) if len(res_2) == len(x): res_attrb[attrb] = ''.join(res_2) for lab in sett_RDF_generic[kind]['matching']['labels'].keys(): lb_type, created = LabelType.objects.get_or_create(name=lab) for x in sett_RDF_generic[kind]['matching']['labels'][lab]: for ind, elem in filter(lambda a: a[1]==x[0], ind_type): elem = results[ind][1] lb = Label(label=prep_string((elem, x[1])), isoCode_639_3=elem.language, label_type=lb_type) labels.append(lb) if kwargs.get('drill_down', True): for con in sett_RDF_generic[kind]['matching']['linked objects']: for x in con['object']: for ind, elem in filter(lambda a: a[1]==x[0], ind_type): elem = results[ind][1] ob = GenericRDFParser(elem, con['type'], drill_down=False) if ob.created and not ob.saved: ob.save() # TODO: We should move the save of related objects in the save routine try: u = ContentType.objects.get(app_label=app_label_relations, model=kind.lower()+con['type'].lower()) u_kind = ContentType.objects.get(app_label=app_label_vocabularies, model=kind.lower()+con['type'].lower()+'Relation'.lower()) except ContentType.DoesNotExist: u = ContentType.objects.get(app_label=app_label_relations, model=con['type'].lower()+kind.lower()) u_kind = ContentType.objects.get(app_label=app_label_vocabularies, model=con['type'].lower()+kind.lower()+'Relation'.lower()) u_kind_2 = u_kind.model_class() u2 = u.model_class()() uk, created = u_kind_2.objects.get_or_create(name=con['kind']) if con['type'] == kind: setattr(u2, 'related_' + con['type'].lower() + 'B_id', ob.objct.pk) else: setattr(u2, 'related_' + con['type'].lower() + '_id', ob.objct.pk) setattr(u2, 'relation_type_id', uk.pk) related_objcts.append(u2) self.objct = objct(**res_attrb) self.labels = labels self.related_objcts = related_objcts