Source code for djiffy.models

from collections import OrderedDict
import json
import os.path
import urllib

from attrdict import AttrMap
from django.conf import settings
from django.db import models
from django.urls import reverse
from jsonfield import JSONField
from piffle import iiif
import rdflib
from rdflib.namespace import DC
import requests


def get_iiif_url(url):
    '''Wrapper around :meth:`requests.get` to support conditionally
    adding an auth token based on the domain of the request url and
    any **AUTH_TOKENS** configured in django settings.'''
    request_options = {}

    AUTH_TOKENS = getattr(settings, 'DJIFFY_AUTH_TOKENS', None)
    if AUTH_TOKENS:
        domain = urllib.parse.urlparse(url).netloc
        if domain in AUTH_TOKENS:
            request_options['params'] = {'auth_token': AUTH_TOKENS[domain]}

    return requests.get(url, **request_options)


class IIIFException(Exception):
    '''Custom exception for IIIF/djiffy specific errors'''
    pass


class Manifest(models.Model):
    '''Minimal db model representation of an IIIF presentation manifest'''
    #: label
    label = models.TextField()
    #: short id extracted from URI
    short_id = models.CharField(max_length=255, unique=True)
    #: URI
    uri = models.URLField()
    #: iiif presentation metadata for display
    metadata = JSONField(load_kwargs={'object_pairs_hook': OrderedDict})
    #: date local manifest cache was created
    created = models.DateField(auto_now_add=True)
    #: date local manifest cache was last modified
    last_modified = models.DateField(auto_now=True)
    #: extra data provided via a 'seeAlso' reference
    extra_data = JSONField(load_kwargs={'object_pairs_hook': OrderedDict},
        default=OrderedDict)

    class Meta:
        verbose_name = 'IIIF Manifest'
        # add custom permissions; change and delete provided by django
        permissions = (
            ('view_canvas', 'Can view %s' % verbose_name),
        )

    # todo: metadata? thumbnail references
    # - should we cache the actual manifest file?

    def __str__(self):
        return self.label or self.short_id

    @property
    def thumbnail(self):
        '''thumbnail url for associated canvas'''
        return self.canvases.filter(thumbnail=True).first()

    def get_absolute_url(self):
        ''''url for this manifest within the django site'''
        return reverse('djiffy:manifest', args=[self.short_id])

    def admin_thumbnail(self):
        '''thumbnail for convenience display in admin interface'''
        if self.thumbnail:
            return self.thumbnail.admin_thumbnail()
    admin_thumbnail.short_description = 'Thumbnail'
    admin_thumbnail.allow_tags = True

    @property
    def logo(self):
        '''manifest logo, if there is one'''
        return self.extra_data.get('logo', None)

    @property
    def license(self):
        '''manifest license, if there is one'''
        return self.extra_data.get('license', None)

    @property
    def rights_statement_id(self):
        '''short id for rightstatement.org license'''
        if self.license and 'rightsstatements.org' in self.license:
            return self.license.rstrip(' /').split('/')[-2]

    _rights_graph = None

    def license_label(self, lang='en'):
        '''Get the text label for the rights license.  Uses local
        value from edm rights if available; otherwise uses
        data for the URI to get the preferred label or title.'''

        # Some manifests have a seeAlso data contains an "edm_rights"
        # section with a label for the rights statement.
        # Use that if available (NOTE: ignores specified language)
        # NOTE: possibly PUL specific, but shouldn't hurt to look locally first
        for data in self.extra_data.values():
            if 'edm_rights' in data and 'pref_label' in data['edm_rights']:
                return data['edm_rights']['pref_label']

        # if license/rights label is not available locally, get via uri
        if self._rights_graph is None:
            # if license is defined and a url
            if self.license and urllib.parse.urlparse(self.license).scheme in ['http', 'https']:
                self._rights_graph = rdflib.Graph()
                try:
                    # rights statement org does content-negotiation for json-jd,
                    # but rdflib doesn't handle that automatically
                    if 'rightsstatements.org' in self.license:
                        resp = requests.get(self.license,
                                            headers={'Accept': 'application/json'},
                                            allow_redirects=False)
                        if resp.status_code == requests.codes.see_other:
                            self._rights_graph.parse(resp.headers['location'], format='json-ld')

                    # creative commons doesn't support content negotiation,
                    # but you can add rdf to the end of the url
                    elif 'creativecommons.org' in self.license:
                        rdf_uri = '/'.join([self.license.rstrip('/'), 'rdf'])
                        self._rights_graph.parse(rdf_uri)

                except Exception:
                    # possible to get an exception when parsing the
                    # rdf, maybe on the request; don't choke if we do!

                    # NOTE: using generic Exception here becuase unfortunately
                    # that is what rdflib raises when it can't parse RDF
                    pass

        # get the preferred label for this license in the requested language;
        # returns a list of label, value; use the first value
        if self._rights_graph:
            license_uri = rdflib.URIRef(self.license)
            preflabel = self._rights_graph.preferredLabel(license_uri,
                                                          lang=lang)
            if preflabel:
                # convert rdflib Literal to string
                return str(preflabel[0][1])
            # otherwise, get dc title
            # iterate over all titles and return one with a matching language code
            for title in self._rights_graph.objects(subject=license_uri, predicate=DC.title):
                if title.language == lang:
                    return str(title)


class IIIFImage(iiif.IIIFImageClient):
    '''Subclass of :class:`piffle.iiif.IIIFImageClient`, for generating
    IIIF Image URIs for manifest canvas images.'''

    #: long edge size for single page display
    single_page_size = 1000
    #: long edge size for thumbnail
    thumbnail_size = 300
    #: long edge size for mini thumbnail
    mini_thumbnail_size = 100

    def thumbnail(self):
        '''thumbnail'''
        return self.size(height=self.thumbnail_size, width=self.thumbnail_size,
                         exact=True).format('png')

    def mini_thumbnail(self):
        '''mini thumbnail'''
        return self.size(height=self.mini_thumbnail_size,
                         width=self.mini_thumbnail_size, exact=True) \
                   .format('png')

    def page_size(self):
        '''page size for display: :attr:`SINGLE_PAGE_SIZE` on the long edge'''
        return self.size(height=self.single_page_size,
                         width=self.single_page_size, exact=True)


class Canvas(models.Model):
    '''Minimal db model representation of a canvas from an IIIF manifest'''

    #: label
    label = models.TextField()
    #: short id extracted from URI
    short_id = models.CharField(max_length=255)
    #: URI
    uri = models.URLField()
    #: URL of IIIF image for this canvas
    iiif_image_id = models.URLField()
    #: :class:`Manifest` this canvas vbelongs to
    manifest = models.ForeignKey(Manifest, related_name='canvases')
    #: boolean flag to indicate if this canvas shoudl be used as thumbnail
    thumbnail = models.BooleanField(default=False)
    #: order of this canvas within associated manifest primary sequence
    order = models.PositiveIntegerField()
    # (for now only stores a single sequence, so just store order on the page    )
    # format? size? (ocr text eventually?)
    #: extra data not otherwise given its own field, serialized as json
    extra_data = JSONField(load_kwargs={'object_pairs_hook': OrderedDict},
        default=OrderedDict)

    class Meta:
        ordering = ["manifest", "order"]
        verbose_name = 'IIIF Canvas'
        verbose_name_plural = 'IIIF Canvases'
        unique_together = ("short_id", "manifest")
        # add custom permissions; change and delete provided by django
        permissions = (
            ('view_manifest', 'Can view %s' % verbose_name),
        )

    def __str__(self):
        return '%s %d (%s)%s' % (self.manifest, self.order + 1, self.label,
            '*' if self.thumbnail else '')

    @property
    def image(self):
        '''Associated IIIF image for this canvas as :class:`IIIFImage`'''
        # NOTE: piffle iiif image wants service & id split out.
        # Should update to handle iiif image ids as provided in manifests
        # for now, split into service and image id. (is this reliable?)
        return IIIFImage(*self.iiif_image_id.rsplit('/', 1))

    @property
    def plain_text_url(self):
        '''Return plain text url for a canvas if one exists'''

        rendering = self.extra_data.get('rendering', None)
        if rendering:
            # handle both cases where this is a list and where it is just
            # a dictionary, to be safe
            if isinstance(rendering, list):
                for item in rendering:
                    # iterate over the list and return the first plain text url
                    # we find
                    if 'format' in item and item['format'] == 'text/plain':
                        return item['@id']
            else:
                # otherwise, if it's a dictionary, check if it's plaintext and
                # return
                if 'format' in rendering \
                        and rendering['format'] == 'text/plain':
                    return rendering['@id']
        # finally return None if no plain text is available or no rendering
        return None

    def get_absolute_url(self):
        ''''url for this canvas within the django site'''
        return reverse('djiffy:canvas', args=[self.manifest.short_id, self.short_id])

    def next(self):
        '''Next canvas after this one in sequence (within manifest
        primary sequence).  Returns an empty queryset if there is no next
        canvas.'''
        return Canvas.objects.filter(manifest=self.manifest, order__gt=self.order) \
            .first()

    def prev(self):
        '''Previous canvas before this one in sequence
        (within manifest primary sequence).  Returns an empty queryset
        if there is no next canvas.'''
        return Canvas.objects.filter(manifest=self.manifest, order__lt=self.order) \
            .last()

    def admin_thumbnail(self):
        '''thumbnail for convenience display in admin interface'''
        return u'<img src="%s" />' % self.image.mini_thumbnail()
    admin_thumbnail.short_description = 'Thumbnail'
    admin_thumbnail.allow_tags = True


class IIIFPresentation(AttrMap):
    ''':class:`attrdict.AttrMap` subclass for read access to IIIF Presentation
    content'''

    # TODO: document sample use, e.g. @ fields

    at_fields = ['type', 'id', 'context']

    @classmethod
    def from_file(cls, path):
        '''Iniitialize :class:`IIIFPresentation` from a file.'''
        with open(path) as manifest:
            data = json.loads(manifest.read())
        return cls(data)

    @classmethod
    def from_url(cls, uri):
        '''Iniitialize :class:`IIIFPresentation` from a URL.

        :raises: :class:`IIIFException` if URL is not retrieved successfully,
            if the response is not JSON content, or if the JSON cannot be parsed.
        '''
        response = get_iiif_url(uri)
        if response.status_code == requests.codes.ok:
            try:
                return cls(response.json())
            except json.decoder.JSONDecodeError as err:
                # if json fails, two possibilities:
                # - we didn't actually get json (e.g. redirect for auth)
                if 'application/json' not in response.headers['content-type']:
                    raise IIIFException('No JSON found at %s' % uri)
                # - there is something wrong with the json
                raise IIIFException('Error parsing JSON for %s: %s' %
                    (uri, err))

        raise IIIFException('Error retrieving manifest at %s: %s %s' %
            (uri, response.status_code, response.reason))

    @classmethod
    def is_url(cls, url):
        '''Utility method to check if a path is a url or file'''
        return urllib.parse.urlparse(url).scheme != ""

    @classmethod
    def from_file_or_url(cls, path):
        '''Iniitialize :class:`IIIFPresentation` from a file or a url.'''
        if os.path.isfile(path):
            return cls.from_file(path)
        elif cls.is_url(path):
            return cls.from_url(path)
        else:
            raise IIIFException('File not found: %s' % path)

    @classmethod
    def short_id(cls, uri):
        '''Generate a short id from full manifest/canvas uri identifiers
        for use in local urls.  Logic is based on the recommended
        url pattern from the IIIF Presentation 2.0 specification.'''

        # shortening should work reliably for uris that follow
        # recommended url patterns from the spec
        # http://iiif.io/api/presentation/2.0/#a-summary-of-recommended-uri-patterns
        #   manifest:  {scheme}://{host}/{prefix}/{identifier}/manifest
        #   canvas: {scheme}://{host}/{prefix}/{identifier}/canvas/{name}

        # remove trailing /manifest at the end of the url, if present
        if uri.endswith('/manifest'):
            uri = uri[:-len('/manifest')]
        # split on slashes and return the last portion
        return uri.split('/')[-1]


    def __getattr__(self, key):
        """
        Access an item as an attribute.
        """
        # override getattr to allow use of keys with leading @,
        # which are otherwise not detected as present and not valid
        at_key = self._handle_at_keys(key)
        if key not in self or \
          (key not in self.at_fields and at_key not in self) or \
          not self._valid_name(key):
            raise AttributeError(
                "'{cls}' instance has no attribute '{name}'".format(
                    cls=self.__class__.__name__, name=key
                )
            )
        return self._build(self[key])

    def _handle_at_keys(self, key):
        if key in self.at_fields:
            key = '@%s' % key
        return key

    def __getitem__(self, key):
        """
        Access a value associated with a key.
        """
        return self._mapping[self._handle_at_keys(key)]

    def __setitem__(self, key, value):
        """
        Add a key-value pair to the instance.
        """
        self._mapping[self._handle_at_keys(key)] = value

    def __delitem__(self, key):
        """
        Delete a key-value pair
        """
        del self._mapping[self._handle_at_keys(key)]

    @property
    def first_label(self):
        # label can be a string or list of strings
        if isinstance(self.label, str):
            return self.label
        else:
            return self.label[0]