Source code for dvt.annotate.face

# -*- coding: utf-8 -*-
"""Annotators to detect and identify faces.

Identifying individuals in an image generally requires two distinct steps. The
first is detecting bounding boxes for faces in the image and the second is
identifying the faces themselves. Currently the most common method for doing
the second step is to project a detected face into a high-dimensional space
designed such that different images of the same person will be close together
and images of different people will be farther apart. This module is built
around this paradigm, allowing for the specification of custom detectors and
embeddings into the model.
"""

from importlib import import_module

from numpy import float32, expand_dims

from ..abstract import FrameAnnotator
from ..utils import (
    process_output_values,
    sub_image,
    _proc_frame_list,
    _which_frames,
    _trim_bbox
)


[docs]class FaceAnnotator(FrameAnnotator): """Annotator for detecting faces and embedding them as a face vector. The annotator will return a list with one DictList item for every frame with a detected face. If an embedding is supplied, the DictList items will contain a numpy array with the face embeddings. Attributes: detector: An object with a method called detect that takes an image and returns a set of detect faces. Can be set to None (default) as a pass-through option for testing. embedding: An object with a method embed that takes an image along with a set of bounding boxed and returns embeddings of the faces as a numpy array. Set to None (default) to only run the face detector. freq (int): How often to perform the embedding. For example, setting the frequency to 2 will embed every other frame in the batch. frames (array of ints): An optional list of frames to process. This should be a list of integers or a 1D numpy array of integers. If set to something other than None, the freq input is ignored. name (str): A description of the aggregator. Used as a key in the output data. """ name = "face" def __init__(self, **kwargs): self.freq = kwargs.get("freq", 1) self.detector = kwargs.get("detector") self.embedding = kwargs.get("embedding", None) self.frames = _proc_frame_list(kwargs.get("frames", None)) super().__init__(**kwargs)
[docs] def annotate(self, batch): """Annotate the batch of frames with the face annotator. Args: batch (FrameBatch): A batch of images to annotate. Returns: A list of dictionaries containing the video name, frame, bounding box coordinates (top, bottom, left, and right). If an embedding is included, the result will also contain a numpy array of the embedding for each face. """ f_faces = [] for fnum in _which_frames(batch, self.freq, self.frames): img = batch.img[fnum, :, :, :] t_faces = self.detector.detect(img) for face in t_faces: face['frame'] = batch.get_frame_names()[fnum] if self.embedding is not None: face["embed"] = [self.embedding.embed(img, face)] f_faces.extend(process_output_values(face)) return f_faces
[docs]class FaceDetectMtcnn: """Detect faces using the Multi-task Cascaded CNN model. Attributes: cutoff (float): A cutoff value for which faces to include in the final output. Set to zero (default) to include all faces. """ def __init__(self, cutoff=0): self.mtcnn = import_module("mtcnn.mtcnn") self.cutoff = cutoff self._mt = self.mtcnn.MTCNN(min_face_size=20)
[docs] def detect(self, img): """Detect faces in an image. Args: img (numpy array): A single image stored as a three-dimensional numpy array. Returns: A list of dictionaries where each dictionary represents a detected face. Keys include the bounding box (top, left, bottom, right) as well as a confidence score. """ dets = self._mt.detect_faces(img) faces = [] for det in dets: if det["confidence"] >= self.cutoff: bbox = _trim_bbox( ( det["box"][1], det["box"][0] + det["box"][2], det["box"][1] + det["box"][3], det["box"][0], ), img.shape, ) faces += [ { "top": bbox[0], "right": bbox[1], "bottom": bbox[2], "left": bbox[3], "confidence": [det["confidence"]], } ] return faces
[docs]class FaceEmbedVgg2: """Embed faces using the VGGFace2 model. A face embedding with state-of-the-art results, particularly suitable when there are small or non-forward-facing examples in the dataset. """ def __init__(self): from keras.models import load_model from keras.utils import get_file from keras import backend as K mloc = get_file( "vggface2-resnet50.h5", origin="https://github.com/distant-viewing/dvt/" "releases/download/0.0.1/" "vggface2-resnet50.h5", ) self._model = load_model(mloc) self._iformat = K.image_data_format()
[docs] def embed(self, img, face): """Embed detected faces in an image. Args: img (numpy array): A single image stored as a three-dimensional numpy array. faces (dict): Location of detected faces in the image. Returns: A numpy array with one row for each input face and 2048 columns. """ iscale = self._proc_image( sub_image( img=img, top=face["top"], right=face["right"], bottom=face["bottom"], left=face["left"], fct=1.3, output_shape=(224, 224), ) ) return self._model.predict(iscale)[0, 0, 0, :]
def _proc_image(self, iscale): iscale = float32(iscale) iscale = expand_dims(iscale, axis=0) if self._iformat == "channels_first": # pragma: no cover iscale = iscale[:, ::-1, ...] iscale[:, 0, :, :] -= 91.4953 iscale[:, 1, :, :] -= 103.8827 iscale[:, 2, :, :] -= 131.0912 else: iscale = iscale[..., ::-1] iscale[..., 0] -= 91.4953 iscale[..., 1] -= 103.8827 iscale[..., 2] -= 131.0912 return iscale