Source code for dvt.aggregate.people

# -*- coding: utf-8 -*-
"""Aggregate frame level information to detect people in shots.

The aggregator functions here take face embeddings and tries to predict the
identity of people within each shot.
"""

from os.path import join, basename, splitext

from numpy import argmin, min as npmin, vstack
from numpy.linalg import norm

from ..abstract import Aggregator
from ..annotate.face import FaceAnnotator, FaceDetectMtcnn, FaceEmbedVgg2
from ..core import DataExtraction, ImageInput
from ..utils import _check_data_exists


[docs]class PeopleAggregator(Aggregator): """Uses face embeddings to identify the identity of people in the frame. You will need to provide baseline faces for the annotator to compare to. Note that the annotator returns the nearest faces along with the distance to each face. Attributes: face_names (list): List of names associated with each face in the set of predefined faces fprint (numpy array): A numpy array giving the embedding vectors for the predefined faces. Each row should correspond with one face id and the number of columns should match the number of columns in your embedding. name (str): A description of the aggregator. Used as a key in the output data. """ name = "people" def __init__(self, **kwargs): self.face_names = kwargs.get("face_names") self.fprint = kwargs.get("fprint") assert self.fprint.shape[0] == len(self.face_names) super().__init__(**kwargs)
[docs] def aggregate(self, ldframe, **kwargs): """Aggregate faces. Args: ldframe (dict): A dictionary of DictFrames from a FrameAnnotator. Must contain an entry with the key 'face', which is used in the annotation. Returns: A dictionary frame giving the detected people, with one row per detected face. """ # make sure annotators have been run _check_data_exists(ldframe, ["face"]) # grab the data and create new output ops = ldframe["face"] output = { "frame": ops.frame.values.copy(), "top": ops.top.values.copy(), "left": ops.left.values.copy(), "bottom": ops.bottom.values.copy(), "right": ops.right.values.copy(), "confidence": ops.confidence.values.copy(), "person": [""] * len(ops.frame.values), "person-dist": [""] * len(ops.frame.values), } # cycle through frames and detect closest face; let the user filter as # needed for fid, face in enumerate(ops.embed.values): dists = norm(face - self.fprint, axis=1) output["person"][fid] = self.face_names[argmin(dists)] output["person-dist"][fid] = npmin(dists) return output
[docs]def make_fprint_from_images(dinput): """Create face fingerprints from a directory of faces. This function takes as an input a directory containing image files, with each image given the name of a person or character. The function returns the 'fingerprints' (sterotypical embedding) of the faces in a format that can be passed to the PeopleAggregator. Args: face_names (list): List of names associated with each face in the set of predefined faces Returns: A tuple giving the number array of embedding vectors and a list of the names of the people in the images. """ dextra = DataExtraction(ImageInput(input_paths=join(dinput, "", "*"))) dextra.run_annotators([FaceAnnotator( detector=FaceDetectMtcnn(), embedding=FaceEmbedVgg2() )]) faces = dextra.get_data()['face'] face_names = [ splitext(basename(x))[0] for x in dextra.get_data()["meta"]["paths"] ] return vstack(faces.embed), face_names