Source code for dvt.aggregate.length

# -*- coding: utf-8 -*-
"""Aggregate frame level information to estimate shot length.

The aggregator functions here takes detected faces and objects to estimate the
shot length. It also provides aggregated information about the detected faces
and objects for each frame.
"""

from numpy import argmax, array, max as npmax, nonzero

from ..abstract import Aggregator
from ..utils import _check_data_exists


[docs]class ShotLengthAggregator(Aggregator): """Uses detected faces and objects to estimate shot length. You can change the cut-offs and names of the face types. Attributes: min_obj_score (float): minimum confidence score to include a detected object in the computation min_face_score (float): minimum confidence score to include a detected face in the computation max_person_dist (float): maximum distance to a face to categorize as a known person. shot_names (list): a list of shot names, from the longest shot to the tightest. Set to None to use the default settings. shot_sizes (list): as list of shot size cut-offs given as a proportion (vertical) of face size to the entire shot. Should be an increasing list starting at zero and the same length as shot_names. Set to None to use the default settings. name (str): A description of the aggregator. Used as a key in the output data. """ name = "length" def __init__(self, **kwargs): self.min_obj_score = kwargs.get("min_obj_score", 0.7) self.min_face_score = kwargs.get("min_face_score", 0.7) self.max_person_dist = kwargs.get("max_person_dist", 100) self.shot_sizes = array( kwargs.get("shot_sizes", [0, 0.05, 0.15, 0.2, 0.3, 0.5, 0.7]) ) self.shot_names = kwargs.get("shot_names", [ "1-VLS", "2-LS", "3-MLS", "4-MS", "5-MCU", "6-CU", "7-BCU", ]) self.frames = kwargs.get('frames', None) assert len(self.shot_sizes) == len(self.shot_names) super().__init__(**kwargs)
[docs] def aggregate(self, ldframe, **kwargs): """Determine shot lengths using detected faces and objects. Args: ldframe (dict): A dictionary of DictFrames from a FrameAnnotator. Must contain an entry with the keys 'meta', 'face' and 'obj', which are used in the annotation. frames (list): An optional list of frames. Otherwise, will annotate any frame with a detected face or object. Returns: A dictionary frame giving the detected people, with one row per frame in the original input. """ # make sure annotators have been run _check_data_exists(ldframe, ["face", "obj", "meta"]) # grab the data sets face = ldframe["face"] objs = ldframe["obj"] # get heights; different depending on input type (video or images) if "height" in ldframe["meta"].keys(): face = face.assign( img_height=ldframe["meta"].height.values[0], img_width=ldframe["meta"].width.values[0] ) objs = objs.assign( img_height=ldframe["meta"].height.values[0], img_width=ldframe["meta"].width.values[0] ) else: face = face.merge(ldframe["img"], on="frame") objs = objs.merge(ldframe["img"], on="frame") # compute data using vectorized numpy arrays, where possible face_height = ( face.bottom.values - face.top.values ) / face.img_height.values objs_height = ( objs.bottom.values - objs.top.values ) / objs.img_height.values if "person" not in face: face['person'] = "" if "person_dist" not in face: face['person_dist'] = 0 # what frames to include? if self.frames is None: self.frames = set(face.frame.values).union(objs.frame.values) frames = sorted(self.frames) # create the output output = { "frame": frames, "num_faces": [0] * len(frames), "num_people": [0] * len(frames), "largest_face": [0.0] * len(frames), "largest_body": [0.0] * len(frames), "shot_length": [""] * len(frames), "objects": [""] * len(frames), "people": [""] * len(frames) } for fnum, frame in enumerate(frames): face_ids = nonzero( (face.frame.values == frame) & (face.confidence.values > self.min_face_score) )[0] face_person_ids = nonzero( (face.frame.values == frame) & (face.confidence.values > self.min_face_score) & (face.person_dist.values < self.max_person_dist) )[0] objs_ids = nonzero( (objs.frame.values == frame) & (objs.category.values == "person") & (objs.score.values > self.min_obj_score) )[0] aobj_ids = nonzero(objs.frame.values == frame)[0] output["num_faces"][fnum] = len(face_ids) output["num_people"][fnum] = len(objs_ids) output["largest_face"][fnum] = npmax( face_height[face_ids], initial=0 ) output["largest_body"][fnum] = npmax( objs_height[objs_ids], initial=0 ) output["objects"][fnum] = "; ".join( sorted(set(objs.category.values[aobj_ids])) ) output["people"][fnum] = "; ".join( sorted(set(face.person.values[face_person_ids])) ) output["shot_length"][fnum] = self.shot_names[ argmax(self.shot_sizes >= output["largest_face"][fnum]) ] return output