Source code for bob.db.ijbc.reader

#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# @author: Manuel Gunther <siebenkopf@googlemail.com>
# @date:   Mon Oct 16 18:35:14 MDT 2017

"""
This script has some sort of utility functions that parses the original database files
"""

import pkg_resources
import os

import bob.db.base
import csv
import numpy

import logging
import six

logger = logging.getLogger("bob.db.ijbc")


[docs]class Annotation:
    """
    Annotations for a File of the IJB-C dataset
    """

    def __init__(self, annots):
        # assure that we have all annotations
        assert len(annots) == 30

        # assure that a face bounding box is present
        assert not numpy.all(numpy.isnan(annots[:4]))
        self.topleft = (annots[1], annots[0])
        self.size = (annots[3], annots[2])
        self.bottomright = tuple(self.topleft[i] + self.size[i] for i in range(2))

        self.frame = None if numpy.isnan(annots[4]) else annots[4]
        self.facial_hair = None if numpy.isnan(annots[5]) else annots[5]
        self.age = None if numpy.isnan(annots[6]) else annots[6]
        self.indoor = None if numpy.isnan(annots[7]) else annots[7]
        self.skintone = None if numpy.isnan(annots[8]) else annots[8]
        self.gender = None if numpy.isnan(annots[9]) else annots[9]
        self.yaw = None if numpy.isnan(annots[10]) else annots[10]
        self.roll = None if numpy.isnan(annots[11]) else annots[11]

        self.occlusion = annots[12:30]
        self.annotation = dict(topleft=self.topleft, bottomright=self.bottomright, size=self.size)

    def __call__(self):
        return self.annotation


class File(bob.db.base.File):
    """
    IJB-C File class

    Different from its ascendant class, this one as input

    """

[docs]    @staticmethod
    def make_id(path, subject_id):
        return "%s-%s" % (path, subject_id)

    def __init__(self, subject_id, path, annotation=None):
        """**Constructor Documentation**

        Initialize the File object with the minimum required data.

        Parameters
        ----------

        subject_id : various type
          The id of the client, this file belongs to, typically the ``subject_id`` of the protocol file.

        path : str
          The path of this file, relative to the basic directory.

        annotation : :py:class:`Annotation` or ``None``
          The annotation of the file, if present
        """
        path, self.extension = os.path.splitext(path)
        super(File, self).__init__(path, self.make_id(path, subject_id))
        self.client_id = subject_id
        self.annotation = annotation

[docs]    def make_path(self, directory=None, extension=None, add_client_id=True):
        """Wraps the current path so that a complete path is formed.
        By default, the file name will be a unique file name, as there might be several ``File`` objects with the same path.
        To get the original file name, please set the ``add_client_id`` flag to ``False``.

        Keyword parameters:

        directory : str or ``None``
          An optional directory name that will be prefixed to the returned result.

        extension : str or ``None``
          An optional extension that will be suffixed to the returned filename.
          The extension normally includes the leading ``.`` character as in ``.jpg`` or ``.hdf5``.

        add_client_id : bool
          By default, the client_id is added to generate a unique path.
          If set to false, the client_id will not be added.

        Returns a string containing the newly generated file path, which by default is unique.
        """
        # assure that directory and extension are actually strings
        if directory is None: directory = ''
        if extension is None: extension = ''
        path = "%s%s" % (self.id, extension) if add_client_id else "%s%s" % (self.path, extension)
        # create the path
        return os.path.join(directory, path)


class Template:
    """A ``Template`` contains a list of :py:class:`File` objects belonging to
    the same subject (there might be several templates per subject).

    These are listed in the ``self.files`` field.

    A ``Template`` can serve for training, model enrollment, or for probing.

    Each template belongs specifically to a certain protocol, as the template_id
    in the original file lists might differ for different protocols.

    The protocol purpose can be obtained using ``self.protocol_purpose`` after
    creation of the database.

    Note that the ``template_id`` corresponds to the template_id of the file
    lists, while the ``id`` is only used as a unique key for querying the
    database.

    For convenience, the template also contains a ``path``, which is a
    concatenation of the ``File.media_id`` of the first file, and the
    ``self.template_id``, making it unique (at least per protocol).

    """

    def __init__(self, template_id, subject_id, files=None):
        self.id = template_id
        self.client_id = subject_id
        self.files = files if files is not None else []

        self.path = str(template_id)

    def __lt__(self, other):
        """This function defines the order on the Template objects. Template objects are
        always ordered by their ID, in ascending order."""
        return self.id < other.id


class Protocol:
    """The list of protocols and their according files"""

    def __init__(self):
        self.base_directory = pkg_resources.resource_filename(__name__, "protocol")
        if not os.path.isdir(self.base_directory):
            raise IOError(
                "The protocol directory %s cannot be found? Did you forget to download the protocol files with 'bob_dbmanage.py ijbc download'?" % self.base_directory)
        self._files = {}
        self._templates = {}
        self._matches = {}
        self._covariates = {}

        self.protocol_names = [
            "1:1", "Covariates"
        ]
        # TODO: Take care of other protocols
        #self.protocol_names = [
        #    "1:1", "Covariates",
        #    "1:N-G1-Image", "1:N-G2-Image", "1:N-Image",
        #    "1:N-G1-Mixed", "1:N-G2-Mixed", "1:N-Mixed",
        #    "1:N-G1-Video", "1:N-G2-Video", "1:N-Video"
        #]

        self.purpose_names = ["enroll", "probe"]

    def _read_metadata(self):
        """Reads the meta-data file if not yet done"""
        if not self._files:
            with open(os.path.join(self.base_directory, "ijbc_metadata.csv")) as p:
                reader = csv.reader(p)
                # skip header row
                six.next(reader)
                for splits in reader:
                    # generate annotations
                    annots = [float(a) for a in splits[3:]]
                    annotation = None if numpy.all(numpy.isnan(annots)) else Annotation(annots)

                    # create file
                    subject_id = None if numpy.isnan(float(splits[0])) else int(splits[0])
                    file = File(subject_id, splits[1], annotation)
                    if file.id in self._files:
                        #logger.debug("Found duplicate entry for file %s with ID %d", file.path, file.client_id)
                        x = 0
                    else:
                        self._files[file.id] = file

    def _read_template_list(self, which, protocol_file):
        if which not in self._templates:
            templates = self._templates[which] = {}
            with open(os.path.join(self.base_directory, protocol_file)) as p:
                reader = csv.reader(p)
                # skip header row
                six.next(reader)
                for splits in reader:
                    # generate file id
                    subject_id = None if numpy.isnan(float(splits[1])) else int(splits[1])
                    file_id = File.make_id(os.path.splitext(splits[2])[0], subject_id)

                    # make sure we know that file already
                    assert file_id in self._files

                    # add it to the template, or create it if not done yet
                    template_id = int(splits[0])
                    if template_id not in templates:
                        templates[template_id] = Template(template_id, subject_id)
                    templates[template_id].files.append(self._files[file_id])

                    # TODO: check that the annotations match

        return self._templates[which]

    def _read_match_file(self, protocol, protocol_file):
        if protocol not in self._matches:
            # assure that the probe is loaded
            if protocol == "1:1":
                self.get_templates(protocol, "probe")

            # read match files
            match_file = os.path.join(self.base_directory, protocol_file)
            matches = self._matches[protocol] = {}

            with open(match_file) as f:
                # read the rest of the lines
                reader = csv.reader(f)
                for splits in reader:
                    # extract basic information of the file
                    assert len(splits) == 2
                    model_id = int(splits[0])
                    probe_id = int(splits[1])
                    if model_id not in matches:
                        matches[model_id] = []
                    matches[model_id].append(probe_id)

        return self._matches[protocol]

[docs]    def get_templates(self, protocol, purpose=None):
        """Returns all :py:class:`Template`'s for the given protocol and purpose."""
        assert protocol in self.protocol_names
        assert purpose in self.purpose_names
        # read metadata if not done yet
        self._read_metadata()

        if protocol == "Covariates":
            # for the covariates, we do not use the default gallery
            if not self._covariates:
                # first, read all templates
                self._read_template_list("Covariates", "ijbc_11_covariate_probe_reference.csv")
                # and now split them into model and probe (overlapping)
                matches = self._read_match_file("Covariates", "ijbc_11_covariate_matches.csv")
                self._covariates["enroll"] = {x: self._templates["Covariates"][x] for x in matches}
                self._covariates["probe"] = {x: self._templates["Covariates"][x] for x in
                                             set(p for m in matches.keys() for p in matches[m])}
            return self._covariates[purpose]

        elif purpose == "enroll":
            # otherwise, we have the same templates for enrollment, throughout
            if "S2" not in protocol: self._read_template_list("G1", "ijbc_1N_gallery_G1.csv")
            if "S1" not in protocol: self._read_template_list("G2", "ijbc_1N_gallery_G2.csv")

            if "S1" in protocol:
                return self._templates["G1"]
            elif "S2" in protocol:
                return self._templates["G2"]
            else:
                if "S1S2" not in self._templates:
                    self._templates["G1G2"] = self._templates["G1"].copy()
                    self._templates["G1G2"].update(self._templates["G2"])
                return self._templates["G1G2"]

        else:
            # probes for 1:N protocol
            if "Image" in protocol:
                return self._read_template_list("Image", "ijbc_1N_probe_img.csv")
            elif "Video" in protocol:
                return self._read_template_list("Video", "ijbc_1N_probe_video.csv")
            else:
                # This file is used for both the 1:1 protocol (as probes) and the 1:N-Mixed protocols
                return self._read_template_list("Mixed", "ijbc_1N_probe_mixed.csv")

[docs]    def enroll_template(self, protocol, model_id):
        """Returns the enrollment template for the given model_id"""
        templates = self.get_templates(protocol=protocol, purpose="enroll")
        assert model_id in templates, "The given model id '%s' is not a gallery template ID" % model_id
        return templates[model_id]

[docs]    def probe_templates(self, protocol, model_id):
        """Returns the probe templates for the given model_id"""
        if protocol == "1:1":
            matches = self._read_match_file("1:1", "ijbc_11_G1_G2_matches.csv")[model_id]
            return [self._templates["Mixed"][m] for m in matches]
        elif protocol == "Covariates":
            matches = self._read_match_file("Covariates", "ijbc_11_covariate_matches.csv")[model_id]
            return [self._templates["Covariates"][m] for m in matches]
        else:
            # for 1:N protocols, return all probe files
            return self.get_templates(protocol, "probe").values()