Source code for bob.bio.face.database.ijbc

from bob.bio.base.pipelines.vanilla_biometrics.abstract_classes import Database
import pandas as pd
from bob.pipelines.sample import DelayedSample, SampleSet
from bob.extension import rc
import os
import bob.io.image
from functools import partial
from bob.pipelines.utils import hash_string
import copy
import logging

logger = logging.getLogger(__name__)


def _make_sample_from_template_row(row, image_directory):

    # Appending this key, so we can handle parallel writting done correctly
    # paying the penalty of having duplicate files
    key = os.path.splitext(row["FILENAME"])[0] + "-" + str(row["SUBJECT_ID"])

    return DelayedSample(
        load=partial(bob.io.image.load, os.path.join(image_directory, row["FILENAME"])),
        reference_id=str(row["TEMPLATE_ID"]),
        subject_id=str(row["SUBJECT_ID"]),
        key=key,
        # gender=row["GENDER"],
        # indoor_outdoor=row["INDOOR_OUTDOOR"],
        # skintone=row["SKINTONE"],
        # yaw=row["YAW"],
        # rool=row["ROLL"],
        # occ1=row["OCC1"],
        # occ2=row["OCC2"],
        # occ3=row["OCC3"],
        # occ4=row["OCC4"],
        # occ5=row["OCC5"],
        # occ6=row["OCC6"],
        # occ7=row["OCC7"],
        # occ8=row["OCC8"],
        # occ9=row["OCC9"],
        # occ10=row["OCC10"],
        # occ11=row["OCC11"],
        # occ12=row["OCC12"],
        # occ13=row["OCC13"],
        # occ14=row["OCC14"],
        # occ15=row["OCC15"],
        # occ16=row["OCC16"],
        # occ17=row["OCC17"],
        # occ18=row["OCC18"],
        annotations={
            "topleft": (float(row["FACE_Y"]), float(row["FACE_X"])),
            "bottomright": (
                float(row["FACE_Y"]) + float(row["FACE_HEIGHT"]),
                float(row["FACE_X"]) + float(row["FACE_WIDTH"]),
            ),
            "size": (float(row["FACE_HEIGHT"]), float(row["FACE_WIDTH"])),
        },
    )


def _make_sample_set_from_template_group(template_group, image_directory):

    samples = list(
        template_group.apply(
            _make_sample_from_template_row, axis=1, image_directory=image_directory
        )
    )
    return SampleSet(
        samples,
        reference_id=samples[0].reference_id,
        subject_id=samples[0].subject_id,
        key=samples[0].reference_id,
    )


[docs]class IJBCDatabase(Database):
    """

    This package contains the access API and descriptions for the IARPA Janus Benchmark C -- IJB-C database.
    The actual raw data can be downloaded from the original web page: http://www.nist.gov/programs-projects/face-challenges (note that not everyone might be eligible for downloading the data).

    Included in the database, there are list files defining verification as well as closed- and open-set identification protocols.
    For verification, two different protocols are provided.
    For the ``1:1`` protocol, gallery and probe templates are combined using several images and video frames for each subject.
    Compared gallery and probe templates share the same gender and skin tone -- these have been matched to make the comparisions more realistic and difficult.

    For closed-set identification, the gallery of the ``1:1`` protocol is used, while probes stem from either only images, mixed images and video frames, or plain videos.
    For open-set identification, the same probes are evaluated, but the gallery is split into two parts, either of which is left out to provide unknown probe templates, i.e., probe templates with no matching subject in the gallery.
    In any case, scores are computed between all (active) gallery templates and all probes.

    The IJB-C dataset provides additional evaluation protocols for face detection and clustering, but these are (not yet) part of this interface.


    .. warning::
      
      To use this dataset protocol, you need to have the original files of the IJBC datasets.
      Once you have it downloaded, please run the following command to set the path for Bob

        .. code-block:: sh

            bob config set bob.bio.face.ijbc.directory [IJBC PATH]

    
    The code below allows you to fetch the galery and probes of the "1:1" protocol.

    .. code-block:: python

        >>> from bob.bio.face.database import IJBCDatabase
        >>> ijbc = IJBCDatabase(protocol="test1")
        >>>
        >>> # Fetching the gallery 
        >>> references = ijbc.references()
        >>> # Fetching the probes 
        >>> probes = ijbc.probes()
    
    """

    def __init__(
        self,
        protocol,
        original_directory=rc.get("bob.bio.face.ijbc.directory"),
        **kwargs,
    ):

        if original_directory is None or not os.path.exists(original_directory):
            raise ValueError(
                "Invalid or non existant `original_directory`: f{original_directory}"
            )

        self._check_protocol(protocol)
        super().__init__(
            name="ijbc",
            protocol=protocol,
            allow_scoring_with_all_biometric_references=False,
            annotation_type="bounding-box",
            fixed_positions=None,
            memory_demanding=True,
        )

        self.image_directory = os.path.join(original_directory, "images")
        self.protocol_directory = os.path.join(original_directory, "protocols")
        self._cached_probes = None
        self._cached_references = None
        self.hash_fn = hash_string

        self._load_metadata(protocol)

        ### For the test4 protocols
        if "test4" in protocol:
            self.allow_scoring_with_all_biometric_references = True

    def _load_metadata(self, protocol):
        # Load CSV files
        if protocol == "test1" or protocol == "test2":
            self.reference_templates = pd.read_csv(
                os.path.join(self.protocol_directory, protocol, "enroll_templates.csv")
            )

            self.probe_templates = pd.read_csv(
                os.path.join(self.protocol_directory, protocol, "verif_templates.csv")
            )

            self.matches = pd.read_csv(
                os.path.join(self.protocol_directory, protocol, "match.csv"),
                names=["ENROLL_TEMPLATE_ID", "VERIF_TEMPLATE_ID"],
            ).astype("str")

            # TODO: temporarily disabling the metadata
            """
            self.metadata = pd.read_csv(
                os.path.join(self.protocol_directory, "ijbc_metadata_with_age.csv"),
                usecols=[
                    "SUBJECT_ID",
                    "FILENAME",
                    "FACE_X",
                    "FACE_Y",
                    "FACE_WIDTH",
                    "FACE_HEIGHT",
                    "SIGHTING_ID",
                    "FACIAL_HAIR",
                    "AGE",
                    "INDOOR_OUTDOOR",
                    "SKINTONE",
                    "GENDER",
                    "YAW",
                    "ROLL",
                ]
                + [f"OCC{i}" for i in range(1, 19)],
            )

            # LEFT JOIN WITH METADATA
            self.probe_templates = pd.merge(
                self.probe_templates,
                self.metadata,
                on=[
                    "SUBJECT_ID",
                    "FILENAME",
                    "FACE_X",
                    "FACE_Y",
                    "FACE_WIDTH",
                    "FACE_HEIGHT",
                ],
                how="left",
            )

            # LEFT JOIN WITH METADATA
            self.reference_templates = pd.merge(
                self.reference_templates,
                self.metadata,
                on=[
                    "SUBJECT_ID",
                    "FILENAME",
                    "FACE_X",
                    "FACE_Y",
                    "FACE_WIDTH",
                    "FACE_HEIGHT",
                ],
                how="left",
            )
            """

        elif "test4" in protocol:
            gallery_file = "gallery_G1.csv" if "G1" in protocol else "gallery_G2.csv"

            self.reference_templates = pd.read_csv(
                os.path.join(self.protocol_directory, "test4", gallery_file)
            )

            self.probe_templates = pd.read_csv(
                os.path.join(self.protocol_directory, "test4", "probes.csv")
            )

            self.matches = None

        else:
            raise ValueError(
                f"Protocol `{protocol}` not supported. We do accept merge requests :-)"
            )

[docs]    def background_model_samples(self):
        return None

[docs]    def probes(self, group="dev"):
        self._check_group(group)
        if self._cached_probes is None:

            logger.info("Loading probes. This operation might take some minutes")

            self._cached_probes = list(
                self.probe_templates.groupby("TEMPLATE_ID").apply(
                    _make_sample_set_from_template_group,
                    image_directory=self.image_directory,
                )
            )

            # Wirering probes with references
            if self.protocol == "test1" or self.protocol == "test2":
                # Link probes to the references they have to be compared with
                # We might make that faster if we manage to write it as a Panda instruction
                grouped_matches = self.matches.groupby("VERIF_TEMPLATE_ID")
                for probe_sampleset in self._cached_probes:
                    probe_sampleset.references = list(
                        grouped_matches.get_group(probe_sampleset.reference_id)[
                            "ENROLL_TEMPLATE_ID"
                        ]
                    )
            elif "test4" in self.protocol:
                references = [s.reference_id for s in self.references()]
                # You compare with all biometric references
                for probe_sampleset in self._cached_probes:
                    probe_sampleset.references = copy.deepcopy(references)
                    pass

            else:
                raise ValueError(f"Invalid protocol: {self.protocol}")

        return self._cached_probes

[docs]    def references(self, group="dev"):
        self._check_group(group)
        if self._cached_references is None:

            logger.info("Loading templates. This operation might take some minutes")

            self._cached_references = list(
                self.reference_templates.groupby("TEMPLATE_ID").apply(
                    _make_sample_set_from_template_group,
                    image_directory=self.image_directory,
                )
            )

        return self._cached_references

[docs]    def all_samples(self, group="dev"):
        self._check_group(group)

        return self.references() + self.probes()

[docs]    def groups(self):
        return ["dev"]

[docs]    def protocols(self):
        return ["test1", "test2", "test4-G1", "test4-G2"]

    def _check_protocol(self, protocol):
        assert protocol in self.protocols(), "Unvalid protocol `{}` not in {}".format(
            protocol, self.protocols()
        )

    def _check_group(self, group):
        assert group in self.groups(), "Unvalid group `{}` not in {}".format(
            group, self.groups()
        )