Source code for bob.bio.face.database.rfw

import copy
import logging
import os

from functools import partial

import numpy as np

import bob.io.base

from bob.bio.base.pipelines.abstract_classes import Database
from bob.extension import rc
from bob.extension.download import get_file
from bob.pipelines.sample import DelayedSample, SampleSet

logger = logging.getLogger("bob.bio.face")


class RFWDatabase(Database):
    """
    Dataset interface for the Racial faces in the wild dataset:

    The RFW is a subset of the MS-Celeb 1M dataset, and it's composed of 44332 images split into 11416 identities.
    There are four "race" labels in this dataset (`African`, `Asian`, `Caucasian`, and `Indian`).
    Furthermore, with the help of https://query.wikidata.org/ we've added information about gender and
    country of birth.

    We offer two evaluation protocols.
    The first one, called "original" is the original protocol from its publication. It contains ~24k comparisons in total.
    Worth noting that this evaluation protocol has an issue. It considers only comparisons of pairs of images from the same
    "race".
    To close this gap, we've created a protocol called "idiap" that extends the original protocol to one where impostors  comparisons
    (or non-mated) is possible. This is closed to a real-world scenario.

    .. warning::
        The following identities are assossiated with two races in the original dataset
         - m.023915
         - m.0z08d8y
         - m.0bk56n
         - m.04f4wpb
         - m.0gc2xf9
         - m.08dyjb
         - m.05y2fd
         - m.0gbz836
         - m.01pw5d
         - m.0cm83zb
         - m.02qmpkk
         - m.05xpnv


    For more information check:

    .. code-block:: latex

        @inproceedings{wang2019racial,
        title={Racial faces in the wild: Reducing racial bias by information maximization adaptation network},
        author={Wang, Mei and Deng, Weihong and Hu, Jiani and Tao, Xunqiang and Huang, Yaohai},
        booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
        pages={692--702},
        year={2019}
        }

    """

    def __init__(
        self,
        protocol,
        original_directory=rc.get("bob.bio.face.rfw.directory"),
        **kwargs,
    ):

        if original_directory is None or not os.path.exists(original_directory):
            raise ValueError(
                "Invalid or non existant `original_directory`: f{original_directory}"
            )

        self._check_protocol(protocol)
        self._races = ["African", "Asian", "Caucasian", "Indian"]
        self.original_directory = original_directory
        self._default_extension = ".jpg"

        super().__init__(
            name="rfw",
            protocol=protocol,
            score_all_vs_all=False,
            annotation_type="eyes-center",
            fixed_positions=None,
            memory_demanding=False,
        )

        self._pairs = dict()
        self._first_reference_of_subject = (
            dict()
        )  # Used with the Idiap protocol
        self._inverted_pairs = dict()
        self._id_race = dict()  # ID -- > RACE
        self._race_ids = dict()  # RACE --> ID
        self._landmarks = dict()
        self._cached_biometric_references = None
        self._cached_probes = None
        self._cached_zprobes = None
        self._cached_treferences = None
        self._cached_treferences = None
        self._discarded_subjects = (
            []
        )  # Some subjects were labeled with both races
        self._load_metadata(target_set="test")
        self._demographics = None
        self._demographics = self._get_demographics_dict()

        # Setting the seed for the IDIAP PROTOCOL,
        # so we have a consisent set of probes
        self._idiap_protocol_seed = 652

        # Number of samples used to Z-Norm and T-Norm (per race)
        self._nzprobes = 25
        self._ntreferences = 25

[docs] @staticmethod def urls(): return [ "https://www.idiap.ch/software/bob/databases/latest/msceleb_wikidata_demographics.csv.tar.gz", "http://www.idiap.ch/software/bob/databases/latest/msceleb_wikidata_demographics.csv.tar.gz", ]
def _get_demographics_dict(self): """ Get the dictionary with GENDER and COUNTRY of birth. Data obtained using the wiki data `https://query.wikidata.org/` using the following sparql query ''' SELECT ?item ?itemLabel ?genderLabel ?countryLabel WHERE { ?item wdt:P31 wd:Q5. ?item ?label "{MY_NAME_HERE}"@en . optional{ ?item wdt:P21 ?gender.} optional{ ?item wdt:P27 ?country.} SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } } ''' """ urls = RFWDatabase.urls() filename = get_file( "msceleb_wikidata_demographics.csv.tar.gz", urls, file_hash="8eb0e3c93647dfa0c13fade5db96d73a", extract=True, )[:-7] if self._demographics is None: self._demographics = dict() with open(filename) as f: for line in f.readlines(): line = line.split(",") self._demographics[line[0]] = [ line[2], line[3].rstrip("\n"), ] return self._demographics def _get_subject_from_key(self, key): return key[:-5] def _load_metadata(self, target_set="test"): for race in self._races: pair_file = os.path.join( self.original_directory, target_set, "txts", race, f"{race}_pairs.txt", ) for line in open(pair_file).readlines(): line = line.split("\t") line[-1] = line[-1].rstrip("\n") key = f"{line[0]}_000{line[1]}" subject_id = self._get_subject_from_key(key) dict_key = f"{race}/{subject_id}/{key}" if subject_id not in self._id_race: self._id_race[subject_id] = race else: if ( self._id_race[subject_id] != race and subject_id not in self._discarded_subjects ): logger.warning( f"{subject_id} was already labeled as {self._id_race[subject_id]}, and it's illogical to be relabeled as {race}. " f"This seems a problem with RFW dataset, so we are removing all samples linking {subject_id} as {race}" ) self._discarded_subjects.append(subject_id) continue # Positive or negative pairs if len(line) == 3: k_value = f"{line[0]}_000{line[2]}" dict_value = f"{race}/{self._get_subject_from_key(k_value)}/{k_value}" else: k_value = f"{line[2]}_000{line[3]}" dict_value = f"{race}/{self._get_subject_from_key(k_value)}/{k_value}" if dict_key not in self._pairs: self._pairs[dict_key] = [] self._pairs[dict_key].append(dict_value) # Picking the first reference if self.protocol == "idiap": for p in self._pairs: _, subject_id, reference_id = p.split("/") if subject_id in self._first_reference_of_subject: continue self._first_reference_of_subject[subject_id] = reference_id # Preparing the probes self._inverted_pairs = self._invert_dict(self._pairs) self._race_ids = self._invert_dict(self._id_race) def _invert_dict(self, dict_pairs): inverted_pairs = dict() for k in dict_pairs: if isinstance(dict_pairs[k], list): for v in dict_pairs[k]: if v not in inverted_pairs: inverted_pairs[v] = [] inverted_pairs[v].append(k) else: v = dict_pairs[k] if v not in inverted_pairs: inverted_pairs[v] = [] inverted_pairs[v].append(k) return inverted_pairs
[docs] def background_model_samples(self): return []
def _get_zt_samples(self, seed): cache = [] # Setting the seed for the IDIAP PROTOCOL, # so we have a consisent set of probes np.random.seed(seed) for race in self._races: data_dir = os.path.join( self.original_directory, "train", "data", race ) files = os.listdir(data_dir) # SHUFFLING np.random.shuffle(files) files = files[0 : self._nzprobes] # RFW original data is not super organized # train data from Caucasians are stored differently if race == "Caucasian": for f in files: reference_id = os.listdir(os.path.join(data_dir, f))[0] key = f"{race}/{f}/{reference_id[:-4]}" cache.append( self._make_sampleset( key, target_set="train", get_demographic=False ) ) else: for f in files: key = f"{race}/{race}/{f[:-4]}" cache.append( self._make_sampleset( key, target_set="train", get_demographic=False ) ) return cache
[docs] def zprobes(self, group="dev", proportion=1.0): if self._cached_zprobes is None: self._cached_zprobes = self._get_zt_samples( self._idiap_protocol_seed + 1 ) references = list( set([s.reference_id for s in self.references(group=group)]) ) for p in self._cached_zprobes: p.references = copy.deepcopy(references) return self._cached_zprobes
[docs] def treferences(self, group="dev", proportion=1.0): if self._cached_treferences is None: self._cached_treferences = self._get_zt_samples( self._idiap_protocol_seed + 2 ) return self._cached_zprobes
[docs] def probes(self, group="dev"): self._check_group(group) if self._cached_probes is None: # Setting the seed for the IDIAP PROTOCOL, # so we have a consisent set of probes np.random.seed(self._idiap_protocol_seed) self._cached_probes = [] for key in self._inverted_pairs: sset = self._make_sampleset(key) sset.references = [ key.split("/")[-1] for key in self._inverted_pairs[key] ] # If it's the idiap protocol, we should # extend the list of comparisons if self.protocol == "idiap": # Picking one reference per race extra_references = [] for k in self._race_ids: # Discard samples from the same race if k == sset.race: continue index = np.random.randint(len(self._race_ids[k])) random_subject_id = self._race_ids[k][index] # Search for the first reference id in with this identity extra_references.append( self._first_reference_of_subject[random_subject_id] ) assert len(extra_references) == 3 sset.references += extra_references self._cached_probes.append(sset) return self._cached_probes
def _fetch_landmarks(self, filename, key): if key not in self._landmarks: with open(filename) as f: for line in f.readlines(): line = line.split("\t") # pattern 'm.0c7mh2_0003.jpg'[:-4] k = line[0].split("/")[-1][:-4] self._landmarks[k] = dict() self._landmarks[k]["reye"] = ( float(line[3]), float(line[2]), ) self._landmarks[k]["leye"] = ( float(line[5]), float(line[4]), ) return self._landmarks[key] def _make_sampleset(self, item, target_set="test", get_demographic=True): race, subject_id, reference_id = item.split("/") # RFW original data is not super organized # Test and train data os stored differently key = f"{race}/{subject_id}/{reference_id}" path = ( os.path.join( self.original_directory, f"{target_set}/data/{race}", subject_id, reference_id + self._default_extension, ) if (target_set == "test" or race == "Caucasian") else os.path.join( self.original_directory, f"{target_set}/data/{race}", reference_id + self._default_extension, ) ) annotations = ( self._fetch_landmarks( os.path.join( self.original_directory, "erratum1", "Caucasian_lmk.txt" ), reference_id, ) if (target_set == "train" and race == "Caucasian") else self._fetch_landmarks( os.path.join( self.original_directory, f"{target_set}/txts/{race}/{race}_lmk.txt", ), reference_id, ) ) samples = [ DelayedSample( partial( bob.io.base.load, path, ), key=key, annotations=annotations, reference_id=reference_id, subject_id=subject_id, ) ] if get_demographic: gender = self._demographics[subject_id][0] country = self._demographics[subject_id][1] return SampleSet( samples, key=key, reference_id=reference_id, subject_id=subject_id, race=race, gender=gender, country=country, ) else: return SampleSet( samples, key=key, reference_id=reference_id, subject_id=subject_id, race=race, )
[docs] def references(self, group="dev"): self._check_group(group) if self._cached_biometric_references is None: self._cached_biometric_references = [] for key in self._pairs: self._cached_biometric_references.append( self._make_sampleset(key) ) return self._cached_biometric_references
[docs] def all_samples(self, group="dev"): self._check_group(group) return self.references() + self.probes()
[docs] def groups(self): return ["dev"]
[docs] def protocols(self): return ["original", "idiap"]
def _check_protocol(self, protocol): assert ( protocol in self.protocols() ), "Unvalid protocol `{}` not in {}".format(protocol, self.protocols()) def _check_group(self, group): assert group in self.groups(), "Unvalid group `{}` not in {}".format( group, self.groups() )