Source code for bob.bio.base.pipelines.pipelines

#!/usr/bin/env python
# vim: set fileencoding=utf-8 :

"""
Implementation of the PipelineSimple using Dask :ref:`bob.bio.base.struct_bio_rec_sys`_

This file contains simple processing blocks meant to be used
for bob.bio experiments
"""

import logging

from sklearn.base import BaseEstimator
from sklearn.pipeline import Pipeline

from bob.bio.base.pipelines.abstract_classes import BioAlgorithm
from bob.pipelines import SampleWrapper, is_instance_nested, wrap

from .score_writers import FourColumnsScoreWriter

logger = logging.getLogger(__name__)
import tempfile


class PipelineSimple:
    """
    The simplest possible pipeline

    This is the backbone of most biometric recognition systems.
    It implements three subpipelines and they are the following:

     - :py:class:`PipelineSimple.train_background_model`: Initializes or trains your transformer.
        It will run :py:meth:`sklearn.base.BaseEstimator.fit`

     - :py:class:`PipelineSimple.enroll_templates`: Creates enrollment templates
        It will run :py:meth:`sklearn.base.BaseEstimator.transform` followed by a sequence of
        :py:meth:`bob.bio.base.pipelines.abstract_classes.BioAlgorithm.create_templates`

     - :py:class:`PipelineSimple.probe_templates`: Creates probe templates
        It will run :py:meth:`sklearn.base.BaseEstimator.transform` followed by a sequence of
        :py:meth:`bob.bio.base.pipelines.abstract_classes.BioAlgorithm.create_templates`

     - :py:class:`PipelineSimple.compute_scores`: Computes scores
        It will run :py:meth:`bob.bio.base.pipelines.abstract_classes.BioAlgorithm.compare`


    Example
    -------
       >>> from sklearn.preprocessing import FunctionTransformer
       >>> from sklearn.pipeline import make_pipeline
       >>> from bob.bio.base.algorithm import Distance
       >>> from bob.bio.base.pipelines import PipelineSimple
       >>> from bob.pipelines import wrap
       >>> import numpy
       >>> linearize = lambda samples: [numpy.reshape(x, (-1,)) for x in samples]
       >>> transformer = wrap(["sample"], FunctionTransformer(linearize))
       >>> transformer_pipeline = make_pipeline(transformer)
       >>> biometric_algorithm = Distance()
       >>> pipeline = PipelineSimple(transformer_pipeline, biometric_algorithm)
       >>> pipeline(samples_for_training_back_ground_model, samplesets_for_enroll, samplesets_for_scoring)  # doctest: +SKIP


    To run this pipeline using Dask, used the function
    :py:func:`dask_bio_pipeline`.

    Example
    -------
      >>> from bob.bio.base.pipelines import dask_bio_pipeline
      >>> pipeline = PipelineSimple(transformer_pipeline, biometric_algorithm)
      >>> pipeline = dask_bio_pipeline(pipeline)
      >>> pipeline(samples_for_training_back_ground_model, samplesets_for_enroll, samplesets_for_scoring).compute()  # doctest: +SKIP


    Parameters
    ----------

    transformer: :py:class`sklearn.pipeline.Pipeline` or a `sklearn.base.BaseEstimator`
        Transformer that will preprocess your data

    biometric_algorithm: :py:class:`bob.bio.base.pipelines.abstract_classes.BioAlgorithm`
        Biometrics algorithm object that implements the methods `enroll` and
        `score` methods

    score_writer: :any:`bob.bio.base.pipelines.ScoreWriter`
        Format to write scores. Default to
        :any:`bob.bio.base.pipelines.FourColumnsScoreWriter`

    """

    def __init__(
        self,
        transformer: Pipeline,
        biometric_algorithm: BioAlgorithm,
        score_writer=None,
    ):
        self.transformer = transformer
        self.biometric_algorithm = biometric_algorithm
        self.score_writer = score_writer
        if self.score_writer is None:
            tempdir = tempfile.TemporaryDirectory()
            self.score_writer = FourColumnsScoreWriter(tempdir.name)

        check_valid_pipeline(self)

    def __call__(
        self,
        background_model_samples,
        biometric_reference_samples,
        probe_samples,
        score_all_vs_all=True,
        return_templates=False,
    ):
        logger.info(" >> PipelineSimple: Training background model")
        self.train_background_model(background_model_samples)

        logger.info(" >> PipelineSimple: Creating enroll templates")
        enroll_templates = self.enroll_templates(biometric_reference_samples)

        logger.info(" >> PipelineSimple: Creating probe templates")
        probe_templates = self.probe_templates(probe_samples)

        logger.info(" >> PipelineSimple: Computing scores")
        scores = self.compute_scores(
            probe_templates,
            enroll_templates,
            score_all_vs_all,
        )

        if return_templates:
            return scores, enroll_templates, probe_templates
        else:
            return scores

[docs] def train_background_model(self, background_model_samples): # background_model_samples is a list of Samples # We might have algorithms that has no data for training if len(background_model_samples) > 0: self.transformer.fit(background_model_samples) else: logger.warning( "There's no data to train background model. " "For the rest of the execution it will be assumed that the pipeline does not require fit." ) return self.transformer
[docs] def enroll_templates(self, biometric_reference_samples): biometric_reference_features = self.transformer.transform( biometric_reference_samples ) enroll_templates = ( self.biometric_algorithm.create_templates_from_samplesets( biometric_reference_features, enroll=True ) ) # a list of Samples return enroll_templates
[docs] def probe_templates(self, probe_samples): probe_features = self.transformer.transform(probe_samples) probe_templates = ( self.biometric_algorithm.create_templates_from_samplesets( probe_features, enroll=False ) ) # a list of Samples return probe_templates
[docs] def compute_scores( self, probe_templates, enroll_templates, score_all_vs_all, ): return self.biometric_algorithm.score_sample_templates( probe_templates, enroll_templates, score_all_vs_all )
[docs] def write_scores(self, scores): if self.score_writer is None: raise ValueError("No score writer defined in the pipeline") return self.score_writer.write(scores)
[docs] def post_process(self, score_paths, filename): if self.score_writer is None: raise ValueError("No score writer defined in the pipeline") return self.score_writer.post_process(score_paths, filename)
def check_valid_pipeline(pipeline_simple): """ Applying some checks in the PipelineSimple """ # CHECKING THE TRANSFORMER # Checking if it's a Scikit Pipeline or a estimator if isinstance(pipeline_simple.transformer, Pipeline): # Checking if all steps are wrapped as samples, if not, we should wrap them for p in pipeline_simple.transformer: if not is_instance_nested(p, "estimator", SampleWrapper): wrap(["sample"], p) # In this case it can be a simple estimator. AND # Checking if it's sample wrapper, if not, do it elif is_instance_nested( pipeline_simple.transformer, "estimator", BaseEstimator ) and is_instance_nested( pipeline_simple.transformer, "estimator", BaseEstimator ): wrap(["sample"], pipeline_simple.transformer) else: raise ValueError( f"pipeline_simple.transformer should be instance of either `sklearn.pipeline.Pipeline` or" f"sklearn.base.BaseEstimator, not {pipeline_simple.transformer}" ) # Checking the Biometric algorithm if not isinstance(pipeline_simple.biometric_algorithm, BioAlgorithm): raise ValueError( f"pipeline_simple.biometric_algorithm should be instance of `BioAlgorithm`" f"not {pipeline_simple.biometric_algorithm}" ) return True