Source code for bob.bio.base.algorithm.LDA

#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Manuel Guenther <Manuel.Guenther@idiap.ch>

import bob.io.base
import bob.learn.linear

import numpy
import scipy.spatial

from .Algorithm import Algorithm

import logging
logger = logging.getLogger("bob.bio.base")

class LDA (Algorithm):
  """Computes a linear discriminant analysis (LDA) on the given data, possibly after computing a principal component analysis (PCA).

  This algorithm computes a LDA projection (:py:class:`bob.learn.linear.FisherLDATrainer`) on the given training features, projects the features to Fisher space and computes the distance of two projected features in Fisher space.
  For example, the Fisher faces algorithm as proposed by [ZKC+98]_ can be run with this class.


  Additionally, a PCA projection matrix can be computed beforehand, to reduce the dimensionality of the input vectors.
  In that case, the finally stored projection matrix is the combination of the PCA and LDA projection.

  **Parameters:**

  lda_subspace_dimension : int or ``None``
    If specified, the LDA subspace will be truncated to the given number of dimensions.
    By default (``None``) it is limited to the number of classes in the training set - 1.

  pca_subspace_dimentsion : int or float or ``None``
    If specified, a combined PCA + LDA projection matrix will be computed.
    If specified as ``int``, defines the number of eigenvectors used in the PCA projection matrix.
    If specified as ``float`` (between 0 and 1), the number of eigenvectors is calculated such that the given percentage of variance is kept.

  use_pinv : bool
    Use the Pseudo-inverse to compute the LDA projection matrix?
    Sometimes, the training fails because it is impossible to invert the covariance matrix.
    In these cases, you might want to set ``use_pinv`` to ``True``, which solves this process, but slows down the processing noticeably.

  distance_function : function
    A function taking two parameters and returns a float.
    If ``uses_variances`` is set to ``True``, the function is provided with a third parameter, which is the vector of variances (aka. eigenvalues).

  is_distance_function : bool
    Set this flag to ``False`` if the given ``distance_function`` computes a similarity value (i.e., higher values are better)

  use_variances : bool
    If set to ``True``, the ``distance_function`` is provided with a third argument, which is the vector of variances (aka. eigenvalues).

  kwargs : ``key=value`` pairs
    A list of keyword arguments directly passed to the :py:class:`Algorithm` base class constructor.
  """

  def __init__(
      self,
      lda_subspace_dimension = None, # if set, the LDA subspace will be truncated to the given number of dimensions; by default it is limited to the number of classes in the training set
      pca_subspace_dimension = None, # if set, a PCA subspace truncation is performed before applying LDA; might be integral or float
      use_pinv = False,
      distance_function = scipy.spatial.distance.euclidean,
      is_distance_function = True,
      uses_variances = False,
      **kwargs  # parameters directly sent to the base class
  ):

    # call base class constructor and register that the LDA tool performs projection and need the training features split by client
    super(LDA, self).__init__(
        performs_projection = True,
        split_training_features_by_client = True,

        lda_subspace_dimension = lda_subspace_dimension,
        pca_subspace_dimension = pca_subspace_dimension,
        use_pinv = use_pinv,
        distance_function = str(distance_function),
        is_distance_function = is_distance_function,
        uses_variances = uses_variances,

        **kwargs
    )

    # copy information
    self.pca_subspace = pca_subspace_dimension
    self.lda_subspace = lda_subspace_dimension
    if self.pca_subspace is not None and isinstance(self.pca_subspace, int) and self.lda_subspace and self.pca_subspace < self.lda_subspace:
      raise ValueError("The LDA subspace is larger than the PCA subspace size. This won't work properly. Please check your setup!")
    self.use_pinv = use_pinv

    self.machine = None
    self.distance_function = distance_function
    self.factor = -1 if is_distance_function else 1.
    self.uses_variances = uses_variances


  def _check_feature(self, feature, projected=False):
    """Checks that the features are appropriate."""
    if not isinstance(feature, numpy.ndarray) or feature.ndim != 1 or feature.dtype != numpy.float64:
      raise ValueError("The given feature is not appropriate")
    index = 1 if projected else 0
    if self.machine is not None and feature.shape[0] != self.machine.shape[index]:
      raise ValueError("The given feature is expected to have %d elements, but it has %d" % (self.machine.shape[index], feature.shape[0]))


  def _arrange_data(self, training_files):
    """Arranges the data to train the LDA projection matrix."""
    data = []
    for client_files in training_files:
      # at least two files per client are required!
      if len(client_files) < 2:
        logger.warn("Skipping one client since the number of client files is only %d", len(client_files))
        continue
      data.append(numpy.vstack(feature.flatten() for feature in client_files))

    # Returns the list of lists of arrays
    return data


  def _train_pca(self, training_set):
    """Trains and returns a LinearMachine that is trained using PCA"""
    data_list = (feature for client in training_set for feature in client)
    data = numpy.vstack(data_list)

    logger.info("  -> Training Linear Machine using PCA")
    t = bob.learn.linear.PCATrainer()
    machine, eigen_values = t.train(data)

    if isinstance(self.pca_subspace, float):
      cummulated = numpy.cumsum(eigen_values) / numpy.sum(eigen_values)
      for index in range(len(cummulated)):
        if cummulated[index] > self.pca_subspace:
          self.pca_subspace = index
          break
      self.pca_subspace = index

    if self.lda_subspace is not None and self.pca_subspace <= self.lda_subspace:
      logger.warn("  ... Extending the PCA subspace dimension from %d to %d", self.pca_subspace, self.lda_subspace + 1)
      self.pca_subspace = self.lda_subspace + 1
    else:
      logger.info("  ... Limiting PCA subspace to %d dimensions", self.pca_subspace)

    # limit number of pcs
    machine.resize(machine.shape[0], self.pca_subspace)
    return machine


  def _perform_pca(self, machine, training_set):
    """Perform PCA on data of the training set"""
    return [numpy.vstack(machine(feature) for feature in client_features) for client_features in training_set]


[docs]  def train_projector(self, training_features, projector_file):
    """Generates the LDA or PCA+LDA projection matrix from the given features (that are sorted by identity).

    **Parameters:**

    training_features : [[1D :py:class:`numpy.ndarray`]]
      A list of lists of 1D training arrays (vectors) to train the LDA projection matrix with.
      Each sub-list contains the features of one client.

    projector_file : str
      A writable file, into which the LDA or PCA+LDA projection matrix (as a :py:class:`bob.learn.linear.Machine`) and the eigenvalues will be written.
    """
    # check data
    [self._check_feature(feature) for client_features in training_features for feature in client_features]

    # arrange LDA training data
    data = self._arrange_data(training_features)

    # train PCA of wanted
    if self.pca_subspace:
      # train on all training features
      pca_machine = self._train_pca(training_features)
      # project only the features that are used for training
      logger.info("  -> Projecting training data to PCA subspace")
      data = self._perform_pca(pca_machine, data)

    logger.info("  -> Training Linear Machine using LDA")
    trainer = bob.learn.linear.FisherLDATrainer(use_pinv = self.use_pinv, strip_to_rank = (self.lda_subspace is None))
    self.machine, self.variances = trainer.train(data)
    if self.lda_subspace is not None:
      self.machine.resize(self.machine.shape[0], self.lda_subspace)
      self.variances = self.variances.copy()
      self.variances = numpy.resize(self.variances, (self.lda_subspace))

    if self.pca_subspace is not None:
      # compute combined PCA/LDA projection matrix
      combined_matrix = numpy.dot(pca_machine.weights, self.machine.weights)
      # set new weight matrix (and new mean vector) of novel machine
      self.machine = bob.learn.linear.Machine(combined_matrix)
      self.machine.input_subtract = pca_machine.input_subtract

    hdf5 = bob.io.base.HDF5File(projector_file, "w")
    hdf5.set("Eigenvalues", self.variances)
    hdf5.create_group("/Machine")
    hdf5.cd("/Machine")
    self.machine.save(hdf5)


[docs]  def load_projector(self, projector_file):
    """Reads the projection matrix and the eigenvalues from file.

    **Parameters:**

    projector_file : str
      An existing file, from which the PCA or PCA+LDA projection matrix and the eigenvalues are read.
    """
    # read LDA projector
    hdf5 = bob.io.base.HDF5File(projector_file)
    self.variances = hdf5.read("Eigenvalues")
    hdf5.cd("/Machine")
    self.machine = bob.learn.linear.Machine(hdf5)


[docs]  def project(self, feature):
    """project(feature) -> projected

    Projects the given feature into Fisher space.

    **Parameters:**

    feature : 1D :py:class:`numpy.ndarray`
      The 1D feature to be projected.

    **Returns:**

    projected : 1D :py:class:`numpy.ndarray`
      The ``feature`` projected into Fisher space.
    """
    self._check_feature(feature)
    # Projects the data
    return self.machine(feature)


[docs]  def enroll(self, enroll_features):
    """enroll(enroll_features) -> model

    Enrolls the model by storing all given input vectors.

    **Parameters:**

    enroll_features : [1D :py:class:`numpy.ndarray`]
      The list of projected features to enroll the model from.

    **Returns:**

    model : 2D :py:class:`numpy.ndarray`
      The enrolled model.
    """
    assert len(enroll_features)
    [self._check_feature(feature, True) for feature in enroll_features]
    # just store all the features
    return numpy.vstack(enroll_features)


[docs]  def score(self, model, probe):
    """score(model, probe) -> float

    Computes the distance of the model to the probe using the distance function specified in the constructor.

    **Parameters:**

    model : 2D :py:class:`numpy.ndarray`
      The model storing all enrollment features.

    probe : 1D :py:class:`numpy.ndarray`
      The probe feature vector in Fisher space.

    **Returns:**

    score : float
      A similarity value between ``model`` and ``probe``
    """
    self._check_feature(probe, True)
    # return the negative distance (as a similarity measure)
    if len(model.shape) == 2:
      # we have multiple models, so we use the multiple model scoring
      return self.score_for_multiple_models(model, probe)
    elif self.uses_variances:
      # single model, single probe (multiple probes have already been handled)
      return self.factor * self.distance_function(model, probe, self.variances)
    else:
      # single model, single probe (multiple probes have already been handled)
      return self.factor * self.distance_function(model, probe)

  # re-define unused functions, just so that they do not get documented
[docs]  def train_enroller(*args,**kwargs): raise NotImplementedError()
[docs]  def load_enroller(*args,**kwargs): pass