# Source code for bob.bio.base.algorithm.PCA

```
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Manuel Guenther <Manuel.Guenther@idiap.ch>
import bob.learn.linear
import bob.io.base
import numpy
import scipy.spatial
from .Algorithm import Algorithm
import logging
logger = logging.getLogger("bob.bio.base")
class PCA (Algorithm):
"""Performs a principal component analysis (PCA) on the given data.
This algorithm computes a PCA projection (:py:class:`bob.learn.linear.PCATrainer`) on the given training features, projects the features to eigenspace and computes the distance of two projected features in eigenspace.
For example, the eigenface algorithm as proposed by [TP91]_ can be run with this class.
**Parameters:**
subspace_dimension : int or float
If specified as ``int``, defines the number of eigenvectors used in the PCA projection matrix.
If specified as ``float`` (between 0 and 1), the number of eigenvectors is calculated such that the given percentage of variance is kept.
distance_function : function
A function taking two parameters and returns a float.
If ``uses_variances`` is set to ``True``, the function is provided with a third parameter, which is the vector of variances (aka. eigenvalues).
is_distance_function : bool
Set this flag to ``False`` if the given ``distance_function`` computes a similarity value (i.e., higher values are better)
use_variances : bool
If set to ``True``, the ``distance_function`` is provided with a third argument, which is the vector of variances (aka. eigenvalues).
kwargs : ``key=value`` pairs
A list of keyword arguments directly passed to the :py:class:`Algorithm` base class constructor.
"""
def __init__(
self,
subspace_dimension, # if int, number of subspace dimensions; if float, percentage of variance to keep
distance_function = scipy.spatial.distance.euclidean,
is_distance_function = True,
uses_variances = False,
**kwargs # parameters directly sent to the base class
):
# call base class constructor and register that the algorithm performs a projection
super(PCA, self).__init__(
performs_projection = True,
subspace_dimension = subspace_dimension,
distance_function = str(distance_function),
is_distance_function = is_distance_function,
uses_variances = uses_variances,
**kwargs
)
self.subspace_dim = subspace_dimension
self.machine = None
self.distance_function = distance_function
self.factor = -1. if is_distance_function else 1.
self.uses_variances = uses_variances
def _check_feature(self, feature, projected=False):
"""Checks that the features are appropriate"""
if not isinstance(feature, numpy.ndarray) or feature.ndim != 1 or feature.dtype != numpy.float64:
raise ValueError("The given feature is not appropriate")
index = 1 if projected else 0
if self.machine is not None and feature.shape[0] != self.machine.shape[index]:
raise ValueError("The given feature is expected to have %d elements, but it has %d" % (self.machine.shape[index], feature.shape[0]))
[docs] def train_projector(self, training_features, projector_file):
"""Generates the PCA covariance matrix and writes it into the given projector_file.
**Parameters:**
training_features : [1D :py:class:`numpy.ndarray`]
A list of 1D training arrays (vectors) to train the PCA projection matrix with.
projector_file : str
A writable file, into which the PCA projection matrix (as a :py:class:`bob.learn.linear.Machine`) and the eigenvalues will be written.
"""
# Assure that all data are 1D
[self._check_feature(feature) for feature in training_features]
# Initializes the data
data = numpy.vstack(training_features)
logger.info(" -> Training LinearMachine using PCA")
t = bob.learn.linear.PCATrainer()
self.machine, self.variances = t.train(data)
# For re-shaping, we need to copy...
self.variances = self.variances.copy()
# compute variance percentage, if desired
if isinstance(self.subspace_dim, float):
cummulated = numpy.cumsum(self.variances) / numpy.sum(self.variances)
for index in range(len(cummulated)):
if cummulated[index] > self.subspace_dim:
self.subspace_dim = index
break
self.subspace_dim = index
logger.info(" ... Keeping %d PCA dimensions", self.subspace_dim)
# re-shape machine
self.machine.resize(self.machine.shape[0], self.subspace_dim)
self.variances = numpy.resize(self.variances, (self.subspace_dim))
f = bob.io.base.HDF5File(projector_file, "w")
f.set("Eigenvalues", self.variances)
f.create_group("Machine")
f.cd("/Machine")
self.machine.save(f)
[docs] def load_projector(self, projector_file):
"""Reads the PCA projection matrix and the eigenvalues from file.
**Parameters:**
projector_file : str
An existing file, from which the PCA projection matrix and the eigenvalues are read.
"""
# read PCA projector
f = bob.io.base.HDF5File(projector_file)
self.variances = f.read("Eigenvalues")
f.cd("/Machine")
self.machine = bob.learn.linear.Machine(f)
[docs] def project(self, feature):
"""project(feature) -> projected
Projects the given feature into eigenspace.
**Parameters:**
feature : 1D :py:class:`numpy.ndarray`
The 1D feature to be projected.
**Returns:**
projected : 1D :py:class:`numpy.ndarray`
The ``feature`` projected into eigenspace.
"""
self._check_feature(feature)
# Projects the data
return self.machine(feature)
[docs] def enroll(self, enroll_features):
"""enroll(enroll_features) -> model
Enrolls the model by storing all given input vectors.
**Parameters:**
enroll_features : [1D :py:class:`numpy.ndarray`]
The list of projected features to enroll the model from.
**Returns:**
model : 2D :py:class:`numpy.ndarray`
The enrolled model.
"""
assert len(enroll_features)
[self._check_feature(feature, True) for feature in enroll_features]
# just store all the features
return numpy.vstack(enroll_features)
[docs] def score(self, model, probe):
"""score(model, probe) -> float
Computes the distance of the model to the probe using the distance function specified in the constructor.
**Parameters:**
model : 2D :py:class:`numpy.ndarray`
The model storing all enrollment features.
probe : 1D :py:class:`numpy.ndarray`
The probe feature vector in eigenspace.
**Returns:**
score : float
A similarity value between ``model`` and ``probe``
"""
self._check_feature(probe, True)
# return the negative distance (as a similarity measure)
if len(model.shape) == 2:
# we have multiple models, so we use the multiple model scoring
return self.score_for_multiple_models(model, probe)
elif self.uses_variances:
# single model, single probe (multiple probes have already been handled)
return self.factor * self.distance_function(model, probe, self.variances)
else:
# single model, single probe (multiple probes have already been handled)
return self.factor * self.distance_function(model, probe)
# re-define unused functions, just so that they do not get documented
```