# Source code for bob.bio.base.algorithm.PCA

#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Manuel Guenther <Manuel.Guenther@idiap.ch>

import bob.learn.linear
import bob.io.base

import numpy
import scipy.spatial

from .Algorithm import Algorithm

import logging
logger = logging.getLogger("bob.bio.base")

class PCA (Algorithm):
"""Performs a principal component analysis (PCA) on the given data.

This algorithm computes a PCA projection (:py:class:bob.learn.linear.PCATrainer) on the given training features, projects the features to eigenspace and computes the distance of two projected features in eigenspace.
For example, the eigenface algorithm as proposed by [TP91]_ can be run with this class.

**Parameters:**

subspace_dimension : int or float
If specified as int, defines the number of eigenvectors used in the PCA projection matrix.
If specified as float (between 0 and 1), the number of eigenvectors is calculated such that the given percentage of variance is kept.

distance_function : function
A function taking two parameters and returns a float.
If uses_variances is set to True, the function is provided with a third parameter, which is the vector of variances (aka. eigenvalues).

is_distance_function : bool
Set this flag to False if the given distance_function computes a similarity value (i.e., higher values are better)

use_variances : bool
If set to True, the distance_function is provided with a third argument, which is the vector of variances (aka. eigenvalues).

kwargs : key=value pairs
A list of keyword arguments directly passed to the :py:class:Algorithm base class constructor.
"""

def __init__(
self,
subspace_dimension,  # if int, number of subspace dimensions; if float, percentage of variance to keep
distance_function = scipy.spatial.distance.euclidean,
is_distance_function = True,
uses_variances = False,
**kwargs  # parameters directly sent to the base class
):

# call base class constructor and register that the algorithm performs a projection
super(PCA, self).__init__(
performs_projection = True,

subspace_dimension = subspace_dimension,
distance_function = str(distance_function),
is_distance_function = is_distance_function,
uses_variances = uses_variances,

**kwargs
)

self.subspace_dim = subspace_dimension
self.machine = None
self.distance_function = distance_function
self.factor = -1. if is_distance_function else 1.
self.uses_variances = uses_variances

def _check_feature(self, feature, projected=False):
"""Checks that the features are appropriate"""
if not isinstance(feature, numpy.ndarray) or feature.ndim != 1 or feature.dtype != numpy.float64:
raise ValueError("The given feature is not appropriate")
index = 1 if projected else 0
if self.machine is not None and feature.shape != self.machine.shape[index]:
raise ValueError("The given feature is expected to have %d elements, but it has %d" % (self.machine.shape[index], feature.shape))

[docs]  def train_projector(self, training_features, projector_file):
"""Generates the PCA covariance matrix and writes it into the given projector_file.

**Parameters:**

training_features : [1D :py:class:numpy.ndarray]
A list of 1D training arrays (vectors) to train the PCA projection matrix with.

projector_file : str
A writable file, into which the PCA projection matrix (as a :py:class:bob.learn.linear.Machine) and the eigenvalues will be written.
"""
# Assure that all data are 1D
[self._check_feature(feature) for feature in training_features]

# Initializes the data
data = numpy.vstack(training_features)
logger.info("  -> Training LinearMachine using PCA")
t = bob.learn.linear.PCATrainer()
self.machine, self.variances = t.train(data)
# For re-shaping, we need to copy...
self.variances = self.variances.copy()

# compute variance percentage, if desired
if isinstance(self.subspace_dim, float):
cummulated = numpy.cumsum(self.variances) / numpy.sum(self.variances)
for index in range(len(cummulated)):
if cummulated[index] > self.subspace_dim:
self.subspace_dim = index
break
self.subspace_dim = index
logger.info("    ... Keeping %d PCA dimensions", self.subspace_dim)
# re-shape machine
self.machine.resize(self.machine.shape, self.subspace_dim)
self.variances = numpy.resize(self.variances, (self.subspace_dim))

f = bob.io.base.HDF5File(projector_file, "w")
f.set("Eigenvalues", self.variances)
f.create_group("Machine")
f.cd("/Machine")
self.machine.save(f)

[docs]  def load_projector(self, projector_file):
"""Reads the PCA projection matrix and the eigenvalues from file.

**Parameters:**

projector_file : str
An existing file, from which the PCA projection matrix and the eigenvalues are read.
"""
# read PCA projector
f = bob.io.base.HDF5File(projector_file)
self.variances = f.read("Eigenvalues")
f.cd("/Machine")
self.machine = bob.learn.linear.Machine(f)

[docs]  def project(self, feature):
"""project(feature) -> projected

Projects the given feature into eigenspace.

**Parameters:**

feature : 1D :py:class:numpy.ndarray
The 1D feature to be projected.

**Returns:**

projected : 1D :py:class:numpy.ndarray
The feature projected into eigenspace.
"""
self._check_feature(feature)
# Projects the data
return self.machine(feature)

[docs]  def enroll(self, enroll_features):
"""enroll(enroll_features) -> model

Enrolls the model by storing all given input vectors.

**Parameters:**

enroll_features : [1D :py:class:numpy.ndarray]
The list of projected features to enroll the model from.

**Returns:**

model : 2D :py:class:numpy.ndarray
The enrolled model.
"""
assert len(enroll_features)
[self._check_feature(feature, True) for feature in enroll_features]
# just store all the features
return numpy.vstack(enroll_features)

[docs]  def score(self, model, probe):
"""score(model, probe) -> float

Computes the distance of the model to the probe using the distance function specified in the constructor.

**Parameters:**

model : 2D :py:class:numpy.ndarray
The model storing all enrollment features.

probe : 1D :py:class:numpy.ndarray
The probe feature vector in eigenspace.

**Returns:**

score : float
A similarity value between model and probe
"""
self._check_feature(probe, True)
# return the negative distance (as a similarity measure)
if len(model.shape) == 2:
# we have multiple models, so we use the multiple model scoring
return self.score_for_multiple_models(model, probe)
elif self.uses_variances:
# single model, single probe (multiple probes have already been handled)
return self.factor * self.distance_function(model, probe, self.variances)
else:
# single model, single probe (multiple probes have already been handled)
return self.factor * self.distance_function(model, probe)

# re-define unused functions, just so that they do not get documented
[docs]  def train_enroller(*args,**kwargs): raise NotImplementedError()
[docs]  def load_enroller(*args,**kwargs): pass