Source code for bob.pipelines.sample

"""Base definition of sample."""

from collections.abc import MutableSequence, Sequence
from typing import Any

import numpy as np

from bob.io.base import vstack_features

SAMPLE_DATA_ATTRS = ("data", "samples")


def _copy_attributes(sample, parent, kwargs, exclude_list=None):
    """Copies attributes from a dictionary to self."""
    exclude_list = exclude_list or []
    if parent is not None:
        for key in parent.__dict__:
            if (
                key.startswith("_")
                or key in SAMPLE_DATA_ATTRS
                or key in exclude_list
            ):
                continue

            setattr(sample, key, getattr(parent, key))

    for key, value in kwargs.items():
        if (
            key.startswith("_")
            or key in SAMPLE_DATA_ATTRS
            or key in exclude_list
        ):
            continue

        setattr(sample, key, value)


class _ReprMixin:
    def __repr__(self):
        return (
            f"{self.__class__.__name__}("
            + ", ".join(
                f"{k}={v!r}"
                for k, v in self.__dict__.items()
                if not k.startswith("_")
            )
            + ")"
        )

    def __eq__(self, other):
        sorted_self = {
            k: v
            for k, v in sorted(self.__dict__.items(), key=lambda item: item[0])
        }
        sorted_other = {
            k: v
            for k, v in sorted(other.__dict__.items(), key=lambda item: item[0])
        }

        for s, o in zip(sorted_self, sorted_other):
            # Checking keys
            if s != o:
                return False

            # Checking values
            if isinstance(sorted_self[s], np.ndarray) and isinstance(
                sorted_self[o], np.ndarray
            ):
                if not np.allclose(sorted_self[s], sorted_other[o]):
                    return False
            else:
                if sorted_self[s] != sorted_other[o]:
                    return False

        return True


class Sample(_ReprMixin):
    """Representation of sample. A Sample is a simple container that wraps a
    data-point (see :ref:`bob.pipelines.sample`)

    Each sample must have the following attributes:

        * attribute ``data``: Contains the data for this sample


    Parameters
    ----------

        data : object
            Object representing the data to initialize this sample with.

        parent : object
            A parent object from which to inherit all other attributes (except
            ``data``)
    """

    def __init__(self, data, parent=None, **kwargs):
        self.data = data
        _copy_attributes(self, parent, kwargs)


class DelayedSample(Sample):
    """Representation of sample that can be loaded via a callable.

    The optional ``**kwargs`` argument allows you to attach more attributes to
    this sample instance.


    Parameters
    ----------

        load
            A python function that can be called parameterlessly, to load the
            sample in question from whatever medium

        parent : :any:`DelayedSample`, :any:`Sample`, None
            If passed, consider this as a parent of this sample, to copy
            information

        delayed_attributes : dict or None
            A dictionary of name : load_fn pairs that will be used to create
            attributes of name : load_fn() in this class. Use this to option
            to create more delayed attributes than just ``sample.data``.

        kwargs : dict
            Further attributes of this sample, to be stored and eventually
            transmitted to transformed versions of the sample
    """

    def __init__(self, load, parent=None, delayed_attributes=None, **kwargs):
        self.__running_init__ = True
        # Merge parent's and param's delayed_attributes
        parent_attr = getattr(parent, "_delayed_attributes", None)
        self._delayed_attributes = None
        if parent_attr is not None:
            self._delayed_attributes = parent_attr.copy()

        if delayed_attributes is not None:
            if self._delayed_attributes is None:
                self._delayed_attributes = delayed_attributes.copy()
            else:
                self._delayed_attributes.update(delayed_attributes)

        # Inherit attributes from parent, without calling delayed_attributes
        for key in getattr(parent, "__dict__", []):
            if key.startswith("_"):
                continue
            if key in SAMPLE_DATA_ATTRS:
                continue
            if self._delayed_attributes is not None:
                if key in self._delayed_attributes:
                    continue
            setattr(self, key, getattr(parent, key))

        # Create the delayed attributes, but leave their values as None for now.
        if self._delayed_attributes is not None:
            kwargs.update({k: None for k in self._delayed_attributes})
        # Set attribute from kwargs
        _copy_attributes(self, None, kwargs)
        self._load = load
        del self.__running_init__

    def __getattribute__(self, name: str) -> Any:
        try:
            delayed_attributes = super().__getattribute__("_delayed_attributes")
        except AttributeError:
            delayed_attributes = None
        if delayed_attributes is None or name not in delayed_attributes:
            return super().__getattribute__(name)
        return delayed_attributes[name]()

    def __setattr__(self, name: str, value: Any) -> None:
        if (
            name != "delayed_attributes"
            and "__running_init__" not in self.__dict__
        ):
            delayed_attributes = getattr(self, "_delayed_attributes", None)
            # if setting an attribute which was delayed, remove it from delayed_attributes
            if delayed_attributes is not None and name in delayed_attributes:
                del delayed_attributes[name]

        super().__setattr__(name, value)

    @property
    def data(self):
        """Loads the data from the disk file."""
        return self._load()

[docs] @classmethod def from_sample(cls, sample: Sample, **kwargs): """Creates a DelayedSample from another DelayedSample or a Sample. If the sample is a DelayedSample, its data will not be loaded. Parameters ---------- sample : :any:`Sample` The sample to convert to a DelayedSample """ if hasattr(sample, "_load"): data = sample._load else: def data(): return sample.data return cls(data, parent=sample, **kwargs)
class SampleSet(MutableSequence, _ReprMixin): """A set of samples with extra attributes""" def __init__(self, samples, parent=None, **kwargs): self.samples = samples _copy_attributes( self, parent, kwargs, exclude_list=getattr(parent, "_delayed_attributes", None), ) def __len__(self): return len(self.samples) def __getitem__(self, item): return self.samples.__getitem__(item) def __setitem__(self, key, item): return self.samples.__setitem__(key, item) def __delitem__(self, item): return self.samples.__delitem__(item)
[docs] def insert(self, index, item): # if not item in self.samples: self.samples.insert(index, item)
class DelayedSampleSet(SampleSet): """A set of samples with extra attributes""" def __init__(self, load, parent=None, **kwargs): self._load = load _copy_attributes( self, parent, kwargs, exclude_list=getattr(parent, "_delayed_attributes", None), ) @property def samples(self): return self._load() class DelayedSampleSetCached(DelayedSampleSet): """A cached version of DelayedSampleSet""" def __init__(self, load, parent=None, **kwargs): super().__init__(load, parent=parent, kwargs=kwargs) self._data = None _copy_attributes( self, parent, kwargs, exclude_list=getattr(parent, "_delayed_attributes", None), ) @property def samples(self): if self._data is None: self._data = self._load() return self._data class SampleBatch(Sequence, _ReprMixin): """A batch of samples that looks like [s.data for s in samples] However, when you call np.array(SampleBatch), it will construct a numpy array from sample.data attributes in a memory efficient way. """ def __init__(self, samples, sample_attribute="data"): self.samples = samples self.sample_attribute = sample_attribute def __len__(self): return len(self.samples) def __getitem__(self, item): return getattr(self.samples[item], self.sample_attribute) def __array__(self, dtype=None, *args, **kwargs): def _reader(s): # adding one more dimension to data so they get stacked sample-wise return getattr(s, self.sample_attribute)[None, ...] if self.samples and hasattr( getattr(self.samples[0], self.sample_attribute), "shape" ): try: arr = vstack_features(_reader, self.samples, dtype=dtype) except Exception as e: try: # try computing one feature to show a better traceback _ = getattr(self.samples[0], self.sample_attribute) raise e except Exception as e2: raise e2 from e else: # to handle string data arr = [getattr(s, self.sample_attribute) for s in self.samples] return np.asarray(arr, dtype, *args, **kwargs)