Source code for bob.io.base

# import Libraries of other lib packages
import numpy
import bob.core

# import our own Library
import bob.extension
bob.extension.load_bob_library('bob.io.base', __file__)

from ._library import File as _File_C, HDF5File as _HDF5File_C, extensions
from . import version
from .version import module as __version__
from .version import api as __api_version__

import os


class File(_File_C):
  __doc__ = _File_C.__doc__

  def __enter__(self):
    return self

  def __exit__(self, type, value, traceback):
    self.close()


class HDF5File(_HDF5File_C):
  __doc__ = _HDF5File_C.__doc__

  def __enter__(self):
    return self

  def __exit__(self, type, value, traceback):
    return self.close()

  def __contains__(self, x):
    __doc__ = self.has_key.__doc__
    return self.has_key(x)

  def __iter__(self):
    __doc__ = self.keys.__doc__
    return iter(self.keys())

  def __getitem__(self, name):
    __doc__ = self.get.__doc__
    return self.get(name)

  def __setitem__(self, name, value):
    __doc__ = self.set.__doc__
    return self.set(name, value)

  def values(self):
    '''Yields the datasets contained in the current directory.

    Yields
    -------
    object
        The datasets that are being read.
    '''
    return (self[key] for key in self)

  def items(self):
    '''Yields the keys and the datasets contained in the current directory.

    Yields
    -------
    tuple
        The key and the datasets that are being read in a tuple.
    '''
    return ((key, self[key]) for key in self)


def _is_string(s):
  """Returns ``True`` if the given object is a string

  This method can be used with Python-2.x or 3.x and returns a string
  respecting each environment's constraints.
  """

  from sys import version_info

  return (version_info[0] < 3 and isinstance(s, (str, unicode))) or \
      isinstance(s, (bytes, str))


@numpy.deprecate(new_name="os.makedirs(directory, exist_ok=True)")
def create_directories_safe(directory, dryrun=False):
  """Creates a directory if it does not exists, with concurrent access support.
  This function will also create any parent directories that might be required.
  If the dryrun option is selected, it does not actually create the directory,
  but just writes the (Linux) command that would have been executed.

  **Parameters:**

  ``directory`` : str
    The directory that you want to create.

  ``dryrun`` : bool
    Only ``print`` the command to console, but do not execute it.
  """
  if dryrun:
    print("[dry-run] mkdir -p '%s'" % directory)
  else:
    os.makedirs(directory, exist_ok=True)


def load(inputs):
  """load(inputs) -> data

  Loads the contents of a file, an iterable of files, or an iterable of
  :py:class:`bob.io.base.File`'s into a :py:class:`numpy.ndarray`.

  **Parameters:**

  ``inputs`` : various types

    This might represent several different entities:

    1. The name of a file (full path) from where to load the data. In this
       case, this assumes that the file contains an array and returns a loaded
       numpy ndarray.
    2. An iterable of filenames to be loaded in memory. In this case, this
       would assume that each file contains a single 1D sample or a set of 1D
       samples, load them in memory and concatenate them into a single and
       returned 2D :py:class:`numpy.ndarray`.
    3. An iterable of :py:class:`File`. In this case, this would assume
       that each :py:class:`File` contains a single 1D sample or a set
       of 1D samples, load them in memory if required and concatenate them into
       a single and returned 2D :py:class:`numpy.ndarray`.
    4. An iterable with mixed filenames and :py:class:`File`. In this
       case, this would returned a 2D :py:class:`numpy.ndarray`, as described
       by points 2 and 3 above.

  **Returns:**

  ``data`` : :py:class:`numpy.ndarray`
    The data loaded from the given ``inputs``.
  """

  from collections import Iterable
  import numpy
  if _is_string(inputs):
    return File(inputs, 'r').read()
  elif isinstance(inputs, Iterable):
    retval = []
    for obj in inputs:
      if _is_string(obj):
        retval.append(load(obj))
      elif isinstance(obj, File):
        retval.append(obj.read())
      else:
        raise TypeError(
            "Iterable contains an object which is not a filename nor a "
            "bob.io.base.File.")
    return numpy.vstack(retval)
  else:
    raise TypeError(
        "Unexpected input object. This function is expecting a filename, "
        "or an iterable of filenames and/or bob.io.base.File's")


def merge(filenames):
  """merge(filenames) -> files

  Converts an iterable of filenames into an iterable over read-only
  :py:class:`bob.io.base.File`'s.

  **Parameters:**

  ``filenames`` : str or [str]

    A list of file names.
    This might represent:

    1. A single filename. In this case, an iterable with a single
       :py:class:`File` is returned.
    2. An iterable of filenames to be converted into an iterable of
       :py:class:`File`'s.

  **Returns:**

  ``files`` : [:py:class:`File`]
    The list of files.
  """

  from collections import Iterable
  from .utils import is_string
  if is_string(filenames):
    return [File(filenames, 'r')]
  elif isinstance(filenames, Iterable):
    return [File(k, 'r') for k in filenames]
  else:
    raise TypeError(
        "Unexpected input object. This function is expecting an "
        "iterable of filenames.")


def save(array, filename, create_directories=False):
  """Saves the contents of an array-like object to file.

  Effectively, this is the same as creating a :py:class:`File` object
  with the mode flag set to ``'w'`` (write with truncation) and calling
  :py:meth:`File.write` passing ``array`` as parameter.

  Parameters:

  ``array`` : array_like
    The array-like object to be saved on the file

  ``filename`` : str
    The name of the file where you need the contents saved to

  ``create_directories`` : bool
    Automatically generate the directories if required (defaults to ``False``
    because of compatibility reasons; might change in future to default to
    ``True``)
  """
  # create directory if not existent yet
  if create_directories:
    create_directories_safe(os.path.dirname(filename))

  # requires data is c-contiguous and aligned, will create a copy otherwise
  array = numpy.require(array, requirements=('C_CONTIGUOUS', 'ALIGNED'))

  return File(filename, 'w').write(array)

# Just to make it homogenous with the C++ API
write = save
read = load


def append(array, filename):
  """append(array, filename) -> position

  Appends the contents of an array-like object to file.

  Effectively, this is the same as creating a :py:class:`File` object
  with the mode flag set to ``'a'`` (append) and calling
  :py:meth:`File.append` passing ``array`` as parameter.

  **Parameters:**

  ``array`` : array_like
    The array-like object to be saved on the file

  ``filename`` : str
    The name of the file where you need the contents saved to

  **Returns:**

  ``position`` : int
    See :py:meth:`File.append`
  """

  # requires data is c-contiguous and aligned, will create a copy otherwise
  array = numpy.require(array, requirements=('C_CONTIGUOUS', 'ALIGNED'))

  return File(filename, 'a').append(array)


def peek(filename):
  """peek(filename) -> dtype, shape, stride

  Returns the type of array (frame or sample) saved in the given file.

  Effectively, this is the same as creating a :py:class:`File` object
  with the mode flag set to `r` (read-only) and calling
  :py:meth:`File.describe`.

  **Parameters**:

  ``filename`` : str
    The name of the file to peek information from

  **Returns:**

  ``dtype, shape, stride`` : see :py:meth:`File.describe`
  """
  return File(filename, 'r').describe()


def peek_all(filename):
  """peek_all(filename) -> dtype, shape, stride

  Returns the type of array (for full readouts) saved in the given file.

  Effectively, this is the same as creating a :py:class:`File` object
  with the mode flag set to ``'r'`` (read-only) and returning
  ``File.describe`` with its parameter ``all`` set to ``True``.

  **Parameters:**

  ``filename`` : str
    The name of the file to peek information from

  **Returns:**

  ``dtype, shape, stride`` : see :py:meth:`File.describe`
  """
  return File(filename, 'r').describe(all=True)


# Keeps compatibility with the previously existing API
open = File


def get_config():
  """Returns a string containing the configuration information.
  """
  return bob.extension.get_config(__name__, version.externals, version.api)


def get_include_directories():
  """get_include_directories() -> includes

  Returns a list of include directories for dependent libraries, such as HDF5.
  This function is automatically used by
  :py:func:`bob.extension.get_bob_libraries` to retrieve the non-standard
  include directories that are required to use the C bindings of this library
  in dependent classes. You shouldn't normally need to call this function by
  hand.

  **Returns:**

  ``includes`` : [str]
    The list of non-standard include directories required to use the C bindings
    of this class. For now, only the directory for the HDF5 headers are
    returned.
  """
  # try to use pkg_config first
  try:
    from bob.extension.utils import find_header
    # locate pkg-config on our own
    header = 'hdf5.h'
    candidates = find_header(header)
    if not candidates:
      raise RuntimeError(
          "could not find %s's `%s' - have you installed %s on this "
          "machine?" % ('hdf5', header, 'hdf5'))

    return [os.path.dirname(candidates[0])]
  except RuntimeError:
    from bob.extension import pkgconfig
    pkg = pkgconfig('hdf5')
    return pkg.include_directories()


def get_macros():
  """get_macros() -> macros

  Returns a list of preprocessor macros, such as ``(HAVE_HDF5, 1)``. This
  function is automatically used by :py:func:`bob.extension.get_bob_libraries`
  to retrieve the prerpocessor definitions that are required to use the C
  bindings of this library in dependent classes. You shouldn't normally need to
  call this function by hand.

  **Returns:**

  ``macros`` : [(str,str)]
    The list of preprocessor macros required to use the C bindings of this
    class. For now, only ``('HAVE_HDF5', '1')`` is returned, when applicable.
  """
  # get include directories
  if get_include_directories():
    return [('HAVE_HDF5', '1')]


def _generate_features(reader, paths, same_size=False):
  """Load and stack features in a memory efficient way. This function is meant
  to be used inside :py:func:`vstack_features`.

  Parameters
  ----------
  reader : ``collections.Callable``
    See the documentation of :py:func:`vstack_features`.
  paths : ``collections.Iterable``
    See the documentation of :py:func:`vstack_features`.
  same_size : :obj:`bool`, optional
    See the documentation of :py:func:`vstack_features`.

  Yields
  ------
  object
    The first object returned is a tuple of :py:class:`numpy.dtype` of
    features and the shape of the first feature. The rest of objects are
    the actual values in features. The features are returned in C order.
  """

  shape_determined = False
  for i, path in enumerate(paths):

    feature = numpy.atleast_2d(reader(path))
    feature = numpy.ascontiguousarray(feature)
    if not shape_determined:
      shape_determined = True
      dtype = feature.dtype
      shape = list(feature.shape)
      yield (dtype, shape)
    else:
      # make sure all features have the same shape and dtype
      if same_size:
        assert shape == list(feature.shape)
      else:
        assert shape[1:] == list(feature.shape[1:])
      assert dtype == feature.dtype

    for value in feature.flat:
      yield value


[docs]def vstack_features(reader, paths, same_size=False): """Stacks all features in a memory efficient way. Parameters ---------- reader : ``collections.Callable`` The function to load the features. The function should only take one argument ``path`` and return loaded features. Use :any:`functools.partial` to accommodate your reader to this format. The features returned by ``reader`` are expected to have the same :py:class:`numpy.dtype` and the same shape except for their first dimension. First dimension should correspond to the number of samples. paths : ``collections.Iterable`` An iterable of paths to iterate on. Whatever is inside path is given to ``reader`` so they do not need to be necessarily paths to actual files. If ``same_size`` is ``True``, ``len(paths)`` must be valid. same_size : :obj:`bool`, optional If ``True``, it assumes that arrays inside all the paths are the same shape. If you know the features are the same size in all paths, set this to ``True`` to improve the performance. Returns ------- numpy.ndarray The read features with the shape ``(n_samples, *features_shape[1:])``. Examples -------- This function in a simple way is equivalent to calling ``numpy.vstack(reader(p) for p in paths)``. >>> import numpy >>> from bob.io.base import vstack_features >>> def reader(path): ... # in each file, there are 5 samples and features are 2 dimensional. ... return numpy.arange(10).reshape(5,2) >>> paths = ['path1', 'path2'] >>> all_features = vstack_features(reader, paths) >>> numpy.allclose(all_features, numpy.array( ... [[0, 1], ... [2, 3], ... [4, 5], ... [6, 7], ... [8, 9], ... [0, 1], ... [2, 3], ... [4, 5], ... [6, 7], ... [8, 9]])) True >>> all_features_with_more_memory = numpy.vstack(reader(p) for p in paths) >>> numpy.allclose(all_features, all_features_with_more_memory) True You can allocate the array at once to improve the performance if you know that all features in paths have the same shape and you know the total number of the paths: >>> all_features = vstack_features(reader, paths, same_size=True) >>> numpy.allclose(all_features, numpy.array( ... [[0, 1], ... [2, 3], ... [4, 5], ... [6, 7], ... [8, 9], ... [0, 1], ... [2, 3], ... [4, 5], ... [6, 7], ... [8, 9]])) True .. note:: This function runs very slowly. Only use it when RAM is precious. """ iterable = _generate_features(reader, paths, same_size) dtype, shape = next(iterable) if same_size: total_size = int(len(paths) * numpy.prod(shape)) all_features = numpy.fromiter(iterable, dtype, total_size) else: all_features = numpy.fromiter(iterable, dtype) # the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 4). shape = list(shape) shape[0] = -1 return numpy.reshape(all_features, shape, order="C")
# gets sphinx autodoc done right - don't remove it __all__ = [_ for _ in dir() if not _.startswith('_')]