#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# @author: Manuel Guenther <>
# @author: Pavel Korshunov <>
# @date: Wed 19 Aug 13:43:21 2015

import os

import logging

logger = logging.getLogger("bob.pad.base")

from .FileSelector import FileSelector
from import utils
from .preprocessor import read_preprocessed_data
from import read_features

[docs]def train_extractor(extractor, preprocessor, allow_missing_files=False, force=False): """Trains the feature extractor using preprocessed data of the ``'train'`` group, if the feature extractor requires training. This function should only be called, when the ``extractor`` actually requires training. The given ``extractor`` is trained using preprocessed data. It writes the extractor to the file specified by the :py:class:``. By default, if the target file already exist, it is not re-created. **Parameters:** extractor : py:class:`` or derived The extractor to be trained. preprocessor : py:class:`` or derived The preprocessor, used for reading the preprocessed data. allow_missing_files : bool If set to ``True``, preprocessed data files that are not found are silently ignored during training. force : bool If given, the extractor file is regenerated, even if it already exists. """ if not extractor.requires_training: logger.warn( "The train_extractor function should not have been called, since the extractor does not need training.") return # the file selector object fs = FileSelector.instance() # the file to write if utils.check_file(fs.extractor_file, force, extractor.min_extractor_file_size):"- Extraction: extractor '%s' already exists.", fs.extractor_file) else: # read training files train_files = fs.training_list( 'preprocessed', 'train_extractor', combined=not extractor.split_training_data_by_client) train_data = read_preprocessed_data( train_files, preprocessor, extractor.split_training_data_by_client, allow_missing_files) if extractor.split_training_data_by_client:"- Extraction: training extractor '%s' using %d classes:", fs.extractor_file, len(train_files)) else:"- Extraction: training extractor '%s' using %d training files:", fs.extractor_file, len(train_files)) # train model extractor.train(train_data, fs.extractor_file)
[docs]def extract(extractor, preprocessor, groups=None, indices=None, allow_missing_files=False, force=False): """Extracts features from the preprocessed data using the given extractor. The given ``extractor`` is used to extract all features required for the current experiment. It writes the extracted data into the directory specified by the :py:class:``. By default, if target files already exist, they are not re-created. The preprocessor is only used to load the data in a coherent way. **Parameters:** extractor : py:class:`` or derived The extractor, used for extracting and writing the features. preprocessor : py:class:`` or derived The preprocessor, used for reading the preprocessed data. groups : some of ``('train', 'dev', 'eval')`` or ``None`` The list of groups, for which the data should be extracted. indices : (int, int) or None If specified, only the features for the given index range ``range(begin, end)`` should be extracted. This is usually given, when parallel threads are executed. allow_missing_files : bool If set to ``True``, preprocessed data files that are not found are silently ignored. force : bool If given, files are regenerated, even if they already exist. """ # the file selector object fs = FileSelector.instance() extractor.load(fs.extractor_file) data_files = fs.preprocessed_data_list(groups=groups) feature_files = fs.feature_list(groups=groups) # select a subset of indices to iterate if indices is not None: index_range = range(indices[0], indices[1])"- Extraction: splitting of index range %s" % str(indices)) else: index_range = range(len(data_files))"- Extraction: extracting %d features from directory '%s' to directory '%s'", len(index_range), fs.directories['preprocessed'], fs.directories['extracted']) for i in index_range: data_file = data_files[i] feature_file = feature_files[i] if not os.path.exists(data_file) and preprocessor.writes_data: if allow_missing_files: logger.debug( "... Cannot find preprocessed data file %s; skipping", data_file) continue else: logger.error( "Cannot find preprocessed data file %s", data_file) if not utils.check_file(feature_file, force, extractor.min_feature_file_size): logger.debug( "... Extracting features for data file '%s'", data_file) # create output directory before reading the data file (is # sometimes required, when relative directories are specified, # especially, including a .. somewhere) # load data data = preprocessor.read_data(data_file) # extract feature feature = extractor(data) if feature is None: if allow_missing_files: logger.debug( "... Feature extraction for data file %s failed; skipping", data_file) continue else: raise RuntimeError( "Feature extraction of file '%s' was not successful", data_file) # write feature extractor.write_feature(feature, feature_file) else: logger.debug( "... Skipping preprocessed data '%s' since feature file '%s' exists", data_file, feature_file)