Source code for bob.bio.spear.extractor.cepstral_extended

#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Pavel Korshunov <pavel.korshunov@idiap.ch>
# Fri 6 Nov 17:13:22 CEST 2015
#
# Copyright (C) 2011-2012 Idiap Research Institute, Martigny, Switzerland
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


from __future__ import print_function

import numpy
import bob
import bob.core

import logging
logger = logging.getLogger("bob.bio.spear")
logger.setLevel(logging.DEBUG)

from bob.bio.base.extractor import Extractor
from .. import utils


class CepstralExtended(Extractor):
    """Extract energy bands from spectrogram and VAD labels based on the modulation of the energy around 4 Hz"""

    def __init__(
            self,
            win_length_ms=20.,  # 20 ms
            win_shift_ms=10.,  # 10 ms
            n_filters=40,
            f_min=0.0,  # 0 Hz
            f_max=8000,  # 8 KHz - this is an important value. Normally it should be half of the sampling frequency
            pre_emphasis_coef=1.0,
            mel_scale=True,
            rect_filter=False,
            inverse_filter=False,
            delta_win=2,
            n_ceps=19,  # 0-->18,
            dct_norm=False,
            ssfc_features=False,
            scfc_features=False,
            scmc_features=False,
            with_delta=True,
            with_delta_delta=True,
            with_energy=False,
            normalize_spectrum=False,
            keep_only_deltas=True,
            log_filter=True,
            energy_filter=False,
            vad_filter="no_filter",  # we do apply any trim filter by default
            normalize_feature_vector = False,
            **kwargs
    ):
        # call base class constructor with its set of parameters
        Extractor.__init__(
            self,
            requires_training=False, split_training_data_by_client=False,
            **kwargs
        )
        # copy parameters
        self.win_length_ms = win_length_ms
        self.win_shift_ms = win_shift_ms
        self.n_filters = n_filters
        self.f_min = f_min
        self.f_max = f_max
        self.pre_emphasis_coef = pre_emphasis_coef
        self.mel_scale = mel_scale
        self.rect_filter = rect_filter
        self.inverse_filter = inverse_filter
        self.delta_win = delta_win
        self.n_ceps = n_ceps
        self.dct_norm = dct_norm
        self.ssfc_features = ssfc_features
        self.scfc_features = scfc_features
        self.scmc_features = scmc_features
        self.with_delta = with_delta
        self.with_delta_delta = with_delta_delta
        self.with_energy = with_energy
        self.normalize_spectrum = normalize_spectrum
        self.keep_only_deltas = keep_only_deltas
        self.log_filter = log_filter
        self.energy_filter = energy_filter
        self.vad_filter = vad_filter
        self.normalize_feature_vector = normalize_feature_vector

        # compute the size of the feature vector
        self.features_len = self.n_ceps
        if self.with_delta:
            self.features_len += self.n_ceps
        if self.with_delta_delta:
            self.features_len += self.n_ceps


[docs]    def normalize_features(self, features):
        mean = numpy.mean(features, axis=0)
        std = numpy.std(features, axis=0)
        return numpy.divide(features-mean, std)


[docs]    def compute_ceps(self, rate, data):

        ceps = bob.ap.Ceps(rate, self.win_length_ms, self.win_shift_ms, self.n_filters, self.n_ceps, self.f_min,
                           self.f_max, self.delta_win, self.pre_emphasis_coef)
        ceps.dct_norm = self.dct_norm
        ceps.mel_scale = self.mel_scale
        # ceps.mel_scale = False
        ceps.rect_filter = self.rect_filter
        ceps.inverse_filter = self.inverse_filter
        ceps.with_energy = self.with_energy
        ceps.with_delta = self.with_delta
        ceps.with_delta_delta = self.with_delta_delta
        ceps.ssfc_features = self.ssfc_features
        ceps.scfc_features = self.scfc_features
        ceps.scmc_features = self.scmc_features
        ceps.normalize_spectrum = self.normalize_spectrum
        ceps.log_filter = self.log_filter
        ceps.energy_filter = self.energy_filter

        cepstral_features = ceps(data)

        if self.keep_only_deltas: # do not take the actual coefficients, only delta with delta-delta
            cepstral_features = cepstral_features[:, self.n_ceps:]
        return cepstral_features

    def __call__(self, input_data, annotations=None):
        """labels speech (1) and non-speech (0) parts of the given input wave file using 4Hz modulation energy and energy, as well as, compute energy of the signal and split it in bands using on linear or mel-filters
            Input parameter:
               * input_signal[0] --> rate
               * input_signal[1] --> signal
        """
        rate = input_data[0]
        wav_sample = input_data[1]
        labels = input_data[2] # results of the VAD preprocessor

        # remove trailing zeros the wav_sample
        # wav_sample = numpy.trim_zeros(wav_sample)  # comment it out to align with VAD output

        if wav_sample.size:
            cepstral_coeff = self.compute_ceps(rate, wav_sample)

            # SSFC features are a frame shorter than labels,
            # since they are computed using the difference between neighboring frames
            if self.ssfc_features:
                labels = labels[1:]
            logger.info("- Extraction: size of cepstral features %s", str(cepstral_coeff.shape))

            filtered_features = utils.vad_filter_features(labels, cepstral_coeff, self.vad_filter)
            logger.info("- Extraction: size of filtered cepstral features %s", str(filtered_features.shape))

            if numpy.isnan(numpy.sum(filtered_features)):
                logger.error("- Extraction: cepstral coefficients have NaN values, returning zero-vector...")
                return numpy.array([numpy.zeros(self.features_len)])

            if self.normalize_feature_vector:
                filtered_features = self.normalize_features(filtered_features)

            return numpy.asarray(filtered_features, dtype=numpy.float64)

        logger.error("- Extraction: WAV sample is empty")
        return numpy.array([numpy.zeros(self.features_len)])


extractor = CepstralExtended()