Source code for bob.pad.voice.extractor.spectrogram_extended

#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Pavel Korshunov <>
# Mon  6 Aug 15:12:22 CEST 2015

from __future__ import print_function

import bob.measure
from import Extractor
from .. import utils

import numpy
import bob.ap
import math

import logging
logger = logging.getLogger("bob.pad.voice")

class SpectrogramExtended(Extractor):
    """Extract energy bands from spectrogram and VAD labels based on the modulation of the energy around 4 Hz"""

    def __init__(
            win_length_ms=20.,  # 20 ms
            win_shift_ms=10.,  # 10 ms
            f_min=0.0,  # 0 Hz
            f_max=4000,  # 4 KHz
            normalize_feature_vector = False,
        # call base class constructor with its set of parameters
            requires_training=False, split_training_data_by_client=False,
        # copy parameters
        self.win_length_ms = win_length_ms
        self.win_shift_ms = win_shift_ms
        self.n_filters = n_filters
        self.f_min = f_min
        self.f_max = f_max
        self.pre_emphasis_coef = pre_emphasis_coef
        self.mel_scale = mel_scale
        self.rect_filter = rect_filter
        self.inverse_filter = inverse_filter
        self.normalize_spectrum = normalize_spectrum
        self.log_filter = log_filter
        self.energy_filter = energy_filter
        self.energy_bands = energy_bands
        self.vad_filter = vad_filter
        self.normalize_feature_vector = normalize_feature_vector

        if self.energy_bands:
            self.features_len = self.n_filters
            self.features_len = int(math.pow(2.0, math.ceil(math.log(self.win_length_ms)/math.log(2))))

[docs] def normalize_features(self, features): mean = numpy.mean(features, axis=0) std = numpy.std(features, axis=0) return numpy.divide(features - mean, std)
[docs] def compute_spectrogram(self, rate, data): spectrogram = bob.ap.Spectrogram(rate, self.win_length_ms, self.win_shift_ms, self.n_filters, self.f_min, self.f_max, self.pre_emphasis_coef) spectrogram.energy_bands = self.energy_bands spectrogram.mel_scale = self.mel_scale spectrogram.rect_filter = self.rect_filter spectrogram.inverse_filter = self.inverse_filter spectrogram.normalize_spectrum = self.normalize_spectrum spectrogram.log_filter = self.log_filter spectrogram.energy_filter = self.energy_filter energy_bands = spectrogram(data) return energy_bands
def __call__(self, input_data, annotations=None): """labels speech (1) and non-speech (0) parts of the given input wave file using 4Hz modulation energy and energy, as well as, compute energy of the signal and split it in bands using on linear or mel-filters Input parameter: * input_signal[0] --> rate * input_signal[1] --> signal """ rate = input_data[0] wav_sample = input_data[1] labels = input_data[2] # results of the VAD preprocessor # remove trailing zeros from the wav_sample # wav_sample = numpy.trim_zeros(wav_sample) # comment it out to align with VAD output if wav_sample.size: spectrogram = self.compute_spectrogram(rate, wav_sample)"- Extraction: size of spectrogram features %s", str(spectrogram.shape)) filtered_spectrogram = utils.vad_filter_features(labels, spectrogram, self.vad_filter)"- Extraction: size of filtered spectrogram features %s", str(filtered_spectrogram.shape)) if numpy.isnan(numpy.sum(filtered_spectrogram)): logger.error("- Extraction: cepstral coefficients have NaN values, returning zero-vector...") return numpy.array([numpy.zeros(self.features_len)]) if self.normalize_feature_vector: filtered_spectrogram = self.normalize_features(filtered_spectrogram) return numpy.asarray(filtered_spectrogram, dtype=numpy.float64) logger.error("- Extraction: WAV sample is empty") return numpy.array([numpy.zeros(self.features_len)]) extractor = SpectrogramExtended()