Source code for bob.kaldi.mfcc

#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Marc Ferras Font <marc.ferras@idiap.ch>
# Mon 11 Jul 2016 10:39:15 CEST

import logging
import os
import tempfile
from os.path import isfile
from subprocess import PIPE
from subprocess import Popen

import numpy as np

from . import io

logger = logging.getLogger(__name__)


[docs]def mfcc( data, rate=8000, preemphasis_coefficient=0.97, raw_energy=True, frame_length=25, frame_shift=10, num_ceps=13, num_mel_bins=23, cepstral_lifter=22, low_freq=20, high_freq=0, dither=1.0, snip_edges=True, normalization=True, ): """Computes the MFCCs for given speech samples. Parameters ---------- data : numpy.ndarray A 1D numpy ndarray object containing 64-bit float numbers with the audio signal to calculate the MFCCs from. The input needs to be normalized between [-1, 1]. rate : float The sampling rate of the input signal in ``data``. preemphasis_coefficient : :obj:`float`, optional Coefficient for use in signal preemphasis raw_energy : :obj:`bool`, optional If true, compute energy before preemphasis and windowing frame_length : :obj:`int`, optional Frame length in milliseconds frame_shift : :obj:`int`, optional Frame shift in milliseconds num_ceps : :obj:`int`, optional Number of cepstra in MFCC computation (including C0) num_mel_bins : :obj:`int`, optional Number of triangular mel-frequency bins cepstral_lifter : :obj:`int`, optional Constant that controls scaling of MFCCs low_freq : :obj:`int`, optional Low cutoff frequency for mel bins high_freq : :obj:`int`, optional High cutoff frequency for mel bins (if < 0, offset from Nyquist) dither : :obj:`float`, optional Dithering constant (0.0 means no dither) snip_edges : :obj:`bool`, optional If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. normalization : :obj:`bool`, optional If true, the input samples in ``data`` are normalized to [-1, 1]. Returns ------- numpy.ndarray The MFCCs calculated for the input signal (2D array of 32-bit floats). """ binary1 = "compute-mfcc-feats" cmd1 = [binary1] binary2 = "add-deltas" cmd2 = [binary2] binary3 = "apply-cmvn-sliding" cmd3 = [binary3] # compute features plus deltas and sliding cmvn into the ark file cmd1 += [ "--sample-frequency=" + str(rate), "--preemphasis-coefficient=" + str(preemphasis_coefficient), "--raw-energy=" + str(raw_energy).lower(), "--frame-length=" + str(frame_length), "--frame-shift=" + str(frame_shift), "--num-ceps=" + str(num_ceps), "--num-mel-bins=" + str(num_mel_bins), "--cepstral-lifter=" + str(cepstral_lifter), "--dither=" + str(dither), "--snip-edges=" + str(snip_edges).lower(), "ark:-", "ark:-", ] cmd2 += [ "ark:-", "ark:-", ] cmd3 += [ "--norm-vars=false", "--center=true", "--cmn-window=300", "ark:-", "ark:-", ] # import ipdb; ipdb.set_trace() if normalization: data /= np.max(np.abs(data), axis=0) # normalize to [-1,1] with open(os.devnull, "w") as fnull: pipe1 = Popen(cmd1, stdin=PIPE, stdout=PIPE, stderr=fnull) pipe2 = Popen(cmd2, stdout=PIPE, stdin=pipe1.stdout, stderr=fnull) pipe3 = Popen(cmd3, stdout=PIPE, stdin=pipe2.stdout, stderr=fnull) # write wav file name (as if it were a Kaldi ark file) pipe1.stdin.write(b"abc ") # write WAV file in 16-bit format io.write_wav(pipe1.stdin, data, rate) pipe1.stdin.close() ret = [mat for name, mat in io.read_mat_ark(pipe3.stdout)][0] return ret
[docs]def mfcc_from_path( filename, channel=0, preemphasis_coefficient=0.97, raw_energy=True, frame_length=25, frame_shift=10, num_ceps=13, num_mel_bins=23, cepstral_lifter=22, low_freq=20, high_freq=0, dither=1.0, snip_edges=True, ): """Computes the MFCCs for a given input signal recorded into a file Parameters ---------- filename : str A path to a valid WAV or NIST Sphere file to read data from channel : int The audio channel to read from inside the file preemphasis_coefficient : :obj:`float`, optional Coefficient for use in signal preemphasis raw_energy : :obj:`bool`, optional If true, compute energy before preemphasis and windowing frame_length : :obj:`int`, optional Frame length in milliseconds frame_shift : :obj:`int`, optional Frame shift in milliseconds num_ceps : :obj:`int`, optional Number of cepstra in MFCC computation (including C0) num_mel_bins : :obj:`int`, optional Number of triangular mel-frequency bins cepstral_lifter : :obj:`int`, optional Constant that controls scaling of MFCCs low_freq : :obj:`int`, optional Low cutoff frequency for mel bins high_freq : :obj:`int`, optional High cutoff frequency for mel bins (if < 0, offset from Nyquist) dither : :obj:`float`, optional Dithering constant (0.0 means no dither) snip_edges : :obj:`bool`, optional If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends Returns ------- numpy.ndarray The MFCCs calculated for the input signal (2D array of 32-bit floats). """ binary1 = "compute-mfcc-feats" cmd1 = [binary1] binary2 = "add-deltas" cmd2 = [binary2] binary3 = "apply-cmvn-sliding" cmd3 = [binary3] # compute features into the ark file cmd1 += [ "--channel=" + str(channel), "--preemphasis-coefficient=" + str(preemphasis_coefficient), "--raw-energy=" + str(raw_energy).lower(), "--frame-length=" + str(frame_length), "--frame-shift=" + str(frame_shift), "--num-ceps=" + str(num_ceps), "--num-mel-bins=" + str(num_mel_bins), "--cepstral-lifter=" + str(cepstral_lifter), "--dither=" + str(dither), "--snip-edges=" + str(snip_edges).lower(), "scp:-", "ark:-", ] cmd2 += [ "ark:-", "ark:-", ] cmd3 += [ "--norm-vars=false", "--center=true", "--cmn-window=300", "ark:-", "ark:-", ] # import ipdb; ipdb.set_trace() assert isfile(filename) with open(os.devnull, "w") as fnull: pipe1 = Popen(cmd1, stdin=PIPE, stdout=PIPE, stderr=fnull) pipe2 = Popen(cmd2, stdout=PIPE, stdin=pipe1.stdout, stderr=fnull) pipe3 = Popen(cmd3, stdout=PIPE, stdin=pipe2.stdout, stderr=fnull) # write scp file into pipe.stdin strwrite = "abc " + filename pipe1.stdin.write(strwrite.encode("utf-8")) pipe1.stdin.close() # pipe3.communicate() # read ark from pipe3.stdout ret = [mat for name, mat in io.read_mat_ark(pipe3.stdout)][0] return ret
[docs]def compute_vad( samples, rate, vad_energy_mean_scale=0.5, vad_energy_th=5, vad_frames_context=0, vad_proportion_th=0.6, ): """Performs Voice Activity Detection on a Kaldi feature matrix Parameters ---------- feats : numpy.ndarray A 2-D numpy array, with log-energy being in the first component of each feature vector rate : float The sampling rate of the input signal in ``samples``. vad_energy_mean_scale: :obj:`float`, optional If this is set to s, to get the actual threshold we let m be the mean log-energy of the file, and use s*m + vad-energy-th vad_energy_th: :obj:`float`, optional Constant term in energy threshold for MFCC0 for VAD. vad_frames_context: :obj:`int`, optional Number of frames of context on each side of central frame, in window for which energy is monitored vad_proportion_th: :obj:`float`, optional Parameter controlling the proportion of frames within the window that need to have more energy than the threshold Returns ------- numpy.ndarray The labels [1/0] of voiced features (1D array of floats). """ binary1 = "compute-mfcc-feats" cmd1 = [binary1] binary2 = "compute-vad" cmd2 = [binary2] cmd1 += [ "--sample-frequency=" + str(rate), "ark:-", "ark:-", ] cmd2 += [ "--vad-energy-mean-scale=" + str(vad_energy_mean_scale), "--vad-energy-threshold=" + str(vad_energy_th), "--vad-frames-context=" + str(vad_frames_context), "--vad-proportion-threshold=" + str(vad_proportion_th), "ark:-", "ark:-", ] samples /= np.max(np.abs(samples), axis=0) # normalize to [-1,1] with tempfile.NamedTemporaryFile(suffix=".log") as logfile: pipe1 = Popen(cmd1, stdin=PIPE, stdout=PIPE, stderr=logfile) pipe2 = Popen(cmd2, stdin=pipe1.stdout, stdout=PIPE, stderr=logfile) pipe1.stdin.write(b"abc ") io.write_wav(pipe1.stdin, samples, rate) pipe1.stdin.close() with open(logfile.name) as fp: logtxt = fp.read() logger.debug("%s", logtxt) # read ark from pipe2.stdout ret = [mat for name, mat in io.read_vec_flt_ark(pipe2.stdout)][0] return ret