#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Milos Cernak <milos.cernak@idiap.ch>
# August 28, 2017
import logging
import os
import tempfile
from subprocess import PIPE
from subprocess import Popen
import numpy as np
from . import io
logger = logging.getLogger(__name__)
[docs]def cepstral(
data,
cepstral_type,
rate=8000,
preemphasis_coefficient=0.97,
raw_energy=True,
delta_order=2,
frame_length=25,
frame_shift=10,
num_ceps=13,
num_mel_bins=23,
cepstral_lifter=22,
low_freq=20,
high_freq=0,
dither=1.0,
snip_edges=True,
normalization=True,
):
"""Computes the cepstral (mfcc/plp) features for given speech samples.
Parameters
----------
data : numpy.ndarray
A 1D numpy ndarray object containing 64-bit float numbers with
the audio signal to calculate the cepstral features from. The
input needs to be normalized between [-1, 1].
rate : float
The sampling rate of the input signal in ``data``.
cepstral_type: str
The type of cepstral features: mfcc or plp
preemphasis_coefficient : :obj:`float`, optional
Coefficient for use in signal preemphasis
raw_energy : :obj:`bool`, optional
If true, compute energy before preemphasis and windowing
delta_order : :obj:`int`, optional
Add deltas to raw mfcc or plp features
frame_length : :obj:`int`, optional
Frame length in milliseconds
frame_shift : :obj:`int`, optional
Frame shift in milliseconds
num_ceps : :obj:`int`, optional
Number of cepstra in MFCC computation (including C0)
num_mel_bins : :obj:`int`, optional
Number of triangular mel-frequency bins
cepstral_lifter : :obj:`int`, optional
Constant that controls scaling of MFCCs
low_freq : :obj:`int`, optional
Low cutoff frequency for mel bins
high_freq : :obj:`int`, optional
High cutoff frequency for mel bins (if < 0, offset from Nyquist)
dither : :obj:`float`, optional
Dithering constant (0.0 means no dither)
snip_edges : :obj:`bool`, optional
If true, end effects will be handled by outputting only frames
that completely fit in the file, and the number of frames
depends on the frame-length. If false, the number of frames
depends only on the frame-shift, and we reflect the data at
the ends.
normalization : :obj:`bool`, optional
If true, the input samples in ``data`` are normalized to [-1, 1].
Returns
-------
numpy.ndarray
The cepstral features calculated for the input signal (2D
array of 32-bit floats).
"""
assert cepstral_type == "mfcc" or cepstral_type == "plp"
binary1 = "compute-" + cepstral_type + "-feats"
cmd1 = [binary1]
binary2 = "compute-cmvn-stats"
cmd2 = [binary2]
binary3 = "apply-cmvn"
cmd3 = [binary3]
binary4 = "add-deltas"
cmd4 = [binary4]
# compute features plus deltas and sliding cmvn into the ark file
cmd1 += [
"--sample-frequency=" + str(rate),
"--preemphasis-coefficient=" + str(preemphasis_coefficient),
"--raw-energy=" + str(raw_energy).lower(),
"--frame-length=" + str(frame_length),
"--frame-shift=" + str(frame_shift),
"--num-ceps=" + str(num_ceps),
"--num-mel-bins=" + str(num_mel_bins),
"--cepstral-lifter=" + str(cepstral_lifter),
"--dither=" + str(dither),
"--snip-edges=" + str(snip_edges).lower(),
"ark:-",
"ark:-",
]
cmd4 += [
"--delta-order=" + str(delta_order),
"ark:-",
"ark:-",
]
# import ipdb; ipdb.set_trace()
if normalization:
data /= np.max(np.abs(data), axis=0) # normalize to [-1,1]
# Compute static features
with open(os.devnull, "w") as fnull:
pipe1 = Popen(cmd1, stdin=PIPE, stdout=PIPE, stderr=fnull)
# write wav file name (as if it were a Kaldi ark file)
pipe1.stdin.write(b"abc ")
# write WAV file in 16-bit format
io.write_wav(pipe1.stdin, data, rate)
pipe1.stdin.close()
feats = [mat for name, mat in io.read_mat_ark(pipe1.stdout)][0]
assert len(feats)
# Compute and apply CMVN with deltas
with tempfile.NamedTemporaryFile(suffix=".cmvn") as cmvnfile, open(
os.devnull, "w"
) as fnull:
cmd2 += [
"ark:-",
cmvnfile.name,
]
pipe2 = Popen(cmd2, stdin=PIPE, stdout=PIPE, stderr=fnull)
io.write_mat(pipe2.stdin, feats, key=b"abc")
# pipe2.stdin.close()
pipe2.communicate()
cmd3 += [
cmvnfile.name,
"ark:-",
"ark:-",
]
pipe3 = Popen(cmd3, stdin=PIPE, stdout=PIPE, stderr=fnull)
pipe4 = Popen(cmd4, stdin=pipe3.stdout, stdout=PIPE, stderr=fnull)
io.write_mat(pipe3.stdin, feats, key=b"abc")
pipe3.stdin.close()
ret = [mat for name, mat in io.read_mat_ark(pipe4.stdout)][0]
return ret