Source code for bob.kaldi.dnn

#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Milos Cernak <milos.cernak@idiap.ch>
# August 31, 2017

import os

import numpy as np

from . import io
from subprocess import PIPE, Popen
from os.path import isfile
import tempfile
# import shutil
import logging
import pkg_resources

import bob.kaldi

logger = logging.getLogger(__name__)


[docs]def nnet_forward(feats, nnet, feats_transform='', apply_log=False,
                 no_softmax=False, prior_floor=1e-10, prior_scale=1,
                 use_gpu=False):
    """Computes the forward pass for given features.

    Parameters
    ----------
    feats: numpy.ndarray
        The input cepstral features (2D array of 32-bit floats).
    nnet: str
        The neural network

    feats_transform : :obj:`str`, optional
        The input feature transform for ``feats``.
    apply_log : :obj:`bool`, optional
        Transform NN output by log().
    no_softmax : :obj:`bool`, optional
        Removes the last component with Softmax.
    prior_floor : :obj:`float`, optional
        Flooring constant for prior probability.
    prior_scale : :obj:`float`, optional
        Scaling factor to be applied on pdf-log-priors.
    use_gpu : :obj:`bool`, optional
        Compute forward pass on GPU.

    Returns
    -------
    numpy.ndarray
        The posterior features.

    """

    binary1 = 'nnet-forward'
    cmd1 = [binary1]

    cmd1 += [
        '--apply-log=' + str(apply_log).lower(),
        '--no-softmax=' + str(no_softmax).lower(),
        '--prior-floor=' + str(prior_floor),
        '--prior-scale=' + str(prior_scale),
        '--use-gpu=' + str(use_gpu).lower(),
    ]
        
    # save nnet model to a file
    with tempfile.NamedTemporaryFile(
            delete=False, suffix='.nnet') as dnn:
        with open(dnn.name, 'wt') as fp:
            fp.write(nnet)

    if feats_transform != '':
        # save nnet transform model to a file
        with tempfile.NamedTemporaryFile(
                delete=False, suffix='.nnet') as transf:
            with open(transf.name, 'wt') as fp:
                fp.write(feats_transform)
                
        cmd1 += [
            '--feature-transform=' + transf.name,
        ]

    cmd1 += [
        dnn.name,
        'ark:-',
        'ark:-',
    ]
    with tempfile.NamedTemporaryFile(suffix='.log') as logfile:
        pipe1 = Popen(cmd1, stdin=PIPE, stdout=PIPE, stderr=logfile)
        io.write_mat(pipe1.stdin, feats, key=b'abc')
        pipe1.stdin.close()
        # pipe1.communicate()

        posts = [mat for name, mat in io.read_mat_ark(pipe1.stdout)][0]

        with open(logfile.name) as fp:
            logtxt = fp.read()
            logger.debug("%s", logtxt)

    os.unlink(dnn.name)
    if feats_transform != '':
        os.unlink(transf.name)
    
    return posts

[docs]def compute_dnn_vad(samples, rate, silence_threshold=0.9, posterior=0):
    """Performs Voice Activity Detection on a Kaldi feature matrix

    Parameters
    ----------
    feats : numpy.ndarray
        A 2-D numpy array, with log-energy being in the first
        component of each feature vector
    rate : float
        The sampling rate of the input signal in ``samples``.
    silence_threshold: :obj:`float`, optional
        Silence threshold to be used for silence posterior
        evaluation. 
    posterior: :obj:`int`, optional
        Index of posterior feature to be used for detection. Useful
        ones are 0, 1 and 2, for silence, laughter and
        noise,respectively.

    Returns
    -------
    numpy.ndarray
        The labels [1/0] of voiced features (1D array of floats).
    """

    nnetfile   = pkg_resources.resource_filename(__name__,
    'test/dnn/ami.nnet.txt')
    transfile = pkg_resources.resource_filename(__name__,
    'test/dnn/ami.feature_transform.txt')

    feats = bob.kaldi.cepstral(samples, 'mfcc', rate,
    normalization=False)

    with open(nnetfile) as nnetf, \
        open(transfile) as trnf:
        dnn = nnetf.read()
        trn = trnf.read()
        post = bob.kaldi.nnet_forward(feats, dnn, trn)

    vad = []
    for row in post:
        if row[posterior] > silence_threshold:
            vad.append(0.0)
        else:
            vad.append(1.0)

    return vad

[docs]def compute_dnn_phone(samples, rate):
    """Computes phone posteriors on a Kaldi feature matrix

    Parameters
    ----------
    feats : numpy.ndarray
        A 2-D numpy array, with log-energy being in the first
        component of each feature vector
    rate : float
        The sampling rate of the input signal in ``samples``.

    Returns
    -------
    numpy.ndarray
        The phone posteriors and labels.
    """

    nnetfile   = pkg_resources.resource_filename(__name__,
    'test/dnn/ami.nnet.txt')
    transfile = pkg_resources.resource_filename(__name__,
    'test/dnn/ami.feature_transform.txt')
    labfile = pkg_resources.resource_filename(__name__,
    'test/dnn/ami.phones.txt')

    feats = bob.kaldi.cepstral(samples, 'mfcc', rate,
    normalization=False)

    with open(nnetfile) as nnetf, \
        open(transfile) as trnf:
        dnn = nnetf.read()
        trn = trnf.read()
        post = bob.kaldi.nnet_forward(feats, dnn, trn)

    labels = a=np.genfromtxt(labfile, dtype='str', skip_header=1)
    lab = []
    for l in labels:
        lab.append(l[0])

    return [post, lab]