Coverage for src/bob/bio/spear/annotator/energy_thr.py: 100%
44 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-06 22:04 +0100
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-06 22:04 +0100
1#!/usr/bin/env python
2# vim: set fileencoding=utf-8 :
3# Elie Khoury <Elie.Khoury@idiap.ch>
4# Tue 9 Jun 16:56:01 CEST 2015
5#
6# Copyright (C) 2012-2015 Idiap Research Institute, Martigny, Switzerland
7#
8# This program is free software: you can redistribute it and/or modify
9# it under the terms of the GNU General Public License as published by
10# the Free Software Foundation, version 3 of the License.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with this program. If not, see <http://www.gnu.org/licenses/>.
20"""Energy-based voice activity detection for speaker recognition"""
22import logging
24import numpy
26from bob.bio.base.annotator import Annotator
28from .. import audio_processing as ap
29from .. import utils
31logger = logging.getLogger(__name__)
34class Energy_Thr(Annotator):
35 """VAD based on an energy threshold"""
37 def __init__(
38 self,
39 win_length_ms=20.0, # 20 ms
40 win_shift_ms=10.0, # 10 ms
41 smoothing_window=10, # 10 frames (i.e. 100 ms)
42 ratio_threshold=0.15, # 0.1 of the maximum energy
43 **kwargs
44 ):
45 super().__init__(**kwargs)
46 self.win_length_ms = win_length_ms
47 self.win_shift_ms = win_shift_ms
48 self.smoothing_window = smoothing_window
49 self.ratio_threshold = ratio_threshold
51 def _voice_activity_detection(self, energy):
53 n_samples = len(energy)
54 threshold = numpy.max(energy) - numpy.log(
55 (1.0 / self.ratio_threshold) * (1.0 / self.ratio_threshold)
56 )
57 label = numpy.array(numpy.ones(n_samples), dtype=numpy.int16)
59 # if energy does not change a lot, it's not audio maybe?
60 if numpy.std(energy) < 10e-5:
61 return label * 0
63 for i in range(n_samples):
64 if energy[i] > threshold:
65 label[i] = label[i] * 1
66 else:
67 label[i] = 0
68 return label
70 def _compute_energy(self, data, sample_rate):
71 """retrieve the speech / non speech labels for the speech sample given by the tuple (rate, wave signal)"""
73 energy_array = ap.energy(
74 data,
75 sample_rate,
76 win_length_ms=self.win_length_ms,
77 win_shift_ms=self.win_shift_ms,
78 )
79 labels = self._voice_activity_detection(energy_array)
80 # discard isolated speech a number of frames defined in smoothing_window
81 labels = utils.smoothing(labels, self.smoothing_window)
82 logger.info(
83 "After thresholded Energy-based VAD there are %d frames remaining over %d",
84 numpy.sum(labels),
85 len(labels),
86 )
87 return labels
89 def transform_one(self, data, sample_rate, annotations=None):
90 """labels speech (1) and non-speech (0) parts of the given input wave file using thresholded Energy
91 Input parameter:
92 * input_signal[0] --> rate
93 * input_signal[1] --> signal TODO doc
94 """
96 labels = self._compute_energy(data, sample_rate)
97 if (labels == 0).all():
98 logger.warning("No Audio was detected in the sample!")
99 return None
101 return labels
103 def transform(
104 self, audio_signals: "list[numpy.ndarray]", sample_rates: "list[int]"
105 ):
106 results = []
107 for audio_signal, sample_rate in zip(audio_signals, sample_rates):
108 results.append(self.transform_one(audio_signal, sample_rate))
109 return results
111 def _more_tags(self):
112 return {
113 "requires_fit": False,
114 "bob_transform_extra_input": (("sample_rates", "rate"),),
115 "bob_output": "annotations",
116 }