Coverage for src/bob/bio/spear/annotator/energy

1#!/usr/bin/env python

2# vim: set fileencoding=utf-8 :

3# Elie Khoury <Elie.Khoury@idiap.ch>

4# Tue 9 Jun 16:56:01 CEST 2015

8# This program is free software: you can redistribute it and/or modify

9# it under the terms of the GNU General Public License as published by

10# the Free Software Foundation, version 3 of the License.

11#

12# This program is distributed in the hope that it will be useful,

13# but WITHOUT ANY WARRANTY; without even the implied warranty of

14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

15# GNU General Public License for more details.

16#

17# You should have received a copy of the GNU General Public License

18# along with this program. If not, see <http://www.gnu.org/licenses/>.

20"""Energy-based voice activity detection for speaker recognition"""

22import logging

24import numpy

26from bob.bio.base.annotator import Annotator

28from .. import audio_processing as ap

29from .. import utils

31logger = logging.getLogger(__name__)

34class Energy_Thr(Annotator):

35 """VAD based on an energy threshold"""

37 def __init__(

38 self,

39 win_length_ms=20.0, # 20 ms

40 win_shift_ms=10.0, # 10 ms

41 smoothing_window=10, # 10 frames (i.e. 100 ms)

42 ratio_threshold=0.15, # 0.1 of the maximum energy

43 **kwargs

44 ):

45 super().__init__(**kwargs)

46 self.win_length_ms = win_length_ms

47 self.win_shift_ms = win_shift_ms

48 self.smoothing_window = smoothing_window

49 self.ratio_threshold = ratio_threshold

51 def _voice_activity_detection(self, energy):

53 n_samples = len(energy)

54 threshold = numpy.max(energy) - numpy.log(

55 (1.0 / self.ratio_threshold) * (1.0 / self.ratio_threshold)

56 )

57 label = numpy.array(numpy.ones(n_samples), dtype=numpy.int16)

59 # if energy does not change a lot, it's not audio maybe?

60 if numpy.std(energy) < 10e-5:

61 return label * 0

63 for i in range(n_samples):

64 if energy[i] > threshold:

65 label[i] = label[i] * 1

66 else:

67 label[i] = 0

68 return label

70 def _compute_energy(self, data, sample_rate):

71 """retrieve the speech / non speech labels for the speech sample given by the tuple (rate, wave signal)"""

73 energy_array = ap.energy(

74 data,

75 sample_rate,

76 win_length_ms=self.win_length_ms,

77 win_shift_ms=self.win_shift_ms,

78 )

79 labels = self._voice_activity_detection(energy_array)

80 # discard isolated speech a number of frames defined in smoothing_window

81 labels = utils.smoothing(labels, self.smoothing_window)

82 logger.info(

83 "After thresholded Energy-based VAD there are %d frames remaining over %d",

84 numpy.sum(labels),

85 len(labels),

86 )

87 return labels

89 def transform_one(self, data, sample_rate, annotations=None):

90 """labels speech (1) and non-speech (0) parts of the given input wave file using thresholded Energy

91 Input parameter:

92 * input_signal[0] --> rate

93 * input_signal[1] --> signal TODO doc

94 """

96 labels = self._compute_energy(data, sample_rate)

97 if (labels == 0).all():

98 logger.warning("No Audio was detected in the sample!")

99 return None

100

101 return labels

102

103 def transform(

104 self, audio_signals: "list[numpy.ndarray]", sample_rates: "list[int]"

105 ):

106 results = []

107 for audio_signal, sample_rate in zip(audio_signals, sample_rates):

108 results.append(self.transform_one(audio_signal, sample_rate))

109 return results

110

111 def _more_tags(self):

112 return {

113 "requires_fit": False,

114 "bob_transform_extra_input": (("sample_rates", "rate"),),

115 "bob_output": "annotations",

116 }

Coverage for src/bob/bio/spear/annotator/energy_thr.py: 100%

44 statements