Coverage for src/bob/bio/spear/annotator/energy_thr.py: 100%

44 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-06 22:04 +0100

1#!/usr/bin/env python 

2# vim: set fileencoding=utf-8 : 

3# Elie Khoury <Elie.Khoury@idiap.ch> 

4# Tue 9 Jun 16:56:01 CEST 2015 

5# 

6# Copyright (C) 2012-2015 Idiap Research Institute, Martigny, Switzerland 

7# 

8# This program is free software: you can redistribute it and/or modify 

9# it under the terms of the GNU General Public License as published by 

10# the Free Software Foundation, version 3 of the License. 

11# 

12# This program is distributed in the hope that it will be useful, 

13# but WITHOUT ANY WARRANTY; without even the implied warranty of 

14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

15# GNU General Public License for more details. 

16# 

17# You should have received a copy of the GNU General Public License 

18# along with this program. If not, see <http://www.gnu.org/licenses/>. 

19 

20"""Energy-based voice activity detection for speaker recognition""" 

21 

22import logging 

23 

24import numpy 

25 

26from bob.bio.base.annotator import Annotator 

27 

28from .. import audio_processing as ap 

29from .. import utils 

30 

31logger = logging.getLogger(__name__) 

32 

33 

34class Energy_Thr(Annotator): 

35 """VAD based on an energy threshold""" 

36 

37 def __init__( 

38 self, 

39 win_length_ms=20.0, # 20 ms 

40 win_shift_ms=10.0, # 10 ms 

41 smoothing_window=10, # 10 frames (i.e. 100 ms) 

42 ratio_threshold=0.15, # 0.1 of the maximum energy 

43 **kwargs 

44 ): 

45 super().__init__(**kwargs) 

46 self.win_length_ms = win_length_ms 

47 self.win_shift_ms = win_shift_ms 

48 self.smoothing_window = smoothing_window 

49 self.ratio_threshold = ratio_threshold 

50 

51 def _voice_activity_detection(self, energy): 

52 

53 n_samples = len(energy) 

54 threshold = numpy.max(energy) - numpy.log( 

55 (1.0 / self.ratio_threshold) * (1.0 / self.ratio_threshold) 

56 ) 

57 label = numpy.array(numpy.ones(n_samples), dtype=numpy.int16) 

58 

59 # if energy does not change a lot, it's not audio maybe? 

60 if numpy.std(energy) < 10e-5: 

61 return label * 0 

62 

63 for i in range(n_samples): 

64 if energy[i] > threshold: 

65 label[i] = label[i] * 1 

66 else: 

67 label[i] = 0 

68 return label 

69 

70 def _compute_energy(self, data, sample_rate): 

71 """retrieve the speech / non speech labels for the speech sample given by the tuple (rate, wave signal)""" 

72 

73 energy_array = ap.energy( 

74 data, 

75 sample_rate, 

76 win_length_ms=self.win_length_ms, 

77 win_shift_ms=self.win_shift_ms, 

78 ) 

79 labels = self._voice_activity_detection(energy_array) 

80 # discard isolated speech a number of frames defined in smoothing_window 

81 labels = utils.smoothing(labels, self.smoothing_window) 

82 logger.info( 

83 "After thresholded Energy-based VAD there are %d frames remaining over %d", 

84 numpy.sum(labels), 

85 len(labels), 

86 ) 

87 return labels 

88 

89 def transform_one(self, data, sample_rate, annotations=None): 

90 """labels speech (1) and non-speech (0) parts of the given input wave file using thresholded Energy 

91 Input parameter: 

92 * input_signal[0] --> rate 

93 * input_signal[1] --> signal TODO doc 

94 """ 

95 

96 labels = self._compute_energy(data, sample_rate) 

97 if (labels == 0).all(): 

98 logger.warning("No Audio was detected in the sample!") 

99 return None 

100 

101 return labels 

102 

103 def transform( 

104 self, audio_signals: "list[numpy.ndarray]", sample_rates: "list[int]" 

105 ): 

106 results = [] 

107 for audio_signal, sample_rate in zip(audio_signals, sample_rates): 

108 results.append(self.transform_one(audio_signal, sample_rate)) 

109 return results 

110 

111 def _more_tags(self): 

112 return { 

113 "requires_fit": False, 

114 "bob_transform_extra_input": (("sample_rates", "rate"),), 

115 "bob_output": "annotations", 

116 }