Coverage for src/bob/bio/spear/extractor/Cepstral.py: 85%

55 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-06 22:04 +0100

1#!/usr/bin/env python 

2# Elie Khoury <Elie.Khoury@idiap.ch> 

3# Amir Mohammadi <amir.mohammadi@idiap.ch> 

4 

5"""Cepstral Features for speaker recognition""" 

6 

7import logging 

8 

9import numpy 

10 

11from sklearn.base import BaseEstimator, TransformerMixin 

12 

13from .. import audio_processing as ap 

14 

15logger = logging.getLogger(__name__) 

16 

17 

18class Cepstral(BaseEstimator, TransformerMixin): 

19 """Extracts the Cepstral features of audio wav data. 

20 

21 Use a SampleWrapper to use with bob pipelines to pass the `rate` and `annotations` 

22 attributes to the arguments of `transform`: 

23 >>> wrap( 

24 ... ["sample"], 

25 ... Cepstral(), 

26 ... transform_extra_arguments=[ 

27 ... ("sample_rate", "rate"), ("vad_labels", "annotations") 

28 ... ] 

29 ... ) 

30 """ 

31 

32 def __init__( 

33 self, 

34 win_length_ms=20, 

35 win_shift_ms=10, 

36 n_filters=24, 

37 dct_norm=False, 

38 f_min=0.0, 

39 f_max=4000.0, 

40 delta_win=2, 

41 mel_scale=True, 

42 with_energy=True, 

43 with_delta=True, 

44 with_delta_delta=True, 

45 n_ceps=19, # 0-->18 

46 pre_emphasis_coef=0.95, 

47 features_mask=None, 

48 normalize_flag=True, 

49 **kwargs, 

50 ): 

51 """Most parameters are passed to `ap.cepstral`. 

52 

53 Parameters 

54 ---------- 

55 features_mask: numpy slice 

56 Indices of features to keep (only applied if VAD annotations are present). 

57 normalize_flag: bool 

58 Controls the normalization of the feature vectors after Cepstral. 

59 """ 

60 

61 super().__init__(**kwargs) 

62 self.win_length_ms = win_length_ms 

63 self.win_shift_ms = win_shift_ms 

64 self.n_filters = n_filters 

65 self.dct_norm = dct_norm 

66 self.f_min = f_min 

67 self.f_max = f_max 

68 self.delta_win = delta_win 

69 self.mel_scale = mel_scale 

70 self.with_energy = with_energy 

71 self.with_delta = with_delta 

72 self.with_delta_delta = with_delta_delta 

73 self.n_ceps = n_ceps 

74 self.pre_emphasis_coef = pre_emphasis_coef 

75 self.features_mask = features_mask 

76 self.normalize_flag = normalize_flag 

77 

78 def normalize_features(self, params: numpy.ndarray): 

79 """Returns the features normalized along the columns. 

80 

81 Parameters 

82 ---------- 

83 params: 

84 2D array of feature vectors. 

85 """ 

86 

87 # if there is only 1 frame, we cannot normalize it 

88 if len(params) == 1 or (params.std(axis=0) == 0).any(): 

89 return params 

90 # normalized_vector is mean std normalized version of params per feature dimension 

91 normalized_vector = (params - params.mean(axis=0)) / params.std(axis=0) 

92 return normalized_vector 

93 

94 def transform_one( 

95 self, 

96 wav_data: numpy.ndarray, 

97 sample_rate: float, 

98 vad_labels: numpy.ndarray, 

99 ): 

100 """Computes and returns cepstral features for one given audio signal.""" 

101 logger.debug("Cepstral transform.") 

102 

103 cepstral_features = ap.cepstral( 

104 wav_data, 

105 sample_rate, 

106 win_length_ms=self.win_length_ms, 

107 win_shift_ms=self.win_shift_ms, 

108 n_filters=self.n_filters, 

109 f_min=self.f_min, 

110 f_max=self.f_max, 

111 pre_emphasis_coef=self.pre_emphasis_coef, 

112 mel_scale=self.mel_scale, 

113 n_ceps=self.n_ceps, 

114 delta_win=self.delta_win, 

115 dct_norm=self.dct_norm, 

116 with_energy=self.with_energy, 

117 with_delta=self.with_delta, 

118 with_delta_delta=self.with_delta_delta, 

119 ) 

120 

121 if vad_labels is not None: # Don't apply VAD if labels are not present 

122 vad_labels = numpy.array( 

123 vad_labels 

124 ) # Ensure array, as `list == 1` is `False` 

125 filtered_features = cepstral_features[vad_labels == 1] 

126 if self.features_mask is not None: 

127 filtered_features = filtered_features[:, self.features_mask] 

128 else: 

129 filtered_features = cepstral_features 

130 

131 if self.normalize_flag: 

132 normalized_features = self.normalize_features(filtered_features) 

133 else: 

134 normalized_features = filtered_features 

135 

136 if normalized_features.shape[0] == 0: 

137 logger.warning("No speech found for this utterance") 

138 # But do not keep it empty!!! This avoids errors in next steps 

139 feature_length = ( 

140 len(self.features_mask) if self.features_mask else 60 

141 ) 

142 normalized_features = numpy.zeros((1, feature_length)) 

143 return normalized_features 

144 

145 def transform( 

146 self, 

147 wav_data_set: "list[numpy.ndarray]", 

148 sample_rate: "list[float]", 

149 vad_labels: "list[numpy.ndarray]", 

150 ): 

151 results = [] 

152 for wav_data, rate, annotations in zip( 

153 wav_data_set, sample_rate, vad_labels 

154 ): 

155 results.append(self.transform_one(wav_data, rate, annotations)) 

156 return results 

157 

158 def fit(self, X, y=None, **fit_params): 

159 return self 

160 

161 def _more_tags(self): 

162 return { 

163 "requires_fit": False, 

164 "bob_transform_extra_input": ( 

165 ("sample_rate", "rate"), 

166 ("vad_labels", "annotations"), 

167 ), 

168 }