Coverage for src/bob/bio/spear/extractor/Cepstral.py: 85%

1#!/usr/bin/env python

2# Elie Khoury <Elie.Khoury@idiap.ch>

3# Amir Mohammadi <amir.mohammadi@idiap.ch>

5"""Cepstral Features for speaker recognition"""

7import logging

9import numpy

11from sklearn.base import BaseEstimator, TransformerMixin

13from .. import audio_processing as ap

15logger = logging.getLogger(__name__)

18class Cepstral(BaseEstimator, TransformerMixin):

19 """Extracts the Cepstral features of audio wav data.

21 Use a SampleWrapper to use with bob pipelines to pass the `rate` and `annotations`

22 attributes to the arguments of `transform`:

23 >>> wrap(

24 ... ["sample"],

25 ... Cepstral(),

26 ... transform_extra_arguments=[

27 ... ("sample_rate", "rate"), ("vad_labels", "annotations")

28 ... ]

29 ... )

30 """

32 def __init__(

33 self,

34 win_length_ms=20,

35 win_shift_ms=10,

36 n_filters=24,

37 dct_norm=False,

38 f_min=0.0,

39 f_max=4000.0,

40 delta_win=2,

41 mel_scale=True,

42 with_energy=True,

43 with_delta=True,

44 with_delta_delta=True,

45 n_ceps=19, # 0-->18

46 pre_emphasis_coef=0.95,

47 features_mask=None,

48 normalize_flag=True,

49 **kwargs,

50 ):

51 """Most parameters are passed to `ap.cepstral`.

53 Parameters

54 ----------

55 features_mask: numpy slice

56 Indices of features to keep (only applied if VAD annotations are present).

57 normalize_flag: bool

58 Controls the normalization of the feature vectors after Cepstral.

59 """

61 super().__init__(**kwargs)

62 self.win_length_ms = win_length_ms

63 self.win_shift_ms = win_shift_ms

64 self.n_filters = n_filters

65 self.dct_norm = dct_norm

66 self.f_min = f_min

67 self.f_max = f_max

68 self.delta_win = delta_win

69 self.mel_scale = mel_scale

70 self.with_energy = with_energy

71 self.with_delta = with_delta

72 self.with_delta_delta = with_delta_delta

73 self.n_ceps = n_ceps

74 self.pre_emphasis_coef = pre_emphasis_coef

75 self.features_mask = features_mask

76 self.normalize_flag = normalize_flag

78 def normalize_features(self, params: numpy.ndarray):

79 """Returns the features normalized along the columns.

81 Parameters

82 ----------

83 params:

84 2D array of feature vectors.

85 """

87 # if there is only 1 frame, we cannot normalize it

88 if len(params) == 1 or (params.std(axis=0) == 0).any():

89 return params

90 # normalized_vector is mean std normalized version of params per feature dimension

91 normalized_vector = (params - params.mean(axis=0)) / params.std(axis=0)

92 return normalized_vector

94 def transform_one(

95 self,

96 wav_data: numpy.ndarray,

97 sample_rate: float,

98 vad_labels: numpy.ndarray,

99 ):

100 """Computes and returns cepstral features for one given audio signal."""

101 logger.debug("Cepstral transform.")

102

103 cepstral_features = ap.cepstral(

104 wav_data,

105 sample_rate,

106 win_length_ms=self.win_length_ms,

107 win_shift_ms=self.win_shift_ms,

108 n_filters=self.n_filters,

109 f_min=self.f_min,

110 f_max=self.f_max,

111 pre_emphasis_coef=self.pre_emphasis_coef,

112 mel_scale=self.mel_scale,

113 n_ceps=self.n_ceps,

114 delta_win=self.delta_win,

115 dct_norm=self.dct_norm,

116 with_energy=self.with_energy,

117 with_delta=self.with_delta,

118 with_delta_delta=self.with_delta_delta,

119 )

120

121 if vad_labels is not None: # Don't apply VAD if labels are not present

122 vad_labels = numpy.array(

123 vad_labels

124 ) # Ensure array, as `list == 1` is `False`

125 filtered_features = cepstral_features[vad_labels == 1]

126 if self.features_mask is not None:

127 filtered_features = filtered_features[:, self.features_mask]

128 else:

129 filtered_features = cepstral_features

130

131 if self.normalize_flag:

132 normalized_features = self.normalize_features(filtered_features)

133 else:

134 normalized_features = filtered_features

135

136 if normalized_features.shape[0] == 0:

137 logger.warning("No speech found for this utterance")

138 # But do not keep it empty!!! This avoids errors in next steps

139 feature_length = (

140 len(self.features_mask) if self.features_mask else 60

141 )

142 normalized_features = numpy.zeros((1, feature_length))

143 return normalized_features

144

145 def transform(

146 self,

147 wav_data_set: "list[numpy.ndarray]",

148 sample_rate: "list[float]",

149 vad_labels: "list[numpy.ndarray]",

150 ):

151 results = []

152 for wav_data, rate, annotations in zip(

153 wav_data_set, sample_rate, vad_labels

154 ):

155 results.append(self.transform_one(wav_data, rate, annotations))

156 return results

157

158 def fit(self, X, y=None, **fit_params):

159 return self

160

161 def _more_tags(self):

162 return {

163 "requires_fit": False,

164 "bob_transform_extra_input": (

165 ("sample_rate", "rate"),

166 ("vad_labels", "annotations"),

167 ),

168 }