Coverage for src/bob/bio/spear/extractor/Cepstral.py: 85%
55 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-06 22:04 +0100
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-06 22:04 +0100
1#!/usr/bin/env python
2# Elie Khoury <Elie.Khoury@idiap.ch>
3# Amir Mohammadi <amir.mohammadi@idiap.ch>
5"""Cepstral Features for speaker recognition"""
7import logging
9import numpy
11from sklearn.base import BaseEstimator, TransformerMixin
13from .. import audio_processing as ap
15logger = logging.getLogger(__name__)
18class Cepstral(BaseEstimator, TransformerMixin):
19 """Extracts the Cepstral features of audio wav data.
21 Use a SampleWrapper to use with bob pipelines to pass the `rate` and `annotations`
22 attributes to the arguments of `transform`:
23 >>> wrap(
24 ... ["sample"],
25 ... Cepstral(),
26 ... transform_extra_arguments=[
27 ... ("sample_rate", "rate"), ("vad_labels", "annotations")
28 ... ]
29 ... )
30 """
32 def __init__(
33 self,
34 win_length_ms=20,
35 win_shift_ms=10,
36 n_filters=24,
37 dct_norm=False,
38 f_min=0.0,
39 f_max=4000.0,
40 delta_win=2,
41 mel_scale=True,
42 with_energy=True,
43 with_delta=True,
44 with_delta_delta=True,
45 n_ceps=19, # 0-->18
46 pre_emphasis_coef=0.95,
47 features_mask=None,
48 normalize_flag=True,
49 **kwargs,
50 ):
51 """Most parameters are passed to `ap.cepstral`.
53 Parameters
54 ----------
55 features_mask: numpy slice
56 Indices of features to keep (only applied if VAD annotations are present).
57 normalize_flag: bool
58 Controls the normalization of the feature vectors after Cepstral.
59 """
61 super().__init__(**kwargs)
62 self.win_length_ms = win_length_ms
63 self.win_shift_ms = win_shift_ms
64 self.n_filters = n_filters
65 self.dct_norm = dct_norm
66 self.f_min = f_min
67 self.f_max = f_max
68 self.delta_win = delta_win
69 self.mel_scale = mel_scale
70 self.with_energy = with_energy
71 self.with_delta = with_delta
72 self.with_delta_delta = with_delta_delta
73 self.n_ceps = n_ceps
74 self.pre_emphasis_coef = pre_emphasis_coef
75 self.features_mask = features_mask
76 self.normalize_flag = normalize_flag
78 def normalize_features(self, params: numpy.ndarray):
79 """Returns the features normalized along the columns.
81 Parameters
82 ----------
83 params:
84 2D array of feature vectors.
85 """
87 # if there is only 1 frame, we cannot normalize it
88 if len(params) == 1 or (params.std(axis=0) == 0).any():
89 return params
90 # normalized_vector is mean std normalized version of params per feature dimension
91 normalized_vector = (params - params.mean(axis=0)) / params.std(axis=0)
92 return normalized_vector
94 def transform_one(
95 self,
96 wav_data: numpy.ndarray,
97 sample_rate: float,
98 vad_labels: numpy.ndarray,
99 ):
100 """Computes and returns cepstral features for one given audio signal."""
101 logger.debug("Cepstral transform.")
103 cepstral_features = ap.cepstral(
104 wav_data,
105 sample_rate,
106 win_length_ms=self.win_length_ms,
107 win_shift_ms=self.win_shift_ms,
108 n_filters=self.n_filters,
109 f_min=self.f_min,
110 f_max=self.f_max,
111 pre_emphasis_coef=self.pre_emphasis_coef,
112 mel_scale=self.mel_scale,
113 n_ceps=self.n_ceps,
114 delta_win=self.delta_win,
115 dct_norm=self.dct_norm,
116 with_energy=self.with_energy,
117 with_delta=self.with_delta,
118 with_delta_delta=self.with_delta_delta,
119 )
121 if vad_labels is not None: # Don't apply VAD if labels are not present
122 vad_labels = numpy.array(
123 vad_labels
124 ) # Ensure array, as `list == 1` is `False`
125 filtered_features = cepstral_features[vad_labels == 1]
126 if self.features_mask is not None:
127 filtered_features = filtered_features[:, self.features_mask]
128 else:
129 filtered_features = cepstral_features
131 if self.normalize_flag:
132 normalized_features = self.normalize_features(filtered_features)
133 else:
134 normalized_features = filtered_features
136 if normalized_features.shape[0] == 0:
137 logger.warning("No speech found for this utterance")
138 # But do not keep it empty!!! This avoids errors in next steps
139 feature_length = (
140 len(self.features_mask) if self.features_mask else 60
141 )
142 normalized_features = numpy.zeros((1, feature_length))
143 return normalized_features
145 def transform(
146 self,
147 wav_data_set: "list[numpy.ndarray]",
148 sample_rate: "list[float]",
149 vad_labels: "list[numpy.ndarray]",
150 ):
151 results = []
152 for wav_data, rate, annotations in zip(
153 wav_data_set, sample_rate, vad_labels
154 ):
155 results.append(self.transform_one(wav_data, rate, annotations))
156 return results
158 def fit(self, X, y=None, **fit_params):
159 return self
161 def _more_tags(self):
162 return {
163 "requires_fit": False,
164 "bob_transform_extra_input": (
165 ("sample_rate", "rate"),
166 ("vad_labels", "annotations"),
167 ),
168 }