Coverage for src/bob/bio/base/script/gen.py: 93%
86 statements
« prev ^ index » next coverage.py v7.6.0, created at 2024-07-12 22:34 +0200
« prev ^ index » next coverage.py v7.6.0, created at 2024-07-12 22:34 +0200
1"""Generate random scores.
2"""
3import csv
4import logging
5import os
7import click
8import numpy
10from clapper.click import verbosity_option
12logger = logging.getLogger(__name__)
15def gen_score_distr(
16 mean_neg,
17 mean_pos,
18 sigma_neg=10,
19 sigma_pos=10,
20 n_neg=5000,
21 n_pos=5000,
22 seed=0,
23):
24 """Generate scores from normal distributions
26 Parameters
27 ----------
28 mean_neg : float
29 Mean for negative scores
30 mean_pos : float
31 Mean for positive scores
32 sigma_neg : float
33 STDev for negative scores
34 sigma_pos : float
35 STDev for positive scores
36 n_pos: int
37 The number of positive scores generated
38 n_neg: int
39 The number of negative scores generated
40 seed: int
41 A value to initialize the Random Number generator. Giving the same
42 value (or not specifying 'seed') on two different calls will generate
43 the same lists of scores.
45 Returns
46 -------
47 neg_scores : :any:`list`
48 Negatives scores
49 pos_scores : :any:`list`
50 Positive scores
51 """
53 logger.debug("Initializing RNG.")
54 numpy.random.seed(seed)
56 logger.info(f"Generating {n_neg} negative and {n_pos} positive scores.")
58 neg_scores = numpy.random.normal(loc=mean_neg, scale=sigma_neg, size=n_neg)
59 pos_scores = numpy.random.normal(loc=mean_pos, scale=sigma_pos, size=n_pos)
61 return neg_scores, pos_scores
64def write_scores_to_file(
65 neg,
66 pos,
67 filename,
68 n_subjects=5,
69 n_probes_per_subject=5,
70 n_unknown_subjects=0,
71 neg_unknown=None,
72 to_csv=True,
73 five_col=False,
74 metadata={"meta0": "data0", "meta1": "data1"},
75):
76 """Writes score distributions
78 Parameters
79 ----------
80 neg : :py:class:`numpy.ndarray`
81 Scores for negative samples.
82 pos : :py:class:`numpy.ndarray`
83 Scores for positive samples.
84 filename : str
85 The path to write the score to.
86 n_subjects: int
87 Number of different subjects
88 n_probes_per_subject: int
89 Number of different samples used as probe for each subject
90 n_unknown_subjects: int
91 The number of unknown (no registered model) subjects
92 neg_unknown: None or list
93 The of unknown subjects scores
94 to_csv: bool
95 Use the CSV format, else the legacy 4 or 5 columns format.
96 five_col : bool
97 If 5-colum format, else 4-column
98 """
99 logger.debug(f"Creating result directories ('{filename}').")
100 os.makedirs(os.path.dirname(filename), exist_ok=True)
101 s_subjects = ["x%d" % i for i in range(n_subjects)]
103 logger.debug("Writing scores to files.")
105 with open(filename, "wt") as f:
106 if to_csv:
107 csv_writer = csv.writer(f)
108 csv_writer.writerow(
109 ["bio_ref_subject_id", "probe_subject_id", "key", "score"]
110 + list(metadata.keys())
111 )
112 # Generate one line per probe (unless "--force-count" specified)
113 logger.debug("Writing positive scores.")
114 for i, score in enumerate(pos):
115 s_name = s_subjects[int(i / n_probes_per_subject) % n_subjects]
116 s_five = " " if not five_col else " d" + s_name + " "
117 probe_id = "%s_%d" % (s_name, i % n_probes_per_subject)
118 if to_csv:
119 csv_writer.writerow(
120 [s_name, s_name, probe_id, score] + list(metadata.values())
121 )
122 else:
123 f.write(
124 "%s%s%s %s %f\n" % (s_name, s_five, s_name, probe_id, score)
125 )
127 # Generate one line per probe against each ref (unless "--force-count" specified)
128 logger.debug("Writing negative scores.")
129 for i, score in enumerate(neg):
130 n_impostors = n_subjects - 1
131 ref = s_subjects[
132 int(i / n_probes_per_subject / n_impostors) % n_subjects
133 ]
134 impostors = [s for s in s_subjects if s != ref] # ignore pos
135 probe = impostors[int(i / n_probes_per_subject) % n_impostors]
136 s_five = " " if not five_col else " d" + ref
137 probe_id = "%s_%d" % (probe, i % n_probes_per_subject)
138 if to_csv:
139 csv_writer.writerow(
140 [ref, probe, probe_id, score] + list(metadata.values())
141 )
142 else:
143 f.write(
144 "%s%s%s %s %f\n" % (ref, s_five, probe, probe_id, score)
145 )
147 logger.debug("Writing unknown scores.")
148 if neg_unknown is not None:
149 s_unknown_subjects = ["u%d" % i for i in range(n_unknown_subjects)]
150 for i, score in enumerate(neg_unknown):
151 ref = s_subjects[
152 int(i / n_probes_per_subject / n_unknown_subjects)
153 % n_subjects
154 ]
155 probe = s_unknown_subjects[
156 int(i / n_probes_per_subject) % n_unknown_subjects
157 ]
158 s_five = " " if not five_col else " d" + ref + " "
159 probe_id = "%s_%d" % (probe, i % n_probes_per_subject)
160 if to_csv:
161 csv_writer.writerow(
162 [ref, probe, probe_id, score] + list(metadata.values())
163 )
164 else:
165 f.write(
166 "%s%s%s %s %f\n" % (ref, s_five, probe, probe_id, score)
167 )
170@click.command(
171 epilog="""
172Scores generation examples:
174Output 'scores-dev.csv' and 'scores-eval.csv' in a new folder 'generated_scores/':
176 $ bob bio gen ./generated_scores
178Output scores similar to a system evaluated on the AT&T dataset dev group:
180 $ bob bio gen -s 20 -p 5 ./generated_scores
182Output a given number of scores in each file:
184 $ bob bio gen -f --n-neg 500 --n-pos 100 ./generated_scores
186Include unknown subjects scores:
188 $ bob bio gen -s 5 -u 2 ./generated_scores
190Change the mean and standard deviation of the scores distributions:
192 $ bob bio gen -mm 1 -sp 0.3 -mnm -1 -sn 0.5 ./generated_scores
194You can observe the distributions histograms in a pdf file with:
196 $ bob bio hist -e ./generated_scores/scores-{dev,eval}.csv -o hist_gen.pdf
197"""
198)
199@click.argument("outdir")
200@click.option(
201 "-mm",
202 "--mean-match",
203 default=10,
204 type=click.FLOAT,
205 show_default=True,
206 help="Mean for the positive scores distribution",
207)
208@click.option(
209 "-mnm",
210 "--mean-non-match",
211 default=-10,
212 type=click.FLOAT,
213 show_default=True,
214 help="Mean for the negative scores distribution",
215)
216@click.option(
217 "-p",
218 "--n-probes-per-subject",
219 default=5,
220 type=click.INT,
221 show_default=True,
222 help="Number of probes per subject",
223)
224@click.option(
225 "-s",
226 "--n-subjects",
227 default=50,
228 type=click.INT,
229 show_default=True,
230 help="Number of subjects",
231)
232@click.option(
233 "-sp",
234 "--sigma-positive",
235 default=10,
236 type=click.FLOAT,
237 show_default=True,
238 help="Variance for the positive score distributions",
239)
240@click.option(
241 "-sn",
242 "--sigma-negative",
243 default=10,
244 type=click.FLOAT,
245 show_default=True,
246 help="Variance for the negative score distributions",
247)
248@click.option(
249 "-u",
250 "--n-unknown-subjects",
251 default=0,
252 type=click.INT,
253 show_default=True,
254 help="Number of unknown subjects (useful for open-set plots)",
255)
256@click.option(
257 "-f",
258 "--force-count",
259 "force_count",
260 is_flag=True,
261 help="Use --n-pos and --n-neg amounts instead of the subject and sample counts",
262)
263@click.option(
264 "--n-pos",
265 "n_pos",
266 default=5000,
267 type=click.INT,
268 show_default=True,
269 help="Number of Positive verifications (number of lines in the file)",
270)
271@click.option(
272 "--n-neg",
273 "n_neg",
274 default=5000,
275 type=click.INT,
276 show_default=True,
277 help="Number of Negative verifications (number of lines in the file)",
278)
279@click.option(
280 "--n-unk",
281 "n_unk",
282 default=5000,
283 type=click.INT,
284 show_default=True,
285 help="Number of Unknown verifications (number of lines in the file)",
286)
287@click.option("--csv/--legacy", default=True, show_default=True)
288@click.option("--five-col/--four-col", default=False, show_default=True)
289@verbosity_option(logger=logger)
290def gen(
291 outdir,
292 mean_match,
293 mean_non_match,
294 n_probes_per_subject,
295 n_subjects,
296 sigma_positive,
297 sigma_negative,
298 n_unknown_subjects,
299 csv,
300 five_col,
301 force_count,
302 n_pos,
303 n_neg,
304 n_unk,
305 **kwargs,
306):
307 """Generate random scores.
309 Generates random scores in 4col or 5col format. The scores are generated
310 using Gaussian distribution whose mean and variance are an input
311 parameter. The generated scores can be used as hypothetical datasets.
313 This command generates scores relative to the number of subjects and
314 probes per subjects, unless the -f flag is set. In that case, the --n-pos
315 and --n-neg options are used as number of genuine and impostor
316 comparisons.
317 """
319 # Compute the number of verifications needed
320 if force_count:
321 neg_count, pos_count, unknown_count = n_neg, n_pos, n_unk
322 else:
323 # One reference (model), and `n_probes_per_subject` probes per subject
324 neg_count = n_subjects * n_probes_per_subject * (n_subjects - 1)
325 pos_count = n_probes_per_subject * n_subjects
326 unknown_count = n_unknown_subjects * n_subjects * n_probes_per_subject
328 # Generate the data
329 logger.info("Generating dev scores.")
330 neg_dev, pos_dev = gen_score_distr(
331 mean_non_match,
332 mean_match,
333 sigma_negative,
334 sigma_positive,
335 n_neg=neg_count,
336 n_pos=pos_count,
337 seed=0,
338 )
339 logger.info("Generating eval scores.")
340 neg_eval, pos_eval = gen_score_distr(
341 mean_non_match,
342 mean_match,
343 sigma_negative,
344 sigma_positive,
345 n_neg=neg_count,
346 n_pos=pos_count,
347 seed=1,
348 )
350 # For simplicity I will use the same distribution for dev-eval
351 if n_unknown_subjects:
352 logger.info("Generating unknown scores.")
353 neg_unknown, _ = gen_score_distr(
354 mean_non_match,
355 mean_match,
356 sigma_negative,
357 sigma_positive,
358 n_neg=unknown_count,
359 n_pos=0,
360 seed=2,
361 )
362 else:
363 neg_unknown = None
365 # Write the data into files
366 logger.info("Saving results.")
367 write_scores_to_file(
368 neg_dev,
369 pos_dev,
370 os.path.join(outdir, "scores-dev.csv"),
371 n_subjects,
372 n_probes_per_subject,
373 n_unknown_subjects,
374 neg_unknown,
375 csv,
376 five_col,
377 )
379 write_scores_to_file(
380 neg_eval,
381 pos_eval,
382 os.path.join(outdir, "scores-eval.csv"),
383 n_subjects,
384 n_probes_per_subject,
385 n_unknown_subjects,
386 neg_unknown,
387 csv,
388 five_col,
389 )