Coverage for src/bob/bio/base/script/gen.py: 93%

1"""Generate random scores.

2"""

3import csv

4import logging

5import os

7import click

8import numpy

10from clapper.click import verbosity_option

12logger = logging.getLogger(__name__)

15def gen_score_distr(

16 mean_neg,

17 mean_pos,

18 sigma_neg=10,

19 sigma_pos=10,

20 n_neg=5000,

21 n_pos=5000,

22 seed=0,

23):

24 """Generate scores from normal distributions

26 Parameters

27 ----------

28 mean_neg : float

29 Mean for negative scores

30 mean_pos : float

31 Mean for positive scores

32 sigma_neg : float

33 STDev for negative scores

34 sigma_pos : float

35 STDev for positive scores

36 n_pos: int

37 The number of positive scores generated

38 n_neg: int

39 The number of negative scores generated

40 seed: int

41 A value to initialize the Random Number generator. Giving the same

42 value (or not specifying 'seed') on two different calls will generate

43 the same lists of scores.

45 Returns

46 -------

47 neg_scores : :any:`list`

48 Negatives scores

49 pos_scores : :any:`list`

50 Positive scores

51 """

53 logger.debug("Initializing RNG.")

54 numpy.random.seed(seed)

56 logger.info(f"Generating {n_neg} negative and {n_pos} positive scores.")

58 neg_scores = numpy.random.normal(loc=mean_neg, scale=sigma_neg, size=n_neg)

59 pos_scores = numpy.random.normal(loc=mean_pos, scale=sigma_pos, size=n_pos)

61 return neg_scores, pos_scores

64def write_scores_to_file(

65 neg,

66 pos,

67 filename,

68 n_subjects=5,

69 n_probes_per_subject=5,

70 n_unknown_subjects=0,

71 neg_unknown=None,

72 to_csv=True,

73 five_col=False,

74 metadata={"meta0": "data0", "meta1": "data1"},

75):

76 """Writes score distributions

78 Parameters

79 ----------

80 neg : :py:class:`numpy.ndarray`

81 Scores for negative samples.

82 pos : :py:class:`numpy.ndarray`

83 Scores for positive samples.

84 filename : str

85 The path to write the score to.

86 n_subjects: int

87 Number of different subjects

88 n_probes_per_subject: int

89 Number of different samples used as probe for each subject

90 n_unknown_subjects: int

91 The number of unknown (no registered model) subjects

92 neg_unknown: None or list

93 The of unknown subjects scores

94 to_csv: bool

95 Use the CSV format, else the legacy 4 or 5 columns format.

96 five_col : bool

97 If 5-colum format, else 4-column

98 """

99 logger.debug(f"Creating result directories ('{filename}').")

100 os.makedirs(os.path.dirname(filename), exist_ok=True)

101 s_subjects = ["x%d" % i for i in range(n_subjects)]

102

103 logger.debug("Writing scores to files.")

104

105 with open(filename, "wt") as f:

106 if to_csv:

107 csv_writer = csv.writer(f)

108 csv_writer.writerow(

109 ["bio_ref_subject_id", "probe_subject_id", "key", "score"]

110 + list(metadata.keys())

111 )

112 # Generate one line per probe (unless "--force-count" specified)

113 logger.debug("Writing positive scores.")

114 for i, score in enumerate(pos):

115 s_name = s_subjects[int(i / n_probes_per_subject) % n_subjects]

116 s_five = " " if not five_col else " d" + s_name + " "

117 probe_id = "%s_%d" % (s_name, i % n_probes_per_subject)

118 if to_csv:

119 csv_writer.writerow(

120 [s_name, s_name, probe_id, score] + list(metadata.values())

121 )

122 else:

123 f.write(

124 "%s%s%s %s %f\n" % (s_name, s_five, s_name, probe_id, score)

125 )

126

127 # Generate one line per probe against each ref (unless "--force-count" specified)

128 logger.debug("Writing negative scores.")

129 for i, score in enumerate(neg):

130 n_impostors = n_subjects - 1

131 ref = s_subjects[

132 int(i / n_probes_per_subject / n_impostors) % n_subjects

133 ]

134 impostors = [s for s in s_subjects if s != ref] # ignore pos

135 probe = impostors[int(i / n_probes_per_subject) % n_impostors]

136 s_five = " " if not five_col else " d" + ref

137 probe_id = "%s_%d" % (probe, i % n_probes_per_subject)

138 if to_csv:

139 csv_writer.writerow(

140 [ref, probe, probe_id, score] + list(metadata.values())

141 )

142 else:

143 f.write(

144 "%s%s%s %s %f\n" % (ref, s_five, probe, probe_id, score)

145 )

146

147 logger.debug("Writing unknown scores.")

148 if neg_unknown is not None:

149 s_unknown_subjects = ["u%d" % i for i in range(n_unknown_subjects)]

150 for i, score in enumerate(neg_unknown):

151 ref = s_subjects[

152 int(i / n_probes_per_subject / n_unknown_subjects)

153 % n_subjects

154 ]

155 probe = s_unknown_subjects[

156 int(i / n_probes_per_subject) % n_unknown_subjects

157 ]

158 s_five = " " if not five_col else " d" + ref + " "

159 probe_id = "%s_%d" % (probe, i % n_probes_per_subject)

160 if to_csv:

161 csv_writer.writerow(

162 [ref, probe, probe_id, score] + list(metadata.values())

163 )

164 else:

165 f.write(

166 "%s%s%s %s %f\n" % (ref, s_five, probe, probe_id, score)

167 )

168

169

170@click.command(

171 epilog="""

172Scores generation examples:

173

174Output 'scores-dev.csv' and 'scores-eval.csv' in a new folder 'generated_scores/':

175

176 $ bob bio gen ./generated_scores

177

178Output scores similar to a system evaluated on the AT&T dataset dev group:

179

180 $ bob bio gen -s 20 -p 5 ./generated_scores

181

182Output a given number of scores in each file:

183

184 $ bob bio gen -f --n-neg 500 --n-pos 100 ./generated_scores

185

186Include unknown subjects scores:

187

188 $ bob bio gen -s 5 -u 2 ./generated_scores

189

190Change the mean and standard deviation of the scores distributions:

191

192 $ bob bio gen -mm 1 -sp 0.3 -mnm -1 -sn 0.5 ./generated_scores

193

194You can observe the distributions histograms in a pdf file with:

195

196 $ bob bio hist -e ./generated_scores/scores-{dev,eval}.csv -o hist_gen.pdf

197"""

198)

199@click.argument("outdir")

200@click.option(

201 "-mm",

202 "--mean-match",

203 default=10,

204 type=click.FLOAT,

205 show_default=True,

206 help="Mean for the positive scores distribution",

207)

208@click.option(

209 "-mnm",

210 "--mean-non-match",

211 default=-10,

212 type=click.FLOAT,

213 show_default=True,

214 help="Mean for the negative scores distribution",

215)

216@click.option(

217 "-p",

218 "--n-probes-per-subject",

219 default=5,

220 type=click.INT,

221 show_default=True,

222 help="Number of probes per subject",

223)

224@click.option(

225 "-s",

226 "--n-subjects",

227 default=50,

228 type=click.INT,

229 show_default=True,

230 help="Number of subjects",

231)

232@click.option(

233 "-sp",

234 "--sigma-positive",

235 default=10,

236 type=click.FLOAT,

237 show_default=True,

238 help="Variance for the positive score distributions",

239)

240@click.option(

241 "-sn",

242 "--sigma-negative",

243 default=10,

244 type=click.FLOAT,

245 show_default=True,

246 help="Variance for the negative score distributions",

247)

248@click.option(

249 "-u",

250 "--n-unknown-subjects",

251 default=0,

252 type=click.INT,

253 show_default=True,

254 help="Number of unknown subjects (useful for open-set plots)",

255)

256@click.option(

257 "-f",

258 "--force-count",

259 "force_count",

260 is_flag=True,

261 help="Use --n-pos and --n-neg amounts instead of the subject and sample counts",

262)

263@click.option(

264 "--n-pos",

265 "n_pos",

266 default=5000,

267 type=click.INT,

268 show_default=True,

269 help="Number of Positive verifications (number of lines in the file)",

270)

271@click.option(

272 "--n-neg",

273 "n_neg",

274 default=5000,

275 type=click.INT,

276 show_default=True,

277 help="Number of Negative verifications (number of lines in the file)",

278)

279@click.option(

280 "--n-unk",

281 "n_unk",

282 default=5000,

283 type=click.INT,

284 show_default=True,

285 help="Number of Unknown verifications (number of lines in the file)",

286)

287@click.option("--csv/--legacy", default=True, show_default=True)

288@click.option("--five-col/--four-col", default=False, show_default=True)

289@verbosity_option(logger=logger)

290def gen(

291 outdir,

292 mean_match,

293 mean_non_match,

294 n_probes_per_subject,

295 n_subjects,

296 sigma_positive,

297 sigma_negative,

298 n_unknown_subjects,

299 csv,

300 five_col,

301 force_count,

302 n_pos,

303 n_neg,

304 n_unk,

305 **kwargs,

306):

307 """Generate random scores.

308

309 Generates random scores in 4col or 5col format. The scores are generated

310 using Gaussian distribution whose mean and variance are an input

311 parameter. The generated scores can be used as hypothetical datasets.

312

313 This command generates scores relative to the number of subjects and

314 probes per subjects, unless the -f flag is set. In that case, the --n-pos

315 and --n-neg options are used as number of genuine and impostor

316 comparisons.

317 """

318

319 # Compute the number of verifications needed

320 if force_count:

321 neg_count, pos_count, unknown_count = n_neg, n_pos, n_unk

322 else:

323 # One reference (model), and `n_probes_per_subject` probes per subject

324 neg_count = n_subjects * n_probes_per_subject * (n_subjects - 1)

325 pos_count = n_probes_per_subject * n_subjects

326 unknown_count = n_unknown_subjects * n_subjects * n_probes_per_subject

327

328 # Generate the data

329 logger.info("Generating dev scores.")

330 neg_dev, pos_dev = gen_score_distr(

331 mean_non_match,

332 mean_match,

333 sigma_negative,

334 sigma_positive,

335 n_neg=neg_count,

336 n_pos=pos_count,

337 seed=0,

338 )

339 logger.info("Generating eval scores.")

340 neg_eval, pos_eval = gen_score_distr(

341 mean_non_match,

342 mean_match,

343 sigma_negative,

344 sigma_positive,

345 n_neg=neg_count,

346 n_pos=pos_count,

347 seed=1,

348 )

349

350 # For simplicity I will use the same distribution for dev-eval

351 if n_unknown_subjects:

352 logger.info("Generating unknown scores.")

353 neg_unknown, _ = gen_score_distr(

354 mean_non_match,

355 mean_match,

356 sigma_negative,

357 sigma_positive,

358 n_neg=unknown_count,

359 n_pos=0,

360 seed=2,

361 )

362 else:

363 neg_unknown = None

364

365 # Write the data into files

366 logger.info("Saving results.")

367 write_scores_to_file(

368 neg_dev,

369 pos_dev,

370 os.path.join(outdir, "scores-dev.csv"),

371 n_subjects,

372 n_probes_per_subject,

373 n_unknown_subjects,

374 neg_unknown,

375 csv,

376 five_col,

377 )

378

379 write_scores_to_file(

380 neg_eval,

381 pos_eval,

382 os.path.join(outdir, "scores-eval.csv"),

383 n_subjects,

384 n_probes_per_subject,

385 n_unknown_subjects,

386 neg_unknown,

387 csv,

388 five_col,

389 )