Coverage for src/bob/bio/base/script/gen.py: 93%

86 statements  

« prev     ^ index     » next       coverage.py v7.6.0, created at 2024-07-12 22:34 +0200

1"""Generate random scores. 

2""" 

3import csv 

4import logging 

5import os 

6 

7import click 

8import numpy 

9 

10from clapper.click import verbosity_option 

11 

12logger = logging.getLogger(__name__) 

13 

14 

15def gen_score_distr( 

16 mean_neg, 

17 mean_pos, 

18 sigma_neg=10, 

19 sigma_pos=10, 

20 n_neg=5000, 

21 n_pos=5000, 

22 seed=0, 

23): 

24 """Generate scores from normal distributions 

25 

26 Parameters 

27 ---------- 

28 mean_neg : float 

29 Mean for negative scores 

30 mean_pos : float 

31 Mean for positive scores 

32 sigma_neg : float 

33 STDev for negative scores 

34 sigma_pos : float 

35 STDev for positive scores 

36 n_pos: int 

37 The number of positive scores generated 

38 n_neg: int 

39 The number of negative scores generated 

40 seed: int 

41 A value to initialize the Random Number generator. Giving the same 

42 value (or not specifying 'seed') on two different calls will generate 

43 the same lists of scores. 

44 

45 Returns 

46 ------- 

47 neg_scores : :any:`list` 

48 Negatives scores 

49 pos_scores : :any:`list` 

50 Positive scores 

51 """ 

52 

53 logger.debug("Initializing RNG.") 

54 numpy.random.seed(seed) 

55 

56 logger.info(f"Generating {n_neg} negative and {n_pos} positive scores.") 

57 

58 neg_scores = numpy.random.normal(loc=mean_neg, scale=sigma_neg, size=n_neg) 

59 pos_scores = numpy.random.normal(loc=mean_pos, scale=sigma_pos, size=n_pos) 

60 

61 return neg_scores, pos_scores 

62 

63 

64def write_scores_to_file( 

65 neg, 

66 pos, 

67 filename, 

68 n_subjects=5, 

69 n_probes_per_subject=5, 

70 n_unknown_subjects=0, 

71 neg_unknown=None, 

72 to_csv=True, 

73 five_col=False, 

74 metadata={"meta0": "data0", "meta1": "data1"}, 

75): 

76 """Writes score distributions 

77 

78 Parameters 

79 ---------- 

80 neg : :py:class:`numpy.ndarray` 

81 Scores for negative samples. 

82 pos : :py:class:`numpy.ndarray` 

83 Scores for positive samples. 

84 filename : str 

85 The path to write the score to. 

86 n_subjects: int 

87 Number of different subjects 

88 n_probes_per_subject: int 

89 Number of different samples used as probe for each subject 

90 n_unknown_subjects: int 

91 The number of unknown (no registered model) subjects 

92 neg_unknown: None or list 

93 The of unknown subjects scores 

94 to_csv: bool 

95 Use the CSV format, else the legacy 4 or 5 columns format. 

96 five_col : bool 

97 If 5-colum format, else 4-column 

98 """ 

99 logger.debug(f"Creating result directories ('{filename}').") 

100 os.makedirs(os.path.dirname(filename), exist_ok=True) 

101 s_subjects = ["x%d" % i for i in range(n_subjects)] 

102 

103 logger.debug("Writing scores to files.") 

104 

105 with open(filename, "wt") as f: 

106 if to_csv: 

107 csv_writer = csv.writer(f) 

108 csv_writer.writerow( 

109 ["bio_ref_subject_id", "probe_subject_id", "key", "score"] 

110 + list(metadata.keys()) 

111 ) 

112 # Generate one line per probe (unless "--force-count" specified) 

113 logger.debug("Writing positive scores.") 

114 for i, score in enumerate(pos): 

115 s_name = s_subjects[int(i / n_probes_per_subject) % n_subjects] 

116 s_five = " " if not five_col else " d" + s_name + " " 

117 probe_id = "%s_%d" % (s_name, i % n_probes_per_subject) 

118 if to_csv: 

119 csv_writer.writerow( 

120 [s_name, s_name, probe_id, score] + list(metadata.values()) 

121 ) 

122 else: 

123 f.write( 

124 "%s%s%s %s %f\n" % (s_name, s_five, s_name, probe_id, score) 

125 ) 

126 

127 # Generate one line per probe against each ref (unless "--force-count" specified) 

128 logger.debug("Writing negative scores.") 

129 for i, score in enumerate(neg): 

130 n_impostors = n_subjects - 1 

131 ref = s_subjects[ 

132 int(i / n_probes_per_subject / n_impostors) % n_subjects 

133 ] 

134 impostors = [s for s in s_subjects if s != ref] # ignore pos 

135 probe = impostors[int(i / n_probes_per_subject) % n_impostors] 

136 s_five = " " if not five_col else " d" + ref 

137 probe_id = "%s_%d" % (probe, i % n_probes_per_subject) 

138 if to_csv: 

139 csv_writer.writerow( 

140 [ref, probe, probe_id, score] + list(metadata.values()) 

141 ) 

142 else: 

143 f.write( 

144 "%s%s%s %s %f\n" % (ref, s_five, probe, probe_id, score) 

145 ) 

146 

147 logger.debug("Writing unknown scores.") 

148 if neg_unknown is not None: 

149 s_unknown_subjects = ["u%d" % i for i in range(n_unknown_subjects)] 

150 for i, score in enumerate(neg_unknown): 

151 ref = s_subjects[ 

152 int(i / n_probes_per_subject / n_unknown_subjects) 

153 % n_subjects 

154 ] 

155 probe = s_unknown_subjects[ 

156 int(i / n_probes_per_subject) % n_unknown_subjects 

157 ] 

158 s_five = " " if not five_col else " d" + ref + " " 

159 probe_id = "%s_%d" % (probe, i % n_probes_per_subject) 

160 if to_csv: 

161 csv_writer.writerow( 

162 [ref, probe, probe_id, score] + list(metadata.values()) 

163 ) 

164 else: 

165 f.write( 

166 "%s%s%s %s %f\n" % (ref, s_five, probe, probe_id, score) 

167 ) 

168 

169 

170@click.command( 

171 epilog=""" 

172Scores generation examples: 

173 

174Output 'scores-dev.csv' and 'scores-eval.csv' in a new folder 'generated_scores/': 

175 

176 $ bob bio gen ./generated_scores 

177 

178Output scores similar to a system evaluated on the AT&T dataset dev group: 

179 

180 $ bob bio gen -s 20 -p 5 ./generated_scores 

181 

182Output a given number of scores in each file: 

183 

184 $ bob bio gen -f --n-neg 500 --n-pos 100 ./generated_scores 

185 

186Include unknown subjects scores: 

187 

188 $ bob bio gen -s 5 -u 2 ./generated_scores 

189 

190Change the mean and standard deviation of the scores distributions: 

191 

192 $ bob bio gen -mm 1 -sp 0.3 -mnm -1 -sn 0.5 ./generated_scores 

193 

194You can observe the distributions histograms in a pdf file with: 

195 

196 $ bob bio hist -e ./generated_scores/scores-{dev,eval}.csv -o hist_gen.pdf 

197""" 

198) 

199@click.argument("outdir") 

200@click.option( 

201 "-mm", 

202 "--mean-match", 

203 default=10, 

204 type=click.FLOAT, 

205 show_default=True, 

206 help="Mean for the positive scores distribution", 

207) 

208@click.option( 

209 "-mnm", 

210 "--mean-non-match", 

211 default=-10, 

212 type=click.FLOAT, 

213 show_default=True, 

214 help="Mean for the negative scores distribution", 

215) 

216@click.option( 

217 "-p", 

218 "--n-probes-per-subject", 

219 default=5, 

220 type=click.INT, 

221 show_default=True, 

222 help="Number of probes per subject", 

223) 

224@click.option( 

225 "-s", 

226 "--n-subjects", 

227 default=50, 

228 type=click.INT, 

229 show_default=True, 

230 help="Number of subjects", 

231) 

232@click.option( 

233 "-sp", 

234 "--sigma-positive", 

235 default=10, 

236 type=click.FLOAT, 

237 show_default=True, 

238 help="Variance for the positive score distributions", 

239) 

240@click.option( 

241 "-sn", 

242 "--sigma-negative", 

243 default=10, 

244 type=click.FLOAT, 

245 show_default=True, 

246 help="Variance for the negative score distributions", 

247) 

248@click.option( 

249 "-u", 

250 "--n-unknown-subjects", 

251 default=0, 

252 type=click.INT, 

253 show_default=True, 

254 help="Number of unknown subjects (useful for open-set plots)", 

255) 

256@click.option( 

257 "-f", 

258 "--force-count", 

259 "force_count", 

260 is_flag=True, 

261 help="Use --n-pos and --n-neg amounts instead of the subject and sample counts", 

262) 

263@click.option( 

264 "--n-pos", 

265 "n_pos", 

266 default=5000, 

267 type=click.INT, 

268 show_default=True, 

269 help="Number of Positive verifications (number of lines in the file)", 

270) 

271@click.option( 

272 "--n-neg", 

273 "n_neg", 

274 default=5000, 

275 type=click.INT, 

276 show_default=True, 

277 help="Number of Negative verifications (number of lines in the file)", 

278) 

279@click.option( 

280 "--n-unk", 

281 "n_unk", 

282 default=5000, 

283 type=click.INT, 

284 show_default=True, 

285 help="Number of Unknown verifications (number of lines in the file)", 

286) 

287@click.option("--csv/--legacy", default=True, show_default=True) 

288@click.option("--five-col/--four-col", default=False, show_default=True) 

289@verbosity_option(logger=logger) 

290def gen( 

291 outdir, 

292 mean_match, 

293 mean_non_match, 

294 n_probes_per_subject, 

295 n_subjects, 

296 sigma_positive, 

297 sigma_negative, 

298 n_unknown_subjects, 

299 csv, 

300 five_col, 

301 force_count, 

302 n_pos, 

303 n_neg, 

304 n_unk, 

305 **kwargs, 

306): 

307 """Generate random scores. 

308 

309 Generates random scores in 4col or 5col format. The scores are generated 

310 using Gaussian distribution whose mean and variance are an input 

311 parameter. The generated scores can be used as hypothetical datasets. 

312 

313 This command generates scores relative to the number of subjects and 

314 probes per subjects, unless the -f flag is set. In that case, the --n-pos 

315 and --n-neg options are used as number of genuine and impostor 

316 comparisons. 

317 """ 

318 

319 # Compute the number of verifications needed 

320 if force_count: 

321 neg_count, pos_count, unknown_count = n_neg, n_pos, n_unk 

322 else: 

323 # One reference (model), and `n_probes_per_subject` probes per subject 

324 neg_count = n_subjects * n_probes_per_subject * (n_subjects - 1) 

325 pos_count = n_probes_per_subject * n_subjects 

326 unknown_count = n_unknown_subjects * n_subjects * n_probes_per_subject 

327 

328 # Generate the data 

329 logger.info("Generating dev scores.") 

330 neg_dev, pos_dev = gen_score_distr( 

331 mean_non_match, 

332 mean_match, 

333 sigma_negative, 

334 sigma_positive, 

335 n_neg=neg_count, 

336 n_pos=pos_count, 

337 seed=0, 

338 ) 

339 logger.info("Generating eval scores.") 

340 neg_eval, pos_eval = gen_score_distr( 

341 mean_non_match, 

342 mean_match, 

343 sigma_negative, 

344 sigma_positive, 

345 n_neg=neg_count, 

346 n_pos=pos_count, 

347 seed=1, 

348 ) 

349 

350 # For simplicity I will use the same distribution for dev-eval 

351 if n_unknown_subjects: 

352 logger.info("Generating unknown scores.") 

353 neg_unknown, _ = gen_score_distr( 

354 mean_non_match, 

355 mean_match, 

356 sigma_negative, 

357 sigma_positive, 

358 n_neg=unknown_count, 

359 n_pos=0, 

360 seed=2, 

361 ) 

362 else: 

363 neg_unknown = None 

364 

365 # Write the data into files 

366 logger.info("Saving results.") 

367 write_scores_to_file( 

368 neg_dev, 

369 pos_dev, 

370 os.path.join(outdir, "scores-dev.csv"), 

371 n_subjects, 

372 n_probes_per_subject, 

373 n_unknown_subjects, 

374 neg_unknown, 

375 csv, 

376 five_col, 

377 ) 

378 

379 write_scores_to_file( 

380 neg_eval, 

381 pos_eval, 

382 os.path.join(outdir, "scores-eval.csv"), 

383 n_subjects, 

384 n_probes_per_subject, 

385 n_unknown_subjects, 

386 neg_unknown, 

387 csv, 

388 five_col, 

389 )