Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1#!/usr/bin/env python 

2# coding=utf-8 

3 

4import logging 

5import os 

6import sys 

7 

8import click 

9import numpy 

10 

11from bob.extension.scripts.click_helper import ( 

12 ConfigCommand, 

13 ResourceOption, 

14 verbosity_option, 

15) 

16 

17from ..engine.significance import ( 

18 PERFORMANCE_FIGURES, 

19 index_of_outliers, 

20 sliding_window_performances, 

21 visual_performances, 

22 write_analysis_figures, 

23 write_analysis_text, 

24) 

25from .evaluate import _validate_threshold 

26from .evaluate import run as run_evaluation 

27 

28logger = logging.getLogger(__name__) 

29 

30 

31def _eval_sliding_windows( 

32 system_name, 

33 threshold, 

34 evaluate, 

35 preddir, 

36 dataset, 

37 steps, 

38 size, 

39 stride, 

40 outdir, 

41 figure, 

42 nproc, 

43 checkpointdir, 

44): 

45 """Calculates the sliding window performances on a dataset 

46 

47 

48 Parameters 

49 ========== 

50 

51 system_name : str 

52 The name of the current system being analyzed 

53 

54 threshold : :py:class:`float`, :py:class:`str` 

55 This number is used to define positives and negatives from probability 

56 maps, and report F1-scores (a priori). By default, we expect a set 

57 named 'validation' to be available at the input data. If that is not 

58 the case, we use 'train', if available. You may provide the name of 

59 another dataset to be used for threshold tunning otherwise. If not 

60 set, or a string is input, threshold tunning is done per system, 

61 individually. Optionally, you may also provide a floating-point number 

62 between [0.0, 1.0] as the threshold to use for both systems. 

63 

64 evaluate : str 

65 Name of the dataset key to use from ``dataset`` to evaluate (typically, 

66 ``test``) 

67 

68 preddir : str 

69 Root path to the predictions generated by system ``system_name``. The 

70 final subpath inside ``preddir`` that will be used will have the value 

71 of this variable suffixed with the value of ``evaluate``. We will 

72 search for ``<preddir>/<evaluate>/<stems>.hdf5``. 

73 

74 dataset : dict 

75 A dictionary mapping string keys to 

76 :py:class:`torch.utils.data.dataset.Dataset` instances 

77 

78 steps : int 

79 The number of threshold steps to consider when evaluating the highest 

80 possible F1-score on train/test data. 

81 

82 size : tuple 

83 Two values indicating the size of windows to be used for the sliding 

84 window analysis. The values represent height and width respectively 

85 

86 stride : tuple 

87 Two values indicating the stride of windows to be used for the sliding 

88 window analysis. The values represent height and width respectively 

89 

90 outdir : str 

91 Path where to store visualizations. If set to ``None``, then do not 

92 store performance visualizations. 

93 

94 figure : str 

95 The name of a performance figure (e.g. ``f1_score``, ``jaccard``, or 

96 ``accuracy``) to use when comparing performances 

97 

98 nproc : int 

99 Sets the number of parallel processes to use when running using 

100 multiprocessing. A value of zero uses all reported cores. A value of 

101 ``1`` avoids completely the use of multiprocessing and runs all chores 

102 in the current processing context. 

103 

104 checkpointdir : str 

105 If set to a string (instead of ``None``), then stores a cached version 

106 of the sliding window performances on disk, for a particular system. 

107 

108 

109 Returns 

110 ======= 

111 

112 d : dict 

113 A dictionary in which keys are filename stems and values are 

114 dictionaries with the following contents: 

115 

116 ``winperf``: numpy.ndarray 

117 A dataframe with all the sliding window performances aggregated, 

118 for all input images. 

119 

120 ``n`` : numpy.ndarray 

121 A 2D numpy array containing the number of performance scores for 

122 every pixel in the original image 

123 

124 ``avg`` : numpy.ndarray 

125 A 2D numpy array containing the average performances for every 

126 pixel on the input image considering the sliding window sizes and 

127 strides applied to the image 

128 

129 ``std`` : numpy.ndarray 

130 A 2D numpy array containing the (unbiased) standard deviations for 

131 the provided performance figure, for every pixel on the input image 

132 considering the sliding window sizes and strides applied to the 

133 image 

134 

135 """ 

136 

137 if checkpointdir is not None: 

138 chkpt_fname = os.path.join( 

139 checkpointdir, 

140 f"{system_name}-{evaluate}-{threshold}-" 

141 f"{size[0]}x{size[1]}+{stride[0]}x{stride[1]}-{figure}.pkl.gz", 

142 ) 

143 os.makedirs(os.path.dirname(chkpt_fname), exist_ok=True) 

144 if os.path.exists(chkpt_fname): 

145 logger.info(f"Loading checkpoint from {chkpt_fname}...") 

146 # loads and returns checkpoint from file 

147 try: 

148 with __import__("gzip").GzipFile(chkpt_fname, "r") as f: 

149 return __import__("pickle").load(f) 

150 except EOFError as e: 

151 logger.warning( 

152 f"Could not load sliding window performance " 

153 f"from {chkpt_fname}: {e}. Calculating..." 

154 ) 

155 else: 

156 logger.debug( 

157 f"Checkpoint not available at {chkpt_fname}. " f"Calculating..." 

158 ) 

159 else: 

160 chkpt_fname = None 

161 

162 if not isinstance(threshold, float): 

163 

164 assert threshold in dataset, f"No dataset named '{threshold}'" 

165 

166 logger.info( 

167 f"Evaluating threshold on '{threshold}' set for " 

168 f"'{system_name}' using {steps} steps" 

169 ) 

170 threshold = run_evaluation( 

171 dataset[threshold], threshold, preddir, steps=steps 

172 ) 

173 logger.info(f"Set --threshold={threshold:.5f} for '{system_name}'") 

174 

175 # for a given threshold on each system, calculate sliding window performances 

176 logger.info( 

177 f"Evaluating sliding window '{figure}' on '{evaluate}' set for " 

178 f"'{system_name}' using windows of size {size} and stride {stride}" 

179 ) 

180 

181 retval = sliding_window_performances( 

182 dataset, 

183 evaluate, 

184 preddir, 

185 threshold, 

186 size, 

187 stride, 

188 figure, 

189 nproc, 

190 outdir, 

191 ) 

192 

193 # cache sliding window performance for later use, if necessary 

194 if chkpt_fname is not None: 

195 logger.debug(f"Storing checkpoint at {chkpt_fname}...") 

196 with __import__("gzip").GzipFile(chkpt_fname, "w") as f: 

197 __import__("pickle").dump(retval, f) 

198 

199 return retval 

200 

201 

202def _eval_differences( 

203 names, 

204 perfs, 

205 evaluate, 

206 dataset, 

207 size, 

208 stride, 

209 outdir, 

210 figure, 

211 nproc, 

212 checkpointdir, 

213): 

214 """Evaluate differences in the performance sliding windows between two systems 

215 

216 Parameters 

217 ---------- 

218 

219 names : :py:class:`tuple` of :py:class:`str` 

220 Names of the first and second systems 

221 

222 perfs : :py:class:`tuple` of :py:class:`dict` 

223 Dictionaries for the sliding window performances of each system, as 

224 returned by :py:func:`_eval_sliding_windows` 

225 

226 evaluate : str 

227 Name of the dataset key to use from ``dataset`` to evaluate (typically, 

228 ``test``) 

229 

230 dataset : dict 

231 A dictionary mapping string keys to 

232 :py:class:`torch.utils.data.dataset.Dataset` instances 

233 

234 size : tuple 

235 Two values indicating the size of windows to be used for sliding window 

236 analysis. The values represent height and width respectively 

237 

238 stride : tuple 

239 Two values indicating the stride of windows to be used for sliding 

240 window analysis. The values represent height and width respectively 

241 

242 outdir : str 

243 If set to ``None``, then do not output performance visualizations. 

244 Otherwise, in directory ``outdir``, dumps the visualizations for the 

245 performance differences between both systems. 

246 

247 figure : str 

248 The name of a performance figure (e.g. ``f1_score``, or ``jaccard``) to 

249 use when comparing performances 

250 

251 nproc : int 

252 Sets the number of parallel processes to use when running using 

253 multiprocessing. A value of zero uses all reported cores. A value of 

254 ``1`` avoids completely the use of multiprocessing and runs all chores 

255 in the current processing context. 

256 

257 checkpointdir : str 

258 If set to a string (instead of ``None``), then stores a cached version 

259 of the sliding window performances on disk, for a particular difference 

260 between systems. 

261 

262 

263 Returns 

264 ------- 

265 

266 d : dict 

267 A dictionary representing sliding window performance differences across 

268 all files and sliding windows. The format of this is similar to the 

269 individual inputs ``perf1`` and ``perf2``. 

270 

271 """ 

272 

273 if checkpointdir is not None: 

274 chkpt_fname = os.path.join( 

275 checkpointdir, 

276 f"{names[0]}-{names[1]}-{evaluate}-" 

277 f"{size[0]}x{size[1]}+{stride[0]}x{stride[1]}-{figure}.pkl.gz", 

278 ) 

279 os.makedirs(os.path.dirname(chkpt_fname), exist_ok=True) 

280 if os.path.exists(chkpt_fname): 

281 logger.info(f"Loading checkpoint from {chkpt_fname}...") 

282 # loads and returns checkpoint from file 

283 try: 

284 with __import__("gzip").GzipFile(chkpt_fname, "r") as f: 

285 return __import__("pickle").load(f) 

286 except EOFError as e: 

287 logger.warning( 

288 f"Could not load sliding window performance " 

289 f"from {chkpt_fname}: {e}. Calculating..." 

290 ) 

291 else: 

292 logger.debug( 

293 f"Checkpoint not available at {chkpt_fname}. " f"Calculating..." 

294 ) 

295 else: 

296 chkpt_fname = None 

297 

298 perf_diff = dict( 

299 [(k, perfs[0][k]["winperf"] - perfs[1][k]["winperf"]) for k in perfs[0]] 

300 ) 

301 

302 # for a given threshold on each system, calculate sliding window performances 

303 logger.info( 

304 f"Evaluating sliding window '{figure}' differences on '{evaluate}' " 

305 f"set on '{names[0]}-{names[1]}' using windows of size {size} and " 

306 f"stride {stride}" 

307 ) 

308 

309 retval = visual_performances( 

310 dataset, 

311 evaluate, 

312 perf_diff, 

313 size, 

314 stride, 

315 figure, 

316 nproc, 

317 outdir, 

318 ) 

319 

320 # cache sliding window performance for later use, if necessary 

321 if chkpt_fname is not None: 

322 logger.debug(f"Storing checkpoint at {chkpt_fname}...") 

323 with __import__("gzip").GzipFile(chkpt_fname, "w") as f: 

324 __import__("pickle").dump(retval, f) 

325 

326 return retval 

327 

328 

329@click.command( 

330 entry_point_group="bob.ip.binseg.config", 

331 cls=ConfigCommand, 

332 epilog="""Examples: 

333 

334\b 

335 1. Runs a significance test using as base the calculated predictions of two 

336 different systems, on the **same** dataset: 

337\b 

338 $ bob binseg significance -vv drive --names system1 system2 --predictions=path/to/predictions/system-1 path/to/predictions/system-2 

339\b 

340 2. By default, we use a "validation" dataset if it is available, to infer 

341 the a priori threshold for the comparison of two systems. Otherwise, 

342 you may need to specify the name of a set to be used as validation set 

343 for choosing a threshold. The same goes for the set to be used for 

344 testing the hypothesis - by default we use the "test" dataset if it is 

345 available, otherwise, specify. 

346\b 

347 $ bob binseg significance -vv drive --names system1 system2 --predictions=path/to/predictions/system-1 path/to/predictions/system-2 --threshold=train --evaluate=alternate-test 

348""", 

349) 

350@click.option( 

351 "--names", 

352 "-n", 

353 help="Names of the two systems to compare", 

354 nargs=2, 

355 required=True, 

356 type=str, 

357 cls=ResourceOption, 

358) 

359@click.option( 

360 "--predictions", 

361 "-p", 

362 help="Path where predictions of system 2 are currently stored. You may " 

363 "also input predictions from a second-annotator. This application " 

364 "will adequately handle it.", 

365 nargs=2, 

366 required=True, 

367 type=click.Path(exists=True, file_okay=False, dir_okay=True), 

368 cls=ResourceOption, 

369) 

370@click.option( 

371 "--dataset", 

372 "-d", 

373 help="A dictionary mapping string keys to " 

374 "torch.utils.data.dataset.Dataset instances", 

375 required=True, 

376 cls=ResourceOption, 

377) 

378@click.option( 

379 "--threshold", 

380 "-t", 

381 help="This number is used to define positives and negatives from " 

382 "probability maps, and report F1-scores (a priori). By default, we " 

383 "expect a set named 'validation' to be available at the input data. " 

384 "If that is not the case, we use 'train', if available. You may provide " 

385 "the name of another dataset to be used for threshold tunning otherwise. " 

386 "If not set, or a string is input, threshold tunning is done per system, " 

387 "individually. Optionally, you may also provide a floating-point number " 

388 "between [0.0, 1.0] as the threshold to use for both systems.", 

389 default="validation", 

390 show_default=True, 

391 required=True, 

392 cls=ResourceOption, 

393) 

394@click.option( 

395 "--evaluate", 

396 "-e", 

397 help="Name of the dataset to evaluate", 

398 default="test", 

399 show_default=True, 

400 required=True, 

401 cls=ResourceOption, 

402) 

403@click.option( 

404 "--steps", 

405 "-S", 

406 help="This number is used to define the number of threshold steps to " 

407 "consider when evaluating the highest possible F1-score on train/test data.", 

408 default=1000, 

409 type=int, 

410 show_default=True, 

411 required=True, 

412 cls=ResourceOption, 

413) 

414@click.option( 

415 "--size", 

416 "-s", 

417 help="This is a tuple with two values indicating the size of windows to " 

418 "be used for sliding window analysis. The values represent height and " 

419 "width respectively.", 

420 default=(128, 128), 

421 nargs=2, 

422 type=int, 

423 show_default=True, 

424 required=True, 

425 cls=ResourceOption, 

426) 

427@click.option( 

428 "--stride", 

429 "-t", 

430 help="This is a tuple with two values indicating the stride of windows to " 

431 "be used for sliding window analysis. The values represent height and " 

432 "width respectively.", 

433 default=(32, 32), 

434 nargs=2, 

435 type=int, 

436 show_default=True, 

437 required=True, 

438 cls=ResourceOption, 

439) 

440@click.option( 

441 "--figure", 

442 "-f", 

443 help="The name of a performance figure (e.g. f1_score, or jaccard) to " 

444 "use when comparing performances", 

445 default="accuracy", 

446 type=str, 

447 show_default=True, 

448 required=True, 

449 cls=ResourceOption, 

450) 

451@click.option( 

452 "--output-folder", 

453 "-o", 

454 help="Path where to store visualizations", 

455 required=False, 

456 type=click.Path(), 

457 show_default=True, 

458 cls=ResourceOption, 

459) 

460@click.option( 

461 "--remove-outliers/--no-remove-outliers", 

462 "-R", 

463 help="If set, removes outliers from both score distributions before " 

464 "running statistical analysis. Outlier removal follows a 1.5 IQR range " 

465 "check from the difference in figures between both systems and assumes " 

466 "most of the distribution is contained within that range (like in a " 

467 "normal distribution)", 

468 default=False, 

469 required=True, 

470 show_default=True, 

471 cls=ResourceOption, 

472) 

473@click.option( 

474 "--remove-zeros/--no-remove-zeros", 

475 "-R", 

476 help="If set, removes instances from the statistical analysis in which " 

477 "both systems had a performance equal to zero.", 

478 default=False, 

479 required=True, 

480 show_default=True, 

481 cls=ResourceOption, 

482) 

483@click.option( 

484 "--parallel", 

485 "-x", 

486 help="Set the number of parallel processes to use when running using " 

487 "multiprocessing. A value of zero uses all reported cores.", 

488 default=1, 

489 type=int, 

490 show_default=True, 

491 required=True, 

492 cls=ResourceOption, 

493) 

494@click.option( 

495 "--checkpoint-folder", 

496 "-k", 

497 help="Path where to store checkpointed versions of sliding window " 

498 "performances", 

499 required=False, 

500 type=click.Path(), 

501 show_default=True, 

502 cls=ResourceOption, 

503) 

504@verbosity_option(cls=ResourceOption) 

505def significance( 

506 names, 

507 predictions, 

508 dataset, 

509 threshold, 

510 evaluate, 

511 steps, 

512 size, 

513 stride, 

514 figure, 

515 output_folder, 

516 remove_outliers, 

517 remove_zeros, 

518 parallel, 

519 checkpoint_folder, 

520 **kwargs, 

521): 

522 """Evaluates how significantly different are two models on the same dataset 

523 

524 This application calculates the significance of results of two models 

525 operating on the same dataset, and subject to a priori threshold tunning. 

526 """ 

527 

528 # minimal validation to startup 

529 threshold = _validate_threshold(threshold, dataset) 

530 assert evaluate in dataset, f"No dataset named '{evaluate}'" 

531 

532 perf1 = _eval_sliding_windows( 

533 names[0], 

534 threshold, 

535 evaluate, 

536 predictions[0], 

537 dataset, 

538 steps, 

539 size, 

540 stride, 

541 ( 

542 output_folder 

543 if output_folder is None 

544 else os.path.join(output_folder, names[0]) 

545 ), 

546 figure, 

547 parallel, 

548 checkpoint_folder, 

549 ) 

550 

551 perf2 = _eval_sliding_windows( 

552 names[1], 

553 threshold, 

554 evaluate, 

555 predictions[1], 

556 dataset, 

557 steps, 

558 size, 

559 stride, 

560 ( 

561 output_folder 

562 if output_folder is None 

563 else os.path.join(output_folder, names[1]) 

564 ), 

565 figure, 

566 parallel, 

567 checkpoint_folder, 

568 ) 

569 

570 # perf_diff = _eval_differences( 

571 # names, 

572 # (perf1, perf2), 

573 # evaluate, 

574 # dataset, 

575 # size, 

576 # stride, 

577 # ( 

578 # output_folder 

579 # if output_folder is None 

580 # else os.path.join(output_folder, "diff") 

581 # ), 

582 # figure, 

583 # parallel, 

584 # checkpoint_folder, 

585 # ) 

586 

587 # loads all figures for the given threshold 

588 stems = list(perf1.keys()) 

589 figindex = PERFORMANCE_FIGURES.index(figure) 

590 da = numpy.array([perf1[k]["winperf"][figindex] for k in stems]).flatten() 

591 db = numpy.array([perf2[k]["winperf"][figindex] for k in stems]).flatten() 

592 diff = da - db 

593 

594 while remove_outliers: 

595 outliers_diff = index_of_outliers(diff) 

596 if sum(outliers_diff) == 0: 

597 break 

598 diff = diff[~outliers_diff] 

599 da = da[~outliers_diff] 

600 db = db[~outliers_diff] 

601 

602 if remove_zeros: 

603 remove_zeros = (da == 0) & (db == 0) 

604 diff = diff[~remove_zeros] 

605 da = da[~remove_zeros] 

606 db = db[~remove_zeros] 

607 

608 if output_folder is not None: 

609 fname = os.path.join(output_folder, "analysis.pdf") 

610 os.makedirs(os.path.dirname(fname), exist_ok=True) 

611 logger.info(f"Writing analysis figures to {fname} (multipage PDF)...") 

612 write_analysis_figures(names, da, db, fname) 

613 

614 if output_folder is not None: 

615 fname = os.path.join(output_folder, "analysis.txt") 

616 os.makedirs(os.path.dirname(fname), exist_ok=True) 

617 logger.info(f"Writing analysis summary to {fname}...") 

618 with open(fname, "wt") as f: 

619 write_analysis_text(names, da, db, f) 

620 write_analysis_text(names, da, db, sys.stdout)