Coverage for src/deepdraw/script/significance.py: 0%

119 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-11-30 15:00 +0100

1# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> 

2# 

3# SPDX-License-Identifier: GPL-3.0-or-later 

4 

5import os 

6import sys 

7 

8import click 

9import numpy 

10 

11from clapper.click import ConfigCommand, ResourceOption, verbosity_option 

12from clapper.logging import setup 

13 

14logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") 

15 

16from ..engine.evaluator import run as run_evaluation 

17from ..engine.significance import ( 

18 PERFORMANCE_FIGURES, 

19 index_of_outliers, 

20 sliding_window_performances, 

21 visual_performances, 

22 write_analysis_figures, 

23 write_analysis_text, 

24) 

25 

26 

27@click.command( 

28 entry_point_group="deepdraw.config", 

29 cls=ConfigCommand, 

30 epilog="""Examples: 

31 

32\b 

33 1. Runs a significance test using as base the calculated predictions of two 

34 different systems, on the **same** dataset: 

35 

36 .. code:: sh 

37 

38 $ deepdraw significance -vv drive --names system1 system2 --predictions=path/to/predictions/system-1 path/to/predictions/system-2 

39 

40\b 

41 2. By default, we use a "validation" dataset if it is available, to infer 

42 the a priori threshold for the comparison of two systems. Otherwise, 

43 you may need to specify the name of a set to be used as validation set 

44 for choosing a threshold. The same goes for the set to be used for 

45 testing the hypothesis - by default we use the "test" dataset if it is 

46 available, otherwise, specify. 

47 

48 .. code:: sh 

49 

50 $ deepdraw significance -vv drive --names system1 system2 --predictions=path/to/predictions/system-1 path/to/predictions/system-2 --threshold=train --evaluate=alternate-test 

51""", 

52) 

53@click.option( 

54 "--names", 

55 "-n", 

56 help="Names of the two systems to compare", 

57 nargs=2, 

58 required=True, 

59 type=str, 

60 cls=ResourceOption, 

61) 

62@click.option( 

63 "--predictions", 

64 "-p", 

65 help="Path where predictions of system 2 are currently stored. You may " 

66 "also input predictions from a second-annotator. This application " 

67 "will adequately handle it.", 

68 nargs=2, 

69 required=True, 

70 type=click.Path(exists=True, file_okay=False, dir_okay=True), 

71 cls=ResourceOption, 

72) 

73@click.option( 

74 "--dataset", 

75 "-d", 

76 help="A dictionary mapping string keys to " 

77 "torch.utils.data.dataset.Dataset instances", 

78 required=True, 

79 cls=ResourceOption, 

80) 

81@click.option( 

82 "--threshold", 

83 "-t", 

84 help="This number is used to define positives and negatives from " 

85 "probability maps, and report F1-scores (a priori). By default, we " 

86 "expect a set named 'validation' to be available at the input data. " 

87 "If that is not the case, we use 'train', if available. You may provide " 

88 "the name of another dataset to be used for threshold tunning otherwise. " 

89 "If not set, or a string is input, threshold tunning is done per system, " 

90 "individually. Optionally, you may also provide a floating-point number " 

91 "between [0.0, 1.0] as the threshold to use for both systems.", 

92 default="validation", 

93 show_default=True, 

94 required=True, 

95 cls=ResourceOption, 

96) 

97@click.option( 

98 "--evaluate", 

99 "-e", 

100 help="Name of the dataset to evaluate", 

101 default="test", 

102 show_default=True, 

103 required=True, 

104 cls=ResourceOption, 

105) 

106@click.option( 

107 "--steps", 

108 "-S", 

109 help="This number is used to define the number of threshold steps to " 

110 "consider when evaluating the highest possible F1-score on train/test data.", 

111 default=1000, 

112 type=int, 

113 show_default=True, 

114 required=True, 

115 cls=ResourceOption, 

116) 

117@click.option( 

118 "--size", 

119 "-s", 

120 help="This is a tuple with two values indicating the size of windows to " 

121 "be used for sliding window analysis. The values represent height and " 

122 "width respectively.", 

123 default=(128, 128), 

124 nargs=2, 

125 type=int, 

126 show_default=True, 

127 required=True, 

128 cls=ResourceOption, 

129) 

130@click.option( 

131 "--stride", 

132 "-t", 

133 help="This is a tuple with two values indicating the stride of windows to " 

134 "be used for sliding window analysis. The values represent height and " 

135 "width respectively.", 

136 default=(32, 32), 

137 nargs=2, 

138 type=int, 

139 show_default=True, 

140 required=True, 

141 cls=ResourceOption, 

142) 

143@click.option( 

144 "--figure", 

145 "-f", 

146 help="The name of a performance figure (e.g. f1_score, or jaccard) to " 

147 "use when comparing performances", 

148 default="accuracy", 

149 type=str, 

150 show_default=True, 

151 required=True, 

152 cls=ResourceOption, 

153) 

154@click.option( 

155 "--output-folder", 

156 "-o", 

157 help="Path where to store visualizations", 

158 required=False, 

159 type=click.Path(), 

160 show_default=True, 

161 cls=ResourceOption, 

162) 

163@click.option( 

164 "--remove-outliers/--no-remove-outliers", 

165 "-R", 

166 help="If set, removes outliers from both score distributions before " 

167 "running statistical analysis. Outlier removal follows a 1.5 IQR range " 

168 "check from the difference in figures between both systems and assumes " 

169 "most of the distribution is contained within that range (like in a " 

170 "normal distribution)", 

171 default=False, 

172 required=True, 

173 show_default=True, 

174 cls=ResourceOption, 

175) 

176@click.option( 

177 "--remove-zeros/--no-remove-zeros", 

178 "-R", 

179 help="If set, removes instances from the statistical analysis in which " 

180 "both systems had a performance equal to zero.", 

181 default=False, 

182 required=True, 

183 show_default=True, 

184 cls=ResourceOption, 

185) 

186@click.option( 

187 "--parallel", 

188 "-x", 

189 help="Set the number of parallel processes to use when running using " 

190 "multiprocessing. A value of zero uses all reported cores.", 

191 default=1, 

192 type=int, 

193 show_default=True, 

194 required=True, 

195 cls=ResourceOption, 

196) 

197@click.option( 

198 "--checkpoint-folder", 

199 "-k", 

200 help="Path where to store checkpointed versions of sliding window " 

201 "performances", 

202 required=False, 

203 type=click.Path(), 

204 show_default=True, 

205 cls=ResourceOption, 

206) 

207@verbosity_option(logger=logger, cls=ResourceOption) 

208@click.pass_context 

209def significance( 

210 ctx, 

211 names, 

212 predictions, 

213 dataset, 

214 threshold, 

215 evaluate, 

216 steps, 

217 size, 

218 stride, 

219 figure, 

220 output_folder, 

221 remove_outliers, 

222 remove_zeros, 

223 parallel, 

224 checkpoint_folder, 

225 verbose, 

226 **kwargs, 

227): 

228 """Evaluates how significantly different are two models on the same 

229 dataset. 

230 

231 This application calculates the significance of results of two 

232 models operating on the same dataset, and subject to a priori 

233 threshold tunning. 

234 """ 

235 

236 def _validate_threshold(t, dataset): 

237 """Validate the user threshold selection. 

238 

239 Returns parsed threshold. 

240 """ 

241 if t is None: 

242 return 0.5 

243 

244 try: 

245 # we try to convert it to float first 

246 t = float(t) 

247 if t < 0.0 or t > 1.0: 

248 raise ValueError( 

249 "Float thresholds must be within range [0.0, 1.0]" 

250 ) 

251 except ValueError: 

252 # it is a bit of text - assert dataset with name is available 

253 if not isinstance(dataset, dict): 

254 raise ValueError( 

255 "Threshold should be a floating-point number " 

256 "if your provide only a single dataset for evaluation" 

257 ) 

258 if t not in dataset: 

259 raise ValueError( 

260 f"Text thresholds should match dataset names, " 

261 f"but {t} is not available among the datasets provided (" 

262 f"({', '.join(dataset.keys())})" 

263 ) 

264 

265 return t 

266 

267 def _eval_sliding_windows( 

268 system_name, 

269 threshold, 

270 evaluate, 

271 preddir, 

272 dataset, 

273 steps, 

274 size, 

275 stride, 

276 outdir, 

277 figure, 

278 nproc, 

279 checkpointdir, 

280 ): 

281 """Calculates the sliding window performances on a dataset. 

282 

283 Parameters 

284 ========== 

285 

286 system_name : str 

287 The name of the current system being analyzed 

288 

289 threshold : :py:class:`float`, :py:class:`str` 

290 This number is used to define positives and negatives from probability 

291 maps, and report F1-scores (a priori). By default, we expect a set 

292 named 'validation' to be available at the input data. If that is not 

293 the case, we use 'train', if available. You may provide the name of 

294 another dataset to be used for threshold tunning otherwise. If not 

295 set, or a string is input, threshold tunning is done per system, 

296 individually. Optionally, you may also provide a floating-point number 

297 between [0.0, 1.0] as the threshold to use for both systems. 

298 

299 evaluate : str 

300 Name of the dataset key to use from ``dataset`` to evaluate (typically, 

301 ``test``) 

302 

303 preddir : str 

304 Root path to the predictions generated by system ``system_name``. The 

305 final subpath inside ``preddir`` that will be used will have the value 

306 of this variable suffixed with the value of ``evaluate``. We will 

307 search for ``<preddir>/<evaluate>/<stems>.hdf5``. 

308 

309 dataset : dict 

310 A dictionary mapping string keys to 

311 :py:class:`torch.utils.data.dataset.Dataset` instances 

312 

313 steps : int 

314 The number of threshold steps to consider when evaluating the highest 

315 possible F1-score on train/test data. 

316 

317 size : tuple 

318 Two values indicating the size of windows to be used for the sliding 

319 window analysis. The values represent height and width respectively 

320 

321 stride : tuple 

322 Two values indicating the stride of windows to be used for the sliding 

323 window analysis. The values represent height and width respectively 

324 

325 outdir : str 

326 Path where to store visualizations. If set to ``None``, then do not 

327 store performance visualizations. 

328 

329 figure : str 

330 The name of a performance figure (e.g. ``f1_score``, ``jaccard``, or 

331 ``accuracy``) to use when comparing performances 

332 

333 nproc : int 

334 Sets the number of parallel processes to use when running using 

335 multiprocessing. A value of zero uses all reported cores. A value of 

336 ``1`` avoids completely the use of multiprocessing and runs all chores 

337 in the current processing context. 

338 

339 checkpointdir : str 

340 If set to a string (instead of ``None``), then stores a cached version 

341 of the sliding window performances on disk, for a particular system. 

342 

343 

344 Returns 

345 ======= 

346 

347 d : dict 

348 A dictionary in which keys are filename stems and values are 

349 dictionaries with the following contents: 

350 

351 ``winperf``: numpy.ndarray 

352 A dataframe with all the sliding window performances aggregated, 

353 for all input images. 

354 

355 ``n`` : numpy.ndarray 

356 A 2D numpy array containing the number of performance scores for 

357 every pixel in the original image 

358 

359 ``avg`` : numpy.ndarray 

360 A 2D numpy array containing the average performances for every 

361 pixel on the input image considering the sliding window sizes and 

362 strides applied to the image 

363 

364 ``std`` : numpy.ndarray 

365 A 2D numpy array containing the (unbiased) standard deviations for 

366 the provided performance figure, for every pixel on the input image 

367 considering the sliding window sizes and strides applied to the 

368 image 

369 """ 

370 

371 if checkpointdir is not None: 

372 chkpt_fname = os.path.join( 

373 checkpointdir, 

374 f"{system_name}-{evaluate}-{threshold}-" 

375 f"{size[0]}x{size[1]}+{stride[0]}x{stride[1]}-{figure}.pkl.gz", 

376 ) 

377 os.makedirs(os.path.dirname(chkpt_fname), exist_ok=True) 

378 if os.path.exists(chkpt_fname): 

379 logger.info(f"Loading checkpoint from {chkpt_fname}...") 

380 # loads and returns checkpoint from file 

381 try: 

382 with __import__("gzip").GzipFile(chkpt_fname, "r") as f: 

383 return __import__("pickle").load(f) 

384 except EOFError as e: 

385 logger.warning( 

386 f"Could not load sliding window performance " 

387 f"from {chkpt_fname}: {e}. Calculating..." 

388 ) 

389 else: 

390 logger.debug( 

391 f"Checkpoint not available at {chkpt_fname}. " 

392 f"Calculating..." 

393 ) 

394 else: 

395 chkpt_fname = None 

396 

397 if not isinstance(threshold, float): 

398 assert threshold in dataset, f"No dataset named '{threshold}'" 

399 

400 logger.info( 

401 f"Evaluating threshold on '{threshold}' set for " 

402 f"'{system_name}' using {steps} steps" 

403 ) 

404 threshold = run_evaluation( 

405 dataset[threshold], threshold, preddir, steps=steps 

406 ) 

407 logger.info(f"Set --threshold={threshold:.5f} for '{system_name}'") 

408 

409 # for a given threshold on each system, calculate sliding window performances 

410 logger.info( 

411 f"Evaluating sliding window '{figure}' on '{evaluate}' set for " 

412 f"'{system_name}' using windows of size {size} and stride {stride}" 

413 ) 

414 

415 retval = sliding_window_performances( 

416 dataset, 

417 evaluate, 

418 preddir, 

419 threshold, 

420 size, 

421 stride, 

422 figure, 

423 nproc, 

424 outdir, 

425 ) 

426 

427 # cache sliding window performance for later use, if necessary 

428 if chkpt_fname is not None: 

429 logger.debug(f"Storing checkpoint at {chkpt_fname}...") 

430 with __import__("gzip").GzipFile(chkpt_fname, "w") as f: 

431 __import__("pickle").dump(retval, f) 

432 

433 return retval 

434 

435 def _eval_differences( 

436 names, 

437 perfs, 

438 evaluate, 

439 dataset, 

440 size, 

441 stride, 

442 outdir, 

443 figure, 

444 nproc, 

445 checkpointdir, 

446 ): 

447 """Evaluate differences in the performance sliding windows between two 

448 systems. 

449 

450 Parameters 

451 ---------- 

452 

453 names : :py:class:`tuple` of :py:class:`str` 

454 Names of the first and second systems 

455 

456 perfs : :py:class:`tuple` of :py:class:`dict` 

457 Dictionaries for the sliding window performances of each system, as 

458 returned by :py:func:`_eval_sliding_windows` 

459 

460 evaluate : str 

461 Name of the dataset key to use from ``dataset`` to evaluate (typically, 

462 ``test``) 

463 

464 dataset : dict 

465 A dictionary mapping string keys to 

466 :py:class:`torch.utils.data.dataset.Dataset` instances 

467 

468 size : tuple 

469 Two values indicating the size of windows to be used for sliding window 

470 analysis. The values represent height and width respectively 

471 

472 stride : tuple 

473 Two values indicating the stride of windows to be used for sliding 

474 window analysis. The values represent height and width respectively 

475 

476 outdir : str 

477 If set to ``None``, then do not output performance visualizations. 

478 Otherwise, in directory ``outdir``, dumps the visualizations for the 

479 performance differences between both systems. 

480 

481 figure : str 

482 The name of a performance figure (e.g. ``f1_score``, or ``jaccard``) to 

483 use when comparing performances 

484 

485 nproc : int 

486 Sets the number of parallel processes to use when running using 

487 multiprocessing. A value of zero uses all reported cores. A value of 

488 ``1`` avoids completely the use of multiprocessing and runs all chores 

489 in the current processing context. 

490 

491 checkpointdir : str 

492 If set to a string (instead of ``None``), then stores a cached version 

493 of the sliding window performances on disk, for a particular difference 

494 between systems. 

495 

496 

497 Returns 

498 ------- 

499 

500 d : dict 

501 A dictionary representing sliding window performance differences across 

502 all files and sliding windows. The format of this is similar to the 

503 individual inputs ``perf1`` and ``perf2``. 

504 """ 

505 

506 if checkpointdir is not None: 

507 chkpt_fname = os.path.join( 

508 checkpointdir, 

509 f"{names[0]}-{names[1]}-{evaluate}-" 

510 f"{size[0]}x{size[1]}+{stride[0]}x{stride[1]}-{figure}.pkl.gz", 

511 ) 

512 os.makedirs(os.path.dirname(chkpt_fname), exist_ok=True) 

513 if os.path.exists(chkpt_fname): 

514 logger.info(f"Loading checkpoint from {chkpt_fname}...") 

515 # loads and returns checkpoint from file 

516 try: 

517 with __import__("gzip").GzipFile(chkpt_fname, "r") as f: 

518 return __import__("pickle").load(f) 

519 except EOFError as e: 

520 logger.warning( 

521 f"Could not load sliding window performance " 

522 f"from {chkpt_fname}: {e}. Calculating..." 

523 ) 

524 else: 

525 logger.debug( 

526 f"Checkpoint not available at {chkpt_fname}. " 

527 f"Calculating..." 

528 ) 

529 else: 

530 chkpt_fname = None 

531 

532 perf_diff = { 

533 k: perfs[0][k]["winperf"] - perfs[1][k]["winperf"] for k in perfs[0] 

534 } 

535 

536 # for a given threshold on each system, calculate sliding window performances 

537 logger.info( 

538 f"Evaluating sliding window '{figure}' differences on '{evaluate}' " 

539 f"set on '{names[0]}-{names[1]}' using windows of size {size} and " 

540 f"stride {stride}" 

541 ) 

542 

543 retval = visual_performances( 

544 dataset, 

545 evaluate, 

546 perf_diff, 

547 size, 

548 stride, 

549 figure, 

550 nproc, 

551 outdir, 

552 ) 

553 

554 # cache sliding window performance for later use, if necessary 

555 if chkpt_fname is not None: 

556 logger.debug(f"Storing checkpoint at {chkpt_fname}...") 

557 with __import__("gzip").GzipFile(chkpt_fname, "w") as f: 

558 __import__("pickle").dump(retval, f) 

559 

560 return retval 

561 

562 # minimal validation to startup 

563 threshold = _validate_threshold(threshold, dataset) 

564 assert evaluate in dataset, f"No dataset named '{evaluate}'" 

565 

566 perf1 = _eval_sliding_windows( 

567 names[0], 

568 threshold, 

569 evaluate, 

570 predictions[0], 

571 dataset, 

572 steps, 

573 size, 

574 stride, 

575 ( 

576 output_folder 

577 if output_folder is None 

578 else os.path.join(output_folder, names[0]) 

579 ), 

580 figure, 

581 parallel, 

582 checkpoint_folder, 

583 ) 

584 

585 perf2 = _eval_sliding_windows( 

586 names[1], 

587 threshold, 

588 evaluate, 

589 predictions[1], 

590 dataset, 

591 steps, 

592 size, 

593 stride, 

594 ( 

595 output_folder 

596 if output_folder is None 

597 else os.path.join(output_folder, names[1]) 

598 ), 

599 figure, 

600 parallel, 

601 checkpoint_folder, 

602 ) 

603 

604 # perf_diff = _eval_differences( 

605 # names, 

606 # (perf1, perf2), 

607 # evaluate, 

608 # dataset, 

609 # size, 

610 # stride, 

611 # ( 

612 # output_folder 

613 # if output_folder is None 

614 # else os.path.join(output_folder, "diff") 

615 # ), 

616 # figure, 

617 # parallel, 

618 # checkpoint_folder, 

619 # ) 

620 

621 # loads all figures for the given threshold 

622 stems = list(perf1.keys()) 

623 figindex = PERFORMANCE_FIGURES.index(figure) 

624 da = numpy.array([perf1[k]["winperf"][figindex] for k in stems]).flatten() 

625 db = numpy.array([perf2[k]["winperf"][figindex] for k in stems]).flatten() 

626 diff = da - db 

627 

628 while remove_outliers: 

629 outliers_diff = index_of_outliers(diff) 

630 if sum(outliers_diff) == 0: 

631 break 

632 diff = diff[~outliers_diff] 

633 da = da[~outliers_diff] 

634 db = db[~outliers_diff] 

635 

636 if remove_zeros: 

637 remove_zeros = (da == 0) & (db == 0) 

638 diff = diff[~remove_zeros] 

639 da = da[~remove_zeros] 

640 db = db[~remove_zeros] 

641 

642 if output_folder is not None: 

643 fname = os.path.join(output_folder, "analysis.pdf") 

644 os.makedirs(os.path.dirname(fname), exist_ok=True) 

645 logger.info(f"Writing analysis figures to {fname} (multipage PDF)...") 

646 write_analysis_figures(names, da, db, fname) 

647 

648 if output_folder is not None: 

649 fname = os.path.join(output_folder, "analysis.txt") 

650 os.makedirs(os.path.dirname(fname), exist_ok=True) 

651 logger.info(f"Writing analysis summary to {fname}...") 

652 with open(fname, "w") as f: 

653 write_analysis_text(names, da, db, f) 

654 write_analysis_text(names, da, db, sys.stdout)