Coverage for src/deepdraw/script/significance.py: 0%

3# SPDX-License-Identifier: GPL-3.0-or-later

5import os

6import sys

8import click

9import numpy

11from clapper.click import ConfigCommand, ResourceOption, verbosity_option

12from clapper.logging import setup

14logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")

16from ..engine.evaluator import run as run_evaluation

17from ..engine.significance import (

18 PERFORMANCE_FIGURES,

19 index_of_outliers,

20 sliding_window_performances,

21 visual_performances,

22 write_analysis_figures,

23 write_analysis_text,

24)

27@click.command(

28 entry_point_group="deepdraw.config",

29 cls=ConfigCommand,

30 epilog="""Examples:

32\b

33 1. Runs a significance test using as base the calculated predictions of two

34 different systems, on the **same** dataset:

36 .. code:: sh

38 $ deepdraw significance -vv drive --names system1 system2 --predictions=path/to/predictions/system-1 path/to/predictions/system-2

40\b

41 2. By default, we use a "validation" dataset if it is available, to infer

42 the a priori threshold for the comparison of two systems. Otherwise,

43 you may need to specify the name of a set to be used as validation set

44 for choosing a threshold. The same goes for the set to be used for

45 testing the hypothesis - by default we use the "test" dataset if it is

46 available, otherwise, specify.

48 .. code:: sh

50 $ deepdraw significance -vv drive --names system1 system2 --predictions=path/to/predictions/system-1 path/to/predictions/system-2 --threshold=train --evaluate=alternate-test

51""",

52)

53@click.option(

54 "--names",

55 "-n",

56 help="Names of the two systems to compare",

57 nargs=2,

58 required=True,

59 type=str,

60 cls=ResourceOption,

61)

62@click.option(

63 "--predictions",

64 "-p",

65 help="Path where predictions of system 2 are currently stored. You may "

66 "also input predictions from a second-annotator. This application "

67 "will adequately handle it.",

68 nargs=2,

69 required=True,

70 type=click.Path(exists=True, file_okay=False, dir_okay=True),

71 cls=ResourceOption,

72)

73@click.option(

74 "--dataset",

75 "-d",

76 help="A dictionary mapping string keys to "

77 "torch.utils.data.dataset.Dataset instances",

78 required=True,

79 cls=ResourceOption,

80)

81@click.option(

82 "--threshold",

83 "-t",

84 help="This number is used to define positives and negatives from "

85 "probability maps, and report F1-scores (a priori). By default, we "

86 "expect a set named 'validation' to be available at the input data. "

87 "If that is not the case, we use 'train', if available. You may provide "

88 "the name of another dataset to be used for threshold tunning otherwise. "

89 "If not set, or a string is input, threshold tunning is done per system, "

90 "individually. Optionally, you may also provide a floating-point number "

91 "between [0.0, 1.0] as the threshold to use for both systems.",

92 default="validation",

93 show_default=True,

94 required=True,

95 cls=ResourceOption,

96)

97@click.option(

98 "--evaluate",

99 "-e",

100 help="Name of the dataset to evaluate",

101 default="test",

102 show_default=True,

103 required=True,

104 cls=ResourceOption,

105)

106@click.option(

107 "--steps",

108 "-S",

109 help="This number is used to define the number of threshold steps to "

110 "consider when evaluating the highest possible F1-score on train/test data.",

111 default=1000,

112 type=int,

113 show_default=True,

114 required=True,

115 cls=ResourceOption,

116)

117@click.option(

118 "--size",

119 "-s",

120 help="This is a tuple with two values indicating the size of windows to "

121 "be used for sliding window analysis. The values represent height and "

122 "width respectively.",

123 default=(128, 128),

124 nargs=2,

125 type=int,

126 show_default=True,

127 required=True,

128 cls=ResourceOption,

129)

130@click.option(

131 "--stride",

132 "-t",

133 help="This is a tuple with two values indicating the stride of windows to "

134 "be used for sliding window analysis. The values represent height and "

135 "width respectively.",

136 default=(32, 32),

137 nargs=2,

138 type=int,

139 show_default=True,

140 required=True,

141 cls=ResourceOption,

142)

143@click.option(

144 "--figure",

145 "-f",

146 help="The name of a performance figure (e.g. f1_score, or jaccard) to "

147 "use when comparing performances",

148 default="accuracy",

149 type=str,

150 show_default=True,

151 required=True,

152 cls=ResourceOption,

153)

154@click.option(

155 "--output-folder",

156 "-o",

157 help="Path where to store visualizations",

158 required=False,

159 type=click.Path(),

160 show_default=True,

161 cls=ResourceOption,

162)

163@click.option(

164 "--remove-outliers/--no-remove-outliers",

165 "-R",

166 help="If set, removes outliers from both score distributions before "

167 "running statistical analysis. Outlier removal follows a 1.5 IQR range "

168 "check from the difference in figures between both systems and assumes "

169 "most of the distribution is contained within that range (like in a "

170 "normal distribution)",

171 default=False,

172 required=True,

173 show_default=True,

174 cls=ResourceOption,

175)

176@click.option(

177 "--remove-zeros/--no-remove-zeros",

178 "-R",

179 help="If set, removes instances from the statistical analysis in which "

180 "both systems had a performance equal to zero.",

181 default=False,

182 required=True,

183 show_default=True,

184 cls=ResourceOption,

185)

186@click.option(

187 "--parallel",

188 "-x",

189 help="Set the number of parallel processes to use when running using "

190 "multiprocessing. A value of zero uses all reported cores.",

191 default=1,

192 type=int,

193 show_default=True,

194 required=True,

195 cls=ResourceOption,

196)

197@click.option(

198 "--checkpoint-folder",

199 "-k",

200 help="Path where to store checkpointed versions of sliding window "

201 "performances",

202 required=False,

203 type=click.Path(),

204 show_default=True,

205 cls=ResourceOption,

206)

207@verbosity_option(logger=logger, cls=ResourceOption)

208@click.pass_context

209def significance(

210 ctx,

211 names,

212 predictions,

213 dataset,

214 threshold,

215 evaluate,

216 steps,

217 size,

218 stride,

219 figure,

220 output_folder,

221 remove_outliers,

222 remove_zeros,

223 parallel,

224 checkpoint_folder,

225 verbose,

226 **kwargs,

227):

228 """Evaluates how significantly different are two models on the same

229 dataset.

230

231 This application calculates the significance of results of two

232 models operating on the same dataset, and subject to a priori

233 threshold tunning.

234 """

235

236 def _validate_threshold(t, dataset):

237 """Validate the user threshold selection.

238

239 Returns parsed threshold.

240 """

241 if t is None:

242 return 0.5

243

244 try:

245 # we try to convert it to float first

246 t = float(t)

247 if t < 0.0 or t > 1.0:

248 raise ValueError(

249 "Float thresholds must be within range [0.0, 1.0]"

250 )

251 except ValueError:

252 # it is a bit of text - assert dataset with name is available

253 if not isinstance(dataset, dict):

254 raise ValueError(

255 "Threshold should be a floating-point number "

256 "if your provide only a single dataset for evaluation"

257 )

258 if t not in dataset:

259 raise ValueError(

260 f"Text thresholds should match dataset names, "

261 f"but {t} is not available among the datasets provided ("

262 f"({', '.join(dataset.keys())})"

263 )

264

265 return t

266

267 def _eval_sliding_windows(

268 system_name,

269 threshold,

270 evaluate,

271 preddir,

272 dataset,

273 steps,

274 size,

275 stride,

276 outdir,

277 figure,

278 nproc,

279 checkpointdir,

280 ):

281 """Calculates the sliding window performances on a dataset.

282

283 Parameters

284 ==========

285

286 system_name : str

287 The name of the current system being analyzed

288

289 threshold : :py:class:`float`, :py:class:`str`

290 This number is used to define positives and negatives from probability

291 maps, and report F1-scores (a priori). By default, we expect a set

292 named 'validation' to be available at the input data. If that is not

293 the case, we use 'train', if available. You may provide the name of

294 another dataset to be used for threshold tunning otherwise. If not

295 set, or a string is input, threshold tunning is done per system,

296 individually. Optionally, you may also provide a floating-point number

297 between [0.0, 1.0] as the threshold to use for both systems.

298

299 evaluate : str

300 Name of the dataset key to use from ``dataset`` to evaluate (typically,

301 ``test``)

302

303 preddir : str

304 Root path to the predictions generated by system ``system_name``. The

305 final subpath inside ``preddir`` that will be used will have the value

306 of this variable suffixed with the value of ``evaluate``. We will

307 search for ``<preddir>/<evaluate>/<stems>.hdf5``.

308

309 dataset : dict

310 A dictionary mapping string keys to

311 :py:class:`torch.utils.data.dataset.Dataset` instances

312

313 steps : int

314 The number of threshold steps to consider when evaluating the highest

315 possible F1-score on train/test data.

316

317 size : tuple

318 Two values indicating the size of windows to be used for the sliding

319 window analysis. The values represent height and width respectively

320

321 stride : tuple

322 Two values indicating the stride of windows to be used for the sliding

323 window analysis. The values represent height and width respectively

324

325 outdir : str

326 Path where to store visualizations. If set to ``None``, then do not

327 store performance visualizations.

328

329 figure : str

330 The name of a performance figure (e.g. ``f1_score``, ``jaccard``, or

331 ``accuracy``) to use when comparing performances

332

333 nproc : int

334 Sets the number of parallel processes to use when running using

335 multiprocessing. A value of zero uses all reported cores. A value of

336 ``1`` avoids completely the use of multiprocessing and runs all chores

337 in the current processing context.

338

339 checkpointdir : str

340 If set to a string (instead of ``None``), then stores a cached version

341 of the sliding window performances on disk, for a particular system.

342

343

344 Returns

345 =======

346

347 d : dict

348 A dictionary in which keys are filename stems and values are

349 dictionaries with the following contents:

350

351 ``winperf``: numpy.ndarray

352 A dataframe with all the sliding window performances aggregated,

353 for all input images.

354

355 ``n`` : numpy.ndarray

356 A 2D numpy array containing the number of performance scores for

357 every pixel in the original image

358

359 ``avg`` : numpy.ndarray

360 A 2D numpy array containing the average performances for every

361 pixel on the input image considering the sliding window sizes and

362 strides applied to the image

363

364 ``std`` : numpy.ndarray

365 A 2D numpy array containing the (unbiased) standard deviations for

366 the provided performance figure, for every pixel on the input image

367 considering the sliding window sizes and strides applied to the

368 image

369 """

370

371 if checkpointdir is not None:

372 chkpt_fname = os.path.join(

373 checkpointdir,

374 f"{system_name}-{evaluate}-{threshold}-"

375 f"{size[0]}x{size[1]}+{stride[0]}x{stride[1]}-{figure}.pkl.gz",

376 )

377 os.makedirs(os.path.dirname(chkpt_fname), exist_ok=True)

378 if os.path.exists(chkpt_fname):

379 logger.info(f"Loading checkpoint from {chkpt_fname}...")

380 # loads and returns checkpoint from file

381 try:

382 with __import__("gzip").GzipFile(chkpt_fname, "r") as f:

383 return __import__("pickle").load(f)

384 except EOFError as e:

385 logger.warning(

386 f"Could not load sliding window performance "

387 f"from {chkpt_fname}: {e}. Calculating..."

388 )

389 else:

390 logger.debug(

391 f"Checkpoint not available at {chkpt_fname}. "

392 f"Calculating..."

393 )

394 else:

395 chkpt_fname = None

396

397 if not isinstance(threshold, float):

398 assert threshold in dataset, f"No dataset named '{threshold}'"

399

400 logger.info(

401 f"Evaluating threshold on '{threshold}' set for "

402 f"'{system_name}' using {steps} steps"

403 )

404 threshold = run_evaluation(

405 dataset[threshold], threshold, preddir, steps=steps

406 )

407 logger.info(f"Set --threshold={threshold:.5f} for '{system_name}'")

408

409 # for a given threshold on each system, calculate sliding window performances

410 logger.info(

411 f"Evaluating sliding window '{figure}' on '{evaluate}' set for "

412 f"'{system_name}' using windows of size {size} and stride {stride}"

413 )

414

415 retval = sliding_window_performances(

416 dataset,

417 evaluate,

418 preddir,

419 threshold,

420 size,

421 stride,

422 figure,

423 nproc,

424 outdir,

425 )

426

427 # cache sliding window performance for later use, if necessary

428 if chkpt_fname is not None:

429 logger.debug(f"Storing checkpoint at {chkpt_fname}...")

430 with __import__("gzip").GzipFile(chkpt_fname, "w") as f:

431 __import__("pickle").dump(retval, f)

432

433 return retval

434

435 def _eval_differences(

436 names,

437 perfs,

438 evaluate,

439 dataset,

440 size,

441 stride,

442 outdir,

443 figure,

444 nproc,

445 checkpointdir,

446 ):

447 """Evaluate differences in the performance sliding windows between two

448 systems.

449

450 Parameters

451 ----------

452

453 names : :py:class:`tuple` of :py:class:`str`

454 Names of the first and second systems

455

456 perfs : :py:class:`tuple` of :py:class:`dict`

457 Dictionaries for the sliding window performances of each system, as

458 returned by :py:func:`_eval_sliding_windows`

459

460 evaluate : str

461 Name of the dataset key to use from ``dataset`` to evaluate (typically,

462 ``test``)

463

464 dataset : dict

465 A dictionary mapping string keys to

466 :py:class:`torch.utils.data.dataset.Dataset` instances

467

468 size : tuple

469 Two values indicating the size of windows to be used for sliding window

470 analysis. The values represent height and width respectively

471

472 stride : tuple

473 Two values indicating the stride of windows to be used for sliding

474 window analysis. The values represent height and width respectively

475

476 outdir : str

477 If set to ``None``, then do not output performance visualizations.

478 Otherwise, in directory ``outdir``, dumps the visualizations for the

479 performance differences between both systems.

480

481 figure : str

482 The name of a performance figure (e.g. ``f1_score``, or ``jaccard``) to

483 use when comparing performances

484

485 nproc : int

486 Sets the number of parallel processes to use when running using

487 multiprocessing. A value of zero uses all reported cores. A value of

488 ``1`` avoids completely the use of multiprocessing and runs all chores

489 in the current processing context.

490

491 checkpointdir : str

492 If set to a string (instead of ``None``), then stores a cached version

493 of the sliding window performances on disk, for a particular difference

494 between systems.

495

496

497 Returns

498 -------

499

500 d : dict

501 A dictionary representing sliding window performance differences across

502 all files and sliding windows. The format of this is similar to the

503 individual inputs ``perf1`` and ``perf2``.

504 """

505

506 if checkpointdir is not None:

507 chkpt_fname = os.path.join(

508 checkpointdir,

509 f"{names[0]}-{names[1]}-{evaluate}-"

510 f"{size[0]}x{size[1]}+{stride[0]}x{stride[1]}-{figure}.pkl.gz",

511 )

512 os.makedirs(os.path.dirname(chkpt_fname), exist_ok=True)

513 if os.path.exists(chkpt_fname):

514 logger.info(f"Loading checkpoint from {chkpt_fname}...")

515 # loads and returns checkpoint from file

516 try:

517 with __import__("gzip").GzipFile(chkpt_fname, "r") as f:

518 return __import__("pickle").load(f)

519 except EOFError as e:

520 logger.warning(

521 f"Could not load sliding window performance "

522 f"from {chkpt_fname}: {e}. Calculating..."

523 )

524 else:

525 logger.debug(

526 f"Checkpoint not available at {chkpt_fname}. "

527 f"Calculating..."

528 )

529 else:

530 chkpt_fname = None

531

532 perf_diff = {

533 k: perfs[0][k]["winperf"] - perfs[1][k]["winperf"] for k in perfs[0]

534 }

535

536 # for a given threshold on each system, calculate sliding window performances

537 logger.info(

538 f"Evaluating sliding window '{figure}' differences on '{evaluate}' "

539 f"set on '{names[0]}-{names[1]}' using windows of size {size} and "

540 f"stride {stride}"

541 )

542

543 retval = visual_performances(

544 dataset,

545 evaluate,

546 perf_diff,

547 size,

548 stride,

549 figure,

550 nproc,

551 outdir,

552 )

553

554 # cache sliding window performance for later use, if necessary

555 if chkpt_fname is not None:

556 logger.debug(f"Storing checkpoint at {chkpt_fname}...")

557 with __import__("gzip").GzipFile(chkpt_fname, "w") as f:

558 __import__("pickle").dump(retval, f)

559

560 return retval

561

562 # minimal validation to startup

563 threshold = _validate_threshold(threshold, dataset)

564 assert evaluate in dataset, f"No dataset named '{evaluate}'"

565

566 perf1 = _eval_sliding_windows(

567 names[0],

568 threshold,

569 evaluate,

570 predictions[0],

571 dataset,

572 steps,

573 size,

574 stride,

575 (

576 output_folder

577 if output_folder is None

578 else os.path.join(output_folder, names[0])

579 ),

580 figure,

581 parallel,

582 checkpoint_folder,

583 )

584

585 perf2 = _eval_sliding_windows(

586 names[1],

587 threshold,

588 evaluate,

589 predictions[1],

590 dataset,

591 steps,

592 size,

593 stride,

594 (

595 output_folder

596 if output_folder is None

597 else os.path.join(output_folder, names[1])

598 ),

599 figure,

600 parallel,

601 checkpoint_folder,

602 )

603

604 # perf_diff = _eval_differences(

605 # names,

606 # (perf1, perf2),

607 # evaluate,

608 # dataset,

609 # size,

610 # stride,

611 # (

612 # output_folder

613 # if output_folder is None

614 # else os.path.join(output_folder, "diff")

615 # ),

616 # figure,

617 # parallel,

618 # checkpoint_folder,

619 # )

620

621 # loads all figures for the given threshold

622 stems = list(perf1.keys())

623 figindex = PERFORMANCE_FIGURES.index(figure)

624 da = numpy.array([perf1[k]["winperf"][figindex] for k in stems]).flatten()

625 db = numpy.array([perf2[k]["winperf"][figindex] for k in stems]).flatten()

626 diff = da - db

627

628 while remove_outliers:

629 outliers_diff = index_of_outliers(diff)

630 if sum(outliers_diff) == 0:

631 break

632 diff = diff[~outliers_diff]

633 da = da[~outliers_diff]

634 db = db[~outliers_diff]

635

636 if remove_zeros:

637 remove_zeros = (da == 0) & (db == 0)

638 diff = diff[~remove_zeros]

639 da = da[~remove_zeros]

640 db = db[~remove_zeros]

641

642 if output_folder is not None:

643 fname = os.path.join(output_folder, "analysis.pdf")

644 os.makedirs(os.path.dirname(fname), exist_ok=True)

645 logger.info(f"Writing analysis figures to {fname} (multipage PDF)...")

646 write_analysis_figures(names, da, db, fname)

647

648 if output_folder is not None:

649 fname = os.path.join(output_folder, "analysis.txt")

650 os.makedirs(os.path.dirname(fname), exist_ok=True)

651 logger.info(f"Writing analysis summary to {fname}...")

652 with open(fname, "w") as f:

653 write_analysis_text(names, da, db, f)

654 write_analysis_text(names, da, db, sys.stdout)