Coverage for src/deepdraw/script/significance.py: 0%
119 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-11-30 15:00 +0100
« prev ^ index » next coverage.py v7.3.1, created at 2023-11-30 15:00 +0100
1# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
2#
3# SPDX-License-Identifier: GPL-3.0-or-later
5import os
6import sys
8import click
9import numpy
11from clapper.click import ConfigCommand, ResourceOption, verbosity_option
12from clapper.logging import setup
14logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
16from ..engine.evaluator import run as run_evaluation
17from ..engine.significance import (
18 PERFORMANCE_FIGURES,
19 index_of_outliers,
20 sliding_window_performances,
21 visual_performances,
22 write_analysis_figures,
23 write_analysis_text,
24)
27@click.command(
28 entry_point_group="deepdraw.config",
29 cls=ConfigCommand,
30 epilog="""Examples:
32\b
33 1. Runs a significance test using as base the calculated predictions of two
34 different systems, on the **same** dataset:
36 .. code:: sh
38 $ deepdraw significance -vv drive --names system1 system2 --predictions=path/to/predictions/system-1 path/to/predictions/system-2
40\b
41 2. By default, we use a "validation" dataset if it is available, to infer
42 the a priori threshold for the comparison of two systems. Otherwise,
43 you may need to specify the name of a set to be used as validation set
44 for choosing a threshold. The same goes for the set to be used for
45 testing the hypothesis - by default we use the "test" dataset if it is
46 available, otherwise, specify.
48 .. code:: sh
50 $ deepdraw significance -vv drive --names system1 system2 --predictions=path/to/predictions/system-1 path/to/predictions/system-2 --threshold=train --evaluate=alternate-test
51""",
52)
53@click.option(
54 "--names",
55 "-n",
56 help="Names of the two systems to compare",
57 nargs=2,
58 required=True,
59 type=str,
60 cls=ResourceOption,
61)
62@click.option(
63 "--predictions",
64 "-p",
65 help="Path where predictions of system 2 are currently stored. You may "
66 "also input predictions from a second-annotator. This application "
67 "will adequately handle it.",
68 nargs=2,
69 required=True,
70 type=click.Path(exists=True, file_okay=False, dir_okay=True),
71 cls=ResourceOption,
72)
73@click.option(
74 "--dataset",
75 "-d",
76 help="A dictionary mapping string keys to "
77 "torch.utils.data.dataset.Dataset instances",
78 required=True,
79 cls=ResourceOption,
80)
81@click.option(
82 "--threshold",
83 "-t",
84 help="This number is used to define positives and negatives from "
85 "probability maps, and report F1-scores (a priori). By default, we "
86 "expect a set named 'validation' to be available at the input data. "
87 "If that is not the case, we use 'train', if available. You may provide "
88 "the name of another dataset to be used for threshold tunning otherwise. "
89 "If not set, or a string is input, threshold tunning is done per system, "
90 "individually. Optionally, you may also provide a floating-point number "
91 "between [0.0, 1.0] as the threshold to use for both systems.",
92 default="validation",
93 show_default=True,
94 required=True,
95 cls=ResourceOption,
96)
97@click.option(
98 "--evaluate",
99 "-e",
100 help="Name of the dataset to evaluate",
101 default="test",
102 show_default=True,
103 required=True,
104 cls=ResourceOption,
105)
106@click.option(
107 "--steps",
108 "-S",
109 help="This number is used to define the number of threshold steps to "
110 "consider when evaluating the highest possible F1-score on train/test data.",
111 default=1000,
112 type=int,
113 show_default=True,
114 required=True,
115 cls=ResourceOption,
116)
117@click.option(
118 "--size",
119 "-s",
120 help="This is a tuple with two values indicating the size of windows to "
121 "be used for sliding window analysis. The values represent height and "
122 "width respectively.",
123 default=(128, 128),
124 nargs=2,
125 type=int,
126 show_default=True,
127 required=True,
128 cls=ResourceOption,
129)
130@click.option(
131 "--stride",
132 "-t",
133 help="This is a tuple with two values indicating the stride of windows to "
134 "be used for sliding window analysis. The values represent height and "
135 "width respectively.",
136 default=(32, 32),
137 nargs=2,
138 type=int,
139 show_default=True,
140 required=True,
141 cls=ResourceOption,
142)
143@click.option(
144 "--figure",
145 "-f",
146 help="The name of a performance figure (e.g. f1_score, or jaccard) to "
147 "use when comparing performances",
148 default="accuracy",
149 type=str,
150 show_default=True,
151 required=True,
152 cls=ResourceOption,
153)
154@click.option(
155 "--output-folder",
156 "-o",
157 help="Path where to store visualizations",
158 required=False,
159 type=click.Path(),
160 show_default=True,
161 cls=ResourceOption,
162)
163@click.option(
164 "--remove-outliers/--no-remove-outliers",
165 "-R",
166 help="If set, removes outliers from both score distributions before "
167 "running statistical analysis. Outlier removal follows a 1.5 IQR range "
168 "check from the difference in figures between both systems and assumes "
169 "most of the distribution is contained within that range (like in a "
170 "normal distribution)",
171 default=False,
172 required=True,
173 show_default=True,
174 cls=ResourceOption,
175)
176@click.option(
177 "--remove-zeros/--no-remove-zeros",
178 "-R",
179 help="If set, removes instances from the statistical analysis in which "
180 "both systems had a performance equal to zero.",
181 default=False,
182 required=True,
183 show_default=True,
184 cls=ResourceOption,
185)
186@click.option(
187 "--parallel",
188 "-x",
189 help="Set the number of parallel processes to use when running using "
190 "multiprocessing. A value of zero uses all reported cores.",
191 default=1,
192 type=int,
193 show_default=True,
194 required=True,
195 cls=ResourceOption,
196)
197@click.option(
198 "--checkpoint-folder",
199 "-k",
200 help="Path where to store checkpointed versions of sliding window "
201 "performances",
202 required=False,
203 type=click.Path(),
204 show_default=True,
205 cls=ResourceOption,
206)
207@verbosity_option(logger=logger, cls=ResourceOption)
208@click.pass_context
209def significance(
210 ctx,
211 names,
212 predictions,
213 dataset,
214 threshold,
215 evaluate,
216 steps,
217 size,
218 stride,
219 figure,
220 output_folder,
221 remove_outliers,
222 remove_zeros,
223 parallel,
224 checkpoint_folder,
225 verbose,
226 **kwargs,
227):
228 """Evaluates how significantly different are two models on the same
229 dataset.
231 This application calculates the significance of results of two
232 models operating on the same dataset, and subject to a priori
233 threshold tunning.
234 """
236 def _validate_threshold(t, dataset):
237 """Validate the user threshold selection.
239 Returns parsed threshold.
240 """
241 if t is None:
242 return 0.5
244 try:
245 # we try to convert it to float first
246 t = float(t)
247 if t < 0.0 or t > 1.0:
248 raise ValueError(
249 "Float thresholds must be within range [0.0, 1.0]"
250 )
251 except ValueError:
252 # it is a bit of text - assert dataset with name is available
253 if not isinstance(dataset, dict):
254 raise ValueError(
255 "Threshold should be a floating-point number "
256 "if your provide only a single dataset for evaluation"
257 )
258 if t not in dataset:
259 raise ValueError(
260 f"Text thresholds should match dataset names, "
261 f"but {t} is not available among the datasets provided ("
262 f"({', '.join(dataset.keys())})"
263 )
265 return t
267 def _eval_sliding_windows(
268 system_name,
269 threshold,
270 evaluate,
271 preddir,
272 dataset,
273 steps,
274 size,
275 stride,
276 outdir,
277 figure,
278 nproc,
279 checkpointdir,
280 ):
281 """Calculates the sliding window performances on a dataset.
283 Parameters
284 ==========
286 system_name : str
287 The name of the current system being analyzed
289 threshold : :py:class:`float`, :py:class:`str`
290 This number is used to define positives and negatives from probability
291 maps, and report F1-scores (a priori). By default, we expect a set
292 named 'validation' to be available at the input data. If that is not
293 the case, we use 'train', if available. You may provide the name of
294 another dataset to be used for threshold tunning otherwise. If not
295 set, or a string is input, threshold tunning is done per system,
296 individually. Optionally, you may also provide a floating-point number
297 between [0.0, 1.0] as the threshold to use for both systems.
299 evaluate : str
300 Name of the dataset key to use from ``dataset`` to evaluate (typically,
301 ``test``)
303 preddir : str
304 Root path to the predictions generated by system ``system_name``. The
305 final subpath inside ``preddir`` that will be used will have the value
306 of this variable suffixed with the value of ``evaluate``. We will
307 search for ``<preddir>/<evaluate>/<stems>.hdf5``.
309 dataset : dict
310 A dictionary mapping string keys to
311 :py:class:`torch.utils.data.dataset.Dataset` instances
313 steps : int
314 The number of threshold steps to consider when evaluating the highest
315 possible F1-score on train/test data.
317 size : tuple
318 Two values indicating the size of windows to be used for the sliding
319 window analysis. The values represent height and width respectively
321 stride : tuple
322 Two values indicating the stride of windows to be used for the sliding
323 window analysis. The values represent height and width respectively
325 outdir : str
326 Path where to store visualizations. If set to ``None``, then do not
327 store performance visualizations.
329 figure : str
330 The name of a performance figure (e.g. ``f1_score``, ``jaccard``, or
331 ``accuracy``) to use when comparing performances
333 nproc : int
334 Sets the number of parallel processes to use when running using
335 multiprocessing. A value of zero uses all reported cores. A value of
336 ``1`` avoids completely the use of multiprocessing and runs all chores
337 in the current processing context.
339 checkpointdir : str
340 If set to a string (instead of ``None``), then stores a cached version
341 of the sliding window performances on disk, for a particular system.
344 Returns
345 =======
347 d : dict
348 A dictionary in which keys are filename stems and values are
349 dictionaries with the following contents:
351 ``winperf``: numpy.ndarray
352 A dataframe with all the sliding window performances aggregated,
353 for all input images.
355 ``n`` : numpy.ndarray
356 A 2D numpy array containing the number of performance scores for
357 every pixel in the original image
359 ``avg`` : numpy.ndarray
360 A 2D numpy array containing the average performances for every
361 pixel on the input image considering the sliding window sizes and
362 strides applied to the image
364 ``std`` : numpy.ndarray
365 A 2D numpy array containing the (unbiased) standard deviations for
366 the provided performance figure, for every pixel on the input image
367 considering the sliding window sizes and strides applied to the
368 image
369 """
371 if checkpointdir is not None:
372 chkpt_fname = os.path.join(
373 checkpointdir,
374 f"{system_name}-{evaluate}-{threshold}-"
375 f"{size[0]}x{size[1]}+{stride[0]}x{stride[1]}-{figure}.pkl.gz",
376 )
377 os.makedirs(os.path.dirname(chkpt_fname), exist_ok=True)
378 if os.path.exists(chkpt_fname):
379 logger.info(f"Loading checkpoint from {chkpt_fname}...")
380 # loads and returns checkpoint from file
381 try:
382 with __import__("gzip").GzipFile(chkpt_fname, "r") as f:
383 return __import__("pickle").load(f)
384 except EOFError as e:
385 logger.warning(
386 f"Could not load sliding window performance "
387 f"from {chkpt_fname}: {e}. Calculating..."
388 )
389 else:
390 logger.debug(
391 f"Checkpoint not available at {chkpt_fname}. "
392 f"Calculating..."
393 )
394 else:
395 chkpt_fname = None
397 if not isinstance(threshold, float):
398 assert threshold in dataset, f"No dataset named '{threshold}'"
400 logger.info(
401 f"Evaluating threshold on '{threshold}' set for "
402 f"'{system_name}' using {steps} steps"
403 )
404 threshold = run_evaluation(
405 dataset[threshold], threshold, preddir, steps=steps
406 )
407 logger.info(f"Set --threshold={threshold:.5f} for '{system_name}'")
409 # for a given threshold on each system, calculate sliding window performances
410 logger.info(
411 f"Evaluating sliding window '{figure}' on '{evaluate}' set for "
412 f"'{system_name}' using windows of size {size} and stride {stride}"
413 )
415 retval = sliding_window_performances(
416 dataset,
417 evaluate,
418 preddir,
419 threshold,
420 size,
421 stride,
422 figure,
423 nproc,
424 outdir,
425 )
427 # cache sliding window performance for later use, if necessary
428 if chkpt_fname is not None:
429 logger.debug(f"Storing checkpoint at {chkpt_fname}...")
430 with __import__("gzip").GzipFile(chkpt_fname, "w") as f:
431 __import__("pickle").dump(retval, f)
433 return retval
435 def _eval_differences(
436 names,
437 perfs,
438 evaluate,
439 dataset,
440 size,
441 stride,
442 outdir,
443 figure,
444 nproc,
445 checkpointdir,
446 ):
447 """Evaluate differences in the performance sliding windows between two
448 systems.
450 Parameters
451 ----------
453 names : :py:class:`tuple` of :py:class:`str`
454 Names of the first and second systems
456 perfs : :py:class:`tuple` of :py:class:`dict`
457 Dictionaries for the sliding window performances of each system, as
458 returned by :py:func:`_eval_sliding_windows`
460 evaluate : str
461 Name of the dataset key to use from ``dataset`` to evaluate (typically,
462 ``test``)
464 dataset : dict
465 A dictionary mapping string keys to
466 :py:class:`torch.utils.data.dataset.Dataset` instances
468 size : tuple
469 Two values indicating the size of windows to be used for sliding window
470 analysis. The values represent height and width respectively
472 stride : tuple
473 Two values indicating the stride of windows to be used for sliding
474 window analysis. The values represent height and width respectively
476 outdir : str
477 If set to ``None``, then do not output performance visualizations.
478 Otherwise, in directory ``outdir``, dumps the visualizations for the
479 performance differences between both systems.
481 figure : str
482 The name of a performance figure (e.g. ``f1_score``, or ``jaccard``) to
483 use when comparing performances
485 nproc : int
486 Sets the number of parallel processes to use when running using
487 multiprocessing. A value of zero uses all reported cores. A value of
488 ``1`` avoids completely the use of multiprocessing and runs all chores
489 in the current processing context.
491 checkpointdir : str
492 If set to a string (instead of ``None``), then stores a cached version
493 of the sliding window performances on disk, for a particular difference
494 between systems.
497 Returns
498 -------
500 d : dict
501 A dictionary representing sliding window performance differences across
502 all files and sliding windows. The format of this is similar to the
503 individual inputs ``perf1`` and ``perf2``.
504 """
506 if checkpointdir is not None:
507 chkpt_fname = os.path.join(
508 checkpointdir,
509 f"{names[0]}-{names[1]}-{evaluate}-"
510 f"{size[0]}x{size[1]}+{stride[0]}x{stride[1]}-{figure}.pkl.gz",
511 )
512 os.makedirs(os.path.dirname(chkpt_fname), exist_ok=True)
513 if os.path.exists(chkpt_fname):
514 logger.info(f"Loading checkpoint from {chkpt_fname}...")
515 # loads and returns checkpoint from file
516 try:
517 with __import__("gzip").GzipFile(chkpt_fname, "r") as f:
518 return __import__("pickle").load(f)
519 except EOFError as e:
520 logger.warning(
521 f"Could not load sliding window performance "
522 f"from {chkpt_fname}: {e}. Calculating..."
523 )
524 else:
525 logger.debug(
526 f"Checkpoint not available at {chkpt_fname}. "
527 f"Calculating..."
528 )
529 else:
530 chkpt_fname = None
532 perf_diff = {
533 k: perfs[0][k]["winperf"] - perfs[1][k]["winperf"] for k in perfs[0]
534 }
536 # for a given threshold on each system, calculate sliding window performances
537 logger.info(
538 f"Evaluating sliding window '{figure}' differences on '{evaluate}' "
539 f"set on '{names[0]}-{names[1]}' using windows of size {size} and "
540 f"stride {stride}"
541 )
543 retval = visual_performances(
544 dataset,
545 evaluate,
546 perf_diff,
547 size,
548 stride,
549 figure,
550 nproc,
551 outdir,
552 )
554 # cache sliding window performance for later use, if necessary
555 if chkpt_fname is not None:
556 logger.debug(f"Storing checkpoint at {chkpt_fname}...")
557 with __import__("gzip").GzipFile(chkpt_fname, "w") as f:
558 __import__("pickle").dump(retval, f)
560 return retval
562 # minimal validation to startup
563 threshold = _validate_threshold(threshold, dataset)
564 assert evaluate in dataset, f"No dataset named '{evaluate}'"
566 perf1 = _eval_sliding_windows(
567 names[0],
568 threshold,
569 evaluate,
570 predictions[0],
571 dataset,
572 steps,
573 size,
574 stride,
575 (
576 output_folder
577 if output_folder is None
578 else os.path.join(output_folder, names[0])
579 ),
580 figure,
581 parallel,
582 checkpoint_folder,
583 )
585 perf2 = _eval_sliding_windows(
586 names[1],
587 threshold,
588 evaluate,
589 predictions[1],
590 dataset,
591 steps,
592 size,
593 stride,
594 (
595 output_folder
596 if output_folder is None
597 else os.path.join(output_folder, names[1])
598 ),
599 figure,
600 parallel,
601 checkpoint_folder,
602 )
604 # perf_diff = _eval_differences(
605 # names,
606 # (perf1, perf2),
607 # evaluate,
608 # dataset,
609 # size,
610 # stride,
611 # (
612 # output_folder
613 # if output_folder is None
614 # else os.path.join(output_folder, "diff")
615 # ),
616 # figure,
617 # parallel,
618 # checkpoint_folder,
619 # )
621 # loads all figures for the given threshold
622 stems = list(perf1.keys())
623 figindex = PERFORMANCE_FIGURES.index(figure)
624 da = numpy.array([perf1[k]["winperf"][figindex] for k in stems]).flatten()
625 db = numpy.array([perf2[k]["winperf"][figindex] for k in stems]).flatten()
626 diff = da - db
628 while remove_outliers:
629 outliers_diff = index_of_outliers(diff)
630 if sum(outliers_diff) == 0:
631 break
632 diff = diff[~outliers_diff]
633 da = da[~outliers_diff]
634 db = db[~outliers_diff]
636 if remove_zeros:
637 remove_zeros = (da == 0) & (db == 0)
638 diff = diff[~remove_zeros]
639 da = da[~remove_zeros]
640 db = db[~remove_zeros]
642 if output_folder is not None:
643 fname = os.path.join(output_folder, "analysis.pdf")
644 os.makedirs(os.path.dirname(fname), exist_ok=True)
645 logger.info(f"Writing analysis figures to {fname} (multipage PDF)...")
646 write_analysis_figures(names, da, db, fname)
648 if output_folder is not None:
649 fname = os.path.join(output_folder, "analysis.txt")
650 os.makedirs(os.path.dirname(fname), exist_ok=True)
651 logger.info(f"Writing analysis summary to {fname}...")
652 with open(fname, "w") as f:
653 write_analysis_text(names, da, db, f)
654 write_analysis_text(names, da, db, sys.stdout)