Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#!/usr/bin/env python
2# coding=utf-8
4import logging
5import os
6import sys
8import click
9import numpy
11from bob.extension.scripts.click_helper import (
12 ConfigCommand,
13 ResourceOption,
14 verbosity_option,
15)
17from ..engine.significance import (
18 PERFORMANCE_FIGURES,
19 index_of_outliers,
20 sliding_window_performances,
21 visual_performances,
22 write_analysis_figures,
23 write_analysis_text,
24)
25from .evaluate import _validate_threshold
26from .evaluate import run as run_evaluation
28logger = logging.getLogger(__name__)
31def _eval_sliding_windows(
32 system_name,
33 threshold,
34 evaluate,
35 preddir,
36 dataset,
37 steps,
38 size,
39 stride,
40 outdir,
41 figure,
42 nproc,
43 checkpointdir,
44):
45 """Calculates the sliding window performances on a dataset
48 Parameters
49 ==========
51 system_name : str
52 The name of the current system being analyzed
54 threshold : :py:class:`float`, :py:class:`str`
55 This number is used to define positives and negatives from probability
56 maps, and report F1-scores (a priori). By default, we expect a set
57 named 'validation' to be available at the input data. If that is not
58 the case, we use 'train', if available. You may provide the name of
59 another dataset to be used for threshold tunning otherwise. If not
60 set, or a string is input, threshold tunning is done per system,
61 individually. Optionally, you may also provide a floating-point number
62 between [0.0, 1.0] as the threshold to use for both systems.
64 evaluate : str
65 Name of the dataset key to use from ``dataset`` to evaluate (typically,
66 ``test``)
68 preddir : str
69 Root path to the predictions generated by system ``system_name``. The
70 final subpath inside ``preddir`` that will be used will have the value
71 of this variable suffixed with the value of ``evaluate``. We will
72 search for ``<preddir>/<evaluate>/<stems>.hdf5``.
74 dataset : dict
75 A dictionary mapping string keys to
76 :py:class:`torch.utils.data.dataset.Dataset` instances
78 steps : int
79 The number of threshold steps to consider when evaluating the highest
80 possible F1-score on train/test data.
82 size : tuple
83 Two values indicating the size of windows to be used for the sliding
84 window analysis. The values represent height and width respectively
86 stride : tuple
87 Two values indicating the stride of windows to be used for the sliding
88 window analysis. The values represent height and width respectively
90 outdir : str
91 Path where to store visualizations. If set to ``None``, then do not
92 store performance visualizations.
94 figure : str
95 The name of a performance figure (e.g. ``f1_score``, ``jaccard``, or
96 ``accuracy``) to use when comparing performances
98 nproc : int
99 Sets the number of parallel processes to use when running using
100 multiprocessing. A value of zero uses all reported cores. A value of
101 ``1`` avoids completely the use of multiprocessing and runs all chores
102 in the current processing context.
104 checkpointdir : str
105 If set to a string (instead of ``None``), then stores a cached version
106 of the sliding window performances on disk, for a particular system.
109 Returns
110 =======
112 d : dict
113 A dictionary in which keys are filename stems and values are
114 dictionaries with the following contents:
116 ``winperf``: numpy.ndarray
117 A dataframe with all the sliding window performances aggregated,
118 for all input images.
120 ``n`` : numpy.ndarray
121 A 2D numpy array containing the number of performance scores for
122 every pixel in the original image
124 ``avg`` : numpy.ndarray
125 A 2D numpy array containing the average performances for every
126 pixel on the input image considering the sliding window sizes and
127 strides applied to the image
129 ``std`` : numpy.ndarray
130 A 2D numpy array containing the (unbiased) standard deviations for
131 the provided performance figure, for every pixel on the input image
132 considering the sliding window sizes and strides applied to the
133 image
135 """
137 if checkpointdir is not None:
138 chkpt_fname = os.path.join(
139 checkpointdir,
140 f"{system_name}-{evaluate}-{threshold}-"
141 f"{size[0]}x{size[1]}+{stride[0]}x{stride[1]}-{figure}.pkl.gz",
142 )
143 os.makedirs(os.path.dirname(chkpt_fname), exist_ok=True)
144 if os.path.exists(chkpt_fname):
145 logger.info(f"Loading checkpoint from {chkpt_fname}...")
146 # loads and returns checkpoint from file
147 try:
148 with __import__("gzip").GzipFile(chkpt_fname, "r") as f:
149 return __import__("pickle").load(f)
150 except EOFError as e:
151 logger.warning(
152 f"Could not load sliding window performance "
153 f"from {chkpt_fname}: {e}. Calculating..."
154 )
155 else:
156 logger.debug(
157 f"Checkpoint not available at {chkpt_fname}. " f"Calculating..."
158 )
159 else:
160 chkpt_fname = None
162 if not isinstance(threshold, float):
164 assert threshold in dataset, f"No dataset named '{threshold}'"
166 logger.info(
167 f"Evaluating threshold on '{threshold}' set for "
168 f"'{system_name}' using {steps} steps"
169 )
170 threshold = run_evaluation(
171 dataset[threshold], threshold, preddir, steps=steps
172 )
173 logger.info(f"Set --threshold={threshold:.5f} for '{system_name}'")
175 # for a given threshold on each system, calculate sliding window performances
176 logger.info(
177 f"Evaluating sliding window '{figure}' on '{evaluate}' set for "
178 f"'{system_name}' using windows of size {size} and stride {stride}"
179 )
181 retval = sliding_window_performances(
182 dataset,
183 evaluate,
184 preddir,
185 threshold,
186 size,
187 stride,
188 figure,
189 nproc,
190 outdir,
191 )
193 # cache sliding window performance for later use, if necessary
194 if chkpt_fname is not None:
195 logger.debug(f"Storing checkpoint at {chkpt_fname}...")
196 with __import__("gzip").GzipFile(chkpt_fname, "w") as f:
197 __import__("pickle").dump(retval, f)
199 return retval
202def _eval_differences(
203 names,
204 perfs,
205 evaluate,
206 dataset,
207 size,
208 stride,
209 outdir,
210 figure,
211 nproc,
212 checkpointdir,
213):
214 """Evaluate differences in the performance sliding windows between two systems
216 Parameters
217 ----------
219 names : :py:class:`tuple` of :py:class:`str`
220 Names of the first and second systems
222 perfs : :py:class:`tuple` of :py:class:`dict`
223 Dictionaries for the sliding window performances of each system, as
224 returned by :py:func:`_eval_sliding_windows`
226 evaluate : str
227 Name of the dataset key to use from ``dataset`` to evaluate (typically,
228 ``test``)
230 dataset : dict
231 A dictionary mapping string keys to
232 :py:class:`torch.utils.data.dataset.Dataset` instances
234 size : tuple
235 Two values indicating the size of windows to be used for sliding window
236 analysis. The values represent height and width respectively
238 stride : tuple
239 Two values indicating the stride of windows to be used for sliding
240 window analysis. The values represent height and width respectively
242 outdir : str
243 If set to ``None``, then do not output performance visualizations.
244 Otherwise, in directory ``outdir``, dumps the visualizations for the
245 performance differences between both systems.
247 figure : str
248 The name of a performance figure (e.g. ``f1_score``, or ``jaccard``) to
249 use when comparing performances
251 nproc : int
252 Sets the number of parallel processes to use when running using
253 multiprocessing. A value of zero uses all reported cores. A value of
254 ``1`` avoids completely the use of multiprocessing and runs all chores
255 in the current processing context.
257 checkpointdir : str
258 If set to a string (instead of ``None``), then stores a cached version
259 of the sliding window performances on disk, for a particular difference
260 between systems.
263 Returns
264 -------
266 d : dict
267 A dictionary representing sliding window performance differences across
268 all files and sliding windows. The format of this is similar to the
269 individual inputs ``perf1`` and ``perf2``.
271 """
273 if checkpointdir is not None:
274 chkpt_fname = os.path.join(
275 checkpointdir,
276 f"{names[0]}-{names[1]}-{evaluate}-"
277 f"{size[0]}x{size[1]}+{stride[0]}x{stride[1]}-{figure}.pkl.gz",
278 )
279 os.makedirs(os.path.dirname(chkpt_fname), exist_ok=True)
280 if os.path.exists(chkpt_fname):
281 logger.info(f"Loading checkpoint from {chkpt_fname}...")
282 # loads and returns checkpoint from file
283 try:
284 with __import__("gzip").GzipFile(chkpt_fname, "r") as f:
285 return __import__("pickle").load(f)
286 except EOFError as e:
287 logger.warning(
288 f"Could not load sliding window performance "
289 f"from {chkpt_fname}: {e}. Calculating..."
290 )
291 else:
292 logger.debug(
293 f"Checkpoint not available at {chkpt_fname}. " f"Calculating..."
294 )
295 else:
296 chkpt_fname = None
298 perf_diff = dict(
299 [(k, perfs[0][k]["winperf"] - perfs[1][k]["winperf"]) for k in perfs[0]]
300 )
302 # for a given threshold on each system, calculate sliding window performances
303 logger.info(
304 f"Evaluating sliding window '{figure}' differences on '{evaluate}' "
305 f"set on '{names[0]}-{names[1]}' using windows of size {size} and "
306 f"stride {stride}"
307 )
309 retval = visual_performances(
310 dataset,
311 evaluate,
312 perf_diff,
313 size,
314 stride,
315 figure,
316 nproc,
317 outdir,
318 )
320 # cache sliding window performance for later use, if necessary
321 if chkpt_fname is not None:
322 logger.debug(f"Storing checkpoint at {chkpt_fname}...")
323 with __import__("gzip").GzipFile(chkpt_fname, "w") as f:
324 __import__("pickle").dump(retval, f)
326 return retval
329@click.command(
330 entry_point_group="bob.ip.binseg.config",
331 cls=ConfigCommand,
332 epilog="""Examples:
334\b
335 1. Runs a significance test using as base the calculated predictions of two
336 different systems, on the **same** dataset:
337\b
338 $ bob binseg significance -vv drive --names system1 system2 --predictions=path/to/predictions/system-1 path/to/predictions/system-2
339\b
340 2. By default, we use a "validation" dataset if it is available, to infer
341 the a priori threshold for the comparison of two systems. Otherwise,
342 you may need to specify the name of a set to be used as validation set
343 for choosing a threshold. The same goes for the set to be used for
344 testing the hypothesis - by default we use the "test" dataset if it is
345 available, otherwise, specify.
346\b
347 $ bob binseg significance -vv drive --names system1 system2 --predictions=path/to/predictions/system-1 path/to/predictions/system-2 --threshold=train --evaluate=alternate-test
348""",
349)
350@click.option(
351 "--names",
352 "-n",
353 help="Names of the two systems to compare",
354 nargs=2,
355 required=True,
356 type=str,
357 cls=ResourceOption,
358)
359@click.option(
360 "--predictions",
361 "-p",
362 help="Path where predictions of system 2 are currently stored. You may "
363 "also input predictions from a second-annotator. This application "
364 "will adequately handle it.",
365 nargs=2,
366 required=True,
367 type=click.Path(exists=True, file_okay=False, dir_okay=True),
368 cls=ResourceOption,
369)
370@click.option(
371 "--dataset",
372 "-d",
373 help="A dictionary mapping string keys to "
374 "torch.utils.data.dataset.Dataset instances",
375 required=True,
376 cls=ResourceOption,
377)
378@click.option(
379 "--threshold",
380 "-t",
381 help="This number is used to define positives and negatives from "
382 "probability maps, and report F1-scores (a priori). By default, we "
383 "expect a set named 'validation' to be available at the input data. "
384 "If that is not the case, we use 'train', if available. You may provide "
385 "the name of another dataset to be used for threshold tunning otherwise. "
386 "If not set, or a string is input, threshold tunning is done per system, "
387 "individually. Optionally, you may also provide a floating-point number "
388 "between [0.0, 1.0] as the threshold to use for both systems.",
389 default="validation",
390 show_default=True,
391 required=True,
392 cls=ResourceOption,
393)
394@click.option(
395 "--evaluate",
396 "-e",
397 help="Name of the dataset to evaluate",
398 default="test",
399 show_default=True,
400 required=True,
401 cls=ResourceOption,
402)
403@click.option(
404 "--steps",
405 "-S",
406 help="This number is used to define the number of threshold steps to "
407 "consider when evaluating the highest possible F1-score on train/test data.",
408 default=1000,
409 type=int,
410 show_default=True,
411 required=True,
412 cls=ResourceOption,
413)
414@click.option(
415 "--size",
416 "-s",
417 help="This is a tuple with two values indicating the size of windows to "
418 "be used for sliding window analysis. The values represent height and "
419 "width respectively.",
420 default=(128, 128),
421 nargs=2,
422 type=int,
423 show_default=True,
424 required=True,
425 cls=ResourceOption,
426)
427@click.option(
428 "--stride",
429 "-t",
430 help="This is a tuple with two values indicating the stride of windows to "
431 "be used for sliding window analysis. The values represent height and "
432 "width respectively.",
433 default=(32, 32),
434 nargs=2,
435 type=int,
436 show_default=True,
437 required=True,
438 cls=ResourceOption,
439)
440@click.option(
441 "--figure",
442 "-f",
443 help="The name of a performance figure (e.g. f1_score, or jaccard) to "
444 "use when comparing performances",
445 default="accuracy",
446 type=str,
447 show_default=True,
448 required=True,
449 cls=ResourceOption,
450)
451@click.option(
452 "--output-folder",
453 "-o",
454 help="Path where to store visualizations",
455 required=False,
456 type=click.Path(),
457 show_default=True,
458 cls=ResourceOption,
459)
460@click.option(
461 "--remove-outliers/--no-remove-outliers",
462 "-R",
463 help="If set, removes outliers from both score distributions before "
464 "running statistical analysis. Outlier removal follows a 1.5 IQR range "
465 "check from the difference in figures between both systems and assumes "
466 "most of the distribution is contained within that range (like in a "
467 "normal distribution)",
468 default=False,
469 required=True,
470 show_default=True,
471 cls=ResourceOption,
472)
473@click.option(
474 "--remove-zeros/--no-remove-zeros",
475 "-R",
476 help="If set, removes instances from the statistical analysis in which "
477 "both systems had a performance equal to zero.",
478 default=False,
479 required=True,
480 show_default=True,
481 cls=ResourceOption,
482)
483@click.option(
484 "--parallel",
485 "-x",
486 help="Set the number of parallel processes to use when running using "
487 "multiprocessing. A value of zero uses all reported cores.",
488 default=1,
489 type=int,
490 show_default=True,
491 required=True,
492 cls=ResourceOption,
493)
494@click.option(
495 "--checkpoint-folder",
496 "-k",
497 help="Path where to store checkpointed versions of sliding window "
498 "performances",
499 required=False,
500 type=click.Path(),
501 show_default=True,
502 cls=ResourceOption,
503)
504@verbosity_option(cls=ResourceOption)
505def significance(
506 names,
507 predictions,
508 dataset,
509 threshold,
510 evaluate,
511 steps,
512 size,
513 stride,
514 figure,
515 output_folder,
516 remove_outliers,
517 remove_zeros,
518 parallel,
519 checkpoint_folder,
520 **kwargs,
521):
522 """Evaluates how significantly different are two models on the same dataset
524 This application calculates the significance of results of two models
525 operating on the same dataset, and subject to a priori threshold tunning.
526 """
528 # minimal validation to startup
529 threshold = _validate_threshold(threshold, dataset)
530 assert evaluate in dataset, f"No dataset named '{evaluate}'"
532 perf1 = _eval_sliding_windows(
533 names[0],
534 threshold,
535 evaluate,
536 predictions[0],
537 dataset,
538 steps,
539 size,
540 stride,
541 (
542 output_folder
543 if output_folder is None
544 else os.path.join(output_folder, names[0])
545 ),
546 figure,
547 parallel,
548 checkpoint_folder,
549 )
551 perf2 = _eval_sliding_windows(
552 names[1],
553 threshold,
554 evaluate,
555 predictions[1],
556 dataset,
557 steps,
558 size,
559 stride,
560 (
561 output_folder
562 if output_folder is None
563 else os.path.join(output_folder, names[1])
564 ),
565 figure,
566 parallel,
567 checkpoint_folder,
568 )
570 # perf_diff = _eval_differences(
571 # names,
572 # (perf1, perf2),
573 # evaluate,
574 # dataset,
575 # size,
576 # stride,
577 # (
578 # output_folder
579 # if output_folder is None
580 # else os.path.join(output_folder, "diff")
581 # ),
582 # figure,
583 # parallel,
584 # checkpoint_folder,
585 # )
587 # loads all figures for the given threshold
588 stems = list(perf1.keys())
589 figindex = PERFORMANCE_FIGURES.index(figure)
590 da = numpy.array([perf1[k]["winperf"][figindex] for k in stems]).flatten()
591 db = numpy.array([perf2[k]["winperf"][figindex] for k in stems]).flatten()
592 diff = da - db
594 while remove_outliers:
595 outliers_diff = index_of_outliers(diff)
596 if sum(outliers_diff) == 0:
597 break
598 diff = diff[~outliers_diff]
599 da = da[~outliers_diff]
600 db = db[~outliers_diff]
602 if remove_zeros:
603 remove_zeros = (da == 0) & (db == 0)
604 diff = diff[~remove_zeros]
605 da = da[~remove_zeros]
606 db = db[~remove_zeros]
608 if output_folder is not None:
609 fname = os.path.join(output_folder, "analysis.pdf")
610 os.makedirs(os.path.dirname(fname), exist_ok=True)
611 logger.info(f"Writing analysis figures to {fname} (multipage PDF)...")
612 write_analysis_figures(names, da, db, fname)
614 if output_folder is not None:
615 fname = os.path.join(output_folder, "analysis.txt")
616 os.makedirs(os.path.dirname(fname), exist_ok=True)
617 logger.info(f"Writing analysis summary to {fname}...")
618 with open(fname, "wt") as f:
619 write_analysis_text(names, da, db, f)
620 write_analysis_text(names, da, db, sys.stdout)