Coverage for src/deepdraw/engine/evaluator.py: 99%
172 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-11-30 15:00 +0100
« prev ^ index » next coverage.py v7.3.1, created at 2023-11-30 15:00 +0100
1# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
2#
3# SPDX-License-Identifier: GPL-3.0-or-later
5"""Defines functionality for the evaluation of predictions."""
7import itertools
8import logging
9import multiprocessing
10import os
12import h5py
13import numpy
14import pandas
15import PIL
16import torch
17import torch.nn.functional
18import torchvision.transforms.functional as VF
20from tqdm import tqdm
22from ..utils.measure import base_measures, bayesian_measures
24logger = logging.getLogger(__name__)
27def _posneg(pred, gt, threshold):
28 """Calculates true and false positives and negatives.
30 Parameters
31 ----------
33 pred : torch.Tensor
34 pixel-wise predictions
36 gt : torch.Tensor
37 ground-truth (annotations)
39 threshold : float
40 a particular threshold in which to calculate the performance
41 measures
44 Returns
45 -------
47 tp_tensor : torch.Tensor
48 boolean tensor with true positives, considering all observations
50 fp_tensor : torch.Tensor
51 boolean tensor with false positives, considering all observations
53 tn_tensor : torch.Tensor
54 boolean tensor with true negatives, considering all observations
56 fn_tensor : torch.Tensor
57 boolean tensor with false negatives, considering all observations
58 """
60 gt = gt.byte() # byte tensor
62 # threshold
63 binary_pred = torch.gt(pred, threshold).byte()
65 # equals and not-equals
66 equals = torch.eq(binary_pred, gt).type(torch.uint8) # tensor
67 notequals = torch.ne(binary_pred, gt).type(torch.uint8) # tensor
69 # true positives
70 tp_tensor = gt * binary_pred
72 # false positives
73 fp_tensor = torch.eq((binary_pred + tp_tensor), 1).byte()
75 # true negatives
76 tn_tensor = equals - tp_tensor
78 # false negatives
79 fn_tensor = notequals - fp_tensor
81 return tp_tensor, fp_tensor, tn_tensor, fn_tensor
84def sample_measures_for_threshold(pred, gt, mask, threshold):
85 """Calculates counts on one single sample, for a specific threshold.
87 Parameters
88 ----------
90 pred : torch.Tensor
91 pixel-wise predictions
93 gt : torch.Tensor
94 ground-truth (annotations)
96 mask : torch.Tensor
97 region mask (used only if available). May be set to ``None``.
99 threshold : float
100 a particular threshold in which to calculate the performance
101 measures
104 Returns
105 -------
107 tp : int
109 fp : int
111 tn : int
113 fn : int
114 """
116 tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg(pred, gt, threshold)
118 # if a mask is provided, consider only TP/FP/TN/FN **within** the region of
119 # interest defined by the mask
120 if mask is not None:
121 antimask = torch.le(mask, 0.5)
122 tp_tensor[antimask] = 0
123 fp_tensor[antimask] = 0
124 tn_tensor[antimask] = 0
125 fn_tensor[antimask] = 0
127 # calc measures from scalars
128 tp_count = torch.sum(tp_tensor).item()
129 fp_count = torch.sum(fp_tensor).item()
130 tn_count = torch.sum(tn_tensor).item()
131 fn_count = torch.sum(fn_tensor).item()
133 return tp_count, fp_count, tn_count, fn_count
136def _sample_measures(pred, gt, mask, steps):
137 """Calculates measures on one single sample.
139 Parameters
140 ----------
142 pred : torch.Tensor
143 pixel-wise predictions
145 gt : torch.Tensor
146 ground-truth (annotations)
148 mask : torch.Tensor
149 region mask (used only if available). May be set to ``None``.
151 steps : int
152 number of steps to use for threshold analysis. The step size is
153 calculated from this by dividing ``1.0/steps``
156 Returns
157 -------
159 measures : pandas.DataFrame
161 A pandas dataframe with the following columns:
163 * tp: int
164 * fp: int
165 * tn: int
166 * fn: int
167 """
169 step_size = 1.0 / steps
170 data = [
171 (index, threshold)
172 + sample_measures_for_threshold(pred, gt, mask, threshold)
173 for index, threshold in enumerate(numpy.arange(0.0, 1.0, step_size))
174 ]
176 retval = pandas.DataFrame(
177 data,
178 columns=(
179 "index",
180 "threshold",
181 "tp",
182 "fp",
183 "tn",
184 "fn",
185 ),
186 )
187 retval.set_index("index", inplace=True)
188 return retval
191def _sample_analysis(
192 img,
193 pred,
194 gt,
195 mask,
196 threshold,
197 tp_color=(0, 255, 0), # (128,128,128) Gray
198 fp_color=(0, 0, 255), # (70, 240, 240) Cyan
199 fn_color=(255, 0, 0), # (245, 130, 48) Orange
200 overlay=True,
201):
202 """Visualizes true positives, false positives and false negatives.
204 Parameters
205 ----------
207 img : torch.Tensor
208 original image
210 pred : torch.Tensor
211 pixel-wise predictions
213 gt : torch.Tensor
214 ground-truth (annotations)
216 mask : torch.Tensor
217 region mask (used only if available). May be set to ``None``.
219 threshold : float
220 The threshold to be used while analyzing this image's probability map
222 tp_color : tuple
223 RGB value for true positives
225 fp_color : tuple
226 RGB value for false positives
228 fn_color : tuple
229 RGB value for false negatives
231 overlay : :py:class:`bool`, Optional
232 If set to ``True`` (which is the default), then overlay annotations on
233 top of the image. Otherwise, represent data on a black canvas.
236 Returns
237 -------
239 figure : PIL.Image.Image
241 A PIL image that contains the overlayed analysis of true-positives
242 (TP), false-positives (FP) and false negatives (FN).
243 """
245 tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg(pred, gt, threshold)
247 # if a mask is provided, consider only TP/FP/TN/FN **within** the region of
248 # interest defined by the mask
249 if mask is not None:
250 antimask = torch.le(mask, 0.5)
251 tp_tensor[antimask] = 0
252 fp_tensor[antimask] = 0
253 tn_tensor[antimask] = 0
254 fn_tensor[antimask] = 0
256 # change to PIL representation
257 tp_pil = VF.to_pil_image(tp_tensor.float())
258 tp_pil_colored = PIL.ImageOps.colorize(tp_pil, (0, 0, 0), tp_color)
260 fp_pil = VF.to_pil_image(fp_tensor.float())
261 fp_pil_colored = PIL.ImageOps.colorize(fp_pil, (0, 0, 0), fp_color)
263 fn_pil = VF.to_pil_image(fn_tensor.float())
264 fn_pil_colored = PIL.ImageOps.colorize(fn_pil, (0, 0, 0), fn_color)
266 tp_pil_colored.paste(fp_pil_colored, mask=fp_pil)
267 tp_pil_colored.paste(fn_pil_colored, mask=fn_pil)
269 if overlay:
270 img = VF.to_pil_image(img) # PIL Image
271 # using blend here, to fade original image being overlayed, or
272 # its brightness may obfuscate colors from the vessel map
273 tp_pil_colored = PIL.Image.blend(img, tp_pil_colored, 0.5)
275 return tp_pil_colored
278def _summarize(data):
279 """Summarizes collected dataframes and adds bayesian figures."""
281 _entries = (
282 "mean_precision",
283 "mode_precision",
284 "lower_precision",
285 "upper_precision",
286 "mean_recall",
287 "mode_recall",
288 "lower_recall",
289 "upper_recall",
290 "mean_specificity",
291 "mode_specificity",
292 "lower_specificity",
293 "upper_specificity",
294 "mean_accuracy",
295 "mode_accuracy",
296 "lower_accuracy",
297 "upper_accuracy",
298 "mean_jaccard",
299 "mode_jaccard",
300 "lower_jaccard",
301 "upper_jaccard",
302 "mean_f1_score",
303 "mode_f1_score",
304 "lower_f1_score",
305 "upper_f1_score",
306 "frequentist_precision",
307 "frequentist_recall",
308 "frequentist_specificity",
309 "frequentist_accuracy",
310 "frequentist_jaccard",
311 "frequentist_f1_score",
312 )
314 def _row_summary(r):
315 # run bayesian_measures(), flatten tuple of tuples, name entries
316 bayesian = [
317 item
318 for sublist in bayesian_measures(
319 r.tp,
320 r.fp,
321 r.tn,
322 r.fn,
323 lambda_=0.5,
324 coverage=0.95,
325 )
326 for item in sublist
327 ]
329 # evaluate frequentist measures
330 frequentist = base_measures(r.tp, r.fp, r.tn, r.fn)
331 return pandas.Series(bayesian + list(frequentist), index=_entries)
333 # Merges all dataframes together
334 sums = pandas.concat(data.values()).groupby("index").sum()
335 sums["threshold"] /= len(data)
337 # create a new dataframe with these
338 measures = sums.apply(lambda r: _row_summary(r), axis=1)
340 # merge sums and measures into a single dataframe
341 return pandas.concat([sums, measures.reindex(sums.index)], axis=1).copy()
344def _evaluate_sample_worker(args):
345 """Runs all of the evaluation steps on a single sample.
347 Parameters
348 ----------
350 args : tuple
351 A tuple containing the following sub-arguments:
353 sample : tuple
354 Sample to be processed, containing the stem of the filepath
355 relative to the database root, the image, the ground-truth, and
356 possibly the mask to define the region of interest to be processed.
358 name : str
359 the local name of the dataset (e.g. ``train``, or ``test``), to be
360 used when saving measures files.
362 steps : :py:class:`float`, Optional
363 number of threshold steps to consider when evaluating thresholds.
365 threshold : :py:class:`float`, Optional
366 if ``overlayed_folder``, then this should be threshold (floating
367 point) to apply to prediction maps to decide on positives and
368 negatives for overlaying analysis (graphical output). This number
369 should come from the training set or a separate validation set.
370 Using a test set value may bias your analysis. This number is also
371 used to print the a priori F1-score on the evaluated set.
373 use_predictions_folder : str
374 Folder where predictions for the dataset images have been
375 previously stored
377 output_folder : str, None
378 If not ``None``, then outputs a copy of the evaluation for this
379 sample in CSV format at this directory, but respecting the sample
380 ``stem``.
382 overlayed_folder : str, None
383 If not ``None``, then outputs a version of the input image with
384 predictions overlayed, in PNG format, but respecting the sample
385 ``stem``.
388 Returns
389 -------
391 stem : str
392 The unique sample stem
394 data : pandas.DataFrame
395 Dataframe containing the evaluation performance on this single sample
396 """
398 (
399 sample,
400 name,
401 steps,
402 threshold,
403 use_predictions_folder,
404 output_folder,
405 overlayed_folder,
406 ) = args
408 stem = sample[0]
409 image = sample[1]
410 gt = sample[2]
411 mask = None if len(sample) <= 3 else sample[3]
412 pred_fullpath = os.path.join(use_predictions_folder, stem + ".hdf5")
413 with h5py.File(pred_fullpath, "r") as f:
414 pred = f["array"][:]
415 pred = torch.from_numpy(pred)
416 retval = _sample_measures(pred, gt, mask, steps)
418 if output_folder is not None:
419 fullpath = os.path.join(output_folder, name, f"{stem}.csv")
420 tqdm.write(f"Saving {fullpath}...")
421 os.makedirs(os.path.dirname(fullpath), exist_ok=True)
422 retval.to_csv(fullpath)
424 if overlayed_folder is not None:
425 overlay_image = _sample_analysis(
426 image, pred, gt, mask, threshold=threshold, overlay=True
427 )
428 fullpath = os.path.join(overlayed_folder, name, f"{stem}.png")
429 tqdm.write(f"Saving {fullpath}...")
430 os.makedirs(os.path.dirname(fullpath), exist_ok=True)
431 overlay_image.save(fullpath)
433 return stem, retval
436def run(
437 dataset,
438 name,
439 predictions_folder,
440 output_folder=None,
441 overlayed_folder=None,
442 threshold=None,
443 steps=1000,
444 parallel=-1,
445):
446 """Runs inference and calculates measures.
448 Parameters
449 ---------
451 dataset : py:class:`torch.utils.data.Dataset`
452 a dataset to iterate on
454 name : str
455 the local name of this dataset (e.g. ``train``, or ``test``), to be
456 used when saving measures files.
458 predictions_folder : str
459 folder where predictions for the dataset images have been previously
460 stored
462 output_folder : :py:class:`str`, Optional
463 folder where to store results. If not provided, then do not store any
464 analysis (useful for quickly calculating overlay thresholds)
466 overlayed_folder : :py:class:`str`, Optional
467 if not ``None``, then it should be the name of a folder where to store
468 overlayed versions of the images and ground-truths
470 threshold : :py:class:`float`, Optional
471 if ``overlayed_folder``, then this should be threshold (floating point)
472 to apply to prediction maps to decide on positives and negatives for
473 overlaying analysis (graphical output). This number should come from
474 the training set or a separate validation set. Using a test set value
475 may bias your analysis. This number is also used to print the a priori
476 F1-score on the evaluated set.
478 steps : :py:class:`float`, Optional
479 number of threshold steps to consider when evaluating thresholds.
481 parallel : :py:class:`int`, Optional
482 If set to a value different >= 0, uses multiprocessing for estimating
483 thresholds for each sample through a processing pool. A value of zero
484 will create as many processes in the pool as cores in the machine. A
485 negative value disables multiprocessing altogether. A value greater
486 than zero will spawn as many processes as requested.
489 Returns
490 -------
492 threshold : float
493 Threshold to achieve the highest possible F1-score for this dataset
494 """
496 # Collect overall measures
497 data = {}
499 use_predictions_folder = os.path.join(predictions_folder, name)
500 if not os.path.exists(use_predictions_folder):
501 use_predictions_folder = predictions_folder
503 if parallel < 0: # turns off multiprocessing
504 for sample in tqdm(dataset, desc="sample"):
505 k, v = _evaluate_sample_worker(
506 (
507 sample,
508 name,
509 steps,
510 threshold,
511 use_predictions_folder,
512 output_folder,
513 overlayed_folder,
514 )
515 )
516 data[k] = v
517 else:
518 parallel = parallel or multiprocessing.cpu_count()
519 with multiprocessing.Pool(processes=parallel) as pool, tqdm(
520 total=len(dataset),
521 desc="sample",
522 ) as pbar:
523 for k, v in pool.imap_unordered(
524 _evaluate_sample_worker,
525 zip(
526 dataset,
527 itertools.repeat(name),
528 itertools.repeat(steps),
529 itertools.repeat(threshold),
530 itertools.repeat(use_predictions_folder),
531 itertools.repeat(output_folder),
532 itertools.repeat(overlayed_folder),
533 ),
534 ):
535 pbar.update()
536 data[k] = v
538 # Merges all dataframes together
539 measures = _summarize(data)
541 maxf1 = measures["mean_f1_score"].max()
542 maxf1_index = measures["mean_f1_score"].idxmax()
543 maxf1_threshold = measures["threshold"][maxf1_index]
545 logger.info(
546 f"Maximum F1-score of {maxf1:.5f}, achieved at "
547 f"threshold {maxf1_threshold:.3f} (chosen *a posteriori*)"
548 )
550 if threshold is not None:
551 # get the closest possible threshold we have
552 index = int(round(steps * threshold))
553 f1_a_priori = measures["mean_f1_score"][index]
554 actual_threshold = measures["threshold"][index]
556 # mark threshold a priori chosen on this dataset
557 measures["threshold_a_priori"] = False
558 measures["threshold_a_priori", index] = True
560 logger.info(
561 f"F1-score of {f1_a_priori:.5f}, at threshold "
562 f"{actual_threshold:.3f} (chosen *a priori*)"
563 )
565 if output_folder is not None:
566 logger.info(f"Output folder: {output_folder}")
567 os.makedirs(output_folder, exist_ok=True)
568 measures_path = os.path.join(output_folder, f"{name}.csv")
569 logger.info(
570 f"Saving measures over all input images at {measures_path}..."
571 )
572 measures.to_csv(measures_path)
574 return maxf1_threshold
577def _compare_annotators_worker(args):
578 """Runs all of the comparison steps on a single sample pair.
580 Parameters
581 ----------
583 args : tuple
584 A tuple containing the following sub-arguments:
586 baseline_sample : tuple
587 Baseline sample to be processed, containing the stem of the filepath
588 relative to the database root, the image, the ground-truth, and
589 possibly the mask to define the region of interest to be processed.
591 other_sample : tuple
592 Another sample that is identical to the first, but has a different
593 mask (drawn by a different annotator)
595 name : str
596 the local name of the dataset (e.g. ``train``, or ``test``), to be
597 used when saving measures files.
599 output_folder : str, None
600 If not ``None``, then outputs a copy of the evaluation for this
601 sample in CSV format at this directory, but respecting the sample
602 ``stem``.
604 overlayed_folder : str, None
605 If not ``None``, then outputs a version of the input image with
606 predictions overlayed, in PNG format, but respecting the sample
607 ``stem``.
610 Returns
611 -------
613 stem : str
614 The unique sample stem
616 data : pandas.DataFrame
617 Dataframe containing the evaluation performance on this single sample
618 """
620 (
621 baseline_sample,
622 other_sample,
623 name,
624 output_folder,
625 overlayed_folder,
626 ) = args
628 assert baseline_sample[0] == other_sample[0], (
629 f"Mismatch between "
630 f"datasets for second-annotator analysis "
631 f"({baseline_sample[0]} != {other_sample[0]}). This "
632 f"typically occurs when the second annotator (`other`) "
633 f"comes from a different dataset than the `baseline` dataset"
634 )
636 stem = baseline_sample[0]
637 image = baseline_sample[1]
638 gt = baseline_sample[2]
639 pred = other_sample[2] # works as a prediction
640 mask = None if len(baseline_sample) < 4 else baseline_sample[3]
641 retval = _sample_measures(pred, gt, mask, 2)
643 if output_folder is not None:
644 fullpath = os.path.join(
645 output_folder, "second-annotator", name, f"{stem}.csv"
646 )
647 tqdm.write(f"Saving {fullpath}...")
648 os.makedirs(os.path.dirname(fullpath), exist_ok=True)
649 retval.to_csv(fullpath)
651 if overlayed_folder is not None:
652 overlay_image = _sample_analysis(
653 image, pred, gt, mask, threshold=0.5, overlay=True
654 )
655 fullpath = os.path.join(
656 overlayed_folder, "second-annotator", name, f"{stem}.png"
657 )
658 tqdm.write(f"Saving {fullpath}...")
659 os.makedirs(os.path.dirname(fullpath), exist_ok=True)
660 overlay_image.save(fullpath)
662 return stem, retval
665def compare_annotators(
666 baseline,
667 other,
668 name,
669 output_folder,
670 overlayed_folder=None,
671 parallel=-1,
672):
673 """Compares annotations on the **same** dataset.
675 Parameters
676 ---------
678 baseline : py:class:`torch.utils.data.Dataset`
679 a dataset to iterate on, containing the baseline annotations
681 other : py:class:`torch.utils.data.Dataset`
682 a second dataset, with the same samples as ``baseline``, but annotated
683 by a different annotator than in the first dataset. The key values
684 must much between ``baseline`` and this dataset.
686 name : str
687 the local name of this dataset (e.g. ``train-second-annotator``, or
688 ``test-second-annotator``), to be used when saving measures files.
690 output_folder : str
691 folder where to store results
693 overlayed_folder : :py:class:`str`, Optional
694 if not ``None``, then it should be the name of a folder where to store
695 overlayed versions of the images and ground-truths
697 parallel : :py:class:`int`, Optional
698 If set to a value different >= 0, uses multiprocessing for estimating
699 thresholds for each sample through a processing pool. A value of zero
700 will create as many processes in the pool as cores in the machine. A
701 negative value disables multiprocessing altogether. A value greater
702 than zero will spawn as many processes as requested.
703 """
705 logger.info(f"Output folder: {output_folder}")
706 os.makedirs(output_folder, exist_ok=True)
708 # Collect overall measures
709 data = {}
711 if parallel < 0: # turns off multiprocessing
712 for baseline_sample, other_sample in tqdm(
713 list(zip(baseline, other)),
714 desc="samples",
715 leave=False,
716 disable=None,
717 ):
718 k, v = _compare_annotators_worker(
719 (
720 baseline_sample,
721 other_sample,
722 name,
723 output_folder,
724 overlayed_folder,
725 )
726 )
727 data[k] = v
728 else:
729 parallel = parallel or multiprocessing.cpu_count()
730 with multiprocessing.Pool(processes=parallel) as pool, tqdm(
731 total=len(baseline),
732 desc="sample",
733 ) as pbar:
734 for k, v in pool.imap_unordered(
735 _compare_annotators_worker,
736 zip(
737 baseline,
738 other,
739 itertools.repeat(name),
740 itertools.repeat(output_folder),
741 itertools.repeat(overlayed_folder),
742 ),
743 ):
744 pbar.update()
745 data[k] = v
747 measures = _summarize(data)
748 measures.drop(0, inplace=True) # removes threshold == 0.0, keeps 0.5 only
750 measures_path = os.path.join(
751 output_folder, "second-annotator", f"{name}.csv"
752 )
753 os.makedirs(os.path.dirname(measures_path), exist_ok=True)
754 logger.info(f"Saving summaries over all input images at {measures_path}...")
755 measures.to_csv(measures_path)
757 maxf1 = measures["mean_f1_score"].max()
758 logger.info(f"F1-score of {maxf1:.5f} (second annotator; threshold=0.5)")