Coverage for src/deepdraw/script/evaluate.py: 85%
54 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-11-30 15:00 +0100
« prev ^ index » next coverage.py v7.3.1, created at 2023-11-30 15:00 +0100
1# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
2#
3# SPDX-License-Identifier: GPL-3.0-or-later
5import click
7from clapper.click import ConfigCommand, ResourceOption, verbosity_option
8from clapper.logging import setup
10logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
12from ..engine.evaluator import compare_annotators, run
15@click.command(
16 entry_point_group="deepdraw.config",
17 cls=ConfigCommand,
18 epilog="""Examples:
20\b
21 1. Runs evaluation on an existing dataset configuration:
23 .. code:: sh
25 $ deepdraw evaluate -vv drive --predictions-folder=path/to/predictions --output-folder=path/to/results
27\b
28 2. To run evaluation on a folder with your own images and annotations, you
29 must first specify resizing, cropping, etc, so that the image can be
30 correctly input to the model. Failing to do so will likely result in
31 poor performance. To figure out such specifications, you must consult
32 the dataset configuration used for **training** the provided model.
33 Once you figured this out, do the following:
35 .. code:: sh
37 $ deepdraw config copy csv-dataset-example mydataset.py
38 # modify "mydataset.py" to your liking
39 $ deepdraw evaluate -vv mydataset.py --predictions-folder=path/to/predictions --output-folder=path/to/results
40""",
41)
42@click.option(
43 "--output-folder",
44 "-o",
45 help="Path where to store the analysis result (created if does not exist)",
46 required=True,
47 default="results",
48 type=click.Path(),
49 cls=ResourceOption,
50)
51@click.option(
52 "--predictions-folder",
53 "-p",
54 help="Path where predictions are currently stored",
55 required=True,
56 type=click.Path(exists=True, file_okay=False, dir_okay=True),
57 cls=ResourceOption,
58)
59@click.option(
60 "--dataset",
61 "-d",
62 help="A torch.utils.data.dataset.Dataset instance implementing a dataset "
63 "to be used for evaluation purposes, possibly including all pre-processing "
64 "pipelines required or, optionally, a dictionary mapping string keys to "
65 "torch.utils.data.dataset.Dataset instances. All keys that do not start "
66 "with an underscore (_) will be processed.",
67 required=True,
68 cls=ResourceOption,
69)
70@click.option(
71 "--second-annotator",
72 "-S",
73 help="A dataset or dictionary, like in --dataset, with the same "
74 "sample keys, but with annotations from a different annotator that is "
75 "going to be compared to the one in --dataset. The same rules regarding "
76 "dataset naming conventions apply",
77 required=False,
78 default=None,
79 cls=ResourceOption,
80 show_default=True,
81)
82@click.option(
83 "--overlayed",
84 "-O",
85 help="Creates overlayed representations of the output probability maps, "
86 "similar to --overlayed in prediction-mode, except it includes "
87 "distinctive colours for true and false positives and false negatives. "
88 "If not set, or empty then do **NOT** output overlayed images. "
89 "Otherwise, the parameter represents the name of a folder where to "
90 "store those",
91 show_default=True,
92 default=None,
93 required=False,
94 cls=ResourceOption,
95)
96@click.option(
97 "--threshold",
98 "-t",
99 help="This number is used to define positives and negatives from "
100 "probability maps, and report F1-scores (a priori). It "
101 "should either come from the training set or a separate validation set "
102 "to avoid biasing the analysis. Optionally, if you provide a multi-set "
103 "dataset as input, this may also be the name of an existing set from "
104 "which the threshold will be estimated (highest F1-score) and then "
105 "applied to the subsequent sets. This number is also used to print "
106 "the test set F1-score a priori performance",
107 default=None,
108 show_default=False,
109 required=False,
110 cls=ResourceOption,
111)
112@click.option(
113 "--steps",
114 "-S",
115 help="This number is used to define the number of threshold steps to "
116 "consider when evaluating the highest possible F1-score on test data.",
117 default=1000,
118 show_default=True,
119 required=True,
120 cls=ResourceOption,
121)
122@click.option(
123 "--parallel",
124 "-P",
125 help="""Use multiprocessing for data processing: if set to -1 (default),
126 disables multiprocessing. Set to 0 to enable as many data loading
127 instances as processing cores as available in the system. Set to >= 1 to
128 enable that many multiprocessing instances for data processing.""",
129 type=click.IntRange(min=-1),
130 show_default=True,
131 required=True,
132 default=-1,
133 cls=ResourceOption,
134)
135@verbosity_option(logger=logger, cls=ResourceOption)
136@click.pass_context
137def evaluate(
138 ctx,
139 output_folder,
140 predictions_folder,
141 dataset,
142 second_annotator,
143 overlayed,
144 threshold,
145 steps,
146 parallel,
147 verbose,
148 **kwargs,
149):
150 def _validate_threshold(t, dataset):
151 """Validate the user threshold selection.
153 Returns parsed threshold.
154 """
155 if t is None:
156 return 0.5
158 try:
159 # we try to convert it to float first
160 t = float(t)
161 if t < 0.0 or t > 1.0:
162 raise ValueError(
163 "Float thresholds must be within range [0.0, 1.0]"
164 )
165 except ValueError:
166 # it is a bit of text - assert dataset with name is available
167 if not isinstance(dataset, dict):
168 raise ValueError(
169 "Threshold should be a floating-point number "
170 "if your provide only a single dataset for evaluation"
171 )
172 if t not in dataset:
173 raise ValueError(
174 f"Text thresholds should match dataset names, "
175 f"but {t} is not available among the datasets provided ("
176 f"({', '.join(dataset.keys())})"
177 )
179 return t
181 threshold = _validate_threshold(threshold, dataset)
183 if not isinstance(dataset, dict):
184 dataset = {"test": dataset}
186 if second_annotator is None:
187 second_annotator = {}
188 elif not isinstance(second_annotator, dict):
189 second_annotator = {"test": second_annotator}
190 # else, second_annotator must be a dict
192 if isinstance(threshold, str):
193 # first run evaluation for reference dataset, do not save overlays
194 logger.info(f"Evaluating threshold on '{threshold}' set")
195 threshold = run(
196 dataset[threshold], threshold, predictions_folder, steps=steps
197 )
198 logger.info(f"Set --threshold={threshold:.5f}")
200 # clean-up the overlayed path
201 if overlayed is not None:
202 overlayed = overlayed.strip()
204 # now run with the
205 for k, v in dataset.items():
206 if k.startswith("_"):
207 logger.info(f"Skipping dataset '{k}' (not to be evaluated)")
208 continue
209 logger.info(f"Analyzing '{k}' set...")
210 run(
211 v,
212 k,
213 predictions_folder,
214 output_folder,
215 overlayed,
216 threshold,
217 steps=steps,
218 parallel=parallel,
219 )
220 second = second_annotator.get(k)
221 if second is not None:
222 if not second.all_keys_match(v):
223 logger.warning(
224 f"Key mismatch between `dataset[{k}]` and "
225 f"`second_annotator[{k}]` - skipping "
226 f"second-annotator comparisons for {k} subset"
227 )
228 else:
229 compare_annotators(
230 v, second, k, output_folder, overlayed, parallel=parallel
231 )