Coverage for src/deepdraw/script/evaluate.py: 85%

3# SPDX-License-Identifier: GPL-3.0-or-later

5import click

7from clapper.click import ConfigCommand, ResourceOption, verbosity_option

8from clapper.logging import setup

10logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")

12from ..engine.evaluator import compare_annotators, run

15@click.command(

16 entry_point_group="deepdraw.config",

17 cls=ConfigCommand,

18 epilog="""Examples:

20\b

21 1. Runs evaluation on an existing dataset configuration:

23 .. code:: sh

25 $ deepdraw evaluate -vv drive --predictions-folder=path/to/predictions --output-folder=path/to/results

27\b

28 2. To run evaluation on a folder with your own images and annotations, you

29 must first specify resizing, cropping, etc, so that the image can be

30 correctly input to the model. Failing to do so will likely result in

31 poor performance. To figure out such specifications, you must consult

32 the dataset configuration used for **training** the provided model.

33 Once you figured this out, do the following:

35 .. code:: sh

37 $ deepdraw config copy csv-dataset-example mydataset.py

38 # modify "mydataset.py" to your liking

39 $ deepdraw evaluate -vv mydataset.py --predictions-folder=path/to/predictions --output-folder=path/to/results

40""",

41)

42@click.option(

43 "--output-folder",

44 "-o",

45 help="Path where to store the analysis result (created if does not exist)",

46 required=True,

47 default="results",

48 type=click.Path(),

49 cls=ResourceOption,

50)

51@click.option(

52 "--predictions-folder",

53 "-p",

54 help="Path where predictions are currently stored",

55 required=True,

56 type=click.Path(exists=True, file_okay=False, dir_okay=True),

57 cls=ResourceOption,

58)

59@click.option(

60 "--dataset",

61 "-d",

62 help="A torch.utils.data.dataset.Dataset instance implementing a dataset "

63 "to be used for evaluation purposes, possibly including all pre-processing "

64 "pipelines required or, optionally, a dictionary mapping string keys to "

65 "torch.utils.data.dataset.Dataset instances. All keys that do not start "

66 "with an underscore (_) will be processed.",

67 required=True,

68 cls=ResourceOption,

69)

70@click.option(

71 "--second-annotator",

72 "-S",

73 help="A dataset or dictionary, like in --dataset, with the same "

74 "sample keys, but with annotations from a different annotator that is "

75 "going to be compared to the one in --dataset. The same rules regarding "

76 "dataset naming conventions apply",

77 required=False,

78 default=None,

79 cls=ResourceOption,

80 show_default=True,

81)

82@click.option(

83 "--overlayed",

84 "-O",

85 help="Creates overlayed representations of the output probability maps, "

86 "similar to --overlayed in prediction-mode, except it includes "

87 "distinctive colours for true and false positives and false negatives. "

88 "If not set, or empty then do **NOT** output overlayed images. "

89 "Otherwise, the parameter represents the name of a folder where to "

90 "store those",

91 show_default=True,

92 default=None,

93 required=False,

94 cls=ResourceOption,

95)

96@click.option(

97 "--threshold",

98 "-t",

99 help="This number is used to define positives and negatives from "

100 "probability maps, and report F1-scores (a priori). It "

101 "should either come from the training set or a separate validation set "

102 "to avoid biasing the analysis. Optionally, if you provide a multi-set "

103 "dataset as input, this may also be the name of an existing set from "

104 "which the threshold will be estimated (highest F1-score) and then "

105 "applied to the subsequent sets. This number is also used to print "

106 "the test set F1-score a priori performance",

107 default=None,

108 show_default=False,

109 required=False,

110 cls=ResourceOption,

111)

112@click.option(

113 "--steps",

114 "-S",

115 help="This number is used to define the number of threshold steps to "

116 "consider when evaluating the highest possible F1-score on test data.",

117 default=1000,

118 show_default=True,

119 required=True,

120 cls=ResourceOption,

121)

122@click.option(

123 "--parallel",

124 "-P",

125 help="""Use multiprocessing for data processing: if set to -1 (default),

126 disables multiprocessing. Set to 0 to enable as many data loading

127 instances as processing cores as available in the system. Set to >= 1 to

128 enable that many multiprocessing instances for data processing.""",

129 type=click.IntRange(min=-1),

130 show_default=True,

131 required=True,

132 default=-1,

133 cls=ResourceOption,

134)

135@verbosity_option(logger=logger, cls=ResourceOption)

136@click.pass_context

137def evaluate(

138 ctx,

139 output_folder,

140 predictions_folder,

141 dataset,

142 second_annotator,

143 overlayed,

144 threshold,

145 steps,

146 parallel,

147 verbose,

148 **kwargs,

149):

150 def _validate_threshold(t, dataset):

151 """Validate the user threshold selection.

152

153 Returns parsed threshold.

154 """

155 if t is None:

156 return 0.5

157

158 try:

159 # we try to convert it to float first

160 t = float(t)

161 if t < 0.0 or t > 1.0:

162 raise ValueError(

163 "Float thresholds must be within range [0.0, 1.0]"

164 )

165 except ValueError:

166 # it is a bit of text - assert dataset with name is available

167 if not isinstance(dataset, dict):

168 raise ValueError(

169 "Threshold should be a floating-point number "

170 "if your provide only a single dataset for evaluation"

171 )

172 if t not in dataset:

173 raise ValueError(

174 f"Text thresholds should match dataset names, "

175 f"but {t} is not available among the datasets provided ("

176 f"({', '.join(dataset.keys())})"

177 )

178

179 return t

180

181 threshold = _validate_threshold(threshold, dataset)

182

183 if not isinstance(dataset, dict):

184 dataset = {"test": dataset}

185

186 if second_annotator is None:

187 second_annotator = {}

188 elif not isinstance(second_annotator, dict):

189 second_annotator = {"test": second_annotator}

190 # else, second_annotator must be a dict

191

192 if isinstance(threshold, str):

193 # first run evaluation for reference dataset, do not save overlays

194 logger.info(f"Evaluating threshold on '{threshold}' set")

195 threshold = run(

196 dataset[threshold], threshold, predictions_folder, steps=steps

197 )

198 logger.info(f"Set --threshold={threshold:.5f}")

199

200 # clean-up the overlayed path

201 if overlayed is not None:

202 overlayed = overlayed.strip()

203

204 # now run with the

205 for k, v in dataset.items():

206 if k.startswith("_"):

207 logger.info(f"Skipping dataset '{k}' (not to be evaluated)")

208 continue

209 logger.info(f"Analyzing '{k}' set...")

210 run(

211 v,

212 k,

213 predictions_folder,

214 output_folder,

215 overlayed,

216 threshold,

217 steps=steps,

218 parallel=parallel,

219 )

220 second = second_annotator.get(k)

221 if second is not None:

222 if not second.all_keys_match(v):

223 logger.warning(

224 f"Key mismatch between `dataset[{k}]` and "

225 f"`second_annotator[{k}]` - skipping "

226 f"second-annotator comparisons for {k} subset"

227 )

228 else:

229 compare_annotators(

230 v, second, k, output_folder, overlayed, parallel=parallel

231 )