Coverage for src/deepdraw/script/evaluate.py: 85%

54 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-11-30 15:00 +0100

1# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> 

2# 

3# SPDX-License-Identifier: GPL-3.0-or-later 

4 

5import click 

6 

7from clapper.click import ConfigCommand, ResourceOption, verbosity_option 

8from clapper.logging import setup 

9 

10logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") 

11 

12from ..engine.evaluator import compare_annotators, run 

13 

14 

15@click.command( 

16 entry_point_group="deepdraw.config", 

17 cls=ConfigCommand, 

18 epilog="""Examples: 

19 

20\b 

21 1. Runs evaluation on an existing dataset configuration: 

22 

23 .. code:: sh 

24 

25 $ deepdraw evaluate -vv drive --predictions-folder=path/to/predictions --output-folder=path/to/results 

26 

27\b 

28 2. To run evaluation on a folder with your own images and annotations, you 

29 must first specify resizing, cropping, etc, so that the image can be 

30 correctly input to the model. Failing to do so will likely result in 

31 poor performance. To figure out such specifications, you must consult 

32 the dataset configuration used for **training** the provided model. 

33 Once you figured this out, do the following: 

34 

35 .. code:: sh 

36 

37 $ deepdraw config copy csv-dataset-example mydataset.py 

38 # modify "mydataset.py" to your liking 

39 $ deepdraw evaluate -vv mydataset.py --predictions-folder=path/to/predictions --output-folder=path/to/results 

40""", 

41) 

42@click.option( 

43 "--output-folder", 

44 "-o", 

45 help="Path where to store the analysis result (created if does not exist)", 

46 required=True, 

47 default="results", 

48 type=click.Path(), 

49 cls=ResourceOption, 

50) 

51@click.option( 

52 "--predictions-folder", 

53 "-p", 

54 help="Path where predictions are currently stored", 

55 required=True, 

56 type=click.Path(exists=True, file_okay=False, dir_okay=True), 

57 cls=ResourceOption, 

58) 

59@click.option( 

60 "--dataset", 

61 "-d", 

62 help="A torch.utils.data.dataset.Dataset instance implementing a dataset " 

63 "to be used for evaluation purposes, possibly including all pre-processing " 

64 "pipelines required or, optionally, a dictionary mapping string keys to " 

65 "torch.utils.data.dataset.Dataset instances. All keys that do not start " 

66 "with an underscore (_) will be processed.", 

67 required=True, 

68 cls=ResourceOption, 

69) 

70@click.option( 

71 "--second-annotator", 

72 "-S", 

73 help="A dataset or dictionary, like in --dataset, with the same " 

74 "sample keys, but with annotations from a different annotator that is " 

75 "going to be compared to the one in --dataset. The same rules regarding " 

76 "dataset naming conventions apply", 

77 required=False, 

78 default=None, 

79 cls=ResourceOption, 

80 show_default=True, 

81) 

82@click.option( 

83 "--overlayed", 

84 "-O", 

85 help="Creates overlayed representations of the output probability maps, " 

86 "similar to --overlayed in prediction-mode, except it includes " 

87 "distinctive colours for true and false positives and false negatives. " 

88 "If not set, or empty then do **NOT** output overlayed images. " 

89 "Otherwise, the parameter represents the name of a folder where to " 

90 "store those", 

91 show_default=True, 

92 default=None, 

93 required=False, 

94 cls=ResourceOption, 

95) 

96@click.option( 

97 "--threshold", 

98 "-t", 

99 help="This number is used to define positives and negatives from " 

100 "probability maps, and report F1-scores (a priori). It " 

101 "should either come from the training set or a separate validation set " 

102 "to avoid biasing the analysis. Optionally, if you provide a multi-set " 

103 "dataset as input, this may also be the name of an existing set from " 

104 "which the threshold will be estimated (highest F1-score) and then " 

105 "applied to the subsequent sets. This number is also used to print " 

106 "the test set F1-score a priori performance", 

107 default=None, 

108 show_default=False, 

109 required=False, 

110 cls=ResourceOption, 

111) 

112@click.option( 

113 "--steps", 

114 "-S", 

115 help="This number is used to define the number of threshold steps to " 

116 "consider when evaluating the highest possible F1-score on test data.", 

117 default=1000, 

118 show_default=True, 

119 required=True, 

120 cls=ResourceOption, 

121) 

122@click.option( 

123 "--parallel", 

124 "-P", 

125 help="""Use multiprocessing for data processing: if set to -1 (default), 

126 disables multiprocessing. Set to 0 to enable as many data loading 

127 instances as processing cores as available in the system. Set to >= 1 to 

128 enable that many multiprocessing instances for data processing.""", 

129 type=click.IntRange(min=-1), 

130 show_default=True, 

131 required=True, 

132 default=-1, 

133 cls=ResourceOption, 

134) 

135@verbosity_option(logger=logger, cls=ResourceOption) 

136@click.pass_context 

137def evaluate( 

138 ctx, 

139 output_folder, 

140 predictions_folder, 

141 dataset, 

142 second_annotator, 

143 overlayed, 

144 threshold, 

145 steps, 

146 parallel, 

147 verbose, 

148 **kwargs, 

149): 

150 def _validate_threshold(t, dataset): 

151 """Validate the user threshold selection. 

152 

153 Returns parsed threshold. 

154 """ 

155 if t is None: 

156 return 0.5 

157 

158 try: 

159 # we try to convert it to float first 

160 t = float(t) 

161 if t < 0.0 or t > 1.0: 

162 raise ValueError( 

163 "Float thresholds must be within range [0.0, 1.0]" 

164 ) 

165 except ValueError: 

166 # it is a bit of text - assert dataset with name is available 

167 if not isinstance(dataset, dict): 

168 raise ValueError( 

169 "Threshold should be a floating-point number " 

170 "if your provide only a single dataset for evaluation" 

171 ) 

172 if t not in dataset: 

173 raise ValueError( 

174 f"Text thresholds should match dataset names, " 

175 f"but {t} is not available among the datasets provided (" 

176 f"({', '.join(dataset.keys())})" 

177 ) 

178 

179 return t 

180 

181 threshold = _validate_threshold(threshold, dataset) 

182 

183 if not isinstance(dataset, dict): 

184 dataset = {"test": dataset} 

185 

186 if second_annotator is None: 

187 second_annotator = {} 

188 elif not isinstance(second_annotator, dict): 

189 second_annotator = {"test": second_annotator} 

190 # else, second_annotator must be a dict 

191 

192 if isinstance(threshold, str): 

193 # first run evaluation for reference dataset, do not save overlays 

194 logger.info(f"Evaluating threshold on '{threshold}' set") 

195 threshold = run( 

196 dataset[threshold], threshold, predictions_folder, steps=steps 

197 ) 

198 logger.info(f"Set --threshold={threshold:.5f}") 

199 

200 # clean-up the overlayed path 

201 if overlayed is not None: 

202 overlayed = overlayed.strip() 

203 

204 # now run with the 

205 for k, v in dataset.items(): 

206 if k.startswith("_"): 

207 logger.info(f"Skipping dataset '{k}' (not to be evaluated)") 

208 continue 

209 logger.info(f"Analyzing '{k}' set...") 

210 run( 

211 v, 

212 k, 

213 predictions_folder, 

214 output_folder, 

215 overlayed, 

216 threshold, 

217 steps=steps, 

218 parallel=parallel, 

219 ) 

220 second = second_annotator.get(k) 

221 if second is not None: 

222 if not second.all_keys_match(v): 

223 logger.warning( 

224 f"Key mismatch between `dataset[{k}]` and " 

225 f"`second_annotator[{k}]` - skipping " 

226 f"second-annotator comparisons for {k} subset" 

227 ) 

228 else: 

229 compare_annotators( 

230 v, second, k, output_folder, overlayed, parallel=parallel 

231 )