Coverage for src/bob/bio/base/script/pipeline_simple.py: 100%

33 statements  

« prev     ^ index     » next       coverage.py v7.6.0, created at 2024-07-12 22:34 +0200

1#!/usr/bin/env python 

2# vim: set fileencoding=utf-8 : 

3# Tiago de Freitas Pereira <tiago.pereira@idiap.ch> 

4 

5 

6"""Executes biometric pipeline""" 

7 

8import logging 

9 

10import click 

11 

12from clapper.click import ConfigCommand, ResourceOption, verbosity_option 

13 

14from bob.pipelines.distributed import VALID_DASK_CLIENT_STRINGS 

15 

16logger = logging.getLogger(__name__) 

17 

18 

19EPILOG = """\b 

20 

21Command line examples\n 

22----------------------- 

23 

24$ bob bio pipeline simple -vv DATABASE PIPELINE 

25 

26See the help of the CONFIG argument on top of this help message 

27for a list of available configurations. 

28 

29It is possible to provide database and pipeline through a configuration file. 

30Generate an example configuration file with: 

31 

32$ bob bio pipeline simple --dump-config my_experiment.py 

33 

34and execute it with: 

35 

36$ bob bio pipeline simple -vv my_experiment.py 

37 

38my_experiment.py must contain the following elements: 

39 

40 >>> transformer = ... # A scikit-learn pipeline wrapped with bob.pipelines' SampleWrapper\n 

41 >>> algorithm = ... # `An BioAlgorithm`\n 

42 >>> pipeline = PipelineSimple(transformer,algorithm)\n 

43 >>> database = .... # Biometric Database (class that implements the methods: `background_model_samples`, `references` and `probes`)" 

44\b""" 

45 

46 

47@click.command( 

48 name="simple", 

49 entry_point_group="bob.bio.config", 

50 cls=ConfigCommand, 

51 epilog=EPILOG, 

52) 

53@click.option( 

54 "--pipeline", 

55 "-p", 

56 required=True, 

57 entry_point_group="bob.bio.pipeline", 

58 help="The simplest pipeline possible composed of a scikit-learn Pipeline and a BioAlgorithm", 

59 cls=ResourceOption, 

60) 

61@click.option( 

62 "--database", 

63 "-d", 

64 entry_point_group="bob.bio.database", 

65 required=True, 

66 help="Biometric Database connector (class that implements the methods: `background_model_samples`, `references` and `probes`)", 

67 cls=ResourceOption, 

68) 

69@click.option( 

70 "--dask-client", 

71 "-l", 

72 entry_point_group="dask.client", 

73 string_exceptions=VALID_DASK_CLIENT_STRINGS, 

74 default="single-threaded", 

75 help="Dask client for the execution of the pipeline.", 

76 cls=ResourceOption, 

77) 

78@click.option( 

79 "--group", 

80 "-g", 

81 "groups", 

82 type=click.Choice(["dev", "eval"]), 

83 multiple=True, 

84 default=("dev",), 

85 help="If given, this value will limit the experiments belonging to a particular protocolar group", 

86 cls=ResourceOption, 

87) 

88@click.option( 

89 "--output", 

90 "-o", 

91 show_default=True, 

92 default="results", 

93 help="Name of output directory where output scores will be saved.", 

94 cls=ResourceOption, 

95) 

96@click.option( 

97 "--write-metadata-scores/--write-column-scores", 

98 "-meta/-nmeta", 

99 default=True, 

100 help="If set, all the scores will be written with all their metadata using the `CSVScoreWriter`", 

101 cls=ResourceOption, 

102) 

103@click.option( 

104 "--memory", 

105 "-m", 

106 is_flag=True, 

107 help="If set, it will run the experiment keeping all objects on memory with nothing checkpointed. If not set, checkpoints will be saved in `--output`.", 

108 cls=ResourceOption, 

109) 

110@click.option( 

111 "--checkpoint-dir", 

112 "-c", 

113 show_default=True, 

114 default=None, 

115 help="Name of output directory where the checkpoints will be saved. In case --checkpoint is set, checkpoints will be saved in this directory.", 

116 cls=ResourceOption, 

117) 

118@click.option( 

119 "--dask-partition-size", 

120 "-s", 

121 help="If using Dask, this option defines the max size of each dask.bag.partition. " 

122 "Use this option if the current heuristic that sets this value doesn't suit your experiment. " 

123 "(https://docs.dask.org/en/latest/bag-api.html?highlight=partition_size#dask.bag.from_sequence).", 

124 default=None, 

125 type=click.INT, 

126 cls=ResourceOption, 

127) 

128@click.option( 

129 "--dask-n-partitions", 

130 "-n", 

131 help="If using Dask, this option defines a fixed number of dask.bag.partition for " 

132 "each set of data. Use this option if the current heuristic that sets this value " 

133 "doesn't suit your experiment." 

134 "(https://docs.dask.org/en/latest/bag-api.html?highlight=partition_size#dask.bag.from_sequence).", 

135 default=None, 

136 type=click.INT, 

137 cls=ResourceOption, 

138) 

139@click.option( 

140 "--dask-n-workers", 

141 "-w", 

142 help="If using Dask, this option defines the number of workers to start your experiment. " 

143 "Dask automatically scales up/down the number of workers due to the current load of tasks to be solved. " 

144 "Use this option if the current amount of workers set to start an experiment doesn't suit you.", 

145 default=None, 

146 type=click.INT, 

147 cls=ResourceOption, 

148) 

149@click.option( 

150 "--force", 

151 "-f", 

152 is_flag=True, 

153 help="If set, it will force generate all the checkpoints of an experiment. This option doesn't work if `--memory` is set", 

154 cls=ResourceOption, 

155) 

156@click.option( 

157 "--no-dask", 

158 is_flag=True, 

159 help="If set, it will not use Dask to run the experiment.", 

160 cls=ResourceOption, 

161) 

162@verbosity_option(cls=ResourceOption, logger=logger) 

163def pipeline_simple( 

164 pipeline, 

165 database, 

166 dask_client, 

167 groups, 

168 output, 

169 write_metadata_scores, 

170 memory, 

171 checkpoint_dir, 

172 dask_partition_size, 

173 dask_n_workers, 

174 dask_n_partitions, 

175 force, 

176 no_dask, 

177 **kwargs, 

178): 

179 """Runs the simplest biometrics pipeline. 

180 

181 Such pipeline consists into two major components. 

182 The first component consists of a scikit-learn `Pipeline`, 

183 where a sequence of transformations of the input data 

184 is defined. 

185 The second component is a `BioAlgorithm` that defines the primitives 

186 `enroll` and `score` 

187 

188 With those two components any Biometric Experiment can be done. 

189 A Biometric experiment consists of three sub-pipelines and 

190 they are defined below: 

191 

192 Sub-pipeline 1:\n 

193 --------------- 

194 

195 Training background model. 

196 Some biometric algorithms demands the training of background model, for instance a neural network. 

197 

198 \b 

199 This pipeline runs: `Pipeline.fit(DATA_FOR_FIT)` 

200 

201 

202 

203 \b 

204 

205 Sub-pipeline 2:\n 

206 --------------- 

207 

208 Creation of biometric references: This is a standard step in a biometric pipelines. 

209 Given a set of samples of one identity, create a biometric reference (a.k.a template) for sub identity. 

210 

211 

212 \b 

213 raw_data --> preprocessing >> feature extraction >> enroll(background_model) --> biometric_reference 

214 

215 This pipeline runs: `BioAlgorithm.enroll(Pipeline.transform(DATA_ENROLL))` >> biometric_references 

216 

217 

218 Sub-pipeline 3:\n 

219 --------------- 

220 

221 Probing: This is another standard step in biometric pipelines. 

222 Given one sample and one biometric reference, computes a score. 

223 Such score has different meanings depending on the scoring method your biometric algorithm uses. 

224 It's out of scope to explain in a help message to explain what scoring is for different biometric algorithms. 

225 

226 This pipeline runs: `BioAlgorithm.score(Pipeline.transform(DATA_SCORE, biometric_references))` >> biometric_references 

227 

228 .. Note:: 

229 Refrain from calling this function directly from a script. Prefer 

230 :py:func:`bob.bio.base.pipelines.execute_pipeline_simple` 

231 instead. 

232 

233 

234 Using Dask 

235 ---------- 

236 

237 This pipeline is intended to work with Dask to split the load of work between 

238 processes on a machine or workers on a distributed grid system. By default, the 

239 local machine is used in single-threaded mode. However, by specifying the 

240 `--dask-client` option, you specify a Dask Client. 

241 

242 When using multiple workers, a few things have to be considered: 

243 - The number of partitions in the data. 

244 - The number of workers to process the data. 

245 

246 Ideally, (and this is the default behavior) you want to split all the data between 

247 many available workers, and all the workers work at the same time on all the data. 

248 But the number of workers may be limited, or one partition of data may be filling 

249 the memory of one worker. Moreover, having many small tasks (by splitting the data 

250 into many partitions) is not recommended as the scheduler will then spend more time 

251 organizing and communicating with the workers. 

252 

253 To solve speed or memory issues, options are available to split the data 

254 differently (`--dask-n-partitions` or `--dask-partition-size`). If you encounter 

255 memory issues on a worker, try augmenting the number of partitions, and if your 

256 scheduler is not keeping up, try reducing that number. 

257 """ 

258 from bob.bio.base.pipelines import execute_pipeline_simple 

259 

260 if no_dask: 

261 dask_client = None 

262 

263 checkpoint = not memory 

264 

265 logger.debug("Executing PipelineSimple with:") 

266 logger.debug(f"pipeline: {pipeline}") 

267 logger.debug(f" transformer: {pipeline.transformer}") 

268 logger.debug(f" biometric_algorithm: {pipeline.biometric_algorithm}") 

269 logger.debug(f"database: {database}") 

270 

271 execute_pipeline_simple( 

272 pipeline=pipeline, 

273 database=database, 

274 dask_client=dask_client, 

275 groups=groups, 

276 output=output, 

277 write_metadata_scores=write_metadata_scores, 

278 checkpoint=checkpoint, 

279 dask_partition_size=dask_partition_size, 

280 dask_n_partitions=dask_n_partitions, 

281 dask_n_workers=dask_n_workers, 

282 checkpoint_dir=checkpoint_dir, 

283 force=force, 

284 ) 

285 

286 logger.info("Experiment finished !")