Coverage for src/bob/bio/base/script/pipeline

1#!/usr/bin/env python

2# vim: set fileencoding=utf-8 :

3# Tiago de Freitas Pereira <tiago.pereira@idiap.ch>

6"""Executes biometric pipeline"""

8import logging

10import click

12from clapper.click import ConfigCommand, ResourceOption, verbosity_option

14from bob.pipelines.distributed import VALID_DASK_CLIENT_STRINGS

16logger = logging.getLogger(__name__)

19EPILOG = """\b

21Command line examples\n

22-----------------------

24$ bob bio pipeline simple -vv DATABASE PIPELINE

26See the help of the CONFIG argument on top of this help message

27for a list of available configurations.

29It is possible to provide database and pipeline through a configuration file.

30Generate an example configuration file with:

32$ bob bio pipeline simple --dump-config my_experiment.py

34and execute it with:

36$ bob bio pipeline simple -vv my_experiment.py

38my_experiment.py must contain the following elements:

40 >>> transformer = ... # A scikit-learn pipeline wrapped with bob.pipelines' SampleWrapper\n

41 >>> algorithm = ... # `An BioAlgorithm`\n

42 >>> pipeline = PipelineSimple(transformer,algorithm)\n

43 >>> database = .... # Biometric Database (class that implements the methods: `background_model_samples`, `references` and `probes`)"

44\b"""

47@click.command(

48 name="simple",

49 entry_point_group="bob.bio.config",

50 cls=ConfigCommand,

51 epilog=EPILOG,

52)

53@click.option(

54 "--pipeline",

55 "-p",

56 required=True,

57 entry_point_group="bob.bio.pipeline",

58 help="The simplest pipeline possible composed of a scikit-learn Pipeline and a BioAlgorithm",

59 cls=ResourceOption,

60)

61@click.option(

62 "--database",

63 "-d",

64 entry_point_group="bob.bio.database",

65 required=True,

66 help="Biometric Database connector (class that implements the methods: `background_model_samples`, `references` and `probes`)",

67 cls=ResourceOption,

68)

69@click.option(

70 "--dask-client",

71 "-l",

72 entry_point_group="dask.client",

73 string_exceptions=VALID_DASK_CLIENT_STRINGS,

74 default="single-threaded",

75 help="Dask client for the execution of the pipeline.",

76 cls=ResourceOption,

77)

78@click.option(

79 "--group",

80 "-g",

81 "groups",

82 type=click.Choice(["dev", "eval"]),

83 multiple=True,

84 default=("dev",),

85 help="If given, this value will limit the experiments belonging to a particular protocolar group",

86 cls=ResourceOption,

87)

88@click.option(

89 "--output",

90 "-o",

91 show_default=True,

92 default="results",

93 help="Name of output directory where output scores will be saved.",

94 cls=ResourceOption,

95)

96@click.option(

97 "--write-metadata-scores/--write-column-scores",

98 "-meta/-nmeta",

99 default=True,

100 help="If set, all the scores will be written with all their metadata using the `CSVScoreWriter`",

101 cls=ResourceOption,

102)

103@click.option(

104 "--memory",

105 "-m",

106 is_flag=True,

107 help="If set, it will run the experiment keeping all objects on memory with nothing checkpointed. If not set, checkpoints will be saved in `--output`.",

108 cls=ResourceOption,

109)

110@click.option(

111 "--checkpoint-dir",

112 "-c",

113 show_default=True,

114 default=None,

115 help="Name of output directory where the checkpoints will be saved. In case --checkpoint is set, checkpoints will be saved in this directory.",

116 cls=ResourceOption,

117)

118@click.option(

119 "--dask-partition-size",

120 "-s",

121 help="If using Dask, this option defines the max size of each dask.bag.partition. "

122 "Use this option if the current heuristic that sets this value doesn't suit your experiment. "

123 "(https://docs.dask.org/en/latest/bag-api.html?highlight=partition_size#dask.bag.from_sequence).",

124 default=None,

125 type=click.INT,

126 cls=ResourceOption,

127)

128@click.option(

129 "--dask-n-partitions",

130 "-n",

131 help="If using Dask, this option defines a fixed number of dask.bag.partition for "

132 "each set of data. Use this option if the current heuristic that sets this value "

133 "doesn't suit your experiment."

134 "(https://docs.dask.org/en/latest/bag-api.html?highlight=partition_size#dask.bag.from_sequence).",

135 default=None,

136 type=click.INT,

137 cls=ResourceOption,

138)

139@click.option(

140 "--dask-n-workers",

141 "-w",

142 help="If using Dask, this option defines the number of workers to start your experiment. "

143 "Dask automatically scales up/down the number of workers due to the current load of tasks to be solved. "

144 "Use this option if the current amount of workers set to start an experiment doesn't suit you.",

145 default=None,

146 type=click.INT,

147 cls=ResourceOption,

148)

149@click.option(

150 "--force",

151 "-f",

152 is_flag=True,

153 help="If set, it will force generate all the checkpoints of an experiment. This option doesn't work if `--memory` is set",

154 cls=ResourceOption,

155)

156@click.option(

157 "--no-dask",

158 is_flag=True,

159 help="If set, it will not use Dask to run the experiment.",

160 cls=ResourceOption,

161)

162@verbosity_option(cls=ResourceOption, logger=logger)

163def pipeline_simple(

164 pipeline,

165 database,

166 dask_client,

167 groups,

168 output,

169 write_metadata_scores,

170 memory,

171 checkpoint_dir,

172 dask_partition_size,

173 dask_n_workers,

174 dask_n_partitions,

175 force,

176 no_dask,

177 **kwargs,

178):

179 """Runs the simplest biometrics pipeline.

180

181 Such pipeline consists into two major components.

182 The first component consists of a scikit-learn `Pipeline`,

183 where a sequence of transformations of the input data

184 is defined.

185 The second component is a `BioAlgorithm` that defines the primitives

186 `enroll` and `score`

187

188 With those two components any Biometric Experiment can be done.

189 A Biometric experiment consists of three sub-pipelines and

190 they are defined below:

191

192 Sub-pipeline 1:\n

193 ---------------

194

195 Training background model.

196 Some biometric algorithms demands the training of background model, for instance a neural network.

197

198 \b

199 This pipeline runs: `Pipeline.fit(DATA_FOR_FIT)`

203 \b

205 Sub-pipeline 2:\n

206 ---------------

207

208 Creation of biometric references: This is a standard step in a biometric pipelines.

209 Given a set of samples of one identity, create a biometric reference (a.k.a template) for sub identity.

210

211

212 \b

213 raw_data --> preprocessing >> feature extraction >> enroll(background_model) --> biometric_reference

214

215 This pipeline runs: `BioAlgorithm.enroll(Pipeline.transform(DATA_ENROLL))` >> biometric_references

216

217

218 Sub-pipeline 3:\n

219 ---------------

220

221 Probing: This is another standard step in biometric pipelines.

222 Given one sample and one biometric reference, computes a score.

223 Such score has different meanings depending on the scoring method your biometric algorithm uses.

224 It's out of scope to explain in a help message to explain what scoring is for different biometric algorithms.

225

226 This pipeline runs: `BioAlgorithm.score(Pipeline.transform(DATA_SCORE, biometric_references))` >> biometric_references

227

228 .. Note::

229 Refrain from calling this function directly from a script. Prefer

230 :py:func:`bob.bio.base.pipelines.execute_pipeline_simple`

231 instead.

232

233

234 Using Dask

235 ----------

236

237 This pipeline is intended to work with Dask to split the load of work between

238 processes on a machine or workers on a distributed grid system. By default, the

239 local machine is used in single-threaded mode. However, by specifying the

240 `--dask-client` option, you specify a Dask Client.

241

242 When using multiple workers, a few things have to be considered:

243 - The number of partitions in the data.

244 - The number of workers to process the data.

245

246 Ideally, (and this is the default behavior) you want to split all the data between

247 many available workers, and all the workers work at the same time on all the data.

248 But the number of workers may be limited, or one partition of data may be filling

249 the memory of one worker. Moreover, having many small tasks (by splitting the data

250 into many partitions) is not recommended as the scheduler will then spend more time

251 organizing and communicating with the workers.

252

253 To solve speed or memory issues, options are available to split the data

254 differently (`--dask-n-partitions` or `--dask-partition-size`). If you encounter

255 memory issues on a worker, try augmenting the number of partitions, and if your

256 scheduler is not keeping up, try reducing that number.

257 """

258 from bob.bio.base.pipelines import execute_pipeline_simple

259

260 if no_dask:

261 dask_client = None

262

263 checkpoint = not memory

264

265 logger.debug("Executing PipelineSimple with:")

266 logger.debug(f"pipeline: {pipeline}")

267 logger.debug(f" transformer: {pipeline.transformer}")

268 logger.debug(f" biometric_algorithm: {pipeline.biometric_algorithm}")

269 logger.debug(f"database: {database}")

270

271 execute_pipeline_simple(

272 pipeline=pipeline,

273 database=database,

274 dask_client=dask_client,

275 groups=groups,

276 output=output,

277 write_metadata_scores=write_metadata_scores,

278 checkpoint=checkpoint,

279 dask_partition_size=dask_partition_size,

280 dask_n_partitions=dask_n_partitions,

281 dask_n_workers=dask_n_workers,

282 checkpoint_dir=checkpoint_dir,

283 force=force,

284 )

285

286 logger.info("Experiment finished !")

Coverage for src/bob/bio/base/script/pipeline_simple.py: 100%

33 statements