Coverage for src/bob/bio/base/script/pipeline_simple.py: 100%
33 statements
« prev ^ index » next coverage.py v7.6.0, created at 2024-07-12 22:15 +0200
« prev ^ index » next coverage.py v7.6.0, created at 2024-07-12 22:15 +0200
1#!/usr/bin/env python
2# vim: set fileencoding=utf-8 :
3# Tiago de Freitas Pereira <tiago.pereira@idiap.ch>
6"""Executes biometric pipeline"""
8import logging
10import click
12from clapper.click import ConfigCommand, ResourceOption, verbosity_option
14from bob.pipelines.distributed import VALID_DASK_CLIENT_STRINGS
16logger = logging.getLogger(__name__)
19EPILOG = """\b
21Command line examples\n
22-----------------------
24$ bob bio pipeline simple -vv DATABASE PIPELINE
26See the help of the CONFIG argument on top of this help message
27for a list of available configurations.
29It is possible to provide database and pipeline through a configuration file.
30Generate an example configuration file with:
32$ bob bio pipeline simple --dump-config my_experiment.py
34and execute it with:
36$ bob bio pipeline simple -vv my_experiment.py
38my_experiment.py must contain the following elements:
40 >>> transformer = ... # A scikit-learn pipeline wrapped with bob.pipelines' SampleWrapper\n
41 >>> algorithm = ... # `An BioAlgorithm`\n
42 >>> pipeline = PipelineSimple(transformer,algorithm)\n
43 >>> database = .... # Biometric Database (class that implements the methods: `background_model_samples`, `references` and `probes`)"
44\b"""
47@click.command(
48 name="simple",
49 entry_point_group="bob.bio.config",
50 cls=ConfigCommand,
51 epilog=EPILOG,
52)
53@click.option(
54 "--pipeline",
55 "-p",
56 required=True,
57 entry_point_group="bob.bio.pipeline",
58 help="The simplest pipeline possible composed of a scikit-learn Pipeline and a BioAlgorithm",
59 cls=ResourceOption,
60)
61@click.option(
62 "--database",
63 "-d",
64 entry_point_group="bob.bio.database",
65 required=True,
66 help="Biometric Database connector (class that implements the methods: `background_model_samples`, `references` and `probes`)",
67 cls=ResourceOption,
68)
69@click.option(
70 "--dask-client",
71 "-l",
72 entry_point_group="dask.client",
73 string_exceptions=VALID_DASK_CLIENT_STRINGS,
74 default="single-threaded",
75 help="Dask client for the execution of the pipeline.",
76 cls=ResourceOption,
77)
78@click.option(
79 "--group",
80 "-g",
81 "groups",
82 type=click.Choice(["dev", "eval"]),
83 multiple=True,
84 default=("dev",),
85 help="If given, this value will limit the experiments belonging to a particular protocolar group",
86 cls=ResourceOption,
87)
88@click.option(
89 "--output",
90 "-o",
91 show_default=True,
92 default="results",
93 help="Name of output directory where output scores will be saved.",
94 cls=ResourceOption,
95)
96@click.option(
97 "--write-metadata-scores/--write-column-scores",
98 "-meta/-nmeta",
99 default=True,
100 help="If set, all the scores will be written with all their metadata using the `CSVScoreWriter`",
101 cls=ResourceOption,
102)
103@click.option(
104 "--memory",
105 "-m",
106 is_flag=True,
107 help="If set, it will run the experiment keeping all objects on memory with nothing checkpointed. If not set, checkpoints will be saved in `--output`.",
108 cls=ResourceOption,
109)
110@click.option(
111 "--checkpoint-dir",
112 "-c",
113 show_default=True,
114 default=None,
115 help="Name of output directory where the checkpoints will be saved. In case --checkpoint is set, checkpoints will be saved in this directory.",
116 cls=ResourceOption,
117)
118@click.option(
119 "--dask-partition-size",
120 "-s",
121 help="If using Dask, this option defines the max size of each dask.bag.partition. "
122 "Use this option if the current heuristic that sets this value doesn't suit your experiment. "
123 "(https://docs.dask.org/en/latest/bag-api.html?highlight=partition_size#dask.bag.from_sequence).",
124 default=None,
125 type=click.INT,
126 cls=ResourceOption,
127)
128@click.option(
129 "--dask-n-partitions",
130 "-n",
131 help="If using Dask, this option defines a fixed number of dask.bag.partition for "
132 "each set of data. Use this option if the current heuristic that sets this value "
133 "doesn't suit your experiment."
134 "(https://docs.dask.org/en/latest/bag-api.html?highlight=partition_size#dask.bag.from_sequence).",
135 default=None,
136 type=click.INT,
137 cls=ResourceOption,
138)
139@click.option(
140 "--dask-n-workers",
141 "-w",
142 help="If using Dask, this option defines the number of workers to start your experiment. "
143 "Dask automatically scales up/down the number of workers due to the current load of tasks to be solved. "
144 "Use this option if the current amount of workers set to start an experiment doesn't suit you.",
145 default=None,
146 type=click.INT,
147 cls=ResourceOption,
148)
149@click.option(
150 "--force",
151 "-f",
152 is_flag=True,
153 help="If set, it will force generate all the checkpoints of an experiment. This option doesn't work if `--memory` is set",
154 cls=ResourceOption,
155)
156@click.option(
157 "--no-dask",
158 is_flag=True,
159 help="If set, it will not use Dask to run the experiment.",
160 cls=ResourceOption,
161)
162@verbosity_option(cls=ResourceOption, logger=logger)
163def pipeline_simple(
164 pipeline,
165 database,
166 dask_client,
167 groups,
168 output,
169 write_metadata_scores,
170 memory,
171 checkpoint_dir,
172 dask_partition_size,
173 dask_n_workers,
174 dask_n_partitions,
175 force,
176 no_dask,
177 **kwargs,
178):
179 """Runs the simplest biometrics pipeline.
181 Such pipeline consists into two major components.
182 The first component consists of a scikit-learn `Pipeline`,
183 where a sequence of transformations of the input data
184 is defined.
185 The second component is a `BioAlgorithm` that defines the primitives
186 `enroll` and `score`
188 With those two components any Biometric Experiment can be done.
189 A Biometric experiment consists of three sub-pipelines and
190 they are defined below:
192 Sub-pipeline 1:\n
193 ---------------
195 Training background model.
196 Some biometric algorithms demands the training of background model, for instance a neural network.
198 \b
199 This pipeline runs: `Pipeline.fit(DATA_FOR_FIT)`
203 \b
205 Sub-pipeline 2:\n
206 ---------------
208 Creation of biometric references: This is a standard step in a biometric pipelines.
209 Given a set of samples of one identity, create a biometric reference (a.k.a template) for sub identity.
212 \b
213 raw_data --> preprocessing >> feature extraction >> enroll(background_model) --> biometric_reference
215 This pipeline runs: `BioAlgorithm.enroll(Pipeline.transform(DATA_ENROLL))` >> biometric_references
218 Sub-pipeline 3:\n
219 ---------------
221 Probing: This is another standard step in biometric pipelines.
222 Given one sample and one biometric reference, computes a score.
223 Such score has different meanings depending on the scoring method your biometric algorithm uses.
224 It's out of scope to explain in a help message to explain what scoring is for different biometric algorithms.
226 This pipeline runs: `BioAlgorithm.score(Pipeline.transform(DATA_SCORE, biometric_references))` >> biometric_references
228 .. Note::
229 Refrain from calling this function directly from a script. Prefer
230 :py:func:`bob.bio.base.pipelines.execute_pipeline_simple`
231 instead.
234 Using Dask
235 ----------
237 This pipeline is intended to work with Dask to split the load of work between
238 processes on a machine or workers on a distributed grid system. By default, the
239 local machine is used in single-threaded mode. However, by specifying the
240 `--dask-client` option, you specify a Dask Client.
242 When using multiple workers, a few things have to be considered:
243 - The number of partitions in the data.
244 - The number of workers to process the data.
246 Ideally, (and this is the default behavior) you want to split all the data between
247 many available workers, and all the workers work at the same time on all the data.
248 But the number of workers may be limited, or one partition of data may be filling
249 the memory of one worker. Moreover, having many small tasks (by splitting the data
250 into many partitions) is not recommended as the scheduler will then spend more time
251 organizing and communicating with the workers.
253 To solve speed or memory issues, options are available to split the data
254 differently (`--dask-n-partitions` or `--dask-partition-size`). If you encounter
255 memory issues on a worker, try augmenting the number of partitions, and if your
256 scheduler is not keeping up, try reducing that number.
257 """
258 from bob.bio.base.pipelines import execute_pipeline_simple
260 if no_dask:
261 dask_client = None
263 checkpoint = not memory
265 logger.debug("Executing PipelineSimple with:")
266 logger.debug(f"pipeline: {pipeline}")
267 logger.debug(f" transformer: {pipeline.transformer}")
268 logger.debug(f" biometric_algorithm: {pipeline.biometric_algorithm}")
269 logger.debug(f"database: {database}")
271 execute_pipeline_simple(
272 pipeline=pipeline,
273 database=database,
274 dask_client=dask_client,
275 groups=groups,
276 output=output,
277 write_metadata_scores=write_metadata_scores,
278 checkpoint=checkpoint,
279 dask_partition_size=dask_partition_size,
280 dask_n_partitions=dask_n_partitions,
281 dask_n_workers=dask_n_workers,
282 checkpoint_dir=checkpoint_dir,
283 force=force,
284 )
286 logger.info("Experiment finished !")