Coverage for src/bob/bio/base/script/pipeline_train.py: 100%
31 statements
« prev ^ index » next coverage.py v7.6.0, created at 2024-07-12 22:34 +0200
« prev ^ index » next coverage.py v7.6.0, created at 2024-07-12 22:34 +0200
1#!/usr/bin/env python
2# vim: set fileencoding=utf-8 :
3# Tiago de Freitas Pereira <tiago.pereira@idiap.ch>
6"""Executes only the train part of a biometric pipeline"""
8import logging
10import click
12from clapper.click import ConfigCommand, ResourceOption, verbosity_option
14from bob.pipelines.distributed import VALID_DASK_CLIENT_STRINGS
16logger = logging.getLogger(__name__)
19EPILOG = """\b
21Command line examples\n
22-----------------------
24$ bob bio pipeline train -vv DATABASE PIPELINE
26See the help of the CONFIG argument on top of this help message
27for a list of available configurations.
29It is possible to provide database and pipeline through a configuration file.
30Generate an example configuration file with:
32$ bob bio pipeline train --dump-config my_experiment.py
34and execute it with:
36$ bob bio pipeline train -vv my_experiment.py
38my_experiment.py must contain the following elements:
40 >>> pipeline = ... # A scikit-learn pipeline wrapped with bob.pipelines' SampleWrapper\n
41 >>> database = .... # Biometric Database (class that implements the methods: `background_model_samples`, `references` and `probes`)"
42\b"""
45@click.command(
46 name="train",
47 entry_point_group="bob.bio.config",
48 cls=ConfigCommand,
49 epilog=EPILOG,
50)
51@click.option(
52 "--pipeline",
53 "-p",
54 required=True,
55 entry_point_group="bob.bio.pipeline",
56 help="A PipelineSimple or an sklearn.pipeline",
57 cls=ResourceOption,
58)
59@click.option(
60 "--database",
61 "-d",
62 entry_point_group="bob.bio.database",
63 required=True,
64 help="Biometric Database connector (class that implements the methods: `background_model_samples`, `references` and `probes`)",
65 cls=ResourceOption,
66)
67@click.option(
68 "--dask-client",
69 "-l",
70 entry_point_group="dask.client",
71 string_exceptions=VALID_DASK_CLIENT_STRINGS,
72 default="single-threaded",
73 help="Dask client for the execution of the pipeline.",
74 cls=ResourceOption,
75)
76@click.option(
77 "--output",
78 "-o",
79 show_default=True,
80 default="results",
81 help="Name of output directory where output files will be saved.",
82 cls=ResourceOption,
83)
84@click.option(
85 "--memory",
86 "-m",
87 is_flag=True,
88 help="If set, it will run the experiment keeping all objects on memory with nothing checkpointed. If not set, checkpoints will be saved in `--output`.",
89 cls=ResourceOption,
90)
91@click.option(
92 "--checkpoint-dir",
93 "-c",
94 show_default=True,
95 default=None,
96 help="Name of output directory where the checkpoints will be saved. In case --memory is not set, checkpoints will be saved in this directory.",
97 cls=ResourceOption,
98)
99@click.option(
100 "--dask-partition-size",
101 "-s",
102 help="If using Dask, this option defines the max size of each dask.bag.partition. "
103 "Use this option if the current heuristic that sets this value doesn't suit your experiment. "
104 "(https://docs.dask.org/en/latest/bag-api.html?highlight=partition_size#dask.bag.from_sequence).",
105 default=None,
106 type=click.INT,
107 cls=ResourceOption,
108)
109@click.option(
110 "--dask-n-partitions",
111 "-n",
112 help="If using Dask, this option defines a fixed number of dask.bag.partition for "
113 "each set of data. Use this option if the current heuristic that sets this value "
114 "doesn't suit your experiment."
115 "(https://docs.dask.org/en/latest/bag-api.html?highlight=partition_size#dask.bag.from_sequence).",
116 default=None,
117 type=click.INT,
118 cls=ResourceOption,
119)
120@click.option(
121 "--dask-n-workers",
122 "-w",
123 help="If using Dask, this option defines the number of workers to start your experiment. "
124 "Dask automatically scales up/down the number of workers due to the current load of tasks to be solved. "
125 "Use this option if the current amount of workers set to start an experiment doesn't suit you.",
126 default=None,
127 type=click.INT,
128 cls=ResourceOption,
129)
130@click.option(
131 "--force",
132 "-f",
133 is_flag=True,
134 help="If set, it will force generate all the checkpoints of an experiment. This option doesn't work if `--memory` is set",
135 cls=ResourceOption,
136)
137@click.option(
138 "--no-dask",
139 is_flag=True,
140 help="If set, it will not use Dask to run the experiment.",
141 cls=ResourceOption,
142)
143@click.option(
144 "--split-training",
145 is_flag=True,
146 help="Splits the training set in partitions and trains the pipeline in multiple steps.",
147 cls=ResourceOption,
148)
149@click.option(
150 "--n-splits",
151 default=3,
152 help="Number of partitions to split the training set in. "
153 "Each partition will be trained in a separate step.",
154 cls=ResourceOption,
155)
156@verbosity_option(cls=ResourceOption, logger=logger)
157def pipeline_train(
158 pipeline,
159 database,
160 dask_client,
161 output,
162 memory,
163 checkpoint_dir,
164 dask_partition_size,
165 dask_n_workers,
166 dask_n_partitions,
167 force,
168 no_dask,
169 split_training,
170 n_splits,
171 **kwargs,
172):
173 """Runs the training part of a biometrics pipeline.
175 This pipeline consists only of one component, contrary to the ``simple`` pipeline.
176 This component is a scikit-learn ``Pipeline``, where a sequence of transformations
177 of the input data is defined.
179 The pipeline is trained on the database and the resulting model is saved in the
180 output directory.
182 It is possible to split the training data in multiple partitions that will be
183 used to train the pipeline in multiple steps, helping with big datasets that would
184 not fit in memory if trained all at once. Passing the ``--split-training`` option
185 will split the training data in ``--n-splits`` partitions and train the pipeline
186 sequentially with each partition. The pipeline must support "continuous learning",
187 (a call to ``fit`` on an already trained pipeline should continue the training).
188 """
190 from bob.bio.base.pipelines import execute_pipeline_train
192 if no_dask:
193 dask_client = None
195 checkpoint = not memory
197 logger.debug("Executing pipeline training with:")
198 logger.debug(f"pipeline: {pipeline}")
199 logger.debug(f"database: {database}")
201 execute_pipeline_train(
202 pipeline=pipeline,
203 database=database,
204 dask_client=dask_client,
205 output=output,
206 checkpoint=checkpoint,
207 dask_partition_size=dask_partition_size,
208 dask_n_partitions=dask_n_partitions,
209 dask_n_workers=dask_n_workers,
210 checkpoint_dir=checkpoint_dir,
211 force=force,
212 split_training=split_training,
213 n_splits=n_splits,
214 **kwargs,
215 )
217 logger.info(f"Experiment finished ! ({output=})")