Coverage for src/bob/bio/base/script/pipeline_train.py: 100%

31 statements  

« prev     ^ index     » next       coverage.py v7.6.0, created at 2024-07-12 22:34 +0200

1#!/usr/bin/env python 

2# vim: set fileencoding=utf-8 : 

3# Tiago de Freitas Pereira <tiago.pereira@idiap.ch> 

4 

5 

6"""Executes only the train part of a biometric pipeline""" 

7 

8import logging 

9 

10import click 

11 

12from clapper.click import ConfigCommand, ResourceOption, verbosity_option 

13 

14from bob.pipelines.distributed import VALID_DASK_CLIENT_STRINGS 

15 

16logger = logging.getLogger(__name__) 

17 

18 

19EPILOG = """\b 

20 

21Command line examples\n 

22----------------------- 

23 

24$ bob bio pipeline train -vv DATABASE PIPELINE 

25 

26See the help of the CONFIG argument on top of this help message 

27for a list of available configurations. 

28 

29It is possible to provide database and pipeline through a configuration file. 

30Generate an example configuration file with: 

31 

32$ bob bio pipeline train --dump-config my_experiment.py 

33 

34and execute it with: 

35 

36$ bob bio pipeline train -vv my_experiment.py 

37 

38my_experiment.py must contain the following elements: 

39 

40 >>> pipeline = ... # A scikit-learn pipeline wrapped with bob.pipelines' SampleWrapper\n 

41 >>> database = .... # Biometric Database (class that implements the methods: `background_model_samples`, `references` and `probes`)" 

42\b""" 

43 

44 

45@click.command( 

46 name="train", 

47 entry_point_group="bob.bio.config", 

48 cls=ConfigCommand, 

49 epilog=EPILOG, 

50) 

51@click.option( 

52 "--pipeline", 

53 "-p", 

54 required=True, 

55 entry_point_group="bob.bio.pipeline", 

56 help="A PipelineSimple or an sklearn.pipeline", 

57 cls=ResourceOption, 

58) 

59@click.option( 

60 "--database", 

61 "-d", 

62 entry_point_group="bob.bio.database", 

63 required=True, 

64 help="Biometric Database connector (class that implements the methods: `background_model_samples`, `references` and `probes`)", 

65 cls=ResourceOption, 

66) 

67@click.option( 

68 "--dask-client", 

69 "-l", 

70 entry_point_group="dask.client", 

71 string_exceptions=VALID_DASK_CLIENT_STRINGS, 

72 default="single-threaded", 

73 help="Dask client for the execution of the pipeline.", 

74 cls=ResourceOption, 

75) 

76@click.option( 

77 "--output", 

78 "-o", 

79 show_default=True, 

80 default="results", 

81 help="Name of output directory where output files will be saved.", 

82 cls=ResourceOption, 

83) 

84@click.option( 

85 "--memory", 

86 "-m", 

87 is_flag=True, 

88 help="If set, it will run the experiment keeping all objects on memory with nothing checkpointed. If not set, checkpoints will be saved in `--output`.", 

89 cls=ResourceOption, 

90) 

91@click.option( 

92 "--checkpoint-dir", 

93 "-c", 

94 show_default=True, 

95 default=None, 

96 help="Name of output directory where the checkpoints will be saved. In case --memory is not set, checkpoints will be saved in this directory.", 

97 cls=ResourceOption, 

98) 

99@click.option( 

100 "--dask-partition-size", 

101 "-s", 

102 help="If using Dask, this option defines the max size of each dask.bag.partition. " 

103 "Use this option if the current heuristic that sets this value doesn't suit your experiment. " 

104 "(https://docs.dask.org/en/latest/bag-api.html?highlight=partition_size#dask.bag.from_sequence).", 

105 default=None, 

106 type=click.INT, 

107 cls=ResourceOption, 

108) 

109@click.option( 

110 "--dask-n-partitions", 

111 "-n", 

112 help="If using Dask, this option defines a fixed number of dask.bag.partition for " 

113 "each set of data. Use this option if the current heuristic that sets this value " 

114 "doesn't suit your experiment." 

115 "(https://docs.dask.org/en/latest/bag-api.html?highlight=partition_size#dask.bag.from_sequence).", 

116 default=None, 

117 type=click.INT, 

118 cls=ResourceOption, 

119) 

120@click.option( 

121 "--dask-n-workers", 

122 "-w", 

123 help="If using Dask, this option defines the number of workers to start your experiment. " 

124 "Dask automatically scales up/down the number of workers due to the current load of tasks to be solved. " 

125 "Use this option if the current amount of workers set to start an experiment doesn't suit you.", 

126 default=None, 

127 type=click.INT, 

128 cls=ResourceOption, 

129) 

130@click.option( 

131 "--force", 

132 "-f", 

133 is_flag=True, 

134 help="If set, it will force generate all the checkpoints of an experiment. This option doesn't work if `--memory` is set", 

135 cls=ResourceOption, 

136) 

137@click.option( 

138 "--no-dask", 

139 is_flag=True, 

140 help="If set, it will not use Dask to run the experiment.", 

141 cls=ResourceOption, 

142) 

143@click.option( 

144 "--split-training", 

145 is_flag=True, 

146 help="Splits the training set in partitions and trains the pipeline in multiple steps.", 

147 cls=ResourceOption, 

148) 

149@click.option( 

150 "--n-splits", 

151 default=3, 

152 help="Number of partitions to split the training set in. " 

153 "Each partition will be trained in a separate step.", 

154 cls=ResourceOption, 

155) 

156@verbosity_option(cls=ResourceOption, logger=logger) 

157def pipeline_train( 

158 pipeline, 

159 database, 

160 dask_client, 

161 output, 

162 memory, 

163 checkpoint_dir, 

164 dask_partition_size, 

165 dask_n_workers, 

166 dask_n_partitions, 

167 force, 

168 no_dask, 

169 split_training, 

170 n_splits, 

171 **kwargs, 

172): 

173 """Runs the training part of a biometrics pipeline. 

174 

175 This pipeline consists only of one component, contrary to the ``simple`` pipeline. 

176 This component is a scikit-learn ``Pipeline``, where a sequence of transformations 

177 of the input data is defined. 

178 

179 The pipeline is trained on the database and the resulting model is saved in the 

180 output directory. 

181 

182 It is possible to split the training data in multiple partitions that will be 

183 used to train the pipeline in multiple steps, helping with big datasets that would 

184 not fit in memory if trained all at once. Passing the ``--split-training`` option 

185 will split the training data in ``--n-splits`` partitions and train the pipeline 

186 sequentially with each partition. The pipeline must support "continuous learning", 

187 (a call to ``fit`` on an already trained pipeline should continue the training). 

188 """ 

189 

190 from bob.bio.base.pipelines import execute_pipeline_train 

191 

192 if no_dask: 

193 dask_client = None 

194 

195 checkpoint = not memory 

196 

197 logger.debug("Executing pipeline training with:") 

198 logger.debug(f"pipeline: {pipeline}") 

199 logger.debug(f"database: {database}") 

200 

201 execute_pipeline_train( 

202 pipeline=pipeline, 

203 database=database, 

204 dask_client=dask_client, 

205 output=output, 

206 checkpoint=checkpoint, 

207 dask_partition_size=dask_partition_size, 

208 dask_n_partitions=dask_n_partitions, 

209 dask_n_workers=dask_n_workers, 

210 checkpoint_dir=checkpoint_dir, 

211 force=force, 

212 split_training=split_training, 

213 n_splits=n_splits, 

214 **kwargs, 

215 ) 

216 

217 logger.info(f"Experiment finished ! ({output=})")