Coverage for src/deepdraw/script/train

3# SPDX-License-Identifier: GPL-3.0-or-later

5import os

7import click

8import matplotlib.pyplot as plt

9import numpy

10import pandas

12from clapper.click import ConfigCommand, ResourceOption, verbosity_option

13from clapper.logging import setup

14from matplotlib.backends.backend_pdf import PdfPages

16logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")

19def _loss_evolution(df):

20 """Plots the loss evolution over time (epochs)

22 Parameters

23 ----------

25 df : pandas.DataFrame

26 dataframe containing the training logs

29 Returns

30 -------

32 figure : matplotlib.figure.Figure

33 figure to be displayed or saved to file

34 """

36 figure = plt.figure()

37 axes = figure.gca()

39 axes.plot(df.epoch.values, df.loss.values, label="Training")

40 if "validation_loss" in df.columns:

41 axes.plot(

42 df.epoch.values, df.validation_loss.values, label="Validation"

43 )

44 # shows a red dot on the location with the minima on the validation set

45 lowest_index = numpy.argmin(df["validation_loss"])

47 axes.plot(

48 df.epoch.values[lowest_index],

49 df.validation_loss[lowest_index],

50 "mo",

51 label=f"Lowest validation ({df.validation_loss[lowest_index]:.3f}@{df.epoch[lowest_index]})",

52 )

54 if "extra_validation_losses" in df.columns:

55 # These losses are in array format. So, we read all rows, then create a

56 # 2d array. We transpose the array to iterate over each column and

57 # plot the losses individually. They are numbered from 1.

58 df["extra_validation_losses"] = df["extra_validation_losses"].apply(

59 lambda x: numpy.fromstring(x.strip("[]"), sep=" ")

60 )

61 losses = numpy.vstack(df.extra_validation_losses.values).T

62 for n, k in enumerate(losses):

63 axes.plot(df.epoch.values, k, label=f"Extra validation {n+1}")

65 axes.set_title("Loss over time")

66 axes.set_xlabel("Epoch")

67 axes.set_ylabel("Loss")

69 axes.legend(loc="best")

70 axes.grid(alpha=0.3)

71 figure.set_layout_engine("tight")

73 return figure

76def _hardware_utilisation(df, const):

77 """Plot the CPU utilisation over time (epochs).

79 Parameters

80 ----------

82 df : pandas.DataFrame

83 dataframe containing the training logs

85 const : dict

86 training and hardware constants

89 Returns

90 -------

92 figure : matplotlib.figure.Figure

93 figure to be displayed or saved to file

94 """

95 figure = plt.figure()

96 axes = figure.gca()

98 cpu_percent = df.cpu_percent.values / const["cpu_count"]

99 cpu_memory = 100 * df.cpu_rss / const["cpu_memory_total"]

100

101 axes.plot(

102 df.epoch.values,

103 cpu_percent,

104 label=f"CPU usage (cores: {const['cpu_count']})",

105 )

106 axes.plot(

107 df.epoch.values,

108 cpu_memory,

109 label=f"CPU memory (total: {const['cpu_memory_total']:.1f} Gb)",

110 )

111 if "gpu_percent" in df:

112 axes.plot(

113 df.epoch.values,

114 df.gpu_percent.values,

115 label=f"GPU usage (type: {const['gpu_name']})",

116 )

117 if "gpu_memory_percent" in df:

118 axes.plot(

119 df.epoch.values,

120 df.gpu_memory_percent.values,

121 label=f"GPU memory (total: {const['gpu_memory_total']:.1f} Gb)",

122 )

123 axes.set_title("Hardware utilisation over time")

124 axes.set_xlabel("Epoch")

125 axes.set_ylabel("Relative utilisation (%)")

126 axes.set_ylim([0, 100])

127

128 axes.legend(loc="best")

129 axes.grid(alpha=0.3)

130 figure.set_layout_engine("tight")

131

132 return figure

133

134

135@click.command(

136 entry_point_group="deepdraw.config",

137 cls=ConfigCommand,

138 epilog="""Examples:

139

140\b

141 1. Analyzes a training log and produces various plots:

142

143 .. code:: sh

144

145 $ deepdraw train-analysis -vv log.csv constants.csv

146""",

147)

148@click.argument(

149 "log",

150 type=click.Path(dir_okay=False, exists=True),

151)

152@click.argument(

153 "constants",

154 type=click.Path(dir_okay=False, exists=True),

155)

156@click.option(

157 "--output-pdf",

158 "-o",

159 help="Name of the output file to dump",

160 required=True,

161 show_default=True,

162 default="trainlog.pdf",

163)

164@verbosity_option(logger=logger, cls=ResourceOption)

165@click.pass_context

166def train_analysis(ctx, log, constants, output_pdf, verbose, **kwargs):

167 """Analyze the training logs for loss evolution and resource

168 utilisation."""

169

170 constants = pandas.read_csv(constants)

171 constants = dict(zip(constants.keys(), constants.values[0]))

172 data = pandas.read_csv(log)

173

174 # makes sure the directory to save the output PDF is there

175 dirname = os.path.dirname(os.path.realpath(output_pdf))

176 if not os.path.exists(dirname):

177 os.makedirs(dirname)

178

179 # now, do the analysis

180 with PdfPages(output_pdf) as pdf:

181 figure = _loss_evolution(data)

182 pdf.savefig(figure)

183 plt.close(figure)

184

185 figure = _hardware_utilisation(data, constants)

186 pdf.savefig(figure)

187 plt.close(figure)

Coverage for src/deepdraw/script/train_analysis.py: 96%

68 statements