Coverage for src/deepdraw/script/train_analysis.py: 96%
68 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-11-30 15:00 +0100
« prev ^ index » next coverage.py v7.3.1, created at 2023-11-30 15:00 +0100
1# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
2#
3# SPDX-License-Identifier: GPL-3.0-or-later
5import os
7import click
8import matplotlib.pyplot as plt
9import numpy
10import pandas
12from clapper.click import ConfigCommand, ResourceOption, verbosity_option
13from clapper.logging import setup
14from matplotlib.backends.backend_pdf import PdfPages
16logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
19def _loss_evolution(df):
20 """Plots the loss evolution over time (epochs)
22 Parameters
23 ----------
25 df : pandas.DataFrame
26 dataframe containing the training logs
29 Returns
30 -------
32 figure : matplotlib.figure.Figure
33 figure to be displayed or saved to file
34 """
36 figure = plt.figure()
37 axes = figure.gca()
39 axes.plot(df.epoch.values, df.loss.values, label="Training")
40 if "validation_loss" in df.columns:
41 axes.plot(
42 df.epoch.values, df.validation_loss.values, label="Validation"
43 )
44 # shows a red dot on the location with the minima on the validation set
45 lowest_index = numpy.argmin(df["validation_loss"])
47 axes.plot(
48 df.epoch.values[lowest_index],
49 df.validation_loss[lowest_index],
50 "mo",
51 label=f"Lowest validation ({df.validation_loss[lowest_index]:.3f}@{df.epoch[lowest_index]})",
52 )
54 if "extra_validation_losses" in df.columns:
55 # These losses are in array format. So, we read all rows, then create a
56 # 2d array. We transpose the array to iterate over each column and
57 # plot the losses individually. They are numbered from 1.
58 df["extra_validation_losses"] = df["extra_validation_losses"].apply(
59 lambda x: numpy.fromstring(x.strip("[]"), sep=" ")
60 )
61 losses = numpy.vstack(df.extra_validation_losses.values).T
62 for n, k in enumerate(losses):
63 axes.plot(df.epoch.values, k, label=f"Extra validation {n+1}")
65 axes.set_title("Loss over time")
66 axes.set_xlabel("Epoch")
67 axes.set_ylabel("Loss")
69 axes.legend(loc="best")
70 axes.grid(alpha=0.3)
71 figure.set_layout_engine("tight")
73 return figure
76def _hardware_utilisation(df, const):
77 """Plot the CPU utilisation over time (epochs).
79 Parameters
80 ----------
82 df : pandas.DataFrame
83 dataframe containing the training logs
85 const : dict
86 training and hardware constants
89 Returns
90 -------
92 figure : matplotlib.figure.Figure
93 figure to be displayed or saved to file
94 """
95 figure = plt.figure()
96 axes = figure.gca()
98 cpu_percent = df.cpu_percent.values / const["cpu_count"]
99 cpu_memory = 100 * df.cpu_rss / const["cpu_memory_total"]
101 axes.plot(
102 df.epoch.values,
103 cpu_percent,
104 label=f"CPU usage (cores: {const['cpu_count']})",
105 )
106 axes.plot(
107 df.epoch.values,
108 cpu_memory,
109 label=f"CPU memory (total: {const['cpu_memory_total']:.1f} Gb)",
110 )
111 if "gpu_percent" in df:
112 axes.plot(
113 df.epoch.values,
114 df.gpu_percent.values,
115 label=f"GPU usage (type: {const['gpu_name']})",
116 )
117 if "gpu_memory_percent" in df:
118 axes.plot(
119 df.epoch.values,
120 df.gpu_memory_percent.values,
121 label=f"GPU memory (total: {const['gpu_memory_total']:.1f} Gb)",
122 )
123 axes.set_title("Hardware utilisation over time")
124 axes.set_xlabel("Epoch")
125 axes.set_ylabel("Relative utilisation (%)")
126 axes.set_ylim([0, 100])
128 axes.legend(loc="best")
129 axes.grid(alpha=0.3)
130 figure.set_layout_engine("tight")
132 return figure
135@click.command(
136 entry_point_group="deepdraw.config",
137 cls=ConfigCommand,
138 epilog="""Examples:
140\b
141 1. Analyzes a training log and produces various plots:
143 .. code:: sh
145 $ deepdraw train-analysis -vv log.csv constants.csv
146""",
147)
148@click.argument(
149 "log",
150 type=click.Path(dir_okay=False, exists=True),
151)
152@click.argument(
153 "constants",
154 type=click.Path(dir_okay=False, exists=True),
155)
156@click.option(
157 "--output-pdf",
158 "-o",
159 help="Name of the output file to dump",
160 required=True,
161 show_default=True,
162 default="trainlog.pdf",
163)
164@verbosity_option(logger=logger, cls=ResourceOption)
165@click.pass_context
166def train_analysis(ctx, log, constants, output_pdf, verbose, **kwargs):
167 """Analyze the training logs for loss evolution and resource
168 utilisation."""
170 constants = pandas.read_csv(constants)
171 constants = dict(zip(constants.keys(), constants.values[0]))
172 data = pandas.read_csv(log)
174 # makes sure the directory to save the output PDF is there
175 dirname = os.path.dirname(os.path.realpath(output_pdf))
176 if not os.path.exists(dirname):
177 os.makedirs(dirname)
179 # now, do the analysis
180 with PdfPages(output_pdf) as pdf:
181 figure = _loss_evolution(data)
182 pdf.savefig(figure)
183 plt.close(figure)
185 figure = _hardware_utilisation(data, constants)
186 pdf.savefig(figure)
187 plt.close(figure)