Coverage for src/deepdraw/script/train_analysis.py: 96%

68 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-11-30 15:00 +0100

1# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> 

2# 

3# SPDX-License-Identifier: GPL-3.0-or-later 

4 

5import os 

6 

7import click 

8import matplotlib.pyplot as plt 

9import numpy 

10import pandas 

11 

12from clapper.click import ConfigCommand, ResourceOption, verbosity_option 

13from clapper.logging import setup 

14from matplotlib.backends.backend_pdf import PdfPages 

15 

16logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") 

17 

18 

19def _loss_evolution(df): 

20 """Plots the loss evolution over time (epochs) 

21 

22 Parameters 

23 ---------- 

24 

25 df : pandas.DataFrame 

26 dataframe containing the training logs 

27 

28 

29 Returns 

30 ------- 

31 

32 figure : matplotlib.figure.Figure 

33 figure to be displayed or saved to file 

34 """ 

35 

36 figure = plt.figure() 

37 axes = figure.gca() 

38 

39 axes.plot(df.epoch.values, df.loss.values, label="Training") 

40 if "validation_loss" in df.columns: 

41 axes.plot( 

42 df.epoch.values, df.validation_loss.values, label="Validation" 

43 ) 

44 # shows a red dot on the location with the minima on the validation set 

45 lowest_index = numpy.argmin(df["validation_loss"]) 

46 

47 axes.plot( 

48 df.epoch.values[lowest_index], 

49 df.validation_loss[lowest_index], 

50 "mo", 

51 label=f"Lowest validation ({df.validation_loss[lowest_index]:.3f}@{df.epoch[lowest_index]})", 

52 ) 

53 

54 if "extra_validation_losses" in df.columns: 

55 # These losses are in array format. So, we read all rows, then create a 

56 # 2d array. We transpose the array to iterate over each column and 

57 # plot the losses individually. They are numbered from 1. 

58 df["extra_validation_losses"] = df["extra_validation_losses"].apply( 

59 lambda x: numpy.fromstring(x.strip("[]"), sep=" ") 

60 ) 

61 losses = numpy.vstack(df.extra_validation_losses.values).T 

62 for n, k in enumerate(losses): 

63 axes.plot(df.epoch.values, k, label=f"Extra validation {n+1}") 

64 

65 axes.set_title("Loss over time") 

66 axes.set_xlabel("Epoch") 

67 axes.set_ylabel("Loss") 

68 

69 axes.legend(loc="best") 

70 axes.grid(alpha=0.3) 

71 figure.set_layout_engine("tight") 

72 

73 return figure 

74 

75 

76def _hardware_utilisation(df, const): 

77 """Plot the CPU utilisation over time (epochs). 

78 

79 Parameters 

80 ---------- 

81 

82 df : pandas.DataFrame 

83 dataframe containing the training logs 

84 

85 const : dict 

86 training and hardware constants 

87 

88 

89 Returns 

90 ------- 

91 

92 figure : matplotlib.figure.Figure 

93 figure to be displayed or saved to file 

94 """ 

95 figure = plt.figure() 

96 axes = figure.gca() 

97 

98 cpu_percent = df.cpu_percent.values / const["cpu_count"] 

99 cpu_memory = 100 * df.cpu_rss / const["cpu_memory_total"] 

100 

101 axes.plot( 

102 df.epoch.values, 

103 cpu_percent, 

104 label=f"CPU usage (cores: {const['cpu_count']})", 

105 ) 

106 axes.plot( 

107 df.epoch.values, 

108 cpu_memory, 

109 label=f"CPU memory (total: {const['cpu_memory_total']:.1f} Gb)", 

110 ) 

111 if "gpu_percent" in df: 

112 axes.plot( 

113 df.epoch.values, 

114 df.gpu_percent.values, 

115 label=f"GPU usage (type: {const['gpu_name']})", 

116 ) 

117 if "gpu_memory_percent" in df: 

118 axes.plot( 

119 df.epoch.values, 

120 df.gpu_memory_percent.values, 

121 label=f"GPU memory (total: {const['gpu_memory_total']:.1f} Gb)", 

122 ) 

123 axes.set_title("Hardware utilisation over time") 

124 axes.set_xlabel("Epoch") 

125 axes.set_ylabel("Relative utilisation (%)") 

126 axes.set_ylim([0, 100]) 

127 

128 axes.legend(loc="best") 

129 axes.grid(alpha=0.3) 

130 figure.set_layout_engine("tight") 

131 

132 return figure 

133 

134 

135@click.command( 

136 entry_point_group="deepdraw.config", 

137 cls=ConfigCommand, 

138 epilog="""Examples: 

139 

140\b 

141 1. Analyzes a training log and produces various plots: 

142 

143 .. code:: sh 

144 

145 $ deepdraw train-analysis -vv log.csv constants.csv 

146""", 

147) 

148@click.argument( 

149 "log", 

150 type=click.Path(dir_okay=False, exists=True), 

151) 

152@click.argument( 

153 "constants", 

154 type=click.Path(dir_okay=False, exists=True), 

155) 

156@click.option( 

157 "--output-pdf", 

158 "-o", 

159 help="Name of the output file to dump", 

160 required=True, 

161 show_default=True, 

162 default="trainlog.pdf", 

163) 

164@verbosity_option(logger=logger, cls=ResourceOption) 

165@click.pass_context 

166def train_analysis(ctx, log, constants, output_pdf, verbose, **kwargs): 

167 """Analyze the training logs for loss evolution and resource 

168 utilisation.""" 

169 

170 constants = pandas.read_csv(constants) 

171 constants = dict(zip(constants.keys(), constants.values[0])) 

172 data = pandas.read_csv(log) 

173 

174 # makes sure the directory to save the output PDF is there 

175 dirname = os.path.dirname(os.path.realpath(output_pdf)) 

176 if not os.path.exists(dirname): 

177 os.makedirs(dirname) 

178 

179 # now, do the analysis 

180 with PdfPages(output_pdf) as pdf: 

181 figure = _loss_evolution(data) 

182 pdf.savefig(figure) 

183 plt.close(figure) 

184 

185 figure = _hardware_utilisation(data, constants) 

186 pdf.savefig(figure) 

187 plt.close(figure)