Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1#!/usr/bin/env python 

2# vim: set fileencoding=utf-8 : 

3 

4"""Tools for interacting with the running computer or GPU""" 

5 

6import logging 

7import shutil 

8import subprocess 

9 

10import psutil 

11 

12logger = logging.getLogger(__name__) 

13 

14_nvidia_smi = shutil.which("nvidia-smi") 

15"""Location of the nvidia-smi program, if one exists""" 

16 

17 

18GB = float(2 ** 30) 

19"""The number of bytes in a gigabyte""" 

20 

21 

22def run_nvidia_smi(query, rename=None): 

23 """Returns GPU information from query 

24 

25 For a comprehensive list of options and help, execute ``nvidia-smi 

26 --help-query-gpu`` on a host with a GPU 

27 

28 

29 Parameters 

30 ---------- 

31 

32 query : list 

33 A list of query strings as defined by ``nvidia-smi --help-query-gpu`` 

34 

35 rename : :py:class:`list`, Optional 

36 A list of keys to yield in the return value for each entry above. It 

37 gives you the opportunity to rewrite some key names for convenience. 

38 This list, if provided, must be of the same length as ``query``. 

39 

40 

41 Returns 

42 ------- 

43 

44 data : :py:class:`tuple`, None 

45 An ordered dictionary (organized as 2-tuples) containing the queried 

46 parameters (``rename`` versions). If ``nvidia-smi`` is not available, 

47 returns ``None``. Percentage information is left alone, 

48 memory information is transformed to gigabytes (floating-point). 

49 

50 """ 

51 

52 if _nvidia_smi is not None: 

53 

54 if rename is None: 

55 rename = query 

56 else: 

57 assert len(rename) == len(query) 

58 

59 values = subprocess.getoutput( 

60 "%s --query-gpu=%s --format=csv,noheader" 

61 % (_nvidia_smi, ",".join(query)) 

62 ) 

63 values = [k.strip() for k in values.split(",")] 

64 t_values = [] 

65 for k in values: 

66 if k.endswith("%"): 

67 t_values.append(float(k[:-1].strip())) 

68 elif k.endswith("MiB"): 

69 t_values.append(float(k[:-3].strip()) / 1024) 

70 else: 

71 t_values.append(k) # unchanged 

72 return tuple(zip(rename, t_values)) 

73 

74 

75def gpu_constants(): 

76 """Returns GPU (static) information using nvidia-smi 

77 

78 See :py:func:`run_nvidia_smi` for operational details. 

79 

80 Returns 

81 ------- 

82 

83 data : :py:class:`tuple`, None 

84 If ``nvidia-smi`` is not available, returns ``None``, otherwise, we 

85 return an ordered dictionary (organized as 2-tuples) containing the 

86 following ``nvidia-smi`` query information: 

87 

88 * ``gpu_name``, as ``gpu_name`` (:py:class:`str`) 

89 * ``driver_version``, as ``gpu_driver_version`` (:py:class:`str`) 

90 * ``memory.total``, as ``gpu_memory_total`` (transformed to gigabytes, 

91 :py:class:`float`) 

92 

93 """ 

94 

95 return run_nvidia_smi( 

96 ("gpu_name", "driver_version", "memory.total"), 

97 ("gpu_name", "gpu_driver_version", "gpu_memory_total"), 

98 ) 

99 

100 

101def gpu_log(): 

102 """Returns GPU information about current non-static status using nvidia-smi 

103 

104 See :py:func:`run_nvidia_smi` for operational details. 

105 

106 Returns 

107 ------- 

108 

109 data : :py:class:`tuple`, None 

110 If ``nvidia-smi`` is not available, returns ``None``, otherwise, we 

111 return an ordered dictionary (organized as 2-tuples) containing the 

112 following ``nvidia-smi`` query information: 

113 

114 * ``memory.used``, as ``gpu_memory_used`` (transformed to gigabytes, 

115 :py:class:`float`) 

116 * ``memory.free``, as ``gpu_memory_free`` (transformed to gigabytes, 

117 :py:class:`float`) 

118 * ``100*memory.used/memory.total``, as ``gpu_memory_percent``, 

119 (:py:class:`float`, in percent) 

120 * ``utilization.gpu``, as ``gpu_percent``, 

121 (:py:class:`float`, in percent) 

122 

123 """ 

124 

125 retval = run_nvidia_smi( 

126 ( 

127 "memory.total", 

128 "memory.used", 

129 "memory.free", 

130 "utilization.gpu", 

131 ), 

132 ( 

133 "gpu_memory_total", 

134 "gpu_memory_used", 

135 "gpu_memory_free", 

136 "gpu_percent", 

137 ), 

138 ) 

139 

140 # re-compose the output to generate expected values 

141 return ( 

142 retval[1], # gpu_memory_used 

143 retval[2], # gpu_memory_free 

144 ("gpu_memory_percent", 100 * (retval[1][1] / retval[0][1])), 

145 retval[3], # gpu_percent 

146 ) 

147 

148 

149_CLUSTER = [] 

150"""List of processes currently being monitored""" 

151 

152 

153def cpu_constants(): 

154 """Returns static CPU information about the current system. 

155 

156 

157 Returns 

158 ------- 

159 

160 data : tuple 

161 An ordered dictionary (organized as 2-tuples) containing these entries: 

162 

163 0. ``cpu_memory_total`` (:py:class:`float`): total memory available, 

164 in gigabytes 

165 1. ``cpu_count`` (:py:class:`int`): number of logical CPUs available 

166 

167 """ 

168 

169 return ( 

170 ("cpu_memory_total", psutil.virtual_memory().total / GB), 

171 ("cpu_count", psutil.cpu_count(logical=True)), 

172 ) 

173 

174 

175def cpu_log(): 

176 """Returns process (+child) information using ``psutil``. 

177 

178 This call examines the current process plus any spawn child and returns the 

179 combined resource usage summary for the process group. 

180 

181 

182 Returns 

183 ------- 

184 

185 data : tuple 

186 An ordered dictionary (organized as 2-tuples) containing these entries: 

187 

188 0. ``cpu_memory_used`` (:py:class:`float`): total memory used from 

189 the system, in gigabytes 

190 1. ``cpu_rss`` (:py:class:`float`): RAM currently used by 

191 process and children, in gigabytes 

192 2. ``cpu_vms`` (:py:class:`float`): total memory (RAM + swap) currently 

193 used by process and children, in gigabytes 

194 3. ``cpu_percent`` (:py:class:`float`): percentage of the total CPU 

195 used by this process and children (recursively) since last call 

196 (first time called should be ignored). This number depends on the 

197 number of CPUs in the system and can be greater than 100% 

198 4. ``cpu_processes`` (:py:class:`int`): total number of processes 

199 including self and children (recursively) 

200 5. ``cpu_open_files`` (:py:class:`int`): total number of open files by 

201 self and children 

202 

203 """ 

204 

205 global _CLUSTER 

206 if (not _CLUSTER) or (_CLUSTER[0] != psutil.Process()): # initialization 

207 this = psutil.Process() 

208 _CLUSTER = [this] + this.children(recursive=True) 

209 # touch cpu_percent() at least once for all 

210 [k.cpu_percent(interval=None) for k in _CLUSTER] 

211 else: 

212 # check all cluster components and update process list 

213 # done so we can keep the cpu_percent() initialization 

214 stored_children = set(_CLUSTER[1:]) 

215 current_children = set(_CLUSTER[0].children()) 

216 keep_children = stored_children - current_children 

217 new_children = current_children - stored_children 

218 [k.cpu_percent(interval=None) for k in new_children] 

219 _CLUSTER = _CLUSTER[:1] + list(keep_children) + list(new_children) 

220 

221 memory_info = [k.memory_info() for k in _CLUSTER] 

222 

223 return ( 

224 ("cpu_memory_used", psutil.virtual_memory().used / GB), 

225 ("cpu_rss", sum([k.rss for k in memory_info]) / GB), 

226 ("cpu_vms", sum([k.vms for k in memory_info]) / GB), 

227 ("cpu_percent", sum(k.cpu_percent(interval=None) for k in _CLUSTER)), 

228 ("cpu_processes", len(_CLUSTER)), 

229 ("cpu_open_files", sum(len(k.open_files()) for k in _CLUSTER)), 

230 )