Coverage for src/bob/io/base/__init__.py: 84%

114 statements  

« prev     ^ index     » next       coverage.py v7.0.5, created at 2023-06-16 13:56 +0200

1# import Libraries of other lib packages 

2import logging 

3 

4import h5py 

5import imageio 

6import numpy as np 

7 

8logger = logging.getLogger(__name__) 

9import os 

10 

11# Allowing the loading of truncated files in case PIL is used 

12# https://github.com/kirumang/Pix2Pose/issues/2 

13from PIL import ImageFile 

14 

15ImageFile.LOAD_TRUNCATED_IMAGES = True 

16 

17 

18hdf5_extensions = [".hdf5", ".h5", ".hdf", ".hdf5", ".h5", ".hdf", ".hdf5"] 

19image_extensions = [ 

20 ".jpg", 

21 ".jpeg", 

22 ".png", 

23 ".bmp", 

24 ".gif", 

25 ".tif", 

26 ".tiff", 

27 ".pgm", 

28 ".pbm", 

29 ".pnm", 

30 ".ppm", 

31] 

32 

33 

34def _is_string(s): 

35 """Returns ``True`` if the given object is a string or bytes.""" 

36 return isinstance(s, (bytes, str)) 

37 

38 

39@np.deprecate(new_name="os.makedirs(directory, exist_ok=True)") 

40def create_directories_safe(directory, dryrun=False): 

41 """Creates a directory if it does not exists, with concurrent access 

42 support. This function will also create any parent directories that might 

43 be required. If the dryrun option is selected, it does not actually create 

44 the directory, but just writes the (Linux) command that would have been 

45 executed. 

46 

47 **Parameters:** 

48 

49 ``directory`` : str 

50 The directory that you want to create. 

51 

52 ``dryrun`` : bool 

53 Only ``print`` the command to console, but do not execute it. 

54 """ 

55 if dryrun: 

56 print("[dry-run] mkdir -p '%s'" % directory) 

57 else: 

58 os.makedirs(directory, exist_ok=True) 

59 

60 

61def open_file(filename) -> np.ndarray: 

62 """Reads a file content. 

63 

64 Parameters 

65 ---------- 

66 

67 ``filename`` : str 

68 The name of the file to open. 

69 """ 

70 

71 def check_gray(img): 

72 # Checking for gray scaled images 

73 if ( 

74 img.ndim > 2 

75 and np.array_equal(img[:, :, 0], img[:, :, 1]) 

76 and np.array_equal(img[:, :, 0], img[:, :, 2]) 

77 ): 

78 img = img[:, :, 0] 

79 return img 

80 

81 # get the extension 

82 extension = os.path.splitext(filename)[1].lower() 

83 

84 if extension in hdf5_extensions: 

85 with h5py.File(filename, "r") as f: 

86 keys = list(f.keys()) 

87 if len(keys) == 1: 

88 key = keys[0] 

89 else: 

90 key = "array" 

91 if key not in keys: 

92 raise RuntimeError( 

93 f"The file {filename} does not contain the key {key}" 

94 ) 

95 dataset = f[key] 

96 # if the data was saved as a string, load it back as string 

97 string_dtype = h5py.check_string_dtype(dataset.dtype) 

98 if string_dtype is not None: 

99 dataset = dataset.asstr() 

100 return dataset[()] 

101 

102 elif extension in image_extensions: 

103 from ..image import to_bob 

104 

105 img = imageio.imread(filename) 

106 

107 # PNGs have a 4th channel, which we don't want 

108 # Alpha channels for instance have to be ignored 

109 if img.ndim > 2: 

110 if extension.lower() == ".png": 

111 img = img[:, :, 0:3] 

112 

113 img = check_gray(img) 

114 return img if img.ndim == 2 else to_bob(img) 

115 else: 

116 raise ValueError(f"Unknown file extension: {extension}") 

117 

118 

119def write_file(filename, data, format="pillow") -> None: 

120 """Writes the contents of a :py:class:`numpy.ndarray` to a file. 

121 

122 Parameters 

123 ---------- 

124 

125 ``filename`` : str 

126 The name of the file to write to. 

127 

128 ``data`` : :py:class:`numpy.ndarray` 

129 The data to write to the file. 

130 

131 ``format`` : str 

132 The format to use to read the file. By default imageio selects the appropriate for you based on the filename and its contents 

133 """ 

134 extension = os.path.splitext(filename)[1] # get the extension 

135 

136 if extension in hdf5_extensions: 

137 with h5py.File(filename, "w") as f: 

138 f["array"] = data 

139 elif extension in image_extensions: 

140 # Pillow is the format with the best support for all image formats 

141 from ..image import to_matplotlib 

142 

143 imageio.imwrite(filename, to_matplotlib(data), format=format) 

144 else: 

145 raise RuntimeError(f"Unknown file extension: {extension}") 

146 

147 

148def load(inputs) -> np.ndarray: 

149 """Loads the content of a file. 

150 

151 Will take a filename (or an iterable of filenames) and put the content into a 

152 :py:class:`numpy.ndarray`. 

153 

154 **Parameters:** 

155 

156 ``inputs`` : various types 

157 

158 This might represent several different entities: 

159 

160 1. The name of a file (full path) from where to load the data. In this 

161 case, this assumes that the file contains an array and returns a loaded 

162 numpy ndarray. 

163 2. An iterable of filenames to be loaded in memory. In this case, this 

164 would assume that each file contains a single 1D sample or a set of 1D 

165 samples, load them in memory and concatenate them into a single and 

166 returned 2D :py:class:`numpy.ndarray`. 

167 

168 **Returns:** 

169 

170 ``data`` : :py:class:`numpy.ndarray` 

171 The data loaded from the given ``inputs``. 

172 """ 

173 from collections.abc import Iterable 

174 

175 import numpy 

176 

177 if _is_string(inputs): 

178 if not os.path.exists(inputs): 

179 raise RuntimeError(f"`{inputs}' does not exist!") 

180 try: 

181 return open_file(inputs) 

182 except Exception as e: 

183 raise RuntimeError(f"Could not load `{inputs}'!") from e 

184 

185 elif isinstance(inputs, Iterable): 

186 retval = [] 

187 for obj in inputs: 

188 if _is_string(obj): 

189 retval.append(load(obj)) 

190 else: 

191 raise TypeError( 

192 "Iterable contains an object which is not a filename" 

193 ) 

194 return numpy.vstack(retval) 

195 else: 

196 raise TypeError( 

197 "Unexpected input object. This function is expecting a filename, " 

198 "or an iterable of filenames." 

199 ) 

200 

201 

202def save(array, filename, create_directories=False): 

203 """Saves the contents of an array-like object to file. 

204 

205 Effectively, this is the same as opening a file with the mode flag set to ``'w'`` 

206 (write with truncation) and calling ``file.write`` passing ``array`` as parameter. 

207 

208 Parameters: 

209 

210 ``array`` : array_like 

211 The array-like object to be saved on the file 

212 

213 ``filename`` : str 

214 The name of the file where you need the contents saved to 

215 

216 ``create_directories`` : bool 

217 Automatically generate the directories if required (defaults to ``False`` 

218 because of compatibility reasons; might change in future to default to 

219 ``True``) 

220 """ 

221 # create directory if not existent yet 

222 if create_directories: 

223 create_directories_safe(os.path.dirname(filename)) 

224 

225 # if array is a string, don't create a numpy array 

226 if not isinstance(array, str): 

227 # requires data is c-contiguous and aligned, will create a copy otherwise 

228 array = np.require(array, requirements=("C_CONTIGUOUS", "ALIGNED")) 

229 

230 write_file(filename, array) 

231 

232 

233# Just to make it homogenous with the C++ API 

234write = save 

235read = load 

236 

237 

238# Keeps compatibility with the previously existing API 

239# open = File 

240 

241 

242def _generate_features(reader, paths, same_size=False): 

243 """Load and stack features in a memory efficient way. This function is 

244 meant to be used inside :py:func:`vstack_features`. 

245 

246 Parameters 

247 ---------- 

248 reader : ``collections.Callable`` 

249 See the documentation of :py:func:`vstack_features`. 

250 paths : ``collections.Iterable`` 

251 See the documentation of :py:func:`vstack_features`. 

252 same_size : :obj:`bool`, optional 

253 See the documentation of :py:func:`vstack_features`. 

254 

255 Yields 

256 ------ 

257 object 

258 The first object returned is a tuple of :py:class:`numpy.dtype` of 

259 features and the shape of the first feature. The rest of objects are 

260 the actual values in features. The features are returned in C order. 

261 """ 

262 shape_determined = False 

263 for i, path in enumerate(paths): 

264 

265 feature = np.atleast_2d(reader(path)) 

266 feature = np.ascontiguousarray(feature) 

267 if not shape_determined: 

268 shape_determined = True 

269 dtype = feature.dtype 

270 shape = list(feature.shape) 

271 yield (dtype, shape) 

272 else: 

273 # make sure all features have the same shape and dtype 

274 if same_size: 

275 assert shape == list( 

276 feature.shape 

277 ), f"Expected feature shape of {shape}, got {feature.shape}" 

278 else: 

279 assert shape[1:] == list( 

280 feature.shape[1:] 

281 ), f"Ignoring first dimension, expected feature shape of {shape}, got {feature.shape}" 

282 assert dtype == feature.dtype 

283 

284 if same_size: 

285 yield (feature.ravel(),) 

286 else: 

287 for feat in feature: 

288 yield (feat.ravel(),) 

289 

290 

291def vstack_features(reader, paths, same_size=False, dtype=None): 

292 """Stacks all features in a memory efficient way. 

293 

294 Parameters 

295 ---------- 

296 reader : ``collections.Callable`` 

297 The function to load the features. The function should only take one 

298 argument ``path`` and return loaded features. Use :any:`functools.partial` 

299 to accommodate your reader to this format. 

300 The features returned by ``reader`` are expected to have the same 

301 :py:class:`numpy.dtype` and the same shape except for their first 

302 dimension. First dimension should correspond to the number of samples. 

303 paths : ``collections.Iterable`` 

304 An iterable of paths to iterate on. Whatever is inside path is given to 

305 ``reader`` so they do not need to be necessarily paths to actual files. 

306 If ``same_size`` is ``True``, ``len(paths)`` must be valid. 

307 same_size : :obj:`bool`, optional 

308 If ``True``, it assumes that arrays inside all the paths are the same 

309 shape. If you know the features are the same size in all paths, set this 

310 to ``True`` to improve the performance. 

311 dtype : :py:class:`numpy.dtype`, optional 

312 If provided, the data will be casted to this format. 

313 

314 Returns 

315 ------- 

316 numpy.ndarray 

317 The read features with the shape ``(n_samples, *features_shape[1:])``. 

318 

319 Examples 

320 -------- 

321 This function in a simple way is equivalent to calling 

322 ``numpy.vstack([reader(p) for p in paths])``. 

323 

324 >>> import numpy 

325 >>> from bob.io.base import vstack_features 

326 >>> def reader(path): 

327 ... # in each file, there are 5 samples and features are 2 dimensional. 

328 ... return numpy.arange(10).reshape(5,2) 

329 >>> paths = ['path1', 'path2'] 

330 >>> all_features = vstack_features(reader, paths) 

331 >>> numpy.allclose(all_features, numpy.array( 

332 ... [[0, 1], 

333 ... [2, 3], 

334 ... [4, 5], 

335 ... [6, 7], 

336 ... [8, 9], 

337 ... [0, 1], 

338 ... [2, 3], 

339 ... [4, 5], 

340 ... [6, 7], 

341 ... [8, 9]])) 

342 True 

343 >>> all_features_with_more_memory = numpy.vstack([reader(p) for p in paths]) 

344 >>> numpy.allclose(all_features, all_features_with_more_memory) 

345 True 

346 

347 You can allocate the array at once to improve the performance if you know 

348 that all features in paths have the same shape and you know the total number 

349 of the paths: 

350 

351 >>> all_features = vstack_features(reader, paths, same_size=True) 

352 >>> numpy.allclose(all_features, numpy.array( 

353 ... [[0, 1], 

354 ... [2, 3], 

355 ... [4, 5], 

356 ... [6, 7], 

357 ... [8, 9], 

358 ... [0, 1], 

359 ... [2, 3], 

360 ... [4, 5], 

361 ... [6, 7], 

362 ... [8, 9]])) 

363 True 

364 """ 

365 iterable = _generate_features(reader, paths, same_size) 

366 data_dtype, shape = next(iterable) 

367 if dtype is None: 

368 dtype = data_dtype 

369 if same_size: 

370 # numpy black magic: https://stackoverflow.com/a/12473478/1286165 

371 field_dtype = [("", (dtype, (np.prod(shape),)))] 

372 total_size = len(paths) 

373 all_features = np.fromiter(iterable, field_dtype, total_size) 

374 else: 

375 field_dtype = [("", (dtype, (np.prod(shape[1:]),)))] 

376 all_features = np.fromiter(iterable, field_dtype) 

377 

378 # go from a field array to a normal array 

379 all_features = all_features.view(dtype) 

380 # the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 4). 

381 shape = list(shape) 

382 shape[0] = -1 

383 return np.reshape(all_features, shape, order="C") 

384 

385 

386# gets sphinx autodoc done right - don't remove it 

387__all__ = [_ for _ in dir() if not _.startswith("_")]