Coverage for src/bob/bio/base/utils/io.py: 31%

75 statements  

« prev     ^ index     » next       coverage.py v7.6.0, created at 2024-07-12 22:34 +0200

1import logging 

2import os 

3import tarfile 

4import tempfile 

5 

6import h5py 

7import numpy 

8 

9logger = logging.getLogger("bob.bio.base") 

10 

11import bob.io.base 

12 

13 

14def filter_missing_files( 

15 file_names, split_by_client=False, allow_missing_files=True 

16): 

17 """This function filters out files that do not exist, but only if ``allow_missing_files`` is set to ``True``, otherwise the list of ``file_names`` is returned unaltered.""" 

18 

19 if not allow_missing_files: 

20 return file_names 

21 

22 if split_by_client: 

23 # filter out missing files and empty clients 

24 existing_files = [ 

25 [f for f in client_files if os.path.exists(f)] 

26 for client_files in file_names 

27 ] 

28 existing_files = [ 

29 client_files for client_files in existing_files if client_files 

30 ] 

31 else: 

32 # filter out missing files 

33 existing_files = [f for f in file_names if os.path.exists(f)] 

34 return existing_files 

35 

36 

37def filter_none(data, split_by_client=False): 

38 """This function filters out ``None`` values from the given list (or list of lists, when ``split_by_client`` is enabled).""" 

39 

40 if split_by_client: 

41 # filter out missing files and empty clients 

42 existing_data = [ 

43 [d for d in client_data if d is not None] for client_data in data 

44 ] 

45 existing_data = [ 

46 client_data for client_data in existing_data if client_data 

47 ] 

48 else: 

49 # filter out missing files 

50 existing_data = [d for d in data if d is not None] 

51 return existing_data 

52 

53 

54def check_file(filename, force, expected_file_size=1): 

55 """Checks if the file with the given ``filename`` exists and has size greater or equal to ``expected_file_size``. 

56 If the file is to small, **or** if the ``force`` option is set to ``True``, the file is removed. 

57 This function returns ``True`` is the file exists (and has not been removed), otherwise ``False`` 

58 """ 

59 if os.path.exists(filename): 

60 if force or os.path.getsize(filename) < expected_file_size: 

61 logger.debug(" .. Removing old file '%s'.", filename) 

62 os.remove(filename) 

63 return False 

64 else: 

65 return True 

66 return False 

67 

68 

69def read_original_data(biofile, directory, extension): 

70 """This function reads the original data using the given ``biofile`` instance. 

71 It simply calls ``load(directory, extension)`` from :py:class:`bob.bio.base.database.BioFile` or one of its derivatives. 

72 

73 Parameters 

74 ---------- 

75 biofile : :py:class:`bob.bio.base.database.BioFile` or one of its derivatives 

76 The file to read the original data. 

77 directory : str 

78 The base directory of the database. 

79 extension : str or ``None`` 

80 The extension of the original data. 

81 Might be ``None`` if the ``biofile`` itself has the extension stored. 

82 

83 Returns 

84 ------- 

85 object 

86 Whatver ``biofile.load`` returns; usually a :py:class:`numpy.ndarray` 

87 """ 

88 return biofile.load(directory, extension) 

89 

90 

91def load(file): 

92 """Loads data from file. The given file might be an HDF5 file open for reading or a string.""" 

93 if isinstance(file, h5py.File): 

94 return numpy.array(file["array"]) 

95 else: 

96 return bob.io.base.load(file) 

97 

98 

99def save(data, file, compression=0): 

100 """Saves the data to file using HDF5. The given file might be an HDF5 file open for writing, or a string. 

101 If the given data contains a ``save`` method, this method is called with the given HDF5 file. 

102 Otherwise the data is written to the HDF5 file using the given compression. 

103 """ 

104 f = file if isinstance(file, h5py.File) else h5py.File(file, "w") 

105 if hasattr(data, "save"): 

106 data.save(f) 

107 else: 

108 f["array"] = data 

109 

110 

111def open_compressed(filename, open_flag="r", compression_type="bz2"): 

112 """Opens a compressed HDF5File with the given opening flags. 

113 For the 'r' flag, the given compressed file will be extracted to a local space. 

114 For 'w', an empty HDF5File is created. 

115 In any case, the opened HDF5File is returned, which needs to be closed using the close_compressed() function. 

116 """ 

117 # create temporary HDF5 file name 

118 hdf5_file_name = tempfile.mkstemp(".hdf5", "bob_")[1] 

119 

120 if open_flag == "r": 

121 # extract the HDF5 file from the given file name into a temporary file name 

122 tar = tarfile.open(filename, mode="r:" + compression_type) 

123 memory_file = tar.extractfile(tar.next()) 

124 real_file = open(hdf5_file_name, "wb") 

125 real_file.write(memory_file.read()) 

126 del memory_file 

127 real_file.close() 

128 tar.close() 

129 

130 return h5py.File(hdf5_file_name, open_flag) 

131 

132 

133def close_compressed( 

134 filename, hdf5_file, compression_type="bz2", create_link=False 

135): 

136 """Closes the compressed hdf5_file that was opened with open_compressed. 

137 When the file was opened for writing (using the 'w' flag in open_compressed), the created HDF5 file is compressed into the given file name. 

138 To be able to read the data using the real tools, a link with the correct extension might is created, when create_link is set to True. 

139 """ 

140 hdf5_file_name = hdf5_file.filename 

141 is_writable = hdf5_file.writable 

142 hdf5_file.close() 

143 

144 if is_writable: 

145 # create compressed tar file 

146 tar = tarfile.open(filename, mode="w:" + compression_type) 

147 tar.add(hdf5_file_name, os.path.basename(filename)) 

148 tar.close() 

149 

150 if create_link: 

151 extension = {"": ".tar", "bz2": ".tar.bz2", "gz": "tar.gz"}[ 

152 compression_type 

153 ] 

154 link_file = filename + extension 

155 if not os.path.exists(link_file): 

156 os.symlink(os.path.basename(filename), link_file) 

157 

158 # clean up locally generated files 

159 os.remove(hdf5_file_name) 

160 

161 

162def load_compressed(filename, compression_type="bz2"): 

163 """Extracts the data to a temporary HDF5 file using HDF5 and reads its contents. 

164 Note that, though the file name is .hdf5, it contains compressed data! 

165 Accepted compression types are 'gz', 'bz2', ''""" 

166 # read from compressed HDF5 

167 hdf5 = open_compressed(filename, "r") 

168 data = numpy.array(hdf5["array"]) 

169 close_compressed(filename, hdf5) 

170 

171 return data 

172 

173 

174def save_compressed(data, filename, compression_type="bz2", create_link=False): 

175 """Saves the data to a temporary file using HDF5. 

176 Afterwards, the file is compressed using the given compression method and saved using the given file name. 

177 Note that, though the file name will be .hdf5, it will contain compressed data! 

178 Accepted compression types are 'gz', 'bz2', ''""" 

179 # write to compressed HDF5 file 

180 hdf5 = open_compressed(filename, "w") 

181 save(data, hdf5) 

182 close_compressed(filename, hdf5, compression_type, create_link)