Coverage for src/bob/bio/base/database/filelist/models.py: 91%

1#!/usr/bin/env python

2# vim: set fileencoding=utf-8 :

3# @author: Manuel Guenther <Manuel.Guenther@idiap.ch>

4# @date: Wed Oct 24 10:47:43 CEST 2012

8# This program is free software: you can redistribute it and/or modify

9# it under the terms of the GNU General Public License as published by

10# the Free Software Foundation, version 3 of the License.

11#

12# This program is distributed in the hope that it will be useful,

13# but WITHOUT ANY WARRANTY; without even the implied warranty of

14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

15# GNU General Public License for more details.

16#

17# You should have received a copy of the GNU General Public License

18# along with this program. If not, see <http://www.gnu.org/licenses/>.

20"""

21This file defines a simple interface that are comparable with other bob.db databases.

22"""

24import fileinput

25import os

26import re

29class FileListFile(object):

30 """

31 Initialize the File object with the minimum required data.

33 If the ``model_id`` is not specified, ``model_id`` and ``client_id`` are identical.

34 If the ``claimed_id`` is not specified, it is expected to be the ``client_id``.

36 Parameters

37 ----------

39 client_id : various type

40 The id of the client, this file belongs to.

41 The type of it is dependent on your implementation.

42 If you use an SQL database, this should be an SQL type like Integer or String.

44 path : str

45 The path of this file, relative to the basic directory.

46 If you use an SQL database, this should be the SQL type String.

47 Please do not specify any file extensions.

49 file_id : various type

50 The id of the file.

51 The type of it is dependent on your implementation.

52 If you use an SQL database, this should be an SQL type like Integer or String.

53 If you are using an automatically determined file id, you can skip selecting the file id.

54 """

56 def __init__(self, file_name, client_id, model_id=None, claimed_id=None):

57 # super(FileListFile, self).__init__(client_id=client_id, path=file_name, file_id=file_name)

58 super(FileListFile, self).__init__()

59 self.client_id = client_id

60 self.path = file_name

61 self.id = file_name

63 # Note: in case of probe files, model ids are considered to be the ids of the model for the given probe file.

64 # Hence, there might be several probe files with the same file id, but different model ids.

65 # Therefore, please DO NOT USE the model_id outside of this class (or the according database queries).

66 # when the model id is not specified, we use the client id instead

67 self._model_id = client_id if model_id is None else model_id

68 # when the claimed id is not specified, we use the client id instead

69 self.claimed_id = client_id if claimed_id is None else claimed_id

72#############################################################################

73# internal access functions for the file lists; do not export!

74#############################################################################

77class ListReader(object):

78 def __init__(self, store_lists):

79 self.m_read_lists = {}

80 self.m_model_dicts = {}

81 self.m_store_lists = store_lists

83 def _read_multi_column_list(self, list_file):

84 rows = []

85 if not os.path.isfile(list_file):

86 raise RuntimeError("File %s does not exist." % (list_file,))

87 try:

88 for line in fileinput.input(list_file):

89 if line.strip().startswith("#"):

90 continue

91 parsed_line = re.findall(r"[\w/(-.)]+", line)

92 if len(parsed_line):

93 # perform some sanity checks

94 if len(parsed_line) not in (2, 3, 4):

95 raise IOError(

96 "The read line '%s' from file '%s' could not be parsed successfully!"

97 % (line.rstrip(), list_file)

98 )

99 if len(rows) and len(rows[0]) != len(parsed_line):

100 raise IOError(

101 "The parsed line '%s' from file '%s' has a different number of elements than the first parsed line '%s'!"

102 % (parsed_line, list_file, rows[0])

103 )

104 # append the read line

105 rows.append(parsed_line)

106 fileinput.close()

107 except IOError as e:

108 raise RuntimeError(

109 "Error reading the file '%s' : '%s'." % (list_file, e)

110 )

111

112 # return the read list as a vector of columns

113 return rows

114

115 def _read_column_list(self, list_file, column_count):

116 # read the list

117 rows = self._read_multi_column_list(list_file)

118 # extract the file from the first two columns

119 file_list = []

120 for row in rows:

121 if column_count == 2:

122 assert len(row) == 2

123 # we expect: filename client_id

124 file_list.append(

125 FileListFile(file_name=row[0], client_id=row[1])

126 )

127 elif column_count == 3:

128 assert len(row) in (2, 3)

129 # we expect: filename, model_id, client_id

130 file_list.append(

131 FileListFile(

132 file_name=row[0],

133 client_id=row[2] if len(row) > 2 else row[1],

134 model_id=row[1],

135 )

136 )

137 elif column_count == 4:

138 assert len(row) in (3, 4)

139 # we expect: filename, model_id, claimed_id, client_id

140 file_list.append(

141 FileListFile(

142 file_name=row[0],

143 client_id=row[3] if len(row) > 3 else row[1],

144 model_id=row[1],

145 claimed_id=row[2],

146 )

147 )

148 else:

149 raise ValueError(

150 "The given column count %d cannot be interpreted. This is a BUG, please report to the author."

151 % column_count

152 )

153

154 return file_list

155

156 def _create_model_dictionary(self, files):

157 # remember model ids

158 retval = {}

159 for file in files:

160 if file._model_id not in retval:

161 retval[file._model_id] = file.client_id

162 else:

163 if retval[file._model_id] != file.client_id:

164 raise ValueError(

165 "The read model id '%s' is associated to two different client ids '%s' and '%s'!"

166 % (

167 file._model_id,

168 file.client_id,

169 retval[file._model_id],

170 )

171 )

172 return retval

173

174 def read_list(self, list_file, group, type=None):

175 """Reads the list of Files from the given list file (if not done yet) and returns it."""

176 if group in ("world", "optional_world_1", "optional_world_2"):

177 if group not in self.m_read_lists:

178 # read the world list into memory

179 list = self._read_column_list(list_file, 2)

180 if self.m_store_lists:

181 self.m_read_lists[group] = list

182 return list

183 # just return the previously read list

184 return self.m_read_lists[group]

185

186 else:

187 if group not in self.m_read_lists:

188 self.m_read_lists[group] = {}

189 if type not in self.m_read_lists[group]:

190 if type in ("for_models", "for_tnorm"):

191 list = self._read_column_list(list_file, 3)

192 elif type == "for_scores":

193 list = self._read_column_list(list_file, 4)

194 elif type in ("for_probes", "for_znorm"):

195 list = self._read_column_list(list_file, 2)

196 else:

197 raise ValueError(

198 "The given type must be one of %s, but not '%s'"

199 % (

200 (

201 "for_models",

202 "for_scores",

203 "for_probes",

204 "for_tnorm",

205 "for_znorm",

206 ),

207 type,

208 )

209 )

210 if self.m_store_lists:

211 self.m_read_lists[group][type] = list

212 return list

213 return self.m_read_lists[group][type]

214

215 def read_models(self, list_file, group, type=None):

216 """Generates a dictionary from model_ids to client_ids for the given list file, if not done yet, and returns it"""

217 assert group in (

218 "dev",

219 "eval",

220 "world",

221 "optional_world_1",

222 "optional_world_2",

223 )

224 assert type in ("for_models", "for_tnorm")

225 if group not in self.m_model_dicts:

226 self.m_model_dicts[group] = {}

227 if type not in self.m_model_dicts[group]:

228 dict = self._create_model_dictionary(

229 self.read_list(list_file, group, type)

230 )

231 if self.m_store_lists:

232 self.m_model_dicts[group][type] = dict

233 return dict

234 return self.m_model_dicts[group][type]