Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#!/usr/bin/env python
2# coding=utf-8
4import csv
5import json
6import logging
7import os
8import pathlib
10logger = logging.getLogger(__name__)
13class JSONDataset:
14 """
15 Generic multi-protocol/subset filelist dataset that yields samples
17 To create a new dataset, you need to provide one or more JSON formatted
18 filelists (one per protocol) with the following contents:
20 .. code-block:: json
22 {
23 "subset1": [
24 [
25 "value1",
26 "value2",
27 "value3"
28 ],
29 [
30 "value4",
31 "value5",
32 "value6"
33 ]
34 ],
35 "subset2": [
36 ]
37 }
39 Your dataset many contain any number of subsets, but all sample entries
40 must contain the same number of fields.
43 Parameters
44 ----------
46 protocols : list, dict
47 Paths to one or more JSON formatted files containing the various
48 protocols to be recognized by this dataset, or a dictionary, mapping
49 protocol names to paths (or opened file objects) of CSV files.
50 Internally, we save a dictionary where keys default to the basename of
51 paths (list input).
53 fieldnames : list, tuple
54 An iterable over the field names (strings) to assign to each entry in
55 the JSON file. It should have as many items as fields in each entry of
56 the JSON file.
58 loader : object
59 A function that receives as input, a context dictionary (with at least
60 a "protocol" and "subset" keys indicating which protocol and subset are
61 being served), and a dictionary with ``{fieldname: value}`` entries,
62 and returns an object with at least 2 attributes:
64 * ``key``: which must be a unique string for every sample across
65 subsets in a protocol, and
66 * ``data``: which contains the data associated witht this sample
68 """
70 def __init__(self, protocols, fieldnames, loader):
72 if isinstance(protocols, dict):
73 self._protocols = protocols
74 else:
75 self._protocols = dict(
76 (os.path.splitext(os.path.basename(k))[0], k) for k in protocols
77 )
78 self.fieldnames = fieldnames
79 self._loader = loader
81 def check(self, limit=0):
82 """For each protocol, check if all data can be correctly accessed
84 This function assumes each sample has a ``data`` and a ``key``
85 attribute. The ``key`` attribute should be a string, or representable
86 as such.
89 Parameters
90 ----------
92 limit : int
93 Maximum number of samples to check (in each protocol/subset
94 combination) in this dataset. If set to zero, then check
95 everything.
98 Returns
99 -------
101 errors : int
102 Number of errors found
104 """
106 logger.info("Checking dataset...")
107 errors = 0
108 for proto in self._protocols:
109 logger.info(f"Checking protocol '{proto}'...")
110 for name, samples in self.subsets(proto).items():
111 logger.info(f"Checking subset '{name}'...")
112 if limit:
113 logger.info(f"Checking at most first '{limit}' samples...")
114 samples = samples[:limit]
115 for pos, sample in enumerate(samples):
116 try:
117 sample.data # may trigger data loading
118 logger.info(f"{sample.key}: OK")
119 except Exception as e:
120 logger.error(
121 f"Found error loading entry {pos} in subset {name} "
122 f"of protocol {proto} from file "
123 f"'{self._protocols[proto]}': {e}"
124 )
125 errors += 1
126 except Exception as e:
127 logger.error(f"{sample.key}: {e}")
128 errors += 1
129 return errors
131 def subsets(self, protocol):
132 """Returns all subsets in a protocol
134 This method will load JSON information for a given protocol and return
135 all subsets of the given protocol after converting each entry through
136 the loader function.
138 Parameters
139 ----------
141 protocol : str
142 Name of the protocol data to load
145 Returns
146 -------
148 subsets : dict
149 A dictionary mapping subset names to lists of objects (respecting
150 the ``key``, ``data`` interface).
152 """
154 fileobj = self._protocols[protocol]
155 if isinstance(fileobj, (str, bytes, pathlib.Path)):
156 with open(self._protocols[protocol], "r") as f:
157 data = json.load(f)
158 else:
159 data = json.load(f)
160 fileobj.seek(0)
162 retval = {}
163 for subset, samples in data.items():
164 retval[subset] = [
165 self._loader(
166 dict(protocol=protocol, subset=subset, order=n),
167 dict(zip(self.fieldnames, k)),
168 )
169 for n, k in enumerate(samples)
170 ]
172 return retval
175class CSVDataset:
176 """
177 Generic multi-subset filelist dataset that yields samples
179 To create a new dataset, you only need to provide a CSV formatted filelist
180 using any separator (e.g. comma, space, semi-colon) with the following
181 information:
183 .. code-block:: text
185 value1,value2,value3
186 value4,value5,value6
187 ...
189 Notice that all rows must have the same number of entries.
191 Parameters
192 ----------
194 subsets : list, dict
195 Paths to one or more CSV formatted files containing the various subsets
196 to be recognized by this dataset, or a dictionary, mapping subset names
197 to paths (or opened file objects) of CSV files. Internally, we save a
198 dictionary where keys default to the basename of paths (list input).
200 fieldnames : list, tuple
201 An iterable over the field names (strings) to assign to each column in
202 the CSV file. It should have as many items as fields in each row of
203 the CSV file(s).
205 loader : object
206 A function that receives as input, a context dictionary (with, at
207 least, a "subset" key indicating which subset is being served), and a
208 dictionary with ``{key: path}`` entries, and returns a dictionary with
209 the loaded data.
211 """
213 def __init__(self, subsets, fieldnames, loader):
215 if isinstance(subsets, dict):
216 self._subsets = subsets
217 else:
218 self._subsets = dict(
219 (os.path.splitext(os.path.basename(k))[0], k) for k in subsets
220 )
221 self.fieldnames = fieldnames
222 self._loader = loader
224 def check(self, limit=0):
225 """For each subset, check if all data can be correctly accessed
227 This function assumes each sample has a ``data`` and a ``key``
228 attribute. The ``key`` attribute should be a string, or representable
229 as such.
232 Parameters
233 ----------
235 limit : int
236 Maximum number of samples to check (in each protocol/subset
237 combination) in this dataset. If set to zero, then check
238 everything.
241 Returns
242 -------
244 errors : int
245 Number of errors found
247 """
249 logger.info("Checking dataset...")
250 errors = 0
251 for name in self._subsets.keys():
252 logger.info(f"Checking subset '{name}'...")
253 samples = self.samples(name)
254 if limit:
255 logger.info(f"Checking at most first '{limit}' samples...")
256 samples = samples[:limit]
257 for pos, sample in enumerate(samples):
258 try:
259 sample.data # may trigger data loading
260 logger.info(f"{sample.key}: OK")
261 except Exception as e:
262 logger.error(
263 f"Found error loading entry {pos} in subset {name} "
264 f"from file '{self._subsets[name]}': {e}"
265 )
266 errors += 1
267 return errors
269 def subsets(self):
270 """Returns all available subsets at once
272 Returns
273 -------
275 subsets : dict
276 A dictionary mapping subset names to lists of objects (respecting
277 the ``key``, ``data`` interface).
279 """
281 return dict((k, self.samples(k)) for k in self._subsets.keys())
283 def samples(self, subset):
284 """Returns all samples in a subset
286 This method will load CSV information for a given subset and return
287 all samples of the given subset after passing each entry through the
288 loading function.
291 Parameters
292 ----------
294 subset : str
295 Name of the subset data to load
298 Returns
299 -------
301 subset : list
302 A lists of objects (respecting the ``key``, ``data`` interface).
304 """
306 fileobj = self._subsets[subset]
307 if isinstance(fileobj, (str, bytes, pathlib.Path)):
308 with open(self._subsets[subset], newline="") as f:
309 cf = csv.reader(f)
310 samples = [k for k in cf]
311 else:
312 cf = csv.reader(fileobj)
313 samples = [k for k in cf]
314 fileobj.seek(0)
316 return [
317 self._loader(
318 dict(subset=subset, order=n), dict(zip(self.fieldnames, k))
319 )
320 for n, k in enumerate(samples)
321 ]