Coverage for src/bob/bio/base/score/load.py: 78%
189 statements
« prev ^ index » next coverage.py v7.6.5, created at 2024-11-14 21:41 +0100
« prev ^ index » next coverage.py v7.6.5, created at 2024-11-14 21:41 +0100
1#!/usr/bin/env python
2# vim: set fileencoding=utf-8 :
3# Mon 23 May 2011 16:23:05 CEST
5"""A set of utilities to load score files with different formats."""
7import csv
8import logging
9import os
10import tarfile
12from collections import defaultdict
13from pathlib import Path
15import dask.dataframe
16import numpy
18logger = logging.getLogger(__name__)
21def iscsv(filename):
22 return ".csv" in Path(filename).suffixes
25def open_file(filename, mode="rt"):
26 """Opens the given score file for reading.
28 Score files might be raw text files, or a tar-file including a single score
29 file inside.
32 Parameters:
34 filename (:py:class:`str`, ``file-like``): The name of the score file to
35 open, or a file-like object open for reading. If a file name is given,
36 the according file might be a raw text file or a (compressed) tar file
37 containing a raw text file.
40 Returns:
43 ``file-like``: A read-only file-like object as it would be returned by
44 :py:func:`open`.
46 """
48 if not isinstance(filename, str) and hasattr(filename, "read"):
49 # It seems that this is an open file
50 return filename
52 if not os.path.isfile(filename):
53 raise IOError("Score file '%s' does not exist." % filename)
54 if not tarfile.is_tarfile(filename):
55 return open(filename, mode)
57 # open the tar file for reading
58 tar = tarfile.open(filename, "r")
59 # get the first file in the tar file
60 tar_info = tar.next()
61 while tar_info is not None and not tar_info.isfile():
62 tar_info = tar.next()
63 # check that one file was found in the archive
64 if tar_info is None:
65 raise IOError(
66 "The given file is a .tar file, but it does not contain any file."
67 )
69 # open the file for reading
70 return tar.extractfile(tar_info)
73def four_column(filename):
74 """Loads a score set from a single file and yield its lines
76 Loads a score set from a single file and yield its lines (to avoid loading
77 the score file at once into memory). This function verifies that all fields
78 are correctly placed and contain valid fields. The score file must contain
79 the following information in each line:
81 .. code-block:: text
83 claimed_id real_id test_label score
86 Parameters:
88 filename (:py:class:`str`, ``file-like``): The file object that will be
89 opened with :py:func:`open_file` containing the scores.
92 Yields:
94 str: The claimed identity -- the client name of the model that was used in
95 the comparison
97 str: The real identity -- the client name of the probe that was used in
98 the comparison
100 str: A label of the probe -- usually the probe file name, or the probe id
102 float: The result of the comparison of the model and the probe
104 """
105 return _iterate_score_file(filename)
108def split_four_column(filename):
109 """Loads a score set from a single file and splits the scores
111 Loads a score set from a single file and splits the scores between negatives
112 and positives. The score file has to respect the 4 column format as defined
113 in the method :py:func:`four_column`.
115 This method avoids loading and allocating memory for the strings present in
116 the file. We only keep the scores.
119 Parameters:
121 filename (:py:class:`str`, ``file-like``): The file object that will be
122 opened with :py:func:`open_file` containing the scores.
125 Returns:
127 array: negatives, 1D float array containing the list of scores, for which
128 the ``claimed_id`` and the ``real_id`` are different
129 (see :py:func:`four_column`)
131 array: positives, 1D float array containing the list of scores, for which
132 the ``claimed_id`` and the ``real_id`` are identical
133 (see :py:func:`four_column`)
135 """
137 score_lines = four_column(filename)
138 return _split_scores(score_lines, 1)
141def get_split_dataframe(filename):
142 """Loads a score set that was written with :any:`bob.bio.base.pipelines.CSVScoreWriter`
144 Returns two dataframes, split between positives and negatives.
146 Parameters
147 ----------
149 filename (:py:class:`str`, ``file-like``): The file object that will be
150 opened with :py:func:`open_file` containing the scores.
152 Returns
153 -------
155 dataframe: negatives, contains the list of scores (and metadata) for which
156 the fields of the ``bio_ref_subject_id`` and ``probe_subject_id``
157 columns are different. (see
158 :ref:`bob.bio.base.pipeline_simple_advanced_features`)
160 dataframe: positives, contains the list of scores (and metadata) for which
161 the fields of the ``bio_ref_subject_id`` and ``probe_subject_id``
162 columns are identical. (see
163 :ref:`bob.bio.base.pipeline_simple_advanced_features`)
165 """
166 df = dask.dataframe.read_csv(
167 filename, dtype=defaultdict(lambda: str, {"score": float})
168 )
170 genuines = df[df.probe_subject_id == df.bio_ref_subject_id]
171 impostors = df[df.probe_subject_id != df.bio_ref_subject_id]
173 return impostors, genuines
176def split_csv_scores(filename, score_column: str = "score"):
177 """Loads a score set that was written with :any:`bob.bio.base.pipelines.CSVScoreWriter`
179 Parameters
180 ----------
182 filename (:py:class:`str`, ``file-like``): The file object that will be
183 opened with :py:func:`open_file` containing the scores.
185 score_column: The CSV column that contains the score values.
187 Returns
188 -------
190 array: negatives, 1D float array containing the list of scores, for which
191 the fields of the ``bio_ref_subject_id`` and ``probe_subject_id``
192 columns are different. (see
193 :ref:`bob.bio.base.pipeline_simple_advanced_features`)
195 array: positives, 1D float array containing the list of scores, for which
196 the fields of the ``bio_ref_subject_id`` and ``probe_subject_id``
197 columns are identical. (see
198 :ref:`bob.bio.base.pipeline_simple_advanced_features`)
200 """
201 df = dask.dataframe.read_csv(
202 filename, dtype=defaultdict(lambda: str, {"score": float})
203 )
205 genuines = df[df.probe_subject_id == df.bio_ref_subject_id]
206 impostors = df[df.probe_subject_id != df.bio_ref_subject_id]
208 return (
209 impostors[score_column].to_dask_array().compute(),
210 genuines[score_column].to_dask_array().compute(),
211 )
214def cmc_four_column(filename):
215 """Loads scores to compute CMC curves from a file in four column format.
217 The four column file needs to be in the same format as described in
218 :py:func:`four_column`, and the ``test_label`` (column 3) has to contain the
219 test/probe file name or a probe id.
221 This function returns a list of tuples. For each probe file, the tuple
222 consists of a list of negative scores and a list of positive scores.
223 Usually, the list of positive scores should contain only one element, but
224 more are allowed. The result of this function can directly be passed to,
225 e.g., the :py:func:`bob.measure.cmc` function.
228 Parameters:
230 filename (:py:class:`str`, ``file-like``): The file object that will be
231 opened with :py:func:`open_file` containing the scores.
234 Returns:
236 :any:`list`: A list of tuples, where each tuple contains the
237 ``negative`` and ``positive`` scores for one probe of the database. Both
238 ``negatives`` and ``positives`` can be either an 1D
239 :py:class:`numpy.ndarray` of type ``float``, or ``None``.
241 """
243 score_lines = four_column(filename)
244 return _split_cmc_scores(score_lines, 1)
247def five_column(filename):
248 """Loads a score set from a single file and yield its lines
250 Loads a score set from a single file and yield its lines (to avoid loading
251 the score file at once into memory). This function verifies that all fields
252 are correctly placed and contain valid fields. The score file must contain
253 the following information in each line:
255 .. code-block:: text
257 claimed_id model_label real_id test_label score
260 Parameters:
262 filename (:py:class:`str`, ``file-like``): The file object that will be
263 opened with :py:func:`open_file` containing the scores.
266 Yields:
268 str: The claimed identity -- the client name of the model that was used in
269 the comparison
271 str: A label for the model -- usually the model file name, or the model id
273 str: The real identity -- the client name of the probe that was used in
274 the comparison
276 str: A label of the probe -- usually the probe file name, or the probe id
278 float: The result of the comparison of the model and the probe
280 """
282 return _iterate_score_file(filename)
285def split_five_column(filename):
286 """Loads a score set from a single file and splits the scores
288 Loads a score set from a single file in five column format and splits the
289 scores between negatives and positives. The score file has to respect the 5
290 column format as defined in the method :py:func:`five_column`.
292 This method avoids loading and allocating memory for the strings present in
293 the file. We only keep the scores.
296 Parameters:
298 filename (:py:class:`str`, ``file-like``): The file object that will be
299 opened with :py:func:`open_file` containing the scores.
302 Returns:
304 array: negatives, 1D float array containing the list of scores, for which
305 the ``claimed_id`` and the ``real_id`` are different
306 (see :py:func:`four_column`)
308 array: positives, 1D float array containing the list of scores, for which
309 the ``claimed_id`` and the ``real_id`` are identical
310 (see :py:func:`four_column`)
312 """
314 score_lines = four_column(filename)
315 return _split_scores(score_lines, 2)
318def cmc_five_column(filename):
319 """Loads scores to compute CMC curves from a file in five column format.
321 The five column file needs to be in the same format as described in
322 :py:func:`five_column`, and the ``test_label`` (column 4) has to contain the
323 test/probe file name or a probe id.
325 This function returns a list of tuples. For each probe file, the tuple
326 consists of a list of negative scores and a list of positive scores.
327 Usually, the list of positive scores should contain only one element, but
328 more are allowed. The result of this function can directly be passed to,
329 e.g., the :py:func:`bob.measure.cmc` function.
332 Parameters:
334 filename (:py:class:`str`, ``file-like``): The file object that will be
335 opened with :py:func:`open_file` containing the scores.
338 Returns:
340 :any:`list`: A list of tuples, where each tuple contains the
341 ``negative`` and ``positive`` scores for one probe of the database.
343 """
344 score_lines = four_column(filename)
345 return _split_cmc_scores(score_lines, 2)
348def scores(filename, ncolumns=None):
349 """Loads the scores from the given score file and yield its lines.
350 Depending on the score file format, four or five elements are yielded, see
351 :py:func:`bob.bio.base.score.load.four_column` and
352 :py:func:`bob.bio.base.score.load.five_column` for details.
354 Parameters:
356 filename: :py:class:`str`, ``file-like``:
357 The file object that will be opened with :py:func:`open_file` containing
358 the scores.
360 ncolumns: any
361 ignored
363 Yields:
365 tuple:
366 see :py:func:`bob.bio.base.score.load.four_column` or
367 :py:func:`bob.bio.base.score.load.five_column`
368 """
369 return _iterate_score_file(filename)
372def split(filename, ncolumns=None, sort=False, csv_score_column: str = "score"):
373 """Loads the scores from the given score file and splits them into positives
374 and negatives.
375 Depending on the score file format, it calls see
376 :py:func:`bob.bio.base.score.load.split_four_column` and
377 :py:func:`bob.bio.base.score.load.split_five_column` for details.
379 Parameters
380 ----------
382 filename : str
383 The path to the score file.
384 ncolumns : int or ``None``
385 If specified to be ``4`` or ``5``, the score file will be assumed to be
386 in the given format. If not specified, the score file format will be
387 estimated automatically
388 sort : :obj:`bool`, optional
389 If ``True``, will return sorted negatives and positives
390 csv_score_column :
391 When loading a CSV file, specifies the column that holds scores.
393 Returns
394 -------
396 negatives : 1D :py:class:`numpy.ndarray` of type float
397 This array contains the list of scores, for which the ``claimed_id`` and
398 the ``real_id`` are different (see :py:func:`four_column`)
399 positives : 1D :py:class:`numpy.ndarray` of type float
400 This array contains the list of scores, for which the ``claimed_id`` and
401 the ``real_id`` are identical (see :py:func:`four_column`)
402 """
403 if iscsv(filename):
404 neg, pos = split_csv_scores(filename, score_column=csv_score_column)
405 else:
406 ncolumns = _estimate_score_file_format(filename, ncolumns)
407 if ncolumns == 4:
408 neg, pos = split_four_column(filename)
409 else:
410 assert ncolumns == 5
411 neg, pos = split_five_column(filename)
413 if sort:
414 neg.sort()
415 pos.sort()
417 return neg, pos
420def cmc(filename, ncolumns=None, csv_score_column: str = "score"):
421 """cmc(filename, ncolumns=None) -> list
423 Loads scores to compute CMC curves.
425 Depending on the score file format, it calls see
426 :py:func:`bob.bio.base.score.load.cmc_four_column` and
427 `:py:func:`bob.bio.base.score.load.cmc_five_column` for details.
429 Parameters:
431 filename (:py:class:`str` or ``file-like``): The file object that will be
432 opened with :py:func:`open_file` containing the scores.
434 ncolumns: (:py:class:`int`, Optional): If specified to be ``4`` or ``5``,
435 the score file will be assumed to be in the given format. If not
436 specified, the score file format will be estimated automatically
438 csv_score_column: When loading a CSV file, specifies the column that holds
439 scores.
441 Returns:
443 :any:`list`: [(neg,pos)] A list of tuples, where each tuple contains the
444 ``negative`` and ``positive`` scores for one probe of the database.
446 """
448 ncolumns = (
449 4
450 if iscsv(filename)
451 else _estimate_score_file_format(filename, ncolumns)
452 )
454 if ncolumns == 4:
455 return cmc_four_column(filename)
456 else:
457 assert ncolumns == 5
458 return cmc_five_column(filename)
461def load_score(filename, ncolumns=None, minimal=False, **kwargs):
462 """Load scores using numpy.loadtxt and return the data as a numpy array.
464 Parameters:
466 filename (:py:class:`str`, ``file-like``): The file object that will be
467 opened with :py:func:`open_file` containing the scores.
469 ncolumns (:py:class:`int`, optional): 4, 5 or None (the default),
470 specifying the number of columns in the score file. If None is provided,
471 the number of columns will be guessed.
473 minimal (:py:class:`bool`, optional): If True, only loads ``claimed_id``,
474 ``real_id``, and ``scores``.
476 **kwargs: Keyword arguments passed to :py:func:`numpy.genfromtxt`
479 Returns:
481 array: An array which contains not only the actual ``score`` but also the
482 ``claimed_id``, ``real_id``, ``test_label`` and ``['model_label']``
484 """
486 def convertfunc(x):
487 return x
489 ncolumns = _estimate_score_file_format(filename, ncolumns)
491 usecols = kwargs.pop("usecols", None)
492 if ncolumns == 4:
493 names = ("claimed_id", "real_id", "test_label", "score")
494 converters = {0: convertfunc, 1: convertfunc, 2: convertfunc, 3: float}
495 if minimal:
496 usecols = (0, 1, 3)
498 elif ncolumns == 5:
499 names = ("claimed_id", "model_label", "real_id", "test_label", "score")
500 converters = {
501 0: convertfunc,
502 1: convertfunc,
503 2: convertfunc,
504 3: convertfunc,
505 4: float,
506 }
507 if minimal:
508 usecols = (0, 2, 4)
509 else:
510 raise ValueError("ncolumns of 4 and 5 are supported only.")
512 score_lines = numpy.genfromtxt(
513 open_file(filename, mode="rb"),
514 dtype=None,
515 names=names,
516 converters=converters,
517 invalid_raise=True,
518 usecols=usecols,
519 **kwargs,
520 )
521 new_dtype = []
522 for name in score_lines.dtype.names[:-1]:
523 new_dtype.append((name, str(score_lines.dtype[name]).replace("S", "U")))
524 new_dtype.append(("score", float))
525 score_lines = numpy.array(score_lines, new_dtype)
526 return score_lines
529def load_files(filenames, func_load):
530 """Load a list of score files and return a list of tuples of (neg, pos)
532 Parameters
533 ----------
535 filenames : :any:`list`
536 list of file paths
537 func_load :
538 function that can read files in the list
540 Returns
541 -------
543 :any:`list`: [(neg,pos)] A list of tuples, where each tuple contains the
544 ``negative`` and ``positive`` sceach system/probee.
546 """
547 if filenames is None:
548 return None
549 res = []
550 for filepath in filenames:
551 res.append(func_load(filepath))
552 return res
555def get_negatives_positives(score_lines, score_column: str = "score"):
556 """Take the output of load_score and return negatives and positives. This
557 function aims to replace split_four_column and split_five_column but takes a
558 different input. It's up to you to use which one.
559 """
561 pos_mask = score_lines["claimed_id"] == score_lines["real_id"]
562 positives = score_lines[score_column][pos_mask]
563 negatives = score_lines[score_column][numpy.logical_not(pos_mask)]
564 return (negatives, positives)
567def get_negatives_positives_from_file(filename, **kwargs):
568 """Loads the scores first efficiently and then calls
569 get_negatives_positives"""
570 score_lines = load_score(filename, minimal=True, **kwargs)
571 return get_negatives_positives(score_lines, score_column="score")
574def get_negatives_positives_all(score_lines_list, score_column: str = "score"):
575 """Take a list of outputs of load_score and return stacked negatives and
576 positives.
577 """
579 negatives, positives = [], []
580 for score_lines in score_lines_list:
581 neg_pos = get_negatives_positives(
582 score_lines, score_column=score_column
583 )
584 negatives.append(neg_pos[0])
585 positives.append(neg_pos[1])
586 negatives = numpy.vstack(negatives).T
587 positives = numpy.vstack(positives).T
588 return (negatives, positives)
591def get_all_scores(score_lines_list, score_column: str = "score"):
592 """Take a list of outputs of load_score and return stacked scores"""
594 return numpy.vstack(
595 [score_lines[score_column] for score_lines in score_lines_list]
596 ).T
599def dump_score(filename, score_lines):
600 """Dump scores that were loaded using :py:func:`load_score`
601 The number of columns is automatically detected.
602 """
604 if len(score_lines.dtype) == 5:
605 fmt = "%s %s %s %s %.9f"
606 elif len(score_lines.dtype) == 4:
607 fmt = "%s %s %s %.9f"
608 else:
609 raise ValueError("Only scores with 4 and 5 columns are supported.")
610 numpy.savetxt(filename, score_lines, fmt=fmt)
613def _estimate_score_file_format(filename, ncolumns=None):
614 """Estimates the score file format from the given score file.
615 If ``ncolumns`` is in ``(4,5)``, then ``ncolumns`` is returned instead.
616 """
617 if ncolumns in (4, 5):
618 return ncolumns
620 f = open_file(filename, "rb")
621 try:
622 line = f.readline()
623 ncolumns = len(line.split())
624 except Exception:
625 logger.warn(
626 "Could not guess the number of columns in file: {}. "
627 "Assuming 4 column format.".format(filename)
628 )
629 ncolumns = 4
630 finally:
631 f.close()
632 return ncolumns
635def _iterate_score_file(filename, csv_score_column: str = "score"):
636 """Opens the score file and yields the score file lines in a tuple/list.
638 The last element of the line (which is the score) will be transformed to
639 float, the other elements will be str.
640 """
641 if iscsv(filename):
642 for row in _iterate_csv_score_file(
643 filename, score_column=csv_score_column
644 ):
645 yield [
646 row["bio_ref_subject_id"],
647 row["probe_subject_id"],
648 row["probe_template_id"],
649 row[csv_score_column],
650 ]
651 else:
652 opened = open_file(filename, "rb")
653 import io
655 if not isinstance(opened, io.TextIOWrapper):
656 opened = io.TextIOWrapper(opened, newline="")
658 reader = csv.reader(opened, delimiter=" ")
659 for splits in reader:
660 splits[-1] = float(splits[-1])
661 yield splits
664def _iterate_csv_score_file(filename, score_column: str = "score"):
665 """Opens a CSV score file for reading and yields each line in a dict.
667 The ``score_column`` field of the line will be cast to float, the other
668 elements will be str.
669 """
670 opened = open_file(filename)
671 reader = csv.DictReader(opened)
672 for row in reader:
673 row[score_column] = float(row[score_column])
674 yield row
677def _split_scores(
678 score_lines, real_id_index, claimed_id_index=0, score_index=-1
679):
680 """Take the output of :py:func:`four_column` or :py:func:`five_column` and
681 return negatives and positives.
682 """
683 positives, negatives = [], []
684 for line in score_lines:
685 which = (
686 positives
687 if line[claimed_id_index] == line[real_id_index]
688 else negatives
689 )
690 which.append(line[score_index])
692 return (numpy.array(negatives), numpy.array(positives))
695def _split_cmc_scores(
696 score_lines,
697 real_id_index,
698 probe_name_index=None,
699 claimed_id_index=0,
700 score_index=-1,
701):
702 """Takes the output of :py:func:`four_column` or :py:func:`five_column` and
703 return cmc scores.
704 """
705 if probe_name_index is None:
706 probe_name_index = real_id_index + 1
707 # extract positives and negatives
709 pos_dict = {}
710 neg_dict = {}
711 # read four column list
712 for line in score_lines:
713 which = (
714 pos_dict
715 if line[claimed_id_index] == line[real_id_index]
716 else neg_dict
717 )
718 probe_name = line[probe_name_index]
719 # append score
720 if probe_name not in which:
721 which[probe_name] = []
722 which[probe_name].append(line[score_index])
724 # convert to lists of tuples of ndarrays (or None)
725 probe_names = sorted(set(neg_dict.keys()).union(set(pos_dict.keys())))
726 # get all scores in the desired format
727 return [
728 (
729 (
730 numpy.array(neg_dict[probe_name], numpy.float64)
731 if probe_name in neg_dict
732 else None
733 ),
734 (
735 numpy.array(pos_dict[probe_name], numpy.float64)
736 if probe_name in pos_dict
737 else None
738 ),
739 )
740 for probe_name in probe_names
741 ]
744def split_csv_vuln(filename, score_column: str = "score"):
745 """Loads vulnerability scores from a CSV score file.
747 Returns the scores split between positive and negative as well as licit
748 and presentation attack (spoof).
750 The CSV must contain a ``probe_attack_type`` column with each field either
751 containing a str defining the attack type (spoof), or empty (licit).
753 Parameters
754 ----------
756 filename: str
757 The path to a CSV file containing all the scores
759 Returns
760 -------
762 split_scores: dict of str: numpy.ndarray
763 The licit negative and positive, and spoof scores for probes.
764 """
765 logger.debug(f"Loading CSV score file: '{filename}'")
766 split_scores = {"licit_neg": [], "licit_pos": [], "spoof": []}
767 for row in _iterate_csv_score_file(filename, score_column=score_column):
768 if not row["probe_attack_type"]: # licit
769 if row["probe_subject_id"] == row["bio_ref_subject_id"]:
770 split_scores["licit_pos"].append(row[score_column])
771 else:
772 split_scores["licit_neg"].append(row[score_column])
773 else:
774 split_scores["spoof"].append(row[score_column])
775 logger.debug(
776 f"Found {len(split_scores['licit_neg'])} negative (ZEI), "
777 f"{len(split_scores['licit_pos'])} positive (licit), and "
778 f"{len(split_scores['spoof'])} PA (spoof) scores."
779 )
780 # Cast to numpy float
781 for key, val in split_scores.items():
782 split_scores[key] = numpy.array(val, dtype=numpy.float64)
783 return split_scores