Coverage for src/bob/bio/base/score/load.py: 78%

1#!/usr/bin/env python

2# vim: set fileencoding=utf-8 :

3# Mon 23 May 2011 16:23:05 CEST

5"""A set of utilities to load score files with different formats."""

7import csv

8import logging

9import os

10import tarfile

12from collections import defaultdict

13from pathlib import Path

15import dask.dataframe

16import numpy

18logger = logging.getLogger(__name__)

21def iscsv(filename):

22 return ".csv" in Path(filename).suffixes

25def open_file(filename, mode="rt"):

26 """Opens the given score file for reading.

28 Score files might be raw text files, or a tar-file including a single score

29 file inside.

32 Parameters:

34 filename (:py:class:`str`, ``file-like``): The name of the score file to

35 open, or a file-like object open for reading. If a file name is given,

36 the according file might be a raw text file or a (compressed) tar file

37 containing a raw text file.

40 Returns:

43 ``file-like``: A read-only file-like object as it would be returned by

44 :py:func:`open`.

46 """

48 if not isinstance(filename, str) and hasattr(filename, "read"):

49 # It seems that this is an open file

50 return filename

52 if not os.path.isfile(filename):

53 raise IOError("Score file '%s' does not exist." % filename)

54 if not tarfile.is_tarfile(filename):

55 return open(filename, mode)

57 # open the tar file for reading

58 tar = tarfile.open(filename, "r")

59 # get the first file in the tar file

60 tar_info = tar.next()

61 while tar_info is not None and not tar_info.isfile():

62 tar_info = tar.next()

63 # check that one file was found in the archive

64 if tar_info is None:

65 raise IOError(

66 "The given file is a .tar file, but it does not contain any file."

67 )

69 # open the file for reading

70 return tar.extractfile(tar_info)

73def four_column(filename):

74 """Loads a score set from a single file and yield its lines

76 Loads a score set from a single file and yield its lines (to avoid loading

77 the score file at once into memory). This function verifies that all fields

78 are correctly placed and contain valid fields. The score file must contain

79 the following information in each line:

81 .. code-block:: text

83 claimed_id real_id test_label score

86 Parameters:

88 filename (:py:class:`str`, ``file-like``): The file object that will be

89 opened with :py:func:`open_file` containing the scores.

92 Yields:

94 str: The claimed identity -- the client name of the model that was used in

95 the comparison

97 str: The real identity -- the client name of the probe that was used in

98 the comparison

100 str: A label of the probe -- usually the probe file name, or the probe id

101

102 float: The result of the comparison of the model and the probe

103

104 """

105 return _iterate_score_file(filename)

106

107

108def split_four_column(filename):

109 """Loads a score set from a single file and splits the scores

110

111 Loads a score set from a single file and splits the scores between negatives

112 and positives. The score file has to respect the 4 column format as defined

113 in the method :py:func:`four_column`.

114

115 This method avoids loading and allocating memory for the strings present in

116 the file. We only keep the scores.

117

118

119 Parameters:

120

121 filename (:py:class:`str`, ``file-like``): The file object that will be

122 opened with :py:func:`open_file` containing the scores.

123

124

125 Returns:

126

127 array: negatives, 1D float array containing the list of scores, for which

128 the ``claimed_id`` and the ``real_id`` are different

129 (see :py:func:`four_column`)

130

131 array: positives, 1D float array containing the list of scores, for which

132 the ``claimed_id`` and the ``real_id`` are identical

133 (see :py:func:`four_column`)

134

135 """

136

137 score_lines = four_column(filename)

138 return _split_scores(score_lines, 1)

139

140

141def get_split_dataframe(filename):

142 """Loads a score set that was written with :any:`bob.bio.base.pipelines.CSVScoreWriter`

143

144 Returns two dataframes, split between positives and negatives.

145

146 Parameters

147 ----------

148

149 filename (:py:class:`str`, ``file-like``): The file object that will be

150 opened with :py:func:`open_file` containing the scores.

151

152 Returns

153 -------

154

155 dataframe: negatives, contains the list of scores (and metadata) for which

156 the fields of the ``bio_ref_subject_id`` and ``probe_subject_id``

157 columns are different. (see

158 :ref:`bob.bio.base.pipeline_simple_advanced_features`)

159

160 dataframe: positives, contains the list of scores (and metadata) for which

161 the fields of the ``bio_ref_subject_id`` and ``probe_subject_id``

162 columns are identical. (see

163 :ref:`bob.bio.base.pipeline_simple_advanced_features`)

164

165 """

166 df = dask.dataframe.read_csv(

167 filename, dtype=defaultdict(lambda: str, {"score": float})

168 )

169

170 genuines = df[df.probe_subject_id == df.bio_ref_subject_id]

171 impostors = df[df.probe_subject_id != df.bio_ref_subject_id]

172

173 return impostors, genuines

174

175

176def split_csv_scores(filename, score_column: str = "score"):

177 """Loads a score set that was written with :any:`bob.bio.base.pipelines.CSVScoreWriter`

178

179 Parameters

180 ----------

181

182 filename (:py:class:`str`, ``file-like``): The file object that will be

183 opened with :py:func:`open_file` containing the scores.

184

185 score_column: The CSV column that contains the score values.

186

187 Returns

188 -------

189

190 array: negatives, 1D float array containing the list of scores, for which

191 the fields of the ``bio_ref_subject_id`` and ``probe_subject_id``

192 columns are different. (see

193 :ref:`bob.bio.base.pipeline_simple_advanced_features`)

194

195 array: positives, 1D float array containing the list of scores, for which

196 the fields of the ``bio_ref_subject_id`` and ``probe_subject_id``

197 columns are identical. (see

198 :ref:`bob.bio.base.pipeline_simple_advanced_features`)

199

200 """

201 df = dask.dataframe.read_csv(

202 filename, dtype=defaultdict(lambda: str, {"score": float})

203 )

204

205 genuines = df[df.probe_subject_id == df.bio_ref_subject_id]

206 impostors = df[df.probe_subject_id != df.bio_ref_subject_id]

207

208 return (

209 impostors[score_column].to_dask_array().compute(),

210 genuines[score_column].to_dask_array().compute(),

211 )

212

213

214def cmc_four_column(filename):

215 """Loads scores to compute CMC curves from a file in four column format.

216

217 The four column file needs to be in the same format as described in

218 :py:func:`four_column`, and the ``test_label`` (column 3) has to contain the

219 test/probe file name or a probe id.

220

221 This function returns a list of tuples. For each probe file, the tuple

222 consists of a list of negative scores and a list of positive scores.

223 Usually, the list of positive scores should contain only one element, but

224 more are allowed. The result of this function can directly be passed to,

225 e.g., the :py:func:`bob.measure.cmc` function.

226

227

228 Parameters:

229

230 filename (:py:class:`str`, ``file-like``): The file object that will be

231 opened with :py:func:`open_file` containing the scores.

232

233

234 Returns:

235

236 :any:`list`: A list of tuples, where each tuple contains the

237 ``negative`` and ``positive`` scores for one probe of the database. Both

238 ``negatives`` and ``positives`` can be either an 1D

239 :py:class:`numpy.ndarray` of type ``float``, or ``None``.

240

241 """

242

243 score_lines = four_column(filename)

244 return _split_cmc_scores(score_lines, 1)

245

246

247def five_column(filename):

248 """Loads a score set from a single file and yield its lines

249

250 Loads a score set from a single file and yield its lines (to avoid loading

251 the score file at once into memory). This function verifies that all fields

252 are correctly placed and contain valid fields. The score file must contain

253 the following information in each line:

254

255 .. code-block:: text

256

257 claimed_id model_label real_id test_label score

258

259

260 Parameters:

261

262 filename (:py:class:`str`, ``file-like``): The file object that will be

263 opened with :py:func:`open_file` containing the scores.

264

265

266 Yields:

267

268 str: The claimed identity -- the client name of the model that was used in

269 the comparison

270

271 str: A label for the model -- usually the model file name, or the model id

272

273 str: The real identity -- the client name of the probe that was used in

274 the comparison

275

276 str: A label of the probe -- usually the probe file name, or the probe id

277

278 float: The result of the comparison of the model and the probe

279

280 """

281

282 return _iterate_score_file(filename)

283

284

285def split_five_column(filename):

286 """Loads a score set from a single file and splits the scores

287

288 Loads a score set from a single file in five column format and splits the

289 scores between negatives and positives. The score file has to respect the 5

290 column format as defined in the method :py:func:`five_column`.

291

292 This method avoids loading and allocating memory for the strings present in

293 the file. We only keep the scores.

294

295

296 Parameters:

297

298 filename (:py:class:`str`, ``file-like``): The file object that will be

299 opened with :py:func:`open_file` containing the scores.

300

301

302 Returns:

303

304 array: negatives, 1D float array containing the list of scores, for which

305 the ``claimed_id`` and the ``real_id`` are different

306 (see :py:func:`four_column`)

307

308 array: positives, 1D float array containing the list of scores, for which

309 the ``claimed_id`` and the ``real_id`` are identical

310 (see :py:func:`four_column`)

311

312 """

313

314 score_lines = four_column(filename)

315 return _split_scores(score_lines, 2)

316

317

318def cmc_five_column(filename):

319 """Loads scores to compute CMC curves from a file in five column format.

320

321 The five column file needs to be in the same format as described in

322 :py:func:`five_column`, and the ``test_label`` (column 4) has to contain the

323 test/probe file name or a probe id.

324

325 This function returns a list of tuples. For each probe file, the tuple

326 consists of a list of negative scores and a list of positive scores.

327 Usually, the list of positive scores should contain only one element, but

328 more are allowed. The result of this function can directly be passed to,

329 e.g., the :py:func:`bob.measure.cmc` function.

330

331

332 Parameters:

333

334 filename (:py:class:`str`, ``file-like``): The file object that will be

335 opened with :py:func:`open_file` containing the scores.

336

337

338 Returns:

339

340 :any:`list`: A list of tuples, where each tuple contains the

341 ``negative`` and ``positive`` scores for one probe of the database.

342

343 """

344 score_lines = four_column(filename)

345 return _split_cmc_scores(score_lines, 2)

346

347

348def scores(filename, ncolumns=None):

349 """Loads the scores from the given score file and yield its lines.

350 Depending on the score file format, four or five elements are yielded, see

351 :py:func:`bob.bio.base.score.load.four_column` and

352 :py:func:`bob.bio.base.score.load.five_column` for details.

353

354 Parameters:

355

356 filename: :py:class:`str`, ``file-like``:

357 The file object that will be opened with :py:func:`open_file` containing

358 the scores.

359

360 ncolumns: any

361 ignored

362

363 Yields:

364

365 tuple:

366 see :py:func:`bob.bio.base.score.load.four_column` or

367 :py:func:`bob.bio.base.score.load.five_column`

368 """

369 return _iterate_score_file(filename)

370

371

372def split(filename, ncolumns=None, sort=False, csv_score_column: str = "score"):

373 """Loads the scores from the given score file and splits them into positives

374 and negatives.

375 Depending on the score file format, it calls see

376 :py:func:`bob.bio.base.score.load.split_four_column` and

377 :py:func:`bob.bio.base.score.load.split_five_column` for details.

378

379 Parameters

380 ----------

381

382 filename : str

383 The path to the score file.

384 ncolumns : int or ``None``

385 If specified to be ``4`` or ``5``, the score file will be assumed to be

386 in the given format. If not specified, the score file format will be

387 estimated automatically

388 sort : :obj:`bool`, optional

389 If ``True``, will return sorted negatives and positives

390 csv_score_column :

391 When loading a CSV file, specifies the column that holds scores.

392

393 Returns

394 -------

395

396 negatives : 1D :py:class:`numpy.ndarray` of type float

397 This array contains the list of scores, for which the ``claimed_id`` and

398 the ``real_id`` are different (see :py:func:`four_column`)

399 positives : 1D :py:class:`numpy.ndarray` of type float

400 This array contains the list of scores, for which the ``claimed_id`` and

401 the ``real_id`` are identical (see :py:func:`four_column`)

402 """

403 if iscsv(filename):

404 neg, pos = split_csv_scores(filename, score_column=csv_score_column)

405 else:

406 ncolumns = _estimate_score_file_format(filename, ncolumns)

407 if ncolumns == 4:

408 neg, pos = split_four_column(filename)

409 else:

410 assert ncolumns == 5

411 neg, pos = split_five_column(filename)

412

413 if sort:

414 neg.sort()

415 pos.sort()

416

417 return neg, pos

418

419

420def cmc(filename, ncolumns=None, csv_score_column: str = "score"):

421 """cmc(filename, ncolumns=None) -> list

422

423 Loads scores to compute CMC curves.

424

425 Depending on the score file format, it calls see

426 :py:func:`bob.bio.base.score.load.cmc_four_column` and

427 `:py:func:`bob.bio.base.score.load.cmc_five_column` for details.

428

429 Parameters:

430

431 filename (:py:class:`str` or ``file-like``): The file object that will be

432 opened with :py:func:`open_file` containing the scores.

433

434 ncolumns: (:py:class:`int`, Optional): If specified to be ``4`` or ``5``,

435 the score file will be assumed to be in the given format. If not

436 specified, the score file format will be estimated automatically

437

438 csv_score_column: When loading a CSV file, specifies the column that holds

439 scores.

440

441 Returns:

442

443 :any:`list`: [(neg,pos)] A list of tuples, where each tuple contains the

444 ``negative`` and ``positive`` scores for one probe of the database.

445

446 """

447

448 ncolumns = (

449 4

450 if iscsv(filename)

451 else _estimate_score_file_format(filename, ncolumns)

452 )

453

454 if ncolumns == 4:

455 return cmc_four_column(filename)

456 else:

457 assert ncolumns == 5

458 return cmc_five_column(filename)

459

460

461def load_score(filename, ncolumns=None, minimal=False, **kwargs):

462 """Load scores using numpy.loadtxt and return the data as a numpy array.

463

464 Parameters:

465

466 filename (:py:class:`str`, ``file-like``): The file object that will be

467 opened with :py:func:`open_file` containing the scores.

468

469 ncolumns (:py:class:`int`, optional): 4, 5 or None (the default),

470 specifying the number of columns in the score file. If None is provided,

471 the number of columns will be guessed.

472

473 minimal (:py:class:`bool`, optional): If True, only loads ``claimed_id``,

474 ``real_id``, and ``scores``.

475

476 **kwargs: Keyword arguments passed to :py:func:`numpy.genfromtxt`

477

478

479 Returns:

480

481 array: An array which contains not only the actual ``score`` but also the

482 ``claimed_id``, ``real_id``, ``test_label`` and ``['model_label']``

483

484 """

485

486 def convertfunc(x):

487 return x

488

489 ncolumns = _estimate_score_file_format(filename, ncolumns)

490

491 usecols = kwargs.pop("usecols", None)

492 if ncolumns == 4:

493 names = ("claimed_id", "real_id", "test_label", "score")

494 converters = {0: convertfunc, 1: convertfunc, 2: convertfunc, 3: float}

495 if minimal:

496 usecols = (0, 1, 3)

497

498 elif ncolumns == 5:

499 names = ("claimed_id", "model_label", "real_id", "test_label", "score")

500 converters = {

501 0: convertfunc,

502 1: convertfunc,

503 2: convertfunc,

504 3: convertfunc,

505 4: float,

506 }

507 if minimal:

508 usecols = (0, 2, 4)

509 else:

510 raise ValueError("ncolumns of 4 and 5 are supported only.")

511

512 score_lines = numpy.genfromtxt(

513 open_file(filename, mode="rb"),

514 dtype=None,

515 names=names,

516 converters=converters,

517 invalid_raise=True,

518 usecols=usecols,

519 **kwargs,

520 )

521 new_dtype = []

522 for name in score_lines.dtype.names[:-1]:

523 new_dtype.append((name, str(score_lines.dtype[name]).replace("S", "U")))

524 new_dtype.append(("score", float))

525 score_lines = numpy.array(score_lines, new_dtype)

526 return score_lines

527

528

529def load_files(filenames, func_load):

530 """Load a list of score files and return a list of tuples of (neg, pos)

531

532 Parameters

533 ----------

534

535 filenames : :any:`list`

536 list of file paths

537 func_load :

538 function that can read files in the list

539

540 Returns

541 -------

542

543 :any:`list`: [(neg,pos)] A list of tuples, where each tuple contains the

544 ``negative`` and ``positive`` sceach system/probee.

545

546 """

547 if filenames is None:

548 return None

549 res = []

550 for filepath in filenames:

551 res.append(func_load(filepath))

552 return res

553

554

555def get_negatives_positives(score_lines, score_column: str = "score"):

556 """Take the output of load_score and return negatives and positives. This

557 function aims to replace split_four_column and split_five_column but takes a

558 different input. It's up to you to use which one.

559 """

560

561 pos_mask = score_lines["claimed_id"] == score_lines["real_id"]

562 positives = score_lines[score_column][pos_mask]

563 negatives = score_lines[score_column][numpy.logical_not(pos_mask)]

564 return (negatives, positives)

565

566

567def get_negatives_positives_from_file(filename, **kwargs):

568 """Loads the scores first efficiently and then calls

569 get_negatives_positives"""

570 score_lines = load_score(filename, minimal=True, **kwargs)

571 return get_negatives_positives(score_lines, score_column="score")

572

573

574def get_negatives_positives_all(score_lines_list, score_column: str = "score"):

575 """Take a list of outputs of load_score and return stacked negatives and

576 positives.

577 """

578

579 negatives, positives = [], []

580 for score_lines in score_lines_list:

581 neg_pos = get_negatives_positives(

582 score_lines, score_column=score_column

583 )

584 negatives.append(neg_pos[0])

585 positives.append(neg_pos[1])

586 negatives = numpy.vstack(negatives).T

587 positives = numpy.vstack(positives).T

588 return (negatives, positives)

589

590

591def get_all_scores(score_lines_list, score_column: str = "score"):

592 """Take a list of outputs of load_score and return stacked scores"""

593

594 return numpy.vstack(

595 [score_lines[score_column] for score_lines in score_lines_list]

596 ).T

597

598

599def dump_score(filename, score_lines):

600 """Dump scores that were loaded using :py:func:`load_score`

601 The number of columns is automatically detected.

602 """

603

604 if len(score_lines.dtype) == 5:

605 fmt = "%s %s %s %s %.9f"

606 elif len(score_lines.dtype) == 4:

607 fmt = "%s %s %s %.9f"

608 else:

609 raise ValueError("Only scores with 4 and 5 columns are supported.")

610 numpy.savetxt(filename, score_lines, fmt=fmt)

611

612

613def _estimate_score_file_format(filename, ncolumns=None):

614 """Estimates the score file format from the given score file.

615 If ``ncolumns`` is in ``(4,5)``, then ``ncolumns`` is returned instead.

616 """

617 if ncolumns in (4, 5):

618 return ncolumns

619

620 f = open_file(filename, "rb")

621 try:

622 line = f.readline()

623 ncolumns = len(line.split())

624 except Exception:

625 logger.warn(

626 "Could not guess the number of columns in file: {}. "

627 "Assuming 4 column format.".format(filename)

628 )

629 ncolumns = 4

630 finally:

631 f.close()

632 return ncolumns

633

634

635def _iterate_score_file(filename, csv_score_column: str = "score"):

636 """Opens the score file and yields the score file lines in a tuple/list.

637

638 The last element of the line (which is the score) will be transformed to

639 float, the other elements will be str.

640 """

641 if iscsv(filename):

642 for row in _iterate_csv_score_file(

643 filename, score_column=csv_score_column

644 ):

645 yield [

646 row["bio_ref_subject_id"],

647 row["probe_subject_id"],

648 row["probe_template_id"],

649 row[csv_score_column],

650 ]

651 else:

652 opened = open_file(filename, "rb")

653 import io

654

655 if not isinstance(opened, io.TextIOWrapper):

656 opened = io.TextIOWrapper(opened, newline="")

657

658 reader = csv.reader(opened, delimiter=" ")

659 for splits in reader:

660 splits[-1] = float(splits[-1])

661 yield splits

662

663

664def _iterate_csv_score_file(filename, score_column: str = "score"):

665 """Opens a CSV score file for reading and yields each line in a dict.

666

667 The ``score_column`` field of the line will be cast to float, the other

668 elements will be str.

669 """

670 opened = open_file(filename)

671 reader = csv.DictReader(opened)

672 for row in reader:

673 row[score_column] = float(row[score_column])

674 yield row

675

676

677def _split_scores(

678 score_lines, real_id_index, claimed_id_index=0, score_index=-1

679):

680 """Take the output of :py:func:`four_column` or :py:func:`five_column` and

681 return negatives and positives.

682 """

683 positives, negatives = [], []

684 for line in score_lines:

685 which = (

686 positives

687 if line[claimed_id_index] == line[real_id_index]

688 else negatives

689 )

690 which.append(line[score_index])

691

692 return (numpy.array(negatives), numpy.array(positives))

693

694

695def _split_cmc_scores(

696 score_lines,

697 real_id_index,

698 probe_name_index=None,

699 claimed_id_index=0,

700 score_index=-1,

701):

702 """Takes the output of :py:func:`four_column` or :py:func:`five_column` and

703 return cmc scores.

704 """

705 if probe_name_index is None:

706 probe_name_index = real_id_index + 1

707 # extract positives and negatives

708

709 pos_dict = {}

710 neg_dict = {}

711 # read four column list

712 for line in score_lines:

713 which = (

714 pos_dict

715 if line[claimed_id_index] == line[real_id_index]

716 else neg_dict

717 )

718 probe_name = line[probe_name_index]

719 # append score

720 if probe_name not in which:

721 which[probe_name] = []

722 which[probe_name].append(line[score_index])

723

724 # convert to lists of tuples of ndarrays (or None)

725 probe_names = sorted(set(neg_dict.keys()).union(set(pos_dict.keys())))

726 # get all scores in the desired format

727 return [

728 (

729 (

730 numpy.array(neg_dict[probe_name], numpy.float64)

731 if probe_name in neg_dict

732 else None

733 ),

734 (

735 numpy.array(pos_dict[probe_name], numpy.float64)

736 if probe_name in pos_dict

737 else None

738 ),

739 )

740 for probe_name in probe_names

741 ]

742

743

744def split_csv_vuln(filename, score_column: str = "score"):

745 """Loads vulnerability scores from a CSV score file.

746

747 Returns the scores split between positive and negative as well as licit

748 and presentation attack (spoof).

749

750 The CSV must contain a ``probe_attack_type`` column with each field either

751 containing a str defining the attack type (spoof), or empty (licit).

752

753 Parameters

754 ----------

755

756 filename: str

757 The path to a CSV file containing all the scores

758

759 Returns

760 -------

761

762 split_scores: dict of str: numpy.ndarray

763 The licit negative and positive, and spoof scores for probes.

764 """

765 logger.debug(f"Loading CSV score file: '{filename}'")

766 split_scores = {"licit_neg": [], "licit_pos": [], "spoof": []}

767 for row in _iterate_csv_score_file(filename, score_column=score_column):

768 if not row["probe_attack_type"]: # licit

769 if row["probe_subject_id"] == row["bio_ref_subject_id"]:

770 split_scores["licit_pos"].append(row[score_column])

771 else:

772 split_scores["licit_neg"].append(row[score_column])

773 else:

774 split_scores["spoof"].append(row[score_column])

775 logger.debug(

776 f"Found {len(split_scores['licit_neg'])} negative (ZEI), "

777 f"{len(split_scores['licit_pos'])} positive (licit), and "

778 f"{len(split_scores['spoof'])} PA (spoof) scores."

779 )

780 # Cast to numpy float

781 for key, val in split_scores.items():

782 split_scores[key] = numpy.array(val, dtype=numpy.float64)

783 return split_scores