Coverage for src/bob/measure/_library.py: 28%

366 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-06 21:23 +0100

1#!/usr/bin/env python 

2# coding=utf-8 

3 

4"""Various functions for performance assessment 

5 

6Most of these were imported from older C++ implementations. 

7""" 

8 

9import logging 

10 

11from functools import wraps 

12 

13import numpy 

14import numpy.linalg 

15 

16from numba import jit, objmode 

17 

18logger = logging.getLogger(__name__) 

19 

20 

21def _lists_to_arrays(*args, **kwargs): 

22 ret, retkw = list(), dict() 

23 for v in args: 

24 ret.append(numpy.asarray(v) if isinstance(v, list) else v) 

25 for k, v in kwargs.items(): 

26 retkw[k] = numpy.asarray(v) if isinstance(v, list) else v 

27 return ret, retkw 

28 

29 

30def array_jit(func): 

31 jit_func = jit(func, nopython=True) 

32 

33 @wraps(jit_func) 

34 def new_func(*args, **kwargs): 

35 args, kwargs = _lists_to_arrays(*args, **kwargs) 

36 return jit_func(*args, **kwargs) 

37 

38 new_func.jit_func = jit_func 

39 return new_func 

40 

41 

42@jit(nopython=True) 

43def _log_values(points, min_power): 

44 """Computes log-scaled values between :math:`10^\text{min_power}` and 1 

45 

46 Parameters 

47 ========== 

48 

49 points : int 

50 Number of points to consider 

51 

52 min_power : int 

53 Negative integer with the minimum power 

54 

55 

56 Returns 

57 ======= 

58 

59 logscale : numpy.ndarray (float, 1D) 

60 

61 A set of numbers forming a logarithm-based scale between 

62 :math:`10^\text{min_power}` and 1. 

63 

64 """ 

65 return 10 ** (numpy.arange(1 - points, 1) / int(points / (-min_power))) 

66 

67 

68@jit(nopython=True) 

69def _meaningful_thresholds(negatives, positives, points, min_far, is_sorted): 

70 """Returns non-repeatitive thresholds to generate ROC curves 

71 

72 This function creates a list of FAR (and FRR) values that we are 

73 interesting to see on the curve. Computes thresholds for those points. 

74 Sorts the thresholds so we get sorted numbers to plot on the curve and 

75 returns the thresholds. Some points will be duplicate but in terms of 

76 resolution and accuracy this is better than just changing the threshold 

77 from ``min()`` of scores to ``max()`` of scores with equal spaces. 

78 

79 

80 Parameters 

81 ========== 

82 

83 negatives, positives : numpy.ndarray (1D, float) 

84 

85 The negative and positive scores, for which the meaningful threshold 

86 will be calculated. 

87 

88 n_points : int 

89 

90 The number of points, in which the ROC curve are calculated, which are 

91 distributed uniformly in the range ``[min(negatives, positives), 

92 max(negatives, positives)]`` 

93 

94 min_far : int 

95 

96 Minimum FAR in terms of :math:`10^(\text{min_far}`. This value is also 

97 used for ``min_frr``. Values should be negative. 

98 

99 

100 is_sorted : bool 

101 

102 Set this to ``True`` if both sets of scores are already sorted in 

103 ascending order. If ``False``, scores will be sorted internally, which 

104 will require more memory. 

105 

106 

107 Returns 

108 ======= 

109 

110 thresholds : numpy.ndarray (1D, float) 

111 

112 The "meaningful" thresholds that would cause changes in the ROC. 

113 

114 """ 

115 

116 half_points = points // 2 

117 

118 # if not pre-sorted, copies and sorts 

119 neg = negatives if is_sorted else numpy.sort(negatives) 

120 pos = positives if is_sorted else numpy.sort(positives) 

121 

122 frr_list = _log_values(half_points, min_far) 

123 far_list = _log_values(points - half_points, min_far) 

124 

125 t = numpy.zeros((points,)) 

126 t[:half_points] = [_jit_frr_threshold(neg, pos, k, True) for k in frr_list] 

127 t[half_points:] = [_jit_far_threshold(neg, pos, k, True) for k in far_list] 

128 

129 t.sort() 

130 

131 return t 

132 

133 

134def correctly_classified_negatives(negatives, threshold): 

135 """Evaluates correctly classifed negatives in a set, based on a threshold 

136 

137 This method returns an array composed of booleans that pin-point, which 

138 negatives where correctly classified for the given threshold 

139 

140 The pseudo-code for this function is: 

141 

142 .. code-block:: python 

143 

144 classified = [] 

145 for k in negatives: 

146 if k < threshold: 

147 classified.append(True) 

148 else: 

149 classified.append(False) 

150 

151 

152 Parameters 

153 ========== 

154 

155 negatives : numpy.ndarray (1D, float) 

156 

157 The scores generated by comparing objects of different classes 

158 

159 threshold : float 

160 

161 The threshold, for which scores should be considered to be 

162 correctly classified 

163 

164 

165 Returns 

166 ======= 

167 

168 classified : numpy.ndarray (1D, bool) 

169 

170 The decision for each of the ``negatives`` 

171 

172 """ 

173 return negatives < threshold 

174 

175 

176def correctly_classified_positives(positives, threshold): 

177 """Evaluates correctly classifed positives in a set, based on a threshold 

178 

179 This method returns an array composed of booleans that pin-point, which 

180 positives where correctly classified for the given threshold 

181 

182 The pseudo-code for this function is: 

183 

184 .. code-block:: python 

185 

186 classified = [] 

187 for k in positives: 

188 if k >= threshold: 

189 classified.append(True) 

190 else: 

191 classified.append(False) 

192 

193 

194 Parameters 

195 ========== 

196 

197 positives : numpy.ndarray (1D, float) 

198 

199 The scores generated by comparing objects of different classes 

200 

201 threshold : float 

202 

203 The threshold, for which scores should be considered to be 

204 correctly classified 

205 

206 

207 Returns 

208 ======= 

209 

210 classified : numpy.ndarray (1D, bool) 

211 

212 The decision for each of the ``positives`` 

213 

214 """ 

215 return positives >= threshold 

216 

217 

218def det(negatives, positives, n_points, min_far=-8): 

219 """Calculates points of an Detection Error-Tradeoff (DET) curve 

220 

221 Calculates the DET curve given a set of negative and positive scores and a 

222 desired number of points. Returns a two-dimensional array of doubles that 

223 express on its rows: 

224 

225 You can plot the results using your preferred tool to first create a plot 

226 using rows 0 and 1 from the returned value and then replace the X/Y axis 

227 annotation using a pre-determined set of tickmarks as recommended by NIST. 

228 The derivative scales are computed with the :py:func:`ppndf` function. 

229 

230 

231 Parameters 

232 ========== 

233 

234 negatives, positives : numpy.ndarray (1D, float) 

235 

236 The list of negative and positive scores to compute the DET for 

237 

238 n_points : int 

239 

240 The number of points on the DET curve, for which the DET should be 

241 evaluated 

242 

243 min_far : :class:`int`, Optional 

244 

245 Minimum FAR in terms of :math:`10^\text{min_far}`. This value is also 

246 used for ``min_frr``. Values should be negative. 

247 

248 

249 Returns 

250 ======= 

251 

252 curve : numpy.ndarray (2D, float) 

253 

254 The DET curve, with the FPR in the first and the FNR in the second 

255 row: 

256 

257 0. X axis values in the normal deviate scale for the false-accepts 

258 1. Y axis values in the normal deviate scale for the false-rejections 

259 

260 """ 

261 return ppndf(roc(negatives, positives, n_points, min_far)) 

262 

263 

264def pavx_1(y, ghat, index, len): 

265 """Calculates the pavx_1 function 

266 

267 Calculates the pavx_1 function given a set of negative and positive scores 

268 and a desired number of points. Returns a two-dimensional array of doubles 

269 that express on its rows: 

270 

271 You can plot the results using your preferred tool to first create a plot 

272 using rows 0 and 1 from the returned value and then replace the X/Y axis 

273 annotation using a pre-determined set of tickmarks as recommended by NIST. 

274 The derivative scales are computed with the :py:func:`ppndf` function. 

275 

276 

277 Parameters 

278 ========== 

279 

280 y : numpy.ndarray (1D, float) 

281 

282 The list of negative and positive scores to compute the DET for 

283 

284 ghat : numpy.ndarray (1D, float) 

285 

286 The list of mean values calculated in the pavx_1 function 

287 

288 index : numpy.ndarray (1D, size_t) 

289 

290 The list of indices calculated in the pavx_1 function 

291 

292 len : numpy.ndarray (1D, size_t) 

293 

294 The list of lengths calculated in the pavx_1 function 

295 

296 

297 Returns 

298 ======= 

299 

300 ci : size_t 

301 

302 The index of the interval currently considered 

303 

304 """ 

305 # Sets output and working arrays to 0 

306 index = 0 

307 len = 0 

308 ghat = 0.0 

309 # ci is the index of the interval currently considered 

310 # ghat(ci) is the mean of y-values within this interval 

311 ci = 0 

312 index[0] = 0 

313 len[0] = 1 

314 ghat[0] = y[0] 

315 for j in range(1, y.shape[0]): 

316 # a new index interval "j" is created: 

317 ci += 1 

318 index[ci] = j 

319 len[ci] = 1 

320 ghat[ci] = y[j] 

321 while ci >= 1 and ghat[numpy.max(ci - 1, 0)] >= ghat[ci]: 

322 # "pool adjacent violators" 

323 nw = len[ci - 1] + len[ci] 

324 ghat[ci - 1] += (len[ci] / nw) * (ghat[ci] - ghat[ci - 1]) 

325 len[ci - 1] = nw 

326 ci -= 1 

327 return ci 

328 

329 

330def pavx_2(ghat, index, ci): 

331 """Calculates the pavx_2 function 

332 

333 Calculates the pavx_2 function given the pavx_1 function. 

334 

335 Parameters 

336 ========== 

337 

338 ghat : numpy.ndarray (1D, float) 

339 

340 The list of mean values calculated in the pavx_1 function 

341 

342 index : numpy.ndarray (1D, size_t) 

343 

344 The list of indices calculated in the pavx_1 function 

345 

346 ci : size_t 

347 

348 The index of the interval currently considered 

349 

350 """ 

351 # define ghat for all indices 

352 n = index[ci] 

353 while n >= 1: 

354 r = numpy.array(range(index[ci], n)) 

355 ghat[r] = ghat[ci] 

356 n = index[ci] 

357 ci -= 1 

358 return ghat 

359 

360 

361def pavxWidth(input, output): 

362 """Applies the Pool-Adjacent-Violators Algorithm and returns the width. 

363 

364 This is a simplified C++ port of the isotonic regression code made available at the University of Bern website. 

365 

366 Parameters 

367 ========== 

368 

369 input : array_like (float, 1D) 

370 

371 The input matrix for the PAV algorithm. 

372 

373 output : array_like (float, 1D) 

374 

375 The output matrix, must be of the same size as input 

376 

377 Returns 

378 ======= 

379 

380 width : array_like (uint64, 1D) 

381 

382 The width matrix will be created in the same size as input 

383 """ 

384 input = numpy.array(input) 

385 output = numpy.array(output) 

386 

387 # Define working arrays: An interval of indices is represented by its left 

388 # endpoint "index" and its length "len" 

389 N = input.shape[0] 

390 index = numpy.zeros(N, dtype=numpy.uint64) 

391 len = numpy.zeros(N, dtype=numpy.uint64) 

392 

393 # First step 

394 ci = pavx_1(input, output, index, len) 

395 

396 # Get the width vector 

397 width = len[: ci + 1] 

398 

399 # Second step 

400 pavx_2(output, index, ci) 

401 

402 return width 

403 

404 

405def rocch(negatives, positives): 

406 """Calculates the ROC Convex Hull (ROCCH) curve given a set of positive and negative scores 

407 

408 

409 Parameters 

410 ========== 

411 

412 negatives, positives : numpy.ndarray (1D, float) 

413 

414 The set of negative and positive scores to compute the curve 

415 

416 

417 Returns 

418 ======= 

419 

420 curve : numpy.ndarray (2D, float) 

421 

422 The ROC curve, with the first row containing the FPR, and the second 

423 row containing the FNR. 

424 

425 """ 

426 

427 # Number of positive and negative scores 

428 Nt = len(positives) 

429 Nn = len(negatives) 

430 N = Nt + Nn 

431 

432 # Creates a big array with all scores 

433 scores = numpy.concatenate((positives, negatives)) 

434 

435 # It is important here that scores that are the same (i.e. already in 

436 # order) should NOT be swapped. "stable" has this property. 

437 perturb = numpy.argsort(scores, kind="stable") 

438 

439 # Apply permutation 

440 Pideal = numpy.zeros((N,)) 

441 Pideal[perturb < Nt] = 1.0 

442 

443 # Applies the PAVA algorithm 

444 Popt = numpy.ndarray((N,)) 

445 raise NotImplementedError( 

446 "An auto generated implementation of pavxWidth is available but no test has been done." 

447 ) 

448 width = pavxWidth(Pideal, Popt) 

449 

450 # Allocates output 

451 nbins = len(width) 

452 retval = numpy.zeros((2, nbins + 1)) # FAR, FRR 

453 

454 # Fills in output 

455 left = 0 

456 fa = Nn 

457 miss = 0 

458 

459 for i in range(nbins): 

460 retval[0, i] = fa / Nn # pfa 

461 retval[1, i] = miss / Nt # pmiss 

462 left += int(width[i]) 

463 

464 if left >= 1: 

465 miss = Pideal[:left].sum() 

466 else: 

467 miss = 0.0 

468 

469 if Pideal.shape[0] - 1 >= left: 

470 fa = N - left - Pideal[left:].sum() 

471 else: 

472 fa = 0 

473 

474 retval[0, nbins] = fa / Nn # pfa 

475 retval[1, nbins] = miss / Nt # pmiss 

476 

477 return retval 

478 

479 

480@array_jit 

481def rocch2eer(pmiss_pfa): 

482 """Calculates the threshold that is as close as possible to the equal-error-rate (EER) given the input data 

483 

484 

485 .. todo:: 

486 

487 The parameter(s) ``pmiss_pfa`` are used, but not documented. 

488 

489 

490 Returns 

491 ======= 

492 

493 threshold : float 

494 

495 The computed threshold, at which the EER can be obtained 

496 

497 """ 

498 

499 assert pmiss_pfa.shape[0] == 2 

500 

501 N = pmiss_pfa.shape[1] 

502 

503 eer = 0.0 

504 XY = numpy.empty((2, 2)) 

505 one = numpy.ones((2,)) 

506 eerseg = 0.0 

507 epsilon = numpy.finfo(numpy.float64).eps 

508 

509 for i in range(N - 1): 

510 # Define XY matrix 

511 XY[0, 0] = pmiss_pfa[0, i] # pfa 

512 XY[1, 0] = pmiss_pfa[0, i + 1] # pfa 

513 XY[0, 1] = pmiss_pfa[1, i] # pmiss 

514 XY[1, 1] = pmiss_pfa[1, i + 1] # pmiss 

515 

516 # xx and yy should be sorted: 

517 assert XY[1, 0] <= XY[0, 0] and XY[0, 1] <= XY[1, 1] 

518 

519 # Commpute "dd" 

520 abs_dd0 = abs(XY[0, 0] - XY[1, 0]) 

521 abs_dd1 = abs(XY[0, 1] - XY[1, 1]) 

522 

523 if min(abs_dd0, abs_dd1) < epsilon: 

524 eerseg = 0.0 

525 

526 else: 

527 # Finds line coefficients seg s.t. XY.seg = 1 

528 seg = numpy.linalg.solve(XY, one) 

529 # Candidate for the EER (to be compared to current value) 

530 eerseg = 1.0 / seg.sum() 

531 

532 eer = max(eer, eerseg) 

533 

534 return eer 

535 

536 

537def eer_rocch(negatives, positives): 

538 """Equal-error-rate (EER) given the input data, on the ROC Convex Hull 

539 

540 It replicates the EER calculation from the Bosaris toolkit 

541 (https://sites.google.com/site/bosaristoolkit/). 

542 

543 

544 Parameters 

545 ========== 

546 

547 negatives : numpy.ndarray (1D, float) 

548 

549 The set of negative scores to compute the threshold 

550 

551 positives : numpy.ndarray (1D, float) 

552 

553 The set of positive scores to compute the threshold 

554 

555 

556 Returns 

557 ======= 

558 

559 threshold : float 

560 

561 The threshold for the equal error rate 

562 

563 """ 

564 return rocch2eer(rocch(negatives, positives)) 

565 

566 

567@jit("float64(float64, float64, float64)", nopython=True) 

568def _abs_diff(a, b, cost): 

569 return abs(a - b) 

570 

571 

572@jit("float64(float64, float64, float64)", nopython=True) 

573def _weighted_err(far, frr, cost): 

574 return (cost * far) + ((1.0 - cost) * frr) 

575 

576 

577@jit(nopython=True) 

578def _minimizing_threshold(negatives, positives, criterion, cost=0.5): 

579 """Calculates the best threshold taking a predicate as input condition 

580 

581 This method can calculate a threshold based on a set of scores (positives 

582 and negatives) given a certain minimization criterium, input as a 

583 functional predicate. For a discussion on ``positive`` and ``negative`` see 

584 :py:func:`farfrr`. Here, it is expected that the positives and the 

585 negatives are sorted ascendantly. 

586 

587 The predicate method gives back the current minimum given false-acceptance 

588 (FA) and false-rejection (FR) ratios for the input data. The API for the 

589 criterium is: 

590 

591 predicate(fa_ratio : float, fr_ratio : float) -> float 

592 

593 Please note that this method will only work with single-minimum smooth 

594 predicates. 

595 

596 The minimization is carried out in a data-driven way. Starting from the 

597 lowest score (might be a positive or a negative), it increases the 

598 threshold based on the distance between the current score and the following 

599 higher score (also keeping track of duplicate scores) and computes the 

600 predicate for each possible threshold. 

601 

602 Finally, that threshold is returned, for which the predicate returned the 

603 lowest value. 

604 

605 

606 Parameters 

607 ========== 

608 

609 negatives : numpy.ndarray (1D, float) 

610 Negative scores, sorted ascendantly 

611 

612 positives : numpy.ndarray (1D, float) 

613 Positive scores, sorted ascendantly 

614 

615 criterion : str 

616 A predicate from one of ("absolute-difference", "weighted-error") 

617 

618 cost : float 

619 Extra cost argument to be passed to criterion 

620 

621 Returns 

622 ======= 

623 

624 threshold : float 

625 The optimal threshold given the predicate and the scores 

626 

627 """ 

628 if criterion not in ("absolute-difference", "weighted-error"): 

629 raise ValueError("Uknown criterion") 

630 

631 def criterium(a, b, c): 

632 if criterion == "absolute-difference": 

633 return _abs_diff(a, b, c) 

634 else: 

635 return _weighted_err(a, b, c) 

636 

637 if not len(negatives) or not len(positives): 

638 raise RuntimeError( 

639 "Cannot compute threshold when no positives or " 

640 "no negatives are provided" 

641 ) 

642 

643 # iterates over all possible far and frr points and compute the predicate 

644 # for each possible threshold... 

645 min_predicate = 1e8 

646 min_threshold = 1e8 

647 current_predicate = 1e8 

648 # we start with the extreme values for far and frr 

649 far = 1.0 

650 frr = 0.0 

651 

652 # the decrease/increase for far/frr when moving one negative/positive 

653 max_neg = len(negatives) 

654 far_decrease = 1.0 / max_neg 

655 max_pos = len(positives) 

656 frr_increase = 1.0 / max_pos 

657 

658 # we start with the threshold based on the minimum score 

659 

660 # iterates until one of these goes bananas 

661 pos_it = 0 

662 neg_it = 0 

663 current_threshold = min(negatives[neg_it], positives[pos_it]) 

664 

665 # continues until one of the two iterators reaches the end of the list 

666 while pos_it < max_pos and neg_it < max_neg: 

667 # compute predicate 

668 current_predicate = criterium(far, frr, cost) 

669 

670 if current_predicate <= min_predicate: 

671 min_predicate = current_predicate 

672 min_threshold = current_threshold 

673 

674 if positives[pos_it] >= negatives[neg_it]: 

675 # compute current threshold 

676 current_threshold = negatives[neg_it] 

677 neg_it += 1 

678 far -= far_decrease 

679 

680 else: # pos_val <= neg_val 

681 # compute current threshold 

682 current_threshold = positives[pos_it] 

683 pos_it += 1 

684 frr += frr_increase 

685 

686 # skip until next "different" value, which case we "gain" 1 unit on 

687 # the "FAR" value, since we will be accepting that negative as a 

688 # true negative, and not as a false positive anymore. we continue 

689 # to do so for as long as the current threshold matches the current 

690 # iterator. 

691 while neg_it < max_neg and current_threshold == negatives[neg_it]: 

692 neg_it += 1 

693 far -= far_decrease 

694 

695 # skip until next "different" value, which case we "loose" 1 unit 

696 # on the "FRR" value, since we will be accepting that positive as a 

697 # false negative, and not as a true positive anymore. we continue 

698 # to do so for as long as the current threshold matches the current 

699 # iterator. 

700 while pos_it < max_pos and current_threshold == positives[pos_it]: 

701 pos_it += 1 

702 frr += frr_increase 

703 

704 # computes a new threshold based on the center between last and current 

705 # score, if we are **not** already at the end of the score lists 

706 if neg_it < max_neg or pos_it < max_pos: 

707 if neg_it < max_neg and pos_it < max_pos: 

708 current_threshold += min(negatives[neg_it], positives[pos_it]) 

709 elif neg_it < max_neg: 

710 current_threshold += negatives[neg_it] 

711 else: 

712 current_threshold += positives[pos_it] 

713 current_threshold /= 2 

714 

715 # now, we have reached the end of one list (usually the negatives) so, 

716 # finally compute predicate for the last time 

717 current_predicate = criterium(far, frr, cost) 

718 if current_predicate < min_predicate: 

719 min_predicate = current_predicate 

720 min_threshold = current_threshold 

721 

722 # now we just double check choosing the threshold higher than all scores 

723 # will not improve the min_predicate 

724 if neg_it < max_neg or pos_it < max_pos: 

725 last_threshold = current_threshold 

726 if neg_it < max_neg: 

727 last_threshold = numpy.nextafter(negatives[-1], negatives[-1] + 1) 

728 elif pos_it < max_pos: 

729 last_threshold = numpy.nextafter(positives[-1], positives[-1] + 1) 

730 current_predicate = criterium(0.0, 1.0, cost) 

731 if current_predicate < min_predicate: 

732 min_predicate = current_predicate 

733 min_threshold = last_threshold 

734 

735 # return the best threshold found 

736 return min_threshold 

737 

738 

739def eer_threshold(negatives, positives, is_sorted=False): 

740 """Calculates threshold as close as possible to the equal error rate (EER) 

741 

742 The EER should be the point where the FPR equals the FNR. Graphically, this 

743 would be equivalent to the intersection between the ROC (or DET) curves and 

744 the identity. 

745 

746 .. note:: 

747 

748 The scores will be sorted internally, requiring the scores to be copied. 

749 To avoid this copy, you can sort both sets of scores externally in 

750 ascendant order, and set the ``is_sorted`` parameter to ``True``. 

751 

752 

753 Parameters 

754 ========== 

755 

756 negatives : numpy.ndarray (1D, float) 

757 

758 The set of negative scores to compute the threshold 

759 

760 positives : numpy.ndarray (1D, float) 

761 

762 The set of positive scores to compute the threshold 

763 

764 is_sorted : :py:class:`bool`, Optional 

765 

766 Set this to ``True`` if the ``negatives`` are already sorted in 

767 ascending order. If ``False``, scores will be sorted internally, which 

768 will require more memory. 

769 

770 

771 Returns 

772 ======= 

773 

774 threshold : float 

775 

776 The threshold (i.e., as used in :py:func:`farfrr`) where FPR and FNR 

777 are as close as possible 

778 

779 """ 

780 

781 # if not pre-sorted, copies and sorts 

782 neg = negatives if is_sorted else numpy.sort(negatives) 

783 pos = positives if is_sorted else numpy.sort(positives) 

784 

785 return _minimizing_threshold(neg, pos, "absolute-difference") 

786 

787 

788@array_jit 

789def epc( 

790 dev_negatives, 

791 dev_positives, 

792 test_negatives, 

793 test_positives, 

794 n_points, 

795 is_sorted=False, 

796 thresholds=False, 

797): 

798 """Calculates points of an Expected Performance Curve (EPC) 

799 

800 Calculates the EPC curve given a set of positive and negative scores and a 

801 desired number of points. Returns a two-dimensional 

802 :py:class:`numpy.ndarray` of type float with the shape of ``(2, points)`` 

803 or ``(3, points)`` depending on the ``thresholds`` argument. The rows 

804 correspond to the X (cost), Y (weighted error rate on the test set given 

805 the min. threshold on the development set), and the thresholds which were 

806 used to calculate the error (if the ``thresholds`` argument was set to 

807 ``True``), respectively. Please note that, in order to calculate the EPC 

808 curve, one needs two sets of data comprising a development set and a test 

809 set. The minimum weighted error is calculated on the development set and 

810 then applied to the test set to evaluate the weighted error rate at that 

811 position. 

812 

813 The EPC curve plots the HTER on the test set for various values of 'cost'. 

814 For each value of 'cost', a threshold is found that provides the minimum 

815 weighted error (see :py:func:`min_weighted_error_rate_threshold`) on the 

816 development set. Each threshold is consecutively applied to the test set 

817 and the resulting weighted error values are plotted in the EPC. 

818 

819 The cost points in which the EPC curve are calculated are distributed 

820 uniformly in the range :math:`[0.0, 1.0]`. 

821 

822 .. note:: 

823 

824 It is more memory efficient, when sorted arrays of scores are provided 

825 and the ``is_sorted`` parameter is set to ``True``. 

826 

827 

828 Parameters 

829 ========== 

830 

831 dev_negatives : numpy.ndarray (1D, float) 

832 

833 Negative scores on the development set 

834 

835 dev_positives : numpy.ndarray (1D, float) 

836 

837 Positive scores on the development set 

838 

839 test_negatives : numpy.ndarray (1D, float) 

840 

841 Negative scores on the test set 

842 

843 test_positives : numpy.ndarray (1D, float) 

844 

845 Positive scores on the test set 

846 

847 n_points : int 

848 

849 The number of weights for which the EPC curve should be computed 

850 

851 is_sorted : :py:class:`bool`, Optional 

852 

853 Set this to ``True`` if the ``negatives`` are already sorted in 

854 ascending order. If ``False``, scores will be sorted internally, which 

855 will require more memory. 

856 

857 thresholds : :py:class:`bool`, Optional 

858 

859 If ``True`` the function returns an array with the shape of ``(3, 

860 points)`` where the third row contains the thresholds that were 

861 calculated on the development set. 

862 

863 

864 Returns 

865 ======= 

866 

867 curve : numpy.ndarray (2D or 3D, float) 

868 

869 The EPC curve, with the first row containing the weights and the second 

870 row containing the weighted errors on the test set. If ``thresholds`` 

871 is ``True``, there is also a third row which contains the thresholds 

872 that were calculated on the development set. 

873 

874 """ 

875 

876 # if not pre-sorted, copies and sorts 

877 dev_neg = dev_negatives if is_sorted else numpy.sort(dev_negatives) 

878 dev_pos = dev_positives if is_sorted else numpy.sort(dev_positives) 

879 # numpy.linspace is more stable than numpy.arange for non-integer steps. 

880 # However, both arange and linspace are buggy in numba. Using objmode for a 

881 # workaround. TODO(amir): remove objmode once 

882 # https://github.com/numba/numba/issues/6768 is resolved. 

883 with objmode(alpha="float64[:]"): 

884 alpha = numpy.linspace(0, 1.0, n_points) 

885 thres = numpy.empty_like(alpha) 

886 mwer = numpy.empty_like(alpha) 

887 for i, k in enumerate(alpha): 

888 thres[i] = _jit_min_weighted_error_rate_threshold( 

889 dev_neg, dev_pos, k, True 

890 ) 

891 tmp = _jit_farfrr(test_negatives, test_positives, thres[i]) 

892 tmp2 = numpy.empty((2,)) 

893 tmp2[0] = tmp[0] 

894 tmp2[1] = tmp[1] 

895 mwer[i] = numpy.mean(tmp2) 

896 

897 if thresholds: 

898 return numpy.vstack((alpha, mwer, thres)) 

899 return numpy.vstack((alpha, mwer)) 

900 

901 

902def f_score(negatives, positives, threshold, weight=1.0): 

903 r"""Computes the F-score of the accuracy of the classification 

904 

905 The F-score is a weighted mean of precision and recall measurements, see 

906 :py:func:`precision_recall`. It is computed as: 

907 

908 .. math:: 

909 

910 \mathrm{\text{f-score}}(w) = (1 + w^2)\frac{\mathrm{precision}\cdot{}\mathrm{recall}}{w^2\cdot{}\mathrm{precision} + \mathrm{recall}} 

911 

912 The weight :math:`w` needs to be non-negative real value. In case the 

913 weight parameter is 1 (the default), the F-score is called F1 score and is 

914 a harmonic mean between precision and recall values. 

915 

916 

917 Parameters 

918 ========== 

919 

920 negatives : numpy.ndarray (1D, float) 

921 

922 The set of negative scores to compute the precision and recall 

923 

924 positives : numpy.ndarray (1D, float) 

925 

926 The set of positive scores to compute the precision and recall 

927 

928 threshold : float 

929 

930 The threshold to compute the precision and recall for 

931 

932 weight : :py:class:`float`, Optional 

933 

934 The weight :math:`w` between precision and recall 

935 

936 

937 Returns 

938 ======= 

939 

940 f_score : float 

941 

942 The computed f-score for the given scores and the given threshold 

943 

944 """ 

945 weight = weight if weight > 0 else 1 

946 w2 = weight**2 

947 p, r = precision_recall(negatives, positives, threshold) 

948 if p == 0.0 and r == 0.0: 

949 return 0.0 

950 return (1 + w2) * (p * r) / ((w2 * p) + r) 

951 

952 

953@array_jit 

954def far_threshold(negatives, positives, far_value=0.001, is_sorted=False): 

955 """Threshold such that the real FPR is **at most** the requested ``far_value`` if possible 

956 

957 

958 .. note:: 

959 

960 The scores will be sorted internally, requiring the scores to be copied. 

961 To avoid this copy, you can sort the ``negatives`` scores externally in 

962 ascendant order, and set the ``is_sorted`` parameter to ``True``. 

963 

964 

965 Parameters 

966 ========== 

967 

968 negatives : numpy.ndarray (1D, float) 

969 

970 The set of negative scores to compute the FPR threshold 

971 

972 positives : numpy.ndarray (1D, float) 

973 

974 Ignored, but needs to be specified -- may be given as ``[]`` 

975 

976 far_value : :py:class:`float`, Optional 

977 

978 The FPR value, for which the threshold should be computed 

979 

980 is_sorted : :py:class:`bool`, Optional 

981 

982 Set this to ``True`` if the ``negatives`` are already sorted in 

983 ascending order. If ``False``, scores will be sorted internally, which 

984 will require more memory. 

985 

986 

987 Returns 

988 ======= 

989 

990 threshold : float 

991 

992 The threshold such that the real FPR is at most ``far_value`` 

993 

994 """ 

995 

996 # N.B.: Unoptimized version ported from C++ 

997 

998 if far_value < 0.0 or far_value > 1.0: 

999 raise RuntimeError("`far_value' must be in the interval [0.,1.]") 

1000 

1001 if len(negatives) < 2: 

1002 raise RuntimeError("the number of negative scores must be at least 2") 

1003 

1004 epsilon = numpy.finfo(numpy.float64).eps 

1005 # if not pre-sorted, copies and sorts 

1006 scores = negatives if is_sorted else numpy.sort(negatives) 

1007 

1008 # handles special case of far == 1 without any iterating 

1009 if far_value >= (1 - epsilon): 

1010 return numpy.nextafter(scores[0], scores[0] - 1) 

1011 

1012 # Reverse negatives so the end is the start. This way the code below will 

1013 # be very similar to the implementation in the frr_threshold function. The 

1014 # implementations are not exactly the same though. 

1015 scores = numpy.flip(scores) 

1016 

1017 # Move towards the end of array changing the threshold until we cross the 

1018 # desired FAR value. Starting with a threshold that corresponds to FAR == 

1019 # 0. 

1020 total_count = len(scores) 

1021 current_position = 0 

1022 

1023 # since the comparison is `if score >= threshold then accept as genuine`, 

1024 # we can choose the largest score value + eps as the threshold so that we 

1025 # can get for 0% FAR. 

1026 valid_threshold = numpy.nextafter( 

1027 scores[current_position], scores[current_position] + 1 

1028 ) 

1029 current_threshold = 0.0 

1030 

1031 while current_position < total_count: 

1032 current_threshold = scores[current_position] 

1033 # keep iterating if values are repeated 

1034 while ( 

1035 current_position < (total_count - 1) 

1036 and scores[current_position + 1] == current_threshold 

1037 ): 

1038 current_position += 1 

1039 # All the scores up to the current position and including the current 

1040 # position will be accepted falsely. 

1041 future_far = (current_position + 1) / total_count 

1042 if future_far > far_value: 

1043 break 

1044 valid_threshold = current_threshold 

1045 current_position += 1 

1046 

1047 return valid_threshold 

1048 

1049 

1050_jit_far_threshold = far_threshold.jit_func 

1051 

1052 

1053@array_jit 

1054def farfrr(negatives, positives, threshold): 

1055 """Calculates the false-acceptance (FA) ratio and the false-rejection (FR) ratio for the given positive and negative scores and a score threshold 

1056 

1057 ``positives`` holds the score information for samples that are labeled to 

1058 belong to a certain class (a.k.a., 'signal' or 'client'). ``negatives`` 

1059 holds the score information for samples that are labeled **not** to belong 

1060 to the class (a.k.a., 'noise' or 'impostor'). It is expected that 

1061 'positive' scores are, at least by design, greater than 'negative' scores. 

1062 So, every 'positive' value that falls bellow the threshold is considered a 

1063 false-rejection (FR). `negative` samples that fall above the threshold are 

1064 considered a false-accept (FA). 

1065 

1066 Positives that fall on the threshold (exactly) are considered correctly 

1067 classified. Negatives that fall on the threshold (exactly) are considered 

1068 **incorrectly** classified. This equivalent to setting the comparison like 

1069 this pseudo-code: 

1070 

1071 .. code-block:: python 

1072 

1073 false_rejects = 0 

1074 false_accepts = 0 

1075 for k in positives: 

1076 if k < threshold: 

1077 false_rejects += 1 

1078 for k in negatives: 

1079 if k >= threshold: 

1080 false_accepts += 1 

1081 

1082 

1083 The output is in form of a tuple of two double-precision float numbers. 

1084 The numbers range from 0 to 1. The first element of the pair is the false 

1085 positive ratio (FPR), the second element the false negative ratio (FNR). 

1086 

1087 The ``threshold`` value does not necessarily have to fall in the range 

1088 covered by the input scores (negatives and positives altogether), but if it 

1089 does not, the output will be either (1.0, 0.0) or (0.0, 1.0), depending on 

1090 the side the threshold falls. 

1091 

1092 It is possible that scores are inverted in the negative/positive sense. In 

1093 some setups the designer may have setup the system so 'positive' samples 

1094 have a smaller score than the 'negative' ones. In this case, make sure you 

1095 normalize the scores so positive samples have greater scores before feeding 

1096 them into this method. 

1097 

1098 

1099 Parameters 

1100 ========== 

1101 

1102 negatives : numpy.ndarray (1D, float) 

1103 

1104 The scores for comparisons of objects of different classes 

1105 

1106 positives : numpy.ndarray (1D, float) 

1107 

1108 The scores for comparisons of objects of the same class 

1109 

1110 threshold : float 

1111 

1112 The threshold to separate correctly and incorrectly classified scores 

1113 

1114 

1115 Returns 

1116 ======= 

1117 

1118 far : float 

1119 

1120 The False Positve Rate (FPR) for the given threshold 

1121 

1122 frr : float 

1123 

1124 The False Negative Rate (FNR) for the given threshold 

1125 

1126 """ 

1127 

1128 if numpy.isnan(threshold): 

1129 print("Error: Cannot compute FPR (FAR) or FNR (FRR) with NaN threshold") 

1130 return (1.0, 1.0) 

1131 

1132 if not len(negatives): 

1133 raise RuntimeError( 

1134 "Cannot compute FPR (FAR) when no negatives are given" 

1135 ) 

1136 

1137 if not len(positives): 

1138 raise RuntimeError( 

1139 "Cannot compute FNR (FRR) when no positives are given" 

1140 ) 

1141 

1142 return (negatives >= threshold).sum() / len(negatives), ( 

1143 positives < threshold 

1144 ).sum() / len(positives) 

1145 

1146 

1147_jit_farfrr = farfrr.jit_func 

1148 

1149 

1150@array_jit 

1151def frr_threshold(negatives, positives, frr_value=0.001, is_sorted=False): 

1152 """Computes the threshold such that the real FNR is **at most** the requested ``frr_value`` if possible 

1153 

1154 

1155 .. note:: 

1156 

1157 The scores will be sorted internally, requiring the scores to be copied. 

1158 To avoid this copy, you can sort the ``positives`` scores externally in 

1159 ascendant order, and set the ``is_sorted`` parameter to ``True``. 

1160 

1161 

1162 Parameters 

1163 ========== 

1164 

1165 negatives : numpy.ndarray (1D, float) 

1166 

1167 Ignored, but needs to be specified -- may be given as ``[]``. 

1168 

1169 positives : numpy.ndarray (1D, float) 

1170 

1171 The set of positive scores to compute the FNR threshold. 

1172 

1173 frr_value : :py:class:`float`, Optional 

1174 

1175 The FNR value, for which the threshold should be computed. 

1176 

1177 is_sorted : :py:class:`bool`, Optional 

1178 

1179 Set this to ``True`` if the ``positives`` are already sorted in 

1180 ascending order. If ``False``, scores will be sorted internally, which 

1181 will require more memory. 

1182 

1183 

1184 Returns 

1185 ======= 

1186 

1187 threshold : float 

1188 

1189 The threshold such that the real FRR is at most ``frr_value``. 

1190 

1191 """ 

1192 

1193 # N.B.: Unoptimized version ported from C++ 

1194 

1195 if frr_value < 0.0 or frr_value > 1.0: 

1196 raise RuntimeError("`frr_value' value must be in the interval [0.,1.]") 

1197 

1198 if len(positives) < 2: 

1199 raise RuntimeError("the number of positive scores must be at least 2") 

1200 

1201 epsilon = numpy.finfo(numpy.float64).eps 

1202 # if not pre-sorted, copies and sorts 

1203 scores = positives if is_sorted else numpy.sort(positives) 

1204 

1205 # handles special case of far == 1 without any iterating 

1206 if frr_value >= (1 - epsilon): 

1207 return numpy.nextafter(scores[-1], scores[-1] + 1) 

1208 

1209 # Move towards the end of array changing the threshold until we cross the 

1210 # desired FRR value. Starting with a threshold that corresponds to FRR == 

1211 # 0. 

1212 total_count = len(scores) 

1213 current_position = 0 

1214 

1215 # since the comparison is `if score >= threshold then accept as genuine`, 

1216 # we can choose the largest score value + eps as the threshold so that we 

1217 # can get for 0% FAR. 

1218 valid_threshold = numpy.nextafter( 

1219 scores[current_position], scores[current_position] + 1 

1220 ) 

1221 current_threshold = 0.0 

1222 

1223 while current_position < total_count: 

1224 current_threshold = scores[current_position] 

1225 # keep iterating if values are repeated 

1226 while ( 

1227 current_position < (total_count - 1) 

1228 and scores[current_position + 1] == current_threshold 

1229 ): 

1230 current_position += 1 

1231 # All the scores up to the current position and including the current 

1232 # position will be accepted falsely. 

1233 future_frr = current_position / total_count 

1234 if future_frr > frr_value: 

1235 break 

1236 valid_threshold = current_threshold 

1237 current_position += 1 

1238 

1239 return valid_threshold 

1240 

1241 

1242_jit_frr_threshold = frr_threshold.jit_func 

1243 

1244 

1245def min_hter_threshold(negatives, positives, is_sorted=False): 

1246 """Calculates the :py:func:`min_weighted_error_rate_threshold` with ``cost=0.5`` 

1247 

1248 Parameters 

1249 ========== 

1250 

1251 negatives, positives : numpy.ndarray (1D, float) 

1252 

1253 The set of negative and positive scores to compute the threshold 

1254 

1255 is_sorted : :py:class:`bool`, Optional 

1256 

1257 Set this to ``True`` if the ``positives`` are already sorted in 

1258 ascending order. If ``False``, scores will be sorted internally, which 

1259 will require more memory. 

1260 

1261 

1262 Returns 

1263 ======= 

1264 

1265 threshold : float 

1266 

1267 The threshold for which the weighted error rate is minimal 

1268 

1269 """ 

1270 return min_weighted_error_rate_threshold( 

1271 negatives, positives, 0.5, is_sorted 

1272 ) 

1273 

1274 

1275@array_jit 

1276def min_weighted_error_rate_threshold( 

1277 negatives, positives, cost, is_sorted=False 

1278): 

1279 """Calculates the threshold that minimizes the error rate 

1280 

1281 The ``cost`` parameter determines the relative importance between 

1282 false-accepts and false-rejections. This number should be between 0 and 1 

1283 and will be clipped to those extremes. The value to minimize becomes: 

1284 :math:`ER_{cost} = cost * FPR + (1-cost) * FNR`. The higher the cost, the 

1285 higher the importance given to **not** making mistakes classifying 

1286 negatives/noise/impostors. 

1287 

1288 .. note:: 

1289 

1290 The scores will be sorted internally, requiring the scores to be copied. 

1291 To avoid this copy, you can sort both sets of scores externally in 

1292 ascendant order, and set the ``is_sorted`` parameter to ``True``. 

1293 

1294 

1295 Parameters 

1296 ========== 

1297 

1298 negatives, positives : numpy.ndarray (1D, float) 

1299 

1300 The set of negative and positive scores to compute the threshold 

1301 

1302 cost : float 

1303 

1304 The relative cost over FPR with respect to FNR in the threshold 

1305 calculation 

1306 

1307 is_sorted : :py:class:`bool`, Optional 

1308 

1309 Set this to ``True`` if the ``positives`` are already sorted in 

1310 ascending order. If ``False``, scores will be sorted internally, which 

1311 will require more memory. 

1312 

1313 

1314 Returns 

1315 ======= 

1316 

1317 threshold : float 

1318 

1319 The threshold for which the weighted error rate is minimal 

1320 

1321 """ 

1322 

1323 # if not pre-sorted, copies and sorts 

1324 neg = negatives if is_sorted else numpy.sort(negatives) 

1325 pos = positives if is_sorted else numpy.sort(positives) 

1326 if cost > 1.0: 

1327 cost = 1.0 

1328 elif cost < 0.0: 

1329 cost = 0.0 

1330 

1331 return _minimizing_threshold(neg, pos, "weighted-error", cost) 

1332 

1333 

1334_jit_min_weighted_error_rate_threshold = ( 

1335 min_weighted_error_rate_threshold.jit_func 

1336) 

1337 

1338 

1339# @jit([(numba.float64[:, :],)], nopython=True) 

1340def ppndf(p): 

1341 """Returns the Deviate Scale equivalent of a false rejection/acceptance ratio 

1342 

1343 The algorithm that calculates the deviate scale is based on function 

1344 ``ppndf()`` from the NIST package DETware version 2.1, freely available on 

1345 the internet. Please consult it for more details. By 20.04.2011, you could 

1346 find such package `here <http://www.itl.nist.gov/iad/mig/tools/>`_. 

1347 

1348 The input to this function is a cumulative probability. The output from 

1349 this function is the Normal deviate that corresponds to that probability. 

1350 For example: 

1351 

1352 -------+-------- 

1353 INPUT | OUTPUT 

1354 -------+-------- 

1355 0.001 | -3.090 

1356 0.01 | -2.326 

1357 0.1 | -1.282 

1358 0.5 | 0.0 

1359 0.9 | 1.282 

1360 0.99 | 2.326 

1361 0.999 | 3.090 

1362 -------+-------- 

1363 

1364 

1365 Parameters 

1366 ========== 

1367 

1368 p : numpy.ndarray (2D, float) 

1369 

1370 The value (usually FPR or FNR) for which the PPNDF should be calculated 

1371 

1372 

1373 Returns 

1374 ======= 

1375 

1376 ppndf : numpy.ndarray (2D, float) 

1377 

1378 The derivative scale of the given value 

1379 

1380 """ 

1381 

1382 # threshold 

1383 epsilon = numpy.finfo(numpy.float64).eps 

1384 p_new = numpy.copy(p) 

1385 p_new = numpy.where(p_new >= 1.0, 1.0 - epsilon, p_new) 

1386 p_new = numpy.where(p_new <= 0.0, epsilon, p_new) 

1387 

1388 q = p_new - 0.5 

1389 abs_q_smaller = numpy.abs(q) <= 0.42 

1390 abs_q_bigger = ~abs_q_smaller 

1391 

1392 retval = numpy.zeros_like(p_new) 

1393 

1394 # first part q<=0.42 

1395 q1 = q[abs_q_smaller] 

1396 r = numpy.square(q1) 

1397 opt1 = ( 

1398 q1 

1399 * ( 

1400 ((-25.4410604963 * r + 41.3911977353) * r + -18.6150006252) * r 

1401 + 2.5066282388 

1402 ) 

1403 / ( 

1404 ( 

1405 ((3.1308290983 * r + -21.0622410182) * r + 23.0833674374) * r 

1406 + -8.4735109309 

1407 ) 

1408 * r 

1409 + 1.0 

1410 ) 

1411 ) 

1412 retval[abs_q_smaller] = opt1 

1413 

1414 # second part q>0.42 

1415 # r = sqrt (log (0.5 - abs(q))); 

1416 q2 = q[abs_q_bigger] 

1417 r = p_new[abs_q_bigger] 

1418 r[q2 > 0] = 1 - r[q2 > 0] 

1419 if (r <= 0).any(): 

1420 raise RuntimeError("measure::ppndf(): r <= 0.0!") 

1421 

1422 r = numpy.sqrt(-1 * numpy.log(r)) 

1423 opt2 = ( 

1424 ((2.3212127685 * r + 4.8501412713) * r + -2.2979647913) * r 

1425 + -2.7871893113 

1426 ) / ((1.6370678189 * r + 3.5438892476) * r + 1.0) 

1427 opt2[q2 < 0] *= -1 

1428 retval[abs_q_bigger] = opt2 

1429 

1430 return retval 

1431 

1432 

1433@array_jit 

1434def precision_recall(negatives, positives, threshold): 

1435 r"""Calculates the precision and recall (sensitivity) values given negative and positive scores and a threshold 

1436 

1437 Precision and recall are computed as: 

1438 

1439 .. math:: 

1440 

1441 \mathrm{precision} = \frac{tp}{tp + fp} 

1442 

1443 \mathrm{recall} = \frac{tp}{tp + fn} 

1444 

1445 where :math:`tp` are the true positives, :math:`fp` are the false positives 

1446 and :math:`fn` are the false negatives. 

1447 

1448 ``positives`` holds the score information for samples that are labeled to 

1449 belong to a certain class (a.k.a., 'signal' or 'client'). ``negatives`` 

1450 holds the score information for samples that are labeled **not** to belong 

1451 to the class (a.k.a., 'noise' or 'impostor'). For more precise details 

1452 about how the method considers error rates, see :py:func:`farfrr`. 

1453 

1454 

1455 Parameters 

1456 ========== 

1457 

1458 negatives, positives : numpy.ndarray (1D, float) 

1459 

1460 The set of negative and positive scores to compute the measurements 

1461 

1462 threshold : float 

1463 

1464 The threshold to compute the measures for 

1465 

1466 

1467 Returns 

1468 ======= 

1469 

1470 precision : float 

1471 

1472 The precision value for the given negatives and positives 

1473 

1474 recall : float 

1475 

1476 The recall value for the given negatives and positives 

1477 

1478 """ 

1479 

1480 if not len(positives) or not len(negatives): 

1481 raise RuntimeError( 

1482 "Cannot compute precision or recall when no " 

1483 "positives or no negatives are given" 

1484 ) 

1485 

1486 FP = (negatives >= threshold).sum() 

1487 TP = (positives >= threshold).sum() 

1488 CP = TP + FP 

1489 if CP == 0: 

1490 CP = 1 

1491 return TP / CP, TP / len(positives) 

1492 

1493 

1494_jit_precision_recall = precision_recall.jit_func 

1495 

1496 

1497@array_jit 

1498def precision_recall_curve(negatives, positives, n_points): 

1499 """Calculates the precision-recall curve given a set of positive and negative scores and a number of desired points 

1500 

1501 The points in which the curve is calculated are distributed uniformly in 

1502 the range ``[min(negatives, positives), max(negatives, positives)]`` 

1503 

1504 

1505 Parameters 

1506 ========== 

1507 

1508 negatives, positives : numpy.ndarray (1D, float) 

1509 

1510 The set of negative and positive scores to compute the measurements 

1511 

1512 n_points : int 

1513 

1514 The number of thresholds for which precision and recall should be 

1515 evaluated 

1516 

1517 

1518 Returns 

1519 ======= 

1520 

1521 curve : numpy.ndarray (2D, float) 

1522 

1523 2D array of floats that express the X (precision) and Y (recall) 

1524 coordinates. 

1525 

1526 """ 

1527 curve = numpy.empty((2, n_points)) 

1528 for i, k in enumerate( 

1529 _meaningful_thresholds(negatives, positives, n_points, -8, False) 

1530 ): 

1531 x, y = _jit_precision_recall(negatives, positives, k) 

1532 curve[0, i] = x 

1533 curve[1, i] = y 

1534 return curve 

1535 # return numpy.array( 

1536 # [ 

1537 

1538 # for k in 

1539 # ] 

1540 # ).T 

1541 

1542 

1543@array_jit 

1544def roc(negatives, positives, n_points, min_far=-8): 

1545 """Calculates points of an Receiver Operating Characteristic (ROC) 

1546 

1547 Calculates the ROC curve given a set of negative and positive scores and a 

1548 desired number of points. 

1549 

1550 

1551 Parameters 

1552 ========== 

1553 

1554 ``negatives, positives`` : numpy.ndarray (1D, float) 

1555 

1556 The negative and positive scores, for which the ROC curve should be 

1557 calculated. 

1558 

1559 n_points : int 

1560 

1561 The number of points, in which the ROC curve are calculated, which are 

1562 distributed uniformly in the range ``[min(negatives, positives), 

1563 max(negatives, positives)]`` 

1564 

1565 min_far : int 

1566 

1567 Minimum FAR in terms of :math:`10^(\text{min_far}`. This value is also 

1568 used for ``min_frr``. Values should be negative. 

1569 

1570 

1571 Returns 

1572 ======= 

1573 

1574 curve : numpy.ndarray (2D, float) 

1575 

1576 A two-dimensional array of doubles that express the X (FPR) and Y (FNR) 

1577 coordinates in this order 

1578 

1579 """ 

1580 

1581 t = _meaningful_thresholds(negatives, positives, n_points, min_far, False) 

1582 curve = numpy.empty((2, len(t))) 

1583 for i, k in enumerate(t): 

1584 curve[:, i] = _jit_farfrr(negatives, positives, k) 

1585 return curve 

1586 

1587 

1588@array_jit 

1589def roc_for_far(negatives, positives, far_list, is_sorted=False): 

1590 """Calculates the ROC curve for a given set of positive and negative scores and the FPR values, for which the FNR should be computed 

1591 

1592 .. note:: 

1593 

1594 The scores will be sorted internally, requiring the scores to be copied. 

1595 To avoid this copy, you can sort both sets of scores externally in 

1596 ascendant order, and set the ``is_sorted`` parameter to ``True``. 

1597 

1598 

1599 Parameters 

1600 ========== 

1601 

1602 negatives, positives : numpy.ndarray (1D, float) 

1603 

1604 The set of negative and positive scores to compute the curve 

1605 

1606 far_list : numpy.ndarray (1D, float) 

1607 

1608 A list of FPR values, for which the FNR values should be computed 

1609 

1610 is_sorted : :py:class:`bool`, Optional 

1611 

1612 Set this to ``True`` if both sets of scores are already sorted in 

1613 ascending order. If ``False``, scores will be sorted internally, which 

1614 will require more memory. 

1615 

1616 

1617 Returns 

1618 ======= 

1619 

1620 curve : numpy.ndarray (2D, float) 

1621 

1622 The ROC curve, which holds a copy of the given FPR values in row 0, and 

1623 the corresponding FNR values in row 1. 

1624 

1625 """ 

1626 if len(negatives) == 0: 

1627 raise RuntimeError("The given set of negatives is empty.") 

1628 

1629 if len(positives) == 0: 

1630 raise RuntimeError("The given set of positives is empty.") 

1631 

1632 # if not pre-sorted, copies and sorts 

1633 neg = negatives if is_sorted else numpy.sort(negatives) 

1634 pos = positives if is_sorted else numpy.sort(positives) 

1635 

1636 # Get the threshold for the requested far values and calculate far and frr 

1637 # values based on the threshold. 

1638 curve = numpy.empty((2, len(far_list))) 

1639 for i, k in enumerate(far_list): 

1640 curve[:, i] = _jit_farfrr( 

1641 neg, pos, _jit_far_threshold(neg, pos, k, True) 

1642 ) 

1643 return curve