Coverage for src/credible/bayesian/utils.py: 87%

68 statements  

« prev     ^ index     » next       coverage.py v7.4.2, created at 2024-04-16 09:18 +0200

1# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> 

2# 

3# SPDX-License-Identifier: GPL-3.0-or-later 

4"""Functions to evalute the (Bayesian) credible region of measures. 

5 

6(Bayesian) credible region interpretation, with 95% coverage: The probability 

7that the true proportion will lie within the 95% credible interval is 0.95. 

8 

9Contrary to frequentist approaches, in which one can only say that if the test 

10were repeated an infinite number of times, and one constructed a confidence 

11interval each time, then X% of the confidence intervals would contain the true 

12rate, here we can say that given our observed data, there is a X% probability 

13that the true value of :math:`k/n` falls within the provided interval. 

14 

15See a discussion in `Five Confidence Intervals for Proportions That You 

16Should Know About <ci-evaluation_>`_ for a study on coverage for most common 

17methods. 

18 

19.. note:: 

20 

21 For a disambiguation with `Confidence Interval <confidence-interval_>`_ (the 

22 frequentist approach), read `Credible Regions or Intervals 

23 <credible-interval_>`_. 

24 

25.. include:: ../links.rst 

26""" 

27 

28import typing 

29 

30import numpy 

31import numpy.random 

32import numpy.typing 

33import scipy.special 

34 

35from ..utils import as_int_arrays 

36 

37 

38def _beta_ndarray( 

39 successes: numpy.typing.NDArray[numpy.integer], 

40 failures: numpy.typing.NDArray[numpy.integer], 

41 lambda_: float, 

42 coverage: float, 

43) -> tuple[ 

44 numpy.typing.NDArray[numpy.double], 

45 numpy.typing.NDArray[numpy.double], 

46 numpy.typing.NDArray[numpy.double], 

47 numpy.typing.NDArray[numpy.double], 

48]: 

49 r""":py:func:`beta`, for multiple systems. 

50 

51 Parameters 

52 ---------- 

53 successes 

54 Number of successes (``k``) observed on the experiment. 

55 failures 

56 Number of failures (``l``) observed on the experiment. 

57 lambda_ 

58 The parameterisation of the Beta prior to consider. Use 

59 :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for 

60 Jeffrey's prior. If unsure, use 0.5. 

61 coverage 

62 A floating-point number between 0 and 1.0 indicating the 

63 coverage you're expecting. A value of 0.95 will ensure 95% 

64 of the area under the probability density of the posterior 

65 is covered by the returned equal-tailed interval. 

66 

67 Returns 

68 ------- 

69 A tuple containing 4 floating point numbers representing: 

70 

71 * mean: The mean of the posterior distribution 

72 * mode: The mode of the posterior distribution 

73 * lower: The lower bounds of the credible region 

74 * upper: The upper bounds of the credible region 

75 

76 If the input was a 1-D array with multiple experiments, then the return 

77 value of this function is also composed of 1-D arrays, representing the 

78 mean, mode, lower and upper bounds of the various credible regions 

79 defined for each of the success/failure tuples on the input. 

80 

81 Raises 

82 ------ 

83 TypeError 

84 If the dimensions of ``successes`` and ``failures`` do not match, or in 

85 case the input types are unsupported. 

86 """ 

87 

88 # we return the equally-tailed range 

89 right = (1.0 - coverage) / 2 # half-width in each side 

90 lower = scipy.special.betaincinv( 

91 successes + lambda_, failures + lambda_, right 

92 ) 

93 upper = scipy.special.betaincinv( 

94 successes + lambda_, failures + lambda_, 1.0 - right 

95 ) 

96 

97 # evaluate mean and mode (https://en.wikipedia.org/wiki/Beta_distribution) 

98 alpha = successes + lambda_ 

99 beta = failures + lambda_ 

100 

101 mean = numpy.nan_to_num(alpha / (alpha + beta)) 

102 

103 # the mode of a beta distribution is a bit tricky 

104 mode = numpy.zeros_like(lower) 

105 cond = (alpha > 1) & (beta > 1) 

106 mode[cond] = (alpha[cond] - 1) / (alpha[cond] + beta[cond] - 2) 

107 # In the case of precision, if the threshold is close to 1.0, both TP 

108 # and FP can be zero, which may cause this condition to be reached, if 

109 # the prior is exactly 1 (flat prior). This is a weird situation, 

110 # because effectively we are trying to compute the posterior when the 

111 # total number of experiments is zero. So, only the prior counts - but 

112 # the prior is flat, so we should just pick a value. We choose the 

113 # middle of the range. 

114 # conda = alpha == 1 and beta == 1 

115 # mode[cond] = 0.0 

116 # conda = alpha <= 1 and beta > 1 

117 # mode[cond] = 0.0 

118 mode[(alpha > 1) & (beta <= 1)] = 1.0 

119 # else: #elif alpha < 1 and beta < 1: 

120 # in the case of precision, if the threshold is close to 1.0, both TP 

121 # and FP can be zero, which may cause this condition to be reached, if 

122 # the prior is smaller than 1. This is a weird situation, because 

123 # effectively we are trying to compute the posterior when the total 

124 # number of experiments is zero. So, only the prior counts - but the 

125 # prior is bimodal, so we should just pick a value. We choose the 

126 # left of the range. 

127 # n.b.: could also be 1.0 as the prior is bimodal 

128 # mode[alpha < 1 and beta < 1] = 0.0 

129 

130 return mean, mode, lower, upper 

131 

132 

133def beta_array( 

134 successes: typing.Iterable[int], 

135 failures: typing.Iterable[int], 

136 lambda_: float, 

137 coverage: float, 

138) -> tuple[ 

139 numpy.typing.NDArray[numpy.double], 

140 numpy.typing.NDArray[numpy.double], 

141 numpy.typing.NDArray[numpy.double], 

142 numpy.typing.NDArray[numpy.double], 

143]: 

144 r""":py:func:`beta`, for multiple systems. 

145 

146 Parameters 

147 ---------- 

148 successes 

149 Number of successes (``k``) observed on the experiment. 

150 failures 

151 Number of failures (``l``) observed on the experiment. 

152 lambda_ 

153 The parameterisation of the Beta prior to consider. Use 

154 :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for 

155 Jeffrey's prior. If unsure, use 0.5. 

156 coverage 

157 A floating-point number between 0 and 1.0 indicating the 

158 coverage you're expecting. A value of 0.95 will ensure 95% 

159 of the area under the probability density of the posterior 

160 is covered by the returned equal-tailed interval. 

161 

162 Returns 

163 ------- 

164 A tuple containing 4 floating point numbers representing: 

165 

166 * mean: The mean of the posterior distribution 

167 * mode: The mode of the posterior distribution 

168 * lower: The lower bounds of the credible region 

169 * upper: The upper bounds of the credible region 

170 

171 If the input was a 1-D array with multiple experiments, then the return 

172 value of this function is also composed of 1-D arrays, representing the 

173 mean, mode, lower and upper bounds of the various credible regions 

174 defined for each of the success/failure tuples on the input. 

175 

176 Raises 

177 ------ 

178 TypeError 

179 If the dimensions of ``successes`` and ``failures`` do not match, or in 

180 case the input types are unsupported. 

181 """ 

182 successes_array, failures_array = as_int_arrays((successes, failures)) 

183 return _beta_ndarray(successes_array, failures_array, lambda_, coverage) 

184 

185 

186def beta( 

187 successes: int, failures: int, lambda_: float, coverage: float 

188) -> tuple[float, float, float, float]: 

189 r"""Return the mode, upper and lower bounds of the equal-tailed credible 

190 region of a probability estimate following Bernoulli trials. 

191 

192 This technique is (not) very conservative - in most of the cases, coverage 

193 closer to the extremes (0 or 1) is lower than expected (but still greater 

194 than 85%). 

195 

196 This implementation is based on [GOUTTE-2005]_. It assumes :math:`k` 

197 successes and :math:`l` failures (:math:`n = k+l` total trials) are issued 

198 from a series of Bernoulli trials (likelihood is binomial). The posterior 

199 is derivated using the Bayes Theorem with a beta prior. As there is no 

200 reason to favour high vs. low precision, we use a symmetric Beta prior 

201 (:math:`\alpha=\beta`): 

202 

203 .. math:: 

204 

205 P(p|k,n) &= \frac{P(k,n|p)P(p)}{P(k,n)} \\ 

206 P(p|k,n) &= \frac{\frac{n!}{k!(n-k)!}p^{k}(1-p)^{n-k}P(p)}{P(k)} \\ 

207 P(p|k,n) &= \frac{1}{B(k+\alpha, n-k+\beta)}p^{k+\alpha-1}(1-p)^{n-k+\beta-1} \\ 

208 P(p|k,n) &= \frac{1}{B(k+\alpha, n-k+\alpha)}p^{k+\alpha-1}(1-p)^{n-k+\alpha-1} 

209 

210 The mode for this posterior (also the maximum a posteriori) is: 

211 

212 .. math:: 

213 

214 \text{mode}(p) = \frac{k+\lambda-1}{n+2\lambda-2} 

215 

216 Concretely, the prior may be flat (all rates are equally likely, 

217 :math:`\lambda=1`) or we may use Jeoffrey's prior (:math:`\lambda=0.5`), 

218 that is invariant through re-parameterisation. Jeffrey's prior indicate 

219 that rates close to zero or one are more likely. 

220 

221 The mode above works if :math:`k+{\alpha},n-k+{\alpha} > 1`, which is 

222 usually the case for a resonably well tunned system, with more than a few 

223 samples for analysis. In the limit of the system performance, :math:`k` 

224 may be 0, which will make the mode become zero. 

225 

226 For our purposes, it may be more suitable to represent :math:`n = k + l`, 

227 with :math:`k`, the number of successes and :math:`l`, the number of 

228 failures in the binomial experiment, and find this more suitable 

229 representation: 

230 

231 .. math:: 

232 

233 P(p|k,l) &= \frac{1}{B(k+\alpha, l+\alpha)}p^{k+\alpha-1}(1-p)^{l+\alpha-1} \\ 

234 \text{mode}(p) &= \frac{k+\lambda-1}{k+l+2\lambda-2} 

235 

236 This can be mapped to most rates calculated in the context of binary 

237 classification this way: 

238 

239 * Precision or Positive-Predictive Value (PPV): p = TP/(TP+FP), so k=TP, l=FP 

240 * Recall, Sensitivity, or True Positive Rate: r = TP/(TP+FN), so k=TP, l=FN 

241 * Specificity or True Negative Rage: s = TN/(TN+FP), so k=TN, l=FP 

242 * Accuracy: acc = TP+TN/(TP+TN+FP+FN), so k=TP+TN, l=FP+FN 

243 * Jaccard: j = TP/(TP+FP+FN), so k=TP, l=FP+FN 

244 

245 .. note:: **Important** 

246 

247 To calculate the limits given the required coverage, we use the 

248 incomplete **inverse** (regularized, or normalized) beta function, 

249 :any:`scipy.special.betaincinv` instead of :any:`scipy.special.betainc`. 

250 The latter requires we provide the bounds and returns the coverage, 

251 whereas here we are interested in the *inverse* behaviour. 

252 

253 Parameters 

254 ---------- 

255 successes 

256 Number of successes (``k``) observed on the experiment. 

257 failures 

258 Number of failures (``l``) observed on the experiment. 

259 lambda_ 

260 The parameterisation of the Beta prior to consider. Use 

261 :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for 

262 Jeffrey's prior. If unsure, use 0.5. 

263 coverage 

264 A floating-point number between 0 and 1.0 indicating the 

265 coverage you're expecting. A value of 0.95 will ensure 95% 

266 of the area under the probability density of the posterior 

267 is covered by the returned equal-tailed interval. 

268 

269 Returns 

270 ------- 

271 A tuple containing 4 floating point numbers representing: 

272 

273 * mean: The mean of the posterior distribution 

274 * mode: The mode of the posterior distribution 

275 * lower: The lower bounds of the credible region 

276 * upper: The upper bounds of the credible region 

277 

278 If the input was a 1-D array with multiple experiments, then the return 

279 value of this function is also composed of 1-D arrays, representing the 

280 mean, mode, lower and upper bounds of the various credible regions 

281 defined for each of the success/failure tuples on the input. 

282 

283 Raises 

284 ------ 

285 TypeError 

286 If the dimensions of ``successes`` and ``failures`` do not match, or in 

287 case the input types are unsupported. 

288 """ 

289 retval = beta_array([successes], [failures], lambda_, coverage) 

290 return ( 

291 retval[0].item(), 

292 retval[1].item(), 

293 retval[2].item(), 

294 retval[3].item(), 

295 ) 

296 

297 

298def beta_posterior( 

299 successes: int, 

300 failures: int, 

301 lambda_: float, 

302 nb_samples: int, 

303 rng: numpy.random.Generator, 

304) -> numpy.typing.NDArray[numpy.double]: 

305 r"""Simulate the beta posterior of a system with the provided markings. 

306 

307 This implementation is based on [GOUTTE-2005]_, Equation 7. 

308 

309 Figures of merit that are supported by this procedure are those which have 

310 the form :math:`v = k / (k + l)` (successes over successes plus failures): 

311 

312 * Precision or Positive-Predictive Value (PPV): :math:`p = TP/(TP+FP)`, so 

313 :math:`k=TP`, :math:`l=FP` 

314 * Recall, Sensitivity, or True Positive Rate: :math:`r = TP/(TP+FN)`, so 

315 :math:`k=TP`, :math:`l=FN` 

316 * Specificity or True Negative Rage: :math:`s = TN/(TN+FP)`, so :math:`k=TN`, 

317 :math:`l=FP` 

318 * Accuracy: :math:`acc = TP+TN/(TP+TN+FP+FN)`, so :math:`k=TP+TN`, 

319 :math:`l=FP+FN` 

320 * Jaccard Index: :math:`j = TP/(TP+FP+FN)`, so :math:`k=TP`, :math:`l=FP+FN` 

321 

322 Parameters 

323 ---------- 

324 successes 

325 Number of successes (``k``) observed on the experiment. May be a 

326 scalar of any integral type, or an array of integral values indicating 

327 a series of experiments to calculate the credible regions for. 

328 failures 

329 Number of failures (``l``) observed on the experiment. May be a scalar 

330 of any integral type, or an array of integral values indicating a 

331 series of experiments to calculate the credible regions for. 

332 lambda_ 

333 The parameterisation of the Beta prior to consider. Use 

334 :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for 

335 Jeffrey's prior. 

336 nb_samples 

337 Number of generated gamma distribution values. 

338 rng 

339 An initialized numpy random number generator. 

340 

341 Returns 

342 ------- 

343 Variates: An array with size ``nb_samples`` containing a realization of 

344 Equation 7 from [GOUTTE-2005]_. If ``successes`` and ``failures`` are 

345 sequences of arrays, then returns a 2D array where each row represents 

346 one set of realisations. 

347 

348 Raises 

349 ------ 

350 TypeError 

351 If the dimensions of ``successes`` and ``failures`` do not match. 

352 """ 

353 return rng.beta( 

354 a=successes + lambda_, b=failures + lambda_, size=nb_samples 

355 ) 

356 

357 

358def average_beta_posterior( 

359 successes: typing.Iterable[int], 

360 failures: typing.Iterable[int], 

361 lambda_: float, 

362 nb_samples: int, 

363 rng: numpy.random.Generator, 

364) -> numpy.typing.NDArray[numpy.double]: 

365 r"""Simulate the average beta posterior of many systems. 

366 

367 This implementation is based on [GOUTTE-2005]_, Equation 7. 

368 

369 Figures of merit that are supported by this procedure are those which have 

370 the form :math:`v = k / (k + l)` (successes over successes plus failures): 

371 

372 * Precision or Positive-Predictive Value (PPV): :math:`p = TP/(TP+FP)`, so 

373 :math:`k=TP`, :math:`l=FP` 

374 * Recall, Sensitivity, or True Positive Rate: :math:`r = TP/(TP+FN)`, so 

375 :math:`k=TP`, :math:`l=FN` 

376 * Specificity or True Negative Rate: :math:`s = TN/(TN+FP)`, so :math:`k=TN`, 

377 :math:`l=FP` 

378 * Accuracy: :math:`acc = TP+TN/(TP+TN+FP+FN)`, so :math:`k=TP+TN`, 

379 :math:`l=FP+FN` 

380 * Jaccard Index: :math:`j = TP/(TP+FP+FN)`, so :math:`k=TP`, :math:`l=FP+FN` 

381 

382 Parameters 

383 ---------- 

384 successes 

385 Sequence of integers expressing the number of successes (``k``) 

386 observed on the various experiments whose metric is to be averaged. 

387 failures 

388 Sequence of integers expressing the number of failures (``l``) 

389 observed on the various experiments whose metric is to be averaged. 

390 lambda_ 

391 The parameterisation of the Beta prior to consider. Use 

392 :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for 

393 Jeffrey's prior. 

394 nb_samples 

395 Number of generated variates for the M-C simulation. 

396 rng 

397 An initialized numpy random number generator. 

398 

399 Returns 

400 ------- 

401 Variates: An array with size ``nb_samples`` containing a realization of 

402 Equation 7 from [GOUTTE-2005]_, considering the averaging of all input 

403 systems. 

404 """ 

405 successes_array, failures_array = as_int_arrays((successes, failures)) 

406 

407 if successes_array.ndim == 0: 

408 successes_array = numpy.atleast_1d(successes_array) 

409 failures_array = numpy.atleast_1d(failures_array) 

410 

411 return numpy.mean( 

412 [ 

413 beta_posterior(kk, ll, lambda_, nb_samples, rng) 

414 for kk, ll in zip(successes_array, failures_array) 

415 ], 

416 axis=0, 

417 ) 

418 

419 

420def evaluate_statistics( 

421 variates: typing.Iterable[float], 

422 coverage: float, 

423 bins: int | str, 

424) -> tuple[float, float, float, float]: 

425 """Evaluate the left and right margins for a given M-C distribution. 

426 

427 Parameters 

428 ---------- 

429 variates 

430 A 1-D array containing the simulated variates. 

431 coverage 

432 A number, between 0 and 1 to indicate the desired coverage. Typically, 

433 this number is set to 0.95 (95% coverage). 

434 bins 

435 Histogram binning for defining the mode of the original distribution 

436 where variates where sampled from. A larger number of bins will make 

437 it unlikely to discover the true mode. You may also pass any of the 

438 string method names defined at :py:func:`numpy.histogram_bin_edges`. A 

439 method such as ``doane`` should work fine in most cases. 

440 

441 Returns 

442 ------- 

443 statistics: mean, (a rough estimated of the) mode and lower and upper 

444 bounds of the credible intervals for the input simulation. 

445 

446 .. note:: 

447 

448 Mode estimation depends on the number of input ``variates``, and 

449 histogram binning (``bins``). Using :py:func:`scipy.stats.mode` can 

450 only evaluate most common value, which is **not reliable** for 

451 continuous variables. 

452 """ 

453 

454 left_half = (1 - coverage) / 2 # size of excluded (half) area 

455 variates_array = numpy.asarray(variates, dtype=numpy.double) 

456 sorted_variates = numpy.sort(variates_array) 

457 

458 # n.b.: we return the equally tailed range 

459 

460 # calculates position of score which would exclude the left_half (left) 

461 lower_index = int(round(len(variates_array) * left_half)) 

462 

463 # calculates position of score which would exclude the right_half (right) 

464 upper_index = int(round(len(variates_array) * (1 - left_half))) 

465 

466 lower = sorted_variates[lower_index - 1] 

467 upper = sorted_variates[upper_index - 1] 

468 

469 # we evaluate the histogram and get the maximum value from there 

470 hist, ranges = numpy.histogram(variates_array, bins=bins) 

471 idx = numpy.argmax(hist) 

472 mode = (ranges[idx] + ranges[idx + 1]) / 2 

473 

474 return ( 

475 numpy.mean(variates_array).item(), 

476 mode, 

477 lower, 

478 upper, 

479 ) 

480 

481 

482def average_beta( 

483 successes: typing.Iterable[int], 

484 failures: typing.Iterable[int], 

485 lambda_: float, 

486 coverage: float, 

487 nb_samples: int, 

488 rng: numpy.random.Generator, 

489) -> tuple[float, float, float, float]: 

490 r"""Return mean, mode, upper and lower bounds of the credible region of an 

491 average of measures with beta posteriors. 

492 

493 This implementation is based on [GOUTTE-2005]_. 

494 

495 Parameters 

496 ---------- 

497 successes 

498 Sequence of integers expressing the number of successes (``k``) 

499 observed on the various experiments whose metric is to be averaged. 

500 failures 

501 Sequence of integers expressing the number of failures (``l``) 

502 observed on the various experiments whose metric is to be averaged. 

503 lambda_ 

504 The parameterisation of the Beta prior to consider. Use 

505 :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for 

506 Jeffrey's prior. 

507 coverage 

508 A floating-point number between 0 and 1.0 indicating the coverage 

509 you're expecting. A value of 0.95 will ensure 95% of the area under 

510 the probability density of the posterior is covered by the returned 

511 equal-tailed interval. 

512 nb_samples 

513 Number of generated variates for the M-C simulation. 

514 rng 

515 An initialized numpy random number generator. 

516 

517 Returns 

518 ------- 

519 statistics: mean, mode, lower and upper bounds of the credible interval 

520 for the provided coverage. 

521 """ 

522 

523 variates = average_beta_posterior( 

524 successes, failures, lambda_, nb_samples, rng 

525 ) 

526 return evaluate_statistics(variates, coverage, "auto") 

527 

528 

529def compare_beta_posteriors( 

530 k1: int, 

531 l1: int, 

532 k2: int, 

533 l2: int, 

534 lambda_: float, 

535 nb_samples: int, 

536 rng: numpy.random.Generator, 

537) -> float: 

538 r"""Return the probability that system 1 is better than system 2 for a 

539 given figure of merit. 

540 

541 This implementation is based on [GOUTTE-2005]_. 

542 

543 Figures of merit that are supported by this procedure are those which have 

544 the form :math:`v = k / (k + l)`: 

545 

546 * Precision or Positive-Predictive Value (PPV): :math:`p = TP/(TP+FP)`, so 

547 :math:`k=TP`, :math:`l=FP` 

548 * Recall, Sensitivity, or True Positive Rate: :math:`r = TP/(TP+FN)`, so 

549 :math:`k=TP`, :math:`l=FN` 

550 * Specificity or True Negative Rage: :math:`s = TN/(TN+FP)`, so :math:`k=TN`, 

551 :math:`l=FP` 

552 * Accuracy: :math:`acc = TP+TN/(TP+TN+FP+FN)`, so :math:`k=TP+TN`, 

553 :math:`l=FP+FN` 

554 * Jaccard Index: :math:`j = TP/(TP+FP+FN)`, so :math:`k=TP`, :math:`l=FP+FN` 

555 

556 Parameters 

557 ---------- 

558 k1 

559 Number of successes for the first system. See various definitions 

560 above. 

561 l1 

562 Number of failures for the first system. See various definitions 

563 above. 

564 k2 

565 Number of successes for the second system. See various definitions 

566 above. Must be syntatically equal to ``k1``. 

567 l2 

568 Number of failures for the second system. See various definitions 

569 above. Must be syntatically equal to ``l1``. 

570 lambda_ 

571 The parameterisation of the Beta prior to consider. Use 

572 :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for 

573 Jeffrey's prior. If unsure, use 0.5. 

574 nb_samples 

575 Number of generated variates for the M-C simulation. 

576 rng 

577 An initialized numpy random number generator. 

578 

579 Returns 

580 ------- 

581 A number between 0.0 and 1.0 that describes the probability that the 

582 first system has a bigger measurement than the second. 

583 """ 

584 

585 v1 = beta_posterior(k1, l1, lambda_, nb_samples, rng) 

586 v2 = beta_posterior(k2, l2, lambda_, nb_samples, rng) 

587 return numpy.count_nonzero(v1 > v2) / nb_samples 

588 

589 

590def f1_posterior( 

591 tp: int, 

592 fp: int, 

593 fn: int, 

594 lambda_: float, 

595 nb_samples: int, 

596 rng: numpy.random.Generator, 

597) -> numpy.typing.NDArray[numpy.double]: 

598 r"""Simulate the F1-score posterior of a system with the provided markings. 

599 

600 This implementation is based on [GOUTTE-2005]_, Equation 11. 

601 

602 Parameters 

603 ---------- 

604 tp 

605 True positive count, AKA "hit", as an integer scalar. 

606 fp 

607 False positive count, AKA "false alarm", or "Type I error". 

608 fn 

609 False Negative count, AKA "miss", or "Type II error", as a scalar. 

610 lambda_ 

611 The parameterisation of the Beta prior to consider. Use 

612 :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for 

613 Jeffrey's prior. If unsure, use 0.5. 

614 nb_samples 

615 Number of generated variates for the M-C simulation. 

616 rng 

617 An initialized numpy random number generator. 

618 

619 Returns 

620 ------- 

621 variates: An array with size ``nb_samples`` containing a realization of 

622 Equation 11 of [GOUTTE-2005]_. 

623 """ 

624 u = rng.gamma(shape=(tp + lambda_), scale=2.0, size=nb_samples) 

625 v = rng.gamma(shape=(fp + fn + (2 * lambda_)), scale=1.0, size=nb_samples) 

626 return u / (u + v) 

627 

628 

629def average_f1_posterior( 

630 tp: typing.Iterable[int], 

631 fp: typing.Iterable[int], 

632 fn: typing.Iterable[int], 

633 lambda_: float, 

634 nb_samples: int, 

635 rng: numpy.random.Generator, 

636) -> numpy.typing.NDArray[numpy.double]: 

637 r"""Simulate the F1-score posterior of an average system with the provided 

638 markings. 

639 

640 This implementation is based on [GOUTTE-2005]_, Equation 11. 

641 

642 Parameters 

643 ---------- 

644 tp 

645 Arrays containing true positive counts, AKA "hit", for all systems to 

646 be considered on the average. 

647 fp 

648 Arrays containing false positive counts, AKA "false alarm", or "Type I 

649 error" for all systems to be considered on the average. 

650 fn 

651 Arrays containing false Negative counts, AKA "miss", or "Type II 

652 error" for all systems to be considered on the average.. 

653 lambda_ 

654 The parameterisation of the Beta prior to consider. Use 

655 :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for 

656 Jeffrey's prior. If unsure, use 0.5. 

657 nb_samples 

658 Number of generated gamma distribution values. 

659 rng 

660 An initialized numpy random number generator. 

661 

662 Returns 

663 ------- 

664 variates: An array with size ``nb_samples`` containing a realization of 

665 Equation 11 from [GOUTTE-2005]_. 

666 """ 

667 tp_array, fp_array, fn_array = as_int_arrays((tp, fp, fn)) 

668 return numpy.mean( 

669 [ 

670 numpy.asarray( 

671 f1_posterior(tp_, fp_, fn_, lambda_, nb_samples, rng), 

672 dtype=numpy.double, 

673 ) 

674 for tp_, fp_, fn_ in zip(tp_array, fp_array, fn_array) 

675 ], 

676 axis=0, 

677 ) 

678 

679 

680def compare_f1_scores( 

681 tp1: int, 

682 fp1: int, 

683 fn1: int, 

684 tp2: int, 

685 fp2: int, 

686 fn2: int, 

687 lambda_: float, 

688 nb_samples: int, 

689 rng: numpy.random.Generator, 

690) -> float: 

691 r"""Return the probability that the F1-score from 1 system is bigger than 

692 the F1-score of a second system. 

693 

694 This implementation is based on [GOUTTE-2005]_. 

695 

696 Parameters 

697 ---------- 

698 tp1 

699 True positive count, AKA "hit" for the first system. 

700 fp1 

701 False positive count, AKA "false alarm", or "Type I error" for the 

702 first system. 

703 fn1 

704 False Negative count, AKA "miss", or "Type II error", for the first 

705 system. 

706 tp2 

707 True positive count, AKA "hit" for the second system. 

708 fp2 

709 False positive count, AKA "false alarm", or "Type I error" for the 

710 second system. 

711 fn2 

712 False Negative count, AKA "miss", or "Type II error", for the second 

713 system. 

714 lambda_ 

715 The parameterisation of the Beta prior to consider. Use 

716 :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for 

717 Jeffrey's prior. 

718 nb_samples 

719 Number of generated variates for the M-C simulation. 

720 rng 

721 An initialized numpy random number generator. 

722 

723 Returns 

724 ------- 

725 prob: A number between 0.0 and 1.0 that describes the probability that 

726 the first system has a bigger F1-score than the second. 

727 """ 

728 

729 f1 = f1_posterior(tp1, fp1, fn1, lambda_, nb_samples, rng) 

730 f2 = f1_posterior(tp2, fp2, fn2, lambda_, nb_samples, rng) 

731 return numpy.count_nonzero(f1 > f2) / nb_samples 

732 

733 

734def compare_systems( 

735 n: typing.Sequence[int], 

736 lambda_: typing.Sequence[float], 

737 nb_samples: int, 

738 rng: numpy.random.Generator, 

739) -> float: 

740 r"""Compare 2 system (binary) outputs using a Dirichlet posterior. 

741 

742 This function returns the empyrical probability that a system (1) is better 

743 another system (2), based on their binary outputs. The comparison is 

744 carried-out as described in [GOUTTE-2005]_, equations 16 and 19, via a 

745 Monte-Carlo simulation, since the integral of the probability cannot be 

746 resolved analytically. 

747 

748 To do so, we compute the probability that :math:`P(\pi_1 > \pi_2)`, i.e., 

749 the probability that system 1 gives the expected output while system 2 does 

750 not, is greater than the probability that system 1 is incorrect while 

751 system 2 gives the correct answer. It assumes, therefore, systems 1 and 2 

752 are tuned (thresholded), and provide binary outputs that can be compared to 

753 generate 3 numbers: 

754 

755 * :math:`n_1`: The measured number of times system 1 provides the correct 

756 answer, whereas system 2 does not 

757 * :math:`n_2`: The measured number of times system 2 provides the correct 

758 answer, whereas system 1 does not 

759 * :math:`n_3`: The measured number of times system 1 and 2 agree, giving 

760 the same answer (wrong or write, it does not matter) 

761 

762 Notice that :math:`\pi_1 = \frac{n_1}{n_1 + n_2 + n_3}`, and so, 

763 analogously, you may calculate :math:`\pi_2` and :math:`\pi_3`. 

764 

765 We then plug these numbers to simulate a Dirichlet (generalisation of the 

766 Beta distribution for multiple variables) by setting: 

767 

768 * :math:`\alpha_1 = n_1 + \lambda_1` 

769 * :math:`\alpha_2 = n_2 + \lambda_2` 

770 * :math:`\alpha_3 = n_2 + \lambda_3` 

771 

772 Where each :math:`\lambda_i` correspond to the prior to be imputed to that 

773 particular variable. We typically select :math:`\lambda_1 = \lambda_2`, 

774 

775 Parameters 

776 ---------- 

777 n 

778 A triple with 3 integers representing :math:`n_1`, :math:`n_2` and 

779 :math:`n_3`. 

780 lambda_ 

781 A tuple with length 3, containing floating point numbers describing the 

782 parameterisation of the Dirichlet priors to consider. Use 

783 :math:`\lambda_i=0.5` for Jeffrey's prior. 

784 nb_samples 

785 Number of generated dirichlet distribution values (make this high, for 

786 a higher precision on the simulation). 

787 rng 

788 An initialized numpy random number generator. 

789 

790 Returns 

791 ------- 

792 probability: A number between 0.0 and 1.0 that describes the 

793 probability that the first system is better than the second one. 

794 """ 

795 assert len(n) == 3 

796 assert len(lambda_) == 3 

797 samples = rng.dirichlet( 

798 numpy.array(n) + numpy.array(lambda_), size=nb_samples 

799 ) 

800 return numpy.count_nonzero(samples[:, 0] > samples[:, 1]) / nb_samples