Coverage for src/credible/bayesian/utils.py: 87%
68 statements
« prev ^ index » next coverage.py v7.4.2, created at 2024-04-16 09:18 +0200
« prev ^ index » next coverage.py v7.4.2, created at 2024-04-16 09:18 +0200
1# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
2#
3# SPDX-License-Identifier: GPL-3.0-or-later
4"""Functions to evalute the (Bayesian) credible region of measures.
6(Bayesian) credible region interpretation, with 95% coverage: The probability
7that the true proportion will lie within the 95% credible interval is 0.95.
9Contrary to frequentist approaches, in which one can only say that if the test
10were repeated an infinite number of times, and one constructed a confidence
11interval each time, then X% of the confidence intervals would contain the true
12rate, here we can say that given our observed data, there is a X% probability
13that the true value of :math:`k/n` falls within the provided interval.
15See a discussion in `Five Confidence Intervals for Proportions That You
16Should Know About <ci-evaluation_>`_ for a study on coverage for most common
17methods.
19.. note::
21 For a disambiguation with `Confidence Interval <confidence-interval_>`_ (the
22 frequentist approach), read `Credible Regions or Intervals
23 <credible-interval_>`_.
25.. include:: ../links.rst
26"""
28import typing
30import numpy
31import numpy.random
32import numpy.typing
33import scipy.special
35from ..utils import as_int_arrays
38def _beta_ndarray(
39 successes: numpy.typing.NDArray[numpy.integer],
40 failures: numpy.typing.NDArray[numpy.integer],
41 lambda_: float,
42 coverage: float,
43) -> tuple[
44 numpy.typing.NDArray[numpy.double],
45 numpy.typing.NDArray[numpy.double],
46 numpy.typing.NDArray[numpy.double],
47 numpy.typing.NDArray[numpy.double],
48]:
49 r""":py:func:`beta`, for multiple systems.
51 Parameters
52 ----------
53 successes
54 Number of successes (``k``) observed on the experiment.
55 failures
56 Number of failures (``l``) observed on the experiment.
57 lambda_
58 The parameterisation of the Beta prior to consider. Use
59 :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for
60 Jeffrey's prior. If unsure, use 0.5.
61 coverage
62 A floating-point number between 0 and 1.0 indicating the
63 coverage you're expecting. A value of 0.95 will ensure 95%
64 of the area under the probability density of the posterior
65 is covered by the returned equal-tailed interval.
67 Returns
68 -------
69 A tuple containing 4 floating point numbers representing:
71 * mean: The mean of the posterior distribution
72 * mode: The mode of the posterior distribution
73 * lower: The lower bounds of the credible region
74 * upper: The upper bounds of the credible region
76 If the input was a 1-D array with multiple experiments, then the return
77 value of this function is also composed of 1-D arrays, representing the
78 mean, mode, lower and upper bounds of the various credible regions
79 defined for each of the success/failure tuples on the input.
81 Raises
82 ------
83 TypeError
84 If the dimensions of ``successes`` and ``failures`` do not match, or in
85 case the input types are unsupported.
86 """
88 # we return the equally-tailed range
89 right = (1.0 - coverage) / 2 # half-width in each side
90 lower = scipy.special.betaincinv(
91 successes + lambda_, failures + lambda_, right
92 )
93 upper = scipy.special.betaincinv(
94 successes + lambda_, failures + lambda_, 1.0 - right
95 )
97 # evaluate mean and mode (https://en.wikipedia.org/wiki/Beta_distribution)
98 alpha = successes + lambda_
99 beta = failures + lambda_
101 mean = numpy.nan_to_num(alpha / (alpha + beta))
103 # the mode of a beta distribution is a bit tricky
104 mode = numpy.zeros_like(lower)
105 cond = (alpha > 1) & (beta > 1)
106 mode[cond] = (alpha[cond] - 1) / (alpha[cond] + beta[cond] - 2)
107 # In the case of precision, if the threshold is close to 1.0, both TP
108 # and FP can be zero, which may cause this condition to be reached, if
109 # the prior is exactly 1 (flat prior). This is a weird situation,
110 # because effectively we are trying to compute the posterior when the
111 # total number of experiments is zero. So, only the prior counts - but
112 # the prior is flat, so we should just pick a value. We choose the
113 # middle of the range.
114 # conda = alpha == 1 and beta == 1
115 # mode[cond] = 0.0
116 # conda = alpha <= 1 and beta > 1
117 # mode[cond] = 0.0
118 mode[(alpha > 1) & (beta <= 1)] = 1.0
119 # else: #elif alpha < 1 and beta < 1:
120 # in the case of precision, if the threshold is close to 1.0, both TP
121 # and FP can be zero, which may cause this condition to be reached, if
122 # the prior is smaller than 1. This is a weird situation, because
123 # effectively we are trying to compute the posterior when the total
124 # number of experiments is zero. So, only the prior counts - but the
125 # prior is bimodal, so we should just pick a value. We choose the
126 # left of the range.
127 # n.b.: could also be 1.0 as the prior is bimodal
128 # mode[alpha < 1 and beta < 1] = 0.0
130 return mean, mode, lower, upper
133def beta_array(
134 successes: typing.Iterable[int],
135 failures: typing.Iterable[int],
136 lambda_: float,
137 coverage: float,
138) -> tuple[
139 numpy.typing.NDArray[numpy.double],
140 numpy.typing.NDArray[numpy.double],
141 numpy.typing.NDArray[numpy.double],
142 numpy.typing.NDArray[numpy.double],
143]:
144 r""":py:func:`beta`, for multiple systems.
146 Parameters
147 ----------
148 successes
149 Number of successes (``k``) observed on the experiment.
150 failures
151 Number of failures (``l``) observed on the experiment.
152 lambda_
153 The parameterisation of the Beta prior to consider. Use
154 :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for
155 Jeffrey's prior. If unsure, use 0.5.
156 coverage
157 A floating-point number between 0 and 1.0 indicating the
158 coverage you're expecting. A value of 0.95 will ensure 95%
159 of the area under the probability density of the posterior
160 is covered by the returned equal-tailed interval.
162 Returns
163 -------
164 A tuple containing 4 floating point numbers representing:
166 * mean: The mean of the posterior distribution
167 * mode: The mode of the posterior distribution
168 * lower: The lower bounds of the credible region
169 * upper: The upper bounds of the credible region
171 If the input was a 1-D array with multiple experiments, then the return
172 value of this function is also composed of 1-D arrays, representing the
173 mean, mode, lower and upper bounds of the various credible regions
174 defined for each of the success/failure tuples on the input.
176 Raises
177 ------
178 TypeError
179 If the dimensions of ``successes`` and ``failures`` do not match, or in
180 case the input types are unsupported.
181 """
182 successes_array, failures_array = as_int_arrays((successes, failures))
183 return _beta_ndarray(successes_array, failures_array, lambda_, coverage)
186def beta(
187 successes: int, failures: int, lambda_: float, coverage: float
188) -> tuple[float, float, float, float]:
189 r"""Return the mode, upper and lower bounds of the equal-tailed credible
190 region of a probability estimate following Bernoulli trials.
192 This technique is (not) very conservative - in most of the cases, coverage
193 closer to the extremes (0 or 1) is lower than expected (but still greater
194 than 85%).
196 This implementation is based on [GOUTTE-2005]_. It assumes :math:`k`
197 successes and :math:`l` failures (:math:`n = k+l` total trials) are issued
198 from a series of Bernoulli trials (likelihood is binomial). The posterior
199 is derivated using the Bayes Theorem with a beta prior. As there is no
200 reason to favour high vs. low precision, we use a symmetric Beta prior
201 (:math:`\alpha=\beta`):
203 .. math::
205 P(p|k,n) &= \frac{P(k,n|p)P(p)}{P(k,n)} \\
206 P(p|k,n) &= \frac{\frac{n!}{k!(n-k)!}p^{k}(1-p)^{n-k}P(p)}{P(k)} \\
207 P(p|k,n) &= \frac{1}{B(k+\alpha, n-k+\beta)}p^{k+\alpha-1}(1-p)^{n-k+\beta-1} \\
208 P(p|k,n) &= \frac{1}{B(k+\alpha, n-k+\alpha)}p^{k+\alpha-1}(1-p)^{n-k+\alpha-1}
210 The mode for this posterior (also the maximum a posteriori) is:
212 .. math::
214 \text{mode}(p) = \frac{k+\lambda-1}{n+2\lambda-2}
216 Concretely, the prior may be flat (all rates are equally likely,
217 :math:`\lambda=1`) or we may use Jeoffrey's prior (:math:`\lambda=0.5`),
218 that is invariant through re-parameterisation. Jeffrey's prior indicate
219 that rates close to zero or one are more likely.
221 The mode above works if :math:`k+{\alpha},n-k+{\alpha} > 1`, which is
222 usually the case for a resonably well tunned system, with more than a few
223 samples for analysis. In the limit of the system performance, :math:`k`
224 may be 0, which will make the mode become zero.
226 For our purposes, it may be more suitable to represent :math:`n = k + l`,
227 with :math:`k`, the number of successes and :math:`l`, the number of
228 failures in the binomial experiment, and find this more suitable
229 representation:
231 .. math::
233 P(p|k,l) &= \frac{1}{B(k+\alpha, l+\alpha)}p^{k+\alpha-1}(1-p)^{l+\alpha-1} \\
234 \text{mode}(p) &= \frac{k+\lambda-1}{k+l+2\lambda-2}
236 This can be mapped to most rates calculated in the context of binary
237 classification this way:
239 * Precision or Positive-Predictive Value (PPV): p = TP/(TP+FP), so k=TP, l=FP
240 * Recall, Sensitivity, or True Positive Rate: r = TP/(TP+FN), so k=TP, l=FN
241 * Specificity or True Negative Rage: s = TN/(TN+FP), so k=TN, l=FP
242 * Accuracy: acc = TP+TN/(TP+TN+FP+FN), so k=TP+TN, l=FP+FN
243 * Jaccard: j = TP/(TP+FP+FN), so k=TP, l=FP+FN
245 .. note:: **Important**
247 To calculate the limits given the required coverage, we use the
248 incomplete **inverse** (regularized, or normalized) beta function,
249 :any:`scipy.special.betaincinv` instead of :any:`scipy.special.betainc`.
250 The latter requires we provide the bounds and returns the coverage,
251 whereas here we are interested in the *inverse* behaviour.
253 Parameters
254 ----------
255 successes
256 Number of successes (``k``) observed on the experiment.
257 failures
258 Number of failures (``l``) observed on the experiment.
259 lambda_
260 The parameterisation of the Beta prior to consider. Use
261 :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for
262 Jeffrey's prior. If unsure, use 0.5.
263 coverage
264 A floating-point number between 0 and 1.0 indicating the
265 coverage you're expecting. A value of 0.95 will ensure 95%
266 of the area under the probability density of the posterior
267 is covered by the returned equal-tailed interval.
269 Returns
270 -------
271 A tuple containing 4 floating point numbers representing:
273 * mean: The mean of the posterior distribution
274 * mode: The mode of the posterior distribution
275 * lower: The lower bounds of the credible region
276 * upper: The upper bounds of the credible region
278 If the input was a 1-D array with multiple experiments, then the return
279 value of this function is also composed of 1-D arrays, representing the
280 mean, mode, lower and upper bounds of the various credible regions
281 defined for each of the success/failure tuples on the input.
283 Raises
284 ------
285 TypeError
286 If the dimensions of ``successes`` and ``failures`` do not match, or in
287 case the input types are unsupported.
288 """
289 retval = beta_array([successes], [failures], lambda_, coverage)
290 return (
291 retval[0].item(),
292 retval[1].item(),
293 retval[2].item(),
294 retval[3].item(),
295 )
298def beta_posterior(
299 successes: int,
300 failures: int,
301 lambda_: float,
302 nb_samples: int,
303 rng: numpy.random.Generator,
304) -> numpy.typing.NDArray[numpy.double]:
305 r"""Simulate the beta posterior of a system with the provided markings.
307 This implementation is based on [GOUTTE-2005]_, Equation 7.
309 Figures of merit that are supported by this procedure are those which have
310 the form :math:`v = k / (k + l)` (successes over successes plus failures):
312 * Precision or Positive-Predictive Value (PPV): :math:`p = TP/(TP+FP)`, so
313 :math:`k=TP`, :math:`l=FP`
314 * Recall, Sensitivity, or True Positive Rate: :math:`r = TP/(TP+FN)`, so
315 :math:`k=TP`, :math:`l=FN`
316 * Specificity or True Negative Rage: :math:`s = TN/(TN+FP)`, so :math:`k=TN`,
317 :math:`l=FP`
318 * Accuracy: :math:`acc = TP+TN/(TP+TN+FP+FN)`, so :math:`k=TP+TN`,
319 :math:`l=FP+FN`
320 * Jaccard Index: :math:`j = TP/(TP+FP+FN)`, so :math:`k=TP`, :math:`l=FP+FN`
322 Parameters
323 ----------
324 successes
325 Number of successes (``k``) observed on the experiment. May be a
326 scalar of any integral type, or an array of integral values indicating
327 a series of experiments to calculate the credible regions for.
328 failures
329 Number of failures (``l``) observed on the experiment. May be a scalar
330 of any integral type, or an array of integral values indicating a
331 series of experiments to calculate the credible regions for.
332 lambda_
333 The parameterisation of the Beta prior to consider. Use
334 :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for
335 Jeffrey's prior.
336 nb_samples
337 Number of generated gamma distribution values.
338 rng
339 An initialized numpy random number generator.
341 Returns
342 -------
343 Variates: An array with size ``nb_samples`` containing a realization of
344 Equation 7 from [GOUTTE-2005]_. If ``successes`` and ``failures`` are
345 sequences of arrays, then returns a 2D array where each row represents
346 one set of realisations.
348 Raises
349 ------
350 TypeError
351 If the dimensions of ``successes`` and ``failures`` do not match.
352 """
353 return rng.beta(
354 a=successes + lambda_, b=failures + lambda_, size=nb_samples
355 )
358def average_beta_posterior(
359 successes: typing.Iterable[int],
360 failures: typing.Iterable[int],
361 lambda_: float,
362 nb_samples: int,
363 rng: numpy.random.Generator,
364) -> numpy.typing.NDArray[numpy.double]:
365 r"""Simulate the average beta posterior of many systems.
367 This implementation is based on [GOUTTE-2005]_, Equation 7.
369 Figures of merit that are supported by this procedure are those which have
370 the form :math:`v = k / (k + l)` (successes over successes plus failures):
372 * Precision or Positive-Predictive Value (PPV): :math:`p = TP/(TP+FP)`, so
373 :math:`k=TP`, :math:`l=FP`
374 * Recall, Sensitivity, or True Positive Rate: :math:`r = TP/(TP+FN)`, so
375 :math:`k=TP`, :math:`l=FN`
376 * Specificity or True Negative Rate: :math:`s = TN/(TN+FP)`, so :math:`k=TN`,
377 :math:`l=FP`
378 * Accuracy: :math:`acc = TP+TN/(TP+TN+FP+FN)`, so :math:`k=TP+TN`,
379 :math:`l=FP+FN`
380 * Jaccard Index: :math:`j = TP/(TP+FP+FN)`, so :math:`k=TP`, :math:`l=FP+FN`
382 Parameters
383 ----------
384 successes
385 Sequence of integers expressing the number of successes (``k``)
386 observed on the various experiments whose metric is to be averaged.
387 failures
388 Sequence of integers expressing the number of failures (``l``)
389 observed on the various experiments whose metric is to be averaged.
390 lambda_
391 The parameterisation of the Beta prior to consider. Use
392 :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for
393 Jeffrey's prior.
394 nb_samples
395 Number of generated variates for the M-C simulation.
396 rng
397 An initialized numpy random number generator.
399 Returns
400 -------
401 Variates: An array with size ``nb_samples`` containing a realization of
402 Equation 7 from [GOUTTE-2005]_, considering the averaging of all input
403 systems.
404 """
405 successes_array, failures_array = as_int_arrays((successes, failures))
407 if successes_array.ndim == 0:
408 successes_array = numpy.atleast_1d(successes_array)
409 failures_array = numpy.atleast_1d(failures_array)
411 return numpy.mean(
412 [
413 beta_posterior(kk, ll, lambda_, nb_samples, rng)
414 for kk, ll in zip(successes_array, failures_array)
415 ],
416 axis=0,
417 )
420def evaluate_statistics(
421 variates: typing.Iterable[float],
422 coverage: float,
423 bins: int | str,
424) -> tuple[float, float, float, float]:
425 """Evaluate the left and right margins for a given M-C distribution.
427 Parameters
428 ----------
429 variates
430 A 1-D array containing the simulated variates.
431 coverage
432 A number, between 0 and 1 to indicate the desired coverage. Typically,
433 this number is set to 0.95 (95% coverage).
434 bins
435 Histogram binning for defining the mode of the original distribution
436 where variates where sampled from. A larger number of bins will make
437 it unlikely to discover the true mode. You may also pass any of the
438 string method names defined at :py:func:`numpy.histogram_bin_edges`. A
439 method such as ``doane`` should work fine in most cases.
441 Returns
442 -------
443 statistics: mean, (a rough estimated of the) mode and lower and upper
444 bounds of the credible intervals for the input simulation.
446 .. note::
448 Mode estimation depends on the number of input ``variates``, and
449 histogram binning (``bins``). Using :py:func:`scipy.stats.mode` can
450 only evaluate most common value, which is **not reliable** for
451 continuous variables.
452 """
454 left_half = (1 - coverage) / 2 # size of excluded (half) area
455 variates_array = numpy.asarray(variates, dtype=numpy.double)
456 sorted_variates = numpy.sort(variates_array)
458 # n.b.: we return the equally tailed range
460 # calculates position of score which would exclude the left_half (left)
461 lower_index = int(round(len(variates_array) * left_half))
463 # calculates position of score which would exclude the right_half (right)
464 upper_index = int(round(len(variates_array) * (1 - left_half)))
466 lower = sorted_variates[lower_index - 1]
467 upper = sorted_variates[upper_index - 1]
469 # we evaluate the histogram and get the maximum value from there
470 hist, ranges = numpy.histogram(variates_array, bins=bins)
471 idx = numpy.argmax(hist)
472 mode = (ranges[idx] + ranges[idx + 1]) / 2
474 return (
475 numpy.mean(variates_array).item(),
476 mode,
477 lower,
478 upper,
479 )
482def average_beta(
483 successes: typing.Iterable[int],
484 failures: typing.Iterable[int],
485 lambda_: float,
486 coverage: float,
487 nb_samples: int,
488 rng: numpy.random.Generator,
489) -> tuple[float, float, float, float]:
490 r"""Return mean, mode, upper and lower bounds of the credible region of an
491 average of measures with beta posteriors.
493 This implementation is based on [GOUTTE-2005]_.
495 Parameters
496 ----------
497 successes
498 Sequence of integers expressing the number of successes (``k``)
499 observed on the various experiments whose metric is to be averaged.
500 failures
501 Sequence of integers expressing the number of failures (``l``)
502 observed on the various experiments whose metric is to be averaged.
503 lambda_
504 The parameterisation of the Beta prior to consider. Use
505 :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for
506 Jeffrey's prior.
507 coverage
508 A floating-point number between 0 and 1.0 indicating the coverage
509 you're expecting. A value of 0.95 will ensure 95% of the area under
510 the probability density of the posterior is covered by the returned
511 equal-tailed interval.
512 nb_samples
513 Number of generated variates for the M-C simulation.
514 rng
515 An initialized numpy random number generator.
517 Returns
518 -------
519 statistics: mean, mode, lower and upper bounds of the credible interval
520 for the provided coverage.
521 """
523 variates = average_beta_posterior(
524 successes, failures, lambda_, nb_samples, rng
525 )
526 return evaluate_statistics(variates, coverage, "auto")
529def compare_beta_posteriors(
530 k1: int,
531 l1: int,
532 k2: int,
533 l2: int,
534 lambda_: float,
535 nb_samples: int,
536 rng: numpy.random.Generator,
537) -> float:
538 r"""Return the probability that system 1 is better than system 2 for a
539 given figure of merit.
541 This implementation is based on [GOUTTE-2005]_.
543 Figures of merit that are supported by this procedure are those which have
544 the form :math:`v = k / (k + l)`:
546 * Precision or Positive-Predictive Value (PPV): :math:`p = TP/(TP+FP)`, so
547 :math:`k=TP`, :math:`l=FP`
548 * Recall, Sensitivity, or True Positive Rate: :math:`r = TP/(TP+FN)`, so
549 :math:`k=TP`, :math:`l=FN`
550 * Specificity or True Negative Rage: :math:`s = TN/(TN+FP)`, so :math:`k=TN`,
551 :math:`l=FP`
552 * Accuracy: :math:`acc = TP+TN/(TP+TN+FP+FN)`, so :math:`k=TP+TN`,
553 :math:`l=FP+FN`
554 * Jaccard Index: :math:`j = TP/(TP+FP+FN)`, so :math:`k=TP`, :math:`l=FP+FN`
556 Parameters
557 ----------
558 k1
559 Number of successes for the first system. See various definitions
560 above.
561 l1
562 Number of failures for the first system. See various definitions
563 above.
564 k2
565 Number of successes for the second system. See various definitions
566 above. Must be syntatically equal to ``k1``.
567 l2
568 Number of failures for the second system. See various definitions
569 above. Must be syntatically equal to ``l1``.
570 lambda_
571 The parameterisation of the Beta prior to consider. Use
572 :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for
573 Jeffrey's prior. If unsure, use 0.5.
574 nb_samples
575 Number of generated variates for the M-C simulation.
576 rng
577 An initialized numpy random number generator.
579 Returns
580 -------
581 A number between 0.0 and 1.0 that describes the probability that the
582 first system has a bigger measurement than the second.
583 """
585 v1 = beta_posterior(k1, l1, lambda_, nb_samples, rng)
586 v2 = beta_posterior(k2, l2, lambda_, nb_samples, rng)
587 return numpy.count_nonzero(v1 > v2) / nb_samples
590def f1_posterior(
591 tp: int,
592 fp: int,
593 fn: int,
594 lambda_: float,
595 nb_samples: int,
596 rng: numpy.random.Generator,
597) -> numpy.typing.NDArray[numpy.double]:
598 r"""Simulate the F1-score posterior of a system with the provided markings.
600 This implementation is based on [GOUTTE-2005]_, Equation 11.
602 Parameters
603 ----------
604 tp
605 True positive count, AKA "hit", as an integer scalar.
606 fp
607 False positive count, AKA "false alarm", or "Type I error".
608 fn
609 False Negative count, AKA "miss", or "Type II error", as a scalar.
610 lambda_
611 The parameterisation of the Beta prior to consider. Use
612 :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for
613 Jeffrey's prior. If unsure, use 0.5.
614 nb_samples
615 Number of generated variates for the M-C simulation.
616 rng
617 An initialized numpy random number generator.
619 Returns
620 -------
621 variates: An array with size ``nb_samples`` containing a realization of
622 Equation 11 of [GOUTTE-2005]_.
623 """
624 u = rng.gamma(shape=(tp + lambda_), scale=2.0, size=nb_samples)
625 v = rng.gamma(shape=(fp + fn + (2 * lambda_)), scale=1.0, size=nb_samples)
626 return u / (u + v)
629def average_f1_posterior(
630 tp: typing.Iterable[int],
631 fp: typing.Iterable[int],
632 fn: typing.Iterable[int],
633 lambda_: float,
634 nb_samples: int,
635 rng: numpy.random.Generator,
636) -> numpy.typing.NDArray[numpy.double]:
637 r"""Simulate the F1-score posterior of an average system with the provided
638 markings.
640 This implementation is based on [GOUTTE-2005]_, Equation 11.
642 Parameters
643 ----------
644 tp
645 Arrays containing true positive counts, AKA "hit", for all systems to
646 be considered on the average.
647 fp
648 Arrays containing false positive counts, AKA "false alarm", or "Type I
649 error" for all systems to be considered on the average.
650 fn
651 Arrays containing false Negative counts, AKA "miss", or "Type II
652 error" for all systems to be considered on the average..
653 lambda_
654 The parameterisation of the Beta prior to consider. Use
655 :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for
656 Jeffrey's prior. If unsure, use 0.5.
657 nb_samples
658 Number of generated gamma distribution values.
659 rng
660 An initialized numpy random number generator.
662 Returns
663 -------
664 variates: An array with size ``nb_samples`` containing a realization of
665 Equation 11 from [GOUTTE-2005]_.
666 """
667 tp_array, fp_array, fn_array = as_int_arrays((tp, fp, fn))
668 return numpy.mean(
669 [
670 numpy.asarray(
671 f1_posterior(tp_, fp_, fn_, lambda_, nb_samples, rng),
672 dtype=numpy.double,
673 )
674 for tp_, fp_, fn_ in zip(tp_array, fp_array, fn_array)
675 ],
676 axis=0,
677 )
680def compare_f1_scores(
681 tp1: int,
682 fp1: int,
683 fn1: int,
684 tp2: int,
685 fp2: int,
686 fn2: int,
687 lambda_: float,
688 nb_samples: int,
689 rng: numpy.random.Generator,
690) -> float:
691 r"""Return the probability that the F1-score from 1 system is bigger than
692 the F1-score of a second system.
694 This implementation is based on [GOUTTE-2005]_.
696 Parameters
697 ----------
698 tp1
699 True positive count, AKA "hit" for the first system.
700 fp1
701 False positive count, AKA "false alarm", or "Type I error" for the
702 first system.
703 fn1
704 False Negative count, AKA "miss", or "Type II error", for the first
705 system.
706 tp2
707 True positive count, AKA "hit" for the second system.
708 fp2
709 False positive count, AKA "false alarm", or "Type I error" for the
710 second system.
711 fn2
712 False Negative count, AKA "miss", or "Type II error", for the second
713 system.
714 lambda_
715 The parameterisation of the Beta prior to consider. Use
716 :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for
717 Jeffrey's prior.
718 nb_samples
719 Number of generated variates for the M-C simulation.
720 rng
721 An initialized numpy random number generator.
723 Returns
724 -------
725 prob: A number between 0.0 and 1.0 that describes the probability that
726 the first system has a bigger F1-score than the second.
727 """
729 f1 = f1_posterior(tp1, fp1, fn1, lambda_, nb_samples, rng)
730 f2 = f1_posterior(tp2, fp2, fn2, lambda_, nb_samples, rng)
731 return numpy.count_nonzero(f1 > f2) / nb_samples
734def compare_systems(
735 n: typing.Sequence[int],
736 lambda_: typing.Sequence[float],
737 nb_samples: int,
738 rng: numpy.random.Generator,
739) -> float:
740 r"""Compare 2 system (binary) outputs using a Dirichlet posterior.
742 This function returns the empyrical probability that a system (1) is better
743 another system (2), based on their binary outputs. The comparison is
744 carried-out as described in [GOUTTE-2005]_, equations 16 and 19, via a
745 Monte-Carlo simulation, since the integral of the probability cannot be
746 resolved analytically.
748 To do so, we compute the probability that :math:`P(\pi_1 > \pi_2)`, i.e.,
749 the probability that system 1 gives the expected output while system 2 does
750 not, is greater than the probability that system 1 is incorrect while
751 system 2 gives the correct answer. It assumes, therefore, systems 1 and 2
752 are tuned (thresholded), and provide binary outputs that can be compared to
753 generate 3 numbers:
755 * :math:`n_1`: The measured number of times system 1 provides the correct
756 answer, whereas system 2 does not
757 * :math:`n_2`: The measured number of times system 2 provides the correct
758 answer, whereas system 1 does not
759 * :math:`n_3`: The measured number of times system 1 and 2 agree, giving
760 the same answer (wrong or write, it does not matter)
762 Notice that :math:`\pi_1 = \frac{n_1}{n_1 + n_2 + n_3}`, and so,
763 analogously, you may calculate :math:`\pi_2` and :math:`\pi_3`.
765 We then plug these numbers to simulate a Dirichlet (generalisation of the
766 Beta distribution for multiple variables) by setting:
768 * :math:`\alpha_1 = n_1 + \lambda_1`
769 * :math:`\alpha_2 = n_2 + \lambda_2`
770 * :math:`\alpha_3 = n_2 + \lambda_3`
772 Where each :math:`\lambda_i` correspond to the prior to be imputed to that
773 particular variable. We typically select :math:`\lambda_1 = \lambda_2`,
775 Parameters
776 ----------
777 n
778 A triple with 3 integers representing :math:`n_1`, :math:`n_2` and
779 :math:`n_3`.
780 lambda_
781 A tuple with length 3, containing floating point numbers describing the
782 parameterisation of the Dirichlet priors to consider. Use
783 :math:`\lambda_i=0.5` for Jeffrey's prior.
784 nb_samples
785 Number of generated dirichlet distribution values (make this high, for
786 a higher precision on the simulation).
787 rng
788 An initialized numpy random number generator.
790 Returns
791 -------
792 probability: A number between 0.0 and 1.0 that describes the
793 probability that the first system is better than the second one.
794 """
795 assert len(n) == 3
796 assert len(lambda_) == 3
797 samples = rng.dirichlet(
798 numpy.array(n) + numpy.array(lambda_), size=nb_samples
799 )
800 return numpy.count_nonzero(samples[:, 0] > samples[:, 1]) / nb_samples