1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4from collections import deque
5
6import numpy
7import scipy.special
8import torch
9
10
11class SmoothedValue:
12 """Track a series of values and provide access to smoothed values over a
13 window or the global series average.
14 """
15
16 def __init__(self, window_size=20):
17 self.deque = deque(maxlen=window_size)
18
19 def update(self, value):
20 self.deque.append(value)
21
22 @property
23 def median(self):
24 d = torch.tensor(list(self.deque))
25 return d.median().item()
26
27 @property
28 def avg(self):
29 d = torch.tensor(list(self.deque))
30 return d.mean().item()
31
32
33def tricky_division(n, d):
34 """Divides n by d. Returns 0.0 in case of a division by zero"""
35
36 return n / (d + (d == 0))
37
38
39def base_measures(tp, fp, tn, fn):
40 """Calculates frequentist measures from true/false positive and negative
41 counts
42
43 This function can return (frequentist versions of) standard machine
44 learning measures from true and false positive counts of positives and
45 negatives. For a thorough look into these and alternate names for the
46 returned values, please check Wikipedia's entry on `Precision and Recall
47 <https://en.wikipedia.org/wiki/Precision_and_recall>`_.
48
49
50 Parameters
51 ----------
52
53 tp : int
54 True positive count, AKA "hit"
55
56 fp : int
57 False positive count, AKA "false alarm", or "Type I error"
58
59 tn : int
60 True negative count, AKA "correct rejection"
61
62 fn : int
63 False Negative count, AKA "miss", or "Type II error"
64
65
66 Returns
67 -------
68
69 precision : float
70 P, AKA positive predictive value (PPV). It corresponds arithmetically
71 to ``tp/(tp+fp)``. In the case ``tp+fp == 0``, this function returns
72 zero for precision.
73
74 recall : float
75 R, AKA sensitivity, hit rate, or true positive rate (TPR). It
76 corresponds arithmetically to ``tp/(tp+fn)``. In the special case
77 where ``tp+fn == 0``, this function returns zero for recall.
78
79 specificity : float
80 S, AKA selectivity or true negative rate (TNR). It
81 corresponds arithmetically to ``tn/(tn+fp)``. In the special case
82 where ``tn+fp == 0``, this function returns zero for specificity.
83
84 accuracy : float
85 A, see `Accuracy
86 <https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers>`_. is
87 the proportion of correct predictions (both true positives and true
88 negatives) among the total number of pixels examined. It corresponds
89 arithmetically to ``(tp+tn)/(tp+tn+fp+fn)``. This measure includes
90 both true-negatives and positives in the numerator, what makes it
91 sensitive to data or regions without annotations.
92
93 jaccard : float
94 J, see `Jaccard Index or Similarity
95 <https://en.wikipedia.org/wiki/Jaccard_index>`_. It corresponds
96 arithmetically to ``tp/(tp+fp+fn)``. In the special case where
97 ``tn+fp+fn == 0``, this function returns zero for the Jaccard index.
98 The Jaccard index depends on a TP-only numerator, similarly to the F1
99 score. For regions where there are no annotations, the Jaccard index
100 will always be zero, irrespective of the model output. Accuracy may be
101 a better proxy if one needs to consider the true abscence of
102 annotations in a region as part of the measure.
103
104 f1_score : float
105 F1, see `F1-score <https://en.wikipedia.org/wiki/F1_score>`_. It
106 corresponds arithmetically to ``2*P*R/(P+R)`` or ``2*tp/(2*tp+fp+fn)``.
107 In the special case where ``P+R == (2*tp+fp+fn) == 0``, this function
108 returns zero for the Jaccard index. The F1 or Dice score depends on a
109 TP-only numerator, similarly to the Jaccard index. For regions where
110 there are no annotations, the F1-score will always be zero,
111 irrespective of the model output. Accuracy may be a better proxy if
112 one needs to consider the true abscence of annotations in a region as
113 part of the measure.
114
115 """
116
117 return (
118 tricky_division(tp, tp + fp), # precision
119 tricky_division(tp, tp + fn), # recall
120 tricky_division(tn, fp + tn), # specificity
121 tricky_division(tp + tn, tp + fp + fn + tn), # accuracy
122 tricky_division(tp, tp + fp + fn), # jaccard index
123 tricky_division(2 * tp, (2 * tp) + fp + fn), # f1-score
124 )
125
126
127def beta_credible_region(k, l, lambda_, coverage):
128 """
129 Returns the mode, upper and lower bounds of the equal-tailed credible
130 region of a probability estimate following Bernoulli trials.
131
132 This implemetnation is based on [GOUTTE-2005]_. It assumes :math:`k`
133 successes and :math:`l` failures (:math:`n = k+l` total trials) are issued
134 from a series of Bernoulli trials (likelihood is binomial). The posterior
135 is derivated using the Bayes Theorem with a beta prior. As there is no
136 reason to favour high vs. low precision, we use a symmetric Beta prior
137 (:math:`\\alpha=\\beta`):
138
139 .. math::
140
141 P(p|k,n) &= \\frac{P(k,n|p)P(p)}{P(k,n)} \\\\
142 P(p|k,n) &= \\frac{\\frac{n!}{k!(n-k)!}p^{k}(1-p)^{n-k}P(p)}{P(k)} \\\\
143 P(p|k,n) &= \\frac{1}{B(k+\\alpha, n-k+\beta)}p^{k+\\alpha-1}(1-p)^{n-k+\\beta-1} \\\\
144 P(p|k,n) &= \\frac{1}{B(k+\\alpha, n-k+\\alpha)}p^{k+\\alpha-1}(1-p)^{n-k+\\alpha-1}
145
146 The mode for this posterior (also the maximum a posteriori) is:
147
148 .. math::
149
150 \\text{mode}(p) = \\frac{k+\\lambda-1}{n+2\\lambda-2}
151
152 Concretely, the prior may be flat (all rates are equally likely,
153 :math:`\\lambda=1`) or we may use Jeoffrey's prior
154 (:math:`\\lambda=0.5`), that is invariant through re-parameterisation.
155 Jeffrey's prior indicate that rates close to zero or one are more likely.
156
157 The mode above works if :math:`k+{\\alpha},n-k+{\\alpha} > 1`, which is
158 usually the case for a resonably well tunned system, with more than a few
159 samples for analysis. In the limit of the system performance, :math:`k`
160 may be 0, which will make the mode become zero.
161
162 For our purposes, it may be more suitable to represent :math:`n = k + l`,
163 with :math:`k`, the number of successes and :math:`l`, the number of
164 failures in the binomial experiment, and find this more suitable
165 representation:
166
167 .. math::
168
169 P(p|k,l) &= \\frac{1}{B(k+\\alpha, l+\\alpha)}p^{k+\\alpha-1}(1-p)^{l+\\alpha-1} \\\\
170 \\text{mode}(p) &= \\frac{k+\\lambda-1}{k+l+2\\lambda-2}
171
172 This can be mapped to most rates calculated in the context of binary
173 classification this way:
174
175 * Precision or Positive-Predictive Value (PPV): p = TP/(TP+FP), so k=TP, l=FP
176 * Recall, Sensitivity, or True Positive Rate: r = TP/(TP+FN), so k=TP, l=FN
177 * Specificity or True Negative Rage: s = TN/(TN+FP), so k=TN, l=FP
178 * F1-score: f1 = 2TP/(2TP+FP+FN), so k=2TP, l=FP+FN
179 * Accuracy: acc = TP+TN/(TP+TN+FP+FN), so k=TP+TN, l=FP+FN
180 * Jaccard: j = TP/(TP+FP+FN), so k=TP, l=FP+FN
181
182 Contrary to frequentist approaches, in which one can only
183 say that if the test were repeated an infinite number of times,
184 and one constructed a confidence interval each time, then X%
185 of the confidence intervals would contain the true rate, here
186 we can say that given our observed data, there is a X% probability
187 that the true value of :math:`k/n` falls within the provided
188 interval.
189
190
191 .. note::
192
193 For a disambiguation with Confidence Interval, read
194 https://en.wikipedia.org/wiki/Credible_interval.
195
196
197 Parameters
198 ==========
199
200 k : int
201 Number of successes observed on the experiment
202
203 l : int
204 Number of failures observed on the experiment
205
206 lambda__ : :py:class:`float`, Optional
207 The parameterisation of the Beta prior to consider. Use
208 :math:`\\lambda=1` for a flat prior. Use :math:`\\lambda=0.5` for
209 Jeffrey's prior (the default).
210
211 coverage : :py:class:`float`, Optional
212 A floating-point number between 0 and 1.0 indicating the
213 coverage you're expecting. A value of 0.95 will ensure 95%
214 of the area under the probability density of the posterior
215 is covered by the returned equal-tailed interval.
216
217
218 Returns
219 =======
220
221 mean : float
222 The mean of the posterior distribution
223
224 mode : float
225 The mode of the posterior distribution
226
227 lower, upper: float
228 The lower and upper bounds of the credible region
229
230 """
231
232 # we return the equally-tailed range
233 right = (1.0 - coverage) / 2 # half-width in each side
234 lower = scipy.special.betaincinv(k + lambda_, l + lambda_, right)
235 upper = scipy.special.betaincinv(k + lambda_, l + lambda_, 1.0 - right)
236
237 # evaluate mean and mode (https://en.wikipedia.org/wiki/Beta_distribution)
238 alpha = k + lambda_
239 beta = l + lambda_
240
241 E = alpha / (alpha + beta)
242
243 # the mode of a beta distribution is a bit tricky
244 if alpha > 1 and beta > 1:
245 mode = (alpha - 1) / (alpha + beta - 2)
246 elif alpha == 1 and beta == 1:
247 # In the case of precision, if the threshold is close to 1.0, both TP
248 # and FP can be zero, which may cause this condition to be reached, if
249 # the prior is exactly 1 (flat prior). This is a weird situation,
250 # because effectively we are trying to compute the posterior when the
251 # total number of experiments is zero. So, only the prior counts - but
252 # the prior is flat, so we should just pick a value. We choose the
253 # middle of the range.
254 mode = 0.0 # any value would do, we just pick this one
255 elif alpha <= 1 and beta > 1:
256 mode = 0.0
257 elif alpha > 1 and beta <= 1:
258 mode = 1.0
259 else: # elif alpha < 1 and beta < 1:
260 # in the case of precision, if the threshold is close to 1.0, both TP
261 # and FP can be zero, which may cause this condition to be reached, if
262 # the prior is smaller than 1. This is a weird situation, because
263 # effectively we are trying to compute the posterior when the total
264 # number of experiments is zero. So, only the prior counts - but the
265 # prior is bimodal, so we should just pick a value. We choose the
266 # left of the range.
267 mode = 0.0 # could also be 1.0 as the prior is bimodal
268
269 return E, mode, lower, upper
270
271
272def bayesian_measures(tp, fp, tn, fn, lambda_, coverage):
273 """Calculates mean and mode from true/false positive and negative counts
274 with credible regions
275
276 This function can return bayesian estimates of standard machine learning
277 measures from true and false positive counts of positives and negatives.
278 For a thorough look into these and alternate names for the returned values,
279 please check Wikipedia's entry on `Precision and Recall
280 <https://en.wikipedia.org/wiki/Precision_and_recall>`_. See
281 :py:func:`beta_credible_region` for details on the calculation of returned
282 values.
283
284
285 Parameters
286 ----------
287
288 tp : int
289 True positive count, AKA "hit"
290
291 fp : int
292 False positive count, AKA "false alarm", or "Type I error"
293
294 tn : int
295 True negative count, AKA "correct rejection"
296
297 fn : int
298 False Negative count, AKA "miss", or "Type II error"
299
300 lambda_ : float
301 The parameterisation of the Beta prior to consider. Use
302 :math:`\\lambda=1` for a flat prior. Use :math:`\\lambda=0.5` for
303 Jeffrey's prior.
304
305 coverage : float
306 A floating-point number between 0 and 1.0 indicating the
307 coverage you're expecting. A value of 0.95 will ensure 95%
308 of the area under the probability density of the posterior
309 is covered by the returned equal-tailed interval.
310
311
312
313 Returns
314 -------
315
316 precision : (float, float, float, float)
317 P, AKA positive predictive value (PPV), mean, mode and credible
318 intervals (95% CI). It corresponds arithmetically
319 to ``tp/(tp+fp)``.
320
321 recall : (float, float, float, float)
322 R, AKA sensitivity, hit rate, or true positive rate (TPR), mean, mode
323 and credible intervals (95% CI). It corresponds arithmetically to
324 ``tp/(tp+fn)``.
325
326 specificity : (float, float, float, float)
327 S, AKA selectivity or true negative rate (TNR), mean, mode and credible
328 intervals (95% CI). It corresponds arithmetically to ``tn/(tn+fp)``.
329
330 accuracy : (float, float, float, float)
331 A, mean, mode and credible intervals (95% CI). See `Accuracy
332 <https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers>`_. is
333 the proportion of correct predictions (both true positives and true
334 negatives) among the total number of pixels examined. It corresponds
335 arithmetically to ``(tp+tn)/(tp+tn+fp+fn)``. This measure includes
336 both true-negatives and positives in the numerator, what makes it
337 sensitive to data or regions without annotations.
338
339 jaccard : (float, float, float, float)
340 J, mean, mode and credible intervals (95% CI). See `Jaccard Index or
341 Similarity <https://en.wikipedia.org/wiki/Jaccard_index>`_. It
342 corresponds arithmetically to ``tp/(tp+fp+fn)``. The Jaccard index
343 depends on a TP-only numerator, similarly to the F1 score. For regions
344 where there are no annotations, the Jaccard index will always be zero,
345 irrespective of the model output. Accuracy may be a better proxy if
346 one needs to consider the true abscence of annotations in a region as
347 part of the measure.
348
349 f1_score : (float, float, float, float)
350 F1, mean, mode and credible intervals (95% CI). See `F1-score
351 <https://en.wikipedia.org/wiki/F1_score>`_. It corresponds
352 arithmetically to ``2*P*R/(P+R)`` or ``2*tp/(2*tp+fp+fn)``. The F1 or
353 Dice score depends on a TP-only numerator, similarly to the Jaccard
354 index. For regions where there are no annotations, the F1-score will
355 always be zero, irrespective of the model output. Accuracy may be a
356 better proxy if one needs to consider the true abscence of annotations
357 in a region as part of the measure.
358 """
359
360 return (
361 beta_credible_region(tp, fp, lambda_, coverage), # precision
362 beta_credible_region(tp, fn, lambda_, coverage), # recall
363 beta_credible_region(tn, fp, lambda_, coverage), # specificity
364 beta_credible_region(tp + tn, fp + fn, lambda_, coverage), # accuracy
365 beta_credible_region(tp, fp + fn, lambda_, coverage), # jaccard index
366 beta_credible_region(2 * tp, fp + fn, lambda_, coverage), # f1-score
367 )
368
369
370def auc(x, y):
371 """Calculates the area under the precision-recall curve (AUC)
372
373 This function requires a minimum of 2 points and will use the trapezoidal
374 method to calculate the area under a curve bound between ``[0.0, 1.0]``.
375 It interpolates missing points if required. The input ``x`` should be
376 continuously increasing or decreasing.
377
378
379 Parameters
380 ----------
381
382 x : numpy.ndarray
383 A 1D numpy array containing continuously increasing or decreasing
384 values for the X coordinate.
385
386 y : numpy.ndarray
387 A 1D numpy array containing the Y coordinates of the X values provided
388 in ``x``.
389
390 """
391
392 x = numpy.array(x)
393 y = numpy.array(y)
394
395 assert len(x) == len(y), "x and y sequences must have the same length"
396
397 dx = numpy.diff(x)
398 if numpy.any(dx < 0):
399 if numpy.all(dx <= 0):
400 # invert direction
401 x = x[::-1]
402 y = y[::-1]
403 else:
404 raise ValueError(
405 "x is neither increasing nor decreasing " ": {}.".format(x)
406 )
407
408 y_interp = numpy.interp(
409 numpy.arange(0, 1, 0.001),
410 numpy.array(x),
411 numpy.array(y),
412 left=1.0,
413 right=0.0,
414 )
415 return y_interp.sum() * 0.001