Coverage for /scratch/builds/bob/bob.ip.binseg/miniconda/conda-bld/bob.ip.binseg_1635977648782/_test_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_p/lib/python3.8/site-packages/bob/ip/binseg/utils/measure.py: 100%

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1#!/usr/bin/env python

2# -*- coding: utf-8 -*-

4from collections import deque

6import numpy

7import scipy.special

8import torch

11class SmoothedValue:

12 """Track a series of values and provide access to smoothed values over a

13 window or the global series average.

14 """

16 def __init__(self, window_size=20):

17 self.deque = deque(maxlen=window_size)

19 def update(self, value):

20 self.deque.append(value)

22 @property

23 def median(self):

24 d = torch.tensor(list(self.deque))

25 return d.median().item()

27 @property

28 def avg(self):

29 d = torch.tensor(list(self.deque))

30 return d.mean().item()

33def tricky_division(n, d):

34 """Divides n by d. Returns 0.0 in case of a division by zero"""

36 return n / (d + (d == 0))

39def base_measures(tp, fp, tn, fn):

40 """Calculates frequentist measures from true/false positive and negative

41 counts

43 This function can return (frequentist versions of) standard machine

44 learning measures from true and false positive counts of positives and

45 negatives. For a thorough look into these and alternate names for the

46 returned values, please check Wikipedia's entry on `Precision and Recall

47 <https://en.wikipedia.org/wiki/Precision_and_recall>`_.

50 Parameters

51 ----------

53 tp : int

54 True positive count, AKA "hit"

56 fp : int

57 False positive count, AKA "false alarm", or "Type I error"

59 tn : int

60 True negative count, AKA "correct rejection"

62 fn : int

63 False Negative count, AKA "miss", or "Type II error"

66 Returns

67 -------

69 precision : float

70 P, AKA positive predictive value (PPV). It corresponds arithmetically

71 to ``tp/(tp+fp)``. In the case ``tp+fp == 0``, this function returns

72 zero for precision.

74 recall : float

75 R, AKA sensitivity, hit rate, or true positive rate (TPR). It

76 corresponds arithmetically to ``tp/(tp+fn)``. In the special case

77 where ``tp+fn == 0``, this function returns zero for recall.

79 specificity : float

80 S, AKA selectivity or true negative rate (TNR). It

81 corresponds arithmetically to ``tn/(tn+fp)``. In the special case

82 where ``tn+fp == 0``, this function returns zero for specificity.

84 accuracy : float

85 A, see `Accuracy

86 <https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers>`_. is

87 the proportion of correct predictions (both true positives and true

88 negatives) among the total number of pixels examined. It corresponds

89 arithmetically to ``(tp+tn)/(tp+tn+fp+fn)``. This measure includes

90 both true-negatives and positives in the numerator, what makes it

91 sensitive to data or regions without annotations.

93 jaccard : float

94 J, see `Jaccard Index or Similarity

95 <https://en.wikipedia.org/wiki/Jaccard_index>`_. It corresponds

96 arithmetically to ``tp/(tp+fp+fn)``. In the special case where

97 ``tn+fp+fn == 0``, this function returns zero for the Jaccard index.

98 The Jaccard index depends on a TP-only numerator, similarly to the F1

99 score. For regions where there are no annotations, the Jaccard index

100 will always be zero, irrespective of the model output. Accuracy may be

101 a better proxy if one needs to consider the true abscence of

102 annotations in a region as part of the measure.

103

104 f1_score : float

105 F1, see `F1-score <https://en.wikipedia.org/wiki/F1_score>`_. It

106 corresponds arithmetically to ``2*P*R/(P+R)`` or ``2*tp/(2*tp+fp+fn)``.

107 In the special case where ``P+R == (2*tp+fp+fn) == 0``, this function

108 returns zero for the Jaccard index. The F1 or Dice score depends on a

109 TP-only numerator, similarly to the Jaccard index. For regions where

110 there are no annotations, the F1-score will always be zero,

111 irrespective of the model output. Accuracy may be a better proxy if

112 one needs to consider the true abscence of annotations in a region as

113 part of the measure.

114

115 """

116

117 return (

118 tricky_division(tp, tp + fp), # precision

119 tricky_division(tp, tp + fn), # recall

120 tricky_division(tn, fp + tn), # specificity

121 tricky_division(tp + tn, tp + fp + fn + tn), # accuracy

122 tricky_division(tp, tp + fp + fn), # jaccard index

123 tricky_division(2 * tp, (2 * tp) + fp + fn), # f1-score

124 )

125

126

127def beta_credible_region(k, l, lambda_, coverage):

128 """

129 Returns the mode, upper and lower bounds of the equal-tailed credible

130 region of a probability estimate following Bernoulli trials.

131

132 This implemetnation is based on [GOUTTE-2005]_. It assumes :math:`k`

133 successes and :math:`l` failures (:math:`n = k+l` total trials) are issued

134 from a series of Bernoulli trials (likelihood is binomial). The posterior

135 is derivated using the Bayes Theorem with a beta prior. As there is no

136 reason to favour high vs. low precision, we use a symmetric Beta prior

137 (:math:`\\alpha=\\beta`):

138

139 .. math::

140

141 P(p|k,n) &= \\frac{P(k,n|p)P(p)}{P(k,n)} \\\\

142 P(p|k,n) &= \\frac{\\frac{n!}{k!(n-k)!}p^{k}(1-p)^{n-k}P(p)}{P(k)} \\\\

143 P(p|k,n) &= \\frac{1}{B(k+\\alpha, n-k+\beta)}p^{k+\\alpha-1}(1-p)^{n-k+\\beta-1} \\\\

144 P(p|k,n) &= \\frac{1}{B(k+\\alpha, n-k+\\alpha)}p^{k+\\alpha-1}(1-p)^{n-k+\\alpha-1}

145

146 The mode for this posterior (also the maximum a posteriori) is:

147

148 .. math::

149

150 \\text{mode}(p) = \\frac{k+\\lambda-1}{n+2\\lambda-2}

151

152 Concretely, the prior may be flat (all rates are equally likely,

153 :math:`\\lambda=1`) or we may use Jeoffrey's prior

154 (:math:`\\lambda=0.5`), that is invariant through re-parameterisation.

155 Jeffrey's prior indicate that rates close to zero or one are more likely.

156

157 The mode above works if :math:`k+{\\alpha},n-k+{\\alpha} > 1`, which is

158 usually the case for a resonably well tunned system, with more than a few

159 samples for analysis. In the limit of the system performance, :math:`k`

160 may be 0, which will make the mode become zero.

161

162 For our purposes, it may be more suitable to represent :math:`n = k + l`,

163 with :math:`k`, the number of successes and :math:`l`, the number of

164 failures in the binomial experiment, and find this more suitable

165 representation:

166

167 .. math::

168

169 P(p|k,l) &= \\frac{1}{B(k+\\alpha, l+\\alpha)}p^{k+\\alpha-1}(1-p)^{l+\\alpha-1} \\\\

170 \\text{mode}(p) &= \\frac{k+\\lambda-1}{k+l+2\\lambda-2}

171

172 This can be mapped to most rates calculated in the context of binary

173 classification this way:

174

175 * Precision or Positive-Predictive Value (PPV): p = TP/(TP+FP), so k=TP, l=FP

176 * Recall, Sensitivity, or True Positive Rate: r = TP/(TP+FN), so k=TP, l=FN

177 * Specificity or True Negative Rage: s = TN/(TN+FP), so k=TN, l=FP

178 * F1-score: f1 = 2TP/(2TP+FP+FN), so k=2TP, l=FP+FN

179 * Accuracy: acc = TP+TN/(TP+TN+FP+FN), so k=TP+TN, l=FP+FN

180 * Jaccard: j = TP/(TP+FP+FN), so k=TP, l=FP+FN

181

182 Contrary to frequentist approaches, in which one can only

183 say that if the test were repeated an infinite number of times,

184 and one constructed a confidence interval each time, then X%

185 of the confidence intervals would contain the true rate, here

186 we can say that given our observed data, there is a X% probability

187 that the true value of :math:`k/n` falls within the provided

188 interval.

189

190

191 .. note::

192

193 For a disambiguation with Confidence Interval, read

194 https://en.wikipedia.org/wiki/Credible_interval.

195

196

197 Parameters

198 ==========

199

200 k : int

201 Number of successes observed on the experiment

202

203 l : int

204 Number of failures observed on the experiment

205

206 lambda__ : :py:class:`float`, Optional

207 The parameterisation of the Beta prior to consider. Use

208 :math:`\\lambda=1` for a flat prior. Use :math:`\\lambda=0.5` for

209 Jeffrey's prior (the default).

210

211 coverage : :py:class:`float`, Optional

212 A floating-point number between 0 and 1.0 indicating the

213 coverage you're expecting. A value of 0.95 will ensure 95%

214 of the area under the probability density of the posterior

215 is covered by the returned equal-tailed interval.

216

217

218 Returns

219 =======

220

221 mean : float

222 The mean of the posterior distribution

223

224 mode : float

225 The mode of the posterior distribution

226

227 lower, upper: float

228 The lower and upper bounds of the credible region

229

230 """

231

232 # we return the equally-tailed range

233 right = (1.0 - coverage) / 2 # half-width in each side

234 lower = scipy.special.betaincinv(k + lambda_, l + lambda_, right)

235 upper = scipy.special.betaincinv(k + lambda_, l + lambda_, 1.0 - right)

236

237 # evaluate mean and mode (https://en.wikipedia.org/wiki/Beta_distribution)

238 alpha = k + lambda_

239 beta = l + lambda_

240

241 E = alpha / (alpha + beta)

242

243 # the mode of a beta distribution is a bit tricky

244 if alpha > 1 and beta > 1:

245 mode = (alpha - 1) / (alpha + beta - 2)

246 elif alpha == 1 and beta == 1:

247 # In the case of precision, if the threshold is close to 1.0, both TP

248 # and FP can be zero, which may cause this condition to be reached, if

249 # the prior is exactly 1 (flat prior). This is a weird situation,

250 # because effectively we are trying to compute the posterior when the

251 # total number of experiments is zero. So, only the prior counts - but

252 # the prior is flat, so we should just pick a value. We choose the

253 # middle of the range.

254 mode = 0.0 # any value would do, we just pick this one

255 elif alpha <= 1 and beta > 1:

256 mode = 0.0

257 elif alpha > 1 and beta <= 1:

258 mode = 1.0

259 else: # elif alpha < 1 and beta < 1:

260 # in the case of precision, if the threshold is close to 1.0, both TP

261 # and FP can be zero, which may cause this condition to be reached, if

262 # the prior is smaller than 1. This is a weird situation, because

263 # effectively we are trying to compute the posterior when the total

264 # number of experiments is zero. So, only the prior counts - but the

265 # prior is bimodal, so we should just pick a value. We choose the

266 # left of the range.

267 mode = 0.0 # could also be 1.0 as the prior is bimodal

268

269 return E, mode, lower, upper

270

271

272def bayesian_measures(tp, fp, tn, fn, lambda_, coverage):

273 """Calculates mean and mode from true/false positive and negative counts

274 with credible regions

275

276 This function can return bayesian estimates of standard machine learning

277 measures from true and false positive counts of positives and negatives.

278 For a thorough look into these and alternate names for the returned values,

279 please check Wikipedia's entry on `Precision and Recall

280 <https://en.wikipedia.org/wiki/Precision_and_recall>`_. See

281 :py:func:`beta_credible_region` for details on the calculation of returned

282 values.

283

284

285 Parameters

286 ----------

287

288 tp : int

289 True positive count, AKA "hit"

290

291 fp : int

292 False positive count, AKA "false alarm", or "Type I error"

293

294 tn : int

295 True negative count, AKA "correct rejection"

296

297 fn : int

298 False Negative count, AKA "miss", or "Type II error"

299

300 lambda_ : float

301 The parameterisation of the Beta prior to consider. Use

302 :math:`\\lambda=1` for a flat prior. Use :math:`\\lambda=0.5` for

303 Jeffrey's prior.

304

305 coverage : float

306 A floating-point number between 0 and 1.0 indicating the

307 coverage you're expecting. A value of 0.95 will ensure 95%

308 of the area under the probability density of the posterior

309 is covered by the returned equal-tailed interval.

310

311

312

313 Returns

314 -------

315

316 precision : (float, float, float, float)

317 P, AKA positive predictive value (PPV), mean, mode and credible

318 intervals (95% CI). It corresponds arithmetically

319 to ``tp/(tp+fp)``.

320

321 recall : (float, float, float, float)

322 R, AKA sensitivity, hit rate, or true positive rate (TPR), mean, mode

323 and credible intervals (95% CI). It corresponds arithmetically to

324 ``tp/(tp+fn)``.

325

326 specificity : (float, float, float, float)

327 S, AKA selectivity or true negative rate (TNR), mean, mode and credible

328 intervals (95% CI). It corresponds arithmetically to ``tn/(tn+fp)``.

329

330 accuracy : (float, float, float, float)

331 A, mean, mode and credible intervals (95% CI). See `Accuracy

332 <https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers>`_. is

333 the proportion of correct predictions (both true positives and true

334 negatives) among the total number of pixels examined. It corresponds

335 arithmetically to ``(tp+tn)/(tp+tn+fp+fn)``. This measure includes

336 both true-negatives and positives in the numerator, what makes it

337 sensitive to data or regions without annotations.

338

339 jaccard : (float, float, float, float)

340 J, mean, mode and credible intervals (95% CI). See `Jaccard Index or

341 Similarity <https://en.wikipedia.org/wiki/Jaccard_index>`_. It

342 corresponds arithmetically to ``tp/(tp+fp+fn)``. The Jaccard index

343 depends on a TP-only numerator, similarly to the F1 score. For regions

344 where there are no annotations, the Jaccard index will always be zero,

345 irrespective of the model output. Accuracy may be a better proxy if

346 one needs to consider the true abscence of annotations in a region as

347 part of the measure.

348

349 f1_score : (float, float, float, float)

350 F1, mean, mode and credible intervals (95% CI). See `F1-score

351 <https://en.wikipedia.org/wiki/F1_score>`_. It corresponds

352 arithmetically to ``2*P*R/(P+R)`` or ``2*tp/(2*tp+fp+fn)``. The F1 or

353 Dice score depends on a TP-only numerator, similarly to the Jaccard

354 index. For regions where there are no annotations, the F1-score will

355 always be zero, irrespective of the model output. Accuracy may be a

356 better proxy if one needs to consider the true abscence of annotations

357 in a region as part of the measure.

358 """

359

360 return (

361 beta_credible_region(tp, fp, lambda_, coverage), # precision

362 beta_credible_region(tp, fn, lambda_, coverage), # recall

363 beta_credible_region(tn, fp, lambda_, coverage), # specificity

364 beta_credible_region(tp + tn, fp + fn, lambda_, coverage), # accuracy

365 beta_credible_region(tp, fp + fn, lambda_, coverage), # jaccard index

366 beta_credible_region(2 * tp, fp + fn, lambda_, coverage), # f1-score

367 )

368

369

370def auc(x, y):

371 """Calculates the area under the precision-recall curve (AUC)

372

373 This function requires a minimum of 2 points and will use the trapezoidal

374 method to calculate the area under a curve bound between ``[0.0, 1.0]``.

375 It interpolates missing points if required. The input ``x`` should be

376 continuously increasing or decreasing.

377

378

379 Parameters

380 ----------

381

382 x : numpy.ndarray

383 A 1D numpy array containing continuously increasing or decreasing

384 values for the X coordinate.

385

386 y : numpy.ndarray

387 A 1D numpy array containing the Y coordinates of the X values provided

388 in ``x``.

389

390 """

391

392 x = numpy.array(x)

393 y = numpy.array(y)

394

395 assert len(x) == len(y), "x and y sequences must have the same length"

396

397 dx = numpy.diff(x)

398 if numpy.any(dx < 0):

399 if numpy.all(dx <= 0):

400 # invert direction

401 x = x[::-1]

402 y = y[::-1]

403 else:

404 raise ValueError(

405 "x is neither increasing nor decreasing " ": {}.".format(x)

406 )

407

408 y_interp = numpy.interp(

409 numpy.arange(0, 1, 0.001),

410 numpy.array(x),

411 numpy.array(y),

412 left=1.0,

413 right=0.0,

414 )

415 return y_interp.sum() * 0.001

52 statements 52 run 0 missing 0 excluded

52 statements