Coverage for /scratch/builds/bob/bob.med.tb/miniconda/conda-bld/bob.med.tb_1637571489937/_test_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placeho/lib/python3.8/site-packages/bob/med/tb/utils/measure.py: 100%

1#!/usr/bin/env python

2# -*- coding: utf-8 -*-

4from collections import deque

5import numpy

6import torch

7import scipy.special

10class SmoothedValue:

11 """Track a series of values and provide access to smoothed values over a

12 window or the global series average.

13 """

15 def __init__(self, window_size=20):

16 self.deque = deque(maxlen=window_size)

18 def update(self, value):

19 self.deque.append(value)

21 @property

22 def median(self):

23 d = torch.tensor(list(self.deque))

24 return d.median().item()

26 @property

27 def avg(self):

28 d = torch.tensor(list(self.deque))

29 return d.mean().item()

32def tricky_division(n, d):

33 """Divides n by d. Returns 0.0 in case of a division by zero"""

35 return n/(d+(d==0))

38def base_measures(tp, fp, tn, fn):

39 """Calculates measures from true/false positive and negative counts

41 This function can return standard machine learning measures from true and

42 false positive counts of positives and negatives. For a thorough look into

43 these and alternate names for the returned values, please check Wikipedia's

44 entry on `Precision and Recall

45 <https://en.wikipedia.org/wiki/Precision_and_recall>`_.

48 Parameters

49 ----------

51 tp : int

52 True positive count, AKA "hit"

54 fp : int

55 False positive count, AKA, "correct rejection"

57 tn : int

58 True negative count, AKA "false alarm", or "Type I error"

60 fn : int

61 False Negative count, AKA "miss", or "Type II error"

64 Returns

65 -------

67 precision : float

68 P, AKA positive predictive value (PPV). It corresponds arithmetically

69 to ``tp/(tp+fp)``. In the case ``tp+fp == 0``, this function returns

70 zero for precision.

72 recall : float

73 R, AKA sensitivity, hit rate, or true positive rate (TPR). It

74 corresponds arithmetically to ``tp/(tp+fn)``. In the special case

75 where ``tp+fn == 0``, this function returns zero for recall.

77 specificity : float

78 S, AKA selectivity or true negative rate (TNR). It

79 corresponds arithmetically to ``tn/(tn+fp)``. In the special case

80 where ``tn+fp == 0``, this function returns zero for specificity.

82 accuracy : float

83 A, see `Accuracy

84 <https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers>`_. is

85 the proportion of correct predictions (both true positives and true

86 negatives) among the total number of pixels examined. It corresponds

87 arithmetically to ``(tp+tn)/(tp+tn+fp+fn)``. This measure includes

88 both true-negatives and positives in the numerator, what makes it

89 sensitive to data or regions without annotations.

91 jaccard : float

92 J, see `Jaccard Index or Similarity

93 <https://en.wikipedia.org/wiki/Jaccard_index>`_. It corresponds

94 arithmetically to ``tp/(tp+fp+fn)``. In the special case where

95 ``tn+fp+fn == 0``, this function returns zero for the Jaccard index.

96 The Jaccard index depends on a TP-only numerator, similarly to the F1

97 score. For regions where there are no annotations, the Jaccard index

98 will always be zero, irrespective of the model output. Accuracy may be

99 a better proxy if one needs to consider the true abscence of

100 annotations in a region as part of the measure.

101

102 f1_score : float

103 F1, see `F1-score <https://en.wikipedia.org/wiki/F1_score>`_. It

104 corresponds arithmetically to ``2*P*R/(P+R)`` or ``2*tp/(2*tp+fp+fn)``.

105 In the special case where ``P+R == (2*tp+fp+fn) == 0``, this function

106 returns zero for the Jaccard index. The F1 or Dice score depends on a

107 TP-only numerator, similarly to the Jaccard index. For regions where

108 there are no annotations, the F1-score will always be zero,

109 irrespective of the model output. Accuracy may be a better proxy if

110 one needs to consider the true abscence of annotations in a region as

111 part of the measure.

112

113 """

114

115 return (

116 tricky_division(tp, tp + fp), #precision

117 tricky_division(tp, tp + fn), #recall

118 tricky_division(tn, fp + tn), #specificity

119 tricky_division(tp + tn, tp + fp + fn + tn), #accuracy

120 tricky_division(tp, tp + fp + fn), #jaccard index

121 tricky_division(2*tp, (2*tp) + fp + fn), #f1-score

122 )

123

124def beta_credible_region(k, l, lambda_, coverage):

125 """

126 Returns the mode, upper and lower bounds of the equal-tailed credible

127 region of a probability estimate following Bernoulli trials.

128

129 This implemetnation is based on [GOUTTE-2005]_. It assumes :math:`k`

130 successes and :math:`l` failures (:math:`n = k+l` total trials) are issued

131 from a series of Bernoulli trials (likelihood is binomial). The posterior

132 is derivated using the Bayes Theorem with a beta prior. As there is no

133 reason to favour high vs. low precision, we use a symmetric Beta prior

134 (:math:`\\alpha=\\beta`):

135

136 .. math::

137

138 P(p|k,n) &= \\frac{P(k,n|p)P(p)}{P(k,n)} \\\\

139 P(p|k,n) &= \\frac{\\frac{n!}{k!(n-k)!}p^{k}(1-p)^{n-k}P(p)}{P(k)} \\\\

140 P(p|k,n) &= \\frac{1}{B(k+\\alpha, n-k+\beta)}p^{k+\\alpha-1}(1-p)^{n-k+\\beta-1} \\\\

141 P(p|k,n) &= \\frac{1}{B(k+\\alpha, n-k+\\alpha)}p^{k+\\alpha-1}(1-p)^{n-k+\\alpha-1}

142

143 The mode for this posterior (also the maximum a posteriori) is:

144

145 .. math::

146

147 \\text{mode}(p) = \\frac{k+\\lambda-1}{n+2\\lambda-2}

148

149 Concretely, the prior may be flat (all rates are equally likely,

150 :math:`\\lambda=1`) or we may use Jeoffrey's prior

151 (:math:`\\lambda=0.5`), that is invariant through re-parameterisation.

152 Jeffrey's prior indicate that rates close to zero or one are more likely.

153

154 The mode above works if :math:`k+{\\alpha},n-k+{\\alpha} > 1`, which is

155 usually the case for a resonably well tunned system, with more than a few

156 samples for analysis. In the limit of the system performance, :math:`k`

157 may be 0, which will make the mode become zero.

158

159 For our purposes, it may be more suitable to represent :math:`n = k + l`,

160 with :math:`k`, the number of successes and :math:`l`, the number of

161 failures in the binomial experiment, and find this more suitable

162 representation:

163

164 .. math::

165

166 P(p|k,l) &= \\frac{1}{B(k+\\alpha, l+\\alpha)}p^{k+\\alpha-1}(1-p)^{l+\\alpha-1} \\\\

167 \\text{mode}(p) &= \\frac{k+\\lambda-1}{k+l+2\\lambda-2}

168

169 This can be mapped to most rates calculated in the context of binary

170 classification this way:

171

172 * Precision or Positive-Predictive Value (PPV): p = TP/(TP+FP), so k=TP, l=FP

173 * Recall, Sensitivity, or True Positive Rate: r = TP/(TP+FN), so k=TP, l=FN

174 * Specificity or True Negative Rage: s = TN/(TN+FP), so k=TN, l=FP

175 * F1-score: f1 = 2TP/(2TP+FP+FN), so k=2TP, l=FP+FN

176 * Accuracy: acc = TP+TN/(TP+TN+FP+FN), so k=TP+TN, l=FP+FN

177 * Jaccard: j = TP/(TP+FP+FN), so k=TP, l=FP+FN

178

179 Contrary to frequentist approaches, in which one can only

180 say that if the test were repeated an infinite number of times,

181 and one constructed a confidence interval each time, then X%

182 of the confidence intervals would contain the true rate, here

183 we can say that given our observed data, there is a X% probability

184 that the true value of :math:`k/n` falls within the provided

185 interval.

186

187

188 .. note::

189

190 For a disambiguation with Confidence Interval, read

191 https://en.wikipedia.org/wiki/Credible_interval.

192

193

194 Parameters

195 ==========

196

197 k : int

198 Number of successes observed on the experiment

199

200 l : int

201 Number of failures observed on the experiment

202

203 lambda__ : :py:class:`float`, Optional

204 The parameterisation of the Beta prior to consider. Use

205 :math:`\\lambda=1` for a flat prior. Use :math:`\\lambda=0.5` for

206 Jeffrey's prior (the default).

207

208 coverage : :py:class:`float`, Optional

209 A floating-point number between 0 and 1.0 indicating the

210 coverage you're expecting. A value of 0.95 will ensure 95%

211 of the area under the probability density of the posterior

212 is covered by the returned equal-tailed interval.

213

214

215 Returns

216 =======

217

218 mean : float

219 The mean of the posterior distribution

220

221 mode : float

222 The mode of the posterior distribution

223

224 lower, upper: float

225 The lower and upper bounds of the credible region

226

227 """

228

229 # we return the equally-tailed range

230 right = (1.0-coverage)/2 #half-width in each side

231 lower = scipy.special.betaincinv(k+lambda_, l+lambda_, right)

232 upper = scipy.special.betaincinv(k+lambda_, l+lambda_, 1.0-right)

233

234 # evaluate mean and mode (https://en.wikipedia.org/wiki/Beta_distribution)

235 alpha = k+lambda_

236 beta = l+lambda_

237

238 E = alpha / (alpha + beta)

239

240 # the mode of a beta distribution is a bit tricky

241 if alpha > 1 and beta > 1:

242 mode = (alpha-1) / (alpha+beta-2)

243 elif alpha == 1 and beta == 1:

244 # In the case of precision, if the threshold is close to 1.0, both TP

245 # and FP can be zero, which may cause this condition to be reached, if

246 # the prior is exactly 1 (flat prior). This is a weird situation,

247 # because effectively we are trying to compute the posterior when the

248 # total number of experiments is zero. So, only the prior counts - but

249 # the prior is flat, so we should just pick a value. We choose the

250 # middle of the range.

251 mode = 0.0 #any value would do, we just pick this one

252 elif alpha <= 1 and beta > 1:

253 mode = 0.0

254 elif alpha > 1 and beta <= 1:

255 mode = 1.0

256 else: #elif alpha < 1 and beta < 1:

257 # in the case of precision, if the threshold is close to 1.0, both TP

258 # and FP can be zero, which may cause this condition to be reached, if

259 # the prior is smaller than 1. This is a weird situation, because

260 # effectively we are trying to compute the posterior when the total

261 # number of experiments is zero. So, only the prior counts - but the

262 # prior is bimodal, so we should just pick a value. We choose the

263 # left of the range.

264 mode = 0.0 #could also be 1.0 as the prior is bimodal

265

266 return E, mode, lower, upper

267

268

269def bayesian_measures(tp, fp, tn, fn, lambda_, coverage):

270 """Calculates mean and mode from true/false positive and negative counts

271 with credible regions

272

273 This function can return bayesian estimates of standard machine learning

274 measures from true and false positive counts of positives and negatives.

275 For a thorough look into these and alternate names for the returned values,

276 please check Wikipedia's entry on `Precision and Recall

277 <https://en.wikipedia.org/wiki/Precision_and_recall>`_. See

278 :py:func:`beta_credible_region` for details on the calculation of returned

279 values.

280

281

282 Parameters

283 ----------

284

285 tp : int

286 True positive count, AKA "hit"

287

288 fp : int

289 False positive count, AKA "false alarm", or "Type I error"

290

291 tn : int

292 True negative count, AKA "correct rejection"

293

294 fn : int

295 False Negative count, AKA "miss", or "Type II error"

296

297 lambda_ : float

298 The parameterisation of the Beta prior to consider. Use

299 :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for

300 Jeffrey's prior.

301

302 coverage : float

303 A floating-point number between 0 and 1.0 indicating the

304 coverage you're expecting. A value of 0.95 will ensure 95%

305 of the area under the probability density of the posterior

306 is covered by the returned equal-tailed interval.

307

308

309

310 Returns

311 -------

312

313 precision : (float, float, float, float)

314 P, AKA positive predictive value (PPV), mean, mode and credible

315 intervals (95% CI). It corresponds arithmetically

316 to ``tp/(tp+fp)``.

317

318 recall : (float, float, float, float)

319 R, AKA sensitivity, hit rate, or true positive rate (TPR), mean, mode

320 and credible intervals (95% CI). It corresponds arithmetically to

321 ``tp/(tp+fn)``.

322

323 specificity : (float, float, float, float)

324 S, AKA selectivity or true negative rate (TNR), mean, mode and credible

325 intervals (95% CI). It corresponds arithmetically to ``tn/(tn+fp)``.

326

327 accuracy : (float, float, float, float)

328 A, mean, mode and credible intervals (95% CI). See `Accuracy

329 <https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers>`_. is

330 the proportion of correct predictions (both true positives and true

331 negatives) among the total number of pixels examined. It corresponds

332 arithmetically to ``(tp+tn)/(tp+tn+fp+fn)``. This measure includes

333 both true-negatives and positives in the numerator, what makes it

334 sensitive to data or regions without annotations.

335

336 jaccard : (float, float, float, float)

337 J, mean, mode and credible intervals (95% CI). See `Jaccard Index or

338 Similarity <https://en.wikipedia.org/wiki/Jaccard_index>`_. It

339 corresponds arithmetically to ``tp/(tp+fp+fn)``. The Jaccard index

340 depends on a TP-only numerator, similarly to the F1 score. For regions

341 where there are no annotations, the Jaccard index will always be zero,

342 irrespective of the model output. Accuracy may be a better proxy if

343 one needs to consider the true abscence of annotations in a region as

344 part of the measure.

345

346 f1_score : (float, float, float, float)

347 F1, mean, mode and credible intervals (95% CI). See `F1-score

348 <https://en.wikipedia.org/wiki/F1_score>`_. It corresponds

349 arithmetically to ``2*P*R/(P+R)`` or ``2*tp/(2*tp+fp+fn)``. The F1 or

350 Dice score depends on a TP-only numerator, similarly to the Jaccard

351 index. For regions where there are no annotations, the F1-score will

352 always be zero, irrespective of the model output. Accuracy may be a

353 better proxy if one needs to consider the true abscence of annotations

354 in a region as part of the measure.

355 """

356

357 return (

358 beta_credible_region(tp, fp, lambda_, coverage), #precision

359 beta_credible_region(tp, fn, lambda_, coverage), #recall

360 beta_credible_region(tn, fp, lambda_, coverage), #specificity

361 beta_credible_region(tp+tn, fp+fn, lambda_, coverage), #accuracy

362 beta_credible_region(tp, fp+fn, lambda_, coverage), #jaccard index

363 beta_credible_region(2*tp, fp+fn, lambda_, coverage), #f1-score

364 )

365

366def get_centered_maxf1(f1_scores, thresholds):

367 """Return the centered max F1 score threshold when multiple threshold

368 give the same max F1 score

369

370

371 Parameters

372 ----------

373

374 f1_scores : numpy.ndarray

375 1D array of f1 scores

376

377 thresholds : numpy.ndarray

378 1D array of thresholds

379

380 Returns

381 -------

382

383 max F1 score: float

384

385 threshold: float

386

387 """

388 maxf1 = f1_scores.max()

389 maxf1_indices = numpy.where(f1_scores == maxf1)[0]

390

391 # If multiple thresholds give the same max F1 score

392 if len(maxf1_indices) > 1:

393 mean_maxf1_index = int(round(numpy.mean(maxf1_indices)))

394 else:

395 mean_maxf1_index = maxf1_indices[0]

396

397 return maxf1, thresholds[mean_maxf1_index]