Coverage for /scratch/builds/bob/bob.med.tb/miniconda/conda-bld/bob.med.tb_1637571489937/_test_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placeho/lib/python3.8/site-packages/bob/med/tb/utils/measure.py: 100%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

47 statements  

1#!/usr/bin/env python 

2# -*- coding: utf-8 -*- 

3 

4from collections import deque 

5import numpy 

6import torch 

7import scipy.special 

8 

9 

10class SmoothedValue: 

11 """Track a series of values and provide access to smoothed values over a 

12 window or the global series average. 

13 """ 

14 

15 def __init__(self, window_size=20): 

16 self.deque = deque(maxlen=window_size) 

17 

18 def update(self, value): 

19 self.deque.append(value) 

20 

21 @property 

22 def median(self): 

23 d = torch.tensor(list(self.deque)) 

24 return d.median().item() 

25 

26 @property 

27 def avg(self): 

28 d = torch.tensor(list(self.deque)) 

29 return d.mean().item() 

30 

31 

32def tricky_division(n, d): 

33 """Divides n by d. Returns 0.0 in case of a division by zero""" 

34 

35 return n/(d+(d==0)) 

36 

37 

38def base_measures(tp, fp, tn, fn): 

39 """Calculates measures from true/false positive and negative counts 

40 

41 This function can return standard machine learning measures from true and 

42 false positive counts of positives and negatives. For a thorough look into 

43 these and alternate names for the returned values, please check Wikipedia's 

44 entry on `Precision and Recall 

45 <https://en.wikipedia.org/wiki/Precision_and_recall>`_. 

46 

47 

48 Parameters 

49 ---------- 

50 

51 tp : int 

52 True positive count, AKA "hit" 

53 

54 fp : int 

55 False positive count, AKA, "correct rejection" 

56 

57 tn : int 

58 True negative count, AKA "false alarm", or "Type I error" 

59 

60 fn : int 

61 False Negative count, AKA "miss", or "Type II error" 

62 

63 

64 Returns 

65 ------- 

66 

67 precision : float 

68 P, AKA positive predictive value (PPV). It corresponds arithmetically 

69 to ``tp/(tp+fp)``. In the case ``tp+fp == 0``, this function returns 

70 zero for precision. 

71 

72 recall : float 

73 R, AKA sensitivity, hit rate, or true positive rate (TPR). It 

74 corresponds arithmetically to ``tp/(tp+fn)``. In the special case 

75 where ``tp+fn == 0``, this function returns zero for recall. 

76 

77 specificity : float 

78 S, AKA selectivity or true negative rate (TNR). It 

79 corresponds arithmetically to ``tn/(tn+fp)``. In the special case 

80 where ``tn+fp == 0``, this function returns zero for specificity. 

81 

82 accuracy : float 

83 A, see `Accuracy 

84 <https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers>`_. is 

85 the proportion of correct predictions (both true positives and true 

86 negatives) among the total number of pixels examined. It corresponds 

87 arithmetically to ``(tp+tn)/(tp+tn+fp+fn)``. This measure includes 

88 both true-negatives and positives in the numerator, what makes it 

89 sensitive to data or regions without annotations. 

90 

91 jaccard : float 

92 J, see `Jaccard Index or Similarity 

93 <https://en.wikipedia.org/wiki/Jaccard_index>`_. It corresponds 

94 arithmetically to ``tp/(tp+fp+fn)``. In the special case where 

95 ``tn+fp+fn == 0``, this function returns zero for the Jaccard index. 

96 The Jaccard index depends on a TP-only numerator, similarly to the F1 

97 score. For regions where there are no annotations, the Jaccard index 

98 will always be zero, irrespective of the model output. Accuracy may be 

99 a better proxy if one needs to consider the true abscence of 

100 annotations in a region as part of the measure. 

101 

102 f1_score : float 

103 F1, see `F1-score <https://en.wikipedia.org/wiki/F1_score>`_. It 

104 corresponds arithmetically to ``2*P*R/(P+R)`` or ``2*tp/(2*tp+fp+fn)``. 

105 In the special case where ``P+R == (2*tp+fp+fn) == 0``, this function 

106 returns zero for the Jaccard index. The F1 or Dice score depends on a 

107 TP-only numerator, similarly to the Jaccard index. For regions where 

108 there are no annotations, the F1-score will always be zero, 

109 irrespective of the model output. Accuracy may be a better proxy if 

110 one needs to consider the true abscence of annotations in a region as 

111 part of the measure. 

112 

113 """ 

114 

115 return ( 

116 tricky_division(tp, tp + fp), #precision 

117 tricky_division(tp, tp + fn), #recall 

118 tricky_division(tn, fp + tn), #specificity 

119 tricky_division(tp + tn, tp + fp + fn + tn), #accuracy 

120 tricky_division(tp, tp + fp + fn), #jaccard index 

121 tricky_division(2*tp, (2*tp) + fp + fn), #f1-score 

122 ) 

123 

124def beta_credible_region(k, l, lambda_, coverage): 

125 """ 

126 Returns the mode, upper and lower bounds of the equal-tailed credible 

127 region of a probability estimate following Bernoulli trials. 

128 

129 This implemetnation is based on [GOUTTE-2005]_. It assumes :math:`k` 

130 successes and :math:`l` failures (:math:`n = k+l` total trials) are issued 

131 from a series of Bernoulli trials (likelihood is binomial). The posterior 

132 is derivated using the Bayes Theorem with a beta prior. As there is no 

133 reason to favour high vs. low precision, we use a symmetric Beta prior 

134 (:math:`\\alpha=\\beta`): 

135 

136 .. math:: 

137 

138 P(p|k,n) &= \\frac{P(k,n|p)P(p)}{P(k,n)} \\\\ 

139 P(p|k,n) &= \\frac{\\frac{n!}{k!(n-k)!}p^{k}(1-p)^{n-k}P(p)}{P(k)} \\\\ 

140 P(p|k,n) &= \\frac{1}{B(k+\\alpha, n-k+\beta)}p^{k+\\alpha-1}(1-p)^{n-k+\\beta-1} \\\\ 

141 P(p|k,n) &= \\frac{1}{B(k+\\alpha, n-k+\\alpha)}p^{k+\\alpha-1}(1-p)^{n-k+\\alpha-1} 

142 

143 The mode for this posterior (also the maximum a posteriori) is: 

144 

145 .. math:: 

146 

147 \\text{mode}(p) = \\frac{k+\\lambda-1}{n+2\\lambda-2} 

148 

149 Concretely, the prior may be flat (all rates are equally likely, 

150 :math:`\\lambda=1`) or we may use Jeoffrey's prior 

151 (:math:`\\lambda=0.5`), that is invariant through re-parameterisation. 

152 Jeffrey's prior indicate that rates close to zero or one are more likely. 

153 

154 The mode above works if :math:`k+{\\alpha},n-k+{\\alpha} > 1`, which is 

155 usually the case for a resonably well tunned system, with more than a few 

156 samples for analysis. In the limit of the system performance, :math:`k` 

157 may be 0, which will make the mode become zero. 

158 

159 For our purposes, it may be more suitable to represent :math:`n = k + l`, 

160 with :math:`k`, the number of successes and :math:`l`, the number of 

161 failures in the binomial experiment, and find this more suitable 

162 representation: 

163 

164 .. math:: 

165 

166 P(p|k,l) &= \\frac{1}{B(k+\\alpha, l+\\alpha)}p^{k+\\alpha-1}(1-p)^{l+\\alpha-1} \\\\ 

167 \\text{mode}(p) &= \\frac{k+\\lambda-1}{k+l+2\\lambda-2} 

168 

169 This can be mapped to most rates calculated in the context of binary 

170 classification this way: 

171 

172 * Precision or Positive-Predictive Value (PPV): p = TP/(TP+FP), so k=TP, l=FP 

173 * Recall, Sensitivity, or True Positive Rate: r = TP/(TP+FN), so k=TP, l=FN 

174 * Specificity or True Negative Rage: s = TN/(TN+FP), so k=TN, l=FP 

175 * F1-score: f1 = 2TP/(2TP+FP+FN), so k=2TP, l=FP+FN 

176 * Accuracy: acc = TP+TN/(TP+TN+FP+FN), so k=TP+TN, l=FP+FN 

177 * Jaccard: j = TP/(TP+FP+FN), so k=TP, l=FP+FN 

178 

179 Contrary to frequentist approaches, in which one can only 

180 say that if the test were repeated an infinite number of times, 

181 and one constructed a confidence interval each time, then X% 

182 of the confidence intervals would contain the true rate, here 

183 we can say that given our observed data, there is a X% probability 

184 that the true value of :math:`k/n` falls within the provided 

185 interval. 

186 

187 

188 .. note:: 

189 

190 For a disambiguation with Confidence Interval, read 

191 https://en.wikipedia.org/wiki/Credible_interval. 

192 

193 

194 Parameters 

195 ========== 

196 

197 k : int 

198 Number of successes observed on the experiment 

199 

200 l : int 

201 Number of failures observed on the experiment 

202 

203 lambda__ : :py:class:`float`, Optional 

204 The parameterisation of the Beta prior to consider. Use 

205 :math:`\\lambda=1` for a flat prior. Use :math:`\\lambda=0.5` for 

206 Jeffrey's prior (the default). 

207 

208 coverage : :py:class:`float`, Optional 

209 A floating-point number between 0 and 1.0 indicating the 

210 coverage you're expecting. A value of 0.95 will ensure 95% 

211 of the area under the probability density of the posterior 

212 is covered by the returned equal-tailed interval. 

213 

214 

215 Returns 

216 ======= 

217 

218 mean : float 

219 The mean of the posterior distribution 

220 

221 mode : float 

222 The mode of the posterior distribution 

223 

224 lower, upper: float 

225 The lower and upper bounds of the credible region 

226 

227 """ 

228 

229 # we return the equally-tailed range 

230 right = (1.0-coverage)/2 #half-width in each side 

231 lower = scipy.special.betaincinv(k+lambda_, l+lambda_, right) 

232 upper = scipy.special.betaincinv(k+lambda_, l+lambda_, 1.0-right) 

233 

234 # evaluate mean and mode (https://en.wikipedia.org/wiki/Beta_distribution) 

235 alpha = k+lambda_ 

236 beta = l+lambda_ 

237 

238 E = alpha / (alpha + beta) 

239 

240 # the mode of a beta distribution is a bit tricky 

241 if alpha > 1 and beta > 1: 

242 mode = (alpha-1) / (alpha+beta-2) 

243 elif alpha == 1 and beta == 1: 

244 # In the case of precision, if the threshold is close to 1.0, both TP 

245 # and FP can be zero, which may cause this condition to be reached, if 

246 # the prior is exactly 1 (flat prior). This is a weird situation, 

247 # because effectively we are trying to compute the posterior when the 

248 # total number of experiments is zero. So, only the prior counts - but 

249 # the prior is flat, so we should just pick a value. We choose the 

250 # middle of the range. 

251 mode = 0.0 #any value would do, we just pick this one 

252 elif alpha <= 1 and beta > 1: 

253 mode = 0.0 

254 elif alpha > 1 and beta <= 1: 

255 mode = 1.0 

256 else: #elif alpha < 1 and beta < 1: 

257 # in the case of precision, if the threshold is close to 1.0, both TP 

258 # and FP can be zero, which may cause this condition to be reached, if 

259 # the prior is smaller than 1. This is a weird situation, because 

260 # effectively we are trying to compute the posterior when the total 

261 # number of experiments is zero. So, only the prior counts - but the 

262 # prior is bimodal, so we should just pick a value. We choose the 

263 # left of the range. 

264 mode = 0.0 #could also be 1.0 as the prior is bimodal 

265 

266 return E, mode, lower, upper 

267 

268 

269def bayesian_measures(tp, fp, tn, fn, lambda_, coverage): 

270 """Calculates mean and mode from true/false positive and negative counts 

271 with credible regions 

272 

273 This function can return bayesian estimates of standard machine learning 

274 measures from true and false positive counts of positives and negatives. 

275 For a thorough look into these and alternate names for the returned values, 

276 please check Wikipedia's entry on `Precision and Recall 

277 <https://en.wikipedia.org/wiki/Precision_and_recall>`_. See 

278 :py:func:`beta_credible_region` for details on the calculation of returned 

279 values. 

280 

281 

282 Parameters 

283 ---------- 

284 

285 tp : int 

286 True positive count, AKA "hit" 

287 

288 fp : int 

289 False positive count, AKA "false alarm", or "Type I error" 

290 

291 tn : int 

292 True negative count, AKA "correct rejection" 

293 

294 fn : int 

295 False Negative count, AKA "miss", or "Type II error" 

296 

297 lambda_ : float 

298 The parameterisation of the Beta prior to consider. Use 

299 :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for 

300 Jeffrey's prior. 

301 

302 coverage : float 

303 A floating-point number between 0 and 1.0 indicating the 

304 coverage you're expecting. A value of 0.95 will ensure 95% 

305 of the area under the probability density of the posterior 

306 is covered by the returned equal-tailed interval. 

307 

308 

309 

310 Returns 

311 ------- 

312 

313 precision : (float, float, float, float) 

314 P, AKA positive predictive value (PPV), mean, mode and credible 

315 intervals (95% CI). It corresponds arithmetically 

316 to ``tp/(tp+fp)``. 

317 

318 recall : (float, float, float, float) 

319 R, AKA sensitivity, hit rate, or true positive rate (TPR), mean, mode 

320 and credible intervals (95% CI). It corresponds arithmetically to 

321 ``tp/(tp+fn)``. 

322 

323 specificity : (float, float, float, float) 

324 S, AKA selectivity or true negative rate (TNR), mean, mode and credible 

325 intervals (95% CI). It corresponds arithmetically to ``tn/(tn+fp)``. 

326 

327 accuracy : (float, float, float, float) 

328 A, mean, mode and credible intervals (95% CI). See `Accuracy 

329 <https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers>`_. is 

330 the proportion of correct predictions (both true positives and true 

331 negatives) among the total number of pixels examined. It corresponds 

332 arithmetically to ``(tp+tn)/(tp+tn+fp+fn)``. This measure includes 

333 both true-negatives and positives in the numerator, what makes it 

334 sensitive to data or regions without annotations. 

335 

336 jaccard : (float, float, float, float) 

337 J, mean, mode and credible intervals (95% CI). See `Jaccard Index or 

338 Similarity <https://en.wikipedia.org/wiki/Jaccard_index>`_. It 

339 corresponds arithmetically to ``tp/(tp+fp+fn)``. The Jaccard index 

340 depends on a TP-only numerator, similarly to the F1 score. For regions 

341 where there are no annotations, the Jaccard index will always be zero, 

342 irrespective of the model output. Accuracy may be a better proxy if 

343 one needs to consider the true abscence of annotations in a region as 

344 part of the measure. 

345 

346 f1_score : (float, float, float, float) 

347 F1, mean, mode and credible intervals (95% CI). See `F1-score 

348 <https://en.wikipedia.org/wiki/F1_score>`_. It corresponds 

349 arithmetically to ``2*P*R/(P+R)`` or ``2*tp/(2*tp+fp+fn)``. The F1 or 

350 Dice score depends on a TP-only numerator, similarly to the Jaccard 

351 index. For regions where there are no annotations, the F1-score will 

352 always be zero, irrespective of the model output. Accuracy may be a 

353 better proxy if one needs to consider the true abscence of annotations 

354 in a region as part of the measure. 

355 """ 

356 

357 return ( 

358 beta_credible_region(tp, fp, lambda_, coverage), #precision 

359 beta_credible_region(tp, fn, lambda_, coverage), #recall 

360 beta_credible_region(tn, fp, lambda_, coverage), #specificity 

361 beta_credible_region(tp+tn, fp+fn, lambda_, coverage), #accuracy 

362 beta_credible_region(tp, fp+fn, lambda_, coverage), #jaccard index 

363 beta_credible_region(2*tp, fp+fn, lambda_, coverage), #f1-score 

364 ) 

365 

366def get_centered_maxf1(f1_scores, thresholds): 

367 """Return the centered max F1 score threshold when multiple threshold 

368 give the same max F1 score 

369 

370 

371 Parameters 

372 ---------- 

373 

374 f1_scores : numpy.ndarray 

375 1D array of f1 scores 

376 

377 thresholds : numpy.ndarray 

378 1D array of thresholds 

379 

380 Returns 

381 ------- 

382 

383 max F1 score: float 

384 

385 threshold: float 

386 

387 """ 

388 maxf1 = f1_scores.max() 

389 maxf1_indices = numpy.where(f1_scores == maxf1)[0] 

390 

391 # If multiple thresholds give the same max F1 score 

392 if len(maxf1_indices) > 1: 

393 mean_maxf1_index = int(round(numpy.mean(maxf1_indices))) 

394 else: 

395 mean_maxf1_index = maxf1_indices[0] 

396 

397 return maxf1, thresholds[mean_maxf1_index]