Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1#!/usr/bin/env python 

2# -*- coding: utf-8 -*- 

3 

4from collections import deque 

5 

6import numpy 

7import scipy.special 

8import torch 

9 

10 

11class SmoothedValue: 

12 """Track a series of values and provide access to smoothed values over a 

13 window or the global series average. 

14 """ 

15 

16 def __init__(self, window_size=20): 

17 self.deque = deque(maxlen=window_size) 

18 

19 def update(self, value): 

20 self.deque.append(value) 

21 

22 @property 

23 def median(self): 

24 d = torch.tensor(list(self.deque)) 

25 return d.median().item() 

26 

27 @property 

28 def avg(self): 

29 d = torch.tensor(list(self.deque)) 

30 return d.mean().item() 

31 

32 

33def tricky_division(n, d): 

34 """Divides n by d. Returns 0.0 in case of a division by zero""" 

35 

36 return n / (d + (d == 0)) 

37 

38 

39def base_measures(tp, fp, tn, fn): 

40 """Calculates frequentist measures from true/false positive and negative 

41 counts 

42 

43 This function can return (frequentist versions of) standard machine 

44 learning measures from true and false positive counts of positives and 

45 negatives. For a thorough look into these and alternate names for the 

46 returned values, please check Wikipedia's entry on `Precision and Recall 

47 <https://en.wikipedia.org/wiki/Precision_and_recall>`_. 

48 

49 

50 Parameters 

51 ---------- 

52 

53 tp : int 

54 True positive count, AKA "hit" 

55 

56 fp : int 

57 False positive count, AKA "false alarm", or "Type I error" 

58 

59 tn : int 

60 True negative count, AKA "correct rejection" 

61 

62 fn : int 

63 False Negative count, AKA "miss", or "Type II error" 

64 

65 

66 Returns 

67 ------- 

68 

69 precision : float 

70 P, AKA positive predictive value (PPV). It corresponds arithmetically 

71 to ``tp/(tp+fp)``. In the case ``tp+fp == 0``, this function returns 

72 zero for precision. 

73 

74 recall : float 

75 R, AKA sensitivity, hit rate, or true positive rate (TPR). It 

76 corresponds arithmetically to ``tp/(tp+fn)``. In the special case 

77 where ``tp+fn == 0``, this function returns zero for recall. 

78 

79 specificity : float 

80 S, AKA selectivity or true negative rate (TNR). It 

81 corresponds arithmetically to ``tn/(tn+fp)``. In the special case 

82 where ``tn+fp == 0``, this function returns zero for specificity. 

83 

84 accuracy : float 

85 A, see `Accuracy 

86 <https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers>`_. is 

87 the proportion of correct predictions (both true positives and true 

88 negatives) among the total number of pixels examined. It corresponds 

89 arithmetically to ``(tp+tn)/(tp+tn+fp+fn)``. This measure includes 

90 both true-negatives and positives in the numerator, what makes it 

91 sensitive to data or regions without annotations. 

92 

93 jaccard : float 

94 J, see `Jaccard Index or Similarity 

95 <https://en.wikipedia.org/wiki/Jaccard_index>`_. It corresponds 

96 arithmetically to ``tp/(tp+fp+fn)``. In the special case where 

97 ``tn+fp+fn == 0``, this function returns zero for the Jaccard index. 

98 The Jaccard index depends on a TP-only numerator, similarly to the F1 

99 score. For regions where there are no annotations, the Jaccard index 

100 will always be zero, irrespective of the model output. Accuracy may be 

101 a better proxy if one needs to consider the true abscence of 

102 annotations in a region as part of the measure. 

103 

104 f1_score : float 

105 F1, see `F1-score <https://en.wikipedia.org/wiki/F1_score>`_. It 

106 corresponds arithmetically to ``2*P*R/(P+R)`` or ``2*tp/(2*tp+fp+fn)``. 

107 In the special case where ``P+R == (2*tp+fp+fn) == 0``, this function 

108 returns zero for the Jaccard index. The F1 or Dice score depends on a 

109 TP-only numerator, similarly to the Jaccard index. For regions where 

110 there are no annotations, the F1-score will always be zero, 

111 irrespective of the model output. Accuracy may be a better proxy if 

112 one needs to consider the true abscence of annotations in a region as 

113 part of the measure. 

114 

115 """ 

116 

117 return ( 

118 tricky_division(tp, tp + fp), # precision 

119 tricky_division(tp, tp + fn), # recall 

120 tricky_division(tn, fp + tn), # specificity 

121 tricky_division(tp + tn, tp + fp + fn + tn), # accuracy 

122 tricky_division(tp, tp + fp + fn), # jaccard index 

123 tricky_division(2 * tp, (2 * tp) + fp + fn), # f1-score 

124 ) 

125 

126 

127def beta_credible_region(k, l, lambda_, coverage): 

128 """ 

129 Returns the mode, upper and lower bounds of the equal-tailed credible 

130 region of a probability estimate following Bernoulli trials. 

131 

132 This implemetnation is based on [GOUTTE-2005]_. It assumes :math:`k` 

133 successes and :math:`l` failures (:math:`n = k+l` total trials) are issued 

134 from a series of Bernoulli trials (likelihood is binomial). The posterior 

135 is derivated using the Bayes Theorem with a beta prior. As there is no 

136 reason to favour high vs. low precision, we use a symmetric Beta prior 

137 (:math:`\\alpha=\\beta`): 

138 

139 .. math:: 

140 

141 P(p|k,n) &= \\frac{P(k,n|p)P(p)}{P(k,n)} \\\\ 

142 P(p|k,n) &= \\frac{\\frac{n!}{k!(n-k)!}p^{k}(1-p)^{n-k}P(p)}{P(k)} \\\\ 

143 P(p|k,n) &= \\frac{1}{B(k+\\alpha, n-k+\beta)}p^{k+\\alpha-1}(1-p)^{n-k+\\beta-1} \\\\ 

144 P(p|k,n) &= \\frac{1}{B(k+\\alpha, n-k+\\alpha)}p^{k+\\alpha-1}(1-p)^{n-k+\\alpha-1} 

145 

146 The mode for this posterior (also the maximum a posteriori) is: 

147 

148 .. math:: 

149 

150 \\text{mode}(p) = \\frac{k+\\lambda-1}{n+2\\lambda-2} 

151 

152 Concretely, the prior may be flat (all rates are equally likely, 

153 :math:`\\lambda=1`) or we may use Jeoffrey's prior 

154 (:math:`\\lambda=0.5`), that is invariant through re-parameterisation. 

155 Jeffrey's prior indicate that rates close to zero or one are more likely. 

156 

157 The mode above works if :math:`k+{\\alpha},n-k+{\\alpha} > 1`, which is 

158 usually the case for a resonably well tunned system, with more than a few 

159 samples for analysis. In the limit of the system performance, :math:`k` 

160 may be 0, which will make the mode become zero. 

161 

162 For our purposes, it may be more suitable to represent :math:`n = k + l`, 

163 with :math:`k`, the number of successes and :math:`l`, the number of 

164 failures in the binomial experiment, and find this more suitable 

165 representation: 

166 

167 .. math:: 

168 

169 P(p|k,l) &= \\frac{1}{B(k+\\alpha, l+\\alpha)}p^{k+\\alpha-1}(1-p)^{l+\\alpha-1} \\\\ 

170 \\text{mode}(p) &= \\frac{k+\\lambda-1}{k+l+2\\lambda-2} 

171 

172 This can be mapped to most rates calculated in the context of binary 

173 classification this way: 

174 

175 * Precision or Positive-Predictive Value (PPV): p = TP/(TP+FP), so k=TP, l=FP 

176 * Recall, Sensitivity, or True Positive Rate: r = TP/(TP+FN), so k=TP, l=FN 

177 * Specificity or True Negative Rage: s = TN/(TN+FP), so k=TN, l=FP 

178 * F1-score: f1 = 2TP/(2TP+FP+FN), so k=2TP, l=FP+FN 

179 * Accuracy: acc = TP+TN/(TP+TN+FP+FN), so k=TP+TN, l=FP+FN 

180 * Jaccard: j = TP/(TP+FP+FN), so k=TP, l=FP+FN 

181 

182 Contrary to frequentist approaches, in which one can only 

183 say that if the test were repeated an infinite number of times, 

184 and one constructed a confidence interval each time, then X% 

185 of the confidence intervals would contain the true rate, here 

186 we can say that given our observed data, there is a X% probability 

187 that the true value of :math:`k/n` falls within the provided 

188 interval. 

189 

190 

191 .. note:: 

192 

193 For a disambiguation with Confidence Interval, read 

194 https://en.wikipedia.org/wiki/Credible_interval. 

195 

196 

197 Parameters 

198 ========== 

199 

200 k : int 

201 Number of successes observed on the experiment 

202 

203 l : int 

204 Number of failures observed on the experiment 

205 

206 lambda__ : :py:class:`float`, Optional 

207 The parameterisation of the Beta prior to consider. Use 

208 :math:`\\lambda=1` for a flat prior. Use :math:`\\lambda=0.5` for 

209 Jeffrey's prior (the default). 

210 

211 coverage : :py:class:`float`, Optional 

212 A floating-point number between 0 and 1.0 indicating the 

213 coverage you're expecting. A value of 0.95 will ensure 95% 

214 of the area under the probability density of the posterior 

215 is covered by the returned equal-tailed interval. 

216 

217 

218 Returns 

219 ======= 

220 

221 mean : float 

222 The mean of the posterior distribution 

223 

224 mode : float 

225 The mode of the posterior distribution 

226 

227 lower, upper: float 

228 The lower and upper bounds of the credible region 

229 

230 """ 

231 

232 # we return the equally-tailed range 

233 right = (1.0 - coverage) / 2 # half-width in each side 

234 lower = scipy.special.betaincinv(k + lambda_, l + lambda_, right) 

235 upper = scipy.special.betaincinv(k + lambda_, l + lambda_, 1.0 - right) 

236 

237 # evaluate mean and mode (https://en.wikipedia.org/wiki/Beta_distribution) 

238 alpha = k + lambda_ 

239 beta = l + lambda_ 

240 

241 E = alpha / (alpha + beta) 

242 

243 # the mode of a beta distribution is a bit tricky 

244 if alpha > 1 and beta > 1: 

245 mode = (alpha - 1) / (alpha + beta - 2) 

246 elif alpha == 1 and beta == 1: 

247 # In the case of precision, if the threshold is close to 1.0, both TP 

248 # and FP can be zero, which may cause this condition to be reached, if 

249 # the prior is exactly 1 (flat prior). This is a weird situation, 

250 # because effectively we are trying to compute the posterior when the 

251 # total number of experiments is zero. So, only the prior counts - but 

252 # the prior is flat, so we should just pick a value. We choose the 

253 # middle of the range. 

254 mode = 0.0 # any value would do, we just pick this one 

255 elif alpha <= 1 and beta > 1: 

256 mode = 0.0 

257 elif alpha > 1 and beta <= 1: 

258 mode = 1.0 

259 else: # elif alpha < 1 and beta < 1: 

260 # in the case of precision, if the threshold is close to 1.0, both TP 

261 # and FP can be zero, which may cause this condition to be reached, if 

262 # the prior is smaller than 1. This is a weird situation, because 

263 # effectively we are trying to compute the posterior when the total 

264 # number of experiments is zero. So, only the prior counts - but the 

265 # prior is bimodal, so we should just pick a value. We choose the 

266 # left of the range. 

267 mode = 0.0 # could also be 1.0 as the prior is bimodal 

268 

269 return E, mode, lower, upper 

270 

271 

272def bayesian_measures(tp, fp, tn, fn, lambda_, coverage): 

273 """Calculates mean and mode from true/false positive and negative counts 

274 with credible regions 

275 

276 This function can return bayesian estimates of standard machine learning 

277 measures from true and false positive counts of positives and negatives. 

278 For a thorough look into these and alternate names for the returned values, 

279 please check Wikipedia's entry on `Precision and Recall 

280 <https://en.wikipedia.org/wiki/Precision_and_recall>`_. See 

281 :py:func:`beta_credible_region` for details on the calculation of returned 

282 values. 

283 

284 

285 Parameters 

286 ---------- 

287 

288 tp : int 

289 True positive count, AKA "hit" 

290 

291 fp : int 

292 False positive count, AKA "false alarm", or "Type I error" 

293 

294 tn : int 

295 True negative count, AKA "correct rejection" 

296 

297 fn : int 

298 False Negative count, AKA "miss", or "Type II error" 

299 

300 lambda_ : float 

301 The parameterisation of the Beta prior to consider. Use 

302 :math:`\\lambda=1` for a flat prior. Use :math:`\\lambda=0.5` for 

303 Jeffrey's prior. 

304 

305 coverage : float 

306 A floating-point number between 0 and 1.0 indicating the 

307 coverage you're expecting. A value of 0.95 will ensure 95% 

308 of the area under the probability density of the posterior 

309 is covered by the returned equal-tailed interval. 

310 

311 

312 

313 Returns 

314 ------- 

315 

316 precision : (float, float, float, float) 

317 P, AKA positive predictive value (PPV), mean, mode and credible 

318 intervals (95% CI). It corresponds arithmetically 

319 to ``tp/(tp+fp)``. 

320 

321 recall : (float, float, float, float) 

322 R, AKA sensitivity, hit rate, or true positive rate (TPR), mean, mode 

323 and credible intervals (95% CI). It corresponds arithmetically to 

324 ``tp/(tp+fn)``. 

325 

326 specificity : (float, float, float, float) 

327 S, AKA selectivity or true negative rate (TNR), mean, mode and credible 

328 intervals (95% CI). It corresponds arithmetically to ``tn/(tn+fp)``. 

329 

330 accuracy : (float, float, float, float) 

331 A, mean, mode and credible intervals (95% CI). See `Accuracy 

332 <https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers>`_. is 

333 the proportion of correct predictions (both true positives and true 

334 negatives) among the total number of pixels examined. It corresponds 

335 arithmetically to ``(tp+tn)/(tp+tn+fp+fn)``. This measure includes 

336 both true-negatives and positives in the numerator, what makes it 

337 sensitive to data or regions without annotations. 

338 

339 jaccard : (float, float, float, float) 

340 J, mean, mode and credible intervals (95% CI). See `Jaccard Index or 

341 Similarity <https://en.wikipedia.org/wiki/Jaccard_index>`_. It 

342 corresponds arithmetically to ``tp/(tp+fp+fn)``. The Jaccard index 

343 depends on a TP-only numerator, similarly to the F1 score. For regions 

344 where there are no annotations, the Jaccard index will always be zero, 

345 irrespective of the model output. Accuracy may be a better proxy if 

346 one needs to consider the true abscence of annotations in a region as 

347 part of the measure. 

348 

349 f1_score : (float, float, float, float) 

350 F1, mean, mode and credible intervals (95% CI). See `F1-score 

351 <https://en.wikipedia.org/wiki/F1_score>`_. It corresponds 

352 arithmetically to ``2*P*R/(P+R)`` or ``2*tp/(2*tp+fp+fn)``. The F1 or 

353 Dice score depends on a TP-only numerator, similarly to the Jaccard 

354 index. For regions where there are no annotations, the F1-score will 

355 always be zero, irrespective of the model output. Accuracy may be a 

356 better proxy if one needs to consider the true abscence of annotations 

357 in a region as part of the measure. 

358 """ 

359 

360 return ( 

361 beta_credible_region(tp, fp, lambda_, coverage), # precision 

362 beta_credible_region(tp, fn, lambda_, coverage), # recall 

363 beta_credible_region(tn, fp, lambda_, coverage), # specificity 

364 beta_credible_region(tp + tn, fp + fn, lambda_, coverage), # accuracy 

365 beta_credible_region(tp, fp + fn, lambda_, coverage), # jaccard index 

366 beta_credible_region(2 * tp, fp + fn, lambda_, coverage), # f1-score 

367 ) 

368 

369 

370def auc(x, y): 

371 """Calculates the area under the precision-recall curve (AUC) 

372 

373 This function requires a minimum of 2 points and will use the trapezoidal 

374 method to calculate the area under a curve bound between ``[0.0, 1.0]``. 

375 It interpolates missing points if required. The input ``x`` should be 

376 continuously increasing or decreasing. 

377 

378 

379 Parameters 

380 ---------- 

381 

382 x : numpy.ndarray 

383 A 1D numpy array containing continuously increasing or decreasing 

384 values for the X coordinate. 

385 

386 y : numpy.ndarray 

387 A 1D numpy array containing the Y coordinates of the X values provided 

388 in ``x``. 

389 

390 """ 

391 

392 x = numpy.array(x) 

393 y = numpy.array(y) 

394 

395 assert len(x) == len(y), "x and y sequences must have the same length" 

396 

397 dx = numpy.diff(x) 

398 if numpy.any(dx < 0): 

399 if numpy.all(dx <= 0): 

400 # invert direction 

401 x = x[::-1] 

402 y = y[::-1] 

403 else: 

404 raise ValueError( 

405 "x is neither increasing nor decreasing " ": {}.".format(x) 

406 ) 

407 

408 y_interp = numpy.interp( 

409 numpy.arange(0, 1, 0.001), 

410 numpy.array(x), 

411 numpy.array(y), 

412 left=1.0, 

413 right=0.0, 

414 ) 

415 return y_interp.sum() * 0.001