Coverage for src/deepdraw/engine/evaluator.py: 99%

172 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-11-30 15:00 +0100

1# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> 

2# 

3# SPDX-License-Identifier: GPL-3.0-or-later 

4 

5"""Defines functionality for the evaluation of predictions.""" 

6 

7import itertools 

8import logging 

9import multiprocessing 

10import os 

11 

12import h5py 

13import numpy 

14import pandas 

15import PIL 

16import torch 

17import torch.nn.functional 

18import torchvision.transforms.functional as VF 

19 

20from tqdm import tqdm 

21 

22from ..utils.measure import base_measures, bayesian_measures 

23 

24logger = logging.getLogger(__name__) 

25 

26 

27def _posneg(pred, gt, threshold): 

28 """Calculates true and false positives and negatives. 

29 

30 Parameters 

31 ---------- 

32 

33 pred : torch.Tensor 

34 pixel-wise predictions 

35 

36 gt : torch.Tensor 

37 ground-truth (annotations) 

38 

39 threshold : float 

40 a particular threshold in which to calculate the performance 

41 measures 

42 

43 

44 Returns 

45 ------- 

46 

47 tp_tensor : torch.Tensor 

48 boolean tensor with true positives, considering all observations 

49 

50 fp_tensor : torch.Tensor 

51 boolean tensor with false positives, considering all observations 

52 

53 tn_tensor : torch.Tensor 

54 boolean tensor with true negatives, considering all observations 

55 

56 fn_tensor : torch.Tensor 

57 boolean tensor with false negatives, considering all observations 

58 """ 

59 

60 gt = gt.byte() # byte tensor 

61 

62 # threshold 

63 binary_pred = torch.gt(pred, threshold).byte() 

64 

65 # equals and not-equals 

66 equals = torch.eq(binary_pred, gt).type(torch.uint8) # tensor 

67 notequals = torch.ne(binary_pred, gt).type(torch.uint8) # tensor 

68 

69 # true positives 

70 tp_tensor = gt * binary_pred 

71 

72 # false positives 

73 fp_tensor = torch.eq((binary_pred + tp_tensor), 1).byte() 

74 

75 # true negatives 

76 tn_tensor = equals - tp_tensor 

77 

78 # false negatives 

79 fn_tensor = notequals - fp_tensor 

80 

81 return tp_tensor, fp_tensor, tn_tensor, fn_tensor 

82 

83 

84def sample_measures_for_threshold(pred, gt, mask, threshold): 

85 """Calculates counts on one single sample, for a specific threshold. 

86 

87 Parameters 

88 ---------- 

89 

90 pred : torch.Tensor 

91 pixel-wise predictions 

92 

93 gt : torch.Tensor 

94 ground-truth (annotations) 

95 

96 mask : torch.Tensor 

97 region mask (used only if available). May be set to ``None``. 

98 

99 threshold : float 

100 a particular threshold in which to calculate the performance 

101 measures 

102 

103 

104 Returns 

105 ------- 

106 

107 tp : int 

108 

109 fp : int 

110 

111 tn : int 

112 

113 fn : int 

114 """ 

115 

116 tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg(pred, gt, threshold) 

117 

118 # if a mask is provided, consider only TP/FP/TN/FN **within** the region of 

119 # interest defined by the mask 

120 if mask is not None: 

121 antimask = torch.le(mask, 0.5) 

122 tp_tensor[antimask] = 0 

123 fp_tensor[antimask] = 0 

124 tn_tensor[antimask] = 0 

125 fn_tensor[antimask] = 0 

126 

127 # calc measures from scalars 

128 tp_count = torch.sum(tp_tensor).item() 

129 fp_count = torch.sum(fp_tensor).item() 

130 tn_count = torch.sum(tn_tensor).item() 

131 fn_count = torch.sum(fn_tensor).item() 

132 

133 return tp_count, fp_count, tn_count, fn_count 

134 

135 

136def _sample_measures(pred, gt, mask, steps): 

137 """Calculates measures on one single sample. 

138 

139 Parameters 

140 ---------- 

141 

142 pred : torch.Tensor 

143 pixel-wise predictions 

144 

145 gt : torch.Tensor 

146 ground-truth (annotations) 

147 

148 mask : torch.Tensor 

149 region mask (used only if available). May be set to ``None``. 

150 

151 steps : int 

152 number of steps to use for threshold analysis. The step size is 

153 calculated from this by dividing ``1.0/steps`` 

154 

155 

156 Returns 

157 ------- 

158 

159 measures : pandas.DataFrame 

160 

161 A pandas dataframe with the following columns: 

162 

163 * tp: int 

164 * fp: int 

165 * tn: int 

166 * fn: int 

167 """ 

168 

169 step_size = 1.0 / steps 

170 data = [ 

171 (index, threshold) 

172 + sample_measures_for_threshold(pred, gt, mask, threshold) 

173 for index, threshold in enumerate(numpy.arange(0.0, 1.0, step_size)) 

174 ] 

175 

176 retval = pandas.DataFrame( 

177 data, 

178 columns=( 

179 "index", 

180 "threshold", 

181 "tp", 

182 "fp", 

183 "tn", 

184 "fn", 

185 ), 

186 ) 

187 retval.set_index("index", inplace=True) 

188 return retval 

189 

190 

191def _sample_analysis( 

192 img, 

193 pred, 

194 gt, 

195 mask, 

196 threshold, 

197 tp_color=(0, 255, 0), # (128,128,128) Gray 

198 fp_color=(0, 0, 255), # (70, 240, 240) Cyan 

199 fn_color=(255, 0, 0), # (245, 130, 48) Orange 

200 overlay=True, 

201): 

202 """Visualizes true positives, false positives and false negatives. 

203 

204 Parameters 

205 ---------- 

206 

207 img : torch.Tensor 

208 original image 

209 

210 pred : torch.Tensor 

211 pixel-wise predictions 

212 

213 gt : torch.Tensor 

214 ground-truth (annotations) 

215 

216 mask : torch.Tensor 

217 region mask (used only if available). May be set to ``None``. 

218 

219 threshold : float 

220 The threshold to be used while analyzing this image's probability map 

221 

222 tp_color : tuple 

223 RGB value for true positives 

224 

225 fp_color : tuple 

226 RGB value for false positives 

227 

228 fn_color : tuple 

229 RGB value for false negatives 

230 

231 overlay : :py:class:`bool`, Optional 

232 If set to ``True`` (which is the default), then overlay annotations on 

233 top of the image. Otherwise, represent data on a black canvas. 

234 

235 

236 Returns 

237 ------- 

238 

239 figure : PIL.Image.Image 

240 

241 A PIL image that contains the overlayed analysis of true-positives 

242 (TP), false-positives (FP) and false negatives (FN). 

243 """ 

244 

245 tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg(pred, gt, threshold) 

246 

247 # if a mask is provided, consider only TP/FP/TN/FN **within** the region of 

248 # interest defined by the mask 

249 if mask is not None: 

250 antimask = torch.le(mask, 0.5) 

251 tp_tensor[antimask] = 0 

252 fp_tensor[antimask] = 0 

253 tn_tensor[antimask] = 0 

254 fn_tensor[antimask] = 0 

255 

256 # change to PIL representation 

257 tp_pil = VF.to_pil_image(tp_tensor.float()) 

258 tp_pil_colored = PIL.ImageOps.colorize(tp_pil, (0, 0, 0), tp_color) 

259 

260 fp_pil = VF.to_pil_image(fp_tensor.float()) 

261 fp_pil_colored = PIL.ImageOps.colorize(fp_pil, (0, 0, 0), fp_color) 

262 

263 fn_pil = VF.to_pil_image(fn_tensor.float()) 

264 fn_pil_colored = PIL.ImageOps.colorize(fn_pil, (0, 0, 0), fn_color) 

265 

266 tp_pil_colored.paste(fp_pil_colored, mask=fp_pil) 

267 tp_pil_colored.paste(fn_pil_colored, mask=fn_pil) 

268 

269 if overlay: 

270 img = VF.to_pil_image(img) # PIL Image 

271 # using blend here, to fade original image being overlayed, or 

272 # its brightness may obfuscate colors from the vessel map 

273 tp_pil_colored = PIL.Image.blend(img, tp_pil_colored, 0.5) 

274 

275 return tp_pil_colored 

276 

277 

278def _summarize(data): 

279 """Summarizes collected dataframes and adds bayesian figures.""" 

280 

281 _entries = ( 

282 "mean_precision", 

283 "mode_precision", 

284 "lower_precision", 

285 "upper_precision", 

286 "mean_recall", 

287 "mode_recall", 

288 "lower_recall", 

289 "upper_recall", 

290 "mean_specificity", 

291 "mode_specificity", 

292 "lower_specificity", 

293 "upper_specificity", 

294 "mean_accuracy", 

295 "mode_accuracy", 

296 "lower_accuracy", 

297 "upper_accuracy", 

298 "mean_jaccard", 

299 "mode_jaccard", 

300 "lower_jaccard", 

301 "upper_jaccard", 

302 "mean_f1_score", 

303 "mode_f1_score", 

304 "lower_f1_score", 

305 "upper_f1_score", 

306 "frequentist_precision", 

307 "frequentist_recall", 

308 "frequentist_specificity", 

309 "frequentist_accuracy", 

310 "frequentist_jaccard", 

311 "frequentist_f1_score", 

312 ) 

313 

314 def _row_summary(r): 

315 # run bayesian_measures(), flatten tuple of tuples, name entries 

316 bayesian = [ 

317 item 

318 for sublist in bayesian_measures( 

319 r.tp, 

320 r.fp, 

321 r.tn, 

322 r.fn, 

323 lambda_=0.5, 

324 coverage=0.95, 

325 ) 

326 for item in sublist 

327 ] 

328 

329 # evaluate frequentist measures 

330 frequentist = base_measures(r.tp, r.fp, r.tn, r.fn) 

331 return pandas.Series(bayesian + list(frequentist), index=_entries) 

332 

333 # Merges all dataframes together 

334 sums = pandas.concat(data.values()).groupby("index").sum() 

335 sums["threshold"] /= len(data) 

336 

337 # create a new dataframe with these 

338 measures = sums.apply(lambda r: _row_summary(r), axis=1) 

339 

340 # merge sums and measures into a single dataframe 

341 return pandas.concat([sums, measures.reindex(sums.index)], axis=1).copy() 

342 

343 

344def _evaluate_sample_worker(args): 

345 """Runs all of the evaluation steps on a single sample. 

346 

347 Parameters 

348 ---------- 

349 

350 args : tuple 

351 A tuple containing the following sub-arguments: 

352 

353 sample : tuple 

354 Sample to be processed, containing the stem of the filepath 

355 relative to the database root, the image, the ground-truth, and 

356 possibly the mask to define the region of interest to be processed. 

357 

358 name : str 

359 the local name of the dataset (e.g. ``train``, or ``test``), to be 

360 used when saving measures files. 

361 

362 steps : :py:class:`float`, Optional 

363 number of threshold steps to consider when evaluating thresholds. 

364 

365 threshold : :py:class:`float`, Optional 

366 if ``overlayed_folder``, then this should be threshold (floating 

367 point) to apply to prediction maps to decide on positives and 

368 negatives for overlaying analysis (graphical output). This number 

369 should come from the training set or a separate validation set. 

370 Using a test set value may bias your analysis. This number is also 

371 used to print the a priori F1-score on the evaluated set. 

372 

373 use_predictions_folder : str 

374 Folder where predictions for the dataset images have been 

375 previously stored 

376 

377 output_folder : str, None 

378 If not ``None``, then outputs a copy of the evaluation for this 

379 sample in CSV format at this directory, but respecting the sample 

380 ``stem``. 

381 

382 overlayed_folder : str, None 

383 If not ``None``, then outputs a version of the input image with 

384 predictions overlayed, in PNG format, but respecting the sample 

385 ``stem``. 

386 

387 

388 Returns 

389 ------- 

390 

391 stem : str 

392 The unique sample stem 

393 

394 data : pandas.DataFrame 

395 Dataframe containing the evaluation performance on this single sample 

396 """ 

397 

398 ( 

399 sample, 

400 name, 

401 steps, 

402 threshold, 

403 use_predictions_folder, 

404 output_folder, 

405 overlayed_folder, 

406 ) = args 

407 

408 stem = sample[0] 

409 image = sample[1] 

410 gt = sample[2] 

411 mask = None if len(sample) <= 3 else sample[3] 

412 pred_fullpath = os.path.join(use_predictions_folder, stem + ".hdf5") 

413 with h5py.File(pred_fullpath, "r") as f: 

414 pred = f["array"][:] 

415 pred = torch.from_numpy(pred) 

416 retval = _sample_measures(pred, gt, mask, steps) 

417 

418 if output_folder is not None: 

419 fullpath = os.path.join(output_folder, name, f"{stem}.csv") 

420 tqdm.write(f"Saving {fullpath}...") 

421 os.makedirs(os.path.dirname(fullpath), exist_ok=True) 

422 retval.to_csv(fullpath) 

423 

424 if overlayed_folder is not None: 

425 overlay_image = _sample_analysis( 

426 image, pred, gt, mask, threshold=threshold, overlay=True 

427 ) 

428 fullpath = os.path.join(overlayed_folder, name, f"{stem}.png") 

429 tqdm.write(f"Saving {fullpath}...") 

430 os.makedirs(os.path.dirname(fullpath), exist_ok=True) 

431 overlay_image.save(fullpath) 

432 

433 return stem, retval 

434 

435 

436def run( 

437 dataset, 

438 name, 

439 predictions_folder, 

440 output_folder=None, 

441 overlayed_folder=None, 

442 threshold=None, 

443 steps=1000, 

444 parallel=-1, 

445): 

446 """Runs inference and calculates measures. 

447 

448 Parameters 

449 --------- 

450 

451 dataset : py:class:`torch.utils.data.Dataset` 

452 a dataset to iterate on 

453 

454 name : str 

455 the local name of this dataset (e.g. ``train``, or ``test``), to be 

456 used when saving measures files. 

457 

458 predictions_folder : str 

459 folder where predictions for the dataset images have been previously 

460 stored 

461 

462 output_folder : :py:class:`str`, Optional 

463 folder where to store results. If not provided, then do not store any 

464 analysis (useful for quickly calculating overlay thresholds) 

465 

466 overlayed_folder : :py:class:`str`, Optional 

467 if not ``None``, then it should be the name of a folder where to store 

468 overlayed versions of the images and ground-truths 

469 

470 threshold : :py:class:`float`, Optional 

471 if ``overlayed_folder``, then this should be threshold (floating point) 

472 to apply to prediction maps to decide on positives and negatives for 

473 overlaying analysis (graphical output). This number should come from 

474 the training set or a separate validation set. Using a test set value 

475 may bias your analysis. This number is also used to print the a priori 

476 F1-score on the evaluated set. 

477 

478 steps : :py:class:`float`, Optional 

479 number of threshold steps to consider when evaluating thresholds. 

480 

481 parallel : :py:class:`int`, Optional 

482 If set to a value different >= 0, uses multiprocessing for estimating 

483 thresholds for each sample through a processing pool. A value of zero 

484 will create as many processes in the pool as cores in the machine. A 

485 negative value disables multiprocessing altogether. A value greater 

486 than zero will spawn as many processes as requested. 

487 

488 

489 Returns 

490 ------- 

491 

492 threshold : float 

493 Threshold to achieve the highest possible F1-score for this dataset 

494 """ 

495 

496 # Collect overall measures 

497 data = {} 

498 

499 use_predictions_folder = os.path.join(predictions_folder, name) 

500 if not os.path.exists(use_predictions_folder): 

501 use_predictions_folder = predictions_folder 

502 

503 if parallel < 0: # turns off multiprocessing 

504 for sample in tqdm(dataset, desc="sample"): 

505 k, v = _evaluate_sample_worker( 

506 ( 

507 sample, 

508 name, 

509 steps, 

510 threshold, 

511 use_predictions_folder, 

512 output_folder, 

513 overlayed_folder, 

514 ) 

515 ) 

516 data[k] = v 

517 else: 

518 parallel = parallel or multiprocessing.cpu_count() 

519 with multiprocessing.Pool(processes=parallel) as pool, tqdm( 

520 total=len(dataset), 

521 desc="sample", 

522 ) as pbar: 

523 for k, v in pool.imap_unordered( 

524 _evaluate_sample_worker, 

525 zip( 

526 dataset, 

527 itertools.repeat(name), 

528 itertools.repeat(steps), 

529 itertools.repeat(threshold), 

530 itertools.repeat(use_predictions_folder), 

531 itertools.repeat(output_folder), 

532 itertools.repeat(overlayed_folder), 

533 ), 

534 ): 

535 pbar.update() 

536 data[k] = v 

537 

538 # Merges all dataframes together 

539 measures = _summarize(data) 

540 

541 maxf1 = measures["mean_f1_score"].max() 

542 maxf1_index = measures["mean_f1_score"].idxmax() 

543 maxf1_threshold = measures["threshold"][maxf1_index] 

544 

545 logger.info( 

546 f"Maximum F1-score of {maxf1:.5f}, achieved at " 

547 f"threshold {maxf1_threshold:.3f} (chosen *a posteriori*)" 

548 ) 

549 

550 if threshold is not None: 

551 # get the closest possible threshold we have 

552 index = int(round(steps * threshold)) 

553 f1_a_priori = measures["mean_f1_score"][index] 

554 actual_threshold = measures["threshold"][index] 

555 

556 # mark threshold a priori chosen on this dataset 

557 measures["threshold_a_priori"] = False 

558 measures["threshold_a_priori", index] = True 

559 

560 logger.info( 

561 f"F1-score of {f1_a_priori:.5f}, at threshold " 

562 f"{actual_threshold:.3f} (chosen *a priori*)" 

563 ) 

564 

565 if output_folder is not None: 

566 logger.info(f"Output folder: {output_folder}") 

567 os.makedirs(output_folder, exist_ok=True) 

568 measures_path = os.path.join(output_folder, f"{name}.csv") 

569 logger.info( 

570 f"Saving measures over all input images at {measures_path}..." 

571 ) 

572 measures.to_csv(measures_path) 

573 

574 return maxf1_threshold 

575 

576 

577def _compare_annotators_worker(args): 

578 """Runs all of the comparison steps on a single sample pair. 

579 

580 Parameters 

581 ---------- 

582 

583 args : tuple 

584 A tuple containing the following sub-arguments: 

585 

586 baseline_sample : tuple 

587 Baseline sample to be processed, containing the stem of the filepath 

588 relative to the database root, the image, the ground-truth, and 

589 possibly the mask to define the region of interest to be processed. 

590 

591 other_sample : tuple 

592 Another sample that is identical to the first, but has a different 

593 mask (drawn by a different annotator) 

594 

595 name : str 

596 the local name of the dataset (e.g. ``train``, or ``test``), to be 

597 used when saving measures files. 

598 

599 output_folder : str, None 

600 If not ``None``, then outputs a copy of the evaluation for this 

601 sample in CSV format at this directory, but respecting the sample 

602 ``stem``. 

603 

604 overlayed_folder : str, None 

605 If not ``None``, then outputs a version of the input image with 

606 predictions overlayed, in PNG format, but respecting the sample 

607 ``stem``. 

608 

609 

610 Returns 

611 ------- 

612 

613 stem : str 

614 The unique sample stem 

615 

616 data : pandas.DataFrame 

617 Dataframe containing the evaluation performance on this single sample 

618 """ 

619 

620 ( 

621 baseline_sample, 

622 other_sample, 

623 name, 

624 output_folder, 

625 overlayed_folder, 

626 ) = args 

627 

628 assert baseline_sample[0] == other_sample[0], ( 

629 f"Mismatch between " 

630 f"datasets for second-annotator analysis " 

631 f"({baseline_sample[0]} != {other_sample[0]}). This " 

632 f"typically occurs when the second annotator (`other`) " 

633 f"comes from a different dataset than the `baseline` dataset" 

634 ) 

635 

636 stem = baseline_sample[0] 

637 image = baseline_sample[1] 

638 gt = baseline_sample[2] 

639 pred = other_sample[2] # works as a prediction 

640 mask = None if len(baseline_sample) < 4 else baseline_sample[3] 

641 retval = _sample_measures(pred, gt, mask, 2) 

642 

643 if output_folder is not None: 

644 fullpath = os.path.join( 

645 output_folder, "second-annotator", name, f"{stem}.csv" 

646 ) 

647 tqdm.write(f"Saving {fullpath}...") 

648 os.makedirs(os.path.dirname(fullpath), exist_ok=True) 

649 retval.to_csv(fullpath) 

650 

651 if overlayed_folder is not None: 

652 overlay_image = _sample_analysis( 

653 image, pred, gt, mask, threshold=0.5, overlay=True 

654 ) 

655 fullpath = os.path.join( 

656 overlayed_folder, "second-annotator", name, f"{stem}.png" 

657 ) 

658 tqdm.write(f"Saving {fullpath}...") 

659 os.makedirs(os.path.dirname(fullpath), exist_ok=True) 

660 overlay_image.save(fullpath) 

661 

662 return stem, retval 

663 

664 

665def compare_annotators( 

666 baseline, 

667 other, 

668 name, 

669 output_folder, 

670 overlayed_folder=None, 

671 parallel=-1, 

672): 

673 """Compares annotations on the **same** dataset. 

674 

675 Parameters 

676 --------- 

677 

678 baseline : py:class:`torch.utils.data.Dataset` 

679 a dataset to iterate on, containing the baseline annotations 

680 

681 other : py:class:`torch.utils.data.Dataset` 

682 a second dataset, with the same samples as ``baseline``, but annotated 

683 by a different annotator than in the first dataset. The key values 

684 must much between ``baseline`` and this dataset. 

685 

686 name : str 

687 the local name of this dataset (e.g. ``train-second-annotator``, or 

688 ``test-second-annotator``), to be used when saving measures files. 

689 

690 output_folder : str 

691 folder where to store results 

692 

693 overlayed_folder : :py:class:`str`, Optional 

694 if not ``None``, then it should be the name of a folder where to store 

695 overlayed versions of the images and ground-truths 

696 

697 parallel : :py:class:`int`, Optional 

698 If set to a value different >= 0, uses multiprocessing for estimating 

699 thresholds for each sample through a processing pool. A value of zero 

700 will create as many processes in the pool as cores in the machine. A 

701 negative value disables multiprocessing altogether. A value greater 

702 than zero will spawn as many processes as requested. 

703 """ 

704 

705 logger.info(f"Output folder: {output_folder}") 

706 os.makedirs(output_folder, exist_ok=True) 

707 

708 # Collect overall measures 

709 data = {} 

710 

711 if parallel < 0: # turns off multiprocessing 

712 for baseline_sample, other_sample in tqdm( 

713 list(zip(baseline, other)), 

714 desc="samples", 

715 leave=False, 

716 disable=None, 

717 ): 

718 k, v = _compare_annotators_worker( 

719 ( 

720 baseline_sample, 

721 other_sample, 

722 name, 

723 output_folder, 

724 overlayed_folder, 

725 ) 

726 ) 

727 data[k] = v 

728 else: 

729 parallel = parallel or multiprocessing.cpu_count() 

730 with multiprocessing.Pool(processes=parallel) as pool, tqdm( 

731 total=len(baseline), 

732 desc="sample", 

733 ) as pbar: 

734 for k, v in pool.imap_unordered( 

735 _compare_annotators_worker, 

736 zip( 

737 baseline, 

738 other, 

739 itertools.repeat(name), 

740 itertools.repeat(output_folder), 

741 itertools.repeat(overlayed_folder), 

742 ), 

743 ): 

744 pbar.update() 

745 data[k] = v 

746 

747 measures = _summarize(data) 

748 measures.drop(0, inplace=True) # removes threshold == 0.0, keeps 0.5 only 

749 

750 measures_path = os.path.join( 

751 output_folder, "second-annotator", f"{name}.csv" 

752 ) 

753 os.makedirs(os.path.dirname(measures_path), exist_ok=True) 

754 logger.info(f"Saving summaries over all input images at {measures_path}...") 

755 measures.to_csv(measures_path) 

756 

757 maxf1 = measures["mean_f1_score"].max() 

758 logger.info(f"F1-score of {maxf1:.5f} (second annotator; threshold=0.5)")