Coverage for src/deepdraw/engine/evaluator.py: 99%

3# SPDX-License-Identifier: GPL-3.0-or-later

5"""Defines functionality for the evaluation of predictions."""

7import itertools

8import logging

9import multiprocessing

10import os

12import h5py

13import numpy

14import pandas

15import PIL

16import torch

17import torch.nn.functional

18import torchvision.transforms.functional as VF

20from tqdm import tqdm

22from ..utils.measure import base_measures, bayesian_measures

24logger = logging.getLogger(__name__)

27def _posneg(pred, gt, threshold):

28 """Calculates true and false positives and negatives.

30 Parameters

31 ----------

33 pred : torch.Tensor

34 pixel-wise predictions

36 gt : torch.Tensor

37 ground-truth (annotations)

39 threshold : float

40 a particular threshold in which to calculate the performance

41 measures

44 Returns

45 -------

47 tp_tensor : torch.Tensor

48 boolean tensor with true positives, considering all observations

50 fp_tensor : torch.Tensor

51 boolean tensor with false positives, considering all observations

53 tn_tensor : torch.Tensor

54 boolean tensor with true negatives, considering all observations

56 fn_tensor : torch.Tensor

57 boolean tensor with false negatives, considering all observations

58 """

60 gt = gt.byte() # byte tensor

62 # threshold

63 binary_pred = torch.gt(pred, threshold).byte()

65 # equals and not-equals

66 equals = torch.eq(binary_pred, gt).type(torch.uint8) # tensor

67 notequals = torch.ne(binary_pred, gt).type(torch.uint8) # tensor

69 # true positives

70 tp_tensor = gt * binary_pred

72 # false positives

73 fp_tensor = torch.eq((binary_pred + tp_tensor), 1).byte()

75 # true negatives

76 tn_tensor = equals - tp_tensor

78 # false negatives

79 fn_tensor = notequals - fp_tensor

81 return tp_tensor, fp_tensor, tn_tensor, fn_tensor

84def sample_measures_for_threshold(pred, gt, mask, threshold):

85 """Calculates counts on one single sample, for a specific threshold.

87 Parameters

88 ----------

90 pred : torch.Tensor

91 pixel-wise predictions

93 gt : torch.Tensor

94 ground-truth (annotations)

96 mask : torch.Tensor

97 region mask (used only if available). May be set to ``None``.

99 threshold : float

100 a particular threshold in which to calculate the performance

101 measures

102

103

104 Returns

105 -------

107 tp : int

109 fp : int

111 tn : int

113 fn : int

114 """

115

116 tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg(pred, gt, threshold)

117

118 # if a mask is provided, consider only TP/FP/TN/FN **within** the region of

119 # interest defined by the mask

120 if mask is not None:

121 antimask = torch.le(mask, 0.5)

122 tp_tensor[antimask] = 0

123 fp_tensor[antimask] = 0

124 tn_tensor[antimask] = 0

125 fn_tensor[antimask] = 0

126

127 # calc measures from scalars

128 tp_count = torch.sum(tp_tensor).item()

129 fp_count = torch.sum(fp_tensor).item()

130 tn_count = torch.sum(tn_tensor).item()

131 fn_count = torch.sum(fn_tensor).item()

132

133 return tp_count, fp_count, tn_count, fn_count

134

135

136def _sample_measures(pred, gt, mask, steps):

137 """Calculates measures on one single sample.

138

139 Parameters

140 ----------

141

142 pred : torch.Tensor

143 pixel-wise predictions

144

145 gt : torch.Tensor

146 ground-truth (annotations)

147

148 mask : torch.Tensor

149 region mask (used only if available). May be set to ``None``.

150

151 steps : int

152 number of steps to use for threshold analysis. The step size is

153 calculated from this by dividing ``1.0/steps``

154

155

156 Returns

157 -------

158

159 measures : pandas.DataFrame

160

161 A pandas dataframe with the following columns:

162

163 * tp: int

164 * fp: int

165 * tn: int

166 * fn: int

167 """

168

169 step_size = 1.0 / steps

170 data = [

171 (index, threshold)

172 + sample_measures_for_threshold(pred, gt, mask, threshold)

173 for index, threshold in enumerate(numpy.arange(0.0, 1.0, step_size))

174 ]

175

176 retval = pandas.DataFrame(

177 data,

178 columns=(

179 "index",

180 "threshold",

181 "tp",

182 "fp",

183 "tn",

184 "fn",

185 ),

186 )

187 retval.set_index("index", inplace=True)

188 return retval

189

190

191def _sample_analysis(

192 img,

193 pred,

194 gt,

195 mask,

196 threshold,

197 tp_color=(0, 255, 0), # (128,128,128) Gray

198 fp_color=(0, 0, 255), # (70, 240, 240) Cyan

199 fn_color=(255, 0, 0), # (245, 130, 48) Orange

200 overlay=True,

201):

202 """Visualizes true positives, false positives and false negatives.

203

204 Parameters

205 ----------

206

207 img : torch.Tensor

208 original image

209

210 pred : torch.Tensor

211 pixel-wise predictions

212

213 gt : torch.Tensor

214 ground-truth (annotations)

215

216 mask : torch.Tensor

217 region mask (used only if available). May be set to ``None``.

218

219 threshold : float

220 The threshold to be used while analyzing this image's probability map

221

222 tp_color : tuple

223 RGB value for true positives

224

225 fp_color : tuple

226 RGB value for false positives

227

228 fn_color : tuple

229 RGB value for false negatives

230

231 overlay : :py:class:`bool`, Optional

232 If set to ``True`` (which is the default), then overlay annotations on

233 top of the image. Otherwise, represent data on a black canvas.

234

235

236 Returns

237 -------

238

239 figure : PIL.Image.Image

240

241 A PIL image that contains the overlayed analysis of true-positives

242 (TP), false-positives (FP) and false negatives (FN).

243 """

244

245 tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg(pred, gt, threshold)

246

247 # if a mask is provided, consider only TP/FP/TN/FN **within** the region of

248 # interest defined by the mask

249 if mask is not None:

250 antimask = torch.le(mask, 0.5)

251 tp_tensor[antimask] = 0

252 fp_tensor[antimask] = 0

253 tn_tensor[antimask] = 0

254 fn_tensor[antimask] = 0

255

256 # change to PIL representation

257 tp_pil = VF.to_pil_image(tp_tensor.float())

258 tp_pil_colored = PIL.ImageOps.colorize(tp_pil, (0, 0, 0), tp_color)

259

260 fp_pil = VF.to_pil_image(fp_tensor.float())

261 fp_pil_colored = PIL.ImageOps.colorize(fp_pil, (0, 0, 0), fp_color)

262

263 fn_pil = VF.to_pil_image(fn_tensor.float())

264 fn_pil_colored = PIL.ImageOps.colorize(fn_pil, (0, 0, 0), fn_color)

265

266 tp_pil_colored.paste(fp_pil_colored, mask=fp_pil)

267 tp_pil_colored.paste(fn_pil_colored, mask=fn_pil)

268

269 if overlay:

270 img = VF.to_pil_image(img) # PIL Image

271 # using blend here, to fade original image being overlayed, or

272 # its brightness may obfuscate colors from the vessel map

273 tp_pil_colored = PIL.Image.blend(img, tp_pil_colored, 0.5)

274

275 return tp_pil_colored

276

277

278def _summarize(data):

279 """Summarizes collected dataframes and adds bayesian figures."""

280

281 _entries = (

282 "mean_precision",

283 "mode_precision",

284 "lower_precision",

285 "upper_precision",

286 "mean_recall",

287 "mode_recall",

288 "lower_recall",

289 "upper_recall",

290 "mean_specificity",

291 "mode_specificity",

292 "lower_specificity",

293 "upper_specificity",

294 "mean_accuracy",

295 "mode_accuracy",

296 "lower_accuracy",

297 "upper_accuracy",

298 "mean_jaccard",

299 "mode_jaccard",

300 "lower_jaccard",

301 "upper_jaccard",

302 "mean_f1_score",

303 "mode_f1_score",

304 "lower_f1_score",

305 "upper_f1_score",

306 "frequentist_precision",

307 "frequentist_recall",

308 "frequentist_specificity",

309 "frequentist_accuracy",

310 "frequentist_jaccard",

311 "frequentist_f1_score",

312 )

313

314 def _row_summary(r):

315 # run bayesian_measures(), flatten tuple of tuples, name entries

316 bayesian = [

317 item

318 for sublist in bayesian_measures(

319 r.tp,

320 r.fp,

321 r.tn,

322 r.fn,

323 lambda_=0.5,

324 coverage=0.95,

325 )

326 for item in sublist

327 ]

328

329 # evaluate frequentist measures

330 frequentist = base_measures(r.tp, r.fp, r.tn, r.fn)

331 return pandas.Series(bayesian + list(frequentist), index=_entries)

332

333 # Merges all dataframes together

334 sums = pandas.concat(data.values()).groupby("index").sum()

335 sums["threshold"] /= len(data)

336

337 # create a new dataframe with these

338 measures = sums.apply(lambda r: _row_summary(r), axis=1)

339

340 # merge sums and measures into a single dataframe

341 return pandas.concat([sums, measures.reindex(sums.index)], axis=1).copy()

342

343

344def _evaluate_sample_worker(args):

345 """Runs all of the evaluation steps on a single sample.

346

347 Parameters

348 ----------

349

350 args : tuple

351 A tuple containing the following sub-arguments:

352

353 sample : tuple

354 Sample to be processed, containing the stem of the filepath

355 relative to the database root, the image, the ground-truth, and

356 possibly the mask to define the region of interest to be processed.

357

358 name : str

359 the local name of the dataset (e.g. ``train``, or ``test``), to be

360 used when saving measures files.

361

362 steps : :py:class:`float`, Optional

363 number of threshold steps to consider when evaluating thresholds.

364

365 threshold : :py:class:`float`, Optional

366 if ``overlayed_folder``, then this should be threshold (floating

367 point) to apply to prediction maps to decide on positives and

368 negatives for overlaying analysis (graphical output). This number

369 should come from the training set or a separate validation set.

370 Using a test set value may bias your analysis. This number is also

371 used to print the a priori F1-score on the evaluated set.

372

373 use_predictions_folder : str

374 Folder where predictions for the dataset images have been

375 previously stored

376

377 output_folder : str, None

378 If not ``None``, then outputs a copy of the evaluation for this

379 sample in CSV format at this directory, but respecting the sample

380 ``stem``.

381

382 overlayed_folder : str, None

383 If not ``None``, then outputs a version of the input image with

384 predictions overlayed, in PNG format, but respecting the sample

385 ``stem``.

386

387

388 Returns

389 -------

390

391 stem : str

392 The unique sample stem

393

394 data : pandas.DataFrame

395 Dataframe containing the evaluation performance on this single sample

396 """

397

398 (

399 sample,

400 name,

401 steps,

402 threshold,

403 use_predictions_folder,

404 output_folder,

405 overlayed_folder,

406 ) = args

407

408 stem = sample[0]

409 image = sample[1]

410 gt = sample[2]

411 mask = None if len(sample) <= 3 else sample[3]

412 pred_fullpath = os.path.join(use_predictions_folder, stem + ".hdf5")

413 with h5py.File(pred_fullpath, "r") as f:

414 pred = f["array"][:]

415 pred = torch.from_numpy(pred)

416 retval = _sample_measures(pred, gt, mask, steps)

417

418 if output_folder is not None:

419 fullpath = os.path.join(output_folder, name, f"{stem}.csv")

420 tqdm.write(f"Saving {fullpath}...")

421 os.makedirs(os.path.dirname(fullpath), exist_ok=True)

422 retval.to_csv(fullpath)

423

424 if overlayed_folder is not None:

425 overlay_image = _sample_analysis(

426 image, pred, gt, mask, threshold=threshold, overlay=True

427 )

428 fullpath = os.path.join(overlayed_folder, name, f"{stem}.png")

429 tqdm.write(f"Saving {fullpath}...")

430 os.makedirs(os.path.dirname(fullpath), exist_ok=True)

431 overlay_image.save(fullpath)

432

433 return stem, retval

434

435

436def run(

437 dataset,

438 name,

439 predictions_folder,

440 output_folder=None,

441 overlayed_folder=None,

442 threshold=None,

443 steps=1000,

444 parallel=-1,

445):

446 """Runs inference and calculates measures.

447

448 Parameters

449 ---------

450

451 dataset : py:class:`torch.utils.data.Dataset`

452 a dataset to iterate on

453

454 name : str

455 the local name of this dataset (e.g. ``train``, or ``test``), to be

456 used when saving measures files.

457

458 predictions_folder : str

459 folder where predictions for the dataset images have been previously

460 stored

461

462 output_folder : :py:class:`str`, Optional

463 folder where to store results. If not provided, then do not store any

464 analysis (useful for quickly calculating overlay thresholds)

465

466 overlayed_folder : :py:class:`str`, Optional

467 if not ``None``, then it should be the name of a folder where to store

468 overlayed versions of the images and ground-truths

469

470 threshold : :py:class:`float`, Optional

471 if ``overlayed_folder``, then this should be threshold (floating point)

472 to apply to prediction maps to decide on positives and negatives for

473 overlaying analysis (graphical output). This number should come from

474 the training set or a separate validation set. Using a test set value

475 may bias your analysis. This number is also used to print the a priori

476 F1-score on the evaluated set.

477

478 steps : :py:class:`float`, Optional

479 number of threshold steps to consider when evaluating thresholds.

480

481 parallel : :py:class:`int`, Optional

482 If set to a value different >= 0, uses multiprocessing for estimating

483 thresholds for each sample through a processing pool. A value of zero

484 will create as many processes in the pool as cores in the machine. A

485 negative value disables multiprocessing altogether. A value greater

486 than zero will spawn as many processes as requested.

487

488

489 Returns

490 -------

491

492 threshold : float

493 Threshold to achieve the highest possible F1-score for this dataset

494 """

495

496 # Collect overall measures

497 data = {}

498

499 use_predictions_folder = os.path.join(predictions_folder, name)

500 if not os.path.exists(use_predictions_folder):

501 use_predictions_folder = predictions_folder

502

503 if parallel < 0: # turns off multiprocessing

504 for sample in tqdm(dataset, desc="sample"):

505 k, v = _evaluate_sample_worker(

506 (

507 sample,

508 name,

509 steps,

510 threshold,

511 use_predictions_folder,

512 output_folder,

513 overlayed_folder,

514 )

515 )

516 data[k] = v

517 else:

518 parallel = parallel or multiprocessing.cpu_count()

519 with multiprocessing.Pool(processes=parallel) as pool, tqdm(

520 total=len(dataset),

521 desc="sample",

522 ) as pbar:

523 for k, v in pool.imap_unordered(

524 _evaluate_sample_worker,

525 zip(

526 dataset,

527 itertools.repeat(name),

528 itertools.repeat(steps),

529 itertools.repeat(threshold),

530 itertools.repeat(use_predictions_folder),

531 itertools.repeat(output_folder),

532 itertools.repeat(overlayed_folder),

533 ),

534 ):

535 pbar.update()

536 data[k] = v

537

538 # Merges all dataframes together

539 measures = _summarize(data)

540

541 maxf1 = measures["mean_f1_score"].max()

542 maxf1_index = measures["mean_f1_score"].idxmax()

543 maxf1_threshold = measures["threshold"][maxf1_index]

544

545 logger.info(

546 f"Maximum F1-score of {maxf1:.5f}, achieved at "

547 f"threshold {maxf1_threshold:.3f} (chosen *a posteriori*)"

548 )

549

550 if threshold is not None:

551 # get the closest possible threshold we have

552 index = int(round(steps * threshold))

553 f1_a_priori = measures["mean_f1_score"][index]

554 actual_threshold = measures["threshold"][index]

555

556 # mark threshold a priori chosen on this dataset

557 measures["threshold_a_priori"] = False

558 measures["threshold_a_priori", index] = True

559

560 logger.info(

561 f"F1-score of {f1_a_priori:.5f}, at threshold "

562 f"{actual_threshold:.3f} (chosen *a priori*)"

563 )

564

565 if output_folder is not None:

566 logger.info(f"Output folder: {output_folder}")

567 os.makedirs(output_folder, exist_ok=True)

568 measures_path = os.path.join(output_folder, f"{name}.csv")

569 logger.info(

570 f"Saving measures over all input images at {measures_path}..."

571 )

572 measures.to_csv(measures_path)

573

574 return maxf1_threshold

575

576

577def _compare_annotators_worker(args):

578 """Runs all of the comparison steps on a single sample pair.

579

580 Parameters

581 ----------

582

583 args : tuple

584 A tuple containing the following sub-arguments:

585

586 baseline_sample : tuple

587 Baseline sample to be processed, containing the stem of the filepath

588 relative to the database root, the image, the ground-truth, and

589 possibly the mask to define the region of interest to be processed.

590

591 other_sample : tuple

592 Another sample that is identical to the first, but has a different

593 mask (drawn by a different annotator)

594

595 name : str

596 the local name of the dataset (e.g. ``train``, or ``test``), to be

597 used when saving measures files.

598

599 output_folder : str, None

600 If not ``None``, then outputs a copy of the evaluation for this

601 sample in CSV format at this directory, but respecting the sample

602 ``stem``.

603

604 overlayed_folder : str, None

605 If not ``None``, then outputs a version of the input image with

606 predictions overlayed, in PNG format, but respecting the sample

607 ``stem``.

608

609

610 Returns

611 -------

612

613 stem : str

614 The unique sample stem

615

616 data : pandas.DataFrame

617 Dataframe containing the evaluation performance on this single sample

618 """

619

620 (

621 baseline_sample,

622 other_sample,

623 name,

624 output_folder,

625 overlayed_folder,

626 ) = args

627

628 assert baseline_sample[0] == other_sample[0], (

629 f"Mismatch between "

630 f"datasets for second-annotator analysis "

631 f"({baseline_sample[0]} != {other_sample[0]}). This "

632 f"typically occurs when the second annotator (`other`) "

633 f"comes from a different dataset than the `baseline` dataset"

634 )

635

636 stem = baseline_sample[0]

637 image = baseline_sample[1]

638 gt = baseline_sample[2]

639 pred = other_sample[2] # works as a prediction

640 mask = None if len(baseline_sample) < 4 else baseline_sample[3]

641 retval = _sample_measures(pred, gt, mask, 2)

642

643 if output_folder is not None:

644 fullpath = os.path.join(

645 output_folder, "second-annotator", name, f"{stem}.csv"

646 )

647 tqdm.write(f"Saving {fullpath}...")

648 os.makedirs(os.path.dirname(fullpath), exist_ok=True)

649 retval.to_csv(fullpath)

650

651 if overlayed_folder is not None:

652 overlay_image = _sample_analysis(

653 image, pred, gt, mask, threshold=0.5, overlay=True

654 )

655 fullpath = os.path.join(

656 overlayed_folder, "second-annotator", name, f"{stem}.png"

657 )

658 tqdm.write(f"Saving {fullpath}...")

659 os.makedirs(os.path.dirname(fullpath), exist_ok=True)

660 overlay_image.save(fullpath)

661

662 return stem, retval

663

664

665def compare_annotators(

666 baseline,

667 other,

668 name,

669 output_folder,

670 overlayed_folder=None,

671 parallel=-1,

672):

673 """Compares annotations on the **same** dataset.

674

675 Parameters

676 ---------

677

678 baseline : py:class:`torch.utils.data.Dataset`

679 a dataset to iterate on, containing the baseline annotations

680

681 other : py:class:`torch.utils.data.Dataset`

682 a second dataset, with the same samples as ``baseline``, but annotated

683 by a different annotator than in the first dataset. The key values

684 must much between ``baseline`` and this dataset.

685

686 name : str

687 the local name of this dataset (e.g. ``train-second-annotator``, or

688 ``test-second-annotator``), to be used when saving measures files.

689

690 output_folder : str

691 folder where to store results

692

693 overlayed_folder : :py:class:`str`, Optional

694 if not ``None``, then it should be the name of a folder where to store

695 overlayed versions of the images and ground-truths

696

697 parallel : :py:class:`int`, Optional

698 If set to a value different >= 0, uses multiprocessing for estimating

699 thresholds for each sample through a processing pool. A value of zero

700 will create as many processes in the pool as cores in the machine. A

701 negative value disables multiprocessing altogether. A value greater

702 than zero will spawn as many processes as requested.

703 """

704

705 logger.info(f"Output folder: {output_folder}")

706 os.makedirs(output_folder, exist_ok=True)

707

708 # Collect overall measures

709 data = {}

710

711 if parallel < 0: # turns off multiprocessing

712 for baseline_sample, other_sample in tqdm(

713 list(zip(baseline, other)),

714 desc="samples",

715 leave=False,

716 disable=None,

717 ):

718 k, v = _compare_annotators_worker(

719 (

720 baseline_sample,

721 other_sample,

722 name,

723 output_folder,

724 overlayed_folder,

725 )

726 )

727 data[k] = v

728 else:

729 parallel = parallel or multiprocessing.cpu_count()

730 with multiprocessing.Pool(processes=parallel) as pool, tqdm(

731 total=len(baseline),

732 desc="sample",

733 ) as pbar:

734 for k, v in pool.imap_unordered(

735 _compare_annotators_worker,

736 zip(

737 baseline,

738 other,

739 itertools.repeat(name),

740 itertools.repeat(output_folder),

741 itertools.repeat(overlayed_folder),

742 ),

743 ):

744 pbar.update()

745 data[k] = v

746

747 measures = _summarize(data)

748 measures.drop(0, inplace=True) # removes threshold == 0.0, keeps 0.5 only

749

750 measures_path = os.path.join(

751 output_folder, "second-annotator", f"{name}.csv"

752 )

753 os.makedirs(os.path.dirname(measures_path), exist_ok=True)

754 logger.info(f"Saving summaries over all input images at {measures_path}...")

755 measures.to_csv(measures_path)

756

757 maxf1 = measures["mean_f1_score"].max()

758 logger.info(f"F1-score of {maxf1:.5f} (second annotator; threshold=0.5)")