Source code for bob.ip.binseg.engine.evaluator

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""Defines functionality for the evaluation of predictions"""

import logging
import os

import h5py
import numpy
import pandas
import PIL
import torch
import torch.nn.functional
import torchvision.transforms.functional as VF

from tqdm import tqdm

from ..utils.measure import base_measures, bayesian_measures

logger = logging.getLogger(__name__)


def _posneg(pred, gt, threshold):
    """Calculates true and false positives and negatives


    Parameters
    ----------

    pred : torch.Tensor
        pixel-wise predictions

    gt : torch.Tensor
        ground-truth (annotations)

    threshold : float
        a particular threshold in which to calculate the performance
        measures


    Returns
    -------

    tp_tensor : torch.Tensor
        boolean tensor with true positives, considering all observations

    fp_tensor : torch.Tensor
        boolean tensor with false positives, considering all observations

    tn_tensor : torch.Tensor
        boolean tensor with true negatives, considering all observations

    fn_tensor : torch.Tensor
        boolean tensor with false negatives, considering all observations

    """

    gt = gt.byte()  # byte tensor

    # threshold
    binary_pred = torch.gt(pred, threshold).byte()

    # equals and not-equals
    equals = torch.eq(binary_pred, gt).type(torch.uint8)  # tensor
    notequals = torch.ne(binary_pred, gt).type(torch.uint8)  # tensor

    # true positives
    tp_tensor = gt * binary_pred

    # false positives
    fp_tensor = torch.eq((binary_pred + tp_tensor), 1).byte()

    # true negatives
    tn_tensor = equals - tp_tensor

    # false negatives
    fn_tensor = notequals - fp_tensor

    return tp_tensor, fp_tensor, tn_tensor, fn_tensor


[docs]def sample_measures_for_threshold(pred, gt, mask, threshold):
    """
    Calculates counts on one single sample, for a specific threshold


    Parameters
    ----------

    pred : torch.Tensor
        pixel-wise predictions

    gt : torch.Tensor
        ground-truth (annotations)

    mask : torch.Tensor
        region mask (used only if available).  May be set to ``None``.

    threshold : float
        a particular threshold in which to calculate the performance
        measures


    Returns
    -------

    tp : int

    fp : int

    tn : int

    fn : int

    """

    tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg(pred, gt, threshold)

    # if a mask is provided, consider only TP/FP/TN/FN **within** the region of
    # interest defined by the mask
    if mask is not None:
        antimask = torch.le(mask, 0.5)
        tp_tensor[antimask] = 0
        fp_tensor[antimask] = 0
        tn_tensor[antimask] = 0
        fn_tensor[antimask] = 0

    # calc measures from scalars
    tp_count = torch.sum(tp_tensor).item()
    fp_count = torch.sum(fp_tensor).item()
    tn_count = torch.sum(tn_tensor).item()
    fn_count = torch.sum(fn_tensor).item()

    return tp_count, fp_count, tn_count, fn_count


def _sample_measures(pred, gt, mask, steps):
    """
    Calculates measures on one single sample


    Parameters
    ----------

    pred : torch.Tensor
        pixel-wise predictions

    gt : torch.Tensor
        ground-truth (annotations)

    mask : torch.Tensor
        region mask (used only if available).  May be set to ``None``.

    steps : int
        number of steps to use for threshold analysis.  The step size is
        calculated from this by dividing ``1.0/steps``


    Returns
    -------

    measures : pandas.DataFrame

        A pandas dataframe with the following columns:

        * tp: int
        * fp: int
        * tn: int
        * fn: int

    """

    step_size = 1.0 / steps
    data = [
        (index, threshold)
        + sample_measures_for_threshold(pred, gt, mask, threshold)
        for index, threshold in enumerate(numpy.arange(0.0, 1.0, step_size))
    ]

    retval = pandas.DataFrame(
        data,
        columns=(
            "index",
            "threshold",
            "tp",
            "fp",
            "tn",
            "fn",
        ),
    )
    retval.set_index("index", inplace=True)
    return retval


def _sample_analysis(
    img,
    pred,
    gt,
    mask,
    threshold,
    tp_color=(0, 255, 0),  # (128,128,128) Gray
    fp_color=(0, 0, 255),  # (70, 240, 240) Cyan
    fn_color=(255, 0, 0),  # (245, 130, 48) Orange
    overlay=True,
):
    """Visualizes true positives, false positives and false negatives


    Parameters
    ----------

    img : torch.Tensor
        original image

    pred : torch.Tensor
        pixel-wise predictions

    gt : torch.Tensor
        ground-truth (annotations)

    mask : torch.Tensor
        region mask (used only if available).  May be set to ``None``.

    threshold : float
        The threshold to be used while analyzing this image's probability map

    tp_color : tuple
        RGB value for true positives

    fp_color : tuple
        RGB value for false positives

    fn_color : tuple
        RGB value for false negatives

    overlay : :py:class:`bool`, Optional
        If set to ``True`` (which is the default), then overlay annotations on
        top of the image.  Otherwise, represent data on a black canvas.


    Returns
    -------

    figure : PIL.Image.Image

        A PIL image that contains the overlayed analysis of true-positives
        (TP), false-positives (FP) and false negatives (FN).

    """

    tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg(pred, gt, threshold)

    # if a mask is provided, consider only TP/FP/TN/FN **within** the region of
    # interest defined by the mask
    if mask is not None:
        antimask = torch.le(mask, 0.5)
        tp_tensor[antimask] = 0
        fp_tensor[antimask] = 0
        tn_tensor[antimask] = 0
        fn_tensor[antimask] = 0

    # change to PIL representation
    tp_pil = VF.to_pil_image(tp_tensor.float())
    tp_pil_colored = PIL.ImageOps.colorize(tp_pil, (0, 0, 0), tp_color)

    fp_pil = VF.to_pil_image(fp_tensor.float())
    fp_pil_colored = PIL.ImageOps.colorize(fp_pil, (0, 0, 0), fp_color)

    fn_pil = VF.to_pil_image(fn_tensor.float())
    fn_pil_colored = PIL.ImageOps.colorize(fn_pil, (0, 0, 0), fn_color)

    tp_pil_colored.paste(fp_pil_colored, mask=fp_pil)
    tp_pil_colored.paste(fn_pil_colored, mask=fn_pil)

    if overlay:
        img = VF.to_pil_image(img)  # PIL Image
        # using blend here, to fade original image being overlayed, or
        # its brightness may obfuscate colors from the vessel map
        tp_pil_colored = PIL.Image.blend(img, tp_pil_colored, 0.5)

    return tp_pil_colored


def _summarize(data):
    """Summarizes collected dataframes and adds bayesian figures"""

    _entries = (
        "mean_precision",
        "mode_precision",
        "lower_precision",
        "upper_precision",
        "mean_recall",
        "mode_recall",
        "lower_recall",
        "upper_recall",
        "mean_specificity",
        "mode_specificity",
        "lower_specificity",
        "upper_specificity",
        "mean_accuracy",
        "mode_accuracy",
        "lower_accuracy",
        "upper_accuracy",
        "mean_jaccard",
        "mode_jaccard",
        "lower_jaccard",
        "upper_jaccard",
        "mean_f1_score",
        "mode_f1_score",
        "lower_f1_score",
        "upper_f1_score",
        "frequentist_precision",
        "frequentist_recall",
        "frequentist_specificity",
        "frequentist_accuracy",
        "frequentist_jaccard",
        "frequentist_f1_score",
    )

    def _row_summary(r):

        # run bayesian_measures(), flatten tuple of tuples, name entries
        bayesian = [
            item
            for sublist in bayesian_measures(
                r.tp,
                r.fp,
                r.tn,
                r.fn,
                lambda_=0.5,
                coverage=0.95,
            )
            for item in sublist
        ]

        # evaluate frequentist measures
        frequentist = base_measures(r.tp, r.fp, r.tn, r.fn)
        return pandas.Series(bayesian + list(frequentist), index=_entries)

    # Merges all dataframes together
    sums = pandas.concat(data.values()).groupby("index").sum()
    sums["threshold"] /= len(data)

    # create a new dataframe with these
    measures = sums.apply(lambda r: _row_summary(r), axis=1)

    # merge sums and measures into a single dataframe
    return pandas.concat([sums, measures.reindex(sums.index)], axis=1).copy()


[docs]def run(
    dataset,
    name,
    predictions_folder,
    output_folder=None,
    overlayed_folder=None,
    threshold=None,
    steps=1000,
):
    """
    Runs inference and calculates measures


    Parameters
    ---------

    dataset : py:class:`torch.utils.data.Dataset`
        a dataset to iterate on

    name : str
        the local name of this dataset (e.g. ``train``, or ``test``), to be
        used when saving measures files.

    predictions_folder : str
        folder where predictions for the dataset images has been previously
        stored

    output_folder : :py:class:`str`, Optional
        folder where to store results.  If not provided, then do not store any
        analysis (useful for quickly calculating overlay thresholds)

    overlayed_folder : :py:class:`str`, Optional
        if not ``None``, then it should be the name of a folder where to store
        overlayed versions of the images and ground-truths

    threshold : :py:class:`float`, Optional
        if ``overlayed_folder``, then this should be threshold (floating point)
        to apply to prediction maps to decide on positives and negatives for
        overlaying analysis (graphical output).  This number should come from
        the training set or a separate validation set.  Using a test set value
        may bias your analysis.  This number is also used to print the a priori
        F1-score on the evaluated set.

    steps : :py:class:`float`, Optional
        number of threshold steps to consider when evaluating thresholds.


    Returns
    -------

    threshold : float
        Threshold to achieve the highest possible F1-score for this dataset

    """

    # Collect overall measures
    data = {}

    use_predictions_folder = os.path.join(predictions_folder, name)
    if not os.path.exists(use_predictions_folder):
        use_predictions_folder = predictions_folder

    for sample in tqdm(dataset):
        stem = sample[0]
        image = sample[1]
        gt = sample[2]
        mask = None if len(sample) <= 3 else sample[3]
        pred_fullpath = os.path.join(use_predictions_folder, stem + ".hdf5")
        with h5py.File(pred_fullpath, "r") as f:
            pred = f["array"][:]
        pred = torch.from_numpy(pred)
        if stem in data:
            raise RuntimeError(
                f"{stem} entry already exists in data. Cannot overwrite."
            )
        data[stem] = _sample_measures(pred, gt, mask, steps)

        if output_folder is not None:
            fullpath = os.path.join(output_folder, name, f"{stem}.csv")
            tqdm.write(f"Saving {fullpath}...")
            os.makedirs(os.path.dirname(fullpath), exist_ok=True)
            data[stem].to_csv(fullpath)

        if overlayed_folder is not None:
            overlay_image = _sample_analysis(
                image, pred, gt, mask, threshold=threshold, overlay=True
            )
            fullpath = os.path.join(overlayed_folder, name, f"{stem}.png")
            tqdm.write(f"Saving {fullpath}...")
            os.makedirs(os.path.dirname(fullpath), exist_ok=True)
            overlay_image.save(fullpath)

    # Merges all dataframes together
    measures = _summarize(data)

    maxf1 = measures["mean_f1_score"].max()
    maxf1_index = measures["mean_f1_score"].idxmax()
    maxf1_threshold = measures["threshold"][maxf1_index]

    logger.info(
        f"Maximum F1-score of {maxf1:.5f}, achieved at "
        f"threshold {maxf1_threshold:.3f} (chosen *a posteriori*)"
    )

    if threshold is not None:

        # get the closest possible threshold we have
        index = int(round(steps * threshold))
        f1_a_priori = measures["mean_f1_score"][index]
        actual_threshold = measures["threshold"][index]

        # mark threshold a priori chosen on this dataset
        measures["threshold_a_priori"] = False
        measures["threshold_a_priori", index] = True

        logger.info(
            f"F1-score of {f1_a_priori:.5f}, at threshold "
            f"{actual_threshold:.3f} (chosen *a priori*)"
        )

    if output_folder is not None:
        logger.info(f"Output folder: {output_folder}")
        os.makedirs(output_folder, exist_ok=True)
        measures_path = os.path.join(output_folder, f"{name}.csv")
        logger.info(
            f"Saving measures over all input images at {measures_path}..."
        )
        measures.to_csv(measures_path)

    return maxf1_threshold


[docs]def compare_annotators(
    baseline, other, name, output_folder, overlayed_folder=None
):
    """
    Compares annotations on the **same** dataset


    Parameters
    ---------

    baseline : py:class:`torch.utils.data.Dataset`
        a dataset to iterate on, containing the baseline annotations

    other : py:class:`torch.utils.data.Dataset`
        a second dataset, with the same samples as ``baseline``, but annotated
        by a different annotator than in the first dataset.  The key values
        must much between ``baseline`` and this dataset.

    name : str
        the local name of this dataset (e.g. ``train-second-annotator``, or
        ``test-second-annotator``), to be used when saving measures files.

    output_folder : str
        folder where to store results

    overlayed_folder : :py:class:`str`, Optional
        if not ``None``, then it should be the name of a folder where to store
        overlayed versions of the images and ground-truths

    """

    logger.info(f"Output folder: {output_folder}")
    os.makedirs(output_folder, exist_ok=True)

    # Collect overall measures
    data = {}

    for baseline_sample, other_sample in tqdm(
        list(zip(baseline, other)), desc="samples", leave=False, disable=None
    ):
        assert baseline_sample[0] == other_sample[0], (
            f"Mismatch between "
            f"datasets for second-annotator analysis "
            f"({baseline_sample[0]} != {other_sample[0]}).  This "
            f"typically occurs when the second annotator (`other`) "
            f"comes from a different dataset than the `baseline` dataset"
        )

        stem = baseline_sample[0]
        image = baseline_sample[1]
        gt = baseline_sample[2]
        pred = other_sample[2]  # works as a prediction
        mask = None if len(baseline_sample) < 4 else baseline_sample[3]
        if stem in data:
            raise RuntimeError(
                f"{stem} entry already exists in data. " f"Cannot overwrite."
            )
        data[stem] = _sample_measures(pred, gt, mask, 2)

        if output_folder is not None:
            fullpath = os.path.join(
                output_folder, "second-annotator", name, f"{stem}.csv"
            )
            tqdm.write(f"Saving {fullpath}...")
            os.makedirs(os.path.dirname(fullpath), exist_ok=True)
            data[stem].to_csv(fullpath)

        if overlayed_folder is not None:
            overlay_image = _sample_analysis(
                image, pred, gt, mask, threshold=0.5, overlay=True
            )
            fullpath = os.path.join(
                overlayed_folder, "second-annotator", name, f"{stem}.png"
            )
            tqdm.write(f"Saving {fullpath}...")
            os.makedirs(os.path.dirname(fullpath), exist_ok=True)
            overlay_image.save(fullpath)

    measures = _summarize(data)
    measures.drop(0, inplace=True)  # removes threshold == 0.0, keeps 0.5 only

    measures_path = os.path.join(
        output_folder, "second-annotator", f"{name}.csv"
    )
    os.makedirs(os.path.dirname(measures_path), exist_ok=True)
    logger.info(f"Saving summaries over all input images at {measures_path}...")
    measures.to_csv(measures_path)

    maxf1 = measures["mean_f1_score"].max()
    logger.info(f"F1-score of {maxf1:.5f} (second annotator; threshold=0.5)")