Source code for bob.ip.binseg.engine.evaluator

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""Defines functionality for the evaluation of predictions"""

import itertools
import logging
import multiprocessing
import os

import h5py
import numpy
import pandas
import PIL
import torch
import torch.nn.functional
import torchvision.transforms.functional as VF

from tqdm import tqdm

from ...common.utils.measure import base_measures, bayesian_measures

logger = logging.getLogger(__name__)


def _posneg(pred, gt, threshold):
    """Calculates true and false positives and negatives


    Parameters
    ----------

    pred : torch.Tensor
        pixel-wise predictions

    gt : torch.Tensor
        ground-truth (annotations)

    threshold : float
        a particular threshold in which to calculate the performance
        measures


    Returns
    -------

    tp_tensor : torch.Tensor
        boolean tensor with true positives, considering all observations

    fp_tensor : torch.Tensor
        boolean tensor with false positives, considering all observations

    tn_tensor : torch.Tensor
        boolean tensor with true negatives, considering all observations

    fn_tensor : torch.Tensor
        boolean tensor with false negatives, considering all observations

    """

    gt = gt.byte()  # byte tensor

    # threshold
    binary_pred = torch.gt(pred, threshold).byte()

    # equals and not-equals
    equals = torch.eq(binary_pred, gt).type(torch.uint8)  # tensor
    notequals = torch.ne(binary_pred, gt).type(torch.uint8)  # tensor

    # true positives
    tp_tensor = gt * binary_pred

    # false positives
    fp_tensor = torch.eq((binary_pred + tp_tensor), 1).byte()

    # true negatives
    tn_tensor = equals - tp_tensor

    # false negatives
    fn_tensor = notequals - fp_tensor

    return tp_tensor, fp_tensor, tn_tensor, fn_tensor


[docs]def sample_measures_for_threshold(pred, gt, mask, threshold):
    """
    Calculates counts on one single sample, for a specific threshold


    Parameters
    ----------

    pred : torch.Tensor
        pixel-wise predictions

    gt : torch.Tensor
        ground-truth (annotations)

    mask : torch.Tensor
        region mask (used only if available).  May be set to ``None``.

    threshold : float
        a particular threshold in which to calculate the performance
        measures


    Returns
    -------

    tp : int

    fp : int

    tn : int

    fn : int

    """

    tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg(pred, gt, threshold)

    # if a mask is provided, consider only TP/FP/TN/FN **within** the region of
    # interest defined by the mask
    if mask is not None:
        antimask = torch.le(mask, 0.5)
        tp_tensor[antimask] = 0
        fp_tensor[antimask] = 0
        tn_tensor[antimask] = 0
        fn_tensor[antimask] = 0

    # calc measures from scalars
    tp_count = torch.sum(tp_tensor).item()
    fp_count = torch.sum(fp_tensor).item()
    tn_count = torch.sum(tn_tensor).item()
    fn_count = torch.sum(fn_tensor).item()

    return tp_count, fp_count, tn_count, fn_count


def _sample_measures(pred, gt, mask, steps):
    """
    Calculates measures on one single sample


    Parameters
    ----------

    pred : torch.Tensor
        pixel-wise predictions

    gt : torch.Tensor
        ground-truth (annotations)

    mask : torch.Tensor
        region mask (used only if available).  May be set to ``None``.

    steps : int
        number of steps to use for threshold analysis.  The step size is
        calculated from this by dividing ``1.0/steps``


    Returns
    -------

    measures : pandas.DataFrame

        A pandas dataframe with the following columns:

        * tp: int
        * fp: int
        * tn: int
        * fn: int

    """

    step_size = 1.0 / steps
    data = [
        (index, threshold)
        + sample_measures_for_threshold(pred, gt, mask, threshold)
        for index, threshold in enumerate(numpy.arange(0.0, 1.0, step_size))
    ]

    retval = pandas.DataFrame(
        data,
        columns=(
            "index",
            "threshold",
            "tp",
            "fp",
            "tn",
            "fn",
        ),
    )
    retval.set_index("index", inplace=True)
    return retval


def _sample_analysis(
    img,
    pred,
    gt,
    mask,
    threshold,
    tp_color=(0, 255, 0),  # (128,128,128) Gray
    fp_color=(0, 0, 255),  # (70, 240, 240) Cyan
    fn_color=(255, 0, 0),  # (245, 130, 48) Orange
    overlay=True,
):
    """Visualizes true positives, false positives and false negatives


    Parameters
    ----------

    img : torch.Tensor
        original image

    pred : torch.Tensor
        pixel-wise predictions

    gt : torch.Tensor
        ground-truth (annotations)

    mask : torch.Tensor
        region mask (used only if available).  May be set to ``None``.

    threshold : float
        The threshold to be used while analyzing this image's probability map

    tp_color : tuple
        RGB value for true positives

    fp_color : tuple
        RGB value for false positives

    fn_color : tuple
        RGB value for false negatives

    overlay : :py:class:`bool`, Optional
        If set to ``True`` (which is the default), then overlay annotations on
        top of the image.  Otherwise, represent data on a black canvas.


    Returns
    -------

    figure : PIL.Image.Image

        A PIL image that contains the overlayed analysis of true-positives
        (TP), false-positives (FP) and false negatives (FN).

    """

    tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg(pred, gt, threshold)

    # if a mask is provided, consider only TP/FP/TN/FN **within** the region of
    # interest defined by the mask
    if mask is not None:
        antimask = torch.le(mask, 0.5)
        tp_tensor[antimask] = 0
        fp_tensor[antimask] = 0
        tn_tensor[antimask] = 0
        fn_tensor[antimask] = 0

    # change to PIL representation
    tp_pil = VF.to_pil_image(tp_tensor.float())
    tp_pil_colored = PIL.ImageOps.colorize(tp_pil, (0, 0, 0), tp_color)

    fp_pil = VF.to_pil_image(fp_tensor.float())
    fp_pil_colored = PIL.ImageOps.colorize(fp_pil, (0, 0, 0), fp_color)

    fn_pil = VF.to_pil_image(fn_tensor.float())
    fn_pil_colored = PIL.ImageOps.colorize(fn_pil, (0, 0, 0), fn_color)

    tp_pil_colored.paste(fp_pil_colored, mask=fp_pil)
    tp_pil_colored.paste(fn_pil_colored, mask=fn_pil)

    if overlay:
        img = VF.to_pil_image(img)  # PIL Image
        # using blend here, to fade original image being overlayed, or
        # its brightness may obfuscate colors from the vessel map
        tp_pil_colored = PIL.Image.blend(img, tp_pil_colored, 0.5)

    return tp_pil_colored


def _summarize(data):
    """Summarizes collected dataframes and adds bayesian figures"""

    _entries = (
        "mean_precision",
        "mode_precision",
        "lower_precision",
        "upper_precision",
        "mean_recall",
        "mode_recall",
        "lower_recall",
        "upper_recall",
        "mean_specificity",
        "mode_specificity",
        "lower_specificity",
        "upper_specificity",
        "mean_accuracy",
        "mode_accuracy",
        "lower_accuracy",
        "upper_accuracy",
        "mean_jaccard",
        "mode_jaccard",
        "lower_jaccard",
        "upper_jaccard",
        "mean_f1_score",
        "mode_f1_score",
        "lower_f1_score",
        "upper_f1_score",
        "frequentist_precision",
        "frequentist_recall",
        "frequentist_specificity",
        "frequentist_accuracy",
        "frequentist_jaccard",
        "frequentist_f1_score",
    )

    def _row_summary(r):

        # run bayesian_measures(), flatten tuple of tuples, name entries
        bayesian = [
            item
            for sublist in bayesian_measures(
                r.tp,
                r.fp,
                r.tn,
                r.fn,
                lambda_=0.5,
                coverage=0.95,
            )
            for item in sublist
        ]

        # evaluate frequentist measures
        frequentist = base_measures(r.tp, r.fp, r.tn, r.fn)
        return pandas.Series(bayesian + list(frequentist), index=_entries)

    # Merges all dataframes together
    sums = pandas.concat(data.values()).groupby("index").sum()
    sums["threshold"] /= len(data)

    # create a new dataframe with these
    measures = sums.apply(lambda r: _row_summary(r), axis=1)

    # merge sums and measures into a single dataframe
    return pandas.concat([sums, measures.reindex(sums.index)], axis=1).copy()


def _evaluate_sample_worker(args):
    """Runs all of the evaluation steps on a single sample


    Parameters
    ----------

    args : tuple
        A tuple containing the following sub-arguments:

        sample : tuple
            Sample to be processed, containing the stem of the filepath
            relative to the database root, the image, the ground-truth, and
            possibly the mask to define the region of interest to be processed.

        name : str
            the local name of the dataset (e.g. ``train``, or ``test``), to be
            used when saving measures files.

        steps : :py:class:`float`, Optional
            number of threshold steps to consider when evaluating thresholds.

        threshold : :py:class:`float`, Optional
            if ``overlayed_folder``, then this should be threshold (floating
            point) to apply to prediction maps to decide on positives and
            negatives for overlaying analysis (graphical output).  This number
            should come from the training set or a separate validation set.
            Using a test set value may bias your analysis.  This number is also
            used to print the a priori F1-score on the evaluated set.

        use_predictions_folder : str
            Folder where predictions for the dataset images have been
            previously stored

        output_folder : str, None
            If not ``None``, then outputs a copy of the evaluation for this
            sample in CSV format at this directory, but respecting the sample
            ``stem``.

        overlayed_folder : str, None
            If not ``None``, then outputs a version of the input image with
            predictions overlayed, in PNG format, but respecting the sample
            ``stem``.


    Returns
    -------

    stem : str
        The unique sample stem

    data : pandas.DataFrame
        Dataframe containing the evaluation performance on this single sample

    """

    (
        sample,
        name,
        steps,
        threshold,
        use_predictions_folder,
        output_folder,
        overlayed_folder,
    ) = args

    stem = sample[0]
    image = sample[1]
    gt = sample[2]
    mask = None if len(sample) <= 3 else sample[3]
    pred_fullpath = os.path.join(use_predictions_folder, stem + ".hdf5")
    with h5py.File(pred_fullpath, "r") as f:
        pred = f["array"][:]
    pred = torch.from_numpy(pred)
    retval = _sample_measures(pred, gt, mask, steps)

    if output_folder is not None:
        fullpath = os.path.join(output_folder, name, f"{stem}.csv")
        tqdm.write(f"Saving {fullpath}...")
        os.makedirs(os.path.dirname(fullpath), exist_ok=True)
        retval.to_csv(fullpath)

    if overlayed_folder is not None:
        overlay_image = _sample_analysis(
            image, pred, gt, mask, threshold=threshold, overlay=True
        )
        fullpath = os.path.join(overlayed_folder, name, f"{stem}.png")
        tqdm.write(f"Saving {fullpath}...")
        os.makedirs(os.path.dirname(fullpath), exist_ok=True)
        overlay_image.save(fullpath)

    return stem, retval


[docs]def run(
    dataset,
    name,
    predictions_folder,
    output_folder=None,
    overlayed_folder=None,
    threshold=None,
    steps=1000,
    parallel=-1,
):
    """
    Runs inference and calculates measures


    Parameters
    ---------

    dataset : py:class:`torch.utils.data.Dataset`
        a dataset to iterate on

    name : str
        the local name of this dataset (e.g. ``train``, or ``test``), to be
        used when saving measures files.

    predictions_folder : str
        folder where predictions for the dataset images have been previously
        stored

    output_folder : :py:class:`str`, Optional
        folder where to store results.  If not provided, then do not store any
        analysis (useful for quickly calculating overlay thresholds)

    overlayed_folder : :py:class:`str`, Optional
        if not ``None``, then it should be the name of a folder where to store
        overlayed versions of the images and ground-truths

    threshold : :py:class:`float`, Optional
        if ``overlayed_folder``, then this should be threshold (floating point)
        to apply to prediction maps to decide on positives and negatives for
        overlaying analysis (graphical output).  This number should come from
        the training set or a separate validation set.  Using a test set value
        may bias your analysis.  This number is also used to print the a priori
        F1-score on the evaluated set.

    steps : :py:class:`float`, Optional
        number of threshold steps to consider when evaluating thresholds.

    parallel : :py:class:`int`, Optional
        If set to a value different >= 0, uses multiprocessing for estimating
        thresholds for each sample through a processing pool.  A value of zero
        will create as many processes in the pool as cores in the machine.  A
        negative value disables multiprocessing altogether.  A value greater
        than zero will spawn as many processes as requested.


    Returns
    -------

    threshold : float
        Threshold to achieve the highest possible F1-score for this dataset

    """

    # Collect overall measures
    data = {}

    use_predictions_folder = os.path.join(predictions_folder, name)
    if not os.path.exists(use_predictions_folder):
        use_predictions_folder = predictions_folder

    if parallel < 0:  # turns off multiprocessing
        for sample in tqdm(dataset, desc="sample"):
            k, v = _evaluate_sample_worker(
                (
                    sample,
                    name,
                    steps,
                    threshold,
                    use_predictions_folder,
                    output_folder,
                    overlayed_folder,
                )
            )
            data[k] = v
    else:
        parallel = parallel or multiprocessing.cpu_count()
        with multiprocessing.Pool(processes=parallel) as pool, tqdm(
            total=len(dataset),
            desc="sample",
        ) as pbar:
            for k, v in pool.imap_unordered(
                _evaluate_sample_worker,
                zip(
                    dataset,
                    itertools.repeat(name),
                    itertools.repeat(steps),
                    itertools.repeat(threshold),
                    itertools.repeat(use_predictions_folder),
                    itertools.repeat(output_folder),
                    itertools.repeat(overlayed_folder),
                ),
            ):
                pbar.update()
                data[k] = v

    # Merges all dataframes together
    measures = _summarize(data)

    maxf1 = measures["mean_f1_score"].max()
    maxf1_index = measures["mean_f1_score"].idxmax()
    maxf1_threshold = measures["threshold"][maxf1_index]

    logger.info(
        f"Maximum F1-score of {maxf1:.5f}, achieved at "
        f"threshold {maxf1_threshold:.3f} (chosen *a posteriori*)"
    )

    if threshold is not None:

        # get the closest possible threshold we have
        index = int(round(steps * threshold))
        f1_a_priori = measures["mean_f1_score"][index]
        actual_threshold = measures["threshold"][index]

        # mark threshold a priori chosen on this dataset
        measures["threshold_a_priori"] = False
        measures["threshold_a_priori", index] = True

        logger.info(
            f"F1-score of {f1_a_priori:.5f}, at threshold "
            f"{actual_threshold:.3f} (chosen *a priori*)"
        )

    if output_folder is not None:
        logger.info(f"Output folder: {output_folder}")
        os.makedirs(output_folder, exist_ok=True)
        measures_path = os.path.join(output_folder, f"{name}.csv")
        logger.info(
            f"Saving measures over all input images at {measures_path}..."
        )
        measures.to_csv(measures_path)

    return maxf1_threshold


def _compare_annotators_worker(args):
    """Runs all of the comparison steps on a single sample pair


    Parameters
    ----------

    args : tuple
        A tuple containing the following sub-arguments:

        baseline_sample : tuple
            Baseline sample to be processed, containing the stem of the filepath
            relative to the database root, the image, the ground-truth, and
            possibly the mask to define the region of interest to be processed.

        other_sample : tuple
            Another sample that is identical to the first, but has a different
            mask (drawn by a different annotator)

        name : str
            the local name of the dataset (e.g. ``train``, or ``test``), to be
            used when saving measures files.

        output_folder : str, None
            If not ``None``, then outputs a copy of the evaluation for this
            sample in CSV format at this directory, but respecting the sample
            ``stem``.

        overlayed_folder : str, None
            If not ``None``, then outputs a version of the input image with
            predictions overlayed, in PNG format, but respecting the sample
            ``stem``.


    Returns
    -------

    stem : str
        The unique sample stem

    data : pandas.DataFrame
        Dataframe containing the evaluation performance on this single sample

    """

    (
        baseline_sample,
        other_sample,
        name,
        output_folder,
        overlayed_folder,
    ) = args

    assert baseline_sample[0] == other_sample[0], (
        f"Mismatch between "
        f"datasets for second-annotator analysis "
        f"({baseline_sample[0]} != {other_sample[0]}).  This "
        f"typically occurs when the second annotator (`other`) "
        f"comes from a different dataset than the `baseline` dataset"
    )

    stem = baseline_sample[0]
    image = baseline_sample[1]
    gt = baseline_sample[2]
    pred = other_sample[2]  # works as a prediction
    mask = None if len(baseline_sample) < 4 else baseline_sample[3]
    retval = _sample_measures(pred, gt, mask, 2)

    if output_folder is not None:
        fullpath = os.path.join(
            output_folder, "second-annotator", name, f"{stem}.csv"
        )
        tqdm.write(f"Saving {fullpath}...")
        os.makedirs(os.path.dirname(fullpath), exist_ok=True)
        retval.to_csv(fullpath)

    if overlayed_folder is not None:
        overlay_image = _sample_analysis(
            image, pred, gt, mask, threshold=0.5, overlay=True
        )
        fullpath = os.path.join(
            overlayed_folder, "second-annotator", name, f"{stem}.png"
        )
        tqdm.write(f"Saving {fullpath}...")
        os.makedirs(os.path.dirname(fullpath), exist_ok=True)
        overlay_image.save(fullpath)

    return stem, retval


[docs]def compare_annotators(
    baseline,
    other,
    name,
    output_folder,
    overlayed_folder=None,
    parallel=-1,
):
    """
    Compares annotations on the **same** dataset


    Parameters
    ---------

    baseline : py:class:`torch.utils.data.Dataset`
        a dataset to iterate on, containing the baseline annotations

    other : py:class:`torch.utils.data.Dataset`
        a second dataset, with the same samples as ``baseline``, but annotated
        by a different annotator than in the first dataset.  The key values
        must much between ``baseline`` and this dataset.

    name : str
        the local name of this dataset (e.g. ``train-second-annotator``, or
        ``test-second-annotator``), to be used when saving measures files.

    output_folder : str
        folder where to store results

    overlayed_folder : :py:class:`str`, Optional
        if not ``None``, then it should be the name of a folder where to store
        overlayed versions of the images and ground-truths

    parallel : :py:class:`int`, Optional
        If set to a value different >= 0, uses multiprocessing for estimating
        thresholds for each sample through a processing pool.  A value of zero
        will create as many processes in the pool as cores in the machine.  A
        negative value disables multiprocessing altogether.  A value greater
        than zero will spawn as many processes as requested.

    """

    logger.info(f"Output folder: {output_folder}")
    os.makedirs(output_folder, exist_ok=True)

    # Collect overall measures
    data = {}

    if parallel < 0:  # turns off multiprocessing
        for baseline_sample, other_sample in tqdm(
            list(zip(baseline, other)),
            desc="samples",
            leave=False,
            disable=None,
        ):
            k, v = _compare_annotators_worker(
                (
                    baseline_sample,
                    other_sample,
                    name,
                    output_folder,
                    overlayed_folder,
                )
            )
            data[k] = v
    else:
        parallel = parallel or multiprocessing.cpu_count()
        with multiprocessing.Pool(processes=parallel) as pool, tqdm(
            total=len(baseline),
            desc="sample",
        ) as pbar:
            for k, v in pool.imap_unordered(
                _compare_annotators_worker,
                zip(
                    baseline,
                    other,
                    itertools.repeat(name),
                    itertools.repeat(output_folder),
                    itertools.repeat(overlayed_folder),
                ),
            ):
                pbar.update()
                data[k] = v

    measures = _summarize(data)
    measures.drop(0, inplace=True)  # removes threshold == 0.0, keeps 0.5 only

    measures_path = os.path.join(
        output_folder, "second-annotator", f"{name}.csv"
    )
    os.makedirs(os.path.dirname(measures_path), exist_ok=True)
    logger.info(f"Saving summaries over all input images at {measures_path}...")
    measures.to_csv(measures_path)

    maxf1 = measures["mean_f1_score"].max()
    logger.info(f"F1-score of {maxf1:.5f} (second annotator; threshold=0.5)")