Source code for bob.med.tb.engine.evaluator

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""Defines functionality for the evaluation of predictions"""

import os

import numpy
import pandas as pd
import matplotlib.pyplot as plt
import re

import torch
from sklearn import metrics
from bob.measure import eer_threshold

from ..utils.measure import base_measures, get_centered_maxf1

import logging

logger = logging.getLogger(__name__)


[docs]def posneg(pred, gt, threshold):
    """Calculates true and false positives and negatives"""

    # threshold
    binary_pred = torch.gt(pred, threshold)

    # equals and not-equals
    equals = torch.eq(binary_pred, gt).type(torch.uint8)
    notequals = torch.ne(binary_pred, gt).type(torch.uint8)

    # true positives
    tp_tensor = (gt * binary_pred).type(torch.uint8)

    # false positives
    fp_tensor = torch.eq((binary_pred + tp_tensor), 1).type(torch.uint8)

    # true negatives
    tn_tensor = (equals - tp_tensor).type(torch.uint8)

    # false negatives
    fn_tensor = notequals - fp_tensor.type(torch.uint8)

    return tp_tensor, fp_tensor, tn_tensor, fn_tensor

[docs]def sample_measures_for_threshold(pred, gt, threshold):
    """
    Calculates measures on one single sample, for a specific threshold


    Parameters
    ----------

    pred : torch.Tensor
        pixel-wise predictions

    gt : torch.Tensor
        ground-truth (annotations)

    threshold : float
        a particular threshold in which to calculate the performance
        measures


    Returns
    -------

    precision: float

    recall: float

    specificity: float

    accuracy: float

    jaccard: float

    f1_score: float

    """

    tp_tensor, fp_tensor, tn_tensor, fn_tensor = posneg(pred, gt, threshold)

    # calc measures from scalars
    tp_count = torch.sum(tp_tensor).item()
    fp_count = torch.sum(fp_tensor).item()
    tn_count = torch.sum(tn_tensor).item()
    fn_count = torch.sum(fn_tensor).item()
    return base_measures(tp_count, fp_count, tn_count, fn_count)

[docs]def run(
    dataset,
    name,
    predictions_folder,
    output_folder=None,
    f1_thresh=None,
    eer_thresh=None,
    steps=1000,
):
    """
    Runs inference and calculates measures


    Parameters
    ---------

    dataset : py:class:`torch.utils.data.Dataset`
        a dataset to iterate on

    name : str
        the local name of this dataset (e.g. ``train``, or ``test``), to be
        used when saving measures files.

    predictions_folder : str
        folder where predictions for the dataset images has been previously
        stored

    output_folder : :py:class:`str`, Optional
        folder where to store results.

    f1_thresh : :py:class:`float`, Optional
        This number should come from
        the training set or a separate validation set.  Using a test set value
        may bias your analysis.  This number is also used to print the a priori
        F1-score on the evaluated set.

    eer_thresh : :py:class:`float`, Optional
        This number should come from
        the training set or a separate validation set.  Using a test set value
        may bias your analysis.  This number is used to print the a priori
        EER.

    steps : :py:class:`float`, Optional
        number of threshold steps to consider when evaluating thresholds.


    Returns
    -------

    f1_threshold : float
        Threshold to achieve the highest possible F1-score for this dataset

    eer_threshold : float
        Threshold achieving Equal Error Rate for this dataset

    """

    predictions_path = os.path.join(predictions_folder, name, "predictions.csv")
    if not os.path.exists(predictions_path):
        predictions_path = predictions_folder

    # Load predictions
    pred_data = pd.read_csv(predictions_path)
    pred = torch.Tensor([eval(re.sub(' +', ' ', x.replace('\n', '')).replace(' ', ',')) for x in pred_data['likelihood'].values]).double()
    gt = torch.Tensor([eval(re.sub(' +', ' ', x.replace('\n', '')).replace(' ', ',')) for x in pred_data['ground_truth'].values]).double()

    if pred.shape[1] == 1 and gt.shape[1] == 1:
        pred = torch.flatten(pred)
        gt = torch.flatten(gt)

    pred_data['likelihood'] = pred
    pred_data['ground_truth'] = gt

    # Multiclass f1 score computation
    if pred.ndim > 1:
        auc = metrics.roc_auc_score(gt, pred)
        logger.info("Evaluating multiclass classification")
        logger.info(f"AUC: {auc}")
        logger.info("F1 and EER are not implemented for multiclass")
        
        return None, None
    
    # Generate measures for each threshold
    step_size = 1.0 / steps
    data = [
        (index, threshold) + sample_measures_for_threshold(pred, gt, threshold)
        for index, threshold in enumerate(numpy.arange(0.0, 1.0, step_size))
    ]

    data_df = pd.DataFrame(
        data,
        columns=(
            "index",
            "threshold",
            "precision",
            "recall",
            "specificity",
            "accuracy",
            "jaccard",
            "f1_score",
        )
    )
    data_df = data_df.set_index("index")

    # Save evaluation csv
    if output_folder is not None:
        fullpath = os.path.join(output_folder, f"{name}.csv")
        logger.info(f"Saving {fullpath}...")
        os.makedirs(os.path.dirname(fullpath), exist_ok=True)
        data_df.to_csv(fullpath)

    # Find max F1 score
    f1_scores = numpy.asarray(data_df["f1_score"])
    thresholds = numpy.asarray(data_df["threshold"])

    maxf1, maxf1_threshold = get_centered_maxf1(
            f1_scores,
            thresholds
        )

    logger.info(
        f"Maximum F1-score of {maxf1:.5f}, achieved at "
        f"threshold {maxf1_threshold:.3f} (chosen *a posteriori*)"
    )

    # Find EER
    neg_gt = pred_data.loc[pred_data.loc[:, 'ground_truth'] == 0, :]
    pos_gt = pred_data.loc[pred_data.loc[:, 'ground_truth'] == 1, :]
    post_eer_threshold = eer_threshold(neg_gt['likelihood'], pos_gt['likelihood'])

    logger.info(
        f"Equal error rate achieved at "
        f"threshold {post_eer_threshold:.3f} (chosen *a posteriori*)"
    )

    # Save score table
    if output_folder is not None:
        fig, axes = plt.subplots(1)
        fig.tight_layout(pad=3.0)

        # Names and bounds
        axes.set_xlabel("Score")
        axes.set_ylabel("Normalized counts")
        axes.set_xlim(0.0, 1.0)

        neg_weights = numpy.ones_like(neg_gt['likelihood']) / len(pred_data['likelihood'])
        pos_weights = numpy.ones_like(pos_gt['likelihood']) / len(pred_data['likelihood'])

        axes.hist(
            [neg_gt['likelihood'], pos_gt['likelihood']], 
            weights=[neg_weights, pos_weights],
            bins=100, color=['tab:blue', 'tab:orange'], 
            label=["Negatives", "Positives"])
        axes.legend(prop={'size': 10}, loc="upper center")
        axes.set_title(f"Score table for {name} subset")

        # we should see some of axes 1 axes
        axes.spines["right"].set_visible(False)
        axes.spines["top"].set_visible(False)
        axes.spines["left"].set_position(("data", -0.015))

        fullpath = os.path.join(output_folder, f"{name}_score_table.pdf")
        fig.savefig(fullpath)

    if f1_thresh is not None and eer_thresh is not None:

        # get the closest possible threshold we have
        index = int(round(steps * f1_thresh))
        f1_a_priori = data_df["f1_score"][index]
        actual_threshold = data_df["threshold"][index]

        logger.info(
            f"F1-score of {f1_a_priori:.5f}, at threshold "
            f"{actual_threshold:.3f} (chosen *a priori*)"
        )

        # Print the a priori EER threshold
        logger.info(
            f"Equal error rate (chosen *a priori*) {eer_thresh:.3f}"
        )
    
    return maxf1_threshold, post_eer_threshold

        # from matplotlib.backends.backend_pdf import PdfPages

        # fname = os.path.join(output_folder, name + ".pdf")
        # os.makedirs(os.path.dirname(fname), exist_ok=True)
        
        # with PdfPages(fname) as pdf:
            
        #     fig, axes = plt.subplots(2, 2, figsize=(12.8, 9.6))
        #     fig.suptitle(f"Subset: {name}", fontsize=16, fontweight='semibold')
        #     axes = axes.flatten()
            
        #     # Tight layout often produces nice results
        #     # but requires the title to be spaced accordingly
        #     fig.tight_layout(pad=3.0)
        #     fig.subplots_adjust(top=0.92)
            
        #     # ------------
        #     # Score table
        #     # ------------

        #     axes[0].set_xlim(0.0, 1.0)
        #     axes[0].hist(
        #         [neg_gt['likelihood'], pos_gt['likelihood']], 
        #         bins=30, color=['tab:blue', 'tab:orange'], 
        #         label=["Negatives", "Positives"])
        #     axes[0].legend(prop={'size': 10})
        #     axes[0].set_title("Score table")

        #     # ----------
        #     # ROC Curve
        #     # ----------

        #     # TPR = 1 - FNR
        #     (line,) = axes[1].plot(
        #         1 - data_df['specificity'], 
        #         data_df['recall'], 
        #         color="#1f77b4"
        #     )
        #     auc = roc_auc_score(neg_gt['likelihood'], pos_gt['likelihood'])
        #     axes[1].set(xlabel='1 - specificity', ylabel='Sensitivity',
        #         title=f'ROC curve (AUC={auc:.4f})')
        #     # axes[1].plot([0, 1], [0, 1], color='tab:orange', linestyle='--')
        #     axes[1].grid(linestyle="--", linewidth=1, color="gray", alpha=0.2)
        #     axes[1].set_xlim([0.0, 1.0])
        #     axes[1].set_ylim([0.0, 1.0])

        #     #  Equal Error Rate threshold
        #     EER = eer(neg_gt['likelihood'], pos_gt['likelihood'])
        #     threshold = eer_threshold(neg_gt['likelihood'], pos_gt['likelihood'])
        #     threshold_index = data_df['threshold'].sub(threshold).abs().idxmin()
        #     # hter_threshold = min_hter_threshold(neg_gt['likelihood'], pos_gt['likelihood'])

        #     # Plot EER
        #     (marker,) = axes[1].plot(
        #         1 - data_df["specificity"][threshold_index],
        #         data_df["recall"][threshold_index],
        #         marker="o",
        #         color="tab:blue",
        #         markersize=8
        #     )

        #     # We should see some of axes 1 axes
        #     axes[1].spines["right"].set_visible(False)
        #     axes[1].spines["top"].set_visible(False)
        #     axes[1].spines["left"].set_position(("data", -0.015))
        #     axes[1].spines["bottom"].set_position(("data", -0.015))

        #     # Legend
        #     label = f"{name} set (EER={EER:.4f})"
        #     axes[1].legend(
        #         [tuple([line, marker])],
        #         [label],
        #         loc="lower right",
        #         fancybox=True,
        #         framealpha=0.7,
        #     )

        #     # -----------------------
        #     # Precision-recall Curve
        #     # -----------------------

        #     (line,) = axes[2].plot(data_df['recall'], data_df['precision'])
        #     prc_auc = metrics.auc(data_df['recall'], data_df['precision'])
        #     axes[2].set(xlabel='Recall', ylabel='Precision',
        #         title=f'Precision-recall curve (AUC={prc_auc:.4f})')
        #     axes[2].grid(linestyle="--", linewidth=1, color="gray", alpha=0.2)
        #     axes[2].set_xlim([0.0, 1.0])
        #     axes[2].set_ylim([0.0, 1.0])
            
        #     # Annotates plot with F1-score iso-lines
        #     axes_right = axes[2].twinx()
        #     f_scores_d = numpy.linspace(0.1, 0.9, num=9)
        #     tick_locs = []
        #     tick_labels = []
        #     for f in f_scores_d:
        #         x = numpy.linspace(0.01, 1)
        #         y = f * x / (2 * x - f)
        #         (l,) = axes_right.plot(x[y >= 0], y[y >= 0], color="green", alpha=0.1)
        #         tick_locs.append(y[-1])
        #         tick_labels.append("%.1f" % f)
        #     axes_right.tick_params(axis="y", which="both", pad=0, right=False, left=False)
        #     axes_right.set_ylabel("iso-F", color="green", alpha=0.3)
        #     axes_right.set_ylim([0.0, 1.0])
        #     axes_right.yaxis.set_label_coords(1.015, 0.97)
        #     axes_right.set_yticks(tick_locs)  # notice these are invisible
        #     for k in axes_right.set_yticklabels(tick_labels):
        #         k.set_color("green")
        #         k.set_alpha(0.3)
        #         k.set_size(8)

        #     # We shouldn't see any of axes_right axes
        #     axes_right.spines["right"].set_visible(False)
        #     axes_right.spines["top"].set_visible(False)
        #     axes_right.spines["left"].set_visible(False)
        #     axes_right.spines["bottom"].set_visible(False)

        #     # Plot F1 score
        #     (marker,) = axes[2].plot(
        #         data_df["recall"][maxf1_index],
        #         data_df["precision"][maxf1_index],
        #         marker="o",
        #         color="tab:blue",
        #         markersize=8
        #     )

        #     # We should see some of axes 2 axes
        #     axes[2].spines["right"].set_visible(False)
        #     axes[2].spines["top"].set_visible(False)
        #     axes[2].spines["left"].set_position(("data", -0.015))
        #     axes[2].spines["bottom"].set_position(("data", -0.015))

        #     # Legend
        #     label = f"{name} set (F1={data_df['f1_score'][maxf1_index]:.4f})"
        #     axes[2].legend(
        #         [tuple([line, marker])],
        #         [label],
        #         loc="lower left",
        #         fancybox=True,
        #         framealpha=0.7,
        #     )

        #     # Mean square error given optimal threshold (computed on train set)
        #     ground_truth = pred_data['ground_truth']
        #     likelihood = pred_data['likelihood']
        #     mse_res = mse(likelihood, ground_truth)
        #     text_mse = f"MSE with a threshold of {threshold:.3f}: {mse_res:.3f}"
        #     axes[3].text(0.5, 0.5, text_mse, horizontalalignment="center",
        #         verticalalignment="center")
        #     axes[3].axis('off')

        #     pdf.savefig()
        #     plt.close(fig)

        #     f1_score = f_score(neg_gt['likelihood'], pos_gt['likelihood'], threshold)

        #     logger.info(
        #         f"Maximum F1-score of {f1_score:.5f}, achieved at "
        #         f"threshold {threshold:.3f} (chosen *a priori*)"
        #     )

        #     return threshold