Source code for bob.med.tb.engine.evaluator

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""Defines functionality for the evaluation of predictions"""

import os

import numpy
import pandas as pd
import matplotlib.pyplot as plt
import re

import torch
from sklearn import metrics
from bob.measure import eer_threshold

from ..utils.measure import base_measures, get_centered_maxf1

import logging

logger = logging.getLogger(__name__)


[docs]def posneg(pred, gt, threshold): """Calculates true and false positives and negatives""" # threshold binary_pred = torch.gt(pred, threshold) # equals and not-equals equals = torch.eq(binary_pred, gt).type(torch.uint8) notequals = torch.ne(binary_pred, gt).type(torch.uint8) # true positives tp_tensor = (gt * binary_pred).type(torch.uint8) # false positives fp_tensor = torch.eq((binary_pred + tp_tensor), 1).type(torch.uint8) # true negatives tn_tensor = (equals - tp_tensor).type(torch.uint8) # false negatives fn_tensor = notequals - fp_tensor.type(torch.uint8) return tp_tensor, fp_tensor, tn_tensor, fn_tensor
[docs]def sample_measures_for_threshold(pred, gt, threshold): """ Calculates measures on one single sample, for a specific threshold Parameters ---------- pred : torch.Tensor pixel-wise predictions gt : torch.Tensor ground-truth (annotations) threshold : float a particular threshold in which to calculate the performance measures Returns ------- precision: float recall: float specificity: float accuracy: float jaccard: float f1_score: float """ tp_tensor, fp_tensor, tn_tensor, fn_tensor = posneg(pred, gt, threshold) # calc measures from scalars tp_count = torch.sum(tp_tensor).item() fp_count = torch.sum(fp_tensor).item() tn_count = torch.sum(tn_tensor).item() fn_count = torch.sum(fn_tensor).item() return base_measures(tp_count, fp_count, tn_count, fn_count)
[docs]def run( dataset, name, predictions_folder, output_folder=None, f1_thresh=None, eer_thresh=None, steps=1000, ): """ Runs inference and calculates measures Parameters --------- dataset : py:class:`torch.utils.data.Dataset` a dataset to iterate on name : str the local name of this dataset (e.g. ``train``, or ``test``), to be used when saving measures files. predictions_folder : str folder where predictions for the dataset images has been previously stored output_folder : :py:class:`str`, Optional folder where to store results. f1_thresh : :py:class:`float`, Optional This number should come from the training set or a separate validation set. Using a test set value may bias your analysis. This number is also used to print the a priori F1-score on the evaluated set. eer_thresh : :py:class:`float`, Optional This number should come from the training set or a separate validation set. Using a test set value may bias your analysis. This number is used to print the a priori EER. steps : :py:class:`float`, Optional number of threshold steps to consider when evaluating thresholds. Returns ------- f1_threshold : float Threshold to achieve the highest possible F1-score for this dataset eer_threshold : float Threshold achieving Equal Error Rate for this dataset """ predictions_path = os.path.join(predictions_folder, name, "predictions.csv") if not os.path.exists(predictions_path): predictions_path = predictions_folder # Load predictions pred_data = pd.read_csv(predictions_path) pred = torch.Tensor([eval(re.sub(' +', ' ', x.replace('\n', '')).replace(' ', ',')) for x in pred_data['likelihood'].values]).double() gt = torch.Tensor([eval(re.sub(' +', ' ', x.replace('\n', '')).replace(' ', ',')) for x in pred_data['ground_truth'].values]).double() if pred.shape[1] == 1 and gt.shape[1] == 1: pred = torch.flatten(pred) gt = torch.flatten(gt) pred_data['likelihood'] = pred pred_data['ground_truth'] = gt # Multiclass f1 score computation if pred.ndim > 1: auc = metrics.roc_auc_score(gt, pred) logger.info("Evaluating multiclass classification") logger.info(f"AUC: {auc}") logger.info("F1 and EER are not implemented for multiclass") return None, None # Generate measures for each threshold step_size = 1.0 / steps data = [ (index, threshold) + sample_measures_for_threshold(pred, gt, threshold) for index, threshold in enumerate(numpy.arange(0.0, 1.0, step_size)) ] data_df = pd.DataFrame( data, columns=( "index", "threshold", "precision", "recall", "specificity", "accuracy", "jaccard", "f1_score", ) ) data_df = data_df.set_index("index") # Save evaluation csv if output_folder is not None: fullpath = os.path.join(output_folder, f"{name}.csv") logger.info(f"Saving {fullpath}...") os.makedirs(os.path.dirname(fullpath), exist_ok=True) data_df.to_csv(fullpath) # Find max F1 score f1_scores = numpy.asarray(data_df["f1_score"]) thresholds = numpy.asarray(data_df["threshold"]) maxf1, maxf1_threshold = get_centered_maxf1( f1_scores, thresholds ) logger.info( f"Maximum F1-score of {maxf1:.5f}, achieved at " f"threshold {maxf1_threshold:.3f} (chosen *a posteriori*)" ) # Find EER neg_gt = pred_data.loc[pred_data.loc[:, 'ground_truth'] == 0, :] pos_gt = pred_data.loc[pred_data.loc[:, 'ground_truth'] == 1, :] post_eer_threshold = eer_threshold(neg_gt['likelihood'], pos_gt['likelihood']) logger.info( f"Equal error rate achieved at " f"threshold {post_eer_threshold:.3f} (chosen *a posteriori*)" ) # Save score table if output_folder is not None: fig, axes = plt.subplots(1) fig.tight_layout(pad=3.0) # Names and bounds axes.set_xlabel("Score") axes.set_ylabel("Normalized counts") axes.set_xlim(0.0, 1.0) neg_weights = numpy.ones_like(neg_gt['likelihood']) / len(pred_data['likelihood']) pos_weights = numpy.ones_like(pos_gt['likelihood']) / len(pred_data['likelihood']) axes.hist( [neg_gt['likelihood'], pos_gt['likelihood']], weights=[neg_weights, pos_weights], bins=100, color=['tab:blue', 'tab:orange'], label=["Negatives", "Positives"]) axes.legend(prop={'size': 10}, loc="upper center") axes.set_title(f"Score table for {name} subset") # we should see some of axes 1 axes axes.spines["right"].set_visible(False) axes.spines["top"].set_visible(False) axes.spines["left"].set_position(("data", -0.015)) fullpath = os.path.join(output_folder, f"{name}_score_table.pdf") fig.savefig(fullpath) if f1_thresh is not None and eer_thresh is not None: # get the closest possible threshold we have index = int(round(steps * f1_thresh)) f1_a_priori = data_df["f1_score"][index] actual_threshold = data_df["threshold"][index] logger.info( f"F1-score of {f1_a_priori:.5f}, at threshold " f"{actual_threshold:.3f} (chosen *a priori*)" ) # Print the a priori EER threshold logger.info( f"Equal error rate (chosen *a priori*) {eer_thresh:.3f}" ) return maxf1_threshold, post_eer_threshold
# from matplotlib.backends.backend_pdf import PdfPages # fname = os.path.join(output_folder, name + ".pdf") # os.makedirs(os.path.dirname(fname), exist_ok=True) # with PdfPages(fname) as pdf: # fig, axes = plt.subplots(2, 2, figsize=(12.8, 9.6)) # fig.suptitle(f"Subset: {name}", fontsize=16, fontweight='semibold') # axes = axes.flatten() # # Tight layout often produces nice results # # but requires the title to be spaced accordingly # fig.tight_layout(pad=3.0) # fig.subplots_adjust(top=0.92) # # ------------ # # Score table # # ------------ # axes[0].set_xlim(0.0, 1.0) # axes[0].hist( # [neg_gt['likelihood'], pos_gt['likelihood']], # bins=30, color=['tab:blue', 'tab:orange'], # label=["Negatives", "Positives"]) # axes[0].legend(prop={'size': 10}) # axes[0].set_title("Score table") # # ---------- # # ROC Curve # # ---------- # # TPR = 1 - FNR # (line,) = axes[1].plot( # 1 - data_df['specificity'], # data_df['recall'], # color="#1f77b4" # ) # auc = roc_auc_score(neg_gt['likelihood'], pos_gt['likelihood']) # axes[1].set(xlabel='1 - specificity', ylabel='Sensitivity', # title=f'ROC curve (AUC={auc:.4f})') # # axes[1].plot([0, 1], [0, 1], color='tab:orange', linestyle='--') # axes[1].grid(linestyle="--", linewidth=1, color="gray", alpha=0.2) # axes[1].set_xlim([0.0, 1.0]) # axes[1].set_ylim([0.0, 1.0]) # # Equal Error Rate threshold # EER = eer(neg_gt['likelihood'], pos_gt['likelihood']) # threshold = eer_threshold(neg_gt['likelihood'], pos_gt['likelihood']) # threshold_index = data_df['threshold'].sub(threshold).abs().idxmin() # # hter_threshold = min_hter_threshold(neg_gt['likelihood'], pos_gt['likelihood']) # # Plot EER # (marker,) = axes[1].plot( # 1 - data_df["specificity"][threshold_index], # data_df["recall"][threshold_index], # marker="o", # color="tab:blue", # markersize=8 # ) # # We should see some of axes 1 axes # axes[1].spines["right"].set_visible(False) # axes[1].spines["top"].set_visible(False) # axes[1].spines["left"].set_position(("data", -0.015)) # axes[1].spines["bottom"].set_position(("data", -0.015)) # # Legend # label = f"{name} set (EER={EER:.4f})" # axes[1].legend( # [tuple([line, marker])], # [label], # loc="lower right", # fancybox=True, # framealpha=0.7, # ) # # ----------------------- # # Precision-recall Curve # # ----------------------- # (line,) = axes[2].plot(data_df['recall'], data_df['precision']) # prc_auc = metrics.auc(data_df['recall'], data_df['precision']) # axes[2].set(xlabel='Recall', ylabel='Precision', # title=f'Precision-recall curve (AUC={prc_auc:.4f})') # axes[2].grid(linestyle="--", linewidth=1, color="gray", alpha=0.2) # axes[2].set_xlim([0.0, 1.0]) # axes[2].set_ylim([0.0, 1.0]) # # Annotates plot with F1-score iso-lines # axes_right = axes[2].twinx() # f_scores_d = numpy.linspace(0.1, 0.9, num=9) # tick_locs = [] # tick_labels = [] # for f in f_scores_d: # x = numpy.linspace(0.01, 1) # y = f * x / (2 * x - f) # (l,) = axes_right.plot(x[y >= 0], y[y >= 0], color="green", alpha=0.1) # tick_locs.append(y[-1]) # tick_labels.append("%.1f" % f) # axes_right.tick_params(axis="y", which="both", pad=0, right=False, left=False) # axes_right.set_ylabel("iso-F", color="green", alpha=0.3) # axes_right.set_ylim([0.0, 1.0]) # axes_right.yaxis.set_label_coords(1.015, 0.97) # axes_right.set_yticks(tick_locs) # notice these are invisible # for k in axes_right.set_yticklabels(tick_labels): # k.set_color("green") # k.set_alpha(0.3) # k.set_size(8) # # We shouldn't see any of axes_right axes # axes_right.spines["right"].set_visible(False) # axes_right.spines["top"].set_visible(False) # axes_right.spines["left"].set_visible(False) # axes_right.spines["bottom"].set_visible(False) # # Plot F1 score # (marker,) = axes[2].plot( # data_df["recall"][maxf1_index], # data_df["precision"][maxf1_index], # marker="o", # color="tab:blue", # markersize=8 # ) # # We should see some of axes 2 axes # axes[2].spines["right"].set_visible(False) # axes[2].spines["top"].set_visible(False) # axes[2].spines["left"].set_position(("data", -0.015)) # axes[2].spines["bottom"].set_position(("data", -0.015)) # # Legend # label = f"{name} set (F1={data_df['f1_score'][maxf1_index]:.4f})" # axes[2].legend( # [tuple([line, marker])], # [label], # loc="lower left", # fancybox=True, # framealpha=0.7, # ) # # Mean square error given optimal threshold (computed on train set) # ground_truth = pred_data['ground_truth'] # likelihood = pred_data['likelihood'] # mse_res = mse(likelihood, ground_truth) # text_mse = f"MSE with a threshold of {threshold:.3f}: {mse_res:.3f}" # axes[3].text(0.5, 0.5, text_mse, horizontalalignment="center", # verticalalignment="center") # axes[3].axis('off') # pdf.savefig() # plt.close(fig) # f1_score = f_score(neg_gt['likelihood'], pos_gt['likelihood'], threshold) # logger.info( # f"Maximum F1-score of {f1_score:.5f}, achieved at " # f"threshold {threshold:.3f} (chosen *a priori*)" # ) # return threshold