import numpy as np
import sklearn
import copy
[docs]def compute_deferral_metrics(data_test):
"""_summary_
Args:
data_test (dict): dict data with fields 'defers', 'labels', 'hum_preds', 'preds'
Returns:
dict: dict with metrics, 'classifier_all_acc': classifier accuracy on all data
'human_all_acc': human accuracy on all data
'coverage': how often classifier predicts
"""
results = {}
results["classifier_all_acc"] = sklearn.metrics.accuracy_score(
data_test["preds"], data_test["labels"]
)
results["human_all_acc"] = sklearn.metrics.accuracy_score(
data_test["hum_preds"], data_test["labels"]
)
results["coverage"] = 1 - np.mean(data_test["defers"])
# get classifier accuracy when defers is 0
results["classifier_nondeferred_acc"] = sklearn.metrics.accuracy_score(
data_test["preds"][data_test["defers"] == 0],
data_test["labels"][data_test["defers"] == 0],
)
# get human accuracy when defers is 1
results["human_deferred_acc"] = sklearn.metrics.accuracy_score(
data_test["hum_preds"][data_test["defers"] == 1],
data_test["labels"][data_test["defers"] == 1],
)
# get system accuracy
results["system_acc"] = sklearn.metrics.accuracy_score(
data_test["preds"] * (1 - data_test["defers"])
+ data_test["hum_preds"] * (data_test["defers"]),
data_test["labels"],
)
return results
[docs]def compute_classification_metrics(data_test):
"""compute metrics for just classification
Args:
data_test (dict): dict data with fields 'labels', 'preds'
Returns:
dict: dict with metrics, 'classifier_all_acc': classifier accuracy on all data, also returns AUC for preds_proba
"""
results = {}
results["classifier_all_acc"] = sklearn.metrics.accuracy_score(
data_test["preds"], data_test["labels"]
)
# check if preds and labels are binary
if (
len(np.unique(data_test["labels"])) == 2
and len(np.unique(data_test["preds"])) == 2
):
# get f1
results["classifier_all_f1"] = sklearn.metrics.f1_score(
data_test["preds"], data_test["labels"]
)
if "preds_proba" in data_test:
results["auc"] = sklearn.metrics.roc_auc_score(
data_test["labels"], data_test["preds_proba"]
)
else:
results["auc"] = sklearn.metrics.roc_auc_score(
data_test["labels"], data_test["preds"]
)
return results
[docs]def compute_coverage_v_acc_curve(data_test):
"""
Args:
data_test (dict): dict data with field {'defers': defers_all, 'labels': truths_all, 'hum_preds': hum_preds_all, 'preds': predictions_all, 'rej_score': rej_score_all, 'class_probs': class_probs_all}
Returns:
data (list): compute_deferral_metrics(data_test_modified) on different coverage levels, first element of list is compute_deferral_metrics(data_test)
"""
# get unique rejection scores
rej_scores = np.unique(data_test["rej_score"])
# sort by rejection score
# get the 100 quantiles for rejection scores
rej_scores_quantiles = np.quantile(rej_scores, np.linspace(0, 1, 100))
# for each quantile, get the coverage and accuracy by getting a new deferral decision
all_metrics = []
all_metrics.append(compute_deferral_metrics(data_test))
for q in rej_scores_quantiles:
# get deferral decision
defers = (data_test["rej_score"] > q).astype(int)
copy_data = copy.deepcopy(data_test)
copy_data["defers"] = defers
# compute metrics
metrics = compute_deferral_metrics(copy_data)
all_metrics.append(metrics)
return all_metrics