Source code for alpaca.utils.ue_metrics
import numpy as np
from math import log2
from scipy.stats import percentileofscore
import torch
__all__ = ["get_uq_metrics", "uq_accuracy", "dcg", "ndcg", "uq_ndcg", "uq_ll"]
def get_uq_metrics(estimations, errors, acc_percentile=0.1, **kwargs):
acc = uq_accuracy(estimations, errors, acc_percentile)
ndcg = uq_ndcg(errors, estimations, **kwargs)
ll = uq_ll(errors, estimations)
return acc, ndcg, ll
[docs]def uq_accuracy(uq, errors, percentile=0.1):
"""Shows intersection of worst by error/uq in percentile"""
k = int(len(uq) * percentile)
worst_uq = np.argsort(np.ravel(uq))[-k:]
worst_error = np.argsort(np.ravel(errors))[-k:]
return len(set(worst_uq).intersection(set(worst_error))) / k
[docs]def dcg(relevances, scores, k):
"""
Discounting cumulative gain, metric of ranking quality
For UQ - relevance is ~ error, scores is uq
"""
relevances = np.ravel(relevances)
scores = np.ravel(scores)
ranking = np.argsort(scores)[::-1]
metric = 0
for rank, score_id in enumerate(ranking[:k]):
metric += relevances[score_id] / log2(rank + 2)
return metric
[docs]def ndcg(relevances, scores):
"""
Normalized DCG. We norm fact DCG on ideal DCG score
expect relevances, scores to be numpy ndarrays
"""
k = sum(relevances != 0)
return dcg(relevances, scores, k) / dcg(relevances, relevances, k)
[docs]def uq_ndcg(errors, uq, bins=None):
"""
In UQ we care most of top erros,
so we restructure errors to give top errors bigger relevance
"""
if bins is None:
bins = [80, 95, 99]
sorted_errors = sorted(errors)
errors_percentiles = [percentileofscore(sorted_errors, error) for error in errors]
errors_digitized = np.digitize(errors_percentiles, bins)
return ndcg(errors_digitized, uq)
def uq_ll(errors, uq):
errors = np.ravel(errors)
uq_squared = np.square(np.ravel(uq)) + 1e-10
return -np.mean(np.log(uq_squared) / 2 + np.square(errors) / 2 / uq_squared)
[docs]def classification_metric(uncertainties, correct_predictions):
"""
Classification metric
"""
accumulation = []
uq = uncertainties
idx = np.argsort(uq)
for fraction in np.arange(0.5, 1.01, 0.01):
part_size = int(fraction * len(idx))
part = correct_predictions[idx][:part_size]
accuracy = torch.true_divide(torch.sum(part), len(part))
accumulation.append([fraction, accuracy])
return np.array(accumulation).T