Skip to content

Commit bc4d747

Browse files
Feat/autc score metric (#2994)
* initial version * plotting step visual fix * changelog * feedback changes * change to linspace * step validation and docstring fix * plot unit tests codecov * update tests * update changelog --------- Co-authored-by: Dennis Bader <dennis.bader@gmx.ch>
1 parent f443843 commit bc4d747

File tree

7 files changed

+459
-0
lines changed

7 files changed

+459
-0
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ but cannot always guarantee backwards compatibility. Changes that may **break co
1111

1212
**Improved**
1313

14+
- Added new time aggregated metric `autc()` (Area Under Tolerance Curve): The tolerance curve gives the fraction of predicted target values within tolerance bands of the actual target values across a range of tolerances (defined as % of target range). The AUTC is the normalized area under this tolerance curve and given as a score between [0, 1]. Higher scores are better. [#2994](https://github.com/unit8co/darts/pull/2994) by [Jakub Chłapek](https://github.com/jakubchlapek)
15+
- Added new plotting function `darts.utils.statistics.plot_tolerance_curve()` to plot the tolerance curve described above. [#2994](https://github.com/unit8co/darts/pull/2994) by [Jakub Chłapek](https://github.com/jakubchlapek)
1416
- Added `TimeSeries.plotly()` method for interactive time series visualization using Plotly backend. [#2977](https://github.com/unit8co/darts/pull/2977) by [Dustin Brunner](https://github.com/brunnedu).
1517
- Provides interactive plotting with zoom, pan, hover tooltips, and legend interactions
1618
- Maintains API consistency with the existing `plot()` method for easy adoption

darts/metrics/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
Other metrics:
3131
- :func:`R2 <darts.metrics.metrics.r2_score>`: Coefficient of Determination
3232
- :func:`CV <darts.metrics.metrics.coefficient_of_variation>`: Coefficient of Variation
33+
- :func:`AUTC <darts.metrics.metrics.autc>`: Area Under Tolerance Curve
3334
3435
- Per time step:
3536
Absolute metrics:
@@ -99,6 +100,7 @@
99100
ape,
100101
arre,
101102
ase,
103+
autc,
102104
coefficient_of_variation,
103105
confusion_matrix,
104106
dtw_metric,
@@ -167,6 +169,7 @@
167169
"ape",
168170
"arre",
169171
"ase",
172+
"autc",
170173
"coefficient_of_variation",
171174
"dtw_metric",
172175
"err",

darts/metrics/metrics.py

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
_confusion_matrix,
2323
_get_error_scale,
2424
_get_quantile_intervals,
25+
_get_tolerance_levels,
2526
_get_values_or_raise,
2627
_get_wrapped_metric,
2728
_LabelReduction,
@@ -2403,6 +2404,186 @@ def coefficient_of_variation(
24032404
)
24042405

24052406

2407+
@multi_ts_support
2408+
@multivariate_support
2409+
def _tolerance_coverages(
2410+
actual_series: Union[TimeSeries, Sequence[TimeSeries]],
2411+
pred_series: Union[TimeSeries, Sequence[TimeSeries]],
2412+
intersect: bool = True,
2413+
*,
2414+
min_tolerance: float = 0.0,
2415+
max_tolerance: float = 1.0,
2416+
step: float = 0.01,
2417+
q: Optional[Union[float, list[float], tuple[np.ndarray, pd.Index]]] = None,
2418+
component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
2419+
series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
2420+
n_jobs: int = 1,
2421+
verbose: bool = False,
2422+
) -> METRIC_OUTPUT_TYPE:
2423+
"""Computes the tolerance coverages for different tolerance levels.
2424+
2425+
More info in metric `autc()`.
2426+
"""
2427+
y_true, y_pred = _get_values_or_raise(
2428+
actual_series,
2429+
pred_series,
2430+
intersect,
2431+
remove_nan_union=True,
2432+
q=q,
2433+
)
2434+
2435+
# range of actual values (max - min) for each component
2436+
y_range = np.nanmax(y_true, axis=TIME_AX) - np.nanmin(y_true, axis=TIME_AX)
2437+
2438+
# handle case where range is zero (constant series)
2439+
if np.any(y_range == 0):
2440+
raise ValueError(
2441+
"The range of actual values (max - min) must be strictly positive for all "
2442+
"components to compute the AUTC. Found zero range for at least one component."
2443+
)
2444+
2445+
tolerances = _get_tolerance_levels(
2446+
min_tolerance=min_tolerance,
2447+
max_tolerance=max_tolerance,
2448+
step=step,
2449+
)
2450+
2451+
# compute absolute errors normalized by half the range
2452+
abs_errors = np.abs(y_true - y_pred)
2453+
half_range = y_range / 2
2454+
normalized_errors = abs_errors / half_range
2455+
2456+
# get coverage for each tolerance level (fraction of points within tolerance)
2457+
# -> (n components, n quantiles, n coverages)
2458+
coverages = np.nanmean(
2459+
np.expand_dims(normalized_errors, -1) <= tolerances, axis=TIME_AX
2460+
)
2461+
# 'abuse' the first dimension which is normally the time dimension for the coverages
2462+
# -> (n coverages, n components, n quantiles)
2463+
return coverages.transpose((2, 0, 1))
2464+
2465+
2466+
@multi_ts_support
2467+
@multivariate_support
2468+
def autc(
2469+
actual_series: Union[TimeSeries, Sequence[TimeSeries]],
2470+
pred_series: Union[TimeSeries, Sequence[TimeSeries]],
2471+
intersect: bool = True,
2472+
*,
2473+
min_tolerance: float = 0.0,
2474+
max_tolerance: float = 1.0,
2475+
step: float = 0.01,
2476+
q: Optional[Union[float, list[float], tuple[np.ndarray, pd.Index]]] = None,
2477+
component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
2478+
series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
2479+
n_jobs: int = 1,
2480+
verbose: bool = False,
2481+
) -> METRIC_OUTPUT_TYPE:
2482+
"""Area Under Tolerance Curve (AUTC).
2483+
2484+
AUTC measures the overall alignment between actual and predicted series across a range of tolerance levels.
2485+
For each tolerance level, it computes the fraction of points where the prediction is within ±X% of the actual
2486+
value.
2487+
The AUTC is the normalized area under this curve, providing a single score in [0, 1] where higher is better.
2488+
2489+
For the true series :math:`y` and predicted series :math:`\\hat{y}` of length :math:`T`, tolerance levels
2490+
:math:`\\tau \\in [0, 1]`, and half-range :math:`H = (\\max(y) - \\min(y)) / 2`:
2491+
2492+
.. math::
2493+
2494+
\\text{Coverage}(\\tau) = \\frac{1}{T} \\sum_{t=1}^{T} \\mathbb{1}\\left[\\frac{|y_t - \\hat{y}_t|}{H}
2495+
\\leq \\tau\\right]
2496+
2497+
\\text{AUTC} = \\int_0^1 \\text{Coverage}(\\tau) \\, d\\tau
2498+
2499+
At tolerance :math:`\\tau`, a prediction is within tolerance if the error is within :math:`\\pm\\tau` of the
2500+
actual value (as a fraction of half the range). For example, at 10% tolerance, the prediction must be within
2501+
±10% of the half-range, i.e., within ±5% of the full range.
2502+
2503+
If :math:`\\hat{y}_t` are stochastic (contains several samples) or quantile predictions, use parameter `q` to
2504+
specify on which quantile(s) to compute the metric on. By default, it uses the median 0.5 quantile
2505+
(over all samples, or, if given, the quantile prediction itself).
2506+
2507+
Parameters
2508+
----------
2509+
actual_series
2510+
The (sequence of) actual series.
2511+
pred_series
2512+
The (sequence of) predicted series.
2513+
intersect
2514+
For time series that are overlapping in time without having the same time index, setting `True`
2515+
will consider the values only over their common time interval (intersection in time).
2516+
min_tolerance
2517+
The minimum tolerance level as a fraction of the series half-range. Default is 0.0 (0%).
2518+
max_tolerance
2519+
The maximum tolerance level as a fraction of the series half-range. Default is 1.0 (100%).
2520+
step
2521+
The step size between tolerance levels. Default is 0.01 (1%).
2522+
For example, with defaults, tolerances are [0.0, 0.01, 0.02, ..., 1.0].
2523+
q
2524+
Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
2525+
component_reduction
2526+
Optionally, a function to aggregate the metrics over the component/column axis. It must reduce a `np.ndarray`
2527+
of shape `(t, c)` to a `np.ndarray` of shape `(t,)`. The function takes as input a ``np.ndarray`` and a
2528+
parameter named `axis`, and returns the reduced array. The `axis` receives value `1` corresponding to the
2529+
component axis. If `None`, will return a metric per component.
2530+
series_reduction
2531+
Optionally, a function to aggregate the metrics over multiple series. It must reduce a `np.ndarray`
2532+
of shape `(s, t, c)` to a `np.ndarray` of shape `(t, c)` The function takes as input a ``np.ndarray`` and a
2533+
parameter named `axis`, and returns the reduced array. The `axis` receives value `0` corresponding to the
2534+
series axis. For example with `np.nanmean`, will return the average over all series metrics. If `None`, will
2535+
return a metric per component.
2536+
n_jobs
2537+
The number of jobs to run in parallel. Parallel jobs are created only when a ``Sequence[TimeSeries]`` is
2538+
passed as input, parallelising operations regarding different ``TimeSeries``. Defaults to `1`
2539+
(sequential). Setting the parameter to `-1` means using all the available processors.
2540+
verbose
2541+
Optionally, whether to print operations progress.
2542+
2543+
Raises
2544+
------
2545+
ValueError
2546+
If :math:`\\max_t{y_t} = \\min_t{y_t}` (constant series with zero range).
2547+
2548+
Returns
2549+
-------
2550+
float
2551+
A single metric score in [0, 1] (when `len(q) <= 1`) for:
2552+
2553+
- a single univariate series.
2554+
- a single multivariate series with `component_reduction`.
2555+
- a sequence (list) of uni/multivariate series with `series_reduction` and `component_reduction`.
2556+
np.ndarray
2557+
A numpy array of metric scores. The array has shape (n components * n quantiles,) without component reduction,
2558+
and shape (n quantiles,) with component reduction and `len(q) > 1`.
2559+
For:
2560+
2561+
- the same input arguments that result in the `float` return case from above but with `len(q) > 1`.
2562+
- a single multivariate series and at least `component_reduction=None`.
2563+
- a sequence of uni/multivariate series including `series_reduction` and `component_reduction=None`.
2564+
list[float]
2565+
Same as for type `float` but for a sequence of series.
2566+
list[np.ndarray]
2567+
Same as for type `np.ndarray` but for a sequence of series.
2568+
2569+
See Also
2570+
--------
2571+
:func:`~darts.utils.statistics.plot_tolerance_curve` : Plot the tolerance curve for visual inspection.
2572+
"""
2573+
coverages = _get_wrapped_metric(_tolerance_coverages)(
2574+
actual_series,
2575+
pred_series,
2576+
intersect,
2577+
q=q,
2578+
)
2579+
tolerances = _get_tolerance_levels(
2580+
min_tolerance=min_tolerance,
2581+
max_tolerance=max_tolerance,
2582+
step=step,
2583+
)
2584+
return np.trapezoid(coverages, tolerances, axis=0)
2585+
2586+
24062587
# Dynamic Time Warping
24072588
@multi_ts_support
24082589
@multivariate_support

darts/metrics/utils.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1034,3 +1034,27 @@ def _compute_score(
10341034
# micro f1 score: score_func(sum(x))
10351035
scores = scores.reshape((-1, 1))
10361036
return scores
1037+
1038+
1039+
def _get_tolerance_levels(
1040+
min_tolerance: float,
1041+
max_tolerance: float,
1042+
step: float,
1043+
):
1044+
"""Computes normalized tolerance levels."""
1045+
if not (0.0 <= min_tolerance < max_tolerance <= 1.0):
1046+
raise_log(
1047+
ValueError(
1048+
"min_tolerance must be >= 0, max_tolerance must be <= 1, and min_tolerance < max_tolerance."
1049+
),
1050+
logger=logger,
1051+
)
1052+
if step <= 0 or step > (max_tolerance - min_tolerance):
1053+
raise_log(
1054+
ValueError(
1055+
"step must be positive and not larger than (max_tolerance - min_tolerance)."
1056+
),
1057+
logger=logger,
1058+
)
1059+
num_steps = int(round((max_tolerance - min_tolerance) / step)) + 1
1060+
return np.linspace(min_tolerance, max_tolerance, num_steps)

darts/tests/metrics/test_metrics.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,19 @@ def metric_f1(y_true, y_pred):
155155
return sklearn.metrics.f1_score(y_true.flatten(), y_pred.flatten(), average="macro")
156156

157157

158+
def metric_autc(y_true, y_pred, n_tolerances=101, **kwargs):
159+
"""Reference implementation for AUTC metric."""
160+
y_true = y_true[:, 0] # univariate
161+
y_pred = y_pred[:, 0]
162+
y_range = np.max(y_true) - np.min(y_true)
163+
abs_errors = np.abs(y_true - y_pred)
164+
half_range = y_range / 2
165+
normalized_errors = abs_errors / half_range
166+
tolerances = np.linspace(0, 1, n_tolerances)
167+
coverages = np.array([np.mean(normalized_errors <= tol) for tol in tolerances])
168+
return np.trapezoid(coverages, tolerances)
169+
170+
158171
class TestMetrics:
159172
np.random.seed(42)
160173
pd_train = pd.Series(
@@ -264,6 +277,7 @@ def test_sape_zero_denom(self, metric):
264277
(metrics.smape, False, {}),
265278
(metrics.ope, False, {}),
266279
(metrics.marre, False, {}),
280+
(metrics.autc, False, {}),
267281
(metrics.r2_score, False, {}),
268282
(metrics.coefficient_of_variation, False, {}),
269283
(metrics.qr, True, {}),
@@ -852,6 +866,7 @@ def test_output_type_time_dependent(self, config):
852866
(metrics.mae, False),
853867
(metrics.mse, False),
854868
(metrics.rmse, False),
869+
(metrics.autc, False),
855870
(metrics.rmsle, False),
856871
(metrics.mase, False),
857872
(metrics.msse, False),
@@ -959,6 +974,7 @@ def test_reduction_fn_validity(self, config):
959974
(metrics.mae, 0, False, {}),
960975
(metrics.mse, 0, False, {}),
961976
(metrics.rmse, 0, False, {}),
977+
(metrics.autc, 1, False, {}),
962978
(metrics.rmsle, 0, False, {}),
963979
(metrics.mase, 0, False, {}),
964980
(metrics.msse, 0, False, {}),
@@ -1414,6 +1430,7 @@ def test_multiple_ts_rmse(self):
14141430
(metrics.mae, "max", {}),
14151431
(metrics.mse, "max", {}),
14161432
(metrics.rmse, "max", {}),
1433+
(metrics.autc, "min", {}),
14171434
(metrics.rmsle, "max", {}),
14181435
(metrics.mape, "max", {}),
14191436
(metrics.wmape, "max", {}),
@@ -1513,6 +1530,7 @@ def test_multiple_ts(self, config):
15131530
(metrics.smape, metric_smape, {}, {}),
15141531
(metrics.ope, metric_ope, {}, {}),
15151532
(metrics.marre, metric_marre, {}, {}),
1533+
(metrics.autc, metric_autc, {}, {}),
15161534
(metrics.r2_score, sklearn.metrics.r2_score, {}, {}),
15171535
(metrics.coefficient_of_variation, metric_cov, {}, {}),
15181536
(metrics.accuracy, metric_macc, {}, {}),
@@ -1708,6 +1726,7 @@ def helper_test_non_aggregate(self, metric, is_aggregate, val_exp=None):
17081726
metrics.sape,
17091727
metrics.arre,
17101728
metrics.ql,
1729+
metrics.autc,
17111730
# time aggregates
17121731
metrics.merr,
17131732
metrics.mae,
@@ -2261,3 +2280,26 @@ def test_wrapped_metrics(self):
22612280
with pytest.raises(NotImplementedError) as exc:
22622281
utils._get_wrapped_metric(None, n_wrappers=4)
22632282
assert str(exc.value) == "Only 2-3 wrappers are currently supported"
2283+
2284+
@pytest.mark.parametrize(
2285+
"kwargs,match",
2286+
[
2287+
({"min_tolerance": -0.1}, "min_tolerance must be >= 0"),
2288+
({"max_tolerance": 1.5}, "max_tolerance must be <= 1"),
2289+
(
2290+
{"min_tolerance": 0.8, "max_tolerance": 0.5},
2291+
"min_tolerance must be >= 0",
2292+
),
2293+
({"step": 0}, "step must be positive"),
2294+
({"step": -0.1}, "step must be positive"),
2295+
({"step": 2.0}, "step must be positive"),
2296+
],
2297+
)
2298+
def test_autc_invalid_params(self, kwargs, match):
2299+
with pytest.raises(ValueError, match=match):
2300+
metrics.autc(self.series1, self.series2, **kwargs)
2301+
2302+
def test_autc_constant_series(self):
2303+
series1_const = self.series1.with_values(np.ones(self.series1.shape))
2304+
with pytest.raises(ValueError, match="range of actual values"):
2305+
metrics.autc(series1_const, self.series2)

0 commit comments

Comments
 (0)