Feat/autc score metric (#2994)

jakubchlapek · dennisbader · web-flow · commit bc4d747f9750 · 2026-01-30T09:55:04.000+01:00
* initial version

* plotting step visual fix

* changelog

* feedback changes

* change to linspace

* step validation and docstring fix

* plot unit tests codecov

* update tests

* update changelog

---------

Co-authored-by: Dennis Bader &lt;dennis.bader@gmx.ch&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,8 @@ but cannot always guarantee backwards compatibility. Changes that may **break co
 
 **Improved**
 
+- Added new time aggregated metric `autc()` (Area Under Tolerance Curve): The tolerance curve gives the fraction of predicted target values within tolerance bands of the actual target values across a range of tolerances (defined as % of target range). The AUTC is the normalized area under this tolerance curve and given as a score between [0, 1]. Higher scores are better. [#2994](https://github.com/unit8co/darts/pull/2994) by [Jakub Chłapek](https://github.com/jakubchlapek)
+- Added new plotting function `darts.utils.statistics.plot_tolerance_curve()` to plot the tolerance curve described above. [#2994](https://github.com/unit8co/darts/pull/2994) by [Jakub Chłapek](https://github.com/jakubchlapek)
 - Added `TimeSeries.plotly()` method for interactive time series visualization using Plotly backend. [#2977](https://github.com/unit8co/darts/pull/2977) by [Dustin Brunner](https://github.com/brunnedu).
   - Provides interactive plotting with zoom, pan, hover tooltips, and legend interactions
   - Maintains API consistency with the existing `plot()` method for easy adoption
diff --git a/darts/metrics/__init__.py b/darts/metrics/__init__.py
@@ -30,6 +30,7 @@
     Other metrics:
         - :func:`R2 <darts.metrics.metrics.r2_score>`: Coefficient of Determination
         - :func:`CV <darts.metrics.metrics.coefficient_of_variation>`: Coefficient of Variation
+        - :func:`AUTC <darts.metrics.metrics.autc>`: Area Under Tolerance Curve
 
 - Per time step:
     Absolute metrics:
@@ -99,6 +100,7 @@
     ape,
     arre,
     ase,
+    autc,
     coefficient_of_variation,
     confusion_matrix,
     dtw_metric,
@@ -167,6 +169,7 @@
     "ape",
     "arre",
     "ase",
+    "autc",
     "coefficient_of_variation",
     "dtw_metric",
     "err",
diff --git a/darts/metrics/metrics.py b/darts/metrics/metrics.py
@@ -22,6 +22,7 @@
     _confusion_matrix,
     _get_error_scale,
     _get_quantile_intervals,
+    _get_tolerance_levels,
     _get_values_or_raise,
     _get_wrapped_metric,
     _LabelReduction,
@@ -2403,6 +2404,186 @@ def coefficient_of_variation(
     )
 
 
+@multi_ts_support
+@multivariate_support
+def _tolerance_coverages(
+    actual_series: Union[TimeSeries, Sequence[TimeSeries]],
+    pred_series: Union[TimeSeries, Sequence[TimeSeries]],
+    intersect: bool = True,
+    *,
+    min_tolerance: float = 0.0,
+    max_tolerance: float = 1.0,
+    step: float = 0.01,
+    q: Optional[Union[float, list[float], tuple[np.ndarray, pd.Index]]] = None,
+    component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
+    series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
+    n_jobs: int = 1,
+    verbose: bool = False,
+) -> METRIC_OUTPUT_TYPE:
+    """Computes the tolerance coverages for different tolerance levels.
+
+    More info in metric `autc()`.
+    """
+    y_true, y_pred = _get_values_or_raise(
+        actual_series,
+        pred_series,
+        intersect,
+        remove_nan_union=True,
+        q=q,
+    )
+
+    # range of actual values (max - min) for each component
+    y_range = np.nanmax(y_true, axis=TIME_AX) - np.nanmin(y_true, axis=TIME_AX)
+
+    # handle case where range is zero (constant series)
+    if np.any(y_range == 0):
+        raise ValueError(
+            "The range of actual values (max - min) must be strictly positive for all "
+            "components to compute the AUTC. Found zero range for at least one component."
+        )
+
+    tolerances = _get_tolerance_levels(
+        min_tolerance=min_tolerance,
+        max_tolerance=max_tolerance,
+        step=step,
+    )
+
+    # compute absolute errors normalized by half the range
+    abs_errors = np.abs(y_true - y_pred)
+    half_range = y_range / 2
+    normalized_errors = abs_errors / half_range
+
+    # get coverage for each tolerance level (fraction of points within tolerance)
+    # -> (n components, n quantiles, n coverages)
+    coverages = np.nanmean(
+        np.expand_dims(normalized_errors, -1) <= tolerances, axis=TIME_AX
+    )
+    # 'abuse' the first dimension which is normally the time dimension for the coverages
+    # -> (n coverages, n components, n quantiles)
+    return coverages.transpose((2, 0, 1))
+
+
+@multi_ts_support
+@multivariate_support
+def autc(
+    actual_series: Union[TimeSeries, Sequence[TimeSeries]],
+    pred_series: Union[TimeSeries, Sequence[TimeSeries]],
+    intersect: bool = True,
+    *,
+    min_tolerance: float = 0.0,
+    max_tolerance: float = 1.0,
+    step: float = 0.01,
+    q: Optional[Union[float, list[float], tuple[np.ndarray, pd.Index]]] = None,
+    component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
+    series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
+    n_jobs: int = 1,
+    verbose: bool = False,
+) -> METRIC_OUTPUT_TYPE:
+    """Area Under Tolerance Curve (AUTC).
+
+    AUTC measures the overall alignment between actual and predicted series across a range of tolerance levels.
+    For each tolerance level, it computes the fraction of points where the prediction is within ±X% of the actual
+    value.
+    The AUTC is the normalized area under this curve, providing a single score in [0, 1] where higher is better.
+
+    For the true series :math:`y` and predicted series :math:`\\hat{y}` of length :math:`T`, tolerance levels
+    :math:`\\tau \\in [0, 1]`, and half-range :math:`H = (\\max(y) - \\min(y)) / 2`:
+
+    .. math::
+
+        \\text{Coverage}(\\tau) = \\frac{1}{T} \\sum_{t=1}^{T} \\mathbb{1}\\left[\\frac{|y_t - \\hat{y}_t|}{H}
+        \\leq \\tau\\right]
+
+        \\text{AUTC} = \\int_0^1 \\text{Coverage}(\\tau) \\, d\\tau
+
+    At tolerance :math:`\\tau`, a prediction is within tolerance if the error is within :math:`\\pm\\tau` of the
+    actual value (as a fraction of half the range). For example, at 10% tolerance, the prediction must be within
+    ±10% of the half-range, i.e., within ±5% of the full range.
+
+    If :math:`\\hat{y}_t` are stochastic (contains several samples) or quantile predictions, use parameter `q` to
+    specify on which quantile(s) to compute the metric on. By default, it uses the median 0.5 quantile
+    (over all samples, or, if given, the quantile prediction itself).
+
+    Parameters
+    ----------
+    actual_series
+        The (sequence of) actual series.
+    pred_series
+        The (sequence of) predicted series.
+    intersect
+        For time series that are overlapping in time without having the same time index, setting `True`
+        will consider the values only over their common time interval (intersection in time).
+    min_tolerance
+        The minimum tolerance level as a fraction of the series half-range. Default is 0.0 (0%).
+    max_tolerance
+        The maximum tolerance level as a fraction of the series half-range. Default is 1.0 (100%).
+    step
+        The step size between tolerance levels. Default is 0.01 (1%).
+        For example, with defaults, tolerances are [0.0, 0.01, 0.02, ..., 1.0].
+    q
+        Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
+    component_reduction
+        Optionally, a function to aggregate the metrics over the component/column axis. It must reduce a `np.ndarray`
+        of shape `(t, c)` to a `np.ndarray` of shape `(t,)`. The function takes as input a ``np.ndarray`` and a
+        parameter named `axis`, and returns the reduced array. The `axis` receives value `1` corresponding to the
+        component axis. If `None`, will return a metric per component.
+    series_reduction
+        Optionally, a function to aggregate the metrics over multiple series. It must reduce a `np.ndarray`
+        of shape `(s, t, c)` to a `np.ndarray` of shape `(t, c)` The function takes as input a ``np.ndarray`` and a
+        parameter named `axis`, and returns the reduced array. The `axis` receives value `0` corresponding to the
+        series axis. For example with `np.nanmean`, will return the average over all series metrics. If `None`, will
+        return a metric per component.
+    n_jobs
+        The number of jobs to run in parallel. Parallel jobs are created only when a ``Sequence[TimeSeries]`` is
+        passed as input, parallelising operations regarding different ``TimeSeries``. Defaults to `1`
+        (sequential). Setting the parameter to `-1` means using all the available processors.
+    verbose
+        Optionally, whether to print operations progress.
+
+    Raises
+    ------
+    ValueError
+        If :math:`\\max_t{y_t} = \\min_t{y_t}` (constant series with zero range).
+
+    Returns
+    -------
+    float
+        A single metric score in [0, 1] (when `len(q) <= 1`) for:
+
+        - a single univariate series.
+        - a single multivariate series with `component_reduction`.
+        - a sequence (list) of uni/multivariate series with `series_reduction` and `component_reduction`.
+    np.ndarray
+        A numpy array of metric scores. The array has shape (n components * n quantiles,) without component reduction,
+        and shape (n quantiles,) with component reduction and `len(q) > 1`.
+        For:
+
+        - the same input arguments that result in the `float` return case from above but with `len(q) > 1`.
+        - a single multivariate series and at least `component_reduction=None`.
+        - a sequence of uni/multivariate series including `series_reduction` and `component_reduction=None`.
+    list[float]
+        Same as for type `float` but for a sequence of series.
+    list[np.ndarray]
+        Same as for type `np.ndarray` but for a sequence of series.
+
+    See Also
+    --------
+    :func:`~darts.utils.statistics.plot_tolerance_curve` : Plot the tolerance curve for visual inspection.
+    """
+    coverages = _get_wrapped_metric(_tolerance_coverages)(
+        actual_series,
+        pred_series,
+        intersect,
+        q=q,
+    )
+    tolerances = _get_tolerance_levels(
+        min_tolerance=min_tolerance,
+        max_tolerance=max_tolerance,
+        step=step,
+    )
+    return np.trapezoid(coverages, tolerances, axis=0)
+
+
 # Dynamic Time Warping
 @multi_ts_support
 @multivariate_support
diff --git a/darts/metrics/utils.py b/darts/metrics/utils.py
@@ -1034,3 +1034,27 @@ def _compute_score(
         # micro f1 score: score_func(sum(x))
         scores = scores.reshape((-1, 1))
     return scores
+
+
+def _get_tolerance_levels(
+    min_tolerance: float,
+    max_tolerance: float,
+    step: float,
+):
+    """Computes normalized tolerance levels."""
+    if not (0.0 <= min_tolerance < max_tolerance <= 1.0):
+        raise_log(
+            ValueError(
+                "min_tolerance must be >= 0, max_tolerance must be <= 1, and min_tolerance < max_tolerance."
+            ),
+            logger=logger,
+        )
+    if step <= 0 or step > (max_tolerance - min_tolerance):
+        raise_log(
+            ValueError(
+                "step must be positive and not larger than (max_tolerance - min_tolerance)."
+            ),
+            logger=logger,
+        )
+    num_steps = int(round((max_tolerance - min_tolerance) / step)) + 1
+    return np.linspace(min_tolerance, max_tolerance, num_steps)
diff --git a/darts/tests/metrics/test_metrics.py b/darts/tests/metrics/test_metrics.py
@@ -155,6 +155,19 @@ def metric_f1(y_true, y_pred):
     return sklearn.metrics.f1_score(y_true.flatten(), y_pred.flatten(), average="macro")
 
 
+def metric_autc(y_true, y_pred, n_tolerances=101, **kwargs):
+    """Reference implementation for AUTC metric."""
+    y_true = y_true[:, 0]  # univariate
+    y_pred = y_pred[:, 0]
+    y_range = np.max(y_true) - np.min(y_true)
+    abs_errors = np.abs(y_true - y_pred)
+    half_range = y_range / 2
+    normalized_errors = abs_errors / half_range
+    tolerances = np.linspace(0, 1, n_tolerances)
+    coverages = np.array([np.mean(normalized_errors <= tol) for tol in tolerances])
+    return np.trapezoid(coverages, tolerances)
+
+
 class TestMetrics:
     np.random.seed(42)
     pd_train = pd.Series(
@@ -264,6 +277,7 @@ def test_sape_zero_denom(self, metric):
             (metrics.smape, False, {}),
             (metrics.ope, False, {}),
             (metrics.marre, False, {}),
+            (metrics.autc, False, {}),
             (metrics.r2_score, False, {}),
             (metrics.coefficient_of_variation, False, {}),
             (metrics.qr, True, {}),
@@ -852,6 +866,7 @@ def test_output_type_time_dependent(self, config):
                 (metrics.mae, False),
                 (metrics.mse, False),
                 (metrics.rmse, False),
+                (metrics.autc, False),
                 (metrics.rmsle, False),
                 (metrics.mase, False),
                 (metrics.msse, False),
@@ -959,6 +974,7 @@ def test_reduction_fn_validity(self, config):
             (metrics.mae, 0, False, {}),
             (metrics.mse, 0, False, {}),
             (metrics.rmse, 0, False, {}),
+            (metrics.autc, 1, False, {}),
             (metrics.rmsle, 0, False, {}),
             (metrics.mase, 0, False, {}),
             (metrics.msse, 0, False, {}),
@@ -1414,6 +1430,7 @@ def test_multiple_ts_rmse(self):
             (metrics.mae, "max", {}),
             (metrics.mse, "max", {}),
             (metrics.rmse, "max", {}),
+            (metrics.autc, "min", {}),
             (metrics.rmsle, "max", {}),
             (metrics.mape, "max", {}),
             (metrics.wmape, "max", {}),
@@ -1513,6 +1530,7 @@ def test_multiple_ts(self, config):
             (metrics.smape, metric_smape, {}, {}),
             (metrics.ope, metric_ope, {}, {}),
             (metrics.marre, metric_marre, {}, {}),
+            (metrics.autc, metric_autc, {}, {}),
             (metrics.r2_score, sklearn.metrics.r2_score, {}, {}),
             (metrics.coefficient_of_variation, metric_cov, {}, {}),
             (metrics.accuracy, metric_macc, {}, {}),
@@ -1708,6 +1726,7 @@ def helper_test_non_aggregate(self, metric, is_aggregate, val_exp=None):
                 metrics.sape,
                 metrics.arre,
                 metrics.ql,
+                metrics.autc,
                 # time aggregates
                 metrics.merr,
                 metrics.mae,
@@ -2261,3 +2280,26 @@ def test_wrapped_metrics(self):
         with pytest.raises(NotImplementedError) as exc:
             utils._get_wrapped_metric(None, n_wrappers=4)
         assert str(exc.value) == "Only 2-3 wrappers are currently supported"
+
+    @pytest.mark.parametrize(
+        "kwargs,match",
+        [
+            ({"min_tolerance": -0.1}, "min_tolerance must be >= 0"),
+            ({"max_tolerance": 1.5}, "max_tolerance must be <= 1"),
+            (
+                {"min_tolerance": 0.8, "max_tolerance": 0.5},
+                "min_tolerance must be >= 0",
+            ),
+            ({"step": 0}, "step must be positive"),
+            ({"step": -0.1}, "step must be positive"),
+            ({"step": 2.0}, "step must be positive"),
+        ],
+    )
+    def test_autc_invalid_params(self, kwargs, match):
+        with pytest.raises(ValueError, match=match):
+            metrics.autc(self.series1, self.series2, **kwargs)
+
+    def test_autc_constant_series(self):
+        series1_const = self.series1.with_values(np.ones(self.series1.shape))
+        with pytest.raises(ValueError, match="range of actual values"):
+            metrics.autc(series1_const, self.series2)
diff --git a/darts/tests/utils/test_statistics.py b/darts/tests/utils/test_statistics.py
diff --git a/darts/utils/statistics.py b/darts/utils/statistics.py