feat: OLS forecast + rolling z-score anomaly detection

dev · dev · commit 1908ab20d84d · 2026-04-09T19:34:31.000+03:00
- transform/forecast.py: dependency-free per-country OLS projection with
  widening 95% prediction band, plus rolling-baseline z-score anomalies
  (shift-1 lookback so spikes do not contaminate their own score)
- orchestration: new build_forecast_and_anomalies task wired into the flow
  after dbt build, writing mart.mart_emissions_forecast and
  mart.mart_emissions_anomalies
- migration 004: forecast + anomalies tables with PK + indexes
- dashboard page 05_forecast.py: actuals vs forecast with shaded band and
  anomaly markers, plus side-by-side data tables
- dbt exposures: declare streamlit_dashboard and emissions_forecast_model
  as downstream consumers so dbt docs lineage is complete
- tests: 5 new unit tests covering linear extrapolation, short history
  guard, multi-country fan-out, spike detection, and steady-series quiet
diff --git a/dashboard/pages/05_forecast.py b/dashboard/pages/05_forecast.py
@@ -0,0 +1,91 @@
+"""Forecast + anomaly page: actuals, OLS projection, and flagged outliers."""
+
+from __future__ import annotations
+
+import plotly.graph_objects as go
+import streamlit as st
+
+from dashboard.components.filters import sidebar_filters
+from dashboard.utils.db import query
+
+st.title("Forecast & Anomalies")
+sidebar_filters()
+
+countries = query("select distinct country_code from mart.mart_country_emissions order by 1")
+if countries.empty:
+    st.warning("No data — run `make seed` first.")
+    st.stop()
+
+country = st.selectbox("Country", countries["country_code"].tolist())
+
+actuals = query(
+    "select year, total_emissions_tonnes from mart.mart_country_emissions "
+    "where country_code = %s order by year",
+    (country,),
+)
+forecast = query(
+    "select year, forecast_tonnes, lower_band, upper_band from mart.mart_emissions_forecast "
+    "where country_code = %s order by year",
+    (country,),
+)
+anomalies = query(
+    "select year, total_emissions_tonnes, z_score, severity "
+    "from mart.mart_emissions_anomalies where country_code = %s order by year",
+    (country,),
+)
+
+fig = go.Figure()
+fig.add_trace(
+    go.Scatter(
+        x=actuals["year"],
+        y=actuals["total_emissions_tonnes"],
+        mode="lines+markers",
+        name="actual",
+    )
+)
+if not forecast.empty:
+    fig.add_trace(
+        go.Scatter(
+            x=forecast["year"],
+            y=forecast["forecast_tonnes"],
+            mode="lines+markers",
+            name="forecast",
+            line={"dash": "dash"},
+        )
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=list(forecast["year"]) + list(forecast["year"][::-1]),
+            y=list(forecast["upper_band"]) + list(forecast["lower_band"][::-1]),
+            fill="toself",
+            fillcolor="rgba(255,140,0,0.15)",
+            line={"width": 0},
+            name="95% band",
+            showlegend=True,
+        )
+    )
+if not anomalies.empty:
+    fig.add_trace(
+        go.Scatter(
+            x=anomalies["year"],
+            y=anomalies["total_emissions_tonnes"],
+            mode="markers",
+            marker={"size": 14, "color": "red", "symbol": "x"},
+            name="anomaly",
+        )
+    )
+fig.update_layout(
+    title=f"{country}: actuals, forecast, and anomalies",
+    xaxis_title="year",
+    yaxis_title="tonnes CO₂e",
+    height=550,
+)
+st.plotly_chart(fig, use_container_width=True)
+
+c1, c2 = st.columns(2)
+with c1:
+    st.subheader("Forecast")
+    st.dataframe(forecast, use_container_width=True)
+with c2:
+    st.subheader("Anomalies")
+    st.dataframe(anomalies, use_container_width=True)
diff --git a/dbt_project/models/marts/_marts__exposures.yml b/dbt_project/models/marts/_marts__exposures.yml
@@ -0,0 +1,26 @@
+version: 2
+
+exposures:
+  - name: streamlit_dashboard
+    type: dashboard
+    maturity: high
+    url: http://localhost:8501
+    description: "EU ETS multi-page Streamlit dashboard."
+    depends_on:
+      - ref('mart_country_emissions')
+      - ref('mart_sector_trends')
+      - ref('mart_top_emitters')
+      - ref('mart_compliance_gap')
+    owner:
+      name: data-platform
+      email: data@example.com
+
+  - name: emissions_forecast_model
+    type: ml
+    maturity: medium
+    description: "OLS-based per-country forecast + rolling z-score anomaly detection. Built by orchestration.tasks.build_forecast_and_anomalies and stored in mart.mart_emissions_forecast / mart.mart_emissions_anomalies."
+    depends_on:
+      - ref('mart_country_emissions')
+    owner:
+      name: data-platform
+      email: data@example.com
diff --git a/orchestration/flows.py b/orchestration/flows.py
@@ -11,6 +11,7 @@
 
 from orchestration.alerts import send_alert
 from orchestration.tasks import (
+    build_forecast_and_anomalies,
     clean_table,
     download_source,
     load_country_codes,
@@ -42,6 +43,10 @@ def energy_pipeline() -> dict[str, int]:
 
         run_dbt("deps")
         run_dbt("build --target dev")
+
+        forecast_rows, anomaly_rows = build_forecast_and_anomalies()
+        loaded["forecast"] = forecast_rows
+        loaded["anomalies"] = anomaly_rows
         return loaded
     except Exception as exc:
         send_alert(
diff --git a/orchestration/tasks.py b/orchestration/tasks.py
@@ -12,6 +12,7 @@
 from ingest import loaders, sources
 from ingest.schemas import AllowanceSchema, EmissionSchema, InstallationSchema
 from transform import clean
+from transform.forecast import detect_anomalies, forecast_country_emissions
 
 log = structlog.get_logger(__name__)
 
@@ -113,3 +114,57 @@ def run_dbt(command: str) -> None:
     if result.returncode != 0:
         log.error("dbt_failed", stderr=result.stderr[-2000:])
         raise RuntimeError(f"dbt {command} failed: {result.stderr[-500:]}")
+
+
+@task(retries=2, retry_delay_seconds=30, tags=["transform", "ml"])
+def build_forecast_and_anomalies() -> tuple[int, int]:
+    """Read mart_country_emissions, fit per-country forecasts, flag anomalies.
+
+    Writes results to ``mart.mart_emissions_forecast`` and
+    ``mart.mart_emissions_anomalies``. Returns ``(forecast_rows, anomaly_rows)``.
+    """
+    with loaders.get_conn() as conn, conn.cursor() as cur:
+        cur.execute(
+            "select country_code, year, total_emissions_tonnes from mart.mart_country_emissions"
+        )
+        rows = cur.fetchall()
+    history = pd.DataFrame(rows, columns=["country_code", "year", "total_emissions_tonnes"])
+    log.info("forecast_input", rows=len(history))
+
+    forecast = forecast_country_emissions(history)
+    anomalies = detect_anomalies(history)
+
+    with loaders.get_conn() as conn:
+        loaders.truncate(conn, "mart.mart_emissions_forecast")
+        loaders.truncate(conn, "mart.mart_emissions_anomalies")
+        if not forecast.empty:
+            loaders.copy_dataframe(
+                conn,
+                forecast,
+                "mart.mart_emissions_forecast",
+                ["country_code", "year", "forecast_tonnes", "lower_band", "upper_band", "model"],
+                source_file="forecast_task",
+            )
+        if not anomalies.empty:
+            with conn.cursor() as cur:
+                cur.executemany(
+                    "INSERT INTO mart.mart_emissions_anomalies "
+                    "(country_code, year, total_emissions_tonnes, yoy_pct, z_score, severity) "
+                    "VALUES (%s, %s, %s, %s, %s, %s)",
+                    list(
+                        anomalies[
+                            [
+                                "country_code",
+                                "year",
+                                "total_emissions_tonnes",
+                                "yoy_pct",
+                                "z_score",
+                                "severity",
+                            ]
+                        ].itertuples(index=False, name=None)
+                    ),
+                )
+        conn.commit()
+
+    log.info("forecast_done", forecast_rows=len(forecast), anomaly_rows=len(anomalies))
+    return len(forecast), len(anomalies)
diff --git a/tests/test_forecast.py b/tests/test_forecast.py
@@ -0,0 +1,59 @@
+"""Tests for the forecasting and anomaly detection module."""
+
+from __future__ import annotations
+
+import pandas as pd
+
+from transform.forecast import ForecastConfig, detect_anomalies, forecast_country_emissions
+
+
+def _history(country: str, values: list[float], start: int = 2015) -> pd.DataFrame:
+    return pd.DataFrame(
+        {
+            "country_code": [country] * len(values),
+            "year": list(range(start, start + len(values))),
+            "total_emissions_tonnes": values,
+        }
+    )
+
+
+def test_forecast_extrapolates_linear_trend() -> None:
+    df = _history("DE", [100.0, 110.0, 120.0, 130.0, 140.0])
+    out = forecast_country_emissions(df, ForecastConfig(horizon_years=3))
+    assert len(out) == 3
+    # OLS slope is 10/year — first forecast year should be ~150.
+    assert abs(out.iloc[0]["forecast_tonnes"] - 150.0) < 1e-6
+    assert (out["upper_band"] >= out["forecast_tonnes"]).all()
+    assert (out["lower_band"] <= out["forecast_tonnes"]).all()
+
+
+def test_forecast_skips_short_history() -> None:
+    df = _history("FR", [100.0, 110.0])
+    out = forecast_country_emissions(df, ForecastConfig(min_history_years=4))
+    assert out.empty
+
+
+def test_forecast_handles_multiple_countries() -> None:
+    df = pd.concat(
+        [
+            _history("DE", [100.0, 110.0, 120.0, 130.0, 140.0]),
+            _history("FR", [200.0, 195.0, 190.0, 185.0, 180.0]),
+        ]
+    )
+    out = forecast_country_emissions(df, ForecastConfig(horizon_years=2))
+    assert set(out["country_code"]) == {"DE", "FR"}
+    assert len(out) == 4
+
+
+def test_anomaly_detects_large_spike() -> None:
+    df = _history("DE", [100.0, 102.0, 101.0, 103.0, 102.0, 500.0, 105.0])
+    out = detect_anomalies(df, ForecastConfig(anomaly_z_threshold=2.0))
+    assert not out.empty
+    assert 2020 in out["year"].tolist()
+    assert out["severity"].iloc[0] in {"warning", "critical"}
+
+
+def test_anomaly_returns_empty_for_steady_series() -> None:
+    df = _history("DE", [100.0, 101.0, 102.0, 103.0, 104.0, 105.0])
+    out = detect_anomalies(df)
+    assert out.empty
diff --git a/transform/forecast.py b/transform/forecast.py
@@ -0,0 +1,131 @@
+"""Forecasting and anomaly detection on country-level emissions.
+
+Simple, dependency-free approach designed to run inside the Prefect flow
+*after* dbt has built ``mart.mart_country_emissions``:
+
+* **Forecast:** ordinary least squares on (year, total_emissions) per country,
+  projected ``horizon`` years forward, with a +/- 1.96 * residual-stderr band.
+* **Anomalies:** rolling z-score (window=3) on year-over-year change; rows with
+  ``|z| > 2.5`` are flagged.
+
+Both outputs are written to dedicated mart tables so the dashboard can read
+them with the same cached ``query()`` helper as everything else.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import numpy as np
+import pandas as pd
+import structlog
+
+log = structlog.get_logger(__name__)
+
+
+@dataclass(frozen=True)
+class ForecastConfig:
+    """Configuration for the forecast task."""
+
+    horizon_years: int = 5
+    min_history_years: int = 4
+    anomaly_z_threshold: float = 2.5
+    anomaly_window: int = 3
+
+
+def _fit_linear(years: np.ndarray, values: np.ndarray) -> tuple[float, float, float]:
+    """Return (slope, intercept, residual_stderr) from a 1-D OLS fit."""
+    n = len(years)
+    if n < 2:
+        return 0.0, float(values.mean() if n else 0.0), 0.0
+    x_mean = years.mean()
+    y_mean = values.mean()
+    denom = ((years - x_mean) ** 2).sum()
+    if denom == 0:
+        return 0.0, float(y_mean), 0.0
+    slope = float(((years - x_mean) * (values - y_mean)).sum() / denom)
+    intercept = float(y_mean - slope * x_mean)
+    fitted = slope * years + intercept
+    residuals = values - fitted
+    dof = max(n - 2, 1)
+    stderr = float(np.sqrt((residuals**2).sum() / dof))
+    return slope, intercept, stderr
+
+
+def forecast_country_emissions(
+    history: pd.DataFrame, config: ForecastConfig | None = None
+) -> pd.DataFrame:
+    """Project emissions ``horizon_years`` forward for every country.
+
+    Args:
+        history: Output of ``mart.mart_country_emissions`` with columns
+            ``country_code``, ``year``, ``total_emissions_tonnes``.
+        config: Forecast configuration; defaults to ``ForecastConfig()``.
+
+    Returns:
+        DataFrame with columns ``country_code``, ``year``, ``forecast_tonnes``,
+        ``lower_band``, ``upper_band``, ``model``.
+    """
+    cfg = config or ForecastConfig()
+    out_rows: list[dict[str, object]] = []
+
+    for country, group in history.groupby("country_code", sort=True):
+        g = group.sort_values("year")
+        if len(g) < cfg.min_history_years:
+            log.warning("forecast_skipped_insufficient_history", country=country, years=len(g))
+            continue
+        years = g["year"].to_numpy(dtype=float)
+        values = g["total_emissions_tonnes"].to_numpy(dtype=float)
+        slope, intercept, stderr = _fit_linear(years, values)
+
+        last_year = int(years.max())
+        for h in range(1, cfg.horizon_years + 1):
+            yr = last_year + h
+            point = slope * yr + intercept
+            band = 1.96 * stderr * np.sqrt(h)  # widening band
+            out_rows.append(
+                {
+                    "country_code": country,
+                    "year": yr,
+                    "forecast_tonnes": max(0.0, point),
+                    "lower_band": max(0.0, point - band),
+                    "upper_band": max(0.0, point + band),
+                    "model": "ols_linear",
+                }
+            )
+
+    return pd.DataFrame(out_rows)
+
+
+def detect_anomalies(history: pd.DataFrame, config: ForecastConfig | None = None) -> pd.DataFrame:
+    """Flag country-years whose YoY change is a rolling z-score outlier.
+
+    Args:
+        history: ``mart.mart_country_emissions`` rows.
+        config: Anomaly window + threshold.
+
+    Returns:
+        DataFrame containing only the anomalous rows, with the computed
+        ``yoy_pct``, ``z_score``, and a string ``severity`` label.
+    """
+    cfg = config or ForecastConfig()
+    df = history.sort_values(["country_code", "year"]).copy()
+    df["yoy_pct"] = df.groupby("country_code")["total_emissions_tonnes"].pct_change() * 100.0
+
+    # Compute the rolling baseline from prior rows only (shift by 1) so that
+    # an anomalous point does not contaminate its own z-score.
+    grouped = df.groupby("country_code")["yoy_pct"]
+    prior = grouped.shift(1)
+    rolling_mean = prior.groupby(df["country_code"]).transform(
+        lambda s: s.rolling(cfg.anomaly_window, min_periods=2).mean()
+    )
+    rolling_std = prior.groupby(df["country_code"]).transform(
+        lambda s: s.rolling(cfg.anomaly_window, min_periods=2).std()
+    )
+    df["z_score"] = (df["yoy_pct"] - rolling_mean) / rolling_std.replace(0, np.nan)
+
+    anomalies = df[df["z_score"].abs() > cfg.anomaly_z_threshold].copy()
+    anomalies["severity"] = np.where(anomalies["z_score"].abs() > 4.0, "critical", "warning")
+    return anomalies[
+        ["country_code", "year", "total_emissions_tonnes", "yoy_pct", "z_score", "severity"]
+    ].reset_index(drop=True)
diff --git a/warehouse/migrations/004_forecast_tables.sql b/warehouse/migrations/004_forecast_tables.sql