Better handling of start date

jessegrabowski · jessegrabowski · commit 3925964127e3 · 2024-08-24T14:28:18.000+08:00
diff --git a/pymc_experimental/statespace/core/statespace.py b/pymc_experimental/statespace/core/statespace.py
@@ -1685,36 +1685,102 @@ def _build_forecast_index(
         periods: int | None = None,
         use_scenario_index: bool = False,
         scenario: pd.DataFrame | np.ndarray | None = None,
-    ) -> pd.Index:
-        if use_scenario_index:
-            if isinstance(scenario, pd.DataFrame):
-                return scenario.index
-            if isinstance(scenario, dict):
-                first_df = next(
-                    (df for df in scenario.values() if isinstance(df, pd.DataFrame)), None
-                )
-                return first_df.index
+    ) -> tuple[int | pd.Timestamp, pd.RangeIndex | pd.DatetimeIndex]:
+        """
+        Construct a pandas Index for the requested forecast horizon.
 
-        # Otherwise, build an index. It will be a DateTime index if we have all the necessary information, otherwise
-        # use a range index.
-        is_datetime = isinstance(time_index, pd.DatetimeIndex)
-        forecast_index = None
+        Parameters
+        ----------
+        time_index: pd.RangeIndex or pd.DatetimeIndex
+            Index of the data used to fit the model
+        start: int or pd.Timestamp, optional
+            Date from which to begin forecasting. If using a datetime index, integer start will be interpreted
+            as a positional index. Otherwise, start must be found inside the time_index
+        end: int or pd.Timestamp, optional
+            Date at which to end forecasting. If using a datetime index, end must be a timestamp.
+        periods: int, optional
+            Number of periods to forecast
+        scenario:  pd.DataFrame, np.ndarray, optional
+            Scenario data to use for forecasting. If provided, the index of the scenario data will be used as the
+            forecast index. If provided, start, end, and periods will be ignored.
+        use_scenario_index: bool, default False
+            If True, the index of the scenario data will be used as the forecast index.
 
-        if is_datetime:
-            freq = time_index.inferred_freq
 
-            if end is not None:
-                forecast_index = pd.date_range(start, end=end, freq=freq)
-            if periods is not None:
-                forecast_index = pd.date_range(start, periods=periods, freq=freq)
+        Returns
+        -------
+        start: int | pd.TimeStamp
+            The starting date index or time step from which to generate the forecasts.
+
+        forecast_index: pd.DatetimeIndex or pd.RangeIndex
+            Index for the forecast results
+        """
+
+        def get_or_create_index(x, start=None):
+            if isinstance(x, pd.DataFrame | pd.Series):
+                return x.index
+            elif isinstance(x, dict):
+                return get_or_create_index(next(iter(x.values())))
+            elif isinstance(x, np.ndarray | list | tuple):
+                if start is None:
+                    raise ValueError(
+                        "Provided scenario has no index and no start date was provided. This combination "
+                        "is ambiguous. Please provide a start date, or add an index to the scenario."
+                    )
+                n = x.shape[0] if isinstance(x, np.ndarray) else len(x)
+                return pd.RangeIndex(start, n + start, step=1, dtype="int")
+            else:
+                raise ValueError(f"{type(x)} is not a valid type for scenario data.")
+
+        x0_idx = None
+
+        if use_scenario_index:
+            forecast_index = get_or_create_index(scenario, start)
+            is_datetime = isinstance(forecast_index, pd.DatetimeIndex)
+
+            # If the user provided an index, we want to take it as-is (without removing the start value). Instead,
+            # step one back and use this as the start value.
+            delta = forecast_index.freq if is_datetime else 1
+            x0_idx = forecast_index[0] - delta
 
         else:
-            if end is not None:
-                forecast_index = pd.RangeIndex(start, end, step=1, dtype="int")
-            if periods is not None:
-                forecast_index = pd.RangeIndex(start, start + periods, step=1, dtype="int")
+            # Otherwise, build an index. It will be a DateTime index if we have all the necessary information, otherwise
+            # use a range index.
+            is_datetime = isinstance(time_index, pd.DatetimeIndex)
+            forecast_index = None
+
+            if is_datetime:
+                freq = time_index.inferred_freq
+                if isinstance(start, int):
+                    start = time_index[start]
+                if end is not None:
+                    forecast_index = pd.date_range(start, end=end, freq=freq)
+                if periods is not None:
+                    forecast_index = pd.date_range(start, periods=periods, freq=freq)
+
+            else:
+                if end is not None:
+                    forecast_index = pd.RangeIndex(start, end, step=1, dtype="int")
+                if periods is not None:
+                    forecast_index = pd.RangeIndex(start, start + periods, step=1, dtype="int")
+
+        if is_datetime:
+            if forecast_index.freq != time_index.freq:
+                raise ValueError(
+                    "The frequency of the forecast index must match the frequency on the data used "
+                    f"to fit the model. Got {forecast_index.freq}, expected {time_index.freq}"
+                )
+
+        if x0_idx is None:
+            x0_idx, forecast_index = forecast_index[0], forecast_index[1:]
+        if x0_idx in forecast_index:
+            raise ValueError("x0_idx should not be in the forecast index")
+        if x0_idx not in time_index:
+            raise ValueError("start must be in the data index used to fit the model.")
 
-        return forecast_index
+        # The starting value should not be included in the forecast index. It will be used only to define x0 and P0,
+        # and no forecast will be associated with it.
+        return x0_idx, forecast_index
 
     def _finalize_scenario_initialization(
         self,
@@ -1876,7 +1942,7 @@ def forecast(
             verbose=verbose,
         )
 
-        forecast_index = self._build_forecast_index(
+        t0, forecast_index = self._build_forecast_index(
             time_index=time_index,
             start=start,
             end=end,
@@ -1892,7 +1958,6 @@ def forecast(
         if all([dim in temp_coords for dim in [filter_time_dim, ALL_STATE_DIM, OBS_STATE_DIM]]):
             dims = [TIME_DIM, ALL_STATE_DIM, OBS_STATE_DIM]
 
-        t0 = forecast_index[0]
         t0_idx = np.flatnonzero(time_index == t0)[0]
 
         temp_coords["data_time"] = time_index
diff --git a/tests/statespace/test_statespace.py b/tests/statespace/test_statespace.py
@@ -292,7 +292,7 @@ def test_sampling_methods(group, kind, ss_mod, idata, rng):
 def _make_time_idx(mod, use_datetime_index=True):
     if use_datetime_index:
         mod._fit_coords["time"] = nile.index
-        time_idx = pd.DatetimeIndex(mod._fit_coords["time"].values, freq=nile.index.inferred_freq)
+        time_idx = nile.index
     else:
         mod._fit_coords["time"] = nile.reset_index().index
         time_idx = pd.RangeIndex(start=0, stop=nile.shape[0], step=1)
@@ -354,34 +354,50 @@ def test_forecast_index(use_datetime_index):
     ss_mod._fit_coords = dict()
     time_idx = _make_time_idx(ss_mod, use_datetime_index)
 
-    # From start and end date
+    # From start and end
     start = time_idx[-1]
-    end = time_idx.shift(10)[-1] if use_datetime_index else time_idx[-1] + 11
+    delta = pd.DateOffset(years=10) if use_datetime_index else 11
+    end = start + delta
 
-    forecast_idx = ss_mod._build_forecast_index(time_idx, start=start, end=end)
-    assert start in forecast_idx
-    assert forecast_idx.shape == (11,)
+    x0_index, forecast_idx = ss_mod._build_forecast_index(time_idx, start=start, end=end)
+    assert start not in forecast_idx
+    assert x0_index == start
+    assert forecast_idx.shape == (10,)
 
     # From start and periods
     start = time_idx[-1]
-    periods = 10
+    periods = 11
+
+    x0_index, forecast_idx = ss_mod._build_forecast_index(time_idx, start=start, periods=periods)
+    assert start not in forecast_idx
+    assert x0_index == start
+    assert forecast_idx.shape == (10,)
 
-    forecast_idx = ss_mod._build_forecast_index(time_idx, start=start, periods=periods)
+    # From integer start
+    start = 10
+    x0_index, forecast_idx = ss_mod._build_forecast_index(time_idx, start=start, periods=periods)
+    delta = forecast_idx.freq if use_datetime_index else 1
+
+    assert x0_index == time_idx[start]
     assert forecast_idx.shape == (10,)
+    assert (forecast_idx == time_idx[start + 1 : start + periods]).all()
 
     # From scenario index
     scenario = pd.DataFrame(0, index=forecast_idx, columns=[0, 1, 2])
-    forecast_idx = ss_mod._build_forecast_index(
+    new_start, forecast_idx = ss_mod._build_forecast_index(
         time_index=time_idx, scenario=scenario, use_scenario_index=True
     )
+    assert x0_index not in forecast_idx
+    assert x0_index == (forecast_idx[0] - delta)
     assert forecast_idx.shape == (10,)
     assert forecast_idx.equals(scenario.index)
 
     # From dictionary of scenarios
     scenario = {"a": pd.DataFrame(0, index=forecast_idx, columns=[0, 1, 2])}
-    forecast_idx = ss_mod._build_forecast_index(
+    x0_index, forecast_idx = ss_mod._build_forecast_index(
         time_index=time_idx, scenario=scenario, use_scenario_index=True
     )
+    assert x0_index == (forecast_idx[0] - delta)
     assert forecast_idx.shape == (10,)
     assert forecast_idx.equals(scenario["a"].index)
 
@@ -484,7 +500,7 @@ def test_finalize_scenario_single(data_type, use_datetime_index):
     scenario = data_type(np.zeros((10,)))
 
     scenario = ss_mod._validate_scenario_data(scenario)
-    forecast_idx = ss_mod._build_forecast_index(time_idx, start=time_idx[-1], periods=10)
+    t0, forecast_idx = ss_mod._build_forecast_index(time_idx, start=time_idx[-1], periods=11)
     scenario = ss_mod._finalize_scenario_initialization(scenario, forecast_index=forecast_idx)
 
     assert isinstance(scenario, pd.DataFrame)
@@ -662,6 +678,9 @@ def test_forecast_with_exog_data(rng, exog_ss_mod, idata_exog):
         .assign_coords(state=["exog[a]", "exog[b]", "exog[c]"])
     )
 
+    print(scenario.index)
+    print(level.coords)
+
     regression_effect = forecast_idata.forecast_observed.isel(observed_state=0) - level
     regression_effect_expected = (betas * scenario_xr).sum(dim=["state"])