Allow explicitly passing the frequency to predict_df (#449)

shchur · web-flow · commit 1f099eb265a4 · 2026-01-19T13:59:08.000+01:00
*Issue #, if available:* #425 *Description of changes:* - Add `freq: str | None` parameter to `predict_df` methods. This can only be set in combination with `validate_inputs=False`. If specified, the user-provided `freq` will be used instead of the tryin to infer the `freq` from the data. By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice.
diff --git a/src/chronos/base.py b/src/chronos/base.py
@@ -142,6 +142,7 @@ def predict_df(
         prediction_length: int | None = None,
         quantile_levels: list[float] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
         validate_inputs: bool = True,
+        freq: str | None = None,
         **predict_kwargs,
     ) -> "pd.DataFrame":
         """
@@ -164,8 +165,14 @@ def predict_df(
         quantile_levels
             Quantile levels to compute
         validate_inputs
-            When True, the dataframe(s) will be validated before prediction, ensuring that timestamps have a
-            regular frequency, and item IDs match between past and future data. Setting to False disables these checks.
+            [ADVANCED] When True (default), validates dataframes before prediction. Setting to False removes the
+            validation overhead, but may silently lead to wrong predictions if data is misformatted. When False, you
+            must ensure: (1) all dataframes are sorted by (id_column, timestamp_column); (2) future_df (if provided)
+            has the same item IDs as df with exactly prediction_length rows of future timestamps per item; (3) all
+            timestamps are regularly spaced (e.g., with hourly frequency).
+        freq
+            Frequency string for timestamp generation (e.g., "h", "D", "W"). Can only be used when
+            validate_inputs=False. When provided, skips frequency inference from the data.
         **predict_kwargs
             Additional arguments passed to predict_quantiles
 
@@ -200,6 +207,7 @@ def predict_df(
             timestamp_column=timestamp_column,
             target_columns=[target],
             prediction_length=prediction_length,
+            freq=freq,
             validate_inputs=validate_inputs,
         )
 
diff --git a/src/chronos/chronos2/pipeline.py b/src/chronos/chronos2/pipeline.py
@@ -825,6 +825,7 @@ def predict_df(
         context_length: int | None = None,
         cross_learning: bool = False,
         validate_inputs: bool = True,
+        freq: str | None = None,
         **predict_kwargs,
     ) -> "pd.DataFrame":
         """
@@ -864,8 +865,14 @@ def predict_df(
             For optimal results, consider using a batch size around 100 (as used in the Chronos-2 technical report).
             - Cross-learning is most helpful when individual time series have limited historical context, as the model can leverage patterns from related series in the batch.
         validate_inputs
-            When True, the dataframe(s) will be validated before prediction, ensuring that timestamps have a
-            regular frequency, and item IDs match between past and future data. Setting to False disables these checks.
+            [ADVANCED] When True (default), validates dataframes before prediction. Setting to False removes the
+            validation overhead, but may silently lead to wrong predictions if data is misformatted. When False, you
+            must ensure: (1) all dataframes are sorted by (id_column, timestamp_column); (2) future_df (if provided)
+            has the same item IDs as df with exactly prediction_length rows of future timestamps per item; (3) all
+            timestamps are regularly spaced (e.g., with hourly frequency).
+        freq
+            Frequency string for timestamp generation (e.g., "h", "D", "W"). Can only be used when
+            validate_inputs=False. When provided, skips frequency inference from the data.
         **predict_kwargs
             Additional arguments passed to predict_quantiles
 
@@ -896,6 +903,7 @@ def predict_df(
             timestamp_column=timestamp_column,
             target_columns=target,
             prediction_length=prediction_length,
+            freq=freq,
             validate_inputs=validate_inputs,
         )
 
diff --git a/src/chronos/df_utils.py b/src/chronos/df_utils.py
@@ -204,6 +204,7 @@ def convert_df_input_to_list_of_dicts_input(
     id_column: str = "item_id",
     timestamp_column: str = "timestamp",
     validate_inputs: bool = True,
+    freq: str | None = None,
 ) -> tuple[list[dict[str, np.ndarray | dict[str, np.ndarray]]], np.ndarray, dict[str, "pd.DatetimeIndex"]]:
     """
     Convert from dataframe input format to a list of dictionaries input format.
@@ -230,7 +231,14 @@ def convert_df_input_to_list_of_dicts_input(
     timestamp_column
         Name of column containing timestamps
     validate_inputs
-        When True, the dataframe(s) will be validated be conversion
+        [ADVANCED] When True (default), validates dataframes before prediction. Setting to False removes the
+        validation overhead, but may silently lead to wrong predictions if data is misformatted. When False, you
+        must ensure: (1) all dataframes are sorted by (id_column, timestamp_column); (2) future_df (if provided)
+        has the same item IDs as df with exactly prediction_length rows of future timestamps per item; (3) all
+        timestamps are regularly spaced (e.g., with hourly frequency).
+    freq
+        Frequency string for timestamp generation (e.g., "h", "D", "W"). Can only be used
+        when validate_inputs=False. When provided, skips frequency inference from the data.
 
     Returns
     -------
@@ -242,6 +250,16 @@ def convert_df_input_to_list_of_dicts_input(
 
     import pandas as pd
 
+    if freq is not None and validate_inputs:
+        raise ValueError(
+            "freq can only be provided when validate_inputs=False. "
+            "When using freq with validate_inputs=False, you must ensure: "
+            "(1) all dataframes are sorted by (id_column, timestamp_column);  "
+            "(2) future_df (if provided) has the same item IDs as df with exactly "
+            "prediction_length rows of future timestamps per item; "
+            "(3) all timestamps are regularly spaced."
+        )
+
     if validate_inputs:
         df, future_df, freq, series_lengths, original_order = validate_df_inputs(
             df,
@@ -258,19 +276,19 @@ def convert_df_input_to_list_of_dicts_input(
         # Get series lengths
         series_lengths = df[id_column].value_counts(sort=False).to_list()
 
-        # If validation is skipped, the first freq in the dataframe is used
-        timestamp_index = pd.DatetimeIndex(df[timestamp_column])
-        start_idx = 0
-        freq = None
-        for length in series_lengths:
-            if length < 3:
-                start_idx += length
-                continue
-            timestamps = timestamp_index[start_idx : start_idx + length]
-            freq = pd.infer_freq(timestamps)
-            break
-
-        assert freq is not None, "validate is False, but could not infer frequency from the dataframe"
+        # If freq is not provided, infer from the first series with >= 3 points
+        if freq is None:
+            timestamp_index = pd.DatetimeIndex(df[timestamp_column])
+            start_idx = 0
+            for length in series_lengths:
+                if length < 3:
+                    start_idx += length
+                    continue
+                timestamps = timestamp_index[start_idx : start_idx + length]
+                freq = pd.infer_freq(timestamps)
+                break
+
+            assert freq is not None, "validate_inputs is False, but could not infer frequency from the dataframe"
 
     # Convert to list of dicts format
     inputs: list[dict[str, np.ndarray | dict[str, np.ndarray]]] = []
diff --git a/test/test_df_utils.py b/test/test_df_utils.py