Skip to content
Merged
13 changes: 13 additions & 0 deletions bigframes/ml/forecasting.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from __future__ import annotations

from typing import List, Optional
import warnings

from google.cloud import bigquery

Expand Down Expand Up @@ -230,6 +231,18 @@ def _fit(
"""
X, y = utils.batch_convert_to_dataframe(X, y)

# Auto-convert Date to datetime for hourly/per_minute frequency
if self.data_frequency in ["hourly", "per_minute"]:
timestamp_col = X.columns[0]
if "date" in X[timestamp_col].dtype.name:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Non-rhetorical question: What if the the column has dtype "datetime"? Is it necessary that we still cast that column as datetime, and warn that the column has "date" type?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error would not exist if column has a dtype "datetime". In this case, no cast/warning will exist.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmmm, in that case the expression

"date" in X[timestamp_col].dtype.name

would still evaluate to True, right? Is that something we want?

Copy link
Contributor Author

@shuoweil shuoweil Dec 12, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for suggestion. I believe Garrett has a fair point. Thus I revert the code change. Now this PR only include the notebook changes.

warnings.warn(
f"Converting Date column '{timestamp_col}' to datetime for "
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems not make sense to convert and predict with hourly and minute data frequency to date granularity. We should just let it emit errors.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is a fair point. I revert the code change and update notebook

f"{self.data_frequency} frequency. This is required because "
f"BigQuery ML doesn't support Date type with hourly frequency."
)
X = X.copy()
X[timestamp_col] = bpd.to_datetime(X[timestamp_col])

if X.columns.size != 1:
raise ValueError("Time series timestamp input X contain at least 1 column.")
if y.columns.size != 1:
Expand Down
574 changes: 574 additions & 0 deletions notebooks/ml/timeseries_analysis.ipynb

Large diffs are not rendered by default.

27 changes: 24 additions & 3 deletions tests/system/large/ml/test_forecasting.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from bigframes.ml import forecasting
from bigframes.testing import utils

ARIMA_EVALUATE_OUTPUT_COL = [
ARIMA_EVALUATE_OUTPUT_COLUMNS = [
"non_seasonal_p",
"non_seasonal_d",
"non_seasonal_q",
Expand Down Expand Up @@ -106,9 +106,9 @@ def test_arima_plus_model_fit_summary(
curr_model = arima_model_w_id if id_col_name else arima_model
result = curr_model.summary().to_pandas()
expected_columns = (
[id_col_name] + ARIMA_EVALUATE_OUTPUT_COL
[id_col_name] + ARIMA_EVALUATE_OUTPUT_COLUMNS
if id_col_name
else ARIMA_EVALUATE_OUTPUT_COL
else ARIMA_EVALUATE_OUTPUT_COLUMNS
)
utils.check_pandas_df_schema_and_index(
result, columns=expected_columns, index=2 if id_col_name else 1
Expand Down Expand Up @@ -190,3 +190,24 @@ def test_arima_plus_model_fit_params(
assert reloaded_model.min_time_series_length == 10
assert reloaded_model.trend_smoothing_window_size == 5
assert reloaded_model.decompose_time_series is False


def test_arima_plus_model_fit_date_conversion(time_series_df_default_index):
model = forecasting.ARIMAPlus(data_frequency="hourly")

# Arrange: Create a dataframe with a date column to test auto-conversion
df = time_series_df_default_index.copy()
df["parsed_date"] = df["parsed_date"].dt.date

X_train = df[["parsed_date"]]
y_train = df[["total_visits"]]

with pytest.warns(
UserWarning,
match="Converting Date column 'parsed_date' to datetime for hourly frequency.",
):
# Act
model.fit(X_train, y_train)

# Assert
assert model._bqml_model is not None