feat: support typed pyarrow.Scalar in assignment (#1930)

tswast · google-labs-jules[bot] · gcf-owl-bot[bot] · web-flow · commit cd28e12b3f70 · 2025-07-23T12:54:45.000-05:00
* I am working on adding support for pyarrow.Scalar to infer_literal_method. * Update setup.py * Update tests/unit/core/test_dtypes.py * patch ibis * increase timeout * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * lint --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
@@ -659,6 +659,8 @@ def _dtype_from_string(dtype_string: str) -> typing.Optional[Dtype]:
 
 def infer_literal_type(literal) -> typing.Optional[Dtype]:
     # Maybe also normalize literal to canonical python representation to remove this burden from compilers?
+    if isinstance(literal, pa.Scalar):
+        return arrow_dtype_to_bigframes_dtype(literal.type)
     if pd.api.types.is_list_like(literal):
         element_types = [infer_literal_type(i) for i in literal]
         common_type = lcd_type(*element_types)
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -906,15 +906,53 @@ def test_df_to_pandas_batches(scalars_dfs):
     assert_pandas_df_equal(pd.concat(filtered_batches), pd_result)
 
 
-def test_assign_new_column(scalars_dfs):
+@pytest.mark.parametrize(
+    ("literal", "expected_dtype"),
+    (
+        pytest.param(
+            2,
+            dtypes.INT_DTYPE,
+            id="INT64",
+        ),
+        # ====================================================================
+        # NULL values
+        #
+        # These are regression tests for b/428999884. It needs to be possible to
+        # set a column to NULL with a desired type (not just the pandas default
+        # of float64).
+        # ====================================================================
+        pytest.param(None, dtypes.FLOAT_DTYPE, id="NULL-None"),
+        pytest.param(
+            pa.scalar(None, type=pa.int64()),
+            dtypes.INT_DTYPE,
+            id="NULL-pyarrow-TIMESTAMP",
+        ),
+        pytest.param(
+            pa.scalar(None, type=pa.timestamp("us", tz="UTC")),
+            dtypes.TIMESTAMP_DTYPE,
+            id="NULL-pyarrow-TIMESTAMP",
+        ),
+        pytest.param(
+            pa.scalar(None, type=pa.timestamp("us")),
+            dtypes.DATETIME_DTYPE,
+            id="NULL-pyarrow-DATETIME",
+        ),
+    ),
+)
+def test_assign_new_column_w_literal(scalars_dfs, literal, expected_dtype):
     scalars_df, scalars_pandas_df = scalars_dfs
-    kwargs = {"new_col": 2}
-    df = scalars_df.assign(**kwargs)
+    df = scalars_df.assign(new_col=literal)
     bf_result = df.to_pandas()
-    pd_result = scalars_pandas_df.assign(**kwargs)
 
-    # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes.
-    pd_result["new_col"] = pd_result["new_col"].astype("Int64")
+    new_col_pd = literal
+    if isinstance(literal, pa.Scalar):
+        # PyArrow integer scalars aren't yet supported in pandas Int64Dtype.
+        new_col_pd = literal.as_py()
+
+    # Pandas might not pick the same dtype as BigFrames, but it should at least
+    # be castable to it.
+    pd_result = scalars_pandas_df.assign(new_col=new_col_pd)
+    pd_result["new_col"] = pd_result["new_col"].astype(expected_dtype)
 
     assert_pandas_df_equal(bf_result, pd_result)
 
diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py
@@ -606,7 +606,7 @@ def test_read_gbq_wildcard(
             "query": {
                 "useQueryCache": True,
                 "maximumBytesBilled": "1000000000",
-                "timeoutMs": 10000,
+                "timeoutMs": 120_000,
             }
         },
         pytest.param(
diff --git a/tests/unit/core/test_dtypes.py b/tests/unit/core/test_dtypes.py
@@ -272,3 +272,19 @@ def test_literal_to_ibis_scalar_throws_on_incompatible_literal():
         ValueError,
     ):
         bigframes.core.compile.ibis_types.literal_to_ibis_scalar({"mykey": "myval"})
+
+
+@pytest.mark.parametrize(
+    ["scalar", "expected_dtype"],
+    [
+        (pa.scalar(1_000_000_000, type=pa.int64()), bigframes.dtypes.INT_DTYPE),
+        (pa.scalar(True, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE),
+        (pa.scalar("hello", type=pa.string()), bigframes.dtypes.STRING_DTYPE),
+        # Support NULL scalars.
+        (pa.scalar(None, type=pa.int64()), bigframes.dtypes.INT_DTYPE),
+        (pa.scalar(None, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE),
+        (pa.scalar(None, type=pa.string()), bigframes.dtypes.STRING_DTYPE),
+    ],
+)
+def test_infer_literal_type_arrow_scalar(scalar, expected_dtype):
+    assert bigframes.dtypes.infer_literal_type(scalar) == expected_dtype
diff --git a/third_party/bigframes_vendored/ibis/common/temporal.py b/third_party/bigframes_vendored/ibis/common/temporal.py
@@ -260,3 +260,8 @@ def _from_numpy_datetime64(value):
         raise TypeError("Unable to convert np.datetime64 without pandas")
     else:
         return pd.Timestamp(value).to_pydatetime()
+
+
+@normalize_datetime.register("pyarrow.Scalar")
+def _from_pyarrow_scalar(value):
+    return value.as_py()
diff --git a/third_party/bigframes_vendored/ibis/expr/datatypes/value.py b/third_party/bigframes_vendored/ibis/expr/datatypes/value.py
@@ -27,6 +27,7 @@
 import bigframes_vendored.ibis.expr.datatypes as dt
 from bigframes_vendored.ibis.expr.datatypes.cast import highest_precedence
 from public import public
+import pyarrow as pa
 import toolz
 
 
@@ -71,6 +72,14 @@ def infer_list(values: Sequence[Any]) -> dt.Array:
     return dt.Array(highest_precedence(map(infer, values)))
 
 
+@infer.register("pyarrow.Scalar")
+def infer_pyarrow_scalar(value: "pa.Scalar"):
+    """Infert the type of a PyArrow Scalar value."""
+    import bigframes_vendored.ibis.formats.pyarrow
+
+    return bigframes_vendored.ibis.formats.pyarrow.PyArrowType.to_ibis(value.type)
+
+
 @infer.register(datetime.time)
 def infer_time(value: datetime.time) -> dt.Time:
     return dt.time
@@ -253,6 +262,9 @@ def infer_shapely_multipolygon(value) -> dt.MultiPolygon:
 def normalize(typ, value):
     """Ensure that the Python type underlying a literal resolves to a single type."""
 
+    if pa is not None and isinstance(value, pa.Scalar):
+        value = value.as_py()
+
     dtype = dt.dtype(typ)
     if value is None:
         if not dtype.nullable:
diff --git a/third_party/bigframes_vendored/ibis/formats/pyarrow.py b/third_party/bigframes_vendored/ibis/formats/pyarrow.py
@@ -24,7 +24,6 @@
 @functools.cache
 def _from_pyarrow_types():
     import pyarrow as pa
-    import pyarrow_hotfix  # noqa: F401
 
     return {
         pa.int8(): dt.Int8,
@@ -87,7 +86,6 @@ class PyArrowType(TypeMapper):
     def to_ibis(cls, typ: pa.DataType, nullable=True) -> dt.DataType:
         """Convert a pyarrow type to an ibis type."""
         import pyarrow as pa
-        import pyarrow_hotfix  # noqa: F401
 
         if pa.types.is_null(typ):
             return dt.null

Original file line number	Diff line number	Diff line change
`@@ -606,7 +606,7 @@ def test_read_gbq_wildcard(`
`606`	`606`	`"query": {`
`607`	`607`	`"useQueryCache": True,`
`608`	`608`	`"maximumBytesBilled": "1000000000",`
`609`		`- "timeoutMs": 10000,`
	`609`	`+ "timeoutMs": 120_000,`
`610`	`610`	`}`
`611`	`611`	`},`
`612`	`612`	`pytest.param(`