Skip to content

Commit cd28e12

Browse files
tswastgoogle-labs-jules[bot]gcf-owl-bot[bot]
authored
feat: support typed pyarrow.Scalar in assignment (#1930)
* I am working on adding support for pyarrow.Scalar to infer_literal_method. * Update setup.py * Update tests/unit/core/test_dtypes.py * patch ibis * increase timeout * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * lint --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
1 parent f8d851f commit cd28e12

File tree

7 files changed

+80
-9
lines changed

7 files changed

+80
-9
lines changed

bigframes/dtypes.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -659,6 +659,8 @@ def _dtype_from_string(dtype_string: str) -> typing.Optional[Dtype]:
659659

660660
def infer_literal_type(literal) -> typing.Optional[Dtype]:
661661
# Maybe also normalize literal to canonical python representation to remove this burden from compilers?
662+
if isinstance(literal, pa.Scalar):
663+
return arrow_dtype_to_bigframes_dtype(literal.type)
662664
if pd.api.types.is_list_like(literal):
663665
element_types = [infer_literal_type(i) for i in literal]
664666
common_type = lcd_type(*element_types)

tests/system/small/test_dataframe.py

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -906,15 +906,53 @@ def test_df_to_pandas_batches(scalars_dfs):
906906
assert_pandas_df_equal(pd.concat(filtered_batches), pd_result)
907907

908908

909-
def test_assign_new_column(scalars_dfs):
909+
@pytest.mark.parametrize(
910+
("literal", "expected_dtype"),
911+
(
912+
pytest.param(
913+
2,
914+
dtypes.INT_DTYPE,
915+
id="INT64",
916+
),
917+
# ====================================================================
918+
# NULL values
919+
#
920+
# These are regression tests for b/428999884. It needs to be possible to
921+
# set a column to NULL with a desired type (not just the pandas default
922+
# of float64).
923+
# ====================================================================
924+
pytest.param(None, dtypes.FLOAT_DTYPE, id="NULL-None"),
925+
pytest.param(
926+
pa.scalar(None, type=pa.int64()),
927+
dtypes.INT_DTYPE,
928+
id="NULL-pyarrow-TIMESTAMP",
929+
),
930+
pytest.param(
931+
pa.scalar(None, type=pa.timestamp("us", tz="UTC")),
932+
dtypes.TIMESTAMP_DTYPE,
933+
id="NULL-pyarrow-TIMESTAMP",
934+
),
935+
pytest.param(
936+
pa.scalar(None, type=pa.timestamp("us")),
937+
dtypes.DATETIME_DTYPE,
938+
id="NULL-pyarrow-DATETIME",
939+
),
940+
),
941+
)
942+
def test_assign_new_column_w_literal(scalars_dfs, literal, expected_dtype):
910943
scalars_df, scalars_pandas_df = scalars_dfs
911-
kwargs = {"new_col": 2}
912-
df = scalars_df.assign(**kwargs)
944+
df = scalars_df.assign(new_col=literal)
913945
bf_result = df.to_pandas()
914-
pd_result = scalars_pandas_df.assign(**kwargs)
915946

916-
# Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes.
917-
pd_result["new_col"] = pd_result["new_col"].astype("Int64")
947+
new_col_pd = literal
948+
if isinstance(literal, pa.Scalar):
949+
# PyArrow integer scalars aren't yet supported in pandas Int64Dtype.
950+
new_col_pd = literal.as_py()
951+
952+
# Pandas might not pick the same dtype as BigFrames, but it should at least
953+
# be castable to it.
954+
pd_result = scalars_pandas_df.assign(new_col=new_col_pd)
955+
pd_result["new_col"] = pd_result["new_col"].astype(expected_dtype)
918956

919957
assert_pandas_df_equal(bf_result, pd_result)
920958

tests/system/small/test_session.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -606,7 +606,7 @@ def test_read_gbq_wildcard(
606606
"query": {
607607
"useQueryCache": True,
608608
"maximumBytesBilled": "1000000000",
609-
"timeoutMs": 10000,
609+
"timeoutMs": 120_000,
610610
}
611611
},
612612
pytest.param(

tests/unit/core/test_dtypes.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,3 +272,19 @@ def test_literal_to_ibis_scalar_throws_on_incompatible_literal():
272272
ValueError,
273273
):
274274
bigframes.core.compile.ibis_types.literal_to_ibis_scalar({"mykey": "myval"})
275+
276+
277+
@pytest.mark.parametrize(
278+
["scalar", "expected_dtype"],
279+
[
280+
(pa.scalar(1_000_000_000, type=pa.int64()), bigframes.dtypes.INT_DTYPE),
281+
(pa.scalar(True, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE),
282+
(pa.scalar("hello", type=pa.string()), bigframes.dtypes.STRING_DTYPE),
283+
# Support NULL scalars.
284+
(pa.scalar(None, type=pa.int64()), bigframes.dtypes.INT_DTYPE),
285+
(pa.scalar(None, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE),
286+
(pa.scalar(None, type=pa.string()), bigframes.dtypes.STRING_DTYPE),
287+
],
288+
)
289+
def test_infer_literal_type_arrow_scalar(scalar, expected_dtype):
290+
assert bigframes.dtypes.infer_literal_type(scalar) == expected_dtype

third_party/bigframes_vendored/ibis/common/temporal.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,3 +260,8 @@ def _from_numpy_datetime64(value):
260260
raise TypeError("Unable to convert np.datetime64 without pandas")
261261
else:
262262
return pd.Timestamp(value).to_pydatetime()
263+
264+
265+
@normalize_datetime.register("pyarrow.Scalar")
266+
def _from_pyarrow_scalar(value):
267+
return value.as_py()

third_party/bigframes_vendored/ibis/expr/datatypes/value.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import bigframes_vendored.ibis.expr.datatypes as dt
2828
from bigframes_vendored.ibis.expr.datatypes.cast import highest_precedence
2929
from public import public
30+
import pyarrow as pa
3031
import toolz
3132

3233

@@ -71,6 +72,14 @@ def infer_list(values: Sequence[Any]) -> dt.Array:
7172
return dt.Array(highest_precedence(map(infer, values)))
7273

7374

75+
@infer.register("pyarrow.Scalar")
76+
def infer_pyarrow_scalar(value: "pa.Scalar"):
77+
"""Infert the type of a PyArrow Scalar value."""
78+
import bigframes_vendored.ibis.formats.pyarrow
79+
80+
return bigframes_vendored.ibis.formats.pyarrow.PyArrowType.to_ibis(value.type)
81+
82+
7483
@infer.register(datetime.time)
7584
def infer_time(value: datetime.time) -> dt.Time:
7685
return dt.time
@@ -253,6 +262,9 @@ def infer_shapely_multipolygon(value) -> dt.MultiPolygon:
253262
def normalize(typ, value):
254263
"""Ensure that the Python type underlying a literal resolves to a single type."""
255264

265+
if pa is not None and isinstance(value, pa.Scalar):
266+
value = value.as_py()
267+
256268
dtype = dt.dtype(typ)
257269
if value is None:
258270
if not dtype.nullable:

third_party/bigframes_vendored/ibis/formats/pyarrow.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
@functools.cache
2525
def _from_pyarrow_types():
2626
import pyarrow as pa
27-
import pyarrow_hotfix # noqa: F401
2827

2928
return {
3029
pa.int8(): dt.Int8,
@@ -87,7 +86,6 @@ class PyArrowType(TypeMapper):
8786
def to_ibis(cls, typ: pa.DataType, nullable=True) -> dt.DataType:
8887
"""Convert a pyarrow type to an ibis type."""
8988
import pyarrow as pa
90-
import pyarrow_hotfix # noqa: F401
9189

9290
if pa.types.is_null(typ):
9391
return dt.null

0 commit comments

Comments
 (0)