diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index bc077c1ce3..f3653efc56 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -1062,7 +1062,7 @@ def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp): if op.match_nulls and contains_nulls: return x.isnull() | x.isin(matchable_ibis_values) else: - return x.isin(matchable_ibis_values) + return x.isin(matchable_ibis_values).fillna(False) @scalar_op_compiler.register_unary_op(ops.ToDatetimeOp, pass_op=True) diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index a3c5d5a80e..dfd4ace18e 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -259,11 +259,9 @@ def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: # TODO: Filter out types that can't be coerced to right type assert isinstance(op, gen_ops.IsInOp) - if op.match_nulls or not any(map(pd.isna, op.values)): - # newer polars version have nulls_equal arg - return input.is_in(op.values) - else: - return input.is_in(op.values) or input.is_null() + assert not op.match_nulls # should be stripped by a lowering step rn + values = pl.Series(op.values, strict=False) + return input.is_in(values) @compile_op.register(gen_ops.FillNaOp) @compile_op.register(gen_ops.CoalesceOp) diff --git a/bigframes/core/compile/polars/lowering.py b/bigframes/core/compile/polars/lowering.py index f6ed6c676c..876ff2794f 100644 --- a/bigframes/core/compile/polars/lowering.py +++ b/bigframes/core/compile/polars/lowering.py @@ -13,8 +13,10 @@ # limitations under the License. import dataclasses +from typing import cast import numpy as np +import pandas as pd from bigframes import dtypes from bigframes.core import bigframe_node, expression @@ -316,6 +318,35 @@ def lower(self, expr: expression.OpExpression) -> expression.Expression: return expr +class LowerIsinOp(op_lowering.OpLoweringRule): + @property + def op(self) -> type[ops.ScalarOp]: + return generic_ops.IsInOp + + def lower(self, expr: expression.OpExpression) -> expression.Expression: + assert isinstance(expr.op, generic_ops.IsInOp) + arg = expr.children[0] + new_values = [] + match_nulls = False + for val in expr.op.values: + # coercible, non-coercible + # float NaN/inf should be treated as distinct from 'true' null values + if cast(bool, pd.isna(val)) and not isinstance(val, float): + if expr.op.match_nulls: + match_nulls = True + elif dtypes.is_compatible(val, arg.output_type): + new_values.append(val) + else: + pass + + new_isin = ops.IsInOp(tuple(new_values), match_nulls=False).as_expr(arg) + if match_nulls: + return ops.coalesce_op.as_expr(new_isin, expression.const(True)) + else: + # polars propagates nulls, so need to coalesce to false + return ops.coalesce_op.as_expr(new_isin, expression.const(False)) + + def _coerce_comparables( expr1: expression.Expression, expr2: expression.Expression, @@ -414,6 +445,7 @@ def _lower_cast(cast_op: ops.AsTypeOp, arg: expression.Expression): LowerModRule(), LowerAsTypeRule(), LowerInvertOp(), + LowerIsinOp(), ) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index bcad00830d..b388ac0097 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2755,11 +2755,11 @@ def isin(self, values) -> DataFrame: False, label=label, dtype=pandas.BooleanDtype() ) result_ids.append(result_id) - return DataFrame(block.select_columns(result_ids)).fillna(value=False) + return DataFrame(block.select_columns(result_ids)) elif utils.is_list_like(values): return self._apply_unary_op( ops.IsInOp(values=tuple(values), match_nulls=True) - ).fillna(value=False) + ) else: raise TypeError( "only list-like objects are allowed to be passed to " diff --git a/bigframes/session/polars_executor.py b/bigframes/session/polars_executor.py index 8aa7fd9002..963b5b06a7 100644 --- a/bigframes/session/polars_executor.py +++ b/bigframes/session/polars_executor.py @@ -63,6 +63,7 @@ generic_ops.FillNaOp, generic_ops.CaseWhenOp, generic_ops.InvertOp, + generic_ops.IsInOp, ) _COMPATIBLE_AGG_OPS = ( agg_ops.SizeOp, diff --git a/tests/system/small/engines/test_generic_ops.py b/tests/system/small/engines/test_generic_ops.py index 9fdb6bca78..1d28c335a6 100644 --- a/tests/system/small/engines/test_generic_ops.py +++ b/tests/system/small/engines/test_generic_ops.py @@ -390,3 +390,36 @@ def test_engines_invert_op(scalars_array_value: array_value.ArrayValue, engine): ) assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +def test_engines_isin_op(scalars_array_value: array_value.ArrayValue, engine): + arr, col_ids = scalars_array_value.compute_values( + [ + ops.IsInOp((1, 2, 3)).as_expr(expression.deref("int64_col")), + ops.IsInOp((None, 123456)).as_expr(expression.deref("int64_col")), + ops.IsInOp((None, 123456), match_nulls=False).as_expr( + expression.deref("int64_col") + ), + ops.IsInOp((1.0, 2.0, 3.0)).as_expr(expression.deref("int64_col")), + ops.IsInOp(("1.0", "2.0")).as_expr(expression.deref("int64_col")), + ops.IsInOp(("1.0", 2.5, 3)).as_expr(expression.deref("int64_col")), + ops.IsInOp(()).as_expr(expression.deref("int64_col")), + ops.IsInOp((1, 2, 3, None)).as_expr(expression.deref("float64_col")), + ] + ) + new_names = ( + "int in ints", + "int in ints w null", + "int in ints w null wo match nulls", + "int in floats", + "int in strings", + "int in mixed", + "int in empty", + "float in ints", + ) + arr = arr.rename_columns( + {old_name: new_names[i] for i, old_name in enumerate(col_ids)} + ) + + assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 3b70dec0e9..f752346bef 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1591,7 +1591,7 @@ def test_itertuples(scalars_df_index, index, name): assert bf_tuple == pd_tuple -def test_df_isin_list(scalars_dfs): +def test_df_isin_list_w_null(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs values = ["Hello, World!", 55555, 2.51, pd.NA, True] bf_result = ( @@ -1606,6 +1606,21 @@ def test_df_isin_list(scalars_dfs): pandas.testing.assert_frame_equal(bf_result, pd_result.astype("boolean")) +def test_df_isin_list_wo_null(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + values = ["Hello, World!", 55555, 2.51, True] + bf_result = ( + scalars_df[["int64_col", "float64_col", "string_col", "bool_col"]] + .isin(values) + .to_pandas() + ) + pd_result = scalars_pandas_df[ + ["int64_col", "float64_col", "string_col", "bool_col"] + ].isin(values) + + pandas.testing.assert_frame_equal(bf_result, pd_result.astype("boolean")) + + def test_df_isin_dict(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs values = {