diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index 1ba76dee5b..1bfbe0f734 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -301,6 +301,34 @@ def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: assert isinstance(op, string_ops.StrConcatOp) return pl.concat_str(l_input, r_input) + @compile_op.register(string_ops.StrContainsOp) + def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: + assert isinstance(op, string_ops.StrContainsOp) + return input.str.contains(pattern=op.pat, literal=True) + + @compile_op.register(string_ops.StrContainsRegexOp) + def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: + assert isinstance(op, string_ops.StrContainsRegexOp) + return input.str.contains(pattern=op.pat, literal=False) + + @compile_op.register(string_ops.StartsWithOp) + def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: + assert isinstance(op, string_ops.StartsWithOp) + if len(op.pat) == 1: + return input.str.starts_with(op.pat[0]) + else: + return pl.any_horizontal( + *(input.str.starts_with(pat) for pat in op.pat) + ) + + @compile_op.register(string_ops.EndsWithOp) + def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: + assert isinstance(op, string_ops.EndsWithOp) + if len(op.pat) == 1: + return input.str.ends_with(op.pat[0]) + else: + return pl.any_horizontal(*(input.str.ends_with(pat) for pat in op.pat)) + @compile_op.register(dt_ops.StrftimeOp) def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: assert isinstance(op, dt_ops.StrftimeOp) diff --git a/bigframes/session/polars_executor.py b/bigframes/session/polars_executor.py index 6e3f0ca10f..b93d31d255 100644 --- a/bigframes/session/polars_executor.py +++ b/bigframes/session/polars_executor.py @@ -21,7 +21,13 @@ from bigframes.core import array_value, bigframe_node, expression, local_data, nodes import bigframes.operations from bigframes.operations import aggregations as agg_ops -from bigframes.operations import bool_ops, comparison_ops, generic_ops, numeric_ops +from bigframes.operations import ( + bool_ops, + comparison_ops, + generic_ops, + numeric_ops, + string_ops, +) from bigframes.session import executor, semi_executor if TYPE_CHECKING: @@ -69,6 +75,10 @@ generic_ops.IsInOp, generic_ops.IsNullOp, generic_ops.NotNullOp, + string_ops.StartsWithOp, + string_ops.EndsWithOp, + string_ops.StrContainsOp, + string_ops.StrContainsRegexOp, ) _COMPATIBLE_AGG_OPS = ( agg_ops.SizeOp, diff --git a/tests/system/small/engines/test_strings.py b/tests/system/small/engines/test_strings.py new file mode 100644 index 0000000000..cbab517ef0 --- /dev/null +++ b/tests/system/small/engines/test_strings.py @@ -0,0 +1,77 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from bigframes.core import array_value +import bigframes.operations as ops +from bigframes.session import polars_executor +from bigframes.testing.engine_utils import assert_equivalence_execution + +pytest.importorskip("polars") + +# Polars used as reference as its fast and local. Generally though, prefer gbq engine where they disagree. +REFERENCE_ENGINE = polars_executor.PolarsExecutor() + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +def test_engines_str_contains(scalars_array_value: array_value.ArrayValue, engine): + arr, _ = scalars_array_value.compute_values( + [ + ops.StrContainsOp("(?i)hEllo").as_expr("string_col"), + ops.StrContainsOp("Hello").as_expr("string_col"), + ops.StrContainsOp("T").as_expr("string_col"), + ops.StrContainsOp(".*").as_expr("string_col"), + ] + ) + assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +def test_engines_str_contains_regex( + scalars_array_value: array_value.ArrayValue, engine +): + arr, _ = scalars_array_value.compute_values( + [ + ops.StrContainsRegexOp("(?i)hEllo").as_expr("string_col"), + ops.StrContainsRegexOp("Hello").as_expr("string_col"), + ops.StrContainsRegexOp("T").as_expr("string_col"), + ops.StrContainsRegexOp(".*").as_expr("string_col"), + ] + ) + assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +def test_engines_str_startswith(scalars_array_value: array_value.ArrayValue, engine): + arr, _ = scalars_array_value.compute_values( + [ + ops.StartsWithOp("He").as_expr("string_col"), + ops.StartsWithOp("llo").as_expr("string_col"), + ops.StartsWithOp(("He", "T", "ca")).as_expr("string_col"), + ] + ) + assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +def test_engines_str_endswith(scalars_array_value: array_value.ArrayValue, engine): + arr, _ = scalars_array_value.compute_values( + [ + ops.EndsWithOp("!").as_expr("string_col"), + ops.EndsWithOp("llo").as_expr("string_col"), + ops.EndsWithOp(("He", "T", "ca")).as_expr("string_col"), + ] + ) + assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine)