Merge branch 'main' into output_schema

ccarpentiere · web-flow · commit c34409d42c2f · 2025-08-28T09:47:48.000-07:00
diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py
@@ -301,6 +301,34 @@ def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr:
             assert isinstance(op, string_ops.StrConcatOp)
             return pl.concat_str(l_input, r_input)
 
+        @compile_op.register(string_ops.StrContainsOp)
+        def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr:
+            assert isinstance(op, string_ops.StrContainsOp)
+            return input.str.contains(pattern=op.pat, literal=True)
+
+        @compile_op.register(string_ops.StrContainsRegexOp)
+        def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr:
+            assert isinstance(op, string_ops.StrContainsRegexOp)
+            return input.str.contains(pattern=op.pat, literal=False)
+
+        @compile_op.register(string_ops.StartsWithOp)
+        def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr:
+            assert isinstance(op, string_ops.StartsWithOp)
+            if len(op.pat) == 1:
+                return input.str.starts_with(op.pat[0])
+            else:
+                return pl.any_horizontal(
+                    *(input.str.starts_with(pat) for pat in op.pat)
+                )
+
+        @compile_op.register(string_ops.EndsWithOp)
+        def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr:
+            assert isinstance(op, string_ops.EndsWithOp)
+            if len(op.pat) == 1:
+                return input.str.ends_with(op.pat[0])
+            else:
+                return pl.any_horizontal(*(input.str.ends_with(pat) for pat in op.pat))
+
         @compile_op.register(dt_ops.StrftimeOp)
         def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr:
             assert isinstance(op, dt_ops.StrftimeOp)
diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py
@@ -253,6 +253,11 @@ def is_identity(self) -> bool:
     def transform_children(self, t: Callable[[Expression], Expression]) -> Expression:
         ...
 
+    def bottom_up(self, t: Callable[[Expression], Expression]) -> Expression:
+        expr = self.transform_children(lambda child: child.bottom_up(t))
+        expr = t(expr)
+        return expr
+
     def walk(self) -> Generator[Expression, None, None]:
         yield self
         for child in self.children:
diff --git a/bigframes/core/rewrite/op_lowering.py b/bigframes/core/rewrite/op_lowering.py
@@ -44,7 +44,7 @@ def lower_expr_step(expr: expression.Expression) -> expression.Expression:
                     return maybe_rule.lower(expr)
             return expr
 
-        return lower_expr_step(expr.transform_children(lower_expr_step))
+        return expr.bottom_up(lower_expr_step)
 
     def lower_node(node: bigframe_node.BigFrameNode) -> bigframe_node.BigFrameNode:
         if isinstance(
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -582,6 +582,7 @@ def __getitem__(
             # Index of column labels can be treated the same as a sequence of column labels.
             pandas.Index,
             bigframes.series.Series,
+            slice,
         ],
     ):  # No return type annotations (like pandas) as type cannot always be determined statically
         # NOTE: This implements the operations described in
diff --git a/bigframes/ml/cluster.py b/bigframes/ml/cluster.py
@@ -59,7 +59,7 @@ def __init__(
         warm_start: bool = False,
     ):
         self.n_clusters = n_clusters
-        # allow the alias to be compatible with sklean
+        # allow the alias to be compatible with sklearn
         self.init = "kmeans++" if init == "k-means++" else init
         self.init_col = init_col
         self.distance_type = distance_type
diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py
@@ -211,7 +211,7 @@ def _fit(
         Args:
             X (bigframes.dataframe.DataFrame or bigframes.series.Series,
             or pandas.core.frame.DataFrame or pandas.core.series.Series):
-                A dataframe or series of trainging timestamp.
+                A dataframe or series of training timestamp.
             y (bigframes.dataframe.DataFrame, or bigframes.series.Series,
             or pandas.core.frame.DataFrame, or pandas.core.series.Series):
                 Target values for training.
diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py
@@ -834,7 +834,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> GeminiTextGenerator:
 class Claude3TextGenerator(base.RetriableRemotePredictor):
     """Claude3 text generator LLM model.
 
-    Go to Google Cloud Console -> Vertex AI -> Model Garden page to enabe the models before use. Must have the Consumer Procurement Entitlement Manager Identity and Access Management (IAM) role to enable the models.
+    Go to Google Cloud Console -> Vertex AI -> Model Garden page to enable the models before use. Must have the Consumer Procurement Entitlement Manager Identity and Access Management (IAM) role to enable the models.
     https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/use-partner-models#grant-permissions
 
     .. note::
diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py
@@ -82,7 +82,7 @@ def train_test_split(
     dfs = list(utils.batch_convert_to_dataframe(*arrays))
 
     def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFrame]:
-        """Split a single DF accoding to the stratify Series."""
+        """Split a single DF according to the stratify Series."""
         stratify = stratify.rename("bigframes_stratify_col")  # avoid name conflicts
         merged_df = df.join(stratify.to_frame(), how="outer")
 
diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py
@@ -434,7 +434,7 @@ def _compile_to_sql(
         if columns is None:
             columns = X.columns
         drop = self.drop if self.drop is not None else "none"
-        # minus one here since BQML's inplimentation always includes index 0, and top_k is on top of that.
+        # minus one here since BQML's implementation always includes index 0, and top_k is on top of that.
         top_k = (
             (self.max_categories - 1)
             if self.max_categories is not None
diff --git a/bigframes/session/metrics.py b/bigframes/session/metrics.py
@@ -45,12 +45,17 @@ def count_job_stats(
             bytes_processed = getattr(row_iterator, "total_bytes_processed", 0) or 0
             query_char_count = len(getattr(row_iterator, "query", "") or "")
             slot_millis = getattr(row_iterator, "slot_millis", 0) or 0
-            exec_seconds = 0.0
+            created = getattr(row_iterator, "created", None)
+            ended = getattr(row_iterator, "ended", None)
+            exec_seconds = (
+                (ended - created).total_seconds() if created and ended else 0.0
+            )
 
             self.execution_count += 1
             self.query_char_count += query_char_count
             self.bytes_processed += bytes_processed
             self.slot_millis += slot_millis
+            self.execution_secs += exec_seconds
 
         elif query_job.configuration.dry_run:
             query_char_count = len(query_job.query)
diff --git a/bigframes/session/polars_executor.py b/bigframes/session/polars_executor.py
@@ -21,7 +21,13 @@
 from bigframes.core import array_value, bigframe_node, expression, local_data, nodes
 import bigframes.operations
 from bigframes.operations import aggregations as agg_ops
-from bigframes.operations import bool_ops, comparison_ops, generic_ops, numeric_ops
+from bigframes.operations import (
+    bool_ops,
+    comparison_ops,
+    generic_ops,
+    numeric_ops,
+    string_ops,
+)
 from bigframes.session import executor, semi_executor
 
 if TYPE_CHECKING:
@@ -69,6 +75,10 @@
     generic_ops.IsInOp,
     generic_ops.IsNullOp,
     generic_ops.NotNullOp,
+    string_ops.StartsWithOp,
+    string_ops.EndsWithOp,
+    string_ops.StrContainsOp,
+    string_ops.StrContainsRegexOp,
 )
 _COMPATIBLE_AGG_OPS = (
     agg_ops.SizeOp,
diff --git a/scripts/run_and_publish_benchmark.py b/scripts/run_and_publish_benchmark.py
@@ -84,43 +84,36 @@ def collect_benchmark_result(
     path = pathlib.Path(benchmark_path)
     try:
         results_dict: Dict[str, List[Union[int, float, None]]] = {}
-        bytes_files = sorted(path.rglob("*.bytesprocessed"))
-        millis_files = sorted(path.rglob("*.slotmillis"))
-        bq_seconds_files = sorted(path.rglob("*.bq_exec_time_seconds"))
+        # Use local_seconds_files as the baseline
         local_seconds_files = sorted(path.rglob("*.local_exec_time_seconds"))
-        query_char_count_files = sorted(path.rglob("*.query_char_count"))
-
         error_files = sorted(path.rglob("*.error"))
-
-        if not (
-            len(millis_files)
-            == len(bq_seconds_files)
-            <= len(bytes_files)
-            == len(query_char_count_files)
-            == len(local_seconds_files)
-        ):
-            raise ValueError(
-                "Mismatch in the number of report files for bytes, millis, seconds and query char count: \n"
-                f"millis_files: {len(millis_files)}\n"
-                f"bq_seconds_files: {len(bq_seconds_files)}\n"
-                f"bytes_files: {len(bytes_files)}\n"
-                f"query_char_count_files: {len(query_char_count_files)}\n"
-                f"local_seconds_files: {len(local_seconds_files)}\n"
-            )
-
-        has_full_metrics = len(bq_seconds_files) == len(local_seconds_files)
-
-        for idx in range(len(local_seconds_files)):
-            query_char_count_file = query_char_count_files[idx]
-            local_seconds_file = local_seconds_files[idx]
-            bytes_file = bytes_files[idx]
-            filename = query_char_count_file.relative_to(path).with_suffix("")
-            if filename != local_seconds_file.relative_to(path).with_suffix(
-                ""
-            ) or filename != bytes_file.relative_to(path).with_suffix(""):
-                raise ValueError(
-                    "File name mismatch among query_char_count, bytes and seconds reports."
-                )
+        benchmarks_with_missing_files = []
+
+        for local_seconds_file in local_seconds_files:
+            base_name = local_seconds_file.name.removesuffix(".local_exec_time_seconds")
+            base_path = local_seconds_file.parent / base_name
+            filename = base_path.relative_to(path)
+
+            # Construct paths for other metric files
+            bytes_file = pathlib.Path(f"{base_path}.bytesprocessed")
+            millis_file = pathlib.Path(f"{base_path}.slotmillis")
+            bq_seconds_file = pathlib.Path(f"{base_path}.bq_exec_time_seconds")
+            query_char_count_file = pathlib.Path(f"{base_path}.query_char_count")
+
+            # Check if all corresponding files exist
+            missing_files = []
+            if not bytes_file.exists():
+                missing_files.append(bytes_file.name)
+            if not millis_file.exists():
+                missing_files.append(millis_file.name)
+            if not bq_seconds_file.exists():
+                missing_files.append(bq_seconds_file.name)
+            if not query_char_count_file.exists():
+                missing_files.append(query_char_count_file.name)
+
+            if missing_files:
+                benchmarks_with_missing_files.append((str(filename), missing_files))
+                continue
 
             with open(query_char_count_file, "r") as file:
                 lines = file.read().splitlines()
@@ -135,26 +128,13 @@ def collect_benchmark_result(
                 lines = file.read().splitlines()
                 total_bytes = sum(int(line) for line in lines) / iterations
 
-            if not has_full_metrics:
-                total_slot_millis = None
-                bq_seconds = None
-            else:
-                millis_file = millis_files[idx]
-                bq_seconds_file = bq_seconds_files[idx]
-                if filename != millis_file.relative_to(path).with_suffix(
-                    ""
-                ) or filename != bq_seconds_file.relative_to(path).with_suffix(""):
-                    raise ValueError(
-                        "File name mismatch among query_char_count, bytes, millis, and seconds reports."
-                    )
-
-                with open(millis_file, "r") as file:
-                    lines = file.read().splitlines()
-                    total_slot_millis = sum(int(line) for line in lines) / iterations
+            with open(millis_file, "r") as file:
+                lines = file.read().splitlines()
+                total_slot_millis = sum(int(line) for line in lines) / iterations
 
-                with open(bq_seconds_file, "r") as file:
-                    lines = file.read().splitlines()
-                    bq_seconds = sum(float(line) for line in lines) / iterations
+            with open(bq_seconds_file, "r") as file:
+                lines = file.read().splitlines()
+                bq_seconds = sum(float(line) for line in lines) / iterations
 
             results_dict[str(filename)] = [
                 query_count,
@@ -207,13 +187,9 @@ def collect_benchmark_result(
             f"{index} - query count: {row['Query_Count']},"
             + f" query char count: {row['Query_Char_Count']},"
             + f" bytes processed sum: {row['Bytes_Processed']},"
-            + (f" slot millis sum: {row['Slot_Millis']}," if has_full_metrics else "")
-            + f" local execution time: {formatted_local_exec_time} seconds"
-            + (
-                f", bigquery execution time: {round(row['BigQuery_Execution_Time_Sec'], 1)} seconds"
-                if has_full_metrics
-                else ""
-            )
+            + f" slot millis sum: {row['Slot_Millis']},"
+            + f" local execution time: {formatted_local_exec_time}"
+            + f", bigquery execution time: {round(row['BigQuery_Execution_Time_Sec'], 1)} seconds"
         )
 
     geometric_mean_queries = geometric_mean_excluding_zeros(
@@ -239,30 +215,26 @@ def collect_benchmark_result(
         f"---Geometric mean of queries: {geometric_mean_queries},"
         + f" Geometric mean of queries char counts: {geometric_mean_query_char_count},"
         + f" Geometric mean of bytes processed: {geometric_mean_bytes},"
-        + (
-            f" Geometric mean of slot millis: {geometric_mean_slot_millis},"
-            if has_full_metrics
-            else ""
-        )
+        + f" Geometric mean of slot millis: {geometric_mean_slot_millis},"
         + f" Geometric mean of local execution time: {geometric_mean_local_seconds} seconds"
-        + (
-            f", Geometric mean of BigQuery execution time: {geometric_mean_bq_seconds} seconds---"
-            if has_full_metrics
-            else ""
-        )
+        + f", Geometric mean of BigQuery execution time: {geometric_mean_bq_seconds} seconds---"
     )
 
-    error_message = (
-        "\n"
-        + "\n".join(
-            [
-                f"Failed: {error_file.relative_to(path).with_suffix('')}"
-                for error_file in error_files
-            ]
+    all_errors: List[str] = []
+    if error_files:
+        all_errors.extend(
+            f"Failed: {error_file.relative_to(path).with_suffix('')}"
+            for error_file in error_files
         )
-        if error_files
-        else None
-    )
+    if (
+        benchmarks_with_missing_files
+        and os.getenv("BENCHMARK_AND_PUBLISH", "false") == "true"
+    ):
+        all_errors.extend(
+            f"Missing files for benchmark '{name}': {files}"
+            for name, files in benchmarks_with_missing_files
+        )
+    error_message = "\n" + "\n".join(all_errors) if all_errors else None
     return (
         benchmark_metrics.reset_index().rename(columns={"index": "Benchmark_Name"}),
         error_message,
diff --git a/testing/constraints-3.11.txt b/testing/constraints-3.11.txt
@@ -152,7 +152,7 @@ google-auth==2.38.0
 google-auth-httplib2==0.2.0
 google-auth-oauthlib==1.2.2
 google-cloud-aiplatform==1.106.0
-google-cloud-bigquery==3.35.1
+google-cloud-bigquery==3.36.0
 google-cloud-bigquery-connection==1.18.3
 google-cloud-bigquery-storage==2.32.0
 google-cloud-core==2.4.3
diff --git a/tests/system/small/engines/test_generic_ops.py b/tests/system/small/engines/test_generic_ops.py
@@ -423,3 +423,18 @@ def test_engines_isin_op(scalars_array_value: array_value.ArrayValue, engine):
     )
 
     assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine)
+
+
+@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True)
+def test_engines_isin_op_nested_filter(
+    scalars_array_value: array_value.ArrayValue, engine
+):
+    isin_clause = ops.IsInOp((1, 2, 3)).as_expr(expression.deref("int64_col"))
+    filter_clause = ops.invert_op.as_expr(
+        ops.or_op.as_expr(
+            expression.deref("bool_col"), ops.invert_op.as_expr(isin_clause)
+        )
+    )
+    arr = scalars_array_value.filter(filter_clause)
+
+    assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine)
diff --git a/tests/system/small/engines/test_strings.py b/tests/system/small/engines/test_strings.py