refactor: Unify compile paths with ResultNode (#1636)

TrevorBergeron · web-flow · commit 3c314c3d328d · 2025-04-21T16:51:10.000-07:00
diff --git a/bigframes/core/compile/api.py b/bigframes/core/compile/api.py
@@ -23,7 +23,6 @@
 if TYPE_CHECKING:
     import bigframes.core.nodes
     import bigframes.core.ordering
-    import bigframes.core.schema
 
 
 class SQLCompiler:
@@ -35,8 +34,8 @@ def compile(
         limit: Optional[int] = None,
     ) -> str:
         """Compile node into sql where rows are sorted with ORDER BY."""
-        # If we are ordering the query anyways, compiling the slice as a limit is probably a good idea.
-        return compiler.compile_sql(node, ordered=ordered, limit=limit)
+        request = compiler.CompileRequest(node, sort_rows=ordered, peek_count=limit)
+        return compiler.compile_sql(request).sql
 
     def compile_raw(
         self,
@@ -45,15 +44,20 @@ def compile_raw(
         str, Sequence[bigquery.SchemaField], bigframes.core.ordering.RowOrdering
     ]:
         """Compile node into sql that exposes all columns, including hidden ordering-only columns."""
-        return compiler.compile_raw(node)
+        request = compiler.CompileRequest(
+            node, sort_rows=False, materialize_all_order_keys=True
+        )
+        result = compiler.compile_sql(request)
+        assert result.row_order is not None
+        return result.sql, result.sql_schema, result.row_order
 
 
 def test_only_ibis_inferred_schema(node: bigframes.core.nodes.BigFrameNode):
     """Use only for testing paths to ensure ibis inferred schema does not diverge from bigframes inferred schema."""
     import bigframes.core.schema
 
     node = compiler._replace_unsupported_ops(node)
-    node, _ = rewrite.pull_up_order(node, order_root=False)
+    node = rewrite.bake_order(node)
     ir = compiler.compile_node(node)
     items = tuple(
         bigframes.core.schema.SchemaItem(name, ir.get_column_type(ibis_id))
diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py
@@ -69,23 +69,28 @@ def __init__(
 
     def to_sql(
         self,
-        *,
-        order_by: Sequence[OrderingExpression] = (),
-        limit: Optional[int] = None,
-        selections: Optional[Sequence[str]] = None,
+        order_by: Sequence[OrderingExpression],
+        limit: Optional[int],
+        selections: tuple[tuple[ex.DerefOp, str], ...],
     ) -> str:
         ibis_table = self._to_ibis_expr()
         # This set of output transforms maybe should be its own output node??
-        if (
-            order_by
-            or limit
-            or (selections and (tuple(selections) != tuple(self.column_ids)))
-        ):
+
+        selection_strings = tuple((ref.id.sql, name) for ref, name in selections)
+
+        names_preserved = tuple(name for _, name in selections) == tuple(
+            self.column_ids
+        )
+        is_noop_selection = (
+            all((i[0] == i[1] for i in selection_strings)) and names_preserved
+        )
+
+        if order_by or limit or not is_noop_selection:
             sql = ibis_bigquery.Backend().compile(ibis_table)
             sql = (
                 bigframes.core.compile.googlesql.Select()
                 .from_(sql)
-                .select(selections or self.column_ids)
+                .select(selection_strings)
                 .sql()
             )
 
diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py
@@ -13,8 +13,10 @@
 # limitations under the License.
 from __future__ import annotations
 
+import dataclasses
 import functools
 import typing
+from typing import cast, Optional
 
 import bigframes_vendored.ibis.backends.bigquery as ibis_bigquery
 import bigframes_vendored.ibis.expr.api as ibis_api
@@ -24,6 +26,7 @@
 import pyarrow as pa
 
 from bigframes import dtypes, operations
+from bigframes.core import expression
 import bigframes.core.compile.compiled as compiled
 import bigframes.core.compile.concat as concat_impl
 import bigframes.core.compile.explode
@@ -34,48 +37,58 @@
 
 if typing.TYPE_CHECKING:
     import bigframes.core
-    import bigframes.session
 
 
-def compile_sql(
-    node: nodes.BigFrameNode,
-    ordered: bool,
-    limit: typing.Optional[int] = None,
-) -> str:
-    # later steps might add ids, so snapshot before those steps.
-    output_ids = node.schema.names
-    if ordered:
-        # Need to do this before replacing unsupported ops, as that will rewrite slice ops
-        node, pulled_up_limit = rewrites.pullup_limit_from_slice(node)
-        if (pulled_up_limit is not None) and (
-            (limit is None) or limit > pulled_up_limit
-        ):
-            limit = pulled_up_limit
+@dataclasses.dataclass(frozen=True)
+class CompileRequest:
+    node: nodes.BigFrameNode
+    sort_rows: bool
+    materialize_all_order_keys: bool = False
+    peek_count: typing.Optional[int] = None
+
+
+@dataclasses.dataclass(frozen=True)
+class CompileResult:
+    sql: str
+    sql_schema: typing.Sequence[google.cloud.bigquery.SchemaField]
+    row_order: Optional[bf_ordering.RowOrdering]
 
-    node = _replace_unsupported_ops(node)
+
+def compile_sql(request: CompileRequest) -> CompileResult:
+    output_names = tuple((expression.DerefOp(id), id.sql) for id in request.node.ids)
+    result_node = nodes.ResultNode(
+        request.node,
+        output_cols=output_names,
+        limit=request.peek_count,
+    )
+    if request.sort_rows:
+        # Can only pullup slice if we are doing ORDER BY in outermost SELECT
+        # Need to do this before replacing unsupported ops, as that will rewrite slice ops
+        result_node = rewrites.pull_up_limits(result_node)
+    result_node = _replace_unsupported_ops(result_node)
     # prune before pulling up order to avoid unnnecessary row_number() ops
-    node = rewrites.column_pruning(node)
-    node, ordering = rewrites.pull_up_order(node, order_root=ordered)
-    # final pruning to cleanup up any leftovers unused values
-    node = rewrites.column_pruning(node)
-    return compile_node(node).to_sql(
-        order_by=ordering.all_ordering_columns if ordered else (),
-        limit=limit,
-        selections=output_ids,
+    result_node = cast(nodes.ResultNode, rewrites.column_pruning(result_node))
+    result_node = rewrites.defer_order(
+        result_node, output_hidden_row_keys=request.materialize_all_order_keys
     )
+    if request.sort_rows:
+        result_node = cast(nodes.ResultNode, rewrites.column_pruning(result_node))
+        sql = compile_result_node(result_node)
+        return CompileResult(
+            sql, result_node.schema.to_bigquery(), result_node.order_by
+        )
 
-
-def compile_raw(
-    node: nodes.BigFrameNode,
-) -> typing.Tuple[
-    str, typing.Sequence[google.cloud.bigquery.SchemaField], bf_ordering.RowOrdering
-]:
-    node = _replace_unsupported_ops(node)
-    node = rewrites.column_pruning(node)
-    node, ordering = rewrites.pull_up_order(node, order_root=True)
-    node = rewrites.column_pruning(node)
-    sql = compile_node(node).to_sql()
-    return sql, node.schema.to_bigquery(), ordering
+    ordering: Optional[bf_ordering.RowOrdering] = result_node.order_by
+    result_node = dataclasses.replace(result_node, order_by=None)
+    result_node = cast(nodes.ResultNode, rewrites.column_pruning(result_node))
+    sql = compile_result_node(result_node)
+    # Return the ordering iff no extra columns are needed to define the row order
+    if ordering is not None:
+        output_order = (
+            ordering if ordering.referenced_columns.issubset(result_node.ids) else None
+        )
+    assert (not request.materialize_all_order_keys) or (output_order is not None)
+    return CompileResult(sql, result_node.schema.to_bigquery(), output_order)
 
 
 def _replace_unsupported_ops(node: nodes.BigFrameNode):
@@ -86,6 +99,14 @@ def _replace_unsupported_ops(node: nodes.BigFrameNode):
     return node
 
 
+def compile_result_node(root: nodes.ResultNode) -> str:
+    return compile_node(root.child).to_sql(
+        order_by=root.order_by.all_ordering_columns if root.order_by else (),
+        limit=root.limit,
+        selections=root.output_cols,
+    )
+
+
 # TODO: Remove cache when schema no longer requires compilation to derive schema (and therefor only compiles for execution)
 @functools.lru_cache(maxsize=5000)
 def compile_node(node: nodes.BigFrameNode) -> compiled.UnorderedIR:
diff --git a/bigframes/core/compile/googlesql/expression.py b/bigframes/core/compile/googlesql/expression.py
@@ -25,7 +25,7 @@
 * `expression`: Models basic SQL expressions.
 
 Extended classes (not part of standard GoogleSQL syntax, but added for convenience):
-
+i
 * `ColumnExpression`:  Represents column references.
 * `TableExpression`:   Represents table references.
 * `AliasExpression`:   Represents aliased expressions.
diff --git a/bigframes/core/compile/googlesql/query.py b/bigframes/core/compile/googlesql/query.py
@@ -63,22 +63,31 @@ class Select(abc.SQLSyntax):
 
     def select(
         self,
-        columns: typing.Union[typing.Iterable[str], str, None] = None,
+        columns: typing.Union[
+            typing.Iterable[str], typing.Iterable[tuple[str, str]], str, None
+        ] = None,
         distinct: bool = False,
     ) -> Select:
         if isinstance(columns, str):
             columns = [columns]
         self.select_list: typing.List[typing.Union[SelectExpression, SelectAll]] = (
-            [
-                SelectExpression(expression=expr.ColumnExpression(name=column))
-                for column in columns
-            ]
+            [self._select_field(column) for column in columns]
             if columns
             else [SelectAll(expression=expr.StarExpression())]
         )
         self.distinct = distinct
         return self
 
+    def _select_field(self, field) -> SelectExpression:
+        if isinstance(field, str):
+            return SelectExpression(expression=expr.ColumnExpression(name=field))
+
+        else:
+            alias = field[1] if (field[0] != field[1]) else None
+            return SelectExpression(
+                expression=expr.ColumnExpression(name=field[0]), alias=alias
+            )
+
     def from_(
         self,
         sources: typing.Union[TABLE_SOURCE_TYPE, typing.Iterable[TABLE_SOURCE_TYPE]],
diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py
@@ -36,7 +36,7 @@
 from bigframes.core import identifiers, local_data
 from bigframes.core.bigframe_node import BigFrameNode, COLUMN_SET, Field
 import bigframes.core.expression as ex
-from bigframes.core.ordering import OrderingExpression
+from bigframes.core.ordering import OrderingExpression, RowOrdering
 import bigframes.core.slices as slices
 import bigframes.core.window_spec as window
 import bigframes.dtypes
@@ -1602,11 +1602,50 @@ def remap_refs(
 
 
 # Introduced during planing/compilation
+# TODO: Enforce more strictly that this should never be a child node
 @dataclasses.dataclass(frozen=True, eq=False)
 class ResultNode(UnaryNode):
-    output_names: tuple[str, ...]
-    order_by: Tuple[OrderingExpression, ...] = ()
+    output_cols: tuple[tuple[ex.DerefOp, str], ...]
+    order_by: Optional[RowOrdering] = None
     limit: Optional[int] = None
+    # TODO: CTE definitions
+
+    @property
+    def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]:
+        return ()
+
+    def remap_vars(
+        self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId]
+    ) -> ResultNode:
+        return self
+
+    def remap_refs(
+        self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId]
+    ) -> ResultNode:
+        output_names = tuple(
+            (ref.remap_column_refs(mappings), name) for ref, name in self.output_cols
+        )
+        order_by = self.order_by.remap_column_refs(mappings) if self.order_by else None
+        return dataclasses.replace(self, output_names=output_names, order_by=order_by)  # type: ignore
+
+    @property
+    def consumed_ids(self) -> COLUMN_SET:
+        out_refs = frozenset(ref.id for ref, _ in self.output_cols)
+        order_refs = self.order_by.referenced_columns if self.order_by else frozenset()
+        return out_refs | order_refs
+
+    @property
+    def row_count(self) -> Optional[int]:
+        child_count = self.child.row_count
+        if child_count is None:
+            return None
+        if self.limit is None:
+            return child_count
+        return min(self.limit, child_count)
+
+    @property
+    def variables_introduced(self) -> int:
+        return 0
 
 
 # Tree operators
diff --git a/bigframes/core/rewrite/__init__.py b/bigframes/core/rewrite/__init__.py
@@ -15,10 +15,10 @@
 from bigframes.core.rewrite.identifiers import remap_variables
 from bigframes.core.rewrite.implicit_align import try_row_join
 from bigframes.core.rewrite.legacy_align import legacy_join_as_projection
-from bigframes.core.rewrite.order import pull_up_order
+from bigframes.core.rewrite.order import bake_order, defer_order
 from bigframes.core.rewrite.pruning import column_pruning
 from bigframes.core.rewrite.scan_reduction import try_reduce_to_table_scan
-from bigframes.core.rewrite.slices import pullup_limit_from_slice, rewrite_slice
+from bigframes.core.rewrite.slices import pull_up_limits, rewrite_slice
 from bigframes.core.rewrite.timedeltas import rewrite_timedelta_expressions
 from bigframes.core.rewrite.windows import rewrite_range_rolling
 
@@ -27,10 +27,11 @@
     "try_row_join",
     "rewrite_slice",
     "rewrite_timedelta_expressions",
-    "pullup_limit_from_slice",
+    "pull_up_limits",
     "remap_variables",
-    "pull_up_order",
+    "defer_order",
     "column_pruning",
     "rewrite_range_rolling",
     "try_reduce_to_table_scan",
+    "bake_order",
 ]
diff --git a/bigframes/core/rewrite/order.py b/bigframes/core/rewrite/order.py
@@ -15,17 +15,40 @@
 import functools
 from typing import Mapping, Tuple
 
-from bigframes.core import identifiers
-import bigframes.core.expression
+from bigframes.core import expression, identifiers
 import bigframes.core.nodes
 import bigframes.core.ordering
 import bigframes.core.window_spec
-import bigframes.operations
 from bigframes.operations import aggregations as agg_ops
 
 
+def defer_order(
+    root: bigframes.core.nodes.ResultNode, output_hidden_row_keys: bool
+) -> bigframes.core.nodes.ResultNode:
+    new_child, order = _pull_up_order(root.child, order_root=True)
+    order_by = (
+        order.with_ordering_columns(root.order_by.all_ordering_columns)
+        if root.order_by
+        else order
+    )
+    if output_hidden_row_keys:
+        output_names = tuple((expression.DerefOp(id), id.sql) for id in new_child.ids)
+    else:
+        output_names = root.output_cols
+    return dataclasses.replace(
+        root, output_cols=output_names, child=new_child, order_by=order_by
+    )
+
+
+def bake_order(
+    node: bigframes.core.nodes.BigFrameNode,
+) -> bigframes.core.nodes.BigFrameNode:
+    node, _ = _pull_up_order(node, order_root=False)
+    return node
+
+
 # Makes ordering explicit in window definitions
-def pull_up_order(
+def _pull_up_order(
     root: bigframes.core.nodes.BigFrameNode,
     *,
     order_root: bool = True,
diff --git a/bigframes/core/rewrite/pruning.py b/bigframes/core/rewrite/pruning.py
diff --git a/bigframes/core/rewrite/slices.py b/bigframes/core/rewrite/slices.py