refactor: support to compile project and add_op (#1677)

chelsea-lin · web-flow · commit 8f115e760d65 · 2025-05-07T17:44:06.000-05:00
diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py
@@ -163,6 +163,16 @@ def compile_selection(
         )
         return child.select(selected_cols)
 
+    @_compile_node.register
+    def compile_projection(
+        self, node: nodes.ProjectionNode, child: ir.SQLGlotIR
+    ) -> ir.SQLGlotIR:
+        projected_cols: tuple[tuple[str, sge.Expression], ...] = tuple(
+            (id.sql, scalar_compiler.compile_scalar_expression(expr))
+            for expr, id in node.assignments
+        )
+        return child.project(projected_cols)
+
 
 def _replace_unsupported_ops(node: nodes.BigFrameNode):
     node = nodes.bottom_up(node, rewrite.rewrite_slice)
diff --git a/bigframes/core/compile/sqlglot/scalar_compiler.py b/bigframes/core/compile/sqlglot/scalar_compiler.py
@@ -18,6 +18,8 @@
 import sqlglot.expressions as sge
 
 from bigframes.core import expression
+import bigframes.core.compile.sqlglot.sqlglot_ir as ir
+import bigframes.operations as ops
 
 
 @functools.singledispatch
@@ -29,5 +31,47 @@ def compile_scalar_expression(
 
 
 @compile_scalar_expression.register
-def compile_deref_op(expr: expression.DerefOp):
+def compile_deref_expression(expr: expression.DerefOp) -> sge.Expression:
     return sge.ColumnDef(this=sge.to_identifier(expr.id.sql, quoted=True))
+
+
+@compile_scalar_expression.register
+def compile_constant_expression(
+    expr: expression.ScalarConstantExpression,
+) -> sge.Expression:
+    return ir._literal(expr.value, expr.dtype)
+
+
+@compile_scalar_expression.register
+def compile_op_expression(expr: expression.OpExpression):
+    # Non-recursively compiles the children scalar expressions.
+    args = tuple(map(compile_scalar_expression, expr.inputs))
+
+    op = expr.op
+    op_name = expr.op.__class__.__name__
+    method_name = f"compile_{op_name.lower()}"
+    method = globals().get(method_name, None)
+    if method is None:
+        raise ValueError(
+            f"Compilation method '{method_name}' not found for operator '{op_name}'."
+        )
+
+    if isinstance(op, ops.UnaryOp):
+        return method(op, args[0])
+    elif isinstance(op, ops.BinaryOp):
+        return method(op, args[0], args[1])
+    elif isinstance(op, ops.TernaryOp):
+        return method(op, args[0], args[1], args[2])
+    elif isinstance(op, ops.NaryOp):
+        return method(op, *args)
+    else:
+        raise TypeError(
+            f"Operator '{op_name}' has an unrecognized arity or type "
+            "and cannot be compiled."
+        )
+
+
+# TODO: add parenthesize for operators
+def compile_addop(op: ops.AddOp, left: sge.Expression, right: sge.Expression):
+    # TODO: support addop for string dtype.
+    return sge.Add(this=left, expression=right)
diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py
@@ -118,6 +118,21 @@ def select(
         new_expr = self._encapsulate_as_cte().select(*cols_expr, append=False)
         return SQLGlotIR(expr=new_expr)
 
+    def project(
+        self,
+        projected_cols: tuple[tuple[str, sge.Expression], ...],
+    ) -> SQLGlotIR:
+        projected_cols_expr = [
+            sge.Alias(
+                this=expr,
+                alias=sge.to_identifier(id, quoted=self.quoted),
+            )
+            for id, expr in projected_cols
+        ]
+        # TODO: some columns are not able to be projected into the same select.
+        select_expr = self.expr.select(*projected_cols_expr, append=True)
+        return SQLGlotIR(expr=select_expr)
+
     def _encapsulate_as_cte(
         self,
     ) -> sge.Select:
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -609,7 +609,9 @@ def __getitem__(
     def _getitem_label(self, key: blocks.Label):
         col_ids = self._block.cols_matching_label(key)
         if len(col_ids) == 0:
-            raise KeyError(key)
+            raise KeyError(
+                f"{key} not found in DataFrame columns: {self._block.column_labels}"
+            )
         block = self._block.select_columns(col_ids)
         if isinstance(self.columns, pandas.MultiIndex):
             # Multiindex should drop-level if not selecting entire
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql
@@ -0,0 +1,12 @@
+WITH `bfcte_0` AS (
+  SELECT
+    *,
+    `bfcol_0` AS `bfcol_3`,
+    `bfcol_1` + 1 AS `bfcol_4`
+  FROM UNNEST(ARRAY<STRUCT<`bfcol_0` INT64, `bfcol_1` INT64, `bfcol_2` INT64>>[STRUCT(0, 123456789, 0), STRUCT(1, -987654321, 1), STRUCT(2, 314159, 2), STRUCT(3, CAST(NULL AS INT64), 3), STRUCT(4, -234892, 4), STRUCT(5, 55555, 5), STRUCT(6, 101202303, 6), STRUCT(7, -214748367, 7), STRUCT(8, 2, 8)])
+)
+SELECT
+  `bfcol_3` AS `bfcol_5`,
+  `bfcol_4` AS `bfcol_6`,
+  `bfcol_2` AS `bfcol_7`
+FROM `bfcte_0`
diff --git a/tests/unit/core/compile/sqlglot/test_compile_projection.py b/tests/unit/core/compile/sqlglot/test_compile_projection.py
@@ -0,0 +1,31 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+import pytest
+
+import bigframes
+import bigframes.pandas as bpd
+
+pytest.importorskip("pytest_snapshot")
+
+
+def test_compile_projection(
+    scalars_types_pandas_df: pd.DataFrame, compiler_session: bigframes.Session, snapshot
+):
+    bf_df = bpd.DataFrame(
+        scalars_types_pandas_df[["int64_col"]], session=compiler_session
+    )
+    bf_df["int64_col"] = bf_df["int64_col"] + 1
+    snapshot.assert_match(bf_df.sql, "out.sql")