chore: compile concat nodes by sqlglot (#1824)

chelsea-lin · web-flow · commit 72076c76a6eb · 2025-06-16T13:23:51.000-07:00
* chore: compile concat node

* chore: compile concat nodes by sqlglot
diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py
@@ -190,6 +190,17 @@ def compile_projection(
         )
         return child.project(projected_cols)
 
+    @_compile_node.register
+    def compile_concat(
+        self, node: nodes.ConcatNode, *children: ir.SQLGlotIR
+    ) -> ir.SQLGlotIR:
+        output_ids = [id.sql for id in node.output_ids]
+        return ir.SQLGlotIR.from_union(
+            [child.expr for child in children],
+            output_ids=output_ids,
+            uid_gen=self.uid_gen,
+        )
+
 
 def _replace_unsupported_ops(node: nodes.BigFrameNode):
     node = nodes.bottom_up(node, rewrite.rewrite_slice)
diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py
@@ -149,6 +149,57 @@ def from_query_string(
         select_expr.set("with", sge.With(expressions=[cte]))
         return cls(expr=select_expr, uid_gen=uid_gen)
 
+    @classmethod
+    def from_union(
+        cls,
+        selects: typing.Sequence[sge.Select],
+        output_ids: typing.Sequence[str],
+        uid_gen: guid.SequentialUIDGenerator,
+    ) -> SQLGlotIR:
+        """Builds SQLGlot expression by union of multiple select expressions."""
+        assert (
+            len(list(selects)) >= 2
+        ), f"At least two select expressions must be provided, but got {selects}."
+
+        existing_ctes: list[sge.CTE] = []
+        union_selects: list[sge.Select] = []
+        for select in selects:
+            assert isinstance(
+                select, sge.Select
+            ), f"All provided expressions must be of type sge.Select, but got {type(select)}"
+
+            select_expr = select.copy()
+            existing_ctes = [*existing_ctes, *select_expr.args.pop("with", [])]
+
+            new_cte_name = sge.to_identifier(
+                next(uid_gen.get_uid_stream("bfcte_")), quoted=cls.quoted
+            )
+            new_cte = sge.CTE(
+                this=select_expr,
+                alias=new_cte_name,
+            )
+            existing_ctes = [*existing_ctes, new_cte]
+
+            selections = [
+                sge.Alias(
+                    this=expr.alias_or_name,
+                    alias=sge.to_identifier(output_id, quoted=cls.quoted),
+                )
+                for expr, output_id in zip(select_expr.expressions, output_ids)
+            ]
+            union_selects.append(
+                sge.Select().select(*selections).from_(sge.Table(this=new_cte_name))
+            )
+
+        union_expr = sg.union(
+            *union_selects,
+            distinct=False,
+            copy=False,
+        )
+        final_select_expr = sge.Select().select(sge.Star()).from_(union_expr.subquery())
+        final_select_expr.set("with", sge.With(expressions=existing_ctes))
+        return cls(expr=final_select_expr, uid_gen=uid_gen)
+
     def select(
         self,
         selected_cols: tuple[tuple[str, sge.Expression], ...],
@@ -181,7 +232,7 @@ def project(
             )
             for id, expr in projected_cols
         ]
-        new_expr = self._encapsulate_as_cte().select(*projected_cols_expr, append=False)
+        new_expr = self._encapsulate_as_cte().select(*projected_cols_expr, append=True)
         return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)
 
     def insert(
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql
@@ -0,0 +1,107 @@
+WITH `bfcte_1` AS (
+  SELECT
+    *
+  FROM UNNEST(ARRAY<STRUCT<`bfcol_0` INT64, `bfcol_1` INT64, `bfcol_2` INT64, `bfcol_3` STRING, `bfcol_4` INT64>>[STRUCT(0, 123456789, 0, 'Hello, World!', 0), STRUCT(1, -987654321, 1, 'こんにちは', 1), STRUCT(2, 314159, 2, '  ¡Hola Mundo!  ', 2), STRUCT(3, CAST(NULL AS INT64), 3, CAST(NULL AS STRING), 3), STRUCT(4, -234892, 4, 'Hello, World!', 4), STRUCT(5, 55555, 5, 'Güten Tag!', 5), STRUCT(6, 101202303, 6, 'capitalize, This ', 6), STRUCT(7, -214748367, 7, ' سلام', 7), STRUCT(8, 2, 8, 'T', 8)])
+), `bfcte_3` AS (
+  SELECT
+    `bfcol_0` AS `bfcol_5`,
+    `bfcol_2` AS `bfcol_6`,
+    `bfcol_1` AS `bfcol_7`,
+    `bfcol_3` AS `bfcol_8`,
+    `bfcol_4` AS `bfcol_9`
+  FROM `bfcte_1`
+), `bfcte_5` AS (
+  SELECT
+    *,
+    `bfcol_9` AS `bfcol_10`
+  FROM `bfcte_3`
+), `bfcte_7` AS (
+  SELECT
+    `bfcol_5` AS `bfcol_11`,
+    `bfcol_6` AS `bfcol_12`,
+    `bfcol_7` AS `bfcol_13`,
+    `bfcol_8` AS `bfcol_14`,
+    `bfcol_10` AS `bfcol_15`
+  FROM `bfcte_5`
+), `bfcte_9` AS (
+  SELECT
+    *,
+    0 AS `bfcol_16`
+  FROM `bfcte_7`
+), `bfcte_10` AS (
+  SELECT
+    `bfcol_11` AS `bfcol_17`,
+    `bfcol_12` AS `bfcol_18`,
+    `bfcol_13` AS `bfcol_19`,
+    `bfcol_14` AS `bfcol_20`,
+    `bfcol_16` AS `bfcol_21`,
+    `bfcol_15` AS `bfcol_22`
+  FROM `bfcte_9`
+), `bfcte_0` AS (
+  SELECT
+    *
+  FROM UNNEST(ARRAY<STRUCT<`bfcol_23` INT64, `bfcol_24` INT64, `bfcol_25` INT64, `bfcol_26` STRING, `bfcol_27` INT64>>[STRUCT(0, 123456789, 0, 'Hello, World!', 0), STRUCT(1, -987654321, 1, 'こんにちは', 1), STRUCT(2, 314159, 2, '  ¡Hola Mundo!  ', 2), STRUCT(3, CAST(NULL AS INT64), 3, CAST(NULL AS STRING), 3), STRUCT(4, -234892, 4, 'Hello, World!', 4), STRUCT(5, 55555, 5, 'Güten Tag!', 5), STRUCT(6, 101202303, 6, 'capitalize, This ', 6), STRUCT(7, -214748367, 7, ' سلام', 7), STRUCT(8, 2, 8, 'T', 8)])
+), `bfcte_2` AS (
+  SELECT
+    `bfcol_23` AS `bfcol_28`,
+    `bfcol_25` AS `bfcol_29`,
+    `bfcol_24` AS `bfcol_30`,
+    `bfcol_26` AS `bfcol_31`,
+    `bfcol_27` AS `bfcol_32`
+  FROM `bfcte_0`
+), `bfcte_4` AS (
+  SELECT
+    *,
+    `bfcol_32` AS `bfcol_33`
+  FROM `bfcte_2`
+), `bfcte_6` AS (
+  SELECT
+    `bfcol_28` AS `bfcol_34`,
+    `bfcol_29` AS `bfcol_35`,
+    `bfcol_30` AS `bfcol_36`,
+    `bfcol_31` AS `bfcol_37`,
+    `bfcol_33` AS `bfcol_38`
+  FROM `bfcte_4`
+), `bfcte_8` AS (
+  SELECT
+    *,
+    1 AS `bfcol_39`
+  FROM `bfcte_6`
+), `bfcte_11` AS (
+  SELECT
+    `bfcol_34` AS `bfcol_40`,
+    `bfcol_35` AS `bfcol_41`,
+    `bfcol_36` AS `bfcol_42`,
+    `bfcol_37` AS `bfcol_43`,
+    `bfcol_39` AS `bfcol_44`,
+    `bfcol_38` AS `bfcol_45`
+  FROM `bfcte_8`
+), `bfcte_12` AS (
+  SELECT
+    *
+  FROM (
+    SELECT
+      bfcol_17 AS `bfcol_46`,
+      bfcol_18 AS `bfcol_47`,
+      bfcol_19 AS `bfcol_48`,
+      bfcol_20 AS `bfcol_49`,
+      bfcol_21 AS `bfcol_50`,
+      bfcol_22 AS `bfcol_51`
+    FROM `bfcte_10`
+    UNION ALL
+    SELECT
+      bfcol_40 AS `bfcol_46`,
+      bfcol_41 AS `bfcol_47`,
+      bfcol_42 AS `bfcol_48`,
+      bfcol_43 AS `bfcol_49`,
+      bfcol_44 AS `bfcol_50`,
+      bfcol_45 AS `bfcol_51`
+    FROM `bfcte_11`
+  )
+)
+SELECT
+  `bfcol_46` AS `rowindex`,
+  `bfcol_47` AS `rowindex_1`,
+  `bfcol_48` AS `int64_col`,
+  `bfcol_49` AS `string_col`
+FROM `bfcte_12`
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql
@@ -8,6 +8,7 @@ WITH `bfcte_0` AS (
   FROM `test-project`.`test_dataset`.`test_table`
 ), `bfcte_1` AS (
   SELECT
+    *,
     `bfcol_0` AS `bfcol_5`,
     `bfcol_2` AS `bfcol_6`,
     `bfcol_3` AS `bfcol_7`,
diff --git a/tests/unit/core/compile/sqlglot/test_compile_concat.py b/tests/unit/core/compile/sqlglot/test_compile_concat.py
@@ -0,0 +1,32 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+import pytest
+
+import bigframes
+import bigframes.pandas as bpd
+
+pytest.importorskip("pytest_snapshot")
+
+
+def test_compile_concat(
+    scalars_types_pandas_df: pd.DataFrame, compiler_session: bigframes.Session, snapshot
+):
+    # TODO: concat two same dataframes, which SQL does not get reused.
+    # TODO: concat dataframes from a gbq table but trigger a windows compiler.
+    df1 = bpd.DataFrame(scalars_types_pandas_df, session=compiler_session)
+    df1 = df1[["rowindex", "int64_col", "string_col"]]
+    concat_df = bpd.concat([df1, df1])
+    snapshot.assert_match(concat_df.sql, "out.sql")