chore: add compile_explode (#1848)

chelsea-lin · web-flow · commit bc885bd1ea79 · 2025-06-24T19:10:32.000-05:00
Fixes internal issue 427306238
diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py
@@ -229,6 +229,14 @@ def compile_concat(
             uid_gen=self.uid_gen,
         )
 
+    @_compile_node.register
+    def compile_explode(
+        self, node: nodes.ExplodeNode, child: ir.SQLGlotIR
+    ) -> ir.SQLGlotIR:
+        offsets_col = node.offsets_col.sql if (node.offsets_col is not None) else None
+        columns = tuple(ref.id.sql for ref in node.column_ids)
+        return child.explode(columns, offsets_col)
+
 
 def _replace_unsupported_ops(node: nodes.BigFrameNode):
     node = nodes.bottom_up(node, rewrite.rewrite_slice)
diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py
@@ -290,6 +290,96 @@ def replace(
         ).sql(dialect=self.dialect, pretty=self.pretty)
         return f"{merge_str}\n{whens_str}"
 
+    def explode(
+        self,
+        column_names: tuple[str, ...],
+        offsets_col: typing.Optional[str],
+    ) -> SQLGlotIR:
+        num_columns = len(list(column_names))
+        assert num_columns > 0, "At least one column must be provided for explode."
+        if num_columns == 1:
+            return self._explode_single_column(column_names[0], offsets_col)
+        else:
+            return self._explode_multiple_columns(column_names, offsets_col)
+
+    def _explode_single_column(
+        self, column_name: str, offsets_col: typing.Optional[str]
+    ) -> SQLGlotIR:
+        """Helper method to handle the case of exploding a single column."""
+
+        offset = (
+            sge.to_identifier(offsets_col, quoted=self.quoted) if offsets_col else None
+        )
+        column = sge.to_identifier(column_name, quoted=self.quoted)
+        unnested_column_alias = sge.to_identifier(
+            next(self.uid_gen.get_uid_stream("bfcol_")), quoted=self.quoted
+        )
+        unnest_expr = sge.Unnest(
+            expressions=[column],
+            alias=sge.TableAlias(columns=[unnested_column_alias]),
+            offset=offset,
+        )
+        selection = sge.Star(replace=[unnested_column_alias.as_(column)])
+        # TODO: "CROSS" if not keep_empty else "LEFT"
+        # TODO: overlaps_with_parent to replace existing column.
+        new_expr = (
+            self._encapsulate_as_cte()
+            .select(selection, append=False)
+            .join(unnest_expr, join_type="CROSS")
+        )
+        return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)
+
+    def _explode_multiple_columns(
+        self,
+        column_names: tuple[str, ...],
+        offsets_col: typing.Optional[str],
+    ) -> SQLGlotIR:
+        """Helper method to handle the case of exploding multiple columns."""
+        offset = (
+            sge.to_identifier(offsets_col, quoted=self.quoted) if offsets_col else None
+        )
+        columns = [
+            sge.to_identifier(column_name, quoted=self.quoted)
+            for column_name in column_names
+        ]
+
+        # If there are multiple columns, we need to unnest by zipping the arrays:
+        # https://cloud.google.com/bigquery/docs/arrays#zipping_arrays
+        column_lengths = [
+            sge.func("ARRAY_LENGTH", sge.to_identifier(column, quoted=self.quoted)) - 1
+            for column in columns
+        ]
+        generate_array = sge.func(
+            "GENERATE_ARRAY",
+            sge.convert(0),
+            sge.func("LEAST", *column_lengths),
+        )
+        unnested_offset_alias = sge.to_identifier(
+            next(self.uid_gen.get_uid_stream("bfcol_")), quoted=self.quoted
+        )
+        unnest_expr = sge.Unnest(
+            expressions=[generate_array],
+            alias=sge.TableAlias(columns=[unnested_offset_alias]),
+            offset=offset,
+        )
+        selection = sge.Star(
+            replace=[
+                sge.Bracket(
+                    this=column,
+                    expressions=[unnested_offset_alias],
+                    safe=True,
+                    offset=False,
+                ).as_(column)
+                for column in columns
+            ]
+        )
+        new_expr = (
+            self._encapsulate_as_cte()
+            .select(selection, append=False)
+            .join(unnest_expr, join_type="CROSS")
+        )
+        return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)
+
     def _encapsulate_as_cte(
         self,
     ) -> sge.Select:
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_dataframe/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_dataframe/out.sql
@@ -0,0 +1,21 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `rowindex` AS `bfcol_0`,
+    `int_list_col` AS `bfcol_1`,
+    `string_list_col` AS `bfcol_2`
+  FROM `bigframes-dev`.`sqlglot_test`.`repeated_types`
+), `bfcte_1` AS (
+  SELECT
+    *
+    REPLACE (`bfcol_1`[SAFE_OFFSET(`bfcol_13`)] AS `bfcol_1`, `bfcol_2`[SAFE_OFFSET(`bfcol_13`)] AS `bfcol_2`)
+  FROM `bfcte_0`
+  CROSS JOIN UNNEST(GENERATE_ARRAY(0, LEAST(ARRAY_LENGTH(`bfcol_1`) - 1, ARRAY_LENGTH(`bfcol_2`) - 1))) AS `bfcol_13` WITH OFFSET AS `bfcol_7`
+)
+SELECT
+  `bfcol_0` AS `rowindex`,
+  `bfcol_0` AS `rowindex_1`,
+  `bfcol_1` AS `int_list_col`,
+  `bfcol_2` AS `string_list_col`
+FROM `bfcte_1`
+ORDER BY
+  `bfcol_7` ASC NULLS LAST
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_series/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_series/out.sql
@@ -0,0 +1,18 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `rowindex` AS `bfcol_0`,
+    `int_list_col` AS `bfcol_1`
+  FROM `bigframes-dev`.`sqlglot_test`.`repeated_types`
+), `bfcte_1` AS (
+  SELECT
+    *
+    REPLACE (`bfcol_8` AS `bfcol_1`)
+  FROM `bfcte_0`
+  CROSS JOIN UNNEST(`bfcol_1`) AS `bfcol_8` WITH OFFSET AS `bfcol_4`
+)
+SELECT
+  `bfcol_0` AS `rowindex`,
+  `bfcol_1` AS `int_list_col`
+FROM `bfcte_1`
+ORDER BY
+  `bfcol_4` ASC NULLS LAST
diff --git a/tests/unit/core/compile/sqlglot/test_compile_explode.py b/tests/unit/core/compile/sqlglot/test_compile_explode.py
@@ -0,0 +1,31 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import bigframes.pandas as bpd
+
+pytest.importorskip("pytest_snapshot")
+
+
+# TODO: check order by with offset
+def test_compile_explode_series(repeated_types_df: bpd.DataFrame, snapshot):
+    s = repeated_types_df["int_list_col"].explode()
+    snapshot.assert_match(s.to_frame().sql, "out.sql")
+
+
+def test_compile_explode_dataframe(repeated_types_df: bpd.DataFrame, snapshot):
+    exploded_columns = ["int_list_col", "string_list_col"]
+    df = repeated_types_df[["rowindex", *exploded_columns]].explode(exploded_columns)
+    snapshot.assert_match(df.sql, "out.sql")
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -4124,6 +4124,7 @@ def explode(
         **Examples:**
 
             >>> import bigframes.pandas as bpd
+            >>> import numpy as np
             >>> bpd.options.display.progress_bar = None
 
             >>> df = bpd.DataFrame({'A': [[0, 1, 2], [], [], [3, 4]],