refactor: Switch explode node to use column offsets (#978)

TrevorBergeron · web-flow · commit 40113d807944 · 2024-09-11T14:54:44.000-07:00
diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py
@@ -382,9 +382,8 @@ def explode(self, column_ids: typing.Sequence[str]) -> ArrayValue:
         for column_id in column_ids:
             assert bigframes.dtypes.is_array_like(self.get_column_type(column_id))
 
-        return ArrayValue(
-            nodes.ExplodeNode(child=self.node, column_ids=tuple(column_ids))
-        )
+        offsets = tuple(self.get_offset_for_name(id) for id in column_ids)
+        return ArrayValue(nodes.ExplodeNode(child=self.node, column_ids=offsets))
 
     def _uniform_sampling(self, fraction: float) -> ArrayValue:
         """Sampling the table on given fraction.
@@ -393,3 +392,6 @@ def _uniform_sampling(self, fraction: float) -> ArrayValue:
             The row numbers of result is non-deterministic, avoid to use.
         """
         return ArrayValue(nodes.RandomSampleNode(self.node, fraction))
+
+    def get_offset_for_name(self, name: str):
+        return self.schema.names.index(name)
diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py
@@ -401,8 +401,9 @@ def _uniform_sampling(self, fraction: float) -> UnorderedIR:
             columns=columns,
         )
 
-    def explode(self, column_ids: typing.Sequence[str]) -> UnorderedIR:
+    def explode(self, offsets: typing.Sequence[int]) -> UnorderedIR:
         table = self._to_ibis_expr()
+        column_ids = tuple(table.columns[offset] for offset in offsets)
 
         # The offset array ensures null represents empty arrays after unnesting.
         offset_array_id = bigframes.core.guid.generate_guid("offset_array_")
@@ -712,16 +713,20 @@ def _uniform_sampling(self, fraction: float) -> OrderedIR:
             ordering=self._ordering,
         )
 
-    def explode(self, column_ids: typing.Sequence[str]) -> OrderedIR:
+    def explode(self, offsets: typing.Sequence[int]) -> OrderedIR:
         table = self._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True)
+        column_ids = tuple(table.columns[offset] for offset in offsets)
 
         offset_array_id = bigframes.core.guid.generate_guid("offset_array_")
         offset_array = (
             vendored_ibis_ops.GenerateArray(
                 ibis.greatest(
                     0,
                     ibis.least(
-                        *[table[column_id].length() - 1 for column_id in column_ids]
+                        *[
+                            table[table.columns[offset]].length() - 1
+                            for offset in offsets
+                        ]
                     ),
                 )
             )
diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py
@@ -42,6 +42,9 @@
 OVERHEAD_VARIABLES = 5
 
 
+COL_OFFSET = int
+
+
 @dataclass(frozen=True)
 class BigFrameNode:
     """
@@ -826,7 +829,7 @@ def variables_introduced(self) -> int:
 
 @dataclass(frozen=True)
 class ExplodeNode(UnaryNode):
-    column_ids: typing.Tuple[str, ...]
+    column_ids: typing.Tuple[COL_OFFSET, ...]
 
     @property
     def row_preserving(self) -> bool:
@@ -844,9 +847,9 @@ def schema(self) -> schemata.ArraySchema:
                     self.child.schema.get_type(name).pyarrow_dtype.value_type
                 ),
             )
-            if name in self.column_ids
+            if offset in self.column_ids
             else schemata.SchemaItem(name, self.child.schema.get_type(name))
-            for name in self.child.schema.names
+            for offset, name in enumerate(self.child.schema.names)
         )
         return schemata.ArraySchema(items)