Skip to content

Commit 40113d8

Browse files
refactor: Switch explode node to use column offsets (#978)
1 parent 3a4a9de commit 40113d8

File tree

3 files changed

+19
-9
lines changed

3 files changed

+19
-9
lines changed

bigframes/core/__init__.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -382,9 +382,8 @@ def explode(self, column_ids: typing.Sequence[str]) -> ArrayValue:
382382
for column_id in column_ids:
383383
assert bigframes.dtypes.is_array_like(self.get_column_type(column_id))
384384

385-
return ArrayValue(
386-
nodes.ExplodeNode(child=self.node, column_ids=tuple(column_ids))
387-
)
385+
offsets = tuple(self.get_offset_for_name(id) for id in column_ids)
386+
return ArrayValue(nodes.ExplodeNode(child=self.node, column_ids=offsets))
388387

389388
def _uniform_sampling(self, fraction: float) -> ArrayValue:
390389
"""Sampling the table on given fraction.
@@ -393,3 +392,6 @@ def _uniform_sampling(self, fraction: float) -> ArrayValue:
393392
The row numbers of result is non-deterministic, avoid to use.
394393
"""
395394
return ArrayValue(nodes.RandomSampleNode(self.node, fraction))
395+
396+
def get_offset_for_name(self, name: str):
397+
return self.schema.names.index(name)

bigframes/core/compile/compiled.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -401,8 +401,9 @@ def _uniform_sampling(self, fraction: float) -> UnorderedIR:
401401
columns=columns,
402402
)
403403

404-
def explode(self, column_ids: typing.Sequence[str]) -> UnorderedIR:
404+
def explode(self, offsets: typing.Sequence[int]) -> UnorderedIR:
405405
table = self._to_ibis_expr()
406+
column_ids = tuple(table.columns[offset] for offset in offsets)
406407

407408
# The offset array ensures null represents empty arrays after unnesting.
408409
offset_array_id = bigframes.core.guid.generate_guid("offset_array_")
@@ -712,16 +713,20 @@ def _uniform_sampling(self, fraction: float) -> OrderedIR:
712713
ordering=self._ordering,
713714
)
714715

715-
def explode(self, column_ids: typing.Sequence[str]) -> OrderedIR:
716+
def explode(self, offsets: typing.Sequence[int]) -> OrderedIR:
716717
table = self._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True)
718+
column_ids = tuple(table.columns[offset] for offset in offsets)
717719

718720
offset_array_id = bigframes.core.guid.generate_guid("offset_array_")
719721
offset_array = (
720722
vendored_ibis_ops.GenerateArray(
721723
ibis.greatest(
722724
0,
723725
ibis.least(
724-
*[table[column_id].length() - 1 for column_id in column_ids]
726+
*[
727+
table[table.columns[offset]].length() - 1
728+
for offset in offsets
729+
]
725730
),
726731
)
727732
)

bigframes/core/nodes.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@
4242
OVERHEAD_VARIABLES = 5
4343

4444

45+
COL_OFFSET = int
46+
47+
4548
@dataclass(frozen=True)
4649
class BigFrameNode:
4750
"""
@@ -826,7 +829,7 @@ def variables_introduced(self) -> int:
826829

827830
@dataclass(frozen=True)
828831
class ExplodeNode(UnaryNode):
829-
column_ids: typing.Tuple[str, ...]
832+
column_ids: typing.Tuple[COL_OFFSET, ...]
830833

831834
@property
832835
def row_preserving(self) -> bool:
@@ -844,9 +847,9 @@ def schema(self) -> schemata.ArraySchema:
844847
self.child.schema.get_type(name).pyarrow_dtype.value_type
845848
),
846849
)
847-
if name in self.column_ids
850+
if offset in self.column_ids
848851
else schemata.SchemaItem(name, self.child.schema.get_type(name))
849-
for name in self.child.schema.names
852+
for offset, name in enumerate(self.child.schema.names)
850853
)
851854
return schemata.ArraySchema(items)
852855

0 commit comments

Comments
 (0)