refactor: Introduce slice op to model array slicing (#1055)

TrevorBergeron · web-flow · commit d1b6800a9e15 · 2024-10-09T10:05:57.000-07:00
diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py
@@ -158,10 +158,6 @@ def session(self) -> Session:
     def schema(self) -> schemata.ArraySchema:
         return self.node.schema
 
-    @functools.cached_property
-    def _compiled_schema(self) -> schemata.ArraySchema:
-        return bigframes.core.compile.test_only_ibis_inferred_schema(self.node)
-
     @property
     def explicitly_ordered(self) -> bool:
         # see BigFrameNode.explicitly_ordered
@@ -229,6 +225,23 @@ def order_by(self, by: Sequence[OrderingExpression]) -> ArrayValue:
     def reversed(self) -> ArrayValue:
         return ArrayValue(nodes.ReversedNode(child=self.node))
 
+    def slice(
+        self, start: Optional[int], stop: Optional[int], step: Optional[int]
+    ) -> ArrayValue:
+        if self.node.order_ambiguous and not (self.session._strictly_ordered):
+            warnings.warn(
+                "Window ordering may be ambiguous, this can cause unstable results.",
+                bigframes.exceptions.AmbiguousWindowWarning,
+            )
+        return ArrayValue(
+            nodes.SliceNode(
+                self.node,
+                start=start,
+                stop=stop,
+                step=step if (step is not None) else 1,
+            )
+        )
+
     def promote_offsets(self) -> Tuple[ArrayValue, str]:
         """
         Convenience function to promote copy of column offsets to a value column. Can be used to reset index.
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -1465,84 +1465,17 @@ def slice(
         self,
         start: typing.Optional[int] = None,
         stop: typing.Optional[int] = None,
-        step: typing.Optional[int] = None,
-    ) -> bigframes.core.blocks.Block:
-        if step is None:
-            step = 1
+        step: int = 1,
+    ) -> Block:
         if step == 0:
-            raise ValueError("slice step cannot be zero")
-        if step < 0:
-            reverse_start = (-start - 1) if start else 0
-            reverse_stop = (-stop - 1) if stop else None
-            reverse_step = -step
-            return self.reversed()._forward_slice(
-                reverse_start, reverse_stop, reverse_step
-            )
-        return self._forward_slice(start or 0, stop, step)
-
-    def _forward_slice(self, start: int = 0, stop=None, step: int = 1):
-        """Performs slice but only for positive step size."""
-        if step <= 0:
-            raise ValueError("forward_slice only supports positive step size")
-
-        use_postive_offsets = (
-            (start > 0)
-            or ((stop is not None) and (stop >= 0))
-            or ((step > 1) and (start >= 0))
-        )
-        use_negative_offsets = (
-            (start < 0) or (stop and (stop < 0)) or ((step > 1) and (start < 0))
+            raise ValueError("Slice step size must be non-zero")
+        return Block(
+            self.expr.slice(start, stop, step),
+            index_columns=self.index_columns,
+            column_labels=self.column_labels,
+            index_labels=self._index_labels,
         )
 
-        block = self
-
-        # only generate offsets that are used
-        positive_offsets = None
-        negative_offsets = None
-
-        if use_postive_offsets:
-            block, positive_offsets = self.promote_offsets()
-        if use_negative_offsets:
-            block, negative_offsets = block.reversed().promote_offsets()
-            block = block.reversed()
-
-        conditions = []
-        if start != 0:
-            if start > 0:
-                assert positive_offsets
-                conditions.append(ops.ge_op.as_expr(positive_offsets, ex.const(start)))
-            else:
-                assert negative_offsets
-                conditions.append(
-                    ops.le_op.as_expr(negative_offsets, ex.const(-start - 1))
-                )
-        if stop is not None:
-            if stop >= 0:
-                assert positive_offsets
-                conditions.append(ops.lt_op.as_expr(positive_offsets, ex.const(stop)))
-            else:
-                assert negative_offsets
-                conditions.append(
-                    ops.gt_op.as_expr(negative_offsets, ex.const(-stop - 1))
-                )
-        if step > 1:
-            if start >= 0:
-                assert positive_offsets
-                start_diff = ops.sub_op.as_expr(positive_offsets, ex.const(start))
-            else:
-                assert negative_offsets
-                start_diff = ops.sub_op.as_expr(negative_offsets, ex.const(-start + 1))
-            step_cond = ops.eq_op.as_expr(
-                ops.mod_op.as_expr(start_diff, ex.const(step)), ex.const(0)
-            )
-            conditions.append(step_cond)
-
-        for cond in conditions:
-            block, cond_id = block.project_expr(cond)
-            block = block.filter_by_id(cond_id)
-
-        return block.select_columns(self.value_columns)
-
     # Using cache to optimize for Jupyter Notebook's behavior where both '__repr__'
     # and '__repr_html__' are called in a single display action, reducing redundant
     # queries.
diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py
@@ -20,7 +20,7 @@
 import functools
 import itertools
 import typing
-from typing import Callable, Iterable, Sequence, Tuple
+from typing import Callable, Iterable, Optional, Sequence, Tuple
 
 import google.cloud.bigquery as bq
 
@@ -270,6 +270,37 @@ def order_ambiguous(self) -> bool:
         return self.child.order_ambiguous
 
 
+@dataclass(frozen=True, eq=False)
+class SliceNode(UnaryNode):
+    """Logical slice node conditionally becomes limit or filter over row numbers."""
+
+    start: Optional[int]
+    stop: Optional[int]
+    step: int = 1
+
+    @property
+    def row_preserving(self) -> bool:
+        """Whether this node preserves input rows."""
+        return False
+
+    @property
+    def non_local(self) -> bool:
+        """
+        Whether this node combines information across multiple rows instead of processing rows independently.
+        Used as an approximation for whether the expression may require shuffling to execute (and therefore be expensive).
+        """
+        return True
+
+    # these are overestimates, more accurate numbers available by converting to concrete limit or analytic+filter ops
+    @property
+    def variables_introduced(self) -> int:
+        return 2
+
+    @property
+    def relation_ops_created(self) -> int:
+        return 2
+
+
 @dataclass(frozen=True, eq=False)
 class JoinNode(BigFrameNode):
     left_child: BigFrameNode
diff --git a/bigframes/core/rewrite.py b/bigframes/core/rewrite.py
@@ -16,13 +16,15 @@
 import dataclasses
 import functools
 import itertools
-from typing import Mapping, Optional, Sequence, Tuple
+from typing import cast, Mapping, Optional, Sequence, Tuple
 
 import bigframes.core.expression as scalar_exprs
+import bigframes.core.guid as guids
 import bigframes.core.identifiers as ids
 import bigframes.core.join_def as join_defs
 import bigframes.core.nodes as nodes
 import bigframes.core.ordering as order
+import bigframes.core.tree_properties as traversals
 import bigframes.operations as ops
 
 Selection = Tuple[Tuple[scalar_exprs.Expression, ids.ColumnId], ...]
@@ -381,3 +383,172 @@ def common_selection_root(
     if r_node in l_nodes:
         return r_node
     return None
+
+
+def replace_slice_ops(root: nodes.BigFrameNode) -> nodes.BigFrameNode:
+    # TODO: we want to pull up some slices into limit op if near root.
+    if isinstance(root, nodes.SliceNode):
+        root = root.transform_children(replace_slice_ops)
+        return convert_slice_to_filter(cast(nodes.SliceNode, root))
+    else:
+        return root.transform_children(replace_slice_ops)
+
+
+def get_simplified_slice(node: nodes.SliceNode):
+    """Attempts to simplify the slice."""
+    row_count = traversals.row_count(node)
+    start, stop, step = node.start, node.stop, node.step
+
+    if start is None:
+        start = 0 if step > 0 else -1
+    if row_count and step > 0:
+        if start and start < 0:
+            start = row_count + start
+        if stop and stop < 0:
+            stop = row_count + stop
+    return start, stop, step
+
+
+def convert_slice_to_filter(node: nodes.SliceNode):
+    start, stop, step = get_simplified_slice(node)
+
+    # no-op (eg. df[::1])
+    if (
+        ((start == 0) or (start is None))
+        and ((stop is None) or (stop == -1))
+        and (step == 1)
+    ):
+        return node.child
+    # No filtering, just reverse (eg. df[::-1])
+    if ((start is None) or (start == -1)) and (not stop) and (step == -1):
+        return nodes.ReversedNode(node.child)
+    # if start/stop/step are all non-negative, and do a simple predicate on forward offsets
+    if ((start is None) or (start >= 0)) and ((stop is None) or (stop >= 0)):
+        node_w_offset = add_offsets(node.child)
+        predicate = convert_simple_slice(
+            scalar_exprs.DerefOp(node_w_offset.col_id), start or 0, stop, step
+        )
+        filtered = nodes.FilterNode(node_w_offset, predicate)
+        return drop_cols(filtered, (node_w_offset.col_id,))
+
+    # fallback cases, generate both forward and backward offsets
+    if step < 0:
+        forward_offsets = add_offsets(node.child)
+        reversed_offsets = add_offsets(nodes.ReversedNode(forward_offsets))
+        dual_indexed = reversed_offsets
+    else:
+        reversed_offsets = add_offsets(nodes.ReversedNode(node.child))
+        forward_offsets = add_offsets(nodes.ReversedNode(reversed_offsets))
+        dual_indexed = forward_offsets
+    predicate = convert_complex_slice(
+        scalar_exprs.DerefOp(forward_offsets.col_id),
+        scalar_exprs.DerefOp(reversed_offsets.col_id),
+        start,
+        stop,
+        step,
+    )
+    filtered = nodes.FilterNode(dual_indexed, predicate)
+    return drop_cols(filtered, (forward_offsets.col_id, reversed_offsets.col_id))
+
+
+def add_offsets(node: nodes.BigFrameNode) -> nodes.PromoteOffsetsNode:
+    # Allow providing custom id generator?
+    offsets_id = ids.ColumnId(guids.generate_guid())
+    return nodes.PromoteOffsetsNode(node, offsets_id)
+
+
+def drop_cols(
+    node: nodes.BigFrameNode, drop_cols: Tuple[ids.ColumnId, ...]
+) -> nodes.SelectionNode:
+    # adding a whole node that redefines the schema is a lot of overhead, should do something more efficient
+    selections = tuple(
+        (scalar_exprs.DerefOp(id), id) for id in node.ids if id not in drop_cols
+    )
+    return nodes.SelectionNode(node, selections)
+
+
+def convert_simple_slice(
+    offsets: scalar_exprs.Expression,
+    start: int = 0,
+    stop: Optional[int] = None,
+    step: int = 1,
+) -> scalar_exprs.Expression:
+    """Performs slice but only for positive step size."""
+    assert start >= 0
+    assert (stop is None) or (stop >= 0)
+
+    conditions = []
+    if start > 0:
+        conditions.append(ops.ge_op.as_expr(offsets, scalar_exprs.const(start)))
+    if (stop is not None) and (stop >= 0):
+        conditions.append(ops.lt_op.as_expr(offsets, scalar_exprs.const(stop)))
+    if step > 1:
+        start_diff = ops.sub_op.as_expr(offsets, scalar_exprs.const(start))
+        step_cond = ops.eq_op.as_expr(
+            ops.mod_op.as_expr(start_diff, scalar_exprs.const(step)),
+            scalar_exprs.const(0),
+        )
+        conditions.append(step_cond)
+
+    return merge_predicates(conditions) or scalar_exprs.const(True)
+
+
+def convert_complex_slice(
+    forward_offsets: scalar_exprs.Expression,
+    reverse_offsets: scalar_exprs.Expression,
+    start: int,
+    stop: Optional[int],
+    step: int = 1,
+) -> scalar_exprs.Expression:
+    conditions = []
+    assert step != 0
+    if start or ((start is not None) and step < 0):
+        if start > 0 and step > 0:
+            start_cond = ops.ge_op.as_expr(forward_offsets, scalar_exprs.const(start))
+        elif start > 0 and step < 0:
+            start_cond = ops.le_op.as_expr(forward_offsets, scalar_exprs.const(start))
+        elif start < 0 and step > 0:
+            start_cond = ops.le_op.as_expr(
+                reverse_offsets, scalar_exprs.const(-start - 1)
+            )
+        else:
+            assert start < 0 and step < 0
+            start_cond = ops.ge_op.as_expr(
+                reverse_offsets, scalar_exprs.const(-start - 1)
+            )
+        conditions.append(start_cond)
+    if stop is not None:
+        if stop >= 0 and step > 0:
+            stop_cond = ops.lt_op.as_expr(forward_offsets, scalar_exprs.const(stop))
+        elif stop >= 0 and step < 0:
+            stop_cond = ops.gt_op.as_expr(forward_offsets, scalar_exprs.const(stop))
+        elif stop < 0 and step > 0:
+            stop_cond = ops.gt_op.as_expr(
+                reverse_offsets, scalar_exprs.const(-stop - 1)
+            )
+        else:
+            assert (stop < 0) and (step < 0)
+            stop_cond = ops.lt_op.as_expr(
+                reverse_offsets, scalar_exprs.const(-stop - 1)
+            )
+        conditions.append(stop_cond)
+    if step != 1:
+        if step > 1 and start >= 0:
+            start_diff = ops.sub_op.as_expr(forward_offsets, scalar_exprs.const(start))
+        elif step > 1 and start < 0:
+            start_diff = ops.sub_op.as_expr(
+                reverse_offsets, scalar_exprs.const(-start + 1)
+            )
+        elif step < 0 and start >= 0:
+            start_diff = ops.add_op.as_expr(forward_offsets, scalar_exprs.const(start))
+        else:
+            assert step < 0 and start < 0
+            start_diff = ops.add_op.as_expr(
+                reverse_offsets, scalar_exprs.const(-start + 1)
+            )
+        step_cond = ops.eq_op.as_expr(
+            ops.mod_op.as_expr(start_diff, scalar_exprs.const(step)),
+            scalar_exprs.const(0),
+        )
+        conditions.append(step_cond)
+    return merge_predicates(conditions) or scalar_exprs.const(True)
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -3722,7 +3722,9 @@ def _slice(
         stop: typing.Optional[int] = None,
         step: typing.Optional[int] = None,
     ) -> DataFrame:
-        block = self._block.slice(start=start, stop=stop, step=step)
+        block = self._block.slice(
+            start=start, stop=stop, step=step if (step is not None) else 1
+        )
         return DataFrame(block)
 
     def __array_ufunc__(
diff --git a/bigframes/series.py b/bigframes/series.py
@@ -1923,9 +1923,9 @@ def _slice(
         step: typing.Optional[int] = None,
     ) -> bigframes.series.Series:
         return bigframes.series.Series(
-            self._block.slice(start=start, stop=stop, step=step).select_column(
-                self._value_column
-            ),
+            self._block.slice(
+                start=start, stop=stop, step=step if (step is not None) else 1
+            ).select_column(self._value_column),
         )
 
     def cache(self):
diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py