Merge pull request #2884 from mabel-dev/#2877

joocer · web-flow · commit c67656ee3998 · 2025-10-31T08:18:40.000Z
limit pushdown
diff --git a/opteryx/__version__.py b/opteryx/__version__.py
@@ -1,9 +1,9 @@
 # THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
 # DO NOT EDIT THIS FILE DIRECTLY
 
-__build__ = 1715
+__build__ = 1716
 __author__ = "@joocer"
-__version__ = "0.26.0-beta.1715"
+__version__ = "0.26.0-beta.1716"
 
 # Store the version here so:
 # 1) we don't load dependencies by storing it in __init__.py
diff --git a/opteryx/planner/optimizer/strategies/limit_pushdown.py b/opteryx/planner/optimizer/strategies/limit_pushdown.py
@@ -12,6 +12,9 @@
 We try to push the limit to the other side of PROJECTS
 """
 
+from typing import Optional
+from typing import Set
+
 from opteryx.connectors.capabilities import LimitPushable
 from opteryx.planner.logical_planner import LogicalPlan
 from opteryx.planner.logical_planner import LogicalPlanNode
@@ -23,55 +26,165 @@
 
 
 class LimitPushdownStrategy(OptimizationStrategy):
+    """Push LIMIT operators towards scans when it is safe to do so."""
+
+    _BARRIER_TYPES = {
+        LogicalPlanStepType.Aggregate,
+        LogicalPlanStepType.AggregateAndGroup,
+        LogicalPlanStepType.Distinct,
+        LogicalPlanStepType.Filter,
+        LogicalPlanStepType.FunctionDataset,
+        LogicalPlanStepType.HeapSort,
+        LogicalPlanStepType.Limit,
+        LogicalPlanStepType.MetadataWriter,
+        LogicalPlanStepType.Order,
+        LogicalPlanStepType.Set,
+        LogicalPlanStepType.Union,
+    }
+
     def visit(self, node: LogicalPlanNode, context: OptimizerContext) -> OptimizerContext:
         if not context.optimized_plan:
-            context.optimized_plan = context.pre_optimized_tree.copy()  # type: ignore
+            context.optimized_plan = context.pre_optimized_tree.copy()  # type: ignore[arg-type]
 
         if node.node_type == LogicalPlanStepType.Limit:
-            if node.offset is not None:
-                # we can't push down limits with offset
+            if node.offset is not None or node.limit in (None, 0):
                 return context
             node.nid = context.node_id
+            if not hasattr(node, "pushdown_targets"):
+                node.pushdown_targets = set(node.all_relations or [])
             context.collected_limits.append(node)
             return context
 
-        if (
-            node.node_type == LogicalPlanStepType.Scan
-            and LimitPushable in node.connector.__class__.mro()
-        ):
-            for limit_node in context.collected_limits:
-                if node.relation in limit_node.all_relations:
-                    self.statistics.optimization_limit_pushdown += 1
-                    context.optimized_plan.remove_node(limit_node.nid, heal=True)
-                    node.limit = limit_node.limit
-                    context.optimized_plan[context.node_id] = node
-        elif node.node_type in (
-            LogicalPlanStepType.Aggregate,
-            LogicalPlanStepType.AggregateAndGroup,
-            LogicalPlanStepType.Distinct,
-            LogicalPlanStepType.Filter,
-            LogicalPlanStepType.Join,
-            LogicalPlanStepType.Order,
-            LogicalPlanStepType.Union,
-            LogicalPlanStepType.Scan,
-        ):
-            # we don't push past here
-            for limit_node in context.collected_limits:
-                self.statistics.optimization_limit_pushdown += 1
-                context.optimized_plan.remove_node(limit_node.nid, heal=True)
-                context.optimized_plan.insert_node_after(
-                    limit_node.nid, limit_node, context.node_id
-                )
-                limit_node.columns = []
-            context.collected_limits.clear()
+        remaining_limits = []
+        for limit_node in context.collected_limits:
+            if self._should_skip_branch(limit_node, node):
+                remaining_limits.append(limit_node)
+                continue
+
+            if node.node_type == LogicalPlanStepType.Scan:
+                outcome = self._apply_to_scan(limit_node, node, context)
+                if outcome is True:
+                    continue
+                if outcome is None:
+                    remaining_limits.append(limit_node)
+                    continue
+                self._place_before_node(limit_node, node, context)
+                continue
+
+            if node.node_type == LogicalPlanStepType.Join:
+                if self._refine_targets_for_join(limit_node, node):
+                    remaining_limits.append(limit_node)
+                    continue
+                self._place_before_node(limit_node, node, context)
+                continue
 
+            if node.node_type in self._BARRIER_TYPES:
+                self._place_before_node(limit_node, node, context)
+                continue
+
+            remaining_limits.append(limit_node)
+
+        context.collected_limits = remaining_limits
         return context
 
     def complete(self, plan: LogicalPlan, context: OptimizerContext) -> LogicalPlan:
-        # No finalization needed for this strategy
+        context.collected_limits.clear()
         return plan
 
-    def should_i_run(self, plan):
-        # only run if there are LIMIT clauses in the plan
+    def should_i_run(self, plan: LogicalPlan) -> bool:
         candidates = get_nodes_of_type_from_logical_plan(plan, (LogicalPlanStepType.Limit,))
         return len(candidates) > 0
+
+    @staticmethod
+    def _collect_relations(node: LogicalPlanNode) -> Set[str]:
+        relations = getattr(node, "all_relations", None)
+        if relations:
+            return set(relations)
+        return set()
+
+    def _should_skip_branch(self, limit_node: LogicalPlanNode, node: LogicalPlanNode) -> bool:
+        targets: Set[str] = getattr(limit_node, "pushdown_targets", set())
+        if not targets:
+            return False
+        node_relations = self._collect_relations(node)
+        return bool(node_relations) and targets.isdisjoint(node_relations)
+
+    def _apply_to_scan(
+        self,
+        limit_node: LogicalPlanNode,
+        scan_node: LogicalPlanNode,
+        context: OptimizerContext,
+    ) -> Optional[bool]:
+        targets: Set[str] = getattr(
+            limit_node, "pushdown_targets", set(limit_node.all_relations or [])
+        )
+        relation_names = {scan_node.relation, getattr(scan_node, "alias", None)}
+        if targets and targets.isdisjoint({name for name in relation_names if name}):
+            return None
+
+        connector = getattr(scan_node, "connector", None)
+        if connector and LimitPushable in connector.__class__.mro():
+            current_limit = getattr(scan_node, "limit", None)
+            scan_node.limit = (
+                limit_node.limit if current_limit is None else min(current_limit, limit_node.limit)
+            )
+            if limit_node.nid in context.optimized_plan:
+                context.optimized_plan.remove_node(limit_node.nid, heal=True)
+            context.optimized_plan[context.node_id] = scan_node
+            self.statistics.optimization_limit_pushdown += 1
+            return True
+
+        return False
+
+    def _refine_targets_for_join(
+        self, limit_node: LogicalPlanNode, join_node: LogicalPlanNode
+    ) -> bool:
+        join_type = getattr(join_node, "type", None)
+        if not join_type:
+            return False
+
+        targets: Set[str] = getattr(
+            limit_node, "pushdown_targets", set(limit_node.all_relations or [])
+        )
+        if not targets:
+            targets = set(limit_node.all_relations or [])
+
+        left_relations = set(getattr(join_node, "left_relation_names", []) or [])
+        right_relations = set(getattr(join_node, "right_relation_names", []) or [])
+
+        new_targets: Optional[Set[str]] = None
+
+        if join_type == "left outer":
+            new_targets = targets & left_relations
+        elif join_type == "right outer":
+            new_targets = targets & right_relations
+        elif join_type == "cross join":
+            left_size = getattr(join_node, "left_size", float("inf"))
+            right_size = getattr(join_node, "right_size", float("inf"))
+            left_choice = targets & left_relations
+            right_choice = targets & right_relations
+            if left_choice and right_choice:
+                new_targets = left_choice if left_size <= right_size else right_choice
+            elif left_choice:
+                new_targets = left_choice
+            elif right_choice:
+                new_targets = right_choice
+        else:
+            return False
+
+        if not new_targets:
+            return False
+
+        limit_node.pushdown_targets = new_targets
+        limit_node.all_relations = set(new_targets)
+        return True
+
+    def _place_before_node(
+        self, limit_node: LogicalPlanNode, _: LogicalPlanNode, context: OptimizerContext
+    ) -> None:
+        if limit_node.nid in context.optimized_plan:
+            context.optimized_plan.remove_node(limit_node.nid, heal=True)
+        context.optimized_plan.insert_node_after(limit_node.nid, limit_node, context.node_id)
+        limit_node.columns = []
+        limit_node.pushdown_targets = set(limit_node.all_relations or [])
+        self.statistics.optimization_limit_pushdown += 1
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "opteryx"
-version = "0.26.0-beta.1715"
+version = "0.26.0-beta.1716"
 description = "Query your data, where it lives"
 requires-python = '>=3.11'
 readme = {file = "README.md", content-type = "text/markdown"}
diff --git a/tests/unit/planner/test_limit_pushdown_join_behaviour.py b/tests/unit/planner/test_limit_pushdown_join_behaviour.py
@@ -0,0 +1,43 @@
+import os
+import sys
+
+sys.path.insert(1, os.path.join(sys.path[0], "../../.."))
+
+import opteryx  # noqa: E402
+import pytest  # noqa: E402
+
+
+def _materialize(query: str):
+    cursor = opteryx.query(query)
+    cursor.materialize()
+    return cursor
+
+
+def test_limit_pushdown_left_outer_join():
+    query = (
+        "SELECT s.name FROM testdata.satellites AS s "
+        "LEFT JOIN testdata.planets AS p ON s.planetId = p.id LIMIT 5;"
+    )
+    cursor = _materialize(query)
+    plan_lines = cursor.stats["executed_plan"].splitlines()
+    scan_line = next(
+        line for line in plan_lines if "READ (testdata.satellites AS s)" in line
+    )
+    assert "LIMIT 5" in scan_line, cursor.stats["executed_plan"]
+    assert cursor.stats["rows_read"] <= 14, cursor.stats
+
+
+def test_limit_pushdown_cross_join_prefers_smaller_side():
+    query = (
+        "SELECT * FROM testdata.planets AS p CROSS JOIN testdata.satellites AS s LIMIT 5;"
+    )
+    cursor = _materialize(query)
+    plan_lines = cursor.stats["executed_plan"].splitlines()
+    scan_line = next(
+        line for line in plan_lines if "READ (testdata.planets AS p)" in line
+    )
+    assert "LIMIT 5" in scan_line, cursor.stats["executed_plan"]
+    assert cursor.stats["rows_read"] <= 182, cursor.stats
+
+if __name__ == "__main__":  # pragma: no cover
+    pytest.main([__file__])
diff --git a/tests/unit/planner/test_limit_pushdown_parquet_disk.py b/tests/unit/planner/test_limit_pushdown_parquet_disk.py
@@ -2,10 +2,9 @@
 import sys
 import pytest
 
-sys.path.insert(1, os.path.join(sys.path[0], "../.."))
+sys.path.insert(1, os.path.join(sys.path[0], "../../.."))
 
 import opteryx
-from opteryx.utils.formatter import format_sql
 from tests import is_arm, is_mac, is_windows, skip_if
 
 
@@ -35,6 +34,30 @@ def test_parquet_disk_limit_pushdown(query, expected_rows):
     cur.materialize()
     assert cur.stats["rows_read"] == expected_rows, cur.stats
 
+
+@skip_if(is_arm() or is_windows() or is_mac())
+def test_limit_pushdown_projection_plan():
+    query = "SELECT name FROM (SELECT name FROM testdata.planets) AS s LIMIT 3;"
+    cur = opteryx.query(query)
+    cur.materialize()
+    plan_lines = cur.stats["executed_plan"].splitlines()
+    scan_line = next(line for line in plan_lines if "READ (testdata.planets)" in line)
+    assert "LIMIT 3" in scan_line, cur.stats["executed_plan"]
+    assert cur.stats["rows_read"] == 3, cur.stats
+
+
+@skip_if(is_arm() or is_windows() or is_mac())
+def test_limit_not_pushed_past_heap_sort():
+    query = "SELECT name FROM testdata.planets ORDER BY name LIMIT 3;"
+    cur = opteryx.query(query)
+    cur.materialize()
+    plan_lines = cur.stats["executed_plan"].splitlines()
+    heap_sort_line = next(line for line in plan_lines if "HEAP SORT" in line)
+    scan_line = next(line for line in plan_lines if "READ (testdata.planets)" in line)
+    assert "LIMIT" in heap_sort_line  # fused limit stays with heap sort
+    assert "LIMIT" not in scan_line, cur.stats["executed_plan"]
+    assert cur.stats["rows_read"] == 9, cur.stats
+
 if __name__ == "__main__":  # pragma: no cover
     import shutil
     import time
@@ -68,7 +91,7 @@ def test_parquet_disk_limit_pushdown(query, expected_rows):
                 print(" \033[0;31m*\033[0m")
             else:
                 print()
-        except Exception as err:
+        except (AssertionError, opteryx.exceptions.Error) as err:
             print(f"\033[0;31m{str(int((time.monotonic_ns() - start)/1e6)).rjust(4)}ms ❌ *\033[0m")
             print(">", err)
             failed += 1