fix(enumerator): restore full pre-split functionality and remove test skips

lmeyerov · lmeyerov · commit 28bc40478f10 · 2026-01-09T12:19:24.000-08:00
- Restore source_node_match/destination_node_match filter support
- Restore WHERE + multi-hop path pruning logic
- Remove skip decorators that hid oracle feature gaps
- Keep only legitimate xfail for edge alias on multi-hop (oracle limitation)
- Remove conftest workaround for multi-hop + WHERE
diff --git a/graphistry/gfql/ref/enumerator.py b/graphistry/gfql/ref/enumerator.py
@@ -1,9 +1,10 @@
 """Minimal GFQL reference enumerator used as the correctness oracle."""
+# ruff: noqa: E501
 
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import Any, Dict, List, Literal, Optional, Sequence, Set, Tuple
+from typing import Any, Dict, List, Optional, Sequence, Set, Tuple
 
 import pandas as pd
 
@@ -16,21 +17,7 @@
 from graphistry.compute.ast import ASTEdge, ASTNode, ASTObject
 from graphistry.compute.chain import Chain
 from graphistry.compute.filter_by_dict import filter_by_dict
-ComparisonOp = Literal["==", "!=", "<", "<=", ">", ">="]
-
-
-
-@dataclass(frozen=True)
-class StepColumnRef:
-    alias: str
-    column: str
-
-
-@dataclass(frozen=True)
-class WhereComparison:
-    left: StepColumnRef
-    op: ComparisonOp
-    right: StepColumnRef
+from graphistry.compute.gfql.same_path_types import ComparisonOp, WhereComparison
 
 
 @dataclass(frozen=True)
@@ -52,14 +39,6 @@ class OracleResult:
     edge_hop_labels: Optional[Dict[Any, int]] = None
 
 
-def col(alias: str, column: str) -> StepColumnRef:
-    return StepColumnRef(alias, column)
-
-
-def compare(left: StepColumnRef, op: ComparisonOp, right: StepColumnRef) -> WhereComparison:
-    return WhereComparison(left, op, right)
-
-
 def enumerate_chain(
     g: Plottable,
     ops: Sequence[ASTObject],
@@ -140,11 +119,9 @@ def enumerate_chain(
             paths = paths.drop(columns=[current])
             current = node_step["id_col"]
         else:
-            if where:
-                raise ValueError("WHERE clauses not supported for multi-hop edges in enumerator")
-            if edge_step["alias"] or node_step["alias"]:
-                # Alias tagging for multi-hop not yet supported in enumerator
-                raise ValueError("Aliases not supported for multi-hop edges in enumerator")
+            if edge_step["alias"]:
+                # Edge alias tagging for multi-hop not yet supported in enumerator
+                raise ValueError("Edge aliases not supported for multi-hop edges in enumerator")
 
             dest_allowed: Optional[Set[Any]] = None
             if not node_frame.empty:
@@ -164,6 +141,12 @@ def enumerate_chain(
                 for dst in bp_result.seed_to_nodes.get(seed_id, set()):
                     new_rows.append([*row, dst])
             paths = pd.DataFrame(new_rows, columns=[*base_cols, node_step["id_col"]])
+            paths = paths.merge(
+                node_frame,
+                on=node_step["id_col"],
+                how="inner",
+                validate="m:1",
+            )
             current = node_step["id_col"]
 
             # Stash edges/nodes and hop labels for final selection
@@ -182,6 +165,72 @@ def enumerate_chain(
 
     if where:
         paths = paths[_apply_where(paths, where)]
+
+        # After WHERE filtering, prune collected_nodes/edges to only those in surviving paths
+        # For multi-hop edges, we stored all reachable nodes/edges before WHERE filtering
+        # Now we need to keep only those that participate in valid paths
+        if len(paths) > 0:
+            for i, edge_step in enumerate(edge_steps):
+                if "collected_nodes" not in edge_step:
+                    continue
+                start_col = node_steps[i]["id_col"]
+                end_col = node_steps[i + 1]["id_col"]
+                if start_col not in paths.columns or end_col not in paths.columns:
+                    continue
+                valid_starts = set(paths[start_col].tolist())
+                valid_ends = set(paths[end_col].tolist())
+
+                # Re-trace paths from valid_starts to valid_ends to find valid nodes/edges
+                # Build adjacency from original edges, respecting direction
+                direction = edge_step.get("direction", "forward")
+                adjacency: Dict[Any, List[Tuple[Any, Any]]] = {}
+                for _, row in edges_df.iterrows():  # type: ignore[assignment]
+                    src, dst, eid = row[edge_src], row[edge_dst], row[edge_id]  # type: ignore[call-overload]
+                    if direction == "reverse":
+                        # Reverse: traverse dst -> src
+                        adjacency.setdefault(dst, []).append((eid, src))
+                    elif direction == "undirected":
+                        # Undirected: traverse both ways
+                        adjacency.setdefault(src, []).append((eid, dst))
+                        adjacency.setdefault(dst, []).append((eid, src))
+                    else:
+                        # Forward: traverse src -> dst
+                        adjacency.setdefault(src, []).append((eid, dst))
+
+                # BFS from valid_starts to find paths to valid_ends
+                valid_nodes: Set[Any] = set()
+                valid_edge_ids: Set[Any] = set()
+                min_hops = edge_step.get("min_hops", 1)
+                max_hops = edge_step.get("max_hops", 10)
+
+                for start in valid_starts:
+                    # Track paths: (current_node, path_edges, path_nodes)
+                    stack: List[Tuple[Any, List[Any], List[Any]]] = [(start, [], [start])]
+                    while stack:
+                        node, path_edges, path_nodes = stack.pop()
+                        if len(path_edges) >= max_hops:
+                            continue
+                        for eid, dst in adjacency.get(node, []):
+                            new_edges = path_edges + [eid]
+                            new_nodes = path_nodes + [dst]
+                            # Only include paths within [min_hops, max_hops] range
+                            if dst in valid_ends and len(new_edges) >= min_hops:
+                                # This path reaches a valid end - include all nodes/edges
+                                valid_nodes.update(new_nodes)
+                                valid_edge_ids.update(new_edges)
+                            if len(new_edges) < max_hops:
+                                stack.append((dst, new_edges, new_nodes))
+
+                edge_step["collected_nodes"] = valid_nodes
+                edge_step["collected_edges"] = valid_edge_ids
+        else:
+            # No surviving paths - clear all collected nodes/edges
+            for edge_step in edge_steps:
+                if "collected_nodes" in edge_step:
+                    edge_step["collected_nodes"] = set()
+                if "collected_edges" in edge_step:
+                    edge_step["collected_edges"] = set()
+
     seq_cols: List[str] = []
     for i, node_step in enumerate(node_steps):
         seq_cols.append(node_step["id_col"])
diff --git a/tests/gfql/ref/conftest.py b/tests/gfql/ref/conftest.py
@@ -5,7 +5,6 @@
 import pytest
 
 from graphistry.Engine import Engine
-from graphistry.compute.ast import ASTEdge
 from graphistry.compute.gfql.df_executor import (
     build_same_path_inputs,
     DFSamePathExecutor,
@@ -17,17 +16,6 @@
 TEST_CUDF = "TEST_CUDF" in os.environ and os.environ["TEST_CUDF"] == "1"
 
 
-def _has_multihop(chain) -> bool:
-    """Check if chain has any multi-hop edges (oracle doesn't support multi-hop + WHERE)."""
-    for op in chain:
-        if isinstance(op, ASTEdge):
-            min_h = op.min_hops if op.min_hops is not None else (op.hops if isinstance(op.hops, int) else 1)
-            max_h = op.max_hops if op.max_hops is not None else (op.hops if isinstance(op.hops, int) else min_h)
-            if min_h != 1 or max_h != 1:
-                return True
-    return False
-
-
 def make_simple_graph():
     """Create a simple account->user graph for basic tests."""
     nodes = pd.DataFrame(
@@ -70,26 +58,14 @@ def make_hop_graph():
 
 
 def assert_executor_parity(graph, chain, where):
-    """Assert executor parity with oracle. Tests pandas, and cudf if TEST_CUDF=1.
-
-    For multi-hop + WHERE, oracle comparison is skipped (oracle doesn't support it).
-    We just verify the executor runs and produces valid output.
-    """
+    """Assert executor parity with oracle. Tests pandas, and cudf if TEST_CUDF=1."""
     inputs = build_same_path_inputs(graph, chain, where, Engine.PANDAS)
     executor = DFSamePathExecutor(inputs)
     executor._forward()
     result = executor._run_native()
 
     assert result._nodes is not None and result._edges is not None
 
-    # Oracle doesn't support multi-hop + WHERE, skip comparison
-    if where and _has_multihop(chain):
-        # Just verify executor produced valid output
-        assert "id" in result._nodes.columns
-        assert "src" in result._edges.columns
-        assert "dst" in result._edges.columns
-        return
-
     oracle = enumerate_chain(
         graph,
         chain,
diff --git a/tests/gfql/ref/test_df_executor_amplify.py b/tests/gfql/ref/test_df_executor_amplify.py
@@ -979,7 +979,6 @@ class TestNodeEdgeMatchFilters:
     of the endpoint node filters or WHERE clauses.
     """
 
-    @pytest.mark.skip(reason="Oracle doesn't support destination_node_match correctly")
     def test_destination_node_match_single_hop(self):
         """
         destination_node_match restricts which nodes can be reached.
@@ -1012,7 +1011,6 @@ def test_destination_node_match_single_hop(self):
         assert "b" in result_nodes, "should reach target type node"
         assert "c" not in result_nodes, "should not reach other type node"
 
-    @pytest.mark.skip(reason="Oracle doesn't support source_node_match correctly")
     def test_source_node_match_single_hop(self):
         """
         source_node_match restricts which nodes can be traversed FROM.
@@ -1111,7 +1109,6 @@ def test_destination_node_match_multi_hop(self):
         assert "b" in result_nodes, "should reach b (target) at hop 1"
         assert "c" in result_nodes, "should reach c (target) at hop 2"
 
-    @pytest.mark.skip(reason="Oracle doesn't support source/destination_node_match correctly")
     def test_combined_source_and_dest_match(self):
         """
         Both source_node_match and destination_node_match together.
diff --git a/tests/gfql/ref/test_df_executor_core.py b/tests/gfql/ref/test_df_executor_core.py
@@ -1282,7 +1282,6 @@ def test_cycle_with_branch(self):
 
         _assert_parity(graph, chain, where)
 
-    @pytest.mark.skip(reason="Oracle doesn't support multi-hop + WHERE")
     def test_oracle_cudf_parity_comprehensive(self):
         """
         P0 Test 4: Oracle and cuDF executor must produce identical results.
@@ -1407,7 +1406,6 @@ class TestP1FeatureComposition:
     cuDF executor's handling of multi-hop + WHERE combinations.
     """
 
-    @pytest.mark.skip(reason="Oracle doesn't support multi-hop + WHERE")
     def test_multi_hop_edge_where_filtering(self):
         """
         P1 Test 5: WHERE must be applied even for multi-hop edges.
@@ -1597,7 +1595,6 @@ class TestUnfilteredStarts:
     instead of hop labels (which become ambiguous when all nodes can be starts).
     """
 
-    @pytest.mark.skip(reason="Oracle doesn't support multi-hop + WHERE")
     def test_unfiltered_start_node_multihop(self):
         """
         Unfiltered start node with multi-hop works via public API.
@@ -1663,7 +1660,6 @@ def test_unfiltered_start_single_hop(self):
         result = execute_same_path_chain(graph, chain, where, Engine.PANDAS)
         assert set(result._nodes["id"]) == set(oracle.nodes["id"])
 
-    @pytest.mark.skip(reason="Oracle doesn't support multi-hop + WHERE")
     def test_unfiltered_start_with_cycle(self):
         """
         Unfiltered start with cycle in graph.
@@ -1694,7 +1690,6 @@ def test_unfiltered_start_with_cycle(self):
         result = execute_same_path_chain(graph, chain, where, Engine.PANDAS)
         assert set(result._nodes["id"]) == set(oracle.nodes["id"])
 
-    @pytest.mark.skip(reason="Oracle doesn't support multi-hop + WHERE")
     def test_unfiltered_start_multihop_reverse(self):
         """
         Unfiltered start node with multi-hop REVERSE traversal + WHERE.
@@ -1729,7 +1724,6 @@ def test_unfiltered_start_multihop_reverse(self):
         result = execute_same_path_chain(graph, chain, where, Engine.PANDAS)
         assert set(result._nodes["id"]) == set(oracle.nodes["id"])
 
-    @pytest.mark.skip(reason="Oracle doesn't support multi-hop + WHERE")
     def test_unfiltered_start_multihop_undirected(self):
         """
         Unfiltered start node with multi-hop UNDIRECTED traversal + WHERE.
@@ -1762,7 +1756,6 @@ def test_unfiltered_start_multihop_undirected(self):
         result = execute_same_path_chain(graph, chain, where, Engine.PANDAS)
         assert set(result._nodes["id"]) == set(oracle.nodes["id"])
 
-    @pytest.mark.skip(reason="Oracle doesn't support multi-hop + WHERE")
     def test_filtered_start_multihop_reverse_where(self):
         """
         Filtered start node with multi-hop REVERSE + WHERE.
@@ -1796,7 +1789,6 @@ def test_filtered_start_multihop_reverse_where(self):
         result = execute_same_path_chain(graph, chain, where, Engine.PANDAS)
         assert set(result._nodes["id"]) == set(oracle.nodes["id"])
 
-    @pytest.mark.skip(reason="Oracle doesn't support multi-hop + WHERE")
     def test_filtered_start_multihop_undirected_where(self):
         """
         Filtered start with multi-hop UNDIRECTED + WHERE.
@@ -1841,7 +1833,10 @@ class TestOracleLimitations:
     These test features the oracle doesn't support.
     """
 
-    @pytest.mark.skip(reason="Oracle doesn't support edge aliases on multi-hop edges")
+    @pytest.mark.xfail(
+        reason="Oracle doesn't support edge aliases on multi-hop edges",
+        strict=True,
+    )
     def test_edge_alias_on_multihop(self):
         """
         ORACLE LIMITATION: Edge alias on multi-hop edge.
diff --git a/tests/gfql/ref/test_df_executor_patterns.py b/tests/gfql/ref/test_df_executor_patterns.py
@@ -2429,7 +2429,6 @@ def test_string_equality(self):
         # Note: 'b' IS included because it's an intermediate node in the valid path a→b→c
         # The executor returns ALL nodes participating in valid paths, not just endpoints
 
-    @pytest.mark.skip(reason="Oracle doesn't support multi-hop + WHERE")
     def test_neq_with_nulls(self):
         """!= operator with null values - uses SQL-style semantics where NULL comparisons return False.