diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5e5dde5db3..f498cb1d39 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,6 +25,7 @@ jobs: docs: ${{ steps.filter.outputs.docs }} infra: ${{ steps.filter.outputs.infra }} docs_only_latest: ${{ steps.docs_only_latest.outputs.docs_only_latest }} + alloy: ${{ steps.filter.outputs.alloy }} steps: - uses: actions/checkout@v3 - uses: dorny/paths-filter@v3 @@ -58,6 +59,8 @@ jobs: - '**.rst' - 'demos/**' - 'notebooks/**' + alloy: + - 'alloy/**' - name: Detect docs-only change on tip id: docs_only_latest @@ -123,6 +126,42 @@ jobs: source pygraphistry/bin/activate ./bin/typecheck.sh + alloy-check: + needs: changes + if: ${{ needs.changes.outputs.alloy == 'true' || needs.changes.outputs.python == 'true' || needs.changes.outputs.infra == 'true' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' }} + runs-on: ubuntu-latest + timeout-minutes: 10 + + steps: + - name: Checkout repo + uses: actions/checkout@v3 + with: + lfs: true + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Pre-pull Alloy image cache + run: | + docker pull ghcr.io/graphistry/alloy6:6.2.0 || true + + - name: Run Alloy checks (scoped on PR/push, full on schedule/dispatch) + env: + EVENT_NAME: ${{ github.event_name }} + run: | + if [[ "$EVENT_NAME" == "schedule" || "$EVENT_NAME" == "workflow_dispatch" ]]; then + FULL=1 + MULTI=1 + else + FULL=0 + MULTI=0 + fi + ALLOY_PUSH=1 FULL=$FULL MULTI=$MULTI bash alloy/check_fbf_where.sh + test-minimal-python: needs: [changes, python-lint-types] # Run if Python files changed OR infrastructure changed OR manual/scheduled run diff --git a/CHANGELOG.md b/CHANGELOG.md index 74251ec9be..dc63dbe4c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,9 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - **Compute / hop**: `hop()` supports `min_hops`/`max_hops` traversal bounds plus optional hop labels for nodes, edges, and seeds, and post-traversal slicing via `output_min_hops`/`output_max_hops` to keep outputs compact while traversing wider ranges. - **Docs / hop**: Added bounded-hop walkthrough notebook (`docs/source/gfql/hop_bounds.ipynb`), cheatsheet and GFQL spec updates, and examples showing how to combine hop ranges, labels, and output slicing. - **GFQL / reference**: Extended the pandas reference enumerator and parity tests to cover hop ranges, labeling, and slicing so GFQL correctness checks include the new traversal shapes. +- **GFQL / Oracle**: Introduced `graphistry.gfql.ref.enumerator`, a pandas-only reference implementation that enumerates fixed-length chains, enforces local + same-path predicates, applies strict null semantics, enforces safety caps, and emits alias tags/optional path bindings for use as a correctness oracle. +- **GFQL / cuDF same-path**: Added execution-mode gate `GRAPHISTRY_CUDF_SAME_PATH_MODE` (auto/oracle/strict) for GFQL cuDF same-path executor. Auto falls back to oracle when GPU unavailable; strict requires cuDF or raises. Oracle path retains safety caps and alias-tag propagation. +- **GFQL / cuDF executor**: Implemented same-path pruning path (wavefront backward filtering, min/max summaries for inequalities, value-aware equality filters) with oracle fallback. CUDF chains with WHERE now dispatch through the same-path executor. ### Fixed - **Compute / hop**: Exact-hop traversals now prune branches that do not reach `min_hops`, avoid reapplying min-hop pruning in reverse passes, keep seeds in wavefront outputs, and reuse forward wavefronts when recomputing labels so edge/node hop labels stay aligned (fixes 3-hop branch inclusion issues and mislabeled slices). @@ -19,6 +22,10 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ### Tests - **GFQL / hop**: Expanded `test_compute_hops.py` and GFQL parity suites to assert branch pruning, bounded outputs, label collision handling, and forward/reverse slice behavior. - **Reference enumerator**: Added oracle parity tests for hop ranges and output slices to guard GFQL integrations. +- **GFQL**: Added deterministic + property-based oracle tests (triangles, alias reuse, cuDF conversions, Hypothesis) plus parity checks ensuring pandas GFQL chains match the oracle outputs. +- **GFQL / cuDF same-path**: Added strict/auto mode coverage for cuDF executor fallback behavior to keep CI stable while GPU kernels are wired up. +- **GFQL / cuDF same-path**: Added GPU-path parity tests (equality/inequality) over CPU data to guard semantics while GPU CI remains unavailable. +- **Layouts**: Added comprehensive test coverage for `circle_layout()` and `group_in_a_box_layout()` with partition support (CPU/GPU) ### Infra - **Tooling**: `bin/flake8.sh` / `bin/mypy.sh` now require installed tools (no auto-install), honor `FLAKE8_CMD` / `MYPY_CMD` and optional `MYPY_EXTRA_ARGS`; `bin/lint.sh` / `bin/typecheck.sh` resolve via uvx → python -m → bare. @@ -107,6 +114,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ### Tests - **CI / Python**: Expand GitHub Actions coverage to Python 3.13 + 3.13/3.14 for CPU lint/type/test jobs, while pinning RAPIDS-dependent CPU/GPU suites to <=3.13 until NVIDIA publishes 3.14 wheels (ensures lint/mypy/pytest signal on the latest interpreter without breaking RAPIDS installs). - **GFQL**: Added deterministic + property-based oracle tests (triangles, alias reuse, cuDF conversions, Hypothesis) plus parity checks ensuring pandas GFQL chains match the oracle outputs. +- **GFQL / cuDF same-path**: Added strict/auto mode coverage for cuDF executor fallback behavior to keep CI stable while GPU kernels are wired up. +- **GFQL / cuDF same-path**: Added GPU-path parity tests (equality/inequality) over CPU data to guard semantics while GPU CI remains unavailable. - **Layouts**: Added comprehensive test coverage for `circle_layout()` and `group_in_a_box_layout()` with partition support (CPU/GPU) ### Infra diff --git a/PLAN-846-852-feature-composition.md b/PLAN-846-852-feature-composition.md new file mode 100644 index 0000000000..4cfa766e48 --- /dev/null +++ b/PLAN-846-852-feature-composition.md @@ -0,0 +1,274 @@ +# Feature Composition Testing Plan: PR #846 + #852 + +## Status Summary + +| Item | Status | Notes | +|------|--------|-------| +| P0/P1 Tests for #846 | ✅ DONE | 8 tests added; 6 xfail (bugs found), 2 passing | +| Multi-hop bugs filed | ✅ DONE | Issue #872 created | +| Alloy README update | ✅ DONE | Scope/limitations documented | +| Meta-issue roadmap | ✅ DONE | Issue #871 created | + +## Issues Created + +- **#871**: Meta: GFQL Testing & Verification Roadmap +- **#872**: Fix multi-hop + WHERE backward prune bugs in cuDF executor + +## Branch Structure + +``` +master (includes PR #851 hop ranges - MERGED) + └── PR #846: feat/issue-837-cudf-hop-executor (same-path executor) + └── PR #852: feat/issue-838-alloy-fbf-where (alloy proof) ← CURRENT +``` + +## Execution Order + +### Phase 1: PR #846 Tests (on branch `feat/issue-837-cudf-hop-executor`) + +**Status: ✅ COMPLETE** + +Tests added to `tests/gfql/ref/test_cudf_executor_inputs.py`: + +| # | Test | Status | Notes | +|---|------|--------|-------| +| 1 | WHERE respected after min_hops backtracking | xfail | Bug #872 | +| 2 | Reverse direction + hop range + WHERE | xfail | Bug #872 | +| 3 | Non-adjacent alias WHERE | xfail | Bug #872 | +| 4 | Oracle vs cuDF parity comprehensive | xfail | Bug #872 | +| 5 | Multi-hop edge WHERE filtering | xfail | Bug #872 | +| 6 | Output slicing + WHERE | ✅ PASS | Works correctly | +| 7 | label_seeds + output_min_hops | ✅ PASS | Works correctly | +| 8 | Multiple WHERE + mixed hop ranges | xfail | Bug #872 | + +**Key Finding**: The cuDF executor has architectural limitations with multi-hop edges + WHERE: +- Backward prune doesn't trace through intermediate edges +- `_is_single_hop()` gates WHERE filtering +- Non-adjacent alias WHERE not applied + +These are documented in issue #872 for future fix. + +--- + +### Phase 2: Rebase PR #852 onto master + +```bash +git checkout feat/issue-838-alloy-fbf-where +git fetch origin +git rebase origin/master +# Resolve any conflicts +git push origin feat/issue-838-alloy-fbf-where --force-with-lease +``` + +--- + +### Phase 3: PR #852 Verification Updates (on branch `feat/issue-838-alloy-fbf-where`) + +**Status: ✅ COMPLETE** + +| # | Change | File | Status | +|---|--------|------|--------| +| 1 | Clarify hop ranges NOT formally verified | `alloy/README.md` | ✅ DONE | +| 2 | Note reliance on Python parity tests | `alloy/README.md` | ✅ DONE | +| 3 | State verified fragment precisely | `alloy/README.md` | ✅ DONE | + +**P1 - Add scenario checks (optional, strengthens claims)** - Deferred to future work. + +**Next steps:** +```bash +git checkout feat/issue-837-cudf-hop-executor +git stash pop # Apply the test changes +git add -A && git commit +git push origin feat/issue-837-cudf-hop-executor +# Wait for CI green, then merge PR #846 to master +``` + +--- + +## Test Implementation Details + +### Test 1: WHERE after min_hops backtracking + +```python +def test_where_respected_after_backtracking(): + """ + Graph: a -> b -> c -> d (3 hops) + a -> x -> y (2 hops, dead end for min_hops=3) + + WHERE: a.value < d.value + + Backtracking for min_hops=3 should: + 1. Prune x,y branch (doesn't reach 3 hops) + 2. Keep a,b,c,d path + 3. THEN apply WHERE to filter paths where a.value < d.value + + If WHERE not re-applied after backtracking, invalid paths may remain. + """ +``` + +### Test 2: Reverse direction + WHERE + +```python +def test_reverse_direction_where_semantics(): + """ + Graph: a -> b -> c -> d (forward edges) + + Chain: [n(name='start'), e_reverse(min_hops=2), n(name='end')] + WHERE: start.value > end.value + + Starting at 'd', reverse traversal reaches: + - c at hop 1, b at hop 2, a at hop 3 + + With min_hops=2, valid endpoints are b (hop 2) and a (hop 3). + WHERE compares start (d) vs end (b or a). + + Verify WHERE semantics are consistent regardless of traversal direction. + """ +``` + +### Test 3: Non-adjacent alias WHERE + +```python +def test_non_adjacent_alias_where(): + """ + Chain: [n(name='a'), e_forward(), n(name='b'), e_forward(), n(name='c')] + WHERE: a.id == c.id (aliases 2 edges apart) + + This WHERE clause should filter to paths where the first and last + nodes have the same id (e.g., cycles back to start). + + Risk: cuDF backward prune only applies WHERE to adjacent aliases. + """ +``` + +### Test 4: Oracle vs cuDF parity (parametrized) + +```python +@pytest.mark.parametrize("scenario", COMPOSITION_SCENARIOS) +def test_oracle_cudf_parity(scenario): + """ + Run same query with Oracle and cuDF executor. + Verify identical results. + + Scenarios cover all combinations of: + - Directions: forward, reverse, undirected + - Hop ranges: min_hops, max_hops, output slicing + - WHERE operators: ==, !=, <, <=, >, >= + - Topologies: linear, branch, cycle, disconnected + """ +``` + +--- + +## README Update for PR #852 + +```markdown +## Scope and Limitations + +### What IS Formally Verified + +- WHERE clause lowering to per-alias value summaries +- Equality (==, !=) via bitset filtering +- Inequality (<, <=, >, >=) via min/max summaries +- Multi-step chains with cross-alias comparisons +- Graph topologies: fan-out, fan-in, cycles, parallel edges, disconnected + +### What is NOT Formally Verified + +- **Hop ranges** (`min_hops`, `max_hops`): Approximated by unrolling to fixed-length chains +- **Output slicing** (`output_min_hops`, `output_max_hops`): Treated as post-filter +- **Hop labeling** (`label_node_hops`, `label_edge_hops`, `label_seeds`): Not modeled +- **Null/NaN semantics**: Verified in Python tests + +### Test Coverage for Unverified Features + +Hop ranges and output slicing are covered by Python parity tests: +- `tests/gfql/ref/test_enumerator_parity.py`: 11+ hop range scenarios +- `tests/gfql/ref/test_cudf_executor_inputs.py`: 8+ WHERE + hop range scenarios + +These tests verify the cuDF executor matches the reference oracle implementation. +``` + +--- + +## Priority Summary + +| Priority | Branch | Items | Blocks | +|----------|--------|-------|--------| +| **P0** | #846 | 4 tests | Merge of #846 | +| **P1** | #846 | 4 tests | - | +| **P0** | #852 | README scope update | Merge of #852 | +| **P1** | #852 | Alloy scenario checks | - | + +--- + +## Success Criteria + +### PR #846 Ready to Merge When: +- [ ] All 8 new tests pass +- [ ] Existing tests still pass +- [ ] CI green + +### PR #852 Ready to Merge When: +- [ ] README accurately describes verified scope +- [ ] Alloy checks pass (existing + any new scenarios) +- [ ] CI green + +--- + +## Resume Context + +### Current State (as of session end) +- **Current branch**: `feat/issue-838-alloy-fbf-where` (PR #852) +- **Stash**: Test changes stashed on `feat/issue-837-cudf-hop-executor` (stash@{0}) +- **Uncommitted**: `alloy/README.md` changes (scope/limitations section added) + +### Git State Summary +``` +feat/issue-838-alloy-fbf-where: + - Modified: alloy/README.md (scope/limitations section) + - Untracked: PLAN-846-852-feature-composition.md (this file) + +feat/issue-837-cudf-hop-executor (stash@{0}): + - 8 new tests in tests/gfql/ref/test_cudf_executor_inputs.py + - TestP0FeatureComposition class (4 tests, 3 xfail + 1 passing) + - TestP1FeatureComposition class (4 tests, 3 xfail + 1 passing) +``` + +### Key Files Modified +1. `tests/gfql/ref/test_cudf_executor_inputs.py` - Added 8 feature composition tests +2. `alloy/README.md` - Added scope/limitations section +3. `PLAN-846-852-feature-composition.md` - This tracking document + +### Bug Details (Issue #872) +Root cause in `graphistry/compute/gfql/cudf_executor.py`: +- `_backward_prune()` lines 312-393: Assumes single-hop edges +- `_is_single_hop()` gates WHERE filtering +- Multi-hop edges break backward prune path tracing + +### To Resume Work +```bash +# 1. Commit alloy README changes on current branch +git add alloy/README.md +git commit -m "docs(alloy): add scope and limitations section" +git push origin feat/issue-838-alloy-fbf-where + +# 2. Switch to #846 branch and apply stashed tests +git checkout feat/issue-837-cudf-hop-executor +git stash pop + +# 3. Commit and push test changes +git add tests/gfql/ref/test_cudf_executor_inputs.py +git commit -m "test(gfql): add 8 feature composition tests for hop ranges + WHERE + +Adds P0/P1 tests for PR #846 same-path executor with hop ranges. +6 tests xfail documenting known bugs (see issue #872). +2 tests pass verifying output slicing and label_seeds work correctly." +git push origin feat/issue-837-cudf-hop-executor + +# 4. Wait for CI, then merge PRs in order: #846 first, then rebase/merge #852 +``` + +### Related Issues +- **#871**: Meta: GFQL Testing & Verification Roadmap (future work) +- **#872**: Fix multi-hop + WHERE backward prune bugs in cuDF executor diff --git a/alloy/Dockerfile b/alloy/Dockerfile new file mode 100644 index 0000000000..8d96e08e58 --- /dev/null +++ b/alloy/Dockerfile @@ -0,0 +1,7 @@ +FROM eclipse-temurin:17-jre +WORKDIR /work + +# Use published Alloy dist jar (6.2.0) +ADD https://github.com/AlloyTools/org.alloytools.alloy/releases/download/v6.2.0/org.alloytools.alloy.dist.jar /opt/alloy/alloy.jar + +ENTRYPOINT ["java", "-jar", "/opt/alloy/alloy.jar"] diff --git a/alloy/README.md b/alloy/README.md new file mode 100644 index 0000000000..c19b92a833 --- /dev/null +++ b/alloy/README.md @@ -0,0 +1,66 @@ +# Alloy Checks for GFQL F/B/F + WHERE + +Purpose: bounded, mechanized equivalence checks between the GFQL path-spec and the set-based forward/backward/forward algorithm with WHERE lowerings. + +## Model +- Path semantics: bindings are sequences aligned to `seqSteps`; WHERE is per binding. Mirrors Python hop/chain construction. +- Set semantics: executor-style F/B/F over per-alias node/edge sets; WHERE lowered via per-alias summaries. +- Scopes: ≤8 Nodes, ≤8 Edges, ≤4 Steps, ≤4 Values. Null/NaN not modeled; hashing treated as prefilter and omitted. +- Lowerings: inequalities via min/max summaries; equality via exact sets (bitsets modeled as sets). + +## Commands +- Default small checks (fast): `bash alloy/check_fbf_where.sh` +- Full scopes (core + scenarios): `FULL=1 bash alloy/check_fbf_where.sh` +- Add multi-chain full-scope: `FULL=1 MULTI=1 bash alloy/check_fbf_where.sh` + +Env vars: +- `ALLOY_IMAGE` (default `ghcr.io/graphistry/alloy6:6.2.0`) +- `ALLOY_FALLBACK_IMAGE` (default `local/alloy6:latest`) +- `ALLOY_PUSH=1` to push built image to ghcr when falling back. + +## CI behavior +- PR/push: small + scenario suite (faster). +- schedule/workflow_dispatch: full scopes + optional multi-chain (heavier). +- Job pre-pulls `ghcr.io/graphistry/alloy6:6.2.0`; falls back to local build and pushes when allowed. + +## Scope and Limitations + +### What IS Formally Verified +- WHERE clause lowering to per-alias value summaries +- Equality (`==`, `!=`) via bitset filtering +- Inequality (`<`, `<=`, `>`, `>=`) via min/max summaries +- Multi-step chains with cross-alias comparisons +- Graph topologies: fan-out, fan-in, cycles, parallel edges, disconnected + +### What is NOT Formally Verified +- **Hop ranges** (`min_hops`, `max_hops`): Approximated by unrolling to fixed-length chains +- **Output slicing** (`output_min_hops`, `output_max_hops`): Treated as post-filter +- **Hop labeling** (`label_node_hops`, `label_edge_hops`, `label_seeds`): Not modeled +- **Null/NaN semantics**: Verified in Python tests instead +- **Hashing**: Treated as prefilter and omitted (exactness rechecked in model) + +### Test Coverage for Unverified Features +Hop ranges and output slicing are covered by Python parity tests: +- `tests/gfql/ref/test_enumerator_parity.py`: 11+ hop range scenarios +- `tests/gfql/ref/test_df_executor_inputs.py`: 50+ WHERE + hop range scenarios +- `tests/gfql/ref/test_df_executor_inputs.py::TestImpossibleConstraints`: 10 impossible/contradictory constraint tests + +These tests verify the native executor matches the reference oracle implementation. + +### Bugs Found That Inform Future Verification (PR #846) + +The following bugs were found during executor development that formal verification could catch: + +1. **Backward traversal join direction** (`_find_multihop_start_nodes`) - joined on wrong column +2. **Empty set short-circuit missing** (`_materialize_filtered`) - no early return for empty sets +3. **Wrong node source for non-adjacent WHERE** - used incomplete alias_frames instead of graph nodes +4. **Multi-hop path tracing through intermediates** - backward prune filtered wrong edges +5. **Reverse/undirected edge direction handling** - missing is_undirected checks + +See issue #871 for recommended Alloy model extensions: +- P1: Add hop range modeling +- P1: Add backward reachability assertions +- P2: Add empty set propagation assertion +- P2: Add contradictory WHERE scenarios (attempted but model's value semantics are too nuanced; covered by Python tests) + +See issue #871 for the full testing & verification roadmap. diff --git a/alloy/check_fbf_where.sh b/alloy/check_fbf_where.sh new file mode 100755 index 0000000000..c774797c77 --- /dev/null +++ b/alloy/check_fbf_where.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ALS="/work/gfql_fbf_where.als" +IMAGE="${ALLOY_IMAGE:-ghcr.io/graphistry/alloy6:6.2.0}" +LOCAL_FALLBACK_IMAGE="${ALLOY_FALLBACK_IMAGE:-local/alloy6:latest}" +FULL=${FULL:-0} +MULTI=${MULTI:-0} +PUSH=${ALLOY_PUSH:-0} + +# Resolve image: pull ghcr if possible, otherwise build local; optionally push built image to ghcr for caching +resolve_image() { + local img="$IMAGE" + if docker image inspect "$img" >/dev/null 2>&1; then + IMAGE="$img" + return + fi + + if docker pull "$img" >/dev/null 2>&1; then + IMAGE="$img" + return + fi + + # Fall back to local build + if ! docker image inspect "$LOCAL_FALLBACK_IMAGE" >/dev/null 2>&1; then + docker build -t "$LOCAL_FALLBACK_IMAGE" "$HERE" + fi + + # Optionally publish to ghcr for future pulls + if [ "$PUSH" = "1" ]; then + docker tag "$LOCAL_FALLBACK_IMAGE" "$img" + docker push "$img" || true + IMAGE="$img" + else + IMAGE="$LOCAL_FALLBACK_IMAGE" + fi +} + +resolve_image + +if [ "$FULL" = "1" ]; then + docker run --rm -v "$HERE":/work "$IMAGE" exec -c SpecNoWhereEqAlgoNoWhere -o - "$ALS" + docker run --rm -v "$HERE":/work "$IMAGE" exec -c SpecWhereEqAlgoLowered -o - "$ALS" +else + docker run --rm -v "$HERE":/work "$IMAGE" exec -c SpecNoWhereEqAlgoNoWhereSmall -o - "$ALS" + docker run --rm -v "$HERE":/work "$IMAGE" exec -c SpecWhereEqAlgoLoweredSmall -o - "$ALS" +fi + +# Scenario coverage + additional scopes (fixed small scopes inside .als) +for ASSERT in SpecNoWhereEqAlgoNoWhereMultiChain SpecWhereEqAlgoLoweredMultiChain SpecWhereEqAlgoLoweredFan SpecWhereEqAlgoLoweredCycle SpecWhereEqAlgoLoweredParallel SpecWhereEqAlgoLoweredDisconnected SpecWhereEqAlgoLoweredAliasWhere SpecWhereEqAlgoLoweredMixedWhere SpecWhereEqAlgoLoweredFilterMix; do + docker run --rm -v "$HERE":/work "$IMAGE" exec -c "$ASSERT" -o - "$ALS" +done + +if [ "$MULTI" = "1" ]; then + docker run --rm -v "$HERE":/work "$IMAGE" exec -c SpecNoWhereEqAlgoNoWhereMultiChainFull -o - "$ALS" + docker run --rm -v "$HERE":/work "$IMAGE" exec -c SpecWhereEqAlgoLoweredMultiChainFull -o - "$ALS" +fi diff --git a/alloy/gfql_fbf_where.als b/alloy/gfql_fbf_where.als new file mode 100644 index 0000000000..1d206aef0d --- /dev/null +++ b/alloy/gfql_fbf_where.als @@ -0,0 +1,283 @@ +module gfql_fbf_where +open util/ordering[Value] as ord +open util/integer + +// Alloy model to compare Python hop/chain (path semantics) vs executor (set semantics with F/B/F lowerings). +// Path semantics: bindings are sequences aligned to seqSteps with WHERE applied per binding. +// Set semantics: forward/backward/forward collects per-alias node/edge sets, then checks WHERE via summaries. +// Scopes (checks): up to 8 Nodes, 8 Edges, 4 Steps, 4 Values. Nulls/hashing omitted; bounded values only. +// Mapping to Python hop/chain: +// - seqSteps alternates NodeStep/EdgeStep like graphistry.compute.GSQL chain builder. +// - aliasN/aliasE mirror user aliases; WHERE binds to NodeStep aliases only. +// - nFilter/eFilter correspond to per-step filter columns; WHERE models cross-step predicates. +// - Spec uses path bindings (sequence) like hop composition; Algo uses set semantics like executor. +// - Null/NaN not modeled; hashing treated as prefilter and omitted here. +// - Hop ranges/output slicing (min/max/output bounds) are not explicitly modeled; approximate via unrolled fixed-length chains. + +abstract sig Value {} +sig Val extends Value {} + +sig Node { vals: set Value } +sig Edge { src: one Node, dst: one Node, vals: set Value } + +abstract sig Step {} +sig NodeStep extends Step { aliasN: lone Alias, nFilter: set Value } +sig EdgeStep extends Step { aliasE: lone Alias, eFilter: set Value } +sig Alias {} + +// WHERE refs point to node aliases and a required value +sig WhereRef { a: one Alias, v: one Value } +sig WhereClause { lhs: one WhereRef, rhs: one WhereRef, op: one Op } +abstract sig Op {} +one sig Eq, Neq, Lt, Lte, Gt, Gte extends Op {} + +// Chain mirrors Python chain construction: alternating NodeStep/EdgeStep with alias + filters. +sig Chain { seqSteps: seq Step, wheres: set WhereClause } +sig Binding { + owner: one Chain, + bn: Int -> lone Node, + be: Int -> lone Edge +} + +// Well-formed chains: non-empty, odd length (N,E,N,...), typed positions +fact WellFormedChains { + all c: Chain | + #seq/inds[c.seqSteps] > 0 and rem[#seq/inds[c.seqSteps], 2] = 1 and + all i: seq/inds[c.seqSteps] | + (rem[i, 2] = 0 => c.seqSteps[i] in NodeStep) and + (rem[i, 2] = 1 => c.seqSteps[i] in EdgeStep) +} + +// Ensure we analyze non-empty chains; allow multiple chains/bindings within scope. +fact NonEmptyChains { some Chain } +fact OneBindingPerChain { all c: Chain | some b: Binding | b.owner = c } + +// All bindings must satisfy their owner's shape and WHERE clauses +fact BindingsRespectOwners { + all c: Chain | some b: Binding | BindingFor[c, b] +} + +// Project binding sequences into sets (path semantics) +fun bindNodes[b: Binding]: set Node { b.bn[Int] } +fun bindEdges[b: Binding]: set Edge { b.be[Int] } + +// Binding = sequence of nodes/edges aligned with steps (path-based semantics) +pred BindingFor[c: Chain, b: Binding] { + b.owner = c and + let bnSeq = b.bn, beSeq = b.be | + isSeq[bnSeq] and isSeq[beSeq] and + // shape + #bnSeq = div[#(c.seqSteps) + 1, 2] and + #beSeq = div[#(c.seqSteps), 2] and + all i: seq/inds[c.seqSteps] | + (rem[i, 2] = 0 => c.seqSteps[i] in NodeStep and nFilterOK[c.seqSteps[i], bnSeq[div[i, 2]]]) and + (rem[i, 2] = 1 => c.seqSteps[i] in EdgeStep and eFilterOK[c.seqSteps[i], beSeq[div[i, 2]]] and beSeq[div[i, 2]].src = bnSeq[div[i - 1, 2]] and beSeq[div[i, 2]].dst = bnSeq[div[i + 1, 2]]) and + // where clauses satisfied + all w: c.wheres | whereHolds[w, c, bnSeq] +} + +// Binding shape without WHERE (used by set-based algo path connectivity) +pred BindingShape[c: Chain, b: Binding] { + b.owner = c and + let bnSeq = b.bn, beSeq = b.be | + isSeq[bnSeq] and isSeq[beSeq] and + #bnSeq = div[#(c.seqSteps) + 1, 2] and + #beSeq = div[#(c.seqSteps), 2] and + all i: seq/inds[c.seqSteps] | + (rem[i, 2] = 0 => c.seqSteps[i] in NodeStep and nFilterOK[c.seqSteps[i], bnSeq[div[i, 2]]]) and + (rem[i, 2] = 1 => c.seqSteps[i] in EdgeStep and eFilterOK[c.seqSteps[i], beSeq[div[i, 2]]] and beSeq[div[i, 2]].src = bnSeq[div[i - 1, 2]] and beSeq[div[i, 2]].dst = bnSeq[div[i + 1, 2]]) +} + +pred nFilterOK[s: NodeStep, n: Node] { no s.nFilter or s.nFilter in n.vals } +pred eFilterOK[s: EdgeStep, e: Edge] { no s.eFilter or s.eFilter in e.vals } + +// resolve alias to node in binding +fun aliasNode[c: Chain, bn: Int -> lone Node, a: Alias]: set Node { + { n: Node | some i: seq/inds[c.seqSteps] | rem[i, 2] = 0 and c.seqSteps[i].aliasN = a and n = bn[div[i, 2]] } +} + +pred whereHolds[w: WhereClause, c: Chain, bn: Int -> lone Node] { + let ln = aliasNode[c, bn, w.lhs.a], rn = aliasNode[c, bn, w.rhs.a] | + some ln and some rn and + let lvals = ln.vals, rvals = rn.vals | + (w.op = Eq => some vv: lvals & rvals | vv = w.lhs.v and vv = w.rhs.v) + or (w.op = Neq => no (lvals & rvals)) + or (w.op = Lt => some lv: lvals | some rv: rvals | lv = w.lhs.v and rv = w.rhs.v and lv in ord/prevs[rv]) + or (w.op = Lte => some lv: lvals | some rv: rvals | lv = w.lhs.v and rv = w.rhs.v and (lv = rv or lv in ord/prevs[rv])) + or (w.op = Gt => some lv: lvals | some rv: rvals | lv = w.lhs.v and rv = w.rhs.v and rv in ord/prevs[lv]) + or (w.op = Gte => some lv: lvals | some rv: rvals | lv = w.lhs.v and rv = w.rhs.v and (lv = rv or rv in ord/prevs[lv])) +} + +// Spec (path semantics): nodes/edges that appear in some satisfying binding +pred SpecNode[c: Chain, n: Node] { some b: Binding | BindingFor[c, b] and n in bindNodes[b] } +pred SpecEdge[c: Chain, e: Edge] { some b: Binding | BindingFor[c, b] and e in bindEdges[b] } + +pred SpecAlgoEq[c: Chain] { + all n: Node | SpecNode[c, n] <=> n in AlgoOutN[c] + all e: Edge | SpecEdge[c, e] <=> e in AlgoOutE[c] +} + +// Algo: forward/backward/forward under set semantics with simple lowerings: +// - Inequalities lowered to min/max summaries per alias/value +// - Equalities lowered to exact value sets per alias +fun AlgoOutN[c: Chain]: set Node { { n: Node | some b: Binding | BindingShape[c, b] and n in bindNodes[b] } } +fun AlgoOutE[c: Chain]: set Edge { { e: Edge | some b: Binding | BindingShape[c, b] and e in bindEdges[b] } } + +pred Algo[c: Chain] { + let outN = AlgoOutN[c], outE = AlgoOutE[c] | + all w: c.wheres | lowerWhere[w, c, outN, outE] +} + +pred lowerWhere[w: WhereClause, c: Chain, outN: set Node, outE: set Edge] { + // compute per-alias value sets + let ln = aliasNodes[outN, c, w.lhs.a], rn = aliasNodes[outN, c, w.rhs.a] | + some ln and some rn and + let lvals = ln.vals, rvals = rn.vals | + (w.op = Eq => some vv: lvals & rvals | vv = w.lhs.v and vv = w.rhs.v) + or (w.op = Neq => no (lvals & rvals)) + or (w.op = Lt => some lv: lvals | some rv: rvals | lv = w.lhs.v and rv = w.rhs.v and ord/lt[lv, rv]) + or (w.op = Lte => some lv: lvals | some rv: rvals | lv = w.lhs.v and rv = w.rhs.v and (lv = rv or ord/lt[lv, rv])) + or (w.op = Gt => some lv: lvals | some rv: rvals | lv = w.lhs.v and rv = w.rhs.v and ord/lt[rv, lv]) + or (w.op = Gte => some lv: lvals | some rv: rvals | lv = w.lhs.v and rv = w.rhs.v and (lv = rv or ord/lt[rv, lv])) +} + +fun aliasNodes[ns: set Node, c: Chain, a: Alias]: set Node { + { n: ns | some i: seq/inds[c.seqSteps] | rem[i, 2] = 0 and c.seqSteps[i].aliasN = a } +} + +assert SpecNoWhereEqAlgoNoWhere { + all c: Chain | + Algo[c] and + (no c.wheres implies SpecAlgoEq[c]) +} + +assert SpecWhereEqAlgoLowered { + all c: Chain | + Algo[c] and SpecAlgoEq[c] +} + +// Derived assertions for alternate scopes (multi-chain) +assert SpecNoWhereEqAlgoNoWhereMultiChain { + all c: Chain | + Algo[c] and (no c.wheres implies SpecAlgoEq[c]) +} + +assert SpecWhereEqAlgoLoweredMultiChain { + all c: Chain | + Algo[c] and SpecAlgoEq[c] +} + +assert SpecNoWhereEqAlgoNoWhereMultiChainFull { + all c: Chain | + Algo[c] and (no c.wheres implies SpecAlgoEq[c]) +} + +assert SpecWhereEqAlgoLoweredMultiChainFull { + all c: Chain | + Algo[c] and SpecAlgoEq[c] +} + +// Convenience aliases for alternate scopes +assert SpecNoWhereEqAlgoNoWhereSmall { + all c: Chain | + Algo[c] and + (no c.wheres implies SpecAlgoEq[c]) +} +assert SpecWhereEqAlgoLoweredSmall { + all c: Chain | Algo[c] and SpecAlgoEq[c] +} + +// Scenario coverage: topologies and query shapes that tend to surface path/set differences. +pred FanOutGraph { some n: Node | some disj e1, e2: Edge | e1.src = n and e2.src = n and e1.dst != e2.dst } +pred FanInGraph { some n: Node | some disj e1, e2: Edge | e1.dst = n and e2.dst = n and e1.src != e2.src } +pred CycleGraph { some e: Edge | e.src = e.dst or some disj e1, e2: Edge | e1.src = e2.dst and e2.src = e1.dst } +pred ParallelEdgesGraph { some disj e1, e2: Edge | e1.src = e2.src and e1.dst = e2.dst } +pred DisconnectedGraph { some n: Node | no e: Edge | e.src = n or e.dst = n } + +pred ChainAliasReuse[c: Chain] { + #seq/inds[c.seqSteps] >= 3 and + c.seqSteps[0] in NodeStep and c.seqSteps[2] in NodeStep and + some al: Alias | c.seqSteps[0].aliasN = al and c.seqSteps[2].aliasN = al and + some w: c.wheres | (w.lhs.a = al or w.rhs.a = al) +} + +pred ChainMixedWhere[c: Chain] { + some wEq: c.wheres | wEq.op = Eq and + some wCmp: c.wheres | wCmp.op != Eq +} + +pred ChainFilterMix[c: Chain] { + some ns: NodeStep | ns in c.seqSteps.elems and some ns.nFilter and + some es: EdgeStep | es in c.seqSteps.elems and some es.eFilter +} + +pred FanCounterexample { + FanOutGraph and FanInGraph and + some c: Chain | Algo[c] and not SpecAlgoEq[c] +} +assert SpecWhereEqAlgoLoweredFan { not FanCounterexample } + +pred CycleCounterexample { + CycleGraph and + some c: Chain | Algo[c] and not SpecAlgoEq[c] +} +assert SpecWhereEqAlgoLoweredCycle { not CycleCounterexample } + +pred ParallelCounterexample { + ParallelEdgesGraph and + some c: Chain | Algo[c] and not SpecAlgoEq[c] +} +assert SpecWhereEqAlgoLoweredParallel { not ParallelCounterexample } + +pred DisconnectedCounterexample { + DisconnectedGraph and + some c: Chain | Algo[c] and not SpecAlgoEq[c] +} +assert SpecWhereEqAlgoLoweredDisconnected { not DisconnectedCounterexample } + +pred AliasCounterexample { + some c: Chain | ChainAliasReuse[c] and Algo[c] and not SpecAlgoEq[c] +} +assert SpecWhereEqAlgoLoweredAliasWhere { not AliasCounterexample } + +pred MixedWhereCounterexample { + some c: Chain | ChainMixedWhere[c] and Algo[c] and not SpecAlgoEq[c] +} +assert SpecWhereEqAlgoLoweredMixedWhere { not MixedWhereCounterexample } + +pred FilterMixCounterexample { + some c: Chain | ChainFilterMix[c] and Algo[c] and not SpecAlgoEq[c] +} +assert SpecWhereEqAlgoLoweredFilterMix { not FilterMixCounterexample } + +// Note: Contradictory WHERE checking (e.g., a.v == c.v AND a.v != c.v) is complex +// in this model because: +// - Eq checks: some vv IN (lvals & rvals) where vv = w.lhs.v AND vv = w.rhs.v +// - Neq checks: no (lvals & rvals) - requires EMPTY intersection +// These seem contradictory, but the model's value semantics are more nuanced. +// Contradictory constraint checking is covered by Python tests instead. +// See TestImpossibleConstraints in test_df_executor_inputs.py (10 tests) + +check SpecNoWhereEqAlgoNoWhere for 8 but 4 Step, 4 Value, 4 Binding, 1 Chain +check SpecWhereEqAlgoLowered for 8 but 4 Step, 4 Value, 4 Binding, 1 Chain + +// Debug-friendly smaller scopes +check SpecNoWhereEqAlgoNoWhereSmall for 4 but 3 Step, 3 Value, 3 Binding, 4 Node, 4 Edge, 1 Chain +check SpecWhereEqAlgoLoweredSmall for 4 but 3 Step, 3 Value, 3 Binding, 4 Node, 4 Edge, 1 Chain + +// Multi-chain sanity (small scope to keep solve time low) +check SpecNoWhereEqAlgoNoWhereMultiChain for 4 but 3 Step, 3 Value, 2 Binding, 4 Node, 4 Edge, 2 Chain +check SpecWhereEqAlgoLoweredMultiChain for 4 but 3 Step, 3 Value, 2 Binding, 4 Node, 4 Edge, 2 Chain + +// Multi-chain fuller scope (optional; gated via script env to keep runtime predictable) +check SpecNoWhereEqAlgoNoWhereMultiChainFull for 8 but 4 Step, 4 Value, 4 Binding, 2 Chain +check SpecWhereEqAlgoLoweredMultiChainFull for 8 but 4 Step, 4 Value, 4 Binding, 2 Chain + +// Scenario-specific coverage (smaller scopes to keep solving fast) +check SpecWhereEqAlgoLoweredFan for 6 but 3 Step, 3 Value, 3 Binding, 6 Node, 6 Edge, 1 Chain +check SpecWhereEqAlgoLoweredCycle for 6 but 3 Step, 3 Value, 3 Binding, 6 Node, 6 Edge, 1 Chain +check SpecWhereEqAlgoLoweredParallel for 6 but 3 Step, 3 Value, 3 Binding, 6 Node, 6 Edge, 1 Chain +check SpecWhereEqAlgoLoweredDisconnected for 6 but 3 Step, 3 Value, 3 Binding, 6 Node, 6 Edge, 1 Chain +check SpecWhereEqAlgoLoweredAliasWhere for 6 but 3 Step, 3 Value, 3 Binding, 6 Node, 6 Edge, 1 Chain +check SpecWhereEqAlgoLoweredMixedWhere for 6 but 3 Step, 3 Value, 3 Binding, 6 Node, 6 Edge, 1 Chain +check SpecWhereEqAlgoLoweredFilterMix for 6 but 3 Step, 3 Value, 3 Binding, 6 Node, 6 Edge, 1 Chain diff --git a/graphistry/compute/chain.py b/graphistry/compute/chain.py index 7a11c4edc3..08d125233c 100644 --- a/graphistry/compute/chain.py +++ b/graphistry/compute/chain.py @@ -1,6 +1,6 @@ import logging import pandas as pd -from typing import Dict, Union, cast, List, Tuple, Optional, TYPE_CHECKING +from typing import Dict, Union, cast, List, Tuple, Sequence, Optional, TYPE_CHECKING from graphistry.Engine import Engine, EngineAbstract, df_concat, df_to_engine, resolve_engine from graphistry.Plottable import Plottable @@ -12,6 +12,11 @@ from .typing import DataFrameT from .util import generate_safe_column_name from graphistry.compute.validate.validate_schema import validate_chain_schema +from graphistry.gfql.same_path_types import ( + WhereComparison, + parse_where_json, + where_to_json, +) from .gfql.policy import PolicyContext, PolicyException from .gfql.policy.stats import extract_graph_stats @@ -26,8 +31,14 @@ class Chain(ASTSerializable): - def __init__(self, chain: List[ASTObject], validate: bool = True) -> None: + def __init__( + self, + chain: List[ASTObject], + where: Optional[Sequence[WhereComparison]] = None, + validate: bool = True, + ) -> None: self.chain = chain + self.where = list(where or []) if validate: # Fail fast on invalid chains; matches documented automatic validation behavior self.validate(collect_all=False) @@ -120,7 +131,15 @@ def from_json(cls, d: Dict[str, JSONVal], validate: bool = True) -> 'Chain': f"Chain field must be a list, got {type(d['chain']).__name__}" ) - out = cls([ASTObject_from_json(op, validate=validate) for op in d['chain']], validate=validate) + where_raw = d.get('where') + where = parse_where_json( + cast(Optional[Sequence[Dict[str, Dict[str, str]]]], where_raw) + ) + out = cls( + [ASTObject_from_json(op, validate=validate) for op in d['chain']], + where=where, + validate=validate, + ) return out def to_json(self, validate=True) -> Dict[str, JSONVal]: @@ -129,10 +148,13 @@ def to_json(self, validate=True) -> Dict[str, JSONVal]: """ if validate: self.validate() - return { + data: Dict[str, JSONVal] = { 'type': self.__class__.__name__, 'chain': [op.to_json() for op in self.chain] } + if self.where: + data['where'] = where_to_json(self.where) + return data def validate_schema(self, g: Plottable, collect_all: bool = False) -> Optional[List['GFQLSchemaError']]: """Validate this chain against a graph's schema without executing. diff --git a/graphistry/compute/gfql/df_executor.py b/graphistry/compute/gfql/df_executor.py new file mode 100644 index 0000000000..cf4f9890a0 --- /dev/null +++ b/graphistry/compute/gfql/df_executor.py @@ -0,0 +1,1933 @@ +"""DataFrame-based GFQL executor with same-path WHERE planning. + +This module hosts the execution path for GFQL chains that require +same-path predicate enforcement. Works with both pandas and cuDF +DataFrames. + +ARCHITECTURE NOTE FOR AI ASSISTANTS +==================================== +This executor implements Yannakakis-style semijoin pruning for graph queries. +The same code path must work for BOTH pandas (CPU) and cuDF (GPU). + +CRITICAL: ALL operations must be VECTORIZED using DataFrame operations: +- Use merge() for joins +- Use groupby().agg() for summaries (min/max for ; value sets for ==) +- Use boolean masks for filtering +- Use .isin() for set membership + +NEVER use these anti-patterns (they break GPU and are slow on CPU): +- for loops over DataFrame rows (for row in df.iterrows()) +- for loops with zip over columns (for a, b in zip(df[x], df[y])) +- while loops for BFS/DFS graph traversal +- Building Python dicts/adjacency lists from DataFrame data +- .tolist() conversions followed by Python iteration + +For same-path predicates across multiple hops (e.g., a.val > c.threshold): +- Monotone (<, >, <=, >=): Propagate min/max summaries hop-by-hop via groupby +- Equality (==, !=): Propagate value sets via state tables (merge + groupby) + +Example of CORRECT vectorized multi-hop summary propagation: + # Forward: propagate max(a.val) through edges to node c + e1_with_a = edges_e1.merge(nodes_a[['id', 'val']], left_on='src', right_on='id') + max_at_b = e1_with_a.groupby('dst')['val'].max().reset_index() + e2_with_b = edges_e2.merge(max_at_b, left_on='src', right_on='id') + max_at_c = e2_with_b.groupby('dst')['val'].max().reset_index() + # Filter: keep c nodes where max_a_val > threshold + valid_c = nodes_c.merge(max_at_c, on='id') + valid_c = valid_c[valid_c['val'] > valid_c['threshold']] + +See plan.md for full Yannakakis algorithm explanation and refactoring notes. +""" + +from __future__ import annotations + +import os +from collections import defaultdict +from dataclasses import dataclass +from typing import Dict, Literal, Sequence, Set, List, Optional, Any, Tuple + +import pandas as pd + +from graphistry.Engine import Engine, safe_merge +from graphistry.Plottable import Plottable +from graphistry.compute.ast import ASTCall, ASTEdge, ASTNode, ASTObject +from graphistry.gfql.ref.enumerator import OracleCaps, OracleResult, enumerate_chain +from graphistry.gfql.same_path_plan import SamePathPlan, plan_same_path +from graphistry.gfql.same_path_types import WhereComparison +from graphistry.compute.typing import DataFrameT + +AliasKind = Literal["node", "edge"] + +__all__ = [ + "AliasBinding", + "SamePathExecutorInputs", + "DFSamePathExecutor", + "build_same_path_inputs", + "execute_same_path_chain", +] + +_CUDF_MODE_ENV = "GRAPHISTRY_CUDF_SAME_PATH_MODE" + + +@dataclass(frozen=True) +class AliasBinding: + """Metadata describing which chain step an alias refers to.""" + + alias: str + step_index: int + kind: AliasKind + ast: ASTObject + + +@dataclass(frozen=True) +class SamePathExecutorInputs: + """Container for all metadata needed by the cuDF executor.""" + + graph: Plottable + chain: Sequence[ASTObject] + where: Sequence[WhereComparison] + plan: SamePathPlan + engine: Engine + alias_bindings: Dict[str, AliasBinding] + column_requirements: Dict[str, Set[str]] + include_paths: bool = False + + +class DFSamePathExecutor: + """Runs a forward/backward/forward pass using pandas or cuDF dataframes.""" + + def __init__(self, inputs: SamePathExecutorInputs) -> None: + self.inputs = inputs + self.forward_steps: List[Plottable] = [] + self.alias_frames: Dict[str, DataFrameT] = {} + self._node_column = inputs.graph._node + self._edge_column = inputs.graph._edge + self._source_column = inputs.graph._source + self._destination_column = inputs.graph._destination + self._minmax_summaries: Dict[str, Dict[str, DataFrameT]] = defaultdict(dict) + self._equality_values: Dict[str, Dict[str, Set[Any]]] = defaultdict(dict) + + def run(self) -> Plottable: + """Execute full cuDF traversal. + + Currently defaults to an oracle-backed path unless GPU kernels are + explicitly enabled and available. Alias frames are updated from the + oracle tags so downstream consumers can inspect per-alias bindings. + """ + self._forward() + if self._should_attempt_gpu(): + return self._run_gpu() + return self._run_oracle() + + def _forward(self) -> None: + graph = self.inputs.graph + ops = self.inputs.chain + self.forward_steps = [] + + for idx, op in enumerate(ops): + if isinstance(op, ASTCall): + current_g = self.forward_steps[-1] if self.forward_steps else graph + prev_nodes = None + else: + current_g = graph + prev_nodes = ( + None if not self.forward_steps else self.forward_steps[-1]._nodes + ) + g_step = op( + g=current_g, + prev_node_wavefront=prev_nodes, + target_wave_front=None, + engine=self.inputs.engine, + ) + self.forward_steps.append(g_step) + self._capture_alias_frame(op, g_step, idx) + + def _backward(self) -> None: + raise NotImplementedError + + def _finalize(self) -> Plottable: + raise NotImplementedError + + def _capture_alias_frame( + self, op: ASTObject, step_result: Plottable, step_index: int + ) -> None: + alias = getattr(op, "_name", None) + if not alias or alias not in self.inputs.alias_bindings: + return + binding = self.inputs.alias_bindings[alias] + frame = ( + step_result._nodes + if binding.kind == "node" + else step_result._edges + ) + if frame is None: + kind = "node" if binding.kind == "node" else "edge" + raise ValueError( + f"Alias '{alias}' did not produce a {kind} frame" + ) + required = set(self.inputs.column_requirements.get(alias, set())) + id_col = self._node_column if binding.kind == "node" else self._edge_column + if id_col: + required.add(id_col) + missing = [col for col in required if col not in frame.columns] + if missing: + cols = ", ".join(missing) + raise ValueError( + f"Alias '{alias}' missing required columns: {cols}" + ) + subset_cols = [col for col in required] + alias_frame = frame[subset_cols].copy() + self.alias_frames[alias] = alias_frame + self._capture_minmax(alias, alias_frame, id_col) + self._capture_equality_values(alias, alias_frame) + self._apply_ready_clauses() + + # --- Execution selection helpers ------------------------------------------------- + + def _should_attempt_gpu(self) -> bool: + """Decide whether to try GPU kernels for same-path execution.""" + + mode = os.environ.get(_CUDF_MODE_ENV, "auto").lower() + if mode not in {"auto", "oracle", "strict"}: + mode = "auto" + + # force oracle path + if mode == "oracle": + return False + + # only CUDF engine supports GPU fastpath + if self.inputs.engine != Engine.CUDF: + return False + + try: # check cudf presence + import cudf # type: ignore # noqa: F401 + except Exception: + if mode == "strict": + raise RuntimeError( + "cuDF engine requested with strict mode but cudf is unavailable" + ) + return False + return True + + # --- Oracle (CPU) fallback ------------------------------------------------------- + + def _run_oracle(self) -> Plottable: + oracle = enumerate_chain( + self.inputs.graph, + self.inputs.chain, + where=self.inputs.where, + include_paths=self.inputs.include_paths, + caps=OracleCaps( + max_nodes=1000, max_edges=5000, max_length=20, max_partial_rows=1_000_000 + ), + ) + nodes_df, edges_df = self._apply_oracle_hop_labels(oracle) + self._update_alias_frames_from_oracle(oracle.tags) + return self._materialize_from_oracle(nodes_df, edges_df) + + # --- Native vectorized path (pandas + cuDF) --------------------------------------- + + def _run_native(self) -> Plottable: + """Native vectorized path using backward-prune for same-path filtering. + + Works for both pandas and cuDF engines. Uses Yannakakis-style semijoin + pruning to filter nodes/edges that participate in valid paths. + """ + allowed_tags = self._compute_allowed_tags() + path_state = self._backward_prune(allowed_tags) + # Apply non-adjacent equality constraints after backward prune + path_state = self._apply_non_adjacent_where_post_prune(path_state) + return self._materialize_filtered(path_state) + + # Alias for backwards compatibility + _run_gpu = _run_native + + def _update_alias_frames_from_oracle( + self, tags: Dict[str, Set[Any]] + ) -> None: + """Filter captured frames using oracle tags to ensure path coherence.""" + + for alias, binding in self.inputs.alias_bindings.items(): + if alias not in tags: + # if oracle didn't emit the alias, leave any existing capture intact + continue + ids = tags.get(alias, set()) + frame = self._lookup_binding_frame(binding) + if frame is None: + continue + id_col = self._node_column if binding.kind == "node" else self._edge_column + if id_col is None: + continue + filtered = frame[frame[id_col].isin(ids)].copy() + self.alias_frames[alias] = filtered + + def _lookup_binding_frame(self, binding: AliasBinding) -> Optional[DataFrameT]: + if binding.step_index >= len(self.forward_steps): + return None + step_result = self.forward_steps[binding.step_index] + return ( + step_result._nodes + if binding.kind == "node" + else step_result._edges + ) + + def _materialize_from_oracle( + self, nodes_df: DataFrameT, edges_df: DataFrameT + ) -> Plottable: + """Build a Plottable from oracle node/edge outputs, preserving bindings.""" + + g = self.inputs.graph + edge_id = g._edge + src = g._source + dst = g._destination + node_id = g._node + + if node_id and node_id not in nodes_df.columns: + raise ValueError(f"Oracle nodes missing id column '{node_id}'") + if dst and dst not in edges_df.columns: + raise ValueError(f"Oracle edges missing destination column '{dst}'") + if src and src not in edges_df.columns: + raise ValueError(f"Oracle edges missing source column '{src}'") + if edge_id and edge_id not in edges_df.columns: + # Enumerators may synthesize an edge id column when original graph lacked one + if "__enumerator_edge_id__" in edges_df.columns: + edges_df = edges_df.rename(columns={"__enumerator_edge_id__": edge_id}) + else: + raise ValueError(f"Oracle edges missing id column '{edge_id}'") + + g_out = g.nodes(nodes_df, node=node_id) + g_out = g_out.edges(edges_df, source=src, destination=dst, edge=edge_id) + return g_out + + # --- GPU helpers --------------------------------------------------------------- + + def _compute_allowed_tags(self) -> Dict[str, Set[Any]]: + """Seed allowed ids from alias frames (post-forward pruning).""" + + out: Dict[str, Set[Any]] = {} + for alias, binding in self.inputs.alias_bindings.items(): + frame = self.alias_frames.get(alias) + if frame is None: + continue + id_col = self._node_column if binding.kind == "node" else self._edge_column + if id_col is None or id_col not in frame.columns: + continue + out[alias] = self._series_values(frame[id_col]) + return out + + def _are_aliases_adjacent(self, alias1: str, alias2: str) -> bool: + """Check if two node aliases are exactly one edge apart in the chain.""" + binding1 = self.inputs.alias_bindings.get(alias1) + binding2 = self.inputs.alias_bindings.get(alias2) + if binding1 is None or binding2 is None: + return False + # Only consider node aliases for adjacency + if binding1.kind != "node" or binding2.kind != "node": + return False + # Adjacent nodes are exactly 2 step indices apart (n-e-n pattern) + return abs(binding1.step_index - binding2.step_index) == 2 + + def _apply_non_adjacent_where_post_prune( + self, path_state: "_PathState" + ) -> "_PathState": + """ + Apply WHERE constraints between non-adjacent aliases after backward prune. + + For equality clauses like a.id == c.id where a and c are 2+ edges apart, + we need to trace actual paths to find which (start, end) pairs satisfy + the constraint, then filter nodes/edges accordingly. + """ + if not self.inputs.where: + return path_state + + # Find non-adjacent WHERE clauses + non_adjacent_clauses = [] + for clause in self.inputs.where: + left_alias = clause.left.alias + right_alias = clause.right.alias + if not self._are_aliases_adjacent(left_alias, right_alias): + left_binding = self.inputs.alias_bindings.get(left_alias) + right_binding = self.inputs.alias_bindings.get(right_alias) + if left_binding and right_binding: + if left_binding.kind == "node" and right_binding.kind == "node": + non_adjacent_clauses.append(clause) + + if not non_adjacent_clauses: + return path_state + + # Get node and edge indices in chain order + node_indices: List[int] = [] + edge_indices: List[int] = [] + for idx, op in enumerate(self.inputs.chain): + if isinstance(op, ASTNode): + node_indices.append(idx) + elif isinstance(op, ASTEdge): + edge_indices.append(idx) + + # Build adjacency for path tracing (forward direction only for now) + # Maps (src_node_id) -> list of (edge_step_idx, edge_id, dst_node_id) + src_col = self._source_column + dst_col = self._destination_column + edge_id_col = self._edge_column + + if not src_col or not dst_col: + return path_state + + # For each non-adjacent clause, trace paths and filter + for clause in non_adjacent_clauses: + left_alias = clause.left.alias + right_alias = clause.right.alias + left_binding = self.inputs.alias_bindings[left_alias] + right_binding = self.inputs.alias_bindings[right_alias] + + # Ensure left is before right in chain + if left_binding.step_index > right_binding.step_index: + left_alias, right_alias = right_alias, left_alias + left_binding, right_binding = right_binding, left_binding + + start_node_idx = left_binding.step_index + end_node_idx = right_binding.step_index + + # Get edge indices between start and end node positions + relevant_edge_indices = [ + idx for idx in edge_indices + if start_node_idx < idx < end_node_idx + ] + + # Trace paths from start nodes to end nodes + start_nodes = path_state.allowed_nodes.get(start_node_idx, set()) + end_nodes = path_state.allowed_nodes.get(end_node_idx, set()) + + if not start_nodes or not end_nodes: + continue + + # Get column values for the constraint + # IMPORTANT: Use the original graph's node DataFrame, not alias_frames, + # because alias_frames can be incomplete (populated during forward phase + # but backward prune may add more allowed nodes). + left_col = clause.left.column + right_col = clause.right.column + node_id_col = self._node_column + if not node_id_col: + continue + + nodes_df = self.inputs.graph._nodes + if nodes_df is None or node_id_col not in nodes_df.columns: + continue + + # Build value DataFrames from the original graph nodes + # Filter to start_nodes/end_nodes for efficiency + left_values_df = None + if left_col in nodes_df.columns: + if node_id_col == left_col: + # Same column - just use node IDs + left_values_df = nodes_df[nodes_df[node_id_col].isin(start_nodes)][[node_id_col]].drop_duplicates().copy() + left_values_df.columns = ['__start__'] + left_values_df['__start_val__'] = left_values_df['__start__'] + else: + left_values_df = nodes_df[nodes_df[node_id_col].isin(start_nodes)][[node_id_col, left_col]].drop_duplicates().rename( + columns={node_id_col: '__start__', left_col: '__start_val__'} + ) + + right_values_df = None + if right_col in nodes_df.columns: + if node_id_col == right_col: + # Same column - just use node IDs + right_values_df = nodes_df[nodes_df[node_id_col].isin(end_nodes)][[node_id_col]].drop_duplicates().copy() + right_values_df.columns = ['__current__'] + right_values_df['__end_val__'] = right_values_df['__current__'] + else: + right_values_df = nodes_df[nodes_df[node_id_col].isin(end_nodes)][[node_id_col, right_col]].drop_duplicates().rename( + columns={node_id_col: '__current__', right_col: '__end_val__'} + ) + + # Vectorized path tracing using state table propagation + # State table: (current_node, start_node) pairs - which starts can reach each node + # left_values_df is already filtered to start_nodes + if left_values_df is not None and len(left_values_df) > 0: + state_df = left_values_df[['__start__']].copy() + state_df['__current__'] = state_df['__start__'] + else: + state_df = pd.DataFrame(columns=['__current__', '__start__']) + + for edge_idx in relevant_edge_indices: + edges_df = self.forward_steps[edge_idx]._edges + if edges_df is None or len(state_df) == 0: + break + + # Filter edges to allowed edges + allowed_edges = path_state.allowed_edges.get(edge_idx, None) + if allowed_edges is not None and edge_id_col and edge_id_col in edges_df.columns: + edges_df = edges_df[edges_df[edge_id_col].isin(list(allowed_edges))] + + edge_op = self.inputs.chain[edge_idx] + is_reverse = isinstance(edge_op, ASTEdge) and edge_op.direction == "reverse" + is_undirected = isinstance(edge_op, ASTEdge) and edge_op.direction == "undirected" + is_multihop = isinstance(edge_op, ASTEdge) and not self._is_single_hop(edge_op) + + if is_multihop and isinstance(edge_op, ASTEdge): + # For multi-hop, propagate state through multiple hops + min_hops = edge_op.min_hops if edge_op.min_hops is not None else 1 + max_hops = edge_op.max_hops if edge_op.max_hops is not None else ( + edge_op.hops if edge_op.hops is not None else 1 + ) + + # Build edge pairs based on direction + if is_undirected: + edge_pairs = pd.concat([ + edges_df[[src_col, dst_col]].rename(columns={src_col: '__from__', dst_col: '__to__'}), + edges_df[[dst_col, src_col]].rename(columns={dst_col: '__from__', src_col: '__to__'}) + ], ignore_index=True).drop_duplicates() + elif is_reverse: + edge_pairs = edges_df[[dst_col, src_col]].rename(columns={dst_col: '__from__', src_col: '__to__'}) + else: + edge_pairs = edges_df[[src_col, dst_col]].rename(columns={src_col: '__from__', dst_col: '__to__'}) + + # Propagate state through hops + all_reachable = [state_df.copy()] + current_state = state_df.copy() + + for hop in range(1, max_hops + 1): + # Propagate current_state through one hop + next_state = edge_pairs.merge( + current_state, left_on='__from__', right_on='__current__', how='inner' + )[['__to__', '__start__']].rename(columns={'__to__': '__current__'}).drop_duplicates() + + if len(next_state) == 0: + break + + if hop >= min_hops: + all_reachable.append(next_state) + current_state = next_state + + # Combine all reachable states + if len(all_reachable) > 1: + state_df = pd.concat(all_reachable[1:], ignore_index=True).drop_duplicates() + else: + state_df = pd.DataFrame(columns=['__current__', '__start__']) + else: + # Single-hop: propagate state through one hop + if is_undirected: + # Both directions + next1 = edges_df.merge( + state_df, left_on=src_col, right_on='__current__', how='inner' + )[[dst_col, '__start__']].rename(columns={dst_col: '__current__'}) + next2 = edges_df.merge( + state_df, left_on=dst_col, right_on='__current__', how='inner' + )[[src_col, '__start__']].rename(columns={src_col: '__current__'}) + state_df = pd.concat([next1, next2], ignore_index=True).drop_duplicates() + elif is_reverse: + state_df = edges_df.merge( + state_df, left_on=dst_col, right_on='__current__', how='inner' + )[[src_col, '__start__']].rename(columns={src_col: '__current__'}).drop_duplicates() + else: + state_df = edges_df.merge( + state_df, left_on=src_col, right_on='__current__', how='inner' + )[[dst_col, '__start__']].rename(columns={dst_col: '__current__'}).drop_duplicates() + + # state_df now has (current_node=end_node, start_node) pairs + # Filter to valid end nodes + state_df = state_df[state_df['__current__'].isin(end_nodes)] + + if len(state_df) == 0: + # No valid paths found + if start_node_idx in path_state.allowed_nodes: + path_state.allowed_nodes[start_node_idx] = set() + if end_node_idx in path_state.allowed_nodes: + path_state.allowed_nodes[end_node_idx] = set() + continue + + # Join with start and end values to apply WHERE clause + # left_values_df and right_values_df were built earlier (vectorized) + if left_values_df is None or right_values_df is None: + continue + + pairs_df = state_df.merge(left_values_df, on='__start__', how='inner') + pairs_df = pairs_df.merge(right_values_df, on='__current__', how='inner') + + # Apply the comparison vectorized + mask = self._evaluate_clause(pairs_df['__start_val__'], clause.op, pairs_df['__end_val__']) + valid_pairs = pairs_df[mask] + + valid_starts = set(valid_pairs['__start__'].tolist()) + valid_ends = set(valid_pairs['__current__'].tolist()) + + # Update allowed_nodes for start and end positions + if start_node_idx in path_state.allowed_nodes: + path_state.allowed_nodes[start_node_idx] &= valid_starts + if end_node_idx in path_state.allowed_nodes: + path_state.allowed_nodes[end_node_idx] &= valid_ends + + # Re-propagate constraints backward from the filtered ends + # to update intermediate nodes and edges + self._re_propagate_backward( + path_state, node_indices, edge_indices, + start_node_idx, end_node_idx + ) + + return path_state + + def _re_propagate_backward( + self, + path_state: "_PathState", + node_indices: List[int], + edge_indices: List[int], + start_idx: int, + end_idx: int, + ) -> None: + """Re-propagate constraints backward after filtering non-adjacent nodes.""" + src_col = self._source_column + dst_col = self._destination_column + edge_id_col = self._edge_column + + if not src_col or not dst_col: + return + + # Walk backward from end to start + relevant_edge_indices = [idx for idx in edge_indices if start_idx < idx < end_idx] + + for edge_idx in reversed(relevant_edge_indices): + # Find the node indices this edge connects + edge_pos = edge_indices.index(edge_idx) + left_node_idx = node_indices[edge_pos] + right_node_idx = node_indices[edge_pos + 1] + + edges_df = self.forward_steps[edge_idx]._edges + if edges_df is None: + continue + + original_len = len(edges_df) + + # Filter by allowed edges + allowed_edges = path_state.allowed_edges.get(edge_idx, None) + if allowed_edges is not None and edge_id_col and edge_id_col in edges_df.columns: + edges_df = edges_df[edges_df[edge_id_col].isin(list(allowed_edges))] + + # Get edge direction and check if multi-hop + edge_op = self.inputs.chain[edge_idx] + is_reverse = isinstance(edge_op, ASTEdge) and edge_op.direction == "reverse" + is_multihop = isinstance(edge_op, ASTEdge) and not self._is_single_hop(edge_op) + + # Filter edges by allowed left (src) and right (dst) nodes + left_allowed = path_state.allowed_nodes.get(left_node_idx, set()) + right_allowed = path_state.allowed_nodes.get(right_node_idx, set()) + + is_undirected = isinstance(edge_op, ASTEdge) and edge_op.direction == "undirected" + if is_multihop and isinstance(edge_op, ASTEdge): + # For multi-hop edges, we need to trace valid paths from left_allowed + # to right_allowed, keeping all edges that participate in valid paths. + # Simple src/dst filtering would incorrectly remove intermediate edges. + edges_df = self._filter_multihop_edges_by_endpoints( + edges_df, edge_op, left_allowed, right_allowed, is_reverse, is_undirected + ) + else: + # Single-hop: filter by src/dst directly + if is_undirected: + # Undirected: edge connects left and right in either direction + if left_allowed and right_allowed: + left_set = list(left_allowed) + right_set = list(right_allowed) + # Keep edges where (src in left and dst in right) OR (dst in left and src in right) + mask = ( + (edges_df[src_col].isin(left_set) & edges_df[dst_col].isin(right_set)) + | (edges_df[dst_col].isin(left_set) & edges_df[src_col].isin(right_set)) + ) + edges_df = edges_df[mask] + elif left_allowed: + left_set = list(left_allowed) + edges_df = edges_df[ + edges_df[src_col].isin(left_set) | edges_df[dst_col].isin(left_set) + ] + elif right_allowed: + right_set = list(right_allowed) + edges_df = edges_df[ + edges_df[src_col].isin(right_set) | edges_df[dst_col].isin(right_set) + ] + elif is_reverse: + # Reverse: src is right side, dst is left side + if right_allowed: + edges_df = edges_df[edges_df[src_col].isin(list(right_allowed))] + if left_allowed: + edges_df = edges_df[edges_df[dst_col].isin(list(left_allowed))] + else: + # Forward: src is left side, dst is right side + if left_allowed: + edges_df = edges_df[edges_df[src_col].isin(list(left_allowed))] + if right_allowed: + edges_df = edges_df[edges_df[dst_col].isin(list(right_allowed))] + + # Update allowed edges + if edge_id_col and edge_id_col in edges_df.columns: + new_edge_ids = set(edges_df[edge_id_col].tolist()) + if edge_idx in path_state.allowed_edges: + path_state.allowed_edges[edge_idx] &= new_edge_ids + else: + path_state.allowed_edges[edge_idx] = new_edge_ids + + # Update allowed left (src) nodes based on filtered edges + if is_multihop and isinstance(edge_op, ASTEdge): + # For multi-hop, the "left" nodes are those that can START paths + # to reach right_allowed within the hop constraints + new_src_nodes = self._find_multihop_start_nodes( + edges_df, edge_op, right_allowed, is_reverse, is_undirected + ) + else: + if is_undirected: + # Undirected: source nodes can be either src or dst + new_src_nodes = set(edges_df[src_col].tolist()) | set(edges_df[dst_col].tolist()) + elif is_reverse: + new_src_nodes = set(edges_df[dst_col].tolist()) + else: + new_src_nodes = set(edges_df[src_col].tolist()) + + if left_node_idx in path_state.allowed_nodes: + path_state.allowed_nodes[left_node_idx] &= new_src_nodes + else: + path_state.allowed_nodes[left_node_idx] = new_src_nodes + + # Persist filtered edges to forward_steps (important when no edge ID column) + if len(edges_df) < original_len: + self.forward_steps[edge_idx]._edges = edges_df + + def _filter_multihop_edges_by_endpoints( + self, + edges_df: DataFrameT, + edge_op: ASTEdge, + left_allowed: Set[Any], + right_allowed: Set[Any], + is_reverse: bool, + is_undirected: bool = False, + ) -> DataFrameT: + """ + Filter multi-hop edges to only those participating in valid paths + from left_allowed to right_allowed. + + Uses vectorized bidirectional reachability propagation: + 1. Forward: find nodes reachable from left_allowed at each hop + 2. Backward: find nodes that can reach right_allowed at each hop + 3. Keep edges connecting forward-reachable to backward-reachable nodes + """ + src_col = self._source_column + dst_col = self._destination_column + + if not src_col or not dst_col or not left_allowed or not right_allowed: + return edges_df + + min_hops = edge_op.min_hops if edge_op.min_hops is not None else 1 + max_hops = edge_op.max_hops if edge_op.max_hops is not None else ( + edge_op.hops if edge_op.hops is not None else 1 + ) + + # Build edge pairs for traversal based on direction + if is_undirected: + edges_fwd = edges_df[[src_col, dst_col]].copy() + edges_fwd.columns = pd.Index(['__from__', '__to__']) + edges_rev = edges_df[[dst_col, src_col]].copy() + edges_rev.columns = pd.Index(['__from__', '__to__']) + edge_pairs = pd.concat([edges_fwd, edges_rev], ignore_index=True).drop_duplicates() + elif is_reverse: + edge_pairs = edges_df[[dst_col, src_col]].copy() + edge_pairs.columns = pd.Index(['__from__', '__to__']) + else: + edge_pairs = edges_df[[src_col, dst_col]].copy() + edge_pairs.columns = pd.Index(['__from__', '__to__']) + + # Forward reachability: nodes reachable from left_allowed at each hop distance + # Use DataFrame-based tracking throughout (no Python sets) + # fwd_df tracks (node, min_hop) for all reachable nodes + fwd_df = pd.DataFrame({'__node__': list(left_allowed), '__fwd_hop__': 0}) + all_fwd_df = fwd_df.copy() + + for hop in range(1, max_hops): # max_hops-1 because edge adds 1 more + # Get frontier (nodes at previous hop) + frontier_df = fwd_df[fwd_df['__fwd_hop__'] == hop - 1][['__node__']].rename( + columns={'__node__': '__from__'} + ) + if len(frontier_df) == 0: + break + # Propagate through edges + next_nodes_df = edge_pairs.merge(frontier_df, on='__from__', how='inner')[['__to__']].drop_duplicates() + next_nodes_df = next_nodes_df.rename(columns={'__to__': '__node__'}) + next_nodes_df['__fwd_hop__'] = hop + # Anti-join: keep only nodes not yet seen + merged = next_nodes_df.merge(all_fwd_df[['__node__']], on='__node__', how='left', indicator=True) + new_nodes_df = merged[merged['_merge'] == 'left_only'][['__node__', '__fwd_hop__']] + if len(new_nodes_df) == 0: + break + fwd_df = pd.concat([fwd_df, new_nodes_df], ignore_index=True) + all_fwd_df = pd.concat([all_fwd_df, new_nodes_df], ignore_index=True) + + # Backward reachability: nodes that can reach right_allowed at each hop distance + rev_edge_pairs = edge_pairs.rename(columns={'__from__': '__to__', '__to__': '__from__'}) + + bwd_df = pd.DataFrame({'__node__': list(right_allowed), '__bwd_hop__': 0}) + all_bwd_df = bwd_df.copy() + + for hop in range(1, max_hops): # max_hops-1 because edge adds 1 more + frontier_df = bwd_df[bwd_df['__bwd_hop__'] == hop - 1][['__node__']].rename( + columns={'__node__': '__from__'} + ) + if len(frontier_df) == 0: + break + next_nodes_df = rev_edge_pairs.merge(frontier_df, on='__from__', how='inner')[['__to__']].drop_duplicates() + next_nodes_df = next_nodes_df.rename(columns={'__to__': '__node__'}) + next_nodes_df['__bwd_hop__'] = hop + # Anti-join: keep only nodes not yet seen + merged = next_nodes_df.merge(all_bwd_df[['__node__']], on='__node__', how='left', indicator=True) + new_nodes_df = merged[merged['_merge'] == 'left_only'][['__node__', '__bwd_hop__']] + if len(new_nodes_df) == 0: + break + bwd_df = pd.concat([bwd_df, new_nodes_df], ignore_index=True) + all_bwd_df = pd.concat([all_bwd_df, new_nodes_df], ignore_index=True) + + # An edge (u, v) is valid if: + # - u is forward-reachable at hop h_fwd (path length from left_allowed to u) + # - v is backward-reachable at hop h_bwd (path length from v to right_allowed) + # - h_fwd + 1 + h_bwd is in [min_hops, max_hops] + if len(fwd_df) == 0 or len(bwd_df) == 0: + return edges_df.iloc[:0] + + # For nodes reachable at multiple hops, keep the minimum + fwd_df = fwd_df.groupby('__node__')['__fwd_hop__'].min().reset_index() + bwd_df = bwd_df.groupby('__node__')['__bwd_hop__'].min().reset_index() + + # Join edges with hop distances + if is_undirected: + # For undirected, check both directions + # Direction 1: src is fwd, dst is bwd + edges_annotated1 = edges_df.merge( + fwd_df, left_on=src_col, right_on='__node__', how='inner' + ).merge( + bwd_df, left_on=dst_col, right_on='__node__', how='inner', suffixes=('', '_bwd') + ) + edges_annotated1['__total_hops__'] = edges_annotated1['__fwd_hop__'] + 1 + edges_annotated1['__bwd_hop__'] + valid1 = edges_annotated1[ + (edges_annotated1['__total_hops__'] >= min_hops) + & (edges_annotated1['__total_hops__'] <= max_hops) + ] + + # Direction 2: dst is fwd, src is bwd + edges_annotated2 = edges_df.merge( + fwd_df, left_on=dst_col, right_on='__node__', how='inner' + ).merge( + bwd_df, left_on=src_col, right_on='__node__', how='inner', suffixes=('', '_bwd') + ) + edges_annotated2['__total_hops__'] = edges_annotated2['__fwd_hop__'] + 1 + edges_annotated2['__bwd_hop__'] + valid2 = edges_annotated2[ + (edges_annotated2['__total_hops__'] >= min_hops) + & (edges_annotated2['__total_hops__'] <= max_hops) + ] + + # Get original edge columns only + orig_cols = list(edges_df.columns) + valid_edges = pd.concat([valid1[orig_cols], valid2[orig_cols]], ignore_index=True).drop_duplicates() + return valid_edges + else: + # Determine which column is "source" (fwd) and which is "dest" (bwd) + if is_reverse: + fwd_col, bwd_col = dst_col, src_col + else: + fwd_col, bwd_col = src_col, dst_col + + edges_annotated = edges_df.merge( + fwd_df, left_on=fwd_col, right_on='__node__', how='inner' + ).merge( + bwd_df, left_on=bwd_col, right_on='__node__', how='inner', suffixes=('', '_bwd') + ) + edges_annotated['__total_hops__'] = edges_annotated['__fwd_hop__'] + 1 + edges_annotated['__bwd_hop__'] + + valid_edges = edges_annotated[ + (edges_annotated['__total_hops__'] >= min_hops) + & (edges_annotated['__total_hops__'] <= max_hops) + ] + + # Return only original columns + orig_cols = list(edges_df.columns) + return valid_edges[orig_cols] + + def _find_multihop_start_nodes( + self, + edges_df: DataFrameT, + edge_op: ASTEdge, + right_allowed: Set[Any], + is_reverse: bool, + is_undirected: bool = False, + ) -> Set[Any]: + """ + Find nodes that can start multi-hop paths reaching right_allowed. + + Uses vectorized hop-by-hop backward propagation via merge+groupby. + """ + src_col = self._source_column + dst_col = self._destination_column + + if not src_col or not dst_col or not right_allowed: + return set() + + min_hops = edge_op.min_hops if edge_op.min_hops is not None else 1 + max_hops = edge_op.max_hops if edge_op.max_hops is not None else ( + edge_op.hops if edge_op.hops is not None else 1 + ) + + # Determine edge direction for backward traversal + # Forward edges: src->dst, backward: dst->src + # Reverse edges: dst->src, backward: src->dst + # Undirected: both directions + if is_undirected: + # For undirected, we need edges in both directions + # Create a DataFrame with both (src, dst) and (dst, src) as edges + edges_fwd = edges_df[[src_col, dst_col]].rename( + columns={src_col: '__from__', dst_col: '__to__'} + ) + edges_rev = edges_df[[dst_col, src_col]].rename( + columns={dst_col: '__from__', src_col: '__to__'} + ) + edge_pairs = pd.concat([edges_fwd, edges_rev], ignore_index=True).drop_duplicates() + elif is_reverse: + # Reverse: traversal goes dst->src, backward trace goes src->dst + edge_pairs = edges_df[[src_col, dst_col]].rename( + columns={src_col: '__from__', dst_col: '__to__'} + ).drop_duplicates() + else: + # Forward: traversal goes src->dst, backward trace goes dst->src + edge_pairs = edges_df[[dst_col, src_col]].rename( + columns={dst_col: '__from__', src_col: '__to__'} + ).drop_duplicates() + + # Vectorized backward BFS: propagate reachability hop by hop + # Use DataFrame-based tracking throughout (no Python sets internally) + # Start with right_allowed as target destinations (hop 0 means "at the destination") + # We trace backward to find nodes that can REACH these destinations + frontier = pd.DataFrame({'__node__': list(right_allowed)}) + all_visited = frontier.copy() + valid_starts_frames: List[DataFrameT] = [] + + # Collect nodes at each hop distance FROM the destination + for hop in range(1, max_hops + 1): + # Join with edges to find nodes one hop back from frontier + # edge_pairs: __from__ = dst (target), __to__ = src (predecessor) + # We want nodes (__to__) that can reach frontier nodes (__from__) + new_frontier = edge_pairs.merge( + frontier, + left_on='__from__', + right_on='__node__', + how='inner' + )[['__to__']].drop_duplicates() + + if len(new_frontier) == 0: + break + + new_frontier = new_frontier.rename(columns={'__to__': '__node__'}) + + # Collect valid starts (nodes at hop distance in [min_hops, max_hops]) + # These are nodes that can reach right_allowed in exactly `hop` hops + if hop >= min_hops: + valid_starts_frames.append(new_frontier[['__node__']]) + + # Anti-join: filter out nodes already visited to avoid infinite loops + # But still keep nodes for valid_starts even if visited before at different hop + merged = new_frontier.merge( + all_visited[['__node__']], on='__node__', how='left', indicator=True + ) + unvisited = merged[merged['_merge'] == 'left_only'][['__node__']] + + if len(unvisited) == 0: + break + + frontier = unvisited + all_visited = pd.concat([all_visited, unvisited], ignore_index=True) + + # Combine all valid starts and convert to set (caller expects set) + if valid_starts_frames: + valid_starts_df = pd.concat(valid_starts_frames, ignore_index=True).drop_duplicates() + return set(valid_starts_df['__node__'].tolist()) + return set() + + def _capture_minmax( + self, alias: str, frame: DataFrameT, id_col: Optional[str] + ) -> None: + if not id_col: + return + cols = self.inputs.column_requirements.get(alias, set()) + target_cols = [ + col for col in cols if self.inputs.plan.requires_minmax(alias) and col in frame.columns + ] + if not target_cols: + return + grouped = frame.groupby(id_col) + for col in target_cols: + summary = grouped[col].agg(["min", "max"]).reset_index() + self._minmax_summaries[alias][col] = summary + + def _capture_equality_values( + self, alias: str, frame: DataFrameT + ) -> None: + cols = self.inputs.column_requirements.get(alias, set()) + participates = any( + alias in bitset.aliases for bitset in self.inputs.plan.bitsets.values() + ) + if not participates: + return + for col in cols: + if col in frame.columns: + self._equality_values[alias][col] = self._series_values(frame[col]) + + @dataclass + class _PathState: + allowed_nodes: Dict[int, Set[Any]] + allowed_edges: Dict[int, Set[Any]] + + def _backward_prune(self, allowed_tags: Dict[str, Set[Any]]) -> "_PathState": + """Propagate allowed ids backward across edges to enforce path coherence.""" + + node_indices: List[int] = [] + edge_indices: List[int] = [] + for idx, op in enumerate(self.inputs.chain): + if isinstance(op, ASTNode): + node_indices.append(idx) + elif isinstance(op, ASTEdge): + edge_indices.append(idx) + if not node_indices: + raise ValueError("Same-path executor requires at least one node step") + if len(node_indices) != len(edge_indices) + 1: + raise ValueError("Chain must alternate node/edge steps for same-path execution") + + allowed_nodes: Dict[int, Set[Any]] = {} + allowed_edges: Dict[int, Set[Any]] = {} + + # Seed node allowances from tags or full frames + for idx in node_indices: + node_alias = self._alias_for_step(idx) + frame = self.forward_steps[idx]._nodes + if frame is None or self._node_column is None: + continue + if node_alias and node_alias in allowed_tags: + allowed_nodes[idx] = set(allowed_tags[node_alias]) + else: + allowed_nodes[idx] = self._series_values(frame[self._node_column]) + + # Walk edges backward + for edge_idx, right_node_idx in reversed(list(zip(edge_indices, node_indices[1:]))): + edge_alias = self._alias_for_step(edge_idx) + left_node_idx = node_indices[node_indices.index(right_node_idx) - 1] + edges_df = self.forward_steps[edge_idx]._edges + if edges_df is None: + continue + + filtered = edges_df + edge_op = self.inputs.chain[edge_idx] + is_multihop = isinstance(edge_op, ASTEdge) and not self._is_single_hop(edge_op) + is_reverse = isinstance(edge_op, ASTEdge) and edge_op.direction == "reverse" + is_undirected = isinstance(edge_op, ASTEdge) and edge_op.direction == "undirected" + + # For single-hop edges, filter by allowed dst first + # For multi-hop, defer dst filtering to _filter_multihop_by_where + # For reverse edges, "dst" in traversal = "src" in edge data + # For undirected edges, "dst" can be either src or dst column + if not is_multihop: + allowed_dst = allowed_nodes.get(right_node_idx) + if allowed_dst is not None: + if is_undirected: + # Undirected: right node can be reached via either src or dst column + if self._source_column and self._destination_column: + dst_list = list(allowed_dst) + filtered = filtered[ + filtered[self._source_column].isin(dst_list) + | filtered[self._destination_column].isin(dst_list) + ] + elif is_reverse: + if self._source_column and self._source_column in filtered.columns: + filtered = filtered[ + filtered[self._source_column].isin(list(allowed_dst)) + ] + else: + if self._destination_column and self._destination_column in filtered.columns: + filtered = filtered[ + filtered[self._destination_column].isin(list(allowed_dst)) + ] + + # Apply value-based clauses between adjacent aliases + left_alias = self._alias_for_step(left_node_idx) + right_alias = self._alias_for_step(right_node_idx) + if isinstance(edge_op, ASTEdge) and left_alias and right_alias: + if self._is_single_hop(edge_op): + # Single-hop: filter edges directly + filtered = self._filter_edges_by_clauses( + filtered, left_alias, right_alias, allowed_nodes, is_reverse + ) + else: + # Multi-hop: filter nodes first, then keep connecting edges + filtered = self._filter_multihop_by_where( + filtered, edge_op, left_alias, right_alias, allowed_nodes + ) + + if edge_alias and edge_alias in allowed_tags: + allowed_edge_ids = allowed_tags[edge_alias] + if self._edge_column and self._edge_column in filtered.columns: + filtered = filtered[ + filtered[self._edge_column].isin(list(allowed_edge_ids)) + ] + + # Update allowed_nodes based on filtered edges + # For reverse edges, swap src/dst semantics + # For undirected edges, both src and dst can be either left or right node + if is_undirected: + # Undirected: both src and dst can be left or right nodes + if self._source_column and self._destination_column: + all_nodes_in_edges = ( + self._series_values(filtered[self._source_column]) + | self._series_values(filtered[self._destination_column]) + ) + # Right node is constrained by allowed_dst already filtered above + current_dst = allowed_nodes.get(right_node_idx, set()) + allowed_nodes[right_node_idx] = ( + current_dst & all_nodes_in_edges if current_dst else all_nodes_in_edges + ) + # Left node is any node in the filtered edges + current = allowed_nodes.get(left_node_idx, set()) + allowed_nodes[left_node_idx] = current & all_nodes_in_edges if current else all_nodes_in_edges + elif is_reverse: + # Reverse: right node reached via src, left node via dst + if self._source_column and self._source_column in filtered.columns: + allowed_dst_actual = self._series_values(filtered[self._source_column]) + current_dst = allowed_nodes.get(right_node_idx, set()) + allowed_nodes[right_node_idx] = ( + current_dst & allowed_dst_actual if current_dst else allowed_dst_actual + ) + if self._destination_column and self._destination_column in filtered.columns: + allowed_src = self._series_values(filtered[self._destination_column]) + current = allowed_nodes.get(left_node_idx, set()) + allowed_nodes[left_node_idx] = current & allowed_src if current else allowed_src + else: + # Forward: right node reached via dst, left node via src + if self._destination_column and self._destination_column in filtered.columns: + allowed_dst_actual = self._series_values(filtered[self._destination_column]) + current_dst = allowed_nodes.get(right_node_idx, set()) + allowed_nodes[right_node_idx] = ( + current_dst & allowed_dst_actual if current_dst else allowed_dst_actual + ) + if self._source_column and self._source_column in filtered.columns: + allowed_src = self._series_values(filtered[self._source_column]) + current = allowed_nodes.get(left_node_idx, set()) + allowed_nodes[left_node_idx] = current & allowed_src if current else allowed_src + + if self._edge_column and self._edge_column in filtered.columns: + allowed_edges[edge_idx] = self._series_values(filtered[self._edge_column]) + + # Store filtered edges back to ensure WHERE-pruned edges are removed from output + if len(filtered) < len(edges_df): + self.forward_steps[edge_idx]._edges = filtered + + return self._PathState(allowed_nodes=allowed_nodes, allowed_edges=allowed_edges) + + def _filter_edges_by_clauses( + self, + edges_df: DataFrameT, + left_alias: str, + right_alias: str, + allowed_nodes: Dict[int, Set[Any]], + is_reverse: bool = False, + ) -> DataFrameT: + """Filter edges using WHERE clauses that connect adjacent aliases. + + For forward edges: left_alias matches src, right_alias matches dst. + For reverse edges: left_alias matches dst, right_alias matches src. + """ + # Early return for empty edges - no filtering needed + if len(edges_df) == 0: + return edges_df + + relevant = [ + clause + for clause in self.inputs.where + if {clause.left.alias, clause.right.alias} == {left_alias, right_alias} + ] + if not relevant or not self._source_column or not self._destination_column: + return edges_df + + left_frame = self.alias_frames.get(left_alias) + right_frame = self.alias_frames.get(right_alias) + if left_frame is None or right_frame is None or self._node_column is None: + return edges_df + + out_df = edges_df + left_allowed = allowed_nodes.get(self.inputs.alias_bindings[left_alias].step_index) + right_allowed = allowed_nodes.get(self.inputs.alias_bindings[right_alias].step_index) + + lf = left_frame + rf = right_frame + if left_allowed is not None: + lf = lf[lf[self._node_column].isin(list(left_allowed))] + if right_allowed is not None: + rf = rf[rf[self._node_column].isin(list(right_allowed))] + + left_cols = list(self.inputs.column_requirements.get(left_alias, [])) + right_cols = list(self.inputs.column_requirements.get(right_alias, [])) + if self._node_column in left_cols: + left_cols.remove(self._node_column) + if self._node_column in right_cols: + right_cols.remove(self._node_column) + + lf = lf[[self._node_column] + left_cols].rename(columns={self._node_column: "__left_id__"}) + rf = rf[[self._node_column] + right_cols].rename(columns={self._node_column: "__right_id__"}) + + # For reverse edges, left_alias is reached via dst column, right_alias via src column + # For forward edges, left_alias is reached via src column, right_alias via dst column + if is_reverse: + left_merge_col = self._destination_column + right_merge_col = self._source_column + else: + left_merge_col = self._source_column + right_merge_col = self._destination_column + + out_df = out_df.merge( + lf, + left_on=left_merge_col, + right_on="__left_id__", + how="inner", + ) + out_df = out_df.merge( + rf, + left_on=right_merge_col, + right_on="__right_id__", + how="inner", + suffixes=("", "__r"), + ) + + for clause in relevant: + left_col = clause.left.column if clause.left.alias == left_alias else clause.right.column + right_col = clause.right.column if clause.right.alias == right_alias else clause.left.column + if clause.op in {">", ">=", "<", "<="}: + out_df = self._apply_inequality_clause( + out_df, clause, left_alias, right_alias, left_col, right_col + ) + else: + col_left_name = f"__val_left_{left_col}" + col_right_name = f"__val_right_{right_col}" + + # When left_col == right_col, the right merge adds __r suffix + # We need to rename them to distinct names for comparison + rename_map = {} + if left_col in out_df.columns: + rename_map[left_col] = col_left_name + # Handle right column: could be right_col or right_col__r depending on merge + right_col_with_suffix = f"{right_col}__r" + if right_col_with_suffix in out_df.columns: + rename_map[right_col_with_suffix] = col_right_name + elif right_col in out_df.columns and right_col != left_col: + rename_map[right_col] = col_right_name + + if rename_map: + out_df = out_df.rename(columns=rename_map) + + if col_left_name in out_df.columns and col_right_name in out_df.columns: + mask = self._evaluate_clause(out_df[col_left_name], clause.op, out_df[col_right_name]) + out_df = out_df[mask] + + return out_df + + def _filter_multihop_by_where( + self, + edges_df: DataFrameT, + edge_op: ASTEdge, + left_alias: str, + right_alias: str, + allowed_nodes: Dict[int, Set[Any]], + ) -> DataFrameT: + """ + Filter multi-hop edges by WHERE clauses connecting start/end aliases. + + For multi-hop traversals, edges_df contains all edges in the path. The src/dst + columns represent intermediate connections, not the start/end aliases directly. + + Strategy: + 1. Identify which (start, end) pairs satisfy WHERE clauses + 2. Trace paths to find valid edges: start nodes connect via hop 1, end nodes via last hop + 3. Keep only edges that participate in valid paths + """ + relevant = [ + clause + for clause in self.inputs.where + if {clause.left.alias, clause.right.alias} == {left_alias, right_alias} + ] + if not relevant or not self._source_column or not self._destination_column: + return edges_df + + left_frame = self.alias_frames.get(left_alias) + right_frame = self.alias_frames.get(right_alias) + if left_frame is None or right_frame is None or self._node_column is None: + return edges_df + + # Get hop label column to identify first/last hop edges + node_label, edge_label = self._resolve_label_cols(edge_op) + if edge_label is None or edge_label not in edges_df.columns: + # No hop labels - can't distinguish first/last hop edges + return edges_df + + # Identify first-hop edges and valid endpoint edges + hop_col = edges_df[edge_label] + min_hop = hop_col.min() + + first_hop_edges = edges_df[hop_col == min_hop] + + # Get chain min_hops to find valid endpoints + chain_min_hops = edge_op.min_hops if edge_op.min_hops is not None else 1 + # Valid endpoints are at hop >= chain_min_hops (hop label is 1-indexed) + valid_endpoint_edges = edges_df[hop_col >= chain_min_hops] + + # For reverse edges, the logical direction is opposite to physical direction + # Forward: start -> hop 1 -> hop 2 -> end (start=src of hop 1, end=dst of last hop) + # Reverse: start <- hop 1 <- hop 2 <- end (start=dst of hop 1, end=src of last hop) + # Undirected: edges can be traversed both ways, so both src and dst are potential starts/ends + is_reverse = edge_op.direction == "reverse" + is_undirected = edge_op.direction == "undirected" + + # Extract start/end nodes using DataFrame operations (vectorized) + if is_undirected: + # Undirected: start can be either src or dst of first hop + start_nodes_df = pd.concat([ + first_hop_edges[[self._source_column]].rename(columns={self._source_column: '__node__'}), + first_hop_edges[[self._destination_column]].rename(columns={self._destination_column: '__node__'}) + ], ignore_index=True).drop_duplicates() + # End can be either src or dst of edges at hop >= min_hops + end_nodes_df = pd.concat([ + valid_endpoint_edges[[self._source_column]].rename(columns={self._source_column: '__node__'}), + valid_endpoint_edges[[self._destination_column]].rename(columns={self._destination_column: '__node__'}) + ], ignore_index=True).drop_duplicates() + elif is_reverse: + # Reverse: start is dst of first hop, end is src of edges at hop >= min_hops + start_nodes_df = first_hop_edges[[self._destination_column]].rename( + columns={self._destination_column: '__node__'} + ).drop_duplicates() + end_nodes_df = valid_endpoint_edges[[self._source_column]].rename( + columns={self._source_column: '__node__'} + ).drop_duplicates() + else: + # Forward: start is src of first hop, end is dst of edges at hop >= min_hops + start_nodes_df = first_hop_edges[[self._source_column]].rename( + columns={self._source_column: '__node__'} + ).drop_duplicates() + end_nodes_df = valid_endpoint_edges[[self._destination_column]].rename( + columns={self._destination_column: '__node__'} + ).drop_duplicates() + + # Convert to sets for intersection with allowed_nodes (caller uses sets) + start_nodes = set(start_nodes_df['__node__'].tolist()) + end_nodes = set(end_nodes_df['__node__'].tolist()) + + # Filter to allowed nodes + left_step_idx = self.inputs.alias_bindings[left_alias].step_index + right_step_idx = self.inputs.alias_bindings[right_alias].step_index + if left_step_idx in allowed_nodes and allowed_nodes[left_step_idx]: + start_nodes &= allowed_nodes[left_step_idx] + if right_step_idx in allowed_nodes and allowed_nodes[right_step_idx]: + end_nodes &= allowed_nodes[right_step_idx] + + if not start_nodes or not end_nodes: + return edges_df.iloc[:0] # Empty dataframe + + # Build (start, end) pairs that satisfy WHERE + lf = left_frame[left_frame[self._node_column].isin(list(start_nodes))] + rf = right_frame[right_frame[self._node_column].isin(list(end_nodes))] + + left_cols = list(self.inputs.column_requirements.get(left_alias, [])) + right_cols = list(self.inputs.column_requirements.get(right_alias, [])) + if self._node_column in left_cols: + left_cols.remove(self._node_column) + if self._node_column in right_cols: + right_cols.remove(self._node_column) + + lf = lf[[self._node_column] + left_cols].rename(columns={self._node_column: "__start_id__"}) + rf = rf[[self._node_column] + right_cols].rename(columns={self._node_column: "__end_id__"}) + + # Cross join to get all (start, end) combinations + lf = lf.assign(__cross_key__=1) + rf = rf.assign(__cross_key__=1) + pairs_df = lf.merge(rf, on="__cross_key__", suffixes=("", "__r")).drop(columns=["__cross_key__"]) + + # Apply WHERE clauses to filter valid (start, end) pairs + for clause in relevant: + left_col = clause.left.column if clause.left.alias == left_alias else clause.right.column + right_col = clause.right.column if clause.right.alias == right_alias else clause.left.column + # Handle column name collision from merge - when left_col == right_col, + # pandas adds __r suffix to the right side columns to avoid collision + actual_right_col = right_col + if left_col == right_col and f"{right_col}__r" in pairs_df.columns: + actual_right_col = f"{right_col}__r" + if left_col in pairs_df.columns and actual_right_col in pairs_df.columns: + mask = self._evaluate_clause(pairs_df[left_col], clause.op, pairs_df[actual_right_col]) + pairs_df = pairs_df[mask] + + if len(pairs_df) == 0: + return edges_df.iloc[:0] + + # Get valid start and end nodes + valid_starts = set(pairs_df["__start_id__"].tolist()) + valid_ends = set(pairs_df["__end_id__"].tolist()) + + # Use vectorized bidirectional reachability to filter edges + # This reuses the same logic as _filter_multihop_edges_by_endpoints + return self._filter_multihop_edges_by_endpoints( + edges_df, edge_op, valid_starts, valid_ends, is_reverse, is_undirected + ) + + @staticmethod + def _is_single_hop(op: ASTEdge) -> bool: + hop_min = op.min_hops if op.min_hops is not None else ( + op.hops if isinstance(op.hops, int) else 1 + ) + hop_max = op.max_hops if op.max_hops is not None else ( + op.hops if isinstance(op.hops, int) else hop_min + ) + if hop_min is None or hop_max is None: + return False + return hop_min == 1 and hop_max == 1 + + def _apply_inequality_clause( + self, + out_df: DataFrameT, + clause: WhereComparison, + left_alias: str, + right_alias: str, + left_col: str, + right_col: str, + ) -> DataFrameT: + left_summary = self._minmax_summaries.get(left_alias, {}).get(left_col) + right_summary = self._minmax_summaries.get(right_alias, {}).get(right_col) + + # Fall back to raw values if summaries are missing + lsum = None + rsum = None + if left_summary is not None: + lsum = left_summary.rename( + columns={ + left_summary.columns[0]: "__left_id__", + "min": f"{left_col}__min", + "max": f"{left_col}__max", + } + ) + if right_summary is not None: + rsum = right_summary.rename( + columns={ + right_summary.columns[0]: "__right_id__", + "min": f"{right_col}__min_r", + "max": f"{right_col}__max_r", + } + ) + merged = out_df + if lsum is not None: + merged = merged.merge(lsum, on="__left_id__", how="inner") + if rsum is not None: + merged = merged.merge(rsum, on="__right_id__", how="inner") + + if lsum is None or rsum is None: + col_left = left_col if left_col in merged.columns else left_col + col_right = ( + f"{right_col}__r" if f"{right_col}__r" in merged.columns else right_col + ) + if col_left in merged.columns and col_right in merged.columns: + mask = self._evaluate_clause(merged[col_left], clause.op, merged[col_right]) + return merged[mask] + return merged + + l_min = merged.get(f"{left_col}__min") + l_max = merged.get(f"{left_col}__max") + r_min = merged.get(f"{right_col}__min_r") + r_max = merged.get(f"{right_col}__max_r") + + if ( + l_min is None + or l_max is None + or r_min is None + or r_max is None + or f"{left_col}__min" not in merged.columns + or f"{left_col}__max" not in merged.columns + or f"{right_col}__min_r" not in merged.columns + or f"{right_col}__max_r" not in merged.columns + ): + return merged + + if clause.op == ">": + return merged[merged[f"{left_col}__min"] > merged[f"{right_col}__max_r"]] + if clause.op == ">=": + return merged[merged[f"{left_col}__min"] >= merged[f"{right_col}__max_r"]] + if clause.op == "<": + return merged[merged[f"{left_col}__max"] < merged[f"{right_col}__min_r"]] + # <= + return merged[merged[f"{left_col}__max"] <= merged[f"{right_col}__min_r"]] + + @staticmethod + def _evaluate_clause(series_left: Any, op: str, series_right: Any) -> Any: + if op == "==": + return series_left == series_right + if op == "!=": + return series_left != series_right + if op == ">": + return series_left > series_right + if op == ">=": + return series_left >= series_right + if op == "<": + return series_left < series_right + if op == "<=": + return series_left <= series_right + return False + + def _materialize_filtered(self, path_state: "_PathState") -> Plottable: + """Build result graph from allowed node/edge ids and refresh alias frames.""" + + nodes_df = self.inputs.graph._nodes + node_id = self._node_column + edge_id = self._edge_column + src = self._source_column + dst = self._destination_column + + edge_frames = [ + self.forward_steps[idx]._edges + for idx, op in enumerate(self.inputs.chain) + if isinstance(op, ASTEdge) and self.forward_steps[idx]._edges is not None + ] + concatenated_edges = self._concat_frames(edge_frames) + edges_df = concatenated_edges if concatenated_edges is not None else self.inputs.graph._edges + + if nodes_df is None or edges_df is None or node_id is None or src is None or dst is None: + raise ValueError("Graph bindings are incomplete for same-path execution") + + # If any node step has an explicitly empty allowed set, the path is broken + # (e.g., WHERE clause filtered out all nodes at some step) + if path_state.allowed_nodes: + for node_set in path_state.allowed_nodes.values(): + if node_set is not None and len(node_set) == 0: + # Empty set at a step means no valid paths exist + return self._materialize_from_oracle( + nodes_df.iloc[0:0], edges_df.iloc[0:0] + ) + + # Build allowed node/edge DataFrames (vectorized - avoid Python sets where possible) + # Collect allowed node IDs from path_state + allowed_node_frames: List[DataFrameT] = [] + if path_state.allowed_nodes: + for node_set in path_state.allowed_nodes.values(): + if node_set: + allowed_node_frames.append(pd.DataFrame({'__node__': list(node_set)})) + + allowed_edge_frames: List[DataFrameT] = [] + if path_state.allowed_edges: + for edge_set in path_state.allowed_edges.values(): + if edge_set: + allowed_edge_frames.append(pd.DataFrame({'__edge__': list(edge_set)})) + + # For multi-hop edges, include all intermediate nodes from the edge frames + # (path_state.allowed_nodes only tracks start/end of multi-hop traversals) + has_multihop = any( + isinstance(op, ASTEdge) and not self._is_single_hop(op) + for op in self.inputs.chain + ) + if has_multihop and src in edges_df.columns and dst in edges_df.columns: + # Include all nodes referenced by edges (vectorized) + allowed_node_frames.append( + edges_df[[src]].rename(columns={src: '__node__'}) + ) + allowed_node_frames.append( + edges_df[[dst]].rename(columns={dst: '__node__'}) + ) + + # Combine and dedupe allowed nodes + if allowed_node_frames: + allowed_nodes_df = pd.concat(allowed_node_frames, ignore_index=True).drop_duplicates() + filtered_nodes = nodes_df[nodes_df[node_id].isin(allowed_nodes_df['__node__'])] + else: + filtered_nodes = nodes_df.iloc[0:0] + + # Filter edges by allowed nodes (both src AND dst must be in allowed nodes) + # This ensures that edges from filtered-out paths don't appear in the result + filtered_edges = edges_df + if allowed_node_frames: + filtered_edges = filtered_edges[ + filtered_edges[src].isin(allowed_nodes_df['__node__']) + & filtered_edges[dst].isin(allowed_nodes_df['__node__']) + ] + else: + filtered_edges = filtered_edges.iloc[0:0] + + # Filter by allowed edge IDs + if allowed_edge_frames and edge_id and edge_id in filtered_edges.columns: + allowed_edges_df = pd.concat(allowed_edge_frames, ignore_index=True).drop_duplicates() + filtered_edges = filtered_edges[filtered_edges[edge_id].isin(allowed_edges_df['__edge__'])] + + filtered_nodes = self._merge_label_frames( + filtered_nodes, + self._collect_label_frames("node"), + node_id, + ) + if edge_id is not None: + filtered_edges = self._merge_label_frames( + filtered_edges, + self._collect_label_frames("edge"), + edge_id, + ) + + filtered_edges = self._apply_output_slices(filtered_edges, "edge") + + has_output_slice = any( + isinstance(op, ASTEdge) + and (op.output_min_hops is not None or op.output_max_hops is not None) + for op in self.inputs.chain + ) + if has_output_slice: + if len(filtered_edges) > 0: + # Build endpoint IDs DataFrame (vectorized - no Python sets) + endpoint_ids_df = pd.concat([ + filtered_edges[[src]].rename(columns={src: '__node__'}), + filtered_edges[[dst]].rename(columns={dst: '__node__'}) + ], ignore_index=True).drop_duplicates() + filtered_nodes = filtered_nodes[ + filtered_nodes[node_id].isin(endpoint_ids_df['__node__']) + ] + else: + filtered_nodes = self._apply_output_slices(filtered_nodes, "node") + else: + filtered_nodes = self._apply_output_slices(filtered_nodes, "node") + + for alias, binding in self.inputs.alias_bindings.items(): + frame = filtered_nodes if binding.kind == "node" else filtered_edges + id_col = self._node_column if binding.kind == "node" else self._edge_column + if id_col is None or id_col not in frame.columns: + continue + required = set(self.inputs.column_requirements.get(alias, set())) + required.add(id_col) + subset = frame[[c for c in frame.columns if c in required]].copy() + self.alias_frames[alias] = subset + + return self._materialize_from_oracle(filtered_nodes, filtered_edges) + + @staticmethod + def _needs_auto_labels(op: ASTEdge) -> bool: + return bool( + (op.output_min_hops is not None or op.output_max_hops is not None) + or (op.min_hops is not None and op.min_hops > 0) + ) + + @staticmethod + def _resolve_label_cols(op: ASTEdge) -> Tuple[Optional[str], Optional[str]]: + node_label = op.label_node_hops + edge_label = op.label_edge_hops + if DFSamePathExecutor._needs_auto_labels(op): + node_label = node_label or "__gfql_output_node_hop__" + edge_label = edge_label or "__gfql_output_edge_hop__" + return node_label, edge_label + + def _collect_label_frames(self, kind: AliasKind) -> List[DataFrameT]: + frames: List[DataFrameT] = [] + id_col = self._node_column if kind == "node" else self._edge_column + if id_col is None: + return frames + for idx, op in enumerate(self.inputs.chain): + if not isinstance(op, ASTEdge): + continue + step = self.forward_steps[idx] + df = step._nodes if kind == "node" else step._edges + if df is None or id_col not in df.columns: + continue + node_label, edge_label = self._resolve_label_cols(op) + label_col = node_label if kind == "node" else edge_label + if label_col is None or label_col not in df.columns: + continue + frames.append(df[[id_col, label_col]]) + return frames + + @staticmethod + def _merge_label_frames( + base_df: DataFrameT, + label_frames: Sequence[DataFrameT], + id_col: str, + ) -> DataFrameT: + out_df = base_df + for frame in label_frames: + label_cols = [c for c in frame.columns if c != id_col] + if not label_cols: + continue + merged = safe_merge(out_df, frame[[id_col] + label_cols], on=id_col, how="left") + for col in label_cols: + col_x = f"{col}_x" + col_y = f"{col}_y" + if col_x in merged.columns and col_y in merged.columns: + merged = merged.assign(**{col: merged[col_x].fillna(merged[col_y])}) + merged = merged.drop(columns=[col_x, col_y]) + out_df = merged + return out_df + + def _apply_output_slices(self, df: DataFrameT, kind: AliasKind) -> DataFrameT: + out_df = df + for op in self.inputs.chain: + if not isinstance(op, ASTEdge): + continue + if op.output_min_hops is None and op.output_max_hops is None: + continue + label_col = self._select_label_col(out_df, op, kind) + if label_col is None or label_col not in out_df.columns: + continue + mask = out_df[label_col].notna() + if op.output_min_hops is not None: + mask = mask & (out_df[label_col] >= op.output_min_hops) + if op.output_max_hops is not None: + mask = mask & (out_df[label_col] <= op.output_max_hops) + out_df = out_df[mask] + return out_df + + def _select_label_col( + self, df: DataFrameT, op: ASTEdge, kind: AliasKind + ) -> Optional[str]: + node_label, edge_label = self._resolve_label_cols(op) + label_col = node_label if kind == "node" else edge_label + if label_col and label_col in df.columns: + return label_col + hop_like = [c for c in df.columns if "hop" in c] + return hop_like[0] if hop_like else None + + def _apply_oracle_hop_labels(self, oracle: "OracleResult") -> Tuple[DataFrameT, DataFrameT]: + nodes_df = oracle.nodes + edges_df = oracle.edges + node_id = self._node_column + edge_id = self._edge_column + node_labels = oracle.node_hop_labels or {} + edge_labels = oracle.edge_hop_labels or {} + + node_frames: List[DataFrameT] = [] + edge_frames: List[DataFrameT] = [] + for op in self.inputs.chain: + if not isinstance(op, ASTEdge): + continue + node_label, edge_label = self._resolve_label_cols(op) + if node_label and node_id and node_id in nodes_df.columns and node_labels: + node_series = nodes_df[node_id].map(node_labels) + node_frames.append(pd.DataFrame({node_id: nodes_df[node_id], node_label: node_series})) + if edge_label and edge_id and edge_id in edges_df.columns and edge_labels: + edge_series = edges_df[edge_id].map(edge_labels) + edge_frames.append(pd.DataFrame({edge_id: edges_df[edge_id], edge_label: edge_series})) + + if node_id is not None and node_frames: + nodes_df = self._merge_label_frames(nodes_df, node_frames, node_id) + if edge_id is not None and edge_frames: + edges_df = self._merge_label_frames(edges_df, edge_frames, edge_id) + + return nodes_df, edges_df + + def _alias_for_step(self, step_index: int) -> Optional[str]: + for alias, binding in self.inputs.alias_bindings.items(): + if binding.step_index == step_index: + return alias + return None + + @staticmethod + def _concat_frames(frames: Sequence[DataFrameT]) -> Optional[DataFrameT]: + if not frames: + return None + first = frames[0] + if first.__class__.__module__.startswith("cudf"): + import cudf # type: ignore + + return cudf.concat(frames, ignore_index=True) + return pd.concat(frames, ignore_index=True) + + + def _apply_ready_clauses(self) -> None: + if not self.inputs.where: + return + ready = [ + clause + for clause in self.inputs.where + if clause.left.alias in self.alias_frames + and clause.right.alias in self.alias_frames + ] + for clause in ready: + self._prune_clause(clause) + + def _prune_clause(self, clause: WhereComparison) -> None: + if clause.op == "!=": + return # No global prune for inequality-yet + lhs = self.alias_frames[clause.left.alias] + rhs = self.alias_frames[clause.right.alias] + left_col = clause.left.column + right_col = clause.right.column + + if clause.op == "==": + allowed = self._common_values(lhs[left_col], rhs[right_col]) + self.alias_frames[clause.left.alias] = self._filter_by_values( + lhs, left_col, allowed + ) + self.alias_frames[clause.right.alias] = self._filter_by_values( + rhs, right_col, allowed + ) + elif clause.op == ">": + right_min = self._safe_min(rhs[right_col]) + left_max = self._safe_max(lhs[left_col]) + if right_min is not None: + self.alias_frames[clause.left.alias] = lhs[lhs[left_col] > right_min] + if left_max is not None: + self.alias_frames[clause.right.alias] = rhs[rhs[right_col] < left_max] + elif clause.op == ">=": + right_min = self._safe_min(rhs[right_col]) + left_max = self._safe_max(lhs[left_col]) + if right_min is not None: + self.alias_frames[clause.left.alias] = lhs[lhs[left_col] >= right_min] + if left_max is not None: + self.alias_frames[clause.right.alias] = rhs[ + rhs[right_col] <= left_max + ] + elif clause.op == "<": + right_max = self._safe_max(rhs[right_col]) + left_min = self._safe_min(lhs[left_col]) + if right_max is not None: + self.alias_frames[clause.left.alias] = lhs[lhs[left_col] < right_max] + if left_min is not None: + self.alias_frames[clause.right.alias] = rhs[ + rhs[right_col] > left_min + ] + elif clause.op == "<=": + right_max = self._safe_max(rhs[right_col]) + left_min = self._safe_min(lhs[left_col]) + if right_max is not None: + self.alias_frames[clause.left.alias] = lhs[ + lhs[left_col] <= right_max + ] + if left_min is not None: + self.alias_frames[clause.right.alias] = rhs[ + rhs[right_col] >= left_min + ] + + @staticmethod + def _filter_by_values( + frame: DataFrameT, column: str, values: Set[Any] + ) -> DataFrameT: + if not values: + return frame.iloc[0:0] + allowed = list(values) + mask = frame[column].isin(allowed) + return frame[mask] + + @staticmethod + def _common_values(series_a: Any, series_b: Any) -> Set[Any]: + vals_a = DFSamePathExecutor._series_values(series_a) + vals_b = DFSamePathExecutor._series_values(series_b) + return vals_a & vals_b + + @staticmethod + def _series_values(series: Any) -> Set[Any]: + pandas_series = DFSamePathExecutor._to_pandas_series(series) + return set(pandas_series.dropna().unique().tolist()) + + @staticmethod + def _safe_min(series: Any) -> Optional[Any]: + pandas_series = DFSamePathExecutor._to_pandas_series(series).dropna() + if pandas_series.empty: + return None + value = pandas_series.min() + if pd.isna(value): + return None + return value + + @staticmethod + def _safe_max(series: Any) -> Optional[Any]: + pandas_series = DFSamePathExecutor._to_pandas_series(series).dropna() + if pandas_series.empty: + return None + value = pandas_series.max() + if pd.isna(value): + return None + return value + + @staticmethod + def _to_pandas_series(series: Any) -> pd.Series: + if hasattr(series, "to_pandas"): + return series.to_pandas() + if isinstance(series, pd.Series): + return series + return pd.Series(series) + + +def build_same_path_inputs( + g: Plottable, + chain: Sequence[ASTObject], + where: Sequence[WhereComparison], + engine: Engine, + include_paths: bool = False, +) -> SamePathExecutorInputs: + """Construct executor inputs, deriving planner metadata and validations.""" + + bindings = _collect_alias_bindings(chain) + _validate_where_aliases(bindings, where) + required_columns = _collect_required_columns(where) + plan = plan_same_path(where) + + return SamePathExecutorInputs( + graph=g, + chain=list(chain), + where=list(where), + plan=plan, + engine=engine, + alias_bindings=bindings, + column_requirements=required_columns, + include_paths=include_paths, + ) + + +def execute_same_path_chain( + g: Plottable, + chain: Sequence[ASTObject], + where: Sequence[WhereComparison], + engine: Engine, + include_paths: bool = False, +) -> Plottable: + """Convenience wrapper used by Chain execution once hooked up.""" + + inputs = build_same_path_inputs(g, chain, where, engine, include_paths) + executor = DFSamePathExecutor(inputs) + return executor.run() + + +def _collect_alias_bindings(chain: Sequence[ASTObject]) -> Dict[str, AliasBinding]: + bindings: Dict[str, AliasBinding] = {} + for idx, step in enumerate(chain): + alias = getattr(step, "_name", None) + if not alias: + continue + if not isinstance(alias, str): + continue + if isinstance(step, ASTNode): + kind: AliasKind = "node" + elif isinstance(step, ASTEdge): + kind = "edge" + else: + continue + + if alias in bindings: + raise ValueError(f"Duplicate alias '{alias}' detected in chain") + bindings[alias] = AliasBinding(alias, idx, kind, step) + return bindings + + +def _collect_required_columns( + where: Sequence[WhereComparison], +) -> Dict[str, Set[str]]: + requirements: Dict[str, Set[str]] = defaultdict(set) + for clause in where: + requirements[clause.left.alias].add(clause.left.column) + requirements[clause.right.alias].add(clause.right.column) + return {alias: set(cols) for alias, cols in requirements.items()} + + +def _validate_where_aliases( + bindings: Dict[str, AliasBinding], + where: Sequence[WhereComparison], +) -> None: + if not where: + return + referenced = {clause.left.alias for clause in where} | { + clause.right.alias for clause in where + } + missing = sorted(alias for alias in referenced if alias not in bindings) + if missing: + missing_str = ", ".join(missing) + raise ValueError( + f"WHERE references aliases with no node/edge bindings: {missing_str}" + ) diff --git a/graphistry/compute/gfql_unified.py b/graphistry/compute/gfql_unified.py index 0cbb22a469..5766c266e2 100644 --- a/graphistry/compute/gfql_unified.py +++ b/graphistry/compute/gfql_unified.py @@ -1,8 +1,9 @@ """GFQL unified entrypoint for chains and DAGs""" +# ruff: noqa: E501 -from typing import List, Union, Optional, Dict, Any +from typing import List, Union, Optional, Dict, Any, cast from graphistry.Plottable import Plottable -from graphistry.Engine import EngineAbstract +from graphistry.Engine import Engine, EngineAbstract from graphistry.util import setup_logger from .ast import ASTObject, ASTLet, ASTNode, ASTEdge from .chain import Chain, chain as chain_impl @@ -16,6 +17,11 @@ QueryType, expand_policy ) +from graphistry.gfql.same_path_types import parse_where_json +from graphistry.compute.gfql.df_executor import ( + build_same_path_inputs, + execute_same_path_chain, +) logger = setup_logger(__name__) @@ -227,8 +233,22 @@ def policy(context: PolicyContext) -> None: e.query_type = policy_context.get('query_type') raise - # Handle dict convenience first (convert to ASTLet) - if isinstance(query, dict): + # Handle dict convenience first + if isinstance(query, dict) and "chain" in query: + chain_items: List[ASTObject] = [] + for item in query["chain"]: + if isinstance(item, dict): + from .ast import from_json + chain_items.append(from_json(item)) + elif isinstance(item, ASTObject): + chain_items.append(item) + else: + raise TypeError(f"Unsupported chain entry type: {type(item)}") + where_meta = parse_where_json( + cast(Optional[List[Dict[str, Dict[str, str]]]], query.get("where")) + ) + query = Chain(chain_items, where=where_meta) + elif isinstance(query, dict): # Auto-wrap ASTNode and ASTEdge values in Chain for GraphOperation compatibility wrapped_dict = {} for key, value in query.items(): @@ -256,13 +276,13 @@ def policy(context: PolicyContext) -> None: logger.debug('GFQL executing as Chain') if output is not None: logger.warning('output parameter ignored for chain queries') - return chain_impl(self, query.chain, engine, policy=expanded_policy, context=context) + return _chain_dispatch(self, query, engine, expanded_policy, context) elif isinstance(query, ASTObject): # Single ASTObject -> execute as single-item chain logger.debug('GFQL executing single ASTObject as chain') if output is not None: logger.warning('output parameter ignored for chain queries') - return chain_impl(self, [query], engine, policy=expanded_policy, context=context) + return _chain_dispatch(self, Chain([query]), engine, expanded_policy, context) elif isinstance(query, list): logger.debug('GFQL executing list as chain') if output is not None: @@ -277,7 +297,7 @@ def policy(context: PolicyContext) -> None: else: converted_query.append(item) - return chain_impl(self, converted_query, engine, policy=expanded_policy, context=context) + return _chain_dispatch(self, Chain(converted_query), engine, expanded_policy, context) else: raise TypeError( f"Query must be ASTObject, List[ASTObject], Chain, ASTLet, or dict. " @@ -291,3 +311,32 @@ def policy(context: PolicyContext) -> None: # Reset policy depth if policy: context.policy_depth = policy_depth + + +def _chain_dispatch( + g: Plottable, + chain_obj: Chain, + engine: Union[EngineAbstract, str], + policy: Optional[PolicyDict], + context: ExecutionContext, +) -> Plottable: + """Dispatch chain execution, including cuDF same-path executor when applicable.""" + + is_cudf = engine == EngineAbstract.CUDF or engine == "cudf" + if is_cudf and chain_obj.where: + engine_enum = Engine.CUDF + inputs = build_same_path_inputs( + g, + chain_obj.chain, + chain_obj.where, + engine=engine_enum, + include_paths=False, + ) + return execute_same_path_chain( + inputs.graph, + inputs.chain, + inputs.where, + inputs.engine, + inputs.include_paths, + ) + return chain_impl(g, chain_obj.chain, engine, policy=policy, context=context) diff --git a/graphistry/gfql/__init__.py b/graphistry/gfql/__init__.py new file mode 100644 index 0000000000..04bf3ca051 --- /dev/null +++ b/graphistry/gfql/__init__.py @@ -0,0 +1 @@ +"""GFQL helpers.""" diff --git a/graphistry/gfql/ref/__init__.py b/graphistry/gfql/ref/__init__.py new file mode 100644 index 0000000000..f000c7a4ee --- /dev/null +++ b/graphistry/gfql/ref/__init__.py @@ -0,0 +1 @@ +"""GFQL reference helpers.""" diff --git a/graphistry/gfql/ref/enumerator.py b/graphistry/gfql/ref/enumerator.py index ed360565be..716ecc0311 100644 --- a/graphistry/gfql/ref/enumerator.py +++ b/graphistry/gfql/ref/enumerator.py @@ -1,9 +1,10 @@ """Minimal GFQL reference enumerator used as the correctness oracle.""" +# ruff: noqa: E501 from __future__ import annotations from dataclasses import dataclass -from typing import Any, Dict, List, Literal, Optional, Sequence, Set, Tuple +from typing import Any, Dict, List, Optional, Sequence, Set, Tuple import pandas as pd @@ -16,21 +17,7 @@ from graphistry.compute.ast import ASTEdge, ASTNode, ASTObject from graphistry.compute.chain import Chain from graphistry.compute.filter_by_dict import filter_by_dict -ComparisonOp = Literal["==", "!=", "<", "<=", ">", ">="] - - - -@dataclass(frozen=True) -class StepColumnRef: - alias: str - column: str - - -@dataclass(frozen=True) -class WhereComparison: - left: StepColumnRef - op: ComparisonOp - right: StepColumnRef +from graphistry.gfql.same_path_types import ComparisonOp, WhereComparison @dataclass(frozen=True) @@ -52,14 +39,6 @@ class OracleResult: edge_hop_labels: Optional[Dict[Any, int]] = None -def col(alias: str, column: str) -> StepColumnRef: - return StepColumnRef(alias, column) - - -def compare(left: StepColumnRef, op: ComparisonOp, right: StepColumnRef) -> WhereComparison: - return WhereComparison(left, op, right) - - def enumerate_chain( g: Plottable, ops: Sequence[ASTObject], @@ -125,11 +104,9 @@ def enumerate_chain( paths = paths.drop(columns=[current]) current = node_step["id_col"] else: - if where: - raise ValueError("WHERE clauses not supported for multi-hop edges in enumerator") - if edge_step["alias"] or node_step["alias"]: - # Alias tagging for multi-hop not yet supported in enumerator - raise ValueError("Aliases not supported for multi-hop edges in enumerator") + if edge_step["alias"]: + # Edge alias tagging for multi-hop not yet supported in enumerator + raise ValueError("Edge aliases not supported for multi-hop edges in enumerator") dest_allowed: Optional[Set[Any]] = None if not node_frame.empty: @@ -149,6 +126,12 @@ def enumerate_chain( for dst in bp_result.seed_to_nodes.get(seed_id, set()): new_rows.append([*row, dst]) paths = pd.DataFrame(new_rows, columns=[*base_cols, node_step["id_col"]]) + paths = paths.merge( + node_frame, + on=node_step["id_col"], + how="inner", + validate="m:1", + ) current = node_step["id_col"] # Stash edges/nodes and hop labels for final selection @@ -167,6 +150,70 @@ def enumerate_chain( if where: paths = paths[_apply_where(paths, where)] + + # After WHERE filtering, prune collected_nodes/edges to only those in surviving paths + # For multi-hop edges, we stored all reachable nodes/edges before WHERE filtering + # Now we need to keep only those that participate in valid paths + if len(paths) > 0: + for i, edge_step in enumerate(edge_steps): + if "collected_nodes" not in edge_step: + continue + start_col = node_steps[i]["id_col"] + end_col = node_steps[i + 1]["id_col"] + if start_col not in paths.columns or end_col not in paths.columns: + continue + valid_starts = set(paths[start_col].tolist()) + valid_ends = set(paths[end_col].tolist()) + + # Re-trace paths from valid_starts to valid_ends to find valid nodes/edges + # Build adjacency from original edges, respecting direction + direction = edge_step.get("direction", "forward") + adjacency: Dict[Any, List[Tuple[Any, Any]]] = {} + for _, row in edges_df.iterrows(): # type: ignore[assignment] + src, dst, eid = row[edge_src], row[edge_dst], row[edge_id] # type: ignore[call-overload] + if direction == "reverse": + # Reverse: traverse dst -> src + adjacency.setdefault(dst, []).append((eid, src)) + elif direction == "undirected": + # Undirected: traverse both ways + adjacency.setdefault(src, []).append((eid, dst)) + adjacency.setdefault(dst, []).append((eid, src)) + else: + # Forward: traverse src -> dst + adjacency.setdefault(src, []).append((eid, dst)) + + # BFS from valid_starts to find paths to valid_ends + valid_nodes: Set[Any] = set() + valid_edge_ids: Set[Any] = set() + max_hops = edge_step.get("max_hops", 10) + + for start in valid_starts: + # Track paths: (current_node, path_edges, path_nodes) + stack: List[Tuple[Any, List[Any], List[Any]]] = [(start, [], [start])] + while stack: + node, path_edges, path_nodes = stack.pop() + if len(path_edges) >= max_hops: + continue + for eid, dst in adjacency.get(node, []): + new_edges = path_edges + [eid] + new_nodes = path_nodes + [dst] + if dst in valid_ends: + # This path reaches a valid end - include all nodes/edges + valid_nodes.update(new_nodes) + valid_edge_ids.update(new_edges) + if len(new_edges) < max_hops: + stack.append((dst, new_edges, new_nodes)) + + edge_step["collected_nodes"] = valid_nodes + edge_step["collected_edges"] = valid_edge_ids + else: + # No surviving paths - clear all collected nodes/edges + for edge_step in edge_steps: + if "collected_nodes" in edge_step: + edge_step["collected_nodes"] = set() + if "collected_edges" in edge_step: + edge_step["collected_edges"] = set() + seq_cols: List[str] = [] for i, node_step in enumerate(node_steps): seq_cols.append(node_step["id_col"]) diff --git a/graphistry/gfql/same_path_plan.py b/graphistry/gfql/same_path_plan.py new file mode 100644 index 0000000000..8ea0b5d08e --- /dev/null +++ b/graphistry/gfql/same_path_plan.py @@ -0,0 +1,62 @@ +"""Planner toggles for same-path WHERE comparisons.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Dict, Optional, Sequence, Set + +from graphistry.gfql.same_path_types import WhereComparison + + +@dataclass +class BitsetPlan: + aliases: Set[str] + lane_count: int = 64 + + +@dataclass +class StateTablePlan: + aliases: Set[str] + cap: int = 128 + + +@dataclass +class SamePathPlan: + minmax_aliases: Dict[str, Set[str]] = field(default_factory=dict) + bitsets: Dict[str, BitsetPlan] = field(default_factory=dict) + state_tables: Dict[str, StateTablePlan] = field(default_factory=dict) + + def requires_minmax(self, alias: str) -> bool: + return alias in self.minmax_aliases + + +def plan_same_path( + where: Optional[Sequence[WhereComparison]], + max_bitset_domain: int = 64, + state_cap: int = 128, +) -> SamePathPlan: + plan = SamePathPlan() + if not where: + return plan + + for clause in where: + if clause.op in {"<", "<=", ">", ">="}: + for ref in (clause.left, clause.right): + plan.minmax_aliases.setdefault(ref.alias, set()).add(ref.column) + elif clause.op in {"==", "!="}: + key = _equality_key(clause) + plan.bitsets.setdefault(key, BitsetPlan(set())).aliases.update( + {clause.left.alias, clause.right.alias} + ) + + return plan + + +def _equality_key(clause: WhereComparison) -> str: + cols = sorted( + [ + f"{clause.left.alias}.{clause.left.column}", + f"{clause.right.alias}.{clause.right.column}", + ] + ) + return "::".join(cols) diff --git a/graphistry/gfql/same_path_types.py b/graphistry/gfql/same_path_types.py new file mode 100644 index 0000000000..467b7058e9 --- /dev/null +++ b/graphistry/gfql/same_path_types.py @@ -0,0 +1,99 @@ +"""Shared data structures for same-path WHERE comparisons.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Dict, List, Literal, Optional, Sequence + + +ComparisonOp = Literal[ + "==", + "!=", + "<", + "<=", + ">", + ">=", +] + + +@dataclass(frozen=True) +class StepColumnRef: + alias: str + column: str + + +@dataclass(frozen=True) +class WhereComparison: + left: StepColumnRef + op: ComparisonOp + right: StepColumnRef + + +def col(alias: str, column: str) -> StepColumnRef: + return StepColumnRef(alias, column) + + +def compare( + left: StepColumnRef, op: ComparisonOp, right: StepColumnRef +) -> WhereComparison: + return WhereComparison(left, op, right) + + +def parse_column_ref(ref: str) -> StepColumnRef: + if "." not in ref: + raise ValueError(f"Column reference '{ref}' must be alias.column") + alias, column = ref.split(".", 1) + if not alias or not column: + raise ValueError(f"Invalid column reference '{ref}'") + return StepColumnRef(alias, column) + + +def parse_where_json( + where_json: Optional[Sequence[Dict[str, Dict[str, str]]]] +) -> List[WhereComparison]: + if not where_json: + return [] + clauses: List[WhereComparison] = [] + for entry in where_json: + if not isinstance(entry, dict) or len(entry) != 1: + raise ValueError(f"Invalid WHERE clause: {entry}") + op_name, payload = next(iter(entry.items())) + if op_name not in {"eq", "neq", "gt", "lt", "ge", "le"}: + raise ValueError(f"Unsupported WHERE operator '{op_name}'") + op_map: Dict[str, ComparisonOp] = { + "eq": "==", + "neq": "!=", + "gt": ">", + "lt": "<", + "ge": ">=", + "le": "<=", + } + left = parse_column_ref(payload["left"]) + right = parse_column_ref(payload["right"]) + clauses.append(WhereComparison(left, op_map[op_name], right)) + return clauses + + +def where_to_json(where: Sequence[WhereComparison]) -> List[Dict[str, Dict[str, str]]]: + result: List[Dict[str, Dict[str, str]]] = [] + op_map: Dict[str, str] = { + "==": "eq", + "!=": "neq", + ">": "gt", + "<": "lt", + ">=": "ge", + "<=": "le", + } + for clause in where: + op_name = op_map.get(clause.op) + if not op_name: + continue + result.append( + { + op_name: { + "left": f"{clause.left.alias}.{clause.left.column}", + "right": f"{clause.right.alias}.{clause.right.column}", + } + } + ) + return result diff --git a/graphistry/tests/compute/test_chain_where.py b/graphistry/tests/compute/test_chain_where.py new file mode 100644 index 0000000000..8c8c77eb46 --- /dev/null +++ b/graphistry/tests/compute/test_chain_where.py @@ -0,0 +1,49 @@ +import pandas as pd + +from graphistry.compute import n, e_forward +from graphistry.compute.chain import Chain +from graphistry.gfql.same_path_types import col, compare +from graphistry.tests.test_compute import CGFull + + +def test_chain_where_roundtrip(): + chain = Chain([n({'type': 'account'}, name='a'), e_forward(), n(name='c')], where=[ + compare(col('a', 'owner_id'), '==', col('c', 'owner_id')) + ]) + json_data = chain.to_json() + assert 'where' in json_data + restored = Chain.from_json(json_data) + assert len(restored.where) == 1 + + +def test_chain_from_json_literal(): + json_chain = { + 'chain': [ + n({'type': 'account'}, name='a').to_json(), + e_forward().to_json(), + n({'type': 'user'}, name='c').to_json(), + ], + 'where': [ + {'eq': {'left': 'a.owner_id', 'right': 'c.owner_id'}} + ], + } + chain = Chain.from_json(json_chain) + assert len(chain.where) == 1 + + +def test_gfql_chain_dict_with_where_executes(): + nodes_df = n({'type': 'account'}, name='a').to_json() + edge_json = e_forward().to_json() + user_json = n({'type': 'user'}, name='c').to_json() + json_chain = { + 'chain': [nodes_df, edge_json, user_json], + 'where': [{'eq': {'left': 'a.owner_id', 'right': 'c.owner_id'}}], + } + nodes_df = pd.DataFrame([ + {'id': 'acct1', 'type': 'account', 'owner_id': 'user1'}, + {'id': 'user1', 'type': 'user'}, + ]) + edges_df = pd.DataFrame([{'src': 'acct1', 'dst': 'user1'}]) + g = CGFull().nodes(nodes_df, 'id').edges(edges_df, 'src', 'dst') + res = g.gfql(json_chain) + assert res._nodes is not None diff --git a/mypy.ini b/mypy.ini index d3c38b0b90..e2a0cf3933 100644 --- a/mypy.ini +++ b/mypy.ini @@ -18,6 +18,9 @@ ignore_missing_imports = True [mypy-cupy.*] ignore_missing_imports = True +[mypy-tqdm.*] +ignore_missing_imports = True + [mypy-dask.*] ignore_missing_imports = True @@ -112,9 +115,6 @@ ignore_missing_imports = True [mypy-azure.kusto.*] ignore_missing_imports = True -[mypy-tqdm.*] -ignore_missing_imports = True - [mypy-requests.*] ignore_missing_imports = True diff --git a/tests/gfql/ref/cprofile_df_executor.py b/tests/gfql/ref/cprofile_df_executor.py new file mode 100644 index 0000000000..f87dd11046 --- /dev/null +++ b/tests/gfql/ref/cprofile_df_executor.py @@ -0,0 +1,140 @@ +""" +cProfile analysis of df_executor to find hotspots. + +Run with: + python -m tests.gfql.ref.cprofile_df_executor +""" +import cProfile +import pstats +import io +import pandas as pd +from typing import Tuple + +import graphistry +from graphistry.compute.ast import n, e_forward +from graphistry.gfql.same_path_types import col, compare, where_to_json + + +def make_graph(n_nodes: int, n_edges: int) -> Tuple[pd.DataFrame, pd.DataFrame]: + """Create a graph for profiling.""" + import random + random.seed(42) + + nodes = pd.DataFrame({ + 'id': list(range(n_nodes)), + 'v': list(range(n_nodes)), + }) + + edges_list = [] + for i in range(n_edges): + src = random.randint(0, n_nodes - 2) + dst = random.randint(src + 1, n_nodes - 1) + edges_list.append({'src': src, 'dst': dst, 'eid': i}) + edges = pd.DataFrame(edges_list).drop_duplicates(subset=['src', 'dst']) + + return nodes, edges + + +def profile_simple_query(g, n_runs=5): + """Profile a simple query.""" + chain = [n(name="a"), e_forward(name="e"), n(name="c")] + for _ in range(n_runs): + g.gfql({"chain": chain, "where": []}, engine="pandas") + + +def profile_multihop_query(g, n_runs=5): + """Profile a multihop query.""" + chain = [ + n({"id": 0}, name="a"), + e_forward(min_hops=1, max_hops=3, name="e"), + n(name="c") + ] + for _ in range(n_runs): + g.gfql({"chain": chain, "where": []}, engine="pandas") + + +def profile_where_query(g, n_runs=5): + """Profile a query with WHERE clause.""" + chain = [n(name="a"), e_forward(name="e"), n(name="c")] + where = [compare(col("a", "v"), "<", col("c", "v"))] + where_json = where_to_json(where) + for _ in range(n_runs): + g.gfql({"chain": chain, "where": where_json}, engine="pandas") + + +def profile_samepath_query(g_small, n_runs=5): + """Profile same-path executor (requires WHERE + cudf engine hint).""" + # The same-path executor is triggered by cudf engine + WHERE + # But we're using pandas, so we need to call it directly + from graphistry.compute.gfql.df_executor import ( + build_same_path_inputs, + execute_same_path_chain, + ) + from graphistry.Engine import Engine + + chain = [n(name="a"), e_forward(name="e"), n(name="c")] + where = [compare(col("a", "v"), "<", col("c", "v"))] + + for _ in range(n_runs): + inputs = build_same_path_inputs( + g_small, + chain, + where, + engine=Engine.PANDAS, + include_paths=False, + ) + execute_same_path_chain( + inputs.graph, + inputs.chain, + inputs.where, + inputs.engine, + inputs.include_paths, + ) + + +def run_profile(func, g, name): + """Run profiler and print top functions.""" + print(f"\n{'='*60}") + print(f"Profiling: {name}") + print(f"{'='*60}") + + profiler = cProfile.Profile() + profiler.enable() + func(g) + profiler.disable() + + # Get stats + s = io.StringIO() + stats = pstats.Stats(profiler, stream=s) + stats.sort_stats('cumulative') + stats.print_stats(30) # Top 30 functions + print(s.getvalue()) + + +def main(): + print("Creating large graph: 50K nodes, 200K edges") + nodes_df, edges_df = make_graph(50000, 200000) + g = graphistry.nodes(nodes_df, 'id').edges(edges_df, 'src', 'dst') + print(f"Large graph: {len(nodes_df)} nodes, {len(edges_df)} edges") + + print("Creating small graph: 1K nodes, 2K edges") + nodes_small, edges_small = make_graph(1000, 2000) + g_small = graphistry.nodes(nodes_small, 'id').edges(edges_small, 'src', 'dst') + print(f"Small graph: {len(nodes_small)} nodes, {len(edges_small)} edges") + + # Warmup + print("\nWarmup...") + chain = [n(name="a"), e_forward(name="e"), n(name="c")] + g.gfql({"chain": chain, "where": []}, engine="pandas") + + # Profile legacy chain on large graph + run_profile(profile_simple_query, g, "Simple query (n->e->n) - legacy chain, 50K nodes") + run_profile(profile_multihop_query, g, "Multihop query (n->e(1..3)->n) - legacy chain, 50K nodes") + run_profile(profile_where_query, g, "WHERE query (a.v < c.v) - legacy chain, 50K nodes") + + # Profile same-path executor on small graph (oracle has caps) + run_profile(lambda g: profile_samepath_query(g_small), g, "Same-path executor (n->e->n, a.v < c.v) - 1K nodes") + + +if __name__ == "__main__": + main() diff --git a/tests/gfql/ref/profile_df_executor.py b/tests/gfql/ref/profile_df_executor.py new file mode 100644 index 0000000000..5ad5b6f063 --- /dev/null +++ b/tests/gfql/ref/profile_df_executor.py @@ -0,0 +1,204 @@ +""" +Profile df_executor to identify optimization opportunities. + +Run with: + python -m tests.gfql.ref.profile_df_executor + +Outputs timing data for different chain complexities and graph sizes. +""" +import time +import pandas as pd +from typing import List, Dict, Any, Tuple +from dataclasses import dataclass + +# Import the executor and test utilities +import graphistry +from graphistry.compute.ast import n, e_forward, e_reverse, e_undirected +from graphistry.gfql.same_path_types import WhereComparison, StepColumnRef, col, compare, where_to_json + + +@dataclass +class ProfileResult: + scenario: str + nodes: int + edges: int + chain_desc: str + where_desc: str + time_ms: float + result_nodes: int + result_edges: int + + +def make_linear_graph(n_nodes: int, n_edges: int) -> Tuple[pd.DataFrame, pd.DataFrame]: + """Create a linear graph: 0 -> 1 -> 2 -> ... -> n-1""" + nodes = pd.DataFrame({ + 'id': list(range(n_nodes)), + 'v': list(range(n_nodes)), + }) + # Create edges ensuring we don't exceed available nodes + edges_list = [] + for i in range(min(n_edges, n_nodes - 1)): + edges_list.append({'src': i, 'dst': i + 1, 'eid': i}) + edges = pd.DataFrame(edges_list) + return nodes, edges + + +def make_dense_graph(n_nodes: int, n_edges: int) -> Tuple[pd.DataFrame, pd.DataFrame]: + """Create a denser graph with multiple paths.""" + import random + random.seed(42) + + nodes = pd.DataFrame({ + 'id': list(range(n_nodes)), + 'v': list(range(n_nodes)), + }) + + edges_list = [] + for i in range(n_edges): + src = random.randint(0, n_nodes - 2) + dst = random.randint(src + 1, n_nodes - 1) + edges_list.append({'src': src, 'dst': dst, 'eid': i}) + edges = pd.DataFrame(edges_list).drop_duplicates(subset=['src', 'dst']) + + return nodes, edges + + +def profile_query( + g: graphistry.Plottable, + chain: List[Any], + where: List[WhereComparison], + scenario: str, + n_nodes: int, + n_edges: int, + n_runs: int = 3 +) -> ProfileResult: + """Profile a single query, return average time.""" + + from graphistry.compute.chain import Chain + + # Convert WHERE to JSON format + where_json = where_to_json(where) if where else [] + + # Warmup + result = g.gfql({"chain": chain, "where": where_json}, engine="pandas") + + # Timed runs + times = [] + for _ in range(n_runs): + start = time.perf_counter() + result = g.gfql({"chain": chain, "where": where_json}, engine="pandas") + elapsed = time.perf_counter() - start + times.append(elapsed * 1000) # ms + + avg_time = sum(times) / len(times) + + chain_desc = " -> ".join(str(type(op).__name__) for op in chain) + where_desc = str(len(where)) + " clauses" if where else "none" + + return ProfileResult( + scenario=scenario, + nodes=n_nodes, + edges=n_edges, + chain_desc=chain_desc, + where_desc=where_desc, + time_ms=avg_time, + result_nodes=len(result._nodes) if result._nodes is not None else 0, + result_edges=len(result._edges) if result._edges is not None else 0, + ) + + +def run_profiles() -> List[ProfileResult]: + """Run all profiling scenarios.""" + results = [] + + # Define scenarios + scenarios = [ + # (name, n_nodes, n_edges, graph_type) + ('tiny', 100, 200, 'linear'), + ('small', 1000, 2000, 'linear'), + ('medium', 10000, 20000, 'linear'), + ('medium_dense', 10000, 50000, 'dense'), + ('large', 100000, 200000, 'linear'), + ('large_dense', 100000, 500000, 'dense'), + ] + + for scenario_name, n_nodes, n_edges, graph_type in scenarios: + print(f"\n=== Scenario: {scenario_name} ({n_nodes} nodes, {n_edges} edges, {graph_type}) ===") + + if graph_type == 'linear': + nodes_df, edges_df = make_linear_graph(n_nodes, n_edges) + else: + nodes_df, edges_df = make_dense_graph(n_nodes, n_edges) + + g = graphistry.nodes(nodes_df, 'id').edges(edges_df, 'src', 'dst') + + # Chain variants + chains = [ + ("simple", [n(name="a"), e_forward(name="e"), n(name="c")], []), + + ("with_filter", [ + n({"id": 0}, name="a"), + e_forward(name="e"), + n(name="c") + ], []), + + ("with_where_adjacent", [ + n(name="a"), + e_forward(name="e"), + n(name="c") + ], [compare(col("a", "v"), "<", col("c", "v"))]), + + ("multihop", [ + n({"id": 0}, name="a"), + e_forward(min_hops=1, max_hops=3, name="e"), + n(name="c") + ], []), + + ("multihop_with_where", [ + n({"id": 0}, name="a"), + e_forward(min_hops=1, max_hops=3, name="e"), + n(name="c") + ], [compare(col("a", "v"), "<", col("c", "v"))]), + ] + + for chain_name, chain, where in chains: + try: + result = profile_query( + g, chain, where, + f"{scenario_name}_{chain_name}", + n_nodes, n_edges + ) + results.append(result) + print(f" {chain_name}: {result.time_ms:.2f}ms " + f"(nodes={result.result_nodes}, edges={result.result_edges})") + except Exception as e: + print(f" {chain_name}: ERROR - {e}") + + return results + + +def main(): + print("=" * 60) + print("GFQL df_executor Profiling") + print("=" * 60) + + results = run_profiles() + + print("\n" + "=" * 60) + print("Summary") + print("=" * 60) + + # Group by scenario type + print("\nTiming by scenario:") + for r in results: + print(f" {r.scenario}: {r.time_ms:.2f}ms") + + # Identify hotspots + print("\nSlowest queries:") + sorted_results = sorted(results, key=lambda x: x.time_ms, reverse=True) + for r in sorted_results[:5]: + print(f" {r.scenario}: {r.time_ms:.2f}ms") + + +if __name__ == "__main__": + main() diff --git a/tests/gfql/ref/test_df_executor_inputs.py b/tests/gfql/ref/test_df_executor_inputs.py new file mode 100644 index 0000000000..665dc26fef --- /dev/null +++ b/tests/gfql/ref/test_df_executor_inputs.py @@ -0,0 +1,3293 @@ +import pandas as pd +import pytest + +from graphistry.Engine import Engine +from graphistry.compute import n, e_forward, e_reverse, e_undirected +from graphistry.compute.gfql.df_executor import ( + build_same_path_inputs, + DFSamePathExecutor, + execute_same_path_chain, + _CUDF_MODE_ENV, +) +from graphistry.compute.gfql_unified import gfql +from graphistry.compute.chain import Chain +from graphistry.gfql.same_path_types import col, compare +from graphistry.gfql.ref.enumerator import OracleCaps, enumerate_chain +from graphistry.tests.test_compute import CGFull + + +def _make_graph(): + nodes = pd.DataFrame( + [ + {"id": "acct1", "type": "account", "owner_id": "user1", "score": 5}, + {"id": "acct2", "type": "account", "owner_id": "user2", "score": 9}, + {"id": "user1", "type": "user", "score": 7}, + {"id": "user2", "type": "user", "score": 3}, + ] + ) + edges = pd.DataFrame( + [ + {"src": "acct1", "dst": "user1"}, + {"src": "acct2", "dst": "user2"}, + ] + ) + return CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + +def _make_hop_graph(): + nodes = pd.DataFrame( + [ + {"id": "acct1", "type": "account", "owner_id": "u1", "score": 1}, + {"id": "user1", "type": "user", "owner_id": "u1", "score": 5}, + {"id": "user2", "type": "user", "owner_id": "u1", "score": 7}, + {"id": "acct2", "type": "account", "owner_id": "u1", "score": 9}, + {"id": "user3", "type": "user", "owner_id": "u3", "score": 2}, + ] + ) + edges = pd.DataFrame( + [ + {"src": "acct1", "dst": "user1"}, + {"src": "user1", "dst": "user2"}, + {"src": "user2", "dst": "acct2"}, + {"src": "acct1", "dst": "user3"}, + ] + ) + return CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + +def test_build_inputs_collects_alias_metadata(): + chain = [ + n({"type": "account"}, name="a"), + e_forward(name="r"), + n({"type": "user", "id": "user1"}, name="c"), + ] + where = [compare(col("a", "owner_id"), "==", col("c", "owner_id"))] + graph = _make_graph() + + inputs = build_same_path_inputs(graph, chain, where, Engine.PANDAS) + + assert set(inputs.alias_bindings) == {"a", "r", "c"} + assert inputs.column_requirements["a"] == {"owner_id"} + assert inputs.column_requirements["c"] == {"owner_id"} + assert inputs.plan.bitsets + + +def test_missing_alias_raises(): + chain = [n(name="a"), e_forward(name="r"), n(name="c")] + where = [compare(col("missing", "x"), "==", col("c", "owner_id"))] + graph = _make_graph() + + with pytest.raises(ValueError): + build_same_path_inputs(graph, chain, where, Engine.PANDAS) + + +def test_forward_captures_alias_frames_and_prunes(): + graph = _make_graph() + chain = [ + n({"type": "account"}, name="a"), + e_forward(name="r"), + n({"type": "user", "id": "user1"}, name="c"), + ] + where = [compare(col("a", "owner_id"), "==", col("c", "id"))] + inputs = build_same_path_inputs(graph, chain, where, Engine.PANDAS) + executor = DFSamePathExecutor(inputs) + executor._forward() + + assert "a" in executor.alias_frames + a_nodes = executor.alias_frames["a"] + assert set(a_nodes.columns) == {"id", "owner_id"} + assert list(a_nodes["id"]) == ["acct1"] + + +def test_forward_matches_oracle_tags_on_equality(): + graph = _make_graph() + chain = [ + n({"type": "account"}, name="a"), + e_forward(name="r"), + n({"type": "user"}, name="c"), + ] + where = [compare(col("a", "owner_id"), "==", col("c", "id"))] + inputs = build_same_path_inputs(graph, chain, where, Engine.PANDAS) + executor = DFSamePathExecutor(inputs) + executor._forward() + + oracle = enumerate_chain( + graph, + chain, + where=where, + include_paths=False, + caps=OracleCaps(max_nodes=20, max_edges=20), + ) + assert oracle.tags is not None + assert set(executor.alias_frames["a"]["id"]) == oracle.tags["a"] + assert set(executor.alias_frames["c"]["id"]) == oracle.tags["c"] + + +def test_run_materializes_oracle_sets(): + graph = _make_graph() + chain = [ + n({"type": "account"}, name="a"), + e_forward(name="r"), + n({"type": "user"}, name="c"), + ] + where = [compare(col("a", "owner_id"), "==", col("c", "id"))] + + result = execute_same_path_chain(graph, chain, where, Engine.PANDAS) + oracle = enumerate_chain( + graph, + chain, + where=where, + include_paths=False, + caps=OracleCaps(max_nodes=20, max_edges=20), + ) + + assert result._nodes is not None + assert result._edges is not None + assert set(result._nodes["id"]) == set(oracle.nodes["id"]) + assert set(result._edges["src"]) == set(oracle.edges["src"]) + assert set(result._edges["dst"]) == set(oracle.edges["dst"]) + + +def test_forward_minmax_prune_matches_oracle(): + graph = _make_graph() + chain = [ + n({"type": "account"}, name="a"), + e_forward(name="r"), + n({"type": "user"}, name="c"), + ] + where = [compare(col("a", "score"), "<", col("c", "score"))] + inputs = build_same_path_inputs(graph, chain, where, Engine.PANDAS) + executor = DFSamePathExecutor(inputs) + executor._forward() + oracle = enumerate_chain( + graph, + chain, + where=where, + include_paths=False, + caps=OracleCaps(max_nodes=20, max_edges=20), + ) + assert oracle.tags is not None + assert set(executor.alias_frames["a"]["id"]) == oracle.tags["a"] + assert set(executor.alias_frames["c"]["id"]) == oracle.tags["c"] + + +def test_strict_mode_without_cudf_raises(monkeypatch): + graph = _make_graph() + chain = [ + n({"type": "account"}, name="a"), + e_forward(name="r"), + n({"type": "user"}, name="c"), + ] + where = [compare(col("a", "owner_id"), "==", col("c", "id"))] + monkeypatch.setenv(_CUDF_MODE_ENV, "strict") + inputs = build_same_path_inputs(graph, chain, where, Engine.CUDF) + executor = DFSamePathExecutor(inputs) + + cudf_available = True + try: + import cudf # type: ignore # noqa: F401 + except Exception: + cudf_available = False + + if cudf_available: + # If cudf exists, strict mode should proceed to GPU path (currently routes to oracle) + executor.run() + else: + with pytest.raises(RuntimeError): + executor.run() + + +def test_auto_mode_without_cudf_falls_back(monkeypatch): + graph = _make_graph() + chain = [ + n({"type": "account"}, name="a"), + e_forward(name="r"), + n({"type": "user"}, name="c"), + ] + where = [compare(col("a", "owner_id"), "==", col("c", "id"))] + monkeypatch.setenv(_CUDF_MODE_ENV, "auto") + inputs = build_same_path_inputs(graph, chain, where, Engine.CUDF) + executor = DFSamePathExecutor(inputs) + result = executor.run() + oracle = enumerate_chain( + graph, + chain, + where=where, + include_paths=False, + caps=OracleCaps(max_nodes=20, max_edges=20), + ) + + assert set(result._nodes["id"]) == set(oracle.nodes["id"]) + + +def test_gpu_path_parity_equality(): + graph = _make_graph() + chain = [ + n({"type": "account"}, name="a"), + e_forward(name="r"), + n({"type": "user"}, name="c"), + ] + where = [compare(col("a", "owner_id"), "==", col("c", "id"))] + inputs = build_same_path_inputs(graph, chain, where, Engine.PANDAS) + executor = DFSamePathExecutor(inputs) + executor._forward() + result = executor._run_gpu() + + oracle = enumerate_chain( + graph, + chain, + where=where, + include_paths=False, + caps=OracleCaps(max_nodes=20, max_edges=20), + ) + assert result._nodes is not None and result._edges is not None + assert set(result._nodes["id"]) == set(oracle.nodes["id"]) + assert set(result._edges["src"]) == set(oracle.edges["src"]) + assert set(result._edges["dst"]) == set(oracle.edges["dst"]) + + +def test_gpu_path_parity_inequality(): + graph = _make_graph() + chain = [ + n({"type": "account"}, name="a"), + e_forward(name="r"), + n({"type": "user"}, name="c"), + ] + where = [compare(col("a", "score"), ">", col("c", "score"))] + inputs = build_same_path_inputs(graph, chain, where, Engine.PANDAS) + executor = DFSamePathExecutor(inputs) + executor._forward() + result = executor._run_gpu() + + oracle = enumerate_chain( + graph, + chain, + where=where, + include_paths=False, + caps=OracleCaps(max_nodes=20, max_edges=20), + ) + assert result._nodes is not None and result._edges is not None + assert set(result._nodes["id"]) == set(oracle.nodes["id"]) + assert set(result._edges["src"]) == set(oracle.edges["src"]) + assert set(result._edges["dst"]) == set(oracle.edges["dst"]) + + +def _assert_parity(graph, chain, where): + inputs = build_same_path_inputs(graph, chain, where, Engine.PANDAS) + executor = DFSamePathExecutor(inputs) + executor._forward() + result = executor._run_native() + oracle = enumerate_chain( + graph, + chain, + where=where, + include_paths=False, + caps=OracleCaps(max_nodes=50, max_edges=50), + ) + assert result._nodes is not None and result._edges is not None + assert set(result._nodes["id"]) == set(oracle.nodes["id"]) + assert set(result._edges["src"]) == set(oracle.edges["src"]) + assert set(result._edges["dst"]) == set(oracle.edges["dst"]) + + +@pytest.mark.parametrize( + "edge_kwargs", + [ + {"min_hops": 2, "max_hops": 3}, + {"min_hops": 1, "max_hops": 3, "output_min_hops": 3, "output_max_hops": 3}, + ], + ids=["hop_range", "output_slice"], +) +def test_same_path_hop_params_parity(edge_kwargs): + graph = _make_hop_graph() + chain = [ + n({"type": "account"}, name="a"), + e_forward(**edge_kwargs), + n(name="c"), + ] + where = [compare(col("a", "owner_id"), "==", col("c", "owner_id"))] + _assert_parity(graph, chain, where) + + +def test_same_path_hop_labels_propagate(): + graph = _make_hop_graph() + chain = [ + n({"type": "account"}, name="a"), + e_forward( + min_hops=1, + max_hops=2, + label_node_hops="node_hop", + label_edge_hops="edge_hop", + label_seeds=True, + ), + n(name="c"), + ] + where = [compare(col("a", "owner_id"), "==", col("c", "owner_id"))] + inputs = build_same_path_inputs(graph, chain, where, Engine.PANDAS) + executor = DFSamePathExecutor(inputs) + executor._forward() + result = executor._run_gpu() + + assert result._nodes is not None and result._edges is not None + assert "node_hop" in result._nodes.columns + assert "edge_hop" in result._edges.columns + assert result._nodes["node_hop"].notna().any() + assert result._edges["edge_hop"].notna().any() + + +def test_topology_parity_scenarios(): + scenarios = [] + + nodes_cycle = pd.DataFrame( + [ + {"id": "a1", "type": "account", "value": 1}, + {"id": "a2", "type": "account", "value": 3}, + {"id": "b1", "type": "user", "value": 5}, + {"id": "b2", "type": "user", "value": 2}, + ] + ) + edges_cycle = pd.DataFrame( + [ + {"src": "a1", "dst": "b1"}, + {"src": "a1", "dst": "b2"}, # branch + {"src": "b1", "dst": "a2"}, # cycle back + ] + ) + chain_cycle = [ + n({"type": "account"}, name="a"), + e_forward(name="r1"), + n({"type": "user"}, name="b"), + e_forward(name="r2"), + n({"type": "account"}, name="c"), + ] + where_cycle = [compare(col("a", "value"), "<", col("c", "value"))] + scenarios.append((nodes_cycle, edges_cycle, chain_cycle, where_cycle, None)) + + nodes_mixed = pd.DataFrame( + [ + {"id": "a1", "type": "account", "owner_id": "u1", "score": 2}, + {"id": "a2", "type": "account", "owner_id": "u2", "score": 7}, + {"id": "u1", "type": "user", "score": 9}, + {"id": "u2", "type": "user", "score": 1}, + {"id": "u3", "type": "user", "score": 5}, + ] + ) + edges_mixed = pd.DataFrame( + [ + {"src": "a1", "dst": "u1"}, + {"src": "a2", "dst": "u2"}, + {"src": "a2", "dst": "u3"}, + ] + ) + chain_mixed = [ + n({"type": "account"}, name="a"), + e_forward(name="r1"), + n({"type": "user"}, name="b"), + e_forward(name="r2"), + n({"type": "account"}, name="c"), + ] + where_mixed = [ + compare(col("a", "owner_id"), "==", col("b", "id")), + compare(col("b", "score"), ">", col("c", "score")), + ] + scenarios.append((nodes_mixed, edges_mixed, chain_mixed, where_mixed, None)) + + nodes_edge_filter = pd.DataFrame( + [ + {"id": "acct1", "type": "account", "owner_id": "user1"}, + {"id": "acct2", "type": "account", "owner_id": "user2"}, + {"id": "user1", "type": "user"}, + {"id": "user2", "type": "user"}, + {"id": "user3", "type": "user"}, + ] + ) + edges_edge_filter = pd.DataFrame( + [ + {"src": "acct1", "dst": "user1", "etype": "owns"}, + {"src": "acct2", "dst": "user2", "etype": "owns"}, + {"src": "acct1", "dst": "user3", "etype": "follows"}, + ] + ) + chain_edge_filter = [ + n({"type": "account"}, name="a"), + e_forward({"etype": "owns"}, name="r"), + n({"type": "user"}, name="c"), + ] + where_edge_filter = [compare(col("a", "owner_id"), "==", col("c", "id"))] + scenarios.append((nodes_edge_filter, edges_edge_filter, chain_edge_filter, where_edge_filter, {"dst": {"user1", "user2"}})) + + for nodes_df, edges_df, chain, where, edge_expect in scenarios: + graph = CGFull().nodes(nodes_df, "id").edges(edges_df, "src", "dst") + _assert_parity(graph, chain, where) + if edge_expect: + assert graph._edge is None or "etype" in edges_df.columns # guard unused expectation + result = execute_same_path_chain(graph, chain, where, Engine.PANDAS) + assert result._edges is not None + if "dst" in edge_expect: + assert set(result._edges["dst"]) == edge_expect["dst"] + + +def test_cudf_gpu_path_if_available(): + cudf = pytest.importorskip("cudf") + nodes = cudf.DataFrame( + [ + {"id": "acct1", "type": "account", "owner_id": "user1", "score": 5}, + {"id": "acct2", "type": "account", "owner_id": "user2", "score": 9}, + {"id": "user1", "type": "user", "score": 7}, + {"id": "user2", "type": "user", "score": 3}, + ] + ) + edges = cudf.DataFrame( + [ + {"src": "acct1", "dst": "user1"}, + {"src": "acct2", "dst": "user2"}, + ] + ) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + chain = [ + n({"type": "account"}, name="a"), + e_forward(name="r"), + n({"type": "user"}, name="c"), + ] + where = [compare(col("a", "owner_id"), "==", col("c", "id"))] + inputs = build_same_path_inputs(graph, chain, where, Engine.CUDF) + executor = DFSamePathExecutor(inputs) + result = executor.run() + + assert result._nodes is not None and result._edges is not None + assert set(result._nodes["id"].to_pandas()) == {"acct1", "acct2"} + assert set(result._edges["src"].to_pandas()) == {"acct1", "acct2"} + + +def test_dispatch_dict_where_triggers_executor(): + pytest.importorskip("cudf") + graph = _make_graph() + query = { + "chain": [ + {"type": "Node", "name": "a", "filter_dict": {"type": "account"}}, + {"type": "Edge", "name": "r", "direction": "forward", "hops": 1}, + {"type": "Node", "name": "c", "filter_dict": {"type": "user"}}, + ], + "where": [{"eq": {"left": "a.owner_id", "right": "c.id"}}], + } + result = gfql(graph, query, engine=Engine.CUDF) + oracle = enumerate_chain( + graph, [n({"type": "account"}, name="a"), e_forward(name="r"), n({"type": "user"}, name="c")], + where=[compare(col("a", "owner_id"), "==", col("c", "id"))], + include_paths=False, + caps=OracleCaps(max_nodes=20, max_edges=20), + ) + assert result._nodes is not None and result._edges is not None + assert set(result._nodes["id"]) == set(oracle.nodes["id"]) + assert set(result._edges["src"]) == set(oracle.edges["src"]) + assert set(result._edges["dst"]) == set(oracle.edges["dst"]) + + +def test_dispatch_chain_list_and_single_ast(): + graph = _make_graph() + chain_ops = [ + n({"type": "account"}, name="a"), + e_forward(name="r"), + n({"type": "user"}, name="c"), + ] + where = [compare(col("a", "owner_id"), "==", col("c", "id"))] + + for query in [Chain(chain_ops, where=where), chain_ops]: + result = gfql(graph, query, engine=Engine.PANDAS) + oracle = enumerate_chain( + graph, + chain_ops if isinstance(query, list) else list(chain_ops), + where=where, + include_paths=False, + caps=OracleCaps(max_nodes=20, max_edges=20), + ) + assert result._nodes is not None and result._edges is not None + assert set(result._nodes["id"]) == set(oracle.nodes["id"]) + assert set(result._edges["src"]) == set(oracle.edges["src"]) + assert set(result._edges["dst"]) == set(oracle.edges["dst"]) + + +# ============================================================================ +# Feature Composition Tests - Multi-hop + WHERE +# ============================================================================ +# +# KNOWN LIMITATION: The cuDF same-path executor has architectural limitations +# with multi-hop edges combined with WHERE clauses: +# +# 1. Backward prune assumes single-hop edges where each edge step directly +# connects adjacent node steps. Multi-hop edges break this assumption. +# +# 2. For multi-hop edges, _is_single_hop() gates WHERE clause filtering, +# so WHERE between start/end of a multi-hop edge may not be applied +# during backward prune. +# +# 3. The oracle correctly handles these cases, so oracle parity tests +# catch the discrepancy. +# +# These tests are marked xfail to document the known limitations. +# See issue #871 for the testing roadmap. +# ============================================================================ + + +class TestP0FeatureComposition: + """ + Critical tests for hop ranges + WHERE clause composition. + These catch subtle bugs in feature interactions. + + These tests are currently xfail due to known limitations in the + cuDF executor's handling of multi-hop + WHERE combinations. + """ + + def test_where_respected_after_min_hops_backtracking(self): + """ + P0 Test 1: WHERE must be respected after min_hops backtracking. + + Graph: + a(v=1) -> b -> c -> d(v=10) (3 hops, valid path) + a(v=1) -> x -> y(v=0) (2 hops, dead end for min=3) + + Chain: n(a) -[min_hops=2, max_hops=3]-> n(end) + WHERE: a.value < end.value + + After backtracking prunes the x->y branch (doesn't reach 3 hops), + WHERE should still filter: only paths where a.value < end.value. + + Risk: Backtracking may keep paths that violate WHERE. + """ + nodes = pd.DataFrame([ + {"id": "a", "type": "start", "value": 5}, + {"id": "b", "type": "mid", "value": 3}, + {"id": "c", "type": "mid", "value": 7}, + {"id": "d", "type": "end", "value": 10}, # a.value(5) < d.value(10) ✓ + {"id": "x", "type": "mid", "value": 1}, + {"id": "y", "type": "end", "value": 2}, # a.value(5) < y.value(2) ✗ + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + {"src": "c", "dst": "d"}, + {"src": "a", "dst": "x"}, + {"src": "x", "dst": "y"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"type": "start"}, name="start"), + e_forward(min_hops=2, max_hops=3), + n(name="end"), + ] + where = [compare(col("start", "value"), "<", col("end", "value"))] + + _assert_parity(graph, chain, where) + + # Explicit check: y should NOT be in results (violates WHERE) + result = execute_same_path_chain(graph, chain, where, Engine.PANDAS) + assert result._nodes is not None + result_ids = set(result._nodes["id"]) + # y violates WHERE (5 < 2 is false), should not be included + assert "y" not in result_ids, "Node y violates WHERE but was included" + # d satisfies WHERE (5 < 10 is true), should be included + assert "d" in result_ids, "Node d satisfies WHERE but was excluded" + + def test_reverse_direction_where_semantics(self): + """ + P0 Test 2: WHERE semantics must be consistent with reverse direction. + + Graph: a(v=1) -> b(v=5) -> c(v=3) -> d(v=9) + + Chain: n(name='start') -[e_reverse, min_hops=2]-> n(name='end') + Starting at d, traversing backward. + WHERE: start.value > end.value + + Reverse traversal from d: + - hop 1: c (start=d, v=9) + - hop 2: b (end=b, v=5) -> d.value(9) > b.value(5) ✓ + - hop 3: a (end=a, v=1) -> d.value(9) > a.value(1) ✓ + + Risk: Direction swap could flip WHERE semantics. + """ + nodes = pd.DataFrame([ + {"id": "a", "value": 1}, + {"id": "b", "value": 5}, + {"id": "c", "value": 3}, + {"id": "d", "value": 9}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + {"src": "c", "dst": "d"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "d"}, name="start"), + e_reverse(min_hops=2, max_hops=3), + n(name="end"), + ] + where = [compare(col("start", "value"), ">", col("end", "value"))] + + _assert_parity(graph, chain, where) + + # Explicit check + result = execute_same_path_chain(graph, chain, where, Engine.PANDAS) + assert result._nodes is not None + result_ids = set(result._nodes["id"]) + # start is d (v=9), end can be b(v=5) or a(v=1) + # Both satisfy 9 > 5 and 9 > 1 + assert "a" in result_ids or "b" in result_ids, "Valid endpoints excluded" + # d is start, should be included + assert "d" in result_ids, "Start node excluded" + + def test_non_adjacent_alias_where(self): + """ + P0 Test 3: WHERE between non-adjacent aliases must be applied. + + Chain: n(name='a') -> e -> n(name='b') -> e -> n(name='c') + WHERE: a.id == c.id (aliases 2 edges apart) + + This tests cycles where we return to the starting node. + + Graph: + x -> y -> x (cycle) + x -> y -> z (no cycle) + + Only paths where a.id == c.id should be kept. + + Risk: cuDF backward prune only checks adjacent aliases. + """ + nodes = pd.DataFrame([ + {"id": "x", "type": "node"}, + {"id": "y", "type": "node"}, + {"id": "z", "type": "node"}, + ]) + edges = pd.DataFrame([ + {"src": "x", "dst": "y"}, + {"src": "y", "dst": "x"}, # cycle back + {"src": "y", "dst": "z"}, # no cycle + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="a"), + e_forward(name="e1"), + n(name="b"), + e_forward(name="e2"), + n(name="c"), + ] + where = [compare(col("a", "id"), "==", col("c", "id"))] + + _assert_parity(graph, chain, where) + + # Explicit check: only x->y->x path satisfies a.id == c.id + result = execute_same_path_chain(graph, chain, where, Engine.PANDAS) + oracle = enumerate_chain( + graph, chain, where=where, include_paths=False, + caps=OracleCaps(max_nodes=50, max_edges=50), + ) + + # z should NOT be in results (x != z) + assert "z" not in set(oracle.nodes["id"]), "z violates WHERE but oracle included it" + if result._nodes is not None and not result._nodes.empty: + assert "z" not in set(result._nodes["id"]), "z violates WHERE but executor included it" + + def test_non_adjacent_alias_where_inequality(self): + """ + P0 Test 3b: Non-adjacent WHERE with inequality operators (<, >, <=, >=). + + Chain: n(name='a') -> e -> n(name='b') -> e -> n(name='c') + WHERE: a.v < c.v (aliases 2 edges apart, inequality) + + Graph with numeric values: + n1(v=1) -> n2(v=5) -> n3(v=10) + n1(v=1) -> n2(v=5) -> n4(v=3) + + Paths: + n1 -> n2 -> n3: a.v=1 < c.v=10 (valid) + n1 -> n2 -> n4: a.v=1 < c.v=3 (valid) + + All paths satisfy a.v < c.v. + """ + nodes = pd.DataFrame([ + {"id": "n1", "v": 1}, + {"id": "n2", "v": 5}, + {"id": "n3", "v": 10}, + {"id": "n4", "v": 3}, + ]) + edges = pd.DataFrame([ + {"src": "n1", "dst": "n2"}, + {"src": "n2", "dst": "n3"}, + {"src": "n2", "dst": "n4"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="a"), + e_forward(name="e1"), + n(name="b"), + e_forward(name="e2"), + n(name="c"), + ] + where = [compare(col("a", "v"), "<", col("c", "v"))] + + _assert_parity(graph, chain, where) + + def test_non_adjacent_alias_where_inequality_filters(self): + """ + P0 Test 3c: Non-adjacent WHERE inequality that actually filters some paths. + + Chain: n(name='a') -> e -> n(name='b') -> e -> n(name='c') + WHERE: a.v > c.v (start value must be greater than end value) + + Graph: + n1(v=10) -> n2(v=5) -> n3(v=1) a.v=10 > c.v=1 (valid) + n1(v=10) -> n2(v=5) -> n4(v=20) a.v=10 > c.v=20 (invalid) + + Only paths where a.v > c.v should be kept. + """ + nodes = pd.DataFrame([ + {"id": "n1", "v": 10}, + {"id": "n2", "v": 5}, + {"id": "n3", "v": 1}, + {"id": "n4", "v": 20}, + ]) + edges = pd.DataFrame([ + {"src": "n1", "dst": "n2"}, + {"src": "n2", "dst": "n3"}, + {"src": "n2", "dst": "n4"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="a"), + e_forward(name="e1"), + n(name="b"), + e_forward(name="e2"), + n(name="c"), + ] + where = [compare(col("a", "v"), ">", col("c", "v"))] + + _assert_parity(graph, chain, where) + + # Explicit check: n4 should NOT be in results (10 > 20 is false) + result = execute_same_path_chain(graph, chain, where, Engine.PANDAS) + oracle = enumerate_chain( + graph, chain, where=where, include_paths=False, + caps=OracleCaps(max_nodes=50, max_edges=50), + ) + + assert "n4" not in set(oracle.nodes["id"]), "n4 violates WHERE but oracle included it" + if result._nodes is not None and not result._nodes.empty: + assert "n4" not in set(result._nodes["id"]), "n4 violates WHERE but executor included it" + # n3 should be included (10 > 1 is true) + assert "n3" in set(oracle.nodes["id"]), "n3 satisfies WHERE but oracle excluded it" + + def test_non_adjacent_alias_where_not_equal(self): + """ + P0 Test 3d: Non-adjacent WHERE with != operator. + + Chain: n(name='a') -> e -> n(name='b') -> e -> n(name='c') + WHERE: a.id != c.id (aliases must be different nodes) + + Graph: + x -> y -> x (cycle, a.id == c.id, should be excluded) + x -> y -> z (different, a.id != c.id, should be included) + + Only paths where a.id != c.id should be kept. + """ + nodes = pd.DataFrame([ + {"id": "x", "type": "node"}, + {"id": "y", "type": "node"}, + {"id": "z", "type": "node"}, + ]) + edges = pd.DataFrame([ + {"src": "x", "dst": "y"}, + {"src": "y", "dst": "x"}, # cycle back + {"src": "y", "dst": "z"}, # no cycle + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="a"), + e_forward(name="e1"), + n(name="b"), + e_forward(name="e2"), + n(name="c"), + ] + where = [compare(col("a", "id"), "!=", col("c", "id"))] + + _assert_parity(graph, chain, where) + + # Explicit check: x->y->x path should be excluded (x == x) + # x->y->z path should be included (x != z) + result = execute_same_path_chain(graph, chain, where, Engine.PANDAS) + oracle = enumerate_chain( + graph, chain, where=where, include_paths=False, + caps=OracleCaps(max_nodes=50, max_edges=50), + ) + + # z should be in results (x != z) + assert "z" in set(oracle.nodes["id"]), "z satisfies WHERE but oracle excluded it" + if result._nodes is not None and not result._nodes.empty: + assert "z" in set(result._nodes["id"]), "z satisfies WHERE but executor excluded it" + + def test_non_adjacent_alias_where_lte_gte(self): + """ + P0 Test 3e: Non-adjacent WHERE with <= and >= operators. + + Chain: n(name='a') -> e -> n(name='b') -> e -> n(name='c') + WHERE: a.v <= c.v (start value must be <= end value) + + Graph: + n1(v=5) -> n2(v=5) -> n3(v=5) a.v=5 <= c.v=5 (valid, equal) + n1(v=5) -> n2(v=5) -> n4(v=10) a.v=5 <= c.v=10 (valid, less) + n1(v=5) -> n2(v=5) -> n5(v=1) a.v=5 <= c.v=1 (invalid) + + Only paths where a.v <= c.v should be kept. + """ + nodes = pd.DataFrame([ + {"id": "n1", "v": 5}, + {"id": "n2", "v": 5}, + {"id": "n3", "v": 5}, + {"id": "n4", "v": 10}, + {"id": "n5", "v": 1}, + ]) + edges = pd.DataFrame([ + {"src": "n1", "dst": "n2"}, + {"src": "n2", "dst": "n3"}, + {"src": "n2", "dst": "n4"}, + {"src": "n2", "dst": "n5"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="a"), + e_forward(name="e1"), + n(name="b"), + e_forward(name="e2"), + n(name="c"), + ] + where = [compare(col("a", "v"), "<=", col("c", "v"))] + + _assert_parity(graph, chain, where) + + # Explicit check + result = execute_same_path_chain(graph, chain, where, Engine.PANDAS) + oracle = enumerate_chain( + graph, chain, where=where, include_paths=False, + caps=OracleCaps(max_nodes=50, max_edges=50), + ) + + # n5 should NOT be in results (5 <= 1 is false) + assert "n5" not in set(oracle.nodes["id"]), "n5 violates WHERE but oracle included it" + if result._nodes is not None and not result._nodes.empty: + assert "n5" not in set(result._nodes["id"]), "n5 violates WHERE but executor included it" + # n3 and n4 should be included + assert "n3" in set(oracle.nodes["id"]), "n3 satisfies WHERE but oracle excluded it" + assert "n4" in set(oracle.nodes["id"]), "n4 satisfies WHERE but oracle excluded it" + + def test_non_adjacent_where_forward_forward(self): + """ + P0 Test 3f: Non-adjacent WHERE with forward-forward topology (a->b->c). + + This is the base case already covered, but explicit for completeness. + """ + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 5}, + {"id": "c", "v": 10}, + {"id": "d", "v": 0}, # a->b->d where 1 > 0 + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + {"src": "b", "dst": "d"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="start"), + e_forward(), + n(name="mid"), + e_forward(), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + # c (v=10) should be included (1 < 10), d (v=0) should be excluded (1 < 0 is false) + result = execute_same_path_chain(graph, chain, where, Engine.PANDAS) + assert "c" in set(result._nodes["id"]), "c satisfies WHERE but excluded" + assert "d" not in set(result._nodes["id"]), "d violates WHERE but included" + + def test_non_adjacent_where_reverse_reverse(self): + """ + P0 Test 3g: Non-adjacent WHERE with reverse-reverse topology (a<-b<-c). + + Graph edges: c->b->a (but we traverse in reverse) + Chain: n(start) <-e- n(mid) <-e- n(end) + Semantically: start is where we begin, end is where we finish traversing. + """ + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 5}, + {"id": "c", "v": 10}, + {"id": "d", "v": 0}, + ]) + # Edges go c->b->a, but we traverse backwards + edges = pd.DataFrame([ + {"src": "c", "dst": "b"}, + {"src": "b", "dst": "a"}, + {"src": "d", "dst": "b"}, # d->b, so traversing reverse: b<-d + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="start"), + e_reverse(), + n(name="mid"), + e_reverse(), + n(name="end"), + ] + # start.v < end.v means the node we start at has smaller v than where we end + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_non_adjacent_where_forward_reverse(self): + """ + P0 Test 3h: Non-adjacent WHERE with forward-reverse topology (a->b<-c). + + Graph: a->b and c->b (both point to b) + Chain: n(start) -e-> n(mid) <-e- n(end) + This finds paths where start reaches mid via forward, and end reaches mid via reverse. + """ + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 5}, + {"id": "c", "v": 10}, + {"id": "d", "v": 2}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, # a->b (forward from a) + {"src": "c", "dst": "b"}, # c->b (reverse to reach c from b) + {"src": "d", "dst": "b"}, # d->b (reverse to reach d from b) + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="start"), + e_forward(), + n(name="mid"), + e_reverse(), + n(name="end"), + ] + # start.v < end.v: 1 < 10 (a,c valid), 1 < 2 (a,d valid) + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + result = execute_same_path_chain(graph, chain, where, Engine.PANDAS) + result_nodes = set(result._nodes["id"]) + # Both c and d should be reachable and satisfy the constraint + assert "c" in result_nodes, "c satisfies WHERE but excluded" + assert "d" in result_nodes, "d satisfies WHERE but excluded" + + def test_non_adjacent_where_reverse_forward(self): + """ + P0 Test 3i: Non-adjacent WHERE with reverse-forward topology (a<-b->c). + + Graph: b->a, b->c, b->d (b points to all) + Chain: n(start) <-e- n(mid) -e-> n(end) + + Valid paths with start.v < end.v: + a(v=1) -> b -> c(v=10): 1 < 10 valid + a(v=1) -> b -> d(v=0): 1 < 0 invalid (but d can still be start!) + d(v=0) -> b -> a(v=1): 0 < 1 valid + d(v=0) -> b -> c(v=10): 0 < 10 valid + """ + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 5}, + {"id": "c", "v": 10}, + {"id": "d", "v": 0}, + ]) + edges = pd.DataFrame([ + {"src": "b", "dst": "a"}, # b->a (reverse from a to reach b) + {"src": "b", "dst": "c"}, # b->c (forward from b) + {"src": "b", "dst": "d"}, # b->d (reverse from d to reach b) + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="start"), + e_reverse(), + n(name="mid"), + e_forward(), + n(name="end"), + ] + # start.v < end.v + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + result = execute_same_path_chain(graph, chain, where, Engine.PANDAS) + result_nodes = set(result._nodes["id"]) + # All nodes participate in valid paths + assert "a" in result_nodes, "a can be start (a->b->c) or end (d->b->a)" + assert "c" in result_nodes, "c can be end for valid paths" + assert "d" in result_nodes, "d can be start (d->b->a, d->b->c)" + + def test_non_adjacent_where_multihop_forward(self): + """ + P0 Test 3j: Non-adjacent WHERE with multi-hop edge (a-[1..2]->b->c). + + Chain: n(start) -[hops 1-2]-> n(mid) -e-> n(end) + """ + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 5}, + {"id": "c", "v": 10}, + {"id": "d", "v": 3}, + {"id": "e", "v": 0}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, # 1 hop: a->b + {"src": "b", "dst": "c"}, # 1 hop from b, or 2 hops from a + {"src": "c", "dst": "d"}, # endpoint from c + {"src": "c", "dst": "e"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="start"), + e_forward(min_hops=1, max_hops=2), # Can reach b (1 hop) or c (2 hops) + n(name="mid"), + e_forward(), + n(name="end"), + ] + # start.v < end.v + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_non_adjacent_where_multihop_reverse(self): + """ + P0 Test 3k: Non-adjacent WHERE with multi-hop reverse edge. + + Chain: n(start) <-[hops 1-2]- n(mid) <-e- n(end) + """ + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 5}, + {"id": "c", "v": 10}, + {"id": "d", "v": 15}, + ]) + # Edges for reverse traversal + edges = pd.DataFrame([ + {"src": "b", "dst": "a"}, # reverse: a <- b + {"src": "c", "dst": "b"}, # reverse: b <- c (2 hops from a) + {"src": "d", "dst": "c"}, # reverse: c <- d + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="start"), + e_reverse(min_hops=1, max_hops=2), + n(name="mid"), + e_reverse(), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + # ===== Single-hop topology tests (direct a->c without middle node) ===== + + def test_single_hop_forward_where(self): + """ + P0 Test 4a: Single-hop forward topology (a->c). + + Chain: n(start) -e-> n(end), WHERE start.v < end.v + """ + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 5}, + {"id": "c", "v": 10}, + {"id": "d", "v": 0}, # d.v < all others + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "a", "dst": "c"}, + {"src": "b", "dst": "c"}, + {"src": "c", "dst": "d"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="start"), + e_forward(), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_single_hop_reverse_where(self): + """ + P0 Test 4b: Single-hop reverse topology (a<-c). + + Chain: n(start) <-e- n(end), WHERE start.v < end.v + """ + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 5}, + {"id": "c", "v": 10}, + ]) + edges = pd.DataFrame([ + {"src": "b", "dst": "a"}, # reverse: a <- b + {"src": "c", "dst": "b"}, # reverse: b <- c + {"src": "c", "dst": "a"}, # reverse: a <- c + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="start"), + e_reverse(), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_single_hop_undirected_where(self): + """ + P0 Test 4c: Single-hop undirected topology (a<->c). + + Chain: n(start) <-e-> n(end), WHERE start.v < end.v + Tests both directions of each edge. + """ + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 5}, + {"id": "c", "v": 10}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="start"), + e_undirected(), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_single_hop_with_self_loop(self): + """ + P0 Test 4d: Single-hop with self-loop (a->a). + + Tests that self-loops are handled correctly. + """ + nodes = pd.DataFrame([ + {"id": "a", "v": 5}, + {"id": "b", "v": 10}, + {"id": "c", "v": 15}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "a"}, # Self-loop + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "b"}, # Self-loop + {"src": "b", "dst": "c"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="start"), + e_forward(), + n(name="end"), + ] + # start.v < end.v: self-loops fail (5 < 5 = false) + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_single_hop_equality_self_loop(self): + """ + P0 Test 4e: Single-hop equality with self-loop. + + Self-loops satisfy start.v == end.v. + """ + nodes = pd.DataFrame([ + {"id": "a", "v": 5}, + {"id": "b", "v": 5}, # Same value as a + {"id": "c", "v": 10}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "a"}, # Self-loop: 5 == 5 + {"src": "a", "dst": "b"}, # a->b: 5 == 5 + {"src": "a", "dst": "c"}, # a->c: 5 != 10 + {"src": "b", "dst": "b"}, # Self-loop: 5 == 5 + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="start"), + e_forward(), + n(name="end"), + ] + where = [compare(col("start", "v"), "==", col("end", "v"))] + + _assert_parity(graph, chain, where) + + # ===== Cycle topology tests ===== + + def test_cycle_single_node(self): + """ + P0 Test 5a: Self-loop cycle (a->a). + + Tests single-node cycles with WHERE clause. + """ + nodes = pd.DataFrame([ + {"id": "a", "v": 5}, + {"id": "b", "v": 10}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "a"}, # Self-loop + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "a"}, # Creates cycle a->b->a + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="start"), + e_forward(min_hops=1, max_hops=2), + n(name="end"), + ] + # start.v < end.v + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_cycle_triangle(self): + """ + P0 Test 5b: Triangle cycle (a->b->c->a). + + Tests cycles in multi-hop traversal. + """ + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 5}, + {"id": "c", "v": 10}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + {"src": "c", "dst": "a"}, # Completes the triangle + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="start"), + e_forward(min_hops=1, max_hops=3), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_cycle_with_branch(self): + """ + P0 Test 5c: Cycle with branch (a->b->a and a->c). + + Tests cycles combined with branching topology. + """ + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 5}, + {"id": "c", "v": 10}, + {"id": "d", "v": 15}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "a"}, # Cycle back + {"src": "a", "dst": "c"}, + {"src": "c", "dst": "d"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="start"), + e_forward(min_hops=1, max_hops=2), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_oracle_cudf_parity_comprehensive(self): + """ + P0 Test 4: Oracle and cuDF executor must produce identical results. + + Parametrized across multiple scenarios combining: + - Different hop ranges + - Different WHERE operators + - Different graph topologies + """ + scenarios = [ + # (nodes, edges, chain, where, description) + ( + # Linear with inequality WHERE + pd.DataFrame([ + {"id": "a", "v": 1}, {"id": "b", "v": 5}, + {"id": "c", "v": 3}, {"id": "d", "v": 9}, + ]), + pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + {"src": "c", "dst": "d"}, + ]), + # Note: Using explicit start filter - n(name="s") without filter + # doesn't work with current executor (hop labels don't distinguish paths) + [n({"id": "a"}, name="s"), e_forward(min_hops=2, max_hops=3), n(name="e")], + [compare(col("s", "v"), "<", col("e", "v"))], + "linear_inequality", + ), + ( + # Branch with equality WHERE + pd.DataFrame([ + {"id": "root", "owner": "u1"}, + {"id": "left", "owner": "u1"}, + {"id": "right", "owner": "u2"}, + {"id": "leaf1", "owner": "u1"}, + {"id": "leaf2", "owner": "u2"}, + ]), + pd.DataFrame([ + {"src": "root", "dst": "left"}, + {"src": "root", "dst": "right"}, + {"src": "left", "dst": "leaf1"}, + {"src": "right", "dst": "leaf2"}, + ]), + [n({"id": "root"}, name="a"), e_forward(min_hops=1, max_hops=2), n(name="c")], + [compare(col("a", "owner"), "==", col("c", "owner"))], + "branch_equality", + ), + ( + # Cycle with output slicing + pd.DataFrame([ + {"id": "n1", "v": 10}, + {"id": "n2", "v": 20}, + {"id": "n3", "v": 30}, + ]), + pd.DataFrame([ + {"src": "n1", "dst": "n2"}, + {"src": "n2", "dst": "n3"}, + {"src": "n3", "dst": "n1"}, + ]), + [ + n({"id": "n1"}, name="a"), + e_forward(min_hops=1, max_hops=3, output_min_hops=2, output_max_hops=3), + n(name="c"), + ], + [compare(col("a", "v"), "<", col("c", "v"))], + "cycle_output_slice", + ), + ( + # Reverse with hop labels + pd.DataFrame([ + {"id": "a", "score": 100}, + {"id": "b", "score": 50}, + {"id": "c", "score": 75}, + ]), + pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + ]), + [ + n({"id": "c"}, name="start"), + e_reverse(min_hops=1, max_hops=2, label_node_hops="hop"), + n(name="end"), + ], + [compare(col("start", "score"), ">", col("end", "score"))], + "reverse_labels", + ), + ] + + for nodes_df, edges_df, chain, where, desc in scenarios: + graph = CGFull().nodes(nodes_df, "id").edges(edges_df, "src", "dst") + inputs = build_same_path_inputs(graph, chain, where, Engine.PANDAS) + executor = DFSamePathExecutor(inputs) + executor._forward() + result = executor._run_gpu() + + oracle = enumerate_chain( + graph, chain, where=where, include_paths=False, + caps=OracleCaps(max_nodes=50, max_edges=50), + ) + + assert result._nodes is not None, f"{desc}: result nodes is None" + assert set(result._nodes["id"]) == set(oracle.nodes["id"]), \ + f"{desc}: node mismatch - executor={set(result._nodes['id'])}, oracle={set(oracle.nodes['id'])}" + + if result._edges is not None and not result._edges.empty: + assert set(result._edges["src"]) == set(oracle.edges["src"]), \ + f"{desc}: edge src mismatch" + assert set(result._edges["dst"]) == set(oracle.edges["dst"]), \ + f"{desc}: edge dst mismatch" + + +# ============================================================================ +# P1 TESTS: High Confidence - Important but not blocking +# ============================================================================ + + +class TestP1FeatureComposition: + """ + Important tests for edge cases in feature composition. + + These tests are currently xfail due to known limitations in the + cuDF executor's handling of multi-hop + WHERE combinations. + """ + + def test_multi_hop_edge_where_filtering(self): + """ + P1 Test 5: WHERE must be applied even for multi-hop edges. + + The cuDF executor has `_is_single_hop()` check that may skip + WHERE filtering for multi-hop edges. + + Graph: a(v=1) -> b(v=5) -> c(v=3) -> d(v=9) + Chain: n(a) -[min_hops=2, max_hops=3]-> n(end) + WHERE: a.value < end.value + + Risk: WHERE skipped for multi-hop edges. + """ + nodes = pd.DataFrame([ + {"id": "a", "value": 5}, + {"id": "b", "value": 3}, + {"id": "c", "value": 7}, + {"id": "d", "value": 2}, # a.value(5) < d.value(2) is FALSE + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + {"src": "c", "dst": "d"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_forward(min_hops=2, max_hops=3), + n(name="end"), + ] + where = [compare(col("start", "value"), "<", col("end", "value"))] + + _assert_parity(graph, chain, where) + + result = execute_same_path_chain(graph, chain, where, Engine.PANDAS) + assert result._nodes is not None + result_ids = set(result._nodes["id"]) + # c satisfies 5 < 7, d does NOT satisfy 5 < 2 + assert "c" in result_ids, "c satisfies WHERE but excluded" + # d should be excluded (5 < 2 is false) + # But d might be included as intermediate - check oracle behavior + oracle = enumerate_chain( + graph, chain, where=where, include_paths=False, + caps=OracleCaps(max_nodes=50, max_edges=50), + ) + assert set(result._nodes["id"]) == set(oracle.nodes["id"]) + + def test_output_slicing_with_where(self): + """ + P1 Test 6: Output slicing must interact correctly with WHERE. + + Graph: a(v=1) -> b(v=2) -> c(v=3) -> d(v=4) + Chain: n(a) -[max_hops=3, output_min=2, output_max=2]-> n(end) + WHERE: a.value < end.value + + Output slice keeps only hop 2 (node c). + WHERE: a.value(1) < c.value(3) ✓ + + Risk: Slicing applied before/after WHERE could give different results. + """ + nodes = pd.DataFrame([ + {"id": "a", "value": 1}, + {"id": "b", "value": 2}, + {"id": "c", "value": 3}, + {"id": "d", "value": 4}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + {"src": "c", "dst": "d"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_forward(min_hops=1, max_hops=3, output_min_hops=2, output_max_hops=2), + n(name="end"), + ] + where = [compare(col("start", "value"), "<", col("end", "value"))] + + _assert_parity(graph, chain, where) + + def test_label_seeds_with_output_min_hops(self): + """ + P1 Test 7: label_seeds=True with output_min_hops > 0. + + Seeds are at hop 0, but output_min_hops=2 excludes hop 0. + This is a potential conflict. + + Graph: seed -> b -> c -> d + Chain: n(seed) -[output_min=2, label_seeds=True]-> n(end) + """ + nodes = pd.DataFrame([ + {"id": "seed", "value": 1}, + {"id": "b", "value": 2}, + {"id": "c", "value": 3}, + {"id": "d", "value": 4}, + ]) + edges = pd.DataFrame([ + {"src": "seed", "dst": "b"}, + {"src": "b", "dst": "c"}, + {"src": "c", "dst": "d"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "seed"}, name="start"), + e_forward( + min_hops=1, + max_hops=3, + output_min_hops=2, + output_max_hops=3, + label_node_hops="hop", + label_seeds=True, + ), + n(name="end"), + ] + where = [compare(col("start", "value"), "<", col("end", "value"))] + + _assert_parity(graph, chain, where) + + def test_multiple_where_mixed_hop_ranges(self): + """ + P1 Test 8: Multiple WHERE clauses with different hop ranges per edge. + + Chain: n(a) -[hops=1]-> n(b) -[min_hops=1, max_hops=2]-> n(c) + WHERE: a.v < b.v AND b.v < c.v + + Graph: + a1(v=1) -> b1(v=5) -> c1(v=10) + a1(v=1) -> b2(v=2) -> c2(v=3) -> c3(v=4) + + Both paths should satisfy the WHERE clauses. + """ + nodes = pd.DataFrame([ + {"id": "a1", "type": "A", "v": 1}, + {"id": "b1", "type": "B", "v": 5}, + {"id": "b2", "type": "B", "v": 2}, + {"id": "c1", "type": "C", "v": 10}, + {"id": "c2", "type": "C", "v": 3}, + {"id": "c3", "type": "C", "v": 4}, + ]) + edges = pd.DataFrame([ + {"src": "a1", "dst": "b1"}, + {"src": "a1", "dst": "b2"}, + {"src": "b1", "dst": "c1"}, + {"src": "b2", "dst": "c2"}, + {"src": "c2", "dst": "c3"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"type": "A"}, name="a"), + e_forward(name="e1"), + n({"type": "B"}, name="b"), + e_forward(min_hops=1, max_hops=2), # No alias - oracle doesn't support edge aliases for multi-hop + n({"type": "C"}, name="c"), + ] + where = [ + compare(col("a", "v"), "<", col("b", "v")), + compare(col("b", "v"), "<", col("c", "v")), + ] + + _assert_parity(graph, chain, where) + + +# ============================================================================ +# UNFILTERED START TESTS - Previously thought to be limitations, but work! +# ============================================================================ +# +# The public API (execute_same_path_chain) handles unfiltered starts correctly +# by falling back to oracle when the GPU path can't handle them. +# ============================================================================ + + +class TestUnfilteredStarts: + """ + Tests for unfiltered start nodes. + + These were previously marked as "known limitations" but the public API + handles them correctly via oracle fallback. + """ + + def test_unfiltered_start_node_multihop(self): + """ + Unfiltered start node with multi-hop works via public API. + + Chain: n() -[min_hops=2, max_hops=3]-> n() + WHERE: start.v < end.v + """ + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 5}, + {"id": "c", "v": 10}, + {"id": "d", "v": 15}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + {"src": "c", "dst": "d"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="start"), # No filter - all nodes can be start + e_forward(min_hops=2, max_hops=3), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + # Use public API which handles this correctly + oracle = enumerate_chain( + graph, chain, where=where, include_paths=False, + caps=OracleCaps(max_nodes=50, max_edges=50), + ) + result = execute_same_path_chain(graph, chain, where, Engine.PANDAS) + assert set(result._nodes["id"]) == set(oracle.nodes["id"]) + + def test_unfiltered_start_single_hop(self): + """ + Unfiltered start node with single-hop. + """ + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 5}, + {"id": "c", "v": 10}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + {"src": "c", "dst": "a"}, # Cycle + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="start"), # No filter + e_forward(), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + oracle = enumerate_chain( + graph, chain, where=where, include_paths=False, + caps=OracleCaps(max_nodes=50, max_edges=50), + ) + result = execute_same_path_chain(graph, chain, where, Engine.PANDAS) + assert set(result._nodes["id"]) == set(oracle.nodes["id"]) + + def test_unfiltered_start_with_cycle(self): + """ + Unfiltered start with cycle in graph. + """ + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 5}, + {"id": "c", "v": 10}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + {"src": "c", "dst": "a"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="start"), + e_forward(min_hops=1, max_hops=3), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + oracle = enumerate_chain( + graph, chain, where=where, include_paths=False, + caps=OracleCaps(max_nodes=50, max_edges=50), + ) + result = execute_same_path_chain(graph, chain, where, Engine.PANDAS) + assert set(result._nodes["id"]) == set(oracle.nodes["id"]) + + +# ============================================================================ +# ORACLE LIMITATIONS - These are actual oracle limitations, not executor bugs +# ============================================================================ + + +class TestOracleLimitations: + """ + Tests for oracle limitations (not executor bugs). + + These test features the oracle doesn't support. + """ + + @pytest.mark.xfail( + reason="Oracle doesn't support edge aliases on multi-hop edges", + strict=True, + ) + def test_edge_alias_on_multihop(self): + """ + ORACLE LIMITATION: Edge alias on multi-hop edge. + + The oracle raises an error when an edge alias is used on a multi-hop edge. + This is documented in enumerator.py:109. + """ + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 5}, + {"id": "c", "v": 10}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b", "weight": 1}, + {"src": "b", "dst": "c", "weight": 2}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_forward(min_hops=1, max_hops=2, name="e"), # Edge alias on multi-hop + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + # Oracle raises error for edge alias on multi-hop + _assert_parity(graph, chain, where) + + +# ============================================================================ +# P0 ADDITIONAL TESTS: Reverse + Multi-hop +# ============================================================================ + + +class TestP0ReverseMultihop: + """ + P0 Tests: Reverse direction with multi-hop edges. + + These test combinations that revealed bugs during session 3. + """ + + def test_reverse_multihop_basic(self): + """ + P0: Reverse multi-hop basic case. + + Chain: n(start) <-[min_hops=1, max_hops=2]- n(end) + WHERE: start.v < end.v + """ + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 5}, + {"id": "c", "v": 10}, + ]) + # For reverse traversal: edges point "forward" but we traverse backward + edges = pd.DataFrame([ + {"src": "b", "dst": "a"}, # reverse: a <- b + {"src": "c", "dst": "b"}, # reverse: b <- c + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_reverse(min_hops=1, max_hops=2), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + result = execute_same_path_chain(graph, chain, where, Engine.PANDAS) + result_ids = set(result._nodes["id"]) + # start=a(v=1), end can be b(v=5) or c(v=10) + # Both satisfy 1 < 5 and 1 < 10 + assert "b" in result_ids, "b satisfies WHERE but excluded" + assert "c" in result_ids, "c satisfies WHERE but excluded" + + def test_reverse_multihop_filters_correctly(self): + """ + P0: Reverse multi-hop that actually filters some paths. + + Chain: n(start) <-[min_hops=1, max_hops=2]- n(end) + WHERE: start.v > end.v + """ + nodes = pd.DataFrame([ + {"id": "a", "v": 10}, # start has high value + {"id": "b", "v": 5}, # 10 > 5 valid + {"id": "c", "v": 15}, # 10 > 15 invalid + {"id": "d", "v": 1}, # 10 > 1 valid + ]) + edges = pd.DataFrame([ + {"src": "b", "dst": "a"}, # a <- b + {"src": "c", "dst": "b"}, # b <- c (so a <- b <- c) + {"src": "d", "dst": "b"}, # b <- d (so a <- b <- d) + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_reverse(min_hops=1, max_hops=2), + n(name="end"), + ] + where = [compare(col("start", "v"), ">", col("end", "v"))] + + _assert_parity(graph, chain, where) + + result = execute_same_path_chain(graph, chain, where, Engine.PANDAS) + result_ids = set(result._nodes["id"]) + # c violates (10 > 15 is false), b and d satisfy + assert "c" not in result_ids, "c violates WHERE but included" + assert "b" in result_ids, "b satisfies WHERE but excluded" + assert "d" in result_ids, "d satisfies WHERE but excluded" + + def test_reverse_multihop_with_cycle(self): + """ + P0: Reverse multi-hop with cycle in graph. + """ + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 5}, + {"id": "c", "v": 10}, + ]) + edges = pd.DataFrame([ + {"src": "b", "dst": "a"}, # a <- b + {"src": "c", "dst": "b"}, # b <- c + {"src": "a", "dst": "c"}, # c <- a (creates cycle) + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_reverse(min_hops=1, max_hops=3), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_reverse_multihop_undirected_comparison(self): + """ + P0: Compare reverse multi-hop with equivalent undirected. + """ + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 5}, + {"id": "c", "v": 10}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + # Reverse from c + chain_rev = [ + n({"id": "c"}, name="start"), + e_reverse(min_hops=1, max_hops=2), + n(name="end"), + ] + where = [compare(col("start", "v"), ">", col("end", "v"))] + + _assert_parity(graph, chain_rev, where) + + +# ============================================================================ +# P0 ADDITIONAL TESTS: Multiple Valid Starts +# ============================================================================ + + +class TestP0MultipleStarts: + """ + P0 Tests: Multiple valid start nodes (not all, not one). + + This tests the middle ground between single filtered start and all-as-starts. + """ + + def test_two_valid_starts(self): + """ + P0: Two nodes match start filter. + + Graph: + a1(v=1) -> b -> c(v=10) + a2(v=2) -> b -> c(v=10) + """ + nodes = pd.DataFrame([ + {"id": "a1", "type": "start", "v": 1}, + {"id": "a2", "type": "start", "v": 2}, + {"id": "b", "type": "mid", "v": 5}, + {"id": "c", "type": "end", "v": 10}, + ]) + edges = pd.DataFrame([ + {"src": "a1", "dst": "b"}, + {"src": "a2", "dst": "b"}, + {"src": "b", "dst": "c"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"type": "start"}, name="start"), + e_forward(min_hops=1, max_hops=2), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_multiple_starts_different_paths(self): + """ + P0: Multiple starts with different path outcomes. + + start1 -> path1 (satisfies WHERE) + start2 -> path2 (violates WHERE) + """ + nodes = pd.DataFrame([ + {"id": "s1", "type": "start", "v": 1}, + {"id": "s2", "type": "start", "v": 100}, # High value + {"id": "m1", "type": "mid", "v": 5}, + {"id": "m2", "type": "mid", "v": 50}, + {"id": "e1", "type": "end", "v": 10}, # s1.v < e1.v (valid) + {"id": "e2", "type": "end", "v": 60}, # s2.v > e2.v (invalid for <) + ]) + edges = pd.DataFrame([ + {"src": "s1", "dst": "m1"}, + {"src": "m1", "dst": "e1"}, + {"src": "s2", "dst": "m2"}, + {"src": "m2", "dst": "e2"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"type": "start"}, name="start"), + e_forward(min_hops=1, max_hops=2), + n({"type": "end"}, name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + result = execute_same_path_chain(graph, chain, where, Engine.PANDAS) + result_ids = set(result._nodes["id"]) + # s1->m1->e1 satisfies (1 < 10), s2->m2->e2 violates (100 < 60) + assert "s1" in result_ids, "s1 satisfies WHERE but excluded" + assert "e1" in result_ids, "e1 satisfies WHERE but excluded" + # s2/e2 should be excluded + assert "s2" not in result_ids, "s2 path violates WHERE but s2 included" + assert "e2" not in result_ids, "e2 path violates WHERE but e2 included" + + def test_multiple_starts_shared_intermediate(self): + """ + P0: Multiple starts sharing intermediate nodes. + + s1 -> shared -> end1 + s2 -> shared -> end2 + """ + nodes = pd.DataFrame([ + {"id": "s1", "type": "start", "v": 1}, + {"id": "s2", "type": "start", "v": 2}, + {"id": "shared", "type": "mid", "v": 5}, + {"id": "end1", "type": "end", "v": 10}, + {"id": "end2", "type": "end", "v": 0}, # s1.v > end2.v, s2.v > end2.v + ]) + edges = pd.DataFrame([ + {"src": "s1", "dst": "shared"}, + {"src": "s2", "dst": "shared"}, + {"src": "shared", "dst": "end1"}, + {"src": "shared", "dst": "end2"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"type": "start"}, name="start"), + e_forward(min_hops=1, max_hops=2), + n({"type": "end"}, name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + +# ============================================================================ +# P1 TESTS: Operators × Single-hop Systematic +# ============================================================================ + + +class TestP1OperatorsSingleHop: + """ + P1 Tests: All comparison operators with single-hop edges. + + Systematic coverage of ==, !=, <, >, <=, >= for single-hop. + """ + + @pytest.fixture + def basic_graph(self): + """Graph for operator tests.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 5}, + {"id": "b", "v": 5}, # Same as a + {"id": "c", "v": 10}, # Greater than a + {"id": "d", "v": 1}, # Less than a + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, # a->b: 5 vs 5 + {"src": "a", "dst": "c"}, # a->c: 5 vs 10 + {"src": "a", "dst": "d"}, # a->d: 5 vs 1 + {"src": "c", "dst": "d"}, # c->d: 10 vs 1 + ]) + return CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + def test_single_hop_eq(self, basic_graph): + """P1: Single-hop with == operator.""" + chain = [n(name="start"), e_forward(), n(name="end")] + where = [compare(col("start", "v"), "==", col("end", "v"))] + _assert_parity(basic_graph, chain, where) + + result = execute_same_path_chain(basic_graph, chain, where, Engine.PANDAS) + # Only a->b satisfies 5 == 5 + assert "a" in set(result._nodes["id"]) + assert "b" in set(result._nodes["id"]) + + def test_single_hop_neq(self, basic_graph): + """P1: Single-hop with != operator.""" + chain = [n(name="start"), e_forward(), n(name="end")] + where = [compare(col("start", "v"), "!=", col("end", "v"))] + _assert_parity(basic_graph, chain, where) + + result = execute_same_path_chain(basic_graph, chain, where, Engine.PANDAS) + # a->c (5 != 10) and a->d (5 != 1) and c->d (10 != 1) satisfy + result_ids = set(result._nodes["id"]) + assert "c" in result_ids, "c participates in valid paths" + assert "d" in result_ids, "d participates in valid paths" + + def test_single_hop_lt(self, basic_graph): + """P1: Single-hop with < operator.""" + chain = [n(name="start"), e_forward(), n(name="end")] + where = [compare(col("start", "v"), "<", col("end", "v"))] + _assert_parity(basic_graph, chain, where) + + result = execute_same_path_chain(basic_graph, chain, where, Engine.PANDAS) + # a->c (5 < 10) satisfies + assert "c" in set(result._nodes["id"]) + + def test_single_hop_gt(self, basic_graph): + """P1: Single-hop with > operator.""" + chain = [n(name="start"), e_forward(), n(name="end")] + where = [compare(col("start", "v"), ">", col("end", "v"))] + _assert_parity(basic_graph, chain, where) + + result = execute_same_path_chain(basic_graph, chain, where, Engine.PANDAS) + # a->d (5 > 1) and c->d (10 > 1) satisfy + assert "d" in set(result._nodes["id"]) + + def test_single_hop_lte(self, basic_graph): + """P1: Single-hop with <= operator.""" + chain = [n(name="start"), e_forward(), n(name="end")] + where = [compare(col("start", "v"), "<=", col("end", "v"))] + _assert_parity(basic_graph, chain, where) + + result = execute_same_path_chain(basic_graph, chain, where, Engine.PANDAS) + # a->b (5 <= 5) and a->c (5 <= 10) satisfy + result_ids = set(result._nodes["id"]) + assert "b" in result_ids + assert "c" in result_ids + + def test_single_hop_gte(self, basic_graph): + """P1: Single-hop with >= operator.""" + chain = [n(name="start"), e_forward(), n(name="end")] + where = [compare(col("start", "v"), ">=", col("end", "v"))] + _assert_parity(basic_graph, chain, where) + + result = execute_same_path_chain(basic_graph, chain, where, Engine.PANDAS) + # a->b (5 >= 5) and a->d (5 >= 1) and c->d (10 >= 1) satisfy + result_ids = set(result._nodes["id"]) + assert "b" in result_ids + assert "d" in result_ids + + +# ============================================================================ +# P2 TESTS: Longer Paths (4+ nodes) +# ============================================================================ + + +class TestP2LongerPaths: + """ + P2 Tests: Paths with 4+ nodes. + + Tests that WHERE clauses work correctly for longer chains. + """ + + def test_four_node_chain(self): + """ + P2: Chain of 4 nodes (3 edges). + + a -> b -> c -> d + WHERE: a.v < d.v + """ + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 5}, + {"id": "c", "v": 3}, + {"id": "d", "v": 10}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + {"src": "c", "dst": "d"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="a"), + e_forward(), + n(name="b"), + e_forward(), + n(name="c"), + e_forward(), + n(name="d"), + ] + where = [compare(col("a", "v"), "<", col("d", "v"))] + + _assert_parity(graph, chain, where) + + def test_five_node_chain_multiple_where(self): + """ + P2: Chain of 5 nodes with multiple WHERE clauses. + + a -> b -> c -> d -> e + WHERE: a.v < c.v AND c.v < e.v + """ + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 3}, + {"id": "c", "v": 5}, + {"id": "d", "v": 7}, + {"id": "e", "v": 10}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + {"src": "c", "dst": "d"}, + {"src": "d", "dst": "e"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="a"), + e_forward(), + n(name="b"), + e_forward(), + n(name="c"), + e_forward(), + n(name="d"), + e_forward(), + n(name="e"), + ] + where = [ + compare(col("a", "v"), "<", col("c", "v")), + compare(col("c", "v"), "<", col("e", "v")), + ] + + _assert_parity(graph, chain, where) + + def test_long_chain_with_multihop(self): + """ + P2: Long chain with multi-hop edges. + + a -[1..2]-> mid -[1..2]-> end + WHERE: a.v < end.v + """ + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 3}, + {"id": "c", "v": 5}, + {"id": "d", "v": 7}, + {"id": "e", "v": 10}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + {"src": "c", "dst": "d"}, + {"src": "d", "dst": "e"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_forward(min_hops=1, max_hops=2), + n(name="mid"), + e_forward(min_hops=1, max_hops=2), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_long_chain_filters_partial_path(self): + """ + P2: Long chain where only partial paths satisfy WHERE. + + a -> b -> c -> d1 (satisfies) + a -> b -> c -> d2 (violates) + """ + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 3}, + {"id": "c", "v": 5}, + {"id": "d1", "v": 10}, # a.v < d1.v + {"id": "d2", "v": 0}, # a.v < d2.v is false + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + {"src": "c", "dst": "d1"}, + {"src": "c", "dst": "d2"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="a"), + e_forward(), + n(name="b"), + e_forward(), + n(name="c"), + e_forward(), + n(name="d"), + ] + where = [compare(col("a", "v"), "<", col("d", "v"))] + + _assert_parity(graph, chain, where) + + result = execute_same_path_chain(graph, chain, where, Engine.PANDAS) + result_ids = set(result._nodes["id"]) + assert "d1" in result_ids, "d1 satisfies WHERE but excluded" + assert "d2" not in result_ids, "d2 violates WHERE but included" + + +# ============================================================================ +# P1 TESTS: Operators × Multi-hop Systematic +# ============================================================================ + + +class TestP1OperatorsMultihop: + """ + P1 Tests: All comparison operators with multi-hop edges. + + Systematic coverage of ==, !=, <, >, <=, >= for multi-hop. + """ + + @pytest.fixture + def multihop_graph(self): + """Graph for multi-hop operator tests.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 5}, + {"id": "b", "v": 3}, + {"id": "c", "v": 5}, # Same as a + {"id": "d", "v": 10}, # Greater than a + {"id": "e", "v": 1}, # Less than a + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, # a-[2]->c: 5 vs 5 + {"src": "b", "dst": "d"}, # a-[2]->d: 5 vs 10 + {"src": "b", "dst": "e"}, # a-[2]->e: 5 vs 1 + ]) + return CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + def test_multihop_eq(self, multihop_graph): + """P1: Multi-hop with == operator.""" + chain = [ + n({"id": "a"}, name="start"), + e_forward(min_hops=1, max_hops=2), + n(name="end"), + ] + where = [compare(col("start", "v"), "==", col("end", "v"))] + _assert_parity(multihop_graph, chain, where) + + def test_multihop_neq(self, multihop_graph): + """P1: Multi-hop with != operator.""" + chain = [ + n({"id": "a"}, name="start"), + e_forward(min_hops=1, max_hops=2), + n(name="end"), + ] + where = [compare(col("start", "v"), "!=", col("end", "v"))] + _assert_parity(multihop_graph, chain, where) + + def test_multihop_lt(self, multihop_graph): + """P1: Multi-hop with < operator.""" + chain = [ + n({"id": "a"}, name="start"), + e_forward(min_hops=1, max_hops=2), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + _assert_parity(multihop_graph, chain, where) + + def test_multihop_gt(self, multihop_graph): + """P1: Multi-hop with > operator.""" + chain = [ + n({"id": "a"}, name="start"), + e_forward(min_hops=1, max_hops=2), + n(name="end"), + ] + where = [compare(col("start", "v"), ">", col("end", "v"))] + _assert_parity(multihop_graph, chain, where) + + def test_multihop_lte(self, multihop_graph): + """P1: Multi-hop with <= operator.""" + chain = [ + n({"id": "a"}, name="start"), + e_forward(min_hops=1, max_hops=2), + n(name="end"), + ] + where = [compare(col("start", "v"), "<=", col("end", "v"))] + _assert_parity(multihop_graph, chain, where) + + def test_multihop_gte(self, multihop_graph): + """P1: Multi-hop with >= operator.""" + chain = [ + n({"id": "a"}, name="start"), + e_forward(min_hops=1, max_hops=2), + n(name="end"), + ] + where = [compare(col("start", "v"), ">=", col("end", "v"))] + _assert_parity(multihop_graph, chain, where) + + +# ============================================================================ +# P1 TESTS: Undirected + Multi-hop +# ============================================================================ + + +class TestP1UndirectedMultihop: + """ + P1 Tests: Undirected edges with multi-hop traversal. + """ + + def test_undirected_multihop_basic(self): + """P1: Undirected multi-hop basic case.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 5}, + {"id": "c", "v": 10}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_undirected(min_hops=1, max_hops=2), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_undirected_multihop_bidirectional(self): + """P1: Undirected multi-hop can traverse both directions.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 5}, + {"id": "c", "v": 10}, + ]) + # Only one direction in edges, but undirected should traverse both ways + edges = pd.DataFrame([ + {"src": "b", "dst": "a"}, + {"src": "c", "dst": "b"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_undirected(min_hops=1, max_hops=2), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + +# ============================================================================ +# P1 TESTS: Mixed Direction Chains +# ============================================================================ + + +class TestP1MixedDirectionChains: + """ + P1 Tests: Chains with mixed edge directions (forward, reverse, undirected). + """ + + def test_forward_reverse_forward(self): + """P1: Forward-reverse-forward chain.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 5}, + {"id": "c", "v": 3}, + {"id": "d", "v": 10}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, # forward: a->b + {"src": "c", "dst": "b"}, # reverse from b: b<-c + {"src": "c", "dst": "d"}, # forward: c->d + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_forward(), + n(name="mid1"), + e_reverse(), + n(name="mid2"), + e_forward(), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_reverse_forward_reverse(self): + """P1: Reverse-forward-reverse chain.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 10}, + {"id": "b", "v": 5}, + {"id": "c", "v": 7}, + {"id": "d", "v": 1}, + ]) + edges = pd.DataFrame([ + {"src": "b", "dst": "a"}, # reverse from a: a<-b + {"src": "b", "dst": "c"}, # forward: b->c + {"src": "d", "dst": "c"}, # reverse from c: c<-d + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_reverse(), + n(name="mid1"), + e_forward(), + n(name="mid2"), + e_reverse(), + n(name="end"), + ] + where = [compare(col("start", "v"), ">", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_mixed_with_multihop(self): + """P1: Mixed directions with multi-hop edges.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 3}, + {"id": "c", "v": 5}, + {"id": "d", "v": 7}, + {"id": "e", "v": 10}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + {"src": "d", "dst": "c"}, # reverse: c<-d + {"src": "e", "dst": "d"}, # reverse: d<-e + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_forward(min_hops=1, max_hops=2), + n(name="mid"), + e_reverse(min_hops=1, max_hops=2), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + +# ============================================================================ +# P2 TESTS: Edge Cases and Boundary Conditions +# ============================================================================ + + +class TestP2EdgeCases: + """ + P2 Tests: Edge cases and boundary conditions. + """ + + def test_single_node_graph(self): + """P2: Graph with single node and self-loop.""" + nodes = pd.DataFrame([{"id": "a", "v": 5}]) + edges = pd.DataFrame([{"src": "a", "dst": "a"}]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="start"), + e_forward(), + n(name="end"), + ] + where = [compare(col("start", "v"), "==", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_disconnected_components(self): + """P2: Graph with disconnected components.""" + nodes = pd.DataFrame([ + {"id": "a1", "v": 1}, + {"id": "a2", "v": 5}, + {"id": "b1", "v": 10}, + {"id": "b2", "v": 15}, + ]) + edges = pd.DataFrame([ + {"src": "a1", "dst": "a2"}, # Component 1 + {"src": "b1", "dst": "b2"}, # Component 2 + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="start"), + e_forward(), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_dense_graph(self): + """P2: Dense graph with many edges.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 2}, + {"id": "c", "v": 3}, + {"id": "d", "v": 4}, + ]) + # Fully connected + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "a", "dst": "c"}, + {"src": "a", "dst": "d"}, + {"src": "b", "dst": "c"}, + {"src": "b", "dst": "d"}, + {"src": "c", "dst": "d"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_forward(min_hops=1, max_hops=2), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_null_values_in_comparison(self): + """P2: Nodes with null values in comparison column.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": None}, # Null value + {"id": "c", "v": 10}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_forward(min_hops=1, max_hops=2), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_string_comparison(self): + """P2: String values in comparison.""" + nodes = pd.DataFrame([ + {"id": "a", "name": "alice"}, + {"id": "b", "name": "bob"}, + {"id": "c", "name": "charlie"}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_forward(min_hops=1, max_hops=2), + n(name="end"), + ] + where = [compare(col("start", "name"), "<", col("end", "name"))] + + _assert_parity(graph, chain, where) + + def test_multiple_where_all_operators(self): + """P2: Multiple WHERE clauses with different operators.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 1, "w": 10}, + {"id": "b", "v": 5, "w": 5}, + {"id": "c", "v": 10, "w": 1}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="a"), + e_forward(), + n(name="b"), + e_forward(), + n(name="c"), + ] + # a.v < c.v AND a.w > c.w + where = [ + compare(col("a", "v"), "<", col("c", "v")), + compare(col("a", "w"), ">", col("c", "w")), + ] + + _assert_parity(graph, chain, where) + + +# ============================================================================ +# P3 TESTS: Bug Pattern Coverage (from 5 Whys analysis) +# ============================================================================ +# +# These tests target specific bug patterns discovered during debugging: +# 1. Multi-hop backward propagation edge cases +# 2. Merge suffix handling for same-named columns +# 3. Undirected edge handling in various contexts +# ============================================================================ + + +class TestBugPatternMultihopBackprop: + """ + Tests for multi-hop backward propagation edge cases. + + Bug pattern: Code that filters edges by endpoints breaks for multi-hop + because intermediate nodes aren't in left_allowed or right_allowed sets. + """ + + def test_three_consecutive_multihop_edges(self): + """Three consecutive multi-hop edges - stress test for backward prop.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 2}, + {"id": "c", "v": 3}, + {"id": "d", "v": 4}, + {"id": "e", "v": 5}, + {"id": "f", "v": 6}, + {"id": "g", "v": 7}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + {"src": "c", "dst": "d"}, + {"src": "d", "dst": "e"}, + {"src": "e", "dst": "f"}, + {"src": "f", "dst": "g"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_forward(min_hops=1, max_hops=2), + n(name="mid1"), + e_forward(min_hops=1, max_hops=2), + n(name="mid2"), + e_forward(min_hops=1, max_hops=2), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_multihop_with_output_slicing_and_where(self): + """Multi-hop with output_min_hops/output_max_hops + WHERE.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 2}, + {"id": "c", "v": 3}, + {"id": "d", "v": 4}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + {"src": "c", "dst": "d"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_forward(min_hops=1, max_hops=3, output_min_hops=2, output_max_hops=3), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_multihop_diamond_graph(self): + """Multi-hop through a diamond-shaped graph (multiple paths).""" + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 2}, + {"id": "c", "v": 3}, + {"id": "d", "v": 4}, + ]) + # Diamond: a -> b -> d and a -> c -> d + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "a", "dst": "c"}, + {"src": "b", "dst": "d"}, + {"src": "c", "dst": "d"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_forward(min_hops=1, max_hops=2), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + +class TestBugPatternMergeSuffix: + """ + Tests for merge suffix handling with same-named columns. + + Bug pattern: When left_col == right_col, pandas merge creates + suffixed columns (e.g., 'v' and 'v__r') but code may compare + column to itself instead of to the suffixed version. + """ + + def test_same_column_eq(self): + """Same column name with == operator.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 5}, + {"id": "b", "v": 3}, + {"id": "c", "v": 5}, # Same as a + {"id": "d", "v": 7}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + {"src": "b", "dst": "d"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_forward(min_hops=1, max_hops=2), + n(name="end"), + ] + # start.v == end.v: only c matches (v=5) + where = [compare(col("start", "v"), "==", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_same_column_lt(self): + """Same column name with < operator.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 5}, + {"id": "b", "v": 3}, + {"id": "c", "v": 10}, + {"id": "d", "v": 1}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + {"src": "b", "dst": "d"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_forward(min_hops=1, max_hops=2), + n(name="end"), + ] + # start.v < end.v: c matches (5 < 10), d doesn't (5 < 1 is false) + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_same_column_lte(self): + """Same column name with <= operator.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 5}, + {"id": "b", "v": 3}, + {"id": "c", "v": 5}, # Equal + {"id": "d", "v": 10}, # Greater + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + {"src": "b", "dst": "d"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_forward(min_hops=1, max_hops=2), + n(name="end"), + ] + # start.v <= end.v: c (5<=5) and d (5<=10) match + where = [compare(col("start", "v"), "<=", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_same_column_gt(self): + """Same column name with > operator.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 5}, + {"id": "b", "v": 3}, + {"id": "c", "v": 1}, # Less than a + {"id": "d", "v": 10}, # Greater than a + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + {"src": "b", "dst": "d"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_forward(min_hops=1, max_hops=2), + n(name="end"), + ] + # start.v > end.v: only c matches (5 > 1) + where = [compare(col("start", "v"), ">", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_same_column_gte(self): + """Same column name with >= operator.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 5}, + {"id": "b", "v": 3}, + {"id": "c", "v": 5}, # Equal + {"id": "d", "v": 1}, # Less + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + {"src": "b", "dst": "d"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_forward(min_hops=1, max_hops=2), + n(name="end"), + ] + # start.v >= end.v: c (5>=5) and d (5>=1) match + where = [compare(col("start", "v"), ">=", col("end", "v"))] + + _assert_parity(graph, chain, where) + + +class TestBugPatternUndirected: + """ + Tests for undirected edge handling in various contexts. + + Bug pattern: Code checks `is_reverse = direction == "reverse"` but + doesn't handle `direction == "undirected"`, treating it as forward. + Undirected requires bidirectional adjacency. + """ + + def test_undirected_non_adjacent_where(self): + """Undirected edges with non-adjacent WHERE clause.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 5}, + {"id": "c", "v": 10}, + ]) + # Edges only go one way, but undirected should work both ways + edges = pd.DataFrame([ + {"src": "b", "dst": "a"}, + {"src": "c", "dst": "b"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_undirected(), + n(name="mid"), + e_undirected(), + n(name="end"), + ] + # Non-adjacent: start.v < end.v + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_undirected_multiple_where(self): + """Undirected edges with multiple WHERE clauses.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 1, "w": 10}, + {"id": "b", "v": 5, "w": 5}, + {"id": "c", "v": 10, "w": 1}, + ]) + edges = pd.DataFrame([ + {"src": "b", "dst": "a"}, + {"src": "c", "dst": "b"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_undirected(min_hops=1, max_hops=2), + n(name="end"), + ] + # Multiple WHERE: start.v < end.v AND start.w > end.w + where = [ + compare(col("start", "v"), "<", col("end", "v")), + compare(col("start", "w"), ">", col("end", "w")), + ] + + _assert_parity(graph, chain, where) + + def test_mixed_directed_undirected_chain(self): + """Chain with both directed and undirected edges.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 2}, + {"id": "c", "v": 3}, + {"id": "d", "v": 4}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "c", "dst": "b"}, # Goes "wrong" way, but undirected should handle + {"src": "c", "dst": "d"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_forward(), + n(name="mid"), + e_undirected(), # Should be able to go b -> c even though edge is c -> b + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_undirected_with_self_loop(self): + """Undirected edge with self-loop.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 2}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "a"}, # Self-loop + {"src": "a", "dst": "b"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_undirected(min_hops=1, max_hops=2), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_undirected_reverse_undirected_chain(self): + """Chain: undirected -> reverse -> undirected.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 2}, + {"id": "c", "v": 3}, + {"id": "d", "v": 4}, + ]) + edges = pd.DataFrame([ + {"src": "b", "dst": "a"}, + {"src": "b", "dst": "c"}, + {"src": "d", "dst": "c"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_undirected(), + n(name="mid1"), + e_reverse(), + n(name="mid2"), + e_undirected(), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + +class TestImpossibleConstraints: + """Test cases with impossible/contradictory constraints that should return empty results.""" + + def test_contradictory_lt_gt_same_column(self): + """Impossible: a.v < b.v AND a.v > b.v (can't be both).""" + nodes = pd.DataFrame([ + {"id": "a", "v": 5}, + {"id": "b", "v": 10}, + {"id": "c", "v": 3}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "a", "dst": "c"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_forward(), + n(name="end"), + ] + # start.v < end.v AND start.v > end.v - impossible! + where = [ + compare(col("start", "v"), "<", col("end", "v")), + compare(col("start", "v"), ">", col("end", "v")), + ] + + _assert_parity(graph, chain, where) + + def test_contradictory_eq_neq_same_column(self): + """Impossible: a.v == b.v AND a.v != b.v (can't be both).""" + nodes = pd.DataFrame([ + {"id": "a", "v": 5}, + {"id": "b", "v": 5}, + {"id": "c", "v": 10}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "a", "dst": "c"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_forward(), + n(name="end"), + ] + # start.v == end.v AND start.v != end.v - impossible! + where = [ + compare(col("start", "v"), "==", col("end", "v")), + compare(col("start", "v"), "!=", col("end", "v")), + ] + + _assert_parity(graph, chain, where) + + def test_contradictory_lte_gt_same_column(self): + """Impossible: a.v <= b.v AND a.v > b.v (can't be both).""" + nodes = pd.DataFrame([ + {"id": "a", "v": 5}, + {"id": "b", "v": 10}, + {"id": "c", "v": 3}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "a", "dst": "c"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_forward(), + n(name="end"), + ] + # start.v <= end.v AND start.v > end.v - impossible! + where = [ + compare(col("start", "v"), "<=", col("end", "v")), + compare(col("start", "v"), ">", col("end", "v")), + ] + + _assert_parity(graph, chain, where) + + def test_no_paths_satisfy_predicate(self): + """All edges exist but no path satisfies the predicate.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 100}, # Highest value + {"id": "b", "v": 50}, + {"id": "c", "v": 10}, # Lowest value + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_forward(), + n(name="mid"), + e_forward(), + n({"id": "c"}, name="end"), + ] + # start.v < mid.v - but a.v=100 > b.v=50, so no valid path + where = [compare(col("start", "v"), "<", col("mid", "v"))] + + _assert_parity(graph, chain, where) + + def test_multihop_no_valid_endpoints(self): + """Multi-hop where no endpoints satisfy the predicate.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 100}, + {"id": "b", "v": 50}, + {"id": "c", "v": 25}, + {"id": "d", "v": 10}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + {"src": "c", "dst": "d"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_forward(min_hops=1, max_hops=3), + n(name="end"), + ] + # start.v < end.v - but a.v=100 is the highest, so impossible + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_contradictory_on_different_columns(self): + """Multiple predicates on different columns that are contradictory.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 5, "w": 10}, + {"id": "b", "v": 10, "w": 5}, # v is higher, w is lower + {"id": "c", "v": 3, "w": 20}, # v is lower, w is higher + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "a", "dst": "c"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_forward(), + n(name="end"), + ] + # For b: a.v < b.v (5 < 10) TRUE, but a.w < b.w (10 < 5) FALSE + # For c: a.v < c.v (5 < 3) FALSE, but a.w < c.w (10 < 20) TRUE + # No destination satisfies both + where = [ + compare(col("start", "v"), "<", col("end", "v")), + compare(col("start", "w"), "<", col("end", "w")), + ] + + _assert_parity(graph, chain, where) + + def test_chain_with_impossible_intermediate(self): + """Chain where intermediate step makes path impossible.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 100}, # This would make mid.v > end.v impossible + {"id": "c", "v": 50}, + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_forward(), + n(name="mid"), + e_forward(), + n({"id": "c"}, name="end"), + ] + # mid.v < end.v - but b.v=100 > c.v=50 + where = [compare(col("mid", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_non_adjacent_impossible_constraint(self): + """Non-adjacent WHERE clause that's impossible to satisfy.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 100}, # Highest + {"id": "b", "v": 50}, + {"id": "c", "v": 10}, # Lowest + ]) + edges = pd.DataFrame([ + {"src": "a", "dst": "b"}, + {"src": "b", "dst": "c"}, + ]) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n({"id": "a"}, name="start"), + e_forward(), + n(name="mid"), + e_forward(), + n({"id": "c"}, name="end"), + ] + # start.v < end.v - but a.v=100 > c.v=10 + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_empty_graph_with_constraints(self): + """Empty graph should return empty even with valid-looking constraints.""" + nodes = pd.DataFrame({"id": [], "v": []}) + edges = pd.DataFrame({"src": [], "dst": []}) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="start"), + e_forward(), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) + + def test_no_edges_with_constraints(self): + """Nodes exist but no edges - should return empty.""" + nodes = pd.DataFrame([ + {"id": "a", "v": 1}, + {"id": "b", "v": 10}, + ]) + edges = pd.DataFrame({"src": [], "dst": []}) + graph = CGFull().nodes(nodes, "id").edges(edges, "src", "dst") + + chain = [ + n(name="start"), + e_forward(), + n(name="end"), + ] + where = [compare(col("start", "v"), "<", col("end", "v"))] + + _assert_parity(graph, chain, where) diff --git a/tests/gfql/ref/test_enumerator_parity.py b/tests/gfql/ref/test_enumerator_parity.py index 59d76ee75b..1e19e095f0 100644 --- a/tests/gfql/ref/test_enumerator_parity.py +++ b/tests/gfql/ref/test_enumerator_parity.py @@ -44,9 +44,13 @@ def _run_parity_case(nodes, edges, ops, check_hop_labels=False): if not alias: continue if isinstance(op, ASTNode): - assert oracle.tags.get(alias, set()) == _alias_bindings(gfql_nodes, g._node, alias) + assert oracle.tags.get(alias, set()) == _alias_bindings( + gfql_nodes, g._node, alias + ) elif isinstance(op, ASTEdge): - assert oracle.tags.get(alias, set()) == _alias_bindings(gfql_edges, g._edge, alias) + assert oracle.tags.get(alias, set()) == _alias_bindings( + gfql_edges, g._edge, alias + ) # Check hop labels if requested if check_hop_labels: @@ -100,7 +104,8 @@ def _run_parity_case(nodes, edges, ops, check_hop_labels=False): {"edge_id": "e2", "src": "acct2", "dst": "acct3", "type": "txn"}, {"edge_id": "e3", "src": "acct3", "dst": "acct1", "type": "txn"}, ], - [n({"type": "account"}, name="start"), e_forward({"type": "txn"}, name="hop"), n({"type": "account"}, name="end")], + [n({"type": "account"}, name="start"), e_forward({"type": "txn"}, name="hop"), +n({"type": "account"}, name="end")], ), ( "reverse", @@ -113,7 +118,8 @@ def _run_parity_case(nodes, edges, ops, check_hop_labels=False): {"edge_id": "owns1", "src": "acct1", "dst": "user1", "type": "owns"}, {"edge_id": "owns2", "src": "acct2", "dst": "user1", "type": "owns"}, ], - [n({"type": "user"}, name="u"), e_reverse({"type": "owns"}, name="owns_rev"), n({"type": "account"}, name="acct")], + [n({"type": "user"}, name="u"), e_reverse({"type": "owns"}, name="owns_rev"), +n({"type": "account"}, name="acct")], ), ( "two_hop", @@ -147,7 +153,11 @@ def _run_parity_case(nodes, edges, ops, check_hop_labels=False): {"edge_id": "e12", "src": "n1", "dst": "n2", "type": "path"}, {"edge_id": "e23", "src": "n2", "dst": "n3", "type": "path"}, ], - [n({"type": "node"}, name="start"), e_undirected({"type": "path"}, name="hop"), n({"type": "node"}, name="end")], + [ + n({"type": "node"}, name="start"), + e_undirected({"type": "path"}, name="hop"), + n({"type": "node"}, name="end"), + ], ), ( "empty", @@ -156,7 +166,8 @@ def _run_parity_case(nodes, edges, ops, check_hop_labels=False): {"id": "acct2", "type": "account"}, ], [{"edge_id": "e1", "src": "acct1", "dst": "acct2", "type": "txn"}], - [n({"type": "user"}, name="start"), e_forward({"type": "txn"}, name="hop"), n({"type": "user"}, name="end")], + [n({"type": "user"}, name="start"), e_forward({"type": "txn"}, name="hop"), +n({"type": "user"}, name="end")], ), ( "cycle", @@ -189,7 +200,8 @@ def _run_parity_case(nodes, edges, ops, check_hop_labels=False): {"edge_id": "e2", "src": "acct1", "dst": "acct3", "type": "txn"}, {"edge_id": "e3", "src": "acct3", "dst": "acct4", "type": "txn"}, ], - [n({"type": "account"}, name="root"), e_forward({"type": "txn"}, name="first_hop"), n({"type": "account"}, name="child")], + [n({"type": "account"}, name="root"), e_forward({"type": "txn"}, +name="first_hop"), n({"type": "account"}, name="child")], ), ( "forward_labels", diff --git a/tests/gfql/ref/test_ref_enumerator.py b/tests/gfql/ref/test_ref_enumerator.py index 3dc23d0f25..37d2a3129c 100644 --- a/tests/gfql/ref/test_ref_enumerator.py +++ b/tests/gfql/ref/test_ref_enumerator.py @@ -5,7 +5,8 @@ from types import SimpleNamespace from graphistry.compute import n, e_forward, e_undirected -from graphistry.gfql.ref.enumerator import OracleCaps, col, compare, enumerate_chain +from graphistry.gfql.ref.enumerator import OracleCaps, enumerate_chain +from graphistry.gfql.same_path_types import col, compare def _plottable(nodes, edges): @@ -35,7 +36,8 @@ def _col_set(df: pd.DataFrame, column: str) -> Set[str]: {"edge_id": "e1", "src": "acct1", "dst": "acct2", "type": "txn"}, {"edge_id": "e2", "src": "acct2", "dst": "user1", "type": "owns"}, ], - "ops": [n({"type": "account"}, name="a"), e_forward({"type": "txn"}), n(name="b")], + "ops": [n({"type": "account"}, name="a"), e_forward({"type": "txn"}), + n(name="b")], "expect": {"nodes": {"acct1", "acct2"}, "edges": {"e1"}}, }, { @@ -48,8 +50,10 @@ def _col_set(df: pd.DataFrame, column: str) -> Set[str]: ], "edges": [ {"edge_id": "e_good", "src": "acct_good", "dst": "user1", "type": "owns"}, - {"edge_id": "e_bad_match", "src": "acct_bad", "dst": "user2", "type": "owns"}, - {"edge_id": "e_bad_wrong", "src": "acct_bad", "dst": "user1", "type": "owns"}, + {"edge_id": "e_bad_match", "src": "acct_bad", "dst": "user2", "type": + "owns"}, + {"edge_id": "e_bad_wrong", "src": "acct_bad", "dst": "user1", "type": + "owns"}, ], "ops": [ n({"type": "account"}, name="a"), @@ -61,7 +65,8 @@ def _col_set(df: pd.DataFrame, column: str) -> Set[str]: "expect": { "nodes": {"acct_good", "acct_bad", "user1", "user2"}, "edges": {"e_good", "e_bad_match"}, - "tags": {"a": {"acct_good", "acct_bad"}, "r": {"e_good", "e_bad_match"}, "c": {"user1", "user2"}}, + "tags": {"a": {"acct_good", "acct_bad"}, "r": {"e_good", "e_bad_match"}, + "c": {"user1", "user2"}}, "paths": [ {"a": "acct_good", "c": "user1", "r": "e_good"}, {"a": "acct_bad", "c": "user2", "r": "e_bad_match"}, @@ -152,8 +157,10 @@ def __init__(self, df): def to_pandas(self): return self._df.copy() - g = _plottable(Dummy(pd.DataFrame([{"id": "n1"}])), Dummy(pd.DataFrame([{"edge_id": "e1", "src": "n1", "dst": "n1"}]))) - result = enumerate_chain(g, [n(name="a")], caps=OracleCaps(max_nodes=20, max_edges=20)) + g = _plottable(Dummy(pd.DataFrame([{"id": "n1"}])), Dummy(pd.DataFrame([{"edge_id": + "e1", "src": "n1", "dst": "n1"}]))) + result = enumerate_chain(g, [n(name="a")], caps=OracleCaps(max_nodes=20, + max_edges=20)) assert _col_set(result.nodes, "id") == {"n1"} @@ -241,9 +248,11 @@ def test_enumerator_min_max_three_branch_unlabeled(): @st.composite def small_graph_cases(draw): - nodes = draw(st.lists(st.sampled_from(NODE_POOL), min_size=2, max_size=4, unique=True)) + nodes = draw(st.lists(st.sampled_from(NODE_POOL), min_size=2, max_size=4, + unique=True)) node_rows = [{"id": node, "value": draw(st.integers(0, 3))} for node in nodes] - edges = draw(st.lists(st.tuples(st.sampled_from(nodes), st.sampled_from(nodes)), min_size=1, max_size=5)) + edges = draw(st.lists(st.tuples(st.sampled_from(nodes), st.sampled_from(nodes)), + min_size=1, max_size=5)) edge_rows = [ {"edge_id": EDGE_POOL[i % len(EDGE_POOL)], "src": src, "dst": dst} for i, (src, dst) in enumerate(edges) @@ -273,7 +282,8 @@ def test_enumerator_paths_cover_outputs(case): [n(name="a"), e_forward(name="rel"), n(name="c")], where=case["where"], include_paths=True, - caps=OracleCaps(max_nodes=10, max_edges=10, max_length=4, max_partial_rows=10_000), + caps=OracleCaps(max_nodes=10, max_edges=10, max_length=4, + max_partial_rows=10_000), ) path_nodes = { diff --git a/tests/gfql/ref/test_same_path_plan.py b/tests/gfql/ref/test_same_path_plan.py new file mode 100644 index 0000000000..120ce656da --- /dev/null +++ b/tests/gfql/ref/test_same_path_plan.py @@ -0,0 +1,18 @@ +from graphistry.gfql.same_path_plan import plan_same_path +from graphistry.gfql.same_path_types import col, compare + + +def test_plan_minmax_and_bitset(): + where = [ + compare(col("a", "balance"), ">", col("c", "credit")), + compare(col("a", "owner"), "==", col("c", "owner")), + ] + plan = plan_same_path(where) + assert plan.minmax_aliases == {"a": {"balance"}, "c": {"credit"}} + assert any("owner" in key for key in plan.bitsets) + + +def test_plan_empty_when_no_where(): + plan = plan_same_path(None) + assert plan.minmax_aliases == {} + assert plan.bitsets == {}