Skip to content

Commit 1aece52

Browse files
committed
fix(hop): undirected single-pass frontier
Add hop fast-path toggle, benchmark scripts, and ref exports.
1 parent 418708a commit 1aece52

File tree

7 files changed

+362
-6
lines changed

7 files changed

+362
-6
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
1111
### Added
1212
- **GFQL / WHERE** (experimental): Added `Chain.where` field for same-path WHERE clause constraints. New modules: `same_path_types.py`, `same_path_plan.py`, `df_executor.py` implementing Yannakakis-style semijoin reduction for efficient WHERE filtering. Supports equality, inequality, and comparison operators on named alias columns.
1313
- **GFQL / cuDF same-path**: Added execution-mode gate `GRAPHISTRY_CUDF_SAME_PATH_MODE` (auto/oracle/strict) for GFQL cuDF same-path executor. Auto falls back to oracle when GPU unavailable; strict requires cuDF or raises.
14+
- **Compute / hop**: Added `GRAPHISTRY_HOP_FAST_PATH` (set to `0`/`false`/`off`) to disable fast-path traversal for benchmarking or compatibility checks.
1415

1516
### Performance
1617
- **Compute / hop**: Refactored hop traversal to precompute node predicate domains and unify direction handling; synthetic CPU benchmarks show modest median improvements with some regressions on undirected/range scenarios.
@@ -26,6 +27,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
2627

2728
### Infra
2829
- **GFQL / same_path**: Modular architecture for WHERE execution: `same_path_types.py` (types), `same_path_plan.py` (planning), `df_executor.py` (execution), plus `same_path/` submodules for BFS, edge semantics, multihop, post-pruning, and WHERE filtering.
30+
- **Benchmarks**: Added manual hop microbench + frontier sweep scripts under `benchmarks/` (not wired into CI).
2931

3032
### Tests
3133
- **GFQL / df_executor**: Added comprehensive test suite (core, amplify, patterns, dimension) with 200+ tests covering Yannakakis semijoin, WHERE clause filtering, multi-hop paths, and pandas/cuDF parity.

benchmarks/README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Benchmarks
2+
3+
Manual-only scripts for local performance checks. Not wired into CI.
4+
5+
## Hop microbench
6+
7+
Run a small set of hop() scenarios across synthetic graphs.
8+
9+
```bash
10+
uv run python benchmarks/run_hop_microbench.py --runs 5 --output /tmp/hop-microbench.md
11+
```
12+
13+
## Frontier sweep
14+
15+
Sweep seed sizes on a fixed linear graph.
16+
17+
```bash
18+
uv run python benchmarks/run_hop_frontier_sweep.py --runs 5 --nodes 100000 --edges 200000 --output /tmp/hop-frontier.md
19+
```
20+
21+
Notes:
22+
- Use `--engine cudf` for GPU runs when cuDF is available.
23+
- Scripts print a table to stdout; `--output` writes Markdown results.
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Frontier-size sweep for hop() on a fixed graph.
4+
"""
5+
6+
from __future__ import annotations
7+
8+
import argparse
9+
import time
10+
from dataclasses import dataclass
11+
from typing import Iterable, List, Optional, Tuple
12+
13+
import pandas as pd
14+
15+
import graphistry
16+
from graphistry.Engine import Engine
17+
18+
19+
@dataclass
20+
class ResultRow:
21+
graph: str
22+
seed_size: int
23+
ms: Optional[float]
24+
25+
26+
def make_linear_graph(n_nodes: int, n_edges: int) -> Tuple[pd.DataFrame, pd.DataFrame]:
27+
nodes = pd.DataFrame({"id": list(range(n_nodes))})
28+
edges_list = []
29+
for i in range(min(n_edges, n_nodes - 1)):
30+
edges_list.append({"src": i, "dst": i + 1, "eid": i})
31+
edges = pd.DataFrame(edges_list)
32+
return nodes, edges
33+
34+
35+
def build_graph(n_nodes: int, n_edges: int, engine: Engine):
36+
nodes_df, edges_df = make_linear_graph(n_nodes, n_edges)
37+
if engine == Engine.CUDF:
38+
import cudf # type: ignore
39+
40+
nodes_df = cudf.from_pandas(nodes_df)
41+
edges_df = cudf.from_pandas(edges_df)
42+
return graphistry.nodes(nodes_df, "id").edges(edges_df, "src", "dst")
43+
44+
45+
def _time_call(fn, runs: int) -> float:
46+
times = []
47+
for _ in range(runs):
48+
start = time.perf_counter()
49+
fn()
50+
times.append((time.perf_counter() - start) * 1000)
51+
return sum(times) / len(times)
52+
53+
54+
def run_sweep(g, seed_sizes: List[int], runs: int) -> Iterable[ResultRow]:
55+
for seed_size in seed_sizes:
56+
seed_nodes = g._nodes.head(seed_size)
57+
58+
def _call() -> None:
59+
g.hop(
60+
nodes=seed_nodes,
61+
hops=2,
62+
to_fixed_point=False,
63+
direction="forward",
64+
return_as_wave_front=True,
65+
)
66+
67+
ms = _time_call(_call, runs)
68+
yield ResultRow(graph="", seed_size=seed_size, ms=ms)
69+
70+
71+
def write_markdown(results: Iterable[ResultRow], output_path: str) -> None:
72+
header = [
73+
"# Hop Frontier Sweep",
74+
"",
75+
"Notes:",
76+
"- Fixed linear graph, forward 2-hop, return_as_wave_front=True.",
77+
"",
78+
"| Graph | Seed Size | Time |",
79+
"|-------|-----------|------|",
80+
]
81+
lines = header + [
82+
f"| {row.graph} | {row.seed_size} | {row.ms:.2f}ms |" for row in results
83+
]
84+
with open(output_path, "w", encoding="utf-8") as f:
85+
f.write("\n".join(lines) + "\n")
86+
87+
88+
def main() -> None:
89+
parser = argparse.ArgumentParser(description="Hop frontier sweep.")
90+
parser.add_argument("--engine", default="pandas", choices=["pandas", "cudf"])
91+
parser.add_argument("--runs", type=int, default=3)
92+
parser.add_argument("--nodes", type=int, default=100000)
93+
parser.add_argument("--edges", type=int, default=200000)
94+
parser.add_argument("--output", default="")
95+
parser.add_argument(
96+
"--seed-sizes",
97+
default="1,10,100,1000,10000",
98+
help="Comma-separated list of seed sizes",
99+
)
100+
args = parser.parse_args()
101+
102+
engine = Engine.CUDF if args.engine == "cudf" else Engine.PANDAS
103+
seed_sizes = [int(x) for x in args.seed_sizes.split(",") if x.strip()]
104+
105+
g = build_graph(args.nodes, args.edges, engine)
106+
results = list(run_sweep(g, seed_sizes, args.runs))
107+
for row in results:
108+
row.graph = f"linear_{args.nodes}"
109+
110+
if args.output:
111+
write_markdown(results, args.output)
112+
113+
print("| Graph | Seed Size | Time |")
114+
print("|-------|-----------|------|")
115+
for row in results:
116+
print(f"| {row.graph} | {row.seed_size} | {row.ms:.2f}ms |")
117+
118+
119+
if __name__ == "__main__":
120+
main()

benchmarks/run_hop_microbench.py

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Direct hop() microbenchmarks for common traversal shapes.
4+
"""
5+
6+
from __future__ import annotations
7+
8+
import argparse
9+
import time
10+
from dataclasses import dataclass
11+
from typing import Iterable, List, Optional, Tuple
12+
13+
import pandas as pd
14+
15+
import graphistry
16+
from graphistry.Engine import Engine
17+
18+
19+
@dataclass(frozen=True)
20+
class Scenario:
21+
name: str
22+
hops: int
23+
direction: str
24+
seed_mode: str # "seed0" | "all"
25+
return_as_wave_front: bool = True
26+
27+
28+
@dataclass(frozen=True)
29+
class GraphSpec:
30+
name: str
31+
nodes: int
32+
edges: int
33+
kind: str # "linear" | "dense"
34+
35+
36+
@dataclass
37+
class ResultRow:
38+
graph: str
39+
scenario: str
40+
ms: Optional[float]
41+
42+
43+
def make_linear_graph(n_nodes: int, n_edges: int) -> Tuple[pd.DataFrame, pd.DataFrame]:
44+
nodes = pd.DataFrame({"id": list(range(n_nodes))})
45+
edges_list = []
46+
for i in range(min(n_edges, n_nodes - 1)):
47+
edges_list.append({"src": i, "dst": i + 1, "eid": i})
48+
edges = pd.DataFrame(edges_list)
49+
return nodes, edges
50+
51+
52+
def make_dense_graph(n_nodes: int, n_edges: int) -> Tuple[pd.DataFrame, pd.DataFrame]:
53+
import random
54+
55+
random.seed(42)
56+
nodes = pd.DataFrame({"id": list(range(n_nodes))})
57+
edges_list = []
58+
for i in range(n_edges):
59+
src = random.randint(0, n_nodes - 2)
60+
dst = random.randint(src + 1, n_nodes - 1)
61+
edges_list.append({"src": src, "dst": dst, "eid": i})
62+
edges = pd.DataFrame(edges_list).drop_duplicates(subset=["src", "dst"])
63+
return nodes, edges
64+
65+
66+
def build_graph(spec: GraphSpec, engine: Engine):
67+
if spec.kind == "dense":
68+
nodes_df, edges_df = make_dense_graph(spec.nodes, spec.edges)
69+
else:
70+
nodes_df, edges_df = make_linear_graph(spec.nodes, spec.edges)
71+
72+
if engine == Engine.CUDF:
73+
import cudf # type: ignore
74+
75+
nodes_df = cudf.from_pandas(nodes_df)
76+
edges_df = cudf.from_pandas(edges_df)
77+
78+
return graphistry.nodes(nodes_df, "id").edges(edges_df, "src", "dst")
79+
80+
81+
def _time_call(fn, runs: int) -> float:
82+
times = []
83+
for _ in range(runs):
84+
start = time.perf_counter()
85+
fn()
86+
times.append((time.perf_counter() - start) * 1000)
87+
return sum(times) / len(times)
88+
89+
90+
def run_scenarios(g, scenarios: List[Scenario], runs: int) -> Iterable[ResultRow]:
91+
for scenario in scenarios:
92+
seed_nodes = None
93+
if scenario.seed_mode == "seed0":
94+
seed_nodes = g._nodes[g._nodes["id"] == 0]
95+
96+
def _call() -> None:
97+
g.hop(
98+
nodes=seed_nodes,
99+
hops=scenario.hops,
100+
to_fixed_point=False,
101+
direction=scenario.direction,
102+
return_as_wave_front=scenario.return_as_wave_front,
103+
)
104+
105+
ms = _time_call(_call, runs)
106+
yield ResultRow(graph="", scenario=scenario.name, ms=ms)
107+
108+
109+
def build_scenarios() -> List[Scenario]:
110+
return [
111+
Scenario("2hop_forward_seed0", 2, "forward", "seed0", True),
112+
Scenario("2hop_forward_all", 2, "forward", "all", True),
113+
Scenario("2hop_undirected_seed0", 2, "undirected", "seed0", True),
114+
Scenario("2hop_undirected_all", 2, "undirected", "all", True),
115+
]
116+
117+
118+
def build_graph_specs() -> List[GraphSpec]:
119+
return [
120+
GraphSpec("small_linear", 1_000, 2_000, "linear"),
121+
GraphSpec("medium_linear", 10_000, 20_000, "linear"),
122+
GraphSpec("medium_dense", 10_000, 50_000, "dense"),
123+
]
124+
125+
126+
def write_markdown(results: Iterable[ResultRow], output_path: str) -> None:
127+
header = [
128+
"# Hop Microbench Results",
129+
"",
130+
"Notes:",
131+
"- Direct hop() calls; no WHERE predicates.",
132+
"",
133+
"| Graph | Scenario | Time |",
134+
"|-------|----------|------|",
135+
]
136+
lines = header + [
137+
f"| {row.graph} | {row.scenario} | {row.ms:.2f}ms |" for row in results
138+
]
139+
with open(output_path, "w", encoding="utf-8") as f:
140+
f.write("\n".join(lines) + "\n")
141+
142+
143+
def main() -> None:
144+
parser = argparse.ArgumentParser(description="Hop microbenchmarks.")
145+
parser.add_argument("--engine", default="pandas", choices=["pandas", "cudf"])
146+
parser.add_argument("--runs", type=int, default=3)
147+
parser.add_argument("--output", default="")
148+
args = parser.parse_args()
149+
150+
engine = Engine.CUDF if args.engine == "cudf" else Engine.PANDAS
151+
scenarios = build_scenarios()
152+
results: List[ResultRow] = []
153+
for spec in build_graph_specs():
154+
g = build_graph(spec, engine)
155+
for row in run_scenarios(g, scenarios, args.runs):
156+
row.graph = spec.name
157+
results.append(row)
158+
159+
if args.output:
160+
write_markdown(results, args.output)
161+
162+
print("| Graph | Scenario | Time |")
163+
print("|-------|----------|------|")
164+
for row in results:
165+
print(f"| {row.graph} | {row.scenario} | {row.ms:.2f}ms |")
166+
167+
168+
if __name__ == "__main__":
169+
main()

docs/pr_notes/pr-886-where.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# PR 886 Notes: GFQL WHERE + hop performance
2+
3+
## GPU toggles / experiments
4+
- `GRAPHISTRY_CUDF_SAME_PATH_MODE=auto|oracle|strict` controls same-path executor selection when `Engine.CUDF` is requested.
5+
- `GRAPHISTRY_HOP_FAST_PATH=0` disables hop fast-path traversal for A/B comparisons.
6+
7+
## Commits worth toggling (GPU perf/debug)
8+
- d05d9db9 perf(hop): domain-based fast path traversal
9+
- 6cc23688 perf(hop): undirected single-pass expansion
10+
- d1e11784 perf(df_executor): DF-native cuDF forward prune
11+
- e85fa8e7 fix(filter_by_dict): allow bool filters on object columns
12+
13+
## Manual benchmarks (not in CI)
14+
- `benchmarks/run_hop_microbench.py`
15+
- `benchmarks/run_hop_frontier_sweep.py`
16+
- Example: `uv run python benchmarks/run_hop_microbench.py --runs 5 --output /tmp/hop-microbench.md`

0 commit comments

Comments
 (0)