Skip to content

Commit b4e2f0d

Browse files
authored
Merge pull request #74 from VectorlyApp/agent-benchmarks
Agent benchmarks
2 parents 61bb6f6 + 827bb36 commit b4e2f0d

File tree

12 files changed

+3014
-301
lines changed

12 files changed

+3014
-301
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,4 +215,5 @@ cdp_captures/
215215
cdp_captures*/
216216
routine_discovery_output/
217217
routine_discovery_output*/
218-
downloads/
218+
downloads/
219+
benchmarks/

README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,28 @@ See `quickstart.py` for a complete interactive example.
631631

632632
- **return_screenshot** — capture and return a screenshot of the page as base64
633633

634+
## Running Benchmarks 📊
635+
636+
Benchmarks validate the routine discovery pipeline against known ground-truth routines. They run both deterministic tests (checking routine structure) and LLM-based tests (evaluating semantic correctness).
637+
638+
```bash
639+
# Run all benchmarks
640+
python web_hacker/scripts/run_benchmarks.py
641+
642+
# With verbose output (shows each test result as it runs)
643+
python web_hacker/scripts/run_benchmarks.py -v
644+
645+
# Use a specific model
646+
python web_hacker/scripts/run_benchmarks.py --model gpt-4.1
647+
648+
# Custom output directory
649+
python web_hacker/scripts/run_benchmarks.py --output-dir ./my_benchmarks
650+
```
651+
652+
Results are saved to the output directory:
653+
- `{benchmark_name}.json` — Full evaluation results for each benchmark
654+
- `_summary.json` — Aggregated summary of all benchmark runs
655+
634656
## Contributing 🤝
635657

636658
We welcome contributions! Here's how to get started:

example_routines/massachusetts_corp_search_routine.json

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,10 +94,14 @@
9494
"timeout_seconds": 1
9595
},
9696
{
97-
"type": "return_html",
98-
"scope": "element",
99-
"selector": "table",
100-
"timeout_ms": 20000
97+
"type": "js_evaluate",
98+
"js": "(()=>{const table=document.getElementById('MainContent_SearchControl_grdSearchResultsEntity');if(!table)return{error:'Table not found',results:[]};const rows=table.querySelectorAll('tr.GridRow');const results=[];for(const row of rows){const cells=row.querySelectorAll('td,th');const link=row.querySelector('a.link');results.push({entity_name:cells[0]?.textContent?.trim()||'',id_number:cells[1]?.textContent?.trim()||'',old_id_number:cells[2]?.textContent?.trim()||'',address:cells[3]?.textContent?.trim().replace(/\\s+/g,' ')||'',link:link?.href||''});}return{results,count:results.length};})()",
99+
"session_storage_key": "corp_search_results",
100+
"timeout_seconds": 10
101+
},
102+
{
103+
"type": "return",
104+
"session_storage_key": "corp_search_results"
101105
}
102106
]
103107
}

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ build-backend = "hatchling.build"
66

77
[project]
88
name = "web-hacker"
9-
version = "1.2.1"
9+
version = "1.2.2"
1010
description = "SDK for reverse engineering web apps"
1111
readme = "README.md"
1212
requires-python = ">=3.12.3,<3.13"
@@ -46,6 +46,7 @@ dependencies = [
4646
"beautifulsoup4>=4.14.2",
4747
"pylint>=3.0.0",
4848
"pytest>=8.3.5",
49+
"jmespath>=1.0.1",
4950
]
5051

5152
[project.optional-dependencies]

quickstart.py

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
GREEN, YELLOW, BLUE, CYAN,
2828
print_colored, print_header, ask_yes_no,
2929
)
30+
from web_hacker.utils.infra_utils import clear_directory
3031

3132
# Configuration
3233
PORT = 9222
@@ -139,16 +140,6 @@ def launch_chrome(port: int) -> Optional[subprocess.Popen]:
139140
return None
140141

141142

142-
def clear_directory(path: Path) -> None:
143-
"""Clear all files and subdirectories in a directory."""
144-
if path.exists():
145-
for item in path.iterdir():
146-
if item.is_file():
147-
item.unlink()
148-
elif item.is_dir():
149-
shutil.rmtree(item)
150-
151-
152143
def step_1_monitor_browser(cdp_captures_dir: Path) -> bool:
153144
"""Step 1: Monitor browser activity (launches Chrome if needed)."""
154145
print_header("Step 1: Monitor Browser Activity")

0 commit comments

Comments
 (0)