VectorlyApp
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 22 additions & 0 deletions b/‎README.md‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎example_routines/massachusetts_corp_search_routine.json‎
Lines changed: 8 additions & 4 deletions b/‎example_routines/massachusetts_corp_search_routine.json‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎quickstart.py‎
Lines changed: 1 addition & 10 deletions b/‎quickstart.py‎
Lines changed: 1 addition & 10 deletions
@@ -215,4 +215,5 @@ cdp_captures/
 cdp_captures*/
 routine_discovery_output/
 routine_discovery_output*/
-downloads/
+downloads/
+benchmarks/
@@ -631,6 +631,28 @@ See `quickstart.py` for a complete interactive example.
 
 - **return_screenshot** — capture and return a screenshot of the page as base64
 
+## Running Benchmarks 📊
+
+Benchmarks validate the routine discovery pipeline against known ground-truth routines. They run both deterministic tests (checking routine structure) and LLM-based tests (evaluating semantic correctness).
+
+```bash
+# Run all benchmarks
+python web_hacker/scripts/run_benchmarks.py
+
+# With verbose output (shows each test result as it runs)
+python web_hacker/scripts/run_benchmarks.py -v
+
+# Use a specific model
+python web_hacker/scripts/run_benchmarks.py --model gpt-4.1
+
+# Custom output directory
+python web_hacker/scripts/run_benchmarks.py --output-dir ./my_benchmarks
+```
+
+Results are saved to the output directory:
+- `{benchmark_name}.json` — Full evaluation results for each benchmark
+- `_summary.json` — Aggregated summary of all benchmark runs
+
 ## Contributing 🤝
 
 We welcome contributions! Here's how to get started:
 
@@ -94,10 +94,14 @@
       "timeout_seconds": 1
     },
     {
-      "type": "return_html",
-      "scope": "element",
-      "selector": "table",
-      "timeout_ms": 20000
+      "type": "js_evaluate",
+      "js": "(()=>{const table=document.getElementById('MainContent_SearchControl_grdSearchResultsEntity');if(!table)return{error:'Table not found',results:[]};const rows=table.querySelectorAll('tr.GridRow');const results=[];for(const row of rows){const cells=row.querySelectorAll('td,th');const link=row.querySelector('a.link');results.push({entity_name:cells[0]?.textContent?.trim()||'',id_number:cells[1]?.textContent?.trim()||'',old_id_number:cells[2]?.textContent?.trim()||'',address:cells[3]?.textContent?.trim().replace(/\\s+/g,' ')||'',link:link?.href||''});}return{results,count:results.length};})()",
+      "session_storage_key": "corp_search_results",
+      "timeout_seconds": 10
+    },
+    {
+      "type": "return",
+      "session_storage_key": "corp_search_results"
     }
   ]
 }
 
@@ -6,7 +6,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "web-hacker"
-version = "1.2.1"
+version = "1.2.2"
 description = "SDK for reverse engineering web apps"
 readme = "README.md"
 requires-python = ">=3.12.3,<3.13"
@@ -46,6 +46,7 @@ dependencies = [
     "beautifulsoup4>=4.14.2",
     "pylint>=3.0.0",
     "pytest>=8.3.5",
+    "jmespath>=1.0.1",
 ]
 
 [project.optional-dependencies]
 
@@ -27,6 +27,7 @@
     GREEN, YELLOW, BLUE, CYAN,
     print_colored, print_header, ask_yes_no,
 )
+from web_hacker.utils.infra_utils import clear_directory
 
 # Configuration
 PORT = 9222
@@ -139,16 +140,6 @@ def launch_chrome(port: int) -> Optional[subprocess.Popen]:
         return None
 
 
-def clear_directory(path: Path) -> None:
-    """Clear all files and subdirectories in a directory."""
-    if path.exists():
-        for item in path.iterdir():
-            if item.is_file():
-                item.unlink()
-            elif item.is_dir():
-                shutil.rmtree(item)
-
-
 def step_1_monitor_browser(cdp_captures_dir: Path) -> bool:
     """Step 1: Monitor browser activity (launches Chrome if needed)."""
     print_header("Step 1: Monitor Browser Activity")
Original file line number	Diff line number	Diff line change
`@@ -94,10 +94,14 @@`
`94`	`94`	`"timeout_seconds": 1`
`95`	`95`	`},`
`96`	`96`	`{`
`97`		`- "type": "return_html",`
`98`		`- "scope": "element",`
`99`		`- "selector": "table",`
`100`		`- "timeout_ms": 20000`
	`97`	`+ "type": "js_evaluate",`
	`98`	+ "js": "(()=>{const table=document.getElementById('MainContent_SearchControl_grdSearchResultsEntity');if(!table)return{error:'Table not found',results:[]};const rows=table.querySelectorAll('tr.GridRow');const results=[];for(const row of rows){const cells=row.querySelectorAll('td,th');const link=row.querySelector('a.link');results.push({entity_name:cells[0]?.textContent?.trim()\|\|'',id_number:cells[1]?.textContent?.trim()\|\|'',old_id_number:cells[2]?.textContent?.trim()\|\|'',address:cells[3]?.textContent?.trim().replace(/\\s+/g,' ')\|\|'',link:link?.href\|\|''});}return{results,count:results.length};})()",
	`99`	`+ "session_storage_key": "corp_search_results",`
	`100`	`+ "timeout_seconds": 10`
	`101`	`+ },`
	`102`	`+ {`
	`103`	`+ "type": "return",`
	`104`	`+ "session_storage_key": "corp_search_results"`
`101`	`105`	`}`
`102`	`106`	`]`
`103`	`107`	`}`