Skip to content

Commit c013933

Browse files
Fm/improvements (#61)
* update * update * remove request_id param * fix press releases * update more evals * update readme
1 parent 9358b0e commit c013933

18 files changed

+237
-145
lines changed

README.md

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,44 @@ config = StagehandConfig(
449449
)
450450
```
451451

452+
## Evaluations
453+
454+
The Stagehand Python SDK includes a set of evaluations to test its core functionality. These evaluations are organized by the primary methods they test: `act`, `extract`, and `observe`.
455+
456+
### Running Evaluations
457+
458+
You can run evaluations using the `run_all_evals.py` script in the `evals/` directory:
459+
460+
```bash
461+
# Run only observe evaluations (default behavior)
462+
python -m evals.run_all_evals
463+
464+
# Run all evaluations (act, extract, and observe)
465+
python -m evals.run_all_evals --all
466+
467+
# Run a specific evaluation
468+
python -m evals.run_all_evals --all --eval observe_taxes
469+
python -m evals.run_all_evals --all --eval google_jobs
470+
471+
# Specify a different model
472+
python -m evals.run_all_evals --model gpt-4o-mini
473+
```
474+
475+
### Evaluation Types
476+
477+
The evaluations test the following capabilities:
478+
479+
- **act**: Tests for browser actions (clicking, typing)
480+
- `google_jobs`: Google jobs search and extraction
481+
482+
- **extract**: Tests for data extraction capabilities
483+
- `extract_press_releases`: Extracting press releases from a dummy site
484+
485+
- **observe**: Tests for element observation and identification
486+
- `observe_taxes`: Tax form elements observation
487+
488+
Results are printed to the console with a summary showing success/failure for each evaluation.
489+
452490
## License
453491

454492
MIT License (c) 2025 Browserbase, Inc.

evals/act/google_jobs.py

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import asyncio
22
import traceback
3-
from typing import Any, Optional, dict
3+
from typing import Any, Optional, Dict
44

55
from pydantic import BaseModel
66

@@ -19,23 +19,43 @@ class JobDetails(BaseModel):
1919
preferred_qualifications: Qualifications
2020

2121

22-
def is_job_details_valid(details: dict[str, Any]) -> bool:
22+
def is_job_details_valid(details: Dict[str, Any] | JobDetails) -> bool:
2323
"""
24-
Validates that each top-level field in the extracted job details is not None.
25-
For nested dictionary values, each sub-value must be non-null and a string
26-
or a number.
24+
Validates that the extracted job details are in the correct format.
25+
application_deadline is allowed to be None.
26+
For qualifications, degree and years_of_experience are allowed to be None.
2727
"""
2828
if not details:
2929
return False
30-
for _key, value in details.items():
31-
if value is None:
30+
31+
# Convert Pydantic model to dict if needed
32+
if hasattr(details, "model_dump"):
33+
details_dict = details.model_dump()
34+
else:
35+
details_dict = details
36+
37+
# application_deadline is allowed to be None
38+
# minimum_qualifications and preferred_qualifications must exist
39+
required_fields = ["minimum_qualifications", "preferred_qualifications"]
40+
for field in required_fields:
41+
if field not in details_dict or details_dict[field] is None:
3242
return False
33-
if isinstance(value, dict):
34-
for v in value.values():
35-
if v is None or not isinstance(v, (str, int, float)):
36-
return False
37-
elif not isinstance(value, (str, int, float)):
43+
44+
# For qualifications, check that they're dictionaries but allow None values
45+
for field in ["minimum_qualifications", "preferred_qualifications"]:
46+
if not isinstance(details_dict[field], dict):
3847
return False
48+
49+
# Each qualification should have the expected structure
50+
quals = details_dict[field]
51+
if "degree" not in quals or "years_of_experience" not in quals:
52+
return False
53+
54+
# Values can be None or proper types
55+
for k, v in quals.items():
56+
if v is not None and not isinstance(v, (str, int, float)):
57+
return False
58+
3959
return True
4060

4161

@@ -79,7 +99,7 @@ async def google_jobs(model_name: str, logger, use_text_extract: bool) -> dict:
7999
)
80100

81101
try:
82-
await stagehand.page.navigate("https://www.google.com/")
102+
await stagehand.page.goto("https://www.google.com/")
83103
await asyncio.sleep(3)
84104
await stagehand.page.act(ActOptions(action="click on the about page"))
85105
await stagehand.page.act(ActOptions(action="click on the careers page"))
@@ -96,11 +116,13 @@ async def google_jobs(model_name: str, logger, use_text_extract: bool) -> dict:
96116
"(degree and years of experience), and preferred qualifications "
97117
"(degree and years of experience)"
98118
),
99-
schemaDefinition=JobDetails.model_json_schema(),
119+
schemaDefinition=JobDetails,
100120
useTextExtract=use_text_extract,
101121
)
102122
)
103123

124+
print("Extracted job details:", job_details)
125+
104126
valid = is_job_details_valid(job_details)
105127

106128
await stagehand.close()

evals/env_loader.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
"""
2+
Environment variable loader for Stagehand evaluations.
3+
4+
This module loads environment variables from an .env file in the evals directory,
5+
making them available to all submodules (act, extract, observe).
6+
"""
7+
import os
8+
from pathlib import Path
9+
from dotenv import load_dotenv
10+
11+
def load_evals_env():
12+
"""
13+
Load environment variables from the .env file in the evals directory.
14+
This ensures all submodules have access to the same environment variables.
15+
"""
16+
# Get the evals directory path (where this file is located)
17+
evals_dir = Path(__file__).parent.absolute()
18+
env_path = evals_dir / '.env'
19+
20+
# Load from root directory as fallback if evals/.env doesn't exist
21+
root_env_path = evals_dir.parent / '.env'
22+
23+
# First try to load from evals/.env
24+
if env_path.exists():
25+
print(f"Loading environment variables from {env_path}")
26+
load_dotenv(env_path)
27+
# Fall back to root .env file if it exists
28+
elif root_env_path.exists():
29+
print(f"Loading environment variables from {root_env_path}")
30+
load_dotenv(root_env_path)
31+
else:
32+
print("No .env file found. Please create one in the evals directory.")
33+
print("Required variables: MODEL_API_KEY, BROWSERBASE_API_KEY, BROWSERBASE_PROJECT_ID")
34+
35+
# Check for essential environment variables
36+
essential_vars = ['MODEL_API_KEY', 'BROWSERBASE_API_KEY', 'BROWSERBASE_PROJECT_ID']
37+
missing_vars = [var for var in essential_vars if not os.getenv(var)]
38+
39+
if missing_vars:
40+
print(f"Warning: Missing essential environment variables: {', '.join(missing_vars)}")
41+
print("Some evaluations may fail without these variables.")

evals/extract/extract_press_releases.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import asyncio
2+
import os
23

34
from pydantic import BaseModel
45

@@ -47,8 +48,8 @@ async def extract_press_releases(model_name: str, logger, use_text_extract: bool
4748
session_url = init_response["sessionUrl"]
4849

4950
# Navigate to the dummy press releases page # TODO - choose a different page
50-
await stagehand.page.navigate(
51-
"https://dummy-press-releases.surge.sh/news", wait_until="networkidle"
51+
await stagehand.page.goto(
52+
"https://dummy-press-releases.surge.sh/news"
5253
)
5354
# Wait for 5 seconds to ensure content has loaded
5455
await asyncio.sleep(5)
@@ -61,14 +62,21 @@ async def extract_press_releases(model_name: str, logger, use_text_extract: bool
6162
"extract the title and corresponding publish date of EACH AND EVERY "
6263
"press releases on this page. DO NOT MISS ANY PRESS RELEASES."
6364
),
64-
schemaDefinition=PressReleases.model_json_schema(),
65+
schemaDefinition=PressReleases,
6566
useTextExtract=use_text_extract,
6667
)
6768
)
6869
print("Raw result:", raw_result)
69-
# Check that the extraction returned a valid dictionary
70-
if not raw_result or not isinstance(raw_result, dict):
71-
error_message = "Extraction did not return a valid dictionary."
70+
71+
# Get the items list from the raw_result, which could be a dict or a PressReleases object
72+
if isinstance(raw_result, PressReleases):
73+
items = raw_result.items
74+
elif isinstance(raw_result, dict) and "items" in raw_result:
75+
# Parse the raw result using the defined schema if it's a dictionary
76+
parsed = PressReleases.model_validate(raw_result)
77+
items = parsed.items
78+
else:
79+
error_message = "Extraction did not return valid press releases data."
7280
logger.error({"message": error_message, "raw_result": raw_result})
7381
return {
7482
"_success": False,
@@ -78,10 +86,6 @@ async def extract_press_releases(model_name: str, logger, use_text_extract: bool
7886
"sessionUrl": session_url,
7987
}
8088

81-
# Parse the raw result using the defined schema.
82-
parsed = PressReleases.parse_obj(raw_result)
83-
items = parsed.items
84-
8589
# Expected results (from the TS eval)
8690
expected_length = 28
8791
expected_first = PressRelease(

evals/run_all_evals.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33
import inspect
44
import os
55

6+
from .env_loader import load_evals_env
7+
8+
# Load environment variables at module import time
9+
load_evals_env()
610

711
# A simple logger to collect logs for the evals
812
class SimpleLogger:

evals/utils.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import sys
44
from typing import Any, Optional
55

6+
from .env_loader import load_evals_env
7+
68
# Try to import LiteLLM, which is used for model inference
79
try:
810
import litellm
@@ -89,6 +91,9 @@ async def complete(
8991

9092
def setup_environment():
9193
"""Set up the environment for running evaluations."""
94+
# First, load environment variables from .env files
95+
load_evals_env()
96+
9297
# If OPENAI_API_KEY is set but MODEL_API_KEY is not, copy it over
9398
if os.getenv("OPENAI_API_KEY") and not os.getenv("MODEL_API_KEY"):
9499
os.environ["MODEL_API_KEY"] = os.getenv("OPENAI_API_KEY")

stagehand/handlers/extract_handler.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,15 +35,13 @@ def __init__(
3535
async def extract(
3636
self,
3737
options: Optional[ExtractOptions] = None,
38-
request_id: str = "",
3938
schema: Optional[type[BaseModel]] = None,
4039
) -> ExtractResult:
4140
"""
4241
Execute an extraction operation locally.
4342
4443
Args:
4544
options: ExtractOptions containing the instruction and other parameters
46-
request_id: Unique identifier for the request
4745
schema: Optional Pydantic model for structured output
4846
4947
Returns:
@@ -101,7 +99,6 @@ async def extract(
10199
tree_elements=output_string,
102100
schema=transformed_schema,
103101
llm_client=self.stagehand.llm,
104-
request_id=request_id,
105102
user_provided_instructions=self.user_provided_instructions,
106103
logger=self.logger,
107104
log_inference_to_file=False, # TODO: Implement logging to file if needed

stagehand/handlers/observe_handler.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,13 @@ def __init__(
3232
async def observe(
3333
self,
3434
options: ObserveOptions,
35-
*request_id: str,
3635
from_act: bool = False,
3736
) -> list[ObserveResult]:
3837
"""
3938
Execute an observation operation locally.
4039
4140
Args:
4241
options: ObserveOptions containing the instruction and other parameters
43-
request_id: Unique identifier for the request
4442
4543
Returns:
4644
list of ObserveResult instances
@@ -80,7 +78,6 @@ async def observe(
8078
instruction=instruction,
8179
tree_elements=output_string,
8280
llm_client=self.stagehand.llm,
83-
request_id=request_id,
8481
user_provided_instructions=self.user_provided_instructions,
8582
logger=self.logger,
8683
log_inference_to_file=False, # TODO: Implement logging to file if needed

stagehand/llm/client.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
"""LLM client for model interactions."""
22

33
import logging
4-
import time
54
from typing import Any, Callable, Optional
65

76
import litellm
87

9-
from stagehand.metrics import start_inference_timer, get_inference_time_ms
8+
from stagehand.metrics import get_inference_time_ms, start_inference_timer
109

1110
# Configure logger for the module
1211
logger = logging.getLogger(__name__)

stagehand/llm/inference.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ def observe(
2525
instruction: str,
2626
tree_elements: str,
2727
llm_client: Any,
28-
request_id: str,
2928
user_provided_instructions: Optional[str] = None,
3029
logger: Optional[Callable] = None,
3130
log_inference_to_file: bool = False,
@@ -38,7 +37,6 @@ def observe(
3837
instruction: The instruction to follow when finding elements
3938
tree_elements: String representation of DOM/accessibility tree elements
4039
llm_client: Client for calling LLM
41-
request_id: Unique ID for this request
4240
user_provided_instructions: Optional custom system instructions
4341
logger: Optional logger function
4442
log_inference_to_file: Whether to log inference to file
@@ -73,7 +71,6 @@ def observe(
7371
messages=messages,
7472
response_format=ObserveInferenceSchema,
7573
temperature=0.1,
76-
request_id=request_id,
7774
function_name="ACT" if from_act else "OBSERVE",
7875
)
7976
inference_time_ms = int((time.time() - start_time) * 1000)
@@ -131,7 +128,6 @@ def extract(
131128
tree_elements: str,
132129
schema: Optional[Union[type[BaseModel], dict]] = None,
133130
llm_client: Any = None,
134-
request_id: str = "",
135131
user_provided_instructions: Optional[str] = None,
136132
logger: Optional[Callable] = None,
137133
log_inference_to_file: bool = False,
@@ -146,7 +142,6 @@ def extract(
146142
tree_elements: The DOM or accessibility tree representation
147143
schema: Pydantic model defining the structure of the data to extract
148144
llm_client: The LLM client to use for the request
149-
request_id: Unique identifier for the request
150145
user_provided_instructions: Optional custom system instructions
151146
logger: Logger instance for logging
152147
log_inference_to_file: Whether to log inference to file
@@ -187,7 +182,6 @@ def extract(
187182
messages=extract_messages,
188183
response_format=response_format,
189184
temperature=0.1,
190-
request_id=request_id,
191185
function_name="EXTRACT", # Always set to EXTRACT
192186
**kwargs,
193187
)
@@ -238,7 +232,6 @@ def extract(
238232
messages=metadata_messages,
239233
response_format=metadata_schema,
240234
temperature=0.1,
241-
request_id=request_id,
242235
function_name="EXTRACT", # Metadata for extraction should also be tracked as EXTRACT
243236
)
244237
metadata_end_time = time.time()

0 commit comments

Comments
 (0)