Skip to content

Commit 9358b0e

Browse files
Revert "Some evals in python for local-support (#58)" (#60)
This reverts commit 5e6b6ad.
1 parent 5e6b6ad commit 9358b0e

18 files changed

+145
-237
lines changed

README.md

Lines changed: 0 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -449,44 +449,6 @@ config = StagehandConfig(
449449
)
450450
```
451451

452-
## Evaluations
453-
454-
The Stagehand Python SDK includes a set of evaluations to test its core functionality. These evaluations are organized by the primary methods they test: `act`, `extract`, and `observe`.
455-
456-
### Running Evaluations
457-
458-
You can run evaluations using the `run_all_evals.py` script in the `evals/` directory:
459-
460-
```bash
461-
# Run only observe evaluations (default behavior)
462-
python -m evals.run_all_evals
463-
464-
# Run all evaluations (act, extract, and observe)
465-
python -m evals.run_all_evals --all
466-
467-
# Run a specific evaluation
468-
python -m evals.run_all_evals --all --eval observe_taxes
469-
python -m evals.run_all_evals --all --eval google_jobs
470-
471-
# Specify a different model
472-
python -m evals.run_all_evals --model gpt-4o-mini
473-
```
474-
475-
### Evaluation Types
476-
477-
The evaluations test the following capabilities:
478-
479-
- **act**: Tests for browser actions (clicking, typing)
480-
- `google_jobs`: Google jobs search and extraction
481-
482-
- **extract**: Tests for data extraction capabilities
483-
- `extract_press_releases`: Extracting press releases from a dummy site
484-
485-
- **observe**: Tests for element observation and identification
486-
- `observe_taxes`: Tax form elements observation
487-
488-
Results are printed to the console with a summary showing success/failure for each evaluation.
489-
490452
## License
491453

492454
MIT License (c) 2025 Browserbase, Inc.

evals/act/google_jobs.py

Lines changed: 14 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import asyncio
22
import traceback
3-
from typing import Any, Optional, Dict
3+
from typing import Any, Optional, dict
44

55
from pydantic import BaseModel
66

@@ -19,43 +19,23 @@ class JobDetails(BaseModel):
1919
preferred_qualifications: Qualifications
2020

2121

22-
def is_job_details_valid(details: Dict[str, Any] | JobDetails) -> bool:
22+
def is_job_details_valid(details: dict[str, Any]) -> bool:
2323
"""
24-
Validates that the extracted job details are in the correct format.
25-
application_deadline is allowed to be None.
26-
For qualifications, degree and years_of_experience are allowed to be None.
24+
Validates that each top-level field in the extracted job details is not None.
25+
For nested dictionary values, each sub-value must be non-null and a string
26+
or a number.
2727
"""
2828
if not details:
2929
return False
30-
31-
# Convert Pydantic model to dict if needed
32-
if hasattr(details, "model_dump"):
33-
details_dict = details.model_dump()
34-
else:
35-
details_dict = details
36-
37-
# application_deadline is allowed to be None
38-
# minimum_qualifications and preferred_qualifications must exist
39-
required_fields = ["minimum_qualifications", "preferred_qualifications"]
40-
for field in required_fields:
41-
if field not in details_dict or details_dict[field] is None:
30+
for _key, value in details.items():
31+
if value is None:
4232
return False
43-
44-
# For qualifications, check that they're dictionaries but allow None values
45-
for field in ["minimum_qualifications", "preferred_qualifications"]:
46-
if not isinstance(details_dict[field], dict):
33+
if isinstance(value, dict):
34+
for v in value.values():
35+
if v is None or not isinstance(v, (str, int, float)):
36+
return False
37+
elif not isinstance(value, (str, int, float)):
4738
return False
48-
49-
# Each qualification should have the expected structure
50-
quals = details_dict[field]
51-
if "degree" not in quals or "years_of_experience" not in quals:
52-
return False
53-
54-
# Values can be None or proper types
55-
for k, v in quals.items():
56-
if v is not None and not isinstance(v, (str, int, float)):
57-
return False
58-
5939
return True
6040

6141

@@ -99,7 +79,7 @@ async def google_jobs(model_name: str, logger, use_text_extract: bool) -> dict:
9979
)
10080

10181
try:
102-
await stagehand.page.goto("https://www.google.com/")
82+
await stagehand.page.navigate("https://www.google.com/")
10383
await asyncio.sleep(3)
10484
await stagehand.page.act(ActOptions(action="click on the about page"))
10585
await stagehand.page.act(ActOptions(action="click on the careers page"))
@@ -116,13 +96,11 @@ async def google_jobs(model_name: str, logger, use_text_extract: bool) -> dict:
11696
"(degree and years of experience), and preferred qualifications "
11797
"(degree and years of experience)"
11898
),
119-
schemaDefinition=JobDetails,
99+
schemaDefinition=JobDetails.model_json_schema(),
120100
useTextExtract=use_text_extract,
121101
)
122102
)
123103

124-
print("Extracted job details:", job_details)
125-
126104
valid = is_job_details_valid(job_details)
127105

128106
await stagehand.close()

evals/env_loader.py

Lines changed: 0 additions & 41 deletions
This file was deleted.

evals/extract/extract_press_releases.py

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import asyncio
2-
import os
32

43
from pydantic import BaseModel
54

@@ -48,8 +47,8 @@ async def extract_press_releases(model_name: str, logger, use_text_extract: bool
4847
session_url = init_response["sessionUrl"]
4948

5049
# Navigate to the dummy press releases page # TODO - choose a different page
51-
await stagehand.page.goto(
52-
"https://dummy-press-releases.surge.sh/news"
50+
await stagehand.page.navigate(
51+
"https://dummy-press-releases.surge.sh/news", wait_until="networkidle"
5352
)
5453
# Wait for 5 seconds to ensure content has loaded
5554
await asyncio.sleep(5)
@@ -62,21 +61,14 @@ async def extract_press_releases(model_name: str, logger, use_text_extract: bool
6261
"extract the title and corresponding publish date of EACH AND EVERY "
6362
"press releases on this page. DO NOT MISS ANY PRESS RELEASES."
6463
),
65-
schemaDefinition=PressReleases,
64+
schemaDefinition=PressReleases.model_json_schema(),
6665
useTextExtract=use_text_extract,
6766
)
6867
)
6968
print("Raw result:", raw_result)
70-
71-
# Get the items list from the raw_result, which could be a dict or a PressReleases object
72-
if isinstance(raw_result, PressReleases):
73-
items = raw_result.items
74-
elif isinstance(raw_result, dict) and "items" in raw_result:
75-
# Parse the raw result using the defined schema if it's a dictionary
76-
parsed = PressReleases.model_validate(raw_result)
77-
items = parsed.items
78-
else:
79-
error_message = "Extraction did not return valid press releases data."
69+
# Check that the extraction returned a valid dictionary
70+
if not raw_result or not isinstance(raw_result, dict):
71+
error_message = "Extraction did not return a valid dictionary."
8072
logger.error({"message": error_message, "raw_result": raw_result})
8173
return {
8274
"_success": False,
@@ -86,6 +78,10 @@ async def extract_press_releases(model_name: str, logger, use_text_extract: bool
8678
"sessionUrl": session_url,
8779
}
8880

81+
# Parse the raw result using the defined schema.
82+
parsed = PressReleases.parse_obj(raw_result)
83+
items = parsed.items
84+
8985
# Expected results (from the TS eval)
9086
expected_length = 28
9187
expected_first = PressRelease(

evals/run_all_evals.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,6 @@
33
import inspect
44
import os
55

6-
from .env_loader import load_evals_env
7-
8-
# Load environment variables at module import time
9-
load_evals_env()
106

117
# A simple logger to collect logs for the evals
128
class SimpleLogger:

evals/utils.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
import sys
44
from typing import Any, Optional
55

6-
from .env_loader import load_evals_env
7-
86
# Try to import LiteLLM, which is used for model inference
97
try:
108
import litellm
@@ -91,9 +89,6 @@ async def complete(
9189

9290
def setup_environment():
9391
"""Set up the environment for running evaluations."""
94-
# First, load environment variables from .env files
95-
load_evals_env()
96-
9792
# If OPENAI_API_KEY is set but MODEL_API_KEY is not, copy it over
9893
if os.getenv("OPENAI_API_KEY") and not os.getenv("MODEL_API_KEY"):
9994
os.environ["MODEL_API_KEY"] = os.getenv("OPENAI_API_KEY")

stagehand/handlers/extract_handler.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,15 @@ def __init__(
3535
async def extract(
3636
self,
3737
options: Optional[ExtractOptions] = None,
38+
request_id: str = "",
3839
schema: Optional[type[BaseModel]] = None,
3940
) -> ExtractResult:
4041
"""
4142
Execute an extraction operation locally.
4243
4344
Args:
4445
options: ExtractOptions containing the instruction and other parameters
46+
request_id: Unique identifier for the request
4547
schema: Optional Pydantic model for structured output
4648
4749
Returns:
@@ -99,6 +101,7 @@ async def extract(
99101
tree_elements=output_string,
100102
schema=transformed_schema,
101103
llm_client=self.stagehand.llm,
104+
request_id=request_id,
102105
user_provided_instructions=self.user_provided_instructions,
103106
logger=self.logger,
104107
log_inference_to_file=False, # TODO: Implement logging to file if needed

stagehand/handlers/observe_handler.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,15 @@ def __init__(
3232
async def observe(
3333
self,
3434
options: ObserveOptions,
35+
*request_id: str,
3536
from_act: bool = False,
3637
) -> list[ObserveResult]:
3738
"""
3839
Execute an observation operation locally.
3940
4041
Args:
4142
options: ObserveOptions containing the instruction and other parameters
43+
request_id: Unique identifier for the request
4244
4345
Returns:
4446
list of ObserveResult instances
@@ -78,6 +80,7 @@ async def observe(
7880
instruction=instruction,
7981
tree_elements=output_string,
8082
llm_client=self.stagehand.llm,
83+
request_id=request_id,
8184
user_provided_instructions=self.user_provided_instructions,
8285
logger=self.logger,
8386
log_inference_to_file=False, # TODO: Implement logging to file if needed

stagehand/llm/client.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
"""LLM client for model interactions."""
22

33
import logging
4+
import time
45
from typing import Any, Callable, Optional
56

67
import litellm
78

8-
from stagehand.metrics import get_inference_time_ms, start_inference_timer
9+
from stagehand.metrics import start_inference_timer, get_inference_time_ms
910

1011
# Configure logger for the module
1112
logger = logging.getLogger(__name__)

stagehand/llm/inference.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ def observe(
2525
instruction: str,
2626
tree_elements: str,
2727
llm_client: Any,
28+
request_id: str,
2829
user_provided_instructions: Optional[str] = None,
2930
logger: Optional[Callable] = None,
3031
log_inference_to_file: bool = False,
@@ -37,6 +38,7 @@ def observe(
3738
instruction: The instruction to follow when finding elements
3839
tree_elements: String representation of DOM/accessibility tree elements
3940
llm_client: Client for calling LLM
41+
request_id: Unique ID for this request
4042
user_provided_instructions: Optional custom system instructions
4143
logger: Optional logger function
4244
log_inference_to_file: Whether to log inference to file
@@ -71,6 +73,7 @@ def observe(
7173
messages=messages,
7274
response_format=ObserveInferenceSchema,
7375
temperature=0.1,
76+
request_id=request_id,
7477
function_name="ACT" if from_act else "OBSERVE",
7578
)
7679
inference_time_ms = int((time.time() - start_time) * 1000)
@@ -128,6 +131,7 @@ def extract(
128131
tree_elements: str,
129132
schema: Optional[Union[type[BaseModel], dict]] = None,
130133
llm_client: Any = None,
134+
request_id: str = "",
131135
user_provided_instructions: Optional[str] = None,
132136
logger: Optional[Callable] = None,
133137
log_inference_to_file: bool = False,
@@ -142,6 +146,7 @@ def extract(
142146
tree_elements: The DOM or accessibility tree representation
143147
schema: Pydantic model defining the structure of the data to extract
144148
llm_client: The LLM client to use for the request
149+
request_id: Unique identifier for the request
145150
user_provided_instructions: Optional custom system instructions
146151
logger: Logger instance for logging
147152
log_inference_to_file: Whether to log inference to file
@@ -182,6 +187,7 @@ def extract(
182187
messages=extract_messages,
183188
response_format=response_format,
184189
temperature=0.1,
190+
request_id=request_id,
185191
function_name="EXTRACT", # Always set to EXTRACT
186192
**kwargs,
187193
)
@@ -232,6 +238,7 @@ def extract(
232238
messages=metadata_messages,
233239
response_format=metadata_schema,
234240
temperature=0.1,
241+
request_id=request_id,
235242
function_name="EXTRACT", # Metadata for extraction should also be tracked as EXTRACT
236243
)
237244
metadata_end_time = time.time()

0 commit comments

Comments
 (0)