diff --git a/.changeset/daffy-rapid-turaco.md b/.changeset/daffy-rapid-turaco.md new file mode 100644 index 00000000..4767deb0 --- /dev/null +++ b/.changeset/daffy-rapid-turaco.md @@ -0,0 +1,5 @@ +--- +"stagehand": patch +--- + +Fix parsing schema for extract with no arguments (full page extract) diff --git a/format b/format new file mode 100755 index 00000000..b0451ae0 --- /dev/null +++ b/format @@ -0,0 +1,17 @@ +#!/bin/bash + +# Define source directories (adjust as needed) +SOURCE_DIRS="stagehand" + +# Apply Black formatting first +echo "Applying Black formatting..." +black $SOURCE_DIRS + +# Apply Ruff with autofix for all issues (including import sorting) +echo "Applying Ruff autofixes (including import sorting)..." +ruff check --fix $SOURCE_DIRS + +echo "Checking for remaining issues..." +ruff check $SOURCE_DIRS + +echo "Done! Code has been formatted and linted." \ No newline at end of file diff --git a/stagehand/handlers/extract_handler.py b/stagehand/handlers/extract_handler.py index 21af7a99..89e37b82 100644 --- a/stagehand/handlers/extract_handler.py +++ b/stagehand/handlers/extract_handler.py @@ -7,7 +7,12 @@ from stagehand.a11y.utils import get_accessibility_tree from stagehand.llm.inference import extract as extract_inference from stagehand.metrics import StagehandFunctionName # Changed import location -from stagehand.types import DefaultExtractSchema, ExtractOptions, ExtractResult +from stagehand.types import ( + DefaultExtractSchema, + EmptyExtractSchema, + ExtractOptions, + ExtractResult, +) from stagehand.utils import inject_urls, transform_url_strings_to_ids T = TypeVar("T", bound=BaseModel) @@ -166,4 +171,6 @@ async def _extract_page_text(self) -> ExtractResult: tree = await get_accessibility_tree(self.stagehand_page, self.logger) output_string = tree["simplified"] - return ExtractResult(data=output_string) + output_dict = {"page_text": output_string} + validated_model = EmptyExtractSchema.model_validate(output_dict) + return ExtractResult(data=validated_model).data diff --git a/stagehand/page.py b/stagehand/page.py index 5da0ab25..82946d45 100644 --- a/stagehand/page.py +++ b/stagehand/page.py @@ -16,7 +16,7 @@ ObserveOptions, ObserveResult, ) -from .types import DefaultExtractSchema +from .types import DefaultExtractSchema, EmptyExtractSchema _INJECTION_SCRIPT = None @@ -361,12 +361,17 @@ async def extract( processed_data_payload = result_dict if schema_to_validate_with and isinstance(processed_data_payload, dict): try: - validated_model = schema_to_validate_with.model_validate( - processed_data_payload - ) - processed_data_payload = ( - validated_model # Payload is now the Pydantic model instance - ) + # For extract with no params + if not options_obj: + validated_model = EmptyExtractSchema.model_validate( + processed_data_payload + ) + processed_data_payload = validated_model + else: + validated_model = schema_to_validate_with.model_validate( + processed_data_payload + ) + processed_data_payload = validated_model except Exception as e: self._stagehand.logger.error( f"Failed to validate extracted data against schema {schema_to_validate_with.__name__}: {e}. Keeping raw data dict in .data field." diff --git a/stagehand/types/__init__.py b/stagehand/types/__init__.py index ac1af176..9d6378f7 100644 --- a/stagehand/types/__init__.py +++ b/stagehand/types/__init__.py @@ -23,6 +23,7 @@ ActOptions, ActResult, DefaultExtractSchema, + EmptyExtractSchema, ExtractOptions, ExtractResult, MetadataSchema, @@ -56,4 +57,5 @@ "AgentConfig", "AgentExecuteOptions", "AgentResult", + "EmptyExtractSchema", ] diff --git a/stagehand/types/page.py b/stagehand/types/page.py index ecfee164..83de4c56 100644 --- a/stagehand/types/page.py +++ b/stagehand/types/page.py @@ -9,6 +9,10 @@ class DefaultExtractSchema(BaseModel): extraction: str +class EmptyExtractSchema(BaseModel): + page_text: str + + class ObserveElementSchema(BaseModel): element_id: int description: str = Field(