diff --git a/autogpt_platform/backend/backend/blocks/firecrawl/extract.py b/autogpt_platform/backend/backend/blocks/firecrawl/extract.py old mode 100755 new mode 100644 index 4f54b102a07a..10ef20a3512f --- a/autogpt_platform/backend/backend/blocks/firecrawl/extract.py +++ b/autogpt_platform/backend/backend/blocks/firecrawl/extract.py @@ -19,6 +19,52 @@ from ._config import firecrawl +def normalize_to_json_schema(schema: dict | None) -> dict | None: + """ + Normalize a simplified schema format into valid JSON Schema format. + + Transforms simplified schemas like {"field": "type"} into proper JSON Schema format: + {"type": "object", "properties": {"field": {"type": "type"}}} + + If the schema already appears to be a valid JSON Schema (has "type" or "properties"), + it is returned as-is. + + Args: + schema: The schema to normalize, or None + + Returns: + A valid JSON Schema dict, or None if input was None + """ + if schema is None: + return None + + # If it already has "type" at the root level, assume it's already a JSON Schema + if "type" in schema: + return schema + + # If it already has "properties", assume it's already a JSON Schema + if "properties" in schema: + return schema + + # Otherwise, treat it as a simplified format and transform it + properties = {} + for key, value in schema.items(): + if isinstance(value, str): + # Simple type string like "string", "number", etc. + properties[key] = {"type": value} + elif isinstance(value, dict): + # Already a property definition, use as-is + properties[key] = value + else: + # Fallback: treat as any type + properties[key] = {"type": "string"} + + return { + "type": "object", + "properties": properties, + } + + @cost(BlockCost(2, BlockCostType.RUN)) class FirecrawlExtractBlock(Block): class Input(BlockSchemaInput): @@ -30,7 +76,7 @@ class Input(BlockSchemaInput): description="The prompt to use for the crawl", default=None, advanced=False ) output_schema: dict | None = SchemaField( - description="A Json Schema describing the output structure if more rigid structure is desired.", + description='A JSON Schema describing the output structure. Supports both simplified format (e.g., {"field": "string"}) and full JSON Schema format (e.g., {"type": "object", "properties": {"field": {"type": "string"}}}).', default=None, ) enable_web_search: bool = SchemaField( @@ -59,10 +105,13 @@ async def run( ) -> BlockOutput: app = FirecrawlApp(api_key=credentials.api_key.get_secret_value()) + # Normalize the schema to ensure it's in valid JSON Schema format + normalized_schema = normalize_to_json_schema(input_data.output_schema) + extract_result = app.extract( urls=input_data.urls, prompt=input_data.prompt, - schema=input_data.output_schema, + schema=normalized_schema, enable_web_search=input_data.enable_web_search, )