16
16
ObserveOptions ,
17
17
ObserveResult ,
18
18
)
19
+ from .types import DefaultExtractSchema
19
20
20
21
_INJECTION_SCRIPT = None
21
22
@@ -236,15 +237,36 @@ async def extract(
236
237
# Otherwise use API implementation
237
238
# Allow for no options to extract the entire page
238
239
if options is None :
240
+ options_obj = ExtractOptions ()
239
241
payload = {}
240
242
# Convert string to ExtractOptions if needed
241
243
elif isinstance (options , str ):
242
- options = ExtractOptions (instruction = options )
243
- payload = options .model_dump (exclude_none = True , by_alias = True )
244
+ options_obj = ExtractOptions (instruction = options )
245
+ payload = options_obj .model_dump (exclude_none = True , by_alias = True )
244
246
# Otherwise, it should be an ExtractOptions object
245
247
else :
248
+ options_obj = options
246
249
# Allow extraction without instruction if other options (like schema) are provided
247
- payload = options .model_dump (exclude_none = True , by_alias = True )
250
+ payload = options_obj .model_dump (exclude_none = True , by_alias = True )
251
+
252
+ # Determine the schema to pass to the handler
253
+ schema_to_validate_with = None
254
+ if (
255
+ hasattr (options_obj , "schema_definition" )
256
+ and options_obj .schema_definition != DEFAULT_EXTRACT_SCHEMA
257
+ ):
258
+ if isinstance (options_obj .schema_definition , type ) and issubclass (
259
+ options_obj .schema_definition , BaseModel
260
+ ):
261
+ # Case 1: Pydantic model class
262
+ schema_to_validate_with = options_obj .schema_definition
263
+ elif isinstance (options_obj .schema_definition , dict ):
264
+ # TODO: revisit this case to pass the json_schema since litellm has a bug when passing it directly
265
+ # Case 2: Dictionary
266
+ # Assume it's a direct JSON schema dictionary
267
+ schema_to_validate_with = options_obj .schema_definition
268
+ else :
269
+ schema_to_validate_with = DefaultExtractSchema
248
270
249
271
# If in LOCAL mode, use local implementation
250
272
if self ._stagehand .env == "LOCAL" :
@@ -263,57 +285,39 @@ async def extract(
263
285
)
264
286
return result
265
287
266
- # Convert string to ExtractOptions if needed
267
- if isinstance (options , str ):
268
- options = ExtractOptions (instruction = options )
269
-
270
- # Determine the schema to pass to the handler
271
- schema_to_pass_to_handler = None
272
- if (
273
- hasattr (options , "schema_definition" )
274
- and options .schema_definition != DEFAULT_EXTRACT_SCHEMA
275
- ):
276
- if isinstance (options .schema_definition , type ) and issubclass (
277
- options .schema_definition , BaseModel
278
- ):
279
- # Case 1: Pydantic model class
280
- schema_to_pass_to_handler = options .schema_definition
281
- elif isinstance (options .schema_definition , dict ):
282
- # TODO: revisit this case to pass the json_schema since litellm has a bug when passing it directly
283
- # Case 2: Dictionary
284
- # Assume it's a direct JSON schema dictionary
285
- schema_to_pass_to_handler = options .schema_definition
286
-
287
288
# Call local extract implementation
288
289
result = await self ._extract_handler .extract (
289
- options ,
290
- schema_to_pass_to_handler ,
290
+ options_obj ,
291
+ schema_to_validate_with ,
291
292
)
292
293
return result .data
293
294
295
+ # Use API
294
296
lock = self ._stagehand ._get_lock_for_session ()
295
297
async with lock :
296
- result = await self ._stagehand ._execute ("extract" , payload )
298
+ result_dict = await self ._stagehand ._execute ("extract" , payload )
297
299
298
- # Attempt to parse the result using the base ExtractResult,
299
- # which allows extra fields based on the dynamic schema.
300
- if isinstance (result , dict ):
300
+ if isinstance (result_dict , dict ):
301
301
# Pydantic will validate against known fields + allow extras if configured
302
- try :
303
- # Note: We don't know the exact return structure here,
304
- # ExtractResult allows extra fields.
305
- # The user needs to access data based on their schema.
306
- return ExtractResult (** result )
307
- except Exception as e :
308
- self ._stagehand .logger .error (f"Failed to parse extract result: { e } " )
309
- # Return raw dict if parsing fails, or raise? Returning dict for now.
310
- return result # type: ignore
302
+ processed_data_payload = result_dict
303
+ if schema_to_validate_with and isinstance (processed_data_payload , dict ):
304
+ try :
305
+ validated_model = schema_to_validate_with .model_validate (
306
+ processed_data_payload
307
+ )
308
+ processed_data_payload = (
309
+ validated_model # Payload is now the Pydantic model instance
310
+ )
311
+ except Exception as e :
312
+ self ._stagehand .logger .error (
313
+ f"Failed to validate extracted data against schema { schema_to_validate_with .__name__ } : { e } . Keeping raw data dict in .data field."
314
+ )
315
+ return ExtractResult (data = processed_data_payload ).data
311
316
# Handle unexpected return types
312
317
self ._stagehand .logger .info (
313
- f"Unexpected result type from extract: { type (result )} "
318
+ f"Unexpected result type from extract: { type (result_dict )} "
314
319
)
315
- # Return raw result if not dict or raise error
316
- return result # type: ignore
320
+ return result_dict
317
321
318
322
async def screenshot (self , options : Optional [dict ] = None ) -> str :
319
323
"""
0 commit comments