11"""
22This module provides a function to scrape and extract structured data from a webpage
33using the ScrapeGraph AI API. It allows specifying a schema for the output structure
4- using either a dictionary or a Pydantic model.
4+ using a Pydantic model.
55"""
66
7- from typing import Union
87from pydantic import BaseModel
98import requests
109
11- def scrape (api_key : str , url : str , prompt : str , schema : Union [ dict , BaseModel , None ] = None ) -> str :
10+ def scrape (api_key : str , url : str , prompt : str , schema : BaseModel ) -> str :
1211 """Scrape and extract structured data from a webpage using ScrapeGraph AI.
1312
1413 Args:
1514 api_key (str): Your ScrapeGraph AI API key
1615 url (str): The URL of the webpage to scrape
1716 prompt (str): Natural language prompt describing what data to extract
18- schema (Union[dict, BaseModel, None], optional ): Schema definition for the output structure.
19- Can be either a dictionary or a Pydantic model. If None, returns raw JSON .
17+ schema (BaseModel): Pydantic model defining the output structure.
18+ The model will be converted to JSON schema before making the request .
2019
2120 Returns:
22- str: Extracted data in JSON format matching the provided schema (if specified)
21+ str: Extracted data in JSON format matching the provided schema
2322 """
2423 endpoint = "https://api.scrapegraph.ai/v1/scrape"
2524 headers = {
@@ -30,10 +29,10 @@ def scrape(api_key: str, url: str, prompt: str, schema: Union[dict, BaseModel, N
3029 payload = {
3130 "url" : url ,
3231 "prompt" : prompt ,
33- "schema" : schema .dict () if isinstance ( schema , BaseModel ) else schema if schema is not None else None
32+ "schema" : schema .model_json_schema ()
3433 }
3534
3635 response = requests .post (endpoint , headers = headers , json = payload )
37- response .raise_for_status () # Raise an exception for bad status codes
36+ response .raise_for_status ()
3837
3938 return response .text
0 commit comments