1+ """
2+ Module for Scraping Web Data with ScrapeGraph AI
3+
4+ This module provides functionality to scrape and extract structured data from
5+ webpages using the ScrapeGraph AI API. It defines the `scrape` function, which
6+ takes a ScrapeGraph client, a URL, a user prompt, and an optional Pydantic schema
7+ to specify the desired output structure. The extracted data is returned in JSON
8+ format, adhering to the specified schema if provided. This module is designed to
9+ facilitate the integration of web scraping capabilities into applications utilizing
10+ ScrapeGraph AI services.
11+ """
12+ from typing import Optional
13+
114from pydantic import BaseModel
215import requests
3- from typing import Optional
4- import json
516from .client import ScrapeGraphClient
6- from .exceptions import APIError
17+ from .exceptions import APIError , raise_for_status_code
718
8- def raise_for_status_code (status_code : int , response : requests .Response ):
9- if status_code >= 400 :
10- raise APIError (f"API request failed with status { status_code } " , response = response )
11-
12- def scrape (client : ScrapeGraphClient , url : str , prompt : str , schema : Optional [BaseModel ] = None ) -> str :
19+ def scrape (client : ScrapeGraphClient , url : str , prompt : str ,
20+ schema : Optional [BaseModel ] = None ) -> str :
1321 """Scrape and extract structured data from a webpage using ScrapeGraph AI.
1422
1523 Args:
@@ -25,12 +33,13 @@ def scrape(client: ScrapeGraphClient, url: str, prompt: str, schema: Optional[Ba
2533 """
2634 endpoint = client .get_endpoint ("smartscraper" )
2735 headers = client .get_headers ()
28-
36+
2937 payload = {
3038 "website_url" : url ,
31- "user_prompt" : prompt
39+ "user_prompt" : prompt ,
40+ "output_schema" : {}
3241 }
33-
42+
3443 if schema :
3544 schema_json = schema .model_json_schema ()
3645 payload ["output_schema" ] = {
@@ -39,10 +48,10 @@ def scrape(client: ScrapeGraphClient, url: str, prompt: str, schema: Optional[Ba
3948 "properties" : schema_json .get ("properties" , {}),
4049 "required" : schema_json .get ("required" , [])
4150 }
42-
51+
4352 try :
44- response = requests .post (endpoint , headers = headers , json = payload )
53+ response = requests .post (endpoint , headers = headers , json = payload , timeout = 10 )
4554 raise_for_status_code (response .status_code , response )
4655 return response .text
4756 except requests .exceptions .RequestException as e :
48- raise APIError (f"Request failed: { str (e )} " , response = None )
57+ raise APIError (f"Request failed: { str (e )} " , response = None )
0 commit comments