Skip to content

Commit b9a17d5

Browse files
committed
feat: add integration for local_scraper
1 parent d874b16 commit b9a17d5

File tree

1 file changed

+53
-0
lines changed

1 file changed

+53
-0
lines changed

scrapegraph_py/local_scraper.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
from pydantic import BaseModel
2+
import requests
3+
from typing import Optional
4+
import json
5+
6+
def scrape_text(api_key: str, website_text: str, prompt: str, schema: Optional[BaseModel] = None) -> str:
7+
"""Scrape and extract structured data from website text using ScrapeGraph AI.
8+
9+
Args:
10+
api_key (str): Your ScrapeGraph AI API key.
11+
website_text (str): The text content to analyze.
12+
prompt (str): Natural language prompt describing what data to extract.
13+
schema (Optional[BaseModel]): Pydantic model defining the output structure,
14+
if provided. The model will be converted to JSON schema before making
15+
the request.
16+
17+
Returns:
18+
str: Extracted data in JSON format matching the provided schema.
19+
"""
20+
endpoint = "https://sgai-api.onrender.com/api/v1/smartscraper"
21+
headers = {
22+
"accept": "application/json",
23+
"SGAI-API-KEY": api_key,
24+
"Content-Type": "application/json"
25+
}
26+
27+
payload = {
28+
"website_text": website_text,
29+
"user_prompt": prompt
30+
}
31+
32+
if schema:
33+
schema_json = schema.model_json_schema()
34+
payload["output_schema"] = {
35+
"description": schema_json.get("title", "Schema"),
36+
"name": schema_json.get("title", "Schema"),
37+
"properties": schema_json.get("properties", {}),
38+
"required": schema_json.get("required", [])
39+
}
40+
41+
try:
42+
response = requests.post(endpoint, headers=headers, json=payload)
43+
response.raise_for_status()
44+
except requests.exceptions.HTTPError as http_err:
45+
# Handle HTTP errors specifically
46+
if response.status_code == 403:
47+
return json.dumps({"error": "Access forbidden (403)", "message": "You do not have permission to access this resource."})
48+
return json.dumps({"error": "HTTP error occurred", "message": str(http_err), "status_code": response.status_code})
49+
except requests.exceptions.RequestException as e:
50+
# Handle other request exceptions (e.g., connection errors, timeouts)
51+
return json.dumps({"error": "An error occurred", "message": str(e)})
52+
53+
return response.text

0 commit comments

Comments
 (0)