Skip to content

Commit 6961e16

Browse files
authored
Merge pull request #1 from ScrapeGraphAI/py-development
Py development
2 parents 9617b60 + 3c72586 commit 6961e16

16 files changed

+342
-147
lines changed

scrapegraph-py/README.md

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,51 @@ The SDK provides four main functionalities:
3434

3535
## Usage
3636

37+
### Basic Web Scraping
38+
39+
```python
40+
from scrapegraph_py import ScrapeGraphClient, scrape
41+
from dotenv import load_dotenv
42+
43+
load_dotenv()
44+
api_key = os.getenv("SCRAPEGRAPH_API_KEY")
45+
client = ScrapeGraphClient(api_key)
46+
47+
url = "https://scrapegraphai.com/"
48+
prompt = "What does the company do?"
49+
50+
result = scrape(client, url, prompt)
51+
print(result)
52+
```
53+
54+
### Local HTML Scraping
55+
56+
You can also scrape content from local HTML files:
57+
58+
```python
59+
from scrapegraph_py import ScrapeGraphClient, scrape_text
60+
from bs4 import BeautifulSoup
61+
62+
def scrape_local_html(client: ScrapeGraphClient, file_path: str, prompt: str):
63+
with open(file_path, 'r', encoding='utf-8') as file:
64+
html_content = file.read()
65+
66+
# Use BeautifulSoup to extract text content
67+
soup = BeautifulSoup(html_content, 'html.parser')
68+
text_content = soup.get_text(separator='\n', strip=True)
69+
70+
# Use ScrapeGraph AI to analyze the text
71+
return scrape_text(client, text_content, prompt)
72+
73+
# Usage
74+
client = ScrapeGraphClient(api_key)
75+
result = scrape_local_html(
76+
client,
77+
'sample.html',
78+
"Extract main content and important information"
79+
)
80+
print("Extracted Data:", result)
81+
```
3782

3883
### Structured Data Extraction
3984

scrapegraph-py/examples/credits_example.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,16 @@
55

66
import os
77
from dotenv import load_dotenv
8-
from scrapegraph_py import credits
8+
from scrapegraph_py import ScrapeGraphClient, credits
99

1010
# Load environment variables from a .env file
1111
load_dotenv()
1212

1313
def main():
1414
api_key = os.getenv("SCRAPEGRAPH_API_KEY")
15+
client = ScrapeGraphClient(api_key)
1516

16-
response = credits(api_key)
17+
response = credits(client)
1718
print("Response from the API:")
1819
print(response)
1920

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
11
import os
22
from dotenv import load_dotenv
3-
from scrapegraph_py import status
4-
from scrapegraph_py import feedback
3+
from scrapegraph_py import ScrapeGraphClient, feedback, status
54

65
# Load environment variables from .env file
76
load_dotenv()
87

98
def main():
109
# Get API key from environment variables
1110
api_key = os.getenv("SCRAPEGRAPH_API_KEY")
11+
client = ScrapeGraphClient(api_key)
1212

1313
# Check API status
1414
try:
15-
result = status(api_key)
15+
result = status(client)
1616
print(f"API Status: {result}")
1717
except Exception as e:
1818
print(f"Error occurred: {e}")
@@ -21,8 +21,8 @@ def main():
2121
request_id = "3fa85f64-5717-4562-b3fc-2c963f66afa6"
2222
rating = 5
2323
feedback_message = "This is a test feedback message."
24-
feedback_response = feedback(api_key, request_id, rating, feedback_message) # Call the feedback function
25-
print(f"Feedback Response: {feedback_response}") # Print the response
24+
feedback_response = feedback(client, request_id, rating, feedback_message)
25+
print(f"Feedback Response: {feedback_response}")
2626

2727
if __name__ == "__main__":
2828
main()

scrapegraph-py/examples/local_scraper_example.py

Lines changed: 21 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,56 +1,45 @@
11
from bs4 import BeautifulSoup
22
import os
3+
from scrapegraph_py import ScrapeGraphClient, scrape_text
4+
from dotenv import load_dotenv
35

4-
def scrape_local_html(file_path):
6+
def scrape_local_html(client: ScrapeGraphClient, file_path: str, prompt: str):
57
"""
6-
Scrape content from a local HTML file.
8+
Scrape content from a local HTML file using ScrapeGraph AI.
79
810
Args:
11+
client (ScrapeGraphClient): Initialized ScrapeGraph client
912
file_path (str): Path to the local HTML file
13+
prompt (str): Natural language prompt describing what to extract
1014
1115
Returns:
12-
dict: Extracted data from the HTML file
16+
str: Extracted data in JSON format
1317
"""
14-
# Check if file exists
1518
if not os.path.exists(file_path):
1619
raise FileNotFoundError(f"HTML file not found at: {file_path}")
1720

18-
# Read the HTML file
1921
with open(file_path, 'r', encoding='utf-8') as file:
2022
html_content = file.read()
2123

22-
# Parse HTML with BeautifulSoup
24+
# Use BeautifulSoup to extract text content
2325
soup = BeautifulSoup(html_content, 'html.parser')
26+
text_content = soup.get_text(separator='\n', strip=True)
2427

25-
# Example extraction - modify based on your HTML structure
26-
data = {
27-
'title': soup.title.string if soup.title else None,
28-
'paragraphs': [p.text for p in soup.find_all('p')],
29-
'links': [{'text': a.text, 'href': a.get('href')} for a in soup.find_all('a')],
30-
'headers': [h.text for h in soup.find_all(['h1', 'h2', 'h3'])]
31-
}
32-
33-
return data
28+
# Use ScrapeGraph AI to analyze the text
29+
return scrape_text(client, text_content, prompt)
3430

3531
def main():
36-
# Example usage
32+
load_dotenv()
33+
api_key = os.getenv("SCRAPEGRAPH_API_KEY")
34+
client = ScrapeGraphClient(api_key)
35+
3736
try:
38-
# Assuming you have a sample.html file in the same directory
39-
result = scrape_local_html('sample.html')
40-
41-
# Print extracted data
42-
print("Title:", result['title'])
43-
print("\nParagraphs:")
44-
for p in result['paragraphs']:
45-
print(f"- {p}")
46-
47-
print("\nLinks:")
48-
for link in result['links']:
49-
print(f"- {link['text']}: {link['href']}")
50-
51-
print("\nHeaders:")
52-
for header in result['headers']:
53-
print(f"- {header}")
37+
result = scrape_local_html(
38+
client,
39+
'sample.html',
40+
"Extract main content and important information"
41+
)
42+
print("Extracted Data:", result)
5443

5544
except FileNotFoundError as e:
5645
print(f"Error: {e}")
Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
import os
2-
from scrapegraph_py import scrape
2+
from scrapegraph_py import ScrapeGraphClient, scrape
33
from dotenv import load_dotenv
44

5-
65
load_dotenv()
76
api_key = os.getenv("SCRAPEGRAPH_API_KEY")
7+
client = ScrapeGraphClient(api_key)
8+
89
url = "https://scrapegraphai.com/"
910
prompt = "What does the company do?"
1011

11-
result = scrape(api_key, url, prompt)
12+
result = scrape(client, url, prompt)
1213
print(result)
Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import os
22
from pydantic import BaseModel, Field
3-
from scrapegraph_py import scrape
3+
from scrapegraph_py import ScrapeGraphClient, scrape
44
from dotenv import load_dotenv
55

66
load_dotenv()
@@ -11,12 +11,13 @@ class CompanyInfoSchema(BaseModel):
1111
description: str = Field(description="A description of the company")
1212
main_products: list[str] = Field(description="The main products of the company")
1313

14-
# Example usage
14+
# Initialize client
1515
api_key = os.getenv("SCRAPEGRAPH_API_KEY")
16+
client = ScrapeGraphClient(api_key)
17+
1618
url = "https://scrapegraphai.com/"
1719
prompt = "What does the company do?"
1820

1921
# Call the scrape function with the schema
20-
result = scrape(api_key=api_key, url=url, prompt=prompt, schema=CompanyInfoSchema)
21-
22+
result = scrape(client=client, url=url, prompt=prompt, schema=CompanyInfoSchema)
2223
print(result)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from .client import ScrapeGraphClient
12
from .scrape import scrape
23
from .credits import credits
34
from .feedback import feedback
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
"""
2+
Module for ScrapeGraph Client
3+
4+
This module contains the ScrapeGraphClient class, which provides methods to interact
5+
with the ScrapeGraph AI API. It allows users to initialize the client with an API key,
6+
retrieve necessary headers for API requests, and construct full endpoint URLs for
7+
making requests to the ScrapeGraph API. This facilitates seamless integration with
8+
ScrapeGraph AI services.
9+
"""
10+
11+
class ScrapeGraphClient:
12+
"""Client for interacting with the ScrapeGraph AI API.
13+
14+
This class provides methods to initialize the client with an API key and base URL,
15+
retrieve headers for API requests, and construct full endpoint URLs for making
16+
requests to the ScrapeGraph API. It is designed to facilitate seamless interaction
17+
with the ScrapeGraph AI services.
18+
19+
Attributes:
20+
api_key (str): Your ScrapeGraph AI API key.
21+
base_url (str): Base URL for the API, defaulting to "https://api.scrapegraphai.com/v1".
22+
"""
23+
24+
def __init__(self, api_key: str, base_url: str = "https://api.scrapegraphai.com/v1"):
25+
"""Initialize the ScrapeGraph client.
26+
27+
Args:
28+
api_key (str): Your ScrapeGraph AI API key.
29+
base_url (str): Base URL for the API (optional, defaults
30+
to "https://api.scrapegraphai.com/v1").
31+
"""
32+
self.api_key = api_key
33+
self.base_url = base_url.rstrip('/')
34+
35+
def get_headers(self, include_content_type: bool = True) -> dict:
36+
"""Get the headers for API requests.
37+
38+
Args:
39+
include_content_type (bool): Whether to include the Content-Type header
40+
(default is True).
41+
42+
Returns:
43+
dict: A dictionary containing the headers for the API request, including
44+
the API key and optionally the Content-Type.
45+
"""
46+
headers = {
47+
"accept": "application/json",
48+
"SGAI-APIKEY": self.api_key
49+
}
50+
51+
if include_content_type:
52+
headers["Content-Type"] = "application/json"
53+
54+
return headers
55+
56+
def get_endpoint(self, path: str) -> str:
57+
"""Get the full endpoint URL.
58+
59+
Args:
60+
path (str): The API endpoint path to be appended to the base URL.
61+
62+
Returns:
63+
str: The full endpoint URL constructed from the base URL and the provided path.
64+
"""
65+
return f"{self.base_url}/{path}"
Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,29 @@
11
"""
22
This module provides functionality to interact with the ScrapeGraph AI API.
33
4-
It includes functions to retrieve credits and send feedback, handling responses and errors appropriately.
4+
It includes functions to retrieve credits and send feedback,
5+
handling responses and errors appropriately.
56
"""
67

78
import requests
8-
import json
9+
from .client import ScrapeGraphClient
10+
from .exceptions import raise_for_status_code, APIError
911

10-
def credits(api_key: str) -> str:
12+
def credits(client: ScrapeGraphClient) -> str:
1113
"""Retrieve credits from the API.
1214
1315
Args:
14-
api_key (str): Your ScrapeGraph AI API key.
16+
client (ScrapeGraphClient): Initialized ScrapeGraph client
1517
1618
Returns:
1719
str: Response from the API in JSON format.
1820
"""
19-
endpoint = "https://sgai-api.onrender.com/api/v1/credits"
20-
headers = {
21-
"accept": "application/json",
22-
"SGAI-API-KEY": api_key
23-
}
21+
endpoint = client.get_endpoint("credits")
22+
headers = client.get_headers(include_content_type=False)
2423

2524
try:
2625
response = requests.get(endpoint, headers=headers)
27-
response.raise_for_status()
28-
except requests.exceptions.HTTPError as http_err:
29-
return json.dumps({"error": "HTTP error occurred", "message": str(http_err), "status_code": response.status_code})
26+
raise_for_status_code(response.status_code, response)
27+
return response.text
3028
except requests.exceptions.RequestException as e:
31-
return json.dumps({"error": "An error occurred", "message": str(e)})
32-
33-
return response.text
29+
raise APIError(f"Request failed: {str(e)}", response=None)

0 commit comments

Comments
 (0)