|
1 | 1 | from bs4 import BeautifulSoup |
2 | 2 | import os |
| 3 | +from scrapegraph_py import ScrapeGraphClient, scrape_text |
| 4 | +from dotenv import load_dotenv |
3 | 5 |
|
4 | | -def scrape_local_html(file_path): |
| 6 | +def scrape_local_html(client: ScrapeGraphClient, file_path: str, prompt: str): |
5 | 7 | """ |
6 | | - Scrape content from a local HTML file. |
| 8 | + Scrape content from a local HTML file using ScrapeGraph AI. |
7 | 9 | |
8 | 10 | Args: |
| 11 | + client (ScrapeGraphClient): Initialized ScrapeGraph client |
9 | 12 | file_path (str): Path to the local HTML file |
| 13 | + prompt (str): Natural language prompt describing what to extract |
10 | 14 | |
11 | 15 | Returns: |
12 | | - dict: Extracted data from the HTML file |
| 16 | + str: Extracted data in JSON format |
13 | 17 | """ |
14 | | - # Check if file exists |
15 | 18 | if not os.path.exists(file_path): |
16 | 19 | raise FileNotFoundError(f"HTML file not found at: {file_path}") |
17 | 20 |
|
18 | | - # Read the HTML file |
19 | 21 | with open(file_path, 'r', encoding='utf-8') as file: |
20 | 22 | html_content = file.read() |
21 | 23 |
|
22 | | - # Parse HTML with BeautifulSoup |
| 24 | + # Use BeautifulSoup to extract text content |
23 | 25 | soup = BeautifulSoup(html_content, 'html.parser') |
| 26 | + text_content = soup.get_text(separator='\n', strip=True) |
24 | 27 |
|
25 | | - # Example extraction - modify based on your HTML structure |
26 | | - data = { |
27 | | - 'title': soup.title.string if soup.title else None, |
28 | | - 'paragraphs': [p.text for p in soup.find_all('p')], |
29 | | - 'links': [{'text': a.text, 'href': a.get('href')} for a in soup.find_all('a')], |
30 | | - 'headers': [h.text for h in soup.find_all(['h1', 'h2', 'h3'])] |
31 | | - } |
32 | | - |
33 | | - return data |
| 28 | + # Use ScrapeGraph AI to analyze the text |
| 29 | + return scrape_text(client, text_content, prompt) |
34 | 30 |
|
35 | 31 | def main(): |
36 | | - # Example usage |
| 32 | + load_dotenv() |
| 33 | + api_key = os.getenv("SCRAPEGRAPH_API_KEY") |
| 34 | + client = ScrapeGraphClient(api_key) |
| 35 | + |
37 | 36 | try: |
38 | | - # Assuming you have a sample.html file in the same directory |
39 | | - result = scrape_local_html('sample.html') |
40 | | - |
41 | | - # Print extracted data |
42 | | - print("Title:", result['title']) |
43 | | - print("\nParagraphs:") |
44 | | - for p in result['paragraphs']: |
45 | | - print(f"- {p}") |
46 | | - |
47 | | - print("\nLinks:") |
48 | | - for link in result['links']: |
49 | | - print(f"- {link['text']}: {link['href']}") |
50 | | - |
51 | | - print("\nHeaders:") |
52 | | - for header in result['headers']: |
53 | | - print(f"- {header}") |
| 37 | + result = scrape_local_html( |
| 38 | + client, |
| 39 | + 'sample.html', |
| 40 | + "Extract main content and important information" |
| 41 | + ) |
| 42 | + print("Extracted Data:", result) |
54 | 43 |
|
55 | 44 | except FileNotFoundError as e: |
56 | 45 | print(f"Error: {e}") |
|
0 commit comments