Skip to content

Commit 788107b

Browse files
committed
feat: add steps integration
1 parent eb9bc78 commit 788107b

File tree

11 files changed

+2204
-6
lines changed

11 files changed

+2204
-6
lines changed

scrapegraph-py/examples/async/async_markdownify_steps_example.py

Lines changed: 409 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 320 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,320 @@
1+
"""
2+
Async example demonstrating how to use the Smart Scraper API with interactive steps.
3+
This example shows how to:
4+
1. Set up interactive steps for website navigation asynchronously
5+
2. Use the AsyncClient with custom steps
6+
3. Handle concurrent requests with different step configurations
7+
4. Display comprehensive results with timing
8+
9+
Interactive steps allow you to:
10+
- Click on elements
11+
- Fill input fields
12+
- Wait for page loads
13+
- Navigate through multiple pages
14+
- Perform complex user interactions
15+
16+
Requirements:
17+
- Python 3.7+
18+
- scrapegraph-py
19+
- A .env file with your SGAI_API_KEY
20+
21+
Example .env file:
22+
SGAI_API_KEY=your_api_key_here
23+
"""
24+
25+
import asyncio
26+
import json
27+
import os
28+
import time
29+
from dotenv import load_dotenv
30+
from scrapegraph_py import AsyncClient
31+
32+
# Load environment variables from .env file
33+
load_dotenv()
34+
35+
36+
async def async_smartscraper_with_steps():
37+
"""
38+
Enhanced async Smart Scraper function with interactive steps.
39+
This demonstrates how to use interactive movements to navigate websites asynchronously.
40+
"""
41+
42+
# Get API key from environment
43+
api_key = os.getenv("SGAI_API_KEY")
44+
if not api_key:
45+
raise ValueError(
46+
"API key must be provided or set in .env file as SGAI_API_KEY. "
47+
"Create a .env file with: SGAI_API_KEY=your_api_key_here"
48+
)
49+
50+
# Interactive steps for website navigation
51+
steps = [
52+
"click on search bar",
53+
"wait for 500ms",
54+
"fill email input box with [email protected]",
55+
"wait a sec",
56+
"click on the first result of search",
57+
"wait for 2 seconds to load the result of search",
58+
]
59+
60+
# Target website configuration
61+
website_url = "https://github.com/"
62+
user_prompt = "Extract user profile information"
63+
64+
print("🚀 Starting Async Smart Scraper with Interactive Steps...")
65+
print(f"🌐 Website URL: {website_url}")
66+
print(f"🎯 User Prompt: {user_prompt}")
67+
print(f"📋 Interactive Steps: {len(steps)} steps configured")
68+
print("\n" + "=" * 60)
69+
70+
# Display interactive steps
71+
print("🎯 Interactive Steps to Execute:")
72+
for i, step in enumerate(steps, 1):
73+
print(f" {i}. {step}")
74+
print("\n" + "=" * 60)
75+
76+
# Start timer
77+
start_time = time.time()
78+
print(f"⏱️ Timer started at: {time.strftime('%H:%M:%S', time.localtime(start_time))}")
79+
print("🔄 Processing async request with interactive steps...")
80+
81+
try:
82+
# Initialize async client
83+
async with AsyncClient.from_env() as client:
84+
# Make request with interactive steps
85+
response = await client.smartscraper(
86+
user_prompt=user_prompt,
87+
website_url=website_url,
88+
steps=steps
89+
)
90+
91+
# Calculate execution time
92+
end_time = time.time()
93+
execution_time = end_time - start_time
94+
execution_minutes = execution_time / 60
95+
96+
print(f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}")
97+
print(f"⚡ Total execution time: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)")
98+
print(f"📊 Performance: {execution_time:.1f}s ({execution_minutes:.1f}m) for {len(steps)} interactive steps")
99+
100+
# Display results
101+
print("✅ Request completed successfully!")
102+
print(f"📊 Request ID: {response.get('request_id', 'N/A')}")
103+
print(f"🔄 Status: {response.get('status', 'N/A')}")
104+
105+
if response.get("error"):
106+
print(f"❌ Error: {response['error']}")
107+
else:
108+
print("\n📋 EXTRACTED DATA:")
109+
print("=" * 60)
110+
111+
# Pretty print the result
112+
if "result" in response:
113+
result_data = response["result"]
114+
print(json.dumps(result_data, indent=2, ensure_ascii=False))
115+
116+
# Display extraction statistics
117+
print("\n📊 EXTRACTION STATISTICS:")
118+
print("-" * 50)
119+
result_str = json.dumps(result_data)
120+
print(f"📝 Data size: {len(result_str)} characters")
121+
print(f"🔗 JSON keys: {len(result_data) if isinstance(result_data, dict) else 'N/A'}")
122+
print(f"⚡ Processing speed: {len(result_str)/execution_time:.0f} chars/second")
123+
print(f"🎯 Steps efficiency: {execution_time/len(steps):.2f}s per step")
124+
else:
125+
print("No result data found")
126+
127+
except Exception as e:
128+
end_time = time.time()
129+
execution_time = end_time - start_time
130+
execution_minutes = execution_time / 60
131+
132+
print(f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}")
133+
print(f"⚡ Execution time before error: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)")
134+
print(f"💥 Error occurred: {str(e)}")
135+
136+
137+
async def async_smartscraper_concurrent_steps():
138+
"""
139+
Demonstrate concurrent Smart Scraper requests with different step configurations.
140+
"""
141+
print("\n🎯 CONCURRENT REQUESTS WITH DIFFERENT STEPS")
142+
print("=" * 60)
143+
144+
# Configuration for concurrent requests
145+
configs = [
146+
{
147+
"name": "GitHub Search",
148+
"url": "https://github.com/",
149+
"prompt": "Extract repository information",
150+
"steps": [
151+
"click on search bar",
152+
"wait for 300ms",
153+
"fill search with 'python'",
154+
"wait for 1 second",
155+
"click first result"
156+
]
157+
},
158+
{
159+
"name": "Profile Navigation",
160+
"url": "https://github.com/",
161+
"prompt": "Extract user profile details",
162+
"steps": [
163+
"click on profile menu",
164+
"wait for 1 second",
165+
"click on settings",
166+
"wait for 2 seconds",
167+
"scroll to profile section"
168+
]
169+
},
170+
{
171+
"name": "Repository Details",
172+
"url": "https://github.com/",
173+
"prompt": "Extract repository details",
174+
"steps": [
175+
"click on repositories tab",
176+
"wait for 500ms",
177+
"click on first repository",
178+
"wait for 1 second",
179+
"scroll to readme section"
180+
]
181+
}
182+
]
183+
184+
print(f"🔄 Executing {len(configs)} concurrent requests...")
185+
start_time = time.time()
186+
187+
try:
188+
async with AsyncClient.from_env() as client:
189+
# Create tasks for concurrent execution
190+
tasks = []
191+
for config in configs:
192+
task = client.smartscraper(
193+
user_prompt=config["prompt"],
194+
website_url=config["url"],
195+
steps=config["steps"]
196+
)
197+
tasks.append(task)
198+
199+
# Execute tasks concurrently
200+
results = await asyncio.gather(*tasks, return_exceptions=True)
201+
202+
# Calculate total execution time
203+
end_time = time.time()
204+
execution_time = end_time - start_time
205+
206+
print(f"⚡ Total concurrent execution time: {execution_time:.2f} seconds")
207+
print(f"📊 Average per request: {execution_time/len(configs):.2f} seconds")
208+
209+
# Display results
210+
print("\n📋 CONCURRENT RESULTS:")
211+
print("=" * 60)
212+
213+
for i, (config, result) in enumerate(zip(configs, results), 1):
214+
print(f"\n{i}. {config['name']}:")
215+
print(f" 🎯 Prompt: {config['prompt']}")
216+
print(f" 📝 Steps: {len(config['steps'])}")
217+
218+
if isinstance(result, Exception):
219+
print(f" ❌ Error: {str(result)}")
220+
else:
221+
print(f" ✅ Status: {result.get('status', 'N/A')}")
222+
print(f" 📊 Request ID: {result.get('request_id', 'N/A')}")
223+
if "result" in result:
224+
data_size = len(json.dumps(result["result"]))
225+
print(f" 📝 Data size: {data_size} characters")
226+
print("-" * 40)
227+
228+
except Exception as e:
229+
print(f"💥 Error in concurrent execution: {str(e)}")
230+
231+
232+
async def async_smartscraper_step_patterns():
233+
"""
234+
Demonstrate different step patterns for various use cases.
235+
"""
236+
print("\n🎯 DIFFERENT STEP PATTERNS DEMONSTRATION")
237+
print("=" * 60)
238+
239+
patterns = [
240+
{
241+
"name": "Authentication Flow",
242+
"description": "Steps for logging into a website",
243+
"steps": [
244+
"click on login button",
245+
"wait for 1 second",
246+
"fill username field with [email protected]",
247+
"wait for 200ms",
248+
"fill password field with password123",
249+
"wait for 300ms",
250+
"click submit button",
251+
"wait for 3 seconds"
252+
]
253+
},
254+
{
255+
"name": "Form Submission",
256+
"description": "Steps for filling and submitting a form",
257+
"steps": [
258+
"scroll to contact form",
259+
"wait for 500ms",
260+
"fill name field with John Doe",
261+
"wait for 200ms",
262+
"fill email field with [email protected]",
263+
"wait for 200ms",
264+
"fill message field with Hello World",
265+
"wait for 300ms",
266+
"click submit button"
267+
]
268+
},
269+
{
270+
"name": "Dynamic Content Loading",
271+
"description": "Steps for loading more content dynamically",
272+
"steps": [
273+
"scroll to bottom of page",
274+
"wait for 1 second",
275+
"click load more button",
276+
"wait for 2 seconds",
277+
"scroll down again",
278+
"wait for 1 second",
279+
"click show details button"
280+
]
281+
}
282+
]
283+
284+
for i, pattern in enumerate(patterns, 1):
285+
print(f"\n📋 Pattern {i}: {pattern['name']}")
286+
print(f"📝 Description: {pattern['description']}")
287+
print(f"🎯 Steps ({len(pattern['steps'])}):")
288+
for j, step in enumerate(pattern['steps'], 1):
289+
step_type = "Navigation" if "click" in step else "Wait" if "wait" in step else "Input" if "fill" in step else "Action" if "scroll" in step else "Other"
290+
print(f" {j}. {step} [{step_type}]")
291+
print("-" * 40)
292+
293+
294+
async def main():
295+
"""
296+
Main function to run the async Smart Scraper steps example.
297+
"""
298+
try:
299+
print("🎯 ASYNC SMART SCRAPER INTERACTIVE STEPS EXAMPLE")
300+
print("=" * 60)
301+
print("This example demonstrates how to use interactive steps with Async Smart Scraper.")
302+
print("Interactive steps allow you to navigate websites like a human user asynchronously.")
303+
print("This enables faster processing through concurrent requests.")
304+
print()
305+
306+
await async_smartscraper_with_steps()
307+
await async_smartscraper_concurrent_steps()
308+
await async_smartscraper_step_patterns()
309+
310+
except Exception as e:
311+
print(f"💥 Error occurred: {str(e)}")
312+
print("\n🛠️ Troubleshooting:")
313+
print("1. Make sure your .env file contains SGAI_API_KEY")
314+
print("2. Check your internet connection")
315+
print("3. Verify the target website is accessible")
316+
print("4. Ensure you have sufficient credits in your account")
317+
318+
319+
if __name__ == "__main__":
320+
asyncio.run(main())

0 commit comments

Comments
 (0)