Skip to content

Commit 458bb28

Browse files
committed
feat: add infnite scroll
1 parent 6c2806d commit 458bb28

File tree

3 files changed

+131
-1
lines changed

3 files changed

+131
-1
lines changed
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
"""
2+
Example of using SmartScraper with infinite scrolling in asynchronous mode.
3+
This example demonstrates how to scrape content from multiple webpages concurrently using infinite scrolling.
4+
"""
5+
6+
import asyncio
7+
from scrapegraph_py import AsyncClient
8+
from scrapegraph_py.logger import sgai_logger
9+
10+
# Set up logging
11+
sgai_logger.set_logging(level="INFO")
12+
13+
async def scrape_with_infinite_scroll(client: AsyncClient, url: str, prompt: str, max_pages: int = 10):
14+
"""Helper function to perform a single scraping task with infinite scrolling"""
15+
response = await client.smartscraper(
16+
website_url=url,
17+
user_prompt=prompt,
18+
infinite_scrolling=True,
19+
max_pages=max_pages
20+
)
21+
return response
22+
23+
async def main():
24+
# Initialize the async client with your API key
25+
async with AsyncClient(api_key="your-api-key-here") as sgai_client:
26+
# Example 1: Scrape multiple pages concurrently
27+
tasks = [
28+
scrape_with_infinite_scroll(
29+
sgai_client,
30+
"https://example.com/products",
31+
"Extract all product names and prices",
32+
max_pages=20
33+
),
34+
scrape_with_infinite_scroll(
35+
sgai_client,
36+
"https://example.com/articles",
37+
"Extract all article titles and authors",
38+
max_pages=15
39+
),
40+
scrape_with_infinite_scroll(
41+
sgai_client,
42+
"https://example.com/news",
43+
"Extract all news headlines and dates",
44+
max_pages=10
45+
)
46+
]
47+
48+
# Wait for all scraping tasks to complete
49+
results = await asyncio.gather(*tasks)
50+
51+
# Process and print results
52+
for i, result in enumerate(results, 1):
53+
print(f"\nExample {i} Results:")
54+
print(f"Request ID: {result['request_id']}")
55+
print(f"Result: {result['result']}")
56+
57+
# Example 2: Single page without infinite scrolling
58+
response = await sgai_client.smartscraper(
59+
website_url="https://example.com/static-page",
60+
user_prompt="Extract the main heading and first paragraph",
61+
infinite_scrolling=False
62+
)
63+
print("\nExample 4 - Without infinite scrolling:")
64+
print(f"Request ID: {response['request_id']}")
65+
print(f"Result: {response['result']}")
66+
67+
if __name__ == "__main__":
68+
asyncio.run(main())
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
"""
2+
Example of using SmartScraper with infinite scrolling in synchronous mode.
3+
This example demonstrates how to scrape content from a webpage that requires scrolling to load more content.
4+
"""
5+
6+
from scrapegraph_py import Client
7+
from scrapegraph_py.logger import sgai_logger
8+
import time
9+
10+
# Set up logging
11+
sgai_logger.set_logging(level="INFO")
12+
13+
def main():
14+
# Initialize the client with your API key
15+
sgai_client = Client(api_key="your-api-key-here")
16+
17+
try:
18+
# Example 1: Basic infinite scrolling with default settings
19+
response1 = sgai_client.smartscraper(
20+
website_url="https://example.com/infinite-scroll",
21+
user_prompt="Extract all product names and prices from the page",
22+
infinite_scrolling=True # Uses default max_pages=10
23+
)
24+
print("\nExample 1 - Basic infinite scrolling:")
25+
print(f"Request ID: {response1['request_id']}")
26+
print(f"Result: {response1['result']}")
27+
28+
# Example 2: Custom infinite scrolling with specific max pages
29+
response2 = sgai_client.smartscraper(
30+
website_url="https://example.com/long-list",
31+
user_prompt="Extract all article titles and their publication dates",
32+
infinite_scrolling=True,
33+
max_pages=50 # Custom maximum number of pages to scroll
34+
)
35+
print("\nExample 2 - Custom max pages:")
36+
print(f"Request ID: {response2['request_id']}")
37+
print(f"Result: {response2['result']}")
38+
39+
# Example 3: Without infinite scrolling (for comparison)
40+
response3 = sgai_client.smartscraper(
41+
website_url="https://example.com/static-page",
42+
user_prompt="Extract the main heading and first paragraph",
43+
infinite_scrolling=False
44+
)
45+
print("\nExample 3 - Without infinite scrolling:")
46+
print(f"Request ID: {response3['request_id']}")
47+
print(f"Result: {response3['result']}")
48+
49+
finally:
50+
# Always close the client when done
51+
sgai_client.close()
52+
53+
if __name__ == "__main__":
54+
main()

scrapegraph-py/scrapegraph_py/models/smartscraper.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from uuid import UUID
55

66
from bs4 import BeautifulSoup
7-
from pydantic import BaseModel, Field, model_validator
7+
from pydantic import BaseModel, Field, model_validator, conint
88

99

1010
class SmartScraperRequest(BaseModel):
@@ -28,6 +28,14 @@ class SmartScraperRequest(BaseModel):
2828
},
2929
description="Optional headers to send with the request, including cookies and user agent",
3030
)
31+
infinite_scrolling: bool = Field(
32+
default=False,
33+
description="Enable infinite scrolling to load more content dynamically",
34+
)
35+
max_pages: conint(ge=1, le=1000) = Field(
36+
default=10,
37+
description="Maximum number of pages to scroll when infinite_scrolling is enabled",
38+
)
3139
output_schema: Optional[Type[BaseModel]] = None
3240

3341
@model_validator(mode="after")

0 commit comments

Comments
 (0)