Skip to content

Commit c611070

Browse files
עידן וילנסקיעידן וילנסקי
authored andcommitted
v1.1.0: Add web crawling, content parsing, and browser automation support
- Add crawl() function for website discovery and multi-page scraping - Add parse_content() function for extracting text, links, and structured data - Add connect_browser() function for Playwright/Selenium integration - Improve download_snapshot() with better 202 status handling - Fix zone creation error handling with proper retry logic - Update CI to support Python 3.8+ and remove 3.7 compatibility - Add BeautifulSoup4 dependency for content parsing
1 parent cb4e7fd commit c611070

19 files changed

+845
-80
lines changed

.github/workflows/test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
runs-on: ubuntu-latest
1212
strategy:
1313
matrix:
14-
python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12']
14+
python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
1515

1616
steps:
1717
- uses: actions/checkout@v4

brightdata/__init__.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,22 @@
2929
- Saves the scraped content to local files in various formats (JSON, CSV, etc.)
3030
- syntax: `client.download_content(results)`
3131
- syntax: `client.download_snapshot(results)`
32+
#### connect_browser()
33+
- Get WebSocket endpoint for connecting to Bright Data's scraping browser with Playwright/Selenium
34+
- syntax: `endpoint_url = client.connect_browser()` then use with browser automation tools
35+
#### crawl()
36+
- Crawl websites to discover and scrape multiple pages using Bright Data's Web Crawl API
37+
- syntax: `result = client.crawl(url, filter, exclude_filter, depth, ...)`
38+
#### parse_content()
39+
- Parse and extract useful information from API responses (JSON or HTML)
40+
- syntax: `parsed = client.parse_content(data, extract_text=True, extract_links=True)`
3241
3342
### Features:
3443
- Web Scraping: Scrape websites using Bright Data Web Unlocker API with proxy support
3544
- Search Engine Results: Perform web searches using Bright Data SERP API
45+
- Web Crawling: Discover and scrape multiple pages from websites with advanced filtering
46+
- Content Parsing: Extract text, links, images, and structured data from API responses
47+
- Browser Automation: Simple authentication for Bright Data's scraping browser with Playwright/Selenium
3648
- Multiple Search Engines: Support for Google, Bing, and Yandex
3749
- Parallel Processing: Concurrent processing for multiple URLs or queries
3850
- Robust Error Handling: Comprehensive error handling with retry logic
@@ -50,8 +62,9 @@
5062
NetworkError,
5163
APIError
5264
)
65+
from .utils import parse_content, parse_multiple, extract_structured_data
5366

54-
__version__ = "1.0.8"
67+
__version__ = "1.1.0"
5568
__author__ = "Bright Data"
5669
__email__ = "[email protected]"
5770

@@ -62,5 +75,8 @@
6275
'AuthenticationError',
6376
'ZoneError',
6477
'NetworkError',
65-
'APIError'
78+
'APIError',
79+
'parse_content',
80+
'parse_multiple',
81+
'extract_structured_data'
6682
]

brightdata/api/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
from .search import SearchAPI
33
from .chatgpt import ChatGPTAPI
44
from .linkedin import LinkedInAPI
5+
from .crawl import CrawlAPI
56

67
__all__ = [
78
'WebScraper',
89
'SearchAPI',
910
'ChatGPTAPI',
10-
'LinkedInAPI'
11+
'LinkedInAPI',
12+
'CrawlAPI'
1113
]

brightdata/api/crawl.py

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
import json
2+
from typing import Union, Dict, Any, List, Optional
3+
from ..utils import get_logger, validate_url
4+
from ..exceptions import ValidationError, APIError, AuthenticationError
5+
6+
logger = get_logger('api.crawl')
7+
8+
9+
class CrawlAPI:
10+
"""Handles crawl operations using Bright Data's Web Crawl API"""
11+
12+
CRAWL_DATASET_ID = "gd_m6gjtfmeh43we6cqc"
13+
14+
AVAILABLE_OUTPUT_FIELDS = [
15+
"markdown", "url", "html2text", "page_html", "ld_json",
16+
"page_title", "timestamp", "input", "discovery_input",
17+
"error", "error_code", "warning", "warning_code"
18+
]
19+
20+
def __init__(self, session, api_token, default_timeout=30, max_retries=3, retry_backoff=1.5):
21+
self.session = session
22+
self.api_token = api_token
23+
self.default_timeout = default_timeout
24+
self.max_retries = max_retries
25+
self.retry_backoff = retry_backoff
26+
27+
def crawl(
28+
self,
29+
url: Union[str, List[str]],
30+
ignore_sitemap: Optional[bool] = None,
31+
depth: Optional[int] = None,
32+
filter: Optional[str] = None,
33+
exclude_filter: Optional[str] = None,
34+
custom_output_fields: Optional[List[str]] = None,
35+
include_errors: bool = True
36+
) -> Dict[str, Any]:
37+
"""
38+
## Crawl websites using Bright Data's Web Crawl API
39+
40+
Performs web crawling to discover and scrape multiple pages from a website
41+
starting from the specified URL(s).
42+
43+
### Parameters:
44+
- `url` (str | List[str]): Domain URL(s) to crawl (required)
45+
- `ignore_sitemap` (bool, optional): Ignore sitemap when crawling
46+
- `depth` (int, optional): Maximum depth to crawl relative to the entered URL
47+
- `filter` (str, optional): Regular expression to include only certain URLs (e.g. "/product/")
48+
- `exclude_filter` (str, optional): Regular expression to exclude certain URLs (e.g. "/ads/")
49+
- `custom_output_fields` (List[str], optional): Custom output schema fields to include
50+
- `include_errors` (bool, optional): Include errors in response (default: True)
51+
52+
### Returns:
53+
- `Dict[str, Any]`: Crawl response with snapshot_id for tracking
54+
55+
### Example Usage:
56+
```python
57+
# Single URL crawl
58+
result = client.crawl("https://example.com/")
59+
60+
# Multiple URLs with filters
61+
urls = ["https://example.com/", "https://example2.com/"]
62+
result = client.crawl(
63+
url=urls,
64+
filter="/product/",
65+
exclude_filter="/ads/",
66+
depth=2,
67+
ignore_sitemap=True
68+
)
69+
70+
# Custom output schema
71+
result = client.crawl(
72+
url="https://example.com/",
73+
custom_output_fields=["markdown", "url", "page_title"]
74+
)
75+
```
76+
77+
### Raises:
78+
- `ValidationError`: Invalid URL or parameters
79+
- `AuthenticationError`: Invalid API token or insufficient permissions
80+
- `APIError`: Request failed or server error
81+
"""
82+
if isinstance(url, str):
83+
urls = [url]
84+
elif isinstance(url, list):
85+
urls = url
86+
else:
87+
raise ValidationError("URL must be a string or list of strings")
88+
89+
if not urls:
90+
raise ValidationError("At least one URL is required")
91+
92+
for u in urls:
93+
if not isinstance(u, str) or not u.strip():
94+
raise ValidationError("All URLs must be non-empty strings")
95+
validate_url(u)
96+
97+
if custom_output_fields is not None:
98+
if not isinstance(custom_output_fields, list):
99+
raise ValidationError("custom_output_fields must be a list")
100+
101+
invalid_fields = [field for field in custom_output_fields if field not in self.AVAILABLE_OUTPUT_FIELDS]
102+
if invalid_fields:
103+
raise ValidationError(f"Invalid output fields: {invalid_fields}. Available fields: {self.AVAILABLE_OUTPUT_FIELDS}")
104+
105+
crawl_inputs = []
106+
for u in urls:
107+
crawl_input = {"url": u}
108+
109+
if ignore_sitemap is not None:
110+
crawl_input["ignore_sitemap"] = ignore_sitemap
111+
if depth is not None:
112+
crawl_input["depth"] = depth
113+
if filter is not None:
114+
crawl_input["filter"] = filter
115+
if exclude_filter is not None:
116+
crawl_input["exclude_filter"] = exclude_filter
117+
118+
crawl_inputs.append(crawl_input)
119+
120+
api_url = "https://api.brightdata.com/datasets/v3/trigger"
121+
122+
params = {
123+
"dataset_id": self.CRAWL_DATASET_ID,
124+
"include_errors": str(include_errors).lower(),
125+
"type": "discover_new",
126+
"discover_by": "domain_url"
127+
}
128+
129+
if custom_output_fields:
130+
payload = {
131+
"input": crawl_inputs,
132+
"custom_output_fields": custom_output_fields
133+
}
134+
else:
135+
payload = crawl_inputs
136+
137+
logger.info(f"Starting crawl for {len(urls)} URL(s)")
138+
logger.debug(f"Crawl parameters: depth={depth}, filter={filter}, exclude_filter={exclude_filter}")
139+
140+
try:
141+
response = self.session.post(
142+
api_url,
143+
params=params,
144+
json=payload,
145+
timeout=self.default_timeout
146+
)
147+
148+
if response.status_code == 200:
149+
result = response.json()
150+
snapshot_id = result.get('snapshot_id')
151+
logger.info(f"Crawl initiated successfully. Snapshot ID: {snapshot_id}")
152+
return result
153+
154+
elif response.status_code == 401:
155+
logger.error("Unauthorized (401): Check API token")
156+
raise AuthenticationError(f"Unauthorized (401): Check your API token. {response.text}")
157+
elif response.status_code == 403:
158+
logger.error("Forbidden (403): Insufficient permissions")
159+
raise AuthenticationError(f"Forbidden (403): Insufficient permissions. {response.text}")
160+
elif response.status_code == 400:
161+
logger.error(f"Bad request (400): {response.text}")
162+
raise APIError(f"Bad request (400): {response.text}")
163+
else:
164+
logger.error(f"Crawl request failed ({response.status_code}): {response.text}")
165+
raise APIError(
166+
f"Crawl request failed ({response.status_code}): {response.text}",
167+
status_code=response.status_code,
168+
response_text=response.text
169+
)
170+
171+
except Exception as e:
172+
if isinstance(e, (ValidationError, AuthenticationError, APIError)):
173+
raise
174+
logger.error(f"Unexpected error during crawl: {e}")
175+
raise APIError(f"Unexpected error during crawl: {str(e)}")

brightdata/api/download.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,11 +152,22 @@ def download_snapshot(
152152
timeout=self.default_timeout
153153
)
154154

155-
if response.status_code == 401:
155+
if response.status_code == 200:
156+
pass
157+
elif response.status_code == 202:
158+
try:
159+
response_data = response.json()
160+
message = response_data.get('message', 'Snapshot is not ready yet')
161+
print("Snapshot is not ready yet, try again soon")
162+
return {"status": "not_ready", "message": message, "snapshot_id": snapshot_id}
163+
except json.JSONDecodeError:
164+
print("Snapshot is not ready yet, try again soon")
165+
return {"status": "not_ready", "message": "Snapshot is not ready yet, check again soon", "snapshot_id": snapshot_id}
166+
elif response.status_code == 401:
156167
raise AuthenticationError("Invalid API token or insufficient permissions")
157168
elif response.status_code == 404:
158169
raise APIError(f"Snapshot '{snapshot_id}' not found")
159-
elif response.status_code != 200:
170+
else:
160171
raise APIError(f"Download request failed with status {response.status_code}: {response.text}")
161172

162173
if format == "csv":

0 commit comments

Comments
 (0)