1+ import json
2+ from typing import Union , Dict , Any , List , Optional
3+ from ..utils import get_logger , validate_url
4+ from ..exceptions import ValidationError , APIError , AuthenticationError
5+
6+ logger = get_logger ('api.crawl' )
7+
8+
9+ class CrawlAPI :
10+ """Handles crawl operations using Bright Data's Web Crawl API"""
11+
12+ CRAWL_DATASET_ID = "gd_m6gjtfmeh43we6cqc"
13+
14+ AVAILABLE_OUTPUT_FIELDS = [
15+ "markdown" , "url" , "html2text" , "page_html" , "ld_json" ,
16+ "page_title" , "timestamp" , "input" , "discovery_input" ,
17+ "error" , "error_code" , "warning" , "warning_code"
18+ ]
19+
20+ def __init__ (self , session , api_token , default_timeout = 30 , max_retries = 3 , retry_backoff = 1.5 ):
21+ self .session = session
22+ self .api_token = api_token
23+ self .default_timeout = default_timeout
24+ self .max_retries = max_retries
25+ self .retry_backoff = retry_backoff
26+
27+ def crawl (
28+ self ,
29+ url : Union [str , List [str ]],
30+ ignore_sitemap : Optional [bool ] = None ,
31+ depth : Optional [int ] = None ,
32+ filter : Optional [str ] = None ,
33+ exclude_filter : Optional [str ] = None ,
34+ custom_output_fields : Optional [List [str ]] = None ,
35+ include_errors : bool = True
36+ ) -> Dict [str , Any ]:
37+ """
38+ ## Crawl websites using Bright Data's Web Crawl API
39+
40+ Performs web crawling to discover and scrape multiple pages from a website
41+ starting from the specified URL(s).
42+
43+ ### Parameters:
44+ - `url` (str | List[str]): Domain URL(s) to crawl (required)
45+ - `ignore_sitemap` (bool, optional): Ignore sitemap when crawling
46+ - `depth` (int, optional): Maximum depth to crawl relative to the entered URL
47+ - `filter` (str, optional): Regular expression to include only certain URLs (e.g. "/product/")
48+ - `exclude_filter` (str, optional): Regular expression to exclude certain URLs (e.g. "/ads/")
49+ - `custom_output_fields` (List[str], optional): Custom output schema fields to include
50+ - `include_errors` (bool, optional): Include errors in response (default: True)
51+
52+ ### Returns:
53+ - `Dict[str, Any]`: Crawl response with snapshot_id for tracking
54+
55+ ### Example Usage:
56+ ```python
57+ # Single URL crawl
58+ result = client.crawl("https://example.com/")
59+
60+ # Multiple URLs with filters
61+ urls = ["https://example.com/", "https://example2.com/"]
62+ result = client.crawl(
63+ url=urls,
64+ filter="/product/",
65+ exclude_filter="/ads/",
66+ depth=2,
67+ ignore_sitemap=True
68+ )
69+
70+ # Custom output schema
71+ result = client.crawl(
72+ url="https://example.com/",
73+ custom_output_fields=["markdown", "url", "page_title"]
74+ )
75+ ```
76+
77+ ### Raises:
78+ - `ValidationError`: Invalid URL or parameters
79+ - `AuthenticationError`: Invalid API token or insufficient permissions
80+ - `APIError`: Request failed or server error
81+ """
82+ if isinstance (url , str ):
83+ urls = [url ]
84+ elif isinstance (url , list ):
85+ urls = url
86+ else :
87+ raise ValidationError ("URL must be a string or list of strings" )
88+
89+ if not urls :
90+ raise ValidationError ("At least one URL is required" )
91+
92+ for u in urls :
93+ if not isinstance (u , str ) or not u .strip ():
94+ raise ValidationError ("All URLs must be non-empty strings" )
95+ validate_url (u )
96+
97+ if custom_output_fields is not None :
98+ if not isinstance (custom_output_fields , list ):
99+ raise ValidationError ("custom_output_fields must be a list" )
100+
101+ invalid_fields = [field for field in custom_output_fields if field not in self .AVAILABLE_OUTPUT_FIELDS ]
102+ if invalid_fields :
103+ raise ValidationError (f"Invalid output fields: { invalid_fields } . Available fields: { self .AVAILABLE_OUTPUT_FIELDS } " )
104+
105+ crawl_inputs = []
106+ for u in urls :
107+ crawl_input = {"url" : u }
108+
109+ if ignore_sitemap is not None :
110+ crawl_input ["ignore_sitemap" ] = ignore_sitemap
111+ if depth is not None :
112+ crawl_input ["depth" ] = depth
113+ if filter is not None :
114+ crawl_input ["filter" ] = filter
115+ if exclude_filter is not None :
116+ crawl_input ["exclude_filter" ] = exclude_filter
117+
118+ crawl_inputs .append (crawl_input )
119+
120+ api_url = "https://api.brightdata.com/datasets/v3/trigger"
121+
122+ params = {
123+ "dataset_id" : self .CRAWL_DATASET_ID ,
124+ "include_errors" : str (include_errors ).lower (),
125+ "type" : "discover_new" ,
126+ "discover_by" : "domain_url"
127+ }
128+
129+ if custom_output_fields :
130+ payload = {
131+ "input" : crawl_inputs ,
132+ "custom_output_fields" : custom_output_fields
133+ }
134+ else :
135+ payload = crawl_inputs
136+
137+ logger .info (f"Starting crawl for { len (urls )} URL(s)" )
138+ logger .debug (f"Crawl parameters: depth={ depth } , filter={ filter } , exclude_filter={ exclude_filter } " )
139+
140+ try :
141+ response = self .session .post (
142+ api_url ,
143+ params = params ,
144+ json = payload ,
145+ timeout = self .default_timeout
146+ )
147+
148+ if response .status_code == 200 :
149+ result = response .json ()
150+ snapshot_id = result .get ('snapshot_id' )
151+ logger .info (f"Crawl initiated successfully. Snapshot ID: { snapshot_id } " )
152+ return result
153+
154+ elif response .status_code == 401 :
155+ logger .error ("Unauthorized (401): Check API token" )
156+ raise AuthenticationError (f"Unauthorized (401): Check your API token. { response .text } " )
157+ elif response .status_code == 403 :
158+ logger .error ("Forbidden (403): Insufficient permissions" )
159+ raise AuthenticationError (f"Forbidden (403): Insufficient permissions. { response .text } " )
160+ elif response .status_code == 400 :
161+ logger .error (f"Bad request (400): { response .text } " )
162+ raise APIError (f"Bad request (400): { response .text } " )
163+ else :
164+ logger .error (f"Crawl request failed ({ response .status_code } ): { response .text } " )
165+ raise APIError (
166+ f"Crawl request failed ({ response .status_code } ): { response .text } " ,
167+ status_code = response .status_code ,
168+ response_text = response .text
169+ )
170+
171+ except Exception as e :
172+ if isinstance (e , (ValidationError , AuthenticationError , APIError )):
173+ raise
174+ logger .error (f"Unexpected error during crawl: { e } " )
175+ raise APIError (f"Unexpected error during crawl: { str (e )} " )
0 commit comments