brightdata
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎brightdata/__init__.py‎
Lines changed: 18 additions & 2 deletions b/‎brightdata/__init__.py‎
Lines changed: 18 additions & 2 deletions
diff --git a/‎brightdata/api/__init__.py‎
Lines changed: 3 additions & 1 deletion b/‎brightdata/api/__init__.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎brightdata/api/crawl.py‎
Lines changed: 175 additions & 0 deletions b/‎brightdata/api/crawl.py‎
Lines changed: 175 additions & 0 deletions
diff --git a/‎brightdata/api/download.py‎
Lines changed: 13 additions & 2 deletions b/‎brightdata/api/download.py‎
Lines changed: 13 additions & 2 deletions
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12']
+        python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
 
     steps:
     - uses: actions/checkout@v4
 
@@ -29,10 +29,22 @@
 - Saves the scraped content to local files in various formats (JSON, CSV, etc.)
 - syntax: `client.download_content(results)`
 - syntax: `client.download_snapshot(results)`
+#### connect_browser()
+- Get WebSocket endpoint for connecting to Bright Data's scraping browser with Playwright/Selenium
+- syntax: `endpoint_url = client.connect_browser()` then use with browser automation tools
+#### crawl()
+- Crawl websites to discover and scrape multiple pages using Bright Data's Web Crawl API
+- syntax: `result = client.crawl(url, filter, exclude_filter, depth, ...)`
+#### parse_content()
+- Parse and extract useful information from API responses (JSON or HTML)
+- syntax: `parsed = client.parse_content(data, extract_text=True, extract_links=True)`
 
 ### Features:
 - Web Scraping: Scrape websites using Bright Data Web Unlocker API with proxy support
 - Search Engine Results: Perform web searches using Bright Data SERP API  
+- Web Crawling: Discover and scrape multiple pages from websites with advanced filtering
+- Content Parsing: Extract text, links, images, and structured data from API responses
+- Browser Automation: Simple authentication for Bright Data's scraping browser with Playwright/Selenium
 - Multiple Search Engines: Support for Google, Bing, and Yandex
 - Parallel Processing: Concurrent processing for multiple URLs or queries
 - Robust Error Handling: Comprehensive error handling with retry logic
@@ -50,8 +62,9 @@
     NetworkError,
     APIError
 )
+from .utils import parse_content, parse_multiple, extract_structured_data
 
-__version__ = "1.0.8"
+__version__ = "1.1.0"
 __author__ = "Bright Data"
 __email__ = "[email protected]"
 
@@ -62,5 +75,8 @@
     'AuthenticationError',
     'ZoneError',
     'NetworkError',
-    'APIError'
+    'APIError',
+    'parse_content',
+    'parse_multiple',
+    'extract_structured_data'
 ]
@@ -2,10 +2,12 @@
 from .search import SearchAPI
 from .chatgpt import ChatGPTAPI
 from .linkedin import LinkedInAPI
+from .crawl import CrawlAPI
 
 __all__ = [
     'WebScraper',
     'SearchAPI',
     'ChatGPTAPI',
-    'LinkedInAPI'
+    'LinkedInAPI',
+    'CrawlAPI'
 ]
@@ -0,0 +1,175 @@
+import json
+from typing import Union, Dict, Any, List, Optional
+from ..utils import get_logger, validate_url
+from ..exceptions import ValidationError, APIError, AuthenticationError
+
+logger = get_logger('api.crawl')
+
+
+class CrawlAPI:
+    """Handles crawl operations using Bright Data's Web Crawl API"""
+    
+    CRAWL_DATASET_ID = "gd_m6gjtfmeh43we6cqc"
+    
+    AVAILABLE_OUTPUT_FIELDS = [
+        "markdown", "url", "html2text", "page_html", "ld_json", 
+        "page_title", "timestamp", "input", "discovery_input", 
+        "error", "error_code", "warning", "warning_code"
+    ]
+    
+    def __init__(self, session, api_token, default_timeout=30, max_retries=3, retry_backoff=1.5):
+        self.session = session
+        self.api_token = api_token
+        self.default_timeout = default_timeout
+        self.max_retries = max_retries
+        self.retry_backoff = retry_backoff
+    
+    def crawl(
+        self,
+        url: Union[str, List[str]],
+        ignore_sitemap: Optional[bool] = None,
+        depth: Optional[int] = None,
+        filter: Optional[str] = None,
+        exclude_filter: Optional[str] = None,
+        custom_output_fields: Optional[List[str]] = None,
+        include_errors: bool = True
+    ) -> Dict[str, Any]:
+        """
+        ## Crawl websites using Bright Data's Web Crawl API
+        
+        Performs web crawling to discover and scrape multiple pages from a website
+        starting from the specified URL(s).
+        
+        ### Parameters:
+        - `url` (str | List[str]): Domain URL(s) to crawl (required)
+        - `ignore_sitemap` (bool, optional): Ignore sitemap when crawling
+        - `depth` (int, optional): Maximum depth to crawl relative to the entered URL
+        - `filter` (str, optional): Regular expression to include only certain URLs (e.g. "/product/")
+        - `exclude_filter` (str, optional): Regular expression to exclude certain URLs (e.g. "/ads/")
+        - `custom_output_fields` (List[str], optional): Custom output schema fields to include
+        - `include_errors` (bool, optional): Include errors in response (default: True)
+        
+        ### Returns:
+        - `Dict[str, Any]`: Crawl response with snapshot_id for tracking
+        
+        ### Example Usage:
+        ```python
+        # Single URL crawl
+        result = client.crawl("https://example.com/")
+        
+        # Multiple URLs with filters
+        urls = ["https://example.com/", "https://example2.com/"]
+        result = client.crawl(
+            url=urls,
+            filter="/product/",
+            exclude_filter="/ads/",
+            depth=2,
+            ignore_sitemap=True
+        )
+        
+        # Custom output schema
+        result = client.crawl(
+            url="https://example.com/",
+            custom_output_fields=["markdown", "url", "page_title"]
+        )
+        ```
+        
+        ### Raises:
+        - `ValidationError`: Invalid URL or parameters
+        - `AuthenticationError`: Invalid API token or insufficient permissions
+        - `APIError`: Request failed or server error
+        """
+        if isinstance(url, str):
+            urls = [url]
+        elif isinstance(url, list):
+            urls = url
+        else:
+            raise ValidationError("URL must be a string or list of strings")
+        
+        if not urls:
+            raise ValidationError("At least one URL is required")
+        
+        for u in urls:
+            if not isinstance(u, str) or not u.strip():
+                raise ValidationError("All URLs must be non-empty strings")
+            validate_url(u)
+        
+        if custom_output_fields is not None:
+            if not isinstance(custom_output_fields, list):
+                raise ValidationError("custom_output_fields must be a list")
+            
+            invalid_fields = [field for field in custom_output_fields if field not in self.AVAILABLE_OUTPUT_FIELDS]
+            if invalid_fields:
+                raise ValidationError(f"Invalid output fields: {invalid_fields}. Available fields: {self.AVAILABLE_OUTPUT_FIELDS}")
+        
+        crawl_inputs = []
+        for u in urls:
+            crawl_input = {"url": u}
+            
+            if ignore_sitemap is not None:
+                crawl_input["ignore_sitemap"] = ignore_sitemap
+            if depth is not None:
+                crawl_input["depth"] = depth
+            if filter is not None:
+                crawl_input["filter"] = filter
+            if exclude_filter is not None:
+                crawl_input["exclude_filter"] = exclude_filter
+                
+            crawl_inputs.append(crawl_input)
+        
+        api_url = "https://api.brightdata.com/datasets/v3/trigger"
+        
+        params = {
+            "dataset_id": self.CRAWL_DATASET_ID,
+            "include_errors": str(include_errors).lower(),
+            "type": "discover_new",
+            "discover_by": "domain_url"
+        }
+        
+        if custom_output_fields:
+            payload = {
+                "input": crawl_inputs,
+                "custom_output_fields": custom_output_fields
+            }
+        else:
+            payload = crawl_inputs
+        
+        logger.info(f"Starting crawl for {len(urls)} URL(s)")
+        logger.debug(f"Crawl parameters: depth={depth}, filter={filter}, exclude_filter={exclude_filter}")
+        
+        try:
+            response = self.session.post(
+                api_url,
+                params=params,
+                json=payload,
+                timeout=self.default_timeout
+            )
+            
+            if response.status_code == 200:
+                result = response.json()
+                snapshot_id = result.get('snapshot_id')
+                logger.info(f"Crawl initiated successfully. Snapshot ID: {snapshot_id}")
+                return result
+                
+            elif response.status_code == 401:
+                logger.error("Unauthorized (401): Check API token")
+                raise AuthenticationError(f"Unauthorized (401): Check your API token. {response.text}")
+            elif response.status_code == 403:
+                logger.error("Forbidden (403): Insufficient permissions")
+                raise AuthenticationError(f"Forbidden (403): Insufficient permissions. {response.text}")
+            elif response.status_code == 400:
+                logger.error(f"Bad request (400): {response.text}")
+                raise APIError(f"Bad request (400): {response.text}")
+            else:
+                logger.error(f"Crawl request failed ({response.status_code}): {response.text}")
+                raise APIError(
+                    f"Crawl request failed ({response.status_code}): {response.text}",
+                    status_code=response.status_code,
+                    response_text=response.text
+                )
+                
+        except Exception as e:
+            if isinstance(e, (ValidationError, AuthenticationError, APIError)):
+                raise
+            logger.error(f"Unexpected error during crawl: {e}")
+            raise APIError(f"Unexpected error during crawl: {str(e)}")
@@ -152,11 +152,22 @@ def download_snapshot(
                 timeout=self.default_timeout
             )
 
-            if response.status_code == 401:
+            if response.status_code == 200:
+                pass
+            elif response.status_code == 202:
+                try:
+                    response_data = response.json()
+                    message = response_data.get('message', 'Snapshot is not ready yet')
+                    print("Snapshot is not ready yet, try again soon")
+                    return {"status": "not_ready", "message": message, "snapshot_id": snapshot_id}
+                except json.JSONDecodeError:
+                    print("Snapshot is not ready yet, try again soon")
+                    return {"status": "not_ready", "message": "Snapshot is not ready yet, check again soon", "snapshot_id": snapshot_id}
+            elif response.status_code == 401:
                 raise AuthenticationError("Invalid API token or insufficient permissions")
             elif response.status_code == 404:
                 raise APIError(f"Snapshot '{snapshot_id}' not found")
-            elif response.status_code != 200:
+            else:
                 raise APIError(f"Download request failed with status {response.status_code}: {response.text}")
 
             if format == "csv":