Skip to content

Commit 9f66214

Browse files
authored
Merge pull request #7 from vzucher/Code-Improvements
refactor: improve code quality with constants and best practices
2 parents 9f36969 + 2e655a7 commit 9f66214

File tree

23 files changed

+196
-80
lines changed

23 files changed

+196
-80
lines changed

src/brightdata/api/base.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Base API class for all API implementations."""
22

3+
import asyncio
34
from abc import ABC, abstractmethod
45
from typing import Any
56
from ..core.engine import AsyncEngine
@@ -38,8 +39,6 @@ def _execute_sync(self, *args: Any, **kwargs: Any) -> Any:
3839
3940
Wraps async method using asyncio.run() for sync compatibility.
4041
"""
41-
import asyncio
42-
4342
try:
4443
loop = asyncio.get_running_loop()
4544
raise RuntimeError(

src/brightdata/api/serp/base.py

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from ...core.engine import AsyncEngine
1212
from ...models import SearchResult
1313
from ...types import NormalizedSERPData
14+
from ...constants import HTTP_OK
1415
from ...exceptions import ValidationError, APIError
1516
from ...utils.validation import validate_zone_name
1617
from ...utils.retry import retry_with_backoff
@@ -132,10 +133,14 @@ async def _search_single_async(
132133
**kwargs
133134
)
134135

136+
# Use "json" format when brd_json=1 is in URL (enables Bright Data parsing)
137+
# Otherwise use "raw" to get HTML response
138+
response_format = "json" if "brd_json=1" in search_url else "raw"
139+
135140
payload = {
136141
"zone": zone,
137142
"url": search_url,
138-
"format": "raw",
143+
"format": response_format,
139144
"method": "GET",
140145
}
141146

@@ -151,14 +156,33 @@ async def _make_request():
151156
) as response:
152157
data_fetched_at = datetime.now(timezone.utc)
153158

154-
if response.status == 200:
155-
# With brd_json=1, response is JSON text (not wrapped in status_code/body)
159+
if response.status == HTTP_OK:
160+
# Try to parse response - could be direct JSON or wrapped in status_code/body
156161
text = await response.text()
157162
try:
158163
data = json.loads(text)
159164
except json.JSONDecodeError:
160165
# Fallback to regular JSON response
161-
data = await response.json()
166+
try:
167+
data = await response.json()
168+
except Exception:
169+
# If all else fails, treat as raw text/HTML
170+
data = {"raw_html": text}
171+
172+
# Handle wrapped response format (status_code/headers/body)
173+
if isinstance(data, dict) and "body" in data and "status_code" in data:
174+
# This is a wrapped HTTP response - extract body
175+
body = data.get("body", "")
176+
if isinstance(body, str) and body.strip().startswith("<"):
177+
# Body is HTML - pass to normalizer which will handle it
178+
data = {"body": body, "status_code": data.get("status_code")}
179+
else:
180+
# Body might be JSON string - try to parse it
181+
try:
182+
data = json.loads(body) if isinstance(body, str) else body
183+
except (json.JSONDecodeError, TypeError):
184+
data = {"body": body, "status_code": data.get("status_code")}
185+
162186
normalized_data = self.data_normalizer.normalize(data)
163187

164188
return SearchResult(

src/brightdata/api/serp/data_normalizer.py

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Data normalization for SERP responses."""
22

3+
import warnings
34
from abc import ABC, abstractmethod
45
from typing import Any, Dict, List
56
from ...types import NormalizedSERPData
@@ -16,6 +17,9 @@ def normalize(self, data: Any) -> NormalizedSERPData:
1617

1718
class GoogleDataNormalizer(BaseDataNormalizer):
1819
"""Data normalizer for Google SERP responses."""
20+
21+
# Length of prefix to check for HTML detection
22+
HTML_DETECTION_PREFIX_LENGTH = 200
1923

2024
def normalize(self, data: Any) -> NormalizedSERPData:
2125
"""Normalize Google SERP data."""
@@ -30,11 +34,31 @@ def normalize(self, data: Any) -> NormalizedSERPData:
3034

3135
# Handle raw HTML response (body field)
3236
if "body" in data and isinstance(data.get("body"), str):
33-
return {
34-
"results": [],
35-
"raw_html": data["body"],
36-
"status_code": data.get("status_code"),
37-
}
37+
body = data["body"]
38+
# Check if body is HTML with improved detection
39+
body_lower = body.strip().lower()
40+
is_html = (
41+
body_lower.startswith(("<html", "<!doctype", "<!DOCTYPE")) or
42+
"<html" in body_lower[:self.HTML_DETECTION_PREFIX_LENGTH]
43+
)
44+
45+
if is_html:
46+
warnings.warn(
47+
"SERP API returned raw HTML instead of parsed JSON. "
48+
"This usually means:\n"
49+
"1. The zone doesn't support automatic parsing\n"
50+
"2. The brd_json=1 parameter didn't work as expected\n"
51+
"3. You may need to use a different zone type or endpoint\n\n"
52+
"The raw HTML is available in the 'raw_html' field of the response. "
53+
"Consider using an HTML parser (e.g., BeautifulSoup) to extract results.",
54+
UserWarning,
55+
stacklevel=3
56+
)
57+
return {
58+
"results": [],
59+
"raw_html": body,
60+
"status_code": data.get("status_code"),
61+
}
3862

3963
results = []
4064
organic = data.get("organic", [])

src/brightdata/api/web_unlocker.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
)
1818
from ..utils.url import extract_root_domain
1919
from ..utils.function_detection import get_caller_function_name
20+
from ..constants import HTTP_OK
2021
from ..exceptions import ValidationError, APIError
2122

2223

@@ -130,11 +131,11 @@ async def _scrape_single_async(
130131
) as response:
131132
data_fetched_at = datetime.now(timezone.utc)
132133

133-
if response.status == 200:
134+
if response.status == HTTP_OK:
134135
if response_format == "json":
135136
try:
136137
data = await response.json()
137-
except Exception as e:
138+
except (ValueError, TypeError) as e:
138139
raise APIError(f"Failed to parse JSON response: {str(e)}")
139140
else:
140141
data = await response.text()

src/brightdata/client.py

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
import os
1212
import asyncio
13+
import warnings
1314
from typing import Optional, Dict, Any, Union, List
1415
from datetime import datetime, timezone
1516

@@ -27,6 +28,11 @@
2728
from .api.crawler_service import CrawlerService
2829
from .models import ScrapeResult, SearchResult
2930
from .types import AccountInfo, URLParam, OptionalURLParam
31+
from .constants import (
32+
HTTP_OK,
33+
HTTP_UNAUTHORIZED,
34+
HTTP_FORBIDDEN,
35+
)
3036
from .exceptions import (
3137
ValidationError,
3238
AuthenticationError,
@@ -62,9 +68,9 @@ class BrightDataClient:
6268

6369
# Default configuration
6470
DEFAULT_TIMEOUT = 30
65-
DEFAULT_WEB_UNLOCKER_ZONE = "sdk_unlocker"
66-
DEFAULT_SERP_ZONE = "sdk_serp"
67-
DEFAULT_BROWSER_ZONE = "sdk_browser"
71+
DEFAULT_WEB_UNLOCKER_ZONE = "web_unlocker1"
72+
DEFAULT_SERP_ZONE = "serp_api1"
73+
DEFAULT_BROWSER_ZONE = "browser_api1"
6874

6975
# Environment variable name for API token
7076
TOKEN_ENV_VAR = "BRIGHTDATA_API_TOKEN"
@@ -93,9 +99,9 @@ def __init__(
9399
(supports .env files via python-dotenv)
94100
customer_id: Customer ID (optional, can also be set via BRIGHTDATA_CUSTOMER_ID)
95101
timeout: Default timeout in seconds for all requests (default: 30)
96-
web_unlocker_zone: Zone name for web unlocker (default: "sdk_unlocker")
97-
serp_zone: Zone name for SERP API (default: "sdk_serp")
98-
browser_zone: Zone name for browser API (default: "sdk_browser")
102+
web_unlocker_zone: Zone name for web unlocker (default: "web_unlocker1")
103+
serp_zone: Zone name for SERP API (default: "serp_api1")
104+
browser_zone: Zone name for browser API (default: "browser_api1")
99105
auto_create_zones: Automatically create zones if they don't exist (default: False)
100106
validate_token: Validate token by testing connection on init (default: False)
101107
rate_limit: Maximum requests per rate_period (default: 10). Set to None to disable.
@@ -324,14 +330,14 @@ async def test_connection(self) -> bool:
324330
async with self.engine.get_from_url(
325331
f"{self.engine.BASE_URL}/zone/get_active_zones"
326332
) as response:
327-
if response.status == 200:
333+
if response.status == HTTP_OK:
328334
self._is_connected = True
329335
return True
330336
else:
331337
self._is_connected = False
332338
return False
333339

334-
except Exception:
340+
except (asyncio.TimeoutError, OSError, Exception):
335341
self._is_connected = False
336342
return False
337343

@@ -366,21 +372,34 @@ async def get_account_info(self) -> AccountInfo:
366372
async with self.engine.get_from_url(
367373
f"{self.engine.BASE_URL}/zone/get_active_zones"
368374
) as zones_response:
369-
if zones_response.status == 200:
375+
if zones_response.status == HTTP_OK:
370376
zones = await zones_response.json()
377+
zones = zones or []
378+
379+
# Warn user if no active zones found (they might be inactive)
380+
if not zones:
381+
warnings.warn(
382+
"No active zones found. This could mean:\n"
383+
"1. Your zones might be inactive - activate them in the Bright Data dashboard\n"
384+
"2. You might need to create zones first\n"
385+
"3. Check your dashboard at https://brightdata.com for zone status\n\n"
386+
"Note: The API only returns active zones. Inactive zones won't appear here.",
387+
UserWarning,
388+
stacklevel=2
389+
)
371390

372391
account_info = {
373392
"customer_id": self.customer_id,
374-
"zones": zones or [],
375-
"zone_count": len(zones or []),
393+
"zones": zones,
394+
"zone_count": len(zones),
376395
"token_valid": True,
377396
"retrieved_at": datetime.now(timezone.utc).isoformat(),
378397
}
379398

380399
self._account_info = account_info
381400
return account_info
382401

383-
elif zones_response.status in (401, 403):
402+
elif zones_response.status in (HTTP_UNAUTHORIZED, HTTP_FORBIDDEN):
384403
error_text = await zones_response.text()
385404
raise AuthenticationError(
386405
f"Invalid token (HTTP {zones_response.status}): {error_text}"

src/brightdata/constants.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,38 @@
2323

2424
DEFAULT_COST_PER_RECORD: float = 0.001
2525
"""Default cost per record for base scrapers."""
26+
27+
# Platform-specific costs (when different from default)
28+
COST_PER_RECORD_LINKEDIN: float = 0.002
29+
"""Cost per record for LinkedIn scrapers."""
30+
31+
COST_PER_RECORD_FACEBOOK: float = 0.002
32+
"""Cost per record for Facebook scrapers."""
33+
34+
COST_PER_RECORD_INSTAGRAM: float = 0.002
35+
"""Cost per record for Instagram scrapers."""
36+
37+
COST_PER_RECORD_CHATGPT: float = 0.005
38+
"""Cost per record for ChatGPT scrapers (higher due to AI processing)."""
39+
40+
# HTTP Status Codes
41+
HTTP_OK: int = 200
42+
"""HTTP 200 OK - Request succeeded."""
43+
44+
HTTP_CREATED: int = 201
45+
"""HTTP 201 Created - Resource created successfully."""
46+
47+
HTTP_BAD_REQUEST: int = 400
48+
"""HTTP 400 Bad Request - Invalid request parameters."""
49+
50+
HTTP_UNAUTHORIZED: int = 401
51+
"""HTTP 401 Unauthorized - Authentication required or failed."""
52+
53+
HTTP_FORBIDDEN: int = 403
54+
"""HTTP 403 Forbidden - Access denied."""
55+
56+
HTTP_CONFLICT: int = 409
57+
"""HTTP 409 Conflict - Resource conflict (e.g., duplicate)."""
58+
59+
HTTP_INTERNAL_SERVER_ERROR: int = 500
60+
"""HTTP 500 Internal Server Error - Server error."""

src/brightdata/core/engine.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from typing import Optional, Dict, Any
77
from datetime import datetime, timezone
88
from ..exceptions import APIError, AuthenticationError, NetworkError, TimeoutError, SSLError
9+
from ..constants import HTTP_UNAUTHORIZED, HTTP_FORBIDDEN
910
from ..utils.ssl_helpers import is_ssl_certificate_error, get_ssl_error_message
1011

1112
# Rate limiting support
@@ -304,14 +305,14 @@ async def __aenter__(self):
304305
timeout=self._timeout,
305306
)
306307
# Check status codes that should raise exceptions
307-
if self._response.status == 401:
308+
if self._response.status == HTTP_UNAUTHORIZED:
308309
text = await self._response.text()
309310
await self._response.release()
310-
raise AuthenticationError(f"Unauthorized (401): {text}")
311-
elif self._response.status == 403:
311+
raise AuthenticationError(f"Unauthorized ({HTTP_UNAUTHORIZED}): {text}")
312+
elif self._response.status == HTTP_FORBIDDEN:
312313
text = await self._response.text()
313314
await self._response.release()
314-
raise AuthenticationError(f"Forbidden (403): {text}")
315+
raise AuthenticationError(f"Forbidden ({HTTP_FORBIDDEN}): {text}")
315316

316317
return self._response
317318
except (aiohttp.ClientError, ssl.SSLError, OSError) as e:

0 commit comments

Comments
 (0)