Skip to content

Commit 5a81b1c

Browse files
Idan VilenskiIdan Vilenski
authored andcommitted
v1.1.3: Enhanced extract function and fixed parse_content auto-detection
- Added url and output_scheme parameters to extract function - Implemented OpenAI Structured Outputs support with schema validation - Fixed parse_content to auto-detect multiple results - Added user-agent headers to all dataset API requests - Updated examples with proper schema formatting
1 parent 33bfb66 commit 5a81b1c

File tree

11 files changed

+457
-112
lines changed

11 files changed

+457
-112
lines changed

README.md

Lines changed: 27 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -152,19 +152,28 @@ print(f"Found {len(parsed['links'])} links")
152152

153153
#### `extract()`
154154
```python
155-
# Simple AI-powered extraction using natural language
156-
result = client.extract("extract the latest news headlines from bbc.com")
157-
print(result) # Prints extracted headlines directly
158-
159-
# Extract specific information with custom query
160-
result = client.extract("get product name and price from amazon.com/dp/B079QHML21")
161-
print(f"Product info: {result}")
162-
print(f"Source: {result.url}")
163-
print(f"Tokens used: {result.token_usage['total_tokens']}")
164-
165-
# Extract structured data
166-
result = client.extract("find contact information and business hours from company-website.com")
167-
print(result) # AI-formatted contact details
155+
# Basic extraction (URL in query)
156+
result = client.extract("Extract news headlines from CNN.com")
157+
print(result)
158+
159+
# Using URL parameter with structured output
160+
schema = {
161+
"type": "object",
162+
"properties": {
163+
"headlines": {
164+
"type": "array",
165+
"items": {"type": "string"}
166+
}
167+
},
168+
"required": ["headlines"]
169+
}
170+
171+
result = client.extract(
172+
query="Extract main headlines",
173+
url="https://cnn.com",
174+
output_scheme=schema
175+
)
176+
print(result) # Returns structured JSON matching the schema
168177
```
169178

170179
#### `connect_browser()`
@@ -265,13 +274,15 @@ Extract and parse useful information from API responses.
265274
<details>
266275
<summary>🤖 <strong>extract(...)</strong></summary>
267276

268-
Extract specific information from websites using AI-powered natural language processing.
277+
Extract specific information from websites using AI-powered natural language processing with OpenAI.
269278

270279
```python
271-
- `query`: Natural language query containing what to extract and from which URL (required)
280+
- `query`: Natural language query describing what to extract (required)
281+
- `url`: Single URL or list of URLs to extract from (optional - if not provided, extracts URL from query)
282+
- `output_scheme`: JSON Schema for OpenAI Structured Outputs (optional - enables reliable JSON responses)
272283
- `llm_key`: OpenAI API key (optional - uses OPENAI_API_KEY env variable if not provided)
273284

274-
# Returns: Extracted content as string with metadata attributes
285+
# Returns: ExtractResult object (string-like with metadata attributes)
275286
# Available attributes: .url, .query, .source_title, .token_usage, .content_length
276287
```
277288

brightdata/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@
6464
)
6565
from .utils import parse_content, parse_multiple, extract_structured_data
6666

67-
__version__ = "1.1.2"
67+
__version__ = "1.1.3"
6868
__author__ = "Bright Data"
6969
__email__ = "[email protected]"
7070

brightdata/api/chatgpt.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,16 @@ def scrape_chatgpt(
4242
- Dict containing response with snapshot_id or direct data (if sync=True)
4343
"""
4444
url = "https://api.brightdata.com/datasets/v3/scrape" if sync else "https://api.brightdata.com/datasets/v3/trigger"
45+
try:
46+
from .. import __version__
47+
user_agent = f"brightdata-sdk/{__version__}"
48+
except ImportError:
49+
user_agent = "brightdata-sdk/unknown"
50+
4551
headers = {
4652
"Authorization": f"Bearer {self.api_token}",
47-
"Content-Type": "application/json"
53+
"Content-Type": "application/json",
54+
"User-Agent": user_agent
4855
}
4956
params = {
5057
"dataset_id": "gd_m7aof0k82r803d5bjm",

brightdata/api/download.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,9 +125,16 @@ def download_snapshot(
125125
raise ValidationError("Part parameter requires batch_size to be specified")
126126

127127
url = f"https://api.brightdata.com/datasets/v3/snapshot/{snapshot_id}"
128+
try:
129+
from .. import __version__
130+
user_agent = f"brightdata-sdk/{__version__}"
131+
except ImportError:
132+
user_agent = "brightdata-sdk/unknown"
133+
128134
headers = {
129135
"Authorization": f"Bearer {self.api_token}",
130-
"Accept": "application/json"
136+
"Accept": "application/json",
137+
"User-Agent": user_agent
131138
}
132139
params = {
133140
"format": format

0 commit comments

Comments
 (0)