Skip to content

Commit a5354f2

Browse files
committed
Merge branch 'develop' into release/v0.8.0
2 parents 177e298 + 6090629 commit a5354f2

File tree

2 files changed

+104
-52
lines changed

2 files changed

+104
-52
lines changed

crawl4ai/extraction_strategy.py

Lines changed: 100 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1277,44 +1277,18 @@ def _get_element_attribute(self, element, attribute: str):
12771277
}
12781278

12791279
@staticmethod
1280-
def generate_schema(
1281-
html: str,
1282-
schema_type: str = "CSS", # or XPATH
1283-
query: str = None,
1284-
target_json_example: str = None,
1285-
llm_config: 'LLMConfig' = create_llm_config(),
1286-
provider: str = None,
1287-
api_token: str = None,
1288-
**kwargs
1289-
) -> dict:
1280+
def _build_schema_prompt(html: str, schema_type: str, query: str = None, target_json_example: str = None) -> str:
12901281
"""
1291-
Generate extraction schema from HTML content and optional query.
1292-
1293-
Args:
1294-
html (str): The HTML content to analyze
1295-
query (str, optional): Natural language description of what data to extract
1296-
provider (str): Legacy Parameter. LLM provider to use
1297-
api_token (str): Legacy Parameter. API token for LLM provider
1298-
llm_config (LLMConfig): LLM configuration object
1299-
prompt (str, optional): Custom prompt template to use
1300-
**kwargs: Additional args passed to LLM processor
1301-
1282+
Build the prompt for schema generation. Shared by sync and async methods.
1283+
13021284
Returns:
1303-
dict: Generated schema following the JsonElementExtractionStrategy format
1285+
str: Combined system and user prompt
13041286
"""
13051287
from .prompts import JSON_SCHEMA_BUILDER
1306-
from .utils import perform_completion_with_backoff
1307-
for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items():
1308-
if locals()[name] is not None:
1309-
raise AttributeError(f"Setting '{name}' is deprecated. {message}")
1310-
1311-
# Use default or custom prompt
1288+
13121289
prompt_template = JSON_SCHEMA_BUILDER if schema_type == "CSS" else JSON_SCHEMA_BUILDER_XPATH
1313-
1314-
# Build the prompt
1315-
system_message = {
1316-
"role": "system",
1317-
"content": f"""You specialize in generating special JSON schemas for web scraping. This schema uses CSS or XPATH selectors to present a repetitive pattern in crawled HTML, such as a product in a product list or a search result item in a list of search results. We use this JSON schema to pass to a language model along with the HTML content to extract structured data from the HTML. The language model uses the JSON schema to extract data from the HTML and retrieve values for fields in the JSON schema, following the schema.
1290+
1291+
system_content = f"""You specialize in generating special JSON schemas for web scraping. This schema uses CSS or XPATH selectors to present a repetitive pattern in crawled HTML, such as a product in a product list or a search result item in a list of search results. We use this JSON schema to pass to a language model along with the HTML content to extract structured data from the HTML. The language model uses the JSON schema to extract data from the HTML and retrieve values for fields in the JSON schema, following the schema.
13181292
13191293
Generating this HTML manually is not feasible, so you need to generate the JSON schema using the HTML content. The HTML copied from the crawled website is provided below, which we believe contains the repetitive pattern.
13201294
@@ -1335,31 +1309,27 @@ def generate_schema(
13351309
13361310
# What are the instructions and details for this schema generation?
13371311
{prompt_template}"""
1338-
}
1339-
1340-
user_message = {
1341-
"role": "user",
1342-
"content": f"""
1312+
1313+
user_content = f"""
13431314
HTML to analyze:
13441315
```html
13451316
{html}
13461317
```
13471318
"""
1348-
}
13491319

13501320
if query:
1351-
user_message["content"] += f"\n\n## Query or explanation of target/goal data item:\n{query}"
1321+
user_content += f"\n\n## Query or explanation of target/goal data item:\n{query}"
13521322
if target_json_example:
1353-
user_message["content"] += f"\n\n## Example of target JSON object:\n```json\n{target_json_example}\n```"
1323+
user_content += f"\n\n## Example of target JSON object:\n```json\n{target_json_example}\n```"
13541324

13551325
if query and not target_json_example:
1356-
user_message["content"] += """IMPORTANT: To remind you, in this process, we are not providing a rigid example of the adjacent objects we seek. We rely on your understanding of the explanation provided in the above section. Make sure to grasp what we are looking for and, based on that, create the best schema.."""
1326+
user_content += """IMPORTANT: To remind you, in this process, we are not providing a rigid example of the adjacent objects we seek. We rely on your understanding of the explanation provided in the above section. Make sure to grasp what we are looking for and, based on that, create the best schema.."""
13571327
elif not query and target_json_example:
1358-
user_message["content"] += """IMPORTANT: Please remember that in this process, we provided a proper example of a target JSON object. Make sure to adhere to the structure and create a schema that exactly fits this example. If you find that some elements on the page do not match completely, vote for the majority."""
1328+
user_content += """IMPORTANT: Please remember that in this process, we provided a proper example of a target JSON object. Make sure to adhere to the structure and create a schema that exactly fits this example. If you find that some elements on the page do not match completely, vote for the majority."""
13591329
elif not query and not target_json_example:
1360-
user_message["content"] += """IMPORTANT: Since we neither have a query nor an example, it is crucial to rely solely on the HTML content provided. Leverage your expertise to determine the schema based on the repetitive patterns observed in the content."""
1361-
1362-
user_message["content"] += """IMPORTANT:
1330+
user_content += """IMPORTANT: Since we neither have a query nor an example, it is crucial to rely solely on the HTML content provided. Leverage your expertise to determine the schema based on the repetitive patterns observed in the content."""
1331+
1332+
user_content += """IMPORTANT:
13631333
0/ Ensure your schema remains reliable by avoiding selectors that appear to generate dynamically and are not dependable. You want a reliable schema, as it consistently returns the same data even after many page reloads.
13641334
1/ DO NOT USE use base64 kind of classes, they are temporary and not reliable.
13651335
2/ Every selector must refer to only one unique element. You should ensure your selector points to a single element and is unique to the place that contains the information. You have to use available techniques based on CSS or XPATH requested schema to make sure your selector is unique and also not fragile, meaning if we reload the page now or in the future, the selector should remain reliable.
@@ -1368,20 +1338,98 @@ def generate_schema(
13681338
Analyze the HTML and generate a JSON schema that follows the specified format. Only output valid JSON schema, nothing else.
13691339
"""
13701340

1341+
return "\n\n".join([system_content, user_content])
1342+
1343+
@staticmethod
1344+
def generate_schema(
1345+
html: str,
1346+
schema_type: str = "CSS",
1347+
query: str = None,
1348+
target_json_example: str = None,
1349+
llm_config: 'LLMConfig' = create_llm_config(),
1350+
provider: str = None,
1351+
api_token: str = None,
1352+
**kwargs
1353+
) -> dict:
1354+
"""
1355+
Generate extraction schema from HTML content and optional query (sync version).
1356+
1357+
Args:
1358+
html (str): The HTML content to analyze
1359+
query (str, optional): Natural language description of what data to extract
1360+
provider (str): Legacy Parameter. LLM provider to use
1361+
api_token (str): Legacy Parameter. API token for LLM provider
1362+
llm_config (LLMConfig): LLM configuration object
1363+
**kwargs: Additional args passed to LLM processor
1364+
1365+
Returns:
1366+
dict: Generated schema following the JsonElementExtractionStrategy format
1367+
"""
1368+
from .utils import perform_completion_with_backoff
1369+
1370+
for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items():
1371+
if locals()[name] is not None:
1372+
raise AttributeError(f"Setting '{name}' is deprecated. {message}")
1373+
1374+
prompt = JsonElementExtractionStrategy._build_schema_prompt(html, schema_type, query, target_json_example)
1375+
13711376
try:
1372-
# Call LLM with backoff handling
13731377
response = perform_completion_with_backoff(
13741378
provider=llm_config.provider,
1375-
prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]),
1376-
json_response = True,
1379+
prompt_with_variables=prompt,
1380+
json_response=True,
1381+
api_token=llm_config.api_token,
1382+
base_url=llm_config.base_url,
1383+
extra_args=kwargs
1384+
)
1385+
return json.loads(response.choices[0].message.content)
1386+
except Exception as e:
1387+
raise Exception(f"Failed to generate schema: {str(e)}")
1388+
1389+
@staticmethod
1390+
async def agenerate_schema(
1391+
html: str,
1392+
schema_type: str = "CSS",
1393+
query: str = None,
1394+
target_json_example: str = None,
1395+
llm_config: 'LLMConfig' = None,
1396+
**kwargs
1397+
) -> dict:
1398+
"""
1399+
Generate extraction schema from HTML content (async version).
1400+
1401+
Use this method when calling from async contexts (e.g., FastAPI) to avoid
1402+
issues with certain LLM providers (e.g., Gemini/Vertex AI) that require
1403+
async execution.
1404+
1405+
Args:
1406+
html (str): The HTML content to analyze
1407+
schema_type (str): "CSS" or "XPATH"
1408+
query (str, optional): Natural language description of what data to extract
1409+
target_json_example (str, optional): Example of desired JSON output
1410+
llm_config (LLMConfig): LLM configuration object
1411+
**kwargs: Additional args passed to LLM processor
1412+
1413+
Returns:
1414+
dict: Generated schema following the JsonElementExtractionStrategy format
1415+
"""
1416+
from .utils import aperform_completion_with_backoff
1417+
1418+
if llm_config is None:
1419+
llm_config = create_llm_config()
1420+
1421+
prompt = JsonElementExtractionStrategy._build_schema_prompt(html, schema_type, query, target_json_example)
1422+
1423+
try:
1424+
response = await aperform_completion_with_backoff(
1425+
provider=llm_config.provider,
1426+
prompt_with_variables=prompt,
1427+
json_response=True,
13771428
api_token=llm_config.api_token,
13781429
base_url=llm_config.base_url,
13791430
extra_args=kwargs
13801431
)
1381-
1382-
# Extract and return schema
13831432
return json.loads(response.choices[0].message.content)
1384-
13851433
except Exception as e:
13861434
raise Exception(f"Failed to generate schema: {str(e)}")
13871435

crawl4ai/utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1775,6 +1775,8 @@ def perform_completion_with_backoff(
17751775

17761776
from litellm import completion
17771777
from litellm.exceptions import RateLimitError
1778+
import litellm
1779+
litellm.drop_params = True # Auto-drop unsupported params (e.g., temperature for O-series/GPT-5)
17781780

17791781
extra_args = {"temperature": 0.01, "api_key": api_token, "base_url": base_url}
17801782
if json_response:
@@ -1864,7 +1866,9 @@ async def aperform_completion_with_backoff(
18641866

18651867
from litellm import acompletion
18661868
from litellm.exceptions import RateLimitError
1869+
import litellm
18671870
import asyncio
1871+
litellm.drop_params = True # Auto-drop unsupported params (e.g., temperature for O-series/GPT-5)
18681872

18691873
extra_args = {"temperature": 0.01, "api_key": api_token, "base_url": base_url}
18701874
if json_response:

0 commit comments

Comments
 (0)