Skip to content

Commit 77faf82

Browse files
author
Thordata
committed
code opti
1 parent e312c7b commit 77faf82

File tree

2 files changed

+19
-7
lines changed

2 files changed

+19
-7
lines changed

thordata_langchain_tools/scrape_tool.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@
1414

1515
from .serp_tool import _build_client_from_env
1616

17-
MAX_HTML_CHARS = 10_000 # maximum number of characters to return to the LLM
17+
# Limit the amount of HTML we pass back to the LLM to avoid huge prompts
18+
MAX_HTML_CHARS = 10_000 # ~2–3k tokens, safe for most models
19+
1820

1921
class ThordataScrapeInput(BaseModel):
2022
"""Input schema for ThordataScrapeTool."""
@@ -56,7 +58,7 @@ class ThordataScrapeTool(BaseTool):
5658
a single web page behind anti-bot protections.
5759
5860
The tool returns either:
59-
* Raw HTML string (for output_format='HTML'), or
61+
* A (possibly truncated) HTML string (for output_format='HTML'), or
6062
* A JSON dict with a base64-encoded PNG (for output_format='PNG').
6163
6264
Example:
@@ -70,7 +72,7 @@ class ThordataScrapeTool(BaseTool):
7072
description: str = (
7173
"Use Thordata Universal Scraper to fetch the content of a single web page. "
7274
"Supports optional JavaScript rendering and basic geo-targeting. "
73-
"Returns HTML (or a base64-encoded PNG screenshot)."
75+
"Returns HTML (truncated to a safe length) or a base64-encoded PNG screenshot."
7476
)
7577
args_schema: Type[BaseModel] = ThordataScrapeInput
7678

@@ -98,16 +100,27 @@ def _run(
98100
block_resources=block_resources,
99101
)
100102

101-
# `universal_scrape` may return text (HTML) or bytes (PNG).
103+
# PNG / binary: return a base64-encoded payload in a small JSON wrapper.
102104
if isinstance(result, bytes):
103105
encoded = base64.b64encode(result).decode("utf-8")
104106
return {
105107
"output_format": "PNG",
106108
"data_base64": encoded,
107109
}
108110

109-
# HTML (or text) is returned as-is.
110-
return result
111+
# HTML / text: truncate to avoid huge LLM inputs.
112+
if isinstance(result, str):
113+
if len(result) > MAX_HTML_CHARS:
114+
truncated = result[:MAX_HTML_CHARS]
115+
truncated += (
116+
"\n\n[Truncated to first "
117+
f"{MAX_HTML_CHARS} characters by ThordataScrapeTool]"
118+
)
119+
return truncated
120+
return result
121+
122+
# Fallback: convert unexpected types to string.
123+
return str(result)
111124

112125
async def _arun(self, *args: Any, **kwargs: Any) -> Any:
113126
"""Async interface is not implemented in this initial version."""

thordata_langchain_tools/serp_tool.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
from dotenv import load_dotenv
77
from langchain_core.tools import BaseTool
8-
98
from pydantic import BaseModel, Field
109

1110
# Try top-level imports first (future SDK versions),

0 commit comments

Comments
 (0)