1414
1515from .serp_tool import _build_client_from_env
1616
17- MAX_HTML_CHARS = 10_000 # maximum number of characters to return to the LLM
17+ # Limit the amount of HTML we pass back to the LLM to avoid huge prompts
18+ MAX_HTML_CHARS = 10_000 # ~2–3k tokens, safe for most models
19+
1820
1921class ThordataScrapeInput (BaseModel ):
2022 """Input schema for ThordataScrapeTool."""
@@ -56,7 +58,7 @@ class ThordataScrapeTool(BaseTool):
5658 a single web page behind anti-bot protections.
5759
5860 The tool returns either:
59- * Raw HTML string (for output_format='HTML'), or
61+ * A (possibly truncated) HTML string (for output_format='HTML'), or
6062 * A JSON dict with a base64-encoded PNG (for output_format='PNG').
6163
6264 Example:
@@ -70,7 +72,7 @@ class ThordataScrapeTool(BaseTool):
7072 description : str = (
7173 "Use Thordata Universal Scraper to fetch the content of a single web page. "
7274 "Supports optional JavaScript rendering and basic geo-targeting. "
73- "Returns HTML (or a base64-encoded PNG screenshot) ."
75+ "Returns HTML (truncated to a safe length) or a base64-encoded PNG screenshot."
7476 )
7577 args_schema : Type [BaseModel ] = ThordataScrapeInput
7678
@@ -98,16 +100,27 @@ def _run(
98100 block_resources = block_resources ,
99101 )
100102
101- # `universal_scrape` may return text (HTML) or bytes (PNG) .
103+ # PNG / binary: return a base64-encoded payload in a small JSON wrapper .
102104 if isinstance (result , bytes ):
103105 encoded = base64 .b64encode (result ).decode ("utf-8" )
104106 return {
105107 "output_format" : "PNG" ,
106108 "data_base64" : encoded ,
107109 }
108110
109- # HTML (or text) is returned as-is.
110- return result
111+ # HTML / text: truncate to avoid huge LLM inputs.
112+ if isinstance (result , str ):
113+ if len (result ) > MAX_HTML_CHARS :
114+ truncated = result [:MAX_HTML_CHARS ]
115+ truncated += (
116+ "\n \n [Truncated to first "
117+ f"{ MAX_HTML_CHARS } characters by ThordataScrapeTool]"
118+ )
119+ return truncated
120+ return result
121+
122+ # Fallback: convert unexpected types to string.
123+ return str (result )
111124
112125 async def _arun (self , * args : Any , ** kwargs : Any ) -> Any :
113126 """Async interface is not implemented in this initial version."""
0 commit comments