@@ -43,9 +43,6 @@ async def scrape_and_extract_info(
4343 - url (str): The original URL
4444 - extracted_info (str): The extracted information
4545 - error (str): Error message if the operation failed
46- - scrape_stats (Dict): Statistics about the scraped content
47- - model_used (str): The model used for summarization
48- - tokens_used (int): Number of tokens used (if available)
4946 """
5047 if _is_huggingface_dataset_or_space_url (url ):
5148 return json .dumps (
@@ -54,8 +51,6 @@ async def scrape_and_extract_info(
5451 "url" : url ,
5552 "extracted_info" : "" ,
5653 "error" : "You are trying to scrape a Hugging Face dataset for answers, please do not use the scrape tool for this purpose." ,
57- "scrape_stats" : {},
58- "tokens_used" : 0 ,
5954 },
6055 ensure_ascii = False ,
6156 )
@@ -80,8 +75,6 @@ async def scrape_and_extract_info(
8075 "url" : url ,
8176 "extracted_info" : "" ,
8277 "error" : f"Scraping failed (both Jina and Python): { scrape_result ['error' ]} " ,
83- "scrape_stats" : {},
84- "tokens_used" : 0 ,
8578 },
8679 ensure_ascii = False ,
8780 )
@@ -106,14 +99,6 @@ async def scrape_and_extract_info(
10699 "url" : url ,
107100 "extracted_info" : extracted_result ["extracted_info" ],
108101 "error" : extracted_result ["error" ],
109- "scrape_stats" : {
110- "line_count" : scrape_result ["line_count" ],
111- "char_count" : scrape_result ["char_count" ],
112- "last_char_line" : scrape_result ["last_char_line" ],
113- "all_content_displayed" : scrape_result ["all_content_displayed" ],
114- },
115- "model_used" : extracted_result ["model_used" ],
116- "tokens_used" : extracted_result ["tokens_used" ],
117102 },
118103 ensure_ascii = False ,
119104 )
@@ -146,12 +131,8 @@ async def scrape_url_with_jina(
146131 Dict[str, Any]: A dictionary containing:
147132 - success (bool): Whether the operation was successful
148133 - filename (str): Absolute path to the temporary file containing the scraped content
149- - content (str): The scraped content of the first 40k characters
134+ - content (str): The scraped content (truncated to max_chars if necessary)
150135 - error (str): Error message if the operation failed
151- - line_count (int): Number of lines in the scraped content
152- - char_count (int): Number of characters in the scraped content
153- - last_char_line (int): Line number where the last displayed character is located
154- - all_content_displayed (bool): Signal indicating if all content was displayed (True if content <= 40k chars)
155136 """
156137
157138 # Validate input
@@ -161,10 +142,6 @@ async def scrape_url_with_jina(
161142 "filename" : "" ,
162143 "content" : "" ,
163144 "error" : "URL cannot be empty" ,
164- "line_count" : 0 ,
165- "char_count" : 0 ,
166- "last_char_line" : 0 ,
167- "all_content_displayed" : False ,
168145 }
169146
170147 # Get API key from environment
@@ -174,10 +151,6 @@ async def scrape_url_with_jina(
174151 "filename" : "" ,
175152 "content" : "" ,
176153 "error" : "JINA_API_KEY environment variable is not set" ,
177- "line_count" : 0 ,
178- "char_count" : 0 ,
179- "last_char_line" : 0 ,
180- "all_content_displayed" : False ,
181154 }
182155
183156 # Avoid duplicate Jina URL prefix
@@ -301,10 +274,6 @@ async def scrape_url_with_jina(
301274 "filename" : "" ,
302275 "content" : "" ,
303276 "error" : error_msg ,
304- "line_count" : 0 ,
305- "char_count" : 0 ,
306- "last_char_line" : 0 ,
307- "all_content_displayed" : False ,
308277 }
309278
310279 # Get the scraped content
@@ -316,10 +285,6 @@ async def scrape_url_with_jina(
316285 "filename" : "" ,
317286 "content" : "" ,
318287 "error" : "No content returned from Jina.ai API" ,
319- "line_count" : 0 ,
320- "char_count" : 0 ,
321- "last_char_line" : 0 ,
322- "all_content_displayed" : False ,
323288 }
324289
325290 # handle insufficient balance error
@@ -336,35 +301,15 @@ async def scrape_url_with_jina(
336301 "filename" : "" ,
337302 "content" : "" ,
338303 "error" : "Insufficient balance" ,
339- "line_count" : 0 ,
340- "char_count" : 0 ,
341- "last_char_line" : 0 ,
342- "all_content_displayed" : False ,
343304 }
344305
345- # Get content statistics
346- total_char_count = len (content )
347- total_line_count = content .count ("\n " ) + 1 if content else 0
348-
349306 # Extract first max_chars characters
350307 displayed_content = content [:max_chars ]
351- all_content_displayed = total_char_count <= max_chars
352-
353- # Calculate the line number of the last character displayed
354- if displayed_content :
355- # Count newlines up to the last displayed character
356- last_char_line = displayed_content .count ("\n " ) + 1
357- else :
358- last_char_line = 0
359308
360309 return {
361310 "success" : True ,
362311 "content" : displayed_content ,
363312 "error" : "" ,
364- "line_count" : total_line_count ,
365- "char_count" : total_char_count ,
366- "last_char_line" : last_char_line ,
367- "all_content_displayed" : all_content_displayed ,
368313 }
369314
370315
@@ -382,23 +327,15 @@ async def scrape_url_with_python(
382327 Returns:
383328 Dict[str, Any]: A dictionary containing:
384329 - success (bool): Whether the operation was successful
385- - content (str): The scraped content
330+ - content (str): The scraped content (truncated to max_chars if necessary)
386331 - error (str): Error message if the operation failed
387- - line_count (int): Number of lines in the scraped content
388- - char_count (int): Number of characters in the scraped content
389- - last_char_line (int): Line number where the last displayed character is located
390- - all_content_displayed (bool): Signal indicating if all content was displayed
391332 """
392333 # Validate input
393334 if not url or not url .strip ():
394335 return {
395336 "success" : False ,
396337 "content" : "" ,
397338 "error" : "URL cannot be empty" ,
398- "line_count" : 0 ,
399- "char_count" : 0 ,
400- "last_char_line" : 0 ,
401- "all_content_displayed" : False ,
402339 }
403340
404341 try :
@@ -511,10 +448,6 @@ async def scrape_url_with_python(
511448 "success" : False ,
512449 "content" : "" ,
513450 "error" : error_msg ,
514- "line_count" : 0 ,
515- "char_count" : 0 ,
516- "last_char_line" : 0 ,
517- "all_content_displayed" : False ,
518451 }
519452
520453 # Get the scraped content
@@ -525,34 +458,15 @@ async def scrape_url_with_python(
525458 "success" : False ,
526459 "content" : "" ,
527460 "error" : "No content returned from URL" ,
528- "line_count" : 0 ,
529- "char_count" : 0 ,
530- "last_char_line" : 0 ,
531- "all_content_displayed" : False ,
532461 }
533462
534- # Get content statistics
535- total_char_count = len (content )
536- total_line_count = content .count ("\n " ) + 1 if content else 0
537-
538463 # Extract first max_chars characters
539464 displayed_content = content [:max_chars ]
540- all_content_displayed = total_char_count <= max_chars
541-
542- # Calculate the line number of the last character displayed
543- if displayed_content :
544- last_char_line = displayed_content .count ("\n " ) + 1
545- else :
546- last_char_line = 0
547465
548466 return {
549467 "success" : True ,
550468 "content" : displayed_content ,
551469 "error" : "" ,
552- "line_count" : total_line_count ,
553- "char_count" : total_char_count ,
554- "last_char_line" : last_char_line ,
555- "all_content_displayed" : all_content_displayed ,
556470 }
557471
558472
0 commit comments