Skip to content

Commit c90acad

Browse files
committed
deep research
1 parent 6e90106 commit c90acad

File tree

6 files changed

+569
-3
lines changed

6 files changed

+569
-3
lines changed

src/agent/custom_agent.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,8 @@ def __init__(
111111

112112
# record last actions
113113
self._last_actions = None
114+
# record extract content
115+
self.extracted_content = ""
114116
# custom new info
115117
self.add_infos = add_infos
116118
# agent_state for Stop
@@ -261,9 +263,15 @@ async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None:
261263
if len(actions) == 0:
262264
# TODO: fix no action case
263265
result = [ActionResult(is_done=True, extracted_content=step_info.memory, include_in_memory=True)]
266+
for ret_ in result:
267+
if "Extracted page as" in ret_.extracted_content:
268+
# record every extracted page
269+
self.extracted_content += ret_.extracted_content
264270
self._last_result = result
265271
self._last_actions = actions
266272
if len(result) > 0 and result[-1].is_done:
273+
self.extracted_content += step_info.memory
274+
result[-1].extracted_content = self.extracted_content
267275
logger.info(f"📄 Result: {result[-1].extracted_content}")
268276

269277
self.consecutive_failures = 0
@@ -338,6 +346,7 @@ async def run(self, max_steps: int = 100) -> AgentHistoryList:
338346
break
339347
else:
340348
logger.info("❌ Failed to complete task in maximum steps")
349+
self.history.history[-1].result[-1].extracted_content = self.extracted_content
341350

342351
return self.history
343352

src/controller/custom_controller.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,22 @@
44
from browser_use.agent.views import ActionResult
55
from browser_use.browser.context import BrowserContext
66
from browser_use.controller.service import Controller, DoneAction
7+
from main_content_extractor import MainContentExtractor
8+
from browser_use.controller.views import (
9+
ClickElementAction,
10+
DoneAction,
11+
ExtractPageContentAction,
12+
GoToUrlAction,
13+
InputTextAction,
14+
OpenTabAction,
15+
ScrollAction,
16+
SearchGoogleAction,
17+
SendKeysAction,
18+
SwitchTabAction,
19+
)
20+
import logging
721

22+
logger = logging.getLogger(__name__)
823

924
class CustomController(Controller):
1025
def __init__(self, exclude_actions: list[str] = [],
@@ -29,3 +44,20 @@ async def paste_from_clipboard(browser: BrowserContext):
2944
await page.keyboard.type(text)
3045

3146
return ActionResult(extracted_content=text)
47+
48+
@self.registry.action(
49+
'Extract page content to get the pure text or markdown with links if include_links is set to true',
50+
param_model=ExtractPageContentAction,
51+
requires_browser=True,
52+
)
53+
async def extract_content(params: ExtractPageContentAction, browser: BrowserContext):
54+
page = await browser.get_current_page()
55+
output_format = 'markdown' if params.include_links else 'text'
56+
content = MainContentExtractor.extract( # type: ignore
57+
html=await page.content(),
58+
output_format=output_format,
59+
)
60+
title = await page.title()
61+
msg = f'📄 Page url: {page.url}, Page title: {title}, Extracted page content as {output_format}\n: {content}\n'
62+
logger.info(msg)
63+
return ActionResult(extracted_content=msg)

src/utils/deep_research.py

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
2+
import pdb
3+
4+
from dotenv import load_dotenv
5+
6+
load_dotenv()
7+
import asyncio
8+
import os
9+
import sys
10+
from pprint import pprint
11+
from uuid import uuid4
12+
from src.utils import utils
13+
from src.agent.custom_agent import CustomAgent
14+
import json
15+
from browser_use.agent.service import Agent
16+
from browser_use.browser.browser import BrowserConfig, Browser
17+
from langchain.schema import SystemMessage, HumanMessage
18+
from json_repair import repair_json
19+
from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt
20+
21+
22+
async def deep_research(task, llm, **kwargs):
23+
24+
save_dir = kwargs.get("save_dir", os.path.join(f"./tmp/deep_research/{task_id}"))
25+
os.makedirs(save_dir, exist_ok=True)
26+
27+
# 搜索的信息
28+
search_infos = ""
29+
# 搜索的LLM历史信息
30+
max_query_num = 3
31+
search_system_prompt = f"""
32+
You are an expert task planner for an AI agent that uses a web browser with **automated execution capabilities**. Your goal is to analyze user instructions and, based on available information,
33+
determine what further search queries are necessary to fulfill the user's request. You will output a JSON object with the following structure:
34+
35+
[
36+
"search query 1",
37+
"search query 2",
38+
//... up to a maximum of {max_query_num} search queries
39+
]
40+
```
41+
42+
Here's an example of the type of `search` tasks we are expecting:
43+
[
44+
"weather in Tokyo",
45+
"cheap flights to Paris"
46+
]
47+
```
48+
49+
**Important:**
50+
51+
* Your output should *only* include search queries as strings in a JSON array. Do not include other task types like navigate, click, extract, etc.
52+
* Limit your output to a **maximum of {max_query_num}** search queries.
53+
* Make the search queries to help the automated agent find the needed information. Consider what keywords are most likely to lead to useful results.
54+
* If you have gathered for all the information you want and no further search queries are required, output an empty list: `[]`
55+
* Make sure your search queries are different from the previous queries.
56+
57+
**Inputs:**
58+
59+
1. **User Instruction:** The original instruction given by the user.
60+
2. **Previous Search Results:** Textual data gathered from prior search queries. If there are no previous search results this string will be empty.
61+
"""
62+
search_messages = [SystemMessage(content=search_system_prompt)]
63+
# 记录和总结的历史信息,保存到raw_infos
64+
record_system_prompt = """
65+
You are an expert information recorder. Your role is to process user instructions, current search results, and previously recorded information to extract, summarize, and record new, useful information that helps fulfill the user's request. Your output will be a concise textual summary of new information.
66+
67+
**Important Considerations:**
68+
69+
1. **Avoid Redundancy:** Do not record information that is already present in the `Previous Recorded Information`. Check for semantic similarity, not just exact matches.
70+
71+
2. **Utility Focus:** Only record information that is likely to be useful for completing the user's original instruction. Ask yourself: "Will this help the AI agent achieve its goal?" Discard irrelevant details.
72+
73+
3. **Include Source Information:** When summarizing information extracted from a specific source (like a webpage or article), always include the source title and URL if available. This helps in verifying the information and providing context.
74+
75+
4. **Format:** Provide your output as a textual summary. When source information is available, use the format: `[title](url): summarized content`. If no specific source is identified, just provide the concise summary. No JSON or other structured output is needed beyond this format.
76+
77+
**Inputs:**
78+
79+
1. **User Instruction:** The original instruction given by the user. This helps you determine what kind of information will be useful.
80+
2. **Current Search Results:** Textual data gathered from the most recent search query.
81+
3. **Previous Recorded Information:** Textual data gathered and recorded from previous searches and processing, represented as a single text string. This string might be empty if no information has been recorded yet.
82+
"""
83+
record_messages = [SystemMessage(content=record_system_prompt)]
84+
85+
browser = Browser(
86+
config=BrowserConfig(
87+
disable_security=True,
88+
headless=False, # Set to False to see browser actions
89+
)
90+
)
91+
search_iteration = 0
92+
max_search_iterations = 5 # Limit search iterations to prevent infinite loop
93+
max_history_len = 2
94+
use_vision = True
95+
96+
try:
97+
while search_iteration < max_search_iterations:
98+
search_iteration += 1
99+
print(f"开始第 {search_iteration} 轮搜索...")
100+
101+
query_prompt = f"User Instruction:{task} \n Previous Search Results:\n {search_infos}"
102+
search_messages.append(HumanMessage(content=query_prompt))
103+
ai_query_msg = llm.invoke(search_messages[:1] + search_messages[1:][-max_history_len:])
104+
if hasattr(ai_query_msg, "reasoning_content"):
105+
print("🤯 Start Search Deep Thinking: ")
106+
print(ai_query_msg.reasoning_content)
107+
print("🤯 End Search Deep Thinking")
108+
ai_content = ai_query_msg.content.replace("```json", "").replace("```", "")
109+
ai_content = repair_json(ai_content)
110+
query_tasks = json.loads(ai_content)
111+
if not query_tasks:
112+
break
113+
else:
114+
search_messages.append(ai_query_msg)
115+
print(f"搜索关键词/问题: {query_tasks}")
116+
117+
# 2. Perform Web Search and Auto exec
118+
agents = [CustomAgent(task=task + ". Please click on the most relevant link to get information and go deeper, instead of just staying on the search page.",
119+
llm=llm_bu,
120+
browser=browser,
121+
use_vision=use_vision,
122+
system_prompt_class=CustomSystemPrompt,
123+
agent_prompt_class=CustomAgentMessagePrompt,
124+
max_actions_per_step=5
125+
) for task in query_tasks]
126+
query_results = await asyncio.gather(*[agent.run(max_steps=10) for agent in agents])
127+
128+
# 3. Summarize Search Result
129+
cur_search_rets = ""
130+
for i in range(len(query_tasks)):
131+
cur_search_rets += f"{i+1}. {query_tasks[i]}\n {query_results[i].final_result()}\n"
132+
record_prompt = f"User Instruction:{task}. \n Current Search Results: {cur_search_rets}\n Previous Search Results:\n {search_infos}"
133+
record_messages.append(HumanMessage(content=record_prompt))
134+
ai_record_msg = llm.invoke(record_messages[:1] + record_messages[-1:])
135+
if hasattr(ai_record_msg, "reasoning_content"):
136+
print("🤯 Start Record Deep Thinking: ")
137+
print(ai_record_msg.reasoning_content)
138+
print("🤯 End Record Deep Thinking")
139+
record_content = ai_record_msg.content
140+
search_infos += record_content + "\n"
141+
record_messages.append(ai_record_msg)
142+
print(search_infos)
143+
144+
print("\n搜索完成, 开始生成报告...")
145+
146+
# 5. Report Generation in Markdown (or JSON if you prefer)
147+
writer_system_prompt = """
148+
create polished, high-quality reports that fully meet the user's needs, based on the user's instructions and the relevant information provided. Please write the report using Markdown format, ensuring it is both informative and visually appealing.
149+
150+
Specific Instructions:
151+
* **Structure for Impact:** The report must have a clear, logical, and impactful structure. Begin with a compelling introduction that immediately grabs the reader's attention. Develop well-structured body paragraphs that flow smoothly and logically, and conclude with a concise and memorable conclusion that summarizes key takeaways and leaves a lasting impression.
152+
* **Engaging and Vivid Language:** Employ precise, vivid, and descriptive language to make the report captivating and enjoyable to read. Use stylistic techniques to enhance engagement. Tailor your tone, vocabulary, and writing style to perfectly suit the subject matter and the intended audience to maximize impact and readability.
153+
* **Accuracy and Credibility:** Ensure that all information presented is meticulously accurate, rigorously truthful, and robustly supported by the available data. Cite sources professionally and appropriately to enhance credibility and allow for verification.
154+
* **Publication-Ready Formatting:** Adhere strictly to Markdown formatting for excellent readability and a clean, highly professional visual appearance. Pay close attention to formatting details like headings, lists, emphasis, and spacing to optimize the visual presentation and reader experience. The report should be ready for immediate publication upon completion, requiring minimal to no further editing for style or format.
155+
* **Conciseness and Clarity (Unless Specified Otherwise):** When the user does not provide a specific length, prioritize concise and to-the-point writing, maximizing information density while maintaining clarity.
156+
* **Length Adherence:** When the user specifies a length constraint, meticulously stay within reasonable bounds of that specification, ensuring the content is appropriately scaled without sacrificing quality or completeness.
157+
* **Comprehensive Instruction Following:** Pay meticulous attention to all details and nuances provided in the user instructions. Strive to fulfill every aspect of the user's request with the highest degree of accuracy and attention to detail, creating a report that not only meets but exceeds expectations for quality and professionalism.
158+
* **Output Final Report Only Instruction:** This new instruction is explicitly added at the end to directly address the user's requirement. It clearly commands the LLM to output *only* the final article and to avoid any other elements. The bolded emphasis further reinforces this crucial point.
159+
"""
160+
report_prompt = f"User Instruction:{task} \n Search Information:\n {search_infos}"
161+
report_messages = [SystemMessage(content=writer_system_prompt), HumanMessage(content=report_prompt)] # New context for report generation
162+
ai_report_msg = llm.invoke(report_messages)
163+
if hasattr(ai_report_msg, "reasoning_content"):
164+
print("🤯 Start Report Deep Thinking: ")
165+
print(ai_report_msg.reasoning_content)
166+
print("🤯 End Report Deep Thinking")
167+
report_content = ai_report_msg.content
168+
169+
if report_content:
170+
report_file_path = os.path.join(save_dir, "result.md")
171+
with open(report_file_path, "w", encoding="utf-8") as f:
172+
f.write(report_content)
173+
print(f"报告已生成并保存到: {report_file_path}")
174+
175+
print("\nFinal Result: (Report Content)")
176+
pprint(report_content, indent=4) # Print the final report content
177+
178+
else:
179+
print("未能生成报告内容。")
180+
181+
182+
except Exception as e:
183+
print(f"Deep research 过程中发生错误: {e}")
184+
finally:
185+
if browser:
186+
await browser.close()
187+
print("Browser closed.")

tests/test_browser_use.py

Lines changed: 123 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,129 @@ async def test_browser_use_custom():
233233
await playwright.stop()
234234
if browser:
235235
await browser.close()
236+
237+
async def test_browser_use_parallel():
238+
from browser_use.browser.context import BrowserContextWindowSize
239+
from browser_use.browser.browser import BrowserConfig
240+
from playwright.async_api import async_playwright
241+
from browser_use.browser.browser import Browser
242+
from src.agent.custom_agent import CustomAgent
243+
from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt
244+
from src.browser.custom_browser import CustomBrowser
245+
from src.browser.custom_context import BrowserContextConfig
246+
from src.controller.custom_controller import CustomController
247+
248+
window_w, window_h = 1920, 1080
249+
250+
# llm = utils.get_llm_model(
251+
# provider="openai",
252+
# model_name="gpt-4o",
253+
# temperature=0.8,
254+
# base_url=os.getenv("OPENAI_ENDPOINT", ""),
255+
# api_key=os.getenv("OPENAI_API_KEY", ""),
256+
# )
257+
258+
# llm = utils.get_llm_model(
259+
# provider="azure_openai",
260+
# model_name="gpt-4o",
261+
# temperature=0.8,
262+
# base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
263+
# api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
264+
# )
265+
266+
llm = utils.get_llm_model(
267+
provider="gemini",
268+
model_name="gemini-2.0-flash-exp",
269+
temperature=1.0,
270+
api_key=os.getenv("GOOGLE_API_KEY", "")
271+
)
272+
273+
# llm = utils.get_llm_model(
274+
# provider="deepseek",
275+
# model_name="deepseek-reasoner",
276+
# temperature=0.8
277+
# )
278+
279+
# llm = utils.get_llm_model(
280+
# provider="deepseek",
281+
# model_name="deepseek-chat",
282+
# temperature=0.8
283+
# )
284+
285+
# llm = utils.get_llm_model(
286+
# provider="ollama", model_name="qwen2.5:7b", temperature=0.5
287+
# )
288+
289+
# llm = utils.get_llm_model(
290+
# provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
291+
# )
292+
293+
controller = CustomController()
294+
use_own_browser = True
295+
disable_security = True
296+
use_vision = True # Set to False when using DeepSeek
297+
298+
max_actions_per_step = 1
299+
playwright = None
300+
browser = None
301+
browser_context = None
302+
303+
browser = Browser(
304+
config=BrowserConfig(
305+
disable_security=True,
306+
headless=False,
307+
new_context_config=BrowserContextConfig(save_recording_path='./tmp/recordings'),
308+
)
309+
)
310+
311+
try:
312+
agents = [
313+
Agent(task=task, llm=llm, browser=browser)
314+
for task in [
315+
'Search Google for weather in Tokyo',
316+
'Check Reddit front page title',
317+
'大S去世',
318+
'Find NASA image of the day',
319+
# 'Check top story on CNN',
320+
# 'Search latest SpaceX launch date',
321+
# 'Look up population of Paris',
322+
# 'Find current time in Sydney',
323+
# 'Check who won last Super Bowl',
324+
# 'Search trending topics on Twitter',
325+
]
326+
]
327+
328+
history = await asyncio.gather(*[agent.run() for agent in agents])
329+
pdb.set_trace()
330+
print("Final Result:")
331+
pprint(history.final_result(), indent=4)
332+
333+
print("\nErrors:")
334+
pprint(history.errors(), indent=4)
335+
336+
# e.g. xPaths the model clicked on
337+
print("\nModel Outputs:")
338+
pprint(history.model_actions(), indent=4)
339+
340+
print("\nThoughts:")
341+
pprint(history.model_thoughts(), indent=4)
342+
# close browser
343+
except Exception:
344+
import traceback
345+
346+
traceback.print_exc()
347+
finally:
348+
# 显式关闭持久化上下文
349+
if browser_context:
350+
await browser_context.close()
351+
352+
# 关闭 Playwright 对象
353+
if playwright:
354+
await playwright.stop()
355+
if browser:
356+
await browser.close()
236357

237358
if __name__ == "__main__":
238359
# asyncio.run(test_browser_use_org())
239-
asyncio.run(test_browser_use_custom())
360+
asyncio.run(test_browser_use_parallel())
361+
# asyncio.run(test_browser_use_custom())

0 commit comments

Comments
 (0)