Skip to content

Commit c0a160b

Browse files
authored
Add support for deep research bench (#154)
* Testing with deep research bench examples * Update prompts and evaluation * Remove print
1 parent 69c0a33 commit c0a160b

File tree

5 files changed

+82
-39
lines changed

5 files changed

+82
-39
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ readme = "README.md"
99
license = { text = "MIT" }
1010
requires-python = ">=3.10"
1111
dependencies = [
12-
"langgraph>=0.2.55",
12+
"langgraph>=0.5.3",
1313
"langchain-community>=0.3.9",
1414
"langchain-openai>=0.3.7",
1515
"langchain-anthropic>=0.3.15",

src/open_deep_research/deep_researcher.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -141,11 +141,9 @@ async def supervisor_tools(state: SupervisorState, config: RunnableConfig) -> Co
141141
all_conduct_research_calls = [tool_call for tool_call in most_recent_message.tool_calls if tool_call["name"] == "ConductResearch"]
142142
conduct_research_calls = all_conduct_research_calls[:configurable.max_concurrent_research_units]
143143
overflow_conduct_research_calls = all_conduct_research_calls[configurable.max_concurrent_research_units:]
144-
researcher_system_prompt = research_system_prompt.format(mcp_prompt=configurable.mcp_prompt or "", date=get_today_str())
145144
coros = [
146145
researcher_subgraph.ainvoke({
147146
"researcher_messages": [
148-
SystemMessage(content=researcher_system_prompt),
149147
HumanMessage(content=tool_call["args"]["research_topic"])
150148
],
151149
"research_topic": tool_call["args"]["research_topic"]
@@ -206,9 +204,9 @@ async def researcher(state: ResearcherState, config: RunnableConfig) -> Command[
206204
"api_key": get_api_key_for_model(configurable.research_model, config),
207205
"tags": ["langsmith:nostream"]
208206
}
207+
researcher_system_prompt = research_system_prompt.format(mcp_prompt=configurable.mcp_prompt or "", date=get_today_str())
209208
research_model = configurable_model.bind_tools(tools).with_retry(stop_after_attempt=configurable.max_structured_output_retries).with_config(research_model_config)
210-
# NOTE: Need to add fault tolerance here.
211-
response = await research_model.ainvoke(researcher_messages)
209+
response = await research_model.ainvoke([SystemMessage(content=researcher_system_prompt)] + researcher_messages)
212210
return Command(
213211
goto="researcher_tools",
214212
update={
@@ -274,11 +272,10 @@ async def compress_research(state: ResearcherState, config: RunnableConfig):
274272
})
275273
researcher_messages = state.get("researcher_messages", [])
276274
# Update the system prompt to now focus on compression rather than research.
277-
researcher_messages[0] = SystemMessage(content=compress_research_system_prompt.format(date=get_today_str()))
278275
researcher_messages.append(HumanMessage(content=compress_research_simple_human_message))
279276
while synthesis_attempts < 3:
280277
try:
281-
response = await synthesizer_model.ainvoke(researcher_messages)
278+
response = await synthesizer_model.ainvoke([SystemMessage(content=compress_research_system_prompt.format(date=get_today_str()))] + researcher_messages)
282279
return {
283280
"compressed_research": str(response.content),
284281
"raw_notes": ["\n".join([str(m.content) for m in filter_messages(researcher_messages, include_types=["tool", "ai"])])]
@@ -321,6 +318,7 @@ async def final_report_generation(state: AgentState, config: RunnableConfig):
321318
while current_retry <= max_retries:
322319
final_report_prompt = final_report_generation_prompt.format(
323320
research_brief=state.get("research_brief", ""),
321+
messages=get_buffer_string(state.get("messages", [])),
324322
findings=findings,
325323
date=get_today_str()
326324
)

src/open_deep_research/prompts.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,14 @@
222222
{research_brief}
223223
</Research Brief>
224224
225+
For more context, here is all of the messages so far. Focus on the research brief above, but consider these messages as well for more context.
226+
<Messages>
227+
{messages}
228+
</Messages>
229+
CRITICAL: Make sure the answer is written in the same language as the human messages!
230+
For example, if the user's messages are in English, then MAKE SURE you write your response in English. If the user's messages are in Chinese, then MAKE SURE you write your entire response in Chinese.
231+
This is critical. The user will only understand the answer if it is written in the same language as their input message.
232+
225233
Today's date is {date}.
226234
227235
Here are the findings from the research that you conducted:
@@ -270,6 +278,12 @@
270278
- Use ## for section title (Markdown format) for each section of the report
271279
- Do NOT ever refer to yourself as the writer of the report. This should be a professional report without any self-referential language.
272280
- Do not say what you are doing in the report. Just write the report without any commentary from yourself.
281+
- Each section should be as long as necessary to deeply answer the question with the information you have gathered. It is expected that sections will be fairly long and verbose. You are writing a deep research report, and users will expect a thorough answer.
282+
- Use bullet points to list out information when appropriate, but by default, write in paragraph form.
283+
284+
REMEMBER:
285+
The brief and research may be in English, but you need to translate this information to the right language when writing the final answer.
286+
Make sure the final answer report is in the SAME language as the human messages in the message history.
273287
274288
Format the report in clear markdown with proper structure and include source references where appropriate.
275289

tests/run_evaluate.py

Lines changed: 51 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -11,34 +11,49 @@
1111
client = Client()
1212

1313
# NOTE: Configure the right dataset and evaluators
14-
dataset_name = "ODR: Comprehensive Test"
15-
evaluators = [eval_groundedness, eval_completeness, eval_structure]
14+
dataset_name = "Deep Research Bench"
15+
evaluators = []
16+
# NOTE: Configure the right parameters for the experiment, these will be logged in the metadata
17+
max_structured_output_retries = 3
18+
allow_clarification = False
19+
max_concurrent_research_units = 10
20+
search_api = "tavily" # NOTE: We use Tavily to stay consistent
21+
max_researcher_iterations = 5
22+
max_react_tool_calls = 10
23+
summarization_model = "openai:gpt-4.1-nano"
24+
summarization_model_max_tokens = 8192
25+
research_model = "openai:gpt-4.1"
26+
research_model_max_tokens = 10000
27+
compression_model = "openai:gpt-4.1"
28+
compression_model_max_tokens = 10000
29+
final_report_model = "openai:gpt-4.1"
30+
final_report_model_max_tokens = 10000
31+
1632

1733
async def target(
1834
inputs: dict,
1935
):
20-
"""Generate a report using the open deep research general researcher"""
2136
graph = deep_researcher_builder.compile(checkpointer=MemorySaver())
2237
config = {
2338
"configurable": {
2439
"thread_id": str(uuid.uuid4()),
2540
}
2641
}
2742
# NOTE: Configure the right dataset and evaluators
28-
config["configurable"]["max_structured_output_retries"] = 3
29-
config["configurable"]["allow_clarification"] = False
30-
config["configurable"]["max_concurrent_research_units"] = 10
31-
config["configurable"]["search_api"] = "tavily" # NOTE: We use Tavily to stay consistent
32-
config["configurable"]["max_researcher_iterations"] = 3
33-
config["configurable"]["max_react_tool_calls"] = 10
34-
config["configurable"]["summarization_model"] = "openai:gpt-4.1-nano"
35-
config["configurable"]["summarization_model_max_tokens"] = 8192
36-
config["configurable"]["research_model"] = "openai:gpt-4.1"
37-
config["configurable"]["research_model_max_tokens"] = 10000
38-
config["configurable"]["compression_model"] = "openai:gpt-4.1-mini"
39-
config["configurable"]["compression_model_max_tokens"] = 10000
40-
config["configurable"]["final_report_model"] = "openai:gpt-4.1"
41-
config["configurable"]["final_report_model_max_tokens"] = 10000
43+
config["configurable"]["max_structured_output_retries"] = max_structured_output_retries
44+
config["configurable"]["allow_clarification"] = allow_clarification
45+
config["configurable"]["max_concurrent_research_units"] = max_concurrent_research_units
46+
config["configurable"]["search_api"] = search_api
47+
config["configurable"]["max_researcher_iterations"] = max_researcher_iterations
48+
config["configurable"]["max_react_tool_calls"] = max_react_tool_calls
49+
config["configurable"]["summarization_model"] = summarization_model
50+
config["configurable"]["summarization_model_max_tokens"] = summarization_model_max_tokens
51+
config["configurable"]["research_model"] = research_model
52+
config["configurable"]["research_model_max_tokens"] = research_model_max_tokens
53+
config["configurable"]["compression_model"] = compression_model
54+
config["configurable"]["compression_model_max_tokens"] = compression_model_max_tokens
55+
config["configurable"]["final_report_model"] = final_report_model
56+
config["configurable"]["final_report_model_max_tokens"] = final_report_model_max_tokens
4257
# NOTE: We do not use MCP tools to stay consistent
4358
final_state = await graph.ainvoke(
4459
{"messages": [{"role": "user", "content": inputs["messages"][0]["content"]}]},
@@ -49,10 +64,26 @@ async def target(
4964
async def main():
5065
return await client.aevaluate(
5166
target,
52-
data=client.list_examples(dataset_name=dataset_name, splits=["test2"]),
67+
data=dataset_name,
5368
evaluators=evaluators,
54-
experiment_prefix=f"DR Supervisor: Multi Agent (v3) - Tavily #",
55-
max_concurrency=1,
69+
experiment_prefix=f"ODR GPT-4.1, Tavily Search #",
70+
max_concurrency=10,
71+
metadata={
72+
"max_structured_output_retries": max_structured_output_retries,
73+
"allow_clarification": allow_clarification,
74+
"max_concurrent_research_units": max_concurrent_research_units,
75+
"search_api": search_api,
76+
"max_researcher_iterations": max_researcher_iterations,
77+
"max_react_tool_calls": max_react_tool_calls,
78+
"summarization_model": summarization_model,
79+
"summarization_model_max_tokens": summarization_model_max_tokens,
80+
"research_model": research_model,
81+
"research_model_max_tokens": research_model_max_tokens,
82+
"compression_model": compression_model,
83+
"compression_model_max_tokens": compression_model_max_tokens,
84+
"final_report_model": final_report_model,
85+
"final_report_model_max_tokens": final_report_model_max_tokens,
86+
}
5687
)
5788

5889
if __name__ == "__main__":

uv.lock

Lines changed: 12 additions & 12 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)