Add support for deep research bench (#154)

nhuang-lc · web-flow · commit c0a160b57a9b · 2025-07-25T06:53:29.000-07:00
* Testing with deep research bench examples

* Update prompts and evaluation

* Remove print
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,7 @@ readme = "README.md"
 license = { text = "MIT" }
 requires-python = ">=3.10"
 dependencies = [
-    "langgraph>=0.2.55",
+    "langgraph>=0.5.3",
     "langchain-community>=0.3.9",
     "langchain-openai>=0.3.7",
     "langchain-anthropic>=0.3.15",
diff --git a/src/open_deep_research/deep_researcher.py b/src/open_deep_research/deep_researcher.py
@@ -141,11 +141,9 @@ async def supervisor_tools(state: SupervisorState, config: RunnableConfig) -> Co
         all_conduct_research_calls = [tool_call for tool_call in most_recent_message.tool_calls if tool_call["name"] == "ConductResearch"]
         conduct_research_calls = all_conduct_research_calls[:configurable.max_concurrent_research_units]
         overflow_conduct_research_calls = all_conduct_research_calls[configurable.max_concurrent_research_units:]
-        researcher_system_prompt = research_system_prompt.format(mcp_prompt=configurable.mcp_prompt or "", date=get_today_str())
         coros = [
             researcher_subgraph.ainvoke({
                 "researcher_messages": [
-                    SystemMessage(content=researcher_system_prompt),
                     HumanMessage(content=tool_call["args"]["research_topic"])
                 ],
                 "research_topic": tool_call["args"]["research_topic"]
@@ -206,9 +204,9 @@ async def researcher(state: ResearcherState, config: RunnableConfig) -> Command[
         "api_key": get_api_key_for_model(configurable.research_model, config),
         "tags": ["langsmith:nostream"]
     }
+    researcher_system_prompt = research_system_prompt.format(mcp_prompt=configurable.mcp_prompt or "", date=get_today_str())
     research_model = configurable_model.bind_tools(tools).with_retry(stop_after_attempt=configurable.max_structured_output_retries).with_config(research_model_config)
-    # NOTE: Need to add fault tolerance here.
-    response = await research_model.ainvoke(researcher_messages)
+    response = await research_model.ainvoke([SystemMessage(content=researcher_system_prompt)] + researcher_messages)
     return Command(
         goto="researcher_tools",
         update={
@@ -274,11 +272,10 @@ async def compress_research(state: ResearcherState, config: RunnableConfig):
     })
     researcher_messages = state.get("researcher_messages", [])
     # Update the system prompt to now focus on compression rather than research.
-    researcher_messages[0] = SystemMessage(content=compress_research_system_prompt.format(date=get_today_str()))
     researcher_messages.append(HumanMessage(content=compress_research_simple_human_message))
     while synthesis_attempts < 3:
         try:
-            response = await synthesizer_model.ainvoke(researcher_messages)
+            response = await synthesizer_model.ainvoke([SystemMessage(content=compress_research_system_prompt.format(date=get_today_str()))] + researcher_messages)
             return {
                 "compressed_research": str(response.content),
                 "raw_notes": ["\n".join([str(m.content) for m in filter_messages(researcher_messages, include_types=["tool", "ai"])])]
@@ -321,6 +318,7 @@ async def final_report_generation(state: AgentState, config: RunnableConfig):
     while current_retry <= max_retries:
         final_report_prompt = final_report_generation_prompt.format(
             research_brief=state.get("research_brief", ""),
+            messages=get_buffer_string(state.get("messages", [])),
             findings=findings,
             date=get_today_str()
         )
diff --git a/src/open_deep_research/prompts.py b/src/open_deep_research/prompts.py
@@ -222,6 +222,14 @@
 {research_brief}
 </Research Brief>
 
+For more context, here is all of the messages so far. Focus on the research brief above, but consider these messages as well for more context.
+<Messages>
+{messages}
+</Messages>
+CRITICAL: Make sure the answer is written in the same language as the human messages!
+For example, if the user's messages are in English, then MAKE SURE you write your response in English. If the user's messages are in Chinese, then MAKE SURE you write your entire response in Chinese.
+This is critical. The user will only understand the answer if it is written in the same language as their input message.
+
 Today's date is {date}.
 
 Here are the findings from the research that you conducted:
@@ -270,6 +278,12 @@
 - Use ## for section title (Markdown format) for each section of the report
 - Do NOT ever refer to yourself as the writer of the report. This should be a professional report without any self-referential language. 
 - Do not say what you are doing in the report. Just write the report without any commentary from yourself.
+- Each section should be as long as necessary to deeply answer the question with the information you have gathered. It is expected that sections will be fairly long and verbose. You are writing a deep research report, and users will expect a thorough answer.
+- Use bullet points to list out information when appropriate, but by default, write in paragraph form.
+
+REMEMBER:
+The brief and research may be in English, but you need to translate this information to the right language when writing the final answer.
+Make sure the final answer report is in the SAME language as the human messages in the message history.
 
 Format the report in clear markdown with proper structure and include source references where appropriate.
 
diff --git a/tests/run_evaluate.py b/tests/run_evaluate.py
@@ -11,34 +11,49 @@
 client = Client()
 
 # NOTE: Configure the right dataset and evaluators
-dataset_name = "ODR: Comprehensive Test"
-evaluators = [eval_groundedness, eval_completeness, eval_structure]
+dataset_name = "Deep Research Bench"
+evaluators = []
+# NOTE: Configure the right parameters for the experiment, these will be logged in the metadata
+max_structured_output_retries = 3
+allow_clarification = False
+max_concurrent_research_units = 10
+search_api = "tavily" # NOTE: We use Tavily to stay consistent
+max_researcher_iterations = 5
+max_react_tool_calls = 10
+summarization_model = "openai:gpt-4.1-nano"
+summarization_model_max_tokens = 8192
+research_model = "openai:gpt-4.1"
+research_model_max_tokens = 10000
+compression_model = "openai:gpt-4.1"
+compression_model_max_tokens = 10000
+final_report_model = "openai:gpt-4.1"
+final_report_model_max_tokens = 10000
+
 
 async def target(
     inputs: dict,
 ):
-    """Generate a report using the open deep research general researcher"""
     graph = deep_researcher_builder.compile(checkpointer=MemorySaver())
     config = {
         "configurable": {
             "thread_id": str(uuid.uuid4()),
         }
     }
     # NOTE: Configure the right dataset and evaluators
-    config["configurable"]["max_structured_output_retries"] = 3
-    config["configurable"]["allow_clarification"] = False
-    config["configurable"]["max_concurrent_research_units"] = 10
-    config["configurable"]["search_api"] = "tavily"     # NOTE: We use Tavily to stay consistent
-    config["configurable"]["max_researcher_iterations"] = 3
-    config["configurable"]["max_react_tool_calls"] = 10
-    config["configurable"]["summarization_model"] = "openai:gpt-4.1-nano"
-    config["configurable"]["summarization_model_max_tokens"] = 8192
-    config["configurable"]["research_model"] = "openai:gpt-4.1"
-    config["configurable"]["research_model_max_tokens"] = 10000
-    config["configurable"]["compression_model"] = "openai:gpt-4.1-mini"
-    config["configurable"]["compression_model_max_tokens"] = 10000
-    config["configurable"]["final_report_model"] = "openai:gpt-4.1"
-    config["configurable"]["final_report_model_max_tokens"] = 10000
+    config["configurable"]["max_structured_output_retries"] = max_structured_output_retries
+    config["configurable"]["allow_clarification"] = allow_clarification
+    config["configurable"]["max_concurrent_research_units"] = max_concurrent_research_units
+    config["configurable"]["search_api"] = search_api
+    config["configurable"]["max_researcher_iterations"] = max_researcher_iterations
+    config["configurable"]["max_react_tool_calls"] = max_react_tool_calls
+    config["configurable"]["summarization_model"] = summarization_model
+    config["configurable"]["summarization_model_max_tokens"] = summarization_model_max_tokens
+    config["configurable"]["research_model"] = research_model
+    config["configurable"]["research_model_max_tokens"] = research_model_max_tokens
+    config["configurable"]["compression_model"] = compression_model
+    config["configurable"]["compression_model_max_tokens"] = compression_model_max_tokens
+    config["configurable"]["final_report_model"] = final_report_model
+    config["configurable"]["final_report_model_max_tokens"] = final_report_model_max_tokens
     # NOTE: We do not use MCP tools to stay consistent
     final_state = await graph.ainvoke(
         {"messages": [{"role": "user", "content": inputs["messages"][0]["content"]}]},
@@ -49,10 +64,26 @@ async def target(
 async def main():
     return await client.aevaluate(
         target,
-        data=client.list_examples(dataset_name=dataset_name, splits=["test2"]),
+        data=dataset_name,
         evaluators=evaluators,
-        experiment_prefix=f"DR Supervisor: Multi Agent (v3) - Tavily #",
-        max_concurrency=1,
+        experiment_prefix=f"ODR GPT-4.1, Tavily Search #",
+        max_concurrency=10,
+        metadata={
+            "max_structured_output_retries": max_structured_output_retries,
+            "allow_clarification": allow_clarification,
+            "max_concurrent_research_units": max_concurrent_research_units,
+            "search_api": search_api,
+            "max_researcher_iterations": max_researcher_iterations,
+            "max_react_tool_calls": max_react_tool_calls,
+            "summarization_model": summarization_model,
+            "summarization_model_max_tokens": summarization_model_max_tokens,
+            "research_model": research_model,
+            "research_model_max_tokens": research_model_max_tokens,
+            "compression_model": compression_model,
+            "compression_model_max_tokens": compression_model_max_tokens,
+            "final_report_model": final_report_model,
+            "final_report_model_max_tokens": final_report_model_max_tokens,
+        }
     )
 
 if __name__ == "__main__":
diff --git a/uv.lock b/uv.lock