microsoft · Hoder-zyf · Jul 29, 2025 · Jul 29, 2025 · Jul 29, 2025 · Jul 30, 2025
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -26,4 +26,4 @@
     "--gpus=all"
   ],
   "postCreateCommand": "make dev"
-}
+}
diff --git a/.gitignore b/.gitignore
@@ -119,6 +119,7 @@ venv/
 ENV/
 env.bak/
 venv.bak/
+mcp_config.json
 
 # Spyder project settings
 .spyderproject
@@ -177,4 +178,4 @@ rdagent/app/benchmark/factor/example.json
 
 # UI Server resources
 videos/
-static/
+static/
diff --git a/mcp_config.json.example b/mcp_config.json.example
@@ -0,0 +1,24 @@
+{
+  "mcpServices": {
+    "context7": {
+      "url": "http://localhost:8123/mcp",
+      "handler": "rdagent.components.mcp.context7.client:Context7Client",
+      "enabled": true,
+      "timeout": 60,
+      "extra_config": {
+        "model": "gpt-4",
+        "api_key": "your-api-key-here",
+        "temperature": 0.7
+      }
+    },
+    "simple_code_search": {
+      "url": "http://localhost:9001/mcp",
+      "handler": "rdagent.components.mcp.client:MCPClient",
+      "timeout": 60,
+      "enabled": true,
+      "extra_config": {
+        "model": "gpt-3.5-turbo"
+      }
+    }
+  }
+}
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
@@ -165,6 +165,10 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     """Enable hypothesis critique and rewrite stages for improving hypothesis quality"""
     enable_scale_check: bool = False
 
+    #### MCP documentation search integration
+    enable_mcp_documentation_search: bool = True
+    """Enable MCP documentation search for error resolution. Requires MCP_ENABLED=true and MCP_CONTEXT7_ENABLED=true in environment."""
+
     ##### select related
     ratio_merge_or_ensemble: int = 70
     """The ratio of merge or ensemble to be considered as a valid solution"""
@@ -177,7 +181,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
 
 DS_RD_SETTING = DataScienceBasePropSetting()
 
-# enable_cross_trace_diversity 和 llm_select_hypothesis should not be true at the same time
+# enable_cross_trace_diversity and llm_select_hypothesis should not be true at the same time
 assert not (
     DS_RD_SETTING.enable_cross_trace_diversity and DS_RD_SETTING.llm_select_hypothesis
 ), "enable_cross_trace_diversity and llm_select_hypothesis cannot be true at the same time"
diff --git a/rdagent/components/coder/CoSTEER/evaluators.py b/rdagent/components/coder/CoSTEER/evaluators.py
@@ -76,6 +76,27 @@ def val_and_update_init_dict(data: dict) -> dict:
                 raise ValueError(f"'{attr}' must be a string, not {type(data[attr])}")
         return data
 
+    @classmethod
+    def merge(cls, feedback_li: list["CoSTEERSingleFeedback"]) -> "CoSTEERSingleFeedback":
+        # NOTE:
+        # Here we don't know the detailed design of each feedback, we just know they are CoSTEERSingleFeedback
+        # So we merge them only based on CoSTEERSingleFeedback's attributes
+        # **So some information may be lost when we have different types of feedbacks**
+        # If you have more sophisticated sub class of CoSTEERSingleFeedback, you should override this method
+        # to avoid the loss of information.
+
+        fb = deepcopy(feedback_li[0])
+
+        # for all the evaluators, aggregate the final_decision from `task_id`
+        fb.final_decision = all(fb.final_decision for fb in feedback_li)
+        for attr in "execution", "return_checking", "code":
+            setattr(
+                fb,
+                attr,
+                "\n\n".join([getattr(_fb, attr) for _fb in feedback_li if getattr(_fb, attr) is not None]),
+            )
+        return fb
+
     def __str__(self) -> str:
         return f"""------------------Execution------------------
 {self.execution}
@@ -230,7 +251,18 @@ def evaluate(
         **kwargs,
     ) -> CoSTEERMultiFeedback:
         eval_l = self.single_evaluator if isinstance(self.single_evaluator, list) else [self.single_evaluator]
+
+        # 1) Evaluate each sub_task
         task_li_feedback_li = []
+        # task_li_feedback_li: List[List[CoSTEERSingleFeedback]]
+        # Example:
+        # If there are 2 evaluators and 3 sub_tasks in evo, and each evaluator's evaluate returns a list of 3 CoSTEERSingleFeedbacks,
+        # Then task_li_feedback_li will be:
+        # [
+        #   [feedback_1_1, feedback_1_2, feedback_1_3],  # results from the 1st evaluator for all sub_tasks
+        #   [feedback_2_1, feedback_2_2, feedback_2_3],  # results from the 2nd evaluator for all sub_tasks
+        # ]
+        # Where feedback_i_j is the feedback from the i-th evaluator for the j-th sub_task.
         for ev in eval_l:
             multi_implementation_feedback = multiprocessing_wrapper(
                 [
@@ -248,27 +280,22 @@ def evaluate(
                 n=RD_AGENT_SETTINGS.multi_proc_n,
             )
             task_li_feedback_li.append(multi_implementation_feedback)
-        # merge the feedbacks
+
+        # 2) merge the feedbacks along the sub_tasks to aggregate the multiple evaluation feedbacks
         merged_task_feedback = []
+        # task_li_feedback_li[0] is a list of feedbacks of different tasks for the 1st evaluator
         for task_id, fb in enumerate(task_li_feedback_li[0]):
-            fb = deepcopy(fb)  # deep copy to make it more robust
-
-            fb.final_decision = all(
-                task_li_feedback[task_id].final_decision for task_li_feedback in task_li_feedback_li
-            )
-            for attr in "execution", "return_checking", "code":
-                setattr(
-                    fb,
-                    attr,
-                    "\n\n".join(
-                        [
-                            getattr(task_li_feedback[task_id], attr)
-                            for task_li_feedback in task_li_feedback_li
-                            if getattr(task_li_feedback[task_id], attr) is not None
-                        ]
-                    ),
-                )
+            fb = fb.merge([fb_li[task_id] for fb_li in task_li_feedback_li])
             merged_task_feedback.append(fb)
+        # merged_task_feedback: List[CoSTEERSingleFeedback]
+        # Example:
+        # [
+        #   CoSTEERSingleFeedback(final_decision=True, execution="...", return_checking="...", code="..."),
+        #   CoSTEERSingleFeedback(final_decision=False, execution="...", return_checking="...", code="..."),
+        #   ...
+        # ]
+        # Each element corresponds to the merged feedback for one sub-task across all evaluators.
+        # merged_task_feedback[i] is the merged feedback for the i-th sub_task
 
         final_decision = [
             None if single_feedback is None else single_feedback.final_decision

diff --git a/rdagent/components/coder/data_science/pipeline/eval.py b/rdagent/components/coder/data_science/pipeline/eval.py
@@ -1,7 +1,8 @@
 # tess successfully running.
 # (GPT) if it aligns with the spec & rationality of the spec.
-import json
+import concurrent.futures
 import re
+from dataclasses import dataclass
 from pathlib import Path
 
 import pandas as pd
@@ -18,14 +19,104 @@
 from rdagent.components.coder.data_science.conf import get_clear_ws_cmd, get_ds_env
 from rdagent.components.coder.data_science.share.notebook import NotebookConverter
 from rdagent.components.coder.data_science.utils import remove_eda_part
+from rdagent.components.mcp import MCPAgent
 from rdagent.core.experiment import FBWorkspace, Task
+from rdagent.log import rdagent_logger as logger
 from rdagent.scenarios.data_science.test_eval import get_test_eval
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.agent.workflow import build_cls_from_json_with_retry
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 
-PipelineSingleFeedback = CoSTEERSingleFeedback
+
+@dataclass
+class DSCoderFeedback(CoSTEERSingleFeedback):
+    """
+    Feedback for Data Science CoSTEER evaluation.
+    This feedback is used to evaluate the code and execution of the Data Science CoSTEER task.
+    """
+
+    requires_documentation_search: bool | None = None
+    error_message: str | None = None
+
+    @staticmethod
+    def val_and_update_init_dict(data: dict) -> dict:
+        # First call parent class validation method to handle base fields
+        data = CoSTEERSingleFeedback.val_and_update_init_dict(data)
+
+        # Validate new fields
+        if "requires_documentation_search" in data:
+            if isinstance(data["requires_documentation_search"], str):
+                if data["requires_documentation_search"] == "false" or data["requires_documentation_search"] == "False":
+                    data["requires_documentation_search"] = False
+                elif data["requires_documentation_search"] == "true" or data["requires_documentation_search"] == "True":
+                    data["requires_documentation_search"] = True
+                else:
+                    raise ValueError(
+                        f"'requires_documentation_search' string value must be 'true', 'True', 'false', or 'False', not '{data['requires_documentation_search']}'"
+                    )
+            elif data["requires_documentation_search"] is not None and not isinstance(
+                data["requires_documentation_search"], bool
+            ):
+                raise ValueError(
+                    f"'requires_documentation_search' must be a boolean, string, or None, not {type(data['requires_documentation_search'])}"
+                )
+
+        if "error_message" in data:
+            if data["error_message"] is not None and not isinstance(data["error_message"], str):
+                raise ValueError(f"'error_message' must be a string or None, not {type(data['error_message'])}")
+
+        return data
+
+    def __str__(self) -> str:
+        base_str = super().__str__()
+
+        if self.requires_documentation_search is not None:
+            base_str += f"-------------------Documentation Search Required------------------\n{self.requires_documentation_search}\n"
+
+        if self.error_message is not None:
+            # Check if error_message contains Context7 documentation results
+            if "### API Documentation Reference:" in self.error_message:
+                base_str += f"-------------------Error Analysis & Documentation Search Results ------------------\n{self.error_message}\n"
+            else:
+                base_str += f"-------------------Error Message------------------\n{self.error_message}\n"
+
+        return base_str
+
+    @classmethod
+    def merge(cls, feedback_li: list[CoSTEERSingleFeedback]) -> "DSCoderFeedback":
+        # Call parent class merge method to handle base fields
+        merged_fb = super().merge(feedback_li)
+
+        # Convert to DSCoderFeedback type if needed
+        if not isinstance(merged_fb, DSCoderFeedback):
+            merged_fb = DSCoderFeedback(
+                execution=merged_fb.execution,
+                return_checking=merged_fb.return_checking,
+                code=merged_fb.code,
+                final_decision=merged_fb.final_decision,
+            )
+
+        # Merge error_message fields
+        error_messages = [
+            fb.error_message for fb in feedback_li if isinstance(fb, DSCoderFeedback) and fb.error_message is not None
+        ]
+        if error_messages:
+            merged_fb.error_message = "\n\n".join(error_messages)
+
+        # Merge requires_documentation_search fields (True if any is True)
+        requires_search = [
+            fb.requires_documentation_search
+            for fb in feedback_li
+            if isinstance(fb, DSCoderFeedback) and fb.requires_documentation_search is not None
+        ]
+        if requires_search:
+            merged_fb.requires_documentation_search = any(requires_search)
+
+        return merged_fb
+
+
+PipelineSingleFeedback = DSCoderFeedback
 PipelineMultiFeedback = CoSTEERMultiFeedback
 
 
@@ -51,6 +142,8 @@ def evaluate(
                 execution="This task has failed too many times, skip implementation.",
                 return_checking="This task has failed too many times, skip implementation.",
                 code="This task has failed too many times, skip implementation.",
+                error_message="This task has failed too many times, skip implementation.",
+                requires_documentation_search=False,
                 final_decision=False,
             )
 
@@ -176,6 +269,9 @@ def evaluate(
         else:
             eda_output = implementation.file_dict.get("EDA.md", None)
 
+        # extract enable_mcp_documentation_search from data science configuration
+        enable_mcp_documentation_search = DS_RD_SETTING.enable_mcp_documentation_search
+
         queried_similar_successful_knowledge = (
             queried_knowledge.task_to_similar_task_successful_knowledge[target_task.get_task_information()]
             if queried_knowledge is not None
@@ -185,6 +281,7 @@ def evaluate(
         system_prompt = T(".prompts:pipeline_eval.system").r(
             is_sub_enabled=test_eval.is_sub_enabled(self.scen.competition),
             debug_mode=DS_RD_SETTING.sample_data_by_LLM,
+            enable_mcp_documentation_search=enable_mcp_documentation_search,
             mle_check=DS_RD_SETTING.sample_data_by_LLM,
             queried_similar_successful_knowledge=queried_similar_successful_knowledge,
         )
@@ -204,6 +301,34 @@ def evaluate(
             user_prompt=user_prompt,
             init_kwargs_update_func=PipelineSingleFeedback.val_and_update_init_dict,
         )
+
+        # judge whether we should perform documentation search
+        do_documentation_search = enable_mcp_documentation_search and wfb.requires_documentation_search
+
+        if do_documentation_search:
+            # Use MCPAgent for clean, user-friendly interface
+            try:
+                # Create agent targeting Context7 service - model config comes from mcp_config.json
+                doc_agent = MCPAgent(toolsets="context7")
+
+                # Synchronous query - perfect for evaluation context
+                if wfb.error_message:  # Type safety check
+                    context7_result = doc_agent.run_sync(query=wfb.error_message, timeout=180)
+
+                    if context7_result:
+                        logger.info("Context7: Documentation search completed successfully")
+                        wfb.error_message += f"\n\n### API Documentation Reference:\nThe following API documentation was retrieved based on the error. This provides factual information about API changes or parameter specifications only:\n\n{context7_result}"
+                    else:
+                        logger.warning("Context7: Documentation search failed or no results found")
+                else:
+                    logger.warning("Context7: No error message to search for")
+
+            except concurrent.futures.TimeoutError:
+                logger.error("Context7: Query timed out after 180 seconds")
+            except Exception as e:
+                error_msg = str(e) if str(e) else type(e).__name__
+                logger.error(f"Context7: Query failed - {error_msg}")
+
         if score_ret_code != 0 and wfb.final_decision is True:
             wfb.final_decision = False
             wfb.return_checking += "\n" + score_check_text

diff --git a/rdagent/components/coder/data_science/pipeline/prompts.yaml b/rdagent/components/coder/data_science/pipeline/prompts.yaml
@@ -232,10 +232,15 @@ pipeline_eval:
     - Notes:
       - Model performance is not evaluated in this step; focus solely on successful execution.
       - Warnings are acceptable if they do not interfere with successful code execution.
+      - **Environment Constraint**: The coding environment is fixed and pre-configured. No package installation or modification is allowed. Code must use only existing pre-installed packages.
     - If the code execute successfully:
-      - Proceed to Step 2.
+      - Proceed to the next aspect and overlook the remaining steps in this aspect.
     - If the code does not execute successfully:
-      - Set the "final_decision" to false and write complete analysis in the "execution" field.
+      - Set the "final_decision" to false.
+      {% if enable_mcp_documentation_search %}
+      - Given that my package/environment is fixed and unchangeable, first you should go through the code and the execution output,if the problem could be solved by looking up the official documentation to confirm feature/API availability, compatible usage, or official alternatives in the fixed environment, set the "requires_documentation_search" to true.
+      {% endif %}
+      - Write complete analysis in the "execution" field.
 
     ### Competition Alignment
     - Goal: Confirm strict adherence to the competition's evaluation rules and experimental setup.
@@ -294,7 +299,7 @@ pipeline_eval:
     {% endif %}
 
     {% if queried_similar_successful_knowledge|length != 0 %}
-    ### Step 6: Similar Successful Implementations to help Code Improvement
+    ### Similar Successful Implementations to help Code Improvement
     The user has done several similar tasks and get some successful implementations. These code might not be implemented to the same task, but they are similar to your task and they might work well on your dataset.
     Please refer to these successful implementation and provide your suggestions in your response on how to correct your current code based on these successful implementations.
     ## Successful Implementations for Similar Tasks
@@ -309,9 +314,13 @@ pipeline_eval:
     Please respond with your feedback in the following JSON format without anything else.
     ```json
     {
+    {% if enable_mcp_documentation_search %}
+        "requires_documentation_search": <true/false>,
+    {% endif %}
         "execution": "Describe whether the code executed successfully. Include any errors or issues encountered, and append all error messages and full traceback details without summarizing or omitting any information. If errors occurred, analyze the root causes: (1) Are they fundamental algorithmic/approach issues, or (2) Implementation details that can be easily fixed, or (3) Environment/dependency problems?",
         "return_checking": "Examine the generated files by cross-referencing the code logic and stdout output. Verify: (1) Format matches required submission format (index, column names, CSV content); (2) **File generation authenticity**: Is the file genuinely produced by successful model execution, or is it a result of exception handling/fallback mechanisms? Cite specific code sections and stdout evidence.",
         "code": "Begin explicitly with [Code analysis] or [Evaluation error]. Provide structured analysis: (1) **Technical Appropriateness**: Does the chosen approach (algorithms, data processing, validation strategy) match this problem's data characteristics and competition requirements? (2) **Effective Components**: What specific parts work well and why are they effective for this problem type? (3) **Issues & Improvements**: Identify concrete problems and suggest actionable improvement directions (without providing actual code). (4) **Code Quality**: Assess readability, structure, and adherence to specifications.",
+        "error_message": "If the code execution has problems, extract the error information in the following format, otherwise set to empty string: ### TRACEBACK: <full relevant traceback extracted from execution output> ### SUPPLEMENTARY_INFO: <only if TRACEBACK is unclear - copy exact code fragments: import statements, variable=value assignments, function calls with parameters as they appear in code>",
         "final_decision": <true/false>
     }
     ```
-Original file line number
+Diff line change
@@ Expand Up / @@ -26,4 +26,4 @@ @@
         "--gpus=all"
       ],
       "postCreateCommand": "make dev"
-    }
+    }