microsoft · RolandMinrui · Jul 17, 2025 · Jul 17, 2025 · Jul 17, 2025 · Jul 17, 2025
diff --git a/rdagent/core/evolving_agent.py b/rdagent/core/evolving_agent.py
@@ -79,10 +79,17 @@ def multistep_evolve(
                     # TODO: Putting the evolving trace in here doesn't actually work
                     queried_knowledge = self.rag.query(evo, self.evolving_trace)
 
+                # 2.5 Brief evolving history
+                evolving_history = (
+                    len(self.evolving_trace) + 1,
+                    "\n".join(f"### Evolving Step {i + 1}\n{trace}" for i, trace in enumerate(self.evolving_trace)),
+                )
+
                 # 3. evolve
                 evo = self.evolving_strategy.evolve(
                     evo=evo,
                     evolving_trace=self.evolving_trace,
+                    evolving_history=evolving_history,
                     queried_knowledge=queried_knowledge,
                 )
 
@@ -92,7 +99,7 @@ def multistep_evolve(
                 # 5. Evaluation
                 if self.with_feedback:
                     es.feedback = (
-                        eva if isinstance(eva, Feedback) else eva.evaluate(evo, queried_knowledge=queried_knowledge)
+                        eva if isinstance(eva, Feedback) else eva.evaluate(evo, queried_knowledge=queried_knowledge, evolving_history=evolving_history)
                     )
                     logger.log_object(es.feedback, tag="evolving feedback")
 

diff --git a/rdagent/core/evolving_framework.py b/rdagent/core/evolving_framework.py
@@ -51,6 +51,13 @@ class EvoStep:
     evolvable_subjects: EvolvableSubjects
     queried_knowledge: QueriedKnowledge | None = None
     feedback: Feedback | None = None
+    code_change_summary: str | None = None  # TODO: minrui
+
+    def __str__(self) -> str:
+        return f"""{str(self.feedback)}
+### Summary of Code Change
+{self.code_change_summary}
+"""
 
 
 class EvolvingStrategy(ABC):

diff --git a/rdagent/scenarios/data_science/dev/runner/__init__.py b/rdagent/scenarios/data_science/dev/runner/__init__.py
@@ -44,6 +44,7 @@ def implement_one_task(
         self,
         target_task: CoSTEERTask,
         queried_knowledge: CoSTEERQueriedKnowledge | None = None,
+        evolving_history: tuple = None,
         workspace: FBWorkspace | None = None,
         prev_task_feedback: CoSTEERSingleFeedback | None = None,
     ) -> dict[str, str]:
@@ -52,26 +53,28 @@ def implement_one_task(
             # if no prev_task_feedback, it is the first loop; we do not make any changes and goto evaluators directly.
             return {}
 
-        # Output Agent Map
-        output_map = {
-            True: (PythonBatchPatchOut.get_spec(), PythonBatchPatchOut.extract_output),
-            False: (
-                PythonBatchEditOut.get_spec(with_del=False),
-                PythonBatchEditOut.extract_output,
-            ),
-        }
-        output_spec, extract_output_fn = output_map[self.settings.diff_mode]
+        # Set output agent
+        if self.settings.diff_mode:
+            output_spec = PythonBatchPatchOut.get_spec()
+            extract_output_fn = PythonBatchPatchOut.extract_output
+        else:
+            output_spec = PythonBatchEditOut.get_spec(with_del=False)
+            extract_output_fn = PythonBatchEditOut.extract_output
 
         if prev_task_feedback.hyperparameter_tuning_decision:
             # Use system_refine for hyperparameter tuning
             system_prompt = T(".prompts:DSCoSTEER.system_refine").r(
+                max_loop=DS_RD_SETTING.runner_max_loop,
+                cur_loop=evolving_history[0],
                 out_spec=output_spec,
                 diff_mode=self.settings.diff_mode,
             )
         else:
             task_information_str = target_task.get_task_information()
             # Use system_debugger for error fixing and debugging
             system_prompt = T(".prompts:DSCoSTEER.system_refine").r(
+                max_loop=DS_RD_SETTING.runner_max_loop,
+                cur_loop=evolving_history[0],
                 task_desc=task_information_str,
                 out_spec=output_spec,
                 diff_mode=self.settings.diff_mode,
@@ -81,15 +84,14 @@ def implement_one_task(
         user_prompt = T(".prompts:DSCoSTEER.user").r(
             code=workspace.all_codes,
             feedback=prev_task_feedback,
+            evolving_history=evolving_history[1],
             hyperparameter_tuning_suggestion=prev_task_feedback.hyperparameter_tuning_suggestion,
         )
-
-        batch_edit = extract_output_fn(
-            APIBackend().build_messages_and_create_chat_completion(
-                user_prompt=user_prompt,
-                system_prompt=system_prompt,
-            )
+        resp = APIBackend().build_messages_and_create_chat_completion(
+            user_prompt=user_prompt,
+            system_prompt=system_prompt,
         )
+        batch_edit = extract_output_fn(resp["code"])
 
         batch_edit = {k: v for k, v in batch_edit.items() if k in workspace.file_dict.keys()}
 

diff --git a/rdagent/scenarios/data_science/dev/runner/eval.py b/rdagent/scenarios/data_science/dev/runner/eval.py
@@ -11,7 +11,7 @@
 )
 from rdagent.components.coder.data_science.conf import get_clear_ws_cmd, get_ds_env
 from rdagent.components.coder.data_science.utils import remove_eda_part
-from rdagent.core.evolving_framework import QueriedKnowledge
+from rdagent.core.evolving_framework import EvoStep, QueriedKnowledge
 from rdagent.core.experiment import FBWorkspace, Task
 from rdagent.log import rdagent_logger as logger
 from rdagent.scenarios.data_science.test_eval import (
@@ -39,6 +39,22 @@ def __init__(
         self.hyperparameter_tuning_decision = hyperparameter_tuning_decision
         self.hyperparameter_tuning_suggestion = hyperparameter_tuning_suggestion
 
+    def __str__(self) -> str:
+        parts = [
+            "### Execution",
+            str(self.execution),
+            "### Return Check",
+            self.return_checking if self.return_checking is not None else "No return checking",
+            "### Code",
+            str(self.code),
+            "### Final Decision",
+            f"This implementation is {'SUCCESS' if self.final_decision else 'FAIL'}.",
+        ]
+        if self.hyperparameter_tuning_decision:
+            parts.append("### Hyperparameter Tuning Suggestion")
+            parts.append(str(self.hyperparameter_tuning_suggestion))
+        return "\n".join(parts)
+
 
 class DSCoSTEERCoSTEEREvaluator(CoSTEEREvaluator):
 
@@ -48,6 +64,7 @@ def evaluate(
         implementation: FBWorkspace,
         gt_implementation: FBWorkspace,
         queried_knowledge: QueriedKnowledge = None,
+        evolving_history: tuple = None,
         **kwargs,
     ) -> DSCoSTEEREvalFeedback:
 
@@ -130,6 +147,8 @@ def evaluate(
             stdout += f"\nSubmission check:\n{submission_check_out}\nIf Submission check returns a 'Submission is valid' or similar message, despite some warning messages, you should still consider the submission as valid and give a positive final decision. "
 
         system_prompt = T(".prompts:DSCoSTEER_eval.system").r(
+            max_loop=DS_RD_SETTING.runner_max_loop,
+            cur_loop=evolving_history[0],
             scenario=self.scen.get_scenario_all_desc(eda_output=implementation.file_dict.get("EDA.md", None)),
             is_sub_enabled=test_eval.is_sub_enabled(self.scen.competition),
             task_desc=target_task.get_task_information(),
@@ -141,6 +160,7 @@ def evaluate(
             time_spent=f"{implementation.running_info.running_time:.2f} seconds",
             timeout=f"{env.conf.running_timeout_period} seconds",
             percent_of_timeout_used=f"{(implementation.running_info.running_time / env.conf.running_timeout_period) * 100:.2f}%",
+            evolving_history=evolving_history[1],
         )
 
         feedback = build_cls_from_json_with_retry(

diff --git a/rdagent/scenarios/data_science/dev/runner/prompts.yaml b/rdagent/scenarios/data_science/dev/runner/prompts.yaml
@@ -2,21 +2,25 @@ DSCoSTEER_eval:
   system: |-
     {% include "scenarios.data_science.share:scen.role" %}
     {% if is_sub_enabled %}
+    You have successfully implemented the workflow on a sampled dataset and are now transitioning to the full dataset.
+    The code base will be iteratively improved through a series of [coding] and [feedback] steps.
+    The maximum number of evolution steps is {{ max_loop }}, and you are currently on [feedback] step {{ cur_loop }}.
+
     You will be provided with:
-    1. `Code base`: The code base of the solution
-    2. `The stdout of code execution and testing`: The generated stdout when executing the code base and corresponding testing
-    3, `The time spent on code execution`: The time spent on the code execution
-    4. `The timeout of code execution`: the time limitation of the code execution
-    5. `The percent of timeout used`: the percentage of the time limitation used
-    Your task is to perform the following evaluation(s):
-
-    # Evalution 1: Code Correctness
+    1. The current code base you need to evaluate.
+    2. The stdout of the current code execution and testing.
+    3. The time spent on the current code execution, along with the total timeout and the percent of timeout used for current code execution.
+    4. The evolving history, which includes summaries of previous [coding] and [feedback] steps.
+
+    Your task is to perform the following evaluations:
+
+    # Evaluation 1: Code Correctness
     ## Scenario
     The code is focusing on the following scenario:
     {{ scenario }}
 
     ## Target Task Description
-    The code is focusing on the following task
+    The code is targeting on the following task
     {{ task_desc }}
 
     ## Runtime Environment
@@ -39,12 +43,13 @@ DSCoSTEER_eval:
     The user will provide you the time spent on the whole code execution and the timeout of the code execution. You should decide whether the hyperparameter is reasonable based on the time.
     For example, if the code uses only a very small portion of the allowed time, and hyperparameters like `n_estimators` or `epochs` have low values, with early stopping not being triggered and possible signs of underfitting, you should suggest increasing these hyperparameters.
     You should also notice other resources utilization hyper-parameters,
-    For example, if you are using a GPU with large memory, and the batch size is set very low, you should suggest increasing the batch size if it is not reasonable.
+    For example, if you are using a GPU with large memory, and the batch size is set very low, you should suggest increasing the batch size if it is not reasonable.  
 
     ## Evaluation Guidelines
     1. The code execution time or resource utilization suggest that there is room for improvement in the hyperparameters.
-    2. The code must apply early stopping strategy already (in order to prevent overfitting).
-    3. Your suggestion should have a strong chance of improving the model's performance. Focus on the most obvious and impactful opportunities for quick improvement by leveraging more training time. Don't explore hyperparameters with low confidence.  If there are no obvious and impactful opportunities and the code runs well, please accept it.
+    2. The code must apply early stopping strategy already (in order to prevent overfitting).    
+    3. Carefully review the entire evolving history to avoid repeating the same mistakes.
+    4. Your suggestion should have a strong chance of improving the model's performance. Focus on the most obvious and impactful opportunities for quick improvement by leveraging more training time. Don't explore hyperparameters with low confidence.  If there are no obvious and impactful opportunities and the code runs well, please accept it.
     If the code satisfy the requirements:
     - Set "hyperparameter_tuning_decision" to true.
     - Set "final_decision" to false.
@@ -81,7 +86,6 @@ DSCoSTEER_eval:
     }
     ```
     {% endif %}
-# NOTE: when is_sub_enabled == False, we don't have any checking about the return. So it is just placeholder currently
 
   user: |-
     # Code base
@@ -90,27 +94,32 @@ DSCoSTEER_eval:
     ## Stdout of code execution and testing
     {{ stdout }}
 
-    # The time spend on code execution and timeout
-    {{ time_spent }}
+    ## Execution time and timeout
+    The execution time for current code base: {{ time_spent }}.
+    The total timeout: {{ timeout }}.
+    The percent of timeout used: {{ percent_of_timeout_used }}.
 
-    ## The timeout of code execution
-    {{ timeout }}
-
-    ## The percent of timeout used
-    {{ percent_of_timeout_used }}
+    ## Evolving History
+    {{ evolving_history }}
+
 
 DSCoSTEER:
   system_debugger: |-
     {% include "scenarios.data_science.share:scen.role" %}
-    You have finished the implementation of the whole workflow which has executed well on a sampled dataset. Now we are working on the full dataset.
-    The user has reported that the workflow failed to execute on the full dataset.
-    Your will be provided with:
-    1. Code base.
-    2. Task description, which is the task the code is trying to solve.
-    3. Feedback generated during the execution of the whole workflow.
-    4. Suggestions for hyperparameter tuning.
-    Your job is to debug the whole code base, try to correct the errors, and ensure that the workflow can execute successfully on the full dataset.
+    You have successfully implemented the workflow on a sampled dataset and are now transitioning to the full dataset.
+    The code base will be iteratively improved through a series of [coding] and [feedback] steps.
+    The maximum number of evolution steps is {{ max_loop }}, and you are currently on [coding] step {{ cur_loop }}.
+    The previous [feedback] step indicates that the code failed to execute successfully on the full dataset.
 
+    Your will be provided with:
+    1. The current code base you need to refine.
+    2. The task description, which is the task the code is trying to solve.
+    3. The feedback after executing the code base.
+    4. The evolving history, which includes summaries of previous [coding] and [feedback] steps.
+
+    Your job is to:
+    1. Debug the whole code base, try to correct the errors, and ensure that the workflow can execute successfully on the full dataset.
+    2. Summarize the changes you made to the original code base.
     ## Task description
     {{ task_desc }}
 
@@ -121,42 +130,53 @@ DSCoSTEER:
     {% else %}
     2. You must output the COMPLETE and FULL code. Do not truncate, summarize, or omit any parts of the code. Include all imports, functions, classes, and the entire workflow from start to finish.
     {% endif %}
+    3. Write a concise and structured code change summary. Clearly describe what was changed, specifying exactly what was changed from what to what (e.g., "Changed batch_size from 32 to 128"). Briefly explain the reasoning behind each modification.
 
     ## Output Format
     {% if out_spec %}
     {{ out_spec }}
     {% else %}
     Please response the code in the following JSON format without anything else.
     {
-        "code": "The Python code as a string."
+        "code": "The refined Python code as a string."
+        "code_change_summary": "The structured summary to briefly summarize the changes made to the original code base in two to three sentences."
     }
     {% endif %}
 
   system_refine: |-
     {% include "scenarios.data_science.share:scen.role" %}
-    You have finished the implementation of the whole workflow which has executed well on a sampled dataset. Now we are working on the full dataset.
-    The user has reported that the hyperparameters are not reasonable and the code didn't make the best use of the time limit.
+    You have successfully implemented the workflow on a sampled dataset and are now transitioning to the full dataset.
+    The code base will be iteratively improved through a series of [coding] and [feedback] steps.
+    The maximum number of evolution steps is {{ max_loop }}, and you are currently on [coding] step {{ cur_loop }}.
+    The previous [feedback] step indicates that the code executed successfully, but there are opportunities to improve performance through hyperparameter tuning.
+
     Your will be provided with:
-    1. Code base.
-    2. Feedback generated during the execution of the whole workflow.
-    3. Suggestions for hyperparameter tuning.
-    Your task is to refine the code base and modify the hyperparameters based on the feedback and suggestions.
+    1. The current code base you need to refine.
+    2. The feedback after executing the code base.
+    3. The suggestions for hyperparameter tuning.
+    4. The evolving history, which includes summaries of previous [coding] and [feedback] steps.
+
+    Your task is to:
+    1. Refine the code base and modify the hyperparameters based on the feedback, suggestions, and evolving history.
+    2. Summarize the changes you made to the original code base.
 
     ## Instructions
-    1. Minimal changes principle: only modify necessary hyperparameters based on the feedback and suggestions.
+    1. Minimal changes principle: only modify necessary hyperparameters based on the feedback, suggestions, and evolving history.
     {% if diff_mode %}
-    2. You must output in Code Diff format. The detailed format specification is as follows.
+    2. You must output the code in V4A diff format. The detailed format specification is as follows.
     {% else %}
     2. You must output the COMPLETE and FULL code. Do not truncate, summarize, or omit any parts of the code. Include all imports, functions, classes, and the entire workflow from start to finish.
     {% endif %}
+    3. Write a concise and structured code change summary. Clearly describe what was changed, specifying exactly what was changed from what to what (e.g., "Changed batch_size from 32 to 128"). Briefly explain the reasoning behind each modification.
 
     ## Output Format
     {% if out_spec %}
     {{ out_spec }}
     {% else %}
     Please response the code in the following JSON format without anything else.
     {
-        "code": "The Python code as a string."
+        "code": "The refined Python code as a string."
+        "code_change_summary": "The structured summary to briefly summarize the changes made to the original code base in two to three sentences."
     }
     {% endif %}
 
@@ -167,7 +187,10 @@ DSCoSTEER:
     ## Feedback
     {{ feedback }}
 
+    ## Evolving History
+    {{ evolving_history }}
+
     {% if hyperparameter_tuning_suggestion is not none %}
     ## Hyperparameter Tuning Suggestion
     {{ hyperparameter_tuning_suggestion }}
-    {% endif %}
+    {% endif %}