feat: add previous runner loops to runner history (#1142)

RolandMinrui · Xu · peteryang1 · web-flow · commit 8de9f757ea13 · 2025-08-04T17:38:27.000+08:00
* add prev loops to runner history

* fix evolving history

* fix bug on initializing feedback without final decision

* reformat

* refine

* add comments

* fix ci

* a little refinement

* fix CI

---------

Co-authored-by: Xu &lt;v-xuminrui@microsoft.com&gt;
Co-authored-by: Xu Yang &lt;peteryang@vip.qq.com&gt;
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
@@ -133,5 +133,8 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     """Enable hypothesis critique and rewrite stages for improving hypothesis quality"""
     enable_scale_check: bool = False
 
+    #### enable runner code change summary
+    runner_enable_code_change_summary: bool = True
+
 
 DS_RD_SETTING = DataScienceBasePropSetting()
diff --git a/rdagent/components/coder/CoSTEER/evaluators.py b/rdagent/components/coder/CoSTEER/evaluators.py
@@ -42,7 +42,7 @@ class CoSTEERSingleFeedback(Feedback):
     return_checking: str | None  # including every check in the testing (constraints about the generated value)
     # value_feedback, shape_feedback, value_generated_flag
     code: str
-    final_decision: bool
+    final_decision: bool | None = None
 
     @staticmethod
     def val_and_update_init_dict(data: dict) -> dict:
diff --git a/rdagent/components/coder/CoSTEER/evolving_strategy.py b/rdagent/components/coder/CoSTEER/evolving_strategy.py
@@ -19,6 +19,8 @@
 
 
 class MultiProcessEvolvingStrategy(EvolvingStrategy):
+    KEY_CHANGE_SUMMARY = "__change_summary__"  # Optional key for the summary of the change of evolving subjects
+
     def __init__(self, scen: Scenario, settings: CoSTEERSettings):
         super().__init__(scen)
         self.settings = settings
@@ -51,6 +53,7 @@ def implement_one_task(
         Return
         ------
         The new files {<filename>: <content>} to update the workspace.
+        - Special Keys: self.KEY_CHANGE_SUMMARY;
         """
         raise NotImplementedError
 
diff --git a/rdagent/components/coder/factor_coder/config.py b/rdagent/components/coder/factor_coder/config.py
@@ -4,11 +4,7 @@
 from pydantic_settings import SettingsConfigDict
 
 from rdagent.components.coder.CoSTEER.config import CoSTEERSettings
-from rdagent.utils.env import (
-    CondaConf,
-    Env,
-    LocalEnv,
-)
+from rdagent.utils.env import CondaConf, Env, LocalEnv
 
 
 class FactorCoSTEERSettings(CoSTEERSettings):
diff --git a/rdagent/components/coder/model_coder/conf.py b/rdagent/components/coder/model_coder/conf.py
@@ -3,12 +3,7 @@
 from pydantic_settings import SettingsConfigDict
 
 from rdagent.components.coder.CoSTEER.config import CoSTEERSettings
-from rdagent.utils.env import (
-    Env,
-    QlibCondaConf,
-    QlibCondaEnv,
-    QTDockerEnv,
-)
+from rdagent.utils.env import Env, QlibCondaConf, QlibCondaEnv, QTDockerEnv
 
 
 class ModelCoSTEERSettings(CoSTEERSettings):
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
@@ -153,8 +153,8 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
             {}
         )  # The code injected into the folder, store them in the variable to reproduce the former result
         self.workspace_path: Path = RD_AGENT_SETTINGS.workspace_path / uuid.uuid4().hex
-        # In-memory checkpoint data created by ``create_ws_ckp``.
-        self.ws_ckp: bytes | None = None
+        self.ws_ckp: bytes | None = None  # In-memory checkpoint data created by ``create_ws_ckp``.
+        self.change_summary: str | None = None  # The change from the previous version of workspace
 
     @staticmethod
     def _format_code_dict(code_dict: dict[str, str]) -> str:
@@ -343,13 +343,12 @@ def recover_ws_ckp(self) -> None:
                     dest_path.parent.mkdir(parents=True, exist_ok=True)
                     link_target = zf.read(info).decode()
                     os.symlink(link_target, dest_path)
+                elif info.is_dir():
+                    dest_path.mkdir(parents=True, exist_ok=True)
                 else:
-                    if info.is_dir():
-                        dest_path.mkdir(parents=True, exist_ok=True)
-                    else:
-                        dest_path.parent.mkdir(parents=True, exist_ok=True)
-                        with dest_path.open("wb") as f:
-                            f.write(zf.read(info))
+                    dest_path.parent.mkdir(parents=True, exist_ok=True)
+                    with dest_path.open("wb") as f:
+                        f.write(zf.read(info))
         # NOTE: very important to reduce the size of the object
         self.ws_ckp = None
 
diff --git a/rdagent/scenarios/data_science/dev/runner/__init__.py b/rdagent/scenarios/data_science/dev/runner/__init__.py
@@ -51,15 +51,19 @@ def implement_one_task(
             # if no prev_task_feedback, it is the first loop; we do not make any changes and goto evaluators directly.
             return {}
 
-        # Output Agent Map
-        output_map = {
-            True: (PythonBatchPatchOut.get_spec(), PythonBatchPatchOut.extract_output),
-            False: (
-                PythonBatchEditOut.get_spec(with_del=False),
-                PythonBatchEditOut.extract_output,
-            ),
-        }
-        output_spec, extract_output_fn = output_map[self.settings.diff_mode]
+        # Get evolving history
+        task_info = target_task.get_task_information()
+        queried_former_failed_knowledge = (
+            queried_knowledge.task_to_former_failed_traces[task_info] if queried_knowledge is not None else []
+        )[0]
+
+        # Set output agent
+        if self.settings.diff_mode:
+            output_spec = PythonBatchPatchOut.get_spec()
+            extract_output_fn = PythonBatchPatchOut.extract_output
+        else:
+            output_spec = PythonBatchEditOut.get_spec(with_del=False)
+            extract_output_fn = PythonBatchEditOut.extract_output
 
         if prev_task_feedback.acceptable is False:
             task_information_str = target_task.get_task_information()
@@ -76,32 +80,41 @@ def implement_one_task(
                 diff_mode=self.settings.diff_mode,
             )
 
-        # Generate user prompt for both cases
+        # Start multi-turn chat session
+        session = APIBackend().build_chat_session(
+            session_system_prompt=system_prompt,
+        )
+
+        # Code
         user_prompt = T(".prompts:DSCoSTEER.user").r(
             code=workspace.all_codes,
+            change_summary=workspace.change_summary,
             feedback=prev_task_feedback,
-            hyperparameter_tuning_suggestion=prev_task_feedback.hyperparameter_tuning_suggestion,
+            hyperparameter_tuning_suggestion=(
+                prev_task_feedback.hyperparameter_tuning_suggestion if prev_task_feedback.acceptable else None
+            ),
+            queried_former_failed_knowledge=queried_former_failed_knowledge,
         )
 
+        code = session.build_chat_completion(user_prompt=user_prompt)
         if self.settings.diff_mode:
-            batch_edit = extract_output_fn(
-                APIBackend().build_messages_and_create_chat_completion(
-                    user_prompt=user_prompt,
-                    system_prompt=system_prompt,
-                ),
-                prefix=workspace.workspace_path,
-            )
+            code_batch_edit = extract_output_fn(code, prefix=workspace.workspace_path)
         else:
-            batch_edit = extract_output_fn(
-                APIBackend().build_messages_and_create_chat_completion(
-                    user_prompt=user_prompt,
-                    system_prompt=system_prompt,
-                )
+            code_batch_edit = extract_output_fn(code)
+        code_batch_edit = {k: v for k, v in code_batch_edit.items() if k in workspace.file_dict.keys()}
+
+        if DS_RD_SETTING.runner_enable_code_change_summary:
+            # Change Summary
+            user_prompt = (
+                "Based on the previous conversation and your latest code modifications, "
+                "please provide a concise and structured summary of the changes you made to the original code. "
+                "Clearly specify what was changed and how, focusing on key modifications. "
+                "Limit your summary to plain text, no more than three sentences."
             )
+            change_summary = session.build_chat_completion(user_prompt=user_prompt)
+            code_batch_edit.update({"__change_summary__": change_summary})
 
-        batch_edit = {k: v for k, v in batch_edit.items() if k in workspace.file_dict.keys()}
-
-        return batch_edit
+        return code_batch_edit
 
     def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
         """
@@ -116,6 +129,8 @@ def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
             if evo.sub_workspace_list[index] is None:
                 # evo.sub_workspace_list[index] = FBWorkspace(target_task=evo.sub_tasks[index])
                 evo.sub_workspace_list[index] = evo.experiment_workspace
+            if self.KEY_CHANGE_SUMMARY in code_list[index]:
+                evo.sub_workspace_list[index].change_summary = code_list[index].pop(self.KEY_CHANGE_SUMMARY)
             evo.sub_workspace_list[index].inject_files(**code_list[index])
         return evo
 
diff --git a/rdagent/scenarios/data_science/dev/runner/eval.py b/rdagent/scenarios/data_science/dev/runner/eval.py
@@ -37,12 +37,31 @@ class DSRunnerFeedback(CoSTEERSingleFeedback):
     acceptable: bool | None = None
     hyperparameter_tuning_decision: bool | None = None
     hyperparameter_tuning_suggestion: str | None = None
+    score: str | None = None
 
     def is_acceptable(self) -> bool:
         if self.acceptable is not None:
             return self.acceptable
         return super().is_acceptable()
 
+    def __str__(self) -> str:
+        parts = [
+            "### Execution",
+            str(self.execution),
+            "### Return Check",
+            self.return_checking if self.return_checking is not None else "No return checking",
+            "### Code",
+            str(self.code),
+            "### Validation Score",
+            f"{self.score}" if self.score else "Not available",
+            "### Final Decision",
+            f"This implementation is {'PASSED' if self.acceptable else 'FAILED'}.",
+        ]
+        if self.hyperparameter_tuning_decision:
+            parts.append("### Hyperparameter Tuning Suggestion")
+            parts.append(str(self.hyperparameter_tuning_suggestion))
+        return "\n".join(parts)
+
 
 DSCoSTEEREvalFeedback = DSRunnerFeedback  # FIXME: Alias for backward compatibility
 
@@ -77,6 +96,12 @@ def evaluate(
             env=env, entry=get_clear_ws_cmd()
         )  # Remove previous submission and scores files generated by worklfow.
 
+        # get previous runner loops
+        task_info = target_task.get_task_information()
+        queried_former_failed_knowledge = (
+            queried_knowledge.task_to_former_failed_traces[task_info] if queried_knowledge is not None else []
+        )[0]
+
         # execute workflow
         result = implementation.run(env=env, entry="python -m coverage run main.py")
         stdout = result.stdout
@@ -164,14 +189,19 @@ def evaluate(
             time_spent=f"{implementation.running_info.running_time:.2f} seconds",
             timeout=f"{env.conf.running_timeout_period} seconds",
             percent_of_timeout_used=f"{time_spent_ratio * 100:.2f}%",
+            queried_former_failed_knowledge=queried_former_failed_knowledge,
         )
 
         feedback = build_cls_from_json_with_retry(
             DSRunnerFeedback,
             system_prompt=system_prompt,
             user_prompt=user_prompt,
-            init_kwargs_update_func=DSRunnerFeedback.val_and_update_init_dict,
+            # init_kwargs_update_func=DSRunnerFeedback.val_and_update_init_dict,
         )
+        feedback.score = score_df.to_string() if score_ret_code == 0 else None
+        feedback.final_decision = feedback.acceptable and (
+            not feedback.hyperparameter_tuning_decision
+        )  # If hyperparameter_tuning_decision is None, it's considered as False, so the final_decision dependents on the acceptable
 
         if feedback and not DS_RD_SETTING.coder_on_whole_pipeline:
             # remove unused files
diff --git a/rdagent/scenarios/data_science/dev/runner/prompts.yaml b/rdagent/scenarios/data_science/dev/runner/prompts.yaml
@@ -25,13 +25,10 @@ DSCoSTEER_eval:
     3. Confirm that the prediction file (`submission.csv`) is generated using only the test dataset, and its format matches the sample submission.
     If the code does not satisfy the requirements:
     - Set "acceptable" to false.
-    - Set "final_decision" to false.
-    {% if enable_hyperparameter_tuning_check %}- set "hyperparameter_tuning_decision" to false.
-    - Set "hyperparameter_tuning_suggestion" to an empty string.
     If the code satisfy the requirements:
     - Set "acceptable" to true.
-    - Proceed to the next evaluation.
 
+    {% if enable_hyperparameter_tuning_check %}
     # Evaluation 2: Hyperparameter
     ## Evaluation Description
     The user will provide you the time spent on the whole code execution and the timeout of the code execution. You should decide whether the hyperparameter is reasonable based on the time.
@@ -45,8 +42,7 @@ DSCoSTEER_eval:
     3. Your suggestion should have a strong chance of improving the model's performance. Focus on the most obvious and impactful opportunities for quick improvement by leveraging more training time. Don't explore hyperparameters with low confidence.  If there are no obvious and impactful opportunities and the code runs well, please accept it.
     If the code satisfy the requirements:
     - Set "hyperparameter_tuning_decision" to true.
-    - Set "final_decision" to false.
-    - Provide a reasonable suggestion in "hyperparameter_tuning_suggestion". The "hyperparameter_tuning_suggestion" should begin with a clear observation, followed by your suggestion. For example: "[Observation] The maximum number of epochs was reached, but the validation loss is still going down and early stopping was not activated. Only 15% of the allowed time was used. [Suggestion] We recommend increasing epochs to 100 to avoid underfitting and further improve model performance."
+    - In "hyperparameter_tuning_suggestion", provide a clear, specific, and actionable suggestion. Begin with a concrete observation, then state a direct action to take. Do not use vague language, options, or uncertainty (avoid words like "A or B"). For example: "[Observation] The maximum number of epochs was reached, but the validation loss is still decreasing and early stopping was not activated. Only 15% of the allowed time was used. [Suggestion] Increase epochs to 100 to avoid underfitting and further improve model performance."
     If the code does not satisfy the requirements:
     - Set "hyperparameter_tuning_decision" to false.
     - Set "hyperparameter_tuning_suggestion" to an empty string.
@@ -59,10 +55,11 @@ DSCoSTEER_eval:
         "execution": "Describe whether the whole code base executed successfully and generating the final submission. Include any errors or issues encountered, and retain all error messages and traceback details.",
         "return_checking": "Verify the generated files, particularly the submission file. Ensure that its format matches the sample submission",
         "code": "Provide feedback on code quality, readability, and adherence to the given specifications.",
-        "acceptable": <true/false: if the solution has paased execution, return_checking, and code verification, then it is a valid solution and acceptable. Otherwise it is not acceptable.>,{% if enable_hyperparameter_tuning_check %}
+        "acceptable": <true/false: if the solution has passed execution, return_checking, and code verification, then it is a valid solution and acceptable. Otherwise it is not acceptable.>,
+        {% if enable_hyperparameter_tuning_check %}
         "hyperparameter_tuning_decision": <true/false>,
-        "hyperparameter_tuning_suggestion": <suggestion in plain text for hyperparameter tuning>,{% endif %}
-        "final_decision": <true/false>,
+        "hyperparameter_tuning_suggestion": <suggestion in plain text for hyperparameter tuning>,
+        {% endif %}
     }
     ```
     {% else %}
@@ -101,28 +98,35 @@ DSCoSTEER_eval:
         "acceptable": <true/false: if the solution has paased execution, return_checking, and code verification, then it is a valid solution and acceptable. Otherwise it is not acceptable.>,
         {% if enable_hyperparameter_tuning_check %}"hyperparameter_tuning_decision": <true/false>,
         "hyperparameter_tuning_suggestion": <suggestion in plain text for hyperparameter tuning>,{% endif %}
-        "final_decision": <true/false>,
     }
     ```
     {% endif %}
 # NOTE: when is_sub_enabled == False, we don't have any checking about the return. So it is just placeholder currently
 
   user: |-
-    # Code base
+    # Current Code base
     {{ code }}
+    {% if change_summary is not none %}
+    # Current Code Change Summary
+    {{ change_summary }}{% endif %}
 
     ## Stdout of code execution and testing
     {{ stdout }}
 
-    # The time spend on code execution and timeout
-    {{ time_spent }}
-
-    ## The timeout of code execution
-    {{ timeout }}
-
-    ## The percent of timeout used
-    {{ percent_of_timeout_used }}
-
+    ## Execution time and timeout
+    The execution time for current code base: {{ time_spent }}.
+    The total timeout: {{ timeout }}.
+    The percent of timeout used: {{ percent_of_timeout_used }}.
+    
+    {% if queried_former_failed_knowledge|length != 0 %}
+    # Evolving History
+    {% for former_failed_knowledge in queried_former_failed_knowledge %}## Attempt {{ loop.index }}:
+    ### Summary of Changes
+    {{ former_failed_knowledge.implementation.change_summary }}
+    {{ former_failed_knowledge.feedback }}
+    {% endfor %}
+    {% endif %}
+    
 DSCoSTEER:
   system_debugger: |-
     {% include "scenarios.data_science.share:scen.role" %}
@@ -132,7 +136,6 @@ DSCoSTEER:
     1. Code base.
     2. Task description, which is the task the code is trying to solve.
     3. Feedback generated during the execution of the whole workflow.
-    4. Suggestions for hyperparameter tuning.
     Your job is to debug the whole code base, try to correct the errors, and ensure that the workflow can execute successfully on the full dataset.
 
     ## Task description
@@ -185,13 +188,23 @@ DSCoSTEER:
     {% endif %}
 
   user: |-
-    # Code Base
+    # Current Code Base
     {{ code }}
 
-    ## Feedback
+    ## Feedback of Current Code Base
     {{ feedback }}
 
     {% if hyperparameter_tuning_suggestion is not none %}
     ## Hyperparameter Tuning Suggestion
     {{ hyperparameter_tuning_suggestion }}
     {% endif %}
+
+    {% if queried_former_failed_knowledge|length != 0 %}
+    # Evolving History
+    {% for former_failed_knowledge in queried_former_failed_knowledge %}## Attempt {{ loop.index }}:
+    ### Summary of Changes
+    {{ former_failed_knowledge.implementation.change_summary }}
+    ### Validation Scores
+    {{ former_failed_knowledge.feedback.score }}
+    {% endfor %}
+    {% endif %}