Skip to content

Commit 8de9f75

Browse files
RolandMinruiXupeteryang1
authored
feat: add previous runner loops to runner history (#1142)
* add prev loops to runner history * fix evolving history * fix bug on initializing feedback without final decision * reformat * refine * add comments * fix ci * a little refinement * fix CI --------- Co-authored-by: Xu <[email protected]> Co-authored-by: Xu Yang <[email protected]>
1 parent 234cd58 commit 8de9f75

File tree

9 files changed

+124
-70
lines changed

9 files changed

+124
-70
lines changed

rdagent/app/data_science/conf.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,5 +133,8 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
133133
"""Enable hypothesis critique and rewrite stages for improving hypothesis quality"""
134134
enable_scale_check: bool = False
135135

136+
#### enable runner code change summary
137+
runner_enable_code_change_summary: bool = True
138+
136139

137140
DS_RD_SETTING = DataScienceBasePropSetting()

rdagent/components/coder/CoSTEER/evaluators.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ class CoSTEERSingleFeedback(Feedback):
4242
return_checking: str | None # including every check in the testing (constraints about the generated value)
4343
# value_feedback, shape_feedback, value_generated_flag
4444
code: str
45-
final_decision: bool
45+
final_decision: bool | None = None
4646

4747
@staticmethod
4848
def val_and_update_init_dict(data: dict) -> dict:

rdagent/components/coder/CoSTEER/evolving_strategy.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919

2020

2121
class MultiProcessEvolvingStrategy(EvolvingStrategy):
22+
KEY_CHANGE_SUMMARY = "__change_summary__" # Optional key for the summary of the change of evolving subjects
23+
2224
def __init__(self, scen: Scenario, settings: CoSTEERSettings):
2325
super().__init__(scen)
2426
self.settings = settings
@@ -51,6 +53,7 @@ def implement_one_task(
5153
Return
5254
------
5355
The new files {<filename>: <content>} to update the workspace.
56+
- Special Keys: self.KEY_CHANGE_SUMMARY;
5457
"""
5558
raise NotImplementedError
5659

rdagent/components/coder/factor_coder/config.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,7 @@
44
from pydantic_settings import SettingsConfigDict
55

66
from rdagent.components.coder.CoSTEER.config import CoSTEERSettings
7-
from rdagent.utils.env import (
8-
CondaConf,
9-
Env,
10-
LocalEnv,
11-
)
7+
from rdagent.utils.env import CondaConf, Env, LocalEnv
128

139

1410
class FactorCoSTEERSettings(CoSTEERSettings):

rdagent/components/coder/model_coder/conf.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,7 @@
33
from pydantic_settings import SettingsConfigDict
44

55
from rdagent.components.coder.CoSTEER.config import CoSTEERSettings
6-
from rdagent.utils.env import (
7-
Env,
8-
QlibCondaConf,
9-
QlibCondaEnv,
10-
QTDockerEnv,
11-
)
6+
from rdagent.utils.env import Env, QlibCondaConf, QlibCondaEnv, QTDockerEnv
127

138

149
class ModelCoSTEERSettings(CoSTEERSettings):

rdagent/core/experiment.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -153,8 +153,8 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
153153
{}
154154
) # The code injected into the folder, store them in the variable to reproduce the former result
155155
self.workspace_path: Path = RD_AGENT_SETTINGS.workspace_path / uuid.uuid4().hex
156-
# In-memory checkpoint data created by ``create_ws_ckp``.
157-
self.ws_ckp: bytes | None = None
156+
self.ws_ckp: bytes | None = None # In-memory checkpoint data created by ``create_ws_ckp``.
157+
self.change_summary: str | None = None # The change from the previous version of workspace
158158

159159
@staticmethod
160160
def _format_code_dict(code_dict: dict[str, str]) -> str:
@@ -343,13 +343,12 @@ def recover_ws_ckp(self) -> None:
343343
dest_path.parent.mkdir(parents=True, exist_ok=True)
344344
link_target = zf.read(info).decode()
345345
os.symlink(link_target, dest_path)
346+
elif info.is_dir():
347+
dest_path.mkdir(parents=True, exist_ok=True)
346348
else:
347-
if info.is_dir():
348-
dest_path.mkdir(parents=True, exist_ok=True)
349-
else:
350-
dest_path.parent.mkdir(parents=True, exist_ok=True)
351-
with dest_path.open("wb") as f:
352-
f.write(zf.read(info))
349+
dest_path.parent.mkdir(parents=True, exist_ok=True)
350+
with dest_path.open("wb") as f:
351+
f.write(zf.read(info))
353352
# NOTE: very important to reduce the size of the object
354353
self.ws_ckp = None
355354

rdagent/scenarios/data_science/dev/runner/__init__.py

Lines changed: 41 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -51,15 +51,19 @@ def implement_one_task(
5151
# if no prev_task_feedback, it is the first loop; we do not make any changes and goto evaluators directly.
5252
return {}
5353

54-
# Output Agent Map
55-
output_map = {
56-
True: (PythonBatchPatchOut.get_spec(), PythonBatchPatchOut.extract_output),
57-
False: (
58-
PythonBatchEditOut.get_spec(with_del=False),
59-
PythonBatchEditOut.extract_output,
60-
),
61-
}
62-
output_spec, extract_output_fn = output_map[self.settings.diff_mode]
54+
# Get evolving history
55+
task_info = target_task.get_task_information()
56+
queried_former_failed_knowledge = (
57+
queried_knowledge.task_to_former_failed_traces[task_info] if queried_knowledge is not None else []
58+
)[0]
59+
60+
# Set output agent
61+
if self.settings.diff_mode:
62+
output_spec = PythonBatchPatchOut.get_spec()
63+
extract_output_fn = PythonBatchPatchOut.extract_output
64+
else:
65+
output_spec = PythonBatchEditOut.get_spec(with_del=False)
66+
extract_output_fn = PythonBatchEditOut.extract_output
6367

6468
if prev_task_feedback.acceptable is False:
6569
task_information_str = target_task.get_task_information()
@@ -76,32 +80,41 @@ def implement_one_task(
7680
diff_mode=self.settings.diff_mode,
7781
)
7882

79-
# Generate user prompt for both cases
83+
# Start multi-turn chat session
84+
session = APIBackend().build_chat_session(
85+
session_system_prompt=system_prompt,
86+
)
87+
88+
# Code
8089
user_prompt = T(".prompts:DSCoSTEER.user").r(
8190
code=workspace.all_codes,
91+
change_summary=workspace.change_summary,
8292
feedback=prev_task_feedback,
83-
hyperparameter_tuning_suggestion=prev_task_feedback.hyperparameter_tuning_suggestion,
93+
hyperparameter_tuning_suggestion=(
94+
prev_task_feedback.hyperparameter_tuning_suggestion if prev_task_feedback.acceptable else None
95+
),
96+
queried_former_failed_knowledge=queried_former_failed_knowledge,
8497
)
8598

99+
code = session.build_chat_completion(user_prompt=user_prompt)
86100
if self.settings.diff_mode:
87-
batch_edit = extract_output_fn(
88-
APIBackend().build_messages_and_create_chat_completion(
89-
user_prompt=user_prompt,
90-
system_prompt=system_prompt,
91-
),
92-
prefix=workspace.workspace_path,
93-
)
101+
code_batch_edit = extract_output_fn(code, prefix=workspace.workspace_path)
94102
else:
95-
batch_edit = extract_output_fn(
96-
APIBackend().build_messages_and_create_chat_completion(
97-
user_prompt=user_prompt,
98-
system_prompt=system_prompt,
99-
)
103+
code_batch_edit = extract_output_fn(code)
104+
code_batch_edit = {k: v for k, v in code_batch_edit.items() if k in workspace.file_dict.keys()}
105+
106+
if DS_RD_SETTING.runner_enable_code_change_summary:
107+
# Change Summary
108+
user_prompt = (
109+
"Based on the previous conversation and your latest code modifications, "
110+
"please provide a concise and structured summary of the changes you made to the original code. "
111+
"Clearly specify what was changed and how, focusing on key modifications. "
112+
"Limit your summary to plain text, no more than three sentences."
100113
)
114+
change_summary = session.build_chat_completion(user_prompt=user_prompt)
115+
code_batch_edit.update({"__change_summary__": change_summary})
101116

102-
batch_edit = {k: v for k, v in batch_edit.items() if k in workspace.file_dict.keys()}
103-
104-
return batch_edit
117+
return code_batch_edit
105118

106119
def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
107120
"""
@@ -116,6 +129,8 @@ def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
116129
if evo.sub_workspace_list[index] is None:
117130
# evo.sub_workspace_list[index] = FBWorkspace(target_task=evo.sub_tasks[index])
118131
evo.sub_workspace_list[index] = evo.experiment_workspace
132+
if self.KEY_CHANGE_SUMMARY in code_list[index]:
133+
evo.sub_workspace_list[index].change_summary = code_list[index].pop(self.KEY_CHANGE_SUMMARY)
119134
evo.sub_workspace_list[index].inject_files(**code_list[index])
120135
return evo
121136

rdagent/scenarios/data_science/dev/runner/eval.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,31 @@ class DSRunnerFeedback(CoSTEERSingleFeedback):
3737
acceptable: bool | None = None
3838
hyperparameter_tuning_decision: bool | None = None
3939
hyperparameter_tuning_suggestion: str | None = None
40+
score: str | None = None
4041

4142
def is_acceptable(self) -> bool:
4243
if self.acceptable is not None:
4344
return self.acceptable
4445
return super().is_acceptable()
4546

47+
def __str__(self) -> str:
48+
parts = [
49+
"### Execution",
50+
str(self.execution),
51+
"### Return Check",
52+
self.return_checking if self.return_checking is not None else "No return checking",
53+
"### Code",
54+
str(self.code),
55+
"### Validation Score",
56+
f"{self.score}" if self.score else "Not available",
57+
"### Final Decision",
58+
f"This implementation is {'PASSED' if self.acceptable else 'FAILED'}.",
59+
]
60+
if self.hyperparameter_tuning_decision:
61+
parts.append("### Hyperparameter Tuning Suggestion")
62+
parts.append(str(self.hyperparameter_tuning_suggestion))
63+
return "\n".join(parts)
64+
4665

4766
DSCoSTEEREvalFeedback = DSRunnerFeedback # FIXME: Alias for backward compatibility
4867

@@ -77,6 +96,12 @@ def evaluate(
7796
env=env, entry=get_clear_ws_cmd()
7897
) # Remove previous submission and scores files generated by worklfow.
7998

99+
# get previous runner loops
100+
task_info = target_task.get_task_information()
101+
queried_former_failed_knowledge = (
102+
queried_knowledge.task_to_former_failed_traces[task_info] if queried_knowledge is not None else []
103+
)[0]
104+
80105
# execute workflow
81106
result = implementation.run(env=env, entry="python -m coverage run main.py")
82107
stdout = result.stdout
@@ -164,14 +189,19 @@ def evaluate(
164189
time_spent=f"{implementation.running_info.running_time:.2f} seconds",
165190
timeout=f"{env.conf.running_timeout_period} seconds",
166191
percent_of_timeout_used=f"{time_spent_ratio * 100:.2f}%",
192+
queried_former_failed_knowledge=queried_former_failed_knowledge,
167193
)
168194

169195
feedback = build_cls_from_json_with_retry(
170196
DSRunnerFeedback,
171197
system_prompt=system_prompt,
172198
user_prompt=user_prompt,
173-
init_kwargs_update_func=DSRunnerFeedback.val_and_update_init_dict,
199+
# init_kwargs_update_func=DSRunnerFeedback.val_and_update_init_dict,
174200
)
201+
feedback.score = score_df.to_string() if score_ret_code == 0 else None
202+
feedback.final_decision = feedback.acceptable and (
203+
not feedback.hyperparameter_tuning_decision
204+
) # If hyperparameter_tuning_decision is None, it's considered as False, so the final_decision dependents on the acceptable
175205

176206
if feedback and not DS_RD_SETTING.coder_on_whole_pipeline:
177207
# remove unused files

rdagent/scenarios/data_science/dev/runner/prompts.yaml

Lines changed: 36 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,10 @@ DSCoSTEER_eval:
2525
3. Confirm that the prediction file (`submission.csv`) is generated using only the test dataset, and its format matches the sample submission.
2626
If the code does not satisfy the requirements:
2727
- Set "acceptable" to false.
28-
- Set "final_decision" to false.
29-
{% if enable_hyperparameter_tuning_check %}- set "hyperparameter_tuning_decision" to false.
30-
- Set "hyperparameter_tuning_suggestion" to an empty string.
3128
If the code satisfy the requirements:
3229
- Set "acceptable" to true.
33-
- Proceed to the next evaluation.
3430
31+
{% if enable_hyperparameter_tuning_check %}
3532
# Evaluation 2: Hyperparameter
3633
## Evaluation Description
3734
The user will provide you the time spent on the whole code execution and the timeout of the code execution. You should decide whether the hyperparameter is reasonable based on the time.
@@ -45,8 +42,7 @@ DSCoSTEER_eval:
4542
3. Your suggestion should have a strong chance of improving the model's performance. Focus on the most obvious and impactful opportunities for quick improvement by leveraging more training time. Don't explore hyperparameters with low confidence. If there are no obvious and impactful opportunities and the code runs well, please accept it.
4643
If the code satisfy the requirements:
4744
- Set "hyperparameter_tuning_decision" to true.
48-
- Set "final_decision" to false.
49-
- Provide a reasonable suggestion in "hyperparameter_tuning_suggestion". The "hyperparameter_tuning_suggestion" should begin with a clear observation, followed by your suggestion. For example: "[Observation] The maximum number of epochs was reached, but the validation loss is still going down and early stopping was not activated. Only 15% of the allowed time was used. [Suggestion] We recommend increasing epochs to 100 to avoid underfitting and further improve model performance."
45+
- In "hyperparameter_tuning_suggestion", provide a clear, specific, and actionable suggestion. Begin with a concrete observation, then state a direct action to take. Do not use vague language, options, or uncertainty (avoid words like "A or B"). For example: "[Observation] The maximum number of epochs was reached, but the validation loss is still decreasing and early stopping was not activated. Only 15% of the allowed time was used. [Suggestion] Increase epochs to 100 to avoid underfitting and further improve model performance."
5046
If the code does not satisfy the requirements:
5147
- Set "hyperparameter_tuning_decision" to false.
5248
- Set "hyperparameter_tuning_suggestion" to an empty string.
@@ -59,10 +55,11 @@ DSCoSTEER_eval:
5955
"execution": "Describe whether the whole code base executed successfully and generating the final submission. Include any errors or issues encountered, and retain all error messages and traceback details.",
6056
"return_checking": "Verify the generated files, particularly the submission file. Ensure that its format matches the sample submission",
6157
"code": "Provide feedback on code quality, readability, and adherence to the given specifications.",
62-
"acceptable": <true/false: if the solution has paased execution, return_checking, and code verification, then it is a valid solution and acceptable. Otherwise it is not acceptable.>,{% if enable_hyperparameter_tuning_check %}
58+
"acceptable": <true/false: if the solution has passed execution, return_checking, and code verification, then it is a valid solution and acceptable. Otherwise it is not acceptable.>,
59+
{% if enable_hyperparameter_tuning_check %}
6360
"hyperparameter_tuning_decision": <true/false>,
64-
"hyperparameter_tuning_suggestion": <suggestion in plain text for hyperparameter tuning>,{% endif %}
65-
"final_decision": <true/false>,
61+
"hyperparameter_tuning_suggestion": <suggestion in plain text for hyperparameter tuning>,
62+
{% endif %}
6663
}
6764
```
6865
{% else %}
@@ -101,28 +98,35 @@ DSCoSTEER_eval:
10198
"acceptable": <true/false: if the solution has paased execution, return_checking, and code verification, then it is a valid solution and acceptable. Otherwise it is not acceptable.>,
10299
{% if enable_hyperparameter_tuning_check %}"hyperparameter_tuning_decision": <true/false>,
103100
"hyperparameter_tuning_suggestion": <suggestion in plain text for hyperparameter tuning>,{% endif %}
104-
"final_decision": <true/false>,
105101
}
106102
```
107103
{% endif %}
108104
# NOTE: when is_sub_enabled == False, we don't have any checking about the return. So it is just placeholder currently
109105

110106
user: |-
111-
# Code base
107+
# Current Code base
112108
{{ code }}
109+
{% if change_summary is not none %}
110+
# Current Code Change Summary
111+
{{ change_summary }}{% endif %}
113112
114113
## Stdout of code execution and testing
115114
{{ stdout }}
116115
117-
# The time spend on code execution and timeout
118-
{{ time_spent }}
119-
120-
## The timeout of code execution
121-
{{ timeout }}
122-
123-
## The percent of timeout used
124-
{{ percent_of_timeout_used }}
125-
116+
## Execution time and timeout
117+
The execution time for current code base: {{ time_spent }}.
118+
The total timeout: {{ timeout }}.
119+
The percent of timeout used: {{ percent_of_timeout_used }}.
120+
121+
{% if queried_former_failed_knowledge|length != 0 %}
122+
# Evolving History
123+
{% for former_failed_knowledge in queried_former_failed_knowledge %}## Attempt {{ loop.index }}:
124+
### Summary of Changes
125+
{{ former_failed_knowledge.implementation.change_summary }}
126+
{{ former_failed_knowledge.feedback }}
127+
{% endfor %}
128+
{% endif %}
129+
126130
DSCoSTEER:
127131
system_debugger: |-
128132
{% include "scenarios.data_science.share:scen.role" %}
@@ -132,7 +136,6 @@ DSCoSTEER:
132136
1. Code base.
133137
2. Task description, which is the task the code is trying to solve.
134138
3. Feedback generated during the execution of the whole workflow.
135-
4. Suggestions for hyperparameter tuning.
136139
Your job is to debug the whole code base, try to correct the errors, and ensure that the workflow can execute successfully on the full dataset.
137140
138141
## Task description
@@ -185,13 +188,23 @@ DSCoSTEER:
185188
{% endif %}
186189
187190
user: |-
188-
# Code Base
191+
# Current Code Base
189192
{{ code }}
190193
191-
## Feedback
194+
## Feedback of Current Code Base
192195
{{ feedback }}
193196
194197
{% if hyperparameter_tuning_suggestion is not none %}
195198
## Hyperparameter Tuning Suggestion
196199
{{ hyperparameter_tuning_suggestion }}
197200
{% endif %}
201+
202+
{% if queried_former_failed_knowledge|length != 0 %}
203+
# Evolving History
204+
{% for former_failed_knowledge in queried_former_failed_knowledge %}## Attempt {{ loop.index }}:
205+
### Summary of Changes
206+
{{ former_failed_knowledge.implementation.change_summary }}
207+
### Validation Scores
208+
{{ former_failed_knowledge.feedback.score }}
209+
{% endfor %}
210+
{% endif %}

0 commit comments

Comments
 (0)