Skip to content

Commit c62e5fc

Browse files
RolandMinruijingyuanlmXuJensen246peteryang1
authored
fix: refine DSCoSTEER_eval prompts (#1157)
* change DSCoSTEER_eval prompts * fallback to better exp only * fix fallback * fix and reformat * fix bug when base_fb is None * add reasoning to hyperparameter evaluation * feat: add acceptable assessment in exp_feedback (#1159) * add time * refine eval prompt and make the logic of tuning check more clear * some refinement * fix CI * fix a small bug, only consider score in runner * refine comment * simplify compare function --------- Co-authored-by: jingyuanlm <[email protected]> Co-authored-by: Xu <[email protected]> Co-authored-by: Jensen Lee <[email protected]> Co-authored-by: Xu Yang <[email protected]>
1 parent fc0df6e commit c62e5fc

File tree

8 files changed

+121
-34
lines changed

8 files changed

+121
-34
lines changed

rdagent/app/kaggle/conf.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,5 +78,11 @@ class KaggleBasePropSetting(ExtendedBaseSettings):
7878
time_ratio_limit_to_enable_hyperparameter_tuning: float = 1
7979
"""Time ratio limit to enable hyperparameter tuning, if not change, hyperparameter tuning is always enabled in the first evolution."""
8080

81+
overall_time_ratio_limit_to_enable_hyperparameter_tuning: float = 0
82+
"""Overall time ratio limit to enable hyperparameter tuning, if not change, hyperparameter tuning is always enabled in the first evolution."""
83+
84+
only_enable_tuning_in_merge: bool = False
85+
"""Whether to enable hyperparameter tuning in the merge stage"""
86+
8187

8288
KAGGLE_IMPLEMENT_SETTING = KaggleBasePropSetting()

rdagent/components/coder/CoSTEER/__init__.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,17 @@ def _get_last_fb(self) -> CoSTEERMultiFeedback:
7777
assert isinstance(fb, CoSTEERMultiFeedback), "feedback must be of type CoSTEERMultiFeedback"
7878
return fb
7979

80+
def compare_and_pick_fb(self, base_fb: CoSTEERMultiFeedback | None, new_fb: CoSTEERMultiFeedback | None) -> bool:
81+
"""
82+
Compare new feedback with the fallback feedback.
83+
84+
Returns:
85+
bool: True if the new feedback better and False if the new feedback is worse or invalid.
86+
"""
87+
if new_fb is not None and new_fb.is_acceptable():
88+
return True
89+
return False
90+
8091
def develop(self, exp: Experiment) -> Experiment:
8192

8293
# init intermediate items
@@ -97,11 +108,18 @@ def develop(self, exp: Experiment) -> Experiment:
97108
# Evolving the solution
98109
start_datetime = datetime.now()
99110
fallback_evo_exp = None
111+
fallback_evo_fb = None
100112
reached_max_seconds = False
101113
for evo_exp in self.evolve_agent.multistep_evolve(evo_exp, self.evaluator):
102114
assert isinstance(evo_exp, Experiment) # multiple inheritance
103-
if self._get_last_fb().is_acceptable():
115+
evo_fb = self._get_last_fb()
116+
fallback_decision = self.compare_and_pick_fb(
117+
base_fb=fallback_evo_fb,
118+
new_fb=evo_fb,
119+
)
120+
if fallback_decision:
104121
fallback_evo_exp = deepcopy(evo_exp)
122+
fallback_evo_fb = deepcopy(evo_fb)
105123
fallback_evo_exp.create_ws_ckp() # NOTE: creating checkpoints for saving files in the workspace to prevent inplace mutation.
106124

107125
logger.log_object(evo_exp.sub_workspace_list, tag="evolving code")

rdagent/core/proposal.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ def __init__(
104104
code_change_summary: str | None = None,
105105
decision: bool,
106106
eda_improvement: str | None = None,
107+
acceptable: bool | None = None,
107108
) -> None:
108109
super().__init__(
109110
reason,
@@ -114,6 +115,7 @@ def __init__(
114115
self.observations = observations
115116
self.hypothesis_evaluation = hypothesis_evaluation
116117
self.new_hypothesis = new_hypothesis
118+
self.acceptable = acceptable
117119

118120
def __str__(self) -> str:
119121
return f"""{super().__str__()}

rdagent/scenarios/data_science/dev/feedback.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeed
109109
else convert2bool(dict_get_with_warning(resp_dict, "Replace Best Result", "no"))
110110
),
111111
eda_improvement=dict_get_with_warning(resp_dict, "EDA Improvement", "no"), # EDA improvement suggestion
112+
acceptable=convert2bool(dict_get_with_warning(resp_dict, "Acceptable", "no")),
112113
)
113114

114115
if hypothesis_feedback and DS_RD_SETTING.enable_knowledge_base:

rdagent/scenarios/data_science/dev/prompts.yaml

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,15 +64,26 @@ exp_feedback:
6464
- You should provide your feedback based on the current code and SOTA code. Especially focus on the feature engineering part.
6565
- For example, if the code truncate the line with N words, you can suggest to print the mean, median or quantile of the length of the line for better understanding of the data in the next rounds of experiments.
6666
67+
Step 6: Overall Acceptability Assessment
68+
69+
- Determine the overall acceptability of the experiment based on the comprehensive evaluation from previous steps:
70+
- Set `"Acceptable": "yes"` ONLY if ALL of the following conditions are met:
71+
* Step 1: Submission format is valid
72+
* Step 2: Evaluation methodology is aligned with competition requirements
73+
* Step 4: Current code demonstrates clear improvements over SOTA (better practices, efficiency, or interpretability)
74+
- Set `"Acceptable": "no"` if ANY of the above conditions fail
75+
- This acceptability assessment serves as a final quality gate to ensure only truly valuable experiments are accepted
76+
6777
Provide detailed and constructive feedback structured as follows without anything else:
6878
{
6979
"Submission Format Check": "yes or no",
7080
"First Valid Submission": "yes or no",
7181
"Code Change Summary": "Clearly summarize the changes made to the code (please cover the most important changes while being concise); during development, extra modifications may be made beyond the intent of the hypothesis, so these changes should also be included to provide complete information",
7282
"Observations": "Clearly summarize current and SOTA ensemble results with exact scores and notable patterns. Limit to no more than three concise, data-focused sentences. Your observation must be grounded by explicit evidence from scenario description or code implementation, not just validation scores.",
73-
"Feedback for Hypothesis": Explicitly confirm or refute the hypothesis based on specific data points or performance trends. Limit to two sentences.",
83+
"Feedback for Hypothesis": "Explicitly confirm or refute the hypothesis based on specific data points or performance trends. Limit to two sentences.",
7484
"Evaluation Aligned With Task": "yes or no",
7585
"Replace Best Result": "yes or no",
86+
"Acceptable": "yes or no",
7687
"Reasoning": "Clearly explain the reason for success or failure of the experiment. Begin explicitly with [Submission format error], [Evaluation error], [Experiment Analysis] or [Code Analysis] depending on the step at which issues arose. Reference specific scores and methodological differences with SOTA. Limit to three sentences.",
7788
"EDA Improvement": "improvement suggestion for EDA code, if needed, otherwise set to 'no'. If there is no EDA code, set to 'no'."
7889
}

rdagent/scenarios/data_science/dev/runner/__init__.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
import pandas as pd
22

33
from rdagent.app.data_science.conf import DS_RD_SETTING
4-
from rdagent.components.coder import CoSTEER
54
from rdagent.components.coder.CoSTEER import CoSTEER
65
from rdagent.components.coder.CoSTEER.config import CoSTEERSettings
76
from rdagent.components.coder.CoSTEER.evaluators import (
87
CoSTEERMultiEvaluator,
8+
CoSTEERMultiFeedback,
99
CoSTEERSingleFeedback,
1010
)
1111
from rdagent.components.coder.CoSTEER.evolvable_subjects import FBWorkspace
@@ -171,6 +171,24 @@ def get_develop_max_seconds(self) -> int | None:
171171
"""
172172
return int(self.scen.real_full_timeout() * self.settings.max_seconds_multiplier)
173173

174+
def compare_and_pick_fb(self, base_fb: CoSTEERMultiFeedback | None, new_fb: CoSTEERMultiFeedback | None) -> bool:
175+
# In data science, we only have a single feedback.
176+
# Note: new_fb should always exists as indicated by _get_last_fb() function.
177+
if base_fb is None:
178+
return True
179+
180+
base_fb = base_fb[0]
181+
new_fb = new_fb[0]
182+
183+
def compare_scores(s1, s2) -> bool:
184+
if s2 is None:
185+
return False
186+
if s1 is None:
187+
return True
188+
return (s2 > s1) == self.scen.metric_direction
189+
190+
return compare_scores(base_fb.score, new_fb.score)
191+
174192
def develop(self, exp):
175193
bak_sub_tasks = exp.pending_tasks_list
176194
exp.sub_tasks = [

rdagent/scenarios/data_science/dev/runner/eval.py

Lines changed: 29 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import json
22
import re
33
from dataclasses import dataclass
4+
from datetime import timedelta
45
from pathlib import Path
56

67
import pandas as pd
@@ -15,6 +16,7 @@
1516
from rdagent.core.evolving_framework import QueriedKnowledge
1617
from rdagent.core.experiment import FBWorkspace, Task
1718
from rdagent.log import rdagent_logger as logger
19+
from rdagent.log.timer import RD_Agent_TIMER_wrapper
1820
from rdagent.scenarios.data_science.test_eval import (
1921
MLETestEval,
2022
NoTestEvalError,
@@ -155,28 +157,33 @@ def evaluate(
155157
submission_check_out = ""
156158
submission_ret_code = 0
157159
test_eval = get_test_eval()
158-
160+
timer = RD_Agent_TIMER_wrapper.timer
159161
if test_eval.enabled(self.scen.competition):
160162
submission_check_out, submission_ret_code = test_eval.valid(self.scen.competition, implementation)
161163
stdout += f"\n### Submission check:\n{submission_check_out}\nIf Submission check returns a 'Submission is valid' or similar message, despite some warning messages, you should still consider the submission as valid and give a positive final decision. "
162164

165+
# Whether to enable hyperparameter tuning check
166+
# 1. This is the first evaluation.
167+
c1 = len(queried_knowledge.task_to_former_failed_traces[target_task.get_task_information()][0]) == 0
168+
169+
# 2. The current time spent on runner is less that the time_ratio_limit_to_enable_hyperparameter_tuning.
163170
time_spent_ratio = implementation.running_info.running_time / env.conf.running_timeout_period
164-
# Only enable hyperparameter tuning on the first evaluation.
165-
# Avoid too much time consuming.
166-
enable_hyperparameter_tuning_check = False
167-
if len(queried_knowledge.task_to_former_failed_traces[target_task.get_task_information()][0]) == 0 and (
168-
time_spent_ratio < DS_RD_SETTING.time_ratio_limit_to_enable_hyperparameter_tuning
169-
):
170-
enable_hyperparameter_tuning_check = True
171-
172-
if (
173-
DS_RD_SETTING.time_ratio_limit_to_enable_hyperparameter_tuning is not None
174-
and time_spent_ratio > DS_RD_SETTING.time_ratio_limit_to_enable_hyperparameter_tuning
175-
):
176-
enable_hyperparameter_tuning_check = False
177-
logger.info(
178-
f"Time spent ratio {time_spent_ratio:.2f} exceeds the limit {DS_RD_SETTING.time_ratio_limit_to_enable_hyperparameter_tuning}, hyperparameter tuning is disabled."
179-
)
171+
c2 = time_spent_ratio < DS_RD_SETTING.time_ratio_limit_to_enable_hyperparameter_tuning
172+
173+
# 3. Only enable hyperparameter tuning during the merge stage if configured.
174+
if DS_RD_SETTING.only_enable_tuning_in_merge:
175+
c3 = timer.remain_time() >= timedelta(hours=DS_RD_SETTING.merge_hours)
176+
else:
177+
c3 = True
178+
179+
# 4. If we set an overall hyperparameter tuning time ratio limit, only enable tuning if enough overall time remains.
180+
res_time = RD_Agent_TIMER_wrapper.timer.remain_time()
181+
total_time = RD_Agent_TIMER_wrapper.timer.all_duration
182+
res_ratio = res_time / total_time
183+
c4 = res_ratio >= DS_RD_SETTING.overall_time_ratio_limit_to_enable_hyperparameter_tuning
184+
185+
# Only enable hyperparameter tuning check if all 4 criteria are met.
186+
enable_hyperparameter_tuning_check = c1 and c2 and c3 and c4
180187

181188
system_prompt = T(".prompts:DSCoSTEER_eval.system").r(
182189
scenario=self.scen.get_scenario_all_desc(eda_output=implementation.file_dict.get("EDA.md", None)),
@@ -199,7 +206,11 @@ def evaluate(
199206
user_prompt=user_prompt,
200207
# init_kwargs_update_func=DSRunnerFeedback.val_and_update_init_dict,
201208
)
202-
feedback.score = score_df.to_string() if score_ret_code == 0 else None
209+
try:
210+
feedback.score = score_df.loc["ensemble"].iloc[0] if score_ret_code == 0 else None
211+
except:
212+
logger.error("Failed to get the score from scores.csv.")
213+
feedback.score = None
203214
feedback.final_decision = feedback.acceptable and (
204215
not feedback.hyperparameter_tuning_decision
205216
) # If hyperparameter_tuning_decision is None, it's considered as False, so the final_decision dependents on the acceptable

rdagent/scenarios/data_science/dev/runner/prompts.yaml

Lines changed: 33 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,35 +18,52 @@ DSCoSTEER_eval:
1818
The code is focusing on the following task
1919
{{ task_desc }}
2020
21-
## Evaluation Guidelines
21+
## Evaluation Criteria
2222
1. Evaluate the code base based on several aspects, including execution correctness, return checking, and code quality.
2323
2. Ensure the code does not contain any incorrect, fabricated, or deceptive operations, such as mocking data, scores, or results.
2424
3. Confirm that the prediction file (`submission.csv`) is generated using only the test dataset, and its format matches the sample submission. Please refer to Submission check section including the format check to the submission.
25-
If the code does not satisfy the requirements:
25+
If the code does not satisfy any of the criteria:
2626
- Set "acceptable" to false.
27-
If the code satisfy the requirements:
27+
If the code satisfy all the criteria:
2828
- Set "acceptable" to true.
2929
3030
{% if enable_hyperparameter_tuning_check %}
3131
# Evaluation 2: Hyperparameter
32-
## Evaluation Description
3332
The user will provide you the time spent on the whole code execution and the timeout of the code execution. You should decide whether the hyperparameter is reasonable based on the time.
3433
For example, if the code uses only a very small portion of the allowed time, and hyperparameters like `n_estimators` or `epochs` have low values, with early stopping not being triggered and possible signs of underfitting, you should suggest increasing these hyperparameters.
3534
You should also notice other resources utilization hyper-parameters.
3635
For example, if you are using a GPU with large memory, and the batch size is set very low, you should suggest increasing the batch size if it is not reasonable.
3736
38-
## Evaluation Guidelines
39-
1. The code execution time or resource utilization suggest that there is room for improvement in the hyperparameters.
40-
2. The code must apply early stopping strategy already (in order to prevent overfitting).
37+
## Evaluation Criteria
38+
1. The code execution time or resource utilization is under-utilized, which suggests that there is room for improvement in the hyperparameter
39+
2. The code must already applied early stopping strategy to prevent overfitting and the early stopping was not triggered (otherwise, increasing epochs will be wasted).
4140
3. Your suggestion should have a strong chance of improving the model's performance. Focus on the most obvious and impactful opportunities for quick improvement by leveraging more training time. Don't explore hyperparameters with low confidence. If there are no obvious and impactful opportunities and the code runs well, please accept it.
4241
4. Only include the suggestions in your response without leak any time limit information because the user might over-fit the model to the time limit.
4342
5. Never make your judgment only based on the time spent, you should also consider the code and the stdout.
44-
If the code satisfy the requirements:
45-
- Set "hyperparameter_tuning_decision" to true.
46-
- In "hyperparameter_tuning_suggestion", provide a clear, specific, and actionable suggestion. Begin with a concrete observation, then state a direct action to take. Do not use vague language, options, or uncertainty (avoid words like "A or B"). For example: "[Observation] The maximum number of epochs was reached, but the validation loss is still decreasing and early stopping was not activated. Only small portion of the allowed time was used. [Suggestion] Increase epochs to 100 to avoid underfitting and further improve model performance."
47-
If the code does not satisfy the requirements:
43+
44+
In the "reasoning", provide clear, step-by-step reasoning for your hyperparameter tuning evaluation. Explicitly reference the code, stdout, and resource usage to justify your assessment. Ensure your reasoning checks whether all evaluation criteria are satisfied, and highlight any specific observations that support your decision.
45+
If the code does not satisfy any of the criteria:
4846
- Set "hyperparameter_tuning_decision" to false.
4947
- Set "hyperparameter_tuning_suggestion" to an empty string.
48+
If the code satisfy all the criteria:
49+
- Set "hyperparameter_tuning_decision" to true.
50+
- In "hyperparameter_tuning_suggestion", provide a clear, specific, and actionable suggestion. Begin with a concrete observation, then state a direct action to take. Do not use vague language, options, or uncertainty (avoid words like "A or B"). For example: "[Observation] The maximum number of epochs was reached, but the validation loss is still decreasing and early stopping was not activated. Only small portion of the allowed time was used. [Suggestion] Increase epochs to 100 to avoid underfitting and further improve model performance."
51+
52+
## Hyperparameter Tuning Guidelines
53+
1. Task-specific Hyperparameters
54+
- NLP: Check `max_len`, model size, learning rate, batch size. Suggest increases only if underfitting or low resource usage.
55+
- CV: Check `image_size`, backbone size, batch size, learning rate, augmentation. Suggest increases if results are poor and resources under-used.
56+
- Tabular: Check tree depth, leaves, embedding, preprocessing, learning rate, regularization.
57+
2. Model Capacity
58+
- If validation accuracy is low or loss is high, suggest increasing model size or layers if resources allow. Add regularization if overfitting.
59+
3. Epochs
60+
- If early stopping triggered, do not increase epochs. If not triggered and validation improves, suggest more epochs.
61+
4. Batch Size
62+
- If memory allows and batch size is low, suggest increasing. If OOM errors, suggest reducing.
63+
5. Learning Rate
64+
- If training is slow/underfitting, suggest increasing. If unstable, suggest decreasing.
65+
6. Data Augmentation
66+
- For CV/NLP, suggest tuning augmentation if overfitting or poor generalization.
5067
{% endif %}
5168
5269
## Output format
@@ -57,8 +74,11 @@ DSCoSTEER_eval:
5774
"return_checking": "Verify the generated files, particularly the submission file. Ensure that its format is valid",
5875
"code": "Provide feedback on code quality, readability, and adherence to the given specifications.",
5976
"acceptable": <true/false: if the solution has passed execution, return_checking, and code verification, then it is a valid solution and acceptable. Otherwise it is not acceptable.>,
60-
{% if enable_hyperparameter_tuning_check %}"hyperparameter_tuning_decision": <true/false>,
61-
"hyperparameter_tuning_suggestion": <suggestion in plain text for hyperparameter tuning>,{% endif %}
77+
{% if enable_hyperparameter_tuning_check %}
78+
"reasoning": "Provide step-by-step reasoning for hyperparameter tuning evaluation.",
79+
"hyperparameter_tuning_suggestion": <suggestion in plain text for hyperparameter tuning>,
80+
"hyperparameter_tuning_decision": <true/false>,
81+
{% endif %}
6282
}
6383
```
6484

0 commit comments

Comments
 (0)