fix: refine DSCoSTEER_eval prompts (#1157)

RolandMinrui · jingyuanlm · Xu · web-flow · commit c62e5fcc871d · 2025-08-06T22:55:25.000+08:00
* change DSCoSTEER_eval prompts * fallback to better exp only * fix fallback * fix and reformat * fix bug when base_fb is None * add reasoning to hyperparameter evaluation * feat: add acceptable assessment in exp_feedback (#1159) * add time * refine eval prompt and make the logic of tuning check more clear * some refinement * fix CI * fix a small bug, only consider score in runner * refine comment * simplify compare function --------- Co-authored-by: jingyuanlm <842442862@qq.com> Co-authored-by: Xu <v-xuminrui@microsoft.com> Co-authored-by: Jensen Lee <91518020+Jensen246@users.noreply.github.com> Co-authored-by: Xu Yang <peteryang@vip.qq.com>
diff --git a/rdagent/app/kaggle/conf.py b/rdagent/app/kaggle/conf.py
@@ -78,5 +78,11 @@ class KaggleBasePropSetting(ExtendedBaseSettings):
     time_ratio_limit_to_enable_hyperparameter_tuning: float = 1
     """Time ratio limit to enable hyperparameter tuning, if not change, hyperparameter tuning is always enabled in the first evolution."""
 
+    overall_time_ratio_limit_to_enable_hyperparameter_tuning: float = 0
+    """Overall time ratio limit to enable hyperparameter tuning, if not change, hyperparameter tuning is always enabled in the first evolution."""
+
+    only_enable_tuning_in_merge: bool = False
+    """Whether to enable hyperparameter tuning in the merge stage"""
+
 
 KAGGLE_IMPLEMENT_SETTING = KaggleBasePropSetting()
diff --git a/rdagent/components/coder/CoSTEER/__init__.py b/rdagent/components/coder/CoSTEER/__init__.py
@@ -77,6 +77,17 @@ def _get_last_fb(self) -> CoSTEERMultiFeedback:
         assert isinstance(fb, CoSTEERMultiFeedback), "feedback must be of type CoSTEERMultiFeedback"
         return fb
 
+    def compare_and_pick_fb(self, base_fb: CoSTEERMultiFeedback | None, new_fb: CoSTEERMultiFeedback | None) -> bool:
+        """
+        Compare new feedback with the fallback feedback.
+
+        Returns:
+            bool: True if the new feedback better and False if the new feedback is worse or invalid.
+        """
+        if new_fb is not None and new_fb.is_acceptable():
+            return True
+        return False
+
     def develop(self, exp: Experiment) -> Experiment:
 
         # init intermediate items
@@ -97,11 +108,18 @@ def develop(self, exp: Experiment) -> Experiment:
         # Evolving the solution
         start_datetime = datetime.now()
         fallback_evo_exp = None
+        fallback_evo_fb = None
         reached_max_seconds = False
         for evo_exp in self.evolve_agent.multistep_evolve(evo_exp, self.evaluator):
             assert isinstance(evo_exp, Experiment)  # multiple inheritance
-            if self._get_last_fb().is_acceptable():
+            evo_fb = self._get_last_fb()
+            fallback_decision = self.compare_and_pick_fb(
+                base_fb=fallback_evo_fb,
+                new_fb=evo_fb,
+            )
+            if fallback_decision:
                 fallback_evo_exp = deepcopy(evo_exp)
+                fallback_evo_fb = deepcopy(evo_fb)
                 fallback_evo_exp.create_ws_ckp()  # NOTE: creating checkpoints for saving files in the workspace to prevent inplace mutation.
 
             logger.log_object(evo_exp.sub_workspace_list, tag="evolving code")
diff --git a/rdagent/core/proposal.py b/rdagent/core/proposal.py
@@ -104,6 +104,7 @@ def __init__(
         code_change_summary: str | None = None,
         decision: bool,
         eda_improvement: str | None = None,
+        acceptable: bool | None = None,
     ) -> None:
         super().__init__(
             reason,
@@ -114,6 +115,7 @@ def __init__(
         self.observations = observations
         self.hypothesis_evaluation = hypothesis_evaluation
         self.new_hypothesis = new_hypothesis
+        self.acceptable = acceptable
 
     def __str__(self) -> str:
         return f"""{super().__str__()}
diff --git a/rdagent/scenarios/data_science/dev/feedback.py b/rdagent/scenarios/data_science/dev/feedback.py
@@ -109,6 +109,7 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeed
                 else convert2bool(dict_get_with_warning(resp_dict, "Replace Best Result", "no"))
             ),
             eda_improvement=dict_get_with_warning(resp_dict, "EDA Improvement", "no"),  # EDA improvement suggestion
+            acceptable=convert2bool(dict_get_with_warning(resp_dict, "Acceptable", "no")),
         )
 
         if hypothesis_feedback and DS_RD_SETTING.enable_knowledge_base:
diff --git a/rdagent/scenarios/data_science/dev/prompts.yaml b/rdagent/scenarios/data_science/dev/prompts.yaml
@@ -64,15 +64,26 @@ exp_feedback:
     - You should provide your feedback based on the current code and SOTA code. Especially focus on the feature engineering part.
     - For example, if the code truncate the line with N words, you can suggest to print the mean, median or quantile of the length of the line for better understanding of the data in the next rounds of experiments.
 
+    Step 6: Overall Acceptability Assessment
+
+    - Determine the overall acceptability of the experiment based on the comprehensive evaluation from previous steps:
+      - Set `"Acceptable": "yes"` ONLY if ALL of the following conditions are met:
+        * Step 1: Submission format is valid
+        * Step 2: Evaluation methodology is aligned with competition requirements  
+        * Step 4: Current code demonstrates clear improvements over SOTA (better practices, efficiency, or interpretability)
+      - Set `"Acceptable": "no"` if ANY of the above conditions fail
+    - This acceptability assessment serves as a final quality gate to ensure only truly valuable experiments are accepted
+
     Provide detailed and constructive feedback structured as follows without anything else:
     {
       "Submission Format Check": "yes or no",
       "First Valid Submission": "yes or no",
       "Code Change Summary": "Clearly summarize the changes made to the code (please cover the most important changes while being concise); during development, extra modifications may be made beyond the intent of the hypothesis, so these changes should also be included to provide complete information",
       "Observations": "Clearly summarize current and SOTA ensemble results with exact scores and notable patterns. Limit to no more than three concise, data-focused sentences. Your observation must be grounded by explicit evidence from scenario description or code implementation, not just validation scores.",
-      "Feedback for Hypothesis": Explicitly confirm or refute the hypothesis based on specific data points or performance trends. Limit to two sentences.",
+      "Feedback for Hypothesis": "Explicitly confirm or refute the hypothesis based on specific data points or performance trends. Limit to two sentences.",
       "Evaluation Aligned With Task": "yes or no",
       "Replace Best Result": "yes or no",
+      "Acceptable": "yes or no",
       "Reasoning": "Clearly explain the reason for success or failure of the experiment. Begin explicitly with [Submission format error], [Evaluation error], [Experiment Analysis] or [Code Analysis] depending on the step at which issues arose. Reference specific scores and methodological differences with SOTA. Limit to three sentences.",
       "EDA Improvement": "improvement suggestion for EDA code, if needed, otherwise set to 'no'. If there is no EDA code, set to 'no'."
     }
diff --git a/rdagent/scenarios/data_science/dev/runner/__init__.py b/rdagent/scenarios/data_science/dev/runner/__init__.py
@@ -1,11 +1,11 @@
 import pandas as pd
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
-from rdagent.components.coder import CoSTEER
 from rdagent.components.coder.CoSTEER import CoSTEER
 from rdagent.components.coder.CoSTEER.config import CoSTEERSettings
 from rdagent.components.coder.CoSTEER.evaluators import (
     CoSTEERMultiEvaluator,
+    CoSTEERMultiFeedback,
     CoSTEERSingleFeedback,
 )
 from rdagent.components.coder.CoSTEER.evolvable_subjects import FBWorkspace
@@ -171,6 +171,24 @@ def get_develop_max_seconds(self) -> int | None:
         """
         return int(self.scen.real_full_timeout() * self.settings.max_seconds_multiplier)
 
+    def compare_and_pick_fb(self, base_fb: CoSTEERMultiFeedback | None, new_fb: CoSTEERMultiFeedback | None) -> bool:
+        # In data science, we only have a single feedback.
+        # Note: new_fb should always exists as indicated by _get_last_fb() function.
+        if base_fb is None:
+            return True
+
+        base_fb = base_fb[0]
+        new_fb = new_fb[0]
+
+        def compare_scores(s1, s2) -> bool:
+            if s2 is None:
+                return False
+            if s1 is None:
+                return True
+            return (s2 > s1) == self.scen.metric_direction
+
+        return compare_scores(base_fb.score, new_fb.score)
+
     def develop(self, exp):
         bak_sub_tasks = exp.pending_tasks_list
         exp.sub_tasks = [
diff --git a/rdagent/scenarios/data_science/dev/runner/eval.py b/rdagent/scenarios/data_science/dev/runner/eval.py
@@ -1,6 +1,7 @@
 import json
 import re
 from dataclasses import dataclass
+from datetime import timedelta
 from pathlib import Path
 
 import pandas as pd
@@ -15,6 +16,7 @@
 from rdagent.core.evolving_framework import QueriedKnowledge
 from rdagent.core.experiment import FBWorkspace, Task
 from rdagent.log import rdagent_logger as logger
+from rdagent.log.timer import RD_Agent_TIMER_wrapper
 from rdagent.scenarios.data_science.test_eval import (
     MLETestEval,
     NoTestEvalError,
@@ -155,28 +157,33 @@ def evaluate(
         submission_check_out = ""
         submission_ret_code = 0
         test_eval = get_test_eval()
-
+        timer = RD_Agent_TIMER_wrapper.timer
         if test_eval.enabled(self.scen.competition):
             submission_check_out, submission_ret_code = test_eval.valid(self.scen.competition, implementation)
             stdout += f"\n### Submission check:\n{submission_check_out}\nIf Submission check returns a 'Submission is valid' or similar message, despite some warning messages, you should still consider the submission as valid and give a positive final decision. "
 
+        # Whether to enable hyperparameter tuning check
+        # 1. This is the first evaluation.
+        c1 = len(queried_knowledge.task_to_former_failed_traces[target_task.get_task_information()][0]) == 0
+
+        # 2. The current time spent on runner is less that the time_ratio_limit_to_enable_hyperparameter_tuning.
         time_spent_ratio = implementation.running_info.running_time / env.conf.running_timeout_period
-        # Only enable hyperparameter tuning on the first evaluation.
-        # Avoid too much time consuming.
-        enable_hyperparameter_tuning_check = False
-        if len(queried_knowledge.task_to_former_failed_traces[target_task.get_task_information()][0]) == 0 and (
-            time_spent_ratio < DS_RD_SETTING.time_ratio_limit_to_enable_hyperparameter_tuning
-        ):
-            enable_hyperparameter_tuning_check = True
-
-        if (
-            DS_RD_SETTING.time_ratio_limit_to_enable_hyperparameter_tuning is not None
-            and time_spent_ratio > DS_RD_SETTING.time_ratio_limit_to_enable_hyperparameter_tuning
-        ):
-            enable_hyperparameter_tuning_check = False
-            logger.info(
-                f"Time spent ratio {time_spent_ratio:.2f} exceeds the limit {DS_RD_SETTING.time_ratio_limit_to_enable_hyperparameter_tuning}, hyperparameter tuning is disabled."
-            )
+        c2 = time_spent_ratio < DS_RD_SETTING.time_ratio_limit_to_enable_hyperparameter_tuning
+
+        # 3. Only enable hyperparameter tuning during the merge stage if configured.
+        if DS_RD_SETTING.only_enable_tuning_in_merge:
+            c3 = timer.remain_time() >= timedelta(hours=DS_RD_SETTING.merge_hours)
+        else:
+            c3 = True
+
+        # 4. If we set an overall hyperparameter tuning time ratio limit, only enable tuning if enough overall time remains.
+        res_time = RD_Agent_TIMER_wrapper.timer.remain_time()
+        total_time = RD_Agent_TIMER_wrapper.timer.all_duration
+        res_ratio = res_time / total_time
+        c4 = res_ratio >= DS_RD_SETTING.overall_time_ratio_limit_to_enable_hyperparameter_tuning
+
+        # Only enable hyperparameter tuning check if all 4 criteria are met.
+        enable_hyperparameter_tuning_check = c1 and c2 and c3 and c4
 
         system_prompt = T(".prompts:DSCoSTEER_eval.system").r(
             scenario=self.scen.get_scenario_all_desc(eda_output=implementation.file_dict.get("EDA.md", None)),
@@ -199,7 +206,11 @@ def evaluate(
             user_prompt=user_prompt,
             # init_kwargs_update_func=DSRunnerFeedback.val_and_update_init_dict,
         )
-        feedback.score = score_df.to_string() if score_ret_code == 0 else None
+        try:
+            feedback.score = score_df.loc["ensemble"].iloc[0] if score_ret_code == 0 else None
+        except:
+            logger.error("Failed to get the score from scores.csv.")
+            feedback.score = None
         feedback.final_decision = feedback.acceptable and (
             not feedback.hyperparameter_tuning_decision
         )  # If hyperparameter_tuning_decision is None, it's considered as False, so the final_decision dependents on the acceptable
diff --git a/rdagent/scenarios/data_science/dev/runner/prompts.yaml b/rdagent/scenarios/data_science/dev/runner/prompts.yaml
@@ -18,35 +18,52 @@ DSCoSTEER_eval:
     The code is focusing on the following task
     {{ task_desc }}
 
-    ## Evaluation Guidelines
+    ## Evaluation Criteria
     1. Evaluate the code base based on several aspects, including execution correctness, return checking, and code quality.
     2. Ensure the code does not contain any incorrect, fabricated, or deceptive operations, such as mocking data, scores, or results.
     3. Confirm that the prediction file (`submission.csv`) is generated using only the test dataset, and its format matches the sample submission. Please refer to Submission check section including the format check to the submission.
-    If the code does not satisfy the requirements:
+    If the code does not satisfy any of the criteria:
     - Set "acceptable" to false.
-    If the code satisfy the requirements:
+    If the code satisfy all the criteria:
     - Set "acceptable" to true.
 
     {% if enable_hyperparameter_tuning_check %}
     # Evaluation 2: Hyperparameter
-    ## Evaluation Description
     The user will provide you the time spent on the whole code execution and the timeout of the code execution. You should decide whether the hyperparameter is reasonable based on the time.
     For example, if the code uses only a very small portion of the allowed time, and hyperparameters like `n_estimators` or `epochs` have low values, with early stopping not being triggered and possible signs of underfitting, you should suggest increasing these hyperparameters.
     You should also notice other resources utilization hyper-parameters.
     For example, if you are using a GPU with large memory, and the batch size is set very low, you should suggest increasing the batch size if it is not reasonable.
 
-    ## Evaluation Guidelines
-    1. The code execution time or resource utilization suggest that there is room for improvement in the hyperparameters.
-    2. The code must apply early stopping strategy already (in order to prevent overfitting).
+    ## Evaluation Criteria
+    1. The code execution time or resource utilization is under-utilized, which suggests that there is room for improvement in the hyperparameter
+    2. The code must already applied early stopping strategy to prevent overfitting and the early stopping was not triggered (otherwise, increasing epochs will be wasted).
     3. Your suggestion should have a strong chance of improving the model's performance. Focus on the most obvious and impactful opportunities for quick improvement by leveraging more training time. Don't explore hyperparameters with low confidence.  If there are no obvious and impactful opportunities and the code runs well, please accept it.
     4. Only include the suggestions in your response without leak any time limit information because the user might over-fit the model to the time limit.
     5. Never make your judgment only based on the time spent, you should also consider the code and the stdout.
-    If the code satisfy the requirements:
-    - Set "hyperparameter_tuning_decision" to true.
-    - In "hyperparameter_tuning_suggestion", provide a clear, specific, and actionable suggestion. Begin with a concrete observation, then state a direct action to take. Do not use vague language, options, or uncertainty (avoid words like "A or B"). For example: "[Observation] The maximum number of epochs was reached, but the validation loss is still decreasing and early stopping was not activated. Only small portion of the allowed time was used. [Suggestion] Increase epochs to 100 to avoid underfitting and further improve model performance."
-    If the code does not satisfy the requirements:
+
+    In the "reasoning", provide clear, step-by-step reasoning for your hyperparameter tuning evaluation. Explicitly reference the code, stdout, and resource usage to justify your assessment. Ensure your reasoning checks whether all evaluation criteria are satisfied, and highlight any specific observations that support your decision.
+    If the code does not satisfy any of the criteria:
     - Set "hyperparameter_tuning_decision" to false.
     - Set "hyperparameter_tuning_suggestion" to an empty string.
+    If the code satisfy all the criteria:
+    - Set "hyperparameter_tuning_decision" to true.
+    - In "hyperparameter_tuning_suggestion", provide a clear, specific, and actionable suggestion. Begin with a concrete observation, then state a direct action to take. Do not use vague language, options, or uncertainty (avoid words like "A or B"). For example: "[Observation] The maximum number of epochs was reached, but the validation loss is still decreasing and early stopping was not activated. Only small portion of the allowed time was used. [Suggestion] Increase epochs to 100 to avoid underfitting and further improve model performance."
+
+    ## Hyperparameter Tuning Guidelines
+    1. Task-specific Hyperparameters
+      - NLP: Check `max_len`, model size, learning rate, batch size. Suggest increases only if underfitting or low resource usage.
+      - CV: Check `image_size`, backbone size, batch size, learning rate, augmentation. Suggest increases if results are poor and resources under-used.
+      - Tabular: Check tree depth, leaves, embedding, preprocessing, learning rate, regularization.
+    2. Model Capacity
+      - If validation accuracy is low or loss is high, suggest increasing model size or layers if resources allow. Add regularization if overfitting.
+    3. Epochs
+      - If early stopping triggered, do not increase epochs. If not triggered and validation improves, suggest more epochs.
+    4. Batch Size
+      - If memory allows and batch size is low, suggest increasing. If OOM errors, suggest reducing.
+    5. Learning Rate
+      - If training is slow/underfitting, suggest increasing. If unstable, suggest decreasing.
+    6. Data Augmentation
+      - For CV/NLP, suggest tuning augmentation if overfitting or poor generalization.
     {% endif %}
 
     ## Output format
@@ -57,8 +74,11 @@ DSCoSTEER_eval:
         "return_checking": "Verify the generated files, particularly the submission file. Ensure that its format is valid",
         "code": "Provide feedback on code quality, readability, and adherence to the given specifications.",
         "acceptable": <true/false: if the solution has passed execution, return_checking, and code verification, then it is a valid solution and acceptable. Otherwise it is not acceptable.>,
-        {% if enable_hyperparameter_tuning_check %}"hyperparameter_tuning_decision": <true/false>,
-        "hyperparameter_tuning_suggestion": <suggestion in plain text for hyperparameter tuning>,{% endif %}
+        {% if enable_hyperparameter_tuning_check %}
+        "reasoning": "Provide step-by-step reasoning for hyperparameter tuning evaluation.",
+        "hyperparameter_tuning_suggestion": <suggestion in plain text for hyperparameter tuning>,
+        "hyperparameter_tuning_decision": <true/false>,
+        {% endif %}
     }
     ```
 

Original file line number	Diff line number	Diff line change
`@@ -109,6 +109,7 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeed`
`109`	`109`	`else convert2bool(dict_get_with_warning(resp_dict, "Replace Best Result", "no"))`
`110`	`110`	`),`
`111`	`111`	`eda_improvement=dict_get_with_warning(resp_dict, "EDA Improvement", "no"), # EDA improvement suggestion`
	`112`	`+ acceptable=convert2bool(dict_get_with_warning(resp_dict, "Acceptable", "no")),`
`112`	`113`	`)`
`113`	`114`
`114`	`115`	`if hypothesis_feedback and DS_RD_SETTING.enable_knowledge_base:`