feat: streamline hyperparameter tuning checks and update evaluation g… (#1167)

peteryang1 · web-flow · commit 383e5ed488c7 · 2025-08-08T13:49:01.000+08:00
* feat: streamline hyperparameter tuning checks and update evaluation guidelines

* fix task_gen json check
diff --git a/rdagent/app/kaggle/conf.py b/rdagent/app/kaggle/conf.py
@@ -78,11 +78,5 @@ class KaggleBasePropSetting(ExtendedBaseSettings):
     time_ratio_limit_to_enable_hyperparameter_tuning: float = 1
     """Time ratio limit to enable hyperparameter tuning, if not change, hyperparameter tuning is always enabled in the first evolution."""
 
-    overall_time_ratio_limit_to_enable_hyperparameter_tuning: float = 0
-    """Overall time ratio limit to enable hyperparameter tuning, if not change, hyperparameter tuning is always enabled in the first evolution."""
-
-    only_enable_tuning_in_merge: bool = False
-    """Whether to enable hyperparameter tuning in the merge stage"""
-
 
 KAGGLE_IMPLEMENT_SETTING = KaggleBasePropSetting()
diff --git a/rdagent/scenarios/data_science/dev/runner/eval.py b/rdagent/scenarios/data_science/dev/runner/eval.py
@@ -1,7 +1,6 @@
 import json
 import re
 from dataclasses import dataclass
-from datetime import timedelta
 from pathlib import Path
 
 import pandas as pd
@@ -16,7 +15,6 @@
 from rdagent.core.evolving_framework import QueriedKnowledge
 from rdagent.core.experiment import FBWorkspace, Task
 from rdagent.log import rdagent_logger as logger
-from rdagent.log.timer import RD_Agent_TIMER_wrapper
 from rdagent.scenarios.data_science.test_eval import (
     MLETestEval,
     NoTestEvalError,
@@ -37,7 +35,6 @@ class DSRunnerFeedback(CoSTEERSingleFeedback):
     """
 
     acceptable: bool | None = None
-    reasoning: str | None = None
     hyperparameter_tuning_decision: bool | None = None
     hyperparameter_tuning_suggestion: str | None = None
     score: str | None = None
@@ -158,33 +155,19 @@ def evaluate(
         submission_check_out = ""
         submission_ret_code = 0
         test_eval = get_test_eval()
-        timer = RD_Agent_TIMER_wrapper.timer
+
         if test_eval.enabled(self.scen.competition):
             submission_check_out, submission_ret_code = test_eval.valid(self.scen.competition, implementation)
             stdout += f"\n### Submission check:\n{submission_check_out}\nIf Submission check returns a 'Submission is valid' or similar message, despite some warning messages, you should still consider the submission as valid and give a positive final decision. "
 
-        # Whether to enable hyperparameter tuning check
-        # 1. This is the first evaluation.
-        c1 = len(queried_knowledge.task_to_former_failed_traces[target_task.get_task_information()][0]) == 0
-
-        # 2. The current time spent on runner is less that the time_ratio_limit_to_enable_hyperparameter_tuning.
         time_spent_ratio = implementation.running_info.running_time / env.conf.running_timeout_period
-        c2 = time_spent_ratio < DS_RD_SETTING.time_ratio_limit_to_enable_hyperparameter_tuning
-
-        # 3. Only enable hyperparameter tuning during the merge stage if configured.
-        if DS_RD_SETTING.only_enable_tuning_in_merge:
-            c3 = timer.remain_time() >= timedelta(hours=DS_RD_SETTING.merge_hours)
-        else:
-            c3 = True
-
-        # 4. If we set an overall hyperparameter tuning time ratio limit, only enable tuning if enough overall time remains.
-        res_time = RD_Agent_TIMER_wrapper.timer.remain_time()
-        total_time = RD_Agent_TIMER_wrapper.timer.all_duration
-        res_ratio = res_time / total_time
-        c4 = res_ratio >= DS_RD_SETTING.overall_time_ratio_limit_to_enable_hyperparameter_tuning
-
-        # Only enable hyperparameter tuning check if all 4 criteria are met.
-        enable_hyperparameter_tuning_check = c1 and c2 and c3 and c4
+        # Only enable hyperparameter tuning on the first evaluation.
+        # Avoid too much time consuming.
+        enable_hyperparameter_tuning_check = False
+        if len(queried_knowledge.task_to_former_failed_traces[target_task.get_task_information()][0]) == 0 and (
+            time_spent_ratio < DS_RD_SETTING.time_ratio_limit_to_enable_hyperparameter_tuning
+        ):
+            enable_hyperparameter_tuning_check = True
 
         system_prompt = T(".prompts:DSCoSTEER_eval.system").r(
             scenario=self.scen.get_scenario_all_desc(eda_output=implementation.file_dict.get("EDA.md", None)),
diff --git a/rdagent/scenarios/data_science/dev/runner/prompts.yaml b/rdagent/scenarios/data_science/dev/runner/prompts.yaml
@@ -18,52 +18,35 @@ DSCoSTEER_eval:
     The code is focusing on the following task
     {{ task_desc }}
 
-    ## Evaluation Criteria
+    ## Evaluation Guidelines
     1. Evaluate the code base based on several aspects, including execution correctness, return checking, and code quality.
     2. Ensure the code does not contain any incorrect, fabricated, or deceptive operations, such as mocking data, scores, or results.
     3. Confirm that the prediction file (`submission.csv`) is generated using only the test dataset, and its format matches the sample submission. Please refer to Submission check section including the format check to the submission.
-    If the code does not satisfy any of the criteria:
+    If the code does not satisfy the requirements:
     - Set "acceptable" to false.
-    If the code satisfy all the criteria:
+    If the code satisfy the requirements:
     - Set "acceptable" to true.
 
     {% if enable_hyperparameter_tuning_check %}
     # Evaluation 2: Hyperparameter
+    ## Evaluation Description
     The user will provide you the time spent on the whole code execution and the timeout of the code execution. You should decide whether the hyperparameter is reasonable based on the time.
     For example, if the code uses only a very small portion of the allowed time, and hyperparameters like `n_estimators` or `epochs` have low values, with early stopping not being triggered and possible signs of underfitting, you should suggest increasing these hyperparameters.
     You should also notice other resources utilization hyper-parameters.
     For example, if you are using a GPU with large memory, and the batch size is set very low, you should suggest increasing the batch size if it is not reasonable.
 
-    ## Evaluation Criteria
-    1. The code execution time or resource utilization is under-utilized, which suggests that there is room for improvement in the hyperparameter
-    2. The code must already applied early stopping strategy to prevent overfitting and the early stopping was not triggered (otherwise, increasing epochs will be wasted).
+    ## Evaluation Guidelines
+    1. The code execution time or resource utilization suggest that there is room for improvement in the hyperparameters.
+    2. The code must apply early stopping strategy already (in order to prevent overfitting).
     3. Your suggestion should have a strong chance of improving the model's performance. Focus on the most obvious and impactful opportunities for quick improvement by leveraging more training time. Don't explore hyperparameters with low confidence.  If there are no obvious and impactful opportunities and the code runs well, please accept it.
     4. Only include the suggestions in your response without leak any time limit information because the user might over-fit the model to the time limit.
     5. Never make your judgment only based on the time spent, you should also consider the code and the stdout.
-
-    In the "reasoning", provide clear, step-by-step reasoning for your hyperparameter tuning evaluation. Explicitly reference the code, stdout, and resource usage to justify your assessment. Ensure your reasoning checks whether all evaluation criteria are satisfied, and highlight any specific observations that support your decision.
-    If the code does not satisfy any of the criteria:
-    - Set "hyperparameter_tuning_decision" to false.
-    - Set "hyperparameter_tuning_suggestion" to an empty string.
-    If the code satisfy all the criteria:
+    If the code satisfy the requirements:
     - Set "hyperparameter_tuning_decision" to true.
     - In "hyperparameter_tuning_suggestion", provide a clear, specific, and actionable suggestion. Begin with a concrete observation, then state a direct action to take. Do not use vague language, options, or uncertainty (avoid words like "A or B"). For example: "[Observation] The maximum number of epochs was reached, but the validation loss is still decreasing and early stopping was not activated. Only small portion of the allowed time was used. [Suggestion] Increase epochs to 100 to avoid underfitting and further improve model performance."
-
-    ## Hyperparameter Tuning Guidelines
-    1. Task-specific Hyperparameters
-      - NLP: Check `max_len`, model size, learning rate, batch size. Suggest increases only if underfitting or low resource usage.
-      - CV: Check `image_size`, backbone size, batch size, learning rate, augmentation. Suggest increases if results are poor and resources under-used.
-      - Tabular: Check tree depth, leaves, embedding, preprocessing, learning rate, regularization.
-    2. Model Capacity
-      - If validation accuracy is low or loss is high, suggest increasing model size or layers if resources allow. Add regularization if overfitting.
-    3. Epochs
-      - If early stopping triggered, do not increase epochs. If not triggered and validation improves, suggest more epochs.
-    4. Batch Size
-      - If memory allows and batch size is low, suggest increasing. If OOM errors, suggest reducing.
-    5. Learning Rate
-      - If training is slow/underfitting, suggest increasing. If unstable, suggest decreasing.
-    6. Data Augmentation
-      - For CV/NLP, suggest tuning augmentation if overfitting or poor generalization.
+    If the code does not satisfy the requirements:
+    - Set "hyperparameter_tuning_decision" to false.
+    - Set "hyperparameter_tuning_suggestion" to an empty string.
     {% endif %}
 
     ## Output format
@@ -74,9 +57,7 @@ DSCoSTEER_eval:
         "return_checking": "Verify the generated files, particularly the submission file. Ensure that its format is valid",
         "code": "Provide feedback on code quality, readability, and adherence to the given specifications.",
         "acceptable": <true/false: if the solution has passed execution, return_checking, and code verification, then it is a valid solution and acceptable. Otherwise it is not acceptable.>,
-        {% if enable_hyperparameter_tuning_check %}
-        "reasoning": "Provide step-by-step reasoning for hyperparameter tuning evaluation.",
-        "hyperparameter_tuning_suggestion": <suggestion in plain text for hyperparameter tuning>,
+        {% if enable_hyperparameter_tuning_check %}"hyperparameter_tuning_suggestion": <suggestion in plain text for hyperparameter tuning>,
         "hyperparameter_tuning_decision": <true/false>,
         {% endif %}
     }
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -930,7 +930,7 @@ def task_gen(
             user_prompt=user_prompt,
             system_prompt=sys_prompt,
             response_format=CodingSketch if self.supports_response_schema else {"type": "json_object"},
-            json_target_type=Dict[str, str | Dict[str, str]] if not self.supports_response_schema else None,
+            json_target_type=Dict[str, str | List[str] | Dict[str, str]] if not self.supports_response_schema else None,
         )
 
         task_dict = json.loads(response)

Original file line number	Diff line number	Diff line change
`@@ -930,7 +930,7 @@ def task_gen(`
`930`	`930`	`user_prompt=user_prompt,`
`931`	`931`	`system_prompt=sys_prompt,`
`932`	`932`	`response_format=CodingSketch if self.supports_response_schema else {"type": "json_object"},`
`933`		`- json_target_type=Dict[str, str \| Dict[str, str]] if not self.supports_response_schema else None,`
	`933`	`+ json_target_type=Dict[str, str \| List[str] \| Dict[str, str]] if not self.supports_response_schema else None,`
`934`	`934`	`)`
`935`	`935`
`936`	`936`	`task_dict = json.loads(response)`