Skip to content

Commit 383e5ed

Browse files
authored
feat: streamline hyperparameter tuning checks and update evaluation g… (#1167)
* feat: streamline hyperparameter tuning checks and update evaluation guidelines * fix task_gen json check
1 parent cec4240 commit 383e5ed

File tree

4 files changed

+21
-63
lines changed

4 files changed

+21
-63
lines changed

rdagent/app/kaggle/conf.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -78,11 +78,5 @@ class KaggleBasePropSetting(ExtendedBaseSettings):
7878
time_ratio_limit_to_enable_hyperparameter_tuning: float = 1
7979
"""Time ratio limit to enable hyperparameter tuning, if not change, hyperparameter tuning is always enabled in the first evolution."""
8080

81-
overall_time_ratio_limit_to_enable_hyperparameter_tuning: float = 0
82-
"""Overall time ratio limit to enable hyperparameter tuning, if not change, hyperparameter tuning is always enabled in the first evolution."""
83-
84-
only_enable_tuning_in_merge: bool = False
85-
"""Whether to enable hyperparameter tuning in the merge stage"""
86-
8781

8882
KAGGLE_IMPLEMENT_SETTING = KaggleBasePropSetting()

rdagent/scenarios/data_science/dev/runner/eval.py

Lines changed: 8 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import json
22
import re
33
from dataclasses import dataclass
4-
from datetime import timedelta
54
from pathlib import Path
65

76
import pandas as pd
@@ -16,7 +15,6 @@
1615
from rdagent.core.evolving_framework import QueriedKnowledge
1716
from rdagent.core.experiment import FBWorkspace, Task
1817
from rdagent.log import rdagent_logger as logger
19-
from rdagent.log.timer import RD_Agent_TIMER_wrapper
2018
from rdagent.scenarios.data_science.test_eval import (
2119
MLETestEval,
2220
NoTestEvalError,
@@ -37,7 +35,6 @@ class DSRunnerFeedback(CoSTEERSingleFeedback):
3735
"""
3836

3937
acceptable: bool | None = None
40-
reasoning: str | None = None
4138
hyperparameter_tuning_decision: bool | None = None
4239
hyperparameter_tuning_suggestion: str | None = None
4340
score: str | None = None
@@ -158,33 +155,19 @@ def evaluate(
158155
submission_check_out = ""
159156
submission_ret_code = 0
160157
test_eval = get_test_eval()
161-
timer = RD_Agent_TIMER_wrapper.timer
158+
162159
if test_eval.enabled(self.scen.competition):
163160
submission_check_out, submission_ret_code = test_eval.valid(self.scen.competition, implementation)
164161
stdout += f"\n### Submission check:\n{submission_check_out}\nIf Submission check returns a 'Submission is valid' or similar message, despite some warning messages, you should still consider the submission as valid and give a positive final decision. "
165162

166-
# Whether to enable hyperparameter tuning check
167-
# 1. This is the first evaluation.
168-
c1 = len(queried_knowledge.task_to_former_failed_traces[target_task.get_task_information()][0]) == 0
169-
170-
# 2. The current time spent on runner is less that the time_ratio_limit_to_enable_hyperparameter_tuning.
171163
time_spent_ratio = implementation.running_info.running_time / env.conf.running_timeout_period
172-
c2 = time_spent_ratio < DS_RD_SETTING.time_ratio_limit_to_enable_hyperparameter_tuning
173-
174-
# 3. Only enable hyperparameter tuning during the merge stage if configured.
175-
if DS_RD_SETTING.only_enable_tuning_in_merge:
176-
c3 = timer.remain_time() >= timedelta(hours=DS_RD_SETTING.merge_hours)
177-
else:
178-
c3 = True
179-
180-
# 4. If we set an overall hyperparameter tuning time ratio limit, only enable tuning if enough overall time remains.
181-
res_time = RD_Agent_TIMER_wrapper.timer.remain_time()
182-
total_time = RD_Agent_TIMER_wrapper.timer.all_duration
183-
res_ratio = res_time / total_time
184-
c4 = res_ratio >= DS_RD_SETTING.overall_time_ratio_limit_to_enable_hyperparameter_tuning
185-
186-
# Only enable hyperparameter tuning check if all 4 criteria are met.
187-
enable_hyperparameter_tuning_check = c1 and c2 and c3 and c4
164+
# Only enable hyperparameter tuning on the first evaluation.
165+
# Avoid too much time consuming.
166+
enable_hyperparameter_tuning_check = False
167+
if len(queried_knowledge.task_to_former_failed_traces[target_task.get_task_information()][0]) == 0 and (
168+
time_spent_ratio < DS_RD_SETTING.time_ratio_limit_to_enable_hyperparameter_tuning
169+
):
170+
enable_hyperparameter_tuning_check = True
188171

189172
system_prompt = T(".prompts:DSCoSTEER_eval.system").r(
190173
scenario=self.scen.get_scenario_all_desc(eda_output=implementation.file_dict.get("EDA.md", None)),

rdagent/scenarios/data_science/dev/runner/prompts.yaml

Lines changed: 12 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -18,52 +18,35 @@ DSCoSTEER_eval:
1818
The code is focusing on the following task
1919
{{ task_desc }}
2020
21-
## Evaluation Criteria
21+
## Evaluation Guidelines
2222
1. Evaluate the code base based on several aspects, including execution correctness, return checking, and code quality.
2323
2. Ensure the code does not contain any incorrect, fabricated, or deceptive operations, such as mocking data, scores, or results.
2424
3. Confirm that the prediction file (`submission.csv`) is generated using only the test dataset, and its format matches the sample submission. Please refer to Submission check section including the format check to the submission.
25-
If the code does not satisfy any of the criteria:
25+
If the code does not satisfy the requirements:
2626
- Set "acceptable" to false.
27-
If the code satisfy all the criteria:
27+
If the code satisfy the requirements:
2828
- Set "acceptable" to true.
2929
3030
{% if enable_hyperparameter_tuning_check %}
3131
# Evaluation 2: Hyperparameter
32+
## Evaluation Description
3233
The user will provide you the time spent on the whole code execution and the timeout of the code execution. You should decide whether the hyperparameter is reasonable based on the time.
3334
For example, if the code uses only a very small portion of the allowed time, and hyperparameters like `n_estimators` or `epochs` have low values, with early stopping not being triggered and possible signs of underfitting, you should suggest increasing these hyperparameters.
3435
You should also notice other resources utilization hyper-parameters.
3536
For example, if you are using a GPU with large memory, and the batch size is set very low, you should suggest increasing the batch size if it is not reasonable.
3637
37-
## Evaluation Criteria
38-
1. The code execution time or resource utilization is under-utilized, which suggests that there is room for improvement in the hyperparameter
39-
2. The code must already applied early stopping strategy to prevent overfitting and the early stopping was not triggered (otherwise, increasing epochs will be wasted).
38+
## Evaluation Guidelines
39+
1. The code execution time or resource utilization suggest that there is room for improvement in the hyperparameters.
40+
2. The code must apply early stopping strategy already (in order to prevent overfitting).
4041
3. Your suggestion should have a strong chance of improving the model's performance. Focus on the most obvious and impactful opportunities for quick improvement by leveraging more training time. Don't explore hyperparameters with low confidence. If there are no obvious and impactful opportunities and the code runs well, please accept it.
4142
4. Only include the suggestions in your response without leak any time limit information because the user might over-fit the model to the time limit.
4243
5. Never make your judgment only based on the time spent, you should also consider the code and the stdout.
43-
44-
In the "reasoning", provide clear, step-by-step reasoning for your hyperparameter tuning evaluation. Explicitly reference the code, stdout, and resource usage to justify your assessment. Ensure your reasoning checks whether all evaluation criteria are satisfied, and highlight any specific observations that support your decision.
45-
If the code does not satisfy any of the criteria:
46-
- Set "hyperparameter_tuning_decision" to false.
47-
- Set "hyperparameter_tuning_suggestion" to an empty string.
48-
If the code satisfy all the criteria:
44+
If the code satisfy the requirements:
4945
- Set "hyperparameter_tuning_decision" to true.
5046
- In "hyperparameter_tuning_suggestion", provide a clear, specific, and actionable suggestion. Begin with a concrete observation, then state a direct action to take. Do not use vague language, options, or uncertainty (avoid words like "A or B"). For example: "[Observation] The maximum number of epochs was reached, but the validation loss is still decreasing and early stopping was not activated. Only small portion of the allowed time was used. [Suggestion] Increase epochs to 100 to avoid underfitting and further improve model performance."
51-
52-
## Hyperparameter Tuning Guidelines
53-
1. Task-specific Hyperparameters
54-
- NLP: Check `max_len`, model size, learning rate, batch size. Suggest increases only if underfitting or low resource usage.
55-
- CV: Check `image_size`, backbone size, batch size, learning rate, augmentation. Suggest increases if results are poor and resources under-used.
56-
- Tabular: Check tree depth, leaves, embedding, preprocessing, learning rate, regularization.
57-
2. Model Capacity
58-
- If validation accuracy is low or loss is high, suggest increasing model size or layers if resources allow. Add regularization if overfitting.
59-
3. Epochs
60-
- If early stopping triggered, do not increase epochs. If not triggered and validation improves, suggest more epochs.
61-
4. Batch Size
62-
- If memory allows and batch size is low, suggest increasing. If OOM errors, suggest reducing.
63-
5. Learning Rate
64-
- If training is slow/underfitting, suggest increasing. If unstable, suggest decreasing.
65-
6. Data Augmentation
66-
- For CV/NLP, suggest tuning augmentation if overfitting or poor generalization.
47+
If the code does not satisfy the requirements:
48+
- Set "hyperparameter_tuning_decision" to false.
49+
- Set "hyperparameter_tuning_suggestion" to an empty string.
6750
{% endif %}
6851
6952
## Output format
@@ -74,9 +57,7 @@ DSCoSTEER_eval:
7457
"return_checking": "Verify the generated files, particularly the submission file. Ensure that its format is valid",
7558
"code": "Provide feedback on code quality, readability, and adherence to the given specifications.",
7659
"acceptable": <true/false: if the solution has passed execution, return_checking, and code verification, then it is a valid solution and acceptable. Otherwise it is not acceptable.>,
77-
{% if enable_hyperparameter_tuning_check %}
78-
"reasoning": "Provide step-by-step reasoning for hyperparameter tuning evaluation.",
79-
"hyperparameter_tuning_suggestion": <suggestion in plain text for hyperparameter tuning>,
60+
{% if enable_hyperparameter_tuning_check %}"hyperparameter_tuning_suggestion": <suggestion in plain text for hyperparameter tuning>,
8061
"hyperparameter_tuning_decision": <true/false>,
8162
{% endif %}
8263
}

rdagent/scenarios/data_science/proposal/exp_gen/proposal.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -930,7 +930,7 @@ def task_gen(
930930
user_prompt=user_prompt,
931931
system_prompt=sys_prompt,
932932
response_format=CodingSketch if self.supports_response_schema else {"type": "json_object"},
933-
json_target_type=Dict[str, str | Dict[str, str]] if not self.supports_response_schema else None,
933+
json_target_type=Dict[str, str | List[str] | Dict[str, str]] if not self.supports_response_schema else None,
934934
)
935935

936936
task_dict = json.loads(response)

0 commit comments

Comments
 (0)