Skip to content

Commit fd039f1

Browse files
fix: add metric in scores.csv and avoid reading sample_submission.csv (#1152)
* add scores.csv metric name in both task_gen and coder * a little fix to column names * small fix * avoid sample submission read in task_gen * avoid sample_submission reading in coding * code change summary bug fix * little update * little refinement to eval * refine coder and runner eval prompts --------- Co-authored-by: Xu Yang <[email protected]>
1 parent 06233cb commit fd039f1

File tree

9 files changed

+68
-122
lines changed

9 files changed

+68
-122
lines changed

rdagent/components/coder/data_science/pipeline/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ def implement_one_task(
8383
package_info=target_task.package_info,
8484
enable_model_dump=DS_RD_SETTING.enable_model_dump,
8585
enable_debug_mode=DS_RD_SETTING.sample_data_by_LLM,
86+
spec=T("scenarios.data_science.share:component_spec.Pipeline").r(metric_name=self.scen.metric_name),
8687
)
8788
user_prompt = T(".prompts:pipeline_coder.user").r(
8889
competition_info=competition_info,

rdagent/components/coder/data_science/pipeline/eval.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -100,11 +100,6 @@ def evaluate(
100100
else:
101101
stdout += "Debug mode did not provide debug_time or estimated_time, it's a buggy implementation.\n"
102102

103-
test_eval = get_test_eval()
104-
if test_eval.enabled(self.scen.competition):
105-
submission_check_out, submission_ret_code = test_eval.valid(self.scen.competition, implementation)
106-
stdout += f"\n### Submission check:\n{submission_check_out}\nIf Submission check returns a 'Submission is valid' or similar message, despite some warning messages, you should still consider the submission as valid and give a positive final decision. "
107-
108103
score_fp = implementation.workspace_path / "scores.csv"
109104
score_ret_code = 0
110105
score_check_text = ""
@@ -141,7 +136,11 @@ def evaluate(
141136
score_check_text += f"\n[Error] in checking the scores.csv file: {e}\nscores.csv's content:\n-----\n{score_fp.read_text()}\n-----"
142137
score_ret_code = 1
143138

144-
if not test_eval.is_sub_enabled(self.scen.competition):
139+
test_eval = get_test_eval()
140+
if DS_RD_SETTING.sample_data_by_LLM and test_eval.enabled(self.scen.competition):
141+
submission_check_out, submission_ret_code = test_eval.valid(self.scen.competition, implementation)
142+
stdout += f"\n### Submission check:\n{submission_check_out}\nIf Submission check returns a 'Submission is valid' or similar message, despite some warning messages, you should still consider the submission as valid and give a positive final decision. "
143+
elif not test_eval.is_sub_enabled(self.scen.competition):
145144
submission_ret_code = 0
146145
else:
147146
# Check submission file
@@ -167,14 +166,14 @@ def evaluate(
167166
system_prompt = T(".prompts:pipeline_eval.system").r(
168167
is_sub_enabled=test_eval.is_sub_enabled(self.scen.competition),
169168
debug_mode=DS_RD_SETTING.sample_data_by_LLM,
170-
mle_check=(DS_RD_SETTING.sample_data_by_LLM and test_eval.is_sub_enabled(self.scen.competition)),
169+
mle_check=DS_RD_SETTING.sample_data_by_LLM,
171170
queried_similar_successful_knowledge=queried_similar_successful_knowledge,
172171
)
173172
user_prompt = T(".prompts:pipeline_eval.user").r(
174173
scenario=self.scen.get_scenario_all_desc(eda_output=eda_output),
175174
task_desc=target_task.get_task_information(),
176175
stdout=stdout.strip(),
177-
spec=T("scenarios.data_science.share:component_spec.Pipeline").r(),
176+
spec=T("scenarios.data_science.share:component_spec.Pipeline").r(metric_name=self.scen.metric_name),
178177
code=implementation.file_dict["main.py"],
179178
)
180179
wfb = build_cls_from_json_with_retry(

rdagent/components/coder/data_science/pipeline/prompts.yaml

Lines changed: 28 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ pipeline_coder:
2626
{% include "scenarios.data_science.share:spec.hyperparameter" %}
2727
2828
# Specification your code should follow
29-
{% include "scenarios.data_science.share:component_spec.Pipeline" %}
29+
{{ spec }}
3030
3131
{% if queried_former_failed_knowledge|length != 0 %}
3232
## Previous Failed Attempts
@@ -112,10 +112,10 @@ pipeline_coder:
112112
```
113113
In debug mode, your code should run faster, so the environment will set a shorter time limit than the standard time limit for your code.
114114
For example, you can sample ten percent of the training data and run for one epoch, then the full run with ten epochs will take one hundred times the time taken for the debug run. The scale is calculated by yourself depending on the data sampling and epoch number you choose. If your full run enables early stopping, the scale should be smaller considering the early stopping will stop the training earlier than the full epochs.
115-
Be careful about the train-valid split strategy. StratifiedShuffleSplit is highly risk since the data has some categories with only one sample. If you use StratifiedShuffleSplit, you should consider using a try-except block to catch the error and use a different split strategy if the error occurs. Example code:
115+
Be careful about the train-valid split strategy. Stratified related split is highly risk since the data has some categories with only one sample. If you use Stratified related split, you should consider using a try-except block to catch the error and use a different split strategy if the error occurs. Example code:
116116
```python
117117
try:
118-
fold_indices = StratifiedKFold(...).split(train_X, train_y) or StratifiedShuffleSplit(...).split(train_X, train_y)
118+
fold_indices = StratifiedKFold(...).split(train_X, train_y) or StratifiedShuffleSplit or StratifiedSubsetSampler etc.
119119
except Exception as e:
120120
fold_indices = KFold(...).split(train_X, train_y) or other split strategy
121121
```
@@ -206,10 +206,9 @@ pipeline_eval:
206206
3. A code implementation and its execution output.
207207
Your task is to rigorously evaluate the code implementation against the provided scenario and task description, ensuring it meets all requirements, adheres to the specified structure, and executes successfully.
208208
209-
{% if is_sub_enabled %}
210-
## Evaluation Steps
209+
## Evaluation Aspects
211210
212-
### Step 1: Execution Success
211+
### Execution Success
213212
- Goal: Ensure the code executes successfully without any errors.
214213
- Notes:
215214
- Model performance is not evaluated in this step; focus solely on successful execution.
@@ -219,22 +218,7 @@ pipeline_eval:
219218
- If the code does not execute successfully:
220219
- Set the "final_decision" to false and write complete analysis in the "execution" field.
221220
222-
### Step 2: Submission File Authenticity and Format
223-
- Goal: Verify that the code correctly generates the final submission in the expected format and that the submission is authentic.
224-
- Guidelines:
225-
- The submission file must strictly match the required structure (correct columns, index format, data types). The index names and column names must be identical to the sample submission.
226-
- Rigorously verify that the submission file was produced by genuine model inference and successful code execution, not by cheating, fallback or exception-handling mechanisms.
227-
- The submission must be generated from genuine model predictions using the best saved model—never empty, constant, random, or hard-coded values.
228-
- Submissions must reflect authentic model outputs; any form of fabrication, cheating, or simulated results is strictly prohibited and grounds for rejection.
229-
- Cross-check both code logic and stdout to ensure predictions originate from real model inference, not from error recovery or placeholder code paths.
230-
- Only check the format of the submission since only part of the data is provided; the submission might have a different index than the sample submission data.
231-
- Verify honest failure reporting if training issues occur.
232-
- If the code passes this step:
233-
- Proceed to Step 3.
234-
- If the code does not pass this step:
235-
- Set the "final_decision" to false and clearly document the issues in the "return_checking" field.
236-
237-
### Step 3: Competition Alignment
221+
### Competition Alignment
238222
- Goal: Confirm strict adherence to the competition's evaluation rules and experimental setup.
239223
- Guidelines:
240224
- Analyze whether the experimental setup and code may cause misalignment between validation and test performance.
@@ -251,7 +235,7 @@ pipeline_eval:
251235
- Begin the "code" with `[Evaluation error]`, explicitly document any evaluation alignment issues causing experiment failure.
252236
253237
{% if debug_mode %}
254-
### Step 4: Debug Mode Compliance
238+
### Debug Mode Compliance
255239
- Goal: Ensure the code follows debug mode requirements.
256240
- Guidelines:
257241
- Sufficient debugging information (print statements, clear error messages) should be included to facilitate automatic improvement processes.
@@ -263,15 +247,31 @@ pipeline_eval:
263247
- Debug time should be reasonable and the estimated time should be reasonable based on the debug time.
264248
- Data sampling should only be applied in debug mode. Always use the full data in the full run.
265249
- The label classes number should be the same as the full run even in debug mode.
266-
- If the code passes this step: Finalize evaluation.
250+
- If the code passes this step: Proceed to Next Aspects.
267251
- If the code does not pass this step: Clearly document the debug mode compliance issues and reject the implementation.{% endif %}
268252
253+
254+
### Submission File Format Check
269255
{% if mle_check %}
270-
### Step 5: Test format check
271256
- The user has done a format check for your submission. Since you didn't sample any test data, your debug mode output should be the same format as the full run.
272257
- The user will put the check result in the "Submission check" section of the execution output.
273258
- If the submission check returns a 'Submission is valid' or similar message, despite some warning messages, you should give the conclusion that the code executed successfully. If no other code related issues are found, set the "final_decision" to true.
274259
- If the submission check returns an error message, you should set the "final_decision" to false and clearly document the issues in the "return_checking" field.
260+
{% elif is_sub_enabled %}
261+
- Goal: Verify that the code correctly generates the final submission in the expected format and that the submission is authentic.
262+
- Guidelines:
263+
- The submission file must strictly match the required structure (correct columns, index format, data types). The index names and column names must be identical to the format specified in the Competition Information's '====== Submission Format ======' section.
264+
- Rigorously verify that the submission file was produced by genuine model inference and successful code execution, not by cheating, fallback or exception-handling mechanisms.
265+
- The submission must be generated from genuine model predictions using the best saved model—never empty, constant, random, or hard-coded values.
266+
- Submissions must reflect authentic model outputs; any form of fabrication, cheating, or simulated results is strictly prohibited and grounds for rejection.
267+
- Cross-check both code logic and stdout to ensure predictions originate from real model inference, not from error recovery or placeholder code paths.
268+
- Only check the format of the submission since only part of the data is provided; the submission might have a different index than expected due to data sampling.
269+
- Verify honest failure reporting if training issues occur.
270+
- If the code passes this step, Finalize evaluation.
271+
- If the code does not pass this step:
272+
- Set the "final_decision" to false and clearly document the issues in the "return_checking" field.
273+
{% else %}
274+
Submission File Format Check is not conducted since no target submission format is provided. You should consider this submission file is valid.
275275
{% endif %}
276276
277277
{% if queried_similar_successful_knowledge|length != 0 %}
@@ -290,35 +290,16 @@ pipeline_eval:
290290
Please respond with your feedback in the following JSON format without anything else.
291291
```json
292292
{
293-
"execution": "Describe whether the code executed successfully, correctly integrating all components and generating the final submission. Include any errors or issues encountered, and append all error messages and full traceback details without summarizing or omitting any information. If errors occurred, analyze the root causes: (1) Are they fundamental algorithmic/approach issues, or (2) Implementation details that can be easily fixed, or (3) Environment/dependency problems?",
294-
"return_checking": "Examine the generated files by cross-referencing the code logic and stdout output. Verify: (1) Format matches sample submission (index, column names, CSV content); (2) **File generation authenticity**: Is the file genuinely produced by successful model execution, or is it a result of exception handling/fallback mechanisms? Cite specific code sections and stdout evidence.",
293+
"execution": "Describe whether the code executed successfully. Include any errors or issues encountered, and append all error messages and full traceback details without summarizing or omitting any information. If errors occurred, analyze the root causes: (1) Are they fundamental algorithmic/approach issues, or (2) Implementation details that can be easily fixed, or (3) Environment/dependency problems?",
294+
"return_checking": "Examine the generated files by cross-referencing the code logic and stdout output. Verify: (1) Format matches required submission format (index, column names, CSV content); (2) **File generation authenticity**: Is the file genuinely produced by successful model execution, or is it a result of exception handling/fallback mechanisms? Cite specific code sections and stdout evidence.",
295295
"code": "Begin explicitly with [Code analysis] or [Evaluation error]. Provide structured analysis: (1) **Technical Appropriateness**: Does the chosen approach (algorithms, data processing, validation strategy) match this problem's data characteristics and competition requirements? (2) **Effective Components**: What specific parts work well and why are they effective for this problem type? (3) **Issues & Improvements**: Identify concrete problems and suggest actionable improvement directions (without providing actual code). (4) **Code Quality**: Assess readability, structure, and adherence to specifications.",
296296
"final_decision": <true/false>
297297
}
298298
```
299-
{% else %}
300-
## Evaluation Scope
301-
Your focus is to check whether the workflow code executes successfully.
302299
303-
You will be given the execution output (`stdout`) to determine correctness.
304-
305-
[Note]
306-
1. Model performance is NOT a concern in this evaluation—only correct execution and formatting matter.
307-
308-
Please respond with your feedback in the following JSON format and order
309-
```json
310-
{
311-
"execution": "Describe whether the code executed successfully. Include any errors or issues encountered, and append all error messages and full traceback details without summarizing or omitting any information. If errors occurred, analyze the root causes: (1) Are they fundamental algorithmic/approach issues, or (2) Implementation details that can be easily fixed, or (3) Environment/dependency problems?",
312-
"return_checking": "Describe the expected file to be generated.",
313-
"code": "Provide structured analysis: (1) **Technical Appropriateness**: Does the chosen approach (algorithms, data processing, validation strategy) match this problem's data characteristics and requirements? (2) **Effective Components**: What specific parts work well and why are they effective for this problem type? (3) **Issues & Improvements**: Identify concrete problems and suggest actionable improvement directions (without providing actual code). (4) **Code Quality**: Assess readability, structure, and adherence to specifications.",
314-
"final_decision": <true/false>
315-
}
316-
```
317-
{% endif %}
318-
# NOTE: when is_sub_enabled == False, we don't have any checking about the return. So it is just placeholder currently
319300
320301
user: |-
321-
# Competition Scenario
302+
# Competition Information
322303
{{ scenario }}
323304
324305
# Task Description

rdagent/scenarios/data_science/dev/runner/eval.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ def evaluate(
165165

166166
if test_eval.enabled(self.scen.competition):
167167
submission_check_out, submission_ret_code = test_eval.valid(self.scen.competition, implementation)
168-
stdout += f"\nSubmission check:\n{submission_check_out}\nIf Submission check returns a 'Submission is valid' or similar message, despite some warning messages, you should still consider the submission as valid and give a positive final decision. "
168+
stdout += f"\n### Submission check:\n{submission_check_out}\nIf Submission check returns a 'Submission is valid' or similar message, despite some warning messages, you should still consider the submission as valid and give a positive final decision. "
169169

170170
time_spent_ratio = implementation.running_info.running_time / env.conf.running_timeout_period
171171
if (
@@ -179,12 +179,12 @@ def evaluate(
179179

180180
system_prompt = T(".prompts:DSCoSTEER_eval.system").r(
181181
scenario=self.scen.get_scenario_all_desc(eda_output=implementation.file_dict.get("EDA.md", None)),
182-
is_sub_enabled=test_eval.is_sub_enabled(self.scen.competition),
183182
task_desc=target_task.get_task_information(),
184183
enable_hyperparameter_tuning_check=enable_hyperparameter_tuning_check,
185184
)
186185
user_prompt = T(".prompts:DSCoSTEER_eval.user").r(
187186
code=implementation.all_codes,
187+
change_summary=implementation.change_summary,
188188
stdout=shrink_text(stdout),
189189
time_spent=f"{implementation.running_info.running_time:.2f} seconds",
190190
timeout=f"{env.conf.running_timeout_period} seconds",

0 commit comments

Comments
 (0)