fix: split then sample & remove simple model guide in ds proposal (#1034)

peteryang1 · peteryangms · jingyuanlm · web-flow · commit 2dde8b84a1d0 · 2025-07-09T15:47:37.000+08:00
* fix code timeout &amp; split_then_sample

* change- code

* change-prompts_v2

* remove more simple guidance in proposal

---------

Co-authored-by: Xu Yang &lt;xuyang1@microsoft.com&gt;
Co-authored-by: jingyuanlm &lt;842442862@qq.com&gt;
diff --git a/rdagent/components/coder/data_science/conf.py b/rdagent/components/coder/data_science/conf.py
@@ -27,9 +27,7 @@ class Config:
 def get_ds_env(
     conf_type: Literal["kaggle", "mlebench"] = "kaggle",
     extra_volumes: dict = {},
-    running_timeout_period: int = (
-        DS_RD_SETTING.debug_timeout if not DS_RD_SETTING.sample_data_by_LLM else DS_RD_SETTING.full_timeout
-    ),
+    running_timeout_period: int = DS_RD_SETTING.debug_timeout,
 ) -> Env:
     """
     Retrieve the appropriate environment configuration based on the env_type setting.
diff --git a/rdagent/components/coder/data_science/pipeline/__init__.py b/rdagent/components/coder/data_science/pipeline/__init__.py
@@ -22,10 +22,7 @@
     - Each coder could be tested.
 """
 
-import json
-import re
 from pathlib import Path
-from typing import Dict
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.components.coder.CoSTEER import CoSTEER
@@ -39,14 +36,8 @@
 from rdagent.components.coder.CoSTEER.knowledge_management import (
     CoSTEERQueriedKnowledge,
 )
-from rdagent.components.coder.data_science.conf import (
-    DSCoderCoSTEERSettings,
-    get_ds_env,
-)
+from rdagent.components.coder.data_science.conf import DSCoderCoSTEERSettings
 from rdagent.components.coder.data_science.pipeline.eval import PipelineCoSTEEREvaluator
-from rdagent.components.coder.data_science.raw_data_loader.eval import (
-    DataLoaderCoSTEEREvaluator,
-)
 from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
 from rdagent.components.coder.data_science.share.eval import ModelDumpEvaluator
 from rdagent.core.exception import CoderError
diff --git a/rdagent/components/coder/data_science/pipeline/prompts.yaml b/rdagent/components/coder/data_science/pipeline/prompts.yaml
@@ -76,9 +76,10 @@ pipeline_coder:
     ```bash
     python main.py --debug
     ```
-    In debug mode, you should only sample ten percent of the data and run the minimum epochs to quickly test the correctness of the code.
+    In debug mode, you should only sample ten percent of the training data and run the minimum epochs to quickly test the correctness of the code.
     In debug mode, you should implement a timer to measure the time taken for your debug configuration and estimate the time required for the full run.
-    For example, you can sample ten percent of the data and run for one epoch, then the full run with ten epochs will take one hundred times the time taken for the debug run. The scale is calculated by yourself depending on the data sampling and epoch number you choose. If your full run enables early stopping, the scale should be smaller considering the early stopping will stop the training earlier than the full epochs.
+    For example, you can sample ten percent of the training data and run for one epoch, then the full run with ten epochs will take one hundred times the time taken for the debug run. The scale is calculated by yourself depending on the data sampling and epoch number you choose. If your full run enables early stopping, the scale should be smaller considering the early stopping will stop the training earlier than the full epochs.
+    You should sample the data after train valid split. When you split the data after sampling, you might get a class with only one sample which might cause the split strategy to fail.
     Your debug code should run exactly the same as the full run, except for the data sampling and epoch number, to ensure the correctness of the code.
     You should print total time and estimated time in standard output using print function in the following schema:
     === Start of Debug Information ===
diff --git a/rdagent/scenarios/data_science/dev/runner/__init__.py b/rdagent/scenarios/data_science/dev/runner/__init__.py
@@ -1,6 +1,3 @@
-from pathlib import Path
-from typing import Dict
-
 import pandas as pd
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
@@ -17,7 +14,6 @@
     MultiProcessEvolvingStrategy,
 )
 from rdagent.components.coder.CoSTEER.task import CoSTEERTask
-from rdagent.components.coder.data_science.conf import get_ds_env
 from rdagent.components.coder.data_science.share.eval import ModelDumpEvaluator
 from rdagent.core.exception import RunnerError
 from rdagent.core.scenario import Scenario
@@ -26,7 +22,6 @@
 from rdagent.scenarios.data_science.dev.runner.eval import DSCoSTEERCoSTEEREvaluator
 from rdagent.utils.agent.ret import PythonBatchEditOut
 from rdagent.utils.agent.tpl import T
-from rdagent.utils.env import DockerEnv, MLEBDockerConf
 
 
 class DSRunnerMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml