fix: refine prompt, equal lightgbm, discourage over hypertuning (#1072)

you-n-g · web-flow · commit 56ba15a03fc2 · 2025-07-15T21:58:19.000+08:00
* feat: add mount_path parameter to run command

* feat: add runtime environment info and dynamic timeouts to DS runners
diff --git a/rdagent/app/utils/ws.py b/rdagent/app/utils/ws.py
@@ -10,7 +10,7 @@
 
 
 @app.command()
-def run(competition: str, cmd: str, local_path: str = "./"):
+def run(competition: str, cmd: str, local_path: str = "./", mount_path: str | None = None):
     """
     Launch the data-science environment for a specific competition and run the
     provided command.
@@ -44,6 +44,9 @@ def run(competition: str, cmd: str, local_path: str = "./"):
         enable_cache=False,
     )
 
+    if mount_path is not None:
+        env.conf.mount_path = mount_path
+
     env.run(entry=cmd, local_path=local_path)
 
 
diff --git a/rdagent/components/coder/data_science/conf.py b/rdagent/components/coder/data_science/conf.py
@@ -19,7 +19,7 @@ class DSCoderCoSTEERSettings(CoSTEERSettings):
     class Config:
         env_prefix = "DS_Coder_CoSTEER_"
 
-    max_seconds: int = 2400
+    max_seconds: int = DS_RD_SETTING.debug_timeout * 4
     env_type: str = "docker"
     # TODO: extract a function for env and conf.
 
diff --git a/rdagent/components/coder/data_science/pipeline/prompts.yaml b/rdagent/components/coder/data_science/pipeline/prompts.yaml
@@ -82,6 +82,7 @@ pipeline_coder:
     ```
     In debug mode, you should only sample ten percent of the training data and run the minimum epochs to quickly test the correctness of the code.
     In debug mode, you should implement a timer to measure the time taken for your debug configuration and estimate the time required for the full run.
+    In debug mode, your code should run faster, so the environment will set a shorter time limit than the standard time limit for your code.
     For example, you can sample ten percent of the training data and run for one epoch, then the full run with ten epochs will take one hundred times the time taken for the debug run. The scale is calculated by yourself depending on the data sampling and epoch number you choose. If your full run enables early stopping, the scale should be smaller considering the early stopping will stop the training earlier than the full epochs.
     You should sample the data after train valid split. When you split the data after sampling, you might get a class with only one sample which might cause the split strategy to fail.
     Your debug code should run exactly the same as the full run, except for the data sampling and epoch number, to ensure the correctness of the code.
@@ -133,7 +134,9 @@ pipeline_coder:
     
     {% if latest_code %}
     # Former code
+    ```
     {{ latest_code }}
+    ```
     {% if latest_code_feedback is not none %}
     ## Feedback to former code
     {{ latest_code_feedback }}
@@ -270,7 +273,11 @@ pipeline_eval:
     {{ spec }}
 
     # Code
+    ```
     {{ code }}
+    ```
 
     ## Execution Output
+    ```
     {{ stdout }}
+    ```
diff --git a/rdagent/core/scenario.py b/rdagent/core/scenario.py
@@ -52,6 +52,12 @@ def get_scenario_all_desc(
         The scenario description varies based on the task being performed.
         """
 
+    @abstractmethod
+    def get_runtime_environment(self) -> str:
+        """
+        Get the runtime environment information
+        """
+
     @property
     def experiment_setting(self) -> str | None:
         """Get experiment setting and return as rich text string"""
diff --git a/rdagent/scenarios/data_science/dev/runner/__init__.py b/rdagent/scenarios/data_science/dev/runner/__init__.py
@@ -31,7 +31,7 @@ class DSRunnerCoSTEERSettings(DSCoderCoSTEERSettings):
     class Config:
         env_prefix = "DS_Runner_CoSTEER_"
 
-    max_seconds: int = 3600
+    max_seconds: int = DS_RD_SETTING.full_timeout
     env_type: str = "docker"
     # TODO: extract a function for env and conf.
 
diff --git a/rdagent/scenarios/data_science/dev/runner/eval.py b/rdagent/scenarios/data_science/dev/runner/eval.py
@@ -133,6 +133,7 @@ def evaluate(
             scenario=self.scen.get_scenario_all_desc(eda_output=implementation.file_dict.get("EDA.md", None)),
             is_sub_enabled=test_eval.is_sub_enabled(self.scen.competition),
             task_desc=target_task.get_task_information(),
+            runtime_environment=self.scen.get_runtime_environment(),
         )
         user_prompt = T(".prompts:DSCoSTEER_eval.user").r(
             code=implementation.all_codes,
diff --git a/rdagent/scenarios/data_science/dev/runner/prompts.yaml b/rdagent/scenarios/data_science/dev/runner/prompts.yaml
@@ -9,14 +9,21 @@ DSCoSTEER_eval:
     The task is as follows:
     {{ task_desc }}
 
+    You have following environment to run the code:
+    {{ runtime_environment }}
+
     The whole workflow includes multiple stages, such as:
     - Data loading
     - Feature engineering
     - Model training
     - Ensembling
 
     The user will provide you the time spent on the whole code execution and the timeout of the code execution. You should decide whether the hyperparameter is reasonable based on the time.
-    For example, if the code only spent ten percent of the timeout and the hyperparameter like `n_estimators` or 'epochs' is very small or batch size is small you should suggest to increase these hyperparameter.
+    For example, if the code uses only a small portion of the allowed time, and hyperparameters like `n_estimators` or `epochs` have low values, with early stopping not being triggered and possible signs of underfitting, you should suggest increasing these hyperparameters.
+
+    You should also notice other resources utilization hyper-parameters,
+    For example, if you are using a GPU with large memory, and the batch size is set very low, you should suggest increasing the batch size if it is not reasonable.
+
     Please provide your feedback in two key-value pairs:
     "hyperparameter_tuning_decision": <true/false>
     "hyperparameter_tuning_suggestion": <suggestion in plain text for hyperparameter tuning, e.g., increase n_estimators to 1000, increase epochs to 100, increase batch size to 64, give an empty string if decide not to tune the hyperparameter>
diff --git a/rdagent/scenarios/data_science/scen/runtime_info.py b/rdagent/scenarios/data_science/scen/runtime_info.py
@@ -62,6 +62,8 @@ def get_gpu_info():
         "numpy",
         "scikit-learn",
         "scipy",
+        "xgboost",
+        "sklearn",
         "lightgbm",
         "vtk",
         "opencv-python",
diff --git a/rdagent/scenarios/data_science/test_eval.py b/rdagent/scenarios/data_science/test_eval.py
@@ -23,7 +23,7 @@ def valid(self, competition: str, workspace: FBWorkspace) -> tuple[str, int]:
 
     @abstractmethod
     def enabled(self, competition) -> bool:
-        """able to eval or not"""
+        """support `eval` & `valid` or not"""
 
     @abstractmethod
     def get_sample_submission_name(self, competition: str) -> str:

Original file line number	Diff line number	Diff line change
`@@ -133,6 +133,7 @@ def evaluate(`
`133`	`133`	`scenario=self.scen.get_scenario_all_desc(eda_output=implementation.file_dict.get("EDA.md", None)),`
`134`	`134`	`is_sub_enabled=test_eval.is_sub_enabled(self.scen.competition),`
`135`	`135`	`task_desc=target_task.get_task_information(),`
	`136`	`+ runtime_environment=self.scen.get_runtime_environment(),`
`136`	`137`	`)`
`137`	`138`	`user_prompt = T(".prompts:DSCoSTEER_eval.user").r(`
`138`	`139`	`code=implementation.all_codes,`