feat: enhance timeout handling in CoSTEER and DataScience scenarios (#1150)

peteryang1 · Xu · web-flow · commit 06233cb95acb · 2025-08-04T17:44:51.000+08:00
* add prev loops to runner history

* fix evolving history

* fix bug on initializing feedback without final decision

* reformat

* refine

* add comments

* feat: enhance timeout handling in CoSTEER and DataScience scenarios

---------

Co-authored-by: Xu &lt;v-xuminrui@microsoft.com&gt;
diff --git a/rdagent/components/coder/CoSTEER/__init__.py b/rdagent/components/coder/CoSTEER/__init__.py
@@ -92,6 +92,7 @@ def develop(self, exp: Experiment) -> Experiment:
         # Evolving the solution
         start_datetime = datetime.now()
         fallback_evo_exp = None
+        reached_max_seconds = False
         for evo_exp in self.evolve_agent.multistep_evolve(evo_exp, self.evaluator):
             assert isinstance(evo_exp, Experiment)  # multiple inheritance
             if self._get_last_fb().is_acceptable():
@@ -103,6 +104,7 @@ def develop(self, exp: Experiment) -> Experiment:
                 logger.info(f"evolving workspace: {sw}")
             if self.max_seconds is not None and (datetime.now() - start_datetime).seconds > self.max_seconds:
                 logger.info(f"Reached max time limit {self.max_seconds} seconds, stop evolving")
+                reached_max_seconds = True
                 break
             if RD_Agent_TIMER_wrapper.timer.started and RD_Agent_TIMER_wrapper.timer.is_timeout():
                 logger.info("Global timer is timeout, stop evolving")
@@ -111,13 +113,14 @@ def develop(self, exp: Experiment) -> Experiment:
         # if the final feedback is not finished(therefore acceptable), we will use the fallback solution.
         try:
             evo_exp = self._exp_postprocess_by_feedback(evo_exp, self._get_last_fb())
-        except CoderError:
+        except CoderError as e:
             if fallback_evo_exp is not None:
                 logger.info("Fallback to the fallback solution.")
                 evo_exp = fallback_evo_exp
                 evo_exp.recover_ws_ckp()  # NOTE: recovering checkpoints for restoring files in the workspace to prevent inplace mutation.
             else:
-                raise
+                e.caused_by_timeout = reached_max_seconds
+                raise e
 
         exp.sub_workspace_list = evo_exp.sub_workspace_list
         exp.experiment_workspace = evo_exp.experiment_workspace
diff --git a/rdagent/components/coder/data_science/pipeline/prompts.yaml b/rdagent/components/coder/data_science/pipeline/prompts.yaml
@@ -112,7 +112,13 @@ pipeline_coder:
     ```
     In debug mode, your code should run faster, so the environment will set a shorter time limit than the standard time limit for your code.
     For example, you can sample ten percent of the training data and run for one epoch, then the full run with ten epochs will take one hundred times the time taken for the debug run. The scale is calculated by yourself depending on the data sampling and epoch number you choose. If your full run enables early stopping, the scale should be smaller considering the early stopping will stop the training earlier than the full epochs.
-    Be careful about the train-valid split strategy. StratifiedShuffleSplit is highly risk since the data has some categories with only one sample. If you use StratifiedShuffleSplit, you should consider using a try-except block to catch the error and use a different split strategy if the error occurs.
+    Be careful about the train-valid split strategy. StratifiedShuffleSplit is highly risk since the data has some categories with only one sample. If you use StratifiedShuffleSplit, you should consider using a try-except block to catch the error and use a different split strategy if the error occurs. Example code:
+    ```python
+    try:
+      fold_indices = StratifiedKFold(...).split(train_X, train_y) or StratifiedShuffleSplit(...).split(train_X, train_y)
+    except Exception as e:
+        fold_indices = KFold(...).split(train_X, train_y) or other split strategy
+    ```
     You should sample the data after train valid split. When you split the data after sampling, you might get a class with only one sample which might cause the split strategy to fail. 
     Your debug code should run exactly the same as the full run, except for the data sampling and epoch number, to ensure the correctness of the code.
     You should print total time and estimated time in standard output using print function in the following schema:
diff --git a/rdagent/core/exception.py b/rdagent/core/exception.py
@@ -20,6 +20,7 @@ class CoderError(WorkflowError):
     """
 
     # NOTE: it corresponds to the error of **component**
+    caused_by_timeout: bool = False  # whether the error is caused by timeout
 
 
 class CodeFormatError(CoderError):
diff --git a/rdagent/scenarios/data_science/loop.py b/rdagent/scenarios/data_science/loop.py
@@ -221,7 +221,15 @@ def record(self, prev_out: dict[str, Any]):
         else:
             exp: DSExperiment = prev_out["direct_exp_gen"] if isinstance(e, CoderError) else prev_out["coding"]
             # TODO: distinguish timeout error & other exception.
-            if isinstance(self.trace.scen, DataScienceScen) and DS_RD_SETTING.allow_longer_timeout:
+            if (
+                isinstance(self.trace.scen, DataScienceScen)
+                and DS_RD_SETTING.allow_longer_timeout
+                and isinstance(e, CoderError)
+                and e.caused_by_timeout
+            ):
+                logger.info(
+                    f"Timeout error occurred: {e}. Increasing timeout for the current scenario from {self.trace.scen.timeout_increase_count} to {self.trace.scen.timeout_increase_count + 1}."
+                )
                 self.trace.scen.increase_timeout()
 
             # set the local selection to the trace as global selection, then set the DAG parent for the trace
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml
@@ -430,7 +430,7 @@ task_gen:
     9. **Preferred Packages Notes**:
       - You can choose the most proper packages for the task to best achieve the hypothesis.
       - When facing a choice between two packages which both can achieve the same goal, you should choose the one which is more commonly used and less likely to cause bugs in coding. Especially those you are not familiar with.
-      - For GBDT models, prefer XGBoost or RandomForest over LightGBM unless the SOTA or hypothesis dictates otherwise.
+      - For GBDT models, prefer XGBoost or RandomForest over LightGBM unless the SOTA or hypothesis dictates otherwise. Prefer not using GPU for GBDT models unless the SOTA or hypothesis dictates otherwise.
       - For neural networks, prefer PyTorch or PyTorch based library (over TensorFlow) unless the SOTA or hypothesis dictates otherwise.
       - For neural networks, prefer fine-tuning pre-trained models over training from scratch.
 
diff --git a/rdagent/scenarios/data_science/scen/__init__.py b/rdagent/scenarios/data_science/scen/__init__.py
@@ -60,6 +60,7 @@ def __init__(self, competition: str) -> None:
         self.metric_direction: bool = (
             self._get_direction()
         )  # True indicates higher is better, False indicates lower is better
+        self.timeout_increase_count = 0
 
     def reanalyze_competition_description(self):
         self.raw_description = self._get_description()
@@ -114,7 +115,6 @@ def _analysis_competition_description(self):
         self.longer_time_limit_required = response_json_analysis.get(
             "Longer time limit required", False
         )  # True or False, whether the competition scenario requires a longer time limit to the code.
-        self.timeout_increase_count = 0
 
     def real_debug_timeout(self):
         return (