microsoft · jingyuanlm · Aug 27, 2025 · Jul 25, 2025 · Jul 25, 2025 · Jul 30, 2025
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
@@ -42,6 +42,11 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     full_timeout: int = 3600
     """The timeout limit for running on full data"""
 
+    #### model dump
+    enable_model_dump: bool = False
+    enable_doc_dev: bool = False
+    model_dump_check_level: Literal["medium", "high"] = "medium"
+
     ### specific feature
 
     ### notebook integration
@@ -63,11 +68,6 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     use_raw_description: bool = False
     show_nan_columns: bool = False
 
-    #### model dump
-    enable_model_dump: bool = False
-    enable_doc_dev: bool = False
-    model_dump_check_level: Literal["medium", "high"] = "medium"
-
     ### knowledge base
     enable_knowledge_base: bool = False
     knowledge_base_version: str = "v1"
@@ -126,7 +126,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     exp_gen_version_list: str = "v3,v2"
 
     #### multi-trace: time for final multi-trace merge
-    merge_hours: int = 0
+    merge_hours: float = 0
     """The time for merge"""
 
     #### multi-trace: max SOTA-retrieved number, used in AutoSOTAexpSelector
@@ -140,19 +140,44 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
 
     model_architecture_suggestion_time_percent: float = 0.75
     allow_longer_timeout: bool = False
-    longer_timeout_by_llm: bool = False
+    coder_enable_llm_decide_longer_timeout: bool = False
+    runner_enable_llm_decide_longer_timeout: bool = False
     coder_longer_timeout_multiplier_upper: int = 3
     runner_longer_timeout_multiplier_upper: int = 2
-    timeout_increase_stage: float = 0.3
+    coder_timeout_increase_stage: float = 0.3
+    runner_timeout_increase_stage: float = 0.15
     show_hard_limit: bool = True
 
+    #### enable runner code change summary
+    runner_enable_code_change_summary: bool = True
+
+    ### Proposal workflow related
+
+    #### Hypothesis Generate related
+    enable_simple_hypothesis: bool = False
+    """If true, generate simple hypothesis, no more than 2 sentences each."""
+
+    enable_generate_unique_hypothesis: bool = False
+    """Enable generate unique hypothesis. If True, generate unique hypothesis for each component. If False, generate unique hypothesis for each component."""
+
     #### hypothesis critique and rewrite
-    enable_hypo_critique_rewrite: bool = True
+    enable_hypo_critique_rewrite: bool = False
     """Enable hypothesis critique and rewrite stages for improving hypothesis quality"""
     enable_scale_check: bool = False
 
-    #### enable runner code change summary
-    runner_enable_code_change_summary: bool = True
+    ##### select related
+    ratio_merge_or_ensemble: int = 70
+    """The ratio of merge or ensemble to be considered as a valid solution"""
+    llm_select_hypothesis: bool = False
+    """Whether to use LLM to select hypothesis. If True, use LLM selection; if False, use the existing ranking method."""
+
+    #### Task Generate related
+    fix_seed_and_data_split: bool = False
 
 
 DS_RD_SETTING = DataScienceBasePropSetting()
+
+# enable_cross_trace_diversity 和 llm_select_hypothesis should not be true at the same time
+assert not (
+    DS_RD_SETTING.enable_cross_trace_diversity and DS_RD_SETTING.llm_select_hypothesis
+), "enable_cross_trace_diversity and llm_select_hypothesis cannot be true at the same time"
diff --git a/rdagent/scenarios/data_science/dev/prompts.yaml b/rdagent/scenarios/data_science/dev/prompts.yaml
@@ -31,9 +31,13 @@ exp_feedback:
       - If overfitting is detected, provide a detailed analysis explaining how and why it occurs, referencing scenario description, code implementation, and validation scores to support your findings.
     - If such discrepancies or risks are found:
       - Clearly document these issues in `Reasoning`, referencing both scenario description and code implementation—not just validation scores.
-      - Set `"Evaluation Aligned With Task": "no"` and `"Replace Best Result": "no"`.
-      - Begin your `reasoning` with `[Evaluation error]`, explicitly stating the evaluation alignment issues causing experiment failure.
-    - If evaluation alignment passes, set `"Evaluation Aligned With Task": "yes"`, and then proceed to Step 3.
+        - Severity-based handling:
+         - Severe risk — likely to invert or invalidate the performance trend between validation and test (e.g., strong overfitting, label leakage, test distribution shift):
+           - Set "Evaluation Aligned With Task": "no" and "Replace Best Result": "no".
+           - Begin your reasoning with [Evaluation error], explicitly stating the evaluation alignment issues causing experiment failure.
+         - Mild/moderate risk — may cause slightly optimistic or biased validation scores but is unlikely to change the relative performance trend (e.g., scaling or PCA fit on full training data that’s also applied consistently to test):
+          - Set "Evaluation Aligned With Task": "yes" but note the potential bias in Reasoning.
+           - Proceed to Step 3 for result comparison.
 
     Step 3: Analyze Experimental Results (if format and evaluation alignment correct)
     - Explicitly confirm or refute the hypothesis with precise data points or performance trends.

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/base.py b/rdagent/scenarios/data_science/proposal/exp_gen/base.py
@@ -227,15 +227,21 @@ def experiment_and_feedback_list_after_init(
     ) -> list[tuple[DSExperiment, ExperimentFeedback]]:
         """
         Retrieve a list of experiments and feedbacks based on the return_type.
+
+        return_type:
+            - "sota": experiments that have true decision feedback
         """
+        # TODO: SOTA is a ver confusing name
+
         search_list = self.retrieve_search_list(search_type, selection=selection)
         final_component = self.COMPLETE_ORDER[-1]
         has_final_component = True if DS_RD_SETTING.coder_on_whole_pipeline else False
         SOTA_exp_and_feedback_list = []
         failed_exp_and_feedback_list_after_sota = []
         for exp, fb in search_list:
             if has_final_component:
-                if fb.decision:
+                # FIXME: fb should not be None, but there is a potential bug in the code.
+                if getattr(fb, "decision", False):
                     SOTA_exp_and_feedback_list.append((exp, fb))
                     failed_exp_and_feedback_list_after_sota = []
                 else: