Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
780469c
add hypo select by llm (without time)
Hoder-zyf Jul 25, 2025
99f6f52
add time_info and log color
Hoder-zyf Jul 25, 2025
3229058
Merge remote-tracking branch 'origin/main' into hypo_select
jingyuanlm Jul 30, 2025
a41f917
select no smooth
jingyuanlm Jul 30, 2025
dc4d4fa
2 hypo
jingyuanlm Jul 31, 2025
b9a3259
Merge remote-tracking branch 'origin/main' into hypo_select
jingyuanlm Aug 1, 2025
837a1d5
merge mian 0806
Hoder-zyf Aug 6, 2025
4365802
Merge remote-tracking branch 'origin/main' into hypo_select
jingyuanlm Aug 13, 2025
268e05d
change select
jingyuanlm Aug 13, 2025
c41cf05
small change
jingyuanlm Aug 13, 2025
285686a
fix bug
jingyuanlm Aug 13, 2025
afe334f
fix bug and add hypothesis router and begin flag
jingyuanlm Aug 14, 2025
45fd7f0
fix bug v1
jingyuanlm Aug 14, 2025
0f4fd55
fix bug v2
jingyuanlm Aug 14, 2025
69bc380
fix feedback
jingyuanlm Aug 15, 2025
401f04b
add new model
jingyuanlm Aug 15, 2025
a952c95
add filter
jingyuanlm Aug 15, 2025
417a757
fix bug v3
jingyuanlm Aug 15, 2025
2ccbe1c
fix bug v4
jingyuanlm Aug 15, 2025
10277b4
change prompts v2
jingyuanlm Aug 15, 2025
a1f5581
fix bug v5
jingyuanlm Aug 15, 2025
98f7c5b
fix bug v6
jingyuanlm Aug 16, 2025
b308a47
fix hypo
jingyuanlm Aug 16, 2025
c080772
fix some bug(sota socre, prompts, ensemble prompts ) and add path le…
jingyuanlm Aug 18, 2025
5a473b2
fix bug v7
jingyuanlm Aug 18, 2025
2b9bbe7
fix bug v8
jingyuanlm Aug 19, 2025
ce9e85a
fix bug v9
jingyuanlm Aug 19, 2025
1547da8
fix bug v10
jingyuanlm Aug 19, 2025
1bb475d
reset to v10 and refine
jingyuanlm Aug 22, 2025
c7427ec
fix: fix conflicts with main
Hoder-zyf Aug 22, 2025
832277c
fix: translate to english
Hoder-zyf Aug 22, 2025
328e119
fix bug
jingyuanlm Aug 25, 2025
d0db93d
fix: use differnet increase_stage for coder & runner
Hoder-zyf Aug 25, 2025
830aacc
fix: use different timeout_increase_stage for coder and runner.
Hoder-zyf Aug 25, 2025
6cb5411
fix: revert logger
Hoder-zyf Aug 25, 2025
63cd255
fix: remove duplicate content
Hoder-zyf Aug 25, 2025
002aefc
feat: implement LLM-driven extra hypothesis selection and adjust logic
you-n-g Aug 26, 2025
0e3d450
refactor: relocate Hypothesis models below TraceChallenges
you-n-g Aug 26, 2025
1bcdb18
remove torch fix bug
jingyuanlm Aug 26, 2025
9bc1335
fix small bug
jingyuanlm Aug 26, 2025
a621f62
refactor: prefix internal methods with underscore for llm-based hypot…
you-n-g Aug 27, 2025
4ed5c1f
add fix_seed_and_data_split and enable_simple_hypothesis
jingyuanlm Aug 27, 2025
769a93a
Merge branch 'hypo_select' of https://github.com/microsoft/RD-Agent i…
jingyuanlm Aug 27, 2025
6bda110
lint
you-n-g Aug 27, 2025
ab4da1f
refine proposal config order
you-n-g Aug 27, 2025
5839623
code review; comments
you-n-g Aug 27, 2025
ffaf9c5
lint
you-n-g Aug 27, 2025
7864323
merge int to float
jingyuanlm Aug 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 36 additions & 11 deletions rdagent/app/data_science/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
full_timeout: int = 3600
"""The timeout limit for running on full data"""

#### model dump
enable_model_dump: bool = False
enable_doc_dev: bool = False
model_dump_check_level: Literal["medium", "high"] = "medium"

### specific feature

### notebook integration
Expand All @@ -63,11 +68,6 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
use_raw_description: bool = False
show_nan_columns: bool = False

#### model dump
enable_model_dump: bool = False
enable_doc_dev: bool = False
model_dump_check_level: Literal["medium", "high"] = "medium"

### knowledge base
enable_knowledge_base: bool = False
knowledge_base_version: str = "v1"
Expand Down Expand Up @@ -126,7 +126,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
exp_gen_version_list: str = "v3,v2"

#### multi-trace: time for final multi-trace merge
merge_hours: int = 0
merge_hours: float = 0
"""The time for merge"""

#### multi-trace: max SOTA-retrieved number, used in AutoSOTAexpSelector
Expand All @@ -140,19 +140,44 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):

model_architecture_suggestion_time_percent: float = 0.75
allow_longer_timeout: bool = False
longer_timeout_by_llm: bool = False
coder_enable_llm_decide_longer_timeout: bool = False
runner_enable_llm_decide_longer_timeout: bool = False
coder_longer_timeout_multiplier_upper: int = 3
runner_longer_timeout_multiplier_upper: int = 2
timeout_increase_stage: float = 0.3
coder_timeout_increase_stage: float = 0.3
runner_timeout_increase_stage: float = 0.15
show_hard_limit: bool = True

#### enable runner code change summary
runner_enable_code_change_summary: bool = True

### Proposal workflow related

#### Hypothesis Generate related
enable_simple_hypothesis: bool = False
"""If true, generate simple hypothesis, no more than 2 sentences each."""

enable_generate_unique_hypothesis: bool = False
"""Enable generate unique hypothesis. If True, generate unique hypothesis for each component. If False, generate unique hypothesis for each component."""

#### hypothesis critique and rewrite
enable_hypo_critique_rewrite: bool = True
enable_hypo_critique_rewrite: bool = False
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't change the default value if verified with high confidence.

"""Enable hypothesis critique and rewrite stages for improving hypothesis quality"""
enable_scale_check: bool = False

#### enable runner code change summary
runner_enable_code_change_summary: bool = True
##### select related
ratio_merge_or_ensemble: int = 70
"""The ratio of merge or ensemble to be considered as a valid solution"""
llm_select_hypothesis: bool = False
"""Whether to use LLM to select hypothesis. If True, use LLM selection; if False, use the existing ranking method."""

#### Task Generate related
fix_seed_and_data_split: bool = False


DS_RD_SETTING = DataScienceBasePropSetting()

# enable_cross_trace_diversity 和 llm_select_hypothesis should not be true at the same time
assert not (
DS_RD_SETTING.enable_cross_trace_diversity and DS_RD_SETTING.llm_select_hypothesis
), "enable_cross_trace_diversity and llm_select_hypothesis cannot be true at the same time"
10 changes: 7 additions & 3 deletions rdagent/scenarios/data_science/dev/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,13 @@ exp_feedback:
- If overfitting is detected, provide a detailed analysis explaining how and why it occurs, referencing scenario description, code implementation, and validation scores to support your findings.
- If such discrepancies or risks are found:
- Clearly document these issues in `Reasoning`, referencing both scenario description and code implementation—not just validation scores.
- Set `"Evaluation Aligned With Task": "no"` and `"Replace Best Result": "no"`.
- Begin your `reasoning` with `[Evaluation error]`, explicitly stating the evaluation alignment issues causing experiment failure.
- If evaluation alignment passes, set `"Evaluation Aligned With Task": "yes"`, and then proceed to Step 3.
- Severity-based handling:
- Severe risk — likely to invert or invalidate the performance trend between validation and test (e.g., strong overfitting, label leakage, test distribution shift):
- Set "Evaluation Aligned With Task": "no" and "Replace Best Result": "no".
- Begin your reasoning with [Evaluation error], explicitly stating the evaluation alignment issues causing experiment failure.
- Mild/moderate risk — may cause slightly optimistic or biased validation scores but is unlikely to change the relative performance trend (e.g., scaling or PCA fit on full training data that’s also applied consistently to test):
- Set "Evaluation Aligned With Task": "yes" but note the potential bias in Reasoning.
- Proceed to Step 3 for result comparison.

Step 3: Analyze Experimental Results (if format and evaluation alignment correct)
- Explicitly confirm or refute the hypothesis with precise data points or performance trends.
Expand Down
8 changes: 7 additions & 1 deletion rdagent/scenarios/data_science/proposal/exp_gen/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,15 +227,21 @@ def experiment_and_feedback_list_after_init(
) -> list[tuple[DSExperiment, ExperimentFeedback]]:
"""
Retrieve a list of experiments and feedbacks based on the return_type.

return_type:
- "sota": experiments that have true decision feedback
"""
# TODO: SOTA is a ver confusing name

search_list = self.retrieve_search_list(search_type, selection=selection)
final_component = self.COMPLETE_ORDER[-1]
has_final_component = True if DS_RD_SETTING.coder_on_whole_pipeline else False
SOTA_exp_and_feedback_list = []
failed_exp_and_feedback_list_after_sota = []
for exp, fb in search_list:
if has_final_component:
if fb.decision:
# FIXME: fb should not be None, but there is a potential bug in the code.
if getattr(fb, "decision", False):
SOTA_exp_and_feedback_list.append((exp, fb))
failed_exp_and_feedback_list_after_sota = []
else:
Expand Down
Loading