microsoft · Hoder-zyf · Aug 8, 2025 · Aug 8, 2025 · Aug 9, 2025 · Aug 9, 2025
diff --git a/rdagent/components/coder/data_science/pipeline/prompts.yaml b/rdagent/components/coder/data_science/pipeline/prompts.yaml
@@ -13,9 +13,10 @@ pipeline_coder:
     {{ runtime_environment }}
 
     {% if package_info is not none %}
-    To help you write the runnable code, the user has provided the package information which contains the package names and versions.
-    You should be careful about the package versions, as the code will be executed in the environment with the specified version and the api might be different from the latest version.
-    The user might provide the packages the environment doesn't have, you should avoid using any of them.
+    - To help you write the runnable code, the user has provided the package information which contains the package names and versions.
+    - You should be careful about the package versions, as the code will be executed in the environment with the specified version and the api might be different from the latest version.
+    - While the environment is fixed, you should not limit yourself to only the provided packages - feel free to explore other libraries that might better suit the task. However, prioritize using the available packages first, and only suggest alternatives when they would provide significant improvements or are more appropriate for the specific problem.
+
     ## Package Information
     {{ package_info }}
     {% endif %}

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/base.py b/rdagent/scenarios/data_science/proposal/exp_gen/base.py
@@ -21,7 +21,7 @@ def __init__(
         concise_knowledge: str | None = None,
         problem_name: str | None = None,
         problem_desc: str | None = None,
-        problem_label: Literal["SCENARIO_PROBLEM", "FEEDBACK_PROBLEM"] = "FEEDBACK_PROBLEM",
+        problem_label: Literal["SCENARIO_PROBLEM", "FEEDBACK_PROBLEM", "PERSISTENT_PROBLEM"] = "FEEDBACK_PROBLEM",
         appendix: str | None = None,
     ) -> None:
         super().__init__(

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/package_info.py b/rdagent/scenarios/data_science/proposal/exp_gen/package_info.py
@@ -6,6 +6,108 @@ def get_installed_packages():
     return {dist.metadata["Name"].lower(): dist.version for dist in distributions()}
 
 
+# Kaggle competition packages - based on usage frequency
+PYTHON_BASE_PACKAGES = ["catboost", "lightgbm", "numpy", "optuna", "pandas", "scikit-learn", "scipy", "shap", "xgboost"]
+
+PYTHON_ADVANCED_PACKAGES = [
+    "accelerate",
+    "albumentations",
+    "bayesian-optimization",
+    "category_encoders",
+    "datasets",
+    "featuretools",
+    "imbalanced-learn",
+    "nltk",
+    "opencv-python",
+    "pillow",
+    "polars",
+    "sentence-transformers",
+    "spacy",
+    "tensorflow",
+    "timm",
+    "tokenizers",
+    "torch",
+    "torchvision",
+    "transformers",
+]
+
+
+def get_available_packages_prompt():
+    """Generate prompt template for dynamically detected available packages"""
+    installed_packages = get_installed_packages()
+
+    # Check which packages are actually installed
+    base_available = [pkg for pkg in PYTHON_BASE_PACKAGES if pkg.lower() in installed_packages]
+    advanced_available = [pkg for pkg in PYTHON_ADVANCED_PACKAGES if pkg.lower() in installed_packages]
+
+    # Build prompt
+    prompt_parts = ["# Available packages in environment:\n"]
+
+    if base_available:
+        prompt_parts.append("## 【Basic Libraries】(general tools for data science tasks):")
+        prompt_parts.append(f"- {', '.join(base_available)}")
+        prompt_parts.append("")
+
+    if advanced_available:
+        prompt_parts.append("## 【Advanced Tools】(specialized for specific domains):")
+        prompt_parts.append(f"- {', '.join(advanced_available)}")
+        prompt_parts.append("")
+
+    prompt_parts.append(
+        "You should choose appropriate tool combinations based on the specific context and current situation. Feel free to use any other packages you think are necessary to achieve the best performance."
+    )
+
+    return "\n".join(prompt_parts).strip()
+
+
+def get_persistent_problem_guidelines():
+    """Generate guidelines for PERSISTENT_PROBLEM scenarios - focusing on model architecture"""
+    guidelines = [
+        "## Model Architecture Selection Guidelines",
+        "",
+        "Focus on **model architecture** - choosing the right model type and structure for your specific problem.",
+        "",
+        "### **History-Aware Architecture Selection**",
+        "",
+        "1. **Learn from Experiment History**",
+        "   - **Check what's been tried**: Review previous experiments to understand current baseline status",
+        "   - **Identify gaps**: What architectures haven't been properly tested yet?",
+        "   - **Build on success**: If baseline exists and works, focus on targeted improvements",
+        "",
+        "2. **Context-Driven Strategy**",
+        "   - **No baseline yet**: Start with reliable methods (XGBoost, RandomForest) to establish foundation",
+        "   - **Baseline established**: Explore modern alternatives (LightGBM, CatBoost) for potential gains",
+        "   - **Modern methods tested**: Consider advanced techniques (ensembles, custom) if justified by results",
+        "",
+        "3. ** Single Focus Per Hypothesis**",
+        "   - **One goal at a time**: Each hypothesis should focus on either establishing baseline OR testing innovation, not both",
+        "   - **Avoid feature creep**: Don't try to implement multiple improvements in one hypothesis",
+        "   - **Clear hypothesis scope**: Define exactly what this hypothesis is testing before proposing",
+        "   - **Iterative approach**: Build incrementally - baseline first, then one innovation at a time",
+        "",
+        "4. **Timely Fallback Principle**",
+        "   - **Monitor performance closely**: If advanced methods show no clear improvement, retreat quickly",
+        "   - **Don't chase complexity**: Advanced doesn't always mean better - simple often wins",
+        "   - **Fallback triggers**: Performance drop, training instability, or unclear benefits = immediate retreat",
+        "   - **Preserve what works**: Always maintain access to your best-performing solution",
+        "",
+        "5. **Computational Constraints**",
+        "   - Training time limitations: Choose models that converge quickly",
+        "   - Inference requirements: Balance accuracy with prediction speed",
+        "   - Memory constraints: Consider model size and batch processing needs",
+        "",
+        "### 💡 **Key Reminder**",
+        "**One hypothesis, one goal**: Each hypothesis should test exactly one architectural change - either establish baseline OR test one specific innovation. Keep scope focused for clear results.",
+    ]
+    return "\n".join(guidelines)
+
+
+def get_all_available_packages():
+    """Get flattened list of all packages"""
+    all_packages = PYTHON_BASE_PACKAGES + PYTHON_ADVANCED_PACKAGES
+    return sorted(set(all_packages))
+
+
 def print_filtered_packages(installed_packages, filtered_packages):
     to_print = []
     for package_name in filtered_packages:
@@ -26,24 +128,8 @@ def get_python_packages():
     # Example: `python package_info.py pandas torch scikit-learn`
     # If no extra arguments are provided we fall back to the original default list
     # to keep full backward-compatibility.
-    packages_list = [  # default packages
-        "transformers",
-        "accelerate",
-        "torch",
-        "tensorflow",
-        "pandas",
-        "numpy",
-        "scikit-learn",
-        "scipy",
-        "xgboost",
-        "sklearn",
-        "lightgbm",
-        "vtk",
-        "opencv-python",
-        "keras",
-        "matplotlib",
-        "pydicom",
-    ]
+    # Use our Kaggle-optimized package list as default
+    packages_list = get_all_available_packages()
     if len(sys.argv) > 1:
         packages_list = list(set(packages_list) | set(sys.argv[1:]))
 
@@ -60,5 +146,15 @@ def get_python_packages():
             print(pkg)
 
 
+def print_available_packages_prompt():
+    """Print the available packages prompt to stdout for external consumption"""
+    prompt = get_available_packages_prompt()
+    print(prompt)
+
+
 if __name__ == "__main__":
-    get_python_packages()
+    # Check if we should print available packages prompt
+    if len(sys.argv) > 1 and sys.argv[1] == "--packages-prompt":
+        print_available_packages_prompt()
+    else:
+        get_python_packages()
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml
@@ -193,6 +193,10 @@ hypothesis_gen:
       - **Risk-Reward Balance (Score: 1-10):** Considering the potential for significant improvement (reward) versus the probability of failure, negative side-effects, or excessive resource consumption (risk), how optimal is this balance? A high score indicates a favorable balance.
       - **Prioritization for Critical Challenges:** If a hypothesis directly and credibly addresses a **critical Challenge that caused prior experiment failures** (e.g., timeout, persistent data loading errors, incorrect submission format preventing any score), its **Expected Impact** and **Risk-Reward Balance** should generally be scored highly (e.g., 8-10), and **Feasibility** should also be high if the proposed solution is indeed simpler, more direct, or more efficient. This ensures such critical hypotheses are prioritized.
 
+    {% if additional_guidelines %}
+    {{ additional_guidelines }}
+    {% endif %}
+
     {% if inject_diverse %}
     # Focus on Diversity!!
     Diversity is very critical in the analysis of scenario problems. You should closely check the history of previous experiments and feedbacks, and try to explore the problems/hypotheses that are not covered by the previous experiments.
@@ -250,6 +254,7 @@ hypothesis_critique:
     - **Metric Impact**: Will this meaningfully improve the competition's evaluation metric?
     - **Historical Context**: Has similar approaches been tried? Key learnings from past attempts?
     - **Innovation vs History Balance**: Distinguish between implementation failures (worth retrying with improvements) vs fundamental approach failures (multiple attempts failed due to core unsuitability - should avoid)
+    - **Tool Selection Appropriateness**: Are the suggested tools/packages well-suited for the problem? Consider both modern capabilities and traditional reliability
 
     ### 3. Improvement Direction
     - **Clarity Issues**: If vague, identify specific methods or strategies that address the core problem
@@ -268,11 +273,13 @@ hypothesis_critique:
     **Good Critiques:**
     - "The hypothesis lacks specificity about which ensemble method to use. Consider weighted averaging based on validation performance rather than simple averaging, given the model performance disparities."
     - "This hypothesis proposes LSTM for tabular data. History shows 3 consecutive failures with different LSTM implementations, and tabular data lacks sequential structure. Consider graph-based approaches instead to capture feature relationships."
+    - "The hypothesis jumps to LightGBM without establishing a baseline. Consider starting with XGBoost to ensure a working solution, then explore LightGBM for potential improvements if the baseline performs adequately."
 
     **Poor Critiques:**
     - "Set max_depth=10, learning_rate=0.05, and use 500 trees." (too specific)
     - "This might not work." (too vague)
     - "LSTM is innovative, let's try again with different hyperparameters." (ignores fundamental mismatch)
+    - "Use the latest deep learning model because it's new." (ignores problem-solution fit)
 
     {% if critique_output_format is not none %}
     ## Output Format
@@ -300,13 +307,16 @@ hypothesis_rewrite:
     ## Task
     Transform each **original hypothesis and its critique** into a **single, specific, testable technical hypothesis** that can be implemented immediately.
 
+    **You have the authority to delete hypotheses that you judge to be completely infeasible or unsuitable, but ensure at least one hypothesis remains in your output.**
+
     ## Core Principles
     1. **Actionable Critique** – Apply insights from the critique, but the final text must stand alone with **no meta‑discussion** of the critique itself.
     2. **Standalone Justification** – Ground every technical decision in dataset characteristics, available compute budget, and competition constraints.
     3. **Decisive Specificity** – Remove all ambiguity; propose one clear action.
     4. **Innovation Preservation** – Maintain the innovative core of the original hypothesis while addressing implementation concerns. Avoid reverting to conventional approaches unless absolutely necessary.
     5. **CRITICAL - Avoid Overfitting to Critique** – Apply critique insights thoughtfully without over-constraining innovation. Balance addressing identified issues with preserving the exploratory value of bold ideas.
-    {% if enable_scale_check %}6. The user is currently working on a continuous exploration on the task. It's typical that we first try in small scale and in some certain point we will scale up the solution. 
+    6. **Hypothesis Deletion Authority** – You have the authority to delete hypotheses that you judge to be completely infeasible or unsuitable. Use your judgment, but ensure at least one hypothesis remains.
+    {% if enable_scale_check %}7. The user is currently working on a continuous exploration on the task. It's typical that we first try in small scale and in some certain point we will scale up the solution. 
     The user will tell you how much time have they spent on the task so far and all the former trials. You should consider whether to scale up the solution based on the current situation. You should put this conclusion in each hypothesis's appendix section.
     Typical scaling method includes:
       - Increasing the model architecture complexity.
@@ -321,6 +331,12 @@ hypothesis_rewrite:
 
     ## Guidelines for Writing Rewritten Hypotheses
 
+    ### Available Tools Consideration
+    - When rewriting, consider if the hypothesis leverages appropriate tools from the available packages
+    - Balance innovation with practical tool selection - prefer modern packages when they offer clear advantages
+    - Ensure tool choices align with the problem requirements and constraints
+    - Be pragmatic: use whatever works best for the task - whether it's a cutting-edge transformer or traditional logistic regression
+
     1. **Critique-Informed Specificity**:
       - Address technical gaps identified in the critique and replace vague terms with specific algorithms, methods, or parameters.
       - Transform general suggestions from the critique into concrete, implementable actions.
@@ -379,6 +395,10 @@ hypothesis_rewrite:
     {{ time_status }}
     {% endif %}
 
+    {% if packages_prompt is not none %}
+    {{ packages_prompt }}
+    {% endif %}
+
 
 task_gen:
   system: |-
@@ -429,11 +449,22 @@ task_gen:
       - Ensure validation metrics and processes are consistent across all parts of the pipeline. Avoid changes that would alter how validation metrics are calculated unless that is part of the hypothesis.
     8. **Submission File (`submission.csv`)**: Generate `submission.csv` in the **exact format** required (column names, order, data types), as detailed in the '====== Submission Format ======' section of the Competition Scenario Description (DO NOT read the sample_submission.csv file directly in the code). This is a critical step.
     9. **Preferred Packages Notes**:
-      - You can choose the most proper packages for the task to best achieve the hypothesis.
-      - When facing a choice between two packages which both can achieve the same goal, you should choose the one which is more commonly used and less likely to cause bugs in coding. Especially those you are not familiar with.
-      - For GBDT models, prefer XGBoost or RandomForest over LightGBM unless the SOTA or hypothesis dictates otherwise. Prefer not using GPU for GBDT models unless the SOTA or hypothesis dictates otherwise.
-      - For neural networks, prefer PyTorch or PyTorch based library (over TensorFlow) unless the SOTA or hypothesis dictates otherwise.
-      - For neural networks, prefer fine-tuning pre-trained models over training from scratch.
+      - The `Available Packages` section in the Competition Scenario Description includes general and specific recommendations. Choose packages that best support the hypothesis and constraints; you may deviate with clear justification grounded in data, code reuse, and efficiency.
+      - Do not select packages solely because you are familiar with them.
+      - When facing a choice between two packages which both can achieve the same goal, you should choose the one which is more 
+      commonly used and less likely to cause bugs in coding.
+      - For GBDT, default to CPU. Enable GPU only if profiling on this environment shows clear, reproducible speedups without stability regressions; document the versions and settings used.
+      - For deep learning frameworks, align with the current codebase and available pretrained assets. If unconstrained, default to PyTorch or PyTorch‑based libraries given ecosystem and template support; prioritize consistency and reuse over brand preferences.
+      - For deep learning, favor fine-tuning well‑validated pretrained models over training from scratch when applicable to the task.
+
+    10. **Adaptive Architecture Selection Strategy**:
+      - **History assessment**: Review what architectures/approaches have been tried in previous experiments and their outcomes.
+      - **Context-driven decisions**: Base architecture choice on experiment maturity rather than fixed preferences.
+      - **IF no working solution exists**: Focus on establishing reliable baseline first.
+      - **IF baseline established**: Consider exploring alternative architectures for potential improvements.
+      - **IF alternatives tested**: Evaluate advanced techniques only if previous approaches show measurable benefits.
+      - **Avoid repetition**: Don't retry failed approaches unless addressing specific implementation issues identified in feedback.
+      - **Performance-guided progression**: Let actual results from previous experiments guide complexity level choice.
 
     ## Package Declaration
     At the end of your design, **you MUST** provide a key `packages` in the final JSON output.  
@@ -648,8 +679,8 @@ output_format:
       }
     }
   rewrite: |-
-    For each original hypothesis, rewrite it to address critique feedback, strictly following the JSON schema below. 
-    Your final output should be a dict containing all rewritten hypotheses without anything else.
+    For each original hypothesis, either rewrite it to address critique feedback or delete it if it's completely infeasible, strictly following the JSON schema below.
+    Your final output should be a dict containing the rewritten hypotheses (excluding any deleted ones) without anything else.
     {
       "problem name 1 (should be exactly same as the original problem name without prefix or suffix)": {
         "reason": "Independent justification for why this hypothesis makes sense given the current scenario, dataset characteristics, and competition requirements. DO NOT reference critique feedback or suggestions. Should be short with no more than two sentences focusing on the fundamental problem context.",