fix: ignore case when checking metric name (#1160)

Jensen246 · Hoder-zyf · web-flow · commit fc0df6e9fc7d · 2025-08-06T20:54:08.000+08:00
* fix: ignore case when checking metric name

* add case-sensitive to prompts

---------

Co-authored-by: amstrongzyf &lt;amstrongzyf@126.com&gt;
diff --git a/rdagent/components/coder/data_science/pipeline/eval.py b/rdagent/components/coder/data_science/pipeline/eval.py
@@ -140,8 +140,8 @@ def evaluate(
                 if score_ret_code != 0:
                     score_check_text += f"The dataframe in file 'scores.csv' is:\n{score_df}"
 
-                # Check metric name (columns)
-                if score_df.columns.tolist() != [self.scen.metric_name]:
+                # Check metric name (columns) - case insensitive
+                if [col.lower() for col in score_df.columns.tolist()] != [self.scen.metric_name.lower()]:
                     score_check_text += f"\n[Error] The scores dataframe does not contain the correct column names.\nCorrect columns is: ['{self.scen.metric_name}']\nBut got: {score_df.columns.tolist()}"
                     score_ret_code = 1
 
diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
@@ -105,8 +105,8 @@ def evaluate(
                     score_check_text += f"\n[Error] The scores dataframe does not contain the correct model names as index.\ncorrect model names are: {model_set_in_folder.union({'ensemble'})}\nscore_df is:\n{score_df}"
                     score_ret_code = 1
 
-                # Check metric name (columns)
-                if score_df.columns.tolist() != [self.scen.metric_name]:
+                # Check metric name (columns) - case insensitive
+                if [col.lower() for col in score_df.columns.tolist()] != [self.scen.metric_name.lower()]:
                     score_check_text += f"\n[Error] The scores dataframe does not contain the correct column names.\nCorrect columns is: ['{self.scen.metric_name}']\nBut got: {score_df.columns.tolist()}"
                     score_ret_code = 1
 
diff --git a/rdagent/scenarios/data_science/dev/runner/eval.py b/rdagent/scenarios/data_science/dev/runner/eval.py
@@ -141,8 +141,8 @@ def evaluate(
                         score_check_text += f"\n[Error] The scores dataframe does not contain the correct model names as index.\ncorrect model names are: {model_set_in_folder.union({'ensemble'})}\nscore_df is:\n{score_df}"
                         score_ret_code = 1
 
-                # Check metric name (columns)
-                if score_df.columns.tolist() != [self.scen.metric_name]:
+                # Check metric name (columns) - case insensitive
+                if [col.lower() for col in score_df.columns.tolist()] != [self.scen.metric_name.lower()]:
                     score_check_text += f"\n[Error] The scores dataframe does not contain the correct column names.\nCorrect columns is: ['{self.scen.metric_name}']\nBut got: {score_df.columns.tolist()}"
                     score_ret_code = 1
 
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml
@@ -424,7 +424,7 @@ task_gen:
     7. **Metric Calculation and Storage (`scores.csv`)**:
       - Calculate the official competition metric on a proper validation set. Save results to `scores.csv`.
       - The sketch must ensure this step is included. A successful run should always produce scores.
-      - `scores.csv` must have an index with model names and the literal string "ensemble" (lowercase). **Columns should be a single column with exact metric name: "{{ metric_name }}".**
+      - `scores.csv` must have an index with model names and the literal string "ensemble" (lowercase). **Columns should be a single column with exact metric name: "{{ metric_name }}".** (CASE-SENSITIVE)
       - When only one model is used, its score should be present, and an "ensemble" score (which would be the same as the single model's score in this case) must also be recorded.
       - Ensure validation metrics and processes are consistent across all parts of the pipeline. Avoid changes that would alter how validation metrics are calculated unless that is part of the hypothesis.
     8. **Submission File (`submission.csv`)**: Generate `submission.csv` in the **exact format** required (column names, order, data types), as detailed in the '====== Submission Format ======' section of the Competition Scenario Description (DO NOT read the sample_submission.csv file directly in the code). This is a critical step.
@@ -446,7 +446,7 @@ task_gen:
     
     ## CRITICAL OUTPUT FORMAT REQUIREMENTS
     Your sketch MUST explicitly specify the exact column structure for both output files:
-    - **For `scores.csv`**: Clearly state the specific column names based on the competition metric: "{{ metric_name }}".
+    - **For `scores.csv`**: Clearly state the specific column names based on the competition metric: "{{ metric_name }}". (CASE-SENSITIVE)
     - **For `submission.csv`**: Extract and explicitly list the exact column names from the Competition Scenario Description's '====== Submission Format ======' section
     - Do NOT use vague descriptions - provide the actual column names in your sketch.
 
diff --git a/rdagent/scenarios/data_science/scen/prompts.yaml b/rdagent/scenarios/data_science/scen/prompts.yaml
@@ -24,7 +24,7 @@ scenario_description: |-
   - Do not manipulate data or return values solely to pass preliminary tests, as this will not lead to successful final evaluation.
 
   ====== Evaluation ======
-  {% if metric_name %}The primary evaluation metric for this task is: **{{ metric_name }}**, **which should be the column name in `scores.csv`**.{% endif %}
+  {% if metric_name %}The primary evaluation metric for this task is: **{{ metric_name }}**, **which should be the column name in `scores.csv` and the column name should be exactly the same as "{{ metric_name }}" (CASE-SENSITIVE)**.{% endif %}
   This metric is considered better when it is **{% if metric_direction %}larger{% else %}smaller{% endif %}**.
 
   {% if evaluation is not none %}
diff --git a/rdagent/scenarios/data_science/share.yaml b/rdagent/scenarios/data_science/share.yaml
@@ -340,9 +340,8 @@ component_spec:
       - Calculate the metric (mentioned in the evaluation section of the competition information) for each model and ensemble strategy on valid, and save the results in `scores.csv`
       - The evaluation should be based on k-fold cross-validation but only if that's an appropriate evaluation for the task at hand. Store the mean validation score of k-fold cross-validation in `scores.csv` on each model. Refer to the hyperparameter specification for rules to set the CV folds.
       - Even if only one model is present, compute the ensemble score and store it under `"ensemble"`.
-      - The index of `scores.csv` should include the model name and the "ensemble" strategy. "ensemble" should be exactly in the index with all lower case letters. Ensemble is the result from several models. If only one model is present, the ensemble score should be the same as the model score.
-      - The column names in `scores.csv` should be ["{{ metric_name }}"] where metric_name is the name of the metric used for evaluation. Only one column is required.
-      - The column name should be exactly the same to "{{ metric_name }}" since user will use it to pick the result.
+      - The index of `scores.csv` should include the model name and the "ensemble" strategy. "ensemble" should be exactly in the index with all lower case letters (CASE-SENSITIVE). Ensemble is the result from several models. If only one model is present, the ensemble score should be the same as the model score.
+      - The column names in `scores.csv` should be ["{{ metric_name }}"] where metric_name is the name of the metric used for evaluation. Only one column is required. The column name should be exactly the same as "{{ metric_name }}" (CASE-SENSITIVE) since user will use it to pick the result.
       - Validation metrics should be aligned across all ideas and implementations. Avoid proposing ideas that might affect the validation metrics and modifying the related code.
 
     9. Submission File: