microsoft · XianBW · Jul 31, 2025
diff --git a/rdagent/scenarios/data_science/dev/runner/eval.py b/rdagent/scenarios/data_science/dev/runner/eval.py
@@ -158,6 +158,7 @@ def evaluate(
         user_prompt = T(".prompts:DSCoSTEER_eval.user").r(
             code=implementation.all_codes,
             stdout=shrink_text(stdout),
+            score=score_df.to_string() if score_ret_code == 0 else None,
             time_spent=f"{implementation.running_info.running_time:.2f} seconds",
             timeout=f"{env.conf.running_timeout_period} seconds",
             percent_of_timeout_used=f"{time_spent_ratio * 100:.2f}%",

diff --git a/rdagent/scenarios/data_science/dev/runner/prompts.yaml b/rdagent/scenarios/data_science/dev/runner/prompts.yaml
@@ -114,6 +114,9 @@ DSCoSTEER_eval:
     ## Stdout of code execution and testing
     {{ stdout }}
 
+    ## The valid score of the generated submission.csv
+    {{ score }}
+
     # The time spend on code execution and timeout
     {{ time_spent }}