add detailed_stats

hiyuchang · hiyuchang · commit 02dbe30e9225 · 2026-01-22T15:20:07.000+08:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -56,7 +56,7 @@ data = [
     "py-data-juicer>=1.4.3"
 ]
 agent = [
-    "agentscope>=1.0.9"
+    "agentscope>=1.0.12"
 ]
 rm_gallery = [
     "rm-gallery>=0.1.5"
diff --git a/tests/explorer/explorer_test.py b/tests/explorer/explorer_test.py
@@ -43,6 +43,7 @@ def setUp(self):
         self.config.checkpoint_root_dir = get_checkpoint_path()
         self.config.synchronizer.sync_interval = 2
         self.config.explorer.eval_interval = 4
+        self.config.monitor.detailed_stats = False
 
 
 class TestExplorerCountdownEval(BaseExplorerCase):
@@ -70,21 +71,48 @@ def test_explorer(self):
         self.assertEqual(parser.metric_max_step(eval_metrics[0]), 8)
         for eval_taskset, k_list in zip(eval_tasksets, [[1], [2, 4, 6], [2, 4, 8, 10]]):
             metric_name = "score" if eval_taskset.name == "countdown" else "accuracy"
-            for eval_stats in ["mean", "std"]:
-                k = k_list[-1]
+            repeat_times = k_list[-1]
+            expected_stat_suffixes = [f"mean@{repeat_times}", f"std@{repeat_times}"]
+            for k in k_list:
+                if k == 1:
+                    continue
+                expected_stat_suffixes.extend([f"best@{k}", f"worst@{k}"])
+            # only return the mean of the column
+            for stat_suffix in expected_stat_suffixes:
                 self.assertIn(
-                    f"eval/{eval_taskset.name}/{metric_name}/{eval_stats}@{k}",
+                    f"eval/{eval_taskset.name}/{metric_name}/{stat_suffix}",
+                    eval_metrics,
+                )
+
+
+class TestExplorerEvalDetailedStats(BaseExplorerCase):
+    def test_explorer(self):
+        self.config.buffer.explorer_input.taskset = get_unittest_dataset_config("countdown")
+        self.config.monitor.detailed_stats = True
+        eval_taskset = get_unittest_dataset_config("eval_short")
+        eval_taskset.repeat_times = 6
+        self.config.buffer.explorer_input.eval_tasksets = [eval_taskset]
+        self.config.name = f"explore-eval-{datetime.now().strftime('%Y%m%d%H%M%S')}"
+        self.config.check_and_update()
+        explore(self.config)
+        parser = TensorBoardParser(os.path.join(self.config.monitor.cache_dir, "tensorboard"))
+        rollout_metrics = parser.metric_list("rollout")
+        self.assertTrue(len(rollout_metrics) > 0)
+        eval_metrics = parser.metric_list("eval")
+        self.assertTrue(len(eval_metrics) > 0)
+        self.assertEqual(parser.metric_max_step(rollout_metrics[0]), 8)
+        self.assertEqual(parser.metric_max_step(eval_metrics[0]), 8)
+        metric_name, repeat_times, k_list = "accuracy", 6, [2, 4, 6]
+        expected_stat_suffixes = [f"mean@{repeat_times}", f"std@{repeat_times}"]
+        for k in k_list:  # k_list does not include 1
+            expected_stat_suffixes.extend([f"best@{k}", f"worst@{k}"])
+        # test detailed stats
+        for stat_suffix in expected_stat_suffixes:
+            for stats in ["mean", "std", "max", "min"]:
+                self.assertIn(
+                    f"eval/{eval_taskset.name}/{metric_name}/{stat_suffix}/{stats}",
                     eval_metrics,
                 )
-            for eval_stats in ["best", "worst"]:
-                for k in k_list:
-                    if k == 1:
-                        continue
-                    for stats in ["mean", "std"]:
-                        self.assertIn(
-                            f"eval/{eval_taskset.name}/{metric_name}/{eval_stats}@{k}/{stats}",
-                            eval_metrics,
-                        )
 
 
 class TestExplorerGSM8KRULERNoEval(BaseExplorerCase):
diff --git a/tests/trainer/trainer_test.py b/tests/trainer/trainer_test.py
@@ -172,19 +172,14 @@ def test_trainer(self):
             for taskset_name in ["countdown", "copy_countdown"]:
                 metrics = parser.metric_list(f"{prefix}/{taskset_name}")
                 self.assertGreater(len(metrics), 0, f"{prefix}/{taskset_name} metrics not found")
-                # mean@k, std@k
-                for eval_stats in ["mean", "std"]:
-                    k = 4
-                    metric_name = f"{prefix}/{taskset_name}/score/{eval_stats}@{k}"
+                repeat_times, k_list = 4, [2, 4]
+                expected_stat_suffixes = [f"mean@{repeat_times}", f"std@{repeat_times}"]
+                for k in k_list:
+                    expected_stat_suffixes.extend([f"best@{k}", f"worst@{k}"])
+                for stat_suffix in expected_stat_suffixes:
+                    metric_name = f"{prefix}/{taskset_name}/score/{stat_suffix}"
                     metric_steps = parser.metric_steps(metric_name)
                     self.assertEqual(metric_steps, [0, 4, 8])
-                # best@k/mean, best@k/std, worst@k/mean, worst@k/std
-                for eval_stats in ["best", "worst"]:
-                    for k in [2, 4]:
-                        for stats in ["mean", "std"]:
-                            metric_name = f"{prefix}/{taskset_name}/score/{eval_stats}@{k}/{stats}"
-                            metric_steps = parser.metric_steps(metric_name)
-                            self.assertEqual(metric_steps, [0, 4, 8])
 
     def tearDown(self):
         # remove dir only when the test passed
@@ -1345,19 +1340,14 @@ def test_trainer(self):
         for prefix in ["eval", "bench"]:
             gsm8k_metrics = parser.metric_list(f"{prefix}/gsm8k")
             self.assertGreater(len(gsm8k_metrics), 0, f"{prefix}/gsm8k metrics not found")
-            # mean@k, std@k
-            for eval_stats in ["mean", "std"]:
-                k = 8
-                metric_name = f"{prefix}/gsm8k/accuracy/{eval_stats}@{k}"
+            repeat_times, k_list = 8, [2, 4, 8]
+            expected_stat_suffixes = [f"mean@{repeat_times}", f"std@{repeat_times}"]
+            for k in k_list:
+                expected_stat_suffixes.extend([f"best@{k}", f"worst@{k}"])
+            for stat_suffix in expected_stat_suffixes:
+                metric_name = f"{prefix}/gsm8k/accuracy/{stat_suffix}"
                 metric_steps = parser.metric_steps(metric_name)
                 self.assertEqual(metric_steps, [0, 2])
-            # best@k/mean, best@k/std, worst@k/mean, worst@k/std
-            for eval_stats in ["best", "worst"]:
-                for k in [2, 4, 8]:
-                    for stats in ["mean", "std"]:
-                        metric_name = f"{prefix}/gsm8k/accuracy/{eval_stats}@{k}/{stats}"
-                        metric_steps = parser.metric_steps(metric_name)
-                        self.assertEqual(metric_steps, [0, 2])
 
     def tearDown(self):
         shutil.rmtree(self.config.checkpoint_job_dir, ignore_errors=True)
diff --git a/trinity/explorer/explorer.py b/trinity/explorer/explorer.py
@@ -66,6 +66,7 @@ def __init__(self, config: Config):
             role=self.config.explorer.name,
             config=config,
         )
+        self.detailed_stats = config.monitor.detailed_stats
         if config.explorer.over_rollout.ratio > 0.0:
             self.min_wait_num = math.ceil(
                 config.buffer.batch_size * (1 - config.explorer.over_rollout.ratio)
@@ -432,7 +433,9 @@ async def _finish_eval_step(self, step: Optional[int] = None, prefix: str = "eva
             metric[f"{prefix}/{eval_task_name}/finished_task_count"] = len(statuses)
             metric.update(
                 gather_eval_metrics(
-                    [status.metrics[0] for status in statuses], f"{prefix}/{eval_task_name}"
+                    [status.metrics[0] for status in statuses],
+                    f"{prefix}/{eval_task_name}",
+                    detailed_stats=self.detailed_stats,
                 )
             )
         if self.eval_start_time is not None:
diff --git a/trinity/utils/monitor.py b/trinity/utils/monitor.py
@@ -58,7 +58,10 @@ def gather_metrics(
 
 
 def gather_eval_metrics(
-    metric_list: List[Dict], prefix: str, output_stats: List[str] = ["mean", "max", "min"]
+    metric_list: List[Dict],
+    prefix: str,
+    output_stats: List[str] = ["mean", "max", "min", "std"],
+    detailed_stats: bool = False,
 ) -> Dict:
     if not metric_list:
         return {}
@@ -67,14 +70,14 @@ def gather_eval_metrics(
         numeric_df = df.select_dtypes(include=[np.number])
         metric = {}
         for col in numeric_df.columns:
-            # Skip the columns that are already aggregated
-            key_words = ["std", "mean", "min", "max"]
-            if any(key_word in col.lower() for key_word in key_words):
-                metric[f"{prefix}/{col}"] = numeric_df[col].mean()
-            else:
+            if detailed_stats:
                 stats_df = numeric_df[[col]].agg(output_stats)
                 for stats in output_stats:
                     metric[f"{prefix}/{col}/{stats}"] = stats_df.loc[stats, col].item()
+            else:
+                # only return the mean of the column
+                metric[f"{prefix}/{col}"] = numeric_df[col].mean()
+
         return metric
     except Exception as e:
         raise ValueError(f"Failed to gather eval metrics: {e}") from e

Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ data = [`
`56`	`56`	`"py-data-juicer>=1.4.3"`
`57`	`57`	`]`
`58`	`58`	`agent = [`
`59`		`- "agentscope>=1.0.9"`
	`59`	`+ "agentscope>=1.0.12"`
`60`	`60`	`]`
`61`	`61`	`rm_gallery = [`
`62`	`62`	`"rm-gallery>=0.1.5"`
Original file line number	Diff line number	Diff line change
`@@ -66,6 +66,7 @@ def __init__(self, config: Config):`
`66`	`66`	`role=self.config.explorer.name,`
`67`	`67`	`config=config,`
`68`	`68`	`)`
	`69`	`+ self.detailed_stats = config.monitor.detailed_stats`
`69`	`70`	`if config.explorer.over_rollout.ratio > 0.0:`
`70`	`71`	`self.min_wait_num = math.ceil(`
`71`	`72`	`config.buffer.batch_size * (1 - config.explorer.over_rollout.ratio)`
`@@ -432,7 +433,9 @@ async def _finish_eval_step(self, step: Optional[int] = None, prefix: str = "eva`
`432`	`433`	`metric[f"{prefix}/{eval_task_name}/finished_task_count"] = len(statuses)`
`433`	`434`	`metric.update(`
`434`	`435`	`gather_eval_metrics(`
`435`		`- [status.metrics[0] for status in statuses], f"{prefix}/{eval_task_name}"`
	`436`	`+ [status.metrics[0] for status in statuses],`
	`437`	`+ f"{prefix}/{eval_task_name}",`
	`438`	`+ detailed_stats=self.detailed_stats,`
`436`	`439`	`)`
`437`	`440`	`)`
`438`	`441`	`if self.eval_start_time is not None:`