yanxi-chen
diff --git a/‎docs/sphinx_doc/assets/agentscope_gsm8k_reward.png‎
-147 KB b/‎docs/sphinx_doc/assets/agentscope_gsm8k_reward.png‎
-147 KB
diff --git a/‎docs/sphinx_doc/assets/email_eval_accuracy.png‎
-15.8 KB b/‎docs/sphinx_doc/assets/email_eval_accuracy.png‎
-15.8 KB
diff --git a/‎docs/sphinx_doc/assets/email_reward_mean.png‎
464 KB b/‎docs/sphinx_doc/assets/email_reward_mean.png‎
464 KB
diff --git a/‎docs/sphinx_doc/assets/email_rollout_accuracy.png‎
-50.4 KB b/‎docs/sphinx_doc/assets/email_rollout_accuracy.png‎
-50.4 KB
diff --git a/‎docs/sphinx_doc/source/tutorial/example_search_email.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/sphinx_doc/source/tutorial/example_search_email.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/sphinx_doc/source/tutorial/trinity_installation.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/sphinx_doc/source/tutorial/trinity_installation.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/sphinx_doc/source_zh/tutorial/example_search_email.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/sphinx_doc/source_zh/tutorial/example_search_email.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/sphinx_doc/source_zh/tutorial/trinity_installation.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/sphinx_doc/source_zh/tutorial/trinity_installation.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/grpo_email_search/email_search.yaml‎
Lines changed: 24 additions & 2 deletions b/‎examples/grpo_email_search/email_search.yaml‎
Lines changed: 24 additions & 2 deletions
diff --git a/‎tests/algorithm/advantage_fn_test.py‎
Lines changed: 68 additions & 0 deletions b/‎tests/algorithm/advantage_fn_test.py‎
Lines changed: 68 additions & 0 deletions
@@ -48,5 +48,6 @@ The results are shown in the following figure (the accuracy ranges from -0.1 to
 
 ![](../../assets/email_rollout_accuracy.png)
 
+![](../../assets/email_reward_mean.png)
 
 ![](../../assets/email_eval_accuracy.png)
@@ -6,7 +6,7 @@ For installing Trinity-RFT, you have three options: from source (recommended), v
 Before installing, ensure your system meets the following requirements:
 
 - **Python**: Version 3.10 to 3.12 (inclusive)
-- **CUDA**: Version 12.4 to 12.8 (inclusive)
+- **CUDA**: Version >= 12.6
 - **GPUs**: At least 2 GPUs
 
 ---
 
@@ -44,4 +44,6 @@ trinity run --config examples/grpo_email_search/email_search.yaml
 
 ![](../../assets/email_rollout_accuracy.png)
 
+![](../../assets/email_reward_mean.png)
+
 ![](../../assets/email_eval_accuracy.png)
@@ -6,7 +6,7 @@
 在安装前，请确保您的系统满足以下要求：
 
 - **Python**：3.10 至 3.12（包含）
-- **CUDA**：12.4 至 12.8（包含）
+- **CUDA**：大于等于 12.6
 - **GPU**：至少 2 块 GPU
 
 ---
 
@@ -6,6 +6,20 @@ algorithm:
   repeat_times: 8
   optimizer:
     lr: 1e-6
+  policy_loss_fn: "rec"
+  policy_loss_fn_args:
+    epsilon_low: 0.2
+    epsilon_high: 0.2
+    clip_mode: "one-side"
+    weight: "none"
+    temp: 1.0
+    regularizer: "none"
+    regularizer_coef: 0.0
+  kl_loss_fn: 'k2'
+  kl_loss_fn_args:
+    kl_coef: 0.0
+  advantage_fn_args:
+    std_cal_level: 'batch'
 model:
   model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen3-4B-Instruct-2507}
   max_response_tokens: 4096
@@ -15,8 +29,8 @@ cluster:
   gpu_per_node: 8
 buffer:
   total_epochs: 1
-  batch_size: 16
-  train_batch_size: 640 # 16*8*5
+  batch_size: 64
+  train_batch_size: 2560 # 64*8*5
   explorer_input:
     taskset:
       name: enron_train
@@ -56,6 +70,12 @@ buffer:
       storage_type: queue
       replay_buffer:
         enable: true
+        # reuse_cooldown_time is None
+        priority_fn: 'decay_limit_randomization'
+        priority_fn_args:
+          decay: 2.0
+          use_count_limit: 3
+          sigma: 2.0
 explorer:
   eval_interval: 10
   max_repeat_times_per_runner: 1
@@ -93,3 +113,5 @@ trainer:
   use_dynamic_bsz: true
   max_token_len_per_gpu: 16384
   ulysses_sequence_parallel_size: 1
+monitor:
+  monitor_type: wandb
@@ -326,3 +326,71 @@ def test_batch_level_step_wise_grpo_advantage(self):
         expected_advantages = expected_advantage_value * target_exp.action_mask
         self.assertTrue(torch.allclose(target_exp.advantages, expected_advantages, atol=1e-6))
         self.assertTrue(torch.allclose(target_exp.returns, expected_advantages, atol=1e-6))
+
+    def test_step_wise_grpo_with_std_threshold(self):
+        advantage_fn_cls = ADVANTAGE_FN.get("step_wise_grpo")
+        self.assertIsNotNone(advantage_fn_cls)
+        advantage_fn = advantage_fn_cls(epsilon=1e-6, std_threshold=0.0001)
+        repeat_times = 5
+        step_num = 4
+
+        # Create experiences with mixed reward patterns:
+        # - task 0: all runs have same reward (0.5) -> should be filtered
+        # - task 1: all runs have same reward (1.0) -> should be filtered
+        # - task 2: runs have different rewards (0, 1, 2, 3, 4) -> should NOT be filtered
+        exps = []
+
+        # Task 0: constant reward 0.5
+        for k in range(step_num):
+            for i in range(repeat_times):
+                exps.append(
+                    Experience(
+                        eid=EID(batch=0, task=0, run=i, step=k),
+                        tokens=torch.zeros(5),
+                        prompt_length=2,
+                        reward=0.5,
+                    )
+                )
+
+        # Task 1: constant reward 1.0
+        for k in range(step_num):
+            for i in range(repeat_times):
+                exps.append(
+                    Experience(
+                        eid=EID(batch=0, task=1, run=i, step=k),
+                        tokens=torch.zeros(5),
+                        prompt_length=2,
+                        reward=1.0,
+                    )
+                )
+
+        # Task 2: varying rewards
+        for k in range(step_num):
+            for i in range(repeat_times):
+                exps.append(
+                    Experience(
+                        eid=EID(batch=0, task=2, run=i, step=k),
+                        tokens=torch.zeros(5),
+                        prompt_length=2,
+                        reward=float(i),
+                    )
+                )
+
+        processed_exps, metrics = advantage_fn(exps)
+
+        # Only task 2 should remain (task 0 and task 1 filtered due to zero std)
+        expected_remaining = repeat_times * step_num  # task 2 only
+        expected_filtered = 2 * repeat_times * step_num  # task 0 and task 1
+
+        self.assertEqual(len(processed_exps), expected_remaining)
+        self.assertIn("filtered_count", metrics)
+        self.assertEqual(metrics["filtered_count"], expected_filtered)
+
+        # Verify skipped group ratio: 2 out of 3 tasks were skipped
+        self.assertIn("skipped_group_ratio", metrics)
+        expected_ratio = 2.0 / 3.0  # task 0 and task 1 skipped out of 3 total tasks
+        self.assertAlmostEqual(metrics["skipped_group_ratio"], expected_ratio, places=6)
+
+        # Verify that all remaining experiences are from task 2
+        for exp in processed_exps:
+            self.assertEqual(exp.eid.task, 2)
Original file line number	Diff line number	Diff line change
`@@ -48,5 +48,6 @@ The results are shown in the following figure (the accuracy ranges from -0.1 to`
`48`	`48`
`49`	`49`	`![](../../assets/email_rollout_accuracy.png)`
`50`	`50`
	`51`	`+![](../../assets/email_reward_mean.png)`
`51`	`52`
`52`	`53`	`![](../../assets/email_eval_accuracy.png)`