agentscope-ai
diff --git a/‎examples/entropy/README.md‎
Lines changed: 80 additions & 0 deletions b/‎examples/entropy/README.md‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎examples/entropy/clipb.yaml‎
Lines changed: 100 additions & 0 deletions b/‎examples/entropy/clipb.yaml‎
Lines changed: 100 additions & 0 deletions
diff --git a/‎examples/entropy/clipb_trainer.patch‎
Lines changed: 11 additions & 0 deletions b/‎examples/entropy/clipb_trainer.patch‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎examples/entropy/clipv.yaml‎
Lines changed: 100 additions & 0 deletions b/‎examples/entropy/clipv.yaml‎
Lines changed: 100 additions & 0 deletions
@@ -0,0 +1,80 @@
+# Entropy dynamics of RL training
+
+This example shows the two algorithms **Clip_B** and **Clip_V** from the work [On the Entropy Dynamics in Reinforcement Fine-Tuning of Large Language Models](https://arxiv.org/pdf/2602.03392).
+
+NOTE: This example is only tested on trinity==0.5.1 and verl==0.7.0. The following experiments require `synchronizer.sync_interval=1` and `trainer.trainer_config.algorithm.rollout_correction.bypass_mode=false` to be set.
+
+We also provide a runnable branch in the [Trinity-RFT](https://github.com/hiyuchang/Trinity-RFT/tree/example/entropy) repository that already includes all patches for this example.
+
+## Data Preparation
+
+We utilize the [DAPO-Math-17k](https://huggingface.co/datasets/open-r1/DAPO-Math-17k-Processed) dataset as our training set. We exclude 500 questions from the training set to form the validation set (denoted by dapo-validation-500).
+The training set is filtered out samples from the training set with excessively high (≥ 15/16) or low (≤ 1/16) pass rates, as evaluated by Qwen2.5-7B-Instruct.
+
+## Clip_B Experiment
+
+1. Apply the patch to keep entropy information in the trainer batch:
+
+```bash
+cd /path/to/Trinity-RFT
+git apply examples/entropy/clipb_trainer.patch
+# if not successful, try:
+# git apply --3way --ignore-whitespace examples/entropy/clipb_trainer.patch
+```
+
+2. Update the dataset paths and other configurations in the file [`clipb.yaml`](./clipb.yaml) to point to your local data.
+
+3. Run the experiment:
+
+```bash
+trinity run examples/entropy/clipb.yaml
+```
+
+## Clip_V Implementation
+
+1. Apply the patch to keep entropy information in the trainer batch:
+
+```bash
+cd /path/to/Trinity-RFT
+git apply examples/entropy/clipv_trainer.patch
+# if not successful, try:
+# git apply --3way --ignore-whitespace examples/entropy/clipv_trainer.patch
+```
+
+2. Update the dataset paths and other configurations in the file [`clipv.yaml`](./clipv.yaml) to point to your local data.
+
+3. Run the experiment:
+
+```bash
+trinity run examples/entropy/clipv.yaml
+```
+
+### Logic of Clip_V
+
+As shown in the following flowchart, the forward pass of [examples/entropy/clipv_dp_actor.py](./clipv_dp_actor.py) outputs `log_probs`, `entropy`, and `nec`.
+These signals are then used by [Clip_V advantage function](../../trinity/algorithm/advantage_fn/clipv_advantage.py) to compute `xD` and clip only negative-advantage tokens. This process returns the revised `advantages`.
+
+```mermaid
+flowchart TD
+    A["data"]
+    B["forward pass"]
+    C1["log_probs"]
+    C2["entropy (additional)"]
+    C3["nec (additional)"]
+    subgraph D["advantage computation"]
+        direction TB
+        F["xD = nec - exp(log_probs) * (entropy + log_probs)"]
+        G["only clip negative-advantage tokens"]
+        F --> G
+    end
+    E["advantages"]
+
+    A --> B
+    B --> C1
+    B --> C2
+    B --> C3
+    C1 --> D
+    C2 --> D
+    C3 --> D
+    D --> E
+```
@@ -0,0 +1,100 @@
+project: math_dapo
+name: clipb_example
+checkpoint_root_dir: ${oc.env:TRINITY_CHECKPOINT_ROOT_DIR,./checkpoints}
+model:
+  model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen2.5-7B-Instruct}
+  max_prompt_tokens: 1024
+  max_response_tokens: 7168
+algorithm:
+  algorithm_type: grpo_verl
+  advantage_fn: clipb
+  advantage_fn_args:
+    mu: 2.5
+  repeat_times: 16
+  kl_loss_fn_args:
+    kl_coef: 0.0
+cluster:
+  node_num: 1
+  gpu_per_node: 8
+buffer:
+  total_epochs: 20
+  batch_size: 64
+  explorer_input:
+    taskset:
+      name: dapo_235
+      storage_type: file
+      path: ${oc.env:TRINITY_TASKSET_PATH}  # processed DAPO-Math-17k
+      format:
+        prompt_key: 'question'
+        response_key: 'ground_truth'
+      rollout_args:
+        temperature: 1.0
+        logprobs: 20
+    eval_tasksets:
+    - name: dapo-validation-500
+      storage_type: file
+      path: '/path/to/dapo-validation' # validation samples from DAPO-Math-17k
+      split: 'test'
+      repeat_times: 32
+      format:
+        prompt_key: 'question'
+        response_key: 'ground_truth'
+      rollout_args:
+        temperature: 0.7
+    - name: amc23
+      storage_type: file
+      path: math-ai/amc23 # Path to the AMC23 dataset
+      split: 'test'
+      repeat_times: 32
+      format:
+        prompt_key: 'question'
+        response_key: 'answer'
+      rollout_args:
+        temperature: 0.7
+    - name: aime24
+      storage_type: file
+      path: HuggingFaceH4/aime_2024  # Path to the AIME2024 dataset
+      split: 'train'
+      repeat_times: 32
+      format:
+        prompt_key: 'problem'
+        response_key: 'answer'
+      rollout_args:
+        temperature: 0.7
+    - name: aime25
+      storage_type: file
+      path: math-ai/aime25 # Path to the AIME2025 dataset
+      split: 'test'
+      repeat_times: 32
+      format:
+        prompt_key: 'problem'
+        response_key: 'answer'
+      rollout_args:
+        temperature: 0.7
+    default_workflow_type: 'async_math_workflow'
+    default_reward_fn_type: 'math_boxed_reward'
+  trainer_input:
+    experience_buffer:
+      name: math_buffer
+      storage_type: queue
+      max_read_timeout: 7200
+explorer:
+  eval_interval: 20
+  eval_on_startup: true
+  runner_per_model: 8
+  rollout_model:
+    engine_type: vllm_async
+    engine_num: 4
+    tensor_parallel_size: 1
+    seed: 42
+trainer:
+  trainer_type: 'verl'
+  save_interval: 200
+  trainer_config:
+    algorithm:
+      rollout_correction:
+        bypass_mode: false
+synchronizer:
+  sync_method: 'nccl'
+  sync_interval: 1
+  sync_timeout: 3200
@@ -0,0 +1,11 @@
+--- a/trinity/trainer/verl_trainer.py
++++ b/trinity/trainer/verl_trainer.py
+@@ -501,7 +501,8 @@ class VerlPPOTrainerWrapper(RayPPOTrainer, TrainEngineWrapper):
+                     }
+                     metrics.update(old_log_prob_metrics)
+-                    old_log_prob.batch.pop("entropys")
++                    # Keep entropys in batch so advantage_fn (e.g. Clip_B) can use it
++                    # old_log_prob.batch.pop("entropys")
+                     batch = batch.union(old_log_prob)
+                     if "rollout_log_probs" in batch.batch.keys():
+                         # TODO: we may want to add diff of probs too.
@@ -0,0 +1,100 @@
+project: math_dapo
+name: clipv_example
+checkpoint_root_dir: ${oc.env:TRINITY_CHECKPOINT_ROOT_DIR,./checkpoints}
+model:
+  model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen2.5-7B-Instruct}
+  max_prompt_tokens: 1024
+  max_response_tokens: 7168
+algorithm:
+  algorithm_type: grpo_verl
+  advantage_fn: clipv
+  advantage_fn_args:
+    mu: 8.5
+  repeat_times: 8
+  kl_loss_fn_args:
+    kl_coef: 0.0
+cluster:
+  node_num: 1
+  gpu_per_node: 8
+buffer:
+  total_epochs: 20
+  batch_size: 64
+  explorer_input:
+    taskset:
+      name: dapo_235
+      storage_type: file
+      path: ${oc.env:TRINITY_TASKSET_PATH}  # processed DAPO-Math-17k
+      format:
+        prompt_key: 'question'
+        response_key: 'ground_truth'
+      rollout_args:
+        temperature: 1.0
+        logprobs: 20
+    eval_tasksets:
+    - name: dapo-validation-500
+      storage_type: file
+      path: '/path/to/dapo-validation' # validation samples from DAPO-Math-17k
+      split: 'test'
+      repeat_times: 32
+      format:
+        prompt_key: 'question'
+        response_key: 'ground_truth'
+      rollout_args:
+        temperature: 0.7
+    - name: amc23
+      storage_type: file
+      path: math-ai/amc23 # Path to the AMC23 dataset
+      split: 'test'
+      repeat_times: 32
+      format:
+        prompt_key: 'question'
+        response_key: 'answer'
+      rollout_args:
+        temperature: 0.7
+    - name: aime24
+      storage_type: file
+      path: HuggingFaceH4/aime_2024  # Path to the AIME2024 dataset
+      split: 'train'
+      repeat_times: 32
+      format:
+        prompt_key: 'problem'
+        response_key: 'answer'
+      rollout_args:
+        temperature: 0.7
+    - name: aime25
+      storage_type: file
+      path: math-ai/aime25 # Path to the AIME2025 dataset
+      split: 'test'
+      repeat_times: 32
+      format:
+        prompt_key: 'problem'
+        response_key: 'answer'
+      rollout_args:
+        temperature: 0.7
+    default_workflow_type: 'async_math_workflow'
+    default_reward_fn_type: 'math_boxed_reward'
+  trainer_input:
+    experience_buffer:
+      name: math_buffer
+      storage_type: queue
+      max_read_timeout: 7200
+explorer:
+  eval_interval: 20
+  eval_on_startup: true
+  runner_per_model: 8
+  rollout_model:
+    engine_type: vllm_async
+    engine_num: 4
+    tensor_parallel_size: 1
+    seed: 42
+trainer:
+  trainer_type: 'verl'
+  save_interval: 100
+  trainer_config:
+    algorithm:
+      rollout_correction:
+        bypass_mode: false
+synchronizer:
+  sync_method: 'nccl'
+  sync_interval: 1
+  sync_timeout: 3600