modelscope · yanxi-chen · Jun 5, 2025 · Jun 4, 2025 · Jun 4, 2025 · Jun 4, 2025
diff --git a/docs/sphinx_doc/source/tutorial/example_dpo.md b/docs/sphinx_doc/source/tutorial/example_dpo.md
@@ -48,6 +48,9 @@ name: <experiment_name>
 mode: train
 algorithm:
   algorithm_type: dpo
+  kl_loss_fn: k1
+  kl_loss_fn_args:
+    kl_coef: 0.1  # value of beta in DPO
 checkpoint_root_dir: /PATH/TO/CHECKPOINT/
 model:
   model_path: /PATH/TO/MODEL/
@@ -70,8 +73,6 @@ buffer:
 trainer:
   trainer_config_path: 'examples/dpo_humanlike/train_dpo.yaml'
   save_interval: 30
-  actor_use_kl_loss: True
-  actor_kl_loss_coef: 0.1  # value of beta in DPO
 ```
 
 ### Run the Experiment

diff --git a/examples/dpo_humanlike/dpo.yaml b/examples/dpo_humanlike/dpo.yaml
@@ -3,6 +3,9 @@ name: "trinity_dpo"
 mode: train
 algorithm:
   algorithm_type: dpo
+  kl_loss_fn: k1
+  kl_loss_fn_args:
+    kl_coef: 0.1
 checkpoint_root_dir: /PATH/TO/CHECKPOINT/
 model:
   model_path: /PATH/TO/MODEL
@@ -34,5 +37,3 @@ trainer:
   trainer_type: 'verl'
   trainer_config_path: 'examples/dpo_humanlike/train_dpo.yaml'
   save_interval: 30
-  actor_use_kl_loss: True
-  actor_kl_loss_coef: 0.1
diff --git a/tests/template/config.yaml b/tests/template/config.yaml
@@ -8,10 +8,12 @@ algorithm:
   policy_loss_fn: ppo
   policy_loss_fn_args:
     clip_range: 0.2
-  advantage_fn_type: ppo_adv_fn
+  advantage_fn: ppo
   advantage_fn_args:
     gamma: 1.0
     lam: 1.0
+  kl_penalty_fn: k3
+  kl_loss_fn: k2
 
 model:
   model_path: ''

diff --git a/tests/trainer/trainer_test.py b/tests/trainer/trainer_test.py
@@ -67,6 +67,10 @@ def test_trainer(self):
         actor_metrics = parser.metric_list("actor")
         self.assertTrue(len(actor_metrics) > 0)
         self.assertEqual(parser.metric_max_step(actor_metrics[0]), 8)
+        actor_kl_metrics = parser.metric_list("actor/kl")
+        self.assertTrue(len(actor_kl_metrics) > 0)
+        critic_kl_metrics = parser.metric_list("critic/kl")
+        self.assertTrue(len(critic_kl_metrics) > 0)
         response_metrics = parser.metric_list("response_length")
         self.assertTrue(len(response_metrics) > 0)
         self.assertEqual(parser.metric_max_step(response_metrics[0]), 8)
@@ -86,7 +90,7 @@ def test_trainer(self):
         )
         self.assertTrue(os.path.exists(checkpoint_step_4))
         self.assertTrue(os.path.exists(checkpoint_step_8))
-
+        # TODO: Reinit will fail when using v1 engine, find a way to fix it
         ray.init(ignore_reinit_error=True)
         # test bench mode
         self.config.mode = "bench"
@@ -118,7 +122,7 @@ def test_trainer(self):
         self.config.algorithm.algorithm_type = AlgorithmType.GRPO
         self.config.algorithm.repeat_times = 4
         # self.config.algorithm.repeat_times = 8  # TODO: used for real testing
-        self.config.algorithm.advantage_fn_type = "grpo_adv_fn"
+        self.config.algorithm.advantage_fn = "grpo"
         self.config.algorithm.advantage_fn_args = {}
         # self.config.buffer.batch_size = 96  # TODO: used for real testing
         self.config.buffer.explorer_input.taskset = get_unittest_dataset_config("gsm8k")
@@ -143,8 +147,6 @@ def test_trainer(self):
         # self.assertTrue(0.4 < rewards[1] < 0.55)
         # self.assertTrue(0.6 < rewards[2] < 0.7)
         # self.assertTrue(0.6 < rewards[3] < 0.7)
-        ray.shutdown(_exiting_interpreter=True)
-        # check checkpoint
 
     def tearDown(self):
         # remove dir only when the test passed
@@ -157,7 +159,7 @@ def test_trainer(self):
         # test both mode
         self.config.algorithm.algorithm_type = AlgorithmType.GRPO
         self.config.algorithm.repeat_times = 4
-        self.config.algorithm.advantage_fn_type = "grpo_adv_fn"
+        self.config.algorithm.advantage_fn = "grpo"
         self.config.algorithm.advantage_fn_args = {}
         self.config.buffer.explorer_input.taskset = get_unittest_dataset_config("gsm8k")
         self.config.buffer.trainer_input.sft_warmup_steps = 2
@@ -180,8 +182,6 @@ def test_trainer(self):
         response_metrics = parser.metric_list("response_length")
         self.assertTrue(len(response_metrics) > 0)
         self.assertEqual(parser.metric_max_step(response_metrics[0]), 4)
-        ray.shutdown(_exiting_interpreter=True)
-        # check checkpoint
 
     def tearDown(self):
         # remove dir only when the test passed
@@ -207,8 +207,6 @@ def test_trainer(self):
         actor_metrics = parser.metric_list("actor")
         self.assertTrue(len(actor_metrics) > 0)
         self.assertEqual(parser.metric_max_step(actor_metrics[0]), 4)
-        ray.shutdown(_exiting_interpreter=True)
-        # check checkpoint
 
     def tearDown(self):
         # remove dir only when the test passed

diff --git a/trinity/algorithm/__init__.py b/trinity/algorithm/__init__.py
@@ -1,9 +1,15 @@
 from trinity.algorithm.advantage_fn import ADVANTAGE_FN, AdvantageFn
+from trinity.algorithm.entropy_loss_fn import ENTROPY_LOSS_FN, EntropyLossFn
+from trinity.algorithm.kl_fn import KL_FN, KLFn
 from trinity.algorithm.policy_loss_fn import POLICY_LOSS_FN, PolicyLossFn
 
 __all__ = [
     "AdvantageFn",
     "ADVANTAGE_FN",
     "PolicyLossFn",
     "POLICY_LOSS_FN",
+    "KLFn",
+    "KL_FN",
+    "EntropyLossFn",
+    "ENTROPY_LOSS_FN",
 ]
diff --git a/trinity/algorithm/advantage_fn/grpo_advantage.py b/trinity/algorithm/advantage_fn/grpo_advantage.py
@@ -11,7 +11,7 @@
 from trinity.trainer.verl import core_algos
 
 
-@ADVANTAGE_FN.register_module("grpo_adv_fn")
+@ADVANTAGE_FN.register_module("grpo")
 class GRPOAdvantageFn(AdvantageFn):
     """GRPO advantage computation"""
 

diff --git a/trinity/algorithm/advantage_fn/opmd_advantage.py b/trinity/algorithm/advantage_fn/opmd_advantage.py
@@ -11,7 +11,7 @@
 from trinity.trainer.verl import core_algos
 
 
-@ADVANTAGE_FN.register_module("opmd_adv_fn")
+@ADVANTAGE_FN.register_module("opmd")
 class OPMDAdvantageFn(AdvantageFn):
     """OPMD advantage computation"""
 

diff --git a/trinity/algorithm/advantage_fn/ppo_advantage.py b/trinity/algorithm/advantage_fn/ppo_advantage.py
@@ -11,7 +11,7 @@
 from trinity.trainer.verl import core_algos
 
 
-@ADVANTAGE_FN.register_module("ppo_adv_fn")
+@ADVANTAGE_FN.register_module("ppo")
 class PPOAdvantageFn(AdvantageFn):
     def __init__(
         self,

diff --git a/trinity/algorithm/advantage_fn/reinforce_plus_plus_advantage.py b/trinity/algorithm/advantage_fn/reinforce_plus_plus_advantage.py
@@ -11,7 +11,7 @@
 from trinity.trainer.verl import core_algos
 
 
-@ADVANTAGE_FN.register_module("reinforceplusplus_adv_fn")
+@ADVANTAGE_FN.register_module("reinforceplusplus")
 class REINFORCEPLUSPLUSAdvantageFn(AdvantageFn):
     def __init__(self, gamma: float = 1.0) -> None:
         self.gamma = gamma

diff --git a/trinity/algorithm/advantage_fn/remax_advantage.py b/trinity/algorithm/advantage_fn/remax_advantage.py
@@ -11,7 +11,7 @@
 from trinity.trainer.verl import core_algos
 
 
-@ADVANTAGE_FN.register_module("remax_adv_fn")
+@ADVANTAGE_FN.register_module("remax")
 class REMAXAdvantageFn(AdvantageFn):
     def __init__(self) -> None:
         pass

diff --git a/trinity/algorithm/advantage_fn/rloo_advantage.py b/trinity/algorithm/advantage_fn/rloo_advantage.py
@@ -11,7 +11,7 @@
 from trinity.trainer.verl import core_algos
 
 
-@ADVANTAGE_FN.register_module("rloo_adv_fn")
+@ADVANTAGE_FN.register_module("rloo")
 class RLOOAdvantageFn(AdvantageFn):
     def __init__(self) -> None:
         pass

diff --git a/trinity/algorithm/entropy_loss/__init__.py b/trinity/algorithm/entropy_loss/__init__.py
diff --git a/trinity/algorithm/entropy_loss_fn/__init__.py b/trinity/algorithm/entropy_loss_fn/__init__.py
@@ -0,0 +1,9 @@
+from trinity.algorithm.entropy_loss_fn.entropy_loss_fn import (
+    ENTROPY_LOSS_FN,
+    EntropyLossFn,
+)
+
+__all__ = [
+    "EntropyLossFn",
+    "ENTROPY_LOSS_FN",
+]
diff --git a/trinity/algorithm/entropy_loss_fn/entropy_loss_fn.py b/trinity/algorithm/entropy_loss_fn/entropy_loss_fn.py
@@ -0,0 +1,63 @@
+from abc import ABC, abstractmethod
+from typing import Dict, Tuple
+
+import torch
+
+from trinity.algorithm.utils import masked_mean
+from trinity.utils.registry import Registry
+
+ENTROPY_LOSS_FN = Registry("entropy_loss_fn")
+
+
+class EntropyLossFn(ABC):
+    """
+    Entropy loss function.
+    """
+
+    @abstractmethod
+    def __call__(
+        self,
+        entropy: torch.Tensor,
+        action_mask: torch.Tensor,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict]:
+        """
+        Args:
+            entropy (`torch.Tensor`): The entropy generated by the policy model.
+            action_mask (`torch.Tensor`): The action mask.
+
+        Returns:
+            `torch.Tensor`: The calculated entropy loss.
+            `Dict`: The metrics for logging
+        """
+
+    @classmethod
+    @abstractmethod
+    def default_args(cls) -> Dict:
+        """
+        Returns:
+            `Dict`: The default arguments for the entropy loss function.
+        """
+
+
+@ENTROPY_LOSS_FN.register_module("basic")
+class BasicEntropyLossFn(EntropyLossFn):
+    """
+    Basic entropy loss function.
+    """
+
+    def __init__(self, entropy_coef: float):
+        self.entropy_coef = entropy_coef
+
+    def __call__(
+        self,
+        entropy: torch.Tensor,
+        action_mask: torch.Tensor,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict]:
+        entropy_loss = masked_mean(entropy, action_mask)
+        return entropy_loss * self.entropy_coef, {"entropy_loss": entropy_loss.detach().item()}
+
+    @classmethod
+    def default_args(cls) -> Dict:
+        return {"entropy_coef": 0.0}
diff --git a/trinity/algorithm/kl_fn/__init__.py b/trinity/algorithm/kl_fn/__init__.py
@@ -0,0 +1,3 @@
+from trinity.algorithm.kl_fn.kl_fn import KL_FN, KLFn
+
+__all__ = ["KLFn", "KL_FN"]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from trinity.algorithm.kl_fn.kl_fn import KL_FN, KLFn

		__all__ = ["KLFn", "KL_FN"]