modelscope
diff --git a/‎docs/sphinx_doc/source/tutorial/example_dpo.md‎
Lines changed: 3 additions & 2 deletions b/‎docs/sphinx_doc/source/tutorial/example_dpo.md‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/dpo_humanlike/dpo.yaml‎
Lines changed: 3 additions & 2 deletions b/‎examples/dpo_humanlike/dpo.yaml‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎tests/template/config.yaml‎
Lines changed: 3 additions & 1 deletion b/‎tests/template/config.yaml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎tests/trainer/trainer_test.py‎
Lines changed: 7 additions & 9 deletions b/‎tests/trainer/trainer_test.py‎
Lines changed: 7 additions & 9 deletions
diff --git a/‎trinity/algorithm/__init__.py‎
Lines changed: 6 additions & 0 deletions b/‎trinity/algorithm/__init__.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎trinity/algorithm/advantage_fn/grpo_advantage.py‎
Lines changed: 1 addition & 1 deletion b/‎trinity/algorithm/advantage_fn/grpo_advantage.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎trinity/algorithm/advantage_fn/opmd_advantage.py‎
Lines changed: 1 addition & 1 deletion b/‎trinity/algorithm/advantage_fn/opmd_advantage.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎trinity/algorithm/advantage_fn/ppo_advantage.py‎
Lines changed: 1 addition & 1 deletion b/‎trinity/algorithm/advantage_fn/ppo_advantage.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎trinity/algorithm/advantage_fn/reinforce_plus_plus_advantage.py‎
Lines changed: 1 addition & 1 deletion b/‎trinity/algorithm/advantage_fn/reinforce_plus_plus_advantage.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎trinity/algorithm/advantage_fn/remax_advantage.py‎
Lines changed: 1 addition & 1 deletion b/‎trinity/algorithm/advantage_fn/remax_advantage.py‎
Lines changed: 1 addition & 1 deletion
@@ -48,6 +48,9 @@ name: <experiment_name>
 mode: train
 algorithm:
   algorithm_type: dpo
+  kl_loss_fn: k1
+  kl_loss_fn_args:
+    kl_coef: 0.1  # value of beta in DPO
 checkpoint_root_dir: /PATH/TO/CHECKPOINT/
 model:
   model_path: /PATH/TO/MODEL/
@@ -70,8 +73,6 @@ buffer:
 trainer:
   trainer_config_path: 'examples/dpo_humanlike/train_dpo.yaml'
   save_interval: 30
-  actor_use_kl_loss: True
-  actor_kl_loss_coef: 0.1  # value of beta in DPO
 ```
 
 ### Run the Experiment
 
@@ -3,6 +3,9 @@ name: "trinity_dpo"
 mode: train
 algorithm:
   algorithm_type: dpo
+  kl_loss_fn: k1
+  kl_loss_fn_args:
+    kl_coef: 0.1
 checkpoint_root_dir: /PATH/TO/CHECKPOINT/
 model:
   model_path: /PATH/TO/MODEL
@@ -34,5 +37,3 @@ trainer:
   trainer_type: 'verl'
   trainer_config_path: 'examples/dpo_humanlike/train_dpo.yaml'
   save_interval: 30
-  actor_use_kl_loss: True
-  actor_kl_loss_coef: 0.1
 
@@ -8,10 +8,12 @@ algorithm:
   policy_loss_fn: ppo
   policy_loss_fn_args:
     clip_range: 0.2
-  advantage_fn_type: ppo_adv_fn
+  advantage_fn: ppo
   advantage_fn_args:
     gamma: 1.0
     lam: 1.0
+  kl_penalty_fn: k3
+  kl_loss_fn: k2
 
 model:
   model_path: ''
 
@@ -67,6 +67,10 @@ def test_trainer(self):
         actor_metrics = parser.metric_list("actor")
         self.assertTrue(len(actor_metrics) > 0)
         self.assertEqual(parser.metric_max_step(actor_metrics[0]), 8)
+        actor_kl_metrics = parser.metric_list("actor/kl")
+        self.assertTrue(len(actor_kl_metrics) > 0)
+        critic_kl_metrics = parser.metric_list("critic/kl")
+        self.assertTrue(len(critic_kl_metrics) > 0)
         response_metrics = parser.metric_list("response_length")
         self.assertTrue(len(response_metrics) > 0)
         self.assertEqual(parser.metric_max_step(response_metrics[0]), 8)
@@ -86,7 +90,7 @@ def test_trainer(self):
         )
         self.assertTrue(os.path.exists(checkpoint_step_4))
         self.assertTrue(os.path.exists(checkpoint_step_8))
-
+        # TODO: Reinit will fail when using v1 engine, find a way to fix it
         ray.init(ignore_reinit_error=True)
         # test bench mode
         self.config.mode = "bench"
@@ -118,7 +122,7 @@ def test_trainer(self):
         self.config.algorithm.algorithm_type = AlgorithmType.GRPO
         self.config.algorithm.repeat_times = 4
         # self.config.algorithm.repeat_times = 8  # TODO: used for real testing
-        self.config.algorithm.advantage_fn_type = "grpo_adv_fn"
+        self.config.algorithm.advantage_fn = "grpo"
         self.config.algorithm.advantage_fn_args = {}
         # self.config.buffer.batch_size = 96  # TODO: used for real testing
         self.config.buffer.explorer_input.taskset = get_unittest_dataset_config("gsm8k")
@@ -143,8 +147,6 @@ def test_trainer(self):
         # self.assertTrue(0.4 < rewards[1] < 0.55)
         # self.assertTrue(0.6 < rewards[2] < 0.7)
         # self.assertTrue(0.6 < rewards[3] < 0.7)
-        ray.shutdown(_exiting_interpreter=True)
-        # check checkpoint
 
     def tearDown(self):
         # remove dir only when the test passed
@@ -157,7 +159,7 @@ def test_trainer(self):
         # test both mode
         self.config.algorithm.algorithm_type = AlgorithmType.GRPO
         self.config.algorithm.repeat_times = 4
-        self.config.algorithm.advantage_fn_type = "grpo_adv_fn"
+        self.config.algorithm.advantage_fn = "grpo"
         self.config.algorithm.advantage_fn_args = {}
         self.config.buffer.explorer_input.taskset = get_unittest_dataset_config("gsm8k")
         self.config.buffer.trainer_input.sft_warmup_steps = 2
@@ -180,8 +182,6 @@ def test_trainer(self):
         response_metrics = parser.metric_list("response_length")
         self.assertTrue(len(response_metrics) > 0)
         self.assertEqual(parser.metric_max_step(response_metrics[0]), 4)
-        ray.shutdown(_exiting_interpreter=True)
-        # check checkpoint
 
     def tearDown(self):
         # remove dir only when the test passed
@@ -207,8 +207,6 @@ def test_trainer(self):
         actor_metrics = parser.metric_list("actor")
         self.assertTrue(len(actor_metrics) > 0)
         self.assertEqual(parser.metric_max_step(actor_metrics[0]), 4)
-        ray.shutdown(_exiting_interpreter=True)
-        # check checkpoint
 
     def tearDown(self):
         # remove dir only when the test passed
 
@@ -1,9 +1,15 @@
 from trinity.algorithm.advantage_fn import ADVANTAGE_FN, AdvantageFn
+from trinity.algorithm.entropy_loss_fn import ENTROPY_LOSS_FN, EntropyLossFn
+from trinity.algorithm.kl_fn import KL_FN, KLFn
 from trinity.algorithm.policy_loss_fn import POLICY_LOSS_FN, PolicyLossFn
 
 __all__ = [
     "AdvantageFn",
     "ADVANTAGE_FN",
     "PolicyLossFn",
     "POLICY_LOSS_FN",
+    "KLFn",
+    "KL_FN",
+    "EntropyLossFn",
+    "ENTROPY_LOSS_FN",
 ]
@@ -11,7 +11,7 @@
 from trinity.trainer.verl import core_algos
 
 
-@ADVANTAGE_FN.register_module("grpo_adv_fn")
+@ADVANTAGE_FN.register_module("grpo")
 class GRPOAdvantageFn(AdvantageFn):
     """GRPO advantage computation"""
 
 
@@ -11,7 +11,7 @@
 from trinity.trainer.verl import core_algos
 
 
-@ADVANTAGE_FN.register_module("opmd_adv_fn")
+@ADVANTAGE_FN.register_module("opmd")
 class OPMDAdvantageFn(AdvantageFn):
     """OPMD advantage computation"""
 
 
@@ -11,7 +11,7 @@
 from trinity.trainer.verl import core_algos
 
 
-@ADVANTAGE_FN.register_module("ppo_adv_fn")
+@ADVANTAGE_FN.register_module("ppo")
 class PPOAdvantageFn(AdvantageFn):
     def __init__(
         self,
 
@@ -11,7 +11,7 @@
 from trinity.trainer.verl import core_algos
 
 
-@ADVANTAGE_FN.register_module("reinforceplusplus_adv_fn")
+@ADVANTAGE_FN.register_module("reinforceplusplus")
 class REINFORCEPLUSPLUSAdvantageFn(AdvantageFn):
     def __init__(self, gamma: float = 1.0) -> None:
         self.gamma = gamma
 
@@ -11,7 +11,7 @@
 from trinity.trainer.verl import core_algos
 
 
-@ADVANTAGE_FN.register_module("remax_adv_fn")
+@ADVANTAGE_FN.register_module("remax")
 class REMAXAdvantageFn(AdvantageFn):
     def __init__(self) -> None:
         pass