modelscope
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/template/data/sft_for_gsm8k/sft.jsonl‎
Lines changed: 32 additions & 0 deletions b/‎tests/template/data/sft_for_gsm8k/sft.jsonl‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎tests/tools.py‎
Lines changed: 47 additions & 0 deletions b/‎tests/tools.py‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎tests/trainer/trainer_test.py‎
Lines changed: 105 additions & 2 deletions b/‎tests/trainer/trainer_test.py‎
Lines changed: 105 additions & 2 deletions
diff --git a/‎trinity/algorithm/policy_loss_fn/dpo_loss.py‎
Lines changed: 18 additions & 9 deletions b/‎trinity/algorithm/policy_loss_fn/dpo_loss.py‎
Lines changed: 18 additions & 9 deletions
diff --git a/‎trinity/algorithm/policy_loss_fn/opmd_policy_loss.py‎
Lines changed: 16 additions & 5 deletions b/‎trinity/algorithm/policy_loss_fn/opmd_policy_loss.py‎
Lines changed: 16 additions & 5 deletions
diff --git a/‎trinity/algorithm/policy_loss_fn/policy_loss_fn.py‎
Lines changed: 10 additions & 9 deletions b/‎trinity/algorithm/policy_loss_fn/policy_loss_fn.py‎
Lines changed: 10 additions & 9 deletions
diff --git a/‎trinity/algorithm/policy_loss_fn/ppo_policy_loss.py‎
Lines changed: 19 additions & 8 deletions b/‎trinity/algorithm/policy_loss_fn/ppo_policy_loss.py‎
Lines changed: 19 additions & 8 deletions
@@ -23,7 +23,7 @@ requires-python = ">=3.10"
 dependencies = [
     "verl==0.3.0.post1",
     "ray[default]>=2.45.0",
-    "vllm>=0.8.5",
+    "vllm==0.8.5.post1",
     "tensordict==0.6.2",
     "wandb",
     "omegaconf",
 
@@ -13,6 +13,7 @@
     StorageConfig,
     load_config,
 )
+from trinity.common.constants import PromptType
 
 
 def get_template_config() -> Config:
@@ -59,6 +60,47 @@ def get_unittest_dataset_config(
             default_workflow_type="math_workflow",
             default_reward_fn_type="countdown_reward",
         )
+    elif dataset_name == "gsm8k":
+        return StorageConfig(
+            name=dataset_name,
+            path="openai/gsm8k",
+            split=split,
+            subset_name="main",
+            format=FormatConfig(
+                prompt_key="question",
+                response_key="answer",
+            ),
+            rollout_args=GenerationConfig(
+                n=1,
+                temperature=1.0,
+                logprobs=0,
+            ),
+            default_workflow_type="math_workflow",
+            default_reward_fn_type="math_reward",
+        )
+    elif dataset_name == "sft_for_gsm8k":
+        return StorageConfig(
+            name=dataset_name,
+            path=os.path.join(os.path.dirname(__file__), "template", "data", "sft_for_gsm8k"),
+            split="train",
+            format=FormatConfig(
+                prompt_type=PromptType.PLAINTEXT,
+                prompt_key="prompt",
+                response_key="response",
+            ),
+        )
+    elif dataset_name == "dpo":
+        return StorageConfig(
+            name=dataset_name,
+            path="HumanLLMs/Human-Like-DPO-Dataset",
+            split="train",
+            format=FormatConfig(
+                prompt_type=PromptType.PLAINTEXT,
+                prompt_key="prompt",
+                chosen_key="chosen",
+                rejected_key="rejected",
+            ),
+        )
     else:
         raise ValueError(f"Unknown dataset name: {dataset_name}")
 
@@ -104,6 +146,11 @@ def metric_steps(self, metric_name: str) -> List[int]:
             raise ValueError(f"Metric '{metric_name}' does not exist.")
         return list(self._metrics[metric_name].keys())
 
+    def metric_values(self, metric_name: str) -> List:
+        if not self.metric_exist(metric_name):
+            raise ValueError(f"Metric '{metric_name}' does not exist.")
+        return list(self._metrics[metric_name].values())
+
     def metric_list(self, metric_prefix: str) -> List[str]:
         return [name for name in self._metrics if name.startswith(metric_prefix)]
 
 
@@ -14,8 +14,8 @@
     get_template_config,
     get_unittest_dataset_config,
 )
-from trinity.cli.launcher import bench, both
-from trinity.common.constants import MonitorType, SyncMethod
+from trinity.cli.launcher import bench, both, train
+from trinity.common.constants import AlgorithmType, MonitorType, SyncMethod
 
 
 class BaseTrainerCase(RayUnittestBase):
@@ -109,3 +109,106 @@ def test_trainer(self):
     def tearDown(self):
         # remove dir only when the test passed
         shutil.rmtree(self.config.checkpoint_job_dir)
+
+
+class TestTrainerGSM8K(BaseTrainerCase):
+    def test_trainer(self):
+        """Test GSM8K."""
+        # test both mode
+        self.config.algorithm.algorithm_type = AlgorithmType.GRPO
+        self.config.algorithm.repeat_times = 8
+        self.config.algorithm.advantage_fn_type = "grpo_adv_fn"
+        self.config.algorithm.advantage_fn_args = {}
+        # self.config.buffer.batch_size = 96  # TODO: used for real testing
+        self.config.buffer.explorer_input.taskset = get_unittest_dataset_config("gsm8k")
+        self.config.check_and_update()
+        self.config.trainer.trainer_config.trainer.total_training_steps = 4
+        self.config.trainer.trainer_config.trainer.max_actor_ckpt_to_keep = 2
+        self.config.trainer.trainer_config.actor_rollout_ref.actor.optim.lr = 1e-5
+        both(self.config)
+        parser = TensorBoardParser(os.path.join(self.config.monitor.cache_dir, "tensorboard"))
+        rollout_metrics = parser.metric_list("rollout")
+        self.assertTrue(len(rollout_metrics) > 0)
+        self.assertEqual(parser.metric_max_step(rollout_metrics[0]), 4)
+        actor_metrics = parser.metric_list("actor")
+        self.assertTrue(len(actor_metrics) > 0)
+        self.assertEqual(parser.metric_max_step(actor_metrics[0]), 4)
+        response_metrics = parser.metric_list("response_length")
+        self.assertTrue(len(response_metrics) > 0)
+        self.assertEqual(parser.metric_max_step(response_metrics[0]), 4)
+        # TODO: used for real testing
+        # rewards = parser.metric_values("critic/rewards/mean")
+        # self.assertTrue(0.4 < rewards[0] < 0.55)
+        # self.assertTrue(0.4 < rewards[1] < 0.55)
+        # self.assertTrue(0.6 < rewards[2] < 0.7)
+        # self.assertTrue(0.6 < rewards[3] < 0.7)
+        ray.shutdown(_exiting_interpreter=True)
+        # check checkpoint
+
+    def tearDown(self):
+        # remove dir only when the test passed
+        shutil.rmtree(self.config.checkpoint_job_dir)
+
+
+class TestTrainerGSM8KWithSFT(BaseTrainerCase):
+    def test_trainer(self):
+        """Test GSM8K With SFT."""
+        # test both mode
+        self.config.algorithm.algorithm_type = AlgorithmType.GRPO
+        self.config.algorithm.repeat_times = 8
+        self.config.algorithm.advantage_fn_type = "grpo_adv_fn"
+        self.config.algorithm.advantage_fn_args = {}
+        self.config.buffer.explorer_input.taskset = get_unittest_dataset_config("gsm8k")
+        self.config.buffer.trainer_input.sft_warmup_steps = 2
+        self.config.buffer.trainer_input.sft_warmup_dataset = get_unittest_dataset_config(
+            "sft_for_gsm8k"
+        )
+        self.config.check_and_update()
+        self.config.trainer.trainer_config.trainer.total_training_steps = 4
+        self.config.trainer.trainer_config.trainer.max_actor_ckpt_to_keep = 2
+        self.config.trainer.trainer_config.actor_rollout_ref.actor.optim.lr = 1e-5
+        both(self.config)
+        parser = TensorBoardParser(os.path.join(self.config.monitor.cache_dir, "tensorboard"))
+        rollout_metrics = parser.metric_list("rollout")
+        self.assertTrue(len(rollout_metrics) > 0)
+        self.assertEqual(parser.metric_max_step(rollout_metrics[0]), 2)
+        actor_metrics = parser.metric_list("actor")
+        self.assertTrue(len(actor_metrics) > 0)
+        self.assertEqual(parser.metric_max_step(actor_metrics[0]), 2)  # SFT
+        self.assertEqual(parser.metric_max_step(actor_metrics[-1]), 4)  # RFT
+        response_metrics = parser.metric_list("response_length")
+        self.assertTrue(len(response_metrics) > 0)
+        self.assertEqual(parser.metric_max_step(response_metrics[0]), 4)
+        ray.shutdown(_exiting_interpreter=True)
+        # check checkpoint
+
+    def tearDown(self):
+        # remove dir only when the test passed
+        shutil.rmtree(self.config.checkpoint_job_dir)
+
+
+class TestTrainerDPO(BaseTrainerCase):
+    def test_trainer(self):
+        """Test DPO."""
+        # test both mode
+        self.config.mode = "train"
+        self.config.algorithm.algorithm_type = AlgorithmType.DPO
+        self.config.algorithm.policy_loss_fn = "dpo"
+        self.config.algorithm.policy_loss_fn_args = {}
+        # self.config.buffer.batch_size = 32
+        self.config.buffer.trainer_input.experience_buffer = get_unittest_dataset_config("dpo")
+        self.config.check_and_update()
+        self.config.trainer.trainer_config.trainer.total_training_steps = 4
+        self.config.trainer.trainer_config.trainer.max_actor_ckpt_to_keep = 2
+        self.config.trainer.trainer_config.actor_rollout_ref.actor.optim.lr = 5e-7
+        train(self.config)
+        parser = TensorBoardParser(os.path.join(self.config.monitor.cache_dir, "tensorboard"))
+        actor_metrics = parser.metric_list("actor")
+        self.assertTrue(len(actor_metrics) > 0)
+        self.assertEqual(parser.metric_max_step(actor_metrics[0]), 4)
+        ray.shutdown(_exiting_interpreter=True)
+        # check checkpoint
+
+    def tearDown(self):
+        # remove dir only when the test passed
+        shutil.rmtree(self.config.checkpoint_job_dir)
@@ -1,6 +1,6 @@
 """DPO loss function."""
 
-from typing import Any, Dict, Tuple
+from typing import Dict, List, Tuple
 
 import torch
 import torch.nn.functional as F
@@ -22,21 +22,19 @@ def __init__(
     def __call__(
         self,
         logprob: torch.Tensor,
-        old_logprob: torch.Tensor,
-        action_mask: torch.Tensor,
-        advantages: torch.Tensor,
-        experiences: Any,
+        ref_log_prob: torch.Tensor,
+        response_mask: torch.Tensor,
         **kwargs,
     ) -> Tuple[torch.Tensor, Dict]:
         chosen_logprob = logprob[::2]
         rejected_logprob = logprob[1::2]
-        chosen_mask = action_mask[::2]
-        rejected_mask = action_mask[1::2]
+        chosen_mask = response_mask[::2]
+        rejected_mask = response_mask[1::2]
         chosen_logprob_sum = masked_sum(chosen_logprob, chosen_mask)
         rejected_logprob_sum = masked_sum(rejected_logprob, rejected_mask)
 
-        chosen_ref_logprob = old_logprob[::2]
-        rejected_ref_logprob = old_logprob[1::2]
+        chosen_ref_logprob = ref_log_prob[::2]
+        rejected_ref_logprob = ref_log_prob[1::2]
         chosen_ref_logprob_sum = masked_sum(chosen_ref_logprob, chosen_mask)
         rejected_ref_logprob_sum = masked_sum(rejected_ref_logprob, rejected_mask)
 
@@ -65,3 +63,14 @@ def default_args(cls) -> Dict:
             "beta": 0.1,
             "label_smoothing": 0.0,
         }
+
+    @property
+    def select_keys(self) -> List[str]:
+        return [
+            "attention_mask",
+            "input_ids",
+            "position_ids",
+            "response_mask",
+            "responses",
+            "ref_log_prob",
+        ]
@@ -3,7 +3,7 @@
 Modified from https://github.com/volcengine/verl/blob/main/verl/trainer/ppo/core_algos.py
 """
 
-from typing import Any, Dict, Tuple
+from typing import Dict, List, Tuple
 
 import torch
 
@@ -19,17 +19,28 @@ def __init__(self, tau: float = 1.0) -> None:
     def __call__(
         self,
         logprob: torch.Tensor,
-        old_logprob: torch.Tensor,
-        action_mask: torch.Tensor,
+        old_log_probs: torch.Tensor,
+        response_mask: torch.Tensor,
         advantages: torch.Tensor,
-        experiences: Any,
         **kwargs,
     ) -> Tuple[torch.Tensor, Dict]:
         pg_losses = -advantages * logprob
-        opmd_loss = masked_mean(pg_losses, action_mask)
+        opmd_loss = masked_mean(pg_losses, response_mask)
         opmd_loss = opmd_loss / (1.0 + self.tau)  # for regularization (w.r.t. current pi_theta)
         return opmd_loss, {"opmd_loss": opmd_loss.detach().item()}
 
     @classmethod
     def default_args(cls) -> Dict:
         return {"tau": 1.0}
+
+    @property
+    def select_keys(self) -> List[str]:
+        return [
+            "responses",
+            "input_ids",
+            "attention_mask",
+            "position_ids",
+            "old_log_probs",
+            "advantages",
+            "response_mask",
+        ]
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any, Dict, Tuple
+from typing import Dict, List, Tuple
 
 import torch
 
@@ -16,20 +16,14 @@ class PolicyLossFn(ABC):
     @abstractmethod
     def __call__(
         self,
-        logprob: torch.Tensor,
-        old_logprob: torch.Tensor,
-        action_mask: torch.Tensor,
-        advantages: torch.Tensor,
-        experiences: Any,
         **kwargs,
     ) -> Tuple[torch.Tensor, Dict]:
         """
         Args:
             logprob (`torch.Tensor`): The log probability generated by the policy model.
-            old_logprob (`torch.Tensor`): The log probability generated by the reference model.
-            action_mask (`torch.Tensor`): The action mask.
+            old_log_probs (`torch.Tensor`): The log probability generated by the reference model.
+            response_mask (`torch.Tensor`): The response mask.
             advantages (`torch.Tensor`): The advantages.
-            experiences (`DataProto`): The input experiences.
             kwargs (`Dict`): The step-level parameters for calculating the policy loss.
 
         Returns:
@@ -44,3 +38,10 @@ def default_args(cls) -> Dict:
         Returns:
             `Dict`: The default init arguments for the policy loss function.
         """
+
+    @property
+    def select_keys(self) -> List[str]:
+        """
+        Returns:
+            `List[str]`: The keys to select from input data.
+        """
@@ -3,7 +3,7 @@
 Modified from https://github.com/volcengine/verl/blob/main/verl/trainer/ppo/core_algos.py
 """
 
-from typing import Any, Dict, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import torch
 
@@ -33,23 +33,22 @@ def __init__(
     def __call__(
         self,
         logprob: torch.Tensor,
-        old_logprob: torch.Tensor,
-        action_mask: torch.Tensor,
+        old_log_probs: torch.Tensor,
+        response_mask: torch.Tensor,
         advantages: torch.Tensor,
-        experiences: Any,
         **kwargs,
     ) -> Tuple[torch.Tensor, Dict]:
-        negative_approx_kl = logprob - old_logprob
+        negative_approx_kl = logprob - old_log_probs
         ratio = torch.exp(negative_approx_kl)
-        ppo_kl = masked_mean(-negative_approx_kl, action_mask)
+        ppo_kl = masked_mean(-negative_approx_kl, response_mask)
 
         pg_losses = -advantages * ratio
         pg_losses2 = -advantages * torch.clamp(
             ratio, 1.0 - self.clip_range_low, 1.0 + self.clip_range_high  # type: ignore
         )
 
-        pg_loss = masked_mean(torch.max(pg_losses, pg_losses2), action_mask)
-        pg_clipfrac = masked_mean(torch.gt(pg_losses2, pg_losses).float(), action_mask)
+        pg_loss = masked_mean(torch.max(pg_losses, pg_losses2), response_mask)
+        pg_clipfrac = masked_mean(torch.gt(pg_losses2, pg_losses).float(), response_mask)
         metrics = {
             "pg_clipfrac": pg_clipfrac.detach().item(),
             "ppo_kl": ppo_kl.detach().item(),
@@ -62,3 +61,15 @@ def default_args(cls) -> Dict:
         return {
             "clip_range": 0.2,
         }
+
+    @property
+    def select_keys(self) -> List[str]:
+        return [
+            "responses",
+            "input_ids",
+            "attention_mask",
+            "position_ids",
+            "old_log_probs",
+            "advantages",
+            "response_mask",
+        ]