chenyushuo
diff --git a/‎trinity/algorithm/algorithm.py‎
Lines changed: 127 additions & 0 deletions b/‎trinity/algorithm/algorithm.py‎
Lines changed: 127 additions & 0 deletions
diff --git a/‎trinity/algorithm/algorithm_manager.py‎
Lines changed: 22 additions & 0 deletions b/‎trinity/algorithm/algorithm_manager.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎trinity/common/config.py‎
Lines changed: 24 additions & 7 deletions b/‎trinity/common/config.py‎
Lines changed: 24 additions & 7 deletions
diff --git a/‎trinity/common/constants.py‎
Lines changed: 58 additions & 58 deletions b/‎trinity/common/constants.py‎
Lines changed: 58 additions & 58 deletions
@@ -0,0 +1,127 @@
+from abc import ABC, abstractmethod
+from typing import Dict
+
+from trinity.common.experience import Experience, Experiences
+from trinity.utils.registry import Registry
+
+ALGORITHM = Registry("algorithm")
+
+
+class Algorithm(ABC):
+    use_critic: bool
+    use_reference: bool
+    use_advantage: bool
+    can_balance_batch: bool
+
+    @classmethod
+    def gather_experience(cls, exps: list[Experience], pad_token_id: int = 0) -> Experiences:
+        return Experiences.gather_experiences(exps, pad_token_id)
+
+    @abstractmethod
+    @classmethod
+    def get_default_config(cls) -> Dict:
+        pass
+
+
+@ALGORITHM.register_module("sft")
+class SFTAlgorithm(Algorithm):
+    """SFT Algorithm."""
+
+    use_critic: bool = False
+    use_reference: bool = False
+    use_advantage: bool = False
+    can_balance_batch: bool = True
+
+    @classmethod
+    def get_default_config(cls) -> Dict:
+        return {
+            "policy_loss_fn": "sft",
+            "kl_loss_fn": "none",
+            "entropy_loss_fn": "basic",
+        }
+
+
+@ALGORITHM.register_module("ppo")
+class PPOAlgorithm(Algorithm):
+    """PPO Algorithm."""
+
+    use_critic: bool = True
+    use_reference: bool = True
+    use_advantage: bool = True
+    can_balance_batch: bool = True
+
+    @classmethod
+    def get_default_config(cls) -> Dict:
+        return {
+            "repeat_times": 1,
+            "policy_loss_fn": "ppo",
+            "advantage_fn": "ppo",
+            "kl_penalty_fn": "k3",
+            "kl_loss_fn": "k2",
+            "entropy_loss_fn": "basic",
+        }
+
+
+@ALGORITHM.register_module("grpo")
+class GRPOAlgorithm(Algorithm):
+    """GRPO algorithm."""
+
+    use_critic: bool = False
+    use_reference: bool = True
+    use_advantage: bool = True
+    can_balance_batch: bool = True
+
+    @classmethod
+    def get_default_config(cls) -> Dict:
+        return {
+            "repeat_times": 2,
+            "policy_loss_fn": "ppo",
+            "advantage_fn": "grpo",
+            "kl_penalty_fn": "k3",
+            "kl_loss_fn": "k2",
+            "entropy_loss_fn": "basic",
+        }
+
+
+@ALGORITHM.register_module("opmd")
+class OPMDAlgorithm(Algorithm):
+    """OPMD algorithm."""
+
+    use_critic: bool = False
+    use_reference: bool = True
+    use_advantage: bool = True
+    can_balance_batch: bool = True
+
+    @classmethod
+    def get_default_config(cls) -> Dict:
+        return {
+            "repeat_times": 2,
+            "policy_loss_fn": "opmd",
+            "advantage_fn": "opmd",
+            "kl_penalty_fn": "k3",
+            "kl_loss_fn": "k2",
+            "entropy_loss_fn": "basic",
+        }
+
+
+@ALGORITHM.register_module("dpo")
+class DPOAlgorithm(Algorithm):
+    """DPO algorithm."""
+
+    use_critic: bool = False
+    use_reference: bool = True
+    use_advantage: bool = False
+    can_balance_batch: bool = False
+
+    @classmethod
+    def gather_experience(cls, exps: list[Experience], pad_token_id: int = 0) -> Experiences:
+        return Experiences.gather_dpo_experiences(exps, pad_token_id)
+
+    @classmethod
+    def get_default_config(cls) -> Dict:
+        return {
+            "repeat_times": 2,  # fake repeat times
+            "policy_loss_fn": "dpo",
+            "kl_loss_fn": "k2",
+            "entropy_loss_fn": "basic",
+        }
@@ -0,0 +1,22 @@
+from trinity.algorithm.algorithm import ALGORITHM
+from trinity.common.config import AlgorithmConfig, Config
+
+
+class AlgorithmManager:
+    def __init__(self, config: Config):
+        self.config = config
+        sft_type = ALGORITHM.get("sft")
+        sft_default_config = sft_type.get_default_config()
+        self.sft_algorithm_config = AlgorithmConfig(
+            algorithm_type=sft_type,
+            **sft_default_config,
+        )
+
+    def get_current_algorithm_config(self, global_steps: int):
+        if global_steps <= self.config.buffer.trainer_input.sft_warmup_steps:
+            return self.sft_algorithm_config
+        else:
+            return self.config.algorithm.algorithm_type
+
+    def need_save(self, global_steps: int):
+        return global_steps == self.config.buffer.trainer_input.sft_warmup_steps
@@ -2,10 +2,12 @@
 """Configs for RFT."""
 import os
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
 from omegaconf import OmegaConf
 
+from trinity.algorithm.algorithm import ALGORITHM, Algorithm
+from trinity.algorithm.algorithm_manager import AlgorithmManager
 from trinity.common.constants import (
     AlgorithmType,
     MonitorType,
@@ -170,34 +172,43 @@ class InferenceModelConfig:
 class AlgorithmConfig:
     """Config for algorithm."""
 
-    algorithm_type: AlgorithmType = AlgorithmType.PPO
+    algorithm_type: Union[str, Algorithm] = "ppo"
     # for GRPO-like algorithms, repeat each task for `repeat_times` times
     repeat_times: int = 1
 
-    policy_loss_fn: str = "ppo"
+    policy_loss_fn: str = None  # "ppo"
     # If not set, use PolicyLossFn.default_args()
     policy_loss_fn_args: Optional[dict] = None
 
-    advantage_fn: str = "ppo"
+    advantage_fn: str = None  # "ppo"
     # If not set, use AdvantageFn.default_args()
     advantage_fn_args: Optional[dict] = None
 
-    kl_penalty_fn: str = "none"  # set to "none" to disable kl penalty in reward
+    kl_penalty_fn: str = None  # "none"  # set to "none" to disable kl penalty in reward
     # If not set, use kl_penalty_fn.default_args()
     kl_penalty_fn_args: Optional[dict] = None
 
-    kl_loss_fn: str = "k2"  # set to "none" to disable kl loss
+    kl_loss_fn: str = None  # "k2"  # set to "none" to disable kl loss
     # If not set, use kl_loss_fn.default_args()
     kl_loss_fn_args: Optional[dict] = None
 
-    entropy_loss_fn: str = "basic"
+    entropy_loss_fn: str = None  # "basic"
     # If not set, use entropy_loss_fn.default_args()
     entropy_loss_fn_args: Optional[dict] = None
 
     # used for SFT warmup
     # TODO: move this to SFT warmup
     use_token_level_loss: bool = True
 
+    # do not set
+    algorithm_manager: Optional[AlgorithmManager] = None
+
+    def get_current_algorithm_config(self, global_steps: int):
+        return self.algorithm_manager.get_current_algorithm_config(global_steps)
+
+    def need_save(self, global_steps: int):
+        return self.algorithm_manager.need_save(global_steps)
+
 
 @dataclass
 class ClusterConfig:
@@ -492,6 +503,12 @@ def _check_algorithm(self) -> None:
             POLICY_LOSS_FN,
         )
 
+        self.algorithm.algorithm_manager = AlgorithmManager(self)
+        self.algorithm.algorithm_type = ALGORITHM.get(self.algorithm.algorithm_type)
+        for key, value in self.algorithm.algorithm_type.get_default_config().items():
+            if getattr(self.algorithm, key, None) is None:
+                setattr(self.algorithm, key, value)
+
         policy_fn_cls = POLICY_LOSS_FN.get(self.algorithm.policy_loss_fn)
         if policy_fn_cls is None:
             raise ValueError(f"Invalid policy_loss_fn: {self.algorithm.policy_loss_fn}")
 
@@ -62,64 +62,64 @@ class StorageType(CaseInsensitiveEnum):
     FILE = "file"
 
 
-class AlgorithmType(CaseInsensitiveEnum):
-    """Algorithm Type."""
-
-    SFT = "sft"
-    PPO = "ppo"
-    GRPO = "grpo"
-    OPMD = "opmd"
-    DPO = "dpo"
-
-    def is_rft(self) -> bool:
-        """Check if the algorithm is RFT."""
-        return self in [
-            AlgorithmType.PPO,
-            AlgorithmType.GRPO,
-            AlgorithmType.OPMD,
-        ]
-
-    def is_sft(self) -> bool:
-        """Check if the algorithm is SFT."""
-        return self == AlgorithmType.SFT
-
-    def is_dpo(self) -> bool:
-        """Check if the algorithm is DPO."""
-        return self == AlgorithmType.DPO
-
-    @property
-    def use_critic(self) -> bool:
-        """Check if the algorithm uses critic."""
-        return self == AlgorithmType.PPO
-
-    @property
-    def use_reference(self) -> bool:
-        """Check if the algorithm uses reference."""
-        return self in {
-            AlgorithmType.PPO,
-            AlgorithmType.GRPO,
-            AlgorithmType.OPMD,
-            AlgorithmType.DPO,
-        }
-
-    @property
-    def use_advantage(self) -> bool:
-        """Check if the algorithm uses advantage."""
-        return self in {
-            AlgorithmType.PPO,
-            AlgorithmType.GRPO,
-            AlgorithmType.OPMD,
-        }
-
-    @property
-    def can_balance_batch(self) -> bool:
-        """Check if the algorithm can balance batch."""
-        return self in {
-            AlgorithmType.SFT,
-            AlgorithmType.PPO,
-            AlgorithmType.GRPO,
-            AlgorithmType.OPMD,
-        }
+# class AlgorithmType(CaseInsensitiveEnum):
+#     """Algorithm Type."""
+
+#     SFT = "sft"
+#     PPO = "ppo"
+#     GRPO = "grpo"
+#     OPMD = "opmd"
+#     DPO = "dpo"
+
+#     def is_rft(self) -> bool:
+#         """Check if the algorithm is RFT."""
+#         return self in [
+#             AlgorithmType.PPO,
+#             AlgorithmType.GRPO,
+#             AlgorithmType.OPMD,
+#         ]
+
+#     def is_sft(self) -> bool:
+#         """Check if the algorithm is SFT."""
+#         return self == AlgorithmType.SFT
+
+#     def is_dpo(self) -> bool:
+#         """Check if the algorithm is DPO."""
+#         return self == AlgorithmType.DPO
+
+#     @property
+#     def use_critic(self) -> bool:
+#         """Check if the algorithm uses critic."""
+#         return self == AlgorithmType.PPO
+
+#     @property
+#     def use_reference(self) -> bool:
+#         """Check if the algorithm uses reference."""
+#         return self in {
+#             AlgorithmType.PPO,
+#             AlgorithmType.GRPO,
+#             AlgorithmType.OPMD,
+#             AlgorithmType.DPO,
+#         }
+
+#     @property
+#     def use_advantage(self) -> bool:
+#         """Check if the algorithm uses advantage."""
+#         return self in {
+#             AlgorithmType.PPO,
+#             AlgorithmType.GRPO,
+#             AlgorithmType.OPMD,
+#         }
+
+#     @property
+#     def can_balance_batch(self) -> bool:
+#         """Check if the algorithm can balance batch."""
+#         return self in {
+#             AlgorithmType.SFT,
+#             AlgorithmType.PPO,
+#             AlgorithmType.GRPO,
+#             AlgorithmType.OPMD,
+#         }
 
 
 class MonitorType(CaseInsensitiveEnum):