From 3dbeb9f5fe740a947df0434d506c73f8b9b322b4 Mon Sep 17 00:00:00 2001 From: chenyushuo <297086016@qq.com> Date: Tue, 20 May 2025 15:18:50 +0800 Subject: [PATCH 1/3] mv `algorithm_type` from `trainer` to `global_config` --- .../sphinx_doc/source/tutorial/example_dpo.md | 2 +- .../source/tutorial/trinity_configs.md | 4 +-- examples/async_gsm8k/explorer.yaml | 2 +- examples/async_gsm8k/trainer.yaml | 2 +- examples/dpo_humanlike/dpo.yaml | 2 +- examples/grpo_alfworld/alfworld.yaml | 2 +- examples/grpo_gsm8k/gsm8k.yaml | 2 +- examples/grpo_math/math.yaml | 2 +- examples/grpo_sciworld/sciworld.yaml | 2 +- examples/grpo_webshop/webshop.yaml | 2 +- examples/opmd_gsm8k/opmd_gsm8k.yaml | 2 +- examples/ppo_countdown/countdown.yaml | 2 +- trinity/cli/launcher.py | 4 +-- trinity/common/config.py | 28 +++++++++---------- trinity/common/verl_config.py | 6 ++-- trinity/manager/config_manager.py | 27 +++++++----------- 16 files changed, 42 insertions(+), 49 deletions(-) diff --git a/docs/sphinx_doc/source/tutorial/example_dpo.md b/docs/sphinx_doc/source/tutorial/example_dpo.md index 5d274fbb47..8bbf3d9199 100644 --- a/docs/sphinx_doc/source/tutorial/example_dpo.md +++ b/docs/sphinx_doc/source/tutorial/example_dpo.md @@ -56,7 +56,7 @@ buffer: prompt_key: chosen_key: rejected_key: -trainer: +global_config: algorithm_type: dpo # In train_dpo.yaml diff --git a/docs/sphinx_doc/source/tutorial/trinity_configs.md b/docs/sphinx_doc/source/tutorial/trinity_configs.md index 430f872489..e02b0a252b 100644 --- a/docs/sphinx_doc/source/tutorial/trinity_configs.md +++ b/docs/sphinx_doc/source/tutorial/trinity_configs.md @@ -11,6 +11,7 @@ global_config: batch_size: 96 eval_interval: 1000 eval_on_latest_ckp: true + algorithm_type: ppo ``` - `mode`: The mode of the experiment, chosen from `both`, `train`, `explore` or `bench`. `both` means both trainer and explorer are launched; `train` means only trainer is launched; `explore` means only explorer is launched; `bench` conducts benchmark evaluation. Default is `both`. @@ -18,6 +19,7 @@ global_config: - `global_config.batch_size`: The batch size used for training. It should be checked manually. - `global_config.eval_interval`: The interval steps between two evaluations. Default is `1000`. - `global_config.eval_on_latest_ckp`: Whether to evaluate on only the latest checkpoint or all the checkpoints in the path. Only valid in `bench` mode. Default is `true`. +- `global_config.algorithm_type`: The type of the algorithm, Support `ppo`, `grpo`, `opmd` and `dpo`. ## Monitor @@ -192,7 +194,6 @@ Support `nccl` and `checkpoint`, `nccl` represents that model weights in `explor ```yaml trainer: trainer_type: 'verl' - algorithm_type: ppo trainer_config_path: 'examples/ppo_countdown/train_countdown.yaml' sft_warmup_steps: 0 eval_interval: 1000 @@ -200,7 +201,6 @@ trainer: ``` - `trainer.trainer_type`: The backend of the trainer, Only `verl` is supported. -- `trainer.algorithm_type`: The type of the algorithm, Support `ppo`, `grpo`, `opmd` and `dpo`. - `trainer.trainer_config_path`: The path to the trainer configuration file. It must be set manually. - `trainer.sft_warmup_steps`: The number of steps to warm up the model. Default is `0`. - `trainer.eval_interval`: The interval steps between two evaluations. Default is `1000`. diff --git a/examples/async_gsm8k/explorer.yaml b/examples/async_gsm8k/explorer.yaml index 673da76a59..8402e4ced2 100644 --- a/examples/async_gsm8k/explorer.yaml +++ b/examples/async_gsm8k/explorer.yaml @@ -3,6 +3,7 @@ global_config: total_epochs: 20 batch_size: 96 eval_interval: 10 + algorithm_type: grpo model: model_path: /PATH/TO/MODEL/ max_prompt_tokens: 256 @@ -51,7 +52,6 @@ synchronizer: sync_iteration_interval: 10 trainer: trainer_type: 'verl' - algorithm_type: grpo trainer_config_path: examples/async_gsm8k/verl_config.yaml sft_warmup_steps: 0 # Set to integer to enable sft warmup monitor: diff --git a/examples/async_gsm8k/trainer.yaml b/examples/async_gsm8k/trainer.yaml index df193c3f37..79a50337d2 100644 --- a/examples/async_gsm8k/trainer.yaml +++ b/examples/async_gsm8k/trainer.yaml @@ -3,6 +3,7 @@ global_config: total_epochs: 20 batch_size: 96 eval_interval: 10 + algorithm_type: grpo model: model_path: /PATH/TO/MODEL/ max_prompt_tokens: 256 @@ -50,7 +51,6 @@ synchronizer: sync_iteration_interval: 10 trainer: trainer_type: 'verl' - algorithm_type: grpo trainer_config_path: examples/async_gsm8k/verl_config.yaml sft_warmup_steps: 0 # Set to integer to enable sft warmup monitor: diff --git a/examples/dpo_humanlike/dpo.yaml b/examples/dpo_humanlike/dpo.yaml index de459f9230..825788a792 100644 --- a/examples/dpo_humanlike/dpo.yaml +++ b/examples/dpo_humanlike/dpo.yaml @@ -2,6 +2,7 @@ mode: train global_config: total_epochs: 20 batch_size: 32 # NOTE + algorithm_type: dpo model: model_path: '/PATH/TO/MODEL/CHECKPOINT/' # NOTE max_prompt_tokens: 1792 @@ -29,7 +30,6 @@ synchronizer: sync_timeout: 1200 trainer: trainer_type: 'verl' - algorithm_type: dpo trainer_config_path: 'examples/dpo_humanlike/train_dpo.yaml' save_interval: 30 monitor: diff --git a/examples/grpo_alfworld/alfworld.yaml b/examples/grpo_alfworld/alfworld.yaml index 18dc2595e6..08a9fd8e42 100644 --- a/examples/grpo_alfworld/alfworld.yaml +++ b/examples/grpo_alfworld/alfworld.yaml @@ -1,6 +1,7 @@ global_config: total_epochs: 20 batch_size: 4 + algorithm_type: grpo model: model_path: '/PATH/TO/MODEL/CHECKPOINT/' max_prompt_tokens: 4096 @@ -50,7 +51,6 @@ synchronizer: sync_timeout: 1200 trainer: trainer_type: 'verl' - algorithm_type: grpo trainer_config_path: 'examples/grpo_alfworld/train_alfworld.yaml' save_interval: 10 monitor: diff --git a/examples/grpo_gsm8k/gsm8k.yaml b/examples/grpo_gsm8k/gsm8k.yaml index 748a5ac0e5..71630adaac 100644 --- a/examples/grpo_gsm8k/gsm8k.yaml +++ b/examples/grpo_gsm8k/gsm8k.yaml @@ -15,6 +15,7 @@ global_config: total_epochs: 1 batch_size: 96 eval_interval: 50 + algorithm_type: grpo model: model_path: '/PATH/TO/MODEL/' max_prompt_tokens: 256 @@ -80,7 +81,6 @@ synchronizer: sync_timeout: 1200 trainer: trainer_type: 'verl' - algorithm_type: grpo trainer_config_path: 'examples/grpo_gsm8k/train_gsm8k.yaml' sft_warmup_steps: 0 # Set to integer to enable sft warmup save_interval: 100 diff --git a/examples/grpo_math/math.yaml b/examples/grpo_math/math.yaml index b22291b09a..a1527d9255 100644 --- a/examples/grpo_math/math.yaml +++ b/examples/grpo_math/math.yaml @@ -2,6 +2,7 @@ global_config: total_epochs: 20 batch_size: 288 eval_interval: 10 + algorithm_type: grpo model: model_path: /PATH/TO/MODEL/ max_prompt_tokens: 1024 @@ -50,7 +51,6 @@ synchronizer: sync_timeout: 1200 trainer: trainer_type: 'verl' - algorithm_type: grpo trainer_config_path: 'examples/grpo_math/train_math.yaml' sft_warmup_steps: 0 # Set to integer to enable sft warmup save_interval: 100 diff --git a/examples/grpo_sciworld/sciworld.yaml b/examples/grpo_sciworld/sciworld.yaml index 6ba88d51e2..350b37ba80 100644 --- a/examples/grpo_sciworld/sciworld.yaml +++ b/examples/grpo_sciworld/sciworld.yaml @@ -1,6 +1,7 @@ global_config: total_epochs: 20 batch_size: 4 + algorithm_type: grpo model: model_path: '/PATH/TO/MODEL/CHECKPOINT/' max_prompt_tokens: 4096 @@ -50,7 +51,6 @@ synchronizer: sync_timeout: 1200 trainer: trainer_type: 'verl' - algorithm_type: grpo trainer_config_path: 'examples/grpo_sciworld/train_sciworld.yaml' save_interval: 10 monitor: diff --git a/examples/grpo_webshop/webshop.yaml b/examples/grpo_webshop/webshop.yaml index d5b59d67b0..0ae7563db2 100644 --- a/examples/grpo_webshop/webshop.yaml +++ b/examples/grpo_webshop/webshop.yaml @@ -1,6 +1,7 @@ global_config: total_epochs: 20 batch_size: 4 + algorithm_type: grpo model: model_path: '/PATH/TO/MODEL/CHECKPOINT/' max_prompt_tokens: 4096 @@ -50,7 +51,6 @@ synchronizer: sync_timeout: 1200 trainer: trainer_type: 'verl' - algorithm_type: grpo trainer_config_path: 'examples/grpo_webshop/train_webshop.yaml' save_interval: 10 monitor: diff --git a/examples/opmd_gsm8k/opmd_gsm8k.yaml b/examples/opmd_gsm8k/opmd_gsm8k.yaml index 4739400f1a..7cc502eff2 100644 --- a/examples/opmd_gsm8k/opmd_gsm8k.yaml +++ b/examples/opmd_gsm8k/opmd_gsm8k.yaml @@ -1,6 +1,7 @@ global_config: total_epochs: 1 batch_size: 96 + algorithm_type: opmd model: model_path: '{path to models}/Qwen2.5-1.5B-Inst' max_prompt_tokens: 256 @@ -49,7 +50,6 @@ synchronizer: sync_timeout: 1200 trainer: trainer_type: 'verl' - algorithm_type: opmd trainer_config_path: 'examples/opmd_gsm8k/train_opmd_gsm8k.yaml' sft_warmup_steps: 0 save_interval: 100 diff --git a/examples/ppo_countdown/countdown.yaml b/examples/ppo_countdown/countdown.yaml index f7ad9c4362..c428a167b4 100644 --- a/examples/ppo_countdown/countdown.yaml +++ b/examples/ppo_countdown/countdown.yaml @@ -2,6 +2,7 @@ global_config: total_epochs: 20 batch_size: 96 eval_interval: 1000 + algorithm_type: ppo model: model_path: '/PATH/TO/MODEL/CHECKPOINT/' max_prompt_tokens: 256 @@ -51,7 +52,6 @@ synchronizer: sync_timeout: 1200 trainer: trainer_type: 'verl' - algorithm_type: ppo trainer_config_path: 'examples/ppo_countdown/train_countdown.yaml' sft_warmup_steps: 0 save_interval: 100 diff --git a/trinity/cli/launcher.py b/trinity/cli/launcher.py index 5dbf53abb4..ce36299e84 100644 --- a/trinity/cli/launcher.py +++ b/trinity/cli/launcher.py @@ -56,7 +56,7 @@ def train(config: Config) -> None: logger.info("SFT warmup finished.") break - algo_type = config.trainer.algorithm_type + algo_type = config.global_config.algorithm_type try: ray.get(trainer.train.remote(algo_type)) logger.info("Train finished.") @@ -100,7 +100,7 @@ def both(config: Config) -> None: break ray.get([explorer.sync_weight.remote(), trainer.sync_weight.remote()]) - algo_type = config.trainer.algorithm_type + algo_type = config.global_config.algorithm_type while True: try: ref_explore = explorer.explore_one_period.remote() diff --git a/trinity/common/config.py b/trinity/common/config.py index 2e8e830007..d311e57b3a 100644 --- a/trinity/common/config.py +++ b/trinity/common/config.py @@ -120,6 +120,7 @@ class GlobalConfig: batch_size: int = 1 eval_interval: int = 100 eval_on_latest_ckp: bool = True + algorithm_type: AlgorithmType = AlgorithmType.PPO @dataclass @@ -227,7 +228,6 @@ class TrainerConfig: trainer_config: Any = field(default_factory=dict) # train algorithm - algorithm_type: AlgorithmType = AlgorithmType.PPO get_exp_strategy: Optional[str] = None # warmup config @@ -309,7 +309,7 @@ def _check_interval(self) -> None: # check eval_interval if ( self.mode != "bench" - and self.trainer.algorithm_type != AlgorithmType.DPO + and self.global_config.algorithm_type != AlgorithmType.DPO and self.global_config.eval_interval % self.synchronizer.sync_interval != 0 ): self.global_config.eval_interval = ( @@ -322,12 +322,12 @@ def _check_interval(self) -> None: # check save_interval if ( self.mode != "bench" - and self.trainer.algorithm_type != AlgorithmType.DPO + and self.global_config.algorithm_type != AlgorithmType.DPO and self.synchronizer.sync_method == SyncMethod.CHECKPOINT ): if self.trainer.save_interval != self.synchronizer.sync_interval: logger.warning( - f"When `trainer.algorithm_type` != `DPO` and `synchronizer.sync_method` == `checkpoint`, " + f"When `global_config.algorithm_type` != `DPO` and `synchronizer.sync_method` == `checkpoint`, " f"`trainer.save_interval` will be set to " f"`synchronizer.sync_interval = {self.synchronizer.sync_interval}`." ) @@ -390,20 +390,22 @@ def _check_buffer(self) -> None: # noqa: C901 f"Auto set `buffer.trainer_input.experience_buffer` to {self.buffer.trainer_input.experience_buffer}" ) elif self.mode == "train": # TODO: to be check - if self.trainer.algorithm_type.is_dpo(): + if self.global_config.algorithm_type.is_dpo(): if ( self.buffer.trainer_input.experience_buffer is None or not self.buffer.trainer_input.experience_buffer.path ): raise ValueError( - "`buffer.trainer_input.experience_buffer.path` is required when `trainer.algorithm_type == AlgorithmType.DPO`" + "`buffer.trainer_input.experience_buffer.path` is required when `global_config.algorithm_type == AlgorithmType.DPO`" ) - if self.mode in ["both", "train"]: - self.buffer.trainer_input.experience_buffer.algorithm_type = self.trainer.algorithm_type + if self.buffer.trainer_input.experience_buffer is not None: + self.buffer.trainer_input.experience_buffer.algorithm_type = self.global_config.algorithm_type # set buffer.explorer_output if self.buffer.explorer_output is None: self.buffer.explorer_output = self.buffer.trainer_input.experience_buffer + else: + self.buffer.explorer_output.algorithm_type = self.global_config.algorithm_type # check trainer_input.sft_warmup_dataset if ( @@ -440,7 +442,7 @@ def check_and_update(self) -> None: # noqa: C901 # check mode if self.mode not in ["explore", "train", "both", "bench"]: raise ValueError(f"Invalid mode: {self.mode}") - if self.trainer.algorithm_type == AlgorithmType.DPO and self.mode == "both": + if self.global_config.algorithm_type == AlgorithmType.DPO and self.mode == "both": raise ValueError("DPO does not support `both` mode") # check model path @@ -454,21 +456,19 @@ def check_and_update(self) -> None: # noqa: C901 self.explorer.engine_num * self.explorer.tensor_parallel_size ) self.synchronizer.backend = self.explorer.backend - if self.mode == "bench" and self.synchronizer.sync_method != SyncMethod.CHECKPOINT: + if self.mode in ["train", "explore", "bench"] and self.synchronizer.sync_method != SyncMethod.CHECKPOINT: self.synchronizer.sync_method = SyncMethod.CHECKPOINT logger.warning( - "Bench mode only supports checkpoint synchronization, set `synchronizer.sync_method` to `checkpoint`." + f"`{self.mode}` mode only supports checkpoint synchronization, set `synchronizer.sync_method` to `checkpoint`." ) if ( - self.trainer.algorithm_type == AlgorithmType.DPO + self.global_config.algorithm_type == AlgorithmType.DPO and self.synchronizer.sync_method != SyncMethod.CHECKPOINT ): self.synchronizer.sync_method = SyncMethod.CHECKPOINT logger.warning( "DPO only supports checkpoint synchronization, set `synchronizer.sync_method` to `checkpoint`." ) - if self.synchronizer.sync_method == SyncMethod.NCCL and self.mode != "both": - raise ValueError("`nccl` synchronization is only supported in both mode.") self._check_interval() diff --git a/trinity/common/verl_config.py b/trinity/common/verl_config.py index 944037ac2e..f0f323918b 100644 --- a/trinity/common/verl_config.py +++ b/trinity/common/verl_config.py @@ -310,11 +310,11 @@ def synchronize_config(self, config: Config) -> None: self.critic.ppo_mini_batch_size = config.global_config.batch_size self.critic.rollout_n = self.actor_rollout_ref.rollout.n - self.actor_rollout_ref.actor.algorithm_type = config.trainer.algorithm_type - if config.trainer.algorithm_type == AlgorithmType.PPO: + self.actor_rollout_ref.actor.algorithm_type = config.global_config.algorithm_type + if config.global_config.algorithm_type == AlgorithmType.PPO: logger.info("Using GAE `adv_estimator` for PPO") self.algorithm.adv_estimator = AdvantageEstimator.GAE.value - elif config.trainer.algorithm_type == AlgorithmType.GRPO: + elif config.global_config.algorithm_type == AlgorithmType.GRPO: logger.info("Using GRPO `adv_estimator` for GRPO") self.algorithm.adv_estimator = AdvantageEstimator.GRPO.value diff --git a/trinity/manager/config_manager.py b/trinity/manager/config_manager.py index 1f1e48d61b..f56b8eb363 100644 --- a/trinity/manager/config_manager.py +++ b/trinity/manager/config_manager.py @@ -66,6 +66,7 @@ def _init_default_config(self): "_train_batch_size_per_gpu": 16, "train_batch_size": 96, "eval_interval": 1000, + "algorithm_type": AlgorithmType.PPO.value, # Taskset Configs "taskset_path": "", "taskset_subset_name": None, @@ -131,7 +132,6 @@ def _init_default_config(self): "sync_timeout": 1200, # Trainer Configs "trainer_type": "verl", - "algorithm_type": AlgorithmType.PPO.value, "sft_warmup_steps": 0, "_nccl_save_interval": 100, "save_interval": 100, @@ -1600,20 +1600,7 @@ def generate_config(self): else: trainer_n_gpus_per_node = st.session_state["gpu_per_node"] - critic_model_path = ( - st.session_state["critic_model_path"].strip() - if st.session_state["critic_model_path"].strip() - else st.session_state["model_path"] - ) - - if st.session_state["algorithm_type"] == AlgorithmType.DPO.value: - pass - # experience_buffer_path = ( - # st.session_state["experience_buffer_path"].strip() - # if st.session_state["experience_buffer_path"].strip() - # else st.session_state["dataset_path"].strip() - # ) - else: # not dpo algorithms + if st.session_state["algorithm_type"] != AlgorithmType.DPO.value: experience_buffer_path = st.session_state["experience_buffer_path"].strip() if ( not experience_buffer_path @@ -1657,10 +1644,10 @@ def generate_config(self): "total_epochs": st.session_state["total_epochs"], "batch_size": st.session_state["train_batch_size"], "eval_interval": st.session_state["eval_interval"], + "algorithm_type": st.session_state["algorithm_type"], }, "model": { "model_path": st.session_state["model_path"], - "critic_model_path": critic_model_path, "max_prompt_tokens": st.session_state["max_prompt_tokens"], "max_response_tokens": st.session_state["max_response_tokens"], "checkpoint_path": st.session_state["checkpoint_path"], @@ -1732,7 +1719,6 @@ def generate_config(self): }, "trainer": { "trainer_type": st.session_state["trainer_type"], - "algorithm_type": st.session_state["algorithm_type"], "trainer_config": trainer_config, "sft_warmup_steps": st.session_state["sft_warmup_steps"], "save_interval": st.session_state["save_interval"], @@ -1744,6 +1730,13 @@ def generate_config(self): }, } + if st.session_state["adv_estimator"] == AdvantageEstimator.GAE.value: + config["model"]["critic_model_path"] = ( + st.session_state["critic_model_path"].strip() + if st.session_state["critic_model_path"].strip() + else st.session_state["model_path"] + ) + for idx in range(st.session_state["_eval_tasksets_num"]): if st.session_state[f"eval_taskset_{idx}_path"].strip(): config["buffer"]["explorer_input"]["eval_tasksets"].append( From 351fd2a0efb61651a542696125669b2793596e42 Mon Sep 17 00:00:00 2001 From: chenyushuo <297086016@qq.com> Date: Tue, 20 May 2025 15:34:01 +0800 Subject: [PATCH 2/3] doc fix --- trinity/common/config.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/trinity/common/config.py b/trinity/common/config.py index d311e57b3a..2508081cfb 100644 --- a/trinity/common/config.py +++ b/trinity/common/config.py @@ -399,7 +399,9 @@ def _check_buffer(self) -> None: # noqa: C901 "`buffer.trainer_input.experience_buffer.path` is required when `global_config.algorithm_type == AlgorithmType.DPO`" ) if self.buffer.trainer_input.experience_buffer is not None: - self.buffer.trainer_input.experience_buffer.algorithm_type = self.global_config.algorithm_type + self.buffer.trainer_input.experience_buffer.algorithm_type = ( + self.global_config.algorithm_type + ) # set buffer.explorer_output if self.buffer.explorer_output is None: @@ -456,7 +458,10 @@ def check_and_update(self) -> None: # noqa: C901 self.explorer.engine_num * self.explorer.tensor_parallel_size ) self.synchronizer.backend = self.explorer.backend - if self.mode in ["train", "explore", "bench"] and self.synchronizer.sync_method != SyncMethod.CHECKPOINT: + if ( + self.mode in ["train", "explore", "bench"] + and self.synchronizer.sync_method != SyncMethod.CHECKPOINT + ): self.synchronizer.sync_method = SyncMethod.CHECKPOINT logger.warning( f"`{self.mode}` mode only supports checkpoint synchronization, set `synchronizer.sync_method` to `checkpoint`." From 891f62d678dcc33ab565ee8ca14be846f7382926 Mon Sep 17 00:00:00 2001 From: chenyushuo <297086016@qq.com> Date: Tue, 20 May 2025 16:32:39 +0800 Subject: [PATCH 3/3] doc fix --- docs/sphinx_doc/source/tutorial/trinity_configs.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx_doc/source/tutorial/trinity_configs.md b/docs/sphinx_doc/source/tutorial/trinity_configs.md index e02b0a252b..ec9372cc34 100644 --- a/docs/sphinx_doc/source/tutorial/trinity_configs.md +++ b/docs/sphinx_doc/source/tutorial/trinity_configs.md @@ -7,19 +7,19 @@ The following is the main config file for Trinity-RFT. Take `countdown.yaml` as ```yaml mode: both global_config: + algorithm_type: ppo total_epochs: 1 batch_size: 96 eval_interval: 1000 eval_on_latest_ckp: true - algorithm_type: ppo ``` - `mode`: The mode of the experiment, chosen from `both`, `train`, `explore` or `bench`. `both` means both trainer and explorer are launched; `train` means only trainer is launched; `explore` means only explorer is launched; `bench` conducts benchmark evaluation. Default is `both`. +- `global_config.algorithm_type`: The type of the algorithm, Support `ppo`, `grpo`, `opmd` and `dpo`. - `global_config.total_epochs`: The total number of epochs. It should be checked manually. - `global_config.batch_size`: The batch size used for training. It should be checked manually. - `global_config.eval_interval`: The interval steps between two evaluations. Default is `1000`. - `global_config.eval_on_latest_ckp`: Whether to evaluate on only the latest checkpoint or all the checkpoints in the path. Only valid in `bench` mode. Default is `true`. -- `global_config.algorithm_type`: The type of the algorithm, Support `ppo`, `grpo`, `opmd` and `dpo`. ## Monitor