From 3dbeb9f5fe740a947df0434d506c73f8b9b322b4 Mon Sep 17 00:00:00 2001
From: chenyushuo <297086016@qq.com>
Date: Tue, 20 May 2025 15:18:50 +0800
Subject: [PATCH 1/3] mv `algorithm_type` from `trainer` to `global_config`

---
 .../sphinx_doc/source/tutorial/example_dpo.md |  2 +-
 .../source/tutorial/trinity_configs.md        |  4 +--
 examples/async_gsm8k/explorer.yaml            |  2 +-
 examples/async_gsm8k/trainer.yaml             |  2 +-
 examples/dpo_humanlike/dpo.yaml               |  2 +-
 examples/grpo_alfworld/alfworld.yaml          |  2 +-
 examples/grpo_gsm8k/gsm8k.yaml                |  2 +-
 examples/grpo_math/math.yaml                  |  2 +-
 examples/grpo_sciworld/sciworld.yaml          |  2 +-
 examples/grpo_webshop/webshop.yaml            |  2 +-
 examples/opmd_gsm8k/opmd_gsm8k.yaml           |  2 +-
 examples/ppo_countdown/countdown.yaml         |  2 +-
 trinity/cli/launcher.py                       |  4 +--
 trinity/common/config.py                      | 28 +++++++++----------
 trinity/common/verl_config.py                 |  6 ++--
 trinity/manager/config_manager.py             | 27 +++++++-----------
 16 files changed, 42 insertions(+), 49 deletions(-)

diff --git a/docs/sphinx_doc/source/tutorial/example_dpo.md b/docs/sphinx_doc/source/tutorial/example_dpo.md
index 5d274fbb47..8bbf3d9199 100644
--- a/docs/sphinx_doc/source/tutorial/example_dpo.md
+++ b/docs/sphinx_doc/source/tutorial/example_dpo.md
@@ -56,7 +56,7 @@ buffer:
       prompt_key: <prompt_key>
       chosen_key: <chosen_key>
       rejected_key: <rejected_key>
-trainer:
+global_config:
   algorithm_type: dpo
 
 # In train_dpo.yaml
diff --git a/docs/sphinx_doc/source/tutorial/trinity_configs.md b/docs/sphinx_doc/source/tutorial/trinity_configs.md
index 430f872489..e02b0a252b 100644
--- a/docs/sphinx_doc/source/tutorial/trinity_configs.md
+++ b/docs/sphinx_doc/source/tutorial/trinity_configs.md
@@ -11,6 +11,7 @@ global_config:
   batch_size: 96
   eval_interval: 1000
   eval_on_latest_ckp: true
+  algorithm_type: ppo
 ```
 
 - `mode`: The mode of the experiment, chosen from `both`, `train`, `explore` or `bench`. `both` means both trainer and explorer are launched; `train` means only trainer is launched; `explore` means only explorer is launched; `bench` conducts benchmark evaluation. Default is `both`.
@@ -18,6 +19,7 @@ global_config:
 - `global_config.batch_size`: The batch size used for training. It should be checked manually.
 - `global_config.eval_interval`: The interval steps between two evaluations. Default is `1000`.
 - `global_config.eval_on_latest_ckp`: Whether to evaluate on only the latest checkpoint or all the checkpoints in the path. Only valid in `bench` mode. Default is `true`.
+- `global_config.algorithm_type`: The type of the algorithm, Support `ppo`, `grpo`, `opmd` and `dpo`.
 
 
 ## Monitor
@@ -192,7 +194,6 @@ Support `nccl` and `checkpoint`, `nccl` represents that model weights in `explor
 ```yaml
 trainer:
   trainer_type: 'verl'
-  algorithm_type: ppo
   trainer_config_path: 'examples/ppo_countdown/train_countdown.yaml'
   sft_warmup_steps: 0
   eval_interval: 1000
@@ -200,7 +201,6 @@ trainer:
 ```
 
 - `trainer.trainer_type`: The backend of the trainer, Only `verl` is supported.
-- `trainer.algorithm_type`: The type of the algorithm, Support `ppo`, `grpo`, `opmd` and `dpo`.
 - `trainer.trainer_config_path`: The path to the trainer configuration file. It must be set manually.
 - `trainer.sft_warmup_steps`: The number of steps to warm up the model. Default is `0`.
 - `trainer.eval_interval`: The interval steps between two evaluations. Default is `1000`.
diff --git a/examples/async_gsm8k/explorer.yaml b/examples/async_gsm8k/explorer.yaml
index 673da76a59..8402e4ced2 100644
--- a/examples/async_gsm8k/explorer.yaml
+++ b/examples/async_gsm8k/explorer.yaml
@@ -3,6 +3,7 @@ global_config:
   total_epochs: 20
   batch_size: 96
   eval_interval: 10
+  algorithm_type: grpo
 model:
   model_path: /PATH/TO/MODEL/
   max_prompt_tokens: 256
@@ -51,7 +52,6 @@ synchronizer:
   sync_iteration_interval: 10
 trainer:
   trainer_type: 'verl'
-  algorithm_type: grpo
   trainer_config_path: examples/async_gsm8k/verl_config.yaml
   sft_warmup_steps: 0 # Set to integer to enable sft warmup
 monitor:
diff --git a/examples/async_gsm8k/trainer.yaml b/examples/async_gsm8k/trainer.yaml
index df193c3f37..79a50337d2 100644
--- a/examples/async_gsm8k/trainer.yaml
+++ b/examples/async_gsm8k/trainer.yaml
@@ -3,6 +3,7 @@ global_config:
   total_epochs: 20
   batch_size: 96
   eval_interval: 10
+  algorithm_type: grpo
 model:
   model_path: /PATH/TO/MODEL/
   max_prompt_tokens: 256
@@ -50,7 +51,6 @@ synchronizer:
   sync_iteration_interval: 10
 trainer:
   trainer_type: 'verl'
-  algorithm_type: grpo
   trainer_config_path: examples/async_gsm8k/verl_config.yaml
   sft_warmup_steps: 0 # Set to integer to enable sft warmup
 monitor:
diff --git a/examples/dpo_humanlike/dpo.yaml b/examples/dpo_humanlike/dpo.yaml
index de459f9230..825788a792 100644
--- a/examples/dpo_humanlike/dpo.yaml
+++ b/examples/dpo_humanlike/dpo.yaml
@@ -2,6 +2,7 @@ mode: train
 global_config:
   total_epochs: 20
   batch_size: 32 # NOTE
+  algorithm_type: dpo
 model:
   model_path: '/PATH/TO/MODEL/CHECKPOINT/' # NOTE
   max_prompt_tokens: 1792
@@ -29,7 +30,6 @@ synchronizer:
   sync_timeout: 1200
 trainer:
   trainer_type: 'verl'
-  algorithm_type: dpo
   trainer_config_path: 'examples/dpo_humanlike/train_dpo.yaml'
   save_interval: 30
 monitor:
diff --git a/examples/grpo_alfworld/alfworld.yaml b/examples/grpo_alfworld/alfworld.yaml
index 18dc2595e6..08a9fd8e42 100644
--- a/examples/grpo_alfworld/alfworld.yaml
+++ b/examples/grpo_alfworld/alfworld.yaml
@@ -1,6 +1,7 @@
 global_config:
   total_epochs: 20
   batch_size: 4
+  algorithm_type: grpo
 model:
   model_path: '/PATH/TO/MODEL/CHECKPOINT/'
   max_prompt_tokens: 4096
@@ -50,7 +51,6 @@ synchronizer:
   sync_timeout: 1200
 trainer:
   trainer_type: 'verl'
-  algorithm_type: grpo
   trainer_config_path: 'examples/grpo_alfworld/train_alfworld.yaml'
   save_interval: 10
 monitor:
diff --git a/examples/grpo_gsm8k/gsm8k.yaml b/examples/grpo_gsm8k/gsm8k.yaml
index 748a5ac0e5..71630adaac 100644
--- a/examples/grpo_gsm8k/gsm8k.yaml
+++ b/examples/grpo_gsm8k/gsm8k.yaml
@@ -15,6 +15,7 @@ global_config:
   total_epochs: 1
   batch_size: 96
   eval_interval: 50
+  algorithm_type: grpo
 model:
   model_path: '/PATH/TO/MODEL/'
   max_prompt_tokens: 256
@@ -80,7 +81,6 @@ synchronizer:
   sync_timeout: 1200
 trainer:
   trainer_type: 'verl'
-  algorithm_type: grpo
   trainer_config_path: 'examples/grpo_gsm8k/train_gsm8k.yaml'
   sft_warmup_steps: 0 # Set to integer to enable sft warmup
   save_interval: 100
diff --git a/examples/grpo_math/math.yaml b/examples/grpo_math/math.yaml
index b22291b09a..a1527d9255 100644
--- a/examples/grpo_math/math.yaml
+++ b/examples/grpo_math/math.yaml
@@ -2,6 +2,7 @@ global_config:
   total_epochs: 20
   batch_size: 288
   eval_interval: 10
+  algorithm_type: grpo
 model:
   model_path: /PATH/TO/MODEL/
   max_prompt_tokens: 1024
@@ -50,7 +51,6 @@ synchronizer:
   sync_timeout: 1200
 trainer:
   trainer_type: 'verl'
-  algorithm_type: grpo
   trainer_config_path: 'examples/grpo_math/train_math.yaml'
   sft_warmup_steps: 0 # Set to integer to enable sft warmup
   save_interval: 100
diff --git a/examples/grpo_sciworld/sciworld.yaml b/examples/grpo_sciworld/sciworld.yaml
index 6ba88d51e2..350b37ba80 100644
--- a/examples/grpo_sciworld/sciworld.yaml
+++ b/examples/grpo_sciworld/sciworld.yaml
@@ -1,6 +1,7 @@
 global_config:
   total_epochs: 20
   batch_size: 4
+  algorithm_type: grpo
 model:
   model_path: '/PATH/TO/MODEL/CHECKPOINT/'
   max_prompt_tokens: 4096
@@ -50,7 +51,6 @@ synchronizer:
   sync_timeout: 1200
 trainer:
   trainer_type: 'verl'
-  algorithm_type: grpo
   trainer_config_path: 'examples/grpo_sciworld/train_sciworld.yaml'
   save_interval: 10
 monitor:
diff --git a/examples/grpo_webshop/webshop.yaml b/examples/grpo_webshop/webshop.yaml
index d5b59d67b0..0ae7563db2 100644
--- a/examples/grpo_webshop/webshop.yaml
+++ b/examples/grpo_webshop/webshop.yaml
@@ -1,6 +1,7 @@
 global_config:
   total_epochs: 20
   batch_size: 4
+  algorithm_type: grpo
 model:
   model_path: '/PATH/TO/MODEL/CHECKPOINT/'
   max_prompt_tokens: 4096
@@ -50,7 +51,6 @@ synchronizer:
   sync_timeout: 1200
 trainer:
   trainer_type: 'verl'
-  algorithm_type: grpo
   trainer_config_path: 'examples/grpo_webshop/train_webshop.yaml'
   save_interval: 10
 monitor:
diff --git a/examples/opmd_gsm8k/opmd_gsm8k.yaml b/examples/opmd_gsm8k/opmd_gsm8k.yaml
index 4739400f1a..7cc502eff2 100644
--- a/examples/opmd_gsm8k/opmd_gsm8k.yaml
+++ b/examples/opmd_gsm8k/opmd_gsm8k.yaml
@@ -1,6 +1,7 @@
 global_config:
   total_epochs: 1
   batch_size: 96
+  algorithm_type: opmd
 model:
   model_path: '{path to models}/Qwen2.5-1.5B-Inst'
   max_prompt_tokens: 256
@@ -49,7 +50,6 @@ synchronizer:
   sync_timeout: 1200
 trainer:
   trainer_type: 'verl'
-  algorithm_type: opmd
   trainer_config_path: 'examples/opmd_gsm8k/train_opmd_gsm8k.yaml'
   sft_warmup_steps: 0
   save_interval: 100
diff --git a/examples/ppo_countdown/countdown.yaml b/examples/ppo_countdown/countdown.yaml
index f7ad9c4362..c428a167b4 100644
--- a/examples/ppo_countdown/countdown.yaml
+++ b/examples/ppo_countdown/countdown.yaml
@@ -2,6 +2,7 @@ global_config:
   total_epochs: 20
   batch_size: 96
   eval_interval: 1000
+  algorithm_type: ppo
 model:
   model_path: '/PATH/TO/MODEL/CHECKPOINT/'
   max_prompt_tokens: 256
@@ -51,7 +52,6 @@ synchronizer:
   sync_timeout: 1200
 trainer:
   trainer_type: 'verl'
-  algorithm_type: ppo
   trainer_config_path: 'examples/ppo_countdown/train_countdown.yaml'
   sft_warmup_steps: 0
   save_interval: 100
diff --git a/trinity/cli/launcher.py b/trinity/cli/launcher.py
index 5dbf53abb4..ce36299e84 100644
--- a/trinity/cli/launcher.py
+++ b/trinity/cli/launcher.py
@@ -56,7 +56,7 @@ def train(config: Config) -> None:
                 logger.info("SFT warmup finished.")
                 break
 
-    algo_type = config.trainer.algorithm_type
+    algo_type = config.global_config.algorithm_type
     try:
         ray.get(trainer.train.remote(algo_type))
         logger.info("Train finished.")
@@ -100,7 +100,7 @@ def both(config: Config) -> None:
                 break
         ray.get([explorer.sync_weight.remote(), trainer.sync_weight.remote()])
 
-    algo_type = config.trainer.algorithm_type
+    algo_type = config.global_config.algorithm_type
     while True:
         try:
             ref_explore = explorer.explore_one_period.remote()
diff --git a/trinity/common/config.py b/trinity/common/config.py
index 2e8e830007..d311e57b3a 100644
--- a/trinity/common/config.py
+++ b/trinity/common/config.py
@@ -120,6 +120,7 @@ class GlobalConfig:
     batch_size: int = 1
     eval_interval: int = 100
     eval_on_latest_ckp: bool = True
+    algorithm_type: AlgorithmType = AlgorithmType.PPO
 
 
 @dataclass
@@ -227,7 +228,6 @@ class TrainerConfig:
     trainer_config: Any = field(default_factory=dict)
 
     # train algorithm
-    algorithm_type: AlgorithmType = AlgorithmType.PPO
     get_exp_strategy: Optional[str] = None
 
     # warmup config
@@ -309,7 +309,7 @@ def _check_interval(self) -> None:
         # check eval_interval
         if (
             self.mode != "bench"
-            and self.trainer.algorithm_type != AlgorithmType.DPO
+            and self.global_config.algorithm_type != AlgorithmType.DPO
             and self.global_config.eval_interval % self.synchronizer.sync_interval != 0
         ):
             self.global_config.eval_interval = (
@@ -322,12 +322,12 @@ def _check_interval(self) -> None:
         # check save_interval
         if (
             self.mode != "bench"
-            and self.trainer.algorithm_type != AlgorithmType.DPO
+            and self.global_config.algorithm_type != AlgorithmType.DPO
             and self.synchronizer.sync_method == SyncMethod.CHECKPOINT
         ):
             if self.trainer.save_interval != self.synchronizer.sync_interval:
                 logger.warning(
-                    f"When `trainer.algorithm_type` != `DPO` and `synchronizer.sync_method` == `checkpoint`, "
+                    f"When `global_config.algorithm_type` != `DPO` and `synchronizer.sync_method` == `checkpoint`, "
                     f"`trainer.save_interval` will be set to "
                     f"`synchronizer.sync_interval = {self.synchronizer.sync_interval}`."
                 )
@@ -390,20 +390,22 @@ def _check_buffer(self) -> None:  # noqa: C901
                     f"Auto set `buffer.trainer_input.experience_buffer` to {self.buffer.trainer_input.experience_buffer}"
                 )
         elif self.mode == "train":  # TODO: to be check
-            if self.trainer.algorithm_type.is_dpo():
+            if self.global_config.algorithm_type.is_dpo():
                 if (
                     self.buffer.trainer_input.experience_buffer is None
                     or not self.buffer.trainer_input.experience_buffer.path
                 ):
                     raise ValueError(
-                        "`buffer.trainer_input.experience_buffer.path` is required when `trainer.algorithm_type == AlgorithmType.DPO`"
+                        "`buffer.trainer_input.experience_buffer.path` is required when `global_config.algorithm_type == AlgorithmType.DPO`"
                     )
-        if self.mode in ["both", "train"]:
-            self.buffer.trainer_input.experience_buffer.algorithm_type = self.trainer.algorithm_type
+        if self.buffer.trainer_input.experience_buffer is not None:
+            self.buffer.trainer_input.experience_buffer.algorithm_type = self.global_config.algorithm_type
 
         # set buffer.explorer_output
         if self.buffer.explorer_output is None:
             self.buffer.explorer_output = self.buffer.trainer_input.experience_buffer
+        else:
+            self.buffer.explorer_output.algorithm_type = self.global_config.algorithm_type
 
         # check trainer_input.sft_warmup_dataset
         if (
@@ -440,7 +442,7 @@ def check_and_update(self) -> None:  # noqa: C901
         # check mode
         if self.mode not in ["explore", "train", "both", "bench"]:
             raise ValueError(f"Invalid mode: {self.mode}")
-        if self.trainer.algorithm_type == AlgorithmType.DPO and self.mode == "both":
+        if self.global_config.algorithm_type == AlgorithmType.DPO and self.mode == "both":
             raise ValueError("DPO does not support `both` mode")
 
         # check model path
@@ -454,21 +456,19 @@ def check_and_update(self) -> None:  # noqa: C901
             self.explorer.engine_num * self.explorer.tensor_parallel_size
         )
         self.synchronizer.backend = self.explorer.backend
-        if self.mode == "bench" and self.synchronizer.sync_method != SyncMethod.CHECKPOINT:
+        if self.mode in ["train", "explore", "bench"] and self.synchronizer.sync_method != SyncMethod.CHECKPOINT:
             self.synchronizer.sync_method = SyncMethod.CHECKPOINT
             logger.warning(
-                "Bench mode only supports checkpoint synchronization, set `synchronizer.sync_method` to `checkpoint`."
+                f"`{self.mode}` mode only supports checkpoint synchronization, set `synchronizer.sync_method` to `checkpoint`."
             )
         if (
-            self.trainer.algorithm_type == AlgorithmType.DPO
+            self.global_config.algorithm_type == AlgorithmType.DPO
             and self.synchronizer.sync_method != SyncMethod.CHECKPOINT
         ):
             self.synchronizer.sync_method = SyncMethod.CHECKPOINT
             logger.warning(
                 "DPO only supports checkpoint synchronization, set `synchronizer.sync_method` to `checkpoint`."
             )
-        if self.synchronizer.sync_method == SyncMethod.NCCL and self.mode != "both":
-            raise ValueError("`nccl` synchronization is only supported in both mode.")
 
         self._check_interval()
 
diff --git a/trinity/common/verl_config.py b/trinity/common/verl_config.py
index 944037ac2e..f0f323918b 100644
--- a/trinity/common/verl_config.py
+++ b/trinity/common/verl_config.py
@@ -310,11 +310,11 @@ def synchronize_config(self, config: Config) -> None:
         self.critic.ppo_mini_batch_size = config.global_config.batch_size
         self.critic.rollout_n = self.actor_rollout_ref.rollout.n
 
-        self.actor_rollout_ref.actor.algorithm_type = config.trainer.algorithm_type
-        if config.trainer.algorithm_type == AlgorithmType.PPO:
+        self.actor_rollout_ref.actor.algorithm_type = config.global_config.algorithm_type
+        if config.global_config.algorithm_type == AlgorithmType.PPO:
             logger.info("Using GAE `adv_estimator` for PPO")
             self.algorithm.adv_estimator = AdvantageEstimator.GAE.value
-        elif config.trainer.algorithm_type == AlgorithmType.GRPO:
+        elif config.global_config.algorithm_type == AlgorithmType.GRPO:
             logger.info("Using GRPO `adv_estimator` for GRPO")
             self.algorithm.adv_estimator = AdvantageEstimator.GRPO.value
 
diff --git a/trinity/manager/config_manager.py b/trinity/manager/config_manager.py
index 1f1e48d61b..f56b8eb363 100644
--- a/trinity/manager/config_manager.py
+++ b/trinity/manager/config_manager.py
@@ -66,6 +66,7 @@ def _init_default_config(self):
             "_train_batch_size_per_gpu": 16,
             "train_batch_size": 96,
             "eval_interval": 1000,
+            "algorithm_type": AlgorithmType.PPO.value,
             # Taskset Configs
             "taskset_path": "",
             "taskset_subset_name": None,
@@ -131,7 +132,6 @@ def _init_default_config(self):
             "sync_timeout": 1200,
             # Trainer Configs
             "trainer_type": "verl",
-            "algorithm_type": AlgorithmType.PPO.value,
             "sft_warmup_steps": 0,
             "_nccl_save_interval": 100,
             "save_interval": 100,
@@ -1600,20 +1600,7 @@ def generate_config(self):
         else:
             trainer_n_gpus_per_node = st.session_state["gpu_per_node"]
 
-        critic_model_path = (
-            st.session_state["critic_model_path"].strip()
-            if st.session_state["critic_model_path"].strip()
-            else st.session_state["model_path"]
-        )
-
-        if st.session_state["algorithm_type"] == AlgorithmType.DPO.value:
-            pass
-            # experience_buffer_path = (
-            #     st.session_state["experience_buffer_path"].strip()
-            #     if st.session_state["experience_buffer_path"].strip()
-            #     else st.session_state["dataset_path"].strip()
-            # )
-        else:  # not dpo algorithms
+        if st.session_state["algorithm_type"] != AlgorithmType.DPO.value:
             experience_buffer_path = st.session_state["experience_buffer_path"].strip()
             if (
                 not experience_buffer_path
@@ -1657,10 +1644,10 @@ def generate_config(self):
                     "total_epochs": st.session_state["total_epochs"],
                     "batch_size": st.session_state["train_batch_size"],
                     "eval_interval": st.session_state["eval_interval"],
+                    "algorithm_type": st.session_state["algorithm_type"],
                 },
                 "model": {
                     "model_path": st.session_state["model_path"],
-                    "critic_model_path": critic_model_path,
                     "max_prompt_tokens": st.session_state["max_prompt_tokens"],
                     "max_response_tokens": st.session_state["max_response_tokens"],
                     "checkpoint_path": st.session_state["checkpoint_path"],
@@ -1732,7 +1719,6 @@ def generate_config(self):
                 },
                 "trainer": {
                     "trainer_type": st.session_state["trainer_type"],
-                    "algorithm_type": st.session_state["algorithm_type"],
                     "trainer_config": trainer_config,
                     "sft_warmup_steps": st.session_state["sft_warmup_steps"],
                     "save_interval": st.session_state["save_interval"],
@@ -1744,6 +1730,13 @@ def generate_config(self):
                 },
             }
 
+            if st.session_state["adv_estimator"] == AdvantageEstimator.GAE.value:
+                config["model"]["critic_model_path"] = (
+                    st.session_state["critic_model_path"].strip()
+                    if st.session_state["critic_model_path"].strip()
+                    else st.session_state["model_path"]
+                )
+
             for idx in range(st.session_state["_eval_tasksets_num"]):
                 if st.session_state[f"eval_taskset_{idx}_path"].strip():
                     config["buffer"]["explorer_input"]["eval_tasksets"].append(

From 351fd2a0efb61651a542696125669b2793596e42 Mon Sep 17 00:00:00 2001
From: chenyushuo <297086016@qq.com>
Date: Tue, 20 May 2025 15:34:01 +0800
Subject: [PATCH 2/3] doc fix

---
 trinity/common/config.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/trinity/common/config.py b/trinity/common/config.py
index d311e57b3a..2508081cfb 100644
--- a/trinity/common/config.py
+++ b/trinity/common/config.py
@@ -399,7 +399,9 @@ def _check_buffer(self) -> None:  # noqa: C901
                         "`buffer.trainer_input.experience_buffer.path` is required when `global_config.algorithm_type == AlgorithmType.DPO`"
                     )
         if self.buffer.trainer_input.experience_buffer is not None:
-            self.buffer.trainer_input.experience_buffer.algorithm_type = self.global_config.algorithm_type
+            self.buffer.trainer_input.experience_buffer.algorithm_type = (
+                self.global_config.algorithm_type
+            )
 
         # set buffer.explorer_output
         if self.buffer.explorer_output is None:
@@ -456,7 +458,10 @@ def check_and_update(self) -> None:  # noqa: C901
             self.explorer.engine_num * self.explorer.tensor_parallel_size
         )
         self.synchronizer.backend = self.explorer.backend
-        if self.mode in ["train", "explore", "bench"] and self.synchronizer.sync_method != SyncMethod.CHECKPOINT:
+        if (
+            self.mode in ["train", "explore", "bench"]
+            and self.synchronizer.sync_method != SyncMethod.CHECKPOINT
+        ):
             self.synchronizer.sync_method = SyncMethod.CHECKPOINT
             logger.warning(
                 f"`{self.mode}` mode only supports checkpoint synchronization, set `synchronizer.sync_method` to `checkpoint`."

From 891f62d678dcc33ab565ee8ca14be846f7382926 Mon Sep 17 00:00:00 2001
From: chenyushuo <297086016@qq.com>
Date: Tue, 20 May 2025 16:32:39 +0800
Subject: [PATCH 3/3] doc fix

---
 docs/sphinx_doc/source/tutorial/trinity_configs.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sphinx_doc/source/tutorial/trinity_configs.md b/docs/sphinx_doc/source/tutorial/trinity_configs.md
index e02b0a252b..ec9372cc34 100644
--- a/docs/sphinx_doc/source/tutorial/trinity_configs.md
+++ b/docs/sphinx_doc/source/tutorial/trinity_configs.md
@@ -7,19 +7,19 @@ The following is the main config file for Trinity-RFT. Take `countdown.yaml` as
 ```yaml
 mode: both
 global_config:
+  algorithm_type: ppo
   total_epochs: 1
   batch_size: 96
   eval_interval: 1000
   eval_on_latest_ckp: true
-  algorithm_type: ppo
 ```
 
 - `mode`: The mode of the experiment, chosen from `both`, `train`, `explore` or `bench`. `both` means both trainer and explorer are launched; `train` means only trainer is launched; `explore` means only explorer is launched; `bench` conducts benchmark evaluation. Default is `both`.
+- `global_config.algorithm_type`: The type of the algorithm, Support `ppo`, `grpo`, `opmd` and `dpo`.
 - `global_config.total_epochs`: The total number of epochs. It should be checked manually.
 - `global_config.batch_size`: The batch size used for training. It should be checked manually.
 - `global_config.eval_interval`: The interval steps between two evaluations. Default is `1000`.
 - `global_config.eval_on_latest_ckp`: Whether to evaluate on only the latest checkpoint or all the checkpoints in the path. Only valid in `bench` mode. Default is `true`.
-- `global_config.algorithm_type`: The type of the algorithm, Support `ppo`, `grpo`, `opmd` and `dpo`.
 
 
 ## Monitor