rename online to nccl

chenyushuo · chenyushuo · commit a0beb728e4d0 · 2025-04-28T16:09:21.000+08:00
rename `offline` to `checkpoint`
add `sync_timeout`
add `save_interval` in trainer config
delete `steps_per_epoch` and `reset_consumed`
diff --git a/docs/sphinx_doc/source/tutorial/example_dpo.md b/docs/sphinx_doc/source/tutorial/example_dpo.md
@@ -40,13 +40,13 @@ Note that the dataset has the keys `prompt`, `chosen` and `rejected`. If not, pa
 
 We use the configurations in [`dpo.yaml`](https://github.com/modelscope/Trinity-RFT/tree/main/examples/dpo_humanlike/dpo.yaml) and [`train_dpo.yaml`](https://github.com/modelscope/Trinity-RFT/tree/main/examples/dpo_humanlike/train_dpo.yaml) for this experiment. Some important setups are listed in the following:
 
-We run the experiment in a train mode, as there is no Explorer. To enable this mode, we config `mode` to `train` and set `sync_method` to `offline`. The value of `sync_iteration_interval` can be set as same of the value of `save_freq`.
+We run the experiment in a train mode, as there is no Explorer. To enable this mode, we config `mode` to `train` and set `sync_method` to `checkpoint`. The value of `sync_iteration_interval` can be set as same of the value of `save_freq`.
 
 ```yaml
 # In dpo.yaml
 mode: train
 synchronizer:
-  sync_method: 'offline'
+  sync_method: 'checkpoint'
 buffer:
   train_dataset:
     storage_type: file
diff --git a/docs/sphinx_doc/source/tutorial/example_reasoning_basic.md b/docs/sphinx_doc/source/tutorial/example_reasoning_basic.md
@@ -42,7 +42,7 @@ We run the experiment in a synchronous mode where the Explorer and Trainer opera
 ```yaml
 mode: both
 synchronizer:
-  sync_method: 'online'
+  sync_method: 'nccl'
   sync_iteration_interval: 2
 ```
 
diff --git a/docs/sphinx_doc/source/tutorial/trinity_configs.md b/docs/sphinx_doc/source/tutorial/trinity_configs.md
@@ -164,12 +164,16 @@ explorer:
 
 ```yaml
 synchronizer:
-  sync_method: 'online'
+  sync_method: 'nccl'
   sync_iteration_interval: 10
+  sync_timeout: 1200
 ```
 
-- `synchronizer.sync_method`: The synchronization method, Support `online` and `offline`. Default is `online`.
+- `synchronizer.sync_method`: The synchronization method between `trainer` and `explorer`.
+Support `nccl` and `checkpoint`, `nccl` represents that model weights in `explorer` will be synchronized from `trainer` through `nccl`,
+`checkpoint` represents that `explorer` will load the newest checkpoints saved by `trainer` then update its model weights. Default is `nccl`.
 - `synchronizer.sync_iteration_interval`: The interval between two synchronizations. Default is `10`. It should be set manually.
+- `synchronizer.sync_timeout`: The timeout of the synchronization. Default is `1200`.
 
 ## Trainer
 
diff --git a/examples/dpo_humanlike/dpo.yaml b/examples/dpo_humanlike/dpo.yaml
@@ -47,7 +47,7 @@ explorer:
   max_pending_requests: 32
   max_waiting_steps: 4
 synchronizer:
-  sync_method: 'offline'
+  sync_method: 'checkpoint'
   sync_iteration_interval: 30
 trainer:
   trainer_type: 'verl'
diff --git a/examples/grpo_alfworld/alfworld.yaml b/examples/grpo_alfworld/alfworld.yaml
@@ -43,7 +43,7 @@ explorer:
   gpu_memory_utilization: 0.7
   enable_chunked_prefil: true
 synchronizer:
-  sync_method: 'online'
+  sync_method: 'nccl'
   sync_iteration_interval: 8
 trainer:
   trainer_type: 'verl'
diff --git a/examples/grpo_gsm8k/gsm8k.yaml b/examples/grpo_gsm8k/gsm8k.yaml
@@ -61,7 +61,7 @@ explorer:
   max_pending_requests: 32
   max_waiting_steps: 4
 synchronizer:
-  sync_method: 'online'
+  sync_method: 'nccl'
   sync_iteration_interval: 2
 trainer:
   trainer_type: 'verl'
diff --git a/examples/grpo_math/math.yaml b/examples/grpo_math/math.yaml
@@ -47,7 +47,7 @@ explorer:
   max_pending_requests: 32
   max_waiting_steps: 4
 synchronizer:
-  sync_method: 'online'
+  sync_method: 'nccl'
   sync_iteration_interval: 2
 trainer:
   trainer_type: 'verl'
diff --git a/examples/grpo_sciworld/sciworld.yaml b/examples/grpo_sciworld/sciworld.yaml
@@ -43,7 +43,7 @@ explorer:
   gpu_memory_utilization: 0.7
   enable_chunked_prefil: true
 synchronizer:
-  sync_method: 'online'
+  sync_method: 'nccl'
   sync_iteration_interval: 8
 trainer:
   trainer_type: 'verl'
diff --git a/examples/grpo_webshop/webshop.yaml b/examples/grpo_webshop/webshop.yaml
@@ -43,7 +43,7 @@ explorer:
   gpu_memory_utilization: 0.7
   enable_chunked_prefil: true
 synchronizer:
-  sync_method: 'online'
+  sync_method: 'nccl'
   sync_iteration_interval: 8
 trainer:
   trainer_type: 'verl'
diff --git a/examples/opmd_gsm8k/opmd_gsm8k.yaml b/examples/opmd_gsm8k/opmd_gsm8k.yaml
@@ -40,7 +40,7 @@ explorer:
   max_pending_requests: 32
   max_waiting_steps: 4
 synchronizer:
-  sync_method: 'online'
+  sync_method: 'nccl'
   sync_iteration_interval: 10
 trainer:
   trainer_type: 'verl'
diff --git a/examples/ppo_countdown/countdown.yaml b/examples/ppo_countdown/countdown.yaml
@@ -43,7 +43,7 @@ explorer:
   max_pending_requests: 32
   max_waiting_steps: 4
 synchronizer:
-  sync_method: 'online'
+  sync_method: 'nccl'
   sync_iteration_interval: 10
 trainer:
   trainer_type: 'verl'
diff --git a/tests/common/tmp/template_config.yaml b/tests/common/tmp/template_config.yaml
@@ -44,6 +44,6 @@ monitor:
   project: unittest
   name: test
 synchronizer:
-  sync_method: offline
+  sync_method: checkpoint
   sync_iteration_interval: 10
   wait_for_checkpoint: false
diff --git a/tests/explorer/runner_pool_test.py b/tests/explorer/runner_pool_test.py
@@ -53,7 +53,8 @@ def init_process_group(
         world_size: int,
         group_name: str,
         backend: str = "nccl",
-        offline_update: bool = True,
+        timeout: int = 1200,
+        update_with_checkpoint: bool = True,
     ) -> None:
         pass
 
diff --git a/trinity/common/config.py b/trinity/common/config.py
@@ -6,7 +6,13 @@
 
 from omegaconf import OmegaConf
 
-from trinity.common.constants import AlgorithmType, MonitorType, PromptType, StorageType
+from trinity.common.constants import (
+    AlgorithmType,
+    MonitorType,
+    PromptType,
+    StorageType,
+    SyncMethod,
+)
 from trinity.utils.log import get_logger
 
 logger = get_logger(__name__)
@@ -116,7 +122,6 @@ class BufferConfig:
     max_retry_interval: int = 1
     tokenizer_path: Optional[str] = None
     pad_token_id: Optional[int] = None
-    reset_consumed: Optional[bool] = False
 
     train_dataset: Optional[DatasetConfig] = None
     sft_warmup_dataset: Optional[DatasetConfig] = None
@@ -176,6 +181,7 @@ class TrainerConfig:
     trainer_type: str = "verl"
     trainer_config_path: str = ""
     eval_interval: int = 100
+    save_interval: int = 0
     enable_preview: bool = True  # enable rollout preview in wandb
     trainer_config: Any = field(default_factory=dict)
 
@@ -204,11 +210,11 @@ class MonitorConfig:
 class SynchronizerConfig:
     """Configs for model weight synchronization"""
 
-    # only support `offline` for now
     # TODO: rename to "checkpoint", "nccl", "ipc"
-    sync_method: str = "offline"
+    sync_method: SyncMethod = SyncMethod.NCCL
     # sync weights every `sync_iteration_interval` iterations
     sync_iteration_interval: int = 1
+    sync_timeout: int = 1200
     # wait for the lastest checkpoint to be ready
     wait_for_checkpoint: bool = False
     master_address: Optional[str] = None
@@ -273,28 +279,11 @@ def _check_buffer(self) -> None:
 
     def check_and_update(self) -> None:
         """Check and update the config."""
-        if self.trainer.trainer_type == "verl":
-            if self.trainer.trainer_config:
-                from trinity.common.verl_config import veRLConfig
-
-                trainer_config_schema = OmegaConf.structured(veRLConfig)
-                trainer_config = OmegaConf.merge(trainer_config_schema, self.trainer.trainer_config)
-                self.trainer.trainer_config = OmegaConf.to_object(trainer_config)
-            else:
-                if os.path.isfile(self.trainer.trainer_config_path):
-                    from trinity.common.verl_config import load_config
-
-                    self.trainer.trainer_config = load_config(self.trainer.trainer_config_path)
-                else:
-                    raise ValueError(
-                        f"Invalid trainer config path: {self.trainer.trainer_config_path}"
-                    )
-        else:
-            raise ValueError(f"Invalid trainer type: {self.trainer_type}")
-
         # check mode
         if self.mode not in ["explore", "train", "both"]:
             raise ValueError(f"Invalid mode: {self.mode}")
+        if self.trainer.algorithm_type == AlgorithmType.DPO and self.mode == "both":
+            raise ValueError("DPO does not support `both` mode")
 
         # check model path
         if not os.path.isabs(self.model.model_path):
@@ -310,8 +299,8 @@ def check_and_update(self) -> None:
             self.explorer.engine_num * self.explorer.tensor_parallel_size
         )
         self.synchronizer.backend = self.explorer.backend
-        if self.synchronizer.sync_method == "online" and self.mode != "both":
-            raise ValueError("Online synchronization is only supported in both mode")
+        if self.synchronizer.sync_method == SyncMethod.NCCL and self.mode != "both":
+            raise ValueError("`nccl` synchronization is only supported in both mode.")
 
         # check eval_interval
         if self.trainer.eval_interval % self.synchronizer.sync_iteration_interval != 0:
@@ -342,6 +331,26 @@ def check_and_update(self) -> None:
         self._check_buffer()
         # check and update trainer
         if self.mode != "explore":
+            if self.trainer.trainer_type == "verl":
+                if self.trainer.trainer_config:
+                    from trinity.common.verl_config import veRLConfig
+
+                    trainer_config_schema = OmegaConf.structured(veRLConfig)
+                    trainer_config = OmegaConf.merge(
+                        trainer_config_schema, self.trainer.trainer_config
+                    )
+                    self.trainer.trainer_config = OmegaConf.to_object(trainer_config)
+                else:
+                    if os.path.isfile(self.trainer.trainer_config_path):
+                        from trinity.common.verl_config import load_config
+
+                        self.trainer.trainer_config = load_config(self.trainer.trainer_config_path)
+                    else:
+                        raise ValueError(
+                            f"Invalid trainer config path: {self.trainer.trainer_config_path}"
+                        )
+            else:
+                raise ValueError(f"Invalid trainer type: {self.trainer_type}")
             self.trainer.trainer_config.synchronize_config(self)
         else:
             self.trainer.trainer_config = None
diff --git a/trinity/common/constants.py b/trinity/common/constants.py
@@ -94,3 +94,10 @@ class MonitorType(CaseInsensitiveEnum):
 
     WANDB = "wandb"
     TENSORBOARD = "tensorboard"
+
+
+class SyncMethod(CaseInsensitiveEnum):
+    """Sync Method."""
+
+    NCCL = "nccl"
+    CHECKPOINT = "checkpoint"
diff --git a/trinity/common/models/model.py b/trinity/common/models/model.py
@@ -63,7 +63,8 @@ def init_process_group(
         world_size: int,
         group_name: str,
         backend: str = "nccl",
-        offline_update: bool = True,
+        timeout: int = 1200,
+        update_with_checkpoint: bool = True,
     ) -> None:
         """Init the process group for model weights sync."""
 
diff --git a/trinity/common/models/vllm_async_model.py b/trinity/common/models/vllm_async_model.py
@@ -263,7 +263,8 @@ def init_process_group(
         world_size: int,
         group_name: str,
         backend: str = "nccl",
-        offline_update: bool = True,
+        timeout: int = 1200,
+        update_with_checkpoint: bool = True,
     ):
         return self.async_llm.engine.model_executor.collective_rpc(
             "init_process_group",
@@ -274,7 +275,8 @@ def init_process_group(
                 world_size,
                 group_name,
                 backend,
-                offline_update,
+                timeout,
+                update_with_checkpoint,
             ),
         )
 
diff --git a/trinity/common/models/vllm_model.py b/trinity/common/models/vllm_model.py
@@ -89,7 +89,8 @@ def init_process_group(
         world_size: int,
         group_name: str,
         backend: str = "nccl",
-        offline_update: bool = True,
+        timeout: int = 1200,
+        update_with_checkpoint: bool = True,
     ):
         return self.llm.collective_rpc(
             "init_process_group",
@@ -100,7 +101,8 @@ def init_process_group(
                 world_size,
                 group_name,
                 backend,
-                offline_update,
+                timeout,
+                update_with_checkpoint,
             ),
         )
 
diff --git a/trinity/common/models/vllm_worker.py b/trinity/common/models/vllm_worker.py
@@ -1,5 +1,7 @@
 # -*- coding: utf-8 -*-
 """Custom vLLM Worker."""
+from datetime import timedelta
+
 import ray
 import torch
 import torch.distributed
@@ -26,20 +28,21 @@ def init_process_group(
         world_size: int,
         group_name: str,
         backend: str = "nccl",
-        offline_update: bool = True,
+        timeout: int = 1200,
+        update_with_checkpoint: bool = True,
     ):
         """Init torch process group for model weights update"""
         assert torch.distributed.is_initialized(), "default torch process group must be initialized"
         assert group_name != "", "group name must not be empty"
-        self._offline_update = offline_update
-        if self._offline_update:
+        self._update_with_checkpoint = update_with_checkpoint
+        if self._update_with_checkpoint:
             logger.info(
-                f"init_process_group (offline): address={master_address}:{master_port}, rank={torch.distributed.get_rank()}, rank_offset={rank_offset}, world_size={world_size}"
+                f"init_process_group (checkpoint): address={master_address}:{master_port}, rank={torch.distributed.get_rank()}, rank_offset={rank_offset}, world_size={world_size}"
             )
             self._weight_update_rank = torch.distributed.get_rank() + rank_offset
         else:
             logger.info(
-                f"init_process_group (online): rank={torch.distributed.get_rank()}, rank_offset={rank_offset}, world_size={world_size}"
+                f"init_process_group (nccl): rank={torch.distributed.get_rank()}, rank_offset={rank_offset}, world_size={world_size}"
             )
             self._weight_update_rank = torch.distributed.get_rank() + rank_offset
 
@@ -52,6 +55,7 @@ def init_process_group(
         self._model_update_group = init_process_group(
             backend=backend,
             init_method=init_method,
+            timeout=timedelta(seconds=timeout),
             world_size=world_size,
             rank=self._weight_update_rank,
             group_name=group_name,
diff --git a/trinity/common/verl_config.py b/trinity/common/verl_config.py
@@ -4,6 +4,7 @@
 from omegaconf import OmegaConf
 
 from trinity.common.config import BufferConfig, Config, SynchronizerConfig
+from trinity.common.constants import SyncMethod
 
 
 @dataclass
@@ -243,8 +244,7 @@ class Trainer:
     val_before_train: bool = False
     training_rollout_mode: str = "parallel"
     enable_exp_buffer: bool = True
-    steps_per_epoch: int = 1280
-    get_exp_strategy: Optional[str] = None
+    get_exp_strategy: Optional[str] = None  # TODO
     sync_freq: int = 0
     sft_warmup_iteration: int = 0
     max_actor_ckpt_to_keep: Optional[int] = None
@@ -280,10 +280,12 @@ def synchronize_config(self, config: Config) -> None:
             # for multi-node scenarios, some nodes for rollout, others for training
             self.trainer.n_gpus_per_node = config.cluster.gpu_per_node
         self.trainer.sync_freq = config.synchronizer.sync_iteration_interval
-        if config.synchronizer.sync_method == "offline":
+        if config.synchronizer.sync_method == SyncMethod.CHECKPOINT:
             self.trainer.save_freq = (
                 config.synchronizer.sync_iteration_interval
             )  # TODO: not proper for DPO
+        else:
+            self.trainer.save_freq = config.trainer.save_interval
         self.synchronizer = config.synchronizer
         self.actor_rollout_ref.synchronizer = config.synchronizer
         self.buffer = config.buffer
diff --git a/trinity/explorer/explorer.py b/trinity/explorer/explorer.py
diff --git a/trinity/manager/config_manager.py b/trinity/manager/config_manager.py
diff --git a/trinity/trainer/trainer.py b/trinity/trainer/trainer.py
diff --git a/trinity/trainer/verl/fsdp_workers.py b/trinity/trainer/verl/fsdp_workers.py
diff --git a/trinity/trainer/verl_trainer.py b/trinity/trainer/verl_trainer.py