modelscope · pan-x-c · Apr 30, 2025 · Apr 28, 2025 · Apr 28, 2025 · Apr 29, 2025
diff --git a/docs/sphinx_doc/source/conf.py b/docs/sphinx_doc/source/conf.py
@@ -40,7 +40,6 @@
 
 templates_path = ["_templates"]
 exclude_patterns = ["build"]
-autodoc_mock_imports = ["ray"]
 
 autodoc_default_options = {
     "members": True,

diff --git a/docs/sphinx_doc/source/tutorial/example_dpo.md b/docs/sphinx_doc/source/tutorial/example_dpo.md
@@ -40,13 +40,13 @@ Note that the dataset has the keys `prompt`, `chosen` and `rejected`. If not, pa
 
 We use the configurations in [`dpo.yaml`](https://github.com/modelscope/Trinity-RFT/tree/main/examples/dpo_humanlike/dpo.yaml) and [`train_dpo.yaml`](https://github.com/modelscope/Trinity-RFT/tree/main/examples/dpo_humanlike/train_dpo.yaml) for this experiment. Some important setups are listed in the following:
 
-We run the experiment in a train mode, as there is no Explorer. To enable this mode, we config `mode` to `train` and set `sync_method` to `offline`. The value of `sync_iteration_interval` can be set as same of the value of `save_freq`.
+We run the experiment in a train mode, as there is no Explorer. To enable this mode, we config `mode` to `train` and set `sync_method` to `checkpoint`. The value of `sync_iteration_interval` can be set as same of the value of `save_interval`.
 
 ```yaml
 # In dpo.yaml
 mode: train
 synchronizer:
-  sync_method: 'offline'
+  sync_method: 'checkpoint'
 buffer:
   train_dataset:
     storage_type: file
@@ -63,7 +63,6 @@ trainer:
 # In train_dpo.yaml
 actor_rollout_ref:
   actor:
-    alg_type: dpo
     use_kl_loss: True
     kl_loss_coef: 0.1  # value of beta in DPO
 ```

diff --git a/docs/sphinx_doc/source/tutorial/example_reasoning_basic.md b/docs/sphinx_doc/source/tutorial/example_reasoning_basic.md
@@ -42,7 +42,7 @@ We run the experiment in a synchronous mode where the Explorer and Trainer opera
 ```yaml
 mode: both
 synchronizer:
-  sync_method: 'online'
+  sync_method: 'nccl'
   sync_iteration_interval: 2
 ```
 

diff --git a/docs/sphinx_doc/source/tutorial/trinity_configs.md b/docs/sphinx_doc/source/tutorial/trinity_configs.md
@@ -15,17 +15,6 @@ monitor:
 - `monitor.name`: The name of the experiment. It must be set manually.
 
 
-## Monitor
-
-```yaml
-monitor:
-  project: "Trinity-RFT-countdown"
-  name: "qwen2.5-1.5B-countdown"
-```
-
-- `monitor.project`: The project name. It must be set manually.
-- `monitor.name`: The name of the experiment. It must be set manually.
-
 ## Data
 
 <!-- The `data` configuration specifies the data used for training. It includes the total number of epochs, the batch size, the path to the dataset, the default workflow type, the default reward function type, and the format configuration. -->
@@ -131,8 +120,6 @@ explorer:
   enforce_eager: true
   dtype: bfloat16
   temperature: 1.0
-  top_p: 1.0
-  top_k: -1
   seed: 42
   logprobs: 0
   repeat_times: 5
@@ -150,8 +137,6 @@ explorer:
 - `explorer.enforce_eager`: Whether to enforce eager mode. Default is `True`.
 - `explorer.dtype`: The data type used in vLLM. Default is `bfloat16`.
 - `explorer.temperature`: The temperature used in vLLM. Default is `1.0`.
-- `explorer.top_p`: The top-p used in vLLM. Default is `1.0`.
-- `explorer.top_k`: The top-k used in vLLM. Default is `-1`.
 - `explorer.seed`: The seed used in vLLM. Default is `42`.
 - `explorer.logprobs`: The logprobs used in vLLM. Default is `0`.
 - `explorer.repeat_times`: The number of times to repeat each task, used for GRPO-like algorithms. Default is `5`.
@@ -164,12 +149,16 @@ explorer:
 
 ```yaml
 synchronizer:
-  sync_method: 'online'
+  sync_method: 'nccl'
   sync_iteration_interval: 10
+  sync_timeout: 1200
 ```
 
-- `synchronizer.sync_method`: The synchronization method, Support `online` and `offline`. Default is `online`.
+- `synchronizer.sync_method`: The synchronization method between `trainer` and `explorer`.
+Support `nccl` and `checkpoint`, `nccl` represents that model weights in `explorer` will be synchronized from `trainer` through `nccl`,
+`checkpoint` represents that `explorer` will load the newest checkpoints saved by `trainer` then update its model weights. Default is `nccl`.
 - `synchronizer.sync_iteration_interval`: The interval between two synchronizations. Default is `10`. It should be set manually.
+- `synchronizer.sync_timeout`: The timeout of the synchronization. Default is `1200`.
 
 ## Trainer
 
@@ -180,13 +169,15 @@ trainer:
   trainer_config_path: 'examples/ppo_countdown/train_countdown.yaml'
   sft_warmup_iteration: 0
   eval_interval: 1000
+  save_interval: 100
 ```
 
 - `trainer.trainer_type`: The backend of the trainer, Only `verl` is supported.
 - `trainer.algorithm_type`: The type of the algorithm, Support `ppo`, `grpo`, `opmd` and `dpo`.
 - `trainer.trainer_config_path`: The path to the trainer configuration file. It must be set manually.
 - `trainer.sft_warmup_iteration`: The number of iterations to warm up the model. Default is `0`.
 - `trainer.eval_interval`: The interval between two evaluations. Default is `1000`.
+- `trainer.save_interval`: The interval between two checkpoints. Default is `100`.
 
 ### veRL Trainer Configuration
 
@@ -249,7 +240,6 @@ actor_rollout_ref:
       optimizer_offload: False
       fsdp_size: -1
     # --- below: opmd ---
-    alg_type: ppo  # ppo / opmd / pairwise_opmd
     tau: 0.000  # strength of regularization w.r.t. old / ref policy
     opmd_baseline: mean  # mean / logavgexp, applicable to opmd
     use_uid: False  # True / False, applicable to pairwise_opmd
@@ -403,7 +393,6 @@ trainer:
 - `actor_rollout_ref.actor.kl_loss_coef`: The coefficient of kl loss.
 - `actor_rollout_ref.actor.kl_loss_type`: How to compute kl loss, optional value is `kl`, `abs`, `mse` or `low_var_kl`.
 - `actor_rollout_ref.actor.ulysses_sequence_parallel_size`: Ulysses sequence parallel size.
-- `actor_rollout_ref.actor.alg_type`: Used for OPMD, optional value is `ppo`, `opmd` or `pairwise_opmd`.
 - `actor_rollout_ref.actor.tau`: strength of regularization w.r.t. old / ref policy.
 - `actor_rollout_ref.actor.opmd_baseline`: mean / logavgexp, applicable to opmd.
 - `actor_rollout_ref.actor.use_uid`: True / False, applicable to pairwise_opmd.
@@ -427,7 +416,6 @@ trainer:
 - `algorithm`: Training algorithm settings.
 
 - `trainer.balance_batch`: Whether to balance batch size between GPUs during training.
-- `trainer.save_freq`: Frequency of saving checkpoints.
 - `trainer.resume_mode`: Resume mode for training. Support `disable`, `auto` and `resume_path`.
 - `trainer.resume_from_path`: Path to resume from.
 - `trainer.critic_warmup`: The number of iteration to train the critic model before actual policy learning.

diff --git a/examples/dpo_humanlike/dpo.yaml b/examples/dpo_humanlike/dpo.yaml
@@ -37,8 +37,6 @@ explorer:
   enforce_eager: true
   dtype: bfloat16
   temperature: 1.0
-  top_p: 1.0
-  top_k: -1
   seed: 42
   logprobs: 0
   repeat_times: 1 # NOTE
@@ -47,12 +45,14 @@ explorer:
   max_pending_requests: 32
   max_waiting_steps: 4
 synchronizer:
-  sync_method: 'offline'
+  sync_method: 'checkpoint'
   sync_iteration_interval: 30
+  sync_timeout: 1200
 trainer:
   trainer_type: 'verl'
   algorithm_type: dpo
   trainer_config_path: 'examples/dpo_humanlike/train_dpo.yaml'
+  save_interval: 30
 monitor:
   cache_root_dir: ""
   project: "dpo_example"

diff --git a/examples/dpo_humanlike/train_dpo.yaml b/examples/dpo_humanlike/train_dpo.yaml
@@ -23,7 +23,6 @@ actor_rollout_ref:
     enable_gradient_checkpointing: True
     use_remove_padding: False
   actor:
-    alg_type: dpo
     strategy: fsdp  # This is for backward-compatibility
     ppo_mini_batch_size: 32
     # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
@@ -170,7 +169,6 @@ trainer:
   val_generations_to_log_to_wandb: 0
   nnodes: 1
   n_gpus_per_node: 2
-  save_freq: 30
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
   test_freq: 5

diff --git a/examples/grpo_alfworld/alfworld.yaml b/examples/grpo_alfworld/alfworld.yaml
@@ -31,8 +31,6 @@ explorer:
   enforce_eager: true
   dtype: bfloat16
   temperature: 1.0
-  top_p: 1.0
-  top_k: -1
   seed: 42
   logprobs: 0
   repeat_times: 8
@@ -43,12 +41,14 @@ explorer:
   gpu_memory_utilization: 0.7
   enable_chunked_prefil: true
 synchronizer:
-  sync_method: 'online'
+  sync_method: 'nccl'
   sync_iteration_interval: 8
+  sync_timeout: 1200
 trainer:
   trainer_type: 'verl'
   algorithm_type: ppo
   trainer_config_path: 'examples/grpo_alfworld/train_alfworld.yaml'
+  save_interval: 10
 monitor:
   cache_root_dir: ""
   project: "ALFWORLD"

diff --git a/examples/grpo_alfworld/train_alfworld.yaml b/examples/grpo_alfworld/train_alfworld.yaml
@@ -169,7 +169,6 @@ trainer:
   val_generations_to_log_to_wandb: 0
   nnodes: 1
   n_gpus_per_node: 2
-  save_freq: 1
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
   test_freq: 100

diff --git a/examples/grpo_gsm8k/gsm8k.yaml b/examples/grpo_gsm8k/gsm8k.yaml
@@ -51,8 +51,6 @@ explorer:
   enforce_eager: true
   dtype: bfloat16
   temperature: 1.0
-  top_p: 1.0
-  top_k: -1
   seed: 42
   logprobs: 0
   repeat_times: 8
@@ -61,14 +59,16 @@ explorer:
   max_pending_requests: 32
   max_waiting_steps: 4
 synchronizer:
-  sync_method: 'online'
+  sync_method: 'nccl'
   sync_iteration_interval: 2
+  sync_timeout: 1200
 trainer:
   trainer_type: 'verl'
   algorithm_type: ppo
   trainer_config_path: 'examples/grpo_gsm8k/train_gsm8k.yaml'
   sft_warmup_iteration: 0 # Set to integer to enable sft warmup
   eval_interval: 50
+  save_interval: 100
   # get_exp_strategy: 'LFU'
 monitor:
   cache_root_dir: ""

diff --git a/examples/grpo_gsm8k/train_gsm8k.yaml b/examples/grpo_gsm8k/train_gsm8k.yaml
@@ -52,7 +52,6 @@ actor_rollout_ref:
       optimizer_offload: False
       fsdp_size: -1
     # --- below: opmd ---
-    alg_type: ppo  # ppo / opmd / pairwise_opmd
     tau: 0.000  # strength of regularization w.r.t. old / ref policy
     opmd_baseline: mean  # mean / logavgexp, applicable to opmd
     use_uid: False  # True / False, applicable to pairwise_opmd
@@ -174,7 +173,6 @@ trainer:
   val_generations_to_log_to_wandb: 0
   nnodes: 1
   n_gpus_per_node: 2
-  save_freq: 100
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
   test_freq: 5

diff --git a/examples/grpo_math/math.yaml b/examples/grpo_math/math.yaml
@@ -37,8 +37,6 @@ explorer:
   enforce_eager: true
   dtype: bfloat16
   temperature: 1.0
-  top_p: 1.0
-  top_k: -1
   seed: 42
   logprobs: 0
   repeat_times: 8
@@ -47,14 +45,16 @@ explorer:
   max_pending_requests: 32
   max_waiting_steps: 4
 synchronizer:
-  sync_method: 'online'
+  sync_method: 'nccl'
   sync_iteration_interval: 2
+  sync_timeout: 1200
 trainer:
   trainer_type: 'verl'
   algorithm_type: ppo
   trainer_config_path: 'examples/grpo_math/train_math.yaml'
   sft_warmup_iteration: 0 # Set to integer to enable sft warmup
   eval_interval: 10
+  save_interval: 100
 monitor:
   cache_root_dir: ""
   project: grpo_math

diff --git a/examples/grpo_math/train_math.yaml b/examples/grpo_math/train_math.yaml
@@ -51,7 +51,6 @@ actor_rollout_ref:
       optimizer_offload: False
       fsdp_size: -1
     # --- below: opmd ---
-    alg_type: ppo  # ppo / opmd / pairwise_opmd
     tau: 0.000  # strength of regularization w.r.t. old / ref policy
     opmd_baseline: mean  # mean / logavgexp, applicable to opmd
     use_uid: False  # True / False, applicable to pairwise_opmd
@@ -166,7 +165,6 @@ trainer:
   val_generations_to_log_to_wandb: 0
   nnodes: 1
   n_gpus_per_node: 2
-  save_freq: 100
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
   test_freq: 5

diff --git a/examples/grpo_sciworld/sciworld.yaml b/examples/grpo_sciworld/sciworld.yaml
@@ -31,8 +31,6 @@ explorer:
   enforce_eager: true
   dtype: bfloat16
   temperature: 1.0
-  top_p: 1.0
-  top_k: -1
   seed: 42
   logprobs: 0
   repeat_times: 8
@@ -43,12 +41,14 @@ explorer:
   gpu_memory_utilization: 0.7
   enable_chunked_prefil: true
 synchronizer:
-  sync_method: 'online'
+  sync_method: 'nccl'
   sync_iteration_interval: 8
+  sync_timeout: 1200
 trainer:
   trainer_type: 'verl'
   algorithm_type: ppo
   trainer_config_path: 'examples/grpo_sciworld/train_sciworld.yaml'
+  save_interval: 10
 monitor:
   cache_root_dir: ""
   project: "sciworld"

diff --git a/examples/grpo_sciworld/train_sciworld.yaml b/examples/grpo_sciworld/train_sciworld.yaml
@@ -164,7 +164,6 @@ trainer:
   val_generations_to_log_to_wandb: 0
   nnodes: 1
   n_gpus_per_node: 2
-  save_freq: 1
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
   test_freq: 100

diff --git a/examples/grpo_webshop/train_webshop.yaml b/examples/grpo_webshop/train_webshop.yaml
@@ -169,7 +169,6 @@ trainer:
   val_generations_to_log_to_wandb: 0
   nnodes: 1
   n_gpus_per_node: 2
-  save_freq: 1
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
   test_freq: 100

diff --git a/examples/grpo_webshop/webshop.yaml b/examples/grpo_webshop/webshop.yaml
@@ -31,8 +31,6 @@ explorer:
   enforce_eager: true
   dtype: bfloat16
   temperature: 1.0
-  top_p: 1.0
-  top_k: -1
   seed: 42
   logprobs: 0
   repeat_times: 8
@@ -43,12 +41,14 @@ explorer:
   gpu_memory_utilization: 0.7
   enable_chunked_prefil: true
 synchronizer:
-  sync_method: 'online'
+  sync_method: 'nccl'
   sync_iteration_interval: 8
+  sync_timeout: 1200
 trainer:
   trainer_type: 'verl'
   algorithm_type: ppo
   trainer_config_path: 'examples/grpo_webshop/train_webshop.yaml'
+  save_interval: 10
 monitor:
   cache_root_dir: ""
   project: "WEBSHOP"

diff --git a/examples/opmd_gsm8k/opmd_gsm8k.yaml b/examples/opmd_gsm8k/opmd_gsm8k.yaml
@@ -30,8 +30,6 @@ explorer:
   enforce_eager: true
   dtype: bfloat16
   temperature: 1.0
-  top_p: 1.0
-  top_k: -1
   seed: 42
   logprobs: 0
   repeat_times: 8
@@ -40,13 +38,15 @@ explorer:
   max_pending_requests: 32
   max_waiting_steps: 4
 synchronizer:
-  sync_method: 'online'
+  sync_method: 'nccl'
   sync_iteration_interval: 10
+  sync_timeout: 1200
 trainer:
   trainer_type: 'verl'
   algorithm_type: opmd
   trainer_config_path: 'examples/opmd_gsm8k/train_opmd_gsm8k.yaml'
   sft_warmup_iteration: 0
+  save_interval: 100
 monitor:
   cache_root_dir: ""
   project: "Trinity-RFT-gsm8k-test-opmd"