agentscope-ai
diff --git a/‎.github/workflows/docker/docker-compose.yaml‎
Lines changed: 4 additions & 2 deletions b/‎.github/workflows/docker/docker-compose.yaml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎.github/workflows/unittest.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/unittest.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/sphinx_doc/source/conf.py‎
Lines changed: 0 additions & 1 deletion b/‎docs/sphinx_doc/source/conf.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎docs/sphinx_doc/source/tutorial/example_data_functionalities.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/sphinx_doc/source/tutorial/example_data_functionalities.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/sphinx_doc/source/tutorial/example_dpo.md‎
Lines changed: 2 additions & 3 deletions b/‎docs/sphinx_doc/source/tutorial/example_dpo.md‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎docs/sphinx_doc/source/tutorial/example_multi_turn.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/sphinx_doc/source/tutorial/example_multi_turn.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/sphinx_doc/source/tutorial/example_reasoning_basic.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/sphinx_doc/source/tutorial/example_reasoning_basic.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/sphinx_doc/source/tutorial/trinity_configs.md‎
Lines changed: 8 additions & 20 deletions b/‎docs/sphinx_doc/source/tutorial/trinity_configs.md‎
Lines changed: 8 additions & 20 deletions
diff --git a/‎examples/dpo_humanlike/dpo.yaml‎
Lines changed: 3 additions & 3 deletions b/‎examples/dpo_humanlike/dpo.yaml‎
Lines changed: 3 additions & 3 deletions
@@ -8,7 +8,8 @@ services:
       - RAY_ADDRESS=auto
       - CHECKPOINT_ROOT_DIR=/mnt/checkpoints
       - DATA_ROOT_DIR=/mnt/data
-      - MODEL_PATH=/mnt/checkpoints/Qwen2.5-1.5B-Instruct
+      - MODEL_PATH=/mnt/models/Qwen3-1.7B
+      - CHECKPOINT_PATH=/mnt/checkpoints
     working_dir: /workspace
     networks:
       - trinity-network
@@ -32,7 +33,8 @@ services:
       - HF_ENDPOINT=https://hf-mirror.com
       - CHECKPOINT_ROOT_DIR=/mnt/checkpoints
       - DATA_ROOT_DIR=/mnt/data
-      - MODEL_PATH=/mnt/checkpoints/Qwen2.5-1.5B-Instruct
+      - MODEL_PATH=/mnt/models/Qwen3-1.7B
+      - CHECKPOINT_PATH=/mnt/checkpoints
     working_dir: /workspace
     volumes:
       - trinity-volume:/mnt
 
@@ -36,7 +36,7 @@ jobs:
     - name: Run unittest
       working-directory: trinity-${{ github.run_id }}/.github/workflows/docker
       run: |
-        docker compose exec trinity-node-1 pytest tests --ignore=tests/data --ctrf report.json
+        docker compose exec trinity-node-1 pytest tests -v -s --ignore=tests/data --ctrf report.json
 
     - name: Upload test results
       uses: actions/upload-artifact@v4
 
@@ -94,3 +94,6 @@ modules.rst
 
 # wandb
 wandb/
+
+# checkpoints
+checkpoints/
@@ -40,7 +40,6 @@
 
 templates_path = ["_templates"]
 exclude_patterns = ["build"]
-autodoc_mock_imports = ["ray"]
 
 autodoc_default_options = {
     "members": True,
 
@@ -244,7 +244,7 @@ You can set more config items for this OP (e.g. notification when annotation is
 
 When you start running with the RFT config, the data module will start the OP `human_preference_annotation_mapper`, and then you can find a new project on the "Projects" page of the label-studio server.
 
-![]("../../assets/data-projects.png")
+![](../../assets/data-projects.png)
 
 You can click and enter into this project, and all the samples that need to be annotated are listed on the page.
 
 
@@ -40,13 +40,13 @@ Note that the dataset has the keys `prompt`, `chosen` and `rejected`. If not, pa
 
 We use the configurations in [`dpo.yaml`](https://github.com/modelscope/Trinity-RFT/tree/main/examples/dpo_humanlike/dpo.yaml) and [`train_dpo.yaml`](https://github.com/modelscope/Trinity-RFT/tree/main/examples/dpo_humanlike/train_dpo.yaml) for this experiment. Some important setups are listed in the following:
 
-We run the experiment in a train mode, as there is no Explorer. To enable this mode, we config `mode` to `train` and set `sync_method` to `offline`. The value of `sync_iteration_interval` can be set as same of the value of `save_freq`.
+We run the experiment in a train mode, as there is no Explorer. To enable this mode, we config `mode` to `train` and set `sync_method` to `checkpoint`. The value of `sync_iteration_interval` can be set as same of the value of `save_interval`.
 
 ```yaml
 # In dpo.yaml
 mode: train
 synchronizer:
-  sync_method: 'offline'
+  sync_method: 'checkpoint'
 buffer:
   train_dataset:
     storage_type: file
@@ -63,7 +63,6 @@ trainer:
 # In train_dpo.yaml
 actor_rollout_ref:
   actor:
-    alg_type: dpo
     use_kl_loss: True
     kl_loss_coef: 0.1  # value of beta in DPO
 ```
 
@@ -122,5 +122,5 @@ and include them in the init files in `trinity/common/workflows/__init__.py`
 
 Then you are all set! It should be pretty simple😄, and both environments converge.
 
-![]("../../assets/alfworld_reward_curve.png")
-![]("../../assets/webshop_reward_curve.png")
+![](../../assets/alfworld_reward_curve.png)
+![](../../assets/webshop_reward_curve.png)
@@ -42,7 +42,7 @@ We run the experiment in a synchronous mode where the Explorer and Trainer opera
 ```yaml
 mode: both
 synchronizer:
-  sync_method: 'online'
+  sync_method: 'nccl'
   sync_iteration_interval: 2
 ```
 
 
@@ -15,17 +15,6 @@ monitor:
 - `monitor.name`: The name of the experiment. It must be set manually.
 
 
-## Monitor
-
-```yaml
-monitor:
-  project: "Trinity-RFT-countdown"
-  name: "qwen2.5-1.5B-countdown"
-```
-
-- `monitor.project`: The project name. It must be set manually.
-- `monitor.name`: The name of the experiment. It must be set manually.
-
 ## Data
 
 <!-- The `data` configuration specifies the data used for training. It includes the total number of epochs, the batch size, the path to the dataset, the default workflow type, the default reward function type, and the format configuration. -->
@@ -131,8 +120,6 @@ explorer:
   enforce_eager: true
   dtype: bfloat16
   temperature: 1.0
-  top_p: 1.0
-  top_k: -1
   seed: 42
   logprobs: 0
   repeat_times: 5
@@ -150,8 +137,6 @@ explorer:
 - `explorer.enforce_eager`: Whether to enforce eager mode. Default is `True`.
 - `explorer.dtype`: The data type used in vLLM. Default is `bfloat16`.
 - `explorer.temperature`: The temperature used in vLLM. Default is `1.0`.
-- `explorer.top_p`: The top-p used in vLLM. Default is `1.0`.
-- `explorer.top_k`: The top-k used in vLLM. Default is `-1`.
 - `explorer.seed`: The seed used in vLLM. Default is `42`.
 - `explorer.logprobs`: The logprobs used in vLLM. Default is `0`.
 - `explorer.repeat_times`: The number of times to repeat each task, used for GRPO-like algorithms. Default is `5`.
@@ -164,12 +149,16 @@ explorer:
 
 ```yaml
 synchronizer:
-  sync_method: 'online'
+  sync_method: 'nccl'
   sync_iteration_interval: 10
+  sync_timeout: 1200
 ```
 
-- `synchronizer.sync_method`: The synchronization method, Support `online` and `offline`. Default is `online`.
+- `synchronizer.sync_method`: The synchronization method between `trainer` and `explorer`.
+Support `nccl` and `checkpoint`, `nccl` represents that model weights in `explorer` will be synchronized from `trainer` through `nccl`,
+`checkpoint` represents that `explorer` will load the newest checkpoints saved by `trainer` then update its model weights. Default is `nccl`.
 - `synchronizer.sync_iteration_interval`: The interval between two synchronizations. Default is `10`. It should be set manually.
+- `synchronizer.sync_timeout`: The timeout of the synchronization. Default is `1200`.
 
 ## Trainer
 
@@ -180,13 +169,15 @@ trainer:
   trainer_config_path: 'examples/ppo_countdown/train_countdown.yaml'
   sft_warmup_iteration: 0
   eval_interval: 1000
+  save_interval: 100
 ```
 
 - `trainer.trainer_type`: The backend of the trainer, Only `verl` is supported.
 - `trainer.algorithm_type`: The type of the algorithm, Support `ppo`, `grpo`, `opmd` and `dpo`.
 - `trainer.trainer_config_path`: The path to the trainer configuration file. It must be set manually.
 - `trainer.sft_warmup_iteration`: The number of iterations to warm up the model. Default is `0`.
 - `trainer.eval_interval`: The interval between two evaluations. Default is `1000`.
+- `trainer.save_interval`: The interval between two checkpoints. Default is `100`.
 
 ### veRL Trainer Configuration
 
@@ -249,7 +240,6 @@ actor_rollout_ref:
       optimizer_offload: False
       fsdp_size: -1
     # --- below: opmd ---
-    alg_type: ppo  # ppo / opmd / pairwise_opmd
     tau: 0.000  # strength of regularization w.r.t. old / ref policy
     opmd_baseline: mean  # mean / logavgexp, applicable to opmd
     use_uid: False  # True / False, applicable to pairwise_opmd
@@ -403,7 +393,6 @@ trainer:
 - `actor_rollout_ref.actor.kl_loss_coef`: The coefficient of kl loss.
 - `actor_rollout_ref.actor.kl_loss_type`: How to compute kl loss, optional value is `kl`, `abs`, `mse` or `low_var_kl`.
 - `actor_rollout_ref.actor.ulysses_sequence_parallel_size`: Ulysses sequence parallel size.
-- `actor_rollout_ref.actor.alg_type`: Used for OPMD, optional value is `ppo`, `opmd` or `pairwise_opmd`.
 - `actor_rollout_ref.actor.tau`: strength of regularization w.r.t. old / ref policy.
 - `actor_rollout_ref.actor.opmd_baseline`: mean / logavgexp, applicable to opmd.
 - `actor_rollout_ref.actor.use_uid`: True / False, applicable to pairwise_opmd.
@@ -427,7 +416,6 @@ trainer:
 - `algorithm`: Training algorithm settings.
 
 - `trainer.balance_batch`: Whether to balance batch size between GPUs during training.
-- `trainer.save_freq`: Frequency of saving checkpoints.
 - `trainer.resume_mode`: Resume mode for training. Support `disable`, `auto` and `resume_path`.
 - `trainer.resume_from_path`: Path to resume from.
 - `trainer.critic_warmup`: The number of iteration to train the critic model before actual policy learning.
 
@@ -37,8 +37,6 @@ explorer:
   enforce_eager: true
   dtype: bfloat16
   temperature: 1.0
-  top_p: 1.0
-  top_k: -1
   seed: 42
   logprobs: 0
   repeat_times: 1 # NOTE
@@ -47,12 +45,14 @@ explorer:
   max_pending_requests: 32
   max_waiting_steps: 4
 synchronizer:
-  sync_method: 'offline'
+  sync_method: 'checkpoint'
   sync_iteration_interval: 30
+  sync_timeout: 1200
 trainer:
   trainer_type: 'verl'
   algorithm_type: dpo
   trainer_config_path: 'examples/dpo_humanlike/train_dpo.yaml'
+  save_interval: 30
 monitor:
   cache_root_dir: ""
   project: "dpo_example"