agentscope-ai
diff --git a/‎docs/sphinx_doc/source/tutorial/example_async_mode.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/sphinx_doc/source/tutorial/example_async_mode.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/sphinx_doc/source/tutorial/example_reasoning_basic.md‎
Lines changed: 10 additions & 9 deletions b/‎docs/sphinx_doc/source/tutorial/example_reasoning_basic.md‎
Lines changed: 10 additions & 9 deletions
diff --git a/‎docs/sphinx_doc/source/tutorial/trinity_configs.md‎
Lines changed: 11 additions & 18 deletions b/‎docs/sphinx_doc/source/tutorial/trinity_configs.md‎
Lines changed: 11 additions & 18 deletions
diff --git a/‎examples/async_gsm8k/explorer.yaml‎
Lines changed: 9 additions & 11 deletions b/‎examples/async_gsm8k/explorer.yaml‎
Lines changed: 9 additions & 11 deletions
diff --git a/‎examples/async_gsm8k/trainer.yaml‎
Lines changed: 10 additions & 12 deletions b/‎examples/async_gsm8k/trainer.yaml‎
Lines changed: 10 additions & 12 deletions
diff --git a/‎examples/dpo_humanlike/dpo.yaml‎
Lines changed: 2 additions & 4 deletions b/‎examples/dpo_humanlike/dpo.yaml‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎examples/grpo_alfworld/alfworld.yaml‎
Lines changed: 14 additions & 15 deletions b/‎examples/grpo_alfworld/alfworld.yaml‎
Lines changed: 14 additions & 15 deletions
diff --git a/‎examples/grpo_gsm8k/gsm8k.yaml‎
Lines changed: 11 additions & 16 deletions b/‎examples/grpo_gsm8k/gsm8k.yaml‎
Lines changed: 11 additions & 16 deletions
diff --git a/‎examples/grpo_math/math.yaml‎
Lines changed: 11 additions & 13 deletions b/‎examples/grpo_math/math.yaml‎
Lines changed: 11 additions & 13 deletions
@@ -10,10 +10,10 @@ In addition, we need to configure the following parameters in both files.
 The model weights of the explorer and trainer are synchronized once every `sync_iteration_interval * batch_size` tasks.
 
 ```yaml
-model:
-  checkpoint_path: /PATH/TO/CHECKPOINT
+project: tutorial
+name: async_mode_example
+checkpoint_root_dir: /PATH/TO/CHECKPOINT
 
-# The same data_base path
 buffer:
   batch_size: <batch_size>
   trainer_input:
 
@@ -79,19 +79,20 @@ trinity run --config examples/grpo_gsm8k/gsm8k.yaml
 
 ## Optional: RFT with SFT Warmup
 
-Before RFT, we may use SFT as a warmup step. We need to set `trainer.sft_warmup_steps > 0` and prepare the SFT data to `buffer.train_dataset.path=$DATASET_PATH/{sft_data}`.
+Before RFT, we may use SFT as a warmup step. We need to set `buffer.trainer_input.sft_warmup_steps > 0` and prepare the SFT data to `buffer.trainer_input.sft_warmup_dataset.path=$DATASET_PATH/{sft_data}`.
 
 ```yaml
 # Properly set the following configs in gsm8k.yaml
 buffer:
-  sft_warmup_dataset:
-    storage_type: file
-    path: <$DATASET_PATH/{sft_data}>
-    format:
-      prompt_type: <prompt_type> # messages/plaintext/chatpair
-      prompt_key: <prompt_key>
-      response_key: <response_key>
-  sft_warmup_steps: 10
+  trainer_input:
+    sft_warmup_dataset:
+      storage_type: file
+      path: <$DATASET_PATH/{sft_data}>
+      format:
+        prompt_type: <prompt_type> # messages/plaintext/chatpair
+        prompt_key: <prompt_key>
+        response_key: <response_key>
+    sft_warmup_steps: 10
 ```
 
 The following command runs SFT and RFT in sequence:
 
@@ -29,7 +29,6 @@ monitor:
 - `monitor.monitor_type`: The type of the monitor. For now, `MonitorType.WANDB` and `MonitorType.TENSORBOARD` are supported.
 
 
-
 ## Data Processing
 
 <!-- The `data` configuration specifies the data used for training. It includes the total number of epochs, the batch size, the path to the dataset, the default workflow type, the default reward function type, and the format configuration. -->
@@ -65,16 +64,11 @@ The `model` configuration specifies the model used for training. It includes the
 model:
   model_path: '/PATH/TO/MODEL/CHECKPOINT/'
   critic_model_path: ''
-  max_prompt_tokens: 256
-  max_response_tokens: 1024
-  checkpoint_path: 'checkpoints/qwen2.5-1.5B-countdown'
 ```
 
 - `model.model_path`: The path to the model checkpoint. It must be set manually.
 - `model.critic_model_path`: The path to the critic model checkpoint. If not set, the `model.critic_model_path` will be set to `model.model_path`.
-- `model.max_prompt_tokens`: The maximum number of tokens in the prompt. Default is `2048`. It should be set manually.
-- `model.max_response_tokens`: The maximum number of tokens in the response. Default is `2048`. It should be set manually.
-- `model.checkpoint_path`: The path to the checkpoint of the model. It must be set manually.
+
 
 ## Cluster
 
@@ -143,14 +137,15 @@ The `explorer` configuration specifies the explorer configuration. It includes t
 
 ```yaml
 explorer:
-  engine_type: vllm_async
-  engine_num: 2
   runner_num: 32
-  tensor_parallel_size: 1
-  enable_prefix_caching: false
-  enforce_eager: true
-  dtype: bfloat16
-  seed: 42
+  rollout_model:
+    engine_type: vllm_async
+    engine_num: 2
+    tensor_parallel_size: 1
+    enable_prefix_caching: false
+    enforce_eager: true
+    dtype: bfloat16
+    seed: 42
 ```
 
 - `explorer.engine_type`: The type of the engine, Support `vllm_async` and `vllm_sync`. Default is `vllm_async`.
@@ -161,6 +156,8 @@ explorer:
 - `explorer.enforce_eager`: Whether to enforce eager mode. Default is `True`.
 - `explorer.dtype`: The data type used in vLLM. Default is `bfloat16`.
 - `explorer.seed`: The seed used in vLLM. Default is `42`.
+- `explorer.rollout_model.max_prompt_tokens`: The maximum number of tokens in the prompt. Default is `2048`. It should be set manually.
+- `explorer.rollout_model.max_response_tokens`: The maximum number of tokens in the response. Default is `2048`. It should be set manually.
 
 ## Synchronizer
 
@@ -183,15 +180,11 @@ Support `nccl` and `checkpoint`, `nccl` represents that model weights in `explor
 trainer:
   trainer_type: 'verl'
   trainer_config_path: 'examples/ppo_countdown/train_countdown.yaml'
-  sft_warmup_steps: 0
-  eval_interval: 1000
   save_interval: 100
 ```
 
 - `trainer.trainer_type`: The backend of the trainer, Only `verl` is supported.
 - `trainer.trainer_config_path`: The path to the trainer configuration file. It must be set manually.
-- `trainer.sft_warmup_steps`: The number of steps to warm up the model. Default is `0`.
-- `trainer.eval_interval`: The interval steps between two evaluations. Default is `1000`.
 - `trainer.save_interval`: The interval steps between two checkpoints. Default is `100`.
 
 ### veRL Trainer Configuration
 
@@ -2,11 +2,11 @@ project: "Trinity-RFT-gsm8k"
 name: "async-qwen2.5-1.5B-gsm8k"
 mode: explore
 algorithm_type: grpo
+checkpoint_root_dir: 'checkpoints/qwen2.5-1.5B-gsm8k'
 model:
   model_path: /PATH/TO/MODEL/
   max_prompt_tokens: 256
   max_response_tokens: 1024
-  checkpoint_path: 'checkpoints/qwen2.5-1.5B-gsm8k'
 cluster:
   node_num: 1
   gpu_per_node: 8
@@ -36,20 +36,18 @@ buffer:
       path: 'sqlite:///gsm8k.db'
 explorer:
   eval_interval: 10
-  engine_type: vllm_async
-  engine_num: 2
   runner_num: 32
-  tensor_parallel_size: 1
-  enable_prefix_caching: false
-  enforce_eager: true
-  dtype: bfloat16
-  seed: 42
+  rollout_model:
+    engine_type: vllm_async
+    engine_num: 2
+    tensor_parallel_size: 1
+    enable_prefix_caching: false
+    enforce_eager: true
+    dtype: bfloat16
+    seed: 42
 synchronizer:
   sync_method: 'checkpoint'
   sync_iteration_interval: 10
 trainer:
   trainer_type: 'verl'
   trainer_config_path: examples/async_gsm8k/verl_config.yaml
-  sft_warmup_steps: 0 # Set to integer to enable sft warmup
-monitor:
-  cache_root_dir: ""
@@ -2,11 +2,11 @@ project: "Trinity-RFT-gsm8k"
 name: "async-qwen2.5-1.5B-gsm8k"
 mode: train
 algorithm_type: grpo
+checkpoint_root_dir: /PATH/TO/CHECKPOINT
 model:
-  model_path: /PATH/TO/MODEL/
+  model_path: /PATH/TO/MODEL
   max_prompt_tokens: 256
   max_response_tokens: 1024
-  checkpoint_path: ""
 cluster:
   node_num: 1
   gpu_per_node: 8
@@ -35,20 +35,18 @@ buffer:
       path: 'sqlite:///gsm8k.db'
 explorer:
   eval_interval: 10
-  engine_type: vllm_async
-  engine_num: 2
   runner_num: 32
-  tensor_parallel_size: 1
-  enable_prefix_caching: false
-  enforce_eager: true
-  dtype: bfloat16
-  seed: 42
+  rollout_model:
+    engine_type: vllm_async
+    engine_num: 2
+    tensor_parallel_size: 1
+    enable_prefix_caching: false
+    enforce_eager: true
+    dtype: bfloat16
+    seed: 42
 synchronizer:
   sync_method: 'checkpoint'
   sync_iteration_interval: 10
 trainer:
   trainer_type: 'verl'
   trainer_config_path: examples/async_gsm8k/verl_config.yaml
-  sft_warmup_steps: 0 # Set to integer to enable sft warmup
-monitor:
-  cache_root_dir: ""
@@ -2,11 +2,11 @@ project: "dpo_example"
 name: "trinity_dpo"
 mode: train
 algorithm_type: dpo
+checkpoint_root_dir: /PATH/TO/CHECKPOINT
 model:
-  model_path: '/PATH/TO/MODEL/CHECKPOINT/' # NOTE
+  model_path: '/PATH/TO/MODEL' # NOTE
   max_prompt_tokens: 1792
   max_response_tokens: 256
-  checkpoint_path: 'checkpoints/trinity_dpo'
 cluster:
   node_num: 1
   gpu_per_node: 8
@@ -33,5 +33,3 @@ trainer:
   trainer_type: 'verl'
   trainer_config_path: 'examples/dpo_humanlike/train_dpo.yaml'
   save_interval: 30
-monitor:
-  cache_root_dir: ""
@@ -1,11 +1,9 @@
 project: "ALFWORLD"
 name: "ALFWORLD_RFT"
 algorithm_type: grpo
+checkpoint_root_dir: /PATH/TO/CHECKPOINT/ALFWORLD_RFT/
 model:
-  model_path: '/PATH/TO/MODEL/CHECKPOINT/'
-  max_prompt_tokens: 4096
-  max_response_tokens: 16384
-  checkpoint_path: 'checkpoints/ALFWORLD_RFT'
+  model_path: /PATH/TO/MODEL/
 cluster:
   node_num: 1
   gpu_per_node: 8
@@ -32,16 +30,19 @@ buffer:
       storage_type: queue
       path: 'sqlite:///alfworld.db'
 explorer:
-  engine_type: vllm_async
-  engine_num: 2
   runner_num: 32
-  tensor_parallel_size: 2
-  enable_prefix_caching: false
-  enforce_eager: true
-  dtype: bfloat16
-  seed: 42
-  gpu_memory_utilization: 0.7
-  enable_chunked_prefill: true
+  rollout_model:
+    engine_type: vllm_async
+    engine_num: 2
+    tensor_parallel_size: 2
+    enable_prefix_caching: false
+    enforce_eager: true
+    max_prompt_tokens: 4096
+    max_response_tokens: 16384
+    dtype: bfloat16
+    seed: 42
+    gpu_memory_utilization: 0.7
+    enable_chunked_prefill: true
 synchronizer:
   sync_method: 'nccl'
   sync_interval: 8
@@ -50,5 +51,3 @@ trainer:
   trainer_type: 'verl'
   trainer_config_path: 'examples/grpo_alfworld/train_alfworld.yaml'
   save_interval: 10
-monitor:
-  cache_root_dir: ""
@@ -1,6 +1,7 @@
 project: "Trinity-RFT-gsm8k"
 name: "qwen2.5-1.5B-gsm8k"
 algorithm_type: grpo
+checkpoint_root_dir: /PATH/TO/CHECKPOINT/
 data_processor:
   # basic info
   source_data_path: 'openai/gsm8k'
@@ -17,9 +18,6 @@ data_processor:
 
 model:
   model_path: '/PATH/TO/MODEL/'
-  max_prompt_tokens: 256
-  max_response_tokens: 1024
-  checkpoint_path: ""
 cluster:
   node_num: 1
   gpu_per_node: 8
@@ -61,27 +59,24 @@ buffer:
     #   name: warmup_data
     #   storage_type: file
     #   path: '/PATH/TO/WARMUP_DATA/'
-    #   kwargs:
-    #     prompt_type: plaintext
 explorer:
   eval_interval: 50
-  engine_type: vllm_async
-  engine_num: 2
   runner_num: 32
-  tensor_parallel_size: 1
-  enable_prefix_caching: false
-  enforce_eager: true
-  dtype: bfloat16
-  seed: 42
+  rollout_model:
+    engine_type: vllm_async
+    engine_num: 2
+    tensor_parallel_size: 1
+    enable_prefix_caching: false
+    enforce_eager: true
+    dtype: bfloat16
+    max_prompt_tokens: 256
+    max_response_tokens: 1024
+    seed: 42
 synchronizer:
   sync_method: 'nccl'
   sync_interval: 2
   sync_timeout: 1200
 trainer:
   trainer_type: 'verl'
   trainer_config_path: 'examples/grpo_gsm8k/train_gsm8k.yaml'
-  sft_warmup_steps: 0 # Set to integer to enable sft warmup
   save_interval: 100
-  # get_exp_strategy: 'LFU'
-monitor:
-  cache_root_dir: ""
@@ -1,11 +1,9 @@
 project: grpo_math
 name: grpo_math_example
 algorithm_type: grpo
+checkpoint_root_dir: /PATH/TO/CHECKPOINT/
 model:
   model_path: /PATH/TO/MODEL/
-  max_prompt_tokens: 1024
-  max_response_tokens: 3072
-  checkpoint_path: /PATH/TO/CHECKPOINT/
 cluster:
   node_num: 1
   gpu_per_node: 8
@@ -34,22 +32,22 @@ buffer:
       path: 'sqlite:///math.db'
 explorer:
   eval_interval: 10
-  engine_type: vllm_async
-  engine_num: 2
   runner_num: 32
-  tensor_parallel_size: 1
-  enable_prefix_caching: false
-  enforce_eager: true
-  dtype: bfloat16
-  seed: 42
+  rollout_model:
+    engine_type: vllm_async
+    engine_num: 2
+    tensor_parallel_size: 1
+    enable_prefix_caching: false
+    enforce_eager: true
+    dtype: bfloat16
+    max_prompt_tokens: 1024
+    max_response_tokens: 3072
+    seed: 42
 synchronizer:
   sync_method: 'nccl'
   sync_interval: 2
   sync_timeout: 1200
 trainer:
   trainer_type: 'verl'
   trainer_config_path: 'examples/grpo_math/train_math.yaml'
-  sft_warmup_steps: 0 # Set to integer to enable sft warmup
   save_interval: 100
-monitor:
-  cache_root_dir: ""