agentscope-ai · garyzhang99 · Jul 22, 2025 · Jul 22, 2025 · Jul 22, 2025 · Jul 22, 2025
diff --git a/docs/sphinx_doc/assets/toolace_reward_curve.png b/docs/sphinx_doc/assets/toolace_reward_curve.png
diff --git a/examples/grpo_toolcall/README.md b/examples/grpo_toolcall/README.md
@@ -0,0 +1,17 @@
+# GRPO on ToolAce dataset
+
+This example shows the usage of GRPO on the [ToolAce](https://huggingface.co/datasets/Team-ACE/ToolACE) dataset.
+
+We reference code from [Tool-N1](https://github.com/NVlabs/Tool-N1) for the data preprocessing script and the workflow construction.
+
+The config files are located in [`toolace.yaml`](toolace.yaml) and [`train_toolace.yaml`](train_toolace.yaml).
+
+
+## How to run
+To preprocess the data into the format required by our `toolcall_workflow`, run the following command: `python scripts/data_prepare/get_toolace_data.py`.
+
+Then fill in the config file `toolace.yaml` and run the following command: `trinity run --config examples/grpo_toolcall/toolace.yaml`.
+
+## Reward curve results
+
+![](../../docs/sphinx_doc/assets/toolace_reward_curve.png)
diff --git a/examples/grpo_toolcall/toolace.yaml b/examples/grpo_toolcall/toolace.yaml
@@ -0,0 +1,55 @@
+project: "Trinity-RFT-toolace"
+name: "qwen2.5-7B-toolace"
+checkpoint_root_dir: /PATH/TO/CHECKPOINT/
+algorithm:
+  algorithm_type: grpo
+  repeat_times: 8
+
+model:
+  model_path: /PATH/TO/MODEL/
+  max_prompt_tokens: 4096
+  max_response_tokens: 8192
+cluster:
+  node_num: 1
+  gpu_per_node: 8
+buffer:
+  total_epochs: 1
+  batch_size: 128
+  max_retry_times: 3
+  max_retry_interval: 1
+  explorer_input:
+    taskset:
+      name: toolace_data
+      storage_type: file
+      path: scripts/data_prepare/toolace_data
+      # format: []
+      rollout_args:
+        n: 8
+        temperature: 1.0
+        logprobs: 0
+    eval_tasksets: []
+    default_workflow_type: 'toolcall_workflow'
+  trainer_input:
+    experience_buffer:
+      name: toolace_buffer
+      storage_type: queue
+      path: 'sqlite:///toolace.db'
+explorer:
+  eval_interval: 50
+  runner_num: 32
+  rollout_model:
+    engine_type: vllm_async
+    engine_num: 4
+    tensor_parallel_size: 1
+    enable_prefix_caching: false
+    enforce_eager: true
+    dtype: bfloat16
+    seed: 42
+synchronizer:
+  sync_method: 'nccl'
+  sync_interval: 1
+  sync_timeout: 3600
+trainer:
+  trainer_type: 'verl'
+  trainer_config_path: 'examples/grpo_toolcall/train_toolace.yaml'
+  save_interval: 100
diff --git a/examples/grpo_toolcall/train_toolace.yaml b/examples/grpo_toolcall/train_toolace.yaml
@@ -0,0 +1,49 @@
+actor_rollout_ref:
+  hybrid_engine: True
+  model:
+    external_lib: null
+    override_config: { }
+    enable_gradient_checkpointing: True
+    use_remove_padding: True  # False
+  actor:
+    strategy: fsdp  # This is for backward-compatibility
+    ppo_micro_batch_size_per_gpu: 1
+    use_dynamic_bsz: True # False
+    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    grad_clip: 1.0
+    ppo_epochs: 1
+    shuffle: False
+    ulysses_sequence_parallel_size: 2 # sp size
+    optim:
+      lr: 1e-6
+      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+      # min_lr_ratio: null   # only useful for warmup with cosine
+      warmup_style: constant  # select from constant/cosine
+      total_training_steps: -1  # must be override by program
+    fsdp_config:
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+      param_offload: False
+      optimizer_offload: False
+      fsdp_size: -1
+  ref:
+    fsdp_config:
+      param_offload: False
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+    log_prob_micro_batch_size_per_gpu: 1
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
+
+trainer:
+  balance_batch: True
+  # total_training_steps: null
+  # auto: find the last ckpt to resume. If can't find, start from scratch
+  resume_mode: auto # or auto or resume_path
+  default_hdfs_dir: null
+  remove_previous_ckpt_in_save: False
+  del_local_ckpt_after_load: False
+  val_before_train: False