Skip to content

Commit b7a89f8

Browse files
authored
Add workflow and example for toolcall training using ToolAce dataset (#134)
1 parent 8a1d316 commit b7a89f8

File tree

7 files changed

+670
-0
lines changed

7 files changed

+670
-0
lines changed
468 KB
Loading

examples/grpo_toolcall/README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# GRPO on ToolAce dataset
2+
3+
This example shows the usage of GRPO on the [ToolAce](https://huggingface.co/datasets/Team-ACE/ToolACE) dataset.
4+
5+
We reference code from [Tool-N1](https://github.com/NVlabs/Tool-N1) for the data preprocessing script and the workflow construction.
6+
7+
The config files are located in [`toolace.yaml`](toolace.yaml) and [`train_toolace.yaml`](train_toolace.yaml).
8+
9+
10+
## How to run
11+
To preprocess the data into the format required by our `toolcall_workflow`, run the following command: `python scripts/data_prepare/get_toolace_data.py`.
12+
13+
Then fill in the config file `toolace.yaml` and run the following command: `trinity run --config examples/grpo_toolcall/toolace.yaml`.
14+
15+
## Reward curve results
16+
17+
![](../../docs/sphinx_doc/assets/toolace_reward_curve.png)
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
project: "Trinity-RFT-toolace"
2+
name: "qwen2.5-7B-toolace"
3+
checkpoint_root_dir: /PATH/TO/CHECKPOINT/
4+
algorithm:
5+
algorithm_type: grpo
6+
repeat_times: 8
7+
8+
model:
9+
model_path: /PATH/TO/MODEL/
10+
max_prompt_tokens: 4096
11+
max_response_tokens: 8192
12+
cluster:
13+
node_num: 1
14+
gpu_per_node: 8
15+
buffer:
16+
total_epochs: 1
17+
batch_size: 128
18+
max_retry_times: 3
19+
max_retry_interval: 1
20+
explorer_input:
21+
taskset:
22+
name: toolace_data
23+
storage_type: file
24+
path: scripts/data_prepare/toolace_data
25+
# format: []
26+
rollout_args:
27+
n: 8
28+
temperature: 1.0
29+
logprobs: 0
30+
eval_tasksets: []
31+
default_workflow_type: 'toolcall_workflow'
32+
trainer_input:
33+
experience_buffer:
34+
name: toolace_buffer
35+
storage_type: queue
36+
path: 'sqlite:///toolace.db'
37+
explorer:
38+
eval_interval: 50
39+
runner_num: 32
40+
rollout_model:
41+
engine_type: vllm_async
42+
engine_num: 4
43+
tensor_parallel_size: 1
44+
enable_prefix_caching: false
45+
enforce_eager: true
46+
dtype: bfloat16
47+
seed: 42
48+
synchronizer:
49+
sync_method: 'nccl'
50+
sync_interval: 1
51+
sync_timeout: 3600
52+
trainer:
53+
trainer_type: 'verl'
54+
trainer_config_path: 'examples/grpo_toolcall/train_toolace.yaml'
55+
save_interval: 100
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
actor_rollout_ref:
2+
hybrid_engine: True
3+
model:
4+
external_lib: null
5+
override_config: { }
6+
enable_gradient_checkpointing: True
7+
use_remove_padding: True # False
8+
actor:
9+
strategy: fsdp # This is for backward-compatibility
10+
ppo_micro_batch_size_per_gpu: 1
11+
use_dynamic_bsz: True # False
12+
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
13+
grad_clip: 1.0
14+
ppo_epochs: 1
15+
shuffle: False
16+
ulysses_sequence_parallel_size: 2 # sp size
17+
optim:
18+
lr: 1e-6
19+
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
20+
# min_lr_ratio: null # only useful for warmup with cosine
21+
warmup_style: constant # select from constant/cosine
22+
total_training_steps: -1 # must be override by program
23+
fsdp_config:
24+
wrap_policy:
25+
# transformer_layer_cls_to_wrap: None
26+
min_num_params: 0
27+
param_offload: False
28+
optimizer_offload: False
29+
fsdp_size: -1
30+
ref:
31+
fsdp_config:
32+
param_offload: False
33+
wrap_policy:
34+
# transformer_layer_cls_to_wrap: None
35+
min_num_params: 0
36+
log_prob_micro_batch_size_per_gpu: 1
37+
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
38+
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
39+
ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
40+
41+
trainer:
42+
balance_batch: True
43+
# total_training_steps: null
44+
# auto: find the last ckpt to resume. If can't find, start from scratch
45+
resume_mode: auto # or auto or resume_path
46+
default_hdfs_dir: null
47+
remove_previous_ckpt_in_save: False
48+
del_local_ckpt_after_load: False
49+
val_before_train: False

0 commit comments

Comments
 (0)