Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added docs/sphinx_doc/assets/toolace_reward_curve.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
17 changes: 17 additions & 0 deletions examples/grpo_toolcall/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# GRPO on ToolAce dataset

This example shows the usage of GRPO on the [ToolAce](https://huggingface.co/datasets/Team-ACE/ToolACE) dataset.

We reference code from [Tool-N1](https://github.com/NVlabs/Tool-N1) for the data preprocessing script and the workflow construction.

The config files are located in [`toolace.yaml`](toolace.yaml) and [`train_toolace.yaml`](train_toolace.yaml).


## How to run
To preprocess the data into the format required by our `toolcall_workflow`, run the following command: `python scripts/data_prepare/get_toolace_data.py`.

Then fill in the config file `toolace.yaml` and run the following command: `trinity run --config examples/grpo_toolcall/toolace.yaml`.

## Reward curve results

![](../../docs/sphinx_doc/assets/toolace_reward_curve.png)
55 changes: 55 additions & 0 deletions examples/grpo_toolcall/toolace.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
project: "Trinity-RFT-toolace"
name: "qwen2.5-7B-toolace"
checkpoint_root_dir: /PATH/TO/CHECKPOINT/
algorithm:
algorithm_type: grpo
repeat_times: 8

model:
model_path: /PATH/TO/MODEL/
max_prompt_tokens: 4096
max_response_tokens: 8192
cluster:
node_num: 1
gpu_per_node: 8
buffer:
total_epochs: 1
batch_size: 128
max_retry_times: 3
max_retry_interval: 1
explorer_input:
taskset:
name: toolace_data
storage_type: file
path: scripts/data_prepare/toolace_data
# format: []
rollout_args:
n: 8
temperature: 1.0
logprobs: 0
eval_tasksets: []
default_workflow_type: 'toolcall_workflow'
trainer_input:
experience_buffer:
name: toolace_buffer
storage_type: queue
path: 'sqlite:///toolace.db'
explorer:
eval_interval: 50
runner_num: 32
rollout_model:
engine_type: vllm_async
engine_num: 4
tensor_parallel_size: 1
enable_prefix_caching: false
enforce_eager: true
dtype: bfloat16
seed: 42
synchronizer:
sync_method: 'nccl'
sync_interval: 1
sync_timeout: 3600
trainer:
trainer_type: 'verl'
trainer_config_path: 'examples/grpo_toolcall/train_toolace.yaml'
save_interval: 100
49 changes: 49 additions & 0 deletions examples/grpo_toolcall/train_toolace.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
actor_rollout_ref:
hybrid_engine: True
model:
external_lib: null
override_config: { }
enable_gradient_checkpointing: True
use_remove_padding: True # False
actor:
strategy: fsdp # This is for backward-compatibility
ppo_micro_batch_size_per_gpu: 1
use_dynamic_bsz: True # False
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
grad_clip: 1.0
ppo_epochs: 1
shuffle: False
ulysses_sequence_parallel_size: 2 # sp size
optim:
lr: 1e-6
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
# min_lr_ratio: null # only useful for warmup with cosine
warmup_style: constant # select from constant/cosine
total_training_steps: -1 # must be override by program
fsdp_config:
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
param_offload: False
optimizer_offload: False
fsdp_size: -1
ref:
fsdp_config:
param_offload: False
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
log_prob_micro_batch_size_per_gpu: 1
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size

trainer:
balance_batch: True
# total_training_steps: null
# auto: find the last ckpt to resume. If can't find, start from scratch
resume_mode: auto # or auto or resume_path
default_hdfs_dir: null
remove_previous_ckpt_in_save: False
del_local_ckpt_after_load: False
val_before_train: False
Loading