Skip to content

Commit efff1b7

Browse files
authored
Fix config manager (#49)
1 parent 24f8baa commit efff1b7

File tree

17 files changed

+358
-1517
lines changed

17 files changed

+358
-1517
lines changed

docs/sphinx_doc/source/tutorial/trinity_configs.md

Lines changed: 10 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -187,39 +187,30 @@ Support `nccl` and `checkpoint`, `nccl` represents that model weights in `explor
187187
```yaml
188188
trainer:
189189
trainer_type: 'verl'
190-
trainer_config_path: 'examples/ppo_countdown/train_countdown.yaml'
191190
save_interval: 100
191+
trainer_config_path: 'examples/ppo_countdown/train_countdown.yaml'
192192
```
193193

194194
- `trainer.trainer_type`: The backend of the trainer, Only `verl` is supported.
195-
- `trainer.trainer_config_path`: The path to the trainer configuration file. It must be set manually.
196195
- `trainer.save_interval`: The interval steps between two checkpoints. Default is `100`.
197196

197+
- `trainer.actor_grad_clip`: Gradient clip for actor model training.
198+
- `trainer.actor_clip_ratio`: Used for compute policy loss.
199+
- `trainer.actor_entropy_coeff`: Used for compute policy loss.
200+
- `trainer.actor_use_kl_loss`: Whether to enable kl loss.
201+
- `trainer.actor_kl_loss_coef`: The coefficient of kl loss.
202+
203+
- `trainer.train_config`: The configuration of the trainer. Only one needs to be set for `trainer.trainer_config` and `trainer.trainer_config_path`
204+
- `trainer.trainer_config_path`: The path to the trainer configuration file. It must be set manually.
205+
198206
### veRL Trainer Configuration
199207

200208
Here we mainly introduce the parameters that can be set in veRL. For the specific meaning of the parameters, please refer to the official document of [veRL](https://github.com/volcengine/verl/blob/0bdf7f469854815177e73dcfe9e420836c952e6e/docs/examples/config.rst).
201209

202210
```yaml
203-
data:
204-
tokenizer: null
205-
train_files: train_example.parquet
206-
val_files: test_example.parquet
207-
prompt_key: prompt
208-
max_prompt_length: 256
209-
max_response_length: 1024
210-
train_batch_size: 256
211-
val_batch_size: null
212-
return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs
213-
return_raw_chat: False
214-
shuffle: True
215-
filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left'
216-
truncation: error
217-
image_key: images
218-
219211
actor_rollout_ref:
220212
hybrid_engine: True
221213
model:
222-
path: /PATH/TO/MODEL/CHECKPOINT/
223214
external_lib: null
224215
override_config: { }
225216
enable_gradient_checkpointing: True
@@ -270,35 +261,6 @@ actor_rollout_ref:
270261
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
271262
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
272263
ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
273-
rollout:
274-
name: vllm
275-
temperature: 1.0
276-
top_k: -1 # 0 for hf rollout, -1 for vllm rollout
277-
top_p: 1
278-
use_fire_sampling: False # https://arxiv.org/abs/2410.21236
279-
prompt_length: ${data.max_prompt_length} # not use for opensource
280-
response_length: ${data.max_response_length}
281-
# for vllm rollout
282-
dtype: bfloat16 # should align with FSDP
283-
gpu_memory_utilization: 0.4
284-
ignore_eos: False
285-
enforce_eager: True
286-
free_cache_engine: True
287-
load_format: dummy_dtensor
288-
tensor_model_parallel_size: 2
289-
max_num_batched_tokens: 8192
290-
max_model_len: null
291-
max_num_seqs: 1024
292-
# log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu
293-
log_prob_micro_batch_size_per_gpu: 4
294-
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
295-
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
296-
disable_log_stats: True
297-
enable_chunked_prefill: True # could get higher throughput
298-
# for hf rollout
299-
do_sample: True
300-
# number of responses (i.e. num sample times)
301-
n: 1 # > 1 for grpo
302264
303265
critic:
304266
strategy: fsdp
@@ -309,8 +271,6 @@ critic:
309271
warmup_style: constant # select from constant/cosine
310272
total_training_steps: -1 # must be override by program
311273
model:
312-
path: /PATH/TO/MODEL/CHECKPOINT/
313-
tokenizer_path: ${actor_rollout_ref.model.path}
314274
override_config: { }
315275
external_lib: ${actor_rollout_ref.model.external_lib}
316276
enable_gradient_checkpointing: True
@@ -323,7 +283,6 @@ critic:
323283
min_num_params: 0
324284
fsdp_size: -1
325285
ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
326-
# ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
327286
ppo_micro_batch_size_per_gpu: 8
328287
forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
329288
use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
@@ -335,34 +294,13 @@ critic:
335294
grad_clip: 1.0
336295
cliprange_value: 0.5
337296
338-
reward_model:
339-
enable: False
340-
strategy: fsdp
341-
model:
342-
input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical
343-
path: ~/models/FsfairX-LLaMA3-RM-v0.1
344-
external_lib: ${actor_rollout_ref.model.external_lib}
345-
use_remove_padding: False
346-
fsdp_config:
347-
min_num_params: 0
348-
param_offload: False
349-
fsdp_size: -1
350-
# micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
351-
# micro_batch_size_per_gpu: 2 # set a number
352-
# max_length: null
353-
ulysses_sequence_parallel_size: 1 # sp size
354-
use_dynamic_bsz: ${critic.use_dynamic_bsz}
355-
forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
356-
reward_manager: tinyzero
357-
358297
custom_reward_function:
359298
path: null
360299
name: compute_score
361300
362301
algorithm:
363302
gamma: 1.0
364303
lam: 1.0
365-
adv_estimator: gae
366304
norm_adv_by_std_in_grpo: True
367305
use_kl_in_reward: False
368306
kl_penalty: kl # how to estimate kl divergence
@@ -374,24 +312,14 @@ algorithm:
374312
375313
trainer:
376314
balance_batch: True
377-
total_epochs: 15
378315
# total_training_steps: null
379-
project_name: TinyZero
380-
experiment_name: trinity-qwen2.5-1.5b
381-
logger: [ 'wandb' ]
382-
val_generations_to_log_to_wandb: 0
383-
nnodes: 1
384-
n_gpus_per_node: 2
385-
save_freq: 100
386316
# auto: find the last ckpt to resume. If can't find, start from scratch
387317
resume_mode: auto # or auto or resume_path if
388318
resume_from_path: ""
389-
test_freq: 100
390319
critic_warmup: 0
391320
default_hdfs_dir: null
392321
remove_previous_ckpt_in_save: False
393322
del_local_ckpt_after_load: False
394-
default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
395323
val_before_train: False
396324
max_actor_ckpt_to_keep: 5
397325
max_critic_ckpt_to_keep: 5
@@ -402,11 +330,6 @@ trainer:
402330
- `actor_rollout_ref.model.use_remove_padding`: Whether to remove pad tokens, which will reduce training time.
403331
- `actor_rollout_ref.actor.use_dynamic_bsz`: Whether to reorganize the batch data, specifically to splice the shorter data to reduce the batch size in the actual training process.
404332
- `actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu`: Batch size for one GPU in one forward pass.
405-
- `actor_rollout_ref.actor.grad_clip`: Gradient clip for actor model training.
406-
- `actor_rollout_ref.actor.clip_ratio`: Used for compute policy loss.
407-
- `actor_rollout_ref.actor.entropy_coeff`: Used for compute policy loss.
408-
- `actor_rollout_ref.actor.use_kl_loss`: Whether to enable kl loss.
409-
- `actor_rollout_ref.actor.kl_loss_coef`: The coefficient of kl loss.
410333
- `actor_rollout_ref.actor.kl_loss_type`: How to compute kl loss, optional value is `kl`, `abs`, `mse` or `low_var_kl`.
411334
- `actor_rollout_ref.actor.ulysses_sequence_parallel_size`: Ulysses sequence parallel size.
412335
- `actor_rollout_ref.actor.tau`: strength of regularization w.r.t. old / ref policy.
Lines changed: 0 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,13 @@
1-
data:
2-
tokenizer: null
3-
train_files: placeholder
4-
val_files: placeholder
5-
prompt_key: prompt
6-
max_prompt_length: 256
7-
max_response_length: 1024
8-
train_batch_size: 256
9-
val_batch_size: null
10-
return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs
11-
return_raw_chat: False
12-
shuffle: True
13-
filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left'
14-
truncation: error
15-
image_key: images
16-
171
actor_rollout_ref:
182
hybrid_engine: True
193
model:
20-
path: /PATH/TO/MODEL/
214
external_lib: null
225
override_config: { }
236
enable_gradient_checkpointing: True
247
use_remove_padding: True # False
258
actor:
269
strategy: fsdp # This is for backward-compatibility
2710
ppo_mini_batch_size: 128
28-
# ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
2911
ppo_micro_batch_size_per_gpu: 4
3012
use_dynamic_bsz: True # False
3113
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
@@ -61,92 +43,10 @@ actor_rollout_ref:
6143
wrap_policy:
6244
# transformer_layer_cls_to_wrap: None
6345
min_num_params: 0
64-
# log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu
6546
log_prob_micro_batch_size_per_gpu: 16
6647
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
6748
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
6849
ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
69-
rollout:
70-
name: vllm
71-
temperature: 1.0
72-
use_fire_sampling: False # https://arxiv.org/abs/2410.21236
73-
prompt_length: ${data.max_prompt_length} # not use for opensource
74-
response_length: ${data.max_response_length}
75-
# for vllm rollout
76-
dtype: bfloat16 # should align with FSDP
77-
gpu_memory_utilization: 0.4
78-
ignore_eos: False
79-
enforce_eager: True
80-
free_cache_engine: True
81-
load_format: dummy_dtensor
82-
tensor_model_parallel_size: 2
83-
max_num_batched_tokens: 8192
84-
max_model_len: null
85-
max_num_seqs: 1024
86-
# log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu
87-
log_prob_micro_batch_size_per_gpu: 4
88-
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
89-
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
90-
disable_log_stats: True
91-
enable_chunked_prefill: True # could get higher throughput
92-
# for hf rollout
93-
do_sample: True
94-
# number of responses (i.e. num sample times)
95-
n: 8 # > 1 for grpo
96-
97-
critic:
98-
strategy: fsdp
99-
optim:
100-
lr: 1e-5
101-
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
102-
# min_lr_ratio: null # only useful for warmup with cosine
103-
warmup_style: constant # select from constant/cosine
104-
total_training_steps: -1 # must be override by program
105-
model:
106-
path: /PATH/TO/MODEL/
107-
tokenizer_path: ${actor_rollout_ref.model.path}
108-
override_config: { }
109-
external_lib: ${actor_rollout_ref.model.external_lib}
110-
enable_gradient_checkpointing: True
111-
use_remove_padding: False
112-
fsdp_config:
113-
param_offload: False
114-
optimizer_offload: False
115-
wrap_policy:
116-
# transformer_layer_cls_to_wrap: None
117-
min_num_params: 0
118-
fsdp_size: -1
119-
ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
120-
# ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
121-
ppo_micro_batch_size_per_gpu: 64
122-
forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
123-
use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
124-
ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
125-
forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
126-
ulysses_sequence_parallel_size: 1 # sp size
127-
ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
128-
shuffle: ${actor_rollout_ref.actor.shuffle}
129-
grad_clip: 1.0
130-
cliprange_value: 0.5
131-
132-
reward_model:
133-
enable: False
134-
strategy: fsdp
135-
model:
136-
input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical
137-
path: ~/models/FsfairX-LLaMA3-RM-v0.1
138-
external_lib: ${actor_rollout_ref.model.external_lib}
139-
use_remove_padding: False
140-
fsdp_config:
141-
min_num_params: 0
142-
param_offload: False
143-
fsdp_size: -1
144-
# micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
145-
# micro_batch_size_per_gpu: 2 # set a number
146-
# max_length: null
147-
ulysses_sequence_parallel_size: 1 # sp size
148-
use_dynamic_bsz: ${critic.use_dynamic_bsz}
149-
forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
15050

15151
custom_reward_function:
15252
path: null
@@ -155,29 +55,17 @@ custom_reward_function:
15555
algorithm:
15656
gamma: 1.0
15757
lam: 1.0
158-
adv_estimator: grpo
15958
kl_penalty: kl # how to estimate kl divergence
16059
kl_ctrl:
16160
type: fixed
16261
kl_coef: 0.001
16362

16463
trainer:
16564
balance_batch: True
166-
total_epochs: 10
16765
# total_training_steps: null
168-
project_name: rft_example_gsm8k
169-
experiment_name: cys-qwen2_1.5b_rollout8_grpo_kl0.001_lr1e-5
170-
logger: [ 'console','wandb' ]
171-
val_generations_to_log_to_wandb: 0
172-
nnodes: 1
173-
n_gpus_per_node: 2
174-
save_freq: 100
17566
# auto: find the last ckpt to resume. If can't find, start from scratch
17667
resume_mode: auto # or auto or resume_path if
177-
test_freq: 5
178-
critic_warmup: 0
17968
default_hdfs_dir: null
18069
remove_previous_ckpt_in_save: False
18170
del_local_ckpt_after_load: False
182-
default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
18371
val_before_train: False

0 commit comments

Comments
 (0)