diff --git a/docs/sphinx_doc/source/tutorial/trinity_configs.md b/docs/sphinx_doc/source/tutorial/trinity_configs.md index 255bb58018..fc599e206b 100644 --- a/docs/sphinx_doc/source/tutorial/trinity_configs.md +++ b/docs/sphinx_doc/source/tutorial/trinity_configs.md @@ -187,39 +187,30 @@ Support `nccl` and `checkpoint`, `nccl` represents that model weights in `explor ```yaml trainer: trainer_type: 'verl' - trainer_config_path: 'examples/ppo_countdown/train_countdown.yaml' save_interval: 100 + trainer_config_path: 'examples/ppo_countdown/train_countdown.yaml' ``` - `trainer.trainer_type`: The backend of the trainer, Only `verl` is supported. -- `trainer.trainer_config_path`: The path to the trainer configuration file. It must be set manually. - `trainer.save_interval`: The interval steps between two checkpoints. Default is `100`. +- `trainer.actor_grad_clip`: Gradient clip for actor model training. +- `trainer.actor_clip_ratio`: Used for compute policy loss. +- `trainer.actor_entropy_coeff`: Used for compute policy loss. +- `trainer.actor_use_kl_loss`: Whether to enable kl loss. +- `trainer.actor_kl_loss_coef`: The coefficient of kl loss. + +- `trainer.train_config`: The configuration of the trainer. Only one needs to be set for `trainer.trainer_config` and `trainer.trainer_config_path` +- `trainer.trainer_config_path`: The path to the trainer configuration file. It must be set manually. + ### veRL Trainer Configuration Here we mainly introduce the parameters that can be set in veRL. For the specific meaning of the parameters, please refer to the official document of [veRL](https://github.com/volcengine/verl/blob/0bdf7f469854815177e73dcfe9e420836c952e6e/docs/examples/config.rst). ```yaml -data: - tokenizer: null - train_files: train_example.parquet - val_files: test_example.parquet - prompt_key: prompt - max_prompt_length: 256 - max_response_length: 1024 - train_batch_size: 256 - val_batch_size: null - return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs - return_raw_chat: False - shuffle: True - filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left' - truncation: error - image_key: images - actor_rollout_ref: hybrid_engine: True model: - path: /PATH/TO/MODEL/CHECKPOINT/ external_lib: null override_config: { } enable_gradient_checkpointing: True @@ -270,35 +261,6 @@ actor_rollout_ref: log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size - rollout: - name: vllm - temperature: 1.0 - top_k: -1 # 0 for hf rollout, -1 for vllm rollout - top_p: 1 - use_fire_sampling: False # https://arxiv.org/abs/2410.21236 - prompt_length: ${data.max_prompt_length} # not use for opensource - response_length: ${data.max_response_length} - # for vllm rollout - dtype: bfloat16 # should align with FSDP - gpu_memory_utilization: 0.4 - ignore_eos: False - enforce_eager: True - free_cache_engine: True - load_format: dummy_dtensor - tensor_model_parallel_size: 2 - max_num_batched_tokens: 8192 - max_model_len: null - max_num_seqs: 1024 - # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu - log_prob_micro_batch_size_per_gpu: 4 - log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} - log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} - disable_log_stats: True - enable_chunked_prefill: True # could get higher throughput - # for hf rollout - do_sample: True - # number of responses (i.e. num sample times) - n: 1 # > 1 for grpo critic: strategy: fsdp @@ -309,8 +271,6 @@ critic: warmup_style: constant # select from constant/cosine total_training_steps: -1 # must be override by program model: - path: /PATH/TO/MODEL/CHECKPOINT/ - tokenizer_path: ${actor_rollout_ref.model.path} override_config: { } external_lib: ${actor_rollout_ref.model.external_lib} enable_gradient_checkpointing: True @@ -323,7 +283,6 @@ critic: min_num_params: 0 fsdp_size: -1 ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} - # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu ppo_micro_batch_size_per_gpu: 8 forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} @@ -335,26 +294,6 @@ critic: grad_clip: 1.0 cliprange_value: 0.5 -reward_model: - enable: False - strategy: fsdp - model: - input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical - path: ~/models/FsfairX-LLaMA3-RM-v0.1 - external_lib: ${actor_rollout_ref.model.external_lib} - use_remove_padding: False - fsdp_config: - min_num_params: 0 - param_offload: False - fsdp_size: -1 - # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu - # micro_batch_size_per_gpu: 2 # set a number - # max_length: null - ulysses_sequence_parallel_size: 1 # sp size - use_dynamic_bsz: ${critic.use_dynamic_bsz} - forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} - reward_manager: tinyzero - custom_reward_function: path: null name: compute_score @@ -362,7 +301,6 @@ custom_reward_function: algorithm: gamma: 1.0 lam: 1.0 - adv_estimator: gae norm_adv_by_std_in_grpo: True use_kl_in_reward: False kl_penalty: kl # how to estimate kl divergence @@ -374,24 +312,14 @@ algorithm: trainer: balance_batch: True - total_epochs: 15 # total_training_steps: null - project_name: TinyZero - experiment_name: trinity-qwen2.5-1.5b - logger: [ 'wandb' ] - val_generations_to_log_to_wandb: 0 - nnodes: 1 - n_gpus_per_node: 2 - save_freq: 100 # auto: find the last ckpt to resume. If can't find, start from scratch resume_mode: auto # or auto or resume_path if resume_from_path: "" - test_freq: 100 critic_warmup: 0 default_hdfs_dir: null remove_previous_ckpt_in_save: False del_local_ckpt_after_load: False - default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} val_before_train: False max_actor_ckpt_to_keep: 5 max_critic_ckpt_to_keep: 5 @@ -402,11 +330,6 @@ trainer: - `actor_rollout_ref.model.use_remove_padding`: Whether to remove pad tokens, which will reduce training time. - `actor_rollout_ref.actor.use_dynamic_bsz`: Whether to reorganize the batch data, specifically to splice the shorter data to reduce the batch size in the actual training process. - `actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu`: Batch size for one GPU in one forward pass. -- `actor_rollout_ref.actor.grad_clip`: Gradient clip for actor model training. -- `actor_rollout_ref.actor.clip_ratio`: Used for compute policy loss. -- `actor_rollout_ref.actor.entropy_coeff`: Used for compute policy loss. -- `actor_rollout_ref.actor.use_kl_loss`: Whether to enable kl loss. -- `actor_rollout_ref.actor.kl_loss_coef`: The coefficient of kl loss. - `actor_rollout_ref.actor.kl_loss_type`: How to compute kl loss, optional value is `kl`, `abs`, `mse` or `low_var_kl`. - `actor_rollout_ref.actor.ulysses_sequence_parallel_size`: Ulysses sequence parallel size. - `actor_rollout_ref.actor.tau`: strength of regularization w.r.t. old / ref policy. diff --git a/examples/async_gsm8k/verl_config.yaml b/examples/async_gsm8k/verl_config.yaml index 268d61e0e5..de1b08f590 100644 --- a/examples/async_gsm8k/verl_config.yaml +++ b/examples/async_gsm8k/verl_config.yaml @@ -1,23 +1,6 @@ -data: - tokenizer: null - train_files: placeholder - val_files: placeholder - prompt_key: prompt - max_prompt_length: 256 - max_response_length: 1024 - train_batch_size: 256 - val_batch_size: null - return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs - return_raw_chat: False - shuffle: True - filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left' - truncation: error - image_key: images - actor_rollout_ref: hybrid_engine: True model: - path: /PATH/TO/MODEL/ external_lib: null override_config: { } enable_gradient_checkpointing: True @@ -25,7 +8,6 @@ actor_rollout_ref: actor: strategy: fsdp # This is for backward-compatibility ppo_mini_batch_size: 128 - # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu ppo_micro_batch_size_per_gpu: 4 use_dynamic_bsz: True # False ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length} @@ -61,92 +43,10 @@ actor_rollout_ref: wrap_policy: # transformer_layer_cls_to_wrap: None min_num_params: 0 - # log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu log_prob_micro_batch_size_per_gpu: 16 log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size - rollout: - name: vllm - temperature: 1.0 - use_fire_sampling: False # https://arxiv.org/abs/2410.21236 - prompt_length: ${data.max_prompt_length} # not use for opensource - response_length: ${data.max_response_length} - # for vllm rollout - dtype: bfloat16 # should align with FSDP - gpu_memory_utilization: 0.4 - ignore_eos: False - enforce_eager: True - free_cache_engine: True - load_format: dummy_dtensor - tensor_model_parallel_size: 2 - max_num_batched_tokens: 8192 - max_model_len: null - max_num_seqs: 1024 - # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu - log_prob_micro_batch_size_per_gpu: 4 - log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} - log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} - disable_log_stats: True - enable_chunked_prefill: True # could get higher throughput - # for hf rollout - do_sample: True - # number of responses (i.e. num sample times) - n: 8 # > 1 for grpo - -critic: - strategy: fsdp - optim: - lr: 1e-5 - lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime - # min_lr_ratio: null # only useful for warmup with cosine - warmup_style: constant # select from constant/cosine - total_training_steps: -1 # must be override by program - model: - path: /PATH/TO/MODEL/ - tokenizer_path: ${actor_rollout_ref.model.path} - override_config: { } - external_lib: ${actor_rollout_ref.model.external_lib} - enable_gradient_checkpointing: True - use_remove_padding: False - fsdp_config: - param_offload: False - optimizer_offload: False - wrap_policy: - # transformer_layer_cls_to_wrap: None - min_num_params: 0 - fsdp_size: -1 - ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} - # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu - ppo_micro_batch_size_per_gpu: 64 - forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} - use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} - ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 - forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} - ulysses_sequence_parallel_size: 1 # sp size - ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} - shuffle: ${actor_rollout_ref.actor.shuffle} - grad_clip: 1.0 - cliprange_value: 0.5 - -reward_model: - enable: False - strategy: fsdp - model: - input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical - path: ~/models/FsfairX-LLaMA3-RM-v0.1 - external_lib: ${actor_rollout_ref.model.external_lib} - use_remove_padding: False - fsdp_config: - min_num_params: 0 - param_offload: False - fsdp_size: -1 - # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu - # micro_batch_size_per_gpu: 2 # set a number - # max_length: null - ulysses_sequence_parallel_size: 1 # sp size - use_dynamic_bsz: ${critic.use_dynamic_bsz} - forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} custom_reward_function: path: null @@ -155,7 +55,6 @@ custom_reward_function: algorithm: gamma: 1.0 lam: 1.0 - adv_estimator: grpo kl_penalty: kl # how to estimate kl divergence kl_ctrl: type: fixed @@ -163,21 +62,10 @@ algorithm: trainer: balance_batch: True - total_epochs: 10 # total_training_steps: null - project_name: rft_example_gsm8k - experiment_name: cys-qwen2_1.5b_rollout8_grpo_kl0.001_lr1e-5 - logger: [ 'console','wandb' ] - val_generations_to_log_to_wandb: 0 - nnodes: 1 - n_gpus_per_node: 2 - save_freq: 100 # auto: find the last ckpt to resume. If can't find, start from scratch resume_mode: auto # or auto or resume_path if - test_freq: 5 - critic_warmup: 0 default_hdfs_dir: null remove_previous_ckpt_in_save: False del_local_ckpt_after_load: False - default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} val_before_train: False diff --git a/examples/dpo_humanlike/train_dpo.yaml b/examples/dpo_humanlike/train_dpo.yaml index 09327877f9..8ffc68b397 100644 --- a/examples/dpo_humanlike/train_dpo.yaml +++ b/examples/dpo_humanlike/train_dpo.yaml @@ -1,23 +1,6 @@ -data: - tokenizer: null - train_files: /train.parquet # useless - val_files: /test.parquet # useless - prompt_key: prompt - max_prompt_length: 1792 - max_response_length: 256 - train_batch_size: 32 - val_batch_size: null - return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs - return_raw_chat: False - shuffle: True - filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left' - truncation: error - image_key: images - actor_rollout_ref: hybrid_engine: True model: - path: /PATH/TO/MODEL/CHECKPOINT/ external_lib: null override_config: { } enable_gradient_checkpointing: True @@ -25,7 +8,6 @@ actor_rollout_ref: actor: strategy: fsdp # This is for backward-compatibility ppo_mini_batch_size: 32 - # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu ppo_micro_batch_size_per_gpu: 2 # NOTE use_dynamic_bsz: False ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length} @@ -64,87 +46,6 @@ actor_rollout_ref: log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size - rollout: - name: vllm - temperature: 1.0 - top_k: -1 # 0 for hf rollout, -1 for vllm rollout - top_p: 1 - use_fire_sampling: False # https://arxiv.org/abs/2410.21236 - prompt_length: ${data.max_prompt_length} # not use for opensource - response_length: ${data.max_response_length} - # for vllm rollout - dtype: bfloat16 # should align with FSDP - gpu_memory_utilization: 0.4 - ignore_eos: False - enforce_eager: True - free_cache_engine: True - load_format: dummy_dtensor - tensor_model_parallel_size: 2 - max_num_batched_tokens: 8192 - max_model_len: null - max_num_seqs: 1024 - # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu - log_prob_micro_batch_size_per_gpu: 4 - log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} - log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} - disable_log_stats: True - enable_chunked_prefill: True # could get higher throughput - # for hf rollout - do_sample: True - -critic: - strategy: fsdp - optim: - lr: 1e-5 - lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime - # min_lr_ratio: null # only useful for warmup with cosine - warmup_style: constant # select from constant/cosine - total_training_steps: 783 # must be override by program - model: - path: /PATH/TO/MODEL/CHECKPOINT/ - tokenizer_path: ${actor_rollout_ref.model.path} - override_config: { } - external_lib: ${actor_rollout_ref.model.external_lib} - enable_gradient_checkpointing: True - use_remove_padding: False - fsdp_config: - param_offload: False - optimizer_offload: False - wrap_policy: - # transformer_layer_cls_to_wrap: None - min_num_params: 0 - fsdp_size: -1 - ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} - # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu - ppo_micro_batch_size_per_gpu: 1 - forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} - use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} - ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 - forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} - ulysses_sequence_parallel_size: 1 # sp size - ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} - shuffle: ${actor_rollout_ref.actor.shuffle} - grad_clip: 1.0 - cliprange_value: 0.5 - -reward_model: - enable: False - strategy: fsdp - model: - input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical - path: ~/models/FsfairX-LLaMA3-RM-v0.1 - external_lib: ${actor_rollout_ref.model.external_lib} - use_remove_padding: False - fsdp_config: - min_num_params: 0 - param_offload: False - fsdp_size: -1 - # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu - # micro_batch_size_per_gpu: 2 # set a number - # max_length: null - ulysses_sequence_parallel_size: 1 # sp size - use_dynamic_bsz: ${critic.use_dynamic_bsz} - forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} custom_reward_function: path: null @@ -153,7 +54,6 @@ custom_reward_function: algorithm: gamma: 1.0 lam: 1.0 - adv_estimator: grpo kl_penalty: kl kl_ctrl: type: fixed @@ -161,20 +61,10 @@ algorithm: trainer: balance_batch: False - total_epochs: 1 # total_training_steps: 783 # - project_name: dpo_example - experiment_name: trinity_dpo - logger: [ 'console','wandb' ] - val_generations_to_log_to_wandb: 0 - nnodes: 1 - n_gpus_per_node: 2 # auto: find the last ckpt to resume. If can't find, start from scratch resume_mode: auto # or auto or resume_path if - test_freq: 5 - critic_warmup: 0 default_hdfs_dir: null remove_previous_ckpt_in_save: False del_local_ckpt_after_load: False - default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} val_before_train: False diff --git a/examples/grpo_alfworld/train_alfworld.yaml b/examples/grpo_alfworld/train_alfworld.yaml index a210c39916..215b1817ab 100644 --- a/examples/grpo_alfworld/train_alfworld.yaml +++ b/examples/grpo_alfworld/train_alfworld.yaml @@ -1,23 +1,6 @@ -data: - tokenizer: null - train_files: train_example.parquet - val_files: test_example.parquet - prompt_key: prompt - max_prompt_length: 4096 - max_response_length: 16384 - train_batch_size: 96 - val_batch_size: null - return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs - return_raw_chat: False - shuffle: True - filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left' - truncation: error - image_key: images - actor_rollout_ref: hybrid_engine: True model: - path: /PATH/TO/MODEL/CHECKPOINT/ external_lib: null override_config: { } enable_gradient_checkpointing: True @@ -25,7 +8,6 @@ actor_rollout_ref: actor: strategy: fsdp # This is for backward-compatibility ppo_mini_batch_size: 1536 - # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu ppo_micro_batch_size_per_gpu: 1 use_dynamic_bsz: False ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length} @@ -57,94 +39,10 @@ actor_rollout_ref: wrap_policy: # transformer_layer_cls_to_wrap: None min_num_params: 0 - # log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu log_prob_micro_batch_size_per_gpu: 1 log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size - rollout: - name: vllm - temperature: 1.0 - top_k: -1 # 0 for hf rollout, -1 for vllm rollout - top_p: 1 - use_fire_sampling: False # https://arxiv.org/abs/2410.21236 - prompt_length: ${data.max_prompt_length} # not use for opensource - response_length: ${data.max_response_length} - # for vllm rollout - dtype: bfloat16 # should align with FSDP - gpu_memory_utilization: 0.4 - ignore_eos: False - enforce_eager: True - free_cache_engine: True - load_format: dummy_dtensor - tensor_model_parallel_size: 1 - max_num_batched_tokens: 8192 - max_model_len: null - max_num_seqs: 1024 - # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu - log_prob_micro_batch_size_per_gpu: 1 - log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} - log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} - disable_log_stats: True - enable_chunked_prefill: True # could get higher throughput - # for hf rollout - do_sample: True - # number of responses (i.e. num sample times) - n: 8 # should be > 1 for grpo; Currently is unused parameter - -critic: - strategy: fsdp - optim: - lr: 1e-5 - lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime - # min_lr_ratio: null # only useful for warmup with cosine - warmup_style: constant # select from constant/cosine - total_training_steps: -1 # must be override by program - model: - path: /PATH/TO/MODEL/CHECKPOINT/ - tokenizer_path: ${actor_rollout_ref.model.path} - override_config: { } - external_lib: ${actor_rollout_ref.model.external_lib} - enable_gradient_checkpointing: True - use_remove_padding: False - fsdp_config: - param_offload: False - optimizer_offload: False - wrap_policy: - # transformer_layer_cls_to_wrap: None - min_num_params: 0 - fsdp_size: -1 - ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} - # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu - ppo_micro_batch_size_per_gpu: 1 - forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} - use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} - ppo_max_token_len_per_gpu: 16384 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 - forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} - ulysses_sequence_parallel_size: 1 # sp size - ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} - shuffle: ${actor_rollout_ref.actor.shuffle} - grad_clip: 1.0 - cliprange_value: 0.5 - -reward_model: - enable: False - strategy: fsdp - model: - input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical - path: ~/models/FsfairX-LLaMA3-RM-v0.1 - external_lib: ${actor_rollout_ref.model.external_lib} - use_remove_padding: False - fsdp_config: - min_num_params: 0 - param_offload: False - fsdp_size: -1 - # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu - # micro_batch_size_per_gpu: 2 # set a number - # max_length: null - ulysses_sequence_parallel_size: 1 # sp size - use_dynamic_bsz: ${critic.use_dynamic_bsz} - forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} custom_reward_function: path: null @@ -153,7 +51,6 @@ custom_reward_function: algorithm: gamma: 1.0 lam: 1.0 - adv_estimator: grpo kl_penalty: kl # how to estimate kl divergence kl_ctrl: type: fixed @@ -161,20 +58,10 @@ algorithm: trainer: balance_batch: True - total_epochs: 15 # total_training_steps: null - project_name: ALFWORLD - experiment_name: ALFWORLD_RFT - logger: [ 'wandb' ] - val_generations_to_log_to_wandb: 0 - nnodes: 1 - n_gpus_per_node: 2 # auto: find the last ckpt to resume. If can't find, start from scratch resume_mode: auto # or auto or resume_path if - test_freq: 100 - critic_warmup: 0 default_hdfs_dir: null remove_previous_ckpt_in_save: False del_local_ckpt_after_load: False - default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} val_before_train: False diff --git a/examples/grpo_gsm8k/train_gsm8k.yaml b/examples/grpo_gsm8k/train_gsm8k.yaml index 13b195f557..de1b08f590 100644 --- a/examples/grpo_gsm8k/train_gsm8k.yaml +++ b/examples/grpo_gsm8k/train_gsm8k.yaml @@ -1,23 +1,6 @@ -data: - tokenizer: null - train_files: train_example.parquet - val_files: test_example.parquet - prompt_key: prompt - max_prompt_length: 256 - max_response_length: 1024 - train_batch_size: 256 - val_batch_size: null - return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs - return_raw_chat: False - shuffle: True - filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left' - truncation: error - image_key: images - actor_rollout_ref: hybrid_engine: True model: - path: /PATH/TO/MODEL/CHECKPOINT/ external_lib: null override_config: { } enable_gradient_checkpointing: True @@ -25,7 +8,6 @@ actor_rollout_ref: actor: strategy: fsdp # This is for backward-compatibility ppo_mini_batch_size: 128 - # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu ppo_micro_batch_size_per_gpu: 4 use_dynamic_bsz: True # False ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length} @@ -61,94 +43,10 @@ actor_rollout_ref: wrap_policy: # transformer_layer_cls_to_wrap: None min_num_params: 0 - # log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu log_prob_micro_batch_size_per_gpu: 16 log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size - rollout: - name: vllm - temperature: 1.0 - top_k: -1 # 0 for hf rollout, -1 for vllm rollout - top_p: 1 - use_fire_sampling: False # https://arxiv.org/abs/2410.21236 - prompt_length: ${data.max_prompt_length} # not use for opensource - response_length: ${data.max_response_length} - # for vllm rollout - dtype: bfloat16 # should align with FSDP - gpu_memory_utilization: 0.4 - ignore_eos: False - enforce_eager: True - free_cache_engine: True - load_format: dummy_dtensor - tensor_model_parallel_size: 2 - max_num_batched_tokens: 8192 - max_model_len: null - max_num_seqs: 1024 - # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu - log_prob_micro_batch_size_per_gpu: 4 - log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} - log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} - disable_log_stats: True - enable_chunked_prefill: True # could get higher throughput - # for hf rollout - do_sample: True - # number of responses (i.e. num sample times) - n: 8 # > 1 for grpo - -critic: - strategy: fsdp - optim: - lr: 1e-5 - lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime - # min_lr_ratio: null # only useful for warmup with cosine - warmup_style: constant # select from constant/cosine - total_training_steps: -1 # must be override by program - model: - path: /PATH/TO/MODEL/CHECKPOINT/ - tokenizer_path: ${actor_rollout_ref.model.path} - override_config: { } - external_lib: ${actor_rollout_ref.model.external_lib} - enable_gradient_checkpointing: True - use_remove_padding: False - fsdp_config: - param_offload: False - optimizer_offload: False - wrap_policy: - # transformer_layer_cls_to_wrap: None - min_num_params: 0 - fsdp_size: -1 - ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} - # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu - ppo_micro_batch_size_per_gpu: 64 - forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} - use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} - ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 - forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} - ulysses_sequence_parallel_size: 1 # sp size - ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} - shuffle: ${actor_rollout_ref.actor.shuffle} - grad_clip: 1.0 - cliprange_value: 0.5 - -reward_model: - enable: False - strategy: fsdp - model: - input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical - path: ~/models/FsfairX-LLaMA3-RM-v0.1 - external_lib: ${actor_rollout_ref.model.external_lib} - use_remove_padding: False - fsdp_config: - min_num_params: 0 - param_offload: False - fsdp_size: -1 - # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu - # micro_batch_size_per_gpu: 2 # set a number - # max_length: null - ulysses_sequence_parallel_size: 1 # sp size - use_dynamic_bsz: ${critic.use_dynamic_bsz} - forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} custom_reward_function: path: null @@ -157,7 +55,6 @@ custom_reward_function: algorithm: gamma: 1.0 lam: 1.0 - adv_estimator: grpo kl_penalty: kl # how to estimate kl divergence kl_ctrl: type: fixed @@ -165,20 +62,10 @@ algorithm: trainer: balance_batch: True - total_epochs: 10 # total_training_steps: null - project_name: rft_example_gsm8k - experiment_name: cys-qwen2_1.5b_rollout8_grpo_kl0.001_lr1e-5 - logger: [ 'console','wandb' ] - val_generations_to_log_to_wandb: 0 - nnodes: 1 - n_gpus_per_node: 2 # auto: find the last ckpt to resume. If can't find, start from scratch resume_mode: auto # or auto or resume_path if - test_freq: 5 - critic_warmup: 0 default_hdfs_dir: null remove_previous_ckpt_in_save: False del_local_ckpt_after_load: False - default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} val_before_train: False diff --git a/examples/grpo_math/train_math.yaml b/examples/grpo_math/train_math.yaml index 2482ccc785..78bcb862c6 100644 --- a/examples/grpo_math/train_math.yaml +++ b/examples/grpo_math/train_math.yaml @@ -1,23 +1,6 @@ -data: - tokenizer: null - train_files: train_example.parquet - val_files: test_example.parquet - prompt_key: prompt - max_prompt_length: 1024 - max_response_length: 2048 - # train_batch_size: 256 - val_batch_size: null - return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs - return_raw_chat: False - shuffle: True - filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left' - truncation: error - image_key: images - actor_rollout_ref: hybrid_engine: True model: - path: /PATH/TO/MODEL/ external_lib: null override_config: { } enable_gradient_checkpointing: True @@ -64,84 +47,6 @@ actor_rollout_ref: log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size - rollout: - name: vllm - temperature: 1.0 - top_k: -1 # 0 for hf rollout, -1 for vllm rollout - top_p: 1 - use_fire_sampling: False # https://arxiv.org/abs/2410.21236 - prompt_length: ${data.max_prompt_length} # not use for opensource - response_length: ${data.max_response_length} - # for vllm rollout - dtype: bfloat16 # should align with FSDP - gpu_memory_utilization: 0.4 - ignore_eos: False - enforce_eager: True - free_cache_engine: True - load_format: dummy_dtensor - tensor_model_parallel_size: 2 - max_num_batched_tokens: 8192 - max_model_len: null - max_num_seqs: 1024 - log_prob_micro_batch_size_per_gpu: 4 - log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} - log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} - disable_log_stats: True - enable_chunked_prefill: True # could get higher throughput - # for hf rollout - do_sample: True - # number of responses (i.e. num sample times) - n: 8 # > 1 for grpo - -critic: - strategy: fsdp - optim: - lr: 1e-5 - lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime - # min_lr_ratio: null # only useful for warmup with cosine - warmup_style: constant # select from constant/cosine - total_training_steps: -1 # must be override by program - model: - path: /PATH/TO/MODEL/ - tokenizer_path: ${actor_rollout_ref.model.path} - override_config: { } - external_lib: ${actor_rollout_ref.model.external_lib} - enable_gradient_checkpointing: True - use_remove_padding: False - fsdp_config: - param_offload: False - optimizer_offload: False - wrap_policy: - # transformer_layer_cls_to_wrap: None - min_num_params: 0 - fsdp_size: -1 - ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} - ppo_micro_batch_size_per_gpu: 64 - forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} - use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} - ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 - forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} - ulysses_sequence_parallel_size: 1 # sp size - ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} - shuffle: ${actor_rollout_ref.actor.shuffle} - grad_clip: 1.0 - cliprange_value: 0.5 - -reward_model: - enable: False - strategy: fsdp - model: - input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical - path: ~/models/FsfairX-LLaMA3-RM-v0.1 - external_lib: ${actor_rollout_ref.model.external_lib} - use_remove_padding: False - fsdp_config: - min_num_params: 0 - param_offload: False - fsdp_size: -1 - ulysses_sequence_parallel_size: 1 # sp size - use_dynamic_bsz: ${critic.use_dynamic_bsz} - forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} custom_reward_function: path: null @@ -150,7 +55,6 @@ custom_reward_function: algorithm: gamma: 1.0 lam: 1.0 - adv_estimator: grpo kl_penalty: kl # how to estimate kl divergence kl_ctrl: type: fixed @@ -158,19 +62,9 @@ algorithm: trainer: balance_batch: True - total_epochs: 20 - project_name: grpo_math - experiment_name: grpo_math_example - logger: [ 'console','wandb' ] - val_generations_to_log_to_wandb: 0 - nnodes: 1 - n_gpus_per_node: 2 # auto: find the last ckpt to resume. If can't find, start from scratch resume_mode: auto # or auto or resume_path if - test_freq: 5 - critic_warmup: 0 default_hdfs_dir: null remove_previous_ckpt_in_save: False del_local_ckpt_after_load: False - default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} val_before_train: False diff --git a/examples/grpo_sciworld/train_sciworld.yaml b/examples/grpo_sciworld/train_sciworld.yaml index 833441142c..215b1817ab 100644 --- a/examples/grpo_sciworld/train_sciworld.yaml +++ b/examples/grpo_sciworld/train_sciworld.yaml @@ -1,23 +1,6 @@ -data: - tokenizer: null - train_files: train_example.parquet - val_files: test_example.parquet - prompt_key: prompt - max_prompt_length: 4096 - max_response_length: 16384 - train_batch_size: 96 - val_batch_size: null - return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs - return_raw_chat: False - shuffle: True - filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left' - truncation: error - image_key: images - actor_rollout_ref: hybrid_engine: True model: - path: /PATH/TO/MODEL/CHECKPOINT/ external_lib: null override_config: { } enable_gradient_checkpointing: True @@ -60,86 +43,6 @@ actor_rollout_ref: log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size - rollout: - name: vllm - temperature: 1.0 - top_k: -1 # 0 for hf rollout, -1 for vllm rollout - top_p: 1 - use_fire_sampling: False # https://arxiv.org/abs/2410.21236 - prompt_length: ${data.max_prompt_length} # not use for opensource - response_length: ${data.max_response_length} - # for vllm rollout - dtype: bfloat16 # should align with FSDP - gpu_memory_utilization: 0.4 - ignore_eos: False - enforce_eager: True - free_cache_engine: True - load_format: dummy_dtensor - tensor_model_parallel_size: 1 - max_num_batched_tokens: 8192 - max_model_len: null - max_num_seqs: 1024 - log_prob_micro_batch_size_per_gpu: 1 - log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} - log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} - disable_log_stats: True - enable_chunked_prefill: True # could get higher throughput - # for hf rollout - do_sample: True - # number of responses (i.e. num sample times) - n: 8 # should be > 1 for grpo; Currently is unused parameter - -critic: - strategy: fsdp - optim: - lr: 1e-5 - lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime - # min_lr_ratio: null # only useful for warmup with cosine - warmup_style: constant # select from constant/cosine - total_training_steps: -1 # must be override by program - model: - path: /PATH/TO/MODEL/CHECKPOINT/ - tokenizer_path: ${actor_rollout_ref.model.path} - override_config: { } - external_lib: ${actor_rollout_ref.model.external_lib} - enable_gradient_checkpointing: True - use_remove_padding: False - fsdp_config: - param_offload: False - optimizer_offload: False - wrap_policy: - # transformer_layer_cls_to_wrap: None - min_num_params: 0 - fsdp_size: -1 - ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} - ppo_micro_batch_size_per_gpu: 1 - forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} - use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} - ppo_max_token_len_per_gpu: 16384 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 - forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} - ulysses_sequence_parallel_size: 1 # sp size - ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} - shuffle: ${actor_rollout_ref.actor.shuffle} - grad_clip: 1.0 - cliprange_value: 0.5 - -reward_model: - enable: False - strategy: fsdp - model: - input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical - path: ~/models/FsfairX-LLaMA3-RM-v0.1 - external_lib: ${actor_rollout_ref.model.external_lib} - use_remove_padding: False - fsdp_config: - min_num_params: 0 - param_offload: False - fsdp_size: -1 - # micro_batch_size_per_gpu: 2 # set a number - # max_length: null - ulysses_sequence_parallel_size: 1 # sp size - use_dynamic_bsz: ${critic.use_dynamic_bsz} - forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} custom_reward_function: path: null @@ -148,7 +51,6 @@ custom_reward_function: algorithm: gamma: 1.0 lam: 1.0 - adv_estimator: grpo kl_penalty: kl # how to estimate kl divergence kl_ctrl: type: fixed @@ -156,20 +58,10 @@ algorithm: trainer: balance_batch: True - total_epochs: 15 # total_training_steps: null - project_name: sciworld - experiment_name: sciworld_RFT - logger: [ 'wandb' ] - val_generations_to_log_to_wandb: 0 - nnodes: 1 - n_gpus_per_node: 2 # auto: find the last ckpt to resume. If can't find, start from scratch resume_mode: auto # or auto or resume_path if - test_freq: 100 - critic_warmup: 0 default_hdfs_dir: null remove_previous_ckpt_in_save: False del_local_ckpt_after_load: False - default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} val_before_train: False diff --git a/examples/grpo_webshop/train_webshop.yaml b/examples/grpo_webshop/train_webshop.yaml index ac502fec3f..215b1817ab 100644 --- a/examples/grpo_webshop/train_webshop.yaml +++ b/examples/grpo_webshop/train_webshop.yaml @@ -1,23 +1,6 @@ -data: - tokenizer: null - train_files: train_example.parquet - val_files: test_example.parquet - prompt_key: prompt - max_prompt_length: 4096 - max_response_length: 16384 - train_batch_size: 96 - val_batch_size: null - return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs - return_raw_chat: False - shuffle: True - filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left' - truncation: error - image_key: images - actor_rollout_ref: hybrid_engine: True model: - path: /PATH/TO/MODEL/CHECKPOINT/ external_lib: null override_config: { } enable_gradient_checkpointing: True @@ -25,7 +8,6 @@ actor_rollout_ref: actor: strategy: fsdp # This is for backward-compatibility ppo_mini_batch_size: 1536 - # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu ppo_micro_batch_size_per_gpu: 1 use_dynamic_bsz: False ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length} @@ -57,94 +39,10 @@ actor_rollout_ref: wrap_policy: # transformer_layer_cls_to_wrap: None min_num_params: 0 - # log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu log_prob_micro_batch_size_per_gpu: 1 log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size - rollout: - name: vllm - temperature: 1.0 - top_k: -1 # 0 for hf rollout, -1 for vllm rollout - top_p: 1 - use_fire_sampling: False # https://arxiv.org/abs/2410.21236 - prompt_length: ${data.max_prompt_length} # not use for opensource - response_length: ${data.max_response_length} - # for vllm rollout - dtype: bfloat16 # should align with FSDP - gpu_memory_utilization: 0.4 - ignore_eos: False - enforce_eager: True - free_cache_engine: True - load_format: dummy_dtensor - tensor_model_parallel_size: 1 - max_num_batched_tokens: 8192 - max_model_len: null - max_num_seqs: 1024 - # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu - log_prob_micro_batch_size_per_gpu: 1 - log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} - log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} - disable_log_stats: True - enable_chunked_prefill: True # could get higher throughput - # for hf rollout - do_sample: True - # number of responses (i.e. num sample times) - n: 8 # should be > 1 for grpo; Currently is unused parameter - -critic: - strategy: fsdp - optim: - lr: 1e-5 - lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime - # min_lr_ratio: null # only useful for warmup with cosine - warmup_style: constant # select from constant/cosine - total_training_steps: -1 # must be override by program - model: - path: /PATH/TO/MODEL/CHECKPOINT/ - tokenizer_path: ${actor_rollout_ref.model.path} - override_config: { } - external_lib: ${actor_rollout_ref.model.external_lib} - enable_gradient_checkpointing: True - use_remove_padding: False - fsdp_config: - param_offload: False - optimizer_offload: False - wrap_policy: - # transformer_layer_cls_to_wrap: None - min_num_params: 0 - fsdp_size: -1 - ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} - # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu - ppo_micro_batch_size_per_gpu: 1 - forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} - use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} - ppo_max_token_len_per_gpu: 16384 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 - forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} - ulysses_sequence_parallel_size: 1 # sp size - ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} - shuffle: ${actor_rollout_ref.actor.shuffle} - grad_clip: 1.0 - cliprange_value: 0.5 - -reward_model: - enable: False - strategy: fsdp - model: - input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical - path: ~/models/FsfairX-LLaMA3-RM-v0.1 - external_lib: ${actor_rollout_ref.model.external_lib} - use_remove_padding: False - fsdp_config: - min_num_params: 0 - param_offload: False - fsdp_size: -1 - # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu - # micro_batch_size_per_gpu: 2 # set a number - # max_length: null - ulysses_sequence_parallel_size: 1 # sp size - use_dynamic_bsz: ${critic.use_dynamic_bsz} - forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} custom_reward_function: path: null @@ -153,7 +51,6 @@ custom_reward_function: algorithm: gamma: 1.0 lam: 1.0 - adv_estimator: grpo kl_penalty: kl # how to estimate kl divergence kl_ctrl: type: fixed @@ -161,20 +58,10 @@ algorithm: trainer: balance_batch: True - total_epochs: 15 # total_training_steps: null - project_name: WEBSHOP - experiment_name: WEBSHOP_RFT - logger: [ 'wandb' ] - val_generations_to_log_to_wandb: 0 - nnodes: 1 - n_gpus_per_node: 2 # auto: find the last ckpt to resume. If can't find, start from scratch resume_mode: auto # or auto or resume_path if - test_freq: 100 - critic_warmup: 0 default_hdfs_dir: null remove_previous_ckpt_in_save: False del_local_ckpt_after_load: False - default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} val_before_train: False diff --git a/examples/opmd_gsm8k/train_opmd_gsm8k.yaml b/examples/opmd_gsm8k/train_opmd_gsm8k.yaml index 88f92fb461..326904d987 100644 --- a/examples/opmd_gsm8k/train_opmd_gsm8k.yaml +++ b/examples/opmd_gsm8k/train_opmd_gsm8k.yaml @@ -22,26 +22,9 @@ # adv_estimator: grpo # merely to disable critic model, doesn't affect adv compute when algorithm_type is opmd -data: - tokenizer: null - train_files: /train.jsonl - val_files: /test.jsonl - prompt_key: prompt - max_prompt_length: 256 - max_response_length: 1024 - train_batch_size: 256 - val_batch_size: null - return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs - return_raw_chat: False - shuffle: True - filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left' - truncation: error - image_key: images - actor_rollout_ref: hybrid_engine: True model: - path: path_to_models/Qwen2.5-1.5B-Inst external_lib: null override_config: { } enable_gradient_checkpointing: True @@ -49,7 +32,6 @@ actor_rollout_ref: actor: strategy: fsdp # This is for backward-compatibility ppo_mini_batch_size: 128 - # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu ppo_micro_batch_size_per_gpu: 4 use_dynamic_bsz: True ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length} @@ -87,94 +69,10 @@ actor_rollout_ref: wrap_policy: # transformer_layer_cls_to_wrap: None min_num_params: 0 - # log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu log_prob_micro_batch_size_per_gpu: 16 log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size - rollout: - name: vllm - temperature: 1.0 - top_k: -1 # 0 for hf rollout, -1 for vllm rollout - top_p: 1 - use_fire_sampling: False # https://arxiv.org/abs/2410.21236 - prompt_length: ${data.max_prompt_length} # not use for opensource - response_length: ${data.max_response_length} - # for vllm rollout - dtype: bfloat16 # should align with FSDP - gpu_memory_utilization: 0.4 - ignore_eos: False - enforce_eager: True - free_cache_engine: True - load_format: dummy_dtensor - tensor_model_parallel_size: 2 - max_num_batched_tokens: 8192 - max_model_len: null - max_num_seqs: 1024 - # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu - log_prob_micro_batch_size_per_gpu: 4 - log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} - log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} - disable_log_stats: True - enable_chunked_prefill: True # could get higher throughput - # for hf rollout - do_sample: True - # number of responses (i.e. num sample times) - n: 8 # > 1 for grpo - -critic: - strategy: fsdp - optim: - lr: 1e-5 - lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime - # min_lr_ratio: null # only useful for warmup with cosine - warmup_style: constant # select from constant/cosine - total_training_steps: -1 # must be override by program - model: - path: path_to_models/Qwen2.5-1.5B-Inst - tokenizer_path: ${actor_rollout_ref.model.path} - override_config: { } - external_lib: ${actor_rollout_ref.model.external_lib} - enable_gradient_checkpointing: True - use_remove_padding: False - fsdp_config: - param_offload: False - optimizer_offload: False - wrap_policy: - # transformer_layer_cls_to_wrap: None - min_num_params: 0 - fsdp_size: -1 - ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} - # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu - ppo_micro_batch_size_per_gpu: 64 - forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} - use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} - ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 - forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} - ulysses_sequence_parallel_size: 1 # sp size - ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} - shuffle: ${actor_rollout_ref.actor.shuffle} - grad_clip: 1.0 - cliprange_value: 0.5 - -reward_model: - enable: False - strategy: fsdp - model: - input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical - path: ~/models/FsfairX-LLaMA3-RM-v0.1 - external_lib: ${actor_rollout_ref.model.external_lib} - use_remove_padding: False - fsdp_config: - min_num_params: 0 - param_offload: False - fsdp_size: -1 - # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu - # micro_batch_size_per_gpu: 2 # set a number - # max_length: null - ulysses_sequence_parallel_size: 1 # sp size - use_dynamic_bsz: ${critic.use_dynamic_bsz} - forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} custom_reward_function: path: null @@ -183,7 +81,6 @@ custom_reward_function: algorithm: gamma: 1.0 lam: 1.0 - adv_estimator: grpo kl_penalty: kl # how to estimate kl divergence kl_ctrl: type: fixed @@ -191,22 +88,12 @@ algorithm: trainer: balance_batch: True - total_epochs: 10 # total_training_steps: null - project_name: Trinity-RFT-gsm8k-test-opmd - experiment_name: qwen2.5-1.5B-gsm8k-opmd-kl_0.001-entropy_0-tau_4-beta1_0.0-beta2_0.95-lr_2e-6-sync10 - logger: [ 'console','wandb' ] - val_generations_to_log_to_wandb: 0 - nnodes: 1 - n_gpus_per_node: 2 # auto: find the last ckpt to resume. If can't find, start from scratch resume_mode: auto # or auto or resume_path if - test_freq: 100 - critic_warmup: 0 default_hdfs_dir: null remove_previous_ckpt_in_save: False del_local_ckpt_after_load: False - default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} val_before_train: False max_actor_ckpt_to_keep: 5 max_critic_ckpt_to_keep: 5 diff --git a/examples/ppo_countdown/train_countdown.yaml b/examples/ppo_countdown/train_countdown.yaml index 291afe452f..ae16122ef7 100644 --- a/examples/ppo_countdown/train_countdown.yaml +++ b/examples/ppo_countdown/train_countdown.yaml @@ -1,23 +1,6 @@ -data: - tokenizer: null - train_files: train_example.parquet - val_files: test_example.parquet - prompt_key: prompt - max_prompt_length: 256 - max_response_length: 1024 - train_batch_size: 256 - val_batch_size: null - return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs - return_raw_chat: False - shuffle: True - filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left' - truncation: error - image_key: images - actor_rollout_ref: hybrid_engine: True model: - path: /PATH/TO/MODEL/CHECKPOINT/ external_lib: null override_config: { } enable_gradient_checkpointing: True @@ -25,7 +8,6 @@ actor_rollout_ref: actor: strategy: fsdp # This is for backward-compatibility ppo_mini_batch_size: 128 - # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu ppo_micro_batch_size_per_gpu: 4 use_dynamic_bsz: True ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length} @@ -63,40 +45,10 @@ actor_rollout_ref: wrap_policy: # transformer_layer_cls_to_wrap: None min_num_params: 0 - # log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu log_prob_micro_batch_size_per_gpu: 8 log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size - rollout: - name: vllm - temperature: 1.0 - top_k: -1 # 0 for hf rollout, -1 for vllm rollout - top_p: 1 - use_fire_sampling: False # https://arxiv.org/abs/2410.21236 - prompt_length: ${data.max_prompt_length} # not use for opensource - response_length: ${data.max_response_length} - # for vllm rollout - dtype: bfloat16 # should align with FSDP - gpu_memory_utilization: 0.4 - ignore_eos: False - enforce_eager: True - free_cache_engine: True - load_format: dummy_dtensor - tensor_model_parallel_size: 2 - max_num_batched_tokens: 8192 - max_model_len: null - max_num_seqs: 1024 - # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu - log_prob_micro_batch_size_per_gpu: 4 - log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} - log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} - disable_log_stats: True - enable_chunked_prefill: True # could get higher throughput - # for hf rollout - do_sample: True - # number of responses (i.e. num sample times) - n: 1 # > 1 for grpo critic: strategy: fsdp @@ -107,8 +59,6 @@ critic: warmup_style: constant # select from constant/cosine total_training_steps: -1 # must be override by program model: - path: /PATH/TO/MODEL/CHECKPOINT/ - tokenizer_path: ${actor_rollout_ref.model.path} override_config: { } external_lib: ${actor_rollout_ref.model.external_lib} enable_gradient_checkpointing: True @@ -121,7 +71,6 @@ critic: min_num_params: 0 fsdp_size: -1 ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} - # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu ppo_micro_batch_size_per_gpu: 8 forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} @@ -133,25 +82,6 @@ critic: grad_clip: 1.0 cliprange_value: 0.5 -reward_model: - enable: False - strategy: fsdp - model: - input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical - path: ~/models/FsfairX-LLaMA3-RM-v0.1 - external_lib: ${actor_rollout_ref.model.external_lib} - use_remove_padding: False - fsdp_config: - min_num_params: 0 - param_offload: False - fsdp_size: -1 - # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu - # micro_batch_size_per_gpu: 2 # set a number - # max_length: null - ulysses_sequence_parallel_size: 1 # sp size - use_dynamic_bsz: ${critic.use_dynamic_bsz} - forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} - custom_reward_function: path: null name: compute_score @@ -159,7 +89,6 @@ custom_reward_function: algorithm: gamma: 1.0 lam: 1.0 - adv_estimator: gae kl_penalty: kl # how to estimate kl divergence kl_ctrl: type: fixed @@ -167,22 +96,13 @@ algorithm: trainer: balance_batch: True - total_epochs: 15 # total_training_steps: null - project_name: TinyZero - experiment_name: trinity-qwen2.5-1.5b - logger: [ 'wandb' ] - val_generations_to_log_to_wandb: 0 - nnodes: 1 - n_gpus_per_node: 2 # auto: find the last ckpt to resume. If can't find, start from scratch resume_mode: auto # or auto or resume_path if - test_freq: 100 critic_warmup: 0 default_hdfs_dir: null remove_previous_ckpt_in_save: False del_local_ckpt_after_load: False - default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} val_before_train: False max_actor_ckpt_to_keep: 5 max_critic_ckpt_to_keep: 5 diff --git a/tests/common/config_test.py b/tests/common/config_test.py index 35b9a4f9c7..e1ac0aa7d4 100644 --- a/tests/common/config_test.py +++ b/tests/common/config_test.py @@ -46,7 +46,8 @@ def test_all_examples_are_valid(self): print(f"Checking config: {filename}") config_path = os.path.join(example_dir, example_name, filename) try: - load_config(config_path) + config = load_config(config_path) + config.check_and_update() except Exception as e: print(f"Error loading config {config_path}: {e}") raise e diff --git a/tests/template/verl_config.yaml b/tests/template/verl_config.yaml index d1e84cb455..b17fc87958 100644 --- a/tests/template/verl_config.yaml +++ b/tests/template/verl_config.yaml @@ -8,7 +8,6 @@ actor_rollout_ref: actor: strategy: fsdp # This is for backward-compatibility ppo_mini_batch_size: 4 - # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu ppo_micro_batch_size_per_gpu: 1 use_dynamic_bsz: True ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length} @@ -46,37 +45,10 @@ actor_rollout_ref: wrap_policy: # transformer_layer_cls_to_wrap: None min_num_params: 0 - # log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu log_prob_micro_batch_size_per_gpu: 1 log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size - rollout: - name: vllm - use_fire_sampling: False # https://arxiv.org/abs/2410.21236 - prompt_length: ${data.max_prompt_length} # not use for opensource - response_length: ${data.max_response_length} - # for vllm rollout - dtype: bfloat16 # should align with FSDP - gpu_memory_utilization: 0.4 - ignore_eos: False - enforce_eager: True - free_cache_engine: True - load_format: dummy_dtensor - tensor_model_parallel_size: 2 - max_num_batched_tokens: 8192 - max_model_len: null - max_num_seqs: 1024 - # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu - log_prob_micro_batch_size_per_gpu: 1 - log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} - log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} - disable_log_stats: True - enable_chunked_prefill: True # could get higher throughput - # for hf rollout - do_sample: True - # number of responses (i.e. num sample times) - n: 1 # > 1 for grpo critic: strategy: fsdp @@ -87,7 +59,6 @@ critic: warmup_style: constant # select from constant/cosine total_training_steps: -1 # must be override by program model: - tokenizer_path: ${actor_rollout_ref.model.path} override_config: { } external_lib: ${actor_rollout_ref.model.external_lib} enable_gradient_checkpointing: True @@ -100,7 +71,6 @@ critic: min_num_params: 0 fsdp_size: -1 ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} - # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu ppo_micro_batch_size_per_gpu: 1 forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} @@ -115,7 +85,6 @@ critic: algorithm: gamma: 1.0 lam: 1.0 - adv_estimator: gae kl_penalty: kl # how to estimate kl divergence kl_ctrl: type: fixed @@ -123,22 +92,13 @@ algorithm: trainer: balance_batch: True - total_epochs: 10 # total_training_steps: null - project_name: TinyZero - experiment_name: trinity-qwen2.5-1.5b - logger: [ 'wandb' ] - val_generations_to_log_to_wandb: 0 - nnodes: 1 - n_gpus_per_node: 2 - save_freq: 20 # auto: find the last ckpt to resume. If can't find, start from scratch resume_mode: auto # or auto or resume_path if critic_warmup: 0 default_hdfs_dir: null remove_previous_ckpt_in_save: False del_local_ckpt_after_load: False - default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} val_before_train: False max_actor_ckpt_to_keep: 1 max_critic_ckpt_to_keep: 1 diff --git a/trinity/common/config.py b/trinity/common/config.py index b2703d4d2d..e0660ab03a 100644 --- a/trinity/common/config.py +++ b/trinity/common/config.py @@ -173,8 +173,8 @@ class AlgorithmConfig: algorithm_type: AlgorithmType = AlgorithmType.PPO # for GRPO-like algorithms, repeat each task for `repeat_times` times repeat_times: int = 1 - gamma: float = 1.0 - lam: float = 1.0 + gamma: Optional[float] = None + lam: Optional[float] = None # TODO: add more algorithm params here @@ -259,19 +259,20 @@ class ExplorerConfig: @dataclass class TrainerConfig: trainer_type: str = "verl" - trainer_config_path: str = "" save_interval: int = 0 enable_preview: bool = True # enable rollout preview in wandb # trainer configs - actor_use_kl_loss: bool = False - actor_kl_loss_coef: float = 0.001 - actor_entropy_coef: float = 0.001 - actor_grad_clip: float = 1.0 - actor_clip_ratio: float = 0.2 + actor_use_kl_loss: Optional[bool] = None + actor_kl_loss_coef: Optional[float] = None + actor_entropy_coef: Optional[float] = None + actor_grad_clip: Optional[float] = None + actor_clip_ratio: Optional[float] = None # TODO: extract more train-related params from underlying trainer engine + # Only one needs to be set for `trainer_config` and `trainer_config_path` trainer_config: Any = field(default_factory=dict) + trainer_config_path: str = "" @dataclass @@ -292,7 +293,7 @@ class SynchronizerConfig: sync_interval: int = 1 # waiting for `sync_timeout` seconds before timeout in `nccl` method sync_timeout: int = 1200 - # wait for the lastest checkpoint to be ready + # wait for the lastest checkpoint to be ready # TODO: to be used wait_for_checkpoint: bool = False # ! DO NOT SET, automatically calculated @@ -338,7 +339,7 @@ def _check_interval(self) -> None: and self.algorithm.algorithm_type != AlgorithmType.DPO and self.explorer.eval_interval % self.synchronizer.sync_interval != 0 ): - self.buffer.eval_interval = ( + self.explorer.eval_interval = ( max(self.explorer.eval_interval // self.synchronizer.sync_interval, 1) ) * self.synchronizer.sync_interval logger.warning( diff --git a/trinity/common/verl_config.py b/trinity/common/verl_config.py index dd896a23f1..e5d0d9d55f 100644 --- a/trinity/common/verl_config.py +++ b/trinity/common/verl_config.py @@ -1,3 +1,4 @@ +import math from dataclasses import dataclass, field from typing import Any, Dict, List, Optional @@ -13,20 +14,7 @@ @dataclass class Data: - tokenizer: Optional[str] = None - train_files: str = "" - val_files: str = "" - prompt_key: str = "prompt" - max_prompt_length: int = 512 - max_response_length: int = 512 train_batch_size: int = 1024 - val_batch_size: Optional[int] = None - return_raw_input_ids: bool = False - return_raw_chat: bool = False - shuffle: bool = True - filter_overlong_prompts: bool = False - truncation: str = "error" - image_key: str = "images" @dataclass @@ -109,30 +97,7 @@ class Ref: @dataclass class Rollout: - name: str = "vllm" temperature: float = 1.0 - top_k: int = -1 - top_p: float = 1.0 - use_fire_sampling: bool = False - prompt_length: int = 0 - response_length: int = 0 - dtype: str = "bfloat16" - gpu_memory_utilization: float = 0.5 - ignore_eos: bool = False - enforce_eager: bool = True - free_cache_engine: bool = True - load_format: str = "dummy_dtensor" - tensor_model_parallel_size: int = 2 - max_num_batched_tokens: int = 8192 - max_model_len: Optional[int] = None - max_num_seqs: int = 1024 - log_prob_micro_batch_size: Optional[int] = None - log_prob_micro_batch_size_per_gpu: int = 1 - log_prob_use_dynamic_bsz: bool = False - log_prob_max_token_len_per_gpu: int = 0 - disable_log_stats: bool = True - enable_chunked_prefill: bool = True - do_sample: bool = True n: int = 1 # > 1 for grpo @@ -268,7 +233,7 @@ class veRLConfig: synchronizer: Optional[SynchronizerConfig] = None enable_preview: bool = True - def synchronize_config(self, config: Config) -> None: + def synchronize_config(self, config: Config) -> None: # noqa: C901 """Synchronize config.""" if config.mode != "train": rollout_gpu_num = ( @@ -283,36 +248,50 @@ def synchronize_config(self, config: Config) -> None: ) else: rollout_gpu_num = 0 - rollout_node_num = rollout_gpu_num // config.cluster.gpu_per_node - self.trainer.nnodes = config.cluster.node_num - rollout_node_num - self.actor_rollout_ref.model.path = config.model.model_path - self.critic.model.path = config.model.critic_model_path - self.critic.model.tokenizer_path = config.model.critic_model_path if config.cluster.node_num == 1: # for single node scenarios, rollout and training are on the same node + self.trainer.nnodes = config.cluster.node_num self.trainer.n_gpus_per_node = config.cluster.gpu_per_node - rollout_gpu_num else: # for multi-node scenarios, some nodes for rollout, others for training + assert ( + rollout_gpu_num % config.cluster.gpu_per_node == 0 + ), "rollout_gpu_num must be divisible by `gpu_per_node`" + rollout_node_num = math.ceil(rollout_gpu_num / config.cluster.gpu_per_node) + self.trainer.nnodes = config.cluster.node_num - rollout_node_num + if self.trainer.nnodes < 1: + raise ValueError("The number of training nodes must be greater than 0") self.trainer.n_gpus_per_node = config.cluster.gpu_per_node - self.trainer.sync_freq = config.synchronizer.sync_interval - self.trainer.save_freq = config.trainer.save_interval - self.synchronizer = config.synchronizer - self.actor_rollout_ref.synchronizer = config.synchronizer - self.buffer = config.buffer + world_size = self.trainer.nnodes * self.trainer.n_gpus_per_node if config.buffer.batch_size % world_size != 0: raise ValueError( f"batch_size ({config.buffer.batch_size}) must be divisible by ({world_size})" ) - # TODO: use dynamic read_batch_size to support multi-round scenarios - # Get the experiences of one explore step + + self.trainer.sync_freq = config.synchronizer.sync_interval + self.trainer.save_freq = config.trainer.save_interval self.trainer.project_name = config.project self.trainer.experiment_name = config.name - self.data.train_batch_size = config.buffer.batch_size self.trainer.default_local_dir = config.checkpoint_job_dir self.trainer.sft_warmup_steps = config.buffer.trainer_input.sft_warmup_steps - self.actor_rollout_ref.actor.ppo_mini_batch_size = config.buffer.batch_size + + self.buffer = config.buffer + # TODO: use dynamic read_batch_size to support multi-round scenarios + # Get the experiences of one explore step + self.data.train_batch_size = config.buffer.batch_size + + self.synchronizer = config.synchronizer + self.actor_rollout_ref.synchronizer = config.synchronizer + + # Actor / Critic config + self.actor_rollout_ref.model.path = config.model.model_path + self.critic.model.path = config.model.critic_model_path + self.critic.model.tokenizer_path = config.model.critic_model_path + self.actor_rollout_ref.actor.ppo_mini_batch_size = ( + config.buffer.batch_size + ) # TODO: may allow user to change self.actor_rollout_ref.rollout.temperature = ( config.buffer.explorer_input.taskset.rollout_args.temperature ) @@ -320,6 +299,22 @@ def synchronize_config(self, config: Config) -> None: self.critic.ppo_mini_batch_size = config.buffer.batch_size self.critic.rollout_n = self.actor_rollout_ref.rollout.n + if config.trainer.actor_use_kl_loss is not None: + self.actor_rollout_ref.actor.use_kl_loss = config.trainer.actor_use_kl_loss + if config.trainer.actor_kl_loss_coef is not None: + self.actor_rollout_ref.actor.kl_loss_coef = config.trainer.actor_kl_loss_coef + if config.trainer.actor_entropy_coef is not None: + self.actor_rollout_ref.actor.entropy_coeff = config.trainer.actor_entropy_coef + if config.trainer.actor_grad_clip is not None: + self.actor_rollout_ref.actor.grad_clip = config.trainer.actor_grad_clip + if config.trainer.actor_clip_ratio is not None: + self.actor_rollout_ref.actor.clip_ratio = config.trainer.actor_clip_ratio + + # Algorithm related config + if config.algorithm.gamma is not None: + self.algorithm.gamma = config.algorithm.gamma + if config.algorithm.lam is not None: + self.algorithm.lam = config.algorithm.lam self.actor_rollout_ref.actor.algorithm_type = config.algorithm.algorithm_type if config.algorithm.algorithm_type == AlgorithmType.PPO: logger.info("Using GAE `adv_estimator` for PPO") @@ -328,15 +323,6 @@ def synchronize_config(self, config: Config) -> None: logger.info("Using GRPO `adv_estimator` for GRPO") self.algorithm.adv_estimator = AdvantageEstimator.GRPO.value - # copy trainer related config from global config - self.algorithm.gamma = config.algorithm.gamma - self.algorithm.lam = config.algorithm.lam - self.actor_rollout_ref.actor.use_kl_loss = config.trainer.actor_use_kl_loss - self.actor_rollout_ref.actor.kl_loss_coef = config.trainer.actor_kl_loss_coef - self.actor_rollout_ref.actor.entropy_coeff = config.trainer.actor_entropy_coef - self.actor_rollout_ref.actor.grad_clip = config.trainer.actor_grad_clip - self.actor_rollout_ref.actor.clip_ratio = config.trainer.actor_clip_ratio - if self.actor_rollout_ref.actor.algorithm_type.is_dpo(): # for DPO if not self.actor_rollout_ref.actor.use_kl_loss: self.actor_rollout_ref.actor.use_kl_loss = True diff --git a/trinity/common/workflows/workflow.py b/trinity/common/workflows/workflow.py index 1a0daadb2b..9786bd6b77 100644 --- a/trinity/common/workflows/workflow.py +++ b/trinity/common/workflows/workflow.py @@ -153,12 +153,12 @@ def __init__( task: Task, auxiliary_models: Optional[List[openai.OpenAI]] = None, ): + self.reset(task) super().__init__( model=model, task=task, auxiliary_models=auxiliary_models, ) - self.reset(task) @property def resettable(self): @@ -226,14 +226,12 @@ def __init__( task: Task, auxiliary_models: Optional[List[openai.OpenAI]] = None, ): - if task.reward_fn is None: - task.reward_fn = MathRewardFn - if task.reward_fn == MathRewardFn and task.format_args.system_prompt is None: - task.format_args.system_prompt = """A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within and tags, respectively, i.e., - reasoning process here - answer here . -""" - super().__init__(model=model, task=task, auxiliary_models=auxiliary_models) + self.reset(task) + super().__init__( + model=model, + task=task, + auxiliary_models=auxiliary_models, + ) def reset(self, task: Task): if task.reward_fn is None: diff --git a/trinity/manager/config_manager.py b/trinity/manager/config_manager.py index 21b0e57348..9ac2d36f16 100644 --- a/trinity/manager/config_manager.py +++ b/trinity/manager/config_manager.py @@ -50,29 +50,41 @@ def _init_default_config(self): "mode": "both", "project": "Trinity-RFT", "exp_name": "qwen2.5-1.5B", + "checkpoint_root_dir": "", "monitor_type": MonitorType.TENSORBOARD.value, + # Algorithm Configs + "algorithm_type": AlgorithmType.PPO.value, + "_grouped_adv_repeat_times": 2, + "_not_grouped_adv_repeat_times": 1, + "repeat_times": 1, + "gamma": 1.0, + "lam": 1.0, # Model Configs "model_path": "", "critic_model_path": "", - "checkpoint_path": "", + "max_prompt_tokens": 1024, + "max_response_tokens": 1024, + # Cluster Config "node_num": 1, "gpu_per_node": 8, "total_gpu_num": 8, "trainer_gpu_num": 6, - "max_prompt_tokens": 1024, - "max_response_tokens": 1024, - # Global Configs + # Buffer Configs "total_epochs": 20, "_train_batch_size_per_gpu": 16, "train_batch_size": 96, - "eval_interval": 1000, - "algorithm_type": AlgorithmType.PPO.value, + "buffer_max_retry_times": 3, + "max_retry_interval": 1, # Taskset Configs "taskset_path": "", "taskset_subset_name": None, "taskset_split": "train", "taskset_prompt_key": "question", "taskset_response_key": "answer", + "temperature": 1.0, + "top_p": 1.0, # TODO: to be used + "top_k": -1, # TODO: to be used + "logprobs": 0, # Eval Taskset Configs "_eval_tasksets_num": 0, # Explorer Input Configs @@ -80,15 +92,13 @@ def _init_default_config(self): "default_reward_fn_type": "math_reward", "system_prompt": None, "reply_prefix": None, - # Experience Buffer Configs + # Experience Buffer / DPO Dataset Configs "_dpo_storage_type": StorageType.FILE.value, "_not_dpo_storage_type": StorageType.QUEUE.value, "storage_type": StorageType.QUEUE.value, "_dpo_experience_buffer_path": "", "_not_dpo_experience_buffer_path": "", "experience_buffer_path": "", - "buffer_max_retry_times": 3, - "max_retry_interval": 1, "dpo_dataset_train_split": "train", "dpo_dataset_prompt_type": PromptType.MESSAGES.value, "dpo_dataset_prompt_key": "prompt", @@ -101,26 +111,32 @@ def _init_default_config(self): "sft_warmup_messages_key": "messages", "sft_warmup_prompt_key": "prompt", "sft_warmup_response_key": "response", + # TrainerInput Configs + # TODO: read_experience_strategy + "sft_warmup_steps": 0, # Explorer and Sync Configs + "runner_num": 32, + "max_timeout": 900, + "explorer_max_retry_times": 2, + "eval_interval": 1000, + "eval_on_latest_checkpoint": True, + # Rollout Model Configs "engine_type": "vllm_async", "engine_num": 2, - "runner_num": 32, - "_grouped_adv_repeat_times": 2, - "_not_grouped_adv_repeat_times": 1, - "repeat_times": 1, "tensor_parallel_size": 1, - "enable_prefix_caching": False, + "use_v1": True, "enforce_eager": True, + "enable_prefix_caching": False, + "enable_chunked_prefill": False, + "gpu_memory_utilization": 0.9, "dtype": "bfloat16", - "temperature": 1.0, - "top_p": 1.0, - "top_k": -1, "seed": 42, - "logprobs": 0, - "gpu_memory_utilization": 0.9, - "enable_chunked_prefill": False, - "max_timeout": 900, - "explorer_max_retry_times": 2, + # TODO: max_prompt_tokens + # TODO: max_response_tokens + # TODO: chat_template + "enable_thinking": False, + "enable_openai_api": False, + # TODO: Auxiliary Models Configs # Synchronizer Configs "_not_dpo_sync_method": SyncMethod.NCCL.value, "sync_method": SyncMethod.NCCL.value, @@ -128,9 +144,15 @@ def _init_default_config(self): "sync_timeout": 1200, # Trainer Configs "trainer_type": "verl", - "sft_warmup_steps": 0, "_nccl_save_interval": 100, "save_interval": 100, + # TODO: enable_preview + "_not_dpo_actor_use_kl_loss": True, + "actor_use_kl_loss": True, + "actor_kl_loss_coef": 0.001, + "actor_entropy_coef": 0.001, + "actor_grad_clip": 1.0, + "actor_clip_ratio": 0.2, # veRL Trainer Configs "training_args": [ "balance_batch", @@ -151,8 +173,6 @@ def _init_default_config(self): "del_local_ckpt_after_load": False, "max_actor_ckpt_to_keep": None, "max_critic_ckpt_to_keep": None, - "gamma": 1.0, - "lam": 1.0, "adv_estimator": "gae", "norm_adv_by_std_in_grpo": True, "use_kl_in_reward": False, @@ -170,12 +190,6 @@ def _init_default_config(self): "actor_tau": 0.0, "actor_opmd_baseline": "mean", "actor_use_uid": False, - "actor_grad_clip": 1.0, - "actor_clip_ratio": 0.2, - "actor_entropy_coef": 0.001, - "_not_dpo_actor_use_kl_loss": True, - "actor_use_kl_loss": True, - "actor_kl_loss_coef": 0.001, "actor_kl_loss_type": "low_var_kl", "actor_checkpoint": ["model", "hf_model", "optimizer", "extra"], "critic_lr": 1e-6, @@ -204,7 +218,7 @@ def maintain_session_state(self): def _set_project(self): st.text_input("Project", key="project") - def _set_name(self): + def _set_exp_name(self): st.text_input("Experiment Name", key="exp_name") def _set_monitor_type(self): @@ -221,18 +235,19 @@ def _set_model_path(self): st.warning("Please input model path.") def _set_critic_model_path(self): - st.text_input( - "Critic Model Path (defaults to `model_path`)", - key="critic_model_path", - ) + if st.session_state["adv_estimator"] == AdvantageEstimator.GAE.value: + st.text_input( + "Critic Model Path (defaults to `model_path`)", + key="critic_model_path", + ) - def _set_checkpoint_path(self): - st.text_input("Checkpoint Path", key="checkpoint_path") - if not st.session_state["checkpoint_path"].strip(): # TODO: may auto generate - self.unfinished_fields.add("checkpoint_path") - st.warning("Please input checkpoint path.") - elif not os.path.isabs(st.session_state["checkpoint_path"].strip()): - self.unfinished_fields.add("checkpoint_path") + def _set_checkpoint_root_dir(self): + st.text_input("Checkpoint Root Dir", key="checkpoint_root_dir") + if not st.session_state["checkpoint_root_dir"].strip(): # TODO: may auto generate + self.unfinished_fields.add("checkpoint_root_dir") + st.warning("Please input checkpoint root dir.") + elif not os.path.isabs(st.session_state["checkpoint_root_dir"].strip()): + self.unfinished_fields.add("checkpoint_root_dir") st.warning("Please input an absolute path.") def _set_node_num(self): @@ -346,8 +361,9 @@ def _set_taskset_args(self): response_key_col.text_input( "Response Key :orange-badge[(Needs review)]", key="taskset_response_key" ) + self._set_configs_with_st_columns(["temperature", "logprobs"]) - def _set_eval_taskset_idx(self, idx): + def _set_eval_taskset_idx(self, idx): # TODO: add delete st.text_input( "Taskset Name", key=f"eval_taskset_{idx}_name", @@ -457,7 +473,7 @@ def _set_experience_buffer_path(self): # TODO if `storage_type == StorageType.QUEUE`, default to `None`, -if `storage_type == StorageType.SQL`, default to `sqlite:///{os.path.join(checkpoint_path, '.cache', project_name, experiment_name)}/data.db`.""" +if `storage_type == StorageType.SQL`, default to `sqlite:///{os.path.join(checkpoint_root_dir, '.cache', project_name, experiment_name)}/data.db`.""" def on_change(): if st.session_state["algorithm_type"] == AlgorithmType.DPO.value: @@ -545,7 +561,9 @@ def _set_sft_warmup_dataset_args(self): sft_warmup_messages_key_col, sft_warmup_prompt_key_col, sft_warmup_response_key_col, - ) = st.columns(3) + ) = st.columns( + 3 + ) # TODO: select by prompt type sft_warmup_messages_key_col.text_input( "SFT Dataset Messages Key :orange-badge[(Needs review)]", key="sft_warmup_messages_key", @@ -572,7 +590,7 @@ def _str_for_engine_num_and_tp_size(self): ```""" def _set_engine_num(self): - total_gpu_num = st.session_state["gpu_per_node"] * st.session_state["node_num"] + total_gpu_num = st.session_state["total_gpu_num"] max_engine_num = (total_gpu_num - 1) // st.session_state["tensor_parallel_size"] if st.session_state["engine_num"] > max_engine_num: st.session_state["engine_num"] = max_engine_num @@ -588,7 +606,7 @@ def _set_engine_num(self): ) def _set_tensor_parallel_size(self): - total_gpu_num = st.session_state["gpu_per_node"] * st.session_state["node_num"] + total_gpu_num = st.session_state["total_gpu_num"] max_tensor_parallel_size = (total_gpu_num - 1) // st.session_state["engine_num"] if st.session_state["tensor_parallel_size"] > max_tensor_parallel_size: st.session_state["tensor_parallel_size"] = max_tensor_parallel_size @@ -620,6 +638,33 @@ def _check_engine_num_and_tp_size(self): "Please ensure that `engine_num * tensor_parallel_size` can be divided by `gpu_per_node` when `node_num > 1`." ) + def _set_repeat_times(self): # TODO + grouped_adv_algorithms = [ + AlgorithmType.GRPO.value, + AlgorithmType.OPMD.value, # TODO: may add rloo + ] + if st.session_state["algorithm_type"] in grouped_adv_algorithms: + min_repeat_times = 2 + st.session_state["repeat_times"] = st.session_state["_grouped_adv_repeat_times"] + else: + min_repeat_times = 1 + st.session_state["repeat_times"] = st.session_state["_not_grouped_adv_repeat_times"] + + def on_change(): + if st.session_state["algorithm_type"] in grouped_adv_algorithms: + st.session_state["_grouped_adv_repeat_times"] = st.session_state["repeat_times"] + else: + st.session_state["_not_grouped_adv_repeat_times"] = st.session_state["repeat_times"] + + st.number_input( + "Repeat Times", + key="repeat_times", + min_value=min_repeat_times, + help="`repeat_times` is used to set how many experiences each task can generate, " + "and it must be greater than `1` when `algorithm_type` is `opmd` or `grpo`.", + on_change=on_change, + ) + def _set_sync_method(self): if st.session_state["algorithm_type"] == AlgorithmType.DPO.value: st.session_state["sync_method"] = SyncMethod.CHECKPOINT.value @@ -686,6 +731,9 @@ def _set_seed(self): def _set_logprobs(self): st.number_input("Logprobs", key="logprobs", min_value=0, max_value=20) + def _set_use_v1(self): + st.checkbox("Use V1 Engine", key="use_v1") + def _set_enable_prefix_caching(self): st.checkbox("Prefix Caching", key="enable_prefix_caching") @@ -700,6 +748,12 @@ def _set_gpu_memory_utilization(self): def _set_enable_chunked_prefill(self): st.checkbox("Chunked Prefill", key="enable_chunked_prefill") + def _set_enable_thinking(self): + st.checkbox("Enable Thinking For Qwen3", key="enable_thinking") + + def _set_enable_openai_api(self): + st.checkbox("Enable OpenAI API", key="enable_openai_api") + def _set_max_timeout(self): st.number_input("Max Timeout", key="max_timeout", min_value=0) @@ -745,6 +799,9 @@ def _set_sft_warmup_steps(self): def _set_eval_interval(self): st.number_input("Eval Interval", key="eval_interval", min_value=1) + def _set_eval_on_latest_checkpoint(self): + st.checkbox("Eval on Latest Checkpoint", key="eval_on_latest_ckp") + def _set_training_args(self): st.multiselect( "Training Args", @@ -787,33 +844,6 @@ def on_change(): def _set_ppo_epochs(self): st.number_input("PPO Epochs", key="ppo_epochs", min_value=1) - def _set_repeat_times(self): # TODO - grouped_adv_algorithms = [ - AlgorithmType.GRPO.value, - AlgorithmType.OPMD.value, # TODO: may add rloo - ] - if st.session_state["algorithm_type"] in grouped_adv_algorithms: - min_repeat_times = 2 - st.session_state["repeat_times"] = st.session_state["_grouped_adv_repeat_times"] - else: - min_repeat_times = 1 - st.session_state["repeat_times"] = st.session_state["_not_grouped_adv_repeat_times"] - - def on_change(): - if st.session_state["algorithm_type"] in grouped_adv_algorithms: - st.session_state["_grouped_adv_repeat_times"] = st.session_state["repeat_times"] - else: - st.session_state["_not_grouped_adv_repeat_times"] = st.session_state["repeat_times"] - - st.number_input( - "Repeat Times", - key="repeat_times", - min_value=min_repeat_times, - help="`repeat_times` is used to set how many experiences each task can generate, " - "and it must be greater than `1` when `algorithm_type` is `opmd` or `grpo`.", - on_change=on_change, - ) - def _set_training_strategy(self): st.selectbox( "Training Strategy", @@ -1105,11 +1135,11 @@ def _set_configs_with_st_columns( def beginner_mode(self): st.header("Essential Configs") - self._set_configs_with_st_columns(["project", "name"], columns_config=[1, 3]) + self._set_configs_with_st_columns(["project", "exp_name"], columns_config=[1, 3]) self._set_model_path() - self._set_checkpoint_path() + self._set_checkpoint_root_dir() self._set_taskset_path() @@ -1169,12 +1199,12 @@ def beginner_mode(self): self._set_configs_with_st_columns(["critic_ppo_micro_batch_size_per_gpu", "critic_lr"]) def _expert_model_part(self): - self._set_configs_with_st_columns(["project", "name"], columns_config=[1, 3]) + self._set_configs_with_st_columns(["project", "exp_name"], columns_config=[1, 3]) self._set_model_path() self._set_critic_model_path() - self._set_checkpoint_path() + self._set_checkpoint_root_dir() self._set_configs_with_st_columns(["monitor_type", "node_num", "gpu_per_node"]) self._set_configs_with_st_columns(["max_prompt_tokens", "max_response_tokens"]) @@ -1213,34 +1243,36 @@ def _expert_buffer_part(self): self._set_configs_with_st_columns(["buffer_max_retry_times", "max_retry_interval"]) def _expert_explorer_part(self): + self._set_configs_with_st_columns(["sync_method", "sync_interval", "sync_timeout"]) + self._set_configs_with_st_columns( - ["engine_type", "engine_num", "tensor_parallel_size", "repeat_times"] + [ + "runner_num", + "max_timeout", + "explorer_max_retry_times", + ] ) - self._check_engine_num_and_tp_size() - self._set_configs_with_st_columns(["sync_method", "sync_interval", "sync_timeout"]) + self._set_configs_with_st_columns(["eval_interval", "eval_on_latest_checkpoint"]) - with st.expander("Advanced Config"): - self._set_configs_with_st_columns( - ["runner_num", "temperature", "top_p", "top_k", "seed", "logprobs"] - ) + with st.expander("Rollout Model Config", expanded=True): + self._set_configs_with_st_columns(["engine_type", "engine_num", "tensor_parallel_size"]) + self._check_engine_num_and_tp_size() - self._set_configs_with_st_columns(["dtype", "gpu_memory_utilization"]) - self._set_configs_with_st_columns( - [ - "max_timeout", - "explorer_max_retry_times", - ] - ) + self._set_configs_with_st_columns(["gpu_memory_utilization", "dtype", "seed"]) self._set_configs_with_st_columns( - ["enable_prefix_caching", "enforce_eager", "enable_chunked_prefill"] + ["use_v1", "enforce_eager", "enable_prefix_caching", "enable_chunked_prefill"] ) + self._set_configs_with_st_columns(["enable_thinking", "enable_openai_api"]) + + with st.expander("Auxiliary Models", expanded=True): # TODO + pass + def _expert_trainer_part(self): - self._set_configs_with_st_columns( # TODO: may add `trainer_type` - ["algorithm_type", "sft_warmup_steps", "eval_interval", "save_interval"] - ) + self._set_configs_with_st_columns(["algorithm_type", "gamma", "lam"]) + self._set_configs_with_st_columns(["repeat_times", "save_interval"]) self._check_sft_warmup_dataset_path() if st.session_state["trainer_type"] == "verl": @@ -1280,7 +1312,6 @@ def _expert_verl_trainer_part(self): with rl_algorithm_tab: st.subheader("RL Algorithm Config") - self._set_configs_with_st_columns(["gamma", "lam"]) self._set_configs_with_st_columns(["norm_adv_by_std_in_grpo", "use_kl_in_reward"]) self._set_configs_with_st_columns(["kl_penalty", "kl_ctrl_type", "kl_ctrl_coef"]) self._set_configs_with_st_columns(["horizon", "target_kl"]) @@ -1341,7 +1372,7 @@ def expert_mode(self): with tab: func() - def _generate_verl_config(self, trainer_nnodes: int = 1, trainer_n_gpus_per_node: int = 8): + def _generate_verl_config(self): balance_batch = "balance_batch" in st.session_state["training_args"] enable_gradient_checkpointing = ( "gradient_checkpointing" in st.session_state["training_args"] @@ -1363,33 +1394,10 @@ def _generate_verl_config(self, trainer_nnodes: int = 1, trainer_n_gpus_per_node st.session_state["max_prompt_tokens"] + st.session_state["max_response_tokens"] ) - critic_model_path = ( - st.session_state["critic_model_path"].strip() - if st.session_state["critic_model_path"].strip() - else st.session_state["model_path"] - ) trainer_config = { - "data": { - "tokenizer": None, - "train_files": "placeholder", - "val_files": "placeholder", - "prompt_key": "placeholder", - "max_prompt_length": st.session_state["max_prompt_tokens"], - "max_response_length": st.session_state["max_response_tokens"], - "train_batch_size": st.session_state["train_batch_size"] - * st.session_state["repeat_times"], - "val_batch_size": None, - "return_raw_input_ids": False, - "return_raw_chat": False, - "shuffle": True, - "filter_overlong_prompts": False, - "truncation": "error", - "image_key": "images", - }, "actor_rollout_ref": { "hybrid_engine": True, "model": { - "path": st.session_state["model_path"], "external_lib": None, "override_config": {}, "enable_gradient_checkpointing": enable_gradient_checkpointing, @@ -1403,11 +1411,6 @@ def _generate_verl_config(self, trainer_nnodes: int = 1, trainer_n_gpus_per_node ], "use_dynamic_bsz": use_dynamic_bsz, "ppo_max_token_len_per_gpu": ppo_max_token_len_per_gpu, - "grad_clip": st.session_state["actor_grad_clip"], - "clip_ratio": st.session_state["actor_clip_ratio"], - "entropy_coeff": st.session_state["actor_entropy_coef"], - "use_kl_loss": st.session_state["actor_use_kl_loss"], - "kl_loss_coef": st.session_state["actor_kl_loss_coef"], "kl_loss_type": st.session_state["actor_kl_loss_type"], "ppo_epochs": st.session_state["ppo_epochs"], "shuffle": False, @@ -1441,34 +1444,32 @@ def _generate_verl_config(self, trainer_nnodes: int = 1, trainer_n_gpus_per_node "actor_ulysses_sequence_parallel_size" ], }, - "rollout": { - "name": "vllm", - "temperature": st.session_state["temperature"], - "top_k": -1, - "top_p": 1, - "use_fire_sampling": False, - "prompt_length": st.session_state["max_prompt_tokens"], - "response_length": st.session_state["max_response_tokens"], - "dtype": "bfloat16", - "gpu_memory_utilization": 0.4, - "ignore_eos": False, - "enforce_eager": True, - "free_cache_engine": True, - "load_format": "dummy_dtensor", - "tensor_model_parallel_size": 2, - "max_num_batched_tokens": 8192, - "max_model_len": None, - "max_num_seqs": 1024, - "log_prob_micro_batch_size_per_gpu": 4, - "log_prob_use_dynamic_bsz": use_dynamic_bsz, - "log_prob_max_token_len_per_gpu": ppo_max_token_len_per_gpu, - "disable_log_stats": True, - "enable_chunked_prefill": True, - "do_sample": True, - "n": st.session_state["repeat_times"], + }, + "custom_reward_function": {"path": None, "name": "compute_score"}, + "algorithm": { + "kl_penalty": st.session_state["kl_penalty"], + "kl_ctrl": { + "type": st.session_state["kl_ctrl_type"], + "kl_coef": st.session_state["kl_ctrl_coef"], }, }, - "critic": { + "trainer": { + "balance_batch": balance_batch, + "logger": ["tensorboard"], + "resume_mode": st.session_state["resume_mode"], + "resume_from_path": st.session_state["resume_from_path"], + "default_hdfs_dir": st.session_state["default_hdfs_dir"], + "remove_previous_ckpt_in_save": st.session_state["remove_previous_ckpt_in_save"], + "del_local_ckpt_after_load": st.session_state["del_local_ckpt_after_load"], + "val_before_train": False, + "max_actor_ckpt_to_keep": st.session_state["max_actor_ckpt_to_keep"], + "max_critic_ckpt_to_keep": st.session_state["max_critic_ckpt_to_keep"], + }, + } + + if st.session_state["adv_estimator"] == AdvantageEstimator.GAE.value: + trainer_config["trainer"]["critic_warmup"] = st.session_state["critic_warmup"] + trainer_config["critic"] = { "strategy": st.session_state["training_strategy"], "optim": { "lr": st.session_state["critic_lr"], @@ -1481,8 +1482,6 @@ def _generate_verl_config(self, trainer_nnodes: int = 1, trainer_n_gpus_per_node ), }, "model": { - "path": critic_model_path, - "tokenizer_path": critic_model_path, "override_config": {}, "external_lib": None, "enable_gradient_checkpointing": enable_gradient_checkpointing, @@ -1507,98 +1506,129 @@ def _generate_verl_config(self, trainer_nnodes: int = 1, trainer_n_gpus_per_node "grad_clip": st.session_state["critic_grad_clip"], "cliprange_value": st.session_state["critic_cliprange_value"], "checkpoint": {"contents": st.session_state["critic_checkpoint"]}, - }, - "reward_model": { - "enable": False, - "strategy": "fsdp", - "model": { - "input_tokenizer": st.session_state["model_path"], - "path": "~/models/FsfairX-LLaMA3-RM-v0.1", - "external_lib": None, - "use_remove_padding": False, - "fsdp_config": { - "min_num_params": 0, - "param_offload": False, - "fsdp_size": -1, - }, - }, - "ulysses_sequence_parallel_size": 1, - "use_dynamic_bsz": use_dynamic_bsz, - "forward_max_token_len_per_gpu": ppo_max_token_len_per_gpu * 2, - "reward_manager": "naive", - }, - "custom_reward_function": {"path": None, "name": "compute_score"}, - "algorithm": { - "gamma": st.session_state["gamma"], - "lam": st.session_state["lam"], - "adv_estimator": st.session_state["adv_estimator"], - "kl_penalty": st.session_state["kl_penalty"], - "kl_ctrl": { - "type": st.session_state["kl_ctrl_type"], - "kl_coef": st.session_state["kl_ctrl_coef"], - }, - }, - "trainer": { - "balance_batch": balance_batch, - "total_epochs": st.session_state["total_epochs"], - "project_name": st.session_state["project"], - "experiment_name": st.session_state["exp_name"], - "logger": ["tensorboard"], - "val_generations_to_log_to_wandb": 0, - "nnodes": trainer_nnodes, - "n_gpus_per_node": trainer_n_gpus_per_node, - "save_freq": st.session_state["save_interval"], - "resume_mode": st.session_state["resume_mode"], - "resume_from_path": st.session_state["resume_from_path"], - "test_freq": 100, - "critic_warmup": st.session_state["critic_warmup"], - "default_hdfs_dir": st.session_state["default_hdfs_dir"], - "remove_previous_ckpt_in_save": st.session_state["remove_previous_ckpt_in_save"], - "del_local_ckpt_after_load": st.session_state["del_local_ckpt_after_load"], - "default_local_dir": st.session_state["checkpoint_path"], - "val_before_train": False, - "sync_freq": st.session_state["sync_interval"], - "max_actor_ckpt_to_keep": st.session_state["max_actor_ckpt_to_keep"], - "max_critic_ckpt_to_keep": st.session_state["max_critic_ckpt_to_keep"], - }, - } + } return trainer_config - def generate_config(self): - if st.session_state["mode"] == "both": - trainer_nnodes = ( - st.session_state["node_num"] - - st.session_state["engine_num"] - * st.session_state["tensor_parallel_size"] - // st.session_state["gpu_per_node"] - ) - else: - trainer_nnodes = st.session_state["node_num"] - if st.session_state["node_num"] == 1 and st.session_state["mode"] == "both": - trainer_n_gpus_per_node = ( - st.session_state["gpu_per_node"] - - st.session_state["engine_num"] * st.session_state["tensor_parallel_size"] - ) - else: - trainer_n_gpus_per_node = st.session_state["gpu_per_node"] - + def _gen_buffer_config(self): if st.session_state["algorithm_type"] != AlgorithmType.DPO.value: experience_buffer_path = st.session_state["experience_buffer_path"].strip() if ( not experience_buffer_path and st.session_state["storage_type"] == StorageType.SQL.value ): - experience_buffer_path = f"sqlite:///{os.path.join(st.session_state['checkpoint_path'], '.cache', st.session_state['project'], st.session_state['exp_name'])}/data.db" + experience_buffer_path = f"sqlite:///{os.path.join(st.session_state['checkpoint_root_dir'], '.cache', st.session_state['project'], st.session_state['exp_name'])}/data.db" sft_storage_type = ( StorageType.SQL.value if "://" in st.session_state["sft_warmup_dataset_path"] else StorageType.FILE.value ) # TODO + + buffer_config = { + "batch_size": st.session_state["train_batch_size"], + "total_epochs": st.session_state["total_epochs"], + "explorer_input": { + "taskset": { + "name": "taskset", + "storage_type": StorageType.FILE.value, + "path": st.session_state["taskset_path"], + "split": st.session_state["taskset_split"], + "subset_name": st.session_state["taskset_subset_name"], + "format": { + "prompt_key": st.session_state["taskset_prompt_key"], + "response_key": st.session_state["taskset_response_key"], + }, + "rollout_args": { + "temperature": st.session_state["temperature"], + "logprobs": st.session_state["logprobs"], + }, + }, + "eval_tasksets": [], + "default_workflow_type": st.session_state["default_workflow_type"], + "default_reward_fn_type": st.session_state["default_reward_fn_type"], + "system_prompt": st.session_state["system_prompt"], + "reply_prefix": st.session_state["reply_prefix"], + }, + "trainer_input": { + "experience_buffer": { + "name": "experience_buffer", + "storage_type": st.session_state["storage_type"], + "path": experience_buffer_path, + }, + "sft_warmup_steps": st.session_state["sft_warmup_steps"], + }, + "max_retry_times": st.session_state["buffer_max_retry_times"], + "max_retry_interval": st.session_state["max_retry_interval"], + } + + for idx in range(st.session_state["_eval_tasksets_num"]): + if st.session_state[f"eval_taskset_{idx}_path"].strip(): + buffer_config["explorer_input"]["eval_tasksets"].append( + { + "name": st.session_state[f"eval_taskset_{idx}_name"], + "path": st.session_state[f"eval_taskset_{idx}_path"], + "subset_name": st.session_state[f"eval_taskset_{idx}_subset_name"], + "split": st.session_state[f"eval_taskset_{idx}_split"], + "prompt_key": st.session_state[f"eval_taskset_{idx}_prompt_key"], + "response_key": st.session_state[f"eval_taskset_{idx}_response_key"], + } + ) + if st.session_state["algorithm_type"] == AlgorithmType.DPO.value: + experience_buffer = buffer_config["trainer_input"]["experience_buffer"] + experience_buffer["split"] = st.session_state["dpo_dataset_train_split"] + experience_buffer["format"] = { + "prompt_type": st.session_state["dpo_dataset_prompt_type"], + "prompt_key": st.session_state["dpo_dataset_prompt_key"], + "chosen_key": st.session_state["dpo_dataset_chosen_key"], + "rejected_key": st.session_state["dpo_dataset_rejected_key"], + } + if st.session_state["sft_warmup_dataset_path"].strip(): + buffer_config["trainer_input"]["sft_warmup_dataset"] = { + "name": "sft_warmup_dataset", + "storage_type": sft_storage_type, + "path": st.session_state["sft_warmup_dataset_path"], + "split": st.session_state["sft_warmup_train_split"], + "format": { + "prompt_type": st.session_state["sft_warmup_prompt_type"], + "messages_key": st.session_state["sft_warmup_messages_key"], + "prompt_key": st.session_state["sft_warmup_prompt_key"], + "response_key": st.session_state["sft_warmup_response_key"], + }, + } + + return buffer_config + + def _gen_explorer_config(self): + explorer_config = { + "runner_num": st.session_state["runner_num"], + "max_timeout": st.session_state["max_timeout"], + "max_retry_times": st.session_state["explorer_max_retry_times"], + "rollout_model": { + "engine_type": st.session_state["engine_type"], + "engine_num": st.session_state["engine_num"], + "tensor_parallel_size": st.session_state["tensor_parallel_size"], + "use_v1": st.session_state["use_v1"], + "enforce_eager": st.session_state["enforce_eager"], + "enable_prefix_caching": st.session_state["enable_prefix_caching"], + "enable_chunked_prefill": st.session_state["enable_chunked_prefill"], + "gpu_memory_utilization": st.session_state["gpu_memory_utilization"], + "dtype": st.session_state["dtype"], + "seed": st.session_state["seed"], + # "max_prompt_tokens": None, # TODO + # "max_response_tokens": None, # TODO + # "chat_template": None, # TODO: add chat template + "enable_thinking": st.session_state["enable_thinking"], + "enable_openai_api": st.session_state["enable_openai_api"], + }, + "auxiliary_models": [], + "eval_interval": st.session_state["eval_interval"], + "eval_on_latest_checkpoint": st.session_state["eval_on_latest_checkpoint"], + } + return explorer_config + + def generate_config(self): if st.session_state["trainer_type"] == "verl": - trainer_config = self._generate_verl_config( - trainer_nnodes=trainer_nnodes, trainer_n_gpus_per_node=trainer_n_gpus_per_node - ) + trainer_config = self._generate_verl_config() else: raise ValueError(f"Invalid trainer type: {st.session_state['trainer_type']}") @@ -1623,12 +1653,15 @@ def generate_config(self): config = { "mode": st.session_state["mode"], "project": st.session_state["project"], - "name": st.session_state["name"], - "checkpoint_root_dir": st.session_state["checkpoint_path"], + "name": st.session_state["exp_name"], + "checkpoint_root_dir": st.session_state["checkpoint_root_dir"], "algorithm": { "algorithm_type": st.session_state["algorithm_type"], "repeat_times": st.session_state["repeat_times"], + "gamma": st.session_state["gamma"], + "lam": st.session_state["lam"], }, + "data_processor": {}, # TODO: Add data processor config "model": { "model_path": st.session_state["model_path"], "max_prompt_tokens": st.session_state["max_prompt_tokens"], @@ -1638,75 +1671,27 @@ def generate_config(self): "node_num": st.session_state["node_num"], "gpu_per_node": st.session_state["gpu_per_node"], }, - "buffer": { - "total_epochs": st.session_state["total_epochs"], - "batch_size": st.session_state["train_batch_size"], - "max_retry_times": st.session_state["buffer_max_retry_times"], - "max_retry_interval": st.session_state["max_retry_interval"], - "explorer_input": { - "taskset": { - "name": "taskset", - "storage_type": StorageType.FILE.value, - "path": st.session_state["taskset_path"], - "split": st.session_state["taskset_split"], - "subset_name": st.session_state["taskset_subset_name"], - "format": { - "prompt_key": st.session_state["taskset_prompt_key"], - "response_key": st.session_state["taskset_response_key"], - }, - "rollout_args": { - "n": st.session_state["repeat_times"], - "temperature": st.session_state["temperature"], - "top_p": st.session_state["top_p"], - "top_k": st.session_state["top_k"], - "logprobs": st.session_state["logprobs"], - }, - }, - "eval_tasksets": [], # TODO: add eval tasksets - "default_workflow_type": st.session_state["default_workflow_type"], - "default_reward_fn_type": st.session_state["default_reward_fn_type"], - "system_prompt": st.session_state["system_prompt"], - "reply_prefix": st.session_state["reply_prefix"], - }, - "trainer_input": { - "experience_buffer": { - "name": "experience_buffer", - "storage_type": st.session_state["storage_type"], - "path": experience_buffer_path, - }, - "sft_warmup_steps": st.session_state["sft_warmup_steps"], - }, - }, - "explorer": { - "eval_interval": st.session_state["eval_interval"], - "engine_type": st.session_state["engine_type"], - "engine_num": st.session_state["engine_num"], - "runner_num": st.session_state["runner_num"], - # "chat_template": None, # TODO: add chat template - "tensor_parallel_size": st.session_state["tensor_parallel_size"], - "enable_prefix_caching": st.session_state["enable_prefix_caching"], - "enforce_eager": st.session_state["enforce_eager"], - "dtype": st.session_state["dtype"], - "seed": st.session_state["seed"], - "gpu_memory_utilization": st.session_state["gpu_memory_utilization"], - "enable_chunked_prefill": st.session_state["enable_chunked_prefill"], - "use_v1": True, - "max_timeout": st.session_state["max_timeout"], - "max_retry_times": st.session_state["explorer_max_retry_times"], - }, - "synchronizer": { - "sync_method": st.session_state["sync_method"], - "sync_interval": st.session_state["sync_interval"], - "sync_timeout": st.session_state["sync_timeout"], - }, + "buffer": self._gen_buffer_config(), + "explorer": self._gen_explorer_config(), "trainer": { "trainer_type": st.session_state["trainer_type"], - "trainer_config": trainer_config, "save_interval": st.session_state["save_interval"], + "enable_preview": True, # TODO + "actor_use_kl_loss": st.session_state["actor_use_kl_loss"], + "actor_kl_loss_coef": st.session_state["actor_kl_loss_coef"], + "actor_entropy_coef": st.session_state["actor_entropy_coef"], + "actor_grad_clip": st.session_state["actor_grad_clip"], + "actor_clip_ratio": st.session_state["actor_clip_ratio"], + "trainer_config": trainer_config, }, "monitor": { "monitor_type": st.session_state["monitor_type"], }, + "synchronizer": { + "sync_method": st.session_state["sync_method"], + "sync_interval": st.session_state["sync_interval"], + "sync_timeout": st.session_state["sync_timeout"], + }, } if st.session_state["adv_estimator"] == AdvantageEstimator.GAE.value: @@ -1716,40 +1701,6 @@ def generate_config(self): else st.session_state["model_path"] ) - for idx in range(st.session_state["_eval_tasksets_num"]): - if st.session_state[f"eval_taskset_{idx}_path"].strip(): - config["buffer"]["explorer_input"]["eval_tasksets"].append( - { - "name": st.session_state[f"eval_taskset_{idx}_name"], - "path": st.session_state[f"eval_taskset_{idx}_path"], - "subset_name": st.session_state[f"eval_taskset_{idx}_subset_name"], - "split": st.session_state[f"eval_taskset_{idx}_split"], - "prompt_key": st.session_state[f"eval_taskset_{idx}_prompt_key"], - "response_key": st.session_state[f"eval_taskset_{idx}_response_key"], - } - ) - if st.session_state["algorithm_type"] == AlgorithmType.DPO.value: - experience_buffer = config["buffer"]["trainer_input"]["experience_buffer"] - experience_buffer["split"] = st.session_state["dpo_dataset_train_split"] - experience_buffer["format"] = { - "prompt_type": st.session_state["dpo_dataset_prompt_type"], - "prompt_key": st.session_state["dpo_dataset_prompt_key"], - "chosen_key": st.session_state["dpo_dataset_chosen_key"], - "rejected_key": st.session_state["dpo_dataset_rejected_key"], - } - if st.session_state["sft_warmup_dataset_path"].strip(): - config["buffer"]["trainer_input"]["sft_warmup_dataset"] = { - "name": "sft_warmup_dataset", - "storage_type": sft_storage_type, - "path": st.session_state["sft_warmup_dataset_path"], - "split": st.session_state["sft_warmup_train_split"], - "format": { - "prompt_type": st.session_state["sft_warmup_prompt_type"], - "messages_key": st.session_state["sft_warmup_messages_key"], - "prompt_key": st.session_state["sft_warmup_prompt_key"], - "response_key": st.session_state["sft_warmup_response_key"], - }, - } st.session_state.config_generated = True st.header("Generated Config File") buttons = st.container() @@ -1758,7 +1709,7 @@ def generate_config(self): save_btn.download_button( "Save", data=yaml_config, - file_name=f"{config['monitor']['project']}-{config['monitor']['name']}.yaml", + file_name=f"{config['project']}-{config['name']}.yaml", mime="text/plain", icon=":material/download:", use_container_width=True, diff --git a/trinity/trainer/verl_trainer.py b/trinity/trainer/verl_trainer.py index 090a5ff881..7590d6075b 100644 --- a/trinity/trainer/verl_trainer.py +++ b/trinity/trainer/verl_trainer.py @@ -416,17 +416,6 @@ def train_rft_step(self, experiences: Experiences) -> Tuple[bool, int]: actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"]) metrics.update(actor_output_metrics) - # validate - if ( - self.val_reward_fn is not None - and self.config.trainer.test_freq > 0 - and self.global_steps % self.config.trainer.test_freq == 0 - ): - pass # TODO: may add validation - # with _timer("testing", timing_raw): - # val_metrics: dict = self._validate() - # metrics.update(val_metrics) - if ( self.config.trainer.save_freq > 0 and self.global_steps % self.config.trainer.save_freq == 0