|
| 1 | +data: |
| 2 | + tokenizer: null |
| 3 | + train_files: train_example.parquet |
| 4 | + val_files: test_example.parquet |
| 5 | + prompt_key: prompt |
| 6 | + max_prompt_length: 4096 |
| 7 | + max_response_length: 16384 |
| 8 | + train_batch_size: 96 |
| 9 | + val_batch_size: null |
| 10 | + return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs |
| 11 | + return_raw_chat: False |
| 12 | + shuffle: True |
| 13 | + filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left' |
| 14 | + truncation: error |
| 15 | + image_key: images |
| 16 | + |
| 17 | +actor_rollout_ref: |
| 18 | + hybrid_engine: True |
| 19 | + model: |
| 20 | + path: /PATH/TO/MODEL/CHECKPOINT/ |
| 21 | + external_lib: null |
| 22 | + override_config: { } |
| 23 | + enable_gradient_checkpointing: True |
| 24 | + use_remove_padding: False |
| 25 | + actor: |
| 26 | + strategy: fsdp # This is for backward-compatibility |
| 27 | + ppo_mini_batch_size: 1536 |
| 28 | + # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu |
| 29 | + ppo_micro_batch_size_per_gpu: 1 |
| 30 | + use_dynamic_bsz: False |
| 31 | + ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length} |
| 32 | + grad_clip: 1.0 |
| 33 | + clip_ratio: 0.2 |
| 34 | + entropy_coeff: 0.001 |
| 35 | + use_kl_loss: True # True for GRPO |
| 36 | + kl_loss_coef: 0.001 # for grpo |
| 37 | + kl_loss_type: low_var_kl # for grpo |
| 38 | + ppo_epochs: 1 |
| 39 | + shuffle: False |
| 40 | + ulysses_sequence_parallel_size: 1 # sp size |
| 41 | + optim: |
| 42 | + lr: 1e-6 |
| 43 | + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime |
| 44 | + # min_lr_ratio: null # only useful for warmup with cosine |
| 45 | + warmup_style: constant # select from constant/cosine |
| 46 | + total_training_steps: -1 # must be override by program |
| 47 | + fsdp_config: |
| 48 | + wrap_policy: |
| 49 | + # transformer_layer_cls_to_wrap: None |
| 50 | + min_num_params: 0 |
| 51 | + param_offload: False |
| 52 | + optimizer_offload: False |
| 53 | + fsdp_size: -1 |
| 54 | + ref: |
| 55 | + fsdp_config: |
| 56 | + param_offload: False |
| 57 | + wrap_policy: |
| 58 | + # transformer_layer_cls_to_wrap: None |
| 59 | + min_num_params: 0 |
| 60 | + # log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu |
| 61 | + log_prob_micro_batch_size_per_gpu: 1 |
| 62 | + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} |
| 63 | + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} |
| 64 | + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size |
| 65 | + rollout: |
| 66 | + name: vllm |
| 67 | + temperature: 1.0 |
| 68 | + top_k: -1 # 0 for hf rollout, -1 for vllm rollout |
| 69 | + top_p: 1 |
| 70 | + use_fire_sampling: False # https://arxiv.org/abs/2410.21236 |
| 71 | + prompt_length: ${data.max_prompt_length} # not use for opensource |
| 72 | + response_length: ${data.max_response_length} |
| 73 | + # for vllm rollout |
| 74 | + dtype: bfloat16 # should align with FSDP |
| 75 | + gpu_memory_utilization: 0.4 |
| 76 | + ignore_eos: False |
| 77 | + enforce_eager: True |
| 78 | + free_cache_engine: True |
| 79 | + load_format: dummy_dtensor |
| 80 | + tensor_model_parallel_size: 1 |
| 81 | + max_num_batched_tokens: 8192 |
| 82 | + max_model_len: null |
| 83 | + max_num_seqs: 1024 |
| 84 | + # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu |
| 85 | + log_prob_micro_batch_size_per_gpu: 1 |
| 86 | + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} |
| 87 | + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} |
| 88 | + disable_log_stats: True |
| 89 | + enable_chunked_prefill: True # could get higher throughput |
| 90 | + # for hf rollout |
| 91 | + do_sample: True |
| 92 | + # number of responses (i.e. num sample times) |
| 93 | + n: 8 # should be > 1 for grpo; Currently is unused parameter |
| 94 | + |
| 95 | +critic: |
| 96 | + strategy: fsdp |
| 97 | + optim: |
| 98 | + lr: 1e-5 |
| 99 | + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime |
| 100 | + # min_lr_ratio: null # only useful for warmup with cosine |
| 101 | + warmup_style: constant # select from constant/cosine |
| 102 | + total_training_steps: -1 # must be override by program |
| 103 | + model: |
| 104 | + path: /PATH/TO/MODEL/CHECKPOINT/ |
| 105 | + tokenizer_path: ${actor_rollout_ref.model.path} |
| 106 | + override_config: { } |
| 107 | + external_lib: ${actor_rollout_ref.model.external_lib} |
| 108 | + enable_gradient_checkpointing: True |
| 109 | + use_remove_padding: False |
| 110 | + fsdp_config: |
| 111 | + param_offload: False |
| 112 | + optimizer_offload: False |
| 113 | + wrap_policy: |
| 114 | + # transformer_layer_cls_to_wrap: None |
| 115 | + min_num_params: 0 |
| 116 | + fsdp_size: -1 |
| 117 | + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} |
| 118 | + # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu |
| 119 | + ppo_micro_batch_size_per_gpu: 1 |
| 120 | + forward_micro_batch_size: ${critic.ppo_micro_batch_size} |
| 121 | + forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} |
| 122 | + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} |
| 123 | + ppo_max_token_len_per_gpu: 16384 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 |
| 124 | + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} |
| 125 | + ulysses_sequence_parallel_size: 1 # sp size |
| 126 | + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} |
| 127 | + shuffle: ${actor_rollout_ref.actor.shuffle} |
| 128 | + grad_clip: 1.0 |
| 129 | + cliprange_value: 0.5 |
| 130 | + |
| 131 | +reward_model: |
| 132 | + enable: False |
| 133 | + strategy: fsdp |
| 134 | + model: |
| 135 | + input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical |
| 136 | + path: ~/models/FsfairX-LLaMA3-RM-v0.1 |
| 137 | + external_lib: ${actor_rollout_ref.model.external_lib} |
| 138 | + use_remove_padding: False |
| 139 | + fsdp_config: |
| 140 | + min_num_params: 0 |
| 141 | + param_offload: False |
| 142 | + fsdp_size: -1 |
| 143 | + # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu |
| 144 | + # micro_batch_size_per_gpu: 2 # set a number |
| 145 | + # max_length: null |
| 146 | + ulysses_sequence_parallel_size: 1 # sp size |
| 147 | + use_dynamic_bsz: ${critic.use_dynamic_bsz} |
| 148 | + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} |
| 149 | + |
| 150 | +custom_reward_function: |
| 151 | + path: null |
| 152 | + name: compute_score |
| 153 | + |
| 154 | +algorithm: |
| 155 | + gamma: 1.0 |
| 156 | + lam: 1.0 |
| 157 | + adv_estimator: grpo |
| 158 | + kl_penalty: kl # how to estimate kl divergence |
| 159 | + kl_ctrl: |
| 160 | + type: fixed |
| 161 | + kl_coef: 0.001 |
| 162 | + |
| 163 | +trainer: |
| 164 | + balance_batch: True |
| 165 | + total_epochs: 15 |
| 166 | + # total_training_steps: null |
| 167 | + project_name: sciworld |
| 168 | + experiment_name: sciworld_RFT |
| 169 | + logger: [ 'wandb' ] |
| 170 | + val_generations_to_log_to_wandb: 0 |
| 171 | + nnodes: 1 |
| 172 | + n_gpus_per_node: 2 |
| 173 | + save_freq: 1 |
| 174 | + # auto: find the last ckpt to resume. If can't find, start from scratch |
| 175 | + resume_mode: auto # or auto or resume_path if |
| 176 | + resume_from_path: False |
| 177 | + test_freq: 100 |
| 178 | + critic_warmup: 0 |
| 179 | + default_hdfs_dir: null |
| 180 | + remove_previous_ckpt_in_save: False |
| 181 | + del_local_ckpt_after_load: False |
| 182 | + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} |
| 183 | + val_before_train: False |
0 commit comments