|
| 1 | +# GRPO Algorithm Configuration |
| 2 | +grpo: |
| 3 | + num_prompts_per_step: 128 |
| 4 | + num_generations_per_prompt: 16 |
| 5 | + max_rollout_turns: 1 # for multi-turn rollouts. Math Environments just have 1 turn (answering the question) |
| 6 | + max_num_epochs: 1 |
| 7 | + max_num_steps: 1000000 |
| 8 | + normalize_rewards: true |
| 9 | + use_leave_one_out_baseline: true |
| 10 | + val_period: 10 |
| 11 | + val_at_start: false |
| 12 | + overlong_filtering: false |
| 13 | + max_val_samples: 256 |
| 14 | + val_batch_size: 256 |
| 15 | + seed: 42 |
| 16 | + async_grpo: |
| 17 | + enabled: false |
| 18 | + max_trajectory_age_steps: 1 |
| 19 | + |
| 20 | +loss_fn: |
| 21 | + reference_policy_kl_penalty: 0.01 |
| 22 | + ratio_clip_min: 0.2 |
| 23 | + ratio_clip_max: 0.2 |
| 24 | + ratio_clip_c: null |
| 25 | + # (default off) loss formulation improvements (docs/guides/grpo.md#loss) |
| 26 | + use_on_policy_kl_approximation: false |
| 27 | + use_importance_sampling_correction: false |
| 28 | + sequence_level_importance_ratios: false |
| 29 | + token_level_loss: true |
| 30 | + |
| 31 | +checkpointing: |
| 32 | + enabled: true |
| 33 | + checkpoint_dir: "results/grpo" |
| 34 | + metric_name: "val_reward" |
| 35 | + higher_is_better: true |
| 36 | + keep_top_k: 3 |
| 37 | + save_period: 10 |
| 38 | + checkpoint_must_save_by: null |
| 39 | + model_save_format: "safetensors" |
| 40 | + save_consolidated: false |
| 41 | + |
| 42 | +policy: |
| 43 | + model_name: "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5" |
| 44 | + tokenizer: |
| 45 | + name: "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5" |
| 46 | + max_total_sequence_length: 1024 |
| 47 | + precision: "bfloat16" |
| 48 | + train_global_batch_size: 128 |
| 49 | + train_micro_batch_size: 4 |
| 50 | + logprob_batch_size: 4 |
| 51 | + logprob_chunk_size: null |
| 52 | + |
| 53 | + dtensor_cfg: |
| 54 | + _v2: true |
| 55 | + activation_checkpointing: true |
| 56 | + context_parallel_size: 1 |
| 57 | + cpu_offload: false |
| 58 | + enabled: true |
| 59 | + sequence_parallel: false |
| 60 | + tensor_parallel_size: 8 |
| 61 | + custom_parallel_plan: examples.configs.recipes.llm.llama_nemotron_super_49b_custom_plan.custom_parallel_plan |
| 62 | + |
| 63 | + megatron_cfg: |
| 64 | + enabled: false |
| 65 | + |
| 66 | + # See docs/design-docs/sequence-packing-and-dynamic-batching.md |
| 67 | + # for more details on dynamic batching and sequence packing. |
| 68 | + dynamic_batching: |
| 69 | + enabled: True |
| 70 | + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} |
| 71 | + logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} |
| 72 | + sequence_length_round: 64 |
| 73 | + |
| 74 | + sequence_packing: |
| 75 | + enabled: False |
| 76 | + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} |
| 77 | + logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} |
| 78 | + algorithm: "modified_first_fit_decreasing" |
| 79 | + sequence_length_round: 64 |
| 80 | + |
| 81 | + make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size} |
| 82 | + max_grad_norm: 1.0 |
| 83 | + |
| 84 | + optimizer: |
| 85 | + name: "torch.optim.AdamW" |
| 86 | + kwargs: |
| 87 | + lr: 3.0e-7 |
| 88 | + weight_decay: 0.01 |
| 89 | + betas: [0.9, 0.999] |
| 90 | + eps: 1e-8 |
| 91 | + |
| 92 | + scheduler: |
| 93 | + - name: "torch.optim.lr_scheduler.LinearLR" |
| 94 | + kwargs: |
| 95 | + start_factor: 0.1 |
| 96 | + end_factor: 1.0 |
| 97 | + # The scheduler iteration is per GPRO step and is decoupled with the optimizer step (may be >=1 per GPRO step) |
| 98 | + total_iters: 13 |
| 99 | + - name: "torch.optim.lr_scheduler.ConstantLR" |
| 100 | + kwargs: |
| 101 | + factor: 1.0 |
| 102 | + total_iters: 10000000000 |
| 103 | + - milestones: [13] |
| 104 | + |
| 105 | + generation: |
| 106 | + backend: "vllm" |
| 107 | + max_new_tokens: ${policy.max_total_sequence_length} |
| 108 | + temperature: 1.0 |
| 109 | + top_p: 1.0 |
| 110 | + top_k: null |
| 111 | + stop_token_ids: null |
| 112 | + stop_strings: null |
| 113 | + vllm_cfg: |
| 114 | + async_engine: false |
| 115 | + precision: ${policy.precision} |
| 116 | + tensor_parallel_size: 4 |
| 117 | + pipeline_parallel_size: 1 |
| 118 | + expert_parallel_size: 1 # When EP > 1, EP must be a multiple of TP since vLLM's EP = DP * TP |
| 119 | + gpu_memory_utilization: 0.6 |
| 120 | + max_model_len: ${policy.max_total_sequence_length} |
| 121 | + # when enforce_eager is False, it is optional to set ++policy.generation.vllm_kwargs.compilation_config.use_inductor=False for better accuracy, |
| 122 | + # with the flag, vllm will use the custom CUDA kernels instead of the Triton kernels generated by torch.compile |
| 123 | + # for more details, see convergence issue https://github.com/NVIDIA-NeMo/RL/issues/998 |
| 124 | + enforce_eager: False |
| 125 | + use_deep_gemm: False |
| 126 | + num_last_layers_in_bf16: 0 |
| 127 | + num_first_layers_in_bf16: 0 |
| 128 | + vllm_kwargs: {} |
| 129 | + colocated: |
| 130 | + # true: generation shares training GPUs |
| 131 | + # false: uses dedicated generation resources |
| 132 | + enabled: true |
| 133 | + # only relevant when enabled is false |
| 134 | + resources: |
| 135 | + gpus_per_node: null # Decides num gpus to be dedicated to generation when there is one node in the cluster i.e cluster.num_nodes == 1 |
| 136 | + num_nodes: null # Decides number of nodes to be dedicated to generation |
| 137 | + |
| 138 | +data: |
| 139 | + max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len |
| 140 | + prompt_file: "examples/prompts/cot.txt" |
| 141 | + system_prompt_file: null |
| 142 | + shuffle: true |
| 143 | + |
| 144 | + dataset_name: "OpenMathInstruct-2" |
| 145 | + # You can use custom response datasets for training and validation. For example: |
| 146 | + # data: |
| 147 | + # dataset_name: ResponseDataset |
| 148 | + # train_data_path: <PathToTrainingDataset> # e.g., /path/to/local/dataset.jsonl or hf_org/hf_dataset_name (HuggingFace) |
| 149 | + # val_data_path: <PathToValidationDataset> |
| 150 | + # input_key: <QuestionKey>, default is "input" |
| 151 | + # output_key: <AnswerKey>, default is "output" |
| 152 | + # train_split: <TrainSplit>, default is None # used for HuggingFace datasets |
| 153 | + # val_split: <ValSplit>, default is None # used for HuggingFace datasets |
| 154 | + # See https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/grpo.md#datasets for more details. |
| 155 | + |
| 156 | +env: |
| 157 | + math: |
| 158 | + num_workers: 8 |
| 159 | + |
| 160 | +logger: |
| 161 | + log_dir: "logs" # Base directory for all logs |
| 162 | + num_val_samples_to_print: 0 |
| 163 | + wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running |
| 164 | + tensorboard_enabled: false |
| 165 | + mlflow_enabled: false |
| 166 | + monitor_gpus: false # If true, will monitor GPU usage and log to wandb and/or tensorboard |
| 167 | + wandb: |
| 168 | + project: "grpo-nemotron-super-49b" |
| 169 | + name: "grpo-${data.dataset_name}-nemotron-super-49b-tp${policy.dtensor_cfg.tensor_parallel_size}" |
| 170 | + tensorboard: {} |
| 171 | + mlflow: |
| 172 | + experiment_name: "sft-dev" |
| 173 | + run_name: "grpo-nemotron-super-49b" |
| 174 | + gpu_monitoring: |
| 175 | + collection_interval: 10 # How often to collect GPU usage metrics (in seconds) |
| 176 | + flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds) |
| 177 | + |
| 178 | +cluster: |
| 179 | + gpus_per_node: 8 |
| 180 | + num_nodes: 4 |
0 commit comments