|  | 
|  | 1 | +# Grouped Relative Policy Optimization (GRPO) | 
|  | 2 | +# >>> python -m apps.grpo.main --config apps/grpo/qwen32b.yaml | 
|  | 3 | +# NOTE - This has not been tested for correctness yet! All testing so far has been only for infrastructure stability | 
|  | 4 | + | 
|  | 5 | +# Global configuration | 
|  | 6 | +group_size: 2 | 
|  | 7 | +batch_size: 8 | 
|  | 8 | +max_req_tokens: 512 | 
|  | 9 | +max_res_tokens: 512 | 
|  | 10 | +model: "Qwen/Qwen3-32B" | 
|  | 11 | +off_by_n: 1 # Off by one by default | 
|  | 12 | + | 
|  | 13 | +# Main loop configuration | 
|  | 14 | +rollout_threads: 1   # Recommended to set equal to policy.num_replicas | 
|  | 15 | + | 
|  | 16 | +# Observability configuration | 
|  | 17 | +metric_logging: | 
|  | 18 | +  wandb: | 
|  | 19 | +    project: "grpo-training" | 
|  | 20 | +    group: "grpo_exp_${oc.env:USER}" | 
|  | 21 | +    reduce_across_ranks: True | 
|  | 22 | +  console: | 
|  | 23 | +    reduce_across_ranks: True | 
|  | 24 | + | 
|  | 25 | +# Dataset configuration | 
|  | 26 | +dataset: | 
|  | 27 | +  path: "openai/gsm8k" | 
|  | 28 | +  revision: "main" | 
|  | 29 | +  data_split: "train" | 
|  | 30 | +  streaming: true | 
|  | 31 | +  model: ${model} | 
|  | 32 | + | 
|  | 33 | +# Policy configuration | 
|  | 34 | +policy: | 
|  | 35 | +  engine_config: | 
|  | 36 | +    model: ${model} | 
|  | 37 | +    tensor_parallel_size: 4 | 
|  | 38 | +    pipeline_parallel_size: 1 | 
|  | 39 | +    enforce_eager: false | 
|  | 40 | +  sampling_config: | 
|  | 41 | +    n: ${group_size} | 
|  | 42 | +    max_tokens: ${max_res_tokens} | 
|  | 43 | +    temperature: 1.0 | 
|  | 44 | +    top_p: 1.0 | 
|  | 45 | + | 
|  | 46 | +# Trainer configuration | 
|  | 47 | +trainer: | 
|  | 48 | +  model: | 
|  | 49 | +    name: qwen3 | 
|  | 50 | +    flavor: 32B | 
|  | 51 | +    hf_assets_path: hf://${model} | 
|  | 52 | +  optimizer: | 
|  | 53 | +    name: AdamW | 
|  | 54 | +    lr: 1e-5 | 
|  | 55 | +    eps: 1e-8 | 
|  | 56 | +  lr_scheduler: | 
|  | 57 | +    warmup_steps: 1 | 
|  | 58 | +  training: | 
|  | 59 | +    local_batch_size: ${batch_size} | 
|  | 60 | +    seq_len: 2048 | 
|  | 61 | +    max_norm: 1.0 | 
|  | 62 | +    steps: 1000000 | 
|  | 63 | +    dtype: bfloat16 | 
|  | 64 | +    gc_freq: 1 | 
|  | 65 | +  compile: | 
|  | 66 | +    enable: false | 
|  | 67 | +  parallelism: | 
|  | 68 | +    data_parallel_replicate_degree: 1 | 
|  | 69 | +    data_parallel_shard_degree: -1 | 
|  | 70 | +    tensor_parallel_degree: 1 | 
|  | 71 | +    pipeline_parallel_degree: 1 | 
|  | 72 | +    context_parallel_degree: 1 | 
|  | 73 | +    expert_parallel_degree: 1 | 
|  | 74 | +    disable_loss_parallel: true | 
|  | 75 | +  checkpoint: | 
|  | 76 | +    enable: true | 
|  | 77 | +    initial_load_path: hf://${model} | 
|  | 78 | +    initial_load_in_hf: true | 
|  | 79 | +    last_save_in_hf: true | 
|  | 80 | +    interval: 500 | 
|  | 81 | +    async_mode: "disabled" | 
|  | 82 | +  activation_checkpoint: | 
|  | 83 | +    mode: full | 
|  | 84 | + | 
|  | 85 | +# Replay buffer configuration | 
|  | 86 | +replay_buffer: | 
|  | 87 | +  batch_size: ${batch_size} | 
|  | 88 | +  max_policy_age: ${off_by_n} | 
|  | 89 | +  # dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree | 
|  | 90 | +  dp_size: 8 | 
|  | 91 | + | 
|  | 92 | +# Reference model configuration | 
|  | 93 | +ref_model: | 
|  | 94 | +  model: | 
|  | 95 | +    name: qwen3 | 
|  | 96 | +    flavor: 32B | 
|  | 97 | +    hf_assets_path: hf://${model} | 
|  | 98 | +  training: | 
|  | 99 | +    dtype: bfloat16 | 
|  | 100 | +    gc_freq: 1 | 
|  | 101 | +  compile: | 
|  | 102 | +    enable: false | 
|  | 103 | +  parallelism: | 
|  | 104 | +    data_parallel_replicate_degree: 1 | 
|  | 105 | +    data_parallel_shard_degree: 1 | 
|  | 106 | +    tensor_parallel_degree: 4 | 
|  | 107 | +    pipeline_parallel_degree: 1 | 
|  | 108 | +    context_parallel_degree: 1 | 
|  | 109 | +    expert_parallel_degree: 1 | 
|  | 110 | +  checkpoint: | 
|  | 111 | +    enable: true | 
|  | 112 | +    initial_load_path: hf://${model} | 
|  | 113 | +    initial_load_in_hf: true | 
|  | 114 | + | 
|  | 115 | +# All resource allocations | 
|  | 116 | +services: | 
|  | 117 | +  policy: | 
|  | 118 | +    procs: ${policy.engine_config.tensor_parallel_size} | 
|  | 119 | +    num_replicas: 1 | 
|  | 120 | +    hosts: 1 | 
|  | 121 | +    with_gpus: true | 
|  | 122 | +  ref_model: | 
|  | 123 | +    procs: ${ref_model.parallelism.tensor_parallel_degree} | 
|  | 124 | +    num_replicas: 1 | 
|  | 125 | +    with_gpus: true | 
|  | 126 | +  reward_actor: | 
|  | 127 | +    procs: 1 | 
|  | 128 | +    num_replicas: 1 | 
|  | 129 | +    with_gpus: false | 
|  | 130 | + | 
|  | 131 | +actors: | 
|  | 132 | +  dataset: | 
|  | 133 | +    procs: 1 | 
|  | 134 | +    with_gpus: false | 
|  | 135 | +  trainer: | 
|  | 136 | +    procs: 8 | 
|  | 137 | +    hosts: 1 | 
|  | 138 | +    with_gpus: true | 
|  | 139 | +  replay_buffer: | 
|  | 140 | +    procs: 1 | 
|  | 141 | +    with_gpus: false | 
|  | 142 | +  compute_advantages: | 
|  | 143 | +    procs: 1 | 
|  | 144 | +    with_gpus: false | 
0 commit comments