diff --git a/apps/grpo/qwen3_32b.yaml b/apps/grpo/qwen3_32b.yaml new file mode 100644 index 000000000..3d1b80852 --- /dev/null +++ b/apps/grpo/qwen3_32b.yaml @@ -0,0 +1,144 @@ +# Grouped Relative Policy Optimization (GRPO) +# >>> python -m apps.grpo.main --config apps/grpo/qwen32b.yaml +# NOTE - This has not been tested for correctness yet! All testing so far has been only for infrastructure stability + +# Global configuration +group_size: 2 +batch_size: 8 +max_req_tokens: 512 +max_res_tokens: 512 +model: "Qwen/Qwen3-32B" +off_by_n: 1 # Off by one by default + +# Main loop configuration +rollout_threads: 1 # Recommended to set equal to policy.num_replicas + +# Observability configuration +metric_logging: + wandb: + project: "grpo-training" + group: "grpo_exp_${oc.env:USER}" + reduce_across_ranks: True + console: + reduce_across_ranks: True + +# Dataset configuration +dataset: + path: "openai/gsm8k" + revision: "main" + data_split: "train" + streaming: true + model: ${model} + +# Policy configuration +policy: + engine_config: + model: ${model} + tensor_parallel_size: 4 + pipeline_parallel_size: 1 + enforce_eager: false + sampling_config: + n: ${group_size} + max_tokens: ${max_res_tokens} + temperature: 1.0 + top_p: 1.0 + +# Trainer configuration +trainer: + model: + name: qwen3 + flavor: 32B + hf_assets_path: hf://${model} + optimizer: + name: AdamW + lr: 1e-5 + eps: 1e-8 + lr_scheduler: + warmup_steps: 1 + training: + local_batch_size: ${batch_size} + seq_len: 2048 + max_norm: 1.0 + steps: 1000000 + dtype: bfloat16 + gc_freq: 1 + compile: + enable: false + parallelism: + data_parallel_replicate_degree: 1 + data_parallel_shard_degree: -1 + tensor_parallel_degree: 1 + pipeline_parallel_degree: 1 + context_parallel_degree: 1 + expert_parallel_degree: 1 + disable_loss_parallel: true + checkpoint: + enable: true + initial_load_path: hf://${model} + initial_load_in_hf: true + last_save_in_hf: true + interval: 500 + async_mode: "disabled" + activation_checkpoint: + mode: full + +# Replay buffer configuration +replay_buffer: + batch_size: ${batch_size} + max_policy_age: ${off_by_n} + # dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree + dp_size: 8 + +# Reference model configuration +ref_model: + model: + name: qwen3 + flavor: 32B + hf_assets_path: hf://${model} + training: + dtype: bfloat16 + gc_freq: 1 + compile: + enable: false + parallelism: + data_parallel_replicate_degree: 1 + data_parallel_shard_degree: 1 + tensor_parallel_degree: 4 + pipeline_parallel_degree: 1 + context_parallel_degree: 1 + expert_parallel_degree: 1 + checkpoint: + enable: true + initial_load_path: hf://${model} + initial_load_in_hf: true + +# All resource allocations +services: + policy: + procs: ${policy.engine_config.tensor_parallel_size} + num_replicas: 1 + hosts: 1 + with_gpus: true + ref_model: + procs: ${ref_model.parallelism.tensor_parallel_degree} + num_replicas: 1 + with_gpus: true + reward_actor: + procs: 1 + num_replicas: 1 + with_gpus: false + +actors: + dataset: + procs: 1 + with_gpus: false + trainer: + procs: 8 + hosts: 1 + with_gpus: true + replay_buffer: + procs: 1 + with_gpus: false + compute_advantages: + procs: 1 + with_gpus: false diff --git a/apps/grpo/qwen3_multinode.yaml b/apps/grpo/qwen3_multinode.yaml deleted file mode 100644 index 47c8cdd0e..000000000 --- a/apps/grpo/qwen3_multinode.yaml +++ /dev/null @@ -1,86 +0,0 @@ -# GRPO Training Configuration -# Currently a fork of the main yaml, this just shows -# placement of trainer and inference servers on separate hosts. -# >>> python -m apps.grpo.main --config apps/grpo/qwen3_multinode.yaml - -# Global configuration -group_size: 8 -batch_size: 16 -max_req_tokens: 512 -max_res_tokens: 512 -model: "Qwen/Qwen3-1.7B" - -# Observability configuration -metric_logging: - wandb: - project: "grpo-training" - group: "grpo_exp_${oc.env:USER}" - reduce_across_ranks: True - console: - reduce_across_ranks: True - -# Dataset configuration -dataset: - path: "openai/gsm8k" - revision: "main" - data_split: "train" - streaming: true - model: ${model} - -# Policy configuration -policy: - engine_config: - model: ${model} - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - enforce_eager: false - sampling_config: - n: ${group_size} - max_tokens: ${max_res_tokens} - temperature: 1.0 - top_p: 1.0 - -# Trainer configuration -trainer: - model_name: ${model} - learning_rate: 1e-5 - -# Replay buffer configuration -replay_buffer: - batch_size: ${batch_size} - max_policy_age: 1 # Async by 1 - dp_size: 1 - -# Reference model configuration -ref_model: - model_name: ${model} - -services: - policy: - procs: 1 - hosts: 1 - num_replicas: 1 - with_gpus: true - ref_model: - procs: 1 - num_replicas: 1 - with_gpus: true - reward_actor: - procs: 1 - num_replicas: 1 - with_gpus: false - -actors: - dataset: - procs: 1 - with_gpus: false - compute_advantages: - procs: 1 - with_gpus: false - trainer: - procs: 1 - hosts: 1 - with_gpus: true - replay_buffer: - procs: 1 - with_gpus: false