meta-pytorch · allenwang28 · Sep 23, 2025 · Sep 22, 2025 · Sep 23, 2025
diff --git a/apps/grpo/qwen3_8b.yaml b/apps/grpo/qwen3_8b.yaml
@@ -0,0 +1,131 @@
+# Grouped Relative Policy Optimization (GRPO)
+# >>> python -m apps.grpo.main --config apps/grpo/qwen3_8b.yaml
+
+# Global configuration
+group_size: 8
+batch_size: 16
+max_req_tokens: 512
+max_res_tokens: 512
+model: "Qwen/Qwen3-8B"
+off_by_n: 1 # Off by one by default
+
+# Dataset configuration
+dataset:
+  path: "openai/gsm8k"
+  revision: "main"
+  data_split: "train"
+  streaming: true
+  model: ${model}
+
+# Policy configuration
+policy:
+  engine_config:
+    model: ${model}
+    tensor_parallel_size: 2
+    pipeline_parallel_size: 1
+    enforce_eager: false
+  sampling_config:
+    n: ${group_size}
+    max_tokens: ${max_res_tokens}
+    temperature: 1.0
+    top_p: 1.0
+
+# Trainer configuration
+trainer:
+  model:
+    name: qwen3
+    flavor: 8B
+    hf_assets_path: hf://${model}
+  optimizer:
+    name: AdamW
+    lr: 1e-5
+    eps: 1e-8
+  lr_scheduler:
+    warmup_steps: 1
+  training:
+    local_batch_size: ${batch_size}
+    seq_len: 2048
+    max_norm: 1.0
+    steps: 1000000
+    dtype: bfloat16
+  compile:
+    enable: false
+  parallelism:
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: -1
+    tensor_parallel_degree: 1
+    pipeline_parallel_degree: 1
+    context_parallel_degree: 1
+    expert_parallel_degree: 1
+    disable_loss_parallel: true
+  checkpoint:
+    enable: true
+    initial_load_path: hf://${model}
+    initial_load_in_hf: true
+    last_save_in_hf: true
+    interval: 500
+    async_mode: "disabled"
+  activation_checkpoint:
+    mode: selective
+    selective_ac_option: op
+
+# Replay buffer configuration
+replay_buffer:
+  batch_size: ${batch_size}
+  max_policy_age: ${off_by_n}
+  # This should match the dp_size of TorchTitan
+  # Here it's set explicitly to 2, because we've set
+  # 2 GPUs for the trainer and we're using full FSDP.
+  dp_size: 2
+
+# Reference model configuration
+ref_model:
+  model:
+    name: qwen3
+    flavor: 8B
+    hf_assets_path: hf://${model}
+  training:
+    dtype: bfloat16
+  compile:
+    enable: false
+  parallelism:
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: 1
+    tensor_parallel_degree: 1
+    pipeline_parallel_degree: 1
+    context_parallel_degree: 1
+    expert_parallel_degree: 1
+  checkpoint:
+    initial_load_path: hf://${model}
+    initial_load_in_hf: true
+
+# All resource allocations
+services:
+  dataset:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+  policy:
+    procs: ${policy.engine_config.tensor_parallel_size}
+    num_replicas: 1
+    with_gpus: true
+  trainer:
+    procs: 2
+    num_replicas: 1
+    with_gpus: true
+  replay_buffer:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+  ref_model:
+    procs: 1
+    num_replicas: 1
+    with_gpus: true
+  compute_advantages:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+  reward_actor:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
diff --git a/src/forge/actors/trainer.py b/src/forge/actors/trainer.py
@@ -223,7 +223,9 @@ async def push_weights(self, policy_version: int) -> None:
             )
         hf_state_dict = self.engine.checkpointer.sd_adapter.to_hf(flattened_state_dict)
         # TODO: Figure out how to gracefully handle which model to-vLLM conversion is needed
-        vllm_ready_hf_sd = _qwen3_hf_to_vllm(sd=hf_state_dict, num_layers=28)
+        vllm_ready_hf_sd = _qwen3_hf_to_vllm(
+            sd=hf_state_dict, num_layers=self.engine.model_args.n_layers
+        )
 
         key = f"{self.state_dict_key}{DELIM}{policy_version}"
         start_time = time.time()