diff --git a/apps/grpo/qwen3_32b.yaml b/apps/grpo/qwen3_32b.yaml
new file mode 100644
index 000000000..3d1b80852
--- /dev/null
+++ b/apps/grpo/qwen3_32b.yaml
@@ -0,0 +1,144 @@
+# Grouped Relative Policy Optimization (GRPO)
+# >>> python -m apps.grpo.main --config apps/grpo/qwen32b.yaml
+# NOTE - This has not been tested for correctness yet! All testing so far has been only for infrastructure stability
+
+# Global configuration
+group_size: 2
+batch_size: 8
+max_req_tokens: 512
+max_res_tokens: 512
+model: "Qwen/Qwen3-32B"
+off_by_n: 1 # Off by one by default
+
+# Main loop configuration
+rollout_threads: 1   # Recommended to set equal to policy.num_replicas
+
+# Observability configuration
+metric_logging:
+  wandb:
+    project: "grpo-training"
+    group: "grpo_exp_${oc.env:USER}"
+    reduce_across_ranks: True
+  console:
+    reduce_across_ranks: True
+
+# Dataset configuration
+dataset:
+  path: "openai/gsm8k"
+  revision: "main"
+  data_split: "train"
+  streaming: true
+  model: ${model}
+
+# Policy configuration
+policy:
+  engine_config:
+    model: ${model}
+    tensor_parallel_size: 4
+    pipeline_parallel_size: 1
+    enforce_eager: false
+  sampling_config:
+    n: ${group_size}
+    max_tokens: ${max_res_tokens}
+    temperature: 1.0
+    top_p: 1.0
+
+# Trainer configuration
+trainer:
+  model:
+    name: qwen3
+    flavor: 32B
+    hf_assets_path: hf://${model}
+  optimizer:
+    name: AdamW
+    lr: 1e-5
+    eps: 1e-8
+  lr_scheduler:
+    warmup_steps: 1
+  training:
+    local_batch_size: ${batch_size}
+    seq_len: 2048
+    max_norm: 1.0
+    steps: 1000000
+    dtype: bfloat16
+    gc_freq: 1
+  compile:
+    enable: false
+  parallelism:
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: -1
+    tensor_parallel_degree: 1
+    pipeline_parallel_degree: 1
+    context_parallel_degree: 1
+    expert_parallel_degree: 1
+    disable_loss_parallel: true
+  checkpoint:
+    enable: true
+    initial_load_path: hf://${model}
+    initial_load_in_hf: true
+    last_save_in_hf: true
+    interval: 500
+    async_mode: "disabled"
+  activation_checkpoint:
+    mode: full
+
+# Replay buffer configuration
+replay_buffer:
+  batch_size: ${batch_size}
+  max_policy_age: ${off_by_n}
+  # dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree
+  dp_size: 8
+
+# Reference model configuration
+ref_model:
+  model:
+    name: qwen3
+    flavor: 32B
+    hf_assets_path: hf://${model}
+  training:
+    dtype: bfloat16
+    gc_freq: 1
+  compile:
+    enable: false
+  parallelism:
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: 1
+    tensor_parallel_degree: 4
+    pipeline_parallel_degree: 1
+    context_parallel_degree: 1
+    expert_parallel_degree: 1
+  checkpoint:
+    enable: true
+    initial_load_path: hf://${model}
+    initial_load_in_hf: true
+
+# All resource allocations
+services:
+  policy:
+    procs: ${policy.engine_config.tensor_parallel_size}
+    num_replicas: 1
+    hosts: 1
+    with_gpus: true
+  ref_model:
+    procs: ${ref_model.parallelism.tensor_parallel_degree}
+    num_replicas: 1
+    with_gpus: true
+  reward_actor:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+
+actors:
+  dataset:
+    procs: 1
+    with_gpus: false
+  trainer:
+    procs: 8
+    hosts: 1
+    with_gpus: true
+  replay_buffer:
+    procs: 1
+    with_gpus: false
+  compute_advantages:
+    procs: 1
+    with_gpus: false
diff --git a/apps/grpo/qwen3_multinode.yaml b/apps/grpo/qwen3_multinode.yaml
deleted file mode 100644
index 47c8cdd0e..000000000
--- a/apps/grpo/qwen3_multinode.yaml
+++ /dev/null
@@ -1,86 +0,0 @@
-# GRPO Training Configuration
-# Currently a fork of the main yaml, this just shows
-# placement of trainer and inference servers on separate hosts.
-# >>> python -m apps.grpo.main --config apps/grpo/qwen3_multinode.yaml
-
-# Global configuration
-group_size: 8
-batch_size: 16
-max_req_tokens: 512
-max_res_tokens: 512
-model: "Qwen/Qwen3-1.7B"
-
-# Observability configuration
-metric_logging:
-  wandb:
-    project: "grpo-training"
-    group: "grpo_exp_${oc.env:USER}"
-    reduce_across_ranks: True
-  console:
-    reduce_across_ranks: True
-
-# Dataset configuration
-dataset:
-  path: "openai/gsm8k"
-  revision: "main"
-  data_split: "train"
-  streaming: true
-  model: ${model}
-
-# Policy configuration
-policy:
-  engine_config:
-    model: ${model}
-    tensor_parallel_size: 1
-    pipeline_parallel_size: 1
-    enforce_eager: false
-  sampling_config:
-    n: ${group_size}
-    max_tokens: ${max_res_tokens}
-    temperature: 1.0
-    top_p: 1.0
-
-# Trainer configuration
-trainer:
-  model_name: ${model}
-  learning_rate: 1e-5
-
-# Replay buffer configuration
-replay_buffer:
-  batch_size: ${batch_size}
-  max_policy_age: 1 # Async by 1
-  dp_size: 1
-
-# Reference model configuration
-ref_model:
-  model_name: ${model}
-
-services:
-  policy:
-    procs: 1
-    hosts: 1
-    num_replicas: 1
-    with_gpus: true
-  ref_model:
-    procs: 1
-    num_replicas: 1
-    with_gpus: true
-  reward_actor:
-    procs: 1
-    num_replicas: 1
-    with_gpus: false
-
-actors:
-  dataset:
-    procs: 1
-    with_gpus: false
-  compute_advantages:
-    procs: 1
-    with_gpus: false
-  trainer:
-    procs: 1
-    hosts: 1
-    with_gpus: true
-  replay_buffer:
-    procs: 1
-    with_gpus: false