Skip to content

Commit 6ffa280

Browse files
committed
add 32b config
1 parent f16d0af commit 6ffa280

File tree

2 files changed

+144
-86
lines changed

2 files changed

+144
-86
lines changed

apps/grpo/qwen3_32b.yaml

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
# Grouped Relative Policy Optimization (GRPO)
2+
# >>> python -m apps.grpo.main --config apps/grpo/qwen32b.yaml
3+
# NOTE - This has not been tested for correctness yet! All testing so far has been only for infrastructure stability
4+
5+
# Global configuration
6+
group_size: 2
7+
batch_size: 8
8+
max_req_tokens: 512
9+
max_res_tokens: 512
10+
model: "Qwen/Qwen3-32B"
11+
off_by_n: 1 # Off by one by default
12+
13+
# Main loop configuration
14+
rollout_threads: 1 # Recommended to set equal to policy.num_replicas
15+
16+
# Observability configuration
17+
metric_logging:
18+
wandb:
19+
project: "grpo-training"
20+
group: "grpo_exp_${oc.env:USER}"
21+
reduce_across_ranks: True
22+
console:
23+
reduce_across_ranks: True
24+
25+
# Dataset configuration
26+
dataset:
27+
path: "openai/gsm8k"
28+
revision: "main"
29+
data_split: "train"
30+
streaming: true
31+
model: ${model}
32+
33+
# Policy configuration
34+
policy:
35+
engine_config:
36+
model: ${model}
37+
tensor_parallel_size: 4
38+
pipeline_parallel_size: 1
39+
enforce_eager: false
40+
sampling_config:
41+
n: ${group_size}
42+
max_tokens: ${max_res_tokens}
43+
temperature: 1.0
44+
top_p: 1.0
45+
46+
# Trainer configuration
47+
trainer:
48+
model:
49+
name: qwen3
50+
flavor: 32B
51+
hf_assets_path: hf://${model}
52+
optimizer:
53+
name: AdamW
54+
lr: 1e-5
55+
eps: 1e-8
56+
lr_scheduler:
57+
warmup_steps: 1
58+
training:
59+
local_batch_size: ${batch_size}
60+
seq_len: 2048
61+
max_norm: 1.0
62+
steps: 1000000
63+
dtype: bfloat16
64+
gc_freq: 1
65+
compile:
66+
enable: false
67+
parallelism:
68+
data_parallel_replicate_degree: 1
69+
data_parallel_shard_degree: -1
70+
tensor_parallel_degree: 1
71+
pipeline_parallel_degree: 1
72+
context_parallel_degree: 1
73+
expert_parallel_degree: 1
74+
disable_loss_parallel: true
75+
checkpoint:
76+
enable: true
77+
initial_load_path: hf://${model}
78+
initial_load_in_hf: true
79+
last_save_in_hf: true
80+
interval: 500
81+
async_mode: "disabled"
82+
activation_checkpoint:
83+
mode: full
84+
85+
# Replay buffer configuration
86+
replay_buffer:
87+
batch_size: ${batch_size}
88+
max_policy_age: ${off_by_n}
89+
# dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree
90+
dp_size: 8
91+
92+
# Reference model configuration
93+
ref_model:
94+
model:
95+
name: qwen3
96+
flavor: 32B
97+
hf_assets_path: hf://${model}
98+
training:
99+
dtype: bfloat16
100+
gc_freq: 1
101+
compile:
102+
enable: false
103+
parallelism:
104+
data_parallel_replicate_degree: 1
105+
data_parallel_shard_degree: 1
106+
tensor_parallel_degree: 4
107+
pipeline_parallel_degree: 1
108+
context_parallel_degree: 1
109+
expert_parallel_degree: 1
110+
checkpoint:
111+
enable: true
112+
initial_load_path: hf://${model}
113+
initial_load_in_hf: true
114+
115+
# All resource allocations
116+
services:
117+
policy:
118+
procs: ${policy.engine_config.tensor_parallel_size}
119+
num_replicas: 1
120+
hosts: 1
121+
with_gpus: true
122+
ref_model:
123+
procs: ${ref_model.parallelism.tensor_parallel_degree}
124+
num_replicas: 1
125+
with_gpus: true
126+
reward_actor:
127+
procs: 1
128+
num_replicas: 1
129+
with_gpus: false
130+
131+
actors:
132+
dataset:
133+
procs: 1
134+
with_gpus: false
135+
trainer:
136+
procs: 8
137+
hosts: 1
138+
with_gpus: true
139+
replay_buffer:
140+
procs: 1
141+
with_gpus: false
142+
compute_advantages:
143+
procs: 1
144+
with_gpus: false

apps/grpo/qwen3_multinode.yaml

Lines changed: 0 additions & 86 deletions
This file was deleted.

0 commit comments

Comments
 (0)