Skip to content

Commit 1f66a59

Browse files
committed
(feat): add qwen3 8B&235B config.
1 parent 708c582 commit 1f66a59

File tree

5 files changed

+512
-0
lines changed

5 files changed

+512
-0
lines changed
Lines changed: 261 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,261 @@
1+
hydra:
2+
run:
3+
dir: .
4+
output_subdir: null
5+
6+
exp_name: "qwen3-235BA22B-rlvr-config"
7+
seed: 42
8+
logging_dir: ./output/logs
9+
output_dir: ./output
10+
system_envs:
11+
USE_MODELSCOPE: '1'
12+
13+
checkpoint_config:
14+
type: file_system
15+
output_dir: ./rl_examples/models/${exp_name}
16+
17+
track_with: tensorboard
18+
tracker_kwargs:
19+
log_dir: ./rl_examples/llm/tensorboard/roll_exp/rlvr
20+
21+
num_gpus_per_node: 8
22+
23+
max_steps: 500
24+
save_steps: 100
25+
logging_steps: 1
26+
eval_steps: 10
27+
resume_from_checkpoint: false
28+
29+
30+
rollout_batch_size: 64 # prompt
31+
prompt_length: 2048
32+
response_length: 4096
33+
34+
num_return_sequences_in_group: 8
35+
ppo_epochs: 1
36+
adv_estimator: "reinforce"
37+
38+
# clip
39+
value_clip: 0.5
40+
reward_clip: 10
41+
advantage_clip: 2.0
42+
dual_clip_loss: true
43+
44+
# normalize
45+
reward_norm: null
46+
reward_shift: false
47+
reward_scale: false
48+
49+
# data mask
50+
max_len_mask: true
51+
difficulty_mask: true
52+
difficulty_low_threshold: 0.1
53+
difficulty_high_threshold: 0.95
54+
error_max_len_clip: false
55+
56+
# data weight
57+
difficulty_loss_weight: false
58+
length_loss_weight: false
59+
60+
# reward
61+
add_token_level_kl: false
62+
63+
# advantage
64+
whiten_advantages: true
65+
66+
# dynamic sampling scheduler
67+
# use_additional_prompts: true
68+
# max_running_requests: 256
69+
# is_num_return_sequences_expand: false
70+
71+
pretrain: Qwen/Qwen3-235B-A22B
72+
reward_pretrain: Qwen/Qwen3-235B-A22B
73+
74+
validation:
75+
data_args:
76+
template: qwen3
77+
file_name:
78+
- data/math_benchmarks.jsonl
79+
generating_args:
80+
top_p: 0.6
81+
top_k: 50
82+
num_beams: 1
83+
temperature: 0.6
84+
num_return_sequences: 1
85+
eval_steps: 10
86+
87+
actor_train:
88+
model_args:
89+
disable_gradient_checkpointing: false
90+
dtype: bf16
91+
model_type: ~
92+
training_args:
93+
learning_rate: 1.0e-6
94+
weight_decay: 0
95+
per_device_train_batch_size: 1
96+
gradient_accumulation_steps: 64
97+
warmup_steps: 20
98+
num_train_epochs: 50
99+
data_args:
100+
template: qwen3
101+
file_name:
102+
- data/code_KodCode_data.jsonl
103+
# - data/llm_judge_Multi-subject-RLVR_deal_new.jsonl
104+
- data/math_deepmath_deal.jsonl
105+
- data/general_ifeval_train_deal.jsonl
106+
- data/general_CrossThink-QA_deal.jsonl
107+
domain_interleave_probs:
108+
math_rule: 0.4
109+
code_sandbox: 0.3
110+
# llm_judge: 0.1
111+
crossthinkqa: 0.1
112+
ifeval: 0.1
113+
dataset_dir: data
114+
messages: messages
115+
interleave_probs: "1.0"
116+
preprocessing_num_workers: 16
117+
strategy_args:
118+
strategy_name: megatron_train
119+
strategy_config:
120+
tensor_model_parallel_size: 4
121+
pipeline_model_parallel_size: 8
122+
virtual_pipeline_model_parallel_size: 6
123+
expert_model_parallel_size: 8
124+
context_parallel_size: 1
125+
account_for_loss_in_pipeline_split: true
126+
account_for_embedding_in_pipeline_split: true
127+
use_distributed_optimizer: true
128+
sequence_parallel: true
129+
overlap_grad_reduce: true
130+
bias_activation_fusion: true
131+
apply_rope_fusion: true
132+
moe_grouped_gemm: true
133+
moe_layer_recompute: true
134+
moe_token_dispatcher_type: "alltoall"
135+
device_mapping: list(range(0,256))
136+
infer_batch_size: 2
137+
138+
actor_infer:
139+
model_args:
140+
disable_gradient_checkpointing: true
141+
dtype: bf16
142+
generating_args:
143+
max_new_tokens: ${response_length}
144+
top_p: 0.99
145+
top_k: 100
146+
num_beams: 1
147+
temperature: 0.99
148+
num_return_sequences: ${num_return_sequences_in_group}
149+
data_args:
150+
template: qwen3
151+
strategy_args:
152+
strategy_name: vllm
153+
strategy_config:
154+
gpu_memory_utilization: 0.75
155+
load_format: dummy
156+
tensor_parallel_size: 8
157+
num_gpus_per_worker: 8
158+
device_mapping: list(range(0,200)) # device share with llm reward
159+
infer_batch_size: 1
160+
161+
reference:
162+
model_args:
163+
dtype: bf16
164+
model_type: ~
165+
data_args:
166+
template: qwen3
167+
strategy_args:
168+
strategy_name: megatron_infer
169+
strategy_config:
170+
tensor_model_parallel_size: 1
171+
pipeline_model_parallel_size: 8
172+
virtual_pipeline_model_parallel_size: 6
173+
expert_model_parallel_size: 8
174+
account_for_loss_in_pipeline_split: true
175+
account_for_embedding_in_pipeline_split: true
176+
use_distributed_optimizer: true
177+
sequence_parallel: true
178+
bias_activation_fusion: true
179+
apply_rope_fusion: true
180+
moe_grouped_gemm: true
181+
moe_token_dispatcher_type: "alltoall"
182+
device_mapping: list(range(0,256))
183+
infer_batch_size: 2
184+
185+
rewards:
186+
crossthinkqa:
187+
worker_cls: roll.pipeline.rlvr.rewards.crossthinkqa_rule_reward_worker.CrossThinkQARuleRewardWorker
188+
reward_type: soft
189+
response_length_penalty_coef: 0.0
190+
model_args:
191+
model_name_or_path: ${reward_pretrain}
192+
data_args:
193+
template: qwen3
194+
tag_included: [crossthinkqa]
195+
world_size: 8
196+
infer_batch_size: 4
197+
ifeval:
198+
worker_cls: roll.pipeline.rlvr.rewards.ifeval_rule_reward_worker.GeneralRuleRewardWorker
199+
reward_type: soft
200+
model_args:
201+
model_name_or_path: ${reward_pretrain}
202+
data_args:
203+
template: qwen3
204+
tag_included: [ifeval]
205+
world_size: 8
206+
infer_batch_size: 4
207+
math_rule:
208+
worker_cls: roll.pipeline.rlvr.rewards.math_rule_reward_worker.MathRuleRewardWorker
209+
model_args:
210+
model_name_or_path: ${reward_pretrain}
211+
data_args:
212+
template: qwen3
213+
tag_included: [deepmath_103k, aime]
214+
world_size: 8
215+
infer_batch_size: 1
216+
# dynamic filter config
217+
# query_filter_config:
218+
# type: mean_filter
219+
# filter_args:
220+
# threshold_up: 0.9
221+
# threshold_down: 0.1
222+
code_sandbox:
223+
use_local: true
224+
worker_cls: roll.pipeline.rlvr.rewards.code_sandbox_reward_worker.CodeSandboxRewardWorker
225+
tag_included: [KodCode]
226+
model_args:
227+
model_name_or_path: ${reward_pretrain}
228+
data_args:
229+
template: qwen3
230+
world_size: 8
231+
infer_batch_size: 1
232+
# query_filter_config:
233+
# type: std_filter
234+
# filter_args:
235+
# std_threshold: 0
236+
llm_judge:
237+
# NOTE: llm as judge 也需要gpu, 不能和actor infer共享gpu
238+
worker_cls: roll.pipeline.rlvr.rewards.llm_judge_reward_worker.LLMJudgeRewardWorker
239+
judge_prompt: Qwen2.5-7B-Instruct-RLVR-prompt
240+
judge_model_type: inference
241+
tag_included: [RLVR]
242+
model_args:
243+
model_name_or_path: virtuoussy/Qwen2.5-7B-Instruct-RLVR
244+
attn_implementation: fa2
245+
disable_gradient_checkpointing: true
246+
dtype: bf16
247+
model_type: trl
248+
generating_args:
249+
max_new_tokens: 100
250+
top_p: 0.8
251+
top_k: 50
252+
num_beams: 1
253+
temperature: 0.8
254+
num_return_sequences: 1
255+
data_args:
256+
template: qwen3
257+
strategy_args:
258+
strategy_name: hf_infer
259+
strategy_config: null
260+
device_mapping: list(range(200,256))
261+
infer_batch_size: 4
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#!/bin/bash
2+
set +x
3+
4+
CONFIG_PATH=$(basename $(dirname $0))
5+
python examples/start_rlvr_pipeline.py --config_path $CONFIG_PATH --config_name rlvr_config

0 commit comments

Comments
 (0)