1+ hydra :
2+ run :
3+ dir : .
4+ output_subdir : null
5+
6+ exp_name : " qwen3-235BA22B-rlvr-config"
7+ seed : 42
8+ logging_dir : ./output/logs
9+ output_dir : ./output
10+ system_envs :
11+ USE_MODELSCOPE : ' 1'
12+
13+ checkpoint_config :
14+ type : file_system
15+ output_dir : ./rl_examples/models/${exp_name}
16+
17+ track_with : tensorboard
18+ tracker_kwargs :
19+ log_dir : ./rl_examples/llm/tensorboard/roll_exp/rlvr
20+
21+ num_gpus_per_node : 8
22+
23+ max_steps : 500
24+ save_steps : 100
25+ logging_steps : 1
26+ eval_steps : 10
27+ resume_from_checkpoint : false
28+
29+
30+ rollout_batch_size : 64 # prompt
31+ prompt_length : 2048
32+ response_length : 4096
33+
34+ num_return_sequences_in_group : 8
35+ ppo_epochs : 1
36+ adv_estimator : " reinforce"
37+
38+ # clip
39+ value_clip : 0.5
40+ reward_clip : 10
41+ advantage_clip : 2.0
42+ dual_clip_loss : true
43+
44+ # normalize
45+ reward_norm : null
46+ reward_shift : false
47+ reward_scale : false
48+
49+ # data mask
50+ max_len_mask : true
51+ difficulty_mask : true
52+ difficulty_low_threshold : 0.1
53+ difficulty_high_threshold : 0.95
54+ error_max_len_clip : false
55+
56+ # data weight
57+ difficulty_loss_weight : false
58+ length_loss_weight : false
59+
60+ # reward
61+ add_token_level_kl : false
62+
63+ # advantage
64+ whiten_advantages : true
65+
66+ # dynamic sampling scheduler
67+ # use_additional_prompts: true
68+ # max_running_requests: 256
69+ # is_num_return_sequences_expand: false
70+
71+ pretrain : Qwen/Qwen3-235B-A22B
72+ reward_pretrain : Qwen/Qwen3-235B-A22B
73+
74+ validation :
75+ data_args :
76+ template : qwen3
77+ file_name :
78+ - data/math_benchmarks.jsonl
79+ generating_args :
80+ top_p : 0.6
81+ top_k : 50
82+ num_beams : 1
83+ temperature : 0.6
84+ num_return_sequences : 1
85+ eval_steps : 10
86+
87+ actor_train :
88+ model_args :
89+ disable_gradient_checkpointing : false
90+ dtype : bf16
91+ model_type : ~
92+ training_args :
93+ learning_rate : 1.0e-6
94+ weight_decay : 0
95+ per_device_train_batch_size : 1
96+ gradient_accumulation_steps : 64
97+ warmup_steps : 20
98+ num_train_epochs : 50
99+ data_args :
100+ template : qwen3
101+ file_name :
102+ - data/code_KodCode_data.jsonl
103+ # - data/llm_judge_Multi-subject-RLVR_deal_new.jsonl
104+ - data/math_deepmath_deal.jsonl
105+ - data/general_ifeval_train_deal.jsonl
106+ - data/general_CrossThink-QA_deal.jsonl
107+ domain_interleave_probs :
108+ math_rule : 0.4
109+ code_sandbox : 0.3
110+ # llm_judge: 0.1
111+ crossthinkqa : 0.1
112+ ifeval : 0.1
113+ dataset_dir : data
114+ messages : messages
115+ interleave_probs : " 1.0"
116+ preprocessing_num_workers : 16
117+ strategy_args :
118+ strategy_name : megatron_train
119+ strategy_config :
120+ tensor_model_parallel_size : 4
121+ pipeline_model_parallel_size : 8
122+ virtual_pipeline_model_parallel_size : 6
123+ expert_model_parallel_size : 8
124+ context_parallel_size : 1
125+ account_for_loss_in_pipeline_split : true
126+ account_for_embedding_in_pipeline_split : true
127+ use_distributed_optimizer : true
128+ sequence_parallel : true
129+ overlap_grad_reduce : true
130+ bias_activation_fusion : true
131+ apply_rope_fusion : true
132+ moe_grouped_gemm : true
133+ moe_layer_recompute : true
134+ moe_token_dispatcher_type : " alltoall"
135+ device_mapping : list(range(0,256))
136+ infer_batch_size : 2
137+
138+ actor_infer :
139+ model_args :
140+ disable_gradient_checkpointing : true
141+ dtype : bf16
142+ generating_args :
143+ max_new_tokens : ${response_length}
144+ top_p : 0.99
145+ top_k : 100
146+ num_beams : 1
147+ temperature : 0.99
148+ num_return_sequences : ${num_return_sequences_in_group}
149+ data_args :
150+ template : qwen3
151+ strategy_args :
152+ strategy_name : vllm
153+ strategy_config :
154+ gpu_memory_utilization : 0.75
155+ load_format : dummy
156+ tensor_parallel_size : 8
157+ num_gpus_per_worker : 8
158+ device_mapping : list(range(0,200)) # device share with llm reward
159+ infer_batch_size : 1
160+
161+ reference :
162+ model_args :
163+ dtype : bf16
164+ model_type : ~
165+ data_args :
166+ template : qwen3
167+ strategy_args :
168+ strategy_name : megatron_infer
169+ strategy_config :
170+ tensor_model_parallel_size : 1
171+ pipeline_model_parallel_size : 8
172+ virtual_pipeline_model_parallel_size : 6
173+ expert_model_parallel_size : 8
174+ account_for_loss_in_pipeline_split : true
175+ account_for_embedding_in_pipeline_split : true
176+ use_distributed_optimizer : true
177+ sequence_parallel : true
178+ bias_activation_fusion : true
179+ apply_rope_fusion : true
180+ moe_grouped_gemm : true
181+ moe_token_dispatcher_type : " alltoall"
182+ device_mapping : list(range(0,256))
183+ infer_batch_size : 2
184+
185+ rewards :
186+ crossthinkqa :
187+ worker_cls : roll.pipeline.rlvr.rewards.crossthinkqa_rule_reward_worker.CrossThinkQARuleRewardWorker
188+ reward_type : soft
189+ response_length_penalty_coef : 0.0
190+ model_args :
191+ model_name_or_path : ${reward_pretrain}
192+ data_args :
193+ template : qwen3
194+ tag_included : [crossthinkqa]
195+ world_size : 8
196+ infer_batch_size : 4
197+ ifeval :
198+ worker_cls : roll.pipeline.rlvr.rewards.ifeval_rule_reward_worker.GeneralRuleRewardWorker
199+ reward_type : soft
200+ model_args :
201+ model_name_or_path : ${reward_pretrain}
202+ data_args :
203+ template : qwen3
204+ tag_included : [ifeval]
205+ world_size : 8
206+ infer_batch_size : 4
207+ math_rule :
208+ worker_cls : roll.pipeline.rlvr.rewards.math_rule_reward_worker.MathRuleRewardWorker
209+ model_args :
210+ model_name_or_path : ${reward_pretrain}
211+ data_args :
212+ template : qwen3
213+ tag_included : [deepmath_103k, aime]
214+ world_size : 8
215+ infer_batch_size : 1
216+ # dynamic filter config
217+ # query_filter_config:
218+ # type: mean_filter
219+ # filter_args:
220+ # threshold_up: 0.9
221+ # threshold_down: 0.1
222+ code_sandbox :
223+ use_local : true
224+ worker_cls : roll.pipeline.rlvr.rewards.code_sandbox_reward_worker.CodeSandboxRewardWorker
225+ tag_included : [KodCode]
226+ model_args :
227+ model_name_or_path : ${reward_pretrain}
228+ data_args :
229+ template : qwen3
230+ world_size : 8
231+ infer_batch_size : 1
232+ # query_filter_config:
233+ # type: std_filter
234+ # filter_args:
235+ # std_threshold: 0
236+ llm_judge :
237+ # NOTE: llm as judge 也需要gpu, 不能和actor infer共享gpu
238+ worker_cls : roll.pipeline.rlvr.rewards.llm_judge_reward_worker.LLMJudgeRewardWorker
239+ judge_prompt : Qwen2.5-7B-Instruct-RLVR-prompt
240+ judge_model_type : inference
241+ tag_included : [RLVR]
242+ model_args :
243+ model_name_or_path : virtuoussy/Qwen2.5-7B-Instruct-RLVR
244+ attn_implementation : fa2
245+ disable_gradient_checkpointing : true
246+ dtype : bf16
247+ model_type : trl
248+ generating_args :
249+ max_new_tokens : 100
250+ top_p : 0.8
251+ top_k : 50
252+ num_beams : 1
253+ temperature : 0.8
254+ num_return_sequences : 1
255+ data_args :
256+ template : qwen3
257+ strategy_args :
258+ strategy_name : hf_infer
259+ strategy_config : null
260+ device_mapping : list(range(200,256))
261+ infer_batch_size : 4
0 commit comments