-
Notifications
You must be signed in to change notification settings - Fork 309
Expand file tree
/
Copy pathgdpo_math_1B.yaml
More file actions
62 lines (52 loc) · 1.32 KB
/
gdpo_math_1B.yaml
File metadata and controls
62 lines (52 loc) · 1.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# GDPO: inherits from grpo_math_1B.yaml and overrides only what differs.
defaults: grpo_math_1B.yaml
grpo:
adv_estimator:
name: "gdpo"
normalize_rewards: true
use_leave_one_out_baseline: false
checkpointing:
checkpoint_dir: "results/gdpo"
policy:
model_name: "Qwen/Qwen2.5-1.5B-Instruct"
logprob_batch_size: 4
max_total_sequence_length: 1024
megatron_cfg:
optimizer:
weight_decay: 0.0
scheduler:
lr_decay_style: "cosine"
lr_warmup_iters: 10
# GDPO uses a single flat data config (GSM8K + math_gdpo_data_processor); replace parent's train/validation/default.
data:
_override_: true
max_input_seq_length: ${policy.max_total_sequence_length}
shuffle: true
num_workers: 1
use_multiple_dataloader: false
train:
dataset_name: "gsm8k"
split: train
validation:
dataset_name: "gsm8k"
split: test
default:
prompt_file: null
system_prompt_file: "examples/prompts/gsm8k.txt"
processor: "math_gdpo_data_processor"
env_name: "math_multi_reward"
env:
math_multi_reward:
num_workers: 8
math_verify_impl: "hf_math_verify"
logger:
wandb_enabled: true
wandb:
project: "gdpo-dev"
name: "gdpo-dev-logger"
swanlab:
project: "gdpo-dev"
name: "gdpo-dev-logger"
mlflow:
experiment_name: "gdpo-dev"
run_name: "gdpo-dev-logger"