Skip to content

Commit fac93cc

Browse files
committed
more configs
1 parent 08499b4 commit fac93cc

File tree

3 files changed

+148
-4
lines changed

3 files changed

+148
-4
lines changed

slime/code_golf_harbor_modal/configs/qwen_8b_multi.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,15 +34,15 @@ def get_config() -> RLConfig:
3434
--input-key messages
3535
--label-key label
3636
--apply-chat-template
37-
--apply-chat-template-kwargs '{{"enable_thinking": false}}'
37+
--apply-chat-template-kwargs '{{"enable_thinking": true}}'
3838
--prompt-data {{data_path}}/mbpp_harbor/slime/train.parquet
3939
--eval-prompt-data mbpp {{data_path}}/mbpp_harbor/slime/test.parquet
4040
4141
# Rollout / batching
4242
--num-rollout 2000
43-
--rollout-batch-size 32
43+
--rollout-batch-size 128
4444
--n-samples-per-prompt 8
45-
--global-batch-size 256
45+
--global-batch-size 1024
4646
--rollout-max-response-len 1024
4747
--rollout-temperature 0.9
4848
--eval-max-response-len 1024
@@ -66,7 +66,7 @@ def get_config() -> RLConfig:
6666
--eval-top-p 1
6767
6868
# Save checkpoints to volume
69-
--save {{checkpoints_path}}/qwen8b_code_golf
69+
--save {{checkpoints_path}}/qwen8b_code_golf_thinking
7070
--save-interval 20
7171
""",
7272
)
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
from __future__ import annotations
2+
3+
from .base import (
4+
DEFAULT_GRPO_ARGS,
5+
DEFAULT_OPTIMIZER_ARGS,
6+
DEFAULT_TRAINING_ARGS,
7+
QWEN3_8B_MODEL_ARGS,
8+
RLConfig,
9+
)
10+
11+
12+
def get_config() -> RLConfig:
13+
return RLConfig(
14+
model_name="Qwen3-8B",
15+
model_id="Qwen/Qwen3-8B",
16+
app_name="slime-qwen8b-code-golf",
17+
n_nodes=4,
18+
gpu="H100:8",
19+
sync=True,
20+
wandb_project="slime-code-golf",
21+
wandb_run_name_prefix="qwen8b-mbpp-harbor-thinking-4k",
22+
harbor_rm_profile=True,
23+
harbor_rm_log_samples=True,
24+
slime_args=f"""
25+
# Model
26+
{QWEN3_8B_MODEL_ARGS}
27+
28+
# Training + optimizer + GRPO
29+
{DEFAULT_TRAINING_ARGS}
30+
{DEFAULT_OPTIMIZER_ARGS}
31+
{DEFAULT_GRPO_ARGS}
32+
33+
# Dataset format
34+
--input-key messages
35+
--label-key label
36+
--apply-chat-template
37+
--apply-chat-template-kwargs '{{"enable_thinking": true}}'
38+
--prompt-data {{data_path}}/mbpp_harbor/slime/train.parquet
39+
--eval-prompt-data mbpp {{data_path}}/mbpp_harbor/slime/test.parquet
40+
41+
# Rollout / batching
42+
--num-rollout 2000
43+
--rollout-batch-size 128
44+
--n-samples-per-prompt 8
45+
--global-batch-size 1024
46+
--rollout-max-response-len 2048
47+
--rollout-temperature 0.9
48+
--eval-max-response-len 2048
49+
--n-samples-per-eval-prompt 8
50+
51+
# Custom reward model (Harbor + Modal sandbox scoring)
52+
--rm-type math
53+
--custom-rm-path custom_rm.custom_rm
54+
55+
# SGLang rollout engines
56+
--rollout-num-gpus-per-engine 2
57+
--sglang-mem-fraction-static 0.7
58+
59+
# Distributed orchestration
60+
--actor-num-nodes 4
61+
--actor-num-gpus-per-node 8
62+
--colocate
63+
64+
# Eval cadence
65+
--eval-interval 20
66+
--eval-top-p 1
67+
68+
# Save checkpoints to volume
69+
--save {{checkpoints_path}}/qwen8b_code_golf_thinking_4k
70+
--save-interval 20
71+
""",
72+
)
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
from __future__ import annotations
2+
3+
from .base import (
4+
DEFAULT_GRPO_ARGS,
5+
DEFAULT_OPTIMIZER_ARGS,
6+
DEFAULT_TRAINING_ARGS,
7+
QWEN3_8B_MODEL_ARGS,
8+
RLConfig,
9+
)
10+
11+
12+
def get_config() -> RLConfig:
13+
return RLConfig(
14+
model_name="Qwen3-8B",
15+
model_id="Qwen/Qwen3-8B",
16+
app_name="slime-qwen8b-code-golf",
17+
n_nodes=4,
18+
gpu="H100:8",
19+
sync=True,
20+
wandb_project="slime-code-golf",
21+
wandb_run_name_prefix="qwen8b-mbpp-harbor-thinking-4k-512",
22+
harbor_rm_profile=True,
23+
harbor_rm_log_samples=True,
24+
slime_args=f"""
25+
# Model
26+
{QWEN3_8B_MODEL_ARGS}
27+
28+
# Training + optimizer + GRPO
29+
{DEFAULT_TRAINING_ARGS}
30+
{DEFAULT_OPTIMIZER_ARGS}
31+
{DEFAULT_GRPO_ARGS}
32+
33+
# Dataset format
34+
--input-key messages
35+
--label-key label
36+
--apply-chat-template
37+
--apply-chat-template-kwargs '{{"enable_thinking": true}}'
38+
--prompt-data {{data_path}}/mbpp_harbor/slime/train.parquet
39+
--eval-prompt-data mbpp {{data_path}}/mbpp_harbor/slime/test.parquet
40+
41+
# Rollout / batching (64 * 8 = 512 samples)
42+
--num-rollout 2000
43+
--rollout-batch-size 64
44+
--n-samples-per-prompt 8
45+
--global-batch-size 512
46+
--rollout-max-response-len 4096
47+
--rollout-temperature 0.9
48+
--eval-max-response-len 4096
49+
--n-samples-per-eval-prompt 8
50+
51+
# Custom reward model (Harbor + Modal sandbox scoring)
52+
--rm-type math
53+
--custom-rm-path custom_rm.custom_rm
54+
55+
# SGLang rollout engines
56+
--rollout-num-gpus-per-engine 2
57+
--sglang-mem-fraction-static 0.7
58+
59+
# Distributed orchestration
60+
--actor-num-nodes 4
61+
--actor-num-gpus-per-node 8
62+
--colocate
63+
64+
# Eval cadence
65+
--eval-interval 20
66+
--eval-top-p 1
67+
68+
# Save checkpoints to volume
69+
--save {{checkpoints_path}}/qwen8b_code_golf_thinking_4k_512
70+
--save-interval 20
71+
""",
72+
)

0 commit comments

Comments
 (0)