Skip to content

Commit 9d92ce7

Browse files
feat: support Qwen3 VL MoE (#1171)
Co-authored-by: Chenhe Gu <[email protected]>
1 parent ab2d4d8 commit 9d92ce7

File tree

2 files changed

+52
-42
lines changed

2 files changed

+52
-42
lines changed

examples/geo3k_vlm/README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,14 @@ SLIME_SCRIPT_MODEL_NAME=Qwen3-VL-4B-Instruct ./examples/geo3k_vlm/run_geo3k_vlm.
3636
- `Qwen3-VL-2B-Instruct`
3737
- `Qwen3-VL-4B-Instruct`
3838
- `Qwen3-VL-8B-Instruct`
39+
- `Qwen3-VL-30B-A3B-Instruct`
40+
- `Qwen3-VL-235B-A22B-Instruct`
3941
- `Qwen3-VL-2B-Thinking`
4042
- `Qwen3-VL-4B-Thinking`
4143
- `Qwen3-VL-8B-Thinking`
42-
-
44+
- `Qwen3-VL-30B-A3B-Thinking`
45+
- `Qwen3-VL-235B-A22B-Thinking`
46+
4347
## Notes
4448

4549
### Reward Model Configuration

examples/geo3k_vlm/run_geo3k_vlm.sh

Lines changed: 47 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,13 @@ VALID_MODELS="
1818
Qwen3-VL-2B-Instruct
1919
Qwen3-VL-4B-Instruct
2020
Qwen3-VL-8B-Instruct
21+
Qwen3-VL-30B-A3B-Instruct
22+
Qwen3-VL-235B-A22B-Instruct
2123
Qwen3-VL-2B-Thinking
2224
Qwen3-VL-4B-Thinking
2325
Qwen3-VL-8B-Thinking
26+
Qwen3-VL-30B-A3B-Thinking
27+
Qwen3-VL-235B-A22B-Thinking
2428
"
2529
if ! echo "$VALID_MODELS" | grep -qw "$MODEL_NAME"; then
2630
echo "Error: MODEL_NAME must be one of: $VALID_MODELS"
@@ -29,10 +33,6 @@ fi
2933

3034
MODEL_NAME_LOWER=$(echo "$MODEL_NAME" | tr '[:upper:]' '[:lower:]')
3135

32-
SLIME_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." &>/dev/null && pwd)"
33-
MODEL_ARGS_FILE=$(echo "$MODEL_NAME" | sed 's/-Instruct//g; s/-Thinking//g; s/Qwen3-VL-/qwen3-/g; s/-2B/-1.7B/g')
34-
source "${SLIME_DIR}/scripts/models/${MODEL_ARGS_FILE}.sh"
35-
3636
# External Ray flag
3737
if [ -z "$SLIME_SCRIPT_EXTERNAL_RAY" ] || [ "$SLIME_SCRIPT_EXTERNAL_RAY" = "0" ]; then
3838
USE_EXTERNAL_RAY=0
@@ -134,15 +134,15 @@ SGLANG_ARGS=(
134134

135135
# Wandb args (only if WANDB_API_KEY is set)
136136
if [ -n "$WANDB_API_KEY" ]; then
137-
WANDB_ARGS=(
138-
--use-wandb
139-
--wandb-project slime-geo3k-vlm
140-
--wandb-group ${MODEL_NAME_LOWER}-${TRAIN_BACKEND}
141-
--wandb-key ${WANDB_API_KEY}
142-
--disable-wandb-random-suffix
143-
)
137+
WANDB_ARGS=(
138+
--use-wandb
139+
--wandb-project slime-geo3k-vlm
140+
--wandb-group ${MODEL_NAME_LOWER}-${TRAIN_BACKEND}
141+
--wandb-key ${WANDB_API_KEY}
142+
--disable-wandb-random-suffix
143+
)
144144
else
145-
WANDB_ARGS=()
145+
WANDB_ARGS=()
146146
fi
147147

148148
MISC_ARGS=(
@@ -151,36 +151,42 @@ MISC_ARGS=(
151151

152152
# Backend-specific args
153153
if [ "$TRAIN_BACKEND" = "fsdp" ]; then
154-
BACKEND_ARGS=(
155-
--train-backend fsdp
156-
--gradient-checkpointing
157-
--sglang-attention-backend fa3
158-
--attn-implementation flash_attention_3
159-
--update-weight-buffer-size 536870912
160-
)
154+
BACKEND_ARGS=(
155+
--train-backend fsdp
156+
--gradient-checkpointing
157+
--sglang-attention-backend fa3
158+
--attn-implementation flash_attention_3
159+
--update-weight-buffer-size 536870912
160+
)
161+
MODEL_ARGS=()
161162
else
162-
# megatron backend (default)
163-
BACKEND_ARGS=(
164-
--train-backend megatron
165-
--load /root/models/${MODEL_NAME}
166-
--tensor-model-parallel-size 4
167-
--sequence-parallel
168-
--pipeline-model-parallel-size 1
169-
--context-parallel-size 1
170-
--expert-model-parallel-size 1
171-
--expert-tensor-parallel-size 1
172-
--recompute-granularity full
173-
--recompute-method uniform
174-
--recompute-num-layers 1
175-
--use-dynamic-batch-size
176-
--max-tokens-per-gpu 4096
177-
--attention-dropout 0.0
178-
--hidden-dropout 0.0
179-
--accumulate-allreduce-grads-in-fp32
180-
--attention-softmax-in-fp32
181-
--attention-backend flash
182-
--megatron-to-hf-mode bridge
183-
)
163+
# megatron backend (default)
164+
BACKEND_ARGS=(
165+
--train-backend megatron
166+
--load /root/models/${MODEL_NAME}
167+
--tensor-model-parallel-size 4
168+
--sequence-parallel
169+
--pipeline-model-parallel-size 1
170+
--context-parallel-size 1
171+
--expert-model-parallel-size 1
172+
--expert-tensor-parallel-size 1
173+
--recompute-granularity full
174+
--recompute-method uniform
175+
--recompute-num-layers 1
176+
--use-dynamic-batch-size
177+
--max-tokens-per-gpu 4096
178+
--attention-dropout 0.0
179+
--hidden-dropout 0.0
180+
--accumulate-allreduce-grads-in-fp32
181+
--attention-softmax-in-fp32
182+
--attention-backend flash
183+
--megatron-to-hf-mode bridge
184+
)
185+
186+
# get MODEL_ARGS from scripts/models for megatron backend
187+
SLIME_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." &>/dev/null && pwd)"
188+
MODEL_ARGS_FILE=$(echo "$MODEL_NAME" | sed 's/-Instruct//g; s/-Thinking//g; s/Qwen3-VL-/qwen3-/g; s/-2B/-1.7B/g')
189+
source "${SLIME_DIR}/scripts/models/${MODEL_ARGS_FILE}.sh"
184190
fi
185191

186192
# Start Ray if not using external Ray

0 commit comments

Comments
 (0)