@@ -18,9 +18,13 @@ VALID_MODELS="
1818 Qwen3-VL-2B-Instruct
1919 Qwen3-VL-4B-Instruct
2020 Qwen3-VL-8B-Instruct
21+ Qwen3-VL-30B-A3B-Instruct
22+ Qwen3-VL-235B-A22B-Instruct
2123 Qwen3-VL-2B-Thinking
2224 Qwen3-VL-4B-Thinking
2325 Qwen3-VL-8B-Thinking
26+ Qwen3-VL-30B-A3B-Thinking
27+ Qwen3-VL-235B-A22B-Thinking
2428"
2529if ! echo " $VALID_MODELS " | grep -qw " $MODEL_NAME " ; then
2630 echo " Error: MODEL_NAME must be one of: $VALID_MODELS "
2933
3034MODEL_NAME_LOWER=$( echo " $MODEL_NAME " | tr ' [:upper:]' ' [:lower:]' )
3135
32- SLIME_DIR=" $( cd -- " $( dirname -- " ${BASH_SOURCE[0]} " ) /../.." & > /dev/null && pwd) "
33- MODEL_ARGS_FILE=$( echo " $MODEL_NAME " | sed ' s/-Instruct//g; s/-Thinking//g; s/Qwen3-VL-/qwen3-/g; s/-2B/-1.7B/g' )
34- source " ${SLIME_DIR} /scripts/models/${MODEL_ARGS_FILE} .sh"
35-
3636# External Ray flag
3737if [ -z " $SLIME_SCRIPT_EXTERNAL_RAY " ] || [ " $SLIME_SCRIPT_EXTERNAL_RAY " = " 0" ]; then
3838 USE_EXTERNAL_RAY=0
@@ -134,15 +134,15 @@ SGLANG_ARGS=(
134134
135135# Wandb args (only if WANDB_API_KEY is set)
136136if [ -n " $WANDB_API_KEY " ]; then
137- WANDB_ARGS=(
138- --use-wandb
139- --wandb-project slime-geo3k-vlm
140- --wandb-group ${MODEL_NAME_LOWER} -${TRAIN_BACKEND}
141- --wandb-key ${WANDB_API_KEY}
142- --disable-wandb-random-suffix
143- )
137+ WANDB_ARGS=(
138+ --use-wandb
139+ --wandb-project slime-geo3k-vlm
140+ --wandb-group ${MODEL_NAME_LOWER} -${TRAIN_BACKEND}
141+ --wandb-key ${WANDB_API_KEY}
142+ --disable-wandb-random-suffix
143+ )
144144else
145- WANDB_ARGS=()
145+ WANDB_ARGS=()
146146fi
147147
148148MISC_ARGS=(
@@ -151,36 +151,42 @@ MISC_ARGS=(
151151
152152# Backend-specific args
153153if [ " $TRAIN_BACKEND " = " fsdp" ]; then
154- BACKEND_ARGS=(
155- --train-backend fsdp
156- --gradient-checkpointing
157- --sglang-attention-backend fa3
158- --attn-implementation flash_attention_3
159- --update-weight-buffer-size 536870912
160- )
154+ BACKEND_ARGS=(
155+ --train-backend fsdp
156+ --gradient-checkpointing
157+ --sglang-attention-backend fa3
158+ --attn-implementation flash_attention_3
159+ --update-weight-buffer-size 536870912
160+ )
161+ MODEL_ARGS=()
161162else
162- # megatron backend (default)
163- BACKEND_ARGS=(
164- --train-backend megatron
165- --load /root/models/${MODEL_NAME}
166- --tensor-model-parallel-size 4
167- --sequence-parallel
168- --pipeline-model-parallel-size 1
169- --context-parallel-size 1
170- --expert-model-parallel-size 1
171- --expert-tensor-parallel-size 1
172- --recompute-granularity full
173- --recompute-method uniform
174- --recompute-num-layers 1
175- --use-dynamic-batch-size
176- --max-tokens-per-gpu 4096
177- --attention-dropout 0.0
178- --hidden-dropout 0.0
179- --accumulate-allreduce-grads-in-fp32
180- --attention-softmax-in-fp32
181- --attention-backend flash
182- --megatron-to-hf-mode bridge
183- )
163+ # megatron backend (default)
164+ BACKEND_ARGS=(
165+ --train-backend megatron
166+ --load /root/models/${MODEL_NAME}
167+ --tensor-model-parallel-size 4
168+ --sequence-parallel
169+ --pipeline-model-parallel-size 1
170+ --context-parallel-size 1
171+ --expert-model-parallel-size 1
172+ --expert-tensor-parallel-size 1
173+ --recompute-granularity full
174+ --recompute-method uniform
175+ --recompute-num-layers 1
176+ --use-dynamic-batch-size
177+ --max-tokens-per-gpu 4096
178+ --attention-dropout 0.0
179+ --hidden-dropout 0.0
180+ --accumulate-allreduce-grads-in-fp32
181+ --attention-softmax-in-fp32
182+ --attention-backend flash
183+ --megatron-to-hf-mode bridge
184+ )
185+
186+ # get MODEL_ARGS from scripts/models for megatron backend
187+ SLIME_DIR=" $( cd -- " $( dirname -- " ${BASH_SOURCE[0]} " ) /../.." & > /dev/null && pwd) "
188+ MODEL_ARGS_FILE=$( echo " $MODEL_NAME " | sed ' s/-Instruct//g; s/-Thinking//g; s/Qwen3-VL-/qwen3-/g; s/-2B/-1.7B/g' )
189+ source " ${SLIME_DIR} /scripts/models/${MODEL_ARGS_FILE} .sh"
184190fi
185191
186192# Start Ray if not using external Ray
0 commit comments