Skip to content

Commit 63e63ef

Browse files
authored
[VLM] optimize VLM processing (#1234)
1 parent 2e40a48 commit 63e63ef

File tree

21 files changed

+188
-80
lines changed

21 files changed

+188
-80
lines changed

.github/workflows/pr-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ jobs:
4848
strategy:
4949
fail-fast: false
5050
matrix:
51-
info: [{"num_gpus": 8, "test_file": "test_quick_start_glm4_9B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_30B_A3B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ppo.py"}, {"num_gpus": 8, "test_file": "test_moonlight_16B_A3B.py"}, {"num_gpus": 2, "test_file": "test_qwen3_4B_fsdp_true_on_policy.py"}, {"num_gpus": 8, "test_file": "test_mimo_7B_mtp_only_grad.py"}]
51+
info: [{"num_gpus": 8, "test_file": "test_quick_start_glm4_9B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_30B_A3B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ppo.py"}, {"num_gpus": 8, "test_file": "test_moonlight_16B_A3B.py"}, {"num_gpus": 2, "test_file": "test_qwen3_4B_fsdp_true_on_policy.py"}, {"num_gpus": 8, "test_file": "test_mimo_7B_mtp_only_grad.py"}, {"num_gpus": 8, "test_file": "test_qwen3_vl_4B_fsdp.py"}]
5252
defaults:
5353
run:
5454
working-directory: ${{ github.workspace }}

.github/workflows/pr-test.yml.j2

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
{'test_file': 'test_moonlight_16B_A3B.py', 'num_gpus': 8},
99
{'test_file': 'test_qwen3_4B_fsdp_true_on_policy.py', 'num_gpus': 2},
1010
{'test_file': 'test_mimo_7B_mtp_only_grad.py', 'num_gpus': 8},
11+
{'test_file': 'test_qwen3_vl_4B_fsdp.py', 'num_gpus': 8},
1112
],
1213
},
1314
'e2e-test-long': {

docs/en/get_started/customization.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,7 @@ dict: {
270270
"rollout_log_probs": list, # Log probs (for off-policy correction)
271271
"rollout_routed_experts": list, # Routed experts (for MoE)
272272
"metadata": list, # Train metadata
273-
"multimodal_inputs": list, # Multimodal inputs (for VLM)
273+
"multimodal_train_inputs": list, # Multimodal tensors (for VLM)
274274
"teacher_log_probs": list, # Teacher log probs (for distillation)
275275
}
276276
```

docs/zh/get_started/customization.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,7 @@ dict: {
270270
"rollout_log_probs": list, # log 概率(用于离策略校正)
271271
"rollout_routed_experts": list, # 路由专家(用于 MoE)
272272
"metadata": list, # 训练元数据
273-
"multimodal_inputs": list, # 多模态输入(用于 VLM)
273+
"multimodal_train_inputs": list, # 多模态张量(用于 VLM)
274274
"teacher_log_probs": list, # 教师 log 概率(用于蒸馏)
275275
}
276276
```

examples/geo3k_vlm/run_geo3k_vlm.sh

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,6 @@ fi
8080
# Common args
8181
CKPT_ARGS=(
8282
--hf-checkpoint /root/models/${MODEL_NAME}
83-
# vl model has rotary base 5000000
84-
--rotary-base 5000000
8583
)
8684

8785
ROLLOUT_ARGS=(
@@ -154,41 +152,43 @@ MISC_ARGS=(
154152
# Backend-specific args
155153
if [ "$TRAIN_BACKEND" = "fsdp" ]; then
156154
BACKEND_ARGS=(
157-
--train-backend fsdp
158-
--gradient-checkpointing
159-
--sglang-attention-backend fa3
160-
--attn-implementation flash_attention_3
161-
--update-weight-buffer-size 536870912
155+
--train-backend fsdp
156+
--gradient-checkpointing
157+
--sglang-attention-backend fa3
158+
--attn-implementation flash_attention_3
159+
--update-weight-buffer-size 536870912
162160
)
163161
MODEL_ARGS=()
164162
else
165163
# megatron backend (default)
166164
BACKEND_ARGS=(
167-
--train-backend megatron
168-
--load /root/models/${MODEL_NAME}
169-
--tensor-model-parallel-size 4
170-
--sequence-parallel
171-
--pipeline-model-parallel-size 1
172-
--context-parallel-size 1
173-
--expert-model-parallel-size 1
174-
--expert-tensor-parallel-size 1
175-
--recompute-granularity full
176-
--recompute-method uniform
177-
--recompute-num-layers 1
178-
--use-dynamic-batch-size
179-
--max-tokens-per-gpu 4096
180-
--attention-dropout 0.0
181-
--hidden-dropout 0.0
182-
--accumulate-allreduce-grads-in-fp32
183-
--attention-softmax-in-fp32
184-
--attention-backend flash
185-
--megatron-to-hf-mode bridge
165+
--train-backend megatron
166+
--load /root/models/${MODEL_NAME}
167+
--tensor-model-parallel-size 4
168+
--sequence-parallel
169+
--pipeline-model-parallel-size 1
170+
--context-parallel-size 1
171+
--expert-model-parallel-size 1
172+
--expert-tensor-parallel-size 1
173+
--recompute-granularity full
174+
--recompute-method uniform
175+
--recompute-num-layers 1
176+
--use-dynamic-batch-size
177+
--max-tokens-per-gpu 4096
178+
--attention-dropout 0.0
179+
--hidden-dropout 0.0
180+
--accumulate-allreduce-grads-in-fp32
181+
--attention-softmax-in-fp32
182+
--attention-backend flash
183+
--megatron-to-hf-mode bridge
186184
)
187185

188186
# get MODEL_ARGS from scripts/models for megatron backend
189187
SLIME_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." &>/dev/null && pwd)"
190188
MODEL_ARGS_FILE=$(echo "$MODEL_NAME" | sed 's/-Instruct//g; s/-Thinking//g; s/Qwen3-VL-/qwen3-/g; s/-2B/-1.7B/g')
191-
source "${SLIME_DIR}/scripts/models/${MODEL_ARGS_FILE}.sh"
189+
# VL models require rotary-base 5000000
190+
MODEL_ARGS_ROTARY_BASE=5000000 source "${SLIME_DIR}/scripts/models/${MODEL_ARGS_FILE}.sh"
191+
192192
fi
193193

194194
# Start Ray if not using external Ray

examples/geo3k_vlm/run_geo3k_vlm_sft.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,6 @@ fi
7272
CKPT_ARGS=(
7373
--hf-checkpoint /root/models/${MODEL_NAME}
7474
--load /root/models/${MODEL_NAME}
75-
--rotary-base 5000000
7675
)
7776

7877
SFT_ARGS=(
@@ -152,7 +151,8 @@ else
152151
# get MODEL_ARGS from scripts/models for megatron backend
153152
SLIME_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." &>/dev/null && pwd)"
154153
MODEL_ARGS_FILE=$(echo "$MODEL_NAME" | sed 's/-Instruct//g; s/-Thinking//g; s/Qwen3-VL-/qwen3-/g; s/-2B/-1.7B/g')
155-
source "${SLIME_DIR}/scripts/models/${MODEL_ARGS_FILE}.sh"
154+
# VL models require rotary-base 5000000
155+
MODEL_ARGS_ROTARY_BASE=5000000 source "${SLIME_DIR}/scripts/models/${MODEL_ARGS_FILE}.sh"
156156
fi
157157

158158
# Start Ray if not using external Ray

scripts/models/qwen3-1.7B.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ MODEL_ARGS=(
1010
--disable-bias-linear
1111
--normalization "RMSNorm"
1212
--norm-epsilon 1e-6
13-
--rotary-base 1000000
13+
--rotary-base "${MODEL_ARGS_ROTARY_BASE:-1000000}"
1414
--vocab-size 151936
1515
--kv-channels 128
1616
--qk-layernorm

scripts/models/qwen3-235B-A22B.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ MODEL_ARGS=(
3232
--untie-embeddings-and-output-weights
3333
--vocab-size 151936
3434

35-
--rotary-base 1000000
35+
--rotary-base "${MODEL_ARGS_ROTARY_BASE:-1000000}"
3636

3737
# moe
3838
--moe-ffn-hidden-size 1536

scripts/models/qwen3-30B-A3B.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ MODEL_ARGS=(
3232
--untie-embeddings-and-output-weights
3333
--vocab-size 151936
3434

35-
--rotary-base 1000000
35+
--rotary-base "${MODEL_ARGS_ROTARY_BASE:-1000000}"
3636

3737
# moe
3838
--moe-ffn-hidden-size 768

scripts/models/qwen3-8B.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ MODEL_ARGS=(
1010
--disable-bias-linear
1111
--normalization "RMSNorm"
1212
--norm-epsilon 1e-6
13-
--rotary-base 1000000
13+
--rotary-base "${MODEL_ARGS_ROTARY_BASE:-1000000}"
1414
--vocab-size 151936
1515
--kv-channels 128
1616
--qk-layernorm

0 commit comments

Comments
 (0)