PaddlePaddle · xuxinyi389 · Mar 10, 2026 · Mar 10, 2026
diff --git a/tests/config/benchmark/config/sft/Qwen3-VL-30B-A3B-Instruct.yaml b/tests/config/benchmark/config/sft/Qwen3-VL-30B-A3B-Instruct.yaml
@@ -6,7 +6,7 @@ train_dataset_prob: "1.0"
 eval_dataset_path: /root/paddlejob/share-storage/gpfs/system-public/efficient_benchmark/dataset/coco_grounding/val.jsonl
 eval_dataset_prob: "1.0"
 max_seq_len: 8192
-packing: false
+packing: true
 mix_strategy: concat
 template_backend: custom
 template: qwen3_vl
@@ -31,7 +31,7 @@ evaluation_strategy: steps
 save_steps: 500
 save_strategy: steps
 logging_steps: 1
-gradient_accumulation_steps: 8
+gradient_accumulation_steps: 16
 logging_dir: ./vdl_log
 output_dir: ./checkpoints/qwen3-vl-sft-full
 disable_tqdm: true
@@ -61,7 +61,7 @@ moe_grouped_gemm: true
 moe_deep_gemm: true
 
 apply_rope_fusion: False
-# moe_router_force_load_balancing: true
+moe_router_force_load_balancing: false
 
 # sharding
 split_param: true

diff --git a/tests/config/benchmark/config/sft/Qwen3-VL-8B-Instruct.yaml b/tests/config/benchmark/config/sft/Qwen3-VL-8B-Instruct.yaml
@@ -32,7 +32,7 @@ save_steps: 500
 save_strategy: steps
 logging_steps: 1
 save_total_limit: 1
-gradient_accumulation_steps: 8
+gradient_accumulation_steps: 32
 logging_dir: ./vdl_log_sft_full_tp_8B_coco
 output_dir: ./checkpoints/qwen3-vl-sft-full-tp-8B_coco
 disable_tqdm: true
@@ -44,7 +44,7 @@ warmup_steps: 20
 learning_rate: 1.0e-5
 
 # performance
-tensor_model_parallel_size: 4
+tensor_model_parallel_size: 2
 pipeline_model_parallel_size: 1
 sharding: stage1
 recompute_granularity: full
@@ -57,4 +57,6 @@ save_checkpoint_format: "flex_checkpoint"
 load_checkpoint_format: "flex_checkpoint"
 freeze_config: freeze_vision freeze_aligner
 
-benchmark: true
+benchmark: true
+dataloader_num_workers: 8
+prefetch_factor: 8