bugfix: improve Qwen2.5-VL acc when enable tensor parallel.

wly-115 · web-flow · commit b77b20e54412 · 2025-09-05T12:05:53.000+08:00
diff --git a/xllm/core/layers/npu/qwen2_5_vision_encoder_layer.cpp b/xllm/core/layers/npu/qwen2_5_vision_encoder_layer.cpp
@@ -158,15 +158,8 @@ void Qwen2_5VisionEncoderImpl::pad_qkv_weights() {
   auto qkv_proj_weight_reshaped =
       qkv_proj_weight.reshape({num_heads_pre_rank, 3, 80, hidden_size});
 
-  auto first_half =
-      qkv_proj_weight_reshaped.index({torch::indexing::Slice(),
-                                      torch::indexing::Slice(),
-                                      torch::indexing::Slice(0, 40),
-                                      torch::indexing::Slice()});
-  auto second_half = qkv_proj_weight_reshaped.index({torch::indexing::Slice(),
-                                                     torch::indexing::Slice(),
-                                                     torch::indexing::Slice(40),
-                                                     torch::indexing::Slice()});
+  auto first_half = qkv_proj_weight_reshaped.slice(2, 0, 40);
+  auto second_half = qkv_proj_weight_reshaped.slice(2, 40, 80);
 
   auto first_half_padded = torch::nn::functional::pad(
       first_half, torch::nn::functional::PadFuncOptions({0, 0, 0, 24}));
@@ -182,12 +175,9 @@ void Qwen2_5VisionEncoderImpl::pad_qkv_weights() {
 
   auto qkv_proj_bias_reshaped =
       qkv_proj_bias.reshape({num_heads_pre_rank, 3, 80});
-  first_half = qkv_proj_bias_reshaped.index({torch::indexing::Slice(),
-                                             torch::indexing::Slice(),
-                                             torch::indexing::Slice(0, 40)});
-  second_half = qkv_proj_bias_reshaped.index({torch::indexing::Slice(),
-                                              torch::indexing::Slice(),
-                                              torch::indexing::Slice(40)});
+  first_half = qkv_proj_bias_reshaped.slice(2, 0, 40);
+  second_half = qkv_proj_bias_reshaped.slice(2, 40, 80);
+
   first_half_padded = torch::nn::functional::pad(
       first_half, torch::nn::functional::PadFuncOptions({0, 24}));
   second_half_padded = torch::nn::functional::pad(
@@ -202,31 +192,11 @@ void Qwen2_5VisionEncoderImpl::pad_qkv_weights() {
 
   auto out_proj_weight = at_weight_tensors_[IN_WATTENTION_OUT_WEIGHT];
 
-  if (encode_param_.worldSize == 1) {
-    out_proj_weight =
-        torch::nn::functional::pad(
-            out_proj_weight.reshape({hidden_size, num_heads_pre_rank * 2, 40}),
-            torch::nn::functional::PadFuncOptions({0, 24, 0, 0}))
-            .reshape({hidden_size, num_heads_pre_rank * 128});
-  } else if (encode_param_.worldSize > 1) {
-    auto reshaped =
-        out_proj_weight.reshape({num_heads_pre_rank, 80, hidden_size});
-
-    auto first_half = reshaped.slice(1, 0, 40);
-    auto second_half = reshaped.slice(1, 40, 80);
-
-    auto first_half_padded = torch::nn::functional::pad(
-        first_half, torch::nn::functional::PadFuncOptions({0, 0, 0, 24}));
-
-    auto second_half_padded = torch::nn::functional::pad(
-        second_half, torch::nn::functional::PadFuncOptions({0, 0, 0, 24}));
-
-    auto out_proj_weight_padded =
-        torch::cat({first_half_padded, second_half_padded}, 1);
-
-    out_proj_weight =
-        out_proj_weight_padded.reshape({num_heads_pre_rank * 128, hidden_size});
-  }
+  out_proj_weight =
+      torch::nn::functional::pad(
+          out_proj_weight.reshape({hidden_size, num_heads_pre_rank * 2, 40}),
+          torch::nn::functional::PadFuncOptions({0, 24, 0, 0}))
+          .reshape({hidden_size, num_heads_pre_rank * 128});
   at_weight_tensors_[IN_WATTENTION_OUT_WEIGHT] = out_proj_weight;
 }
 void Qwen2_5VisionEncoderImpl::merge_loaded_weights() {
diff --git a/xllm/models/qwen2_5_vl.h b/xllm/models/qwen2_5_vl.h
@@ -695,12 +695,7 @@ class Qwen2_5_VLForConditionalGenerationImpl : public torch::nn::Module {
   Qwen2_5_VLForConditionalGenerationImpl(const Context& context)
       : model_args_(context.get_model_args()),
         options_(context.get_tensor_options()) {
-    Context vision_context(ParallelArgs(0, 1, nullptr));
-    vision_context.set_model_args(model_args_);
-    vision_context.set_quant_args(context.get_quant_args());
-    vision_context.set_tensor_options(options_);
-    visual_ =
-        register_module("visual", Qwen2_5_VisionTransformer(vision_context));
+    visual_ = register_module("visual", Qwen2_5_VisionTransformer(context));
 
     language_model_ =
         register_module("language_model", QWen2ForCausalLM(context));