Cherry pick vLLM related bug fixes for 2.8 release branch (#5732)

baodii · web-flow · commit 49804aa6afe2 · 2025-07-15T16:41:45.000+08:00
1). FP8 GEMM support input shape [M, B, K] 
keep output shape as [M, B, N].
keep output stride similar as input. In some scenario, the input shape is [M, B, K] and stride is [K, M*K, 1]. We have to make the output stride is [N, M*N, 1] to keep consistency.

2). fix QWEN 32B int4 TP=8 bug
When we run QWEN-32B int4 model with TP=8. The weight will be [80, 5120]. And the group_size of int4 gemm is 128 which is larger then gemm in_feature (80). So, the group_size is changed to 80 which is erroneous.
diff --git a/csrc/gpu/aten/operators/fp8/FP8Linear.cpp b/csrc/gpu/aten/operators/fp8/FP8Linear.cpp
@@ -139,7 +139,15 @@ Tensor fp8_gemm_w8a16(
     TORCH_CHECK(false, "linear only support for 2D and 3D tensors!\n");
   }
 
-  at::Tensor result = at::empty(result_shape, A.options());
+  // deal with input shape [m, b, k] stride [k, m * k, 1]
+  auto k = A.size(A.dim() - 1);
+  auto n = result_shape.back();
+  auto res_stride = A.strides().vec();
+  for (int i = 0; i < res_stride.size() - 1; i++) {
+    res_stride[i] = res_stride[i] / k * n;
+  }
+
+  at::Tensor result = at::empty_strided(result_shape, res_stride, A.options());
 
   // check if nt format
   bool is_nt = true;
diff --git a/csrc/gpu/oneDNN/DnnlMatmulQuant.h b/csrc/gpu/oneDNN/DnnlMatmulQuant.h
@@ -738,7 +738,7 @@ static inline void dnnl_matmul_w8a16_fp8(
 
   const int m = std::reduce(
       src_sz.begin(), src_sz.end() - 1, 1, std::multiplies<int64_t>());
-  const int n = o_sz[1]; // presume channel last format
+  const int n = o_sz.back(); // presume channel last format
   const int k = *(src_sz.end() - 1);
 
   // get device, engine, stream
@@ -791,11 +791,22 @@ static inline void dnnl_matmul_w8a16_fp8(
     tt = dnnl::trans_type_t::nt;
   }
 
-  int64_t lda = mat1.strides()[mat1.dim() - 2];
+  // get lda ldb and ldc
+  auto mat1_strides = mat1.strides();
+  int64_t leading_dim = -1;
+  if (mat1.dim() == 2) {
+    leading_dim = 0;
+  } else if (mat1.dim() == 3) {
+    leading_dim = mat1_strides[0] < mat1_strides[1] ? 0 : 1;
+  } else {
+    TORCH_CHECK(
+        false, "Unsupported input dimension for fp8 matmul: ", mat1.dim());
+  }
+  int64_t lda = mat1_strides[leading_dim];
   int64_t ldb = mat2.strides()[mat2.dim() - 1] == 1
       ? mat2.strides()[mat2.dim() - 2]
       : mat2.strides()[mat2.dim() - 1];
-  int64_t ldc = result.strides()[result.dim() - 2];
+  int64_t ldc = result.strides()[leading_dim];
 
   auto f_attr = [&](primitive_attr& pattr) {
 #ifdef USE_SCRATCHPAD_MODE
diff --git a/intel_extension_for_pytorch/nn/utils/_quantize_convert.py b/intel_extension_for_pytorch/nn/utils/_quantize_convert.py
@@ -316,11 +316,7 @@ def __init__(
         self.double_quant_scale_dtype = double_quant_scale_dtype
         self.compute_dtype = compute_dtype
         self.compress_statistics = compress_statistics
-        self.blocksize = (
-            blocksize
-            if blocksize != -1 and blocksize < self.in_features
-            else self.in_features
-        )
+        self.blocksize = blocksize if blocksize != -1 else self.in_features
         self.scheme = scheme
         self.weight_dtype = weight_dtype
         self.device = device
diff --git a/tests/gpu/examples/test_fp8_linear_v2.py b/tests/gpu/examples/test_fp8_linear_v2.py
@@ -66,11 +66,12 @@ def test_fp8_linear_v2(fp8_dtype, dtype, is_input_fp8, is_bias):
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("is_bias", [True, False])
 @pytest.mark.parametrize("is_nt", [True, False])
-def test_fp8_linear_w8a16(fp8_dtype, dtype, is_bias, is_nt):
+@pytest.mark.parametrize("is_mbk", [True, False])
+def test_fp8_linear_w8a16(fp8_dtype, dtype, is_bias, is_nt, is_mbk):
     seed = 1234
     torch.manual_seed(seed)
 
-    input = torch.randn([8, 2], dtype=dtype, device=torch.device("xpu")) / 10.0
+    input = torch.randn([1, 8, 2], dtype=dtype, device=torch.device("xpu")) / 10.0
     weight = torch.rand([3, 2], dtype=dtype).xpu() / 10.0
 
     gemm_ref = torch.nn.Linear(2, 3, bias=is_bias).xpu().to(dtype)
@@ -105,10 +106,14 @@ def test_fp8_linear_w8a16(fp8_dtype, dtype, is_bias, is_nt):
         ),
     )
 
+    if is_mbk:
+        input = input.transpose(0, 1)
+
     output_fp8 = fp8_linear(input, gemm_ref.bias.data.clone() if is_bias else None)
+    output_fp8 = output_fp8.transpose(0, 1) if is_mbk else output_fp8
 
     torch.testing.assert_close(output_fp8, output_ref, atol=1e-2, rtol=1e-2)
 
 
 if __name__ == "__main__":
-    test_fp8_linear_w8a16(torch.float8_e5m2, torch.float16, True)
+    test_fp8_linear_w8a16(torch.float8_e5m2, torch.float16, True, True, True)