add dequant+gemm for compute bound int4 gemm scenario (#4958) (#4964)

rogerxfeng8 · ganyi1996ppo · zhuwei · web-flow · commit 9d489a89d736 · 2024-10-30T18:06:49.000+08:00
Co-authored-by: Pleaplusone &lt;38376071+ganyi1996ppo@users.noreply.github.com&gt;
Co-authored-by: zhuwei &lt;wei2.zhu@intel.com&gt;
Co-authored-by: wincent8 &lt;wei.liao@intel.com&gt;
diff --git a/intel_extension_for_pytorch/nn/utils/_quantize_convert.py b/intel_extension_for_pytorch/nn/utils/_quantize_convert.py
@@ -462,7 +462,11 @@ def forward(self, input: Tensor) -> Tensor:
 
         if xpu_gemm_use_xetla(self.force_xetla):
             # TODO input.shape[1] > 1 seems not work on gidx scenario, need to fix this bug
-            if input.shape[1] > 1 and not self.force_xetla:
+            if input.dim() == 3:
+                m = input.size(1)
+            else:
+                m = input.size(0)
+            if m > 1:
                 return dequant_gemm_block(input, self)
             return torch.ops.torch_ipex.mm_low_bits(
                 input,
@@ -578,7 +582,7 @@ def convert_qmodel_recursive(module):
 
 def dequant_gemm_block(input, quant_layer, output=None):
     if quant_layer.g_idx is not None:
-        input = input[:, :, quant_layer.g_idx]
+        input = input[..., quant_layer.g_idx]
     if output is None:
         output = torch.ops.torch_ipex.mm_common(
             input,