[Quant][CPU] add a wrapper op for _weight_int4pack_mm_for_cpu with tensor args (pytorch#145245)

Xia-Weiwen · pytorchmergebot · commit 98e16012ec85 · 2025-02-12T08:46:38.000Z
**Summary** It's part of the task to enable max-autotune with GEMM template for WoQ INT4 GEMM on CPU. This PR adds a wrapper op in `quantized` namespace for `torch.ops.aten_weight_int4pack_mm_for_cpu`, whose arguments are all tensors. It will be used in Inductor lowering with max-autotune where scalar arguments are difficult to handle. The new op is not registered to - `aten` because it will require changing `native_functions.yaml`, which is not recommended. - `quantized_decomposed` because it will only have a Python implementation, which cannot be used for cpp wrapper in Inductor. **Test plan** ``` python test/test_linalg.py -k test__int4_mm ``` Pull Request resolved: pytorch#145245 Approved by: https://github.com/leslie-fang-intel, https://github.com/jgong5, https://github.com/jerryzh168
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -3489,6 +3489,8 @@ Tensor _weight_int4pack_mm_cpu(
   TORCH_CHECK(qGroupSize == 32 || qGroupSize == 64 || qGroupSize == 128
       || qGroupSize == 256,
       __func__, ": expect qGroupSize to be 32, 64, 128 or 256, got ", qGroupSize);
+  TORCH_CHECK(K % qGroupSize == 0,
+      __func__, ": expect K to be divisible by qGroupSize, got K:", K, ", qGroupSize:", qGroupSize);
 
   TORCH_CHECK(qScaleAndZeros.dim() == 3 && qScaleAndZeros.size(1) == N
       && qScaleAndZeros.size(2) == 2,
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -25,6 +25,7 @@
 #include <ATen/ops/quantize_per_channel_native.h>     // for quantize_per_ch...
 #include <ATen/ops/quantize_per_tensor_native.h>      // for quantize_per_te...
 #include <ATen/ops/zeros.h>
+#include <ATen/ops/_weight_int4pack_mm_for_cpu.h>
 #endif
 
 #include <c10/util/irange.h>
@@ -1179,6 +1180,17 @@ namespace at::native {
     TORCH_CHECK(false, "Unimplemented (int8 linear with packed weight and bias)");
   }
 
+  Tensor _weight_int4pack_mm_cpu_tensor(
+      const Tensor& A,
+      const Tensor& B,
+      const Tensor& qGroupSize,
+      const Tensor& qScaleAndZeros) {
+    TORCH_CHECK(qGroupSize.numel() == 1, __func__, ": group size must be a scalar.");
+    TORCH_CHECK(qGroupSize.scalar_type() == c10::kLong, __func__, ": group size must be int64.");
+    int group_size = qGroupSize.item<int64_t>();
+    return at::_weight_int4pack_mm_for_cpu(A, B, group_size, qScaleAndZeros);
+  }
+
 
 namespace {
 
@@ -1346,6 +1358,7 @@ TORCH_LIBRARY_IMPL(_quantized, QuantizedCPU, m) {
 TORCH_LIBRARY_IMPL(quantized, CPU, m) {
   m.impl(TORCH_SELECTIVE_NAME("quantized::linear_with_input_q_dq_qweight_dq_output_fp32"), TORCH_FN(QLinearInt8FusedQDQ<false>::run));
   m.impl(TORCH_SELECTIVE_NAME("quantized::linear_with_input_q_dq_qweight_dq_relu_output_fp32"), TORCH_FN(QLinearInt8FusedQDQ<true>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::int4mm_packed_weight_cpu"), TORCH_FN(at::native::_weight_int4pack_mm_cpu_tensor));
 }
 
 TORCH_LIBRARY_IMPL(onednn, MkldnnCPU, m) {
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.h b/aten/src/ATen/native/quantized/cpu/qlinear.h
@@ -42,4 +42,10 @@ C10_API static Tensor run_pointwise_binary_tensor(
       std::string_view unary_post_op_algorithm);
 };
 
+C10_API Tensor _weight_int4pack_mm_cpu_tensor(
+    const Tensor& A,
+    const Tensor& B,
+    const Tensor& qGroupSize,
+    const Tensor& qScaleAndZeros);
+
 } // namespace at::native
diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp
@@ -216,6 +216,7 @@ TORCH_LIBRARY(quantized, m) {
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::prelu(Tensor qx, Tensor weight, float output_scale, int output_zero_point) -> Tensor"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::sigmoid(Tensor qx, float output_scale, int output_zero_point) -> Tensor"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::softmax(Tensor qx, int dim, float output_scale, int output_zero_point) -> Tensor"), {at::Tag::pt2_compliant_tag});
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::int4mm_packed_weight_cpu(Tensor self, Tensor mat2, Tensor qGroupSize, Tensor qScaleAndZeros) -> Tensor"));
 }
 
 // According to #33294: The "_" prefix registration will be
diff --git a/test/test_linalg.py b/test/test_linalg.py
@@ -6644,9 +6644,16 @@ def weight_int4pack_mm(a, b_int4pack, b_scales_and_zeros):
             if self.device_type == 'cpu':
                 self.assertTrue(b_int4pack.dtype is torch.uint8)
                 self.assertTrue(b_int4pack.dim() == 2)
-                return torch._weight_int4pack_mm_for_cpu(
+                c = torch._weight_int4pack_mm_for_cpu(
                     a, b_int4pack, q_group, b_scales_and_zeros
                 )
+                # test wrapper
+                q_group_t = torch.tensor(q_group, dtype=torch.int64, device=device)
+                c_2 = torch.ops.quantized.int4mm_packed_weight_cpu(
+                    a, b_int4pack, q_group_t, b_scales_and_zeros
+                )
+                assert torch.equal(c, c_2)
+                return c
             else:
                 self.assertTrue(b_int4pack.dtype is torch.int32)
                 self.assertTrue(b_int4pack.dim() == 4)
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
@@ -2658,6 +2658,25 @@ def meta_quantized_max_pool2d(
             memory_format=memory_format,
         )
 
+    @register_meta(torch.ops.quantized.int4mm_packed_weight_cpu)
+    def meta_int4mm_packed_weight_cpu(x, w, q_group_size, q_scale_and_zeros):
+        torch._check(x.dim() == 2, f"x must be a 2D tensor, got {x.dim()}D")
+        torch._check(w.dim() == 2, f"w must be a 2D tensor, got {w.dim()}D")
+        torch._check(
+            x.dtype in [torch.float32, torch.float16, torch.bfloat16],
+            f"expected x to be f32/f16/bf16, got {x.dtype}",
+        )
+        torch._check(w.dtype == torch.uint8, f"expected w to be uint8, got {w.dtype}")
+        torch._check(
+            q_group_size.dtype == torch.int64,
+            f"q_group_size must be int64, got {q_group_size.dtype}",
+        )
+        torch._check(
+            q_scale_and_zeros.dtype == x.dtype,
+            f"q_scale_and_zeros must have the same dtype as x, got {q_scale_and_zeros.dtype}",
+        )
+        return x.new_empty(x.size(0), w.size(0), dtype=x.dtype)
+
 
 # from check_dim_size() in aten/src/ATen/TensorUtils.cpp.
 def check_dim_size(tensor, dim, dim_size, size):

Original file line number	Diff line number	Diff line change
`@@ -216,6 +216,7 @@ TORCH_LIBRARY(quantized, m) {`
`216`	`216`	`m.def(TORCH_SELECTIVE_SCHEMA("quantized::prelu(Tensor qx, Tensor weight, float output_scale, int output_zero_point) -> Tensor"), {at::Tag::pt2_compliant_tag});`
`217`	`217`	`m.def(TORCH_SELECTIVE_SCHEMA("quantized::sigmoid(Tensor qx, float output_scale, int output_zero_point) -> Tensor"), {at::Tag::pt2_compliant_tag});`
`218`	`218`	`m.def(TORCH_SELECTIVE_SCHEMA("quantized::softmax(Tensor qx, int dim, float output_scale, int output_zero_point) -> Tensor"), {at::Tag::pt2_compliant_tag});`
	`219`	`+ m.def(TORCH_SELECTIVE_SCHEMA("quantized::int4mm_packed_weight_cpu(Tensor self, Tensor mat2, Tensor qGroupSize, Tensor qScaleAndZeros) -> Tensor"));`
`219`	`220`	`}`
`220`	`221`
`221`	`222`	`// According to #33294: The "_" prefix registration will be`