Merge pull request #476 from InfiniTensor/issue/474

PanZezhong1725 · web-flow · commit 6b903fd94caa · 2025-09-24T09:42:07.000+08:00
issue/474: rename Dequantize to DequantizeAWQ in nvidia gpu
diff --git a/include/infiniop.h b/include/infiniop.h
@@ -7,7 +7,7 @@
 #include "infiniop/ops/causal_softmax.h"
 #include "infiniop/ops/clip.h"
 #include "infiniop/ops/conv.h"
-#include "infiniop/ops/dequantize.h"
+#include "infiniop/ops/dequantize_awq.h"
 #include "infiniop/ops/gemm.h"
 #include "infiniop/ops/mul.h"
 #include "infiniop/ops/random_sample.h"
diff --git a/include/infiniop/ops/dequantize.h b/include/infiniop/ops/dequantize.h
diff --git a/include/infiniop/ops/dequantize_awq.h b/include/infiniop/ops/dequantize_awq.h
@@ -0,0 +1,28 @@
+#ifndef __INFINIOP_DEQUANTIZE_AWQ_API_H__
+#define __INFINIOP_DEQUANTIZE_AWQ_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopDequantizeAWQDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateDequantizeAWQDescriptor(infiniopHandle_t handle,
+                                                                  infiniopDequantizeAWQDescriptor_t *desc_ptr,
+                                                                  infiniopTensorDescriptor_t out_desc,
+                                                                  infiniopTensorDescriptor_t qweight_desc,
+                                                                  infiniopTensorDescriptor_t scales_desc,
+                                                                  infiniopTensorDescriptor_t zeros_desc);
+
+__C __export infiniStatus_t infiniopGetDequantizeAWQWorkspaceSize(infiniopDequantizeAWQDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopDequantizeAWQ(infiniopDequantizeAWQDescriptor_t desc,
+                                                  void *workspace,
+                                                  size_t workspace_size,
+                                                  void *out,
+                                                  const void *qweight,
+                                                  const void *scales,
+                                                  const void *zeros,
+                                                  void *stream);
+
+__C __export infiniStatus_t infiniopDestroyDequantizeAWQDescriptor(infiniopDequantizeAWQDescriptor_t desc);
+
+#endif
diff --git a/src/infiniop/ops/dequantize/nvidia/dequantize_w42f16_nvidia.cuh b/src/infiniop/ops/dequantize/nvidia/dequantize_w42f16_nvidia.cuh
diff --git a/src/infiniop/ops/dequantize_awq/dequantize_awq.h b/src/infiniop/ops/dequantize_awq/dequantize_awq.h
@@ -1,5 +1,5 @@
-#ifndef __DEQUANTIZE_H__
-#define __DEQUANTIZE_H__
+#ifndef __DEQUANTIZE_AWQ_H__
+#define __DEQUANTIZE_AWQ_H__
 
 #include "../../../utils.h"
 #include "../../operator.h"
@@ -8,17 +8,17 @@
 
 #define DESCRIPTOR(NAMESPACE)                                    \
                                                                  \
-    namespace op::dequantize::NAMESPACE {                        \
+    namespace op::dequantize_awq::NAMESPACE {                    \
     class Descriptor final : public InfiniopDescriptor {         \
         struct Opaque;                                           \
         Opaque *_opaque;                                         \
-        DequantizeInfo _info;                                    \
+        DequantizeAWQInfo _info;                                 \
         size_t _workspace_size;                                  \
                                                                  \
         Descriptor(                                              \
             size_t workspace_size_,                              \
             Opaque *opaque,                                      \
-            DequantizeInfo info,                                 \
+            DequantizeAWQInfo info,                              \
             infiniDevice_t device_type,                          \
             int device_id)                                       \
             : InfiniopDescriptor{device_type, device_id},        \
@@ -49,4 +49,5 @@
             void *stream) const;                                 \
     };                                                           \
     }
-#endif
+
+#endif //__DEQUANTIZE_AWQ_H__
diff --git a/src/infiniop/ops/dequantize_awq/info.h b/src/infiniop/ops/dequantize_awq/info.h
@@ -1,14 +1,14 @@
-#ifndef __DEQUANTIZE_INFO_H__
-#define __DEQUANTIZE_INFO_H__
+#ifndef __DEQUANTIZE_AWQ_INFO_H__
+#define __DEQUANTIZE_AWQ_INFO_H__
 
 #include "../../../utils.h"
 #include "../../tensor.h"
 #include <vector>
 
-namespace op::dequantize {
+namespace op::dequantize_awq {
 
-class DequantizeInfo {
-    DequantizeInfo() = default;
+class DequantizeAWQInfo {
+    DequantizeAWQInfo() = default;
 
 public:
     int _in_features, _out_features, _num_groups;
@@ -17,7 +17,7 @@ class DequantizeInfo {
     int out_features() const { return _out_features; }
     int num_groups() const { return _num_groups; }
 
-    static utils::Result<DequantizeInfo> create(
+    static utils::Result<DequantizeAWQInfo> create(
         infiniopTensorDescriptor_t out_desc,
         infiniopTensorDescriptor_t qweight_desc,
         infiniopTensorDescriptor_t scales_desc,
@@ -27,13 +27,13 @@ class DequantizeInfo {
         int _out_features = qweight_desc->dim(1);
         int _num_groups = scales_desc->dim(0);
 
-        return utils::Result<DequantizeInfo>(DequantizeInfo{
+        return utils::Result<DequantizeAWQInfo>(DequantizeAWQInfo{
             _in_features,
             _out_features,
             _num_groups});
     }
 };
 
-} // namespace op::dequantize
+} // namespace op::dequantize_awq
 
-#endif // __DEQUANTIZE_INFO_H__
+#endif // __DEQUANTIZE_AWQ_INFO_H__
diff --git a/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_kernel.cuh b/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_kernel.cuh
diff --git a/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu b/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu
@@ -5,7 +5,7 @@
 #include "dequantize_w42f16_kernel.cuh"
 #include "dequantize_w42f16_nvidia.cuh"
 
-#include "../dequantize.h"
+#include "../dequantize_awq.h"
 #include <cuda_fp16.h>
 
 __global__ void __launch_bounds__(64)
@@ -68,7 +68,7 @@ __global__ void __launch_bounds__(64)
     }
 }
 
-namespace op::dequantize::nvidia {
+namespace op::dequantize_awq::nvidia {
 
 struct Descriptor::Opaque {
     std::shared_ptr<device::nvidia::Handle::Internal> internal;
@@ -87,7 +87,7 @@ infiniStatus_t Descriptor::create(
     infiniopTensorDescriptor_t zeros_desc) {
 
     auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto result = DequantizeInfo::create(out_desc, qweight_desc, scales_desc, zeros_desc);
+    auto result = DequantizeAWQInfo::create(out_desc, qweight_desc, scales_desc, zeros_desc);
 
     *desc_ptr = new Descriptor(
         0,
@@ -133,6 +133,6 @@ Descriptor::calculate(
     return INFINI_STATUS_SUCCESS;
 }
 
-} // namespace op::dequantize::nvidia
+} // namespace op::dequantize_awq::nvidia
 
 #endif
diff --git a/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cuh b/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __DEQUANTIZE_AWQ_CUDA_CUH__
+#define __DEQUANTIZE_AWQ_CUDA_CUH__
+
+#include "../dequantize_awq.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __DEQUANTIZE_AWQ_CUDA_CUH__
diff --git a/src/infiniop/ops/dequantize_awq/operator.cc b/src/infiniop/ops/dequantize_awq/operator.cc
@@ -1,27 +1,27 @@
 #include "../../operator.h"
 #include "../../handle.h"
-#include "infiniop/ops/dequantize.h"
+#include "infiniop/ops/dequantize_awq.h"
 
 #ifdef ENABLE_NVIDIA_API
 #include "nvidia/dequantize_w42f16_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateDequantizeDescriptor(
+__C infiniStatus_t infiniopCreateDequantizeAWQDescriptor(
     infiniopHandle_t handle,
-    infiniopDequantizeDescriptor_t *desc_ptr,
+    infiniopDequantizeAWQDescriptor_t *desc_ptr,
     infiniopTensorDescriptor_t out_desc,
     infiniopTensorDescriptor_t qweight_desc,
     infiniopTensorDescriptor_t scales_desc,
     infiniopTensorDescriptor_t zeros_desc) {
 
-#define CREATE(CASE, NAMESPACE)                                                   \
-    case CASE:                                                                    \
-        return op::dequantize::NAMESPACE::Descriptor::create(                     \
-            handle,                                                               \
-            reinterpret_cast<op::dequantize::NAMESPACE::Descriptor **>(desc_ptr), \
-            out_desc,                                                             \
-            qweight_desc,                                                         \
-            scales_desc,                                                          \
+#define CREATE(CASE, NAMESPACE)                                                       \
+    case CASE:                                                                        \
+        return op::dequantize_awq::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                   \
+            reinterpret_cast<op::dequantize_awq::NAMESPACE::Descriptor **>(desc_ptr), \
+            out_desc,                                                                 \
+            qweight_desc,                                                             \
+            scales_desc,                                                              \
             zeros_desc)
 
     switch (handle->device) {
@@ -35,11 +35,11 @@ __C infiniStatus_t infiniopCreateDequantizeDescriptor(
 #undef CREATE
 }
 
-__C infiniStatus_t infiniopGetDequantizeWorkspaceSize(infiniopDequantizeDescriptor_t desc,
-                                                      size_t *size) {
-#define GET(CASE, NAMESPACE)                                                                            \
-    case CASE:                                                                                          \
-        *size = reinterpret_cast<const op::dequantize::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+__C infiniStatus_t infiniopGetDequantizeAWQWorkspaceSize(infiniopDequantizeAWQDescriptor_t desc,
+                                                         size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                                \
+    case CASE:                                                                                              \
+        *size = reinterpret_cast<const op::dequantize_awq::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
         return INFINI_STATUS_SUCCESS
 
     switch (desc->device_type) {
@@ -52,8 +52,8 @@ __C infiniStatus_t infiniopGetDequantizeWorkspaceSize(infiniopDequantizeDescript
 #undef GET
 }
 
-__C infiniStatus_t infiniopDequantize(
-    infiniopDequantizeDescriptor_t desc,
+__C infiniStatus_t infiniopDequantizeAWQ(
+    infiniopDequantizeAWQDescriptor_t desc,
     void *workspace,
     size_t workspace_size,
     void *out,
@@ -62,9 +62,9 @@ __C infiniStatus_t infiniopDequantize(
     const void *zeros,
     void *stream) {
 
-#define CALCULATE(CASE, NAMESPACE)                                                   \
-    case CASE:                                                                       \
-        return reinterpret_cast<const op::dequantize::NAMESPACE::Descriptor *>(desc) \
+#define CALCULATE(CASE, NAMESPACE)                                                       \
+    case CASE:                                                                           \
+        return reinterpret_cast<const op::dequantize_awq::NAMESPACE::Descriptor *>(desc) \
             ->calculate(workspace, workspace_size, out, qweight, scales, zeros, stream)
 
     switch (desc->device_type) {
@@ -79,11 +79,11 @@ __C infiniStatus_t infiniopDequantize(
 }
 
 __C infiniStatus_t
-infiniopDestroyDequantizeDescriptor(infiniopDequantizeDescriptor_t desc) {
+infiniopDestroyDequantizeAWQDescriptor(infiniopDequantizeAWQDescriptor_t desc) {
 
-#define DELETE(CASE, NAMESPACE)                                                       \
-    case CASE:                                                                        \
-        delete reinterpret_cast<const op::dequantize::NAMESPACE::Descriptor *>(desc); \
+#define DELETE(CASE, NAMESPACE)                                                           \
+    case CASE:                                                                            \
+        delete reinterpret_cast<const op::dequantize_awq::NAMESPACE::Descriptor *>(desc); \
         return INFINI_STATUS_SUCCESS;
 
     switch (desc->device_type) {
diff --git a/test/infiniop/dequantize_awq.py b/test/infiniop/dequantize_awq.py
@@ -140,7 +140,7 @@
 AWQ_REVERSE_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]
 
 
-def dequantize(
+def dequantize_awq(
     qweight: torch.Tensor,
     qzeros: torch.Tensor,
     qscales: torch.Tensor,
@@ -216,7 +216,7 @@ def test(
     sync=None,
 ):
     print(
-        f"Testing Dequantize on {InfiniDeviceNames[device]} with bits:{bits}, group_size:{group_size},"
+        f"Testing Dequantize AWQ on {InfiniDeviceNames[device]} with bits:{bits}, group_size:{group_size},"
         f" qweights_shape:{qweights_shape}, qzeros_shape:{qzeros_shape}, qscales_shape:{qscales_shape},"
         f" qweights_stride:{qweights_stride}, qzeros_stride:{qzeros_stride}, qscales_stride:{qscales_stride},"
         f" qweights_dtype:{InfiniDtypeNames[qweights_dtype]}, qzeros_dtype:{InfiniDtypeNames[qzeros_dtype]}, qscales_dtype:{InfiniDtypeNames[qscales_dtype]}"
@@ -225,29 +225,31 @@ def test(
     qweights = TestTensor(
         qweights_shape, qweights_stride, qweights_dtype, device, mode="randint"
     )
-    qzeros = TestTensor(qzeros_shape, qzeros_stride, qzeros_dtype, device, mode="randint")
+    qzeros = TestTensor(
+        qzeros_shape, qzeros_stride, qzeros_dtype, device, mode="randint"
+    )
     qscales = TestTensor(qscales_shape, qscales_stride, qscales_dtype, device)
     out = TestTensor(out_shape, out_stride, out_dtype, device, mode="zeros")
     ans = TestTensor(out_shape, out_stride, out_dtype, device, mode="ones")
 
     # Compute the PyTorch reference result
-    def torch_dequantize():
-        return dequantize(
+    def torch_dequantize_awq():
+        return dequantize_awq(
             qweights.torch_tensor(),
             qzeros.torch_tensor(),
             qscales.torch_tensor(),
             bits,
             group_size,
         )
 
-    ans = torch_dequantize()
+    ans = torch_dequantize_awq()
 
     if sync is not None:
         sync()
 
     descriptor = infiniopOperatorDescriptor_t()
     check_error(
-        LIBINFINIOP.infiniopCreateDequantizeDescriptor(
+        LIBINFINIOP.infiniopCreateDequantizeAWQDescriptor(
             handle,
             ctypes.byref(descriptor),
             out.descriptor,
@@ -264,16 +266,16 @@ def torch_dequantize():
     # Get workspace size and create workspace
     workspace_size = c_uint64(0)
     check_error(
-        LIBINFINIOP.infiniopGetDequantizeWorkspaceSize(
+        LIBINFINIOP.infiniopGetDequantizeAWQWorkspaceSize(
             descriptor, ctypes.byref(workspace_size)
         )
     )
     workspace = TestWorkspace(workspace_size.value, device)
 
     # Execute infiniop gemm operator
-    def lib_dequantize():
+    def lib_dequantize_awq():
         check_error(
-            LIBINFINIOP.infiniopDequantize(
+            LIBINFINIOP.infiniopDequantizeAWQ(
                 descriptor,
                 workspace.data(),
                 workspace_size.value,
@@ -285,7 +287,7 @@ def lib_dequantize():
             )
         )
 
-    lib_dequantize()
+    lib_dequantize_awq()
 
     # Validate results
     atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
@@ -298,10 +300,10 @@ def lib_dequantize():
     # Profiling workflow
     if PROFILE:
         # fmt: off
-        profile_operation("PyTorch", lambda: torch_dequantize(), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_dequantize(), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("PyTorch", lambda: torch_dequantize_awq(), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_dequantize_awq(), device, NUM_PRERUN, NUM_ITERATIONS)
         # fmt: on
-    check_error(LIBINFINIOP.infiniopDestroyDequantizeDescriptor(descriptor))
+    check_error(LIBINFINIOP.infiniopDestroyDequantizeAWQDescriptor(descriptor))
 
 
 # ==============================================================================
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py