From 05aafb21cc9c18e0d82829018cbdaa8224f7d22e Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Fri, 26 Dec 2025 13:12:26 +0800
Subject: [PATCH] =?UTF-8?q?issue/843:=20=E5=A2=9E=E5=8A=A0per=5Fchannel=5F?=
 =?UTF-8?q?quant=5Fint8=E7=AE=97=E5=AD=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../ops/quant/per_channel_quant_int8.h        |  28 ++
 .../per_channel_quant_int8/cuda/kernel.cuh    | 316 ++++++++++++++++
 .../ops/quant/per_channel_quant_int8/info.h   |  59 +++
 .../nvidia/per_channel_quant_int8_nvidia.cu   | 118 ++++++
 .../nvidia/per_channel_quant_int8_nvidia.cuh  |   7 +
 .../quant/per_channel_quant_int8/operator.cc  |  98 +++++
 .../per_channel_quant_int8.h                  |  40 ++
 test/infiniop/libinfiniop/op_register.py      |  37 ++
 test/infiniop/per_channel_quant_int8.py       | 193 ++++++++++
 test/infiniop/w8a8int8.py                     | 348 ++++++++++++++++++
 xmake.lua                                     |   3 +-
 xmake/nvidia.lua                              |   2 +-
 xmake/qy.lua                                  |   2 +-
 13 files changed, 1248 insertions(+), 3 deletions(-)
 create mode 100644 include/infiniop/ops/quant/per_channel_quant_int8.h
 create mode 100644 src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/quant/per_channel_quant_int8/info.h
 create mode 100644 src/infiniop/ops/quant/per_channel_quant_int8/nvidia/per_channel_quant_int8_nvidia.cu
 create mode 100644 src/infiniop/ops/quant/per_channel_quant_int8/nvidia/per_channel_quant_int8_nvidia.cuh
 create mode 100644 src/infiniop/ops/quant/per_channel_quant_int8/operator.cc
 create mode 100644 src/infiniop/ops/quant/per_channel_quant_int8/per_channel_quant_int8.h
 create mode 100644 test/infiniop/per_channel_quant_int8.py
 create mode 100644 test/infiniop/w8a8int8.py

diff --git a/include/infiniop/ops/quant/per_channel_quant_int8.h b/include/infiniop/ops/quant/per_channel_quant_int8.h
new file mode 100644
index 000000000..ce21f4556
--- /dev/null
+++ b/include/infiniop/ops/quant/per_channel_quant_int8.h
@@ -0,0 +1,28 @@
+#ifndef __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__
+#define __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__
+
+#include "../../operator_descriptor.h"
+
+typedef InfiniopDescriptor *infiniopPerChannelQuantI8Descriptor_t;
+
+__C __export infiniStatus_t infiniopCreatePerChannelQuantI8Descriptor(infiniopHandle_t handle,
+                                                                      infiniopPerChannelQuantI8Descriptor_t *desc_ptr,
+                                                                      infiniopTensorDescriptor_t x_packed_desc,
+                                                                      infiniopTensorDescriptor_t x_scale_desc,
+                                                                      infiniopTensorDescriptor_t x_zero_desc,
+                                                                      infiniopTensorDescriptor_t x_desc);
+
+__C __export infiniStatus_t infiniopGetPerChannelQuantI8WorkspaceSize(infiniopPerChannelQuantI8Descriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopPerChannelQuantI8(infiniopPerChannelQuantI8Descriptor_t desc,
+                                                      void *workspace,
+                                                      size_t workspace_size,
+                                                      void *x_packed,
+                                                      void *x_scale,
+                                                      void *x_zero,
+                                                      const void *x,
+                                                      void *stream);
+
+__C __export infiniStatus_t infiniopDestroyPerChannelQuantI8Descriptor(infiniopPerChannelQuantI8Descriptor_t desc);
+
+#endif
diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh b/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh
new file mode 100644
index 000000000..629cc3b5f
--- /dev/null
+++ b/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh
@@ -0,0 +1,316 @@
+#ifndef __PERCHANNEL_QUANTINT8_KERNEL_CUH__
+#define __PERCHANNEL_QUANTINT8_KERNEL_CUH__
+
+#include <cub/block/block_reduce.cuh>
+/**
+ * Rounds a floating-point value to the nearest integer using
+ * the "half away from zero" tie-breaking rule.
+ *
+ * This rounding mode rounds to the nearest whole number, with ties
+ * (values exactly halfway between integers) rounded away from zero.
+ * For positive numbers: 1.5 rounds to 2, 2.5 rounds to 3
+ * For negative numbers: -1.5 rounds to -2, -2.5 rounds to -3
+ * This differs from standard "round to nearest, ties to even" banking rounding.
+ *
+ * @param x The floating-point value to round.
+ * @return The rounded integer value as an int.
+ *
+ * @note This is a CUDA device function designed to execute on GPU hardware.
+ * @note Uses floorf() and fabsf() from the CUDA math library.
+ */
+__device__ inline int round_half_away_from_zero(float x) {
+    float ax = fabsf(x);
+    float r = floorf(ax + 0.5f);
+    return (x >= 0.0f) ? (int)r : -(int)r;
+}
+
+/**
+ * Performs per-channel asymmetric quantization to int8 precision for large matrices.
+ *
+ * This kernel quantizes input matrix x (M x K) to int8 using channel-wise (column-wise)
+ * quantization parameters, optimized for cases where K >= 1024. Each channel (column)
+ * has independently computed scale and zero point to minimize quantization error.
+ *
+ * The quantization follows: x_quantized = round((x - zero) / scale)
+ * where zero points shift the range and scales normalize to int8 range [-128, 127].
+ *
+ * @tparam Tdata Input data type (typically float or half)
+ * @tparam BLOCK_SIZE CUDA block size for thread cooperation
+ *
+ * @param x_packed Output buffer for packed int8 quantized values
+ * @param x_scale Output buffer for per-channel scale factors
+ * @param x_zero Output buffer for per-channel zero points
+ * @param x Input matrix in row-major layout (M rows, K columns)
+ * @param M Number of rows in input matrix
+ * @param K Number of columns in input matrix (channels)
+ *
+ * @note This is a CUDA device function optimized for GPU execution
+ * @note Designed for large channel dimensions (K >= 1024) to maximize parallelization
+ * @note Uses block-level reductions for efficient min/max computation per channel
+ */
+template <typename Tdata, unsigned int BLOCK_SIZE>
+__device__ void blockPerChannelQuantI8Kernel(
+    int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x,
+    int M, int K) {
+    int row = blockIdx.x;
+    int tid = row * K;
+
+    // ---- 1. reduce max ----
+    float local_max = op::common_cuda::reduce_op::max<BLOCK_SIZE, Tdata>(
+        x + tid, K);
+
+    __shared__ float global_max_f;
+    if (threadIdx.x == 0) {
+        global_max_f = local_max;
+    }
+    __syncthreads();
+
+    typedef cub::BlockReduce<float, BLOCK_SIZE> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    // ---- 2. reduce min ----
+    float thread_min = __FLT_MAX__;
+    for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) {
+        thread_min = fminf(thread_min, (float)x[tid + ind]);
+    }
+    float local_min = BlockReduce(temp_storage).Reduce(thread_min, cub::Min());
+
+    __shared__ float global_min_f;
+    if (threadIdx.x == 0) {
+        global_min_f = local_min;
+    }
+    __syncthreads();
+
+    float global_max = global_max_f;
+    float global_min = global_min_f;
+
+    float scale = (global_max - global_min) / 255.0f;
+    if (scale < 1e-8f) {
+        scale = 1e-8f;
+    }
+
+    float inv_scale = 1.0f / scale;
+    float zero = -global_min * inv_scale - 128.0f;
+
+    x_scale[row] = (Tdata)scale;
+    x_zero[row] = (Tdata)zero;
+
+    for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) {
+
+        float v = (float)x[tid + ind];
+        float qf = v * inv_scale + zero;
+
+        int q = round_half_away_from_zero(qf);
+
+        if (q > 127) {
+            q = 127;
+        }
+        if (q < -128) {
+            q = -128;
+        }
+
+        x_packed[tid + ind] = (int8_t)q;
+    }
+}
+/**
+ * Performs per-channel symmetric quantization to int8 for large matrices (K >= 1024).
+ * Uses zero-centered scaling only, no zero point, and packs quantized data.
+ */
+template <typename Tdata, unsigned int BLOCK_SIZE>
+__device__ void blockPerChannelQuantI8SymKernel(
+    int8_t *x_packed, float *x_scale, const Tdata *x,
+    int M, int K) {
+    int row = blockIdx.x;
+    int tid = row * K;
+
+    typedef cub::BlockReduce<float, BLOCK_SIZE> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    // ---- 2. reduce min ----
+    float thread_max = -__FLT_MAX__;
+    for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) {
+        thread_max = fmaxf(thread_max, fabs((float)x[tid + ind]));
+    }
+    float local_max = BlockReduce(temp_storage).Reduce(thread_max, cub::Max());
+
+    __shared__ float global_max_f;
+    if (threadIdx.x == 0) {
+        global_max_f = local_max;
+    }
+    __syncthreads();
+
+    float global_max = global_max_f;
+
+    float scale = global_max / 127.0f;
+    if (scale < 1e-8f) {
+        scale = 1e-8f;
+    }
+
+    float inv_scale = 1.0f / scale;
+
+    x_scale[row] = (Tdata)scale;
+
+    for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) {
+
+        float v = (float)x[tid + ind];
+        float qf = v * inv_scale;
+
+        int q = round_half_away_from_zero(qf);
+
+        if (q > 127) {
+            q = 127;
+        }
+        if (q < -127) {
+            q = -127;
+        }
+
+        x_packed[tid + ind] = (int8_t)q;
+    }
+}
+
+template <typename T>
+struct MaxOp {
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        return max(a, b);
+    }
+};
+template <typename T>
+struct MinOp {
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        return min(a, b);
+    }
+};
+template <template <typename> class ReductionOp, typename T,
+          int thread_group_width>
+__inline__ __device__ T WarpAllReduce(T val) {
+    for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+        val = ReductionOp<T>()(val, __shfl_xor_sync(0xffffffff, val, mask));
+    }
+    return val;
+}
+/**
+ * Performs per-channel asymmetric quantization to int8 for large matrices (K < 1024).
+ * Computes scale/zero point per channel (column) and packs quantized data.
+ */
+template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y>
+__device__ void warpPerChannelQuantI8Kernel(
+    int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x,
+    int M, int K) {
+    int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
+    int tid = otherIdx * K;
+
+    if (otherIdx < M) {
+
+        __shared__ float max_total[BLOCK_SIZE_y];
+        __shared__ float min_total[BLOCK_SIZE_y];
+
+        float max_data = -__FLT_MAX__;
+        float min_data = __FLT_MAX__;
+
+        // ---- reduce max/min ----
+        for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE_x) {
+            float v = (float)x[tid + ind];
+            max_data = fmaxf(max_data, v);
+            min_data = fminf(min_data, v);
+        }
+
+        max_data = WarpAllReduce<MaxOp, float, BLOCK_SIZE_x>(max_data);
+        min_data = WarpAllReduce<MinOp, float, BLOCK_SIZE_x>(min_data);
+
+        if (threadIdx.x == 0) {
+            max_total[threadIdx.y] = max_data;
+            min_total[threadIdx.y] = min_data;
+        }
+        __syncthreads();
+
+        float max_f = max_total[threadIdx.y];
+        float min_f = min_total[threadIdx.y];
+
+        float scale = (max_f - min_f) / 255.0f;
+        if (scale < 1e-8f) {
+            scale = 1e-8f;
+        }
+
+        float inv_scale = 1.0f / scale;
+        float zero = -min_f * inv_scale - 128.0f;
+
+        x_scale[otherIdx] = scale;
+        x_zero[otherIdx] = zero;
+
+        for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE_x) {
+            float v = (float)x[tid + ind];
+            float qf = v * inv_scale + zero;
+
+            int q = round_half_away_from_zero(qf);
+
+            if (q > 127) {
+                q = 127;
+            }
+            if (q < -128) {
+                q = -128;
+            }
+
+            x_packed[tid + ind] = (int8_t)q;
+        }
+    }
+}
+/**
+ * Performs per-channel symmetric quantization to int8 for large matrices (K < 1024).
+ * Uses zero-centered scaling only, no zero point, and packs quantized data.
+ */
+template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y>
+__device__ void warpPerChannelQuantI8SymKernel(
+    int8_t *x_packed, float *x_scale, const Tdata *x,
+    int M, int K) {
+    int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
+    int tid = otherIdx * K;
+
+    if (otherIdx < M) {
+
+        __shared__ float max_total[BLOCK_SIZE_y];
+
+        float max_data = -__FLT_MAX__;
+
+        // ---- reduce max/min ----
+        for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE_x) {
+            float v = fabs((float)x[tid + ind]);
+            max_data = fmaxf(max_data, v);
+        }
+
+        max_data = WarpAllReduce<MaxOp, float, BLOCK_SIZE_x>(max_data);
+
+        if (threadIdx.x == 0) {
+            max_total[threadIdx.y] = max_data;
+        }
+        __syncthreads();
+
+        float max_f = max_total[threadIdx.y];
+
+        float scale = max_f / 127.0f;
+        if (scale < 1e-8f) {
+            scale = 1e-8f;
+        }
+
+        float inv_scale = 1.0f / scale;
+
+        x_scale[otherIdx] = scale;
+
+        for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE_x) {
+            float v = (float)x[tid + ind];
+            float qf = v * inv_scale;
+
+            int q = round_half_away_from_zero(qf);
+
+            if (q > 127) {
+                q = 127;
+            }
+            if (q < -127) {
+                q = -127;
+            }
+
+            x_packed[tid + ind] = (int8_t)q;
+        }
+    }
+}
+
+#endif // __PERCHANNEL_QUANTINT8_KERNEL_CUH__
diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/info.h b/src/infiniop/ops/quant/per_channel_quant_int8/info.h
new file mode 100644
index 000000000..6a8295753
--- /dev/null
+++ b/src/infiniop/ops/quant/per_channel_quant_int8/info.h
@@ -0,0 +1,59 @@
+#ifndef __PER_CHANNEL_QUANT_INT8_INFO_H__
+#define __PER_CHANNEL_QUANT_INT8_INFO_H__
+
+#include "../../../../utils.h"
+#include "../../../operator.h"
+#include "../../../tensor.h"
+
+namespace op::per_channel_quant_int8 {
+
+class PerChannelQuantI8Info {
+private:
+    PerChannelQuantI8Info() = default;
+
+public:
+    infiniDtype_t dtype, packed_type;
+    size_t M, K;
+
+    static utils::Result<PerChannelQuantI8Info> createPerChannelQuantI8Info(
+        infiniopTensorDescriptor_t x_packed_desc,
+        infiniopTensorDescriptor_t x_scale_desc,
+        infiniopTensorDescriptor_t x_zero_desc,
+        infiniopTensorDescriptor_t x_desc) {
+
+        CHECK_OR_RETURN(
+            x_packed_desc != nullptr && x_scale_desc != nullptr && x_desc != nullptr,
+            INFINI_STATUS_NULL_POINTER);
+
+        const infiniDtype_t dtype = x_desc->dtype();
+        const infiniDtype_t packed_type = x_packed_desc->dtype();
+
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
+        CHECK_DTYPE(packed_type, INFINI_DTYPE_I8);
+
+        CHECK_OR_RETURN(x_desc->ndim() == 2
+                            && x_packed_desc->ndim() == 2
+                            && x_scale_desc->ndim() == 2,
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        size_t M = x_desc->dim(0);
+        size_t K = x_desc->dim(1);
+
+        CHECK_OR_RETURN(M == x_packed_desc->dim(0)
+                            || K == x_packed_desc->dim(1)
+                            || M == x_scale_desc->dim(0)
+                            || 1 == x_scale_desc->dim(1),
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        return utils::Result<PerChannelQuantI8Info>(PerChannelQuantI8Info{
+            dtype,
+            packed_type,
+            M,
+            K,
+        });
+    }
+};
+
+} // namespace op::per_channel_quant_int8
+
+#endif //  __PER_CHANNEL_QUANT_INT8_INFO_H__
diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/nvidia/per_channel_quant_int8_nvidia.cu b/src/infiniop/ops/quant/per_channel_quant_int8/nvidia/per_channel_quant_int8_nvidia.cu
new file mode 100644
index 000000000..24fa80c6e
--- /dev/null
+++ b/src/infiniop/ops/quant/per_channel_quant_int8/nvidia/per_channel_quant_int8_nvidia.cu
@@ -0,0 +1,118 @@
+#include "../../../../devices/nvidia/nvidia_common.cuh"
+#include "per_channel_quant_int8_nvidia.cuh"
+
+#include "../../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../../../../reduce/cuda/reduce.cuh"
+#include <cub/block/block_reduce.cuh>
+
+#include "../cuda/kernel.cuh"
+
+template <typename Tdata, unsigned int BLOCK_SIZE>
+INFINIOP_CUDA_KERNEL blockPerChannelQuantI8(
+    int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x, int M, int K) {
+    blockPerChannelQuantI8Kernel<Tdata, BLOCK_SIZE>(x_packed, x_scale, x_zero, x, M, K);
+}
+template <typename Tdata, unsigned int BLOCK_SIZE>
+INFINIOP_CUDA_KERNEL blockPerChannelQuantI8Sym(
+    int8_t *x_packed, float *x_scale, const Tdata *x, int M, int K) {
+    blockPerChannelQuantI8SymKernel<Tdata, BLOCK_SIZE>(x_packed, x_scale, x, M, K);
+}
+
+template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y>
+INFINIOP_CUDA_KERNEL warpPerChannelQuantI8(
+    int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x, int M, int K) {
+    warpPerChannelQuantI8Kernel<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>(x_packed, x_scale, x_zero, x, M, K);
+}
+template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y>
+INFINIOP_CUDA_KERNEL warpPerChannelQuantI8Sym(
+    int8_t *x_packed, float *x_scale, const Tdata *x, int M, int K) {
+    warpPerChannelQuantI8SymKernel<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>(x_packed, x_scale, x, M, K);
+}
+
+namespace op::per_channel_quant_int8::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle, Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_packed_desc,
+    infiniopTensorDescriptor_t x_scale_desc,
+    infiniopTensorDescriptor_t x_zero_desc,
+    infiniopTensorDescriptor_t x_desc) {
+    auto info = PerChannelQuantI8Info::createPerChannelQuantI8Info(x_packed_desc, x_scale_desc, x_zero_desc, x_desc);
+    CHECK_RESULT(info);
+
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        info.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t per_channel_quant_int8Kernel(const PerChannelQuantI8Info &info, int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x, cudaStream_t stream) {
+    int M = (int)info.M;
+    int K = (int)info.K;
+
+    if (K >= 1024) {
+        if (x_zero == nullptr) {
+            blockPerChannelQuantI8Sym<Tdata, BLOCK_SIZE>
+                <<<M, BLOCK_SIZE, 0, stream>>>(x_packed, x_scale, x, M, K);
+        } else {
+            blockPerChannelQuantI8<Tdata, BLOCK_SIZE>
+                <<<M, BLOCK_SIZE, 0, stream>>>(x_packed, x_scale, x_zero, x, M, K);
+        }
+
+    } else {
+        constexpr unsigned int BLOCK_SIZE_x = 32;
+        constexpr unsigned int BLOCK_SIZE_y = 32;
+        int num_block_x = (M + BLOCK_SIZE_y - 1) / BLOCK_SIZE_y;
+        dim3 block_dim(BLOCK_SIZE_x, BLOCK_SIZE_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+        if (x_zero == nullptr) {
+            warpPerChannelQuantI8Sym<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>
+                <<<grid_dim, block_dim, 0, stream>>>(x_packed, x_scale, x, M, K);
+        } else {
+            warpPerChannelQuantI8<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>
+                <<<grid_dim, block_dim, 0, stream>>>(x_packed, x_scale, x_zero, x, M, K);
+        }
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *x_packed, void *x_scale, void *x_zero, const void *x,
+                                     void *stream_) const {
+    cudaStream_t stream = (cudaStream_t)stream_;
+#define QUANT(BLOCK_SIZE, TDATA) \
+    per_channel_quant_int8Kernel<BLOCK_SIZE, TDATA>(_info, (int8_t *)x_packed, (float *)x_scale, (float *)x_zero, (const TDATA *)x, stream)
+#define QUANT_WITH_BLOCK_SIZE(BLOCK_SIZE)            \
+    {                                                \
+        if (_info.dtype == INFINI_DTYPE_F16)         \
+            return QUANT(BLOCK_SIZE, half);          \
+        else if (_info.dtype == INFINI_DTYPE_F32)    \
+            return QUANT(BLOCK_SIZE, float);         \
+        else if (_info.dtype == INFINI_DTYPE_BF16)   \
+            return QUANT(BLOCK_SIZE, __nv_bfloat16); \
+        else                                         \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;   \
+    }
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        QUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        QUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        QUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::per_channel_quant_int8::nvidia
diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/nvidia/per_channel_quant_int8_nvidia.cuh b/src/infiniop/ops/quant/per_channel_quant_int8/nvidia/per_channel_quant_int8_nvidia.cuh
new file mode 100644
index 000000000..9a7a2872d
--- /dev/null
+++ b/src/infiniop/ops/quant/per_channel_quant_int8/nvidia/per_channel_quant_int8_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __PER_CHANNEL_QUANT_INT8_NVIDIA_API_H__
+#define __PER_CHANNEL_QUANT_INT8_NVIDIA_API_H__
+#include "../per_channel_quant_int8.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __PER_CHANNEL_QUANT_INT8_NVIDIA_API_H__
diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/operator.cc b/src/infiniop/ops/quant/per_channel_quant_int8/operator.cc
new file mode 100644
index 000000000..dade91c88
--- /dev/null
+++ b/src/infiniop/ops/quant/per_channel_quant_int8/operator.cc
@@ -0,0 +1,98 @@
+#include "../../../operator.h"
+#include "../../../handle.h"
+#include "infiniop/ops/quant/per_channel_quant_int8.h"
+
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#include "nvidia/per_channel_quant_int8_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreatePerChannelQuantI8Descriptor(infiniopHandle_t handle,
+                                                             infiniopPerChannelQuantI8Descriptor_t *desc_ptr,
+                                                             infiniopTensorDescriptor_t x_packed_desc,
+                                                             infiniopTensorDescriptor_t x_scale_desc,
+                                                             infiniopTensorDescriptor_t x_zero_desc,
+                                                             infiniopTensorDescriptor_t x_desc) {
+#define CREATE(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                                \
+        return op::per_channel_quant_int8::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                           \
+            reinterpret_cast<op::per_channel_quant_int8::NAMESPACE::Descriptor **>(desc_ptr), \
+            x_packed_desc,                                                                    \
+            x_scale_desc,                                                                     \
+            x_zero_desc,                                                                      \
+            x_desc);
+    switch (handle->device) {
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetPerChannelQuantI8WorkspaceSize(infiniopPerChannelQuantI8Descriptor_t desc, size_t *size) {
+    switch (desc->device_type) {
+#define GET(CASE, NAMESPACE)                                                                                     \
+    case CASE:                                                                                                   \
+        *size = reinterpret_cast<op::per_channel_quant_int8::NAMESPACE::Descriptor *>(desc)->minWorkspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+
+__C infiniStatus_t infiniopPerChannelQuantI8(infiniopPerChannelQuantI8Descriptor_t desc,
+                                             void *workspace,
+                                             size_t workspace_size,
+                                             void *x_packed,
+                                             void *x_scale,
+                                             void *x_zero,
+                                             const void *x,
+                                             void *stream) {
+#define QUANT(CASE, NAMESPACE)                                                                         \
+    case CASE:                                                                                         \
+        return reinterpret_cast<op::per_channel_quant_int8::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace, workspace_size, x_packed, x_scale, x_zero, x, stream);
+
+    switch (desc->device_type) {
+#ifdef ENABLE_NVIDIA_API
+        QUANT(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        QUANT(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef QUANT
+}
+
+__C infiniStatus_t infiniopDestroyPerChannelQuantI8Descriptor(infiniopPerChannelQuantI8Descriptor_t desc) {
+#define DESTROY(CASE, NAMESPACE)                                                            \
+    case CASE:                                                                              \
+        delete reinterpret_cast<op::per_channel_quant_int8::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_NVIDIA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        DESTROY(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DESTROY
+}
diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/per_channel_quant_int8.h b/src/infiniop/ops/quant/per_channel_quant_int8/per_channel_quant_int8.h
new file mode 100644
index 000000000..4d1675c8c
--- /dev/null
+++ b/src/infiniop/ops/quant/per_channel_quant_int8/per_channel_quant_int8.h
@@ -0,0 +1,40 @@
+#ifndef __QUANT_H__
+#define __QUANT_H__
+
+#include "../../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                                                \
+                                                                                             \
+    namespace op::per_channel_quant_int8::NAMESPACE {                                        \
+    class Descriptor final : public InfiniopDescriptor {                                     \
+        struct Opaque;                                                                       \
+        Opaque *_opaque;                                                                     \
+        PerChannelQuantI8Info _info;                                                         \
+        size_t _workspace_size;                                                              \
+                                                                                             \
+        Descriptor(Opaque *opaque, PerChannelQuantI8Info info,                               \
+                   size_t workspace_size,                                                    \
+                   infiniDevice_t device_type, int device_id)                                \
+            : InfiniopDescriptor{device_type, device_id},                                    \
+              _opaque(opaque), _info(info), _workspace_size(workspace_size) {}               \
+                                                                                             \
+    public:                                                                                  \
+        ~Descriptor();                                                                       \
+                                                                                             \
+        size_t minWorkspaceSize() const { return _workspace_size; }                          \
+                                                                                             \
+        static infiniStatus_t create(                                                        \
+            infiniopHandle_t handle, Descriptor **desc_ptr,                                  \
+            infiniopTensorDescriptor_t x_packed_desc,                                        \
+            infiniopTensorDescriptor_t x_scale_desc,                                         \
+            infiniopTensorDescriptor_t x_zero_desc,                                          \
+            infiniopTensorDescriptor_t x_desc);                                              \
+                                                                                             \
+        infiniStatus_t calculate(                                                            \
+            void *workspace, size_t workspace_size,                                          \
+            void *x_packed, void *x_scale, void *x_zero, const void *x, void *stream) const; \
+    };                                                                                       \
+    }
+
+#endif // __QUANT_H__
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index 89f0dacd3..bdc61d33a 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -686,6 +686,42 @@ def dequantize_(lib):
     ]
 
 
+@OpRegister.operator
+def per_channel_quant_int8_(lib):
+    lib.infiniopCreatePerChannelQuantI8Descriptor.restype = c_int32
+    lib.infiniopCreatePerChannelQuantI8Descriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetPerChannelQuantI8WorkspaceSize.restype = c_int32
+    lib.infiniopGetPerChannelQuantI8WorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopPerChannelQuantI8.restype = c_int32
+    lib.infiniopPerChannelQuantI8.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyPerChannelQuantI8Descriptor.restype = c_int32
+    lib.infiniopDestroyPerChannelQuantI8Descriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
 @OpRegister.operator
 def softplus_(lib):
     lib.infiniopCreateSoftplusDescriptor.restype = c_int32
@@ -939,6 +975,7 @@ def tanh_(lib):
         infiniopOperatorDescriptor_t,
     ]
 
+
 @OpRegister.operator
 def scaled_mm_int8_(lib):
     lib.infiniopCreateI8GemmDescriptor.restype = c_int32
diff --git a/test/infiniop/per_channel_quant_int8.py b/test/infiniop/per_channel_quant_int8.py
new file mode 100644
index 000000000..55f493f1c
--- /dev/null
+++ b/test/infiniop/per_channel_quant_int8.py
@@ -0,0 +1,193 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES = [
+    # x_shape, w_shape, symmetric, bias_exit, y_shape
+    ((8, 8), True),
+    ((128, 512), True),
+    ((128, 128), True),
+    ((256, 1024), False),
+    ((256, 2048), True),
+    ((1024, 2048), False),
+]
+
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 5e-2},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 5e-2},
+    InfiniDtype.F32: {"atol": 3e-5, "rtol": 5e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def per_token_quant_int8_torch(x, symmetric):
+    if symmetric:
+        x = x.float()
+        absmax = x.abs().max(dim=-1).values
+        absmax = absmax.clamp_min(1e-10).unsqueeze(-1)
+        scale_x = absmax / 127
+        x_q = x.mul(127 / absmax)
+        x_q = torch.round(x_q).to(torch.int8)
+
+        return x_q, scale_x, None
+    else:
+        w = x.float()
+        w_min = w.min(dim=-1, keepdim=True)[0]
+        w_max = w.max(dim=-1, keepdim=True)[0]
+
+        w_scale = (w_max - w_min) / 255.0
+        w_scale = torch.clamp(w_scale, min=1e-8)
+
+        w_zero = -w_min / w_scale - 128.0
+
+        w_q = torch.round(w / w_scale + w_zero)
+
+        w_q = torch.clamp(w_q, -128, 127)
+
+        w_packed = w_q.to(torch.int8)
+
+        return w_packed, w_scale, w_zero
+
+
+def test(
+    handle,
+    device,
+    x_shape,
+    symmetric,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+
+    print(
+        f"Testing Per Channel Quant Int8 on {InfiniDeviceNames[device]} with x_shape:{x_shape}, symmetric:{symmetric} , dtype:{InfiniDtypeNames[dtype]}"
+    )
+    M, K = x_shape
+
+    x = TestTensor(x_shape, None, dtype, device)
+    x_p, x_s, x_z = per_token_quant_int8_torch(x.torch_tensor(), symmetric)
+    x_packed = TestTensor(x_shape, None, InfiniDtype.I8, device, mode="zeros")
+    x_scale = TestTensor((M, 1), None, InfiniDtype.F32, device)
+    if symmetric:
+        x_zero = None
+    else:
+        x_zero = TestTensor((M, 1), None, InfiniDtype.F32, device)
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreatePerChannelQuantI8Descriptor(
+            handle,
+            ctypes.byref(descriptor),
+            x_packed.descriptor,
+            x_scale.descriptor,
+            None if symmetric else x_zero.descriptor,
+            x.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+
+    x_packed.destroy_desc()
+    x_scale.destroy_desc()
+    if symmetric == False:
+        x_zero.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetPerChannelQuantI8WorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    def lib_per_channel_quant_int8():
+        check_error(
+            LIBINFINIOP.infiniopPerChannelQuantI8(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                x_packed.data(),
+                x_scale.data(),
+                None if symmetric else x_zero.data(),
+                x.data(),
+                None,
+            )
+        )
+
+    lib_per_channel_quant_int8()
+
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(x_packed.actual_tensor(), x_p, atol=atol, rtol=rtol)
+        debug(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol)
+        if symmetric == False:
+            debug(x_zero.actual_tensor(), x_z, atol=atol, rtol=rtol)
+
+    if symmetric:
+        assert torch.allclose(
+            x_packed.actual_tensor(), x_p, atol=2, rtol=2
+        ) and torch.allclose(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol)
+    else:
+        assert (
+            torch.allclose(x_packed.actual_tensor(), x_p, atol=2, rtol=2)
+            and torch.allclose(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol)
+            and torch.allclose(x_zero.actual_tensor(), x_z, atol=atol, rtol=rtol)
+        )
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: per_token_quant_int8_torch(x.torch_tensor(), symmetric), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_per_channel_quant_int8(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyPerChannelQuantI8Descriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/w8a8int8.py b/test/infiniop/w8a8int8.py
new file mode 100644
index 000000000..c35ab9c09
--- /dev/null
+++ b/test/infiniop/w8a8int8.py
@@ -0,0 +1,348 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # x_shape, w_shape, sym, y_shape
+    ((128, 512), (512, 1024), True, (128, 1024)),
+    ((5000, 5120), (5120, 8192), True, (5000, 8192)),
+    ((2000, 5120), (5120, 8192), True, (2000, 8192)),
+    ((1000, 5120), (5120, 8192), True, (1000, 8192)),
+    ((2048, 4096), (4096, 2048), True, (2048, 2048)),
+    ((4096, 4096), (4096, 4096), True, (4096, 4096)),
+    ((2560, 10240), (10240, 20480), True, (2560, 20480)),
+    ((1024, 2048), (2048, 1024), True, (1024, 1024)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.INPLACE,
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 3e-1, "rtol": 1e-2},
+    InfiniDtype.BF16: {"atol": 3e-1, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def mm(x, w, bias, out_dtype):
+    return (torch.matmul(x, w + bias)).to(out_dtype)
+
+
+def scaled_mm(x, w_p, w_s, bias, out_dtype):
+    return (
+        torch.matmul(x.to(torch.float32), w_p.to(torch.float32)) * w_s.view(1, -1)
+        + bias
+    ).to(out_dtype)
+
+
+def to_int8(tensor: torch.Tensor) -> torch.Tensor:
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias):
+    o = torch.matmul(a.to(torch.float32), b.to(torch.float32))
+    if bias is not None:
+        o = o.to(torch.float32) * scale_a.view(-1, 1) * scale_b.view(1, -1) + bias
+    else:
+        o = o.to(torch.float32) * scale_a.view(-1, 1) * scale_b.view(1, -1)
+    return o.to(out_dtype)
+
+
+def per_token_quant_int8_torch(x):
+    x = x.float()
+    absmax = x.abs().max(dim=-1).values
+    absmax = absmax.clamp_min(1e-10).unsqueeze(-1)
+    scale_x = absmax / 127
+    x_q = x.mul(127 / absmax)
+    x_q = torch.round(x_q).to(torch.int8)
+
+    return x_q, scale_x
+
+
+def test(
+    handle,
+    device,
+    x_shape,
+    w_shape,
+    symmetric,
+    y_shape,
+    alpha,
+    beta,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=InfiniDtype.BF16,
+    sync=None,
+):
+    print(
+        f"Testing Linear on {InfiniDeviceNames[device]} with x_shape:{x_shape}, w_shape:{w_shape}, symmetric:{symmetric}, alpha:{alpha}, beta:{beta}, inplace:{inplace} dtype:{InfiniDtypeNames[dtype]}"
+    )
+    M, K = x_shape
+    N = w_shape[1]
+
+    weights_packed = to_int8(torch.randn((N, K), device="cuda").t() * 5)
+    weights_scale = torch.randn((N,), device="cuda", dtype=torch.float32)
+    bias = (
+        torch.randn(
+            (N,),
+            device="cuda",
+            dtype=torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16,
+        )
+        * 10
+    )
+
+    x = TestTensor(x_shape, None, dtype, device)
+    x_packed = TestTensor(x_shape, None, InfiniDtype.I8, device, mode="zeros")
+    x_scale = TestTensor((M, 1), None, InfiniDtype.F32, device)
+
+    weights_packed = TestTensor(
+        (K, N),
+        weights_packed.stride(),
+        InfiniDtype.I8,
+        device,
+        mode="manual",
+        set_tensor=weights_packed,
+    )
+    weights_scale = TestTensor(
+        (N,),
+        weights_scale.stride(),
+        InfiniDtype.F32,
+        device,
+        mode="manual",
+        set_tensor=weights_scale,
+    )
+
+    weights = weights_packed.torch_tensor() * weights_scale.torch_tensor().view(1, -1)
+
+    y = TestTensor(y_shape, None, dtype, device)
+    bias = TestTensor(
+        (N,), bias.stride(), dtype, device, mode="manual", set_tensor=bias
+    )
+
+    x_mm = x.torch_tensor().to(
+        torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16
+    )
+    w_mm = weights.to(torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16)
+
+    quant_descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreatePerChannelQuantI8Descriptor(
+            handle,
+            ctypes.byref(quant_descriptor),
+            x_packed.descriptor,
+            x_scale.descriptor,
+            None,
+            x.descriptor,
+        )
+    )
+
+    quant_workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetPerChannelQuantI8WorkspaceSize(
+            quant_descriptor, ctypes.byref(quant_workspace_size)
+        )
+    )
+    quant_workspace = TestWorkspace(quant_workspace_size.value, x.device)
+
+    def lib_per_channel_quant_int8():
+        check_error(
+            LIBINFINIOP.infiniopPerChannelQuantI8(
+                quant_descriptor,
+                quant_workspace.data(),
+                quant_workspace_size.value,
+                x_packed.data(),
+                x_scale.data(),
+                None,
+                x.data(),
+                None,
+            )
+        )
+
+    scaled_mm_descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateI8GemmDescriptor(
+            handle,
+            ctypes.byref(scaled_mm_descriptor),
+            y.descriptor,
+            bias.descriptor,
+            x_packed.descriptor,
+            x_scale.descriptor,
+            weights_packed.descriptor,
+            weights_scale.descriptor,
+        )
+    )
+
+    scaled_mm_workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetI8GemmWorkspaceSize(
+            scaled_mm_descriptor, ctypes.byref(scaled_mm_workspace_size)
+        )
+    )
+    scaled_mm_workspace = TestWorkspace(scaled_mm_workspace_size.value, x_packed.device)
+
+    def lib_linear():
+        check_error(
+            LIBINFINIOP.infiniopI8Gemm(
+                scaled_mm_descriptor,
+                scaled_mm_workspace.data(),
+                scaled_mm_workspace_size.value,
+                y.data(),
+                bias.data(),
+                x_packed.data(),
+                x_scale.data(),
+                weights_packed.data(),
+                weights_scale.data(),
+                None,
+            )
+        )
+
+    def lib_w8a8int8_linearFunction():
+        lib_per_channel_quant_int8()
+        lib_linear()
+
+    def lib_torch_mm():
+        mm(
+            x_mm,
+            w_mm,
+            bias.torch_tensor(),
+            out_dtype=torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16,
+        )
+
+    x_p, x_s = per_token_quant_int8_torch(x.torch_tensor())
+    lib_w8a8int8_linearFunction()
+
+    scaled_mm_torch = torch_scaled_mm(
+        x_p,
+        weights_packed.torch_tensor(),
+        x_s,
+        weights_scale.torch_tensor(),
+        torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16,
+        bias=bias.torch_tensor(),
+    )
+    mm_torch = scaled_mm(
+        x.torch_tensor(),
+        weights_packed.torch_tensor(),
+        weights_scale.torch_tensor(),
+        bias.torch_tensor(),
+        out_dtype=torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16,
+    )
+
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), mm_torch, atol=atol, rtol=rtol)
+
+    # The quantization test did not normalize the test data, leading to large errors; the error check has been temporarily removed.
+
+    def profile_operation(name, func, device, num_prerun, num_iterations):
+        # Warm up
+        for _ in range(num_prerun):
+            func()
+
+        torch.cuda.synchronize()
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        for _ in range(num_iterations):
+            func()
+        end.record()
+
+        torch.cuda.synchronize()
+        elapsed = start.elapsed_time(end)
+        print(
+            f"{name} took {elapsed / num_iterations:.6f} ms over {num_iterations} iterations"
+        )
+
+    # Profiling workflow
+    if PROFILE:
+        profile_operation(
+            "PyTorch mm       ",
+            lambda: lib_torch_mm(),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "lib total        ",
+            lambda: lib_w8a8int8_linearFunction(),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "lib quant        ",
+            lambda: lib_per_channel_quant_int8(),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "lib scaled mm    ",
+            lambda: lib_linear(),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+
+    check_error(LIBINFINIOP.infiniopDestroyI8GemmDescriptor(scaled_mm_descriptor))
+    check_error(
+        LIBINFINIOP.infiniopDestroyPerChannelQuantI8Descriptor(quant_descriptor)
+    )
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/xmake.lua b/xmake.lua
index 0a5b2d473..9fe77add5 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -289,6 +289,7 @@ target("infiniop")
     if has_config("qy-gpu") then
         add_deps("infiniop-qy")
         add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/ops/*/nvidia/*.cu.o", {public = true})
+        add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/ops/*/*/nvidia/*.cu.o", {public = true})
         add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/devices/nvidia/*.cu.o", {public = true})
     end
 
@@ -312,7 +313,7 @@ target("infiniop")
     end
     set_languages("cxx17")
     add_files("src/infiniop/devices/handle.cc")
-    add_files("src/infiniop/ops/*/operator.cc")
+    add_files("src/infiniop/ops/*/operator.cc", "src/infiniop/ops/*/*/operator.cc")
     add_files("src/infiniop/*.cc")
 
     set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua
index db575969b..635ee96d1 100644
--- a/xmake/nvidia.lua
+++ b/xmake/nvidia.lua
@@ -69,7 +69,7 @@ target("infiniop-nvidia")
     end
 
     set_languages("cxx17")
-    add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu")
+    add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu", "../src/infiniop/ops/*/*/nvidia/*.cu")
 
     if has_config("ninetoothed") then
         add_files("../build/ninetoothed/*.c")
diff --git a/xmake/qy.lua b/xmake/qy.lua
index ecef359a8..bbaa37e0a 100644
--- a/xmake/qy.lua
+++ b/xmake/qy.lua
@@ -98,7 +98,7 @@ target("infiniop-qy")
     add_cuflags("-Xcompiler=-Wno-error=deprecated-declarations")
 
     set_languages("cxx17")
-    add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu")
+    add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu", "../src/infiniop/ops/*/*/nvidia/*.cu")
 
     if has_config("ninetoothed") then
         add_files("../build/ninetoothed/*.c")