From 05aafb21cc9c18e0d82829018cbdaa8224f7d22e Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Fri, 26 Dec 2025 13:12:26 +0800 Subject: [PATCH] =?UTF-8?q?issue/843:=20=E5=A2=9E=E5=8A=A0per=5Fchannel=5F?= =?UTF-8?q?quant=5Fint8=E7=AE=97=E5=AD=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ops/quant/per_channel_quant_int8.h | 28 ++ .../per_channel_quant_int8/cuda/kernel.cuh | 316 ++++++++++++++++ .../ops/quant/per_channel_quant_int8/info.h | 59 +++ .../nvidia/per_channel_quant_int8_nvidia.cu | 118 ++++++ .../nvidia/per_channel_quant_int8_nvidia.cuh | 7 + .../quant/per_channel_quant_int8/operator.cc | 98 +++++ .../per_channel_quant_int8.h | 40 ++ test/infiniop/libinfiniop/op_register.py | 37 ++ test/infiniop/per_channel_quant_int8.py | 193 ++++++++++ test/infiniop/w8a8int8.py | 348 ++++++++++++++++++ xmake.lua | 3 +- xmake/nvidia.lua | 2 +- xmake/qy.lua | 2 +- 13 files changed, 1248 insertions(+), 3 deletions(-) create mode 100644 include/infiniop/ops/quant/per_channel_quant_int8.h create mode 100644 src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh create mode 100644 src/infiniop/ops/quant/per_channel_quant_int8/info.h create mode 100644 src/infiniop/ops/quant/per_channel_quant_int8/nvidia/per_channel_quant_int8_nvidia.cu create mode 100644 src/infiniop/ops/quant/per_channel_quant_int8/nvidia/per_channel_quant_int8_nvidia.cuh create mode 100644 src/infiniop/ops/quant/per_channel_quant_int8/operator.cc create mode 100644 src/infiniop/ops/quant/per_channel_quant_int8/per_channel_quant_int8.h create mode 100644 test/infiniop/per_channel_quant_int8.py create mode 100644 test/infiniop/w8a8int8.py diff --git a/include/infiniop/ops/quant/per_channel_quant_int8.h b/include/infiniop/ops/quant/per_channel_quant_int8.h new file mode 100644 index 000000000..ce21f4556 --- /dev/null +++ b/include/infiniop/ops/quant/per_channel_quant_int8.h @@ -0,0 +1,28 @@ +#ifndef __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__ +#define __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__ + +#include "../../operator_descriptor.h" + +typedef InfiniopDescriptor *infiniopPerChannelQuantI8Descriptor_t; + +__C __export infiniStatus_t infiniopCreatePerChannelQuantI8Descriptor(infiniopHandle_t handle, + infiniopPerChannelQuantI8Descriptor_t *desc_ptr, + infiniopTensorDescriptor_t x_packed_desc, + infiniopTensorDescriptor_t x_scale_desc, + infiniopTensorDescriptor_t x_zero_desc, + infiniopTensorDescriptor_t x_desc); + +__C __export infiniStatus_t infiniopGetPerChannelQuantI8WorkspaceSize(infiniopPerChannelQuantI8Descriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopPerChannelQuantI8(infiniopPerChannelQuantI8Descriptor_t desc, + void *workspace, + size_t workspace_size, + void *x_packed, + void *x_scale, + void *x_zero, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyPerChannelQuantI8Descriptor(infiniopPerChannelQuantI8Descriptor_t desc); + +#endif diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh b/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh new file mode 100644 index 000000000..629cc3b5f --- /dev/null +++ b/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh @@ -0,0 +1,316 @@ +#ifndef __PERCHANNEL_QUANTINT8_KERNEL_CUH__ +#define __PERCHANNEL_QUANTINT8_KERNEL_CUH__ + +#include +/** + * Rounds a floating-point value to the nearest integer using + * the "half away from zero" tie-breaking rule. + * + * This rounding mode rounds to the nearest whole number, with ties + * (values exactly halfway between integers) rounded away from zero. + * For positive numbers: 1.5 rounds to 2, 2.5 rounds to 3 + * For negative numbers: -1.5 rounds to -2, -2.5 rounds to -3 + * This differs from standard "round to nearest, ties to even" banking rounding. + * + * @param x The floating-point value to round. + * @return The rounded integer value as an int. + * + * @note This is a CUDA device function designed to execute on GPU hardware. + * @note Uses floorf() and fabsf() from the CUDA math library. + */ +__device__ inline int round_half_away_from_zero(float x) { + float ax = fabsf(x); + float r = floorf(ax + 0.5f); + return (x >= 0.0f) ? (int)r : -(int)r; +} + +/** + * Performs per-channel asymmetric quantization to int8 precision for large matrices. + * + * This kernel quantizes input matrix x (M x K) to int8 using channel-wise (column-wise) + * quantization parameters, optimized for cases where K >= 1024. Each channel (column) + * has independently computed scale and zero point to minimize quantization error. + * + * The quantization follows: x_quantized = round((x - zero) / scale) + * where zero points shift the range and scales normalize to int8 range [-128, 127]. + * + * @tparam Tdata Input data type (typically float or half) + * @tparam BLOCK_SIZE CUDA block size for thread cooperation + * + * @param x_packed Output buffer for packed int8 quantized values + * @param x_scale Output buffer for per-channel scale factors + * @param x_zero Output buffer for per-channel zero points + * @param x Input matrix in row-major layout (M rows, K columns) + * @param M Number of rows in input matrix + * @param K Number of columns in input matrix (channels) + * + * @note This is a CUDA device function optimized for GPU execution + * @note Designed for large channel dimensions (K >= 1024) to maximize parallelization + * @note Uses block-level reductions for efficient min/max computation per channel + */ +template +__device__ void blockPerChannelQuantI8Kernel( + int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x, + int M, int K) { + int row = blockIdx.x; + int tid = row * K; + + // ---- 1. reduce max ---- + float local_max = op::common_cuda::reduce_op::max( + x + tid, K); + + __shared__ float global_max_f; + if (threadIdx.x == 0) { + global_max_f = local_max; + } + __syncthreads(); + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + // ---- 2. reduce min ---- + float thread_min = __FLT_MAX__; + for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) { + thread_min = fminf(thread_min, (float)x[tid + ind]); + } + float local_min = BlockReduce(temp_storage).Reduce(thread_min, cub::Min()); + + __shared__ float global_min_f; + if (threadIdx.x == 0) { + global_min_f = local_min; + } + __syncthreads(); + + float global_max = global_max_f; + float global_min = global_min_f; + + float scale = (global_max - global_min) / 255.0f; + if (scale < 1e-8f) { + scale = 1e-8f; + } + + float inv_scale = 1.0f / scale; + float zero = -global_min * inv_scale - 128.0f; + + x_scale[row] = (Tdata)scale; + x_zero[row] = (Tdata)zero; + + for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) { + + float v = (float)x[tid + ind]; + float qf = v * inv_scale + zero; + + int q = round_half_away_from_zero(qf); + + if (q > 127) { + q = 127; + } + if (q < -128) { + q = -128; + } + + x_packed[tid + ind] = (int8_t)q; + } +} +/** + * Performs per-channel symmetric quantization to int8 for large matrices (K >= 1024). + * Uses zero-centered scaling only, no zero point, and packs quantized data. + */ +template +__device__ void blockPerChannelQuantI8SymKernel( + int8_t *x_packed, float *x_scale, const Tdata *x, + int M, int K) { + int row = blockIdx.x; + int tid = row * K; + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + // ---- 2. reduce min ---- + float thread_max = -__FLT_MAX__; + for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) { + thread_max = fmaxf(thread_max, fabs((float)x[tid + ind])); + } + float local_max = BlockReduce(temp_storage).Reduce(thread_max, cub::Max()); + + __shared__ float global_max_f; + if (threadIdx.x == 0) { + global_max_f = local_max; + } + __syncthreads(); + + float global_max = global_max_f; + + float scale = global_max / 127.0f; + if (scale < 1e-8f) { + scale = 1e-8f; + } + + float inv_scale = 1.0f / scale; + + x_scale[row] = (Tdata)scale; + + for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) { + + float v = (float)x[tid + ind]; + float qf = v * inv_scale; + + int q = round_half_away_from_zero(qf); + + if (q > 127) { + q = 127; + } + if (q < -127) { + q = -127; + } + + x_packed[tid + ind] = (int8_t)q; + } +} + +template +struct MaxOp { + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + return max(a, b); + } +}; +template +struct MinOp { + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + return min(a, b); + } +}; +template