Quantize: Use FP16 costs, use shmem cost buffer down to 2bpw

turboderp · turboderp · commit 7f45c2ac0949 · 2025-11-01T15:52:40.000+01:00
diff --git a/exllamav3/exllamav3_ext/quant/quantize.cu b/exllamav3/exllamav3_ext/quant/quantize.cu
@@ -5,9 +5,12 @@
 #include "../util.h"
 #include "../util.cuh"
 #include "codebook.cuh"
+#include "exl3_devctx.cuh"
 #include <cmath>
 
 #define NUM_THREADS 1024
+#define H_INF __ushort_as_half(0x7c00)
+#define H_NINF __ushort_as_half(0xfc00)
 
 template <int K, int cb>
 __global__ __launch_bounds__(MIN(NUM_THREADS, 65536 >> K))
@@ -16,10 +19,13 @@ void quantize_tiles_kernel
     const float* __restrict__ input_tiles_ptr,
     float* __restrict__ output_tiles_ptr,
     uint16_t* __restrict__ output_indices_ptr,
-    float* __restrict__ temp_costs_ptr,
+    half* __restrict__ temp_costs_ptr,
     uint16_t* __restrict__ temp_edges_ptr
 )
 {
+    extern __shared__ uint8_t shbuf[];
+    uint8_t* sh = shbuf;
+
     int tile_idx = blockIdx.x;
     int thread = threadIdx.x;
 
@@ -32,46 +38,53 @@ void quantize_tiles_kernel
     uint16_t* output_indices = output_indices_ptr + 256 * tile_idx;
     uint16_t* temp_edges = temp_edges_ptr + 256 * edges * tile_idx;
 
-    // K >= 4 lets temp_costs fit in shmem, otherwise fall back to global temp buffer
-    __shared__ float sh_temp_costs[K >= 4 ? 2 * 65536 >> K : 1];
-    float* temp_costs = K >= 4 ? sh_temp_costs : temp_costs_ptr + 2 * edges * tile_idx;
-    float* temp_costs_inc = temp_costs + edges;
+    // Tile buffer
+    half* sh_input_tile = (half*) sh; sh += 256 * sizeof(half);
+
+    half* sh_min = (half*) sh; sh += 32 * sizeof(half);
+    int* sh_idx = (int*) sh; sh += 32 * sizeof(int);
+
+    // K >= mshk lets temp_costs fit in shmem, otherwise fall back to global temp buffer
+    constexpr int mshk = 2;
+    half* sh_temp_costs = (half*) sh;
+    half* temp_costs = K >= mshk ? sh_temp_costs : temp_costs_ptr + 2 * edges * tile_idx;
+    half* temp_costs_inc = temp_costs + edges;
 
     // Fetch input tile to shmem
-    __shared__ float sh_input_tile[256];
-    if (thread < 256) sh_input_tile[thread] = input_tile[thread];
+    if (thread < 256) sh_input_tile[thread] = __float2half_rn(input_tile[thread]);
     __syncthreads();
 
     auto forward = [&](int roll, int pre_state)
     {
         int ri = roll % 256;
+        half dh, err, min_err, w;
 
         // temp_costs_inc[z] is the cost/cumulative error of an incoming edge from state (z & edge_mask)
-        float* t = temp_costs;
+        half* t = temp_costs;
         temp_costs = temp_costs_inc;
         temp_costs_inc = t;
 
         for (int out_edge_idx = thread; out_edge_idx < edges; out_edge_idx += NUM_THREADS)
         {
-            float w = sh_input_tile[ri];
+            w = sh_input_tile[ri];
 
             int state = out_edge_idx;
             int in_edge_idx = state >> K;
-            float err = decode_3inst_f_diff<cb>(state, w);
-            err = err * err;
-            if (pre_state >= 0 && in_edge_idx != pre_state) err = 1e30f;
-            float min_err = err;
+            dh = __hsub(decode_3inst<cb>(state), w);
+            err = __hmul(dh, dh);
+            if (pre_state >= 0 && in_edge_idx != pre_state) err = H_INF;
+            min_err = err;
             int min_in_edge = in_edge_idx;
 
             #pragma unroll
             for (int k = 1; k < max_q; ++k)
             {
                 state = (k << Kr) | out_edge_idx;
                 in_edge_idx = state >> K;
-                err = decode_3inst_f_diff<cb>(state, w);
-                err = err * err;
-                if (pre_state >= 0 && in_edge_idx != pre_state) err = 1e30f;
-                if (err < min_err) { min_err = err; min_in_edge = in_edge_idx; }
+                dh = __hsub(decode_3inst<cb>(state), w);
+                err = __hmul(dh, dh);
+                if (pre_state >= 0 && in_edge_idx != pre_state) err = H_INF;
+                if (__hlt(err, min_err)) { min_err = err; min_in_edge = in_edge_idx; }
             }
 
             temp_costs[out_edge_idx] = min_err;
@@ -93,23 +106,23 @@ void quantize_tiles_kernel
 
             for (int out_edge_idx = thread; out_edge_idx < edges; out_edge_idx += NUM_THREADS)
             {
-                float w = sh_input_tile[ri];
+                w = sh_input_tile[ri];
 
                 int state = out_edge_idx;
                 int in_edge_idx = state >> K;
-                float err = decode_3inst_f_diff<cb>(state, w);
-                err = err * err + temp_costs_inc[in_edge_idx];
-                float min_err = err;
+                dh = __hsub(decode_3inst<cb>(state), w);
+                err = __hfma(dh, dh, temp_costs_inc[in_edge_idx]);
+                min_err = err;
                 int min_in_edge = in_edge_idx;
 
                 #pragma unroll
                 for (int k = 1; k < max_q; ++k)
                 {
                     state = (k << Kr) | out_edge_idx;
                     in_edge_idx = state >> K;
-                    err = decode_3inst_f_diff<cb>(state, w);
-                    err = err * err + temp_costs_inc[in_edge_idx];
-                    if (err < min_err) { min_err = err; min_in_edge = in_edge_idx; }
+                    dh = __hsub(decode_3inst<cb>(state), w);
+                    err = __hfma(dh, dh, temp_costs_inc[in_edge_idx]);
+                    if (__hlt(err, min_err)) { min_err = err; min_in_edge = in_edge_idx; }
                 }
 
                 temp_costs[out_edge_idx] = min_err;
@@ -125,16 +138,13 @@ void quantize_tiles_kernel
     {
         // Find the final state with the lowest total cost. Return value is only valid in thread 0
 
-        float local_min = 1e30f;
+        half local_min = H_INF;
         int local_idx = -1;
+        #pragma unroll
         for (int e = threadIdx.x; e < edges; e += NUM_THREADS)
         {
-            float v = temp_costs_inc[e];
-            if (v < local_min)
-            {
-                local_min = v;
-                local_idx = e;
-            }
+            half v = temp_costs_inc[e];
+            if (__hlt(v, local_min)) { local_min = v; local_idx = e; }
         }
 
         // Shuffle reduction
@@ -144,33 +154,30 @@ void quantize_tiles_kernel
         #pragma unroll
         for (int offset = 16; offset > 0; offset >>= 1)
         {
-            float other_min = __shfl_down_sync(0xffffffff, local_min, offset, 32);
+            half other_min = __shfl_down_sync(0xffffffff, local_min, offset, 32);
             int other_idx = __shfl_down_sync(0xffffffff, local_idx, offset, 32);
-            if (other_min < local_min)
+            if (__hlt(other_min, local_min))
             {
                 local_min = other_min;
                 local_idx = other_idx;
             }
         }
 
-        __shared__ float s_min[32];
-        __shared__ int s_idx[32];
-
-        s_min[warp_id] = local_min;
-        s_idx[warp_id] = local_idx;
+        sh_min[warp_id] = local_min;
+        sh_idx[warp_id] = local_idx;
         __syncthreads();
 
         if (warp_id == 0)
         {
-            local_min = lane_id * 32 < edges && thread < NUM_THREADS / 32 ? s_min[lane_id] : 1e31f;
-            local_idx = thread < NUM_THREADS ? s_idx[lane_id] : 0;
+            local_min = lane_id * 32 < edges && thread < NUM_THREADS / 32 ? sh_min[lane_id] : H_INF;
+            local_idx = thread < NUM_THREADS ? sh_idx[lane_id] : 0;
 
             #pragma unroll
             for (int offset = 16; offset > 0; offset >>= 1)
             {
-                float other_min = __shfl_down_sync(0xffffffff, local_min, offset, 32);
+                half other_min = __shfl_down_sync(0xffffffff, local_min, offset, 32);
                 int other_idx = __shfl_down_sync(0xffffffff, local_idx, offset, 32);
-                if (other_min < local_min)
+                if (__hlt(other_min, local_min))
                 {
                     local_min = other_min;
                     local_idx = other_idx;
@@ -206,10 +213,9 @@ void quantize_tiles_kernel
         }
 
         // Broadcast to block
-        __shared__ int broadcast;
-        if (thread == 0) broadcast = edge;
+        if (thread == 0) sh_idx[0] = edge;
         __syncthreads();
-        edge = broadcast;
+        edge = sh_idx[0];
 
         return edge;
     };
@@ -270,8 +276,9 @@ void quantize_tiles
     int threads = MIN(NUM_THREADS, edges);
 
     int num_tiles = input_tiles.size(0);
+    if (!num_tiles) return;
 
-    TORCH_CHECK_DTYPE(temp_costs, kFloat);
+    TORCH_CHECK_DTYPE(temp_costs, kHalf);
     TORCH_CHECK_DIM(temp_costs, 3);
     TORCH_CHECK_SIZE(temp_costs, 1, 2);
     TORCH_CHECK_SIZE(temp_costs, 2, edges);
@@ -281,7 +288,10 @@ void quantize_tiles
     TORCH_CHECK_SIZE(temp_edges, 1, 256);
     TORCH_CHECK_SIZE(temp_edges, 2, edges);
 
-    int max_batch_size = temp_costs.size(0);
+    int device;
+    cudaGetDevice(&device);
+    int num_sms = DevCtx::instance().get_num_sms(device);
+    int max_batch_size = MIN(temp_costs.size(0), num_sms);
 
     int cb = 0;
     if (mcg) cb = 1;
@@ -295,13 +305,22 @@ void quantize_tiles
         const float* input_tiles_ptr = ((const float*) input_tiles.data_ptr()) + 256 * batch_i;
         float* output_tiles_ptr = ((float*) output_tiles.data_ptr()) + 256 * batch_i;
         uint16_t* output_indices_ptr = ((uint16_t*) output_indices.data_ptr()) + 256 * batch_i;
-        float* temp_costs_ptr = (float*) temp_costs.data_ptr();
+        half* temp_costs_ptr = (half*) temp_costs.data_ptr();
         uint16_t* temp_edges_ptr = (uint16_t*) temp_edges.data_ptr();
 
         int bsz = batch_j - batch_i;
         int kernel_idx = K - 1 + 8 * cb;
+        int shmem = 2 * (65536 >> K) * sizeof(half) + 512 + 64 + 128;
+
+        cudaFuncSetAttribute
+        (
+            quantize_tiles_kernel_instances[kernel_idx],
+            cudaFuncAttributeMaxDynamicSharedMemorySize,
+            shmem
+        );
+        cuda_check(cudaPeekAtLastError());
 
-        quantize_tiles_kernel_instances[kernel_idx]<<<bsz, threads, 0, stream>>>
+        quantize_tiles_kernel_instances[kernel_idx]<<<bsz, threads, shmem, stream>>>
         (
             input_tiles_ptr,
             output_tiles_ptr,
diff --git a/exllamav3/modules/quant/exl3_lib/quantize.py b/exllamav3/modules/quant/exl3_lib/quantize.py
@@ -45,7 +45,7 @@ def tensor_core_perm_i(device):
 @lru_cache
 def get_temp_buffers(device, K: int):
     max_batch_size = 256
-    temp_costs = torch.zeros((max_batch_size, 2, 65536 >> K), dtype = torch.float, device = device)
+    temp_costs = torch.zeros((max_batch_size, 2, 65536 >> K), dtype = torch.half, device = device)
     temp_edges = torch.zeros((max_batch_size, 256, 65536 >> K), dtype = torch.short, device = device)
     return temp_costs, temp_edges