Reused code for quant/dequant

Egor-Krivov · Egor-Krivov · commit a13736acf958 · 2025-07-15T12:06:11.000Z
diff --git a/bitsandbytes/backends/triton/kernels_8bit_quant.py b/bitsandbytes/backends/triton/kernels_8bit_quant.py
@@ -27,35 +27,19 @@
 @triton.jit
 def dequant_8bit_kernel(
     a_ptr,
-    c_ptr,
-    quant_ptr,
+    out_ptr,
+    code_ptr,
     absmax_ptr,
-    num_paired_elements,
+    n,
     QUANT_BLOCK: tl.constexpr,
     SPLIT_SIZE: tl.constexpr,
 ):
     pid = tl.program_id(axis=0)
     block_start = pid * SPLIT_SIZE
     offsets = block_start + tl.arange(0, SPLIT_SIZE)
-    mask = offsets < num_paired_elements
-
-    a = tl.load(a_ptr + offsets, mask)
-    a = a.to(tl.uint8)
-
-    # apply conversion
-    scaled_int8 = tl.load(quant_ptr + a, mask)
-
-    abs_blocks_lim = (num_paired_elements // QUANT_BLOCK) * QUANT_BLOCK + num_paired_elements % QUANT_BLOCK
-    abs_offsets = offsets // QUANT_BLOCK
-    mask_blocked = offsets < abs_blocks_lim
-
-    absmax = tl.load(absmax_ptr + abs_offsets, mask_blocked)
-    # apply scales
-    out_dq = scaled_int8 * absmax
-
-    offs = block_start + tl.arange(0, SPLIT_SIZE)
-    mask = offs < num_paired_elements
-    tl.store(c_ptr + offs, out_dq, mask)
+    mask = offsets < n
+    out_dq = dequant_8bit_blockwise_kernel_util(a_ptr, offsets, code_ptr, absmax_ptr, mask, QUANT_BLOCK)
+    tl.store(out_ptr + offsets, out_dq, mask)
 
 
 def dequant_8bit_blockwise(
@@ -66,21 +50,21 @@ def dequant_8bit_blockwise(
     dtype: torch.dtype = None,
     out: torch.Tensor = None,
 ):
-    number_of_paired_elements = a.numel()
+    n = a.numel()
     if out is None:
         if dtype is None:
             raise ValueError("If out is None, dtype must be specified")
         out = torch.empty_like(a, dtype=dtype, device=a.device)
 
     SPLIT_SIZE = 256
     # grid = lambda META: (triton.cdiv(number_of_paired_elements, META["SPLIT_SIZE"]),)
-    grid = (triton.cdiv(number_of_paired_elements, SPLIT_SIZE),)
+    grid = (triton.cdiv(n, SPLIT_SIZE),)
     dequant_8bit_kernel[grid](
         a,
         out,
         quant_state_code,
         absmax,
-        number_of_paired_elements,
+        n,
         quant_blocksize,
         SPLIT_SIZE,
     )
@@ -115,39 +99,9 @@ def quantize_8bit_blockwise_kernel(
 
     A = tl.load(A_ptr + offsets, mask=mask, other=0.0)
 
-    # To be able process several blocks -> (BLOCK_SIZE, SPLIT_NUM_BLOCKS)
-    A_reshaped = tl.reshape(A, (SPLIT_NUM_BLOCKS, BLOCK_SIZE))
-
-    # Calculating absamax for each block
-    absmax = tl.max(tl.abs(A_reshaped), axis=1)
+    quantized, absmax = quantize_8bit_blockwise_kernel_util(A, code_ptr, CODE_SIZE, BLOCK_SIZE, SPLIT_NUM_BLOCKS)
     tl.store(absmax_ptr + block_start_idx + tl.arange(0, SPLIT_NUM_BLOCKS), absmax)
-
-    A_normalized = A_reshaped / absmax[:, None]
-    A_normalized = tl.clamp(A_normalized, -1.0, 1.0)
-
-    lower_pivot = tl.zeros((SPLIT_NUM_BLOCKS, BLOCK_SIZE), dtype=tl.int32)
-    upper_pivot = tl.full((SPLIT_NUM_BLOCKS, BLOCK_SIZE), CODE_SIZE - 1, dtype=tl.int32)
-
-    for _ in range(8):  # ceil(log2(code_size)) = 8, actually, in general case should be input parameter
-        pivot = (lower_pivot + upper_pivot) // 2
-        val = tl.load(code_ptr + pivot)
-        is_higher = A_normalized > val  # code[pivot]
-        lower_pivot = tl.where(is_higher, pivot, lower_pivot)
-        upper_pivot = tl.where(is_higher, upper_pivot, pivot)
-
-    # Choose closest level
-    lower_val = tl.load(code_ptr + lower_pivot)
-    upper_val = tl.load(code_ptr + upper_pivot)
-    lower_dist = tl.abs(A_normalized - lower_val)
-    upper_dist = tl.abs(A_normalized - upper_val)
-    quantized = tl.where(lower_dist <= upper_dist, lower_pivot, upper_pivot).to(tl.uint8)
-
-    # too slow approach
-    # diff = tl.abs(A_normalized[:, :, None] - code[None, None, :])
-    # quantized = tl.argmin(diff, axis=2).to(tl.uint8)
-
-    quantized_flat = tl.reshape(quantized, (BLOCK_SIZE * SPLIT_NUM_BLOCKS,))
-    tl.store(out_ptr + offsets, quantized_flat, mask=mask)
+    tl.store(out_ptr + offsets, quantized, mask=mask)
 
 
 def quantize_blockwise_triton(A, code, blocksize, absmax=None, out=None):
@@ -180,17 +134,17 @@ def quantize_blockwise_triton(A, code, blocksize, absmax=None, out=None):
 
 
 @triton.jit
-def quantize_8bit_blockwise_core(
+def quantize_8bit_blockwise_kernel_util(
     a,
-    qmap_ptr,
+    code_ptr,
     CODE_SIZE: tl.constexpr,
     BLOCK_SIZE: tl.constexpr,
     N_PER_TH: tl.constexpr,
 ):
     # To be able process several blocks -> (BLOCK_SIZE, SPLIT_NUM_BLOCKS)
     a_reshaped = tl.reshape(a, (N_PER_TH, BLOCK_SIZE))
 
-    # Calculating absamax for each block
+    # Calculating absmax for each block
     absmax = tl.max(tl.abs(a_reshaped), axis=1)
 
     a_normalized = a_reshaped / absmax[:, None]
@@ -202,37 +156,40 @@ def quantize_8bit_blockwise_core(
     # ceil(log2(code_size)) = 8, actually, in general case should be input parameter
     for _ in range(8):
         pivot = (lower_pivot + upper_pivot) // 2
-        val = tl.load(qmap_ptr + pivot)
+        val = tl.load(code_ptr + pivot)
         is_higher = a_normalized > val  # code[pivot]
         lower_pivot = tl.where(is_higher, pivot, lower_pivot)
         upper_pivot = tl.where(is_higher, upper_pivot, pivot)
 
     # Choose closest level
-    lower_val = tl.load(qmap_ptr + lower_pivot)
-    upper_val = tl.load(qmap_ptr + upper_pivot)
+    lower_val = tl.load(code_ptr + lower_pivot)
+    upper_val = tl.load(code_ptr + upper_pivot)
     lower_dist = tl.abs(a_normalized - lower_val)
     upper_dist = tl.abs(a_normalized - upper_val)
     quantized = tl.where(lower_dist <= upper_dist, lower_pivot, upper_pivot).to(tl.uint8)
 
+    # too slow approach
+    # diff = tl.abs(A_normalized[:, :, None] - code[None, None, :])
+    # quantized = tl.argmin(diff, axis=2).to(tl.uint8)
+
     quantized_flat = tl.reshape(quantized, (BLOCK_SIZE * N_PER_TH,))
     return quantized_flat, absmax
 
 
 @triton.jit
-def dequant_8bit_kernel_util(
-    codes_ptr,
+def dequant_8bit_blockwise_kernel_util(
+    a_ptr,
     offsets,
-    qmap_ptr,
+    code_ptr,
     absmax_ptr,
     mask,
     BLOCK_SIZE: tl.constexpr,
 ):
-    codes = tl.load(codes_ptr + offsets, mask, other=0).to(tl.uint8)
-    abs_offsets = offsets // BLOCK_SIZE
-    absmax = tl.load(absmax_ptr + abs_offsets, mask=mask, other=0.0, eviction_policy="evict_last")
-
-    # apply conversion
-    scaled_int8 = tl.load(qmap_ptr + codes, mask)
-    # apply scales
+    a = tl.load(a_ptr + offsets, mask, other=0).to(tl.uint8)
+    scaled_int8 = tl.load(code_ptr + a, mask)
+    # Load scales
+    absmax_offsets = offsets // BLOCK_SIZE
+    absmax = tl.load(absmax_ptr + absmax_offsets, mask=mask, other=0.0, eviction_policy="evict_last")
+    # Apply scales
     out_dq = scaled_int8 * absmax
     return out_dq
diff --git a/bitsandbytes/backends/triton/kernels_optim.py b/bitsandbytes/backends/triton/kernels_optim.py
@@ -9,8 +9,8 @@
 # from triton.language.extra import libdevice
 from .kernels_8bit_quant import (
     dequant_8bit_blockwise,
-    dequant_8bit_kernel_util,
-    quantize_8bit_blockwise_core,
+    dequant_8bit_blockwise_kernel_util,
+    quantize_8bit_blockwise_kernel_util,
     quantize_blockwise_triton,
 )
 
@@ -445,7 +445,7 @@ def _optimizer_update_1state_8bit_blockwise_triton_kernel(
     # 2. Load and dequantize tensors
     g = tl.load(g_ptr + offsets, mask=mask, other=0.0).to(tl.float32) * gnorm_scale
     p = tl.load(p_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
-    s1 = dequant_8bit_kernel_util(state1_ptr, offsets, qmap1_ptr, absmax1_ptr, mask, BLOCK_SIZE_N)
+    s1 = dequant_8bit_blockwise_kernel_util(state1_ptr, offsets, qmap1_ptr, absmax1_ptr, mask, BLOCK_SIZE_N)
 
     # 3. Optimizer-specific updates
     # LION
@@ -482,7 +482,7 @@ def _optimizer_update_1state_8bit_blockwise_triton_kernel(
 
     # 4. Store updated parameter and requantized state
     tl.store(p_ptr + offsets, p.to(p_ptr.dtype.element_ty), mask=mask)
-    s1_codes, new_absmax1 = quantize_8bit_blockwise_core(s1, qmap1_ptr, 256, BLOCK_SIZE_N, N_PER_TH)
+    s1_codes, new_absmax1 = quantize_8bit_blockwise_kernel_util(s1, qmap1_ptr, 256, BLOCK_SIZE_N, N_PER_TH)
     tl.store(state1_ptr + offsets, s1_codes, mask=mask)
     tl.store(absmax1_ptr + block_start_idx + tl.arange(0, N_PER_TH), new_absmax1)
 
@@ -533,8 +533,8 @@ def _optimizer_update_2state_8bit_blockwise_triton_kernel(
 
     # 3. Optimizer-specific updates
     if OPTIMIZER_ID == 3:  # ADAM
-        s1 = dequant_8bit_kernel_util(state1_ptr, offsets, qmap1_ptr, absmax1_ptr, mask, BLOCK_SIZE_N)
-        s2 = dequant_8bit_kernel_util(state2_ptr, offsets, qmap2_ptr, absmax2_ptr, mask, BLOCK_SIZE_N)
+        s1 = dequant_8bit_blockwise_kernel_util(state1_ptr, offsets, qmap1_ptr, absmax1_ptr, mask, BLOCK_SIZE_N)
+        s2 = dequant_8bit_blockwise_kernel_util(state2_ptr, offsets, qmap2_ptr, absmax2_ptr, mask, BLOCK_SIZE_N)
 
         s1 = s1 * beta1 + (1.0 - beta1) * g
         s2 = s2 * beta2 + (1.0 - beta2) * g * g
@@ -556,26 +556,26 @@ def _optimizer_update_2state_8bit_blockwise_triton_kernel(
         tl.store(p_ptr + offsets, p.to(p_ptr.dtype.element_ty), mask=mask)
 
         # Requantize and store states
-        s1_codes, new_absmax1 = quantize_8bit_blockwise_core(s1, qmap1_ptr, 256, BLOCK_SIZE_N, N_PER_TH)
+        s1_codes, new_absmax1 = quantize_8bit_blockwise_kernel_util(s1, qmap1_ptr, 256, BLOCK_SIZE_N, N_PER_TH)
         tl.store(state1_ptr + offsets, s1_codes, mask=mask)
         tl.store(absmax1_ptr + block_start_idx + tl.arange(0, N_PER_TH), new_absmax1)
 
-        s2_codes, new_absmax2 = quantize_8bit_blockwise_core(s2, qmap2_ptr, 256, BLOCK_SIZE_N, N_PER_TH)
+        s2_codes, new_absmax2 = quantize_8bit_blockwise_kernel_util(s2, qmap2_ptr, 256, BLOCK_SIZE_N, N_PER_TH)
         tl.store(state2_ptr + offsets, s2_codes, mask=mask)
         tl.store(absmax2_ptr + block_start_idx + tl.arange(0, N_PER_TH), new_absmax2)
 
     elif OPTIMIZER_ID == 5:  # ADEMAMIX
         # AdEMAMix has a stacked state1 (m1, m2) and state2 (nu)
-        m1 = dequant_8bit_kernel_util(state1_ptr, offsets, qmap1_ptr, absmax1_ptr, mask, BLOCK_SIZE_N)
-        m2 = dequant_8bit_kernel_util(
+        m1 = dequant_8bit_blockwise_kernel_util(state1_ptr, offsets, qmap1_ptr, absmax1_ptr, mask, BLOCK_SIZE_N)
+        m2 = dequant_8bit_blockwise_kernel_util(
             state1_ptr + n_elements,
             offsets,
             qmap1_ptr,
             absmax1_ptr + n_elements // BLOCK_SIZE_N,
             mask,
             BLOCK_SIZE_N,
         )
-        nu = dequant_8bit_kernel_util(state2_ptr, offsets, qmap2_ptr, absmax2_ptr, mask, BLOCK_SIZE_N)
+        nu = dequant_8bit_blockwise_kernel_util(state2_ptr, offsets, qmap2_ptr, absmax2_ptr, mask, BLOCK_SIZE_N)
 
         m1 = m1 * beta1 + (1.0 - beta1) * g
         m2 = m2 * beta3 + (1.0 - beta3) * g
@@ -599,18 +599,18 @@ def _optimizer_update_2state_8bit_blockwise_triton_kernel(
         tl.store(p_ptr + offsets, p.to(p_ptr.dtype.element_ty), mask=mask)
 
         # Requantize and store all three states
-        m1_codes, new_absmax_m1 = quantize_8bit_blockwise_core(m1, qmap1_ptr, 256, BLOCK_SIZE_N, N_PER_TH)
+        m1_codes, new_absmax_m1 = quantize_8bit_blockwise_kernel_util(m1, qmap1_ptr, 256, BLOCK_SIZE_N, N_PER_TH)
         tl.store(state1_ptr + offsets, m1_codes, mask=mask)
         tl.store(absmax1_ptr + block_start_idx + tl.arange(0, N_PER_TH), new_absmax_m1)
 
-        m2_codes, new_absmax_m2 = quantize_8bit_blockwise_core(m2, qmap1_ptr, 256, BLOCK_SIZE_N, N_PER_TH)
+        m2_codes, new_absmax_m2 = quantize_8bit_blockwise_kernel_util(m2, qmap1_ptr, 256, BLOCK_SIZE_N, N_PER_TH)
         tl.store(state1_ptr + n_elements + offsets, m2_codes, mask=mask)
         tl.store(
             absmax1_ptr + block_start_idx + tl.arange(0, N_PER_TH) + n_elements // BLOCK_SIZE_N,
             new_absmax_m2,
         )
 
-        nu_codes, new_absmax_nu = quantize_8bit_blockwise_core(nu, qmap2_ptr, 256, BLOCK_SIZE_N, N_PER_TH)
+        nu_codes, new_absmax_nu = quantize_8bit_blockwise_kernel_util(nu, qmap2_ptr, 256, BLOCK_SIZE_N, N_PER_TH)
         tl.store(state2_ptr + offsets, nu_codes, mask=mask)
         tl.store(absmax2_ptr + block_start_idx + tl.arange(0, N_PER_TH), new_absmax_nu)
 
@@ -625,7 +625,7 @@ def _optimizer_update_2state_8bit_blockwise_triton_kernel(
 }
 
 
-def optimizer_update_8bit_blockwise_triton_impl(
+def optimizer_update_8bit_blockwise_impl(
     optimizer_name: str,
     g: torch.Tensor,
     p: torch.Tensor,
@@ -703,4 +703,4 @@ def optimizer_update_8bit_blockwise_triton_impl(
 # optimizer_update_8bit_blockwise_impl = torch.compile(optimizer_update_8bit_blockwise_pytorch_impl)
 # optimizer_update_8bit_blockwise_impl = optimizer_update_8bit_blockwise_triton_quant
 # optimizer_update_8bit_blockwise_impl = torch.compile(optimizer_update_8bit_blockwise_triton_quant)
-optimizer_update_8bit_blockwise_impl = optimizer_update_8bit_blockwise_triton_impl
+optimizer_update_8bit_blockwise_impl = optimizer_update_8bit_blockwise_impl