Add int8 dequant function; misc improvements

matthewdouglas · matthewdouglas · commit df941ec723bc · 2024-11-25T10:32:46.000-05:00
diff --git a/benchmarking/int8/training_benchmark.py b/benchmarking/int8/training_benchmark.py
@@ -13,6 +13,8 @@
 
 k = 20
 
+torch.set_printoptions(precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000)
+
 
 @pytest.mark.parametrize(
     ("batch", "seq", "model", "hidden"),
diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
@@ -350,18 +350,17 @@ def forward(
                 CAt[:, state.idx] = 0
 
             # Extract the input outliers in original precision
-            subA = A[:, state.idx]
+            subA = A[:, state.idx].contiguous()
 
             # Extract the corresponding weights
             if state.has_fp16_weights:
                 state.subB = B[:, state.idx].t()
             else:
-                outliers = state.CB[:, state.idx]
-
                 # To dequantize our weights associated with the input outliers,
                 # we want to divide by 127. It's however more performant to multiply
                 # by the reciprocal.
-                state.subB = (7.874016e-3 * outliers * state.SCB.view(-1, 1)).t().to(A.dtype)
+                outliers = state.CB[:, state.idx]
+                state.subB = (outliers.t() * state.SCB * 7.874015718698502e-3).to(A.dtype)
         else:
             subA = None
 
@@ -378,7 +377,7 @@ def forward(
 
         # 4. Mixed-precision decomposition matmul
         if subA is not None and state.subB is not None:
-            output += torch.matmul(subA, state.subB)
+            output = output.addmm(subA, state.subB)
 
         # 5. Save state
         ctx.state = state
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -2722,6 +2722,20 @@ def int8_double_quant(
     return quant_row, quant_col, row_stats, col_stats.flatten().float(), outlier_cols
 
 
+def int8_vectorwise_dequant(A: torch.Tensor, stats: torch.Tensor):
+    """Dequantizes a tensor with dtype `torch.int8` to `torch.float32`.
+
+    Args:
+        A (`torch.Tensor` with dtype `torch.int8`): The quantized int8 tensor.
+        stats (`torch.Tensor` with dtype `torch.float32`): The row-wise quantization statistics.
+
+    Returns:
+        `torch.Tensor` with dtype `torch.float32`: The dequantized tensor.
+    """
+    # To dequantize we divide by 127, or multiply by the reciprocal.
+    return A * stats.view(-1, 1) * 7.874015718698502e-3
+
+
 def int8_vectorwise_quant(A: torch.Tensor, threshold=0.0):
     """Quantizes a tensor with dtype `torch.float16` to `torch.int8` in accordance to the `LLM.int8()` algorithm.
 
@@ -3026,7 +3040,10 @@ def vectorwise_quant(x, dim=1, quant_type="vector"):
         return None
 
 
-@deprecated("This function is deprecated and will be removed in a future release.", category=FutureWarning)
+@deprecated(
+    "This function is deprecated and will be removed in a future release. Consider using `int8_vectorwise_dequant` instead.",
+    category=FutureWarning,
+)
 def vectorwise_dequant(xq, max1, quant_type="vector"):
     if quant_type == "vector":
         x = (xq / C * max1).to(torch.float32)
diff --git a/csrc/kernels.cu b/csrc/kernels.cu
@@ -2159,7 +2159,7 @@ __global__ void kInt8VectorQuant(T * __restrict__ A, int8_t* out, float* rowStat
   // Threads will read the row values in a striped access pattern and find a local absmax.
   float row_local_absmax = -FLT_MIN;
   for (int i = threadIdx.x; i < cols; i += THREADS) {
-    const float absval = fabsf(__ldg(&(row_data[i])));
+    const float absval = fabsf(__ldcs(&(row_data[i])));
 
     // For sparse decomposition, values outside of the threshold are not to be
     // included when calculating the row's absmax.
@@ -2171,7 +2171,6 @@ __global__ void kInt8VectorQuant(T * __restrict__ A, int8_t* out, float* rowStat
   }
 
   // Reduce thread-local absmax across the block.
-  // TODO: Consider algorithm BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY
   const float row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, cub::Max(), cols);
   if (threadIdx.x == 0) {
     // Save our block's absmax to shared memory for the quantization step.
diff --git a/tests/test_functional.py b/tests/test_functional.py
@@ -695,38 +695,6 @@ def test_igemmlt_row_scale(dim1, dim4, inner):
     print(sum(err3) / len(err3))
 
 
-@pytest.mark.parametrize("dim1", get_test_dims(2, 1024, n=2), ids=id_formatter("dim1"))
-@pytest.mark.parametrize("dim2", get_test_dims(2, 1024, n=2), ids=id_formatter("dim2"))
-@pytest.mark.parametrize("dim3", [0], ids=id_formatter("dim3"))
-@pytest.mark.parametrize("dims", [2], ids=id_formatter("dims"))
-@pytest.mark.parametrize("dtype", [torch.int8], ids=describe_dtype)
-@pytest.mark.parametrize("orderA", ["row"], ids=id_formatter("orderA"))
-@pytest.mark.parametrize("orderOut", ["col32", "col_turing", "col_ampere"], ids=id_formatter("orderOut"))
-@pytest.mark.parametrize("transpose", TRUE_FALSE, ids=id_formatter("transpose"))
-@pytest.mark.deprecated
-def test_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose):
-    for i in range(k):
-        if dims == 2:
-            A = torch.randint(10, 99, size=(dim1, dim2), device="cuda").to(dtype)
-        elif dims == 3:
-            A = torch.randint(10, 99, size=(dim1, dim2, dim3), device="cuda").to(dtype)
-
-        A.view(-1)[-1] = -1
-        if transpose:
-            At = A.t().contiguous()
-            out1, S1 = F.nvidia_transform(At, to_order=orderOut)
-        else:
-            out1, S1 = F.nvidia_transform(A, to_order=orderOut)
-        out2, S2 = F.transform(A, to_order=orderOut, transpose=transpose)
-
-        assert S1[0][0] == S2[0][0]
-        assert S1[0][1] == S2[0][1]
-        # print(out1)
-        # print(out2)
-
-        torch.testing.assert_close(out1, out2)
-
-
 @pytest.mark.parametrize("dim1", [512, 2048], ids=id_formatter("dim1"))
 @pytest.mark.parametrize("dim2", [1024, 4096], ids=id_formatter("dim2"))
 def test_coo_double_quant(dim1, dim2):
@@ -1782,6 +1750,38 @@ def test_percentile_clipping(gtype):
         torch.testing.assert_close(gnorm1, gnorm2)
 
 
+@pytest.mark.parametrize("dim1", get_test_dims(2, 1024, n=2), ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", get_test_dims(2, 1024, n=2), ids=id_formatter("dim2"))
+@pytest.mark.parametrize("dim3", [0], ids=id_formatter("dim3"))
+@pytest.mark.parametrize("dims", [2], ids=id_formatter("dims"))
+@pytest.mark.parametrize("dtype", [torch.int8], ids=describe_dtype)
+@pytest.mark.parametrize("orderA", ["row"], ids=id_formatter("orderA"))
+@pytest.mark.parametrize("orderOut", ["col32", "col_turing", "col_ampere"], ids=id_formatter("orderOut"))
+@pytest.mark.parametrize("transpose", TRUE_FALSE, ids=id_formatter("transpose"))
+@pytest.mark.deprecated
+def test_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose):
+    for i in range(k):
+        if dims == 2:
+            A = torch.randint(10, 99, size=(dim1, dim2), device="cuda").to(dtype)
+        elif dims == 3:
+            A = torch.randint(10, 99, size=(dim1, dim2, dim3), device="cuda").to(dtype)
+
+        A.view(-1)[-1] = -1
+        if transpose:
+            At = A.t().contiguous()
+            out1, S1 = F.nvidia_transform(At, to_order=orderOut)
+        else:
+            out1, S1 = F.nvidia_transform(A, to_order=orderOut)
+        out2, S2 = F.transform(A, to_order=orderOut, transpose=transpose)
+
+        assert S1[0][0] == S2[0][0]
+        assert S1[0][1] == S2[0][1]
+        # print(out1)
+        # print(out2)
+
+        torch.testing.assert_close(out1, out2)
+
+
 @pytest.mark.parametrize("dim1", get_test_dims(2, 256, n=2), ids=id_formatter("dim1"))
 @pytest.mark.parametrize("dim2", get_test_dims(2, 256, n=2), ids=id_formatter("dim2"))
 @pytest.mark.parametrize("dim3", get_test_dims(2, 256, n=2), ids=id_formatter("dim3"))