Type annotations, cleanup

matthewdouglas · matthewdouglas · commit b1c4adc4cf89 · 2024-11-05T15:44:22.000-05:00
diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
@@ -244,25 +244,26 @@ def get_tile_inds(format, device):
 @dataclass
 class MatmulLtState:
     _tile_indices: Optional[torch.Tensor] = None
+
     force_no_igemmlt: bool = False
-    CB = None
-    CxB = None  # TODO: Deprecate/remove
-    SB = None
-    SCB = None
 
-    CxBt = None  # TODO: Deprecate/remove
-    SBt = None
-    CBt = None
+    CB: Optional[torch.Tensor] = None
+    CxB: Optional[torch.Tensor] = None  # TODO: Deprecate/remove
+    SB: Optional[torch.Tensor] = None
+    SCB: Optional[torch.Tensor] = None
+
+    CxBt: Optional[torch.Tensor] = None  # TODO: Deprecate/remove
+    SBt: Optional[torch.Tensor] = None
+    CBt: Optional[torch.Tensor] = None
 
-    subB = None
+    subB: Optional[torch.Tensor] = None
 
-    outlier_pool = None
+    outlier_pool: Optional[GlobalOutlierPooler] = None
     has_accumulated_gradients = False
     threshold = 0.0
-    idx = None
+    idx: Optional[torch.Tensor] = None
     is_training = True
     has_fp16_weights = True
-    memory_efficient_backward = False
     use_pool = False
     formatB = "row"  # TODO: Deprecate/remove
 
@@ -313,10 +314,10 @@ def forward(
         if A.dtype != torch.float16:
             warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
 
-        # 1. Quantize A. Note that as a side-effect, outliers are suppressed.
         if len(A.shape) == 3:
             A = A.reshape(-1, A.shape[-1])
 
+        # 1. Quantize A. Note that as a side-effect, outliers are suppressed in CA/CAt.
         if ctx.needs_input_grad[1]:
             # Slower path
             CA, CAt, SCA, SCAt, outlier_cols = F.double_quant(A.to(torch.float16), threshold=state.threshold)
@@ -366,6 +367,8 @@ def forward(
 
         # 3. Int8 Matmul
         out32 = F.int8_linear_matmul(CA, state.CB)
+
+        # Dequantize matmul result
         if bias is None or bias.dtype == torch.float16:
             # we apply the fused bias here
             output = F.int8_mm_dequant(out32, SCA, state.SCB, bias=bias).to(A.dtype)
@@ -375,7 +378,7 @@ def forward(
 
         # 4. Mixed-precision decomposition matmul
         if subA is not None and state.subB is not None:
-            output += torch.matmul(subA, state.subB.to(subA.dtype))
+            output += torch.matmul(subA, state.subB)
 
         # 5. Save state
         ctx.state = state
@@ -399,15 +402,15 @@ def forward(
         return output
 
     @staticmethod
-    def backward(ctx, grad_output):
+    def backward(ctx: torch.autograd.function.FunctionCtx, grad_output: torch.Tensor):
         if ctx.is_empty:
             bias_grad = None if ctx.bias is None else torch.zeros_like(ctx.bias)
             return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None
 
         req_gradA, req_gradB, _, req_gradBias, _ = ctx.needs_input_grad
         CAt, subA, A = ctx.tensors
         SCAt, idx = ctx.tensor_states
-        state = ctx.state
+        state: MatmulLtState = ctx.state
         grad_A = grad_B = grad_bias = None
 
         if req_gradBias:
@@ -499,7 +502,7 @@ def matmul(
     out: Optional[torch.Tensor] = None,
     state: Optional[MatmulLtState] = None,
     threshold=0.0,
-    bias=None,
+    bias: Optional[torch.Tensor] = None,
 ):
     state = state or MatmulLtState()
     if threshold > 0.0:
@@ -512,7 +515,7 @@ def matmul_4bit(
     B: torch.Tensor,
     quant_state: F.QuantState,
     out: Optional[torch.Tensor] = None,
-    bias=None,
+    bias: Optional[torch.Tensor] = None,
 ):
     assert quant_state is not None
 
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
@@ -1,21 +1,3 @@
-"""
-extract factors the build is dependent on:
-[X] compute capability
-    [ ] TODO: Q - What if we have multiple GPUs of different makes?
-- CUDA version
-- Software:
-    - CPU-only: only CPU quantization functions (no optimizer, no matrix multiple)
-    - CuBLAS-LT: full-build 8-bit optimizer
-    - no CuBLAS-LT: no 8-bit matrix multiplication (`nomatmul`)
-
-evaluation:
-    - if paths faulty, return meaningful error
-    - else:
-        - determine CUDA version
-        - determine capabilities
-        - based on that set the default path
-"""
-
 import ctypes as ct
 import logging
 import os
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -2279,7 +2279,9 @@ def int8_linear_matmul(A: torch.Tensor, B: torch.Tensor, out: Optional[torch.Ten
     ldb = shapeB[-1]  # Activations (batch, tokens, inputs)
     ldc = shapeC[-1]  # Output (batch, tokens, outputs)
 
-    assert lda == ldb, f"igemmlt only supports B^T @ A. Inner dimensions do not match: B @ A = {shapeB} @ {shapeA}"
+    assert (
+        lda == ldb
+    ), f"int8_linear_matmul only supports B^T @ A. Inner dimensions do not match: B @ A = {shapeB} @ {shapeA}"
 
     is_on_gpu([A, B, out])
 
@@ -2288,7 +2290,7 @@ def int8_linear_matmul(A: torch.Tensor, B: torch.Tensor, out: Optional[torch.Ten
         ptrA = get_ptr(A)
         ptrB = get_ptr(B)
         ptrC = get_ptr(out)
-        ptrRowScale = get_ptr(None)
+        ptrRowScale = None
         m = ct.c_int32(m)
         n = ct.c_int32(n)
         k = ct.c_int32(k)
@@ -2303,7 +2305,7 @@ def int8_linear_matmul(A: torch.Tensor, B: torch.Tensor, out: Optional[torch.Ten
             has_error = lib.cigemmlt_8(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
 
     if has_error == 100:  # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`
-        raise NotImplementedError("igemmlt not implemented!")
+        raise NotImplementedError("int8_linear_matmul not implemented!")
 
     if has_error:
         raise RuntimeError(
@@ -2369,7 +2371,7 @@ def get_colrow_absmax(
     col_stats: Optional[torch.Tensor] = None,
     nnz_block_ptr: Optional[torch.Tensor] = None,
     threshold=0.0,
-):
+) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
     # Note: prior impl only works with fp16
     assert A.is_floating_point()
 
@@ -2395,7 +2397,7 @@ def get_colrow_absmax(
     return row_stats, col_stats, outlier_mask
 
 
-def get_row_absmax(A, threshold=0.0):
+def get_row_absmax(A: torch.Tensor, threshold=0.0):
     assert A.dtype == torch.float16
 
     rows = prod(A.shape[:-1])
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
@@ -566,11 +566,11 @@ def __init__(
 class Int8Params(torch.nn.Parameter):
     def __new__(
         cls,
-        data=None,
+        data: Optional[torch.Tensor] = None,
         requires_grad=True,
         has_fp16_weights=False,
-        CB=None,
-        SCB=None,
+        CB: Optional[torch.Tensor] = None,
+        SCB: Optional[torch.Tensor] = None,
     ):
         if data is None:
             data = torch.empty(0)
@@ -881,7 +881,6 @@ def __init__(
         output_features: int,
         bias=True,
         has_fp16_weights=True,
-        memory_efficient_backward=False,
         threshold=0.0,
         index=None,
         device=None,
@@ -898,13 +897,12 @@ def __init__(
                 Whether the linear class uses the bias term as well.
         """
         super().__init__(input_features, output_features, bias, device)
-        assert not memory_efficient_backward, "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0"
         self.state = bnb.MatmulLtState()
         self.index = index
 
         self.state.threshold = threshold
         self.state.has_fp16_weights = has_fp16_weights
-        self.state.memory_efficient_backward = memory_efficient_backward
+
         if threshold > 0.0 and not has_fp16_weights:
             self.state.use_pool = True
 
diff --git a/bitsandbytes/research/autograd/_functions.py b/bitsandbytes/research/autograd/_functions.py
@@ -328,9 +328,6 @@ def backward(ctx, grad_output):
             grad_B = torch.matmul(grad_output.t(), A)
 
         if req_gradA:
-            # if state.CBt is not None:
-            #    gradA32, SgradA32 = F.igemmlt(Cgrad, state.CBt.t())
-            #    grad_A = F.mm_dequant(gradA32, SgradA32, SCgrad, state.SCBt).view(ctx.grad_shape).to(ctx.dtype_A)
             if state.CB is not None:
                 CB = state.CB.to(ctx.dtype_A, copy=True).mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0))
                 grad_A = torch.matmul(grad_output, CB).view(ctx.grad_shape).to(ctx.dtype_A)