bitsandbytes-foundation
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 7 additions & 9 deletions b/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 7 additions & 9 deletions
@@ -133,3 +133,4 @@ dmypy.json
 
 dependencies
 cuda_build
+.vscode/*
@@ -496,15 +496,15 @@ class MatMul4Bit(torch.autograd.Function):
     # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None")
 
     @staticmethod
-    def forward(ctx, A, B, out=None, bias=None, state=None):
+    def forward(ctx, A, B, out=None, bias=None, quant_state: F.QuantState = None):
         # default of pytorch behavior if inputs are empty
         ctx.is_empty = False
         if prod(A.shape) == 0:
             ctx.is_empty = True
             ctx.A = A
             ctx.B = B
             ctx.bias = bias
-            B_shape = state[1]
+            B_shape = quant_state.shape
             if A.shape[-1] == B_shape[0]:
                 return torch.empty(A.shape[:-1] + B_shape[1:], dtype=A.dtype, device=A.device)
             else:
@@ -513,10 +513,10 @@ def forward(ctx, A, B, out=None, bias=None, state=None):
 
         # 1. Dequantize
         # 2. MatmulnN
-        output = torch.nn.functional.linear(A, F.dequantize_4bit(B, state).to(A.dtype).t(), bias)
+        output = torch.nn.functional.linear(A, F.dequantize_4bit(B, quant_state).to(A.dtype).t(), bias)
 
         # 3. Save state
-        ctx.state = state
+        ctx.state = quant_state
         ctx.dtype_A, ctx.dtype_B, ctx.dtype_bias = A.dtype, B.dtype, None if bias is None else bias.dtype
 
         if any(ctx.needs_input_grad[:2]):
@@ -534,7 +534,6 @@ def backward(ctx, grad_output):
 
         req_gradA, _, _, req_gradBias, _= ctx.needs_input_grad
         A, B = ctx.tensors
-        state = ctx.state
 
         grad_A, grad_B, grad_bias = None, None, None
 
@@ -563,12 +562,11 @@ def matmul(
     return MatMul8bitLt.apply(A, B, out, bias, state)
 
 
-def matmul_4bit(A: tensor, B: tensor, quant_state: List, out: tensor = None, bias=None):
+def matmul_4bit(A: tensor, B: tensor, quant_state: F.QuantState, out: tensor = None, bias=None):
     assert quant_state is not None
     if A.numel() == A.shape[-1] and A.requires_grad == False:
-        absmax, shape, dtype, blocksize, compressed_stats, quant_type, data_type = quant_state
-        if A.shape[-1] % blocksize != 0:
-            warn(f'Some matrices hidden dimension is not a multiple of {blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}')
+        if A.shape[-1] % quant_state.blocksize != 0:
+            warn(f'Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}')
             return MatMul4Bit.apply(A, B, out, bias, quant_state)
         else:
             out = F.gemv_4bit(A, B.t(), out, state=quant_state)
Original file line number	Diff line number	Diff line change
`@@ -133,3 +133,4 @@ dmypy.json`
`133`	`133`
`134`	`134`	`dependencies`
`135`	`135`	`cuda_build`
	`136`	`+.vscode/*`