Try removing extra transpose ops in 4bit

matthewdouglas · matthewdouglas · commit b4370b828be2 · 2025-04-07T11:06:41.000-04:00
diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
@@ -327,7 +327,7 @@ def forward(ctx, A, B, out=None, bias=None, quant_state: Optional[F.QuantState]
 
         # 1. Dequantize
         # 2. MatmulnN
-        output = torch.nn.functional.linear(A, F.dequantize_4bit(B, quant_state).to(A.dtype).t(), bias)
+        output = torch.nn.functional.linear(A, F.dequantize_4bit(B, quant_state).to(A.dtype), bias)
 
         # 3. Save state
         ctx.state = quant_state
@@ -393,7 +393,7 @@ def matmul_4bit(
             )
             return MatMul4Bit.apply(A, B, out, bias, quant_state)
         else:
-            out = F.gemv_4bit(A, B.t(), out, state=quant_state)
+            out = F.gemv_4bit(A, B, out, state=quant_state)
             if bias is not None:
                 out += bias
             return out
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
@@ -480,7 +480,7 @@ def forward(self, x: torch.Tensor):
 
         bias = None if self.bias is None else self.bias.to(self.compute_dtype)
 
-        return bnb.matmul_4bit(x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state).to(inp_dtype)
+        return bnb.matmul_4bit(x, self.weight.data, bias=bias, quant_state=self.weight.quant_state).to(inp_dtype)
 
 
 class LinearFP4(Linear4bit):