Merge conflict

matthewdouglas · matthewdouglas · commit 1bc28dbd9cb6 · 2025-06-16T13:29:02.000-04:00
diff --git a/.clang-format b/.clang-format
@@ -0,0 +1,36 @@
+---
+BasedOnStyle: LLVM
+AlignAfterOpenBracket: BlockIndent
+BinPackArguments: true
+BinPackParameters: true
+BracedInitializerIndentWidth: 4
+ColumnLimit: 120
+Cpp11BracedListStyle: true
+IndentWidth: 4
+IndentWrappedFunctionNames: true
+PointerAlignment: Left
+SeparateDefinitionBlocks: Always
+Standard: c++17
+StatementMacros:
+  - 'MAKE_PreconditionOptimizer32bit1State'
+  - 'MAKE_PreconditionOptimizer32bit2State'
+  - 'MAKE_PreconditionStatic8bit1State'
+  - 'MAKE_PreconditionStatic8bit2State'
+  - 'MAKE_Optimizer32bit1State'
+  - 'MAKE_optimizerStatic8bit1State'
+  - 'MAKE_optimizerStatic8bit2State'
+  - 'MAKE_OptimizerStatic8bit1StateBlockwise'
+  - 'MAKE_OptimizerStatic8bit2StateBlockwise'
+  - 'MAKE_kQuantizeBlockwise'
+  - 'MAKE_BLOCKWISE8'
+  - 'MAKE_ELEMENTWISE_FUNC'
+  - 'CMAKE_ELEMENTWISE_FUNC'
+  - 'MAKE_FUNC8'
+  - 'MAKE_FUNC32'
+  - 'MAKE_CBLOCKWISE8'
+  - 'MAKE_CFUNC8'
+  - 'MAKE_CFUNC32'
+
+UseTab: Never
+
+...
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -21,3 +21,9 @@ repos:
     rev: v1.26.0
     hooks:
       - id: typos
+  - repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v20.1.6
+    hooks:
+    - id: clang-format
+      types_or: [c++, c, cuda]
+      files: ^csrc/
diff --git a/bitsandbytes/backends/hpu/ops.py b/bitsandbytes/backends/hpu/ops.py
@@ -29,8 +29,6 @@ def _(
     if A.dtype != torch.uint8:
         A = A.view(torch.uint8)
 
-    transpose = False if len(A.shape) == 2 and A.shape[0] == 1 else True
-
     A = A.reshape(-1)
 
     if GAUDI_SW_VER and (GAUDI_SW_VER.major < 1 or GAUDI_SW_VER.minor < 22):
@@ -47,7 +45,4 @@ def _(
 
     output = out_dq.reshape(shape)
 
-    if transpose:
-        output = output.t()
-
     return output
diff --git a/tests/helpers.py b/tests/helpers.py
@@ -98,3 +98,14 @@ def id_formatter(label: str):
 
 def describe_dtype(dtype: torch.dtype) -> str:
     return DTYPE_NAMES.get(dtype) or str(dtype).rpartition(".")[2]
+
+
+def is_supported_on_hpu(
+    quant_type: str = "nf4", dtype: torch.dtype = torch.bfloat16, quant_storage: torch.dtype = torch.uint8
+) -> bool:
+    """
+    Check if the given quant_type, dtype and quant_storage are supported on HPU.
+    """
+    if quant_type == "fp4" or dtype == torch.float16 or quant_storage not in (torch.uint8, torch.bfloat16):
+        return False
+    return True
diff --git a/tests/test_autograd.py b/tests/test_autograd.py
@@ -8,6 +8,7 @@
     describe_dtype,
     get_available_devices,
     id_formatter,
+    is_supported_on_hpu,
 )
 
 TRANSPOSE_VALS = [(False, True), (False, False)]
@@ -189,8 +190,8 @@ def test_matmul_4bit(
     if device == "cpu" and dtype != torch.float32 and any(req_grad) and torch.__version__ < (2, 6):
         pytest.xfail("mse_loss fp16 on CPU is not supported in torch < 2.6")
 
-    if device == "hpu" and quant_type != "nf4":
-        pytest.skip("HPU only supports nf4")
+    if device == "hpu" and not is_supported_on_hpu(quant_type, dtype):
+        pytest.skip("This configuration is not supported on HPU.")
 
     for i in range(3):
         # normal multiply
diff --git a/tests/test_functional.py b/tests/test_functional.py
@@ -16,6 +16,7 @@
     get_available_devices,
     get_test_dims,
     id_formatter,
+    is_supported_on_hpu,
 )
 
 torch.set_printoptions(precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000)
@@ -1101,8 +1102,8 @@ class TestQuantize4BitFunctional:
     @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
     @pytest.mark.parametrize("blocksize", [64, 128, 256, 512, 1024, 2048, 4096])
     def test_4bit_quant(self, device, dtype, quant_type, blocksize):
-        if device == "hpu" and quant_type != "nf4":
-            pytest.skip("fp4 dequantization is not supported on HPU")
+        if device == "hpu" and not is_supported_on_hpu(quant_type, dtype):
+            pytest.skip("This configuration is not supported on HPU.")
 
         A1 = torch.randn(1024, 1024, device=device, dtype=dtype)
         qa, SA = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type)
@@ -1135,14 +1136,15 @@ def test_4bit_quant(self, device, dtype, quant_type, blocksize):
     @pytest.mark.parametrize("device", get_available_devices())
     @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
     @pytest.mark.parametrize("blocksize", [64, 128], ids=id_formatter("blocksize"))
-    def test_4bit_compressed_stats(self, device, quant_type, blocksize):
-        if device == "hpu" and quant_type != "nf4":
-            pytest.skip("fp4 dequantization is not supported on HPU")
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=describe_dtype)
+    def test_4bit_compressed_stats(self, device, quant_type, blocksize, dtype):
+        if device == "hpu" and not is_supported_on_hpu(quant_type, dtype):
+            pytest.skip("FP4 quantization is not supported on HPU.")
 
         errs1 = []
         errs2 = []
         for i in range(10):
-            A1 = torch.randn(1024, 1024, device=device).half()
+            A1 = torch.randn(1024, 1024, device=device, dtype=dtype)
             q2, SA2 = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type)
             q3, SA3 = F.quantize_4bit(A1, blocksize=blocksize, compress_statistics=True, quant_type=quant_type)
             A2 = F.dequantize_4bit(q2, SA2, quant_type=quant_type)
@@ -1211,8 +1213,8 @@ def test_bench_4bit_dequant(self, quant_type):
     )
     @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
     def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
-        if device == "hpu":
-            pytest.skip("gemv not supported on HPU")
+        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
+            pytest.skip("This configuration is not supported on HPU.")
 
         errs1 = []
         errs2 = []
@@ -1363,8 +1365,8 @@ def test_gemv_eye_4bit(self, device, storage_type, dtype, double_quant):
         if device == "cpu" and dtype == torch.bfloat16 and torch.__version__ < (2, 3):
             pytest.skip("eye doe not support bfloat16 on CPU in torch < 2.3")
 
-        if device == "hpu" and storage_type != "nf4":
-            pytest.skip("fp4 dequantization is not supported on HPU")
+        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype):
+            pytest.skip("This configuration is not supported on HPU.")
 
         dims = 10
         torch.random.manual_seed(np.random.randint(0, 412424242))
diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
@@ -13,6 +13,7 @@
     describe_dtype,
     get_available_devices,
     id_formatter,
+    is_supported_on_hpu,
     torch_load_from_buffer,
     torch_save_to_buffer,
 )
@@ -27,12 +28,17 @@
 
 @pytest.mark.parametrize("device", get_available_devices())
 @pytest.mark.parametrize("quant_storage", ["uint8", "float16", "bfloat16", "float32"])
+@pytest.mark.parametrize("original_dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("bias", TRUE_FALSE, ids=id_formatter("bias"))
 @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
 @pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
 @pytest.mark.parametrize("save_before_forward", TRUE_FALSE, ids=id_formatter("save_before_forward"))
-def test_linear_serialization(device, quant_type, compress_statistics, bias, quant_storage, save_before_forward):
-    original_dtype = torch.float16
+def test_linear_serialization(
+    device, quant_type, original_dtype, compress_statistics, bias, quant_storage, save_before_forward
+):
+    if device == "hpu" and not is_supported_on_hpu(quant_type, original_dtype, storage[quant_storage]):
+        pytest.skip("This configuration is not supported on HPU.")
+
     compute_dtype = None
     layer_shape = (300, 400)
 
@@ -188,6 +194,9 @@ def test_linear_serialization(device, quant_type, compress_statistics, bias, qua
 @pytest.mark.parametrize("blocksize", [64, 128])
 @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
 def test_copy_param(device, quant_type, blocksize, compress_statistics):
+    if device == "hpu" and not is_supported_on_hpu(quant_type):
+        pytest.skip("This configuration is not supported on HPU.")
+
     tensor = torch.randn(300, 400)
     param = bnb.nn.Params4bit(
         data=tensor,
@@ -207,6 +216,9 @@ def test_copy_param(device, quant_type, blocksize, compress_statistics):
 @pytest.mark.parametrize("blocksize", [64, 128])
 @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
 def test_deepcopy_param(device, quant_type, blocksize, compress_statistics):
+    if device == "hpu" and not is_supported_on_hpu(quant_type):
+        pytest.skip("This configuration is not supported on HPU.")
+
     tensor = torch.randn(300, 400)
     param = bnb.nn.Params4bit(
         data=tensor,
@@ -233,6 +245,9 @@ def test_deepcopy_param(device, quant_type, blocksize, compress_statistics):
 @pytest.mark.parametrize("blocksize", [64, 128])
 @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
 def test_params4bit_real_serialization(device, quant_type, blocksize, compress_statistics):
+    if device == "hpu" and not is_supported_on_hpu(quant_type):
+        pytest.skip("This configuration is not supported on HPU.")
+
     original_tensor = torch.randn(300, 400)
     original_param = bnb.nn.Params4bit(
         data=original_tensor,
@@ -270,6 +285,9 @@ def test_params4bit_real_serialization(device, quant_type, blocksize, compress_s
 @pytest.mark.parametrize("mode", ["default", "reduce-overhead"], ids=id_formatter("mode"))
 @pytest.mark.skipif(torch.__version__ < (2, 4), reason="Not supported in torch < 2.4")
 def test_linear4bit_torch_compile(device, quant_type, compute_dtype, compress_statistics, bias, fullgraph, mode):
+    if device == "hpu" and not is_supported_on_hpu(quant_type):
+        pytest.skip("This configuration is not supported on HPU.")
+
     if fullgraph and torch.__version__ < (2, 8, 0, "dev"):
         pytest.skip("fullgraph mode requires torch 2.8 or higher")
 
@@ -317,7 +335,8 @@ def test_linear4bit_torch_compile(device, quant_type, compute_dtype, compress_st
         ref_output = net(x)
 
     # Compile the model
-    compiled_net = torch.compile(net, fullgraph=fullgraph, mode=mode)
+    compile_backend = "hpu_backend" if device == "hpu" else "inductor"
+    compiled_net = torch.compile(net, fullgraph=fullgraph, mode=mode, backend=compile_backend)
 
     # Get output from compiled model
     with torch.no_grad():
diff --git a/tests/test_ops.py b/tests/test_ops.py
@@ -5,7 +5,7 @@
 
 import bitsandbytes
 from bitsandbytes.functional import ipex_xpu
-from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter
+from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter, is_supported_on_hpu
 
 # torch.library.opcheck is only available in torch 2.4 and later.
 # When testing with older versions, we will skip it as a no-op.
@@ -158,6 +158,9 @@ class Test4bitBlockwiseQuantOps:
     @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
     @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
     def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
+        if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype):
+            pytest.skip("This configuration is not supported on HPU.")
+
         A = torch.randn(1024, 1024, dtype=dtype, device=device)
 
         out, absmax = torch.ops.bitsandbytes.quantize_4bit.default(A, blocksize, quant_type, storage_dtype)
@@ -179,8 +182,8 @@ def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize
     @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
     @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
     def test_dequantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
-        if device == "hpu" and quant_type != "nf4":
-            pytest.skip("fp4 dequantization is not supported on HPU")
+        if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype):
+            pytest.skip("This configuration is not supported on HPU.")
 
         shape = (128, 128)
 
@@ -213,8 +216,8 @@ def test_dequantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksi
     @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
     @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
     def test_gemv_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
-        if device == "hpu":
-            pytest.skip("gemv not supported on HPU")
+        if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype):
+            pytest.skip("This configuration is not supported on HPU.")
 
         out_features = 1024
         in_features = 256