Additional device agnostic tests

matthewdouglas · matthewdouglas · commit 4fae73dbbb8b · 2025-04-17T10:27:12.000-04:00
diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
@@ -7,7 +7,7 @@
 import torch
 
 import bitsandbytes as bnb
-from tests.helpers import TRUE_FALSE, torch_load_from_buffer, torch_save_to_buffer
+from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter, torch_load_from_buffer, torch_save_to_buffer
 
 storage = {
     "uint8": torch.uint8,
@@ -17,15 +17,18 @@
 }
 
 
+@pytest.mark.parametrize("device", get_available_devices())
 @pytest.mark.parametrize("quant_storage", ["uint8", "float16", "bfloat16", "float32"])
-@pytest.mark.parametrize("bias", TRUE_FALSE)
-@pytest.mark.parametrize("compress_statistics", TRUE_FALSE)
+@pytest.mark.parametrize("bias", TRUE_FALSE, ids=id_formatter("bias"))
+@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
 @pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
-@pytest.mark.parametrize("save_before_forward", TRUE_FALSE)
-def test_linear_serialization(quant_type, compress_statistics, bias, quant_storage, save_before_forward):
+@pytest.mark.parametrize("save_before_forward", TRUE_FALSE, ids=id_formatter("save_before_forward"))
+def test_linear_serialization(device, quant_type, compress_statistics, bias, quant_storage, save_before_forward):
+    if device == "cpu":
+        pytest.xfail("Dequantization is not yet implemented for CPU")
+
     original_dtype = torch.float16
     compute_dtype = None
-    device = "cuda"
     layer_shape = (300, 400)
 
     linear = torch.nn.Linear(*layer_shape, dtype=original_dtype, device="cpu")  # original layer
@@ -52,7 +55,7 @@ def test_linear_serialization(quant_type, compress_statistics, bias, quant_stora
     # restoring from state_dict:
     bias_data2 = sd.pop("bias", None)
     weight_data2 = sd.pop("weight")
-    weight2 = bnb.nn.Params4bit.from_prequantized(quantized_stats=sd, data=weight_data2)
+    weight2 = bnb.nn.Params4bit.from_prequantized(quantized_stats=sd, data=weight_data2, device=device)
 
     # creating new layer with same params:
     linear_q2 = bnb.nn.Linear4bit(
@@ -174,18 +177,50 @@ def test_linear_serialization(quant_type, compress_statistics, bias, quant_stora
         assert size_ratio < target_compression, ratio_error_msg
 
 
-def test_copy_param():
-    tensor = torch.tensor([1.0, 2.0, 3.0, 4.0])
-    param = bnb.nn.Params4bit(data=tensor, requires_grad=False).cuda(0)
+@pytest.mark.parametrize("device", get_available_devices())
+@pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
+@pytest.mark.parametrize("blocksize", [64, 128])
+@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
+def test_copy_param(device, quant_type, blocksize, compress_statistics):
+    if device == "cpu":
+        if compress_statistics:
+            pytest.skip("Currently segfaults on CPU")
+        if quant_type == "fp4":
+            pytest.xfail("FP4 not supported on CPU")
+
+    tensor = torch.linspace(1, blocksize, blocksize)
+    param = bnb.nn.Params4bit(
+        data=tensor,
+        quant_type=quant_type,
+        blocksize=blocksize,
+        compress_statistics=compress_statistics,
+        requires_grad=False,
+    ).to(device)
 
     shallow_copy_param = copy.copy(param)
     assert param.quant_state is shallow_copy_param.quant_state
     assert param.data.data_ptr() == shallow_copy_param.data.data_ptr()
 
 
-def test_deepcopy_param():
-    tensor = torch.tensor([1.0, 2.0, 3.0, 4.0])
-    param = bnb.nn.Params4bit(data=tensor, requires_grad=False).cuda(0)
+@pytest.mark.parametrize("device", get_available_devices())
+@pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
+@pytest.mark.parametrize("blocksize", [64, 128])
+@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
+def test_deepcopy_param(device, quant_type, blocksize, compress_statistics):
+    if device == "cpu":
+        if compress_statistics:
+            pytest.skip("Currently segfaults on CPU")
+        if quant_type == "fp4":
+            pytest.xfail("FP4 not supported on CPU")
+
+    tensor = torch.linspace(1, blocksize, blocksize)
+    param = bnb.nn.Params4bit(
+        data=tensor,
+        quant_type=quant_type,
+        blocksize=blocksize,
+        compress_statistics=compress_statistics,
+        requires_grad=False,
+    ).to(device)
     dict_keys_before = set(param.__dict__.keys())
     copy_param = copy.deepcopy(param)
     dict_keys_after = set(param.__dict__.keys())
@@ -199,12 +234,27 @@ def test_deepcopy_param():
     assert dict_keys_before == dict_keys_copy
 
 
-def test_params4bit_real_serialization():
-    original_tensor = torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float32)
-    original_param = bnb.nn.Params4bit(data=original_tensor, quant_type="fp4")
+@pytest.mark.parametrize("device", get_available_devices())
+@pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
+@pytest.mark.parametrize("blocksize", [64, 128])
+@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
+def test_params4bit_real_serialization(device, quant_type, blocksize, compress_statistics):
+    if device == "cpu":
+        if compress_statistics:
+            pytest.skip("Currently segfaults on CPU")
+        if quant_type == "fp4":
+            pytest.xfail("FP4 not supported on CPU")
+
+    original_tensor = torch.linspace(1, blocksize, blocksize, dtype=torch.float32)
+    original_param = bnb.nn.Params4bit(
+        data=original_tensor,
+        quant_type=quant_type,
+        blocksize=blocksize,
+        compress_statistics=compress_statistics,
+    )
     dict_keys_before = set(original_param.__dict__.keys())
 
-    original_param.cuda(0)  # move to CUDA to trigger quantization
+    original_param.to(device)  # change device to trigger quantization
 
     serialized_param = pickle.dumps(original_param)
     deserialized_param = pickle.loads(serialized_param)
diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
@@ -11,6 +11,7 @@
 from bitsandbytes.nn.modules import Linear8bitLt
 from tests.helpers import (
     TRUE_FALSE,
+    get_available_devices,
     id_formatter,
     torch_load_from_buffer,
     torch_save_to_buffer,
@@ -19,7 +20,11 @@
 
 # contributed by Alex Borzunov, see:
 # https://github.com/bigscience-workshop/petals/blob/main/tests/test_linear8bitlt.py
-def test_linear_no_igemmlt():
+@pytest.mark.parametrize("device", get_available_devices())
+def test_linear_no_igemmlt(device):
+    if device == "cpu":
+        pytest.xfail("Not yet implemented on CPU")
+
     linear = torch.nn.Linear(1024, 3072)
     x = torch.randn(3, 1024, dtype=torch.half)
     linear_custom = Linear8bitLt(
@@ -29,6 +34,8 @@ def test_linear_no_igemmlt():
         has_fp16_weights=False,
         threshold=6.0,
     )
+
+    # TODO: Remove, this is no longer implemented
     linear_custom.state.force_no_igemmlt = True
 
     linear_custom.weight = bnb.nn.Int8Params(
@@ -37,11 +44,11 @@ def test_linear_no_igemmlt():
         has_fp16_weights=False,
     ).to(linear.weight.dtype)
     linear_custom.bias = linear.bias
-    linear_custom = linear_custom.cuda()
-    linear = linear.half().cuda()
+    linear_custom = linear_custom.to(device)
+    linear = linear.half().to(device)
 
-    x_ref = x.clone().cuda().requires_grad_(True)
-    x_ours = x.clone().cuda().requires_grad_(True)
+    x_ref = x.clone().to(device).requires_grad_(True)
+    x_ours = x.clone().to(device).requires_grad_(True)
     fx_ref = linear(x_ref).float()
     grad_proj = torch.randn_like(fx_ref)
     (fx_ref * grad_proj).mean().backward()
@@ -58,18 +65,23 @@ def test_linear_no_igemmlt():
     torch.testing.assert_close(x_ref.grad, x_ours.grad, atol=0.01, rtol=1e-5)
 
 
+@pytest.mark.parametrize("device", get_available_devices())
 @pytest.mark.parametrize("has_fp16_weights", TRUE_FALSE, ids=id_formatter("has_fp16_weights"))
 @pytest.mark.parametrize("serialize_before_forward", TRUE_FALSE, ids=id_formatter("serialize_before_forward"))
 @pytest.mark.parametrize("deserialize_before_cuda", TRUE_FALSE, ids=id_formatter("deserialize_before_cuda"))
 @pytest.mark.parametrize("save_before_forward", TRUE_FALSE, ids=id_formatter("save_before_forward"))
 @pytest.mark.parametrize("load_before_cuda", TRUE_FALSE, ids=id_formatter("load_before_cuda"))
 def test_linear_serialization(
+    device,
     has_fp16_weights,
     serialize_before_forward,
     deserialize_before_cuda,
     save_before_forward,
     load_before_cuda,
 ):
+    if device == "cpu":
+        pytest.xfail("Not yet implemented on CPU")
+
     linear = torch.nn.Linear(32, 96)
     # TODO: Fallback for bad shapes
     x = torch.randn(4, 32, dtype=torch.half)
@@ -89,7 +101,7 @@ def test_linear_serialization(
         has_fp16_weights=has_fp16_weights,
     )
     linear_custom.bias = linear.bias
-    linear_custom = linear_custom.cuda()
+    linear_custom = linear_custom.to(device)
 
     if serialize_before_forward:
         state_dict_8bit = linear_custom.state_dict()
@@ -135,7 +147,7 @@ def test_linear_serialization(
     if load_before_cuda:
         new_linear_custom2 = torch_load_from_buffer(bytes_8bit)
 
-    new_linear_custom = new_linear_custom.cuda()
+    new_linear_custom = new_linear_custom.to(device)
 
     if not deserialize_before_cuda:
         new_linear_custom.load_state_dict(new_state_dict, strict=True)
diff --git a/tests/test_modules.py b/tests/test_modules.py