Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions bitsandbytes/backends/hpu/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@ def _(
if A.dtype != torch.uint8:
A = A.view(torch.uint8)

transpose = False if len(A.shape) == 2 and A.shape[0] == 1 else True

A = A.reshape(-1)

if GAUDI_SW_VER and (GAUDI_SW_VER.major < 1 or GAUDI_SW_VER.minor < 22):
Expand All @@ -47,7 +45,4 @@ def _(

output = out_dq.reshape(shape)

if transpose:
output = output.t()

return output
11 changes: 11 additions & 0 deletions tests/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,14 @@ def id_formatter(label: str):

def describe_dtype(dtype: torch.dtype) -> str:
return DTYPE_NAMES.get(dtype) or str(dtype).rpartition(".")[2]


def is_supported_on_hpu(
quant_type: str = "nf4", dtype: torch.dtype = torch.bfloat16, quant_storage: torch.dtype = torch.uint8
) -> bool:
"""
Check if the given quant_type, dtype and quant_storage are supported on HPU.
"""
if quant_type == "fp4" or dtype == torch.float16 or quant_storage not in (torch.uint8, torch.bfloat16):
return False
return True
4 changes: 4 additions & 0 deletions tests/test_autograd.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
describe_dtype,
get_available_devices,
id_formatter,
is_supported_on_hpu,
)

TRANSPOSE_VALS = [(False, True), (False, False)]
Expand Down Expand Up @@ -189,6 +190,9 @@ def test_matmul_4bit(
if device == "cpu" and dtype != torch.float32 and any(req_grad) and torch.__version__ < (2, 6):
pytest.xfail("mse_loss fp16 on CPU is not supported in torch < 2.6")

if device == "hpu" and not is_supported_on_hpu(quant_type, dtype):
pytest.skip("This configuration is not supported on HPU.")

for i in range(3):
# normal multiply
if funcs[0] in [torch.mm, torch.matmul]:
Expand Down
18 changes: 16 additions & 2 deletions tests/test_functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
get_available_devices,
get_test_dims,
id_formatter,
is_supported_on_hpu,
)

torch.set_printoptions(precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000)
Expand Down Expand Up @@ -1101,6 +1102,9 @@ class TestQuantize4BitFunctional:
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
@pytest.mark.parametrize("blocksize", [64, 128, 256, 512, 1024, 2048, 4096])
def test_4bit_quant(self, device, dtype, quant_type, blocksize):
if device == "hpu" and not is_supported_on_hpu(quant_type, dtype):
pytest.skip("This configuration is not supported on HPU.")

A1 = torch.randn(1024, 1024, device=device, dtype=dtype)
qa, SA = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type)
A2 = F.dequantize_4bit(qa, SA, blocksize=blocksize, quant_type=quant_type)
Expand Down Expand Up @@ -1132,11 +1136,15 @@ def test_4bit_quant(self, device, dtype, quant_type, blocksize):
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
@pytest.mark.parametrize("blocksize", [64, 128], ids=id_formatter("blocksize"))
def test_4bit_compressed_stats(self, device, quant_type, blocksize):
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=describe_dtype)
def test_4bit_compressed_stats(self, device, quant_type, blocksize, dtype):
if device == "hpu" and not is_supported_on_hpu(quant_type, dtype):
pytest.skip("FP4 quantization is not supported on HPU.")

errs1 = []
errs2 = []
for i in range(10):
A1 = torch.randn(1024, 1024, device=device).half()
A1 = torch.randn(1024, 1024, device=device, dtype=dtype)
q2, SA2 = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type)
q3, SA3 = F.quantize_4bit(A1, blocksize=blocksize, compress_statistics=True, quant_type=quant_type)
A2 = F.dequantize_4bit(q2, SA2, quant_type=quant_type)
Expand Down Expand Up @@ -1205,6 +1213,9 @@ def test_bench_4bit_dequant(self, quant_type):
)
@pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
pytest.skip("This configuration is not supported on HPU.")

errs1 = []
errs2 = []
errs3 = []
Expand Down Expand Up @@ -1354,6 +1365,9 @@ def test_gemv_eye_4bit(self, device, storage_type, dtype, double_quant):
if device == "cpu" and dtype == torch.bfloat16 and torch.__version__ < (2, 3):
pytest.skip("eye doe not support bfloat16 on CPU in torch < 2.3")

if device == "hpu" and not is_supported_on_hpu(storage_type, dtype):
pytest.skip("This configuration is not supported on HPU.")

dims = 10
torch.random.manual_seed(np.random.randint(0, 412424242))
dims = get_test_dims(0, 8192, n=dims)
Expand Down
25 changes: 22 additions & 3 deletions tests/test_linear4bit.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
describe_dtype,
get_available_devices,
id_formatter,
is_supported_on_hpu,
torch_load_from_buffer,
torch_save_to_buffer,
)
Expand All @@ -27,12 +28,17 @@

@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("quant_storage", ["uint8", "float16", "bfloat16", "float32"])
@pytest.mark.parametrize("original_dtype", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("bias", TRUE_FALSE, ids=id_formatter("bias"))
@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
@pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
@pytest.mark.parametrize("save_before_forward", TRUE_FALSE, ids=id_formatter("save_before_forward"))
def test_linear_serialization(device, quant_type, compress_statistics, bias, quant_storage, save_before_forward):
original_dtype = torch.float16
def test_linear_serialization(
device, quant_type, original_dtype, compress_statistics, bias, quant_storage, save_before_forward
):
if device == "hpu" and not is_supported_on_hpu(quant_type, original_dtype, storage[quant_storage]):
pytest.skip("This configuration is not supported on HPU.")

compute_dtype = None
layer_shape = (300, 400)

Expand Down Expand Up @@ -188,6 +194,9 @@ def test_linear_serialization(device, quant_type, compress_statistics, bias, qua
@pytest.mark.parametrize("blocksize", [64, 128])
@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
def test_copy_param(device, quant_type, blocksize, compress_statistics):
if device == "hpu" and not is_supported_on_hpu(quant_type):
pytest.skip("This configuration is not supported on HPU.")

tensor = torch.randn(300, 400)
param = bnb.nn.Params4bit(
data=tensor,
Expand All @@ -207,6 +216,9 @@ def test_copy_param(device, quant_type, blocksize, compress_statistics):
@pytest.mark.parametrize("blocksize", [64, 128])
@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
def test_deepcopy_param(device, quant_type, blocksize, compress_statistics):
if device == "hpu" and not is_supported_on_hpu(quant_type):
pytest.skip("This configuration is not supported on HPU.")

tensor = torch.randn(300, 400)
param = bnb.nn.Params4bit(
data=tensor,
Expand All @@ -233,6 +245,9 @@ def test_deepcopy_param(device, quant_type, blocksize, compress_statistics):
@pytest.mark.parametrize("blocksize", [64, 128])
@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
def test_params4bit_real_serialization(device, quant_type, blocksize, compress_statistics):
if device == "hpu" and not is_supported_on_hpu(quant_type):
pytest.skip("This configuration is not supported on HPU.")

original_tensor = torch.randn(300, 400)
original_param = bnb.nn.Params4bit(
data=original_tensor,
Expand Down Expand Up @@ -270,6 +285,9 @@ def test_params4bit_real_serialization(device, quant_type, blocksize, compress_s
@pytest.mark.parametrize("mode", ["default", "reduce-overhead"], ids=id_formatter("mode"))
@pytest.mark.skipif(torch.__version__ < (2, 4), reason="Not supported in torch < 2.4")
def test_linear4bit_torch_compile(device, quant_type, compute_dtype, compress_statistics, bias, fullgraph, mode):
if device == "hpu" and not is_supported_on_hpu(quant_type):
pytest.skip("This configuration is not supported on HPU.")

if fullgraph and torch.__version__ < (2, 8, 0, "dev"):
pytest.skip("fullgraph mode requires torch 2.8 or higher")

Expand Down Expand Up @@ -314,7 +332,8 @@ def test_linear4bit_torch_compile(device, quant_type, compute_dtype, compress_st
ref_output = net(x)

# Compile the model
compiled_net = torch.compile(net, fullgraph=fullgraph, mode=mode)
compile_backend = "hpu_backend" if device == "hpu" else "inductor"
compiled_net = torch.compile(net, fullgraph=fullgraph, mode=mode, backend=compile_backend)

# Get output from compiled model
with torch.no_grad():
Expand Down
11 changes: 10 additions & 1 deletion tests/test_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import bitsandbytes
from bitsandbytes.functional import ipex_xpu
from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter
from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter, is_supported_on_hpu

# torch.library.opcheck is only available in torch 2.4 and later.
# When testing with older versions, we will skip it as a no-op.
Expand Down Expand Up @@ -158,6 +158,9 @@ class Test4bitBlockwiseQuantOps:
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
@pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype):
pytest.skip("This configuration is not supported on HPU.")

A = torch.randn(1024, 1024, dtype=dtype, device=device)

out, absmax = torch.ops.bitsandbytes.quantize_4bit.default(A, blocksize, quant_type, storage_dtype)
Expand All @@ -179,6 +182,9 @@ def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
@pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
def test_dequantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype):
pytest.skip("This configuration is not supported on HPU.")

shape = (128, 128)

n = prod(shape)
Expand Down Expand Up @@ -210,6 +216,9 @@ def test_dequantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksi
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
@pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
def test_gemv_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype):
pytest.skip("This configuration is not supported on HPU.")

out_features = 1024
in_features = 256

Expand Down