huggingface
diff --git a/‎src/peft/utils/integrations.py‎
Lines changed: 7 additions & 2 deletions b/‎src/peft/utils/integrations.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎src/peft/utils/loftq_utils.py‎
Lines changed: 6 additions & 6 deletions b/‎src/peft/utils/loftq_utils.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎tests/bnb/test_bnb_regression.py‎
Lines changed: 12 additions & 10 deletions b/‎tests/bnb/test_bnb_regression.py‎
Lines changed: 12 additions & 10 deletions
diff --git a/‎tests/test_common_gpu.py‎
Lines changed: 14 additions & 15 deletions b/‎tests/test_common_gpu.py‎
Lines changed: 14 additions & 15 deletions
@@ -23,6 +23,8 @@
 import transformers
 from torch import nn
 
+from peft.import_utils import is_xpu_available
+
 
 @contextmanager
 def gather_params_ctx(param, modifier_rank: int = 0, fwd_module: torch.nn.Module = None):
@@ -90,8 +92,11 @@ def dequantize_bnb_weight(weight: torch.nn.Parameter, state=None):
     # BNB requires CUDA weights
     device = weight.device
     is_cpu = device.type == torch.device("cpu").type
-    if is_cpu and torch.cuda.is_available():
-        weight = weight.to(torch.device("cuda"))
+    if is_cpu:
+        if torch.cuda.is_available():
+            weight = weight.to(torch.device("cuda"))
+        elif is_xpu_available():
+            weight = weight.to(torch.device("xpu"))
 
     cls_name = weight.__class__.__name__
     if cls_name == "Params4bit":
 
@@ -22,13 +22,14 @@
 from typing import Callable, Optional, Union
 
 import torch
+from accelerate.utils.memory import clear_device_cache
 from huggingface_hub import snapshot_download
 from huggingface_hub.errors import HFValidationError, LocalEntryNotFoundError
 from safetensors import SafetensorError, safe_open
 from transformers.utils import cached_file
 from transformers.utils.hub import get_checkpoint_shard_files
 
-from peft.import_utils import is_bnb_4bit_available, is_bnb_available
+from peft.import_utils import is_bnb_4bit_available, is_bnb_available, is_xpu_available
 
 
 class NFQuantizer:
@@ -201,20 +202,19 @@ def loftq_init(weight: Union[torch.Tensor, torch.nn.Parameter], num_bits: int, r
     out_feature, in_feature = weight.size()
     device = weight.device
     dtype = weight.dtype
-
     logging.info(
         f"Weight: ({out_feature}, {in_feature}) | Rank: {reduced_rank} | Num Iter: {num_iter} | Num Bits: {num_bits}"
     )
     if not is_bnb_4bit_available() or num_bits in [2, 8]:
         quantizer = NFQuantizer(num_bits=num_bits, device=device, method="normal", block_size=64)
         compute_device = device
     else:
-        compute_device = "cuda"
+        compute_device = "xpu" if is_xpu_available() else "cuda"
 
     weight = weight.to(device=compute_device, dtype=torch.float32)
     res = weight.clone()
     for i in range(num_iter):
-        torch.cuda.empty_cache()
+        clear_device_cache()
         # Quantization
         if num_bits == 4 and is_bnb_4bit_available():
             qweight = bnb.nn.Params4bit(
@@ -246,12 +246,12 @@ def _loftq_init_new(qweight, weight, num_bits: int, reduced_rank: int):
     if not is_bnb_4bit_available():
         raise ValueError("bitsandbytes 4bit quantization is not available.")
 
-    compute_device = "cuda"
+    compute_device = "xpu" if is_xpu_available() else "cuda"
     dequantized_weight = bnb.functional.dequantize_4bit(qweight.data, qweight.quant_state)
 
     weight = weight.to(device=compute_device, dtype=torch.float32)
     residual = weight - dequantized_weight
-    torch.cuda.empty_cache()
+    clear_device_cache()
     # Decompose the residualidual by SVD
     output = _low_rank_decomposition(residual, reduced_rank=reduced_rank)
     L, R, reduced_rank = output["L"], output["R"], output["reduced_rank"]
 
@@ -27,10 +27,12 @@
 import torch
 from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, BitsAndBytesConfig
 
+from peft.import_utils import is_xpu_available
+
 
 bnb = pytest.importorskip("bitsandbytes")
 
-device = torch.device("cuda")
+device = torch.device("xpu") if is_xpu_available() else torch.device("cuda")
 
 
 def bytes_from_tensor(x):
@@ -47,7 +49,7 @@ def bytes_from_tensor(x):
 ############
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="No CUDA device available.")
+@pytest.mark.skipif(not (torch.cuda.is_available() or is_xpu_available()), reason="No CUDA device available.")
 def test_opt_350m_4bit():
     torch.manual_seed(0)
     bnb_config = BitsAndBytesConfig(
@@ -70,7 +72,7 @@ def test_opt_350m_4bit():
     torch.testing.assert_allclose(output, expected)
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="No CUDA device available.")
+@pytest.mark.skipif(not (torch.cuda.is_available() or is_xpu_available()), reason="No CUDA device available.")
 def test_opt_350m_8bit():
     torch.manual_seed(0)
     bnb_config = BitsAndBytesConfig(load_in_8bit=True)
@@ -89,7 +91,7 @@ def test_opt_350m_8bit():
     torch.testing.assert_allclose(output, expected)
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="No CUDA device available.")
+@pytest.mark.skipif(not (torch.cuda.is_available() or is_xpu_available()), reason="No CUDA device available.")
 def test_opt_350m_4bit_double_quant():
     torch.manual_seed(0)
     bnb_config = BitsAndBytesConfig(
@@ -112,7 +114,7 @@ def test_opt_350m_4bit_double_quant():
     torch.testing.assert_allclose(output, expected)
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="No CUDA device available.")
+@pytest.mark.skipif(not (torch.cuda.is_available() or is_xpu_available()), reason="No CUDA device available.")
 def test_opt_350m_4bit_compute_dtype_float16():
     torch.manual_seed(0)
     bnb_config = BitsAndBytesConfig(
@@ -135,7 +137,7 @@ def test_opt_350m_4bit_compute_dtype_float16():
     torch.testing.assert_allclose(output, expected)
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="No CUDA device available.")
+@pytest.mark.skipif(not (torch.cuda.is_available() or is_xpu_available()), reason="No CUDA device available.")
 def test_opt_350m_4bit_quant_type_nf4():
     torch.manual_seed(0)
     bnb_config = BitsAndBytesConfig(
@@ -159,7 +161,7 @@ def test_opt_350m_4bit_quant_type_nf4():
     torch.testing.assert_allclose(output, expected)
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="No CUDA device available.")
+@pytest.mark.skipif(not (torch.cuda.is_available() or is_xpu_available()), reason="No CUDA device available.")
 def test_opt_350m_4bit_quant_storage():
     # note: using torch.float32 instead of the default torch.uint8 does not seem to affect the result
     torch.manual_seed(0)
@@ -184,7 +186,7 @@ def test_opt_350m_4bit_quant_storage():
     torch.testing.assert_allclose(output, expected)
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="No CUDA device available.")
+@pytest.mark.skipif(not (torch.cuda.is_available() or is_xpu_available()), reason="No CUDA device available.")
 def test_opt_350m_8bit_threshold():
     torch.manual_seed(0)
     bnb_config = BitsAndBytesConfig(
@@ -211,7 +213,7 @@ def test_opt_350m_8bit_threshold():
 ###########
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="No CUDA device available.")
+@pytest.mark.skipif(not (torch.cuda.is_available() or is_xpu_available()), reason="No CUDA device available.")
 def test_flan_t5_4bit():
     torch.manual_seed(0)
     bnb_config = BitsAndBytesConfig(
@@ -235,7 +237,7 @@ def test_flan_t5_4bit():
     torch.testing.assert_allclose(output, expected)
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="No CUDA device available.")
+@pytest.mark.skipif(not (torch.cuda.is_available() or is_xpu_available()), reason="No CUDA device available.")
 @pytest.mark.xfail  # might not be reproducible depending on hardware
 def test_flan_t5_8bit():
     torch.manual_seed(0)
 
@@ -63,7 +63,6 @@
     require_multi_accelerator,
     require_non_cpu,
     require_torch_gpu,
-    require_torch_multi_gpu,
 )
 
 
@@ -594,7 +593,7 @@ def test_lora_causal_lm_multi_gpu_inference(self):
         # this should work without any problem
         _ = model.generate(input_ids=input_ids)
 
-    @require_torch_multi_gpu
+    @require_multi_accelerator
     @pytest.mark.multi_gpu_tests
     @require_bitsandbytes
     def test_lora_seq2seq_lm_multi_gpu_inference(self):
@@ -622,7 +621,7 @@ def test_lora_seq2seq_lm_multi_gpu_inference(self):
         # this should work without any problem
         _ = model.generate(input_ids=input_ids)
 
-    @require_torch_multi_gpu
+    @require_multi_accelerator
     @pytest.mark.multi_gpu_tests
     @require_bitsandbytes
     def test_adaption_prompt_8bit(self):
@@ -645,7 +644,7 @@ def test_adaption_prompt_8bit(self):
         random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device)
         _ = model(random_input)
 
-    @require_torch_multi_gpu
+    @require_multi_accelerator
     @pytest.mark.multi_gpu_tests
     @require_bitsandbytes
     def test_adaption_prompt_4bit(self):
@@ -668,7 +667,7 @@ def test_adaption_prompt_4bit(self):
         random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device)
         _ = model(random_input)
 
-    @require_torch_gpu
+    @require_non_cpu
     @pytest.mark.single_gpu_tests
     @require_bitsandbytes
     def test_print_4bit_expected(self):
@@ -778,7 +777,7 @@ def test_8bit_merge_lora(self):
         assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.q_proj, bnb.nn.Linear8bitLt)
         assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj, bnb.nn.Linear8bitLt)
 
-    @require_torch_gpu
+    @require_non_cpu
     @pytest.mark.single_gpu_tests
     @require_bitsandbytes
     def test_8bit_merge_and_disable_lora(self):
@@ -814,7 +813,7 @@ def test_8bit_merge_and_disable_lora(self):
         assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.q_proj, LoraLinear8bitLt)
         assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear8bitLt)
 
-    @require_torch_gpu
+    @require_non_cpu
     @pytest.mark.single_gpu_tests
     @require_bitsandbytes
     def test_8bit_merge_lora_with_bias(self):
@@ -846,7 +845,7 @@ def test_8bit_merge_lora_with_bias(self):
         assert not torch.allclose(out_base, out_before_merge, atol=atol, rtol=rtol)
         assert torch.allclose(out_before_merge, out_after_merge, atol=atol, rtol=rtol)
 
-    @require_torch_gpu
+    @require_non_cpu
     @pytest.mark.single_gpu_tests
     @require_bitsandbytes
     def test_4bit_merge_lora(self):
@@ -888,7 +887,7 @@ def test_4bit_merge_lora(self):
         assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.q_proj, bnb.nn.Linear4bit)
         assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj, bnb.nn.Linear4bit)
 
-    @require_torch_gpu
+    @require_non_cpu
     @pytest.mark.single_gpu_tests
     @require_bitsandbytes
     def test_4bit_merge_and_disable_lora(self):
@@ -930,7 +929,7 @@ def test_4bit_merge_and_disable_lora(self):
         assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.q_proj, LoraLinear4bit)
         assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear4bit)
 
-    @require_torch_gpu
+    @require_non_cpu
     @pytest.mark.single_gpu_tests
     @require_bitsandbytes
     def test_4bit_merge_lora_with_bias(self):
@@ -971,7 +970,7 @@ def test_4bit_merge_lora_with_bias(self):
         assert not torch.allclose(out_base, out_before_merge, atol=atol, rtol=rtol)
         assert torch.allclose(out_before_merge, out_after_merge, atol=atol, rtol=rtol)
 
-    @require_torch_gpu
+    @require_non_cpu
     @pytest.mark.single_gpu_tests
     @require_bitsandbytes
     def test_4bit_lora_mixed_adapter_batches_lora(self):
@@ -1042,7 +1041,7 @@ def test_4bit_lora_mixed_adapter_batches_lora(self):
         assert torch.allclose(out_adapter0[1::3], out_mixed[1::3], atol=atol, rtol=rtol)
         assert torch.allclose(out_adapter1[2::3], out_mixed[2::3], atol=atol, rtol=rtol)
 
-    @require_torch_gpu
+    @require_non_cpu
     @pytest.mark.single_gpu_tests
     @require_bitsandbytes
     def test_8bit_lora_mixed_adapter_batches_lora(self):
@@ -1124,7 +1123,7 @@ def test_serialization_shared_tensors(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
             model.save_pretrained(tmp_dir, safe_serialization=True)
 
-    @require_torch_gpu
+    @require_non_cpu
     @pytest.mark.single_gpu_tests
     @require_bitsandbytes
     def test_4bit_dora_inference(self):
@@ -1163,7 +1162,7 @@ def test_4bit_dora_inference(self):
         assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.q_proj, LoraLinear4bit)
         assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear4bit)
 
-    @require_torch_gpu
+    @require_non_cpu
     @pytest.mark.single_gpu_tests
     @require_bitsandbytes
     def test_8bit_dora_inference(self):
@@ -1197,7 +1196,7 @@ def test_8bit_dora_inference(self):
         assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.q_proj, LoraLinear8bitLt)
         assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear8bitLt)
 
-    @require_torch_gpu
+    @require_non_cpu
     @pytest.mark.single_gpu_tests
     @require_bitsandbytes
     def test_4bit_dora_merging(self):