enable torchao cases on XPU

yao-matrix · yao-matrix · commit 2e114231f683 · 2025-06-03T07:42:25.000Z
Signed-off-by: Matrix YAO &lt;matrix.yao@intel.com&gt;
diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py
@@ -656,13 +656,13 @@ def generate_fpx_quantization_types(bits: int):
 
     @staticmethod
     def _is_cuda_capability_atleast_8_9() -> bool:
-        if not torch.cuda.is_available():
-            raise RuntimeError("TorchAO requires a CUDA compatible GPU and installation of PyTorch.")
-
-        major, minor = torch.cuda.get_device_capability()
-        if major == 8:
-            return minor >= 9
-        return major >= 9
+        if torch.cuda.is_available():
+            major, minor = torch.cuda.get_device_capability()
+            if major == 8:
+                return minor >= 9
+            return major >= 9
+        else:
+            return True
 
     def get_apply_tensor_subclass(self):
         TORCHAO_QUANT_TYPE_METHODS = self._get_torchao_quant_type_to_method()
diff --git a/tests/models/unets/test_models_unet_2d_condition.py b/tests/models/unets/test_models_unet_2d_condition.py
@@ -46,7 +46,6 @@
     require_peft_backend,
     require_torch_accelerator,
     require_torch_accelerator_with_fp16,
-    require_torch_gpu,
     skip_mps,
     slow,
     torch_all_close,
@@ -980,13 +979,13 @@ def test_ip_adapter_plus(self):
         assert sample2.allclose(sample5, atol=1e-4, rtol=1e-4)
         assert sample2.allclose(sample6, atol=1e-4, rtol=1e-4)
 
-    @require_torch_gpu
     @parameterized.expand(
         [
             ("hf-internal-testing/unet2d-sharded-dummy", None),
             ("hf-internal-testing/tiny-sd-unet-sharded-latest-format", "fp16"),
         ]
     )
+    @require_torch_accelerator
     def test_load_sharded_checkpoint_from_hub(self, repo_id, variant):
         _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
         loaded_model = self.model_class.from_pretrained(repo_id, variant=variant)
@@ -996,13 +995,13 @@ def test_load_sharded_checkpoint_from_hub(self, repo_id, variant):
         assert loaded_model
         assert new_output.sample.shape == (4, 4, 16, 16)
 
-    @require_torch_gpu
     @parameterized.expand(
         [
             ("hf-internal-testing/unet2d-sharded-dummy-subfolder", None),
             ("hf-internal-testing/tiny-sd-unet-sharded-latest-format-subfolder", "fp16"),
         ]
     )
+    @require_torch_accelerator
     def test_load_sharded_checkpoint_from_hub_subfolder(self, repo_id, variant):
         _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
         loaded_model = self.model_class.from_pretrained(repo_id, subfolder="unet", variant=variant)
diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py
@@ -30,12 +30,15 @@
 )
 from diffusers.models.attention_processor import Attention
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    backend_synchronize,
     enable_full_determinism,
     is_torch_available,
     is_torchao_available,
     nightly,
     numpy_cosine_similarity_distance,
     require_torch,
+    require_torch_accelerator,
     require_torch_gpu,
     require_torchao_version_greater_or_equal,
     slow,
@@ -61,7 +64,7 @@
 
 
 @require_torch
-@require_torch_gpu
+@require_torch_accelerator
 @require_torchao_version_greater_or_equal("0.7.0")
 class TorchAoConfigTest(unittest.TestCase):
     def test_to_dict(self):
@@ -119,12 +122,12 @@ def test_repr(self):
 
 # Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners
 @require_torch
-@require_torch_gpu
+@require_torch_accelerator
 @require_torchao_version_greater_or_equal("0.7.0")
 class TorchAoTest(unittest.TestCase):
     def tearDown(self):
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def get_dummy_components(
         self, quantization_config: TorchAoConfig, model_id: str = "hf-internal-testing/tiny-flux-pipe"
@@ -518,14 +521,14 @@ def test_sequential_cpu_offload(self):
 
 # Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners
 @require_torch
-@require_torch_gpu
+@require_torch_accelerator
 @require_torchao_version_greater_or_equal("0.7.0")
 class TorchAoSerializationTest(unittest.TestCase):
     model_name = "hf-internal-testing/tiny-flux-pipe"
 
     def tearDown(self):
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def get_dummy_model(self, quant_method, quant_method_kwargs, device=None):
         quantization_config = TorchAoConfig(quant_method, **quant_method_kwargs)
@@ -596,14 +599,14 @@ def _check_serialization_expected_slice(self, quant_method, quant_method_kwargs,
     def test_int_a8w8_cuda(self):
         quant_method, quant_method_kwargs = "int8_dynamic_activation_int8_weight", {}
         expected_slice = np.array([0.3633, -0.1357, -0.0188, -0.249, -0.4688, 0.5078, -0.1289, -0.6914, 0.4551])
-        device = "cuda"
+        device = torch_device
         self._test_original_model_expected_slice(quant_method, quant_method_kwargs, expected_slice)
         self._check_serialization_expected_slice(quant_method, quant_method_kwargs, expected_slice, device)
 
     def test_int_a16w8_cuda(self):
         quant_method, quant_method_kwargs = "int8_weight_only", {}
         expected_slice = np.array([0.3613, -0.127, -0.0223, -0.2539, -0.459, 0.4961, -0.1357, -0.6992, 0.4551])
-        device = "cuda"
+        device = torch_device
         self._test_original_model_expected_slice(quant_method, quant_method_kwargs, expected_slice)
         self._check_serialization_expected_slice(quant_method, quant_method_kwargs, expected_slice, device)
 
@@ -624,14 +627,14 @@ def test_int_a16w8_cpu(self):
 
 # Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners
 @require_torch
-@require_torch_gpu
+@require_torch_accelerator
 @require_torchao_version_greater_or_equal("0.7.0")
 @slow
 @nightly
 class SlowTorchAoTests(unittest.TestCase):
     def tearDown(self):
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def get_dummy_components(self, quantization_config: TorchAoConfig):
         # This is just for convenience, so that we can modify it at one place for custom environments and locally testing
@@ -713,8 +716,8 @@ def test_quantization(self):
             quantization_config = TorchAoConfig(quant_type=quantization_name, modules_to_not_convert=["x_embedder"])
             self._test_quant_type(quantization_config, expected_slice)
             gc.collect()
-            torch.cuda.empty_cache()
-            torch.cuda.synchronize()
+            backend_empty_cache(torch_device)
+            backend_synchronize(torch_device)
 
     def test_serialization_int8wo(self):
         quantization_config = TorchAoConfig("int8wo")
@@ -733,8 +736,8 @@ def test_serialization_int8wo(self):
             pipe.remove_all_hooks()
             del pipe.transformer
             gc.collect()
-            torch.cuda.empty_cache()
-            torch.cuda.synchronize()
+            backend_empty_cache(torch_device)
+            backend_synchronize(torch_device)
             transformer = FluxTransformer2DModel.from_pretrained(
                 tmp_dir, torch_dtype=torch.bfloat16, use_safetensors=False
             )
@@ -783,14 +786,14 @@ def test_memory_footprint_int8wo(self):
 
 
 @require_torch
-@require_torch_gpu
+@require_torch_accelerator
 @require_torchao_version_greater_or_equal("0.7.0")
 @slow
 @nightly
 class SlowTorchAoPreserializedModelTests(unittest.TestCase):
     def tearDown(self):
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def get_dummy_inputs(self, device: torch.device, seed: int = 0):
         if str(device).startswith("mps"):

Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,6 @@`
`46`	`46`	`require_peft_backend,`
`47`	`47`	`require_torch_accelerator,`
`48`	`48`	`require_torch_accelerator_with_fp16,`
`49`		`- require_torch_gpu,`
`50`	`49`	`skip_mps,`
`51`	`50`	`slow,`
`52`	`51`	`torch_all_close,`
`@@ -980,13 +979,13 @@ def test_ip_adapter_plus(self):`
`980`	`979`	`assert sample2.allclose(sample5, atol=1e-4, rtol=1e-4)`
`981`	`980`	`assert sample2.allclose(sample6, atol=1e-4, rtol=1e-4)`
`982`	`981`
`983`		`- @require_torch_gpu`
`984`	`982`	`@parameterized.expand(`
`985`	`983`	`[`
`986`	`984`	`("hf-internal-testing/unet2d-sharded-dummy", None),`
`987`	`985`	`("hf-internal-testing/tiny-sd-unet-sharded-latest-format", "fp16"),`
`988`	`986`	`]`
`989`	`987`	`)`
	`988`	`+ @require_torch_accelerator`
`990`	`989`	`def test_load_sharded_checkpoint_from_hub(self, repo_id, variant):`
`991`	`990`	`_, inputs_dict = self.prepare_init_args_and_inputs_for_common()`
`992`	`991`	`loaded_model = self.model_class.from_pretrained(repo_id, variant=variant)`
`@@ -996,13 +995,13 @@ def test_load_sharded_checkpoint_from_hub(self, repo_id, variant):`
`996`	`995`	`assert loaded_model`
`997`	`996`	`assert new_output.sample.shape == (4, 4, 16, 16)`
`998`	`997`
`999`		`- @require_torch_gpu`
`1000`	`998`	`@parameterized.expand(`
`1001`	`999`	`[`
`1002`	`1000`	`("hf-internal-testing/unet2d-sharded-dummy-subfolder", None),`
`1003`	`1001`	`("hf-internal-testing/tiny-sd-unet-sharded-latest-format-subfolder", "fp16"),`
`1004`	`1002`	`]`
`1005`	`1003`	`)`
	`1004`	`+ @require_torch_accelerator`
`1006`	`1005`	`def test_load_sharded_checkpoint_from_hub_subfolder(self, repo_id, variant):`
`1007`	`1006`	`_, inputs_dict = self.prepare_init_args_and_inputs_for_common()`
`1008`	`1007`	`loaded_model = self.model_class.from_pretrained(repo_id, subfolder="unet", variant=variant)`