[0.16.0] remove cuda hard-code for Hunyuan Image3 (#1402)

xuechendi · web-flow · commit d2f015edce21 · 2026-02-20T13:23:31.000+08:00
Signed-off-by: Chendi Xue &lt;chendi.xue@intel.com&gt;
diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py b/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py
@@ -64,6 +64,7 @@
 
 from vllm_omni.diffusion.attention.layer import Attention
 from vllm_omni.diffusion.distributed.parallel_state import get_pp_group
+from vllm_omni.diffusion.distributed.utils import get_local_device
 from vllm_omni.diffusion.layers.rope import RotaryEmbedding
 
 logger = logging.getLogger(__name__)
@@ -1737,6 +1738,7 @@ def __init__(self, config: HunyuanImage3Config, prefix: str = ""):
         lora_config = None
         self.num_redundant_experts = 0
         self.config = config
+        self.device = get_local_device()
         self.quant_config = quant_config
         self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) if lora_config else 0
@@ -2430,7 +2432,7 @@ def __call__(
                     **model_kwargs,
                 )
 
-                with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True):
+                with torch.autocast(device_type=self.device.type, dtype=torch.bfloat16, enabled=True):
                     model_output = self.model.forward_call(**model_inputs, first_step=(i == 0))
                     pred = model_output["diffusion_prediction"]
                 pred = pred.to(dtype=torch.float32)
@@ -2477,7 +2479,7 @@ def __call__(
         if hasattr(self.vae, "ffactor_temporal"):
             latents = latents.unsqueeze(2)
 
-        with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=True):
+        with torch.autocast(device_type=self.device.type, dtype=torch.float16, enabled=True):
             image = self.vae.decode(latents, return_dict=False, generator=generator)[0]
 
         if hasattr(self.vae, "ffactor_temporal"):
diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/pipeline_hunyuan_image_3.py b/vllm_omni/diffusion/models/hunyuan_image_3/pipeline_hunyuan_image_3.py
@@ -140,7 +140,7 @@ def pre_load(self):
             if hasattr(self, prefix.split(".")[0]):
                 module = dict(self.named_modules()).get(prefix)
                 if module:
-                    module.to(f"cuda:{tp_rank}")
+                    module.to(f"{self.model.device.type}:{tp_rank}")
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         self.pre_load()
@@ -369,7 +369,7 @@ def build_batch_rope_image_info(output, sections):
     def vae_encode(self, image, cfg_factor=1):
         config = self.vae.config
 
-        with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=True):
+        with torch.autocast(device_type=self.model.device.type, dtype=torch.float16, enabled=True):
             vae_encode_result = self.vae.encode(image)
             if isinstance(vae_encode_result, torch.Tensor):
                 latents = vae_encode_result
diff --git a/vllm_omni/platforms/__init__.py b/vllm_omni/platforms/__init__.py
@@ -84,8 +84,6 @@ def xpu_omni_platform_plugin() -> str | None:
     is_xpu = False
     logger.debug("Checking if XPU OmniPlatform is available.")
     try:
-        # installed IPEX if the machine has XPUs.
-        import intel_extension_for_pytorch  # noqa: F401
         import torch
 
         if supports_xccl():