vllm-project · jikunshang · Aug 11, 2025 · Aug 11, 2025 · gemini-code-assist · Aug 11, 2025
diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -27,6 +27,7 @@ docker run \
     "${image_name}" \
     sh -c '
     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -O3
     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
     cd tests

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
@@ -182,7 +182,7 @@ def __init__(
         # opaque custom op. For other platforms, we directly call them
         # and let torch.compile handle them.
         self.use_direct_call = not current_platform.is_cuda_alike(
-        ) and not current_platform.is_cpu()
+        ) and not current_platform.is_cpu() and not current_platform.is_xpu()
 
         self.use_output = self.attn_backend.accept_output_buffer
         compilation_config = get_current_vllm_config().compilation_config

@@ -413,8 +413,9 @@ def __init__(
         # them, e.g. backbone (default), eagle_head, etc.
         self.prefix = prefix or model_tag
 
+        # XPU does not support graph currently.
         global global_graph_pool
-        if global_graph_pool is None:
+        if global_graph_pool is None and not current_platform.is_xpu():
             global_graph_pool = current_platform.graph_pool_handle()
 
         # TODO: in the future, if we want to use multiple

@@ -9,6 +9,7 @@
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 
 from .fx_utils import is_func
 from .vllm_inductor_pass import VllmInductorPass
@@ -32,6 +33,10 @@ def __call__(self, graph: torch.fx.Graph):
         self.nodes_to_remove: list[torch.fx.Node] = []
         count = 0
         for node in graph.nodes:
+            # XPU does not support auto-functionalization yet.
+            # Will enable this when switch to vllm-xpu-kernels.
+            if current_platform.is_xpu():
+                continue
             if not is_func(node, auto_functionalized):
                 continue  # Avoid deep if-elif nesting
 

@@ -78,6 +78,10 @@ def get_device_total_memory(cls, device_id: int = 0) -> int:
     def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
         return True
 
+    @classmethod
+    def get_piecewise_backend_cls(cls) -> str:
+        return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend"  # noqa
+
     @classmethod
     def inference_mode(cls):
         return torch.no_grad()