diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index deb61a9bafab..98a3a4a45cf5 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -27,6 +27,7 @@ docker run \ "${image_name}" \ sh -c ' VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager + VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -O3 VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp cd tests diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index b4c3cbd7c9d6..1795d384b90d 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -182,7 +182,7 @@ def __init__( # opaque custom op. For other platforms, we directly call them # and let torch.compile handle them. self.use_direct_call = not current_platform.is_cuda_alike( - ) and not current_platform.is_cpu() + ) and not current_platform.is_cpu() and not current_platform.is_xpu() self.use_output = self.attn_backend.accept_output_buffer compilation_config = get_current_vllm_config().compilation_config diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 673fb5866234..186f7eff627d 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -413,8 +413,9 @@ def __init__( # them, e.g. backbone (default), eagle_head, etc. self.prefix = prefix or model_tag + # XPU does not support graph currently. global global_graph_pool - if global_graph_pool is None: + if global_graph_pool is None and not current_platform.is_xpu(): global_graph_pool = current_platform.graph_pool_handle() # TODO: in the future, if we want to use multiple diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py index 286221d32c1e..e9f57ecf2ee2 100644 --- a/vllm/compilation/fix_functionalization.py +++ b/vllm/compilation/fix_functionalization.py @@ -9,6 +9,7 @@ from torch._higher_order_ops.auto_functionalize import auto_functionalized from vllm.logger import init_logger +from vllm.platforms import current_platform from .fx_utils import is_func from .vllm_inductor_pass import VllmInductorPass @@ -32,6 +33,10 @@ def __call__(self, graph: torch.fx.Graph): self.nodes_to_remove: list[torch.fx.Node] = [] count = 0 for node in graph.nodes: + # XPU does not support auto-functionalization yet. + # Will enable this when switch to vllm-xpu-kernels. + if current_platform.is_xpu(): + continue if not is_func(node, auto_functionalized): continue # Avoid deep if-elif nesting diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index d8a663f2f0c4..ef434c6aa91e 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -78,6 +78,10 @@ def get_device_total_memory(cls, device_id: int = 0) -> int: def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: return True + @classmethod + def get_piecewise_backend_cls(cls) -> str: + return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend" # noqa + @classmethod def inference_mode(cls): return torch.no_grad()