Support vLLM IR on XPU (#148)

xinyu-intel · web-flow · commit ce90e9fcdb19 · 2026-03-11T19:07:48.000-04:00
* Support vLLM IR on XPU

Signed-off-by: Xinyu Chen &lt;xinyu1.chen@intel.com&gt;

* test layernorm on xpu

Signed-off-by: Xinyu Chen &lt;xinyu1.chen@intel.com&gt;

---------

Signed-off-by: Xinyu Chen &lt;xinyu1.chen@intel.com&gt;
diff --git a/tests/kernels/ir/test_layernorm.py b/tests/kernels/ir/test_layernorm.py
@@ -20,15 +20,16 @@ def rms_norm_inputs(n_tokens: int, hidden_size: int, dtype: torch.dtype):
 
 
 @pytest.mark.skipif(
-    not current_platform.is_cuda_alike(),
-    reason="Currently only kernels on CUDA and ROCm",
+    not current_platform.is_cuda_alike() and not current_platform.is_xpu(),
+    reason="Currently only kernels on CUDA, ROCm and XPU",
 )
 def test_rms_norm_registration():
     expected = {
         "native": True,
-        "vllm_c": True,
+        "vllm_c": current_platform.is_cuda_alike(),
         "aiter": current_platform.is_rocm(),
         "oink": False,
+        "xpu_kernels": current_platform.is_xpu(),
     }
 
     actual = {
@@ -43,13 +44,13 @@ def test_rms_norm_registration():
 @pytest.mark.parametrize("hidden_size", [16, 4096, 8192])
 @pytest.mark.parametrize("epsilon", [1e-6, 1e-5])
 @pytest.mark.skipif(
-    not current_platform.is_cuda_alike(),
-    reason="Currently only kernels on CUDA and ROCm",
+    not current_platform.is_cuda_alike() and not current_platform.is_xpu(),
+    reason="Currently only kernels on CUDA, ROCm and XPU",
 )
 class TestRMSNorm:
     @classmethod
     def setup_class(cls, **kwargs):
-        torch.set_default_device("cuda")
+        torch.set_default_device(current_platform.device_name)
 
     def test_native_semantics(self, dtype, n_tokens, hidden_size, epsilon):
         x, weight = rms_norm_inputs(4, 8, dtype)
@@ -70,7 +71,7 @@ def test_native_semantics(self, dtype, n_tokens, hidden_size, epsilon):
         out4 = rms_norm_native(x, None, epsilon=epsilon)
         torch.testing.assert_close(out3, out4)
 
-    @pytest.mark.parametrize("provider", ["vllm_c", "aiter"])
+    @pytest.mark.parametrize("provider", ["vllm_c", "aiter", "xpu_kernels"])
     def test_impls(self, dtype, n_tokens, hidden_size, epsilon, provider):
         impl = ir.ops.rms_norm.impls[provider]
         if not impl.supported:
@@ -115,7 +116,7 @@ def test_impls(self, dtype, n_tokens, hidden_size, epsilon, provider):
             atol=2e-4,
         )
 
-    @pytest.mark.parametrize("provider", ["vllm_c", "aiter", "native"])
+    @pytest.mark.parametrize("provider", ["vllm_c", "aiter", "xpu_kernels", "native"])
     def test_torch_opcheck(self, dtype, n_tokens, hidden_size, epsilon, provider):
         if not ir.ops.rms_norm.impls[provider].supported:
             pytest.skip(f"{provider} impl not supported on this platform")
diff --git a/vllm/kernels/__init__.py b/vllm/kernels/__init__.py
@@ -2,6 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Kernel implementations for vLLM."""
 
-from . import aiter_ops, oink_ops, vllm_c
+from . import aiter_ops, oink_ops, vllm_c, xpu_ops
 
-__all__ = ["vllm_c", "aiter_ops", "oink_ops"]
+__all__ = ["vllm_c", "aiter_ops", "oink_ops", "xpu_ops"]
diff --git a/vllm/kernels/xpu_ops.py b/vllm/kernels/xpu_ops.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from torch import Tensor
+
+from vllm import ir
+from vllm.platforms import current_platform
+
+current_platform.import_kernels()
+
+
+def is_xpu_kernels_found() -> bool:
+    from importlib.util import find_spec
+
+    return find_spec("vllm_xpu_kernels") is not None
+
+
+XPU_KERNELS_SUPPORTED = is_xpu_kernels_found()
+"""Kernels in this file are supported if vLLM XPU kernels are installed."""
+
+rms_no_var = lambda x, weight, epsilon, variance_size=None: variance_size is None
+
+
+@ir.ops.rms_norm.register_impl(
+    "xpu_kernels", supports_args=rms_no_var, supported=XPU_KERNELS_SUPPORTED
+)
+def rms_norm(
+    x: Tensor, weight: Tensor | None, epsilon: float, variance_size: int | None = None
+) -> Tensor:
+    if weight is None:
+        # Kernel requires weight tensor, pass ones
+        weight = torch.ones(x.shape[-1], device=x.device, dtype=x.dtype)
+    assert variance_size is None
+    output = torch.empty(x.shape, device=x.device, dtype=x.dtype)
+    torch.ops._C.rms_norm(output, x, weight, epsilon)
+    return output
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
@@ -20,6 +20,7 @@
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
+    from vllm.config.kernel import IrOpPriorityConfig
     from vllm.v1.attention.selector import AttentionSelectorConfig
 else:
     VllmConfig = None
@@ -273,6 +274,21 @@ def get_device_communicator_cls(cls) -> str:
             )
         return "vllm.distributed.device_communicators.xpu_communicator.XpuCommunicator"  # noqa
 
+    @classmethod
+    def get_default_ir_op_priority(
+        cls, vllm_config: "VllmConfig"
+    ) -> "IrOpPriorityConfig":
+        from vllm.config.compilation import CompilationMode
+        from vllm.config.kernel import IrOpPriorityConfig
+
+        # Native used by default when compiling,
+        # use fused kernels where available when no codegen
+        cc = vllm_config.compilation_config
+        using_inductor = cc.backend == "inductor" and cc.mode != CompilationMode.NONE
+        default = ["native"] if using_inductor else ["xpu_kernels", "vllm_c", "native"]
+
+        return IrOpPriorityConfig.with_default(default)
+
     @classmethod
     def device_count(cls) -> int:
         return torch.xpu.device_count()