Add silu_and_mul kernel (#8)

Liangliang-Ma · web-flow · commit 6f1df4f4b645 · 2025-08-12T14:14:04.000+08:00
* base of silu_and_mul

Signed-off-by: Ma, Liangliang &lt;liangliang.ma@intel.com&gt;

* refine tests

Signed-off-by: Ma, Liangliang &lt;liangliang.ma@intel.com&gt;

* rm redundant cast

Signed-off-by: Ma, Liangliang &lt;liangliang.ma@intel.com&gt;

* ut pass

Signed-off-by: Ma, Liangliang &lt;liangliang.ma@intel.com&gt;

* fix acc issue

Signed-off-by: Ma, Liangliang &lt;liangliang.ma@intel.com&gt;

* fix format

Signed-off-by: Ma, Liangliang &lt;liangliang.ma@intel.com&gt;

* fix format2

Signed-off-by: Ma, Liangliang &lt;liangliang.ma@intel.com&gt;

---------

Signed-off-by: Ma, Liangliang &lt;liangliang.ma@intel.com&gt;
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -148,6 +148,7 @@ if(VLLM_GPU_LANG STREQUAL "SYCL")
   set(VLLM_EXT_SRC
     "csrc/xpu/cache.cpp"
     "csrc/xpu/layernorm.cpp"
+    "csrc/xpu/activation.cpp"
     "csrc/xpu/pos_encoding_kernels.cpp"
     "csrc/xpu/torch_bindings.cpp"
   )
diff --git a/csrc/xpu/activation.cpp b/csrc/xpu/activation.cpp
@@ -0,0 +1,69 @@
+#include <sycl/sycl.hpp>
+
+#include <algorithm>
+#include "utils.h"
+#include "dispatch_utils.h"
+
+namespace vllm {
+
+template <typename T>
+inline T silu_kernel(const T& x) {
+  // x * sigmoid(x)
+  return (T)(((float)x) / (1.0f + sycl::exp((float)-x)));
+}
+
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
+          bool act_first>
+inline scalar_t compute(const scalar_t& x, const scalar_t& y) {
+  return act_first ? ACT_FN(x) * y : x * ACT_FN(y);
+}
+
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
+          bool act_first>
+void act_and_mul_kernel(scalar_t* __restrict__ out,          // [..., d]
+                        const scalar_t* __restrict__ input,  // [..., 2, d]
+                        const int d, const sycl::nd_item<3>& item_ct1) {
+  const int64_t token_idx = item_ct1.get_group(2);
+  for (int64_t idx = item_ct1.get_local_id(2); idx < d;
+       idx += item_ct1.get_local_range(2)) {
+    const scalar_t x = input[token_idx * 2 * d + idx];
+    const scalar_t y = input[token_idx * 2 * d + d + idx];
+    out[token_idx * d + idx] = compute<scalar_t, ACT_FN, act_first>(x, y);
+  }
+}
+
+template <typename scalar_t>
+void call_silu_and_mul_kernel(torch::Tensor& out, torch::Tensor& input) {
+  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
+  int d = input.size(-1) / 2;
+  int64_t num_tokens = input.numel() / input.size(-1);
+  // dpct::dim3 grid(num_tokens);
+  // dpct::dim3 block(std::min(d, 1024));
+  sycl::range<3> grid(1, 1, num_tokens);
+  sycl::range<3> block(1, 1, std::min(d, 1024));
+  if (num_tokens == 0) {
+    return;
+  }
+  auto out_ptr = out.data_ptr<scalar_t>();
+  auto input_ptr = input.data_ptr<scalar_t>();
+  at::DeviceGuard device_guard(input.device());
+  auto& queue = vllm::xpu::vllmGetQueue();
+  queue.submit([&](sycl::handler& cgh) {
+    cgh.parallel_for(
+        sycl::nd_range<3>(grid * block, block),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+          act_and_mul_kernel<sycl_t, silu_kernel, true>(
+              (sycl_t*)out_ptr, (sycl_t*)input_ptr, d, item_ct1);
+        });
+  });
+}
+
+}  // namespace vllm
+
+void silu_and_mul(torch::Tensor& out,    // [..., d]
+                  torch::Tensor& input)  // [..., 2 * d]
+{
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "call_silu_and_mul_kernel",
+      [&] { vllm::call_silu_and_mul_kernel<scalar_t>(out, input); });
+}
diff --git a/csrc/xpu/ops.h b/csrc/xpu/ops.h
@@ -8,6 +8,8 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
                         torch::Tensor& weight, double epsilon);
 
+void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
+
 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
                       std::optional<torch::Tensor> key, int64_t head_size,
                       torch::Tensor& cos_sin_cache, bool is_neox);
diff --git a/csrc/xpu/torch_bindings.cpp b/csrc/xpu/torch_bindings.cpp
@@ -32,6 +32,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "float epsilon) -> ()");
   ops.impl("fused_add_rms_norm", torch::kXPU, &fused_add_rms_norm);
 
+  // activation ops
+  ops.def("silu_and_mul(Tensor! out, Tensor! input) -> ()");
+  ops.impl("silu_and_mul", torch::kXPU, &silu_and_mul);
+
   // pos_embedding
   ops.def(
       "rotary_embedding(Tensor positions, Tensor! query,"
diff --git a/tests/ops/activation_op.py b/tests/ops/activation_op.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import torch.nn.functional as F
+
+import tests.register_ops as ops
+from tests.ops.custom_ops import CustomOp
+
+
+class SiluAndMul(CustomOp):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.op = ops.silu_and_mul
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+        d = x.shape[-1] // 2
+        return F.silu(x[..., :d]) * x[..., d:]
+
+    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        self.op(out, x)
+        return out
diff --git a/tests/register_ops.py b/tests/register_ops.py
@@ -20,6 +20,10 @@ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
     torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon)
 
 
+def silu_and_mul(out: torch.Tensor, input: torch.Tensor) -> None:
+    torch.ops._C.silu_and_mul(out, input)
+
+
 def rotary_embedding(
     positions: torch.Tensor,
     query: torch.Tensor,
diff --git a/tests/test_activation.py b/tests/test_activation.py
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+
+from tests.ops.activation_op import SiluAndMul
+from tests.utils import opcheck, seed_everything
+
+DTYPES = [torch.half, torch.bfloat16, torch.float]
+NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
+D = [512, 13824]  # Arbitrary values for testing
+SEEDS = [0]
+XPU_DEVICES = [
+    f"xpu:{i}" for i in range(1 if torch.xpu.device_count() == 1 else 2)
+]
+
+
+@pytest.mark.parametrize("activation", ["silu_and_mul"])
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("d", D)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", XPU_DEVICES)
+@torch.inference_mode()
+def test_act_and_mul(
+    activation: str,
+    num_tokens: int,
+    d: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    seed_everything(seed)
+    torch.set_default_device(device)
+    x = torch.randn(num_tokens, 2 * d, dtype=dtype)
+    if activation == "silu_and_mul":
+        layer = SiluAndMul()
+        fn = torch.ops._C.silu_and_mul
+    out = layer(x)
+    ref_out = layer.forward_native(x)
+
+    torch.testing.assert_close(out, ref_out, atol=1e-3, rtol=1e-3)
+
+    d = x.shape[-1] // 2
+    output_shape = (x.shape[:-1] + (d, ))
+    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+    opcheck(fn, (out, x))
diff --git a/tests/utils.py b/tests/utils.py
@@ -78,6 +78,12 @@ def opcheck(
 }
 
 
+def seed_everything(seed) -> None:
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+
 def _convert_from_fp8(
     tensor: torch.Tensor,
     scale: float = 1.0,

Original file line number	Diff line number	Diff line change
`@@ -148,6 +148,7 @@ if(VLLM_GPU_LANG STREQUAL "SYCL")`
`148`	`148`	`set(VLLM_EXT_SRC`
`149`	`149`	`"csrc/xpu/cache.cpp"`
`150`	`150`	`"csrc/xpu/layernorm.cpp"`
	`151`	`+ "csrc/xpu/activation.cpp"`
`151`	`152`	`"csrc/xpu/pos_encoding_kernels.cpp"`
`152`	`153`	`"csrc/xpu/torch_bindings.cpp"`
`153`	`154`	`)`