Add rotary_embedding kernel (#7)

Liangliang-Ma · web-flow · commit 5d63fd49671c · 2025-08-12T13:50:41.000+08:00
* add rope

Signed-off-by: Ma, Liangliang &lt;liangliang.ma@intel.com&gt;

* fix potential acc issue

Signed-off-by: Ma, Liangliang &lt;liangliang.ma@intel.com&gt;

* fix format

Signed-off-by: Ma, Liangliang &lt;liangliang.ma@intel.com&gt;

* fix format of rebase

Signed-off-by: Ma, Liangliang &lt;liangliang.ma@intel.com&gt;

---------

Signed-off-by: Ma, Liangliang &lt;liangliang.ma@intel.com&gt;
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -148,6 +148,7 @@ if(VLLM_GPU_LANG STREQUAL "SYCL")
   set(VLLM_EXT_SRC
     "csrc/xpu/cache.cpp"
     "csrc/xpu/layernorm.cpp"
+    "csrc/xpu/pos_encoding_kernels.cpp"
     "csrc/xpu/torch_bindings.cpp"
   )
   include_directories("/usr/include")
diff --git a/csrc/xpu/ops.h b/csrc/xpu/ops.h
@@ -8,6 +8,10 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
                         torch::Tensor& weight, double epsilon);
 
+void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
+                      std::optional<torch::Tensor> key, int64_t head_size,
+                      torch::Tensor& cos_sin_cache, bool is_neox);
+
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                        torch::Tensor& key_cache, torch::Tensor& value_cache,
                        torch::Tensor& slot_mapping,
diff --git a/csrc/xpu/pos_encoding_kernels.cpp b/csrc/xpu/pos_encoding_kernels.cpp
@@ -0,0 +1,212 @@
+#include <sycl/sycl.hpp>
+#include "utils.h"
+#include "dispatch_utils.h"
+#include <cmath>
+#include <c10/macros/Macros.h>
+
+namespace vllm {
+
+template <typename scalar_t, bool IS_NEOX>
+inline void apply_token_rotary_embedding(scalar_t* __restrict__ arr,
+                                         const scalar_t* __restrict__ cos_ptr,
+                                         const scalar_t* __restrict__ sin_ptr,
+                                         int rot_offset, int embed_dim) {
+  int x_index, y_index;
+  scalar_t cos, sin;
+  if (IS_NEOX) {
+    // GPT-NeoX style rotary embedding.
+    x_index = rot_offset;
+    y_index = embed_dim + rot_offset;
+    cos = cos_ptr[x_index];
+    sin = sin_ptr[x_index];
+  } else {
+    // GPT-J style rotary embedding.
+    x_index = 2 * rot_offset;
+    y_index = 2 * rot_offset + 1;
+    cos = cos_ptr[x_index / 2];
+    sin = sin_ptr[x_index / 2];
+  }
+
+  const scalar_t x = arr[x_index];
+  const scalar_t y = arr[y_index];
+  arr[x_index] = x * cos - y * sin;
+  arr[y_index] = y * cos + x * sin;
+}
+
+template <typename scalar_t, bool IS_NEOX>
+inline void apply_rotary_embedding(
+    scalar_t* __restrict__ query,  // [batch_size, seq_len, num_heads,
+                                   // head_size] or [num_tokens, num_heads,
+                                   // head_size]
+    scalar_t* __restrict__ key,    // nullptr or
+                                   // [batch_size, seq_len, num_kv_heads,
+                                   // head_size] or [num_tokens, num_kv_heads,
+                                   // head_size]
+    const scalar_t* cache_ptr, const int head_size, const int num_heads,
+    const int num_kv_heads, const int rot_dim, const int token_idx,
+    const int64_t query_stride, const int64_t key_stride,
+    const int64_t head_stride, const sycl::nd_item<3>& item_ct1) {
+  const int embed_dim = rot_dim / 2;
+  const scalar_t* cos_ptr = cache_ptr;
+  const scalar_t* sin_ptr = cache_ptr + embed_dim;
+
+  const int nq = num_heads * embed_dim;
+  for (int i = item_ct1.get_local_id(2); i < nq;
+       i += item_ct1.get_local_range(2)) {
+    const int head_idx = i / embed_dim;
+    const int64_t token_head =
+        token_idx * query_stride + head_idx * head_stride;
+    const int rot_offset = i % embed_dim;
+    apply_token_rotary_embedding<scalar_t, IS_NEOX>(
+        query + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim);
+  }
+
+  if (key != nullptr) {
+    const int nk = num_kv_heads * embed_dim;
+    for (int i = item_ct1.get_local_id(2); i < nk;
+         i += item_ct1.get_local_range(2)) {
+      const int head_idx = i / embed_dim;
+      const int64_t token_head =
+          token_idx * key_stride + head_idx * head_stride;
+      const int rot_offset = i % embed_dim;
+      apply_token_rotary_embedding<scalar_t, IS_NEOX>(
+          key + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim);
+    }
+  }
+}
+
+template <typename scalar_t, bool IS_NEOX>
+void rotary_embedding_kernel(
+    const int64_t* __restrict__ positions,  // [batch_size, seq_len] or
+                                            // [num_tokens]
+    scalar_t* __restrict__ query,           // [batch_size, seq_len, num_heads,
+                                   // head_size] or [num_tokens, num_heads,
+                                   // head_size]
+    scalar_t* __restrict__ key,  // nullptr or
+                                 // [batch_size, seq_len, num_kv_heads,
+                                 // head_size] or [num_tokens, num_kv_heads,
+                                 // head_size]
+    const scalar_t* __restrict__ cos_sin_cache,  // [max_position, 2, rot_dim //
+                                                 // 2]
+    const int rot_dim, const int64_t query_stride, const int64_t key_stride,
+    const int64_t head_stride, const int num_heads, const int num_kv_heads,
+    const int head_size, const sycl::nd_item<3>& item_ct1) {
+  // Each thread block is responsible for one token.
+  const int token_idx = item_ct1.get_group(2);
+  int64_t pos = positions[token_idx];
+  const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
+
+  apply_rotary_embedding<scalar_t, IS_NEOX>(
+      query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim,
+      token_idx, query_stride, key_stride, head_stride, item_ct1);
+}
+
+}  // namespace vllm
+
+template <typename scalar_t>
+void call_rotary_embedding_kernel(
+    torch::Tensor& positions, torch::Tensor& query,
+    std::optional<torch::Tensor> key, int64_t head_size,
+    torch::Tensor& cos_sin_cache,  // [max_position, rot_dim]
+    bool is_neox) {
+  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
+  // num_tokens = batch_size * seq_len
+  int64_t num_tokens = positions.numel();
+  int positions_ndim = positions.dim();
+
+  // Make sure num_tokens dim is consistent across positions, query, and key
+  TORCH_CHECK(
+      positions_ndim == 1 || positions_ndim == 2,
+      "positions must have shape [num_tokens] or [batch_size, seq_len]");
+  if (positions_ndim == 1) {
+    TORCH_CHECK(query.size(0) == positions.size(0) &&
+                    (!key.has_value() || key->size(0) == positions.size(0)),
+                "query, key and positions must have the same number of tokens");
+  }
+  if (positions_ndim == 2) {
+    TORCH_CHECK(
+        query.size(0) == positions.size(0) &&
+            (!key.has_value() || key->size(0) == positions.size(0)) &&
+            query.size(1) == positions.size(1) &&
+            (!key.has_value() || key->size(1) == positions.size(1)),
+        "query, key and positions must have the same batch_size and seq_len");
+  }
+
+  // Make sure head_size is valid for query and key
+  // hidden_size = num_heads * head_size
+  int query_hidden_size = query.numel() / num_tokens;
+  int key_hidden_size = key.has_value() ? key->numel() / num_tokens : 0;
+  TORCH_CHECK(query_hidden_size % head_size == 0);
+  TORCH_CHECK(key_hidden_size % head_size == 0);
+
+  // Make sure query and key have consistent number of heads
+  int num_heads = query_hidden_size / head_size;
+  int num_kv_heads = key.has_value() ? key_hidden_size / head_size : num_heads;
+  TORCH_CHECK(num_heads % num_kv_heads == 0);
+
+  int rot_dim = cos_sin_cache.size(1);
+  int seq_dim_idx = positions_ndim - 1;
+  int64_t query_stride = query.stride(seq_dim_idx);
+  int64_t key_stride = key.has_value() ? key->stride(seq_dim_idx) : 0;
+  // Determine head stride: for [*, heads, head_size] use stride of last dim;
+  // for flat [*, heads*head_size], heads blocks are contiguous of size
+  // head_size
+  int query_ndim = query.dim();
+  int64_t head_stride =
+      (query_ndim == positions_ndim + 2) ? query.stride(-2) : head_size;
+
+  auto positions_ptr = positions.data_ptr<int64_t>();
+  auto query_ptr = query.data_ptr<scalar_t>();
+  auto key_ptr = key.has_value() ? key->data_ptr<scalar_t>() : nullptr;
+  auto cos_sin_cache_ptr = cos_sin_cache.data_ptr<scalar_t>();
+
+  sycl::range<3> grid(1, 1, num_tokens);
+  sycl::range<3> block(1, 1, std::min<int64_t>(num_heads * rot_dim / 2, 512));
+
+  at::DeviceGuard device_guard(query.device());
+  auto& queue = vllm::xpu::vllmGetQueue();
+  if (is_neox) {
+    queue.submit([&](sycl::handler& cgh) {
+      cgh.parallel_for(
+          sycl::nd_range<3>(grid * block, block),
+          [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            vllm::rotary_embedding_kernel<sycl_t, true>(
+                positions_ptr, (sycl_t*)query_ptr, (sycl_t*)key_ptr,
+                (sycl_t*)cos_sin_cache_ptr, rot_dim, query_stride, key_stride,
+                head_stride, num_heads, num_kv_heads, head_size, item_ct1);
+          });
+    });
+  } else {
+    queue.submit([&](sycl::handler& cgh) {
+      cgh.parallel_for(
+          sycl::nd_range<3>(grid * block, block),
+          [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            vllm::rotary_embedding_kernel<sycl_t, false>(
+                positions_ptr, (sycl_t*)query_ptr, (sycl_t*)key_ptr,
+                (sycl_t*)cos_sin_cache_ptr, rot_dim, query_stride, key_stride,
+                head_stride, num_heads, num_kv_heads, head_size, item_ct1);
+          });
+    });
+  }
+}
+
+void rotary_embedding(
+    torch::Tensor& positions,  // [batch_size, seq_len] or [num_tokens]
+    torch::Tensor& query,  // [batch_size, seq_len, num_heads * head_size] or
+                           // [num_tokens, num_heads * head_size] or
+                           // [batch_size, seq_len, num_heads, head_size] or
+                           // [num_tokens, num_heads, head_size]
+    std::optional<torch::Tensor> key,
+    // null or
+    // [batch_size, seq_len, num_kv_heads * head_size] or
+    // [num_tokens, num_kv_heads * head_size] or
+    // [batch_size, seq_len, num_heads, head_size] or
+    // [num_tokens, num_heads, head_size]
+    int64_t head_size,
+    torch::Tensor& cos_sin_cache,  // [max_position, rot_dim]
+    bool is_neox) {
+  VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] {
+    call_rotary_embedding_kernel<scalar_t>(positions, query, key, head_size,
+                                           cos_sin_cache, is_neox);
+  });
+}
diff --git a/csrc/xpu/torch_bindings.cpp b/csrc/xpu/torch_bindings.cpp
@@ -31,6 +31,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "fused_add_rms_norm(Tensor! input, Tensor! residual, Tensor weight, "
       "float epsilon) -> ()");
   ops.impl("fused_add_rms_norm", torch::kXPU, &fused_add_rms_norm);
+
+  // pos_embedding
+  ops.def(
+      "rotary_embedding(Tensor positions, Tensor! query,"
+      "                 Tensor!? key, int head_size,"
+      "                 Tensor cos_sin_cache, bool is_neox) -> ()");
+  ops.impl("rotary_embedding", torch::kXPU, &rotary_embedding);
 }
 
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
diff --git a/tests/ops/rotary_embedding_op.py b/tests/ops/rotary_embedding_op.py
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Rotary Positional Embeddings Base Class."""
+from typing import Optional
+
+import torch
+
+import tests.register_ops as ops
+from tests.ops.custom_ops import CustomOp
+
+
+def apply_rotary_emb_torch(
+    x: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    is_neox_style: bool,
+) -> torch.Tensor:
+    cos = cos.unsqueeze(-2).to(x.dtype)
+    sin = sin.unsqueeze(-2).to(x.dtype)
+    if is_neox_style:
+        x1, x2 = torch.chunk(x, 2, dim=-1)
+    else:
+        x1 = x[..., ::2]
+        x2 = x[..., 1::2]
+    o1 = x1 * cos - x2 * sin
+    o2 = x2 * cos + x1 * sin
+    if is_neox_style:
+        return torch.cat((o1, o2), dim=-1)
+    else:
+        return torch.stack((o1, o2), dim=-1).flatten(-2)
+
+
+class RotaryEmbedding(CustomOp):
+    """Original rotary positional embedding."""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+    ) -> None:
+        super().__init__()
+        self.head_size = head_size
+        self.rotary_dim = rotary_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.is_neox_style = is_neox_style
+        self.dtype = dtype
+
+        cache = self._compute_cos_sin_cache()
+        cache = cache.to(dtype)
+        self.cos_sin_cache: torch.Tensor
+        self.register_buffer("cos_sin_cache", cache, persistent=False)
+
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
+        """Compute the inverse frequency."""
+        # NOTE(woosuk): To exactly match the HF implementation, we need to
+        # use CPU to compute the cache and then move it to GPU. However, we
+        # create the cache on GPU for faster initialization. This may cause
+        # a slight numerical difference between the HF implementation and ours.
+        inv_freq = 1.0 / (base**(torch.arange(
+            0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim))
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        """Compute the cos and sin cache."""
+        inv_freq = self._compute_inv_freq(self.base)
+        t = torch.arange(self.max_position_embeddings, dtype=torch.float)
+
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """A PyTorch-native implementation of forward()."""
+        if offsets is not None:
+            positions = positions + offsets
+        positions = positions.flatten()
+        num_tokens = positions.shape[0]
+        cos_sin = self.cos_sin_cache.index_select(0, positions)
+        cos, sin = cos_sin.chunk(2, dim=-1)
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., :self.rotary_dim]
+        query_pass = query[..., self.rotary_dim:]
+        query_rot = apply_rotary_emb_torch(query_rot, cos, sin,
+                                           self.is_neox_style)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        # key may be None in some cases, e.g. cross-layer KV sharing
+        if key is not None:
+            key_shape = key.shape
+            key = key.view(num_tokens, -1, self.head_size)
+            key_rot = key[..., :self.rotary_dim]
+            key_pass = key[..., self.rotary_dim:]
+            key_rot = apply_rotary_emb_torch(key_rot, cos, sin,
+                                             self.is_neox_style)
+            key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    def forward_xpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+
+        # __setattr__ in nn.Module (called by `self.cos_sin_cache = ...`)
+        # is expensive, so avoid calling it if possible
+        if self.cos_sin_cache.device != query.device or \
+            self.cos_sin_cache.dtype != query.dtype:
+            self.cos_sin_cache = self.cos_sin_cache.to(query.device,
+                                                       dtype=query.dtype)
+
+        # ops.rotary_embedding()/batched_rotary_embedding()
+        # are in-place operations that update the query and key tensors.
+        if offsets is not None:
+            raise NotImplementedError(
+                "batched_rotary_embedding is not implemented yet.")
+        else:
+            ops.rotary_embedding(positions, query, key, self.head_size,
+                                 self.cos_sin_cache, self.is_neox_style)
+        return query, key
+
+    def extra_repr(self) -> str:
+        s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
+        s += f", max_position_embeddings={self.max_position_embeddings}"
+        s += f", base={self.base}, is_neox_style={self.is_neox_style}"
+        return s
diff --git a/tests/register_ops.py b/tests/register_ops.py
diff --git a/tests/test_rotary_embedding.py b/tests/test_rotary_embedding.py

Original file line number	Diff line number	Diff line change
`@@ -148,6 +148,7 @@ if(VLLM_GPU_LANG STREQUAL "SYCL")`
`148`	`148`	`set(VLLM_EXT_SRC`
`149`	`149`	`"csrc/xpu/cache.cpp"`
`150`	`150`	`"csrc/xpu/layernorm.cpp"`
	`151`	`+ "csrc/xpu/pos_encoding_kernels.cpp"`
`151`	`152`	`"csrc/xpu/torch_bindings.cpp"`
`152`	`153`	`)`
`153`	`154`	`include_directories("/usr/include")`