format

pengcheng888 · pengcheng888 · commit ce3f99b66bbe · 2026-02-24T10:26:17.000+08:00
diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py
@@ -4,23 +4,23 @@
 from .linear import linear
 from .paged_attention_v2 import paged_attention_v2
 from .random_sample import random_sample
+from .reshape_and_cache import reshape_and_cache
 from .rms_norm import rms_norm
 from .rope import RopeAlgo, rope
 from .silu import silu
 from .swiglu import swiglu
-from .reshape_and_cache import reshape_and_cache
 
 __all__ = [
     "causal_softmax",
     "embedding",
     "flash_attention",
     "linear",
+    "paged_attention_v2",
     "random_sample",
+    "reshape_and_cache",
     "rms_norm",
     "RopeAlgo",
     "rope",
     "silu",
     "swiglu",
-    "paged_attention_v2",
-    "reshape_and_cache",
 ]
diff --git a/python/infinicore/nn/functional/reshape_and_cache.py b/python/infinicore/nn/functional/reshape_and_cache.py
@@ -1,6 +1,5 @@
 from infinicore.lib import _infinicore
-from infinicore.tensor import Tensor, empty
-
+from infinicore.tensor import Tensor
 
 
 def reshape_and_cache(
@@ -9,9 +8,9 @@ def reshape_and_cache(
     key_cache: Tensor,
     value_cache: Tensor,
     slot_mapping: Tensor,
-    kv_cache_dtype:str,
+    kv_cache_dtype: str,
     k_scale: Tensor,
-    v_scale: Tensor ,
+    v_scale: Tensor,
 ):
     _infinicore.reshape_and_cache(
         key._underlying,
diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp
@@ -16,9 +16,9 @@
 #include "ops/paged_attention_prefill.hpp"
 #include "ops/paged_attention_v2.hpp"
 #include "ops/paged_caching.hpp"
-#include "ops/reshape_and_cache.hpp"
 #include "ops/random_sample.hpp"
 #include "ops/rearrange.hpp"
+#include "ops/reshape_and_cache.hpp"
 #include "ops/rms_norm.hpp"
 #include "ops/rope.hpp"
 #include "ops/silu.hpp"
diff --git a/src/infiniop/ops/reshape_and_cache/metax/reshape_and_cache_metax_kernels.cuh b/src/infiniop/ops/reshape_and_cache/metax/reshape_and_cache_metax_kernels.cuh
@@ -13,177 +13,162 @@ namespace op::reshape_and_cache::metax {
 
 using Fp8KVCacheDataType = op::paged_attention_v2::vllm::Fp8KVCacheDataType;
 
-
-
 // Used by vectorization_utils to copy/convert one element
 template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
 struct CopyWithScaleOp {
-  float scale;
-
-  __device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
-    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-      dst = static_cast<OutT>(src);
-    } else {
-    //   dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
-     assert(false);
+    float scale;
+
+    __device__ __forceinline__ void operator()(OutT &dst, const InT src) const {
+        if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+            dst = static_cast<OutT>(src);
+        } else {
+            //   dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
+            assert(false);
+        }
     }
-  }
 };
 
-
-
-
 // Vectorization containers
 template <typename scalar_t, size_t vec_size>
 struct __align__(vec_size * sizeof(scalar_t)) vec_n_t {
-  scalar_t val[vec_size];
+    scalar_t val[vec_size];
 };
 
-
 template <int VEC_SIZE, typename InT, typename OutT, typename ScaOp>
 struct DefaultVecOp {
-  ScaOp scalar_op;
+    ScaOp scalar_op;
 
-  __device__ __forceinline__ void operator()(
-      vec_n_t<OutT, VEC_SIZE>& dst, const vec_n_t<InT, VEC_SIZE>& src) const {
+    __device__ __forceinline__ void operator()(
+        vec_n_t<OutT, VEC_SIZE> &dst, const vec_n_t<InT, VEC_SIZE> &src) const {
 #pragma unroll
-    for (int i = 0; i < VEC_SIZE; ++i) {
-      scalar_op(dst.val[i], src.val[i]);
+        for (int i = 0; i < VEC_SIZE; ++i) {
+            scalar_op(dst.val[i], src.val[i]);
+        }
     }
-  }
 };
 
 template <int VEC_SIZE, typename InT, typename OutT, typename VecOp,
           typename ScaOp>
 __device__ inline void vectorize_with_alignment(
-    const InT* in, OutT* out, int len, int tid, int stride,
-    VecOp&& vec_op,       // vec_n_t<InT,16> -> vec_n_t<OutT,16>
-    ScaOp&& scalar_op) {  // InT -> OutT
-  static_assert(VEC_SIZE > 0 && (VEC_SIZE & (VEC_SIZE - 1)) == 0,
-                "VEC_SIZE must be a positive power-of-two");
-  constexpr int WIDTH = VEC_SIZE * sizeof(InT);  // eg: 64 B
-  uintptr_t addr = reinterpret_cast<uintptr_t>(in);
-
-  // fast path when the whole region is already aligned
-  // Note: currently the output is guaranteed to be same as the input, so we
-  // don't check it here, comments here just for future reference.
-  bool can_vec = ((addr & (WIDTH - 1)) == 0) && ((len & (VEC_SIZE - 1)) == 0);
-  if (can_vec) {
-    int num_vec = len / VEC_SIZE;
+    const InT *in, OutT *out, int len, int tid, int stride,
+    VecOp &&vec_op,      // vec_n_t<InT,16> -> vec_n_t<OutT,16>
+    ScaOp &&scalar_op) { // InT -> OutT
+    static_assert(VEC_SIZE > 0 && (VEC_SIZE & (VEC_SIZE - 1)) == 0,
+                  "VEC_SIZE must be a positive power-of-two");
+    constexpr int WIDTH = VEC_SIZE * sizeof(InT); // eg: 64 B
+    uintptr_t addr = reinterpret_cast<uintptr_t>(in);
+
+    // fast path when the whole region is already aligned
+    // Note: currently the output is guaranteed to be same as the input, so we
+    // don't check it here, comments here just for future reference.
+    bool can_vec = ((addr & (WIDTH - 1)) == 0) && ((len & (VEC_SIZE - 1)) == 0);
+    if (can_vec) {
+        int num_vec = len / VEC_SIZE;
+
+        using vin_t = vec_n_t<InT, VEC_SIZE>;
+        using vout_t = vec_n_t<OutT, VEC_SIZE>;
+        auto *v_in = reinterpret_cast<const vin_t *>(in);
+        auto *v_out = reinterpret_cast<vout_t *>(out);
+
+        for (int i = tid; i < num_vec; i += stride) {
+            vout_t tmp;
+            vec_op(tmp, v_in[i]);
+            v_out[i] = tmp;
+        }
+        return;
+    }
+
+    int misalignment_offset = addr & (WIDTH - 1);      // addr % 64
+    int alignment_bytes = WIDTH - misalignment_offset; // 64 - (addr % 64)
+    int prefix_elems = alignment_bytes & (WIDTH - 1);  // handle 64
+    prefix_elems /= sizeof(InT);
+    prefix_elems = min(prefix_elems, len); // 0 ≤ prefix < 16
+
+    // 1. prefill the when it is unsafe to vectorize
+    for (int i = tid; i < prefix_elems; i += stride) {
+        scalar_op(out[i], in[i]);
+    }
+
+    in += prefix_elems;
+    out += prefix_elems;
+    len -= prefix_elems;
 
+    int num_vec = len / VEC_SIZE;
     using vin_t = vec_n_t<InT, VEC_SIZE>;
     using vout_t = vec_n_t<OutT, VEC_SIZE>;
-    auto* v_in = reinterpret_cast<const vin_t*>(in);
-    auto* v_out = reinterpret_cast<vout_t*>(out);
+    auto *v_in = reinterpret_cast<const vin_t *>(in);
+    auto *v_out = reinterpret_cast<vout_t *>(out);
 
+    // 2. vectorize the main part
     for (int i = tid; i < num_vec; i += stride) {
-      vout_t tmp;
-      vec_op(tmp, v_in[i]);
-      v_out[i] = tmp;
+        vout_t tmp;
+        vec_op(tmp, v_in[i]);
+        v_out[i] = tmp;
     }
-    return;
-  }
-
-  int misalignment_offset = addr & (WIDTH - 1);       // addr % 64
-  int alignment_bytes = WIDTH - misalignment_offset;  // 64 - (addr % 64)
-  int prefix_elems = alignment_bytes & (WIDTH - 1);   // handle 64
-  prefix_elems /= sizeof(InT);
-  prefix_elems = min(prefix_elems, len);  // 0 ≤ prefix < 16
-
-  // 1. prefill the when it is unsafe to vectorize
-  for (int i = tid; i < prefix_elems; i += stride) {
-    scalar_op(out[i], in[i]);
-  }
-
-  in += prefix_elems;
-  out += prefix_elems;
-  len -= prefix_elems;
-
-  int num_vec = len / VEC_SIZE;
-  using vin_t = vec_n_t<InT, VEC_SIZE>;
-  using vout_t = vec_n_t<OutT, VEC_SIZE>;
-  auto* v_in = reinterpret_cast<const vin_t*>(in);
-  auto* v_out = reinterpret_cast<vout_t*>(out);
-
-  // 2. vectorize the main part
-  for (int i = tid; i < num_vec; i += stride) {
-    vout_t tmp;
-    vec_op(tmp, v_in[i]);
-    v_out[i] = tmp;
-  }
-
-  // 3. handle the tail
-  int tail_start = num_vec * VEC_SIZE;
-  for (int i = tid + tail_start; i < len; i += stride) {
-    scalar_op(out[i], in[i]);
-  }
-}
-
 
+    // 3. handle the tail
+    int tail_start = num_vec * VEC_SIZE;
+    for (int i = tid + tail_start; i < len; i += stride) {
+        scalar_op(out[i], in[i]);
+    }
+}
 
 template <int VEC_SIZE, typename InT, typename OutT, typename ScaOp>
-__device__ __forceinline__ void vectorize_with_alignment(const InT* in,
-                                                         OutT* out, int len,
+__device__ __forceinline__ void vectorize_with_alignment(const InT *in,
+                                                         OutT *out, int len,
                                                          int tid, int stride,
-                                                         ScaOp&& scalar_op) {
-  using Vec = DefaultVecOp<VEC_SIZE, InT, OutT, std::decay_t<ScaOp>>;
-  vectorize_with_alignment<VEC_SIZE>(in, out, len, tid, stride, Vec{scalar_op},
-                                     std::forward<ScaOp>(scalar_op));
+                                                         ScaOp &&scalar_op) {
+    using Vec = DefaultVecOp<VEC_SIZE, InT, OutT, std::decay_t<ScaOp>>;
+    vectorize_with_alignment<VEC_SIZE>(in, out, len, tid, stride, Vec{scalar_op},
+                                       std::forward<ScaOp>(scalar_op));
 }
 
-
 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
 __global__ void reshape_and_cache_kernel(
-    const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size]
-    const scalar_t* __restrict__ value,  // [num_tokens, num_heads, head_size]
-    cache_t* __restrict__ key_cache,     // [num_blocks, num_heads, head_size/x,
-                                         // block_size, x]
-    cache_t* __restrict__ value_cache,   // [num_blocks, num_heads, head_size,
-                                         // block_size]
-    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
+    const scalar_t *__restrict__ key,         // [num_tokens, num_heads, head_size]
+    const scalar_t *__restrict__ value,       // [num_tokens, num_heads, head_size]
+    cache_t *__restrict__ key_cache,          // [num_blocks, num_heads, head_size/x,
+                                              // block_size, x]
+    cache_t *__restrict__ value_cache,        // [num_blocks, num_heads, head_size,
+                                              // block_size]
+    const int64_t *__restrict__ slot_mapping, // [num_tokens]
     const int key_stride, const int value_stride, const int num_heads,
     const int head_size, const int block_size, const int x,
-    const float* k_scale, const float* v_scale) {
-  const int64_t token_idx = blockIdx.x;
-  const int64_t slot_idx = slot_mapping[token_idx];
-  if (slot_idx < 0) {
-    // Padding token that should be ignored.
-    return;
-  }
-
-  const int64_t block_idx = slot_idx / block_size;
-  const int64_t block_offset = slot_idx % block_size;
-
-  const int n = num_heads * head_size;
-  for (int i = threadIdx.x; i < n; i += blockDim.x) {
-    const int64_t src_key_idx = token_idx * key_stride + i;
-    const int64_t src_value_idx = token_idx * value_stride + i;
-
-    const int head_idx = i / head_size;
-    const int head_offset = i % head_size;
-    const int x_idx = head_offset / x;
-    const int x_offset = head_offset % x;
-
-    const int64_t tgt_key_idx =
-        block_idx * num_heads * (head_size / x) * block_size * x +
-        head_idx * (head_size / x) * block_size * x + x_idx * block_size * x +
-        block_offset * x + x_offset;
-    const int64_t tgt_value_idx =
-        block_idx * num_heads * head_size * block_size +
-        head_idx * head_size * block_size + head_offset * block_size +
-        block_offset;
-    scalar_t tgt_key = key[src_key_idx];
-    scalar_t tgt_value = value[src_value_idx];
-    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-      key_cache[tgt_key_idx] = tgt_key;
-      value_cache[tgt_value_idx] = tgt_value;
-    } else {
-    //   key_cache[tgt_key_idx] =
-    //       fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
-    //   value_cache[tgt_value_idx] =
-    //       fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
-    assert(false);
+    const float *k_scale, const float *v_scale) {
+    const int64_t token_idx = blockIdx.x;
+    const int64_t slot_idx = slot_mapping[token_idx];
+    if (slot_idx < 0) {
+        // Padding token that should be ignored.
+        return;
+    }
+
+    const int64_t block_idx = slot_idx / block_size;
+    const int64_t block_offset = slot_idx % block_size;
+
+    const int n = num_heads * head_size;
+    for (int i = threadIdx.x; i < n; i += blockDim.x) {
+        const int64_t src_key_idx = token_idx * key_stride + i;
+        const int64_t src_value_idx = token_idx * value_stride + i;
+
+        const int head_idx = i / head_size;
+        const int head_offset = i % head_size;
+        const int x_idx = head_offset / x;
+        const int x_offset = head_offset % x;
+
+        const int64_t tgt_key_idx = block_idx * num_heads * (head_size / x) * block_size * x + head_idx * (head_size / x) * block_size * x + x_idx * block_size * x + block_offset * x + x_offset;
+        const int64_t tgt_value_idx = block_idx * num_heads * head_size * block_size + head_idx * head_size * block_size + head_offset * block_size + block_offset;
+        scalar_t tgt_key = key[src_key_idx];
+        scalar_t tgt_value = value[src_value_idx];
+        if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+            key_cache[tgt_key_idx] = tgt_key;
+            value_cache[tgt_value_idx] = tgt_value;
+        } else {
+            //   key_cache[tgt_key_idx] =
+            //       fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
+            //   value_cache[tgt_value_idx] =
+            //       fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
+            assert(false);
+        }
     }
-  }
 }
 } // namespace op::reshape_and_cache::metax
diff --git a/src/infiniop/ops/reshape_and_cache/operator.cc b/src/infiniop/ops/reshape_and_cache/operator.cc
diff --git a/src/infiniop/ops/reshape_and_cache/reshape_and_cache.h b/src/infiniop/ops/reshape_and_cache/reshape_and_cache.h