issue/923 - ninetoothed kv_caching

wooway777 · wooway777 · commit 7db2675acf6c · 2026-01-16T18:08:34.000+08:00
diff --git a/include/infinicore/ops/kv_caching.hpp b/include/infinicore/ops/kv_caching.hpp
@@ -1,4 +1,4 @@
-#pragma
+#pragma once
 
 #include "../device.hpp"
 #include "common/op.hpp"
@@ -15,11 +15,6 @@ class KVCaching {
     static common::OpDispatcher<schema> &dispatcher();
 };
 
-Tensor kv_caching(Tensor k_cache,
-                  Tensor v_cache,
-                  Tensor k,
-                  Tensor v,
-                  Tensor past_kv_lengths);
 void kv_caching_(Tensor k_cache,
                  Tensor v_cache,
                  Tensor k,
diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
@@ -45,6 +45,7 @@
 from infinicore.ops.add import add
 from infinicore.ops.add_rms_norm import add_rms_norm, add_rms_norm_
 from infinicore.ops.attention import attention
+from infinicore.ops.kv_caching import kv_caching
 from infinicore.ops.matmul import matmul
 from infinicore.ops.mul import mul
 from infinicore.ops.narrow import narrow
@@ -115,6 +116,7 @@
     "add_rms_norm",
     "add_rms_norm_",
     "attention",
+    "kv_caching",
     "matmul",
     "mul",
     "narrow",
diff --git a/python/infinicore/ops/kv_caching.py b/python/infinicore/ops/kv_caching.py
@@ -0,0 +1,13 @@
+from infinicore.lib import _infinicore
+
+
+def kv_caching(k_cache, v_cache, k, v, past_kv_lengths):
+    _infinicore.kv_caching_(
+        k_cache._underlying,
+        v_cache._underlying,
+        k._underlying,
+        v._underlying,
+        past_kv_lengths._underlying,
+    )
+
+    return k_cache, v_cache
diff --git a/src/infinicore/ops/kv_caching/kv_caching.cc b/src/infinicore/ops/kv_caching/kv_caching.cc
@@ -28,15 +28,6 @@ void KVCaching::execute(Tensor k_cache,
     func(k_cache, v_cache, k, v, past_kv_lengths);
 }
 
-Tensor kv_caching(Tensor k_cache,
-                  Tensor v_cache,
-                  Tensor k,
-                  Tensor v,
-                  Tensor past_kv_lengths) {
-    KVCaching::execute(k_cache, v_cache, k, v, past_kv_lengths);
-    return k_cache; // or v_cache, depending on the intended use
-}
-
 void kv_caching_(Tensor k_cache,
                  Tensor v_cache,
                  Tensor k,
diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp
@@ -8,6 +8,7 @@
 #include "ops/causal_softmax.hpp"
 #include "ops/embedding.hpp"
 #include "ops/flash_attention.hpp"
+#include "ops/kv_caching.hpp"
 #include "ops/linear.hpp"
 #include "ops/matmul.hpp"
 #include "ops/mul.hpp"
@@ -30,20 +31,21 @@ inline void bind(py::module &m) {
     bind_add_rms_norm(m);
     bind_attention(m);
     bind_causal_softmax(m);
+    bind_embedding(m);
     bind_flash_attention(m);
-    bind_random_sample(m);
+    bind_kv_caching(m);
     bind_linear(m);
     bind_matmul(m);
     bind_mul(m);
     bind_paged_attention(m);
     bind_paged_attention_prefill(m);
     bind_paged_caching(m);
+    bind_random_sample(m);
     bind_rearrange(m);
     bind_rms_norm(m);
+    bind_rope(m);
     bind_silu(m);
     bind_swiglu(m);
-    bind_rope(m);
-    bind_embedding(m);
 }
 
 } // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/kv_caching.hpp b/src/infinicore/pybind11/ops/kv_caching.hpp
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/kv_caching.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_kv_caching(py::module &m) {
+    m.def("kv_caching_",
+          &op::kv_caching_,
+          py::arg("k_cache"),
+          py::arg("v_cache"),
+          py::arg("k"),
+          py::arg("v"),
+          py::arg("past_kv_lengths"),
+          R"doc(In-place Key-Value Caching.
+
+Updates the KV cache in-place with new key and value tensors.
+
+Args:
+    k_cache: Key cache tensor to update in-place
+    v_cache: Value cache tensor to update in-place
+    k: New key tensor to append
+    v: New value tensor to append
+    past_kv_lengths: Tensor containing current sequence lengths for each batch
+)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infiniop/ops/flash_attention/ninetoothed/descriptor.h b/src/infiniop/ops/flash_attention/ninetoothed/descriptor.h
@@ -67,24 +67,26 @@ class Descriptor final : public InfiniopDescriptor {
         constexpr auto block_size_m_{64};
         constexpr auto block_size_n_{64};
 
-        launch_flash_attention(stream,
-                               query,
-                               key,
-                               value,
-                               attn_mask,
-                               is_causal,
-                               scale,
-                               output,
-                               with_attn_mask,
-                               causal_variant,
-                               with_kv_cache_,
-                               emb_dim_,
-                               is_causal_,
-                               with_attn_mask_,
-                               causal_variant_,
-                               dtype_,
-                               block_size_m_,
-                               block_size_n_);
+        if (launch_flash_attention(stream,
+                                   query,
+                                   key,
+                                   value,
+                                   attn_mask,
+                                   is_causal,
+                                   scale,
+                                   output,
+                                   with_attn_mask,
+                                   causal_variant,
+                                   with_kv_cache_,
+                                   emb_dim_,
+                                   is_causal_,
+                                   with_attn_mask_,
+                                   causal_variant_,
+                                   dtype_,
+                                   block_size_m_,
+                                   block_size_n_)) {
+            return INFINI_STATUS_NOT_IMPLEMENTED;
+        }
 
         return INFINI_STATUS_SUCCESS;
     }
diff --git a/src/infiniop/ops/kv_caching/ninetoothed/build.py b/src/infiniop/ops/kv_caching/ninetoothed/build.py
@@ -0,0 +1,27 @@
+import ninetoothed
+from . import kv_caching
+
+import infiniop.ninetoothed.build
+
+
+def build():
+    dtype_values = (
+        ninetoothed.float16,
+        ninetoothed.bfloat16,
+        ninetoothed.float32,
+    )
+
+    constexpr_param_grid = {
+        "emb_dim": (1, 16, 32, 64, 128, 256),
+        "dtype": dtype_values,
+        "block_size_m": (64,),
+        "block_size_n": (64,),
+    }
+
+    infiniop.ninetoothed.build.build(
+        kv_caching.premake,
+        constexpr_param_grid,
+        caller="cuda",
+        op_name="kv_caching",
+        output_dir=infiniop.ninetoothed.build.BUILD_DIRECTORY_PATH,
+    )
diff --git a/src/infiniop/ops/kv_caching/ninetoothed/kv_caching.h b/src/infiniop/ops/kv_caching/ninetoothed/kv_caching.h
@@ -0,0 +1,101 @@
+#ifndef KV_CACHING_H
+#define KV_CACHING_H
+
+#include "../../../handle.h"
+#include "../../../operator.h"
+#include "../../../tensor.h"
+
+#include "../../../../../build/ninetoothed/kv_caching.h"
+#include "../../../ninetoothed/utils.h"
+
+namespace op::kv_caching::ninetoothed {
+class Descriptor final : public InfiniopDescriptor {
+
+public:
+    Descriptor(
+        infiniopHandle_t handle,
+        infiniopTensorDescriptor_t k_cache_desc,
+        infiniopTensorDescriptor_t v_cache_desc,
+        infiniopTensorDescriptor_t k_desc,
+        infiniopTensorDescriptor_t v_desc,
+        infiniopTensorDescriptor_t past_kv_lengths_desc) : InfiniopDescriptor{handle->device, handle->device_id},
+                                                           k_cache_shape_{k_cache_desc->shape()},
+                                                           k_cache_strides_{k_cache_desc->strides()},
+                                                           v_cache_shape_{v_cache_desc->shape()},
+                                                           v_cache_strides_{v_cache_desc->strides()},
+                                                           k_shape_{k_desc->shape()},
+                                                           k_strides_{k_desc->strides()},
+                                                           v_shape_{v_desc->shape()},
+                                                           v_strides_{v_desc->strides()},
+                                                           past_kv_lengths_shape_{past_kv_lengths_desc->shape()},
+                                                           past_kv_lengths_strides_{past_kv_lengths_desc->strides()},
+                                                           dtype_{k_desc->dtype()} {}
+
+    ~Descriptor() = default;
+
+    size_t get_workspace_size() const { return 0; };
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t k_cache,
+        infiniopTensorDescriptor_t v_cache,
+        infiniopTensorDescriptor_t k,
+        infiniopTensorDescriptor_t v,
+        infiniopTensorDescriptor_t past_kv_lengths) {
+        *desc_ptr = new Descriptor{handle, k_cache, v_cache, k, v, past_kv_lengths};
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t calculate(
+        void *workspace, size_t workspace_size,
+        void *k_cache,
+        void *v_cache,
+        const void *k,
+        const void *v,
+        const void *past_kv_lengths,
+        void *stream) const {
+        auto k_cache_nt{::ninetoothed::Tensor{k_cache, k_cache_shape_, k_cache_strides_}};
+        auto v_cache_nt{::ninetoothed::Tensor{v_cache, v_cache_shape_, v_cache_strides_}};
+        auto k_nt{::ninetoothed::Tensor{k, k_shape_, k_strides_}};
+        auto v_nt{::ninetoothed::Tensor{v, v_shape_, v_strides_}};
+        auto past_kv_lengths_nt{::ninetoothed::Tensor{past_kv_lengths, past_kv_lengths_shape_, past_kv_lengths_strides_}};
+
+        if (launch_kv_caching(stream,
+                              k_cache_nt,
+                              v_cache_nt,
+                              k_nt,
+                              v_nt,
+                              past_kv_lengths_nt,
+                              k_shape_[3],
+                              dtype_,
+                              64, 64)) {
+            return INFINI_STATUS_NOT_IMPLEMENTED;
+        }
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+private:
+    using Size = ::ninetoothed::Tensor<>::Size;
+    using Stride = ::ninetoothed::Tensor<>::Stride;
+
+    std::vector<Size> k_cache_shape_;
+    std::vector<Stride> k_cache_strides_;
+
+    std::vector<Size> v_cache_shape_;
+    std::vector<Stride> v_cache_strides_;
+
+    std::vector<Size> k_shape_;
+    std::vector<Stride> k_strides_;
+    std::vector<Size> v_shape_;
+    std::vector<Stride> v_strides_;
+
+    std::vector<Size> past_kv_lengths_shape_;
+    std::vector<Stride> past_kv_lengths_strides_;
+
+    infiniDtype_t dtype_;
+};
+} // namespace op::kv_caching::ninetoothed
+
+#endif // KV_CACHING_H
diff --git a/src/infiniop/ops/kv_caching/ninetoothed/kv_caching.py b/src/infiniop/ops/kv_caching/ninetoothed/kv_caching.py
@@ -0,0 +1,66 @@
+import functools
+import ninetoothed
+from ninetoothed import Tensor
+
+
+def arrangement(
+    k_cache,
+    v_cache,
+    k,
+    v,
+    past_lengths,
+    block_size_m=ninetoothed.block_size(),
+    block_size_n=ninetoothed.block_size(),
+):
+    k_cache_arranged = k_cache.tile((1, block_size_m, 1, -1)).tile((1, 1, -1, 1))
+    v_cache_arranged = v_cache.tile((1, block_size_m, 1, -1)).tile((1, 1, -1, 1))
+
+    k_arranged = k.tile((1, block_size_m, 1, -1)).tile((1, 1, -1, 1))
+    v_arranged = v.tile((1, block_size_m, 1, -1)).tile((1, 1, -1, 1))
+
+    past_lengths_arranged = (
+        past_lengths.tile((1,))
+        .unsqueeze(1)
+        .unsqueeze(2)
+        .unsqueeze(3)
+        .unsqueeze(4)
+        .expand((-1, *k_arranged.shape))
+    )
+
+    return (
+        k_cache_arranged,
+        v_cache_arranged,
+        k_arranged,
+        v_arranged,
+        past_lengths_arranged,
+    )
+
+
+def application(k_cache, v_cache, k, v, past_lengths):
+    pos = past_lengths
+
+    for i in range(k.shape[-2]):
+        k_cache[0, 0, pos + i, 0] = k[0, 0, i, 0]
+        v_cache[0, 0, pos + i, 0] = v[0, 0, i, 0]
+
+
+def premake(emb_dim=None, dtype=None, block_size_m=None, block_size_n=None):
+    arrangement_ = functools.partial(
+        arrangement, block_size_m=block_size_m, block_size_n=block_size_n
+    )
+
+    shape_options = (None, None, None, {"constexpr": True, "upper_bound": 256})
+
+    tensors = (
+        Tensor(4, dtype=dtype, shape_options=shape_options),
+        Tensor(4, dtype=dtype, shape_options=shape_options),
+        Tensor(4, dtype=dtype, shape_options=shape_options),
+        Tensor(4, dtype=dtype, shape_options=shape_options),
+        Tensor(1, dtype=ninetoothed.int64),
+    )
+
+    if emb_dim is not None:
+        for tensor in tensors:
+            tensor.shape = tensor.shape[:-1] + (emb_dim,)
+
+    return arrangement_, application, tensors
diff --git a/src/infiniop/ops/kv_caching/operator.cc b/src/infiniop/ops/kv_caching/operator.cc
diff --git a/test/infinicore/framework/base.py b/test/infinicore/framework/base.py
diff --git a/test/infinicore/ops/kv_caching.py b/test/infinicore/ops/kv_caching.py