InfiniTensor
diff --git a/‎include/infinicore/ops/paged_attention_prefill.hpp‎
Lines changed: 38 additions & 4 deletions b/‎include/infinicore/ops/paged_attention_prefill.hpp‎
Lines changed: 38 additions & 4 deletions
diff --git a/‎include/infiniop/ops/paged_attention_prefill.h‎
Lines changed: 16 additions & 12 deletions b/‎include/infiniop/ops/paged_attention_prefill.h‎
Lines changed: 16 additions & 12 deletions
diff --git a/‎python/infinicore/ops/paged_attention_prefill.py‎
Lines changed: 10 additions & 11 deletions b/‎python/infinicore/ops/paged_attention_prefill.py‎
Lines changed: 10 additions & 11 deletions
diff --git a/‎src/infinicore/ops/paged_attention/paged_attention.cc‎
Lines changed: 7 additions & 7 deletions b/‎src/infinicore/ops/paged_attention/paged_attention.cc‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎src/infinicore/ops/paged_attention/paged_attention_infiniop.cc‎
Lines changed: 4 additions & 4 deletions b/‎src/infinicore/ops/paged_attention/paged_attention_infiniop.cc‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/infinicore/ops/paged_attention_prefill/paged_attention_prefill.cc‎
Lines changed: 18 additions & 7 deletions b/‎src/infinicore/ops/paged_attention_prefill/paged_attention_prefill.cc‎
Lines changed: 18 additions & 7 deletions
diff --git a/‎src/infinicore/ops/paged_attention_prefill/paged_attention_prefill_infiniop.cc‎
Lines changed: 21 additions & 6 deletions b/‎src/infinicore/ops/paged_attention_prefill/paged_attention_prefill_infiniop.cc‎
Lines changed: 21 additions & 6 deletions
diff --git a/‎src/infinicore/pybind11/ops.hpp‎
Lines changed: 2 additions & 0 deletions b/‎src/infinicore/pybind11/ops.hpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/infinicore/pybind11/ops/paged_attention_prefill.hpp‎
Lines changed: 69 additions & 0 deletions b/‎src/infinicore/pybind11/ops/paged_attention_prefill.hpp‎
Lines changed: 69 additions & 0 deletions
@@ -8,11 +8,45 @@ namespace infinicore::op {
 
 class PagedAttentionPrefill {
 public:
-    using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, std::optional<Tensor>, float);
-    static void execute(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, Tensor seq_lens, Tensor seq_offsets, std::optional<Tensor> alibi_slopes, float);
+    /**
+     * @brief PagedAttentionPrefill operator signature
+     * * Argument order:
+     * 1. out: Output tensor (Packed format)
+     * 2. q: Current Query tensor (Packed format)
+     * 3. k_cache: Physical Key cache (Paged format)
+     * 4. v_cache: Physical Value cache (Paged format)
+     * 5. block_tables: Mapping table from logical blocks to physical blocks
+     * 6. total_kv_lens:  lengths of Complete Key/Value for each request
+     * 7. cu_seqlens_q: Cumulative sequence lengths of Query (prefix sum for variable-length batch)
+     * 8. alibi_slopes: ALiBi bias slopes (optional)
+     * 9. scale: Scaling factor (typically 1/sqrt(head_size))
+     */
+    using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, std::optional<Tensor>, float);
+
+    static void execute(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache,
+                        Tensor block_tables, Tensor total_kv_lens, Tensor cum_seqlens_q,
+                        std::optional<Tensor> alibi_slopes, float scale);
+
     static common::OpDispatcher<schema> &dispatcher();
 };
 
-Tensor paged_attention_prefill(Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, Tensor seq_lens, Tensor seq_offsets, std::optional<Tensor> alibi_slopes, float scale);
-void paged_attention_prefill_(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, Tensor seq_lens, Tensor seq_offsets, std::optional<Tensor> alibi_slopes, float scale);
+Tensor paged_attention_prefill(Tensor q,
+                               Tensor k_cache,
+                               Tensor v_cache,
+                               Tensor block_tables,
+                               Tensor total_kv_lens,
+                               Tensor cum_seqlens_q,
+                               std::optional<Tensor> alibi_slopes,
+                               float scale);
+
+void paged_attention_prefill_(Tensor out,
+                              Tensor q,
+                              Tensor k_cache,
+                              Tensor v_cache,
+                              Tensor block_tables,
+                              Tensor total_kv_lens,
+                              Tensor cum_seqlens_q,
+                              std::optional<Tensor> alibi_slopes,
+                              float scale);
+
 } // namespace infinicore::op
@@ -11,15 +11,22 @@ typedef struct InfiniopDescriptor *infiniopPagedAttentionPrefillDescriptor_t;
  * @param handle The handle to the InfiniOP library context.
  * @param desc_ptr A pointer to store the created descriptor.
  * @param out_desc Descriptor for the output tensor.
+ * Shape: [total_q_tokens, num_heads, head_size]
  * @param q_desc Descriptor for the query tensor (packed/flattened).
+ * Shape: [total_q_tokens, num_heads, head_size]
  * @param k_cache_desc Descriptor for the global physical key cache.
+ * Shape: [max_num_blocks, num_kv_heads, block_size, head_size]
  * @param v_cache_desc Descriptor for the global physical value cache.
+ * Shape: [max_num_blocks, num_kv_heads, block_size, head_size]
  * @param block_tables_desc Descriptor for the block tables mapping logic to physical blocks.
- * @param cache_lens_desc Descriptor for the total sequence lengths (history + current).
- * @param seq_lens_desc Descriptor for the current prefill sequence lengths.
- * @param offset_desc Descriptor for the start position of each sequence in the packed Q tensor.
+ * Shape: [batch_size, max_blocks_per_seq]
+ * @param seq_lens_desc Descriptor for the total KV lengths of each sequence.
+ * Shape: [batch_size]
+ * @param cum_seq_lens_q_desc Descriptor for the cumulative start position (prefix sum) of each Q sequence.
+ * Shape: [batch_size + 1]
  * @param alibi_slopes_desc Optional descriptor for the ALiBi slopes tensor. Can be NULL.
- * @param scale The attention scaling factor.
+ * Shape: [num_heads]
+ * @param scale The attention scaling factor (typically 1.0 / sqrt(head_size)).
  * @return infiniStatus_t Status code of the operation.
  */
 __C __export infiniStatus_t infiniopCreatePagedAttentionPrefillDescriptor(
@@ -30,9 +37,8 @@ __C __export infiniStatus_t infiniopCreatePagedAttentionPrefillDescriptor(
     infiniopTensorDescriptor_t k_cache_desc,
     infiniopTensorDescriptor_t v_cache_desc,
     infiniopTensorDescriptor_t block_tables_desc,
-    infiniopTensorDescriptor_t cache_lens_desc,
     infiniopTensorDescriptor_t seq_lens_desc,
-    infiniopTensorDescriptor_t offset_desc,
+    infiniopTensorDescriptor_t cum_seq_lens_q_desc,
     infiniopTensorDescriptor_t alibi_slopes_desc,
     float scale);
 
@@ -52,11 +58,10 @@ __C __export infiniStatus_t infiniopGetPagedAttentionPrefillWorkspaceSize(
  * @param k_cache Pointer to the global key cache data.
  * @param v_cache Pointer to the global value cache data.
  * @param block_tables Pointer to the block tables data.
- * @param cache_lens Pointer to the total sequence lengths data.
- * @param seq_lens Pointer to the current prefill sequence lengths data.
- * @param offset Pointer to the sequence start offsets data.
+ * @param seq_lens Pointer to the KV lengths data.
+ * @param cum_seq_lens_q Pointer to the Q cumulative sequence lengths data (prefix sum).
  * @param alibi_slopes Pointer to the ALiBi slopes data. Can be NULL.
- * @param stream The CUDA/device stream for the operation.
+ * @param stream The device stream (e.g., cudaStream_t) for the operation.
  * @return infiniStatus_t Status code of the operation.
  */
 __C __export infiniStatus_t infiniopPagedAttentionPrefill(
@@ -68,9 +73,8 @@ __C __export infiniStatus_t infiniopPagedAttentionPrefill(
     const void *k_cache,
     const void *v_cache,
     const void *block_tables,
-    const void *cache_lens,
     const void *seq_lens,
-    const void *offset,
+    const void *cum_seq_lens_q,
     const void *alibi_slopes,
     void *stream);
 
 
@@ -7,25 +7,25 @@ def paged_attention_prefill(
     k_cache: Tensor,
     v_cache: Tensor,
     block_tables: Tensor,
-    cache_lens: Tensor,
-    seq_lens: Tensor,
-    seq_offsets: Tensor,
+    history_lens: Tensor,
+    cu_seqlens_q: Tensor,
     alibi_slopes: Tensor | None = None,
     scale: float = 1.0,
     *,
     out: Tensor | None = None,
 ):
+    alibi_ptr = alibi_slopes._underlying if alibi_slopes is not None else None
+
     if out is None:
         return Tensor(
             _infinicore.paged_attention_prefill(
                 q._underlying,
                 k_cache._underlying,
                 v_cache._underlying,
                 block_tables._underlying,
-                cache_lens._underlying,
-                seq_lens._underlying,
-                seq_offsets._underlying,
-                alibi_slopes._underlying if alibi_slopes is not None else None,
+                history_lens._underlying,
+                cu_seqlens_q._underlying,
+                alibi_ptr,
                 scale,
             )
         )
@@ -36,10 +36,9 @@ def paged_attention_prefill(
         k_cache._underlying,
         v_cache._underlying,
         block_tables._underlying,
-        cache_lens._underlying,
-        seq_lens._underlying,
-        seq_offsets._underlying,
-        alibi_slopes._underlying if alibi_slopes is not None else None,
+        history_lens._underlying,
+        cu_seqlens_q._underlying,
+        alibi_ptr,
         scale,
     )
 
 
@@ -9,20 +9,20 @@ common::OpDispatcher<PagedAttention::schema> &PagedAttention::dispatcher() {
     return dispatcher_;
 };
 
-void PagedAttention::execute(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, std::optional<Tensor> alibi_slopes, float scale) {
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, q, k_cache, v_cache, block_tables, cache_lens);
+void PagedAttention::execute(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor kv_lens, std::optional<Tensor> alibi_slopes, float scale) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, q, k_cache, v_cache, block_tables, kv_lens);
     infinicore::context::setDevice(out->device());
-    dispatcher().lookup(out->device().getType())(out, q, k_cache, v_cache, block_tables, cache_lens, alibi_slopes, scale);
+    dispatcher().lookup(out->device().getType())(out, q, k_cache, v_cache, block_tables, kv_lens, alibi_slopes, scale);
 }
 
-Tensor paged_attention(Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, std::optional<Tensor> alibi_slopes, float scale) {
+Tensor paged_attention(Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor kv_lens, std::optional<Tensor> alibi_slopes, float scale) {
     auto out = Tensor::empty(q->shape(), q->dtype(), q->device());
-    paged_attention_(out, q, k_cache, v_cache, block_tables, cache_lens, alibi_slopes, scale);
+    paged_attention_(out, q, k_cache, v_cache, block_tables, kv_lens, alibi_slopes, scale);
     return out;
 }
 
-void paged_attention_(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, std::optional<Tensor> alibi_slopes, float scale) {
-    PagedAttention::execute(out, q, k_cache, v_cache, block_tables, cache_lens, alibi_slopes, scale);
+void paged_attention_(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor kv_lens, std::optional<Tensor> alibi_slopes, float scale) {
+    PagedAttention::execute(out, q, k_cache, v_cache, block_tables, kv_lens, alibi_slopes, scale);
 }
 
 } // namespace infinicore::op
@@ -15,8 +15,8 @@ thread_local common::OpCache<size_t, infiniopPagedAttentionDescriptor_t> caches(
         }
     });
 
-void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, std::optional<Tensor> alibi_slopes, float scale) {
-    size_t seed = hash_combine(out, q, k_cache, v_cache, block_tables, cache_lens, alibi_slopes, scale);
+void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor kv_lens, std::optional<Tensor> alibi_slopes, float scale) {
+    size_t seed = hash_combine(out, q, k_cache, v_cache, block_tables, kv_lens, alibi_slopes, scale);
 
     auto device = context::getDevice();
     auto &cache = caches.getCache(device);
@@ -27,7 +27,7 @@ void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor bloc
     if (!desc_opt) {
         INFINICORE_CHECK_ERROR(infiniopCreatePagedAttentionDescriptor(
             context::getInfiniopHandle(device), &desc,
-            out->desc(), q->desc(), k_cache->desc(), v_cache->desc(), block_tables->desc(), cache_lens->desc(),
+            out->desc(), q->desc(), k_cache->desc(), v_cache->desc(), block_tables->desc(), kv_lens->desc(),
             alibi_slopes.has_value() ? alibi_slopes.value()->desc() : nullptr,
             scale));
         cache.put(seed, desc);
@@ -41,7 +41,7 @@ void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor bloc
 
     INFINICORE_CHECK_ERROR(infiniopPagedAttention(
         desc, workspace->data(), workspace_size,
-        out->data(), q->data(), k_cache->data(), v_cache->data(), block_tables->data(), cache_lens->data(),
+        out->data(), q->data(), k_cache->data(), v_cache->data(), block_tables->data(), kv_lens->data(),
         alibi_slopes.has_value() ? alibi_slopes.value()->data() : nullptr,
         context::getStream()));
 }
 
@@ -9,20 +9,31 @@ common::OpDispatcher<PagedAttentionPrefill::schema> &PagedAttentionPrefill::disp
     return dispatcher_;
 };
 
-void PagedAttentionPrefill::execute(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, Tensor seq_lens, Tensor seq_offsets, std::optional<Tensor> alibi_slopes, float scale) {
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, q, k_cache, v_cache, block_tables, cache_lens);
+void PagedAttentionPrefill::execute(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache,
+                                    Tensor block_tables, Tensor kv_lens, Tensor cum_seqlens_q,
+                                    std::optional<Tensor> alibi_slopes, float scale) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, q, k_cache, v_cache, block_tables, kv_lens, cum_seqlens_q);
+
     infinicore::context::setDevice(out->device());
-    dispatcher().lookup(out->device().getType())(out, q, k_cache, v_cache, block_tables, cache_lens, seq_lens, seq_offsets, alibi_slopes, scale);
+
+    dispatcher().lookup(out->device().getType())(out, q, k_cache, v_cache, block_tables,
+                                                 kv_lens, cum_seqlens_q, alibi_slopes, scale);
 }
 
-Tensor paged_attention_prefill(Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, Tensor seq_lens, Tensor seq_offsets, std::optional<Tensor> alibi_slopes, float scale) {
+Tensor paged_attention_prefill(Tensor q, Tensor k_cache, Tensor v_cache,
+                               Tensor block_tables, Tensor kv_lens, Tensor cum_seqlens_q,
+                               std::optional<Tensor> alibi_slopes, float scale) {
+
     auto out = Tensor::empty(q->shape(), q->dtype(), q->device());
-    paged_attention_prefill_(out, q, k_cache, v_cache, block_tables, cache_lens, seq_lens, seq_offsets, alibi_slopes, scale);
+    paged_attention_prefill_(out, q, k_cache, v_cache, block_tables, kv_lens, cum_seqlens_q, alibi_slopes, scale);
     return out;
 }
 
-void paged_attention_prefill_(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, Tensor seq_lens, Tensor seq_offsets, std::optional<Tensor> alibi_slopes, float scale) {
-    PagedAttentionPrefill::execute(out, q, k_cache, v_cache, block_tables, cache_lens, seq_lens, seq_offsets, alibi_slopes, scale);
+void paged_attention_prefill_(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache,
+                              Tensor block_tables, Tensor kv_lens, Tensor cum_seqlens_q,
+                              std::optional<Tensor> alibi_slopes, float scale) {
+
+    PagedAttentionPrefill::execute(out, q, k_cache, v_cache, block_tables, kv_lens, cum_seqlens_q, alibi_slopes, scale);
 }
 
 } // namespace infinicore::op
@@ -15,8 +15,10 @@ thread_local common::OpCache<size_t, infiniopPagedAttentionPrefillDescriptor_t>
         }
     });
 
-void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, Tensor seq_lens, Tensor seq_offsets, std::optional<Tensor> alibi_slopes, float scale) {
-    size_t seed = hash_combine(out, q, k_cache, v_cache, block_tables, cache_lens, seq_lens, seq_offsets, alibi_slopes, scale);
+void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache,
+               Tensor block_tables, Tensor kv_lens, Tensor cum_seqlens_q,
+               std::optional<Tensor> alibi_slopes, float scale) {
+    size_t seed = hash_combine(out, q, k_cache, v_cache, block_tables, kv_lens, cum_seqlens_q, alibi_slopes, scale);
 
     auto device = context::getDevice();
     auto &cache = caches.getCache(device);
@@ -27,8 +29,13 @@ void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor bloc
     if (!desc_opt) {
         INFINICORE_CHECK_ERROR(infiniopCreatePagedAttentionPrefillDescriptor(
             context::getInfiniopHandle(device), &desc,
-            out->desc(), q->desc(), k_cache->desc(), v_cache->desc(), block_tables->desc(),
-            cache_lens->desc(), seq_lens->desc(), seq_offsets->desc(),
+            out->desc(),
+            q->desc(),
+            k_cache->desc(),
+            v_cache->desc(),
+            block_tables->desc(),
+            kv_lens->desc(),
+            cum_seqlens_q->desc(),
             alibi_slopes.has_value() ? alibi_slopes.value()->desc() : nullptr,
             scale));
         cache.put(seed, desc);
@@ -41,8 +48,16 @@ void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor bloc
     std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
 
     INFINICORE_CHECK_ERROR(infiniopPagedAttentionPrefill(
-        desc, workspace->data(), workspace_size,
-        out->data(), q->data(), k_cache->data(), v_cache->data(), block_tables->data(), cache_lens->data(), seq_lens->data(), seq_offsets->data(),
+        desc,
+        workspace->data(),
+        workspace_size,
+        out->data(),
+        q->data(),
+        k_cache->data(),
+        v_cache->data(),
+        block_tables->data(),
+        kv_lens->data(),
+        cum_seqlens_q->data(),
         alibi_slopes.has_value() ? alibi_slopes.value()->data() : nullptr,
         context::getStream()));
 }
 
@@ -11,6 +11,7 @@
 #include "ops/matmul.hpp"
 #include "ops/mul.hpp"
 #include "ops/paged_attention.hpp"
+#include "ops/paged_attention_prefill.hpp"
 #include "ops/paged_caching.hpp"
 #include "ops/random_sample.hpp"
 #include "ops/rearrange.hpp"
@@ -33,6 +34,7 @@ inline void bind(py::module &m) {
     bind_matmul(m);
     bind_mul(m);
     bind_paged_attention(m);
+    bind_paged_attention_prefill(m);
     bind_paged_caching(m);
     bind_rearrange(m);
     bind_rms_norm(m);
 
@@ -0,0 +1,69 @@
+#pragma once
+
+#include "infinicore/ops/paged_attention_prefill.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+Tensor py_paged_attention_prefill(Tensor q,
+                                  Tensor k_cache,
+                                  Tensor v_cache,
+                                  Tensor block_tables,
+                                  Tensor history_lens,
+                                  Tensor cu_seqlens_q,
+                                  py::object alibi_slopes,
+                                  float scale) {
+    std::optional<Tensor> alibi_slopes_tensor = std::nullopt;
+    if (!alibi_slopes.is_none()) {
+        alibi_slopes_tensor = alibi_slopes.cast<Tensor>();
+    }
+    return op::paged_attention_prefill(
+        q, k_cache, v_cache, block_tables, history_lens, cu_seqlens_q, alibi_slopes_tensor, scale);
+}
+
+void py_paged_attention_prefill_(Tensor out,
+                                 Tensor q,
+                                 Tensor k_cache,
+                                 Tensor v_cache,
+                                 Tensor block_tables,
+                                 Tensor history_lens,
+                                 Tensor cu_seqlens_q,
+                                 py::object alibi_slopes,
+                                 float scale) {
+    std::optional<Tensor> alibi_slopes_tensor = std::nullopt;
+    if (!alibi_slopes.is_none()) {
+        alibi_slopes_tensor = alibi_slopes.cast<Tensor>();
+    }
+    op::paged_attention_prefill_(out, q, k_cache, v_cache, block_tables, history_lens, cu_seqlens_q, alibi_slopes_tensor, scale);
+}
+
+inline void bind_paged_attention_prefill(py::module &m) {
+    m.def("paged_attention_prefill",
+          &ops::py_paged_attention_prefill,
+          py::arg("q"),
+          py::arg("k_cache"),
+          py::arg("v_cache"),
+          py::arg("block_tables"),
+          py::arg("history_lens"),
+          py::arg("cu_seqlens_q"),
+          py::arg("alibi_slopes") = py::none(),
+          py::arg("scale") = 1.0,
+          R"doc(Paged attention prefill for packed variable-length queries.)doc");
+
+    m.def("paged_attention_prefill_",
+          &ops::py_paged_attention_prefill_,
+          py::arg("out"),
+          py::arg("q"),
+          py::arg("k_cache"),
+          py::arg("v_cache"),
+          py::arg("block_tables"),
+          py::arg("history_lens"),
+          py::arg("cu_seqlens_q"),
+          py::arg("alibi_slopes") = py::none(),
+          py::arg("scale") = 1.0,
+          R"doc(In-place paged attention prefill for packed variable-length queries.)doc");
+}
+
+} // namespace infinicore::ops