InfiniTensor
diff --git a/‎include/infinicore/ops/paged_attention_prefill.hpp‎
Lines changed: 6 additions & 6 deletions b/‎include/infinicore/ops/paged_attention_prefill.hpp‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎include/infiniop/ops/paged_attention_prefill.h‎
Lines changed: 4 additions & 4 deletions b/‎include/infiniop/ops/paged_attention_prefill.h‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/infinicore/ops/paged_attention/paged_attention.cc‎
Lines changed: 7 additions & 7 deletions b/‎src/infinicore/ops/paged_attention/paged_attention.cc‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎src/infinicore/ops/paged_attention/paged_attention_infiniop.cc‎
Lines changed: 4 additions & 4 deletions b/‎src/infinicore/ops/paged_attention/paged_attention_infiniop.cc‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/infinicore/ops/paged_attention_prefill/paged_attention_prefill.cc‎
Lines changed: 7 additions & 8 deletions b/‎src/infinicore/ops/paged_attention_prefill/paged_attention_prefill.cc‎
Lines changed: 7 additions & 8 deletions
diff --git a/‎src/infinicore/ops/paged_attention_prefill/paged_attention_prefill_infiniop.cc‎
Lines changed: 6 additions & 7 deletions b/‎src/infinicore/ops/paged_attention_prefill/paged_attention_prefill_infiniop.cc‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎src/infinicore/pybind11/ops/paged_attention_prefill.hpp‎
Lines changed: 2 additions & 1 deletion b/‎src/infinicore/pybind11/ops/paged_attention_prefill.hpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/infiniop/ops/paged_attention_prefill/cuda/kernel.cuh‎
Lines changed: 19 additions & 16 deletions b/‎src/infiniop/ops/paged_attention_prefill/cuda/kernel.cuh‎
Lines changed: 19 additions & 16 deletions
diff --git a/‎src/infiniop/ops/paged_attention_prefill/info.h‎
Lines changed: 10 additions & 7 deletions b/‎src/infiniop/ops/paged_attention_prefill/info.h‎
Lines changed: 10 additions & 7 deletions
@@ -16,15 +16,15 @@ class PagedAttentionPrefill {
      * 3. k_cache: Physical Key cache (Paged format)
      * 4. v_cache: Physical Value cache (Paged format)
      * 5. block_tables: Mapping table from logical blocks to physical blocks
-     * 6. history_lens: Historical KV lengths (existing length of each sequence in cache)
+     * 6. total_kv_lens:  lengths of Complete Key/Value for each request
      * 7. cu_seqlens_q: Cumulative sequence lengths of Query (prefix sum for variable-length batch)
      * 8. alibi_slopes: ALiBi bias slopes (optional)
      * 9. scale: Scaling factor (typically 1/sqrt(head_size))
      */
     using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, std::optional<Tensor>, float);
 
     static void execute(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache,
-                        Tensor block_tables, Tensor history_lens, Tensor cu_seqlens_q,
+                        Tensor block_tables, Tensor total_kv_lens, Tensor cum_seqlens_q,
                         std::optional<Tensor> alibi_slopes, float scale);
 
     static common::OpDispatcher<schema> &dispatcher();
@@ -34,8 +34,8 @@ Tensor paged_attention_prefill(Tensor q,
                                Tensor k_cache,
                                Tensor v_cache,
                                Tensor block_tables,
-                               Tensor history_lens,
-                               Tensor cu_seqlens_q,
+                               Tensor total_kv_lens,
+                               Tensor cum_seqlens_q,
                                std::optional<Tensor> alibi_slopes,
                                float scale);
 
@@ -44,8 +44,8 @@ void paged_attention_prefill_(Tensor out,
                               Tensor k_cache,
                               Tensor v_cache,
                               Tensor block_tables,
-                              Tensor history_lens,
-                              Tensor cu_seqlens_q,
+                              Tensor total_kv_lens,
+                              Tensor cum_seqlens_q,
                               std::optional<Tensor> alibi_slopes,
                               float scale);
 
 
@@ -20,7 +20,7 @@ typedef struct InfiniopDescriptor *infiniopPagedAttentionPrefillDescriptor_t;
  * Shape: [max_num_blocks, num_kv_heads, block_size, head_size]
  * @param block_tables_desc Descriptor for the block tables mapping logic to physical blocks.
  * Shape: [batch_size, max_blocks_per_seq]
- * @param history_lens_desc Descriptor for the KV history lengths of each sequence.
+ * @param seq_lens_desc Descriptor for the total KV lengths of each sequence.
  * Shape: [batch_size]
  * @param cum_seq_lens_q_desc Descriptor for the cumulative start position (prefix sum) of each Q sequence.
  * Shape: [batch_size + 1]
@@ -37,7 +37,7 @@ __C __export infiniStatus_t infiniopCreatePagedAttentionPrefillDescriptor(
     infiniopTensorDescriptor_t k_cache_desc,
     infiniopTensorDescriptor_t v_cache_desc,
     infiniopTensorDescriptor_t block_tables_desc,
-    infiniopTensorDescriptor_t history_lens_desc,
+    infiniopTensorDescriptor_t seq_lens_desc,
     infiniopTensorDescriptor_t cum_seq_lens_q_desc,
     infiniopTensorDescriptor_t alibi_slopes_desc,
     float scale);
@@ -58,7 +58,7 @@ __C __export infiniStatus_t infiniopGetPagedAttentionPrefillWorkspaceSize(
  * @param k_cache Pointer to the global key cache data.
  * @param v_cache Pointer to the global value cache data.
  * @param block_tables Pointer to the block tables data.
- * @param history_lens Pointer to the KV history lengths data.
+ * @param seq_lens Pointer to the KV lengths data.
  * @param cum_seq_lens_q Pointer to the Q cumulative sequence lengths data (prefix sum).
  * @param alibi_slopes Pointer to the ALiBi slopes data. Can be NULL.
  * @param stream The device stream (e.g., cudaStream_t) for the operation.
@@ -73,7 +73,7 @@ __C __export infiniStatus_t infiniopPagedAttentionPrefill(
     const void *k_cache,
     const void *v_cache,
     const void *block_tables,
-    const void *history_lens,
+    const void *seq_lens,
     const void *cum_seq_lens_q,
     const void *alibi_slopes,
     void *stream);
 
@@ -9,20 +9,20 @@ common::OpDispatcher<PagedAttention::schema> &PagedAttention::dispatcher() {
     return dispatcher_;
 };
 
-void PagedAttention::execute(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, std::optional<Tensor> alibi_slopes, float scale) {
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, q, k_cache, v_cache, block_tables, cache_lens);
+void PagedAttention::execute(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor kv_lens, std::optional<Tensor> alibi_slopes, float scale) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, q, k_cache, v_cache, block_tables, kv_lens);
     infinicore::context::setDevice(out->device());
-    dispatcher().lookup(out->device().getType())(out, q, k_cache, v_cache, block_tables, cache_lens, alibi_slopes, scale);
+    dispatcher().lookup(out->device().getType())(out, q, k_cache, v_cache, block_tables, kv_lens, alibi_slopes, scale);
 }
 
-Tensor paged_attention(Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, std::optional<Tensor> alibi_slopes, float scale) {
+Tensor paged_attention(Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor kv_lens, std::optional<Tensor> alibi_slopes, float scale) {
     auto out = Tensor::empty(q->shape(), q->dtype(), q->device());
-    paged_attention_(out, q, k_cache, v_cache, block_tables, cache_lens, alibi_slopes, scale);
+    paged_attention_(out, q, k_cache, v_cache, block_tables, kv_lens, alibi_slopes, scale);
     return out;
 }
 
-void paged_attention_(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, std::optional<Tensor> alibi_slopes, float scale) {
-    PagedAttention::execute(out, q, k_cache, v_cache, block_tables, cache_lens, alibi_slopes, scale);
+void paged_attention_(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor kv_lens, std::optional<Tensor> alibi_slopes, float scale) {
+    PagedAttention::execute(out, q, k_cache, v_cache, block_tables, kv_lens, alibi_slopes, scale);
 }
 
 } // namespace infinicore::op
@@ -15,8 +15,8 @@ thread_local common::OpCache<size_t, infiniopPagedAttentionDescriptor_t> caches(
         }
     });
 
-void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, std::optional<Tensor> alibi_slopes, float scale) {
-    size_t seed = hash_combine(out, q, k_cache, v_cache, block_tables, cache_lens, alibi_slopes, scale);
+void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor kv_lens, std::optional<Tensor> alibi_slopes, float scale) {
+    size_t seed = hash_combine(out, q, k_cache, v_cache, block_tables, kv_lens, alibi_slopes, scale);
 
     auto device = context::getDevice();
     auto &cache = caches.getCache(device);
@@ -27,7 +27,7 @@ void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor bloc
     if (!desc_opt) {
         INFINICORE_CHECK_ERROR(infiniopCreatePagedAttentionDescriptor(
             context::getInfiniopHandle(device), &desc,
-            out->desc(), q->desc(), k_cache->desc(), v_cache->desc(), block_tables->desc(), cache_lens->desc(),
+            out->desc(), q->desc(), k_cache->desc(), v_cache->desc(), block_tables->desc(), kv_lens->desc(),
             alibi_slopes.has_value() ? alibi_slopes.value()->desc() : nullptr,
             scale));
         cache.put(seed, desc);
@@ -41,7 +41,7 @@ void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor bloc
 
     INFINICORE_CHECK_ERROR(infiniopPagedAttention(
         desc, workspace->data(), workspace_size,
-        out->data(), q->data(), k_cache->data(), v_cache->data(), block_tables->data(), cache_lens->data(),
+        out->data(), q->data(), k_cache->data(), v_cache->data(), block_tables->data(), kv_lens->data(),
         alibi_slopes.has_value() ? alibi_slopes.value()->data() : nullptr,
         context::getStream()));
 }
 
@@ -10,31 +10,30 @@ common::OpDispatcher<PagedAttentionPrefill::schema> &PagedAttentionPrefill::disp
 };
 
 void PagedAttentionPrefill::execute(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache,
-                                    Tensor block_tables, Tensor history_lens, Tensor cu_seqlens_q,
+                                    Tensor block_tables, Tensor kv_lens, Tensor cum_seqlens_q,
                                     std::optional<Tensor> alibi_slopes, float scale) {
-
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, q, k_cache, v_cache, block_tables, history_lens, cu_seqlens_q);
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, q, k_cache, v_cache, block_tables, kv_lens, cum_seqlens_q);
 
     infinicore::context::setDevice(out->device());
 
     dispatcher().lookup(out->device().getType())(out, q, k_cache, v_cache, block_tables,
-                                                 history_lens, cu_seqlens_q, alibi_slopes, scale);
+                                                 kv_lens, cum_seqlens_q, alibi_slopes, scale);
 }
 
 Tensor paged_attention_prefill(Tensor q, Tensor k_cache, Tensor v_cache,
-                               Tensor block_tables, Tensor history_lens, Tensor cu_seqlens_q,
+                               Tensor block_tables, Tensor kv_lens, Tensor cum_seqlens_q,
                                std::optional<Tensor> alibi_slopes, float scale) {
 
     auto out = Tensor::empty(q->shape(), q->dtype(), q->device());
-    paged_attention_prefill_(out, q, k_cache, v_cache, block_tables, history_lens, cu_seqlens_q, alibi_slopes, scale);
+    paged_attention_prefill_(out, q, k_cache, v_cache, block_tables, kv_lens, cum_seqlens_q, alibi_slopes, scale);
     return out;
 }
 
 void paged_attention_prefill_(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache,
-                              Tensor block_tables, Tensor history_lens, Tensor cu_seqlens_q,
+                              Tensor block_tables, Tensor kv_lens, Tensor cum_seqlens_q,
                               std::optional<Tensor> alibi_slopes, float scale) {
 
-    PagedAttentionPrefill::execute(out, q, k_cache, v_cache, block_tables, history_lens, cu_seqlens_q, alibi_slopes, scale);
+    PagedAttentionPrefill::execute(out, q, k_cache, v_cache, block_tables, kv_lens, cum_seqlens_q, alibi_slopes, scale);
 }
 
 } // namespace infinicore::op
@@ -16,10 +16,9 @@ thread_local common::OpCache<size_t, infiniopPagedAttentionPrefillDescriptor_t>
     });
 
 void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache,
-               Tensor block_tables, Tensor history_lens, Tensor cu_seqlens_q,
+               Tensor block_tables, Tensor kv_lens, Tensor cum_seqlens_q,
                std::optional<Tensor> alibi_slopes, float scale) {
-
-    size_t seed = hash_combine(out, q, k_cache, v_cache, block_tables, history_lens, cu_seqlens_q, alibi_slopes, scale);
+    size_t seed = hash_combine(out, q, k_cache, v_cache, block_tables, kv_lens, cum_seqlens_q, alibi_slopes, scale);
 
     auto device = context::getDevice();
     auto &cache = caches.getCache(device);
@@ -35,8 +34,8 @@ void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache,
             k_cache->desc(),
             v_cache->desc(),
             block_tables->desc(),
-            history_lens->desc(),
-            cu_seqlens_q->desc(),
+            kv_lens->desc(),
+            cum_seqlens_q->desc(),
             alibi_slopes.has_value() ? alibi_slopes.value()->desc() : nullptr,
             scale));
         cache.put(seed, desc);
@@ -57,8 +56,8 @@ void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache,
         k_cache->data(),
         v_cache->data(),
         block_tables->data(),
-        history_lens->data(),
-        cu_seqlens_q->data(),
+        kv_lens->data(),
+        cum_seqlens_q->data(),
         alibi_slopes.has_value() ? alibi_slopes.value()->data() : nullptr,
         context::getStream()));
 }
 
@@ -19,7 +19,8 @@ Tensor py_paged_attention_prefill(Tensor q,
     if (!alibi_slopes.is_none()) {
         alibi_slopes_tensor = alibi_slopes.cast<Tensor>();
     }
-    return op::paged_attention_prefill(q, k_cache, v_cache, block_tables, history_lens, cu_seqlens_q, alibi_slopes_tensor, scale);
+    return op::paged_attention_prefill(
+        q, k_cache, v_cache, block_tables, history_lens, cu_seqlens_q, alibi_slopes_tensor, scale);
 }
 
 void py_paged_attention_prefill_(Tensor out,
 
@@ -22,12 +22,13 @@ template <typename Tdata, typename Tcompute>
 __global__ void pagedAttentionPrefillKernel(
     Tdata *out_, const Tdata *q_, const Tdata *k_cache_, const Tdata *v_cache_,
     const int64_t *block_tables_,
-    const int64_t *history_lens_,
+    const int64_t *total_kv_lens_,
     const int64_t *cum_seq_lens_q_,
     const float *alibi_slopes_,
     const size_t num_heads, const size_t num_kv_heads, const float scale,
     const size_t max_num_blocks_per_seq, const size_t block_size,
     const ptrdiff_t kv_block_stride, const ptrdiff_t kv_head_stride,
+    const ptrdiff_t q_stride, const ptrdiff_t q_head_stride,
     const size_t head_size,
     const size_t num_seqs) {
 
@@ -44,10 +45,12 @@ __global__ void pagedAttentionPrefillKernel(
 
     size_t q_token_idx = global_token_idx - cum_seq_lens_q_[seq_idx];
 
-    const int64_t history_len = history_lens_[seq_idx];
-    const int64_t causal_limit = history_len + q_token_idx;
+    const size_t total_kv_len = total_kv_lens_[seq_idx];
+    const size_t q_len = cum_seq_lens_q_[seq_idx + 1] - cum_seq_lens_q_[seq_idx];
+    const size_t history_len = total_kv_len - q_len;
+    const size_t causal_limit = history_len + q_token_idx;
 
-    const Tdata *q_vec = q_ + global_token_idx * num_heads * head_size + head_idx * head_size;
+    const Tdata *q_vec = q_ + global_token_idx * q_stride + head_idx * q_head_stride;
     Tdata *out_ptr = out_ + global_token_idx * num_heads * head_size + head_idx * head_size;
 
     const size_t num_queries_per_kv = num_heads / num_kv_heads;
@@ -57,10 +60,10 @@ __global__ void pagedAttentionPrefillKernel(
     const float alibi_slope = (alibi_slopes_ == nullptr) ? 0.0f : alibi_slopes_[head_idx];
 
     Tcompute max_score = -FLT_MAX;
-    for (int64_t t = 0; t <= causal_limit; ++t) {
-        const int64_t b_idx = t / block_size;
-        const int64_t t_off = t % block_size;
-        const int64_t physical_block_id = block_table[b_idx];
+    for (size_t t = 0; t <= causal_limit; ++t) {
+        const size_t b_idx = t / block_size;
+        const size_t t_off = t % block_size;
+        const ptrdiff_t physical_block_id = block_table[b_idx];
         const Tdata *k_vec = k_cache_ + physical_block_id * kv_block_stride + kv_head_idx * kv_head_stride + t_off * head_size;
 
         Tcompute score = 0.0f;
@@ -77,10 +80,10 @@ __global__ void pagedAttentionPrefillKernel(
     }
 
     Tcompute sum_exp = 0.0f;
-    for (int64_t t = 0; t <= causal_limit; ++t) {
-        const int64_t b_idx = t / block_size;
-        const int64_t t_off = t % block_size;
-        const int64_t physical_block_id = block_table[b_idx];
+    for (size_t t = 0; t <= causal_limit; ++t) {
+        const size_t b_idx = t / block_size;
+        const size_t t_off = t % block_size;
+        const ptrdiff_t physical_block_id = block_table[b_idx];
         const Tdata *k_vec = k_cache_ + physical_block_id * kv_block_stride + kv_head_idx * kv_head_stride + t_off * head_size;
 
         Tcompute score = 0.0f;
@@ -96,10 +99,10 @@ __global__ void pagedAttentionPrefillKernel(
 
     Tcompute acc = 0.0f;
     Tcompute inv_sum = 1.0f / (sum_exp + 1e-6f);
-    for (int64_t t = 0; t <= causal_limit; ++t) {
-        const int64_t b_idx = t / block_size;
-        const int64_t t_off = t % block_size;
-        const int64_t physical_block_id = block_table[b_idx];
+    for (size_t t = 0; t <= causal_limit; ++t) {
+        const size_t b_idx = t / block_size;
+        const size_t t_off = t % block_size;
+        const ptrdiff_t physical_block_id = block_table[b_idx];
 
         const Tdata *k_vec = k_cache_ + physical_block_id * kv_block_stride + kv_head_idx * kv_head_stride + t_off * head_size;
         Tcompute score = 0.0f;
 
@@ -25,6 +25,7 @@ class PagedAttentionPrefillInfo {
     size_t total_q_tokens;
 
     ptrdiff_t q_stride;
+    ptrdiff_t q_head_stride;
     ptrdiff_t kv_block_stride;
     ptrdiff_t kv_head_stride;
     ptrdiff_t o_stride;
@@ -35,7 +36,7 @@ class PagedAttentionPrefillInfo {
         infiniopTensorDescriptor_t k_cache_desc,
         infiniopTensorDescriptor_t v_cache_desc,
         infiniopTensorDescriptor_t block_tables_desc,
-        infiniopTensorDescriptor_t history_lens_desc,
+        infiniopTensorDescriptor_t seq_lens_desc,
         infiniopTensorDescriptor_t cum_seq_lens_q_desc,
         const std::optional<infiniopTensorDescriptor_t> &alibi_slopes_desc,
         float scale) {
@@ -47,7 +48,7 @@ class PagedAttentionPrefillInfo {
             return INFINI_STATUS_BAD_TENSOR_DTYPE;
         }
 
-        if (cum_seq_lens_q_desc->dtype() != INFINI_DTYPE_I64 || history_lens_desc->dtype() != INFINI_DTYPE_I64) {
+        if (cum_seq_lens_q_desc->dtype() != INFINI_DTYPE_I64 || seq_lens_desc->dtype() != INFINI_DTYPE_I64) {
             return INFINI_STATUS_BAD_TENSOR_DTYPE;
         }
 
@@ -57,7 +58,7 @@ class PagedAttentionPrefillInfo {
         auto k_shape = k_cache_desc->shape();
         auto v_shape = v_cache_desc->shape();
         auto block_tables_shape = block_tables_desc->shape();
-        auto history_lens_shape = history_lens_desc->shape();
+        auto seq_lens_shape = seq_lens_desc->shape();
         auto cum_seq_lens_q_shape = cum_seq_lens_q_desc->shape();
 
         if (k_shape.size() != 4 || v_shape.size() != 4) {
@@ -68,10 +69,11 @@ class PagedAttentionPrefillInfo {
             return INFINI_STATUS_BAD_TENSOR_SHAPE;
         }
 
-        if (history_lens_shape.size() != 1 || cum_seq_lens_q_shape.size() != 1) {
+        if (seq_lens_shape.size() != 1 || cum_seq_lens_q_shape.size() != 1) {
             return INFINI_STATUS_BAD_TENSOR_SHAPE;
         }
-        if (cum_seq_lens_q_shape[0] != history_lens_shape[0] + 1) {
+
+        if (cum_seq_lens_q_shape[0] != seq_lens_shape[0] + 1) {
             return INFINI_STATUS_BAD_PARAM;
         }
 
@@ -88,13 +90,13 @@ class PagedAttentionPrefillInfo {
             return INFINI_STATUS_BAD_PARAM;
         }
 
-        size_t num_seqs = history_lens_shape[0];
-
+        size_t num_seqs = seq_lens_shape[0];
         size_t num_kv_heads = k_shape[1];
         size_t block_size = k_shape[2];
         size_t max_num_blocks_per_seq = block_tables_shape[1];
 
         ptrdiff_t q_stride = q_desc->stride(0);
+        ptrdiff_t q_head_stride = q_desc->stride(1);
         ptrdiff_t kv_block_stride = k_cache_desc->stride(0);
         ptrdiff_t kv_head_stride = k_cache_desc->stride(1);
         ptrdiff_t o_stride = out_desc->stride(0);
@@ -110,6 +112,7 @@ class PagedAttentionPrefillInfo {
             max_num_blocks_per_seq,
             total_q_tokens,
             q_stride,
+            q_head_stride,
             kv_block_stride,
             kv_head_stride,
             o_stride});
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,8 @@ Tensor py_paged_attention_prefill(Tensor q,`
`19`	`19`	`if (!alibi_slopes.is_none()) {`
`20`	`20`	`alibi_slopes_tensor = alibi_slopes.cast<Tensor>();`
`21`	`21`	`}`
`22`		`- return op::paged_attention_prefill(q, k_cache, v_cache, block_tables, history_lens, cu_seqlens_q, alibi_slopes_tensor, scale);`
	`22`	`+ return op::paged_attention_prefill(`
	`23`	`+ q, k_cache, v_cache, block_tables, history_lens, cu_seqlens_q, alibi_slopes_tensor, scale);`
`23`	`24`	`}`
`24`	`25`
`25`	`26`	`void py_paged_attention_prefill_(Tensor out,`