Skip to content

Commit 3883f32

Browse files
authored
Merge pull request #883 from InfiniTensor/issue/867
Issue/867: adjust paged_attention_prefill interface naming
2 parents 3b5afff + 499b1dc commit 3883f32

File tree

17 files changed

+640
-256
lines changed

17 files changed

+640
-256
lines changed

include/infinicore/ops/paged_attention_prefill.hpp

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,45 @@ namespace infinicore::op {
88

99
class PagedAttentionPrefill {
1010
public:
11-
using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, std::optional<Tensor>, float);
12-
static void execute(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, Tensor seq_lens, Tensor seq_offsets, std::optional<Tensor> alibi_slopes, float);
11+
/**
12+
* @brief PagedAttentionPrefill operator signature
13+
* * Argument order:
14+
* 1. out: Output tensor (Packed format)
15+
* 2. q: Current Query tensor (Packed format)
16+
* 3. k_cache: Physical Key cache (Paged format)
17+
* 4. v_cache: Physical Value cache (Paged format)
18+
* 5. block_tables: Mapping table from logical blocks to physical blocks
19+
* 6. total_kv_lens: lengths of Complete Key/Value for each request
20+
* 7. cu_seqlens_q: Cumulative sequence lengths of Query (prefix sum for variable-length batch)
21+
* 8. alibi_slopes: ALiBi bias slopes (optional)
22+
* 9. scale: Scaling factor (typically 1/sqrt(head_size))
23+
*/
24+
using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, std::optional<Tensor>, float);
25+
26+
static void execute(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache,
27+
Tensor block_tables, Tensor total_kv_lens, Tensor cum_seqlens_q,
28+
std::optional<Tensor> alibi_slopes, float scale);
29+
1330
static common::OpDispatcher<schema> &dispatcher();
1431
};
1532

16-
Tensor paged_attention_prefill(Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, Tensor seq_lens, Tensor seq_offsets, std::optional<Tensor> alibi_slopes, float scale);
17-
void paged_attention_prefill_(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, Tensor seq_lens, Tensor seq_offsets, std::optional<Tensor> alibi_slopes, float scale);
33+
Tensor paged_attention_prefill(Tensor q,
34+
Tensor k_cache,
35+
Tensor v_cache,
36+
Tensor block_tables,
37+
Tensor total_kv_lens,
38+
Tensor cum_seqlens_q,
39+
std::optional<Tensor> alibi_slopes,
40+
float scale);
41+
42+
void paged_attention_prefill_(Tensor out,
43+
Tensor q,
44+
Tensor k_cache,
45+
Tensor v_cache,
46+
Tensor block_tables,
47+
Tensor total_kv_lens,
48+
Tensor cum_seqlens_q,
49+
std::optional<Tensor> alibi_slopes,
50+
float scale);
51+
1852
} // namespace infinicore::op

include/infiniop/ops/paged_attention_prefill.h

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,22 @@ typedef struct InfiniopDescriptor *infiniopPagedAttentionPrefillDescriptor_t;
1111
* @param handle The handle to the InfiniOP library context.
1212
* @param desc_ptr A pointer to store the created descriptor.
1313
* @param out_desc Descriptor for the output tensor.
14+
* Shape: [total_q_tokens, num_heads, head_size]
1415
* @param q_desc Descriptor for the query tensor (packed/flattened).
16+
* Shape: [total_q_tokens, num_heads, head_size]
1517
* @param k_cache_desc Descriptor for the global physical key cache.
18+
* Shape: [max_num_blocks, num_kv_heads, block_size, head_size]
1619
* @param v_cache_desc Descriptor for the global physical value cache.
20+
* Shape: [max_num_blocks, num_kv_heads, block_size, head_size]
1721
* @param block_tables_desc Descriptor for the block tables mapping logic to physical blocks.
18-
* @param cache_lens_desc Descriptor for the total sequence lengths (history + current).
19-
* @param seq_lens_desc Descriptor for the current prefill sequence lengths.
20-
* @param offset_desc Descriptor for the start position of each sequence in the packed Q tensor.
22+
* Shape: [batch_size, max_blocks_per_seq]
23+
* @param seq_lens_desc Descriptor for the total KV lengths of each sequence.
24+
* Shape: [batch_size]
25+
* @param cum_seq_lens_q_desc Descriptor for the cumulative start position (prefix sum) of each Q sequence.
26+
* Shape: [batch_size + 1]
2127
* @param alibi_slopes_desc Optional descriptor for the ALiBi slopes tensor. Can be NULL.
22-
* @param scale The attention scaling factor.
28+
* Shape: [num_heads]
29+
* @param scale The attention scaling factor (typically 1.0 / sqrt(head_size)).
2330
* @return infiniStatus_t Status code of the operation.
2431
*/
2532
__C __export infiniStatus_t infiniopCreatePagedAttentionPrefillDescriptor(
@@ -30,9 +37,8 @@ __C __export infiniStatus_t infiniopCreatePagedAttentionPrefillDescriptor(
3037
infiniopTensorDescriptor_t k_cache_desc,
3138
infiniopTensorDescriptor_t v_cache_desc,
3239
infiniopTensorDescriptor_t block_tables_desc,
33-
infiniopTensorDescriptor_t cache_lens_desc,
3440
infiniopTensorDescriptor_t seq_lens_desc,
35-
infiniopTensorDescriptor_t offset_desc,
41+
infiniopTensorDescriptor_t cum_seq_lens_q_desc,
3642
infiniopTensorDescriptor_t alibi_slopes_desc,
3743
float scale);
3844

@@ -52,11 +58,10 @@ __C __export infiniStatus_t infiniopGetPagedAttentionPrefillWorkspaceSize(
5258
* @param k_cache Pointer to the global key cache data.
5359
* @param v_cache Pointer to the global value cache data.
5460
* @param block_tables Pointer to the block tables data.
55-
* @param cache_lens Pointer to the total sequence lengths data.
56-
* @param seq_lens Pointer to the current prefill sequence lengths data.
57-
* @param offset Pointer to the sequence start offsets data.
61+
* @param seq_lens Pointer to the KV lengths data.
62+
* @param cum_seq_lens_q Pointer to the Q cumulative sequence lengths data (prefix sum).
5863
* @param alibi_slopes Pointer to the ALiBi slopes data. Can be NULL.
59-
* @param stream The CUDA/device stream for the operation.
64+
* @param stream The device stream (e.g., cudaStream_t) for the operation.
6065
* @return infiniStatus_t Status code of the operation.
6166
*/
6267
__C __export infiniStatus_t infiniopPagedAttentionPrefill(
@@ -68,9 +73,8 @@ __C __export infiniStatus_t infiniopPagedAttentionPrefill(
6873
const void *k_cache,
6974
const void *v_cache,
7075
const void *block_tables,
71-
const void *cache_lens,
7276
const void *seq_lens,
73-
const void *offset,
77+
const void *cum_seq_lens_q,
7478
const void *alibi_slopes,
7579
void *stream);
7680

python/infinicore/ops/paged_attention_prefill.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,25 +7,25 @@ def paged_attention_prefill(
77
k_cache: Tensor,
88
v_cache: Tensor,
99
block_tables: Tensor,
10-
cache_lens: Tensor,
11-
seq_lens: Tensor,
12-
seq_offsets: Tensor,
10+
history_lens: Tensor,
11+
cu_seqlens_q: Tensor,
1312
alibi_slopes: Tensor | None = None,
1413
scale: float = 1.0,
1514
*,
1615
out: Tensor | None = None,
1716
):
17+
alibi_ptr = alibi_slopes._underlying if alibi_slopes is not None else None
18+
1819
if out is None:
1920
return Tensor(
2021
_infinicore.paged_attention_prefill(
2122
q._underlying,
2223
k_cache._underlying,
2324
v_cache._underlying,
2425
block_tables._underlying,
25-
cache_lens._underlying,
26-
seq_lens._underlying,
27-
seq_offsets._underlying,
28-
alibi_slopes._underlying if alibi_slopes is not None else None,
26+
history_lens._underlying,
27+
cu_seqlens_q._underlying,
28+
alibi_ptr,
2929
scale,
3030
)
3131
)
@@ -36,10 +36,9 @@ def paged_attention_prefill(
3636
k_cache._underlying,
3737
v_cache._underlying,
3838
block_tables._underlying,
39-
cache_lens._underlying,
40-
seq_lens._underlying,
41-
seq_offsets._underlying,
42-
alibi_slopes._underlying if alibi_slopes is not None else None,
39+
history_lens._underlying,
40+
cu_seqlens_q._underlying,
41+
alibi_ptr,
4342
scale,
4443
)
4544

src/infinicore/ops/paged_attention/paged_attention.cc

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,20 +9,20 @@ common::OpDispatcher<PagedAttention::schema> &PagedAttention::dispatcher() {
99
return dispatcher_;
1010
};
1111

12-
void PagedAttention::execute(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, std::optional<Tensor> alibi_slopes, float scale) {
13-
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, q, k_cache, v_cache, block_tables, cache_lens);
12+
void PagedAttention::execute(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor kv_lens, std::optional<Tensor> alibi_slopes, float scale) {
13+
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, q, k_cache, v_cache, block_tables, kv_lens);
1414
infinicore::context::setDevice(out->device());
15-
dispatcher().lookup(out->device().getType())(out, q, k_cache, v_cache, block_tables, cache_lens, alibi_slopes, scale);
15+
dispatcher().lookup(out->device().getType())(out, q, k_cache, v_cache, block_tables, kv_lens, alibi_slopes, scale);
1616
}
1717

18-
Tensor paged_attention(Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, std::optional<Tensor> alibi_slopes, float scale) {
18+
Tensor paged_attention(Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor kv_lens, std::optional<Tensor> alibi_slopes, float scale) {
1919
auto out = Tensor::empty(q->shape(), q->dtype(), q->device());
20-
paged_attention_(out, q, k_cache, v_cache, block_tables, cache_lens, alibi_slopes, scale);
20+
paged_attention_(out, q, k_cache, v_cache, block_tables, kv_lens, alibi_slopes, scale);
2121
return out;
2222
}
2323

24-
void paged_attention_(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, std::optional<Tensor> alibi_slopes, float scale) {
25-
PagedAttention::execute(out, q, k_cache, v_cache, block_tables, cache_lens, alibi_slopes, scale);
24+
void paged_attention_(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor kv_lens, std::optional<Tensor> alibi_slopes, float scale) {
25+
PagedAttention::execute(out, q, k_cache, v_cache, block_tables, kv_lens, alibi_slopes, scale);
2626
}
2727

2828
} // namespace infinicore::op

src/infinicore/ops/paged_attention/paged_attention_infiniop.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ thread_local common::OpCache<size_t, infiniopPagedAttentionDescriptor_t> caches(
1515
}
1616
});
1717

18-
void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, std::optional<Tensor> alibi_slopes, float scale) {
19-
size_t seed = hash_combine(out, q, k_cache, v_cache, block_tables, cache_lens, alibi_slopes, scale);
18+
void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor kv_lens, std::optional<Tensor> alibi_slopes, float scale) {
19+
size_t seed = hash_combine(out, q, k_cache, v_cache, block_tables, kv_lens, alibi_slopes, scale);
2020

2121
auto device = context::getDevice();
2222
auto &cache = caches.getCache(device);
@@ -27,7 +27,7 @@ void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor bloc
2727
if (!desc_opt) {
2828
INFINICORE_CHECK_ERROR(infiniopCreatePagedAttentionDescriptor(
2929
context::getInfiniopHandle(device), &desc,
30-
out->desc(), q->desc(), k_cache->desc(), v_cache->desc(), block_tables->desc(), cache_lens->desc(),
30+
out->desc(), q->desc(), k_cache->desc(), v_cache->desc(), block_tables->desc(), kv_lens->desc(),
3131
alibi_slopes.has_value() ? alibi_slopes.value()->desc() : nullptr,
3232
scale));
3333
cache.put(seed, desc);
@@ -41,7 +41,7 @@ void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor bloc
4141

4242
INFINICORE_CHECK_ERROR(infiniopPagedAttention(
4343
desc, workspace->data(), workspace_size,
44-
out->data(), q->data(), k_cache->data(), v_cache->data(), block_tables->data(), cache_lens->data(),
44+
out->data(), q->data(), k_cache->data(), v_cache->data(), block_tables->data(), kv_lens->data(),
4545
alibi_slopes.has_value() ? alibi_slopes.value()->data() : nullptr,
4646
context::getStream()));
4747
}

src/infinicore/ops/paged_attention_prefill/paged_attention_prefill.cc

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,20 +9,31 @@ common::OpDispatcher<PagedAttentionPrefill::schema> &PagedAttentionPrefill::disp
99
return dispatcher_;
1010
};
1111

12-
void PagedAttentionPrefill::execute(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, Tensor seq_lens, Tensor seq_offsets, std::optional<Tensor> alibi_slopes, float scale) {
13-
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, q, k_cache, v_cache, block_tables, cache_lens);
12+
void PagedAttentionPrefill::execute(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache,
13+
Tensor block_tables, Tensor kv_lens, Tensor cum_seqlens_q,
14+
std::optional<Tensor> alibi_slopes, float scale) {
15+
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, q, k_cache, v_cache, block_tables, kv_lens, cum_seqlens_q);
16+
1417
infinicore::context::setDevice(out->device());
15-
dispatcher().lookup(out->device().getType())(out, q, k_cache, v_cache, block_tables, cache_lens, seq_lens, seq_offsets, alibi_slopes, scale);
18+
19+
dispatcher().lookup(out->device().getType())(out, q, k_cache, v_cache, block_tables,
20+
kv_lens, cum_seqlens_q, alibi_slopes, scale);
1621
}
1722

18-
Tensor paged_attention_prefill(Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, Tensor seq_lens, Tensor seq_offsets, std::optional<Tensor> alibi_slopes, float scale) {
23+
Tensor paged_attention_prefill(Tensor q, Tensor k_cache, Tensor v_cache,
24+
Tensor block_tables, Tensor kv_lens, Tensor cum_seqlens_q,
25+
std::optional<Tensor> alibi_slopes, float scale) {
26+
1927
auto out = Tensor::empty(q->shape(), q->dtype(), q->device());
20-
paged_attention_prefill_(out, q, k_cache, v_cache, block_tables, cache_lens, seq_lens, seq_offsets, alibi_slopes, scale);
28+
paged_attention_prefill_(out, q, k_cache, v_cache, block_tables, kv_lens, cum_seqlens_q, alibi_slopes, scale);
2129
return out;
2230
}
2331

24-
void paged_attention_prefill_(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, Tensor seq_lens, Tensor seq_offsets, std::optional<Tensor> alibi_slopes, float scale) {
25-
PagedAttentionPrefill::execute(out, q, k_cache, v_cache, block_tables, cache_lens, seq_lens, seq_offsets, alibi_slopes, scale);
32+
void paged_attention_prefill_(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache,
33+
Tensor block_tables, Tensor kv_lens, Tensor cum_seqlens_q,
34+
std::optional<Tensor> alibi_slopes, float scale) {
35+
36+
PagedAttentionPrefill::execute(out, q, k_cache, v_cache, block_tables, kv_lens, cum_seqlens_q, alibi_slopes, scale);
2637
}
2738

2839
} // namespace infinicore::op

src/infinicore/ops/paged_attention_prefill/paged_attention_prefill_infiniop.cc

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@ thread_local common::OpCache<size_t, infiniopPagedAttentionPrefillDescriptor_t>
1515
}
1616
});
1717

18-
void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, Tensor seq_lens, Tensor seq_offsets, std::optional<Tensor> alibi_slopes, float scale) {
19-
size_t seed = hash_combine(out, q, k_cache, v_cache, block_tables, cache_lens, seq_lens, seq_offsets, alibi_slopes, scale);
18+
void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache,
19+
Tensor block_tables, Tensor kv_lens, Tensor cum_seqlens_q,
20+
std::optional<Tensor> alibi_slopes, float scale) {
21+
size_t seed = hash_combine(out, q, k_cache, v_cache, block_tables, kv_lens, cum_seqlens_q, alibi_slopes, scale);
2022

2123
auto device = context::getDevice();
2224
auto &cache = caches.getCache(device);
@@ -27,8 +29,13 @@ void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor bloc
2729
if (!desc_opt) {
2830
INFINICORE_CHECK_ERROR(infiniopCreatePagedAttentionPrefillDescriptor(
2931
context::getInfiniopHandle(device), &desc,
30-
out->desc(), q->desc(), k_cache->desc(), v_cache->desc(), block_tables->desc(),
31-
cache_lens->desc(), seq_lens->desc(), seq_offsets->desc(),
32+
out->desc(),
33+
q->desc(),
34+
k_cache->desc(),
35+
v_cache->desc(),
36+
block_tables->desc(),
37+
kv_lens->desc(),
38+
cum_seqlens_q->desc(),
3239
alibi_slopes.has_value() ? alibi_slopes.value()->desc() : nullptr,
3340
scale));
3441
cache.put(seed, desc);
@@ -41,8 +48,16 @@ void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor bloc
4148
std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
4249

4350
INFINICORE_CHECK_ERROR(infiniopPagedAttentionPrefill(
44-
desc, workspace->data(), workspace_size,
45-
out->data(), q->data(), k_cache->data(), v_cache->data(), block_tables->data(), cache_lens->data(), seq_lens->data(), seq_offsets->data(),
51+
desc,
52+
workspace->data(),
53+
workspace_size,
54+
out->data(),
55+
q->data(),
56+
k_cache->data(),
57+
v_cache->data(),
58+
block_tables->data(),
59+
kv_lens->data(),
60+
cum_seqlens_q->data(),
4661
alibi_slopes.has_value() ? alibi_slopes.value()->data() : nullptr,
4762
context::getStream()));
4863
}

src/infinicore/pybind11/ops.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include "ops/matmul.hpp"
1212
#include "ops/mul.hpp"
1313
#include "ops/paged_attention.hpp"
14+
#include "ops/paged_attention_prefill.hpp"
1415
#include "ops/paged_caching.hpp"
1516
#include "ops/random_sample.hpp"
1617
#include "ops/rearrange.hpp"
@@ -33,6 +34,7 @@ inline void bind(py::module &m) {
3334
bind_matmul(m);
3435
bind_mul(m);
3536
bind_paged_attention(m);
37+
bind_paged_attention_prefill(m);
3638
bind_paged_caching(m);
3739
bind_rearrange(m);
3840
bind_rms_norm(m);
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#pragma once
2+
3+
#include "infinicore/ops/paged_attention_prefill.hpp"
4+
#include <pybind11/pybind11.h>
5+
6+
namespace py = pybind11;
7+
8+
namespace infinicore::ops {
9+
10+
Tensor py_paged_attention_prefill(Tensor q,
11+
Tensor k_cache,
12+
Tensor v_cache,
13+
Tensor block_tables,
14+
Tensor history_lens,
15+
Tensor cu_seqlens_q,
16+
py::object alibi_slopes,
17+
float scale) {
18+
std::optional<Tensor> alibi_slopes_tensor = std::nullopt;
19+
if (!alibi_slopes.is_none()) {
20+
alibi_slopes_tensor = alibi_slopes.cast<Tensor>();
21+
}
22+
return op::paged_attention_prefill(
23+
q, k_cache, v_cache, block_tables, history_lens, cu_seqlens_q, alibi_slopes_tensor, scale);
24+
}
25+
26+
void py_paged_attention_prefill_(Tensor out,
27+
Tensor q,
28+
Tensor k_cache,
29+
Tensor v_cache,
30+
Tensor block_tables,
31+
Tensor history_lens,
32+
Tensor cu_seqlens_q,
33+
py::object alibi_slopes,
34+
float scale) {
35+
std::optional<Tensor> alibi_slopes_tensor = std::nullopt;
36+
if (!alibi_slopes.is_none()) {
37+
alibi_slopes_tensor = alibi_slopes.cast<Tensor>();
38+
}
39+
op::paged_attention_prefill_(out, q, k_cache, v_cache, block_tables, history_lens, cu_seqlens_q, alibi_slopes_tensor, scale);
40+
}
41+
42+
inline void bind_paged_attention_prefill(py::module &m) {
43+
m.def("paged_attention_prefill",
44+
&ops::py_paged_attention_prefill,
45+
py::arg("q"),
46+
py::arg("k_cache"),
47+
py::arg("v_cache"),
48+
py::arg("block_tables"),
49+
py::arg("history_lens"),
50+
py::arg("cu_seqlens_q"),
51+
py::arg("alibi_slopes") = py::none(),
52+
py::arg("scale") = 1.0,
53+
R"doc(Paged attention prefill for packed variable-length queries.)doc");
54+
55+
m.def("paged_attention_prefill_",
56+
&ops::py_paged_attention_prefill_,
57+
py::arg("out"),
58+
py::arg("q"),
59+
py::arg("k_cache"),
60+
py::arg("v_cache"),
61+
py::arg("block_tables"),
62+
py::arg("history_lens"),
63+
py::arg("cu_seqlens_q"),
64+
py::arg("alibi_slopes") = py::none(),
65+
py::arg("scale") = 1.0,
66+
R"doc(In-place paged attention prefill for packed variable-length queries.)doc");
67+
}
68+
69+
} // namespace infinicore::ops

0 commit comments

Comments
 (0)