Skip to content

Commit 5eeb857

Browse files
committed
update tradeoff desc; resolve unused var warning in concatKV test
1 parent a20d326 commit 5eeb857

File tree

1 file changed

+4
-3
lines changed

1 file changed

+4
-3
lines changed

candle-nn/src/kv_cache.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -694,7 +694,8 @@ impl ScatteredCacheBuilder {
694694
/// The trade-off:
695695
/// - More allocations (one per token in autoregressive generation)
696696
/// - But each allocation uses a faster kernel path
697-
/// - Net result: 40-56% faster on GPU for typical LLM inference
697+
/// - Net result: 2-5x faster on GPU for autoregressive inference
698+
/// (speedup increases with sequence length: ~2x at 300 tokens, ~5x at 2000 tokens)
698699
#[derive(Debug, Clone)]
699700
pub struct ConcatKvCache {
700701
k: Option<Tensor>,
@@ -1008,13 +1009,13 @@ mod concat_cache_tests {
10081009

10091010
let k1 = Tensor::zeros((1, 3, 8, 64), DType::F32, &device)?;
10101011
let v1 = Tensor::zeros((1, 3, 8, 64), DType::F32, &device)?;
1011-
let (k, v) = cache.append(&k1, &v1)?;
1012+
let (k, _v) = cache.append(&k1, &v1)?;
10121013

10131014
assert_eq!(k.dims(), &[1, 3, 8, 64]);
10141015

10151016
let k2 = Tensor::zeros((1, 2, 8, 64), DType::F32, &device)?;
10161017
let v2 = Tensor::zeros((1, 2, 8, 64), DType::F32, &device)?;
1017-
let (k, v) = cache.append(&k2, &v2)?;
1018+
let (k, _v) = cache.append(&k2, &v2)?;
10181019

10191020
assert_eq!(k.dims(), &[1, 5, 8, 64]); // Concatenated on dim 1
10201021
assert_eq!(cache.current_seq_len(), 5);

0 commit comments

Comments
 (0)