Skip to content

Commit 8b9e6b2

Browse files
committed
update kv-cache concat method description
1 parent 5eeb857 commit 8b9e6b2

File tree

1 file changed

+21
-36
lines changed

1 file changed

+21
-36
lines changed

candle-nn/src/kv_cache.rs

Lines changed: 21 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -634,33 +634,32 @@ impl ScatteredCacheBuilder {
634634
/// KV-Cache using concatenation for append operations
635635
///
636636
/// This implementation uses `Tensor::cat` instead of `slice_set` for updates,
637-
/// which provides better GPU performance due to optimized concatenation kernels.
637+
/// providing significant GPU performance improvements for autoregressive generation.
638638
///
639639
/// # Performance Characteristics
640640
///
641-
/// Benchmark results on NVIDIA A100 (SmolLM2-135M, Llama-3.2-1B):
642-
/// - **GPU**: 1.4-1.6x faster than `KvCache` (70 tok/s vs 42 tok/s)
643-
/// - **CPU**: ~10% slower than `KvCache` (due to repeated allocations)
644-
/// - **Memory**: Dynamic growth, no pre-allocation
641+
/// **GPU :**
642+
/// - 2-5x faster than `KvCache` (speedup increases with sequence length)
643+
/// - Works on both full-precision and quantized models
645644
///
646-
/// The performance advantage on GPU comes from:
647-
/// - Optimized CUDA concatenation kernels (fused allocation + copy)
648-
/// - Coalesced memory writes (all threads write adjacent addresses)
649-
/// - Single kernel launch (vs multiple for slice_set: indexing + bounds + copy)
650-
/// - Better memory bandwidth utilization (75% vs 25% on A100)
645+
/// **CPU :**
646+
/// - Essentially neutral (~1% difference)
647+
///
648+
/// The GPU performance advantage comes from:
649+
/// - Tight memory layouts (sequential access patterns)
650+
/// - Optimized concatenation kernels (coalesced memory writes)
651+
/// - Better memory bandwidth utilization
651652
///
652653
/// # When to Use
653654
///
654655
/// **Recommended for:**
655-
/// - GPU inference (CUDA, Metal) where performance is critical
656+
/// - GPU inference (CUDA, Metal)
656657
/// - Autoregressive generation (token-by-token decoding)
657-
/// - When memory for dynamic growth is acceptable
658658
/// - Production inference servers prioritizing throughput
659659
///
660660
/// **Use `KvCache` instead for:**
661-
/// - CPU-only inference (pre-allocation is faster)
662-
/// - Memory-constrained environments (pre-allocation uses less memory for short sequences)
663-
/// - When you need precise memory control
661+
/// - CPU-only inference
662+
/// - When you need fixed memory allocation upfront
664663
///
665664
/// # Example
666665
///
@@ -673,29 +672,19 @@ impl ScatteredCacheBuilder {
673672
/// let k1 = Tensor::randn(0f32, 1., (1, 8, 10, 64), &device)?;
674673
/// let v1 = Tensor::randn(0f32, 1., (1, 8, 10, 64), &device)?;
675674
/// let (k, v) = cache.append(&k1, &v1)?;
676-
/// assert_eq!(k.dims()[2], 10); // sequence length = 10
677675
///
678676
/// // Subsequent tokens (decode)
679-
/// for _ in 0..5 {
680-
/// let k_new = Tensor::randn(0f32, 1., (1, 8, 1, 64), &device)?;
681-
/// let v_new = Tensor::randn(0f32, 1., (1, 8, 1, 64), &device)?;
682-
/// let (k, v) = cache.append(&k_new, &v_new)?;
683-
/// }
684-
/// assert_eq!(cache.current_seq_len(), 15); // 10 + 5
677+
/// let k_new = Tensor::randn(0f32, 1., (1, 8, 1, 64), &device)?;
678+
/// let v_new = Tensor::randn(0f32, 1., (1, 8, 1, 64), &device)?;
679+
/// let (k, v) = cache.append(&k_new, &v_new)?;
685680
/// ```
686681
///
687682
/// # Implementation Details
688683
///
689-
/// Unlike `KvCache` which pre-allocates a fixed-size buffer and uses `slice_set`,
690-
/// this implementation grows dynamically using `Tensor::cat`. While this uses more
691-
/// memory allocations, the GPU kernel for concatenation is significantly more
692-
/// optimized than the general-purpose `slice_set` operation.
693-
///
694-
/// The trade-off:
695-
/// - More allocations (one per token in autoregressive generation)
696-
/// - But each allocation uses a faster kernel path
697-
/// - Net result: 2-5x faster on GPU for autoregressive inference
698-
/// (speedup increases with sequence length: ~2x at 300 tokens, ~5x at 2000 tokens)
684+
/// Unlike `KvCache` which pre-allocates a fixed buffer, this implementation
685+
/// grows dynamically using `Tensor::cat`. The GPU concatenation kernels are
686+
/// highly optimized for sequential append patterns, resulting in better
687+
/// performance despite the dynamic allocation.
699688
#[derive(Debug, Clone)]
700689
pub struct ConcatKvCache {
701690
k: Option<Tensor>,
@@ -755,10 +744,6 @@ impl ConcatKvCache {
755744
/// # Returns
756745
/// Tuple of `(full_k, full_v)` containing all cached keys and values,
757746
/// including the newly appended data.
758-
///
759-
/// # Performance Note
760-
/// On GPU, this operation is highly optimized and faster than equivalent
761-
/// `slice_set` operations despite allocating a new tensor.
762747
pub fn append(&mut self, k: &Tensor, v: &Tensor) -> Result<(Tensor, Tensor)> {
763748
// Update K cache using concatenation
764749
self.k = Some(match &self.k {

0 commit comments

Comments
 (0)