@@ -634,33 +634,32 @@ impl ScatteredCacheBuilder {
634634/// KV-Cache using concatenation for append operations
635635///
636636/// This implementation uses `Tensor::cat` instead of `slice_set` for updates,
637- /// which provides better GPU performance due to optimized concatenation kernels .
637+ /// providing significant GPU performance improvements for autoregressive generation .
638638///
639639/// # Performance Characteristics
640640///
641- /// Benchmark results on NVIDIA A100 (SmolLM2-135M, Llama-3.2-1B):
642- /// - **GPU**: 1.4-1.6x faster than `KvCache` (70 tok/s vs 42 tok/s)
643- /// - **CPU**: ~10% slower than `KvCache` (due to repeated allocations)
644- /// - **Memory**: Dynamic growth, no pre-allocation
641+ /// **GPU :**
642+ /// - 2-5x faster than `KvCache` (speedup increases with sequence length)
643+ /// - Works on both full-precision and quantized models
645644///
646- /// The performance advantage on GPU comes from:
647- /// - Optimized CUDA concatenation kernels (fused allocation + copy)
648- /// - Coalesced memory writes (all threads write adjacent addresses)
649- /// - Single kernel launch (vs multiple for slice_set: indexing + bounds + copy)
650- /// - Better memory bandwidth utilization (75% vs 25% on A100)
645+ /// **CPU :**
646+ /// - Essentially neutral (~1% difference)
647+ ///
648+ /// The GPU performance advantage comes from:
649+ /// - Tight memory layouts (sequential access patterns)
650+ /// - Optimized concatenation kernels (coalesced memory writes)
651+ /// - Better memory bandwidth utilization
651652///
652653/// # When to Use
653654///
654655/// **Recommended for:**
655- /// - GPU inference (CUDA, Metal) where performance is critical
656+ /// - GPU inference (CUDA, Metal)
656657/// - Autoregressive generation (token-by-token decoding)
657- /// - When memory for dynamic growth is acceptable
658658/// - Production inference servers prioritizing throughput
659659///
660660/// **Use `KvCache` instead for:**
661- /// - CPU-only inference (pre-allocation is faster)
662- /// - Memory-constrained environments (pre-allocation uses less memory for short sequences)
663- /// - When you need precise memory control
661+ /// - CPU-only inference
662+ /// - When you need fixed memory allocation upfront
664663///
665664/// # Example
666665///
@@ -673,29 +672,19 @@ impl ScatteredCacheBuilder {
673672/// let k1 = Tensor::randn(0f32, 1., (1, 8, 10, 64), &device)?;
674673/// let v1 = Tensor::randn(0f32, 1., (1, 8, 10, 64), &device)?;
675674/// let (k, v) = cache.append(&k1, &v1)?;
676- /// assert_eq!(k.dims()[2], 10); // sequence length = 10
677675///
678676/// // Subsequent tokens (decode)
679- /// for _ in 0..5 {
680- /// let k_new = Tensor::randn(0f32, 1., (1, 8, 1, 64), &device)?;
681- /// let v_new = Tensor::randn(0f32, 1., (1, 8, 1, 64), &device)?;
682- /// let (k, v) = cache.append(&k_new, &v_new)?;
683- /// }
684- /// assert_eq!(cache.current_seq_len(), 15); // 10 + 5
677+ /// let k_new = Tensor::randn(0f32, 1., (1, 8, 1, 64), &device)?;
678+ /// let v_new = Tensor::randn(0f32, 1., (1, 8, 1, 64), &device)?;
679+ /// let (k, v) = cache.append(&k_new, &v_new)?;
685680/// ```
686681///
687682/// # Implementation Details
688683///
689- /// Unlike `KvCache` which pre-allocates a fixed-size buffer and uses `slice_set`,
690- /// this implementation grows dynamically using `Tensor::cat`. While this uses more
691- /// memory allocations, the GPU kernel for concatenation is significantly more
692- /// optimized than the general-purpose `slice_set` operation.
693- ///
694- /// The trade-off:
695- /// - More allocations (one per token in autoregressive generation)
696- /// - But each allocation uses a faster kernel path
697- /// - Net result: 2-5x faster on GPU for autoregressive inference
698- /// (speedup increases with sequence length: ~2x at 300 tokens, ~5x at 2000 tokens)
684+ /// Unlike `KvCache` which pre-allocates a fixed buffer, this implementation
685+ /// grows dynamically using `Tensor::cat`. The GPU concatenation kernels are
686+ /// highly optimized for sequential append patterns, resulting in better
687+ /// performance despite the dynamic allocation.
699688#[ derive( Debug , Clone ) ]
700689pub struct ConcatKvCache {
701690 k : Option < Tensor > ,
@@ -755,10 +744,6 @@ impl ConcatKvCache {
755744 /// # Returns
756745 /// Tuple of `(full_k, full_v)` containing all cached keys and values,
757746 /// including the newly appended data.
758- ///
759- /// # Performance Note
760- /// On GPU, this operation is highly optimized and faster than equivalent
761- /// `slice_set` operations despite allocating a new tensor.
762747 pub fn append ( & mut self , k : & Tensor , v : & Tensor ) -> Result < ( Tensor , Tensor ) > {
763748 // Update K cache using concatenation
764749 self . k = Some ( match & self . k {
0 commit comments