@@ -694,7 +694,8 @@ impl ScatteredCacheBuilder {
694694/// The trade-off:
695695/// - More allocations (one per token in autoregressive generation)
696696/// - But each allocation uses a faster kernel path
697- /// - Net result: 40-56% faster on GPU for typical LLM inference
697+ /// - Net result: 2-5x faster on GPU for autoregressive inference
698+ /// (speedup increases with sequence length: ~2x at 300 tokens, ~5x at 2000 tokens)
698699#[ derive( Debug , Clone ) ]
699700pub struct ConcatKvCache {
700701 k : Option < Tensor > ,
@@ -1008,13 +1009,13 @@ mod concat_cache_tests {
10081009
10091010 let k1 = Tensor :: zeros ( ( 1 , 3 , 8 , 64 ) , DType :: F32 , & device) ?;
10101011 let v1 = Tensor :: zeros ( ( 1 , 3 , 8 , 64 ) , DType :: F32 , & device) ?;
1011- let ( k, v ) = cache. append ( & k1, & v1) ?;
1012+ let ( k, _v ) = cache. append ( & k1, & v1) ?;
10121013
10131014 assert_eq ! ( k. dims( ) , & [ 1 , 3 , 8 , 64 ] ) ;
10141015
10151016 let k2 = Tensor :: zeros ( ( 1 , 2 , 8 , 64 ) , DType :: F32 , & device) ?;
10161017 let v2 = Tensor :: zeros ( ( 1 , 2 , 8 , 64 ) , DType :: F32 , & device) ?;
1017- let ( k, v ) = cache. append ( & k2, & v2) ?;
1018+ let ( k, _v ) = cache. append ( & k2, & v2) ?;
10181019
10191020 assert_eq ! ( k. dims( ) , & [ 1 , 5 , 8 , 64 ] ) ; // Concatenated on dim 1
10201021 assert_eq ! ( cache. current_seq_len( ) , 5 ) ;
0 commit comments