Pass token indices to KV cache

mseeger · mseeger · commit 16526f3561bd · 2025-02-25T09:26:14.000+01:00
diff --git a/litgpt/kvcache/base.py b/litgpt/kvcache/base.py
@@ -158,7 +158,12 @@ def max_tokens_forward(self) -> int:
         """
         raise NotImplementedError()
 
-    def forward(self, key: torch.Tensor, value: torch.Tensor) -> KeysAndValues:
+    def forward(
+        self,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        token_idx: torch.Tensor,
+    ) -> KeysAndValues:
         """
         Accepts key and value tensors for `1 <= num <= max_tokens_forward`
         new token positions. These are written into the cache. If the cache
@@ -176,6 +181,7 @@ def forward(self, key: torch.Tensor, value: torch.Tensor) -> KeysAndValues:
             key: New keys, `(eff_batch_size, n_query_groups, num, head_size)`,
                 where `1 <= num <= max_tokens_forward`
             value: New values, `(eff_batch_size, n_query_groups, num, head_size)`
+            token_idx: Token indices of input sequence, `(eff_batch_size, num)`
 
         Returns:
             key_cached, value_cached, `(eff_batch_size, n_query_groups, T,
@@ -203,6 +209,7 @@ def update(self, *args, **kwargs):
         Args:
             *args: Depends on subclass
             **kwargs: Depends on subclass
+
         """
         raise NotImplementedError()
 
@@ -228,7 +235,12 @@ def max_prefill_length(self) -> Optional[int]:
         """
         raise NotImplementedError()
 
-    def prefill(self, key: torch.Tensor, value: torch.Tensor):
+    def prefill(
+        self,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        token_idx: torch.Tensor,
+    ):
         """
         Starts a generation loop by passing key and value tensors coming from
         a prefill with embeddings coming from the prompts. The length must be
@@ -239,6 +251,8 @@ def prefill(self, key: torch.Tensor, value: torch.Tensor):
         Args:
             key: Prefill keys, `(eff_batch_size, n_query_groups, T, head_size)`
             value: Prefill values, `(eff_batch_size, n_query_groups, T, head_size)`
+            token_idx: Token indices of input sequence, `(eff_batch_size, T)`
+
         """
         raise NotImplementedError()
 
@@ -271,6 +285,7 @@ def size_estimate(self) -> Tuple[int, Dict[str, int]]:
 
         Returns:
             num_bits_total, bits_by_part (unit is bit)
+
         """
         raise NotImplementedError()
 
@@ -287,6 +302,7 @@ def size_estimate_apriori(cls, params: KVCacheParams, **kwargs) -> Tuple[int, Di
 
         Returns:
             num_bits_total, bits_by_part (unit is bit)
+
         """
         raise NotImplementedError()
 
@@ -326,6 +342,7 @@ class DenseKVCache(KVCache):
 
     Note: If the cache is full, :meth:`forward` raises an exception. The cache
     buffers are allocated up front and are not enlarged later on.
+
     """
     def __init__(
         self,
@@ -344,7 +361,6 @@ def __init__(
             dtype: Data type for buffers
             max_sequence_length: Cache length. If not given, we use
             `config.block_size`
-            max_tokens_forward: See parent class
             head_size: Size of final dimension of buffers. Defaults to head
                 size of model
 
@@ -380,7 +396,12 @@ def max_prefill_length(self) -> Optional[int]:
     def current_length(self) -> int:
         return self.next_position
 
-    def forward(self, key: torch.Tensor, value: torch.Tensor) -> KeysAndValues:
+    def forward(
+        self,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        token_idx: torch.Tensor,
+    ) -> KeysAndValues:
         if self.next_position is None:
             raise IndexError("Cache needs to be initialized with 'prefill' before being used")
         num = key.shape[2]
@@ -416,7 +437,12 @@ def forward(self, key: torch.Tensor, value: torch.Tensor) -> KeysAndValues:
     def update(self, *args, **kwargs):
         pass
 
-    def prefill(self, key: torch.Tensor, value: torch.Tensor):
+    def prefill(
+        self,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        token_idx: torch.Tensor,
+    ):
         if key.dim() != 4:
             raise ValueError("key must have 4 dimensions")
         init_length = key.shape[2]
@@ -517,7 +543,12 @@ def max_tokens_forward(self) -> int:
     def max_prefill_length(self) -> Optional[int]:
         return None
 
-    def forward(self, key: torch.Tensor, value: torch.Tensor) -> KeysAndValues:
+    def forward(
+        self,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        token_idx: torch.Tensor,
+    ) -> KeysAndValues:
         if self.next_position is None:
             raise IndexError("Cache needs to be initialized with 'prefill' before being used")
         if key.ndim != 4:
@@ -563,7 +594,12 @@ def forward(self, key: torch.Tensor, value: torch.Tensor) -> KeysAndValues:
     def update(self, *args, **kwargs):
         pass
 
-    def prefill(self, key: torch.Tensor, value: torch.Tensor):
+    def prefill(
+        self,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        token_idx: torch.Tensor,
+    ):
         if key.dim() != 4:
             raise ValueError("key must have 4 dimensions")
         init_length = key.shape[2]
diff --git a/litgpt/model.py b/litgpt/model.py
@@ -309,7 +309,7 @@ def forward(
                 attn = block.attn
                 if attn.kv_cache.batch_size < eff_batch_size:
                     raise ValueError(f"Batch size {eff_batch_size} is too large for KV cache layer {l_ix} (batch size {attn.kv_cache.batch_size}). Use 'assign_kv_caches' or `set_kv_cache'")
-            x = block(x, cos, sin, input_pos, self.mask_cache)
+            x = block(x, cos, sin, idx, input_pos, self.mask_cache)
 
         x = self.transformer.ln_f(x)
         clamp_head = partial(
@@ -428,6 +428,7 @@ def forward(
         x: torch.Tensor,
         cos: torch.Tensor,
         sin: torch.Tensor,
+        token_idx: torch.Tensor,
         input_pos: Optional[int] = None,
         mask_cache: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
@@ -457,6 +458,7 @@ def forward(
             x_normed,
             cos=cos,
             sin=sin,
+            token_idx=token_idx,
             input_pos=input_pos,
             mask_cache=mask_cache,
         )
@@ -511,6 +513,7 @@ def forward(
         x: torch.Tensor,
         cos: torch.Tensor,
         sin: torch.Tensor,
+        token_idx: torch.Tensor,
         input_pos: Optional[int] = None,
         mask_cache: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
@@ -596,12 +599,12 @@ def forward(
             # Instead of asking for the key and value tensors as such,
             # `k_and_v` allows access to them. Since they are never needed at
             # the same time, this can save memory.
-            k_and_v = self.kv_cache(k, v)
+            k_and_v = self.kv_cache(k, v, token_idx=token_idx)
             # k, v: (B, nh_k, cache_length, hs)
         else:
             if for_prefill:
                 # Prefill KV cache
-                self.kv_cache.prefill(key=k, value=v)
+                self.kv_cache.prefill(key=k, value=v, token_idx=token_idx)
             # In this case, `k_and_v` can vend both keys and values at the same
             # time.
             k_and_v = DefaultKeysAndValues(k, v)
diff --git a/tests/kvcache/test_base.py b/tests/kvcache/test_base.py
@@ -15,6 +15,7 @@ def test_most_recent():
     seed = 31415927
     random.seed(seed)
     torch.random.manual_seed(seed)
+    vocab_size = 128
 
     params = KVCacheParams(
         batch_size=3,
@@ -34,9 +35,22 @@ def test_most_recent():
         num_prefill = max_prefill_length
 
     keys, values = random_keys_values(params, num=num_insert)
-    kv_cache.prefill(keys[:, :, :num_prefill, :], values[:, :, :num_prefill, :])
+    token_idx = torch.randint(
+        low=0,
+        high=vocab_size,
+        size=(params.batch_size, num_insert),
+    )
+    kv_cache.prefill(
+        key=keys[:, :, :num_prefill, :],
+        value=values[:, :, :num_prefill, :],
+        token_idx=token_idx[:, :num_prefill],
+    )
     for pos in range(num_prefill, num_insert):
-        kv_cache(keys[:, :, pos:(pos + 1), :], values[:, :, pos:(pos + 1), :])
+        kv_cache(
+            keys[:, :, pos:(pos + 1), :],
+            values[:, :, pos:(pos + 1), :],
+            token_idx=token_idx[:, pos:(pos + 1)],
+        )
         if kv_cache.update_requires_attn_weights():
             attn_weights = random_attn_weights(params, num=kv_cache.current_length)
             kv_cache.update(attn_weights=attn_weights)
diff --git a/tests/kvcache/test_generic.py b/tests/kvcache/test_generic.py
@@ -18,6 +18,7 @@ def test_store_retrieve(name):
     seed = 31415927
     random.seed(seed)
     torch.random.manual_seed(seed)
+    vocab_size = 128
 
     params = KVCacheParams(
         batch_size=3,
@@ -40,11 +41,22 @@ def test_store_retrieve(name):
         num_prefill = max_prefill_length
 
     keys, values = random_keys_values(params, num=num_insert)
-    kv_cache.prefill(keys[:, :, :num_prefill, :], values[:, :, :num_prefill, :])
+    token_idx = torch.randint(
+        low=0,
+        high=vocab_size,
+        size=(params.batch_size, num_insert),
+    )
+    kv_cache.prefill(
+        key=keys[:, :, :num_prefill, :],
+        value=values[:, :, :num_prefill, :],
+        token_idx=token_idx[:, :num_prefill],
+    )
     keys_and_values = None
     for pos in range(num_prefill, num_insert):
         keys_and_values = kv_cache(
-            keys[:, :, pos:(pos + 1), :], values[:, :, pos:(pos + 1), :]
+            keys[:, :, pos:(pos + 1), :],
+            values[:, :, pos:(pos + 1), :],
+            token_idx=token_idx[:, pos:(pos + 1)],
         )
         if kv_cache.update_requires_attn_weights():
             attn_weights = random_attn_weights(params, num=kv_cache.current_length)
@@ -80,6 +92,7 @@ def test_prefill(name):
     seed = 31415927
     random.seed(seed)
     torch.random.manual_seed(seed)
+    vocab_size = 128
     num_compares = 3
 
     params = KVCacheParams(
@@ -95,18 +108,29 @@ def test_prefill(name):
     kv_cache = create_kv_cache(name, params)
 
     keys, values = random_keys_values(params, num=cache_length)
+    token_idx = torch.randint(
+        low=0,
+        high=vocab_size,
+        size=(params.batch_size, cache_length),
+    )
     keys_cached = []
     values_cached = []
     max_prefill_length = kv_cache.max_prefill_length
     for _ in range(num_compares):
         num_prefill = random.randint(cache_length // 8, cache_length)
         if max_prefill_length is not None and num_prefill > max_prefill_length:
             num_prefill = max_prefill_length
-        kv_cache.prefill(keys[:, :, :num_prefill, :], values[:, :, :num_prefill, :])
+        kv_cache.prefill(
+            key=keys[:, :, :num_prefill, :],
+            value=values[:, :, :num_prefill, :],
+            token_idx=token_idx[:, :num_prefill],
+        )
         keys_and_values = None
         for pos in range(num_prefill, cache_length):
             keys_and_values = kv_cache(
-                keys[:, :, pos:(pos + 1), :], values[:, :, pos:(pos + 1), :]
+                keys[:, :, pos:(pos + 1), :],
+                values[:, :, pos:(pos + 1), :],
+                token_idx=token_idx[:, pos:(pos + 1)],
             )
             if kv_cache.update_requires_attn_weights():
                 attn_weights = random_attn_weights(params, num=kv_cache.current_length)