Fixed tests

mseeger · mseeger · commit 212c7a738c10 · 2025-02-25T14:06:19.000+01:00
diff --git a/litgpt/generate/adapter.py b/litgpt/generate/adapter.py
@@ -147,7 +147,7 @@ def main(
         temperature=temperature,
         top_k=top_k,
         top_p=top_p,
-        eos_id=tokenizer.eos_id,
+        eos_id=int(tokenizer.eos_id),
     )[0]
     t = time.perf_counter() - t0
 
diff --git a/litgpt/generate/adapter_v2.py b/litgpt/generate/adapter_v2.py
@@ -146,7 +146,7 @@ def main(
         temperature=temperature,
         top_k=top_k,
         top_p=top_p,
-        eos_id=tokenizer.eos_id,
+        eos_id=int(tokenizer.eos_id),
     )[0]
     t = time.perf_counter() - t0
 
diff --git a/litgpt/generate/base.py b/litgpt/generate/base.py
@@ -415,6 +415,14 @@ def generate(
     However, KV cache eviction is done in a more coarse-grained manner,
     which can lead to worse performance.
 
+    Key-value caching:
+
+    KV caches must have been assigned in `model`, in that
+    `model.are_kv_caches_assigned() == True`. This is done by either
+    assigning KV caches with `model.assign_kv_caches(...)`, or by creating
+    default (dense) KV caches with `model.set_kv_cache(...)`. The latter does
+    not allow to control memory being used.
+
     Args:
         model: The model to use.
         prompts: List of batch_size 1D tensors, each being a prompt sequence
@@ -570,6 +578,8 @@ def main(
     with fabric.init_tensor():
         # set the max_seq_length to limit the memory usage to what we need
         model.max_seq_length = max_returned_tokens
+        # enable the kv cache
+        model.set_kv_cache(batch_size=1)
     model.eval()
 
     if compile:
@@ -594,7 +604,7 @@ def main(
             temperature=temperature,
             top_k=top_k,
             top_p=top_p,
-            eos_id=tokenizer.eos_id,
+            eos_id=int(tokenizer.eos_id),
         )[0]
         t = time.perf_counter() - t0
         fabric.print(tokenizer.decode(y))
diff --git a/litgpt/generate/full.py b/litgpt/generate/full.py
@@ -141,7 +141,7 @@ def main(
         temperature=temperature,
         top_k=top_k,
         top_p=top_p,
-        eos_id=tokenizer.eos_id,
+        eos_id=int(tokenizer.eos_id),
     )[0]
     t = time.perf_counter() - t0
 
diff --git a/litgpt/generate/sequentially.py b/litgpt/generate/sequentially.py
@@ -238,6 +238,7 @@ def main(
     # still, use init_tensor for the precision
     with fabric.init_tensor(), torch.device("meta"):
         model = GPT(config)
+        model.set_kv_cache(batch_size=1)
     print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
 
     t0 = time.perf_counter()
@@ -276,13 +277,13 @@ def main(
             temperature=temperature,
             top_k=top_k,
             top_p=top_p,
-            eos_id=tokenizer.eos_id,
+            eos_id=int(tokenizer.eos_id),
         )[0]
         t = time.perf_counter() - t0
-        model.clear_kv_cache()
         print(tokenizer.decode(y))
         tokens_generated = y.size(0) - prompt_length
         print(
             f"Time for inference {i + 1}: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr
         )
+    model.clear_kv_cache()
     print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr)
diff --git a/litgpt/generate/tp.py b/litgpt/generate/tp.py
@@ -202,6 +202,7 @@ def main(
     # still, use init_tensor for the precision
     with fabric.init_tensor(), torch.device("meta"):
         model = GPT(config)
+        model.set_kv_cache(batch_size=1)
     fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
 
     # sequentially do: load the checkpoint on CPU -> quantize -> apply tp -> move to device
@@ -253,14 +254,14 @@ def main(
             temperature=temperature,
             top_k=top_k,
             top_p=top_p,
-            eos_id=tokenizer.eos_id,
+            eos_id=int(tokenizer.eos_id),
         )[0]
         t = time.perf_counter() - t0
-        model.clear_kv_cache()
         fabric.print(tokenizer.decode(y))
         tokens_generated = y.size(0) - prompt_length
         fabric.print(
             f"Time for inference {i + 1}: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr
         )
+    model.clear_kv_cache()
     if fabric.device.type == "cuda":
         fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr)
diff --git a/litgpt/model.py b/litgpt/model.py
@@ -101,8 +101,12 @@ def max_seq_length(self, value: int) -> None:
                 device=self.cos.device,
             )
 
-    def _are_kv_caches_assigned(self) -> bool:
-        return any(block.attn.kv_cache is not None for block in self.transformer.h)
+    def are_kv_caches_assigned(self) -> bool:
+        status = [block.attn.kv_cache is not None for block in self.transformer.h]
+        result = any(status)
+        if result and not all(status):
+            raise IndexError("Some layers have KV caches assigned, but not all")
+        return result
 
     def assign_kv_caches(
         self, kv_caches: List[KVCache]
@@ -120,7 +124,7 @@ def assign_kv_caches(
             kv_caches: KV caches, one for each layer of the model
 
         """
-        if self._are_kv_caches_assigned():
+        if self.are_kv_caches_assigned():
             raise ValueError("Model has KV caches assigned already")
         if len(kv_caches) != self.config.n_layer:
             raise ValueError(f"kv_caches must have one entry per layer, so {self.config.n_layer} entries ")
@@ -154,7 +158,7 @@ def set_kv_cache(
                 `self.max_seq_length`
 
         """
-        if self._are_kv_caches_assigned() and not self._default_kv_cache:
+        if self.are_kv_caches_assigned() and not self._default_kv_cache:
             raise ValueError("Model has KV caches assigned already")
         if max_seq_length is None:
             max_seq_length = self.max_seq_length
@@ -269,15 +273,14 @@ def forward(
             raise ValueError(f"Cannot forward sequence of length {T}, max seq length is only {self.max_seq_length}.")
         for_prefill = False
         if input_pos is not None:
-            for_prefill = (input_pos == 0)
             # Few tokens generation. This needs a KV cache. If none is assigned,
             # the call fails
-            msg_suffix = f"."
-            for l_ix, block in enumerate(self.transformer.h):
-                kv_cache = block.attn.kv_cache
-                if kv_cache is None:
-                    raise ValueError("KV caches are not assigned. Assign KV caches with 'assign_kv_caches' or create default caches with 'set_kv_cache'")
-                if not for_prefill:
+            if not self.are_kv_caches_assigned():
+                raise ValueError("KV caches are not assigned. Assign KV caches with 'assign_kv_caches' or create default caches with 'set_kv_cache'")
+            for_prefill = (input_pos == 0)
+            if not for_prefill:
+                for l_ix, block in enumerate(self.transformer.h):
+                    kv_cache = block.attn.kv_cache
                     if kv_cache.next_token_pos is None:
                         raise ValueError("Inference calls need to start with pre-fill, i.e. 'input_pos=0'")
                     if kv_cache.next_token_pos != input_pos:
@@ -373,6 +376,7 @@ def clear_kv_cache(self) -> None:
         if self._default_kv_cache:
             for block in self.transformer.h:
                 block.attn.kv_cache = None
+            self._default_kv_cache = False
 
     def get_kv_cache_params(self) -> Optional[KVCacheParams]:
         kv_cache = self.transformer.h[0].attn.kv_cache