Small change

mseeger · mseeger · commit fce121b83418 · 2025-02-25T16:57:59.000+01:00
diff --git a/litgpt/generate/base.py b/litgpt/generate/base.py
@@ -275,16 +275,16 @@ def batched_generate_fn(
         )
         # We may need the last time slice of `all_logits` below:
         all_logits = model(inputs, input_pos=start)
-        if start == 0:
-            max_tokens_forward = model.kv_cache_max_tokens_forward()
-            if prompt_chunksize > max_tokens_forward:
-                print(
-                    f"prompt_chunksize = {prompt_chunksize} > {max_tokens_forward} = max_tokens_forward. Lowering it to the latter.")
-                prompt_chunksize = max_tokens_forward
-        start = token_pos
         if token_pos == min_prompt_size:
             break
-        chunksize = min(prompt_chunksize, min_prompt_size - token_pos)
+        start = token_pos
+        # Note that `max_tokens_forward` can change during the course of
+        # prompt processing:
+        chunksize = min((
+            prompt_chunksize,
+            model.kv_cache_max_tokens_forward(),
+            min_prompt_size - token_pos
+        ))
         token_pos += chunksize
 
     # Generation loop: One token per iteration
diff --git a/litgpt/kvcache/base.py b/litgpt/kvcache/base.py
@@ -151,6 +151,9 @@ def next_token_pos(self) -> Optional[int]:
     @property
     def max_tokens_forward(self) -> int:
         """
+        Note that this limit may change during the course of the generation
+        for certain caches.
+
         Returns:
             Maximum number of token positions which can be treated in
             :meth:`forward`. Depends on cache, but is `<= cache_length`