Lightning-AI
diff --git a/‎litgpt/adapter.py‎
Lines changed: 36 additions & 12 deletions b/‎litgpt/adapter.py‎
Lines changed: 36 additions & 12 deletions
diff --git a/‎litgpt/adapter_v2.py‎
Lines changed: 18 additions & 6 deletions b/‎litgpt/adapter_v2.py‎
Lines changed: 18 additions & 6 deletions
diff --git a/‎litgpt/api.py‎
Lines changed: 9 additions & 4 deletions b/‎litgpt/api.py‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎litgpt/chat/base.py‎
Lines changed: 77 additions & 21 deletions b/‎litgpt/chat/base.py‎
Lines changed: 77 additions & 21 deletions
diff --git a/‎litgpt/finetune/adapter.py‎
Lines changed: 6 additions & 2 deletions b/‎litgpt/finetune/adapter.py‎
Lines changed: 6 additions & 2 deletions
@@ -19,6 +19,7 @@
 from litgpt.model import GPT as BaseModel
 from litgpt.model import Block as BaseBlock
 from litgpt.model import CausalSelfAttention as BaseCausalSelfAttention
+from litgpt.kvcache.base import KVCache, KeysAndValues, DefaultKeysAndValues
 
 
 @dataclass
@@ -29,7 +30,8 @@ class Config(BaseConfig):
 
 class GPT(BaseModel):
     # Copy & paste from :class:`model.GPT`. Note that :class:`Block` is new here.
-    def __init__(self, config: Config) -> None:
+    def __init__(self, config: Config,
+    ) -> None:
         nn.Module.__init__(self)
         assert config.padded_vocab_size is not None
         self.config = config
@@ -49,6 +51,7 @@ def __init__(self, config: Config) -> None:
         )
         self.mask_cache: Optional[torch.Tensor] = None
         self.max_seq_length = self.config.block_size
+        self._default_kv_cache = False
 
     @classmethod
     def from_name(cls, name: str, **kwargs: Any) -> Self:
@@ -62,17 +65,27 @@ def _init_weights(self, module: nn.Module) -> None:
 
 
 class Block(BaseBlock):
-    def __init__(self, config: Config, block_idx: int) -> None:
-        super().__init__(config, block_idx)
-        self.attn = CausalSelfAttention(config, block_idx)
+    def __init__(
+        self,
+        config: Config,
+        block_idx: int,
+        kv_cache: Optional[KVCache] = None,
+    ) -> None:
+        super().__init__(config, block_idx, kv_cache)
+        self.attn = CausalSelfAttention(config, block_idx, kv_cache=kv_cache)
 
 
 class CausalSelfAttention(BaseCausalSelfAttention):
     """A modification of `litgpt.model.CausalSelfAttention` that adds the attention
     over the adaption prompt."""
 
-    def __init__(self, config: Config, block_idx: int) -> None:
-        super().__init__(config, block_idx)
+    def __init__(
+        self,
+        config: Config,
+        block_idx: int,
+        kv_cache: Optional[KVCache] = None,
+    ) -> None:
+        super().__init__(config, block_idx, kv_cache)
         if block_idx >= config.adapter_start_layer:
             # adapter embedding layer
             self.adapter_wte = nn.Embedding(config.adapter_prompt_length, config.n_embd)
@@ -82,11 +95,16 @@ def __init__(self, config: Config, block_idx: int) -> None:
             self.adapter_kv_cache: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
 
     def scaled_dot_product_attention(
-        self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, mask: Optional[torch.Tensor] = None
-    ) -> torch.Tensor:
-        y = super().scaled_dot_product_attention(q, k, v, mask)
+        self,
+        q: torch.Tensor,
+        k_and_v: KeysAndValues,
+        mask: Optional[torch.Tensor] = None,
+        is_causal: bool = True,
+        return_scores: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        y, scores = super().scaled_dot_product_attention(q, k_and_v, mask, is_causal, return_scores)
         if self.block_idx < self.config.adapter_start_layer:
-            return y
+            return y, scores
 
         aT = self.config.adapter_prompt_length
         if self.adapter_kv_cache is not None:
@@ -110,8 +128,14 @@ def scaled_dot_product_attention(
 
         T = q.size(2)
         amask = torch.ones(T, aT, dtype=torch.bool, device=q.device)
-        ay = super().scaled_dot_product_attention(q, ak, av, amask)
-        return y + self.gating_factor * ay
+        a_k_and_v = DefaultKeysAndValues(keys=ak, values=av)
+        ay, _ = super().scaled_dot_product_attention(
+            q=q,
+            k_and_v=a_k_and_v,
+            mask=amask,
+            is_causal=False,
+        )
+        return y + self.gating_factor * ay, scores
 
     def reset_parameters(self) -> None:
         if hasattr(self, "gating_factor"):
 
@@ -9,7 +9,7 @@
 """
 
 from dataclasses import dataclass
-from typing import Any, Dict, Type, Optional
+from typing import Any, Dict, Type, Optional, List
 
 import torch
 import torch.nn as nn
@@ -22,6 +22,7 @@
 from litgpt.adapter import Config as BaseConfig
 from litgpt.scripts.convert_hf_checkpoint import qkv_reassemble
 from litgpt.utils import map_old_state_dict_weights
+from litgpt.kvcache.base import KVCache
 
 
 @dataclass
@@ -84,6 +85,7 @@ def __init__(self, config: Config) -> None:
         )
         self.mask_cache: Optional[torch.Tensor] = None
         self.max_seq_length = self.config.block_size
+        self._default_kv_cache = False
 
     @classmethod
     def from_name(cls, name: str, **kwargs: Any) -> Self:
@@ -103,18 +105,28 @@ def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwa
 
 
 class Block(BaseBlock):
-    def __init__(self, config: Config, block_idx: int) -> None:
-        super().__init__(config, block_idx)
-        self.attn = CausalSelfAttention(config, block_idx)
+    def __init__(
+        self,
+        config: Config,
+        block_idx: int,
+        kv_cache: Optional[KVCache] = None,
+    ) -> None:
+        super().__init__(config, block_idx, kv_cache)
+        self.attn = CausalSelfAttention(config, block_idx, kv_cache=kv_cache)
         self.mlp = config.mlp_class(config)
 
 
 class CausalSelfAttention(BaseCausalSelfAttention):
     """A modification of `litgpt.adapter.CausalSelfAttention` that uses the Adapter V2 Linear class"""
 
     # Copy&paste from :class:`model.CausalSelfAttention`
-    def __init__(self, config: Config, block_idx: int) -> None:
-        super().__init__(config, block_idx)
+    def __init__(
+        self,
+        config: Config,
+        block_idx: int,
+        kv_cache: Optional[KVCache] = None,
+    ) -> None:
+        super().__init__(config, block_idx, kv_cache)
         # key, query, value projections for all heads, but in a batch
         shape = (config.n_head + 2 * config.n_query_groups) * config.head_size
         self.qkv = AdapterV2Linear(
 
@@ -448,6 +448,7 @@ def generate(
         self,
         prompt: str,
         max_new_tokens: int = 50,
+        prompt_chunksize: int = 1,
         temperature: float = 1.0,
         top_k: Optional[int] = None,
         top_p: float = 1.0,
@@ -461,6 +462,11 @@ def generate(
             model: The model to use.
             prompt: The prompt string to use for generating the samples.
             max_new_tokens: The maximum number of new tokens to return.
+            prompt_chunksize: If even the shortest prompt is longer than the KV
+                cache, prompts are processed in chunks of this size in the
+                prefill phase. Once the shortest has been processed to the
+                end, we proceed with chunk size 1.
+                Defaults to 1, but larger values are recommended for long prompts.
             temperature: Scales the predicted logits by 1 / temperature.
             top_k: If specified, only sample among the tokens with the k highest probabilities.
             top_p: If specified, it represents the cumulative probability threshold to consider in the sampling process.
@@ -500,15 +506,12 @@ def generate(
             self.kv_cache_initialized = True
 
         # Dynamically grow the kv cache size if necessary
+        self.model.clear_kv_cache()
         if not self.fixed_kv_cache_size and self.prev_generated_seq_length < max_returned_tokens:
             tmp_device = self.model.mask_cache.device
             self.model.clear_kv_cache()
             self.model.set_kv_cache(batch_size=1, max_seq_length=max_returned_tokens, device=tmp_device)
 
-        else:
-            for block in self.model.transformer.h:
-                block.attn.kv_cache.reset_parameters()
-
         self.prev_generated_seq_length = max_returned_tokens
         self.model.eval()
 
@@ -517,6 +520,7 @@ def iterator():
                 model=self.model,
                 prompt=input_ids,
                 max_returned_tokens=max_returned_tokens,
+                prompt_chunksize=prompt_chunksize,
                 temperature=temperature,
                 top_k=top_k,
                 top_p=top_p,
@@ -536,6 +540,7 @@ def iterator():
                 model=self.model,
                 prompt=input_ids,
                 max_returned_tokens=max_returned_tokens,
+                prompt_chunksize=prompt_chunksize,
                 temperature=temperature,
                 top_k=top_k,
                 top_p=top_p,
 
@@ -31,6 +31,7 @@ def generate(
     prompt: torch.Tensor,
     max_returned_tokens: int,
     *,
+    prompt_chunksize: int = 1,
     temperature: float = 1.0,
     top_k: Optional[int] = None,
     top_p: float = 1.0,
@@ -60,35 +61,60 @@ def generate(
             or https://huyenchip.com/2024/01/16/sampling.html#top_p
         stop_tokens: If specified, stop generating any more token once one of this list is generated.
     """
-    from litgpt.generate.base import generate_fn
-    return generate_fn(
-        include_prompt=False,
-        include_eos=False,
-        model=model,
-        prompt=prompt,
-        max_returned_tokens=max_returned_tokens,
-        temperature=temperature,
-        top_k=top_k,
-        top_p=top_p,
-        stop_tokens=stop_tokens
+    from litgpt.generate.base import batched_generate_fn
+
+    return map(
+        lambda lst: lst[0],
+        batched_generate_fn(
+            model=model,
+            prompts=[prompt],
+            max_returned_tokens=max_returned_tokens,
+            prompt_chunksize=prompt_chunksize,
+            sample_args = dict(
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+            ),
+            stop_tokens=stop_tokens,
+            include_prompt=False,
+            include_eos=False,
+        )
     )
 
 
-def process_prompt(prompt, model, tokenizer, prompt_style, fabric, temperature, max_new_tokens, top_k, top_p, stop_tokens):
+def process_prompt(
+    prompt: str,
+    model: GPT,
+    tokenizer,
+    prompt_style,
+    fabric,
+    max_new_tokens: int,
+    prompt_chunksize: int,
+    temperature: float,
+    top_k: Optional[int],
+    top_p: float,
+    stop_tokens: Tuple[List[int], ...],
+):
     prompt = prompt_style.apply(prompt=prompt)
     encoded_prompt = tokenizer.encode(prompt, device=fabric.device)
 
     if max_new_tokens is None:
         max_returned_tokens = model.max_seq_length
     else:
-        first_turn = model.mask_cache is None
         max_returned_tokens = encoded_prompt.size(0) + max_new_tokens
-        if first_turn or max_returned_tokens > model.max_seq_length:
+        msl = model.max_seq_length
+        if max_returned_tokens > msl or model.config.block_size == msl:
             model.max_seq_length = max_returned_tokens
-            model.set_kv_cache(batch_size=1, device=fabric.device)
 
     y: Iterator[torch.Tensor] = generate(
-        model, encoded_prompt, max_returned_tokens, temperature=temperature, top_k=top_k, top_p=top_p, stop_tokens=stop_tokens
+        model=model,
+        prompt=encoded_prompt,
+        max_returned_tokens=max_returned_tokens,
+        prompt_chunksize=prompt_chunksize,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+        stop_tokens=stop_tokens,
     )
     token_generator: Iterator[str] = tokenizer.decode_stream(y, device=fabric.device)
 
@@ -103,8 +129,7 @@ def process_prompt(prompt, model, tokenizer, prompt_style, fabric, temperature,
 
     t = time.perf_counter() - t0
 
-    for block in model.transformer.h:
-        block.attn.kv_cache.reset_parameters()
+    model.clear_kv_cache()
     fabric.print(
         f"\nTime for inference: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec,"
         f" {tokens_generated} tokens",
@@ -113,7 +138,19 @@ def process_prompt(prompt, model, tokenizer, prompt_style, fabric, temperature,
     fabric.print()
 
 
-def interact(multiline, model, tokenizer, prompt_style, fabric, temperature, max_new_tokens, top_k, top_p, stop_tokens):
+def interact(
+    multiline: bool,
+    model: GPT,
+    tokenizer,
+    prompt_style,
+    fabric,
+    max_new_tokens: int,
+    prompt_chunksize: int,
+    temperature: float,
+    top_k: Optional[int],
+    top_p: float,
+    stop_tokens: Tuple[List[int], ...],
+):
     while True:
         try:
             if not multiline:
@@ -135,14 +172,27 @@ def interact(multiline, model, tokenizer, prompt_style, fabric, temperature, max
         if not prompt or prompt in ("!quit", "!exit"):
             break
 
-        process_prompt(prompt, model, tokenizer, prompt_style, fabric, temperature, max_new_tokens, top_k, top_p, stop_tokens)
+        process_prompt(
+            prompt=prompt,
+            model=model,
+            tokenizer=tokenizer,
+            prompt_style=prompt_style,
+            fabric=fabric,
+            temperature=temperature,
+            max_new_tokens=max_new_tokens,
+            prompt_chunksize=prompt_chunksize,
+            top_k=top_k,
+            top_p=top_p,
+            stop_tokens=stop_tokens,
+        )
 
 
 @torch.inference_mode()
 def main(
     checkpoint_dir: Path,
     *,
     max_new_tokens: int = 50,
+    prompt_chunksize: int = 1,
     top_k: Optional[int] = 50,
     top_p: float = 1.0,
     temperature: float = 0.8,
@@ -158,6 +208,11 @@ def main(
         checkpoint_dir: A local path to a directory containing the model weights or a valid model name.
             You can get a list of valid model names via the `litgpt download list` command line argument.
         max_new_tokens: The number of generation steps to take.
+        prompt_chunksize: If even the shortest prompt is longer than the KV
+            cache, prompts are processed in chunks of this size in the
+            prefill phase. Once the shortest has been processed to the
+            end, we proceed with chunk size 1.
+            Defaults to 1, but larger values are recommended for long prompts.
         top_k: The number of top most probable tokens to consider in the sampling process.
         top_p: If specified, it represents the cumulative probability threshold to consider in the sampling process.
             In top-p sampling, the next token is sampled from the highest probability tokens
@@ -252,8 +307,9 @@ def main(
         tokenizer=tokenizer,
         prompt_style=prompt_style,
         fabric=fabric,
-        temperature=temperature,
         max_new_tokens=(None if compile else max_new_tokens),
+        prompt_chunksize=prompt_chunksize,
+        temperature=temperature,
         top_k=top_k,
         top_p=top_p,
         stop_tokens=stop_tokens
 
@@ -399,8 +399,12 @@ def generate_example(fabric: L.Fabric, model: GPT, tokenizer: Tokenizer, eval: E
             # do not set `max_seq_length=max_returned_token` because memory is not a concern here
             model.set_kv_cache(batch_size=1)
         output = generate(
-            model, encoded, max_returned_tokens=max_returned_tokens, temperature=0.8, eos_id=tokenizer.eos_id
-        )
+            model=model,
+            prompts=[encoded],
+            max_returned_tokens=max_returned_tokens,
+            temperature=0.8,
+            eos_id=tokenizer.eos_id,
+        )[0]
         model.clear_kv_cache()
         model.train()
         output = tokenizer.decode(output)