Runner changes for TorchTune Llama3.2 vision text decoder (pytorch#6610)

jackzhxng · web-flow · commit 6c944db408c6 · 2024-11-14T14:04:33.000-08:00
diff --git a/examples/models/llama/runner/eager.py b/examples/models/llama/runner/eager.py
@@ -9,11 +9,12 @@
 from typing import Optional
 
 import torch
+
 from executorch.examples.models.llama.export_llama_lib import (
     _prepare_for_llama_export,
     build_args_parser as _build_args_parser,
+    TORCHTUNE_DEFINED_MODELS,
 )
-from executorch.examples.models.llama.llama_transformer import ModelArgs
 from executorch.examples.models.llama.runner.generation import LlamaRunner
 from executorch.extension.llm.export.builder import LLMEdgeManager
 
@@ -26,15 +27,13 @@ class EagerLlamaRunner(LlamaRunner):
     def __init__(self, args):
         with open(args.params, "r") as f:
             params = json.loads(f.read())
-        model_args: ModelArgs = ModelArgs(
+        super().__init__(
+            tokenizer_path=args.tokenizer_path,
             max_seq_len=args.max_seq_length,
             max_batch_size=1,
             use_kv_cache=args.use_kv_cache,
-            **params,
-        )
-        super().__init__(
-            tokenizer_path=args.tokenizer_path,
-            model_args=model_args,
+            vocab_size=params["vocab_size"],
+            has_full_logits=args.model in TORCHTUNE_DEFINED_MODELS,
             device="cuda" if torch.cuda.is_available() else "cpu",
         )
         manager: LLMEdgeManager = _prepare_for_llama_export(args)
diff --git a/examples/models/llama/runner/generation.py b/examples/models/llama/runner/generation.py
@@ -9,7 +9,6 @@
 
 import torch
 
-from executorch.examples.models.llama.llama_transformer import ModelArgs
 from executorch.extension.llm.tokenizer.utils import get_tokenizer
 
 
@@ -47,11 +46,35 @@ def next_token(logits: torch.Tensor, temperature: float, top_p: float) -> int:
 
 
 class LlamaRunner(ABC):
-    def __init__(self, tokenizer_path: str, model_args: ModelArgs, device: str = "cpu"):
-        self.params = model_args
+    def __init__(
+        self,
+        tokenizer_path: str,
+        max_seq_len: int,
+        max_batch_size: int,
+        use_kv_cache: bool,
+        vocab_size: int,
+        has_full_logits: bool = False,
+        device: str = "cpu",
+    ):
+        """
+        Constructor.
+
+        Args:
+        tokenizer_path: path to tokenizer.model file.
+        max_seq_len: max length of the output sequence, after which the output will be clipped.
+        max_batch_size: max batch size.
+        use_kv_cache: whether to use a KV cache.
+        vocab_size: number of items in the vocab.
+        has_full_logits: whether the model returns the full logits or only returns the last logit.
+        device: device to run the runner on.
+        """
+        self.max_seq_len = max_seq_len
+        self.max_batch_size = max_batch_size
+        self.use_kv_cache = use_kv_cache
         self.tokenizer = get_tokenizer(tokenizer_path)
-        assert model_args.vocab_size == self.tokenizer.n_words
+        self.has_full_logits = has_full_logits
         self.device = device
+        assert vocab_size == self.tokenizer.n_words
 
     @abstractmethod
     def forward(
@@ -75,17 +98,20 @@ def generate(  # noqa: C901
             tokens=torch.tensor([prompt_tokens], dtype=torch.long, device=self.device),
             input_pos=(
                 torch.tensor([pos_base], dtype=torch.long, device=self.device)
-                if self.params.use_kv_cache
+                if self.use_kv_cache
                 else None
             ),
         )
 
-        current_token = next_token(logits, temperature, top_p)
+        if self.has_full_logits:
+            current_token = next_token(logits[:, -1, :], temperature, top_p)
+        else:
+            current_token = next_token(logits, temperature, top_p)
         print(f"{self.tokenizer.decode_token(current_token)}", end="", flush=True)
         tokens = prompt_tokens + [current_token]
 
         while len(tokens) < max_seq_len:
-            if self.params.use_kv_cache:
+            if self.use_kv_cache:
                 logits = self.forward(
                     tokens=torch.tensor(
                         [[current_token]], dtype=torch.long, device=self.device
@@ -100,13 +126,20 @@ def generate(  # noqa: C901
                 logits = self.forward(
                     tokens=torch.tensor([tokens], dtype=torch.long, device=self.device),
                 )
-            current_token = next_token(logits, temperature, top_p)
+
+            # If the logits aren't already clipped to only contain the last logit, clip them.
+            if self.has_full_logits:
+                current_token = next_token(logits[:, -1, :], temperature, top_p)
+            else:
+                current_token = next_token(logits, temperature, top_p)
             tokens.append(current_token)
+
             if current_token == self.tokenizer.eos_id or (
                 hasattr(self.tokenizer, "stop_tokens")
                 and current_token in self.tokenizer.stop_tokens
             ):
                 break
+
             print(f"{self.tokenizer.decode_token(current_token)}", end="", flush=True)
         print("\n")
 
@@ -136,7 +169,7 @@ def text_completion(
         """
         return self.generate(
             prompt_tokens=self.tokenizer.encode(prompt, bos=True, eos=False),
-            max_seq_len=self.params.max_seq_len,
+            max_seq_len=self.max_seq_len,
             temperature=temperature,
             top_p=top_p,
             echo=echo,
@@ -171,7 +204,7 @@ def chat_completion(
                 prompt_tokens=self.tokenizer.encode(
                     self._format_prompt(prompt), bos=True, eos=False
                 ),
-                max_seq_len=self.params.max_seq_len,
+                max_seq_len=self.max_seq_len,
                 temperature=temperature,
                 top_p=top_p,
                 echo=True,
diff --git a/examples/models/llama/runner/native.py b/examples/models/llama/runner/native.py
@@ -10,18 +10,22 @@
 
 import torch
 
-from examples.models.llama.llama_transformer import ModelArgs
+from executorch.examples.models.llama.export_llama_lib import (
+    EXECUTORCH_DEFINED_MODELS,
+    TORCHTUNE_DEFINED_MODELS,
+)
+
 from executorch.extension.pybindings.portable_lib import _load_for_executorch
 
 # Load custom ops and quantized ops.
 from executorch.extension.pybindings import portable_lib  # noqa # usort: skip
 
+from executorch.examples.models.llama.runner.generation import LlamaRunner
+
 # Note: import this after portable_lib
 from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa # usort: skip
 from executorch.kernels import quantized  # noqa
 
-from .generation import LlamaRunner
-
 
 class NativeLlamaRunner(LlamaRunner):
     """
@@ -31,13 +35,14 @@ class NativeLlamaRunner(LlamaRunner):
     def __init__(self, args):
         with open(args.params, "r") as f:
             params = json.loads(f.read())
-        model_args: ModelArgs = ModelArgs(
+        super().__init__(
+            tokenizer_path=args.tokenizer,
             max_seq_len=args.max_len,
             max_batch_size=1,
             use_kv_cache=args.kv_cache,
-            **params,
+            vocab_size=params["vocab_size"],
+            has_full_logits=args.model in TORCHTUNE_DEFINED_MODELS,
         )
-        super().__init__(tokenizer_path=args.tokenizer, model_args=model_args)
         self.model = _load_for_executorch(args.pte)
 
     def forward(
@@ -53,8 +58,15 @@ def forward(
 
 
 def build_args_parser() -> argparse.ArgumentParser:
+    # TODO: merge these with build_args_parser from export_llama_lib.
     parser = argparse.ArgumentParser()
 
+    parser.add_argument(
+        "--model",
+        default="llama3",
+        choices=EXECUTORCH_DEFINED_MODELS + TORCHTUNE_DEFINED_MODELS,
+    )
+
     parser.add_argument(
         "-f",
         "--pte",
@@ -89,7 +101,6 @@ def build_args_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "-kv",
         "--kv_cache",
-        default=True,
         action="store_true",
     )
 
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
@@ -194,6 +194,10 @@ def export(self) -> "LLMEdgeManager":
                     strict=True,
                 )
             else:
+                logging.info("Exporting with:")
+                logging.info(f"inputs: {self.example_inputs}")
+                logging.info(f"kwargs: {self.example_kwarg_inputs}")
+                logging.info(f"dynamic shapes: {dynamic_shape}")
                 exported_module = export_for_training(
                     self.model,
                     self.example_inputs,