Add runner

jackzhxng · jackzhxng · commit eeeeb8a4a022 · 2024-11-13T14:07:13.000-08:00
diff --git a/examples/models/llama3_2_vision/runner/eager.py b/examples/models/llama3_2_vision/runner/eager.py
@@ -0,0 +1,85 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import json
+from typing import Optional
+
+import torch
+
+from executorch.examples.models.llama.export_llama_lib import (
+    _prepare_for_llama_export,
+    build_args_parser as _build_args_parser,
+    TORCHTUNE_DEFINED_MODELS,
+)
+from executorch.examples.models.llama3_2_vision.runner.generation import TorchTuneLlamaRunner
+from executorch.extension.llm.export import LLMEdgeManager
+
+
+class EagerLlamaRunner(TorchTuneLlamaRunner):
+    """
+    Runs llama in eager mode with provided checkpoint file.
+    """
+
+    def __init__(self, args):
+        with open(args.params, "r") as f:
+            params = json.loads(f.read())
+        super().__init__(
+            tokenizer_path=args.tokenizer_path,
+            max_seq_len=args.max_seq_length,
+            max_batch_size=1,
+            use_kv_cache=args.use_kv_cache,
+            vocab_size=params["vocab_size"],
+            device="cuda" if torch.cuda.is_available() else "cpu",
+        )
+        manager: LLMEdgeManager = _prepare_for_llama_export(args)
+        self.model = manager.model.eval().to(device=self.device)
+
+    def forward(
+        self,
+        tokens: Optional[torch.LongTensor] = None,
+        input_pos: Optional[torch.LongTensor] = None,
+        mask: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        return self.model.forward(tokens=tokens, input_pos=input_pos, mask=mask)
+
+
+def build_args_parser() -> argparse.ArgumentParser:
+    parser = _build_args_parser()
+
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="Hello",
+    )
+
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0,
+    )
+
+    return parser
+
+
+def main() -> None:
+    parser = build_args_parser()
+    args = parser.parse_args()
+
+    runner = EagerLlamaRunner(args)
+    result = runner.text_completion(
+        prompt=args.prompt,
+        temperature=args.temperature,
+    )
+    print(
+        "Response: \n{response}\n Tokens:\n {tokens}".format(
+            response=result["generation"], tokens=result["tokens"]
+        )
+    )
+
+
+if __name__ == "__main__":
+    main()  # pragma: no cover
diff --git a/examples/models/llama3_2_vision/runner/generation.py b/examples/models/llama3_2_vision/runner/generation.py
@@ -0,0 +1,101 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from abc import ABC, abstractmethod
+from typing import List, Optional, TypedDict
+
+import torch
+
+from executorch.extension.llm.tokenizer.utils import get_tokenizer
+from executorch.examples.models.llama.runner.generation import LlamaRunner, next_token, sample_top_p
+
+
+class TorchTuneLlamaRunner(LlamaRunner):
+    def __init__(
+        self,
+        tokenizer_path: str,
+        max_seq_len: int,
+        max_batch_size: int,
+        use_kv_cache: bool,
+        vocab_size: int,
+        device: str = "cpu",
+    ):
+        super().__init__(
+            tokenizer_path,
+            max_seq_len,
+            max_batch_size,
+            use_kv_cache,
+            vocab_size,
+            device,
+        )
+
+        self.causal_mask = torch.tril(
+            torch.ones(
+                size=(max_seq_len, max_seq_len),
+                dtype=torch.bool,
+            )
+        )
+        self.input_pos = torch.arange(max_seq_len)
+
+    def generate(  # noqa: C901
+        self,
+        prompt_tokens: List[int],
+        max_seq_len: int,
+        temperature: float = 0.8,
+        top_p: float = 0.9,
+        echo: bool = False,
+    ) -> List[int]:
+        # Prefill
+        seq_len = len(prompt_tokens)
+        input_pos = self.input_pos[None, :seq_len]
+        mask = self.causal_mask[None, :seq_len]
+        if self.use_kv_cache:
+            logits = self.forward(
+                tokens=torch.tensor([prompt_tokens], dtype=torch.long, device=self.device),
+                input_pos=input_pos,
+                mask=mask,
+            )
+        else:
+            logits = self.forward(
+                tokens=torch.tensor([prompt_tokens], dtype=torch.long, device=self.device),
+            )
+
+        # Only need the last logit.
+        current_token = next_token(logits[:, -1, :], temperature, top_p)
+        print(f"{self.tokenizer.decode_token(current_token)}", end="", flush=True)
+        tokens = prompt_tokens + [current_token]
+
+        while len(tokens) < max_seq_len:
+            mask = self.causal_mask[None, seq_len, None, :]
+            input_pos = self.input_pos[None, seq_len, None]
+            if self.use_kv_cache:
+                logits = self.forward(
+                    tokens=torch.tensor(
+                        [[current_token]], dtype=torch.long, device=self.device
+                    ),
+                    input_pos=input_pos,
+                    mask=mask,
+                )
+            else:
+                logits = self.forward(
+                    tokens=torch.tensor([tokens], dtype=torch.long, device=self.device),
+                )
+
+            # Only need the last logit.
+            current_token = next_token(logits[:, -1, :], temperature, top_p)
+            tokens.append(current_token)
+
+            if current_token == self.tokenizer.eos_id or (
+                hasattr(self.tokenizer, "stop_tokens")
+                and current_token in self.tokenizer.stop_tokens
+            ):
+                break
+
+            print(f"{self.tokenizer.decode_token(current_token)}", end="", flush=True)
+            seq_len += 1
+
+        return tokens if echo else tokens[len(prompt_tokens) :]
+