Add Llama3.2 1B as an new example model

jackzhxng · jackzhxng · commit b7c8315ae9c9 · 2024-11-07T10:02:50.000-08:00
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -78,7 +78,7 @@
 
 
 EXECUTORCH_DEFINED_MODELS = ["llama2", "llama3", "llama3_1", "llama3_2"]
-TORCHTUNE_DEFINED_MODELS = ["llama3_2_vision"]
+TORCHTUNE_DEFINED_MODELS = ["llama3_2_vision", "llama3_2_tt"]
 
 
 class WeightType(Enum):
@@ -811,6 +811,9 @@ def _load_llama_model(
     elif modelname in TORCHTUNE_DEFINED_MODELS:
         if modelname == "llama3_2_vision":
             model_class_name = "Llama3_2Decoder"
+        if modelname == "llama3_2_tt":
+            modelname = "llama3_2"
+            model_class_name = "Llama3_2"
     else:
         raise ValueError(f"{modelname} is not a valid Llama model.")
 
diff --git a/examples/models/llama/runner/generation.py b/examples/models/llama/runner/generation.py
@@ -146,12 +146,15 @@ def text_completion(
             This method generates text completion for the provided prompt, employing nucleus sampling to introduce controlled randomness.
         """
         prompt_tokens = self.tokenizer.encode(prompt, bos=True, eos=False)
+        print(f"Encoded prompt: {prompt_tokens}")
+        print("Generating")
         generation_tokens = self.generate(
             prompt_tokens=prompt_tokens,
             temperature=temperature,
             top_p=top_p,
             echo=echo,
         )
+        print("Generated")
         return {
             "generation": self.tokenizer.decode(generation_tokens),
             "tokens": generation_tokens,
diff --git a/examples/models/llama3_2/__init__.py b/examples/models/llama3_2/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .model import Llama3_2
+
+__all__ = [Llama3_2]
diff --git a/examples/models/llama3_2/model.py b/examples/models/llama3_2/model.py
@@ -0,0 +1,150 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+import json
+from typing import Any, Dict
+
+import torch
+from executorch.examples.models.checkpoint import (
+    get_checkpoint_dtype,
+    get_default_model_resource_dir,
+)
+
+from executorch.examples.models.model_base import EagerModelBase
+from torchtune.models.llama3_2._model_builders import llama3_2_1b
+from torchtune.models.convert_weights import meta_to_tune
+
+
+class Llama3_2(EagerModelBase):
+    """
+    Llama3.2 as from TorchTune.
+    """
+
+    def __init__(self, **kwargs):
+        # Set member vars from kwargs.
+        self.max_seq_len = kwargs.get(
+            "max_seq_len", 8192
+        )  # Trained to be a lot larger, but this value is kept small because of static kv cache at the moment.
+        self.encoder_max_seq_len = kwargs.get(
+            "encoder_max_seq_len", int(4 * (448 / 14) ** 2 + 1)
+        )  # Same as above.
+        self.output_prune_map_path = kwargs.get("output_prune_map_path", None)
+        self.use_kv_cache = kwargs.get("use_kv_cache", False)
+        self.verbose = kwargs.get("verbose", False)
+        self.args = kwargs.get("args", None)
+
+        ckpt_dir = get_default_model_resource_dir(__file__)
+        # Single checkpoint file.
+        checkpoint_path = kwargs.get("checkpoint", ckpt_dir / "demo_rand_params.pth")
+        # Sharded checkpoint.
+        checkpoint_dir = kwargs.get("checkpoint_dir", None)
+        params_path = kwargs.get("params", ckpt_dir / "demo_config.json")
+
+        self.causal_mask = torch.tril(
+            torch.ones(
+                size=(self.max_seq_len, self.max_seq_len),
+                dtype=torch.bool,
+            )
+        )
+        self.input_pos = torch.arange(self.max_seq_len)
+
+        # Load checkpoint and params.
+        device = "cpu"
+        if checkpoint_dir is not None:
+            raise NotImplementedError(
+                "Sharded checkpoint not yet supported for Llama3_2Decoder."
+            )
+        else:
+            checkpoint = torch.load(checkpoint_path, map_location=device, mmap=True)
+        checkpoint = meta_to_tune(checkpoint)
+        with open(params_path, "r") as f:
+            params = json.loads(f.read())
+
+        # Find dtype from checkpoint. (skip for now)
+        self.dtype = get_checkpoint_dtype(checkpoint)
+
+        # Load model.
+        self.model_ = llama3_2_1b()
+
+        # Save params for future use.
+        for param_name, param_val in params.items():
+            setattr(self.model_, param_name, param_val)
+
+        # Quantize. (skip for now)
+
+        # Load checkpoint.
+        missing, unexpected = self.model_.load_state_dict(
+            checkpoint,
+            strict=False,
+            assign=True,
+        )
+        if kwargs.get("verbose", False):
+            print("============= missing keys ================")
+            print(missing)
+            print("============= /missing ================")
+            print("============= unexpected keys ================")
+            print(unexpected)
+            print("============= /unexpected ================")
+
+        # Prune the output layer if output_prune_map is provided.
+        output_prune_map = None
+        if self.output_prune_map_path is not None:
+            from executorch.examples.models.llama2.source_transformation.prune_output import (
+                prune_output_vocab,
+            )
+
+            with open(self.output_prune_map_path, "r") as f:
+                output_prune_map = json.load(f)
+            # Change keys from string to int (json only supports string keys)
+            output_prune_map = {int(k): v for (k, v) in output_prune_map.items()}
+
+            self.model_ = prune_output_vocab(self.model_, output_prune_map)
+
+        if self.use_kv_cache:
+            print("Setting up KV cache on the model...")
+            self.model_.setup_caches(
+                batch_size=1,
+                dtype=self.dtype,
+                decoder_max_seq_len=self.max_seq_len,
+            )
+
+    def get_eager_model(self) -> torch.nn.Module:
+        if self.dtype:
+            return self.model_.to(self.dtype)
+        else:
+            return self.model_.to(torch.float16)
+
+    def get_example_inputs(self):
+        return (torch.ones(1, 32, dtype=torch.long),)
+
+    def get_example_kwarg_inputs(self):
+        # For export we must use the prefill versions of the
+        # causal mask and input_pos.
+        if self.use_kv_cache:
+            return {
+                "input_pos": self.input_pos[None, :32],
+                "mask": self.causal_mask[None, :32],
+            }
+        else:
+            return None
+
+    def get_dynamic_shapes(self):
+        batch_size = 1
+        dim_seq_len = torch.export.Dim("token_dim", min=1, max=self.max_seq_len)
+        if self.use_kv_cache:
+            dynamic_shapes = {
+                "tokens": {0: batch_size, 1: dim_seq_len},
+                "input_pos" : {0: batch_size, 1: dim_seq_len},
+                "mask": {0: batch_size, 1: dim_seq_len, 2: None},
+            }
+        else:
+            dynamic_shapes = {
+                "tokens": {0: batch_size, 1: dim_seq_len},
+            }
+        return dynamic_shapes
+
diff --git a/examples/models/llama3_2/runner/eager.py b/examples/models/llama3_2/runner/eager.py
@@ -0,0 +1,85 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import json
+from typing import Optional
+
+import torch
+
+from executorch.examples.models.llama.export_llama_lib import (
+    _prepare_for_llama_export,
+    build_args_parser as _build_args_parser,
+    TORCHTUNE_DEFINED_MODELS,
+)
+from executorch.examples.models.llama3_2_vision.runner.generation import TorchTuneLlamaRunner
+from executorch.extension.llm.export import LLMEdgeManager
+
+
+class EagerLlamaRunner(TorchTuneLlamaRunner):
+    """
+    Runs llama in eager mode with provided checkpoint file.
+    """
+
+    def __init__(self, args):
+        with open(args.params, "r") as f:
+            params = json.loads(f.read())
+        super().__init__(
+            tokenizer_path=args.tokenizer_path,
+            max_seq_len=args.max_seq_length,
+            max_batch_size=1,
+            use_kv_cache=args.use_kv_cache,
+            vocab_size=params["vocab_size"],
+            device="cuda" if torch.cuda.is_available() else "cpu",
+        )
+        manager: LLMEdgeManager = _prepare_for_llama_export(args)
+        self.model = manager.model.eval().to(device=self.device)
+
+    def forward(
+        self,
+        tokens: Optional[torch.LongTensor] = None,
+        input_pos: Optional[torch.LongTensor] = None,
+        mask: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        return self.model.forward(tokens=tokens, input_pos=input_pos, mask=mask)
+
+
+def build_args_parser() -> argparse.ArgumentParser:
+    parser = _build_args_parser()
+
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="Hello",
+    )
+
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0,
+    )
+
+    return parser
+
+
+def main() -> None:
+    parser = build_args_parser()
+    args = parser.parse_args()
+
+    runner = EagerLlamaRunner(args)
+    result = runner.text_completion(
+        prompt=args.prompt,
+        temperature=args.temperature,
+    )
+    print(
+        "Response: \n{response}\n Tokens:\n {tokens}".format(
+            response=result["generation"], tokens=result["tokens"]
+        )
+    )
+
+
+if __name__ == "__main__":
+    main()  # pragma: no cover
diff --git a/examples/models/llama3_2/runner/exported_runner.py b/examples/models/llama3_2/runner/exported_runner.py
@@ -0,0 +1,95 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import json
+from typing import Optional
+
+import torch
+
+from executorch.examples.models.llama.export_llama_lib import (
+    _prepare_for_llama_export,
+    build_args_parser as _build_args_parser,
+    TORCHTUNE_DEFINED_MODELS,
+)
+from executorch.examples.models.llama3_2_vision.runner.generation import TorchTuneLlamaRunner
+from executorch.extension.llm.export import LLMEdgeManager
+
+
+class ExportedLlamaRunner(TorchTuneLlamaRunner):
+    """
+    Runs a torch-exported .pt2 Llama.
+    """
+
+    def __init__(self, args):
+        with open(args.params, "r") as f:
+            params = json.loads(f.read())
+        super().__init__(
+            tokenizer_path=args.tokenizer_path,
+            max_seq_len=args.max_seq_length,
+            max_batch_size=1,
+            use_kv_cache=args.use_kv_cache,
+            vocab_size=params["vocab_size"],
+            device="cuda" if torch.cuda.is_available() else "cpu",
+        )
+        print(f"Loading model from {args.pt2}")
+        self.model = torch.export.load(args.pt2).module()
+        print("Model loaded")
+
+    def forward(
+        self,
+        tokens: Optional[torch.LongTensor] = None,
+        input_pos: Optional[torch.LongTensor] = None,
+        mask: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        print("Forward")
+        if self.use_kv_cache:
+            return self.model(tokens, input_pos=input_pos, mask=mask)
+        else:
+            return self.model(tokens)
+
+def build_args_parser() -> argparse.ArgumentParser:
+    parser = _build_args_parser()
+
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="Hello",
+    )
+
+    parser.add_argument(
+        "--pt2",
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0,
+    )
+
+    return parser
+
+
+def main() -> None:
+    parser = build_args_parser()
+    args = parser.parse_args()
+
+    runner = ExportedLlamaRunner(args)
+    result = runner.text_completion(
+        prompt=args.prompt,
+        temperature=args.temperature,
+    )
+    print(
+        "Response: \n{response}\n Tokens:\n {tokens}".format(
+            response=result["generation"], tokens=result["tokens"]
+        )
+    )
+
+
+if __name__ == "__main__":
+    main()  # pragma: no cover
diff --git a/examples/models/llama3_2/runner/generation.py b/examples/models/llama3_2/runner/generation.py