From 7f81e00b6a6d6e3ee8650b1bc988b721a2b17e50 Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Wed, 9 Oct 2024 15:22:08 -0700
Subject: [PATCH 01/44] Changes to native runner to run tt

---
 examples/models/llama2/runner/native.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/examples/models/llama2/runner/native.py b/examples/models/llama2/runner/native.py
index b0d6c20e961..c457762d71f 100644
--- a/examples/models/llama2/runner/native.py
+++ b/examples/models/llama2/runner/native.py
@@ -10,17 +10,17 @@
 
 import torch
 
-from examples.models.llama2.llama_transformer import ModelArgs
+from executorch.examples.models.llama2.llama_transformer import ModelArgs
 from executorch.extension.pybindings.portable_lib import _load_for_executorch
 
 # Load custom ops and quantized ops.
 from executorch.extension.pybindings import portable_lib  # noqa # usort: skip
 
 # Note: import this after portable_lib
-from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa # usort: skip
+# from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa # usort: skip
 from executorch.kernels import quantized  # noqa
 
-from .generation import LlamaRunner
+from executorch.examples.models.llama2.runner.generation import LlamaRunner
 
 
 class NativeLlamaRunner(LlamaRunner):
@@ -35,7 +35,7 @@ def __init__(self, args):
             max_seq_len=args.max_len,
             max_batch_size=1,
             use_kv_cache=args.kv_cache,
-            **params,
+            vocab_size=params["vocab_size"],
         )
         super().__init__(tokenizer_path=args.tokenizer, model_args=model_args)
         self.model = _load_for_executorch(args.pte)
@@ -45,11 +45,17 @@ def forward(
         tokens: Optional[torch.LongTensor] = None,
         input_pos: Optional[torch.LongTensor] = None,
     ) -> torch.Tensor:
-        return (
-            self.model.forward((tokens, input_pos))
-            if input_pos is not None
-            else self.model.forward((tokens,))
-        )[0]
+        # TODO: in LlamaRunner there is a generate function that automatically generates
+        # input_pos tensor and inputs it into the model. Atm TorchTune models use
+        # kwargs for the input_pos, so we will need to make some changes. At least
+        # for the time being, we can run the non-kv cache version of the Torchtune
+        # model with just the tokens like below.
+        return (self.model.forward((tokens,)))[0]
+        # return (
+        #     self.model.forward((tokens, input_pos))
+        #     if input_pos is not None
+        #     else self.model.forward((tokens,))
+        # )[0]
 
 
 def build_args_parser() -> argparse.ArgumentParser:

From 0b5a9a709a410dca334670a697e737e302a8eb2a Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Mon, 30 Sep 2024 12:53:03 -0700
Subject: [PATCH 02/44] Add kwarg example inputs to eager model base

---
 examples/models/llama2/model.py | 13 ++++++++-----
 examples/models/model_base.py   |  5 +++--
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/examples/models/llama2/model.py b/examples/models/llama2/model.py
index 23f1c1b4898..a3532a41ca1 100644
--- a/examples/models/llama2/model.py
+++ b/examples/models/llama2/model.py
@@ -224,25 +224,27 @@ def get_eager_model(self) -> torch.nn.Module:
             # switch all to FP32
             return self.model_.to(torch.float32)
 
-    def get_example_inputs(self):
+    def get_example_inputs(self) -> Tuple[Tuple, Dict]:
         if self.use_kv_cache:
             return self.get_example_inputs_kvcache_sdpa()
         else:
-            return (
+            positional_inputs = (
                 torch.tensor(
                     [[1, 2, 3]], dtype=torch.long
                 ),  # tokens, with kv cache our input token length is always just 1 token.
             )
+            return (positional_inputs, {})
 
     # assumption is the custom op doesnt support dynamic shape right now. It might but its untested so lets first get static shape working
-    def get_example_inputs_kvcache_sdpa(self):
+    def get_example_inputs_kvcache_sdpa(self) -> Tuple[Tuple, Dict]:
         if self.enable_dynamic_shape:
-            return (
+            positional_inputs = (
                 torch.tensor([[2, 3, 4]], dtype=torch.long),
                 torch.tensor([0], dtype=torch.long),
             )
+            return (positional_inputs, {})
         else:
-            return (
+            positional_inputs = (
                 torch.tensor(
                     [[1]], dtype=torch.long
                 ),  # tokens, with kv cache our input token length is always just 1 token.
@@ -250,6 +252,7 @@ def get_example_inputs_kvcache_sdpa(self):
                     [0], dtype=torch.long
                 ),  # start_pos, what token of output are we on.
             )
+            return (positional_inputs, {})
 
     def _transform_for_pre_quantization(self, checkpoint):
         assert hasattr(self.args, "preq_mode"), "preq_mode must be specified"
diff --git a/examples/models/model_base.py b/examples/models/model_base.py
index a1e639cf323..478f1e2d65f 100644
--- a/examples/models/model_base.py
+++ b/examples/models/model_base.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from abc import ABC, abstractmethod
+from typing import Dict, Tuple
 
 import torch
 
@@ -37,11 +38,11 @@ def get_eager_model(self) -> torch.nn.Module:
         raise NotImplementedError("get_eager_model")
 
     @abstractmethod
-    def get_example_inputs(self):
+    def get_example_inputs(self) -> Tuple[Tuple, Dict]:
         """
         Abstract method to provide example inputs for the model.
 
         Returns:
-            Any: Example inputs that can be used for testing and tracing.
+            Tuple[Tuple, Dict]: The positional inputs (Tuple) and the kwarg inputs (Dict).
         """
         raise NotImplementedError("get_example_inputs")

From a9647d2068a539173f1203bee7cbfca5572db421 Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Mon, 7 Oct 2024 15:41:23 -0700
Subject: [PATCH 03/44] Create create new method for example kwarg inputs
 instead

---
 examples/models/llama2/model.py | 13 +++++--------
 examples/models/model_base.py   |  5 ++---
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/examples/models/llama2/model.py b/examples/models/llama2/model.py
index a3532a41ca1..23f1c1b4898 100644
--- a/examples/models/llama2/model.py
+++ b/examples/models/llama2/model.py
@@ -224,27 +224,25 @@ def get_eager_model(self) -> torch.nn.Module:
             # switch all to FP32
             return self.model_.to(torch.float32)
 
-    def get_example_inputs(self) -> Tuple[Tuple, Dict]:
+    def get_example_inputs(self):
         if self.use_kv_cache:
             return self.get_example_inputs_kvcache_sdpa()
         else:
-            positional_inputs = (
+            return (
                 torch.tensor(
                     [[1, 2, 3]], dtype=torch.long
                 ),  # tokens, with kv cache our input token length is always just 1 token.
             )
-            return (positional_inputs, {})
 
     # assumption is the custom op doesnt support dynamic shape right now. It might but its untested so lets first get static shape working
-    def get_example_inputs_kvcache_sdpa(self) -> Tuple[Tuple, Dict]:
+    def get_example_inputs_kvcache_sdpa(self):
         if self.enable_dynamic_shape:
-            positional_inputs = (
+            return (
                 torch.tensor([[2, 3, 4]], dtype=torch.long),
                 torch.tensor([0], dtype=torch.long),
             )
-            return (positional_inputs, {})
         else:
-            positional_inputs = (
+            return (
                 torch.tensor(
                     [[1]], dtype=torch.long
                 ),  # tokens, with kv cache our input token length is always just 1 token.
@@ -252,7 +250,6 @@ def get_example_inputs_kvcache_sdpa(self) -> Tuple[Tuple, Dict]:
                     [0], dtype=torch.long
                 ),  # start_pos, what token of output are we on.
             )
-            return (positional_inputs, {})
 
     def _transform_for_pre_quantization(self, checkpoint):
         assert hasattr(self.args, "preq_mode"), "preq_mode must be specified"
diff --git a/examples/models/model_base.py b/examples/models/model_base.py
index 478f1e2d65f..a1e639cf323 100644
--- a/examples/models/model_base.py
+++ b/examples/models/model_base.py
@@ -5,7 +5,6 @@
 # LICENSE file in the root directory of this source tree.
 
 from abc import ABC, abstractmethod
-from typing import Dict, Tuple
 
 import torch
 
@@ -38,11 +37,11 @@ def get_eager_model(self) -> torch.nn.Module:
         raise NotImplementedError("get_eager_model")
 
     @abstractmethod
-    def get_example_inputs(self) -> Tuple[Tuple, Dict]:
+    def get_example_inputs(self):
         """
         Abstract method to provide example inputs for the model.
 
         Returns:
-            Tuple[Tuple, Dict]: The positional inputs (Tuple) and the kwarg inputs (Dict).
+            Any: Example inputs that can be used for testing and tracing.
         """
         raise NotImplementedError("get_example_inputs")

From fa3b1d253796ce12f957d103a55650884f53e99c Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Mon, 30 Sep 2024 12:53:03 -0700
Subject: [PATCH 04/44] Add kwarg example inputs to eager model base

---
 examples/models/llama2/model.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/models/llama2/model.py b/examples/models/llama2/model.py
index 23f1c1b4898..8ea42ae98c6 100644
--- a/examples/models/llama2/model.py
+++ b/examples/models/llama2/model.py
@@ -315,3 +315,4 @@ def _transform_for_pre_quantization(self, checkpoint):
                 int(embedding_bit_width),
                 embedding_group_size,
             )
+

From e8715ba9e3c7d67f81889b8ddbfa2a401657a01e Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Tue, 8 Oct 2024 00:20:44 -0700
Subject: [PATCH 05/44] Lint

---
 examples/models/llama2/model.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/models/llama2/model.py b/examples/models/llama2/model.py
index 8ea42ae98c6..23f1c1b4898 100644
--- a/examples/models/llama2/model.py
+++ b/examples/models/llama2/model.py
@@ -315,4 +315,3 @@ def _transform_for_pre_quantization(self, checkpoint):
                 int(embedding_bit_width),
                 embedding_group_size,
             )
-

From a6f96a2fd9f050177ac9f7307428e17bbb7c0c97 Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Fri, 4 Oct 2024 20:37:08 -0700
Subject: [PATCH 06/44] Accept model type parameter in export_llama

---
 examples/models/llama2/README.md           | 26 ++++++--
 examples/models/llama2/export_llama.py     |  3 +-
 examples/models/llama2/export_llama_lib.py | 75 ++++++++++++++--------
 3 files changed, 71 insertions(+), 33 deletions(-)

diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md
index 1a6fe99fc41..8686b87c2a1 100644
--- a/examples/models/llama2/README.md
+++ b/examples/models/llama2/README.md
@@ -142,6 +142,7 @@ LLAMA_CHECKPOINT=path/to/checkpoint.pth
 LLAMA_PARAMS=path/to/params.json
 
 python -m examples.models.llama2.export_llama \
+  --model llama3_2
   --checkpoint "${LLAMA_CHECKPOINT:?}" \
   --params "${LLAMA_PARAMS:?}" \
   -kv \
@@ -162,6 +163,7 @@ LLAMA_QUANTIZED_CHECKPOINT=path/to/spinquant/checkpoint.pth
 LLAMA_PARAMS=path/to/params.json
 
 python -m examples.models.llama2.export_llama \
+   --model llama3_2
    --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \
    --params "${LLAMA_PARAMS:?}" \
    --use_sdpa_with_kv_cache \
@@ -185,7 +187,19 @@ You can export and run the original Llama 3 8B instruct model.
 
 2. Export model and generate `.pte` file
     ```
-    python -m examples.models.llama2.export_llama --checkpoint <consolidated.00.pth> -p <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w  --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
+    python -m examples.models.llama2.export_llama
+	   --model llama3
+	   --checkpoint <consolidated.00.pth>
+	   -p <params.json>
+	   -kv
+	   --use_sdpa_with_kv_cache
+	   -X
+	   -qmode 8da4w
+	   --group_size 128
+	   -d fp32
+	   --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+	   --embedding-quantize 4,32
+	   --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
     ```
 
     Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `--embedding-quantize 4,32` as shown above to further reduce the model size.
@@ -205,7 +219,7 @@ If you want to deploy and run a smaller model for educational purposes. From `ex
     ```
 3. Export model and generate `.pte` file.
     ```
-    python -m examples.models.llama2.export_llama -c stories110M.pt -p params.json -X -kv
+    python -m examples.models.llama2.export_llama --model llama2 --checkpoint stories110M.pt --params params.json -X -kv
     ```
 
 ### Option D: Download and export Llama 2 7B model
@@ -218,7 +232,7 @@ You can export and run the original Llama 2 7B model.
 
 3. Export model and generate `.pte` file:
     ```
-    python -m examples.models.llama2.export_llama --checkpoint <checkpoint.pth> --params <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32
+    python -m examples.models.llama2.export_llama --model llama2 --checkpoint <checkpoint.pth> --params <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32
     ```
 4. Create tokenizer.bin.
     ```
@@ -432,9 +446,9 @@ Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-de
 Currently we supported lowering the stories model to other backends, including, CoreML, MPS and QNN. Please refer to the instruction
 for each backend ([CoreML](https://pytorch.org/executorch/main/build-run-coreml.html), [MPS](https://pytorch.org/executorch/main/build-run-mps.html), [QNN](https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html)) before trying to lower them. After the backend library is installed, the script to export a lowered model is
 
-- Lower to CoreML: `python -m examples.models.llama2.export_llama -kv --disable_dynamic_shape --coreml -c stories110M.pt -p params.json `
-- MPS: `python -m examples.models.llama2.export_llama -kv --disable_dynamic_shape --mps -c stories110M.pt -p params.json `
-- QNN: `python -m examples.models.llama2.export_llama -kv --disable_dynamic_shape --qnn -c stories110M.pt -p params.json `
+- Lower to CoreML: `python -m examples.models.llama2.export_llama --model llama3 -kv --disable_dynamic_shape --coreml -c stories110M.pt -p params.json `
+- MPS: `python -m examples.models.llama2.export_llama --model llama3 -kv --disable_dynamic_shape --mps -c stories110M.pt -p params.json `
+- QNN: `python -m examples.models.llama2.export_llama --model llama3 -kv --disable_dynamic_shape --qnn -c stories110M.pt -p params.json `
 
 The iOS LLAMA app supports the CoreML and MPS model and the Android LLAMA app supports the QNN model. On Android, it also allow to cross compiler the llama runner binary, push to the device and run.
 
diff --git a/examples/models/llama2/export_llama.py b/examples/models/llama2/export_llama.py
index 3d0d1b7bcfb..5f382bf50cf 100644
--- a/examples/models/llama2/export_llama.py
+++ b/examples/models/llama2/export_llama.py
@@ -20,10 +20,9 @@
 def main() -> None:
     seed = 42
     torch.manual_seed(seed)
-    modelname = "llama2"
     parser = build_args_parser()
     args = parser.parse_args()
-    export_llama(modelname, args)
+    export_llama(args)
 
 
 if __name__ == "__main__":
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
index 0d292b11e7b..5735b4cc30c 100644
--- a/examples/models/llama2/export_llama_lib.py
+++ b/examples/models/llama2/export_llama_lib.py
@@ -78,6 +78,10 @@
 verbosity_setting = None
 
 
+EXECUTORCH_DEFINED_MODELS = ["llama2", "llama3", "llama3_1", "llama3_2"]
+TORCHTUNE_DEFINED_MODELS = ["llama3_2_vision"]
+
+
 class WeightType(Enum):
     LLAMA = "LLAMA"
     FAIRSEQ2 = "FAIRSEQ2"
@@ -113,11 +117,11 @@ def build_model(
     else:
         output_dir_path = "."
 
-    argString = f"--checkpoint par:{modelname}_ckpt.pt --params par:{modelname}_params.json {extra_opts} --output-dir {output_dir_path}"
+    argString = f"--model {modelname} --checkpoint par:{modelname}_ckpt.pt --params par:{modelname}_params.json {extra_opts} --output-dir {output_dir_path}"
     parser = build_args_parser()
     args = parser.parse_args(shlex.split(argString))
     # pkg_name = resource_pkg_name
-    return export_llama(modelname, args)
+    return export_llama(args)
 
 
 def build_args_parser() -> argparse.ArgumentParser:
@@ -127,6 +131,12 @@ def build_args_parser() -> argparse.ArgumentParser:
     # parser.add_argument(
     #     "-q", "--quantized_ckpt", default=None, help="quantized checkpoint file"
     # )
+    parser.add_argument(
+        "--model",
+        default="llama2",
+        choices=EXECUTORCH_DEFINED_MODELS + TORCHTUNE_DEFINED_MODELS,
+        help="The Lllama model to export. llama2, llama3, llama3_1, llama3_2 share the same architecture, so they are technically interchangeable, given you provide the checkpoint file for the desired version.",
+    )
     parser.add_argument(
         "-E",
         "--embedding-quantize",
@@ -458,13 +468,13 @@ def canonical_path(path: Union[str, Path], *, dir: bool = False) -> str:
         return return_val
 
 
-def export_llama(modelname, args) -> str:
+def export_llama(args) -> str:
     if args.profile_path is not None:
         try:
             from executorch.util.python_profiler import CProfilerFlameGraph
 
             with CProfilerFlameGraph(args.profile_path):
-                builder = _export_llama(modelname, args)
+                builder = _export_llama(args)
                 assert (
                     filename := builder.get_saved_pte_filename()
                 ) is not None, "Fail to get file name from builder"
@@ -475,14 +485,14 @@ def export_llama(modelname, args) -> str:
             )
             return ""
     else:
-        builder = _export_llama(modelname, args)
+        builder = _export_llama(args)
         assert (
             filename := builder.get_saved_pte_filename()
         ) is not None, "Fail to get file name from builder"
         return filename
 
 
-def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager:
+def _prepare_for_llama_export(args) -> LLMEdgeManager:
     """
     Helper function for export_llama. Loads the model from checkpoint and params,
     and sets up a LLMEdgeManager with initial transforms and dtype conversion.
@@ -508,7 +518,7 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager:
 
     return (
         _load_llama_model(
-            modelname=modelname,
+            args.model,
             checkpoint=checkpoint_path,
             checkpoint_dir=checkpoint_dir,
             params_path=params_path,
@@ -530,7 +540,7 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager:
             args=args,
         )
         .set_output_dir(output_dir_path)
-        .source_transform(_get_source_transforms(modelname, dtype_override, args))
+        .source_transform(_get_source_transforms(args.model, dtype_override, args))
     )
 
 
@@ -574,13 +584,13 @@ def _validate_args(args):
         raise ValueError("Model shard is only supported with qnn backend now.")
 
 
-def _export_llama(modelname, args) -> LLMEdgeManager:  # noqa: C901
+def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
     _validate_args(args)
     pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(args)
 
     # export_to_edge
     builder_exported_to_edge = (
-        _prepare_for_llama_export(modelname, args)
+        _prepare_for_llama_export(args)
         .capture_pre_autograd_graph()
         .pt2e_quantize(quantizers)
         .export_to_edge()
@@ -748,8 +758,8 @@ def _load_llama_model_metadata(
 
 
 def _load_llama_model(
+    modelname: str,
     *,
-    modelname: str = "llama2",
     checkpoint: Optional[str] = None,
     checkpoint_dir: Optional[str] = None,
     params_path: str,
@@ -776,26 +786,41 @@ def _load_llama_model(
     Returns:
         An instance of LLMEdgeManager which contains the eager mode model.
     """
+
     assert (
         checkpoint or checkpoint_dir
     ) and params_path, "Both checkpoint/checkpoint_dir and params can't be empty"
     logging.info(
         f"Loading model with checkpoint={checkpoint}, params={params_path}, use_kv_cache={use_kv_cache}, weight_type={weight_type}"
     )
-    model, example_inputs, example_kwarg_inputs, _ = EagerModelFactory.create_model(
-        "llama2",
-        "Llama2Model",
-        checkpoint=checkpoint,
-        checkpoint_dir=checkpoint_dir,
-        params=params_path,
-        use_kv_cache=use_kv_cache,
-        use_sdpa_with_kv_cache=use_sdpa_with_kv_cache,
-        generate_full_logits=generate_full_logits,
-        fairseq2=weight_type == WeightType.FAIRSEQ2,
-        max_seq_len=max_seq_len,
-        enable_dynamic_shape=enable_dynamic_shape,
-        output_prune_map_path=output_prune_map_path,
-        args=args,
+
+    if modelname in EXECUTORCH_DEFINED_MODELS:
+        # Set to llama2 because all models in EXECUTORCH_DEFINED_MODELS share the same archteciture as
+        # defined in example/models/llama2.
+        modelname = "llama2"
+        model_class_name = "Llama2Model"
+    elif modelname in TORCHTUNE_DEFINED_MODELS:
+        if modelname == "llama3_2_vision":
+            model_class_name = "Llama3_2Decoder"
+    else:
+        raise ValueError(f"{modelname} is not a valid Llama model.")
+
+    model, example_inputs, example_kwarg_inputs, _ = (
+        EagerModelFactory.create_model(
+            modelname,
+            model_class_name,
+            checkpoint=checkpoint,
+            checkpoint_dir=checkpoint_dir,
+            params=params_path,
+            use_kv_cache=use_kv_cache,
+            use_sdpa_with_kv_cache=use_sdpa_with_kv_cache,
+            generate_full_logits=generate_full_logits,
+            fairseq2=weight_type == WeightType.FAIRSEQ2,
+            max_seq_len=max_seq_len,
+            enable_dynamic_shape=enable_dynamic_shape,
+            output_prune_map_path=output_prune_map_path,
+            args=args,
+        )
     )
     if dtype_override:
         assert isinstance(

From 328c72c8cf7554184cee927178acfb33bf43338d Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Fri, 4 Oct 2024 20:49:42 -0700
Subject: [PATCH 07/44] Remove future implementation

---
 examples/models/llama2/export_llama_lib.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
index 5735b4cc30c..49bf5d8da83 100644
--- a/examples/models/llama2/export_llama_lib.py
+++ b/examples/models/llama2/export_llama_lib.py
@@ -79,7 +79,7 @@
 
 
 EXECUTORCH_DEFINED_MODELS = ["llama2", "llama3", "llama3_1", "llama3_2"]
-TORCHTUNE_DEFINED_MODELS = ["llama3_2_vision"]
+TORCHTUNE_DEFINED_MODELS = []
 
 
 class WeightType(Enum):
@@ -800,8 +800,7 @@ def _load_llama_model(
         modelname = "llama2"
         model_class_name = "Llama2Model"
     elif modelname in TORCHTUNE_DEFINED_MODELS:
-        if modelname == "llama3_2_vision":
-            model_class_name = "Llama3_2Decoder"
+        raise NotImplementedError("Torchtune Llama models are not yet supported in ExecuTorch export.")
     else:
         raise ValueError(f"{modelname} is not a valid Llama model.")
 

From ec80bba43acd0553062d4485a3897acd2b156b45 Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Tue, 15 Oct 2024 14:59:41 -0700
Subject: [PATCH 08/44] Lint

---
 examples/models/llama2/export_llama_lib.py | 34 +++++++++++-----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
index 49bf5d8da83..40378f0b73a 100644
--- a/examples/models/llama2/export_llama_lib.py
+++ b/examples/models/llama2/export_llama_lib.py
@@ -800,26 +800,26 @@ def _load_llama_model(
         modelname = "llama2"
         model_class_name = "Llama2Model"
     elif modelname in TORCHTUNE_DEFINED_MODELS:
-        raise NotImplementedError("Torchtune Llama models are not yet supported in ExecuTorch export.")
+        raise NotImplementedError(
+            "Torchtune Llama models are not yet supported in ExecuTorch export."
+        )
     else:
         raise ValueError(f"{modelname} is not a valid Llama model.")
 
-    model, example_inputs, example_kwarg_inputs, _ = (
-        EagerModelFactory.create_model(
-            modelname,
-            model_class_name,
-            checkpoint=checkpoint,
-            checkpoint_dir=checkpoint_dir,
-            params=params_path,
-            use_kv_cache=use_kv_cache,
-            use_sdpa_with_kv_cache=use_sdpa_with_kv_cache,
-            generate_full_logits=generate_full_logits,
-            fairseq2=weight_type == WeightType.FAIRSEQ2,
-            max_seq_len=max_seq_len,
-            enable_dynamic_shape=enable_dynamic_shape,
-            output_prune_map_path=output_prune_map_path,
-            args=args,
-        )
+    model, example_inputs, example_kwarg_inputs, _ = EagerModelFactory.create_model(
+        modelname,
+        model_class_name,
+        checkpoint=checkpoint,
+        checkpoint_dir=checkpoint_dir,
+        params=params_path,
+        use_kv_cache=use_kv_cache,
+        use_sdpa_with_kv_cache=use_sdpa_with_kv_cache,
+        generate_full_logits=generate_full_logits,
+        fairseq2=weight_type == WeightType.FAIRSEQ2,
+        max_seq_len=max_seq_len,
+        enable_dynamic_shape=enable_dynamic_shape,
+        output_prune_map_path=output_prune_map_path,
+        args=args,
     )
     if dtype_override:
         assert isinstance(

From c9bbe12e761d469fe2df6ff4dfd91b816abc98ad Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Mon, 7 Oct 2024 15:41:23 -0700
Subject: [PATCH 09/44] Create create new method for example kwarg inputs
 instead

---
 examples/models/llama2/export_llama_lib.py | 30 ++++++++++++----------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
index 40378f0b73a..f9477097060 100644
--- a/examples/models/llama2/export_llama_lib.py
+++ b/examples/models/llama2/export_llama_lib.py
@@ -806,20 +806,22 @@ def _load_llama_model(
     else:
         raise ValueError(f"{modelname} is not a valid Llama model.")
 
-    model, example_inputs, example_kwarg_inputs, _ = EagerModelFactory.create_model(
-        modelname,
-        model_class_name,
-        checkpoint=checkpoint,
-        checkpoint_dir=checkpoint_dir,
-        params=params_path,
-        use_kv_cache=use_kv_cache,
-        use_sdpa_with_kv_cache=use_sdpa_with_kv_cache,
-        generate_full_logits=generate_full_logits,
-        fairseq2=weight_type == WeightType.FAIRSEQ2,
-        max_seq_len=max_seq_len,
-        enable_dynamic_shape=enable_dynamic_shape,
-        output_prune_map_path=output_prune_map_path,
-        args=args,
+    model, example_inputs, example_kwarg_inputs, _ = (
+        EagerModelFactory.create_model(
+            modelname,
+            model_class_name,
+            checkpoint=checkpoint,
+            checkpoint_dir=checkpoint_dir,
+            params=params_path,
+            use_kv_cache=use_kv_cache,
+            use_sdpa_with_kv_cache=use_sdpa_with_kv_cache,
+            generate_full_logits=generate_full_logits,
+            fairseq2=weight_type == WeightType.FAIRSEQ2,
+            max_seq_len=max_seq_len,
+            enable_dynamic_shape=enable_dynamic_shape ,
+            output_prune_map_path=output_prune_map_path,
+            args=args,
+        )
     )
     if dtype_override:
         assert isinstance(

From 99d5bfb728ef18105bab2577ff7da2c263f73db5 Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Fri, 4 Oct 2024 20:37:08 -0700
Subject: [PATCH 10/44] Accept model type parameter in export_llama

---
 examples/models/llama2/export_llama_lib.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
index f9477097060..995eefd925a 100644
--- a/examples/models/llama2/export_llama_lib.py
+++ b/examples/models/llama2/export_llama_lib.py
@@ -79,7 +79,7 @@
 
 
 EXECUTORCH_DEFINED_MODELS = ["llama2", "llama3", "llama3_1", "llama3_2"]
-TORCHTUNE_DEFINED_MODELS = []
+TORCHTUNE_DEFINED_MODELS = ["llama3_2_vision"]
 
 
 class WeightType(Enum):
@@ -800,13 +800,12 @@ def _load_llama_model(
         modelname = "llama2"
         model_class_name = "Llama2Model"
     elif modelname in TORCHTUNE_DEFINED_MODELS:
-        raise NotImplementedError(
-            "Torchtune Llama models are not yet supported in ExecuTorch export."
-        )
+        if modelname == "llama3_2_vision":
+            model_class_name = "Llama3_2Decoder"
     else:
         raise ValueError(f"{modelname} is not a valid Llama model.")
 
-    model, example_inputs, example_kwarg_inputs, _ = (
+    model, example_inputs, example_kwarg_inputs, dynamic_shapes = (
         EagerModelFactory.create_model(
             modelname,
             model_class_name,

From 1fb2236f6087cb2250ae6c172af4403eea65676e Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Fri, 4 Oct 2024 20:39:49 -0700
Subject: [PATCH 11/44] Torchtune llama3_2_vision model in ET, no quantization

---
 examples/models/llama2/export_llama_lib.py    |  19 ++-
 examples/models/llama3_2_vision/__init__.py   |   9 ++
 examples/models/llama3_2_vision/model.py      | 146 ++++++++++++++++++
 .../llama3_2_vision/params/demo_config.json   |  18 +++
 4 files changed, 184 insertions(+), 8 deletions(-)
 create mode 100644 examples/models/llama3_2_vision/model.py
 create mode 100644 examples/models/llama3_2_vision/params/demo_config.json

diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
index 995eefd925a..a34362e32a8 100644
--- a/examples/models/llama2/export_llama_lib.py
+++ b/examples/models/llama2/export_llama_lib.py
@@ -24,8 +24,6 @@
 
 from executorch.devtools.etrecord import generate_etrecord
 
-from executorch.examples.models.llama2.llama_transformer import ModelArgs
-
 from executorch.extension.llm.export.builder import DType, LLMEdgeManager
 
 from executorch.extension.llm.export.partitioner_lib import (
@@ -733,16 +731,18 @@ def _load_llama_model_metadata(
     use_kv_cache: bool,
     use_sdpa_with_kv_cache: bool,
     enable_dynamic_shape: bool,
-    model_args: ModelArgs,
+    max_seq_len: int,
+    n_layers: int,
+    vocab_size: int,
     metadata_str: Optional[str] = None,
 ):
     is_fairseq2 = weight_type == WeightType.FAIRSEQ2
     metadata = {
         "get_bos_id": 3 if is_fairseq2 else 1,
         "get_eos_ids": [3] if is_fairseq2 else [2],
-        "get_max_seq_len": model_args.max_seq_len,
-        "get_n_layers": model_args.n_layers,
-        "get_vocab_size": model_args.vocab_size,
+        "get_max_seq_len": max_seq_len,
+        "get_n_layers": n_layers,
+        "get_vocab_size": vocab_size,
         "use_kv_cache": use_kv_cache,
         "use_sdpa_with_kv_cache": use_sdpa_with_kv_cache,
         "enable_dynamic_shape": enable_dynamic_shape,
@@ -852,12 +852,13 @@ def _load_llama_model(
     return LLMEdgeManager(
         model=model,
         modelname=modelname,
-        max_seq_len=model.params.max_seq_len,
+        max_seq_len=model.max_seq_len,
         dtype=dtype,
         use_kv_cache=use_kv_cache,
         generate_full_logits=generate_full_logits,
         example_inputs=example_inputs,
         example_kwarg_inputs=example_kwarg_inputs,
+        dynamic_shapes=dynamic_shapes,
         enable_dynamic_shape=enable_dynamic_shape,
         calibration_tasks=calibration_tasks,
         calibration_limit=calibration_limit,
@@ -870,7 +871,9 @@ def _load_llama_model(
             use_kv_cache,
             use_sdpa_with_kv_cache,
             enable_dynamic_shape,
-            model.params,
+            model.max_seq_len,
+            model.n_layers,
+            model.vocab_size,
             metadata_str,
         ),
         args=args,
diff --git a/examples/models/llama3_2_vision/__init__.py b/examples/models/llama3_2_vision/__init__.py
index e69de29bb2d..3c385703d72 100644
--- a/examples/models/llama3_2_vision/__init__.py
+++ b/examples/models/llama3_2_vision/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .model import Llama3_2Decoder
+
+__all__ = [Llama3_2Decoder]
diff --git a/examples/models/llama3_2_vision/model.py b/examples/models/llama3_2_vision/model.py
new file mode 100644
index 00000000000..6809c4c346a
--- /dev/null
+++ b/examples/models/llama3_2_vision/model.py
@@ -0,0 +1,146 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+import json
+from typing import Any, Dict, Tuple
+
+import torch
+
+from executorch.examples.models.model_base import EagerModelBase
+from torchtune.models.llama3_2_vision._convert_weights import llama3_vision_meta_to_tune
+from torchtune.models.llama3_2_vision._component_builders import llama3_2_vision_decoder
+from executorch.examples.models.checkpoint import (
+    get_default_model_resource_dir,
+    get_checkpoint_dtype,
+)
+
+
+def to_decoder_checkpoint(checkpoint: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Extracts and formats the decoder-related weights from the checkpoint. The checkpoint contains
+    weight names prefixed with "encoder"/"decoder", such as "encoder.layer.etc" or "decoder.norm.scale".
+    To load the text decoder on its own, the "decoder" prefix needs to be removed.
+    """
+    return {".".join(weight.split(".")[1:]): value for weight, value in checkpoint.items() if weight.startswith("decoder")}
+
+class Llama3_2Decoder(EagerModelBase):
+    """
+    Just the text decoder portions of the Llama3.2 multimodal model.
+    """
+
+    def __init__(self, **kwargs):
+        # Set member vars from kwargs.
+        self.max_seq_len = kwargs.get("max_seq_len", 8192)
+        self.encoder_max_seq_len = kwargs.get("encoder_max_seq_len", int(4 * (448 / 14) ** 2 + 1))
+        self.generate_full_logits = kwargs.get("generate_full_logits", False)
+        self.enable_dynamic_shape = kwargs.get("enable_dynamic_shape", False)
+        self.output_prune_map_path = kwargs.get("output_prune_map_path", None)
+        # TODO: enable kv cache with TransformerDecoder's setup_cache().
+        self.use_kv_cache = kwargs.get("use_kv_cache", False)
+        self.use_sdpa_with_kv_cache = kwargs.get("use_sdpa_with_kv_cache", False)
+        self.verbose = kwargs.get("verbose", False)
+        self.args = kwargs.get("args", None)
+
+
+        ckpt_dir = get_default_model_resource_dir()
+        # Single checkpoint file.
+        checkpoint_path = kwargs.get("checkpoint", ckpt_dir / "demo_rand_params.pth")
+        # Sharded checkpoint.
+        checkpoint_dir = kwargs.get("checkpoint_dir", None)
+        params_path = kwargs.get("params", ckpt_dir / "demo_config.json")
+
+        # Load checkpoint and params.
+        device = "cpu"
+        if checkpoint_dir is not None:
+            raise NotImplementedError("Sharded checkpoint not yet supported for Llama3_2Decoder.")
+        else:
+            checkpoint = torch.load(checkpoint_path, map_location=device, mmap=True)
+        checkpoint = llama3_vision_meta_to_tune(checkpoint)
+        checkpoint = to_decoder_checkpoint(checkpoint)
+        with open(params_path, "r") as f:
+            params = json.loads(f.read())
+
+        # Find dtype from checkpoint. (skip for now)
+        self.dtype = get_checkpoint_dtype(checkpoint)
+
+        # Load model.
+        # Cannot use "with torch.device("meta"):" because it causes some exceptions during export,
+        # i.e. the model isn't fully initialized or something.
+        self.model_ = llama3_2_vision_decoder(
+            vocab_size=params["vocab_size"],
+            num_layers=params["n_layers"],
+            fusion_interval=params["fusion_interval"],
+            num_special_tokens=params["n_special_tokens"],
+            num_heads=params["n_heads"],
+            num_kv_heads=params["n_kv_heads"],
+            embed_dim=params["dim"],
+            max_seq_len=self.max_seq_len,
+            encoder_max_seq_len=self.encoder_max_seq_len,
+            rope_base=params["rope_theta"],
+            intermediate_dim=params["intermediate_dim"],
+        )
+        # Save params for future use.
+        for param_name, param_val in params.items():
+            setattr(self.model_, param_name, param_val)
+
+        # Quantize. (skip for now)
+
+        # Load checkpoint.
+        missing, unexpected = self.model_.load_state_dict(
+            checkpoint,
+            strict=False,
+            assign=True,
+        )
+        if kwargs.get("verbose", False):
+            print("============= missing keys ================")
+            print(missing)
+            print("============= /missing ================")
+            print("============= unexpected keys ================")
+            print(unexpected)
+            print("============= /unexpected ================")
+
+        # Prune the output layer if output_prune_map is provided.
+        output_prune_map = None
+        if self.output_prune_map_path is not None:
+            from executorch.examples.models.llama2.source_transformation.prune_output import prune_output_vocab
+
+            with open(self.output_prune_map_path, "r") as f:
+                output_prune_map = json.load(f)
+            # Change keys from string to int (json only supports string keys)
+            output_prune_map = {int(k): v for (k, v) in output_prune_map.items()}
+
+            self.model_ = prune_output_vocab(self.model_, output_prune_map)
+
+    def get_eager_model(self) -> torch.nn.Module:
+        if self.dtype:
+            return self.model_.to(self.dtype)
+        else:
+            return self.model_.to(torch.float16)
+
+    def get_example_inputs(self) -> Tuple[Tuple, Dict]:
+        return (
+            (torch.ones(1, 64, dtype=torch.long),), # positional inputs
+            {
+                # "mask": None,
+                # "encoder_input": None,
+                # "encoder_mask": None,
+                # "input_pos": torch.ones(64, dtype=torch.long),
+            } # kwarg inputs
+        )
+
+    def get_dynamic_shapes(self):
+        dim = torch.export.Dim("token_dim", min=1,max=self.max_seq_len)
+        dynamic_shapes = {
+            "tokens": {0: 1, 1: dim},
+            # "encoder_input": {0:1, 1:dim_enc, 2:4096},
+            # "encoder_mask": {0:1, 1:dim, 2:dim_enc},
+            # "mask": None,
+            # "input_pos" : {0: dim},
+        }
+        return dynamic_shapes
+        
diff --git a/examples/models/llama3_2_vision/params/demo_config.json b/examples/models/llama3_2_vision/params/demo_config.json
new file mode 100644
index 00000000000..625524ad4c8
--- /dev/null
+++ b/examples/models/llama3_2_vision/params/demo_config.json
@@ -0,0 +1,18 @@
+{
+  "dim": 4096,
+  "ffn_dim_multiplier": 1.3,
+  "fusion_interval": 4,
+  "intermediate_dim": 14336,
+  "multiple_of": 1024,
+  "n_heads": 32,
+  "n_kv_heads": 8,
+  "n_layers": 32,
+  "n_special_tokens": 8,
+  "norm_eps": 1e-05,
+  "rope_theta": 500000.0,
+  "use_scaled_rope": true,
+  "vision_chunk_size": 560,
+  "vision_max_num_chunks": 4,
+  "vocab_size": 128256,
+  "vision_num_cross_attention_layers": 8
+}
\ No newline at end of file

From e0c4b8ab261bf0fd6075053998aa05a83b124c0c Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Tue, 8 Oct 2024 13:02:09 -0700
Subject: [PATCH 12/44] Fix vision model example input

---
 examples/models/llama3_2_vision/model.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/examples/models/llama3_2_vision/model.py b/examples/models/llama3_2_vision/model.py
index 6809c4c346a..e02916b838f 100644
--- a/examples/models/llama3_2_vision/model.py
+++ b/examples/models/llama3_2_vision/model.py
@@ -7,7 +7,7 @@
 # pyre-unsafe
 
 import json
-from typing import Any, Dict, Tuple
+from typing import Any, Dict
 
 import torch
 
@@ -47,7 +47,7 @@ def __init__(self, **kwargs):
         self.args = kwargs.get("args", None)
 
 
-        ckpt_dir = get_default_model_resource_dir()
+        ckpt_dir = get_default_model_resource_dir(__file__)
         # Single checkpoint file.
         checkpoint_path = kwargs.get("checkpoint", ckpt_dir / "demo_rand_params.pth")
         # Sharded checkpoint.
@@ -122,17 +122,20 @@ def get_eager_model(self) -> torch.nn.Module:
         else:
             return self.model_.to(torch.float16)
 
-    def get_example_inputs(self) -> Tuple[Tuple, Dict]:
+    def get_example_inputs(self):
         return (
-            (torch.ones(1, 64, dtype=torch.long),), # positional inputs
-            {
-                # "mask": None,
-                # "encoder_input": None,
-                # "encoder_mask": None,
-                # "input_pos": torch.ones(64, dtype=torch.long),
-            } # kwarg inputs
+            torch.ones(1, 64, dtype=torch.long), # positional inputs
         )
 
+    def get_example_kwarg_inputs(self):
+        # TODO: add input_pos and mask when after making cache work.
+        return {
+            # "mask": None,
+            # "encoder_input": None,
+            # "encoder_mask": None,
+            # "input_pos": torch.ones(64, dtype=torch.long),
+        }
+
     def get_dynamic_shapes(self):
         dim = torch.export.Dim("token_dim", min=1,max=self.max_seq_len)
         dynamic_shapes = {

From e145bd106538f8457af7833aba96d40583739b11 Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Tue, 22 Oct 2024 02:40:54 -0700
Subject: [PATCH 13/44] Lint

---
 examples/models/llama2/export_llama_lib.py |  2 +-
 examples/models/llama3_2_vision/model.py   | 37 +++++++++++++---------
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
index a34362e32a8..29518b641f4 100644
--- a/examples/models/llama2/export_llama_lib.py
+++ b/examples/models/llama2/export_llama_lib.py
@@ -817,7 +817,7 @@ def _load_llama_model(
             generate_full_logits=generate_full_logits,
             fairseq2=weight_type == WeightType.FAIRSEQ2,
             max_seq_len=max_seq_len,
-            enable_dynamic_shape=enable_dynamic_shape ,
+            enable_dynamic_shape=enable_dynamic_shape,
             output_prune_map_path=output_prune_map_path,
             args=args,
         )
diff --git a/examples/models/llama3_2_vision/model.py b/examples/models/llama3_2_vision/model.py
index e02916b838f..5e1dfea18d9 100644
--- a/examples/models/llama3_2_vision/model.py
+++ b/examples/models/llama3_2_vision/model.py
@@ -10,15 +10,15 @@
 from typing import Any, Dict
 
 import torch
-
-from executorch.examples.models.model_base import EagerModelBase
-from torchtune.models.llama3_2_vision._convert_weights import llama3_vision_meta_to_tune
-from torchtune.models.llama3_2_vision._component_builders import llama3_2_vision_decoder
 from executorch.examples.models.checkpoint import (
-    get_default_model_resource_dir,
     get_checkpoint_dtype,
+    get_default_model_resource_dir,
 )
 
+from executorch.examples.models.model_base import EagerModelBase
+from torchtune.models.llama3_2_vision._component_builders import llama3_2_vision_decoder
+from torchtune.models.llama3_2_vision._convert_weights import llama3_vision_meta_to_tune
+
 
 def to_decoder_checkpoint(checkpoint: Dict[str, Any]) -> Dict[str, Any]:
     """
@@ -26,7 +26,12 @@ def to_decoder_checkpoint(checkpoint: Dict[str, Any]) -> Dict[str, Any]:
     weight names prefixed with "encoder"/"decoder", such as "encoder.layer.etc" or "decoder.norm.scale".
     To load the text decoder on its own, the "decoder" prefix needs to be removed.
     """
-    return {".".join(weight.split(".")[1:]): value for weight, value in checkpoint.items() if weight.startswith("decoder")}
+    return {
+        ".".join(weight.split(".")[1:]): value
+        for weight, value in checkpoint.items()
+        if weight.startswith("decoder")
+    }
+
 
 class Llama3_2Decoder(EagerModelBase):
     """
@@ -36,7 +41,9 @@ class Llama3_2Decoder(EagerModelBase):
     def __init__(self, **kwargs):
         # Set member vars from kwargs.
         self.max_seq_len = kwargs.get("max_seq_len", 8192)
-        self.encoder_max_seq_len = kwargs.get("encoder_max_seq_len", int(4 * (448 / 14) ** 2 + 1))
+        self.encoder_max_seq_len = kwargs.get(
+            "encoder_max_seq_len", int(4 * (448 / 14) ** 2 + 1)
+        )
         self.generate_full_logits = kwargs.get("generate_full_logits", False)
         self.enable_dynamic_shape = kwargs.get("enable_dynamic_shape", False)
         self.output_prune_map_path = kwargs.get("output_prune_map_path", None)
@@ -46,7 +53,6 @@ def __init__(self, **kwargs):
         self.verbose = kwargs.get("verbose", False)
         self.args = kwargs.get("args", None)
 
-
         ckpt_dir = get_default_model_resource_dir(__file__)
         # Single checkpoint file.
         checkpoint_path = kwargs.get("checkpoint", ckpt_dir / "demo_rand_params.pth")
@@ -57,7 +63,9 @@ def __init__(self, **kwargs):
         # Load checkpoint and params.
         device = "cpu"
         if checkpoint_dir is not None:
-            raise NotImplementedError("Sharded checkpoint not yet supported for Llama3_2Decoder.")
+            raise NotImplementedError(
+                "Sharded checkpoint not yet supported for Llama3_2Decoder."
+            )
         else:
             checkpoint = torch.load(checkpoint_path, map_location=device, mmap=True)
         checkpoint = llama3_vision_meta_to_tune(checkpoint)
@@ -107,7 +115,9 @@ def __init__(self, **kwargs):
         # Prune the output layer if output_prune_map is provided.
         output_prune_map = None
         if self.output_prune_map_path is not None:
-            from executorch.examples.models.llama2.source_transformation.prune_output import prune_output_vocab
+            from executorch.examples.models.llama2.source_transformation.prune_output import (
+                prune_output_vocab,
+            )
 
             with open(self.output_prune_map_path, "r") as f:
                 output_prune_map = json.load(f)
@@ -123,9 +133,7 @@ def get_eager_model(self) -> torch.nn.Module:
             return self.model_.to(torch.float16)
 
     def get_example_inputs(self):
-        return (
-            torch.ones(1, 64, dtype=torch.long), # positional inputs
-        )
+        return (torch.ones(1, 64, dtype=torch.long),)  # positional inputs
 
     def get_example_kwarg_inputs(self):
         # TODO: add input_pos and mask when after making cache work.
@@ -137,7 +145,7 @@ def get_example_kwarg_inputs(self):
         }
 
     def get_dynamic_shapes(self):
-        dim = torch.export.Dim("token_dim", min=1,max=self.max_seq_len)
+        dim = torch.export.Dim("token_dim", min=1, max=self.max_seq_len)
         dynamic_shapes = {
             "tokens": {0: 1, 1: dim},
             # "encoder_input": {0:1, 1:dim_enc, 2:4096},
@@ -146,4 +154,3 @@ def get_dynamic_shapes(self):
             # "input_pos" : {0: dim},
         }
         return dynamic_shapes
-        

From ed906cbb7f048b09cd602029dea8b72b0bbb8dbb Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Fri, 25 Oct 2024 13:28:46 -0700
Subject: [PATCH 14/44] Kv cache

---
 examples/models/llama3_2_vision/model.py | 26 ++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/examples/models/llama3_2_vision/model.py b/examples/models/llama3_2_vision/model.py
index 5e1dfea18d9..1a51ffeaf08 100644
--- a/examples/models/llama3_2_vision/model.py
+++ b/examples/models/llama3_2_vision/model.py
@@ -40,16 +40,14 @@ class Llama3_2Decoder(EagerModelBase):
 
     def __init__(self, **kwargs):
         # Set member vars from kwargs.
-        self.max_seq_len = kwargs.get("max_seq_len", 8192)
+        self.max_seq_len = kwargs.get("max_seq_len", 8192)  # Trained to be a lot larger, but this value is kept small because of static kv cache at the moment.
         self.encoder_max_seq_len = kwargs.get(
             "encoder_max_seq_len", int(4 * (448 / 14) ** 2 + 1)
-        )
+        )  # Same as above.
         self.generate_full_logits = kwargs.get("generate_full_logits", False)
         self.enable_dynamic_shape = kwargs.get("enable_dynamic_shape", False)
         self.output_prune_map_path = kwargs.get("output_prune_map_path", None)
-        # TODO: enable kv cache with TransformerDecoder's setup_cache().
         self.use_kv_cache = kwargs.get("use_kv_cache", False)
-        self.use_sdpa_with_kv_cache = kwargs.get("use_sdpa_with_kv_cache", False)
         self.verbose = kwargs.get("verbose", False)
         self.args = kwargs.get("args", None)
 
@@ -60,6 +58,14 @@ def __init__(self, **kwargs):
         checkpoint_dir = kwargs.get("checkpoint_dir", None)
         params_path = kwargs.get("params", ckpt_dir / "demo_config.json")
 
+        self.causal_mask = torch.tril(
+            torch.ones(
+                size=(self.max_seq_len, self.max_seq_len),
+                dtype=torch.bool,
+            )
+        )
+        self.input_pos = torch.arange(self.max_seq_len)
+
         # Load checkpoint and params.
         device = "cpu"
         if checkpoint_dir is not None:
@@ -126,6 +132,13 @@ def __init__(self, **kwargs):
 
             self.model_ = prune_output_vocab(self.model_, output_prune_map)
 
+        # if self.use_kv_cache:
+        #     print("Setting up KV cache on the model...")
+        #     self.model_.setup_caches(
+        #         batch_size=1,
+        #         dtype=self.dtype,
+        #     )
+
     def get_eager_model(self) -> torch.nn.Module:
         if self.dtype:
             return self.model_.to(self.dtype)
@@ -133,15 +146,16 @@ def get_eager_model(self) -> torch.nn.Module:
             return self.model_.to(torch.float16)
 
     def get_example_inputs(self):
-        return (torch.ones(1, 64, dtype=torch.long),)  # positional inputs
+        return (torch.ones(1, 64, dtype=torch.long),)
 
     def get_example_kwarg_inputs(self):
         # TODO: add input_pos and mask when after making cache work.
         return {
-            # "mask": None,
+            # "mask": self.causal_mask[None, 64, None, :],
             # "encoder_input": None,
             # "encoder_mask": None,
             # "input_pos": torch.ones(64, dtype=torch.long),
+            # input_pos: self.input_pos[None, 64]
         }
 
     def get_dynamic_shapes(self):

From 1825972af1fa6fc49e8ffe38e06cb26112d3667b Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Fri, 25 Oct 2024 15:27:08 -0700
Subject: [PATCH 15/44] Update READMEs

---
 backends/vulkan/docs/android_demo.md           |  1 +
 ...llama3-qualcomm-ai-engine-direct-backend.md |  2 +-
 .../docs/delegates/qualcomm_README.md          |  6 +++---
 .../LlamaDemo/docs/delegates/xnnpack_README.md | 10 +++++-----
 .../LLaMA/docs/delegates/mps_README.md         |  4 ++--
 .../LLaMA/docs/delegates/xnnpack_README.md     |  8 ++++----
 examples/models/llama/README.md                | 18 ++++++++++++++++--
 examples/models/llama/UTILS.md                 |  8 ++++----
 examples/models/llama2/README.md               |  2 +-
 9 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/backends/vulkan/docs/android_demo.md b/backends/vulkan/docs/android_demo.md
index 2a4faacc0c8..1314a6503aa 100644
--- a/backends/vulkan/docs/android_demo.md
+++ b/backends/vulkan/docs/android_demo.md
@@ -58,6 +58,7 @@ partially lower the Llama model to Vulkan.
 ```shell
 # The files will usually be downloaded to ~/.llama
 python -m examples.models.llama.export_llama \
+  --model llama3_2
   --disable_dynamic_shape --vulkan -kv --use_sdpa_with_kv_cache -d fp32 \
   -c ~/.llama/checkpoints/Llama3.2-1B/consolidated.00.pth \
   -p ~/.llama/checkpoints/Llama3.2-1B/params.json \
diff --git a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
index d928377ff28..90dc7dd0ad8 100644
--- a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
+++ b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
@@ -39,7 +39,7 @@ To export Llama 3 8B instruct with the Qualcomm AI Engine Direct Backend, ensure
 
 ```bash
 # Please note that calibration_data must include the prompt template for special tokens.
-python -m examples.models.llama.export_llama  -t <path_to_tokenizer.model>
+python -m examples.models.llama.export_llama --model llama3  -t <path_to_tokenizer.model>
 llama3/Meta-Llama-3-8B-Instruct/tokenizer.model -p <path_to_params.json> -c <path_to_checkpoint_for_Meta-Llama-3-8B-Instruct>  --use_kv_cache  --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path <path_to_optimized_matrix> --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
 ```
 
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
index 8308da6d840..7d28288bfed 100644
--- a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
+++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
@@ -101,12 +101,12 @@ We support PTQ by default. The entire export may take ~20 minutes (Llama 3.1 8B)
 Examples:
 ```
 # 4 bits weight only quantize
-python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_16a4w -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte”
+python -m examples.models.llama.export_llama --model llama3 --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_16a4w -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte”
 ```
 If the model is really big, it may require model sharding because the Qualcomm DSP is a 32bit system and has a 4GB size limit . For example for Llama 3 8B models, we need to shard the model into 4, but ExecuTorch still packages it into one PTE file. Here is an example:
 ```
 # 8 bits quantization with 4 shards
-python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_8a8w -d fp32 --num_sharding 4 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte”
+python -m examples.models.llama.export_llama --model llama3 --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_8a8w -d fp32 --num_sharding 4 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte”
 ```
 Note: if you encountered issues below
 ```
@@ -158,7 +158,7 @@ To export Llama 3 8B instruct with the Qualcomm AI Engine Direct Backend, ensure
 * 8B models might need 16GB RAM on the device to run.
 ```
 # Please note that calibration_data must include the prompt template for special tokens.
-python -m examples.models.llama.export_llama  -t <path_to_tokenizer.model> -p <path_to_params.json> -c <path_to_checkpoint_for_Meta-Llama-3-8B-Instruct>  --use_kv_cache  --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path <path_to_optimized_matrix> --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+python -m examples.models.llama.export_llama --model llama3 -t <path_to_tokenizer.model> -p <path_to_params.json> -c <path_to_checkpoint_for_Meta-Llama-3-8B-Instruct>  --use_kv_cache  --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path <path_to_optimized_matrix> --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
 ```
 
 ## Pushing Model and Tokenizer
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
index 2a6ddbbfe09..4ee52bd1b99 100644
--- a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
+++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
@@ -56,14 +56,14 @@ In this demo app, we support text-only inference with up-to-date Llama models an
 Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
 * Export Llama model and generate .pte file as below:
 ```
-python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte"
+python -m examples.models.llama.export_llama --model llama3_2 --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte"
 ```
 
 ### For Llama 3.2 1B and 3B QAT+LoRA models
 Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
 * Export Llama model and generate .pte file as below:
 ```
-python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte"
+python -m examples.models.llama.export_llama --model llama3_2 --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte"
 ```
 
 ### For Llama 3.2 1B and 3B BF16 models
@@ -72,7 +72,7 @@ We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B
 * Export Llama model and generate .pte file as below:
 
 ```
-python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte"
+python -m examples.models.llama.export_llama --model llama3_2 --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte"
 ```
 
 For more detail using Llama 3.2 lightweight models including prompt template, please go to our official [website](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2#-llama-3.2-lightweight-models-(1b/3b)-).
@@ -87,7 +87,7 @@ To safeguard your application, you can use our Llama Guard models for prompt cla
 * We prepared this model using the following command
 
 ```
-python -m examples.models.llama.export_llama --checkpoint <path-to-pruned-llama-guard-1b-checkpoint.pth> --params <path-to-your-params.json> -d fp32 -kv --use_sdpa_with_kv_cache --quantization_mode 8da4w --group_size 256 --xnnpack --max_seq_length 8193 --embedding-quantize 4,32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_prune_map <path-to-your-llama_guard-pruned-layers-map.json> --output_name="llama_guard_3_1b_pruned_xnnpack.pte"
+python -m examples.models.llama.export_llama --model llama3_2 --checkpoint <path-to-pruned-llama-guard-1b-checkpoint.pth> --params <path-to-your-params.json> -d fp32 -kv --use_sdpa_with_kv_cache --quantization_mode 8da4w --group_size 256 --xnnpack --max_seq_length 8193 --embedding-quantize 4,32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_prune_map <path-to-your-llama_guard-pruned-layers-map.json> --output_name="llama_guard_3_1b_pruned_xnnpack.pte"
 ```
 
 
@@ -97,7 +97,7 @@ python -m examples.models.llama.export_llama --checkpoint <path-to-pruned-llama-
 * Export Llama model and generate .pte file as below:
 
 ```
-python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama.pte"
+python -m examples.models.llama.export_llama --model llama3_2 --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama.pte"
 ```
 
 You may wonder what the ‘--metadata’ flag is doing. This flag helps export the model with proper special tokens added that the runner can detect EOS tokens easily.
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
index eb3c244dee7..8aeed59cab9 100644
--- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
+++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
@@ -45,9 +45,9 @@ Install the required packages to export the model
 sh examples/models/llama/install_requirements.sh
 ```
 
-Export the model
+Export the model (Llama 3 in this case)
 ```
-python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" --params "${MODEL_DIR}/params.json" -kv --use_sdpa_with_kv_cache --mps -d fp32 --disable_dynamic_shape -qmode 8da4w -G 32
+python -m examples.models.llama.export_llama --model llama3 --checkpoint "${MODEL_DIR}/consolidated.00.pth" --params "${MODEL_DIR}/params.json" -kv --use_sdpa_with_kv_cache --mps -d fp32 --disable_dynamic_shape -qmode 8da4w -G 32
 ```
 
 ## Pushing Model and Tokenizer
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
index 201a2934470..63dfd334a10 100644
--- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
+++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
@@ -48,14 +48,14 @@ sh examples/models/llama/install_requirements.sh
 Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
 * Export Llama model and generate .pte file as below:
 ```
-python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte"
+python -m examples.models.llama.export_llama --model llama3_2 --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte"
 ```
 
 ### For Llama 3.2 1B and 3B QAT+LoRA models
 Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
 * Export Llama model and generate .pte file as below:
 ```
-python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte"
+python -m examples.models.llama.export_llama --model llama3_2 --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte"
 ```
 
 ### For Llama 3.2 1B and 3B BF16 models
@@ -64,7 +64,7 @@ We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B
 * Export Llama model and generate .pte file as below:
 
 ```
-python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte"
+python -m examples.models.llama.export_llama --model llama3_2 --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte"
 ```
 
 For more detail using Llama 3.2 lightweight models including prompt template, please go to our official [website](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2#-llama-3.2-lightweight-models-(1b/3b)-).
@@ -73,7 +73,7 @@ For more detail using Llama 3.2 lightweight models including prompt template, pl
 
 Export the model
 ```
-python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> -p <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w  --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
+python -m examples.models.llama.export_llama --model llama3_2 --checkpoint <path-to-your-checkpoint.pth> -p <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w  --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
 ```
 
 ### For LLaVA model
diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
index 1ae6796b575..d06c0c031d2 100644
--- a/examples/models/llama/README.md
+++ b/examples/models/llama/README.md
@@ -166,6 +166,7 @@ LLAMA_CHECKPOINT=path/to/checkpoint.pth
 LLAMA_PARAMS=path/to/params.json
 
 python -m examples.models.llama.export_llama \
+  --model llama3_2
   --checkpoint "${LLAMA_CHECKPOINT:?}" \
   --params "${LLAMA_PARAMS:?}" \
   -kv \
@@ -187,6 +188,7 @@ LLAMA_QUANTIZED_CHECKPOINT=path/to/spinquant/checkpoint.pth
 LLAMA_PARAMS=path/to/spinquant/params.json
 
 python -m examples.models.llama.export_llama \
+   --model llama3_2
    --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \
    --params "${LLAMA_PARAMS:?}" \
    --use_sdpa_with_kv_cache \
@@ -212,6 +214,7 @@ LLAMA_QUANTIZED_CHECKPOINT=path/to/qlora/checkpoint.pth
 LLAMA_PARAMS=path/to/qlora/params.json
 
 python -m examples.models.llama.export_llama \
+   --model llama3_2
    --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \
    --params "${LLAMA_PARAMS:?}" \
    -qat \
@@ -237,9 +240,20 @@ You can export and run the original Llama 3 8B instruct model.
 
 2. Export model and generate `.pte` file
     ```
-    python -m examples.models.llama.export_llama --checkpoint <consolidated.00.pth> -p <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w  --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
+    python -m examples.models.llama.export_llama
+	    --model llama3
+	    --checkpoint <consolidated.00.pth>
+		-p <params.json>
+		-kv
+		--use_sdpa_with_kv_cache
+		-X
+		-qmode 8da4w
+		--group_size 128
+		-d fp32
+		--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+		--embedding-quantize 4,32
+		--output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
     ```
-
     Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `--embedding-quantize 4,32` as shown above to further reduce the model size.
 
 ## Step 3: Run on your computer to validate
diff --git a/examples/models/llama/UTILS.md b/examples/models/llama/UTILS.md
index c2ae26e4835..d26362e3853 100644
--- a/examples/models/llama/UTILS.md
+++ b/examples/models/llama/UTILS.md
@@ -19,7 +19,7 @@ From `executorch` root:
     ```
 3. Export model and generate `.pte` file.
     ```
-    python -m examples.models.llama.export_llama -c stories110M.pt -p params.json -X -kv
+    python -m examples.models.llama.export_llama --model llama3 -c stories110M.pt -p params.json -X -kv
     ```
 
 ## Smaller model delegated to other backends
@@ -27,9 +27,9 @@ From `executorch` root:
 Currently we supported lowering the stories model to other backends, including, CoreML, MPS and QNN. Please refer to the instruction
 for each backend ([CoreML](https://pytorch.org/executorch/main/build-run-coreml.html), [MPS](https://pytorch.org/executorch/main/build-run-mps.html), [QNN](https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html)) before trying to lower them. After the backend library is installed, the script to export a lowered model is
 
-- Lower to CoreML: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --coreml -c stories110M.pt -p params.json `
-- MPS: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --mps -c stories110M.pt -p params.json `
-- QNN: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --qnn -c stories110M.pt -p params.json `
+- Lower to CoreML: `python -m examples.models.llama.export_llama --model llama3 -kv --disable_dynamic_shape --coreml -c stories110M.pt -p params.json `
+- MPS: `python -m examples.models.llama.export_llama --model llama3 -kv --disable_dynamic_shape --mps -c stories110M.pt -p params.json `
+- QNN: `python -m examples.models.llama.export_llama --model llama3 -kv --disable_dynamic_shape --qnn -c stories110M.pt -p params.json `
 
 The iOS LLAMA app supports the CoreML and MPS model and the Android LLAMA app supports the QNN model. On Android, it also allow to cross compiler the llama runner binary, push to the device and run.
 
diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md
index f7e308a4321..6e0b3794a74 100644
--- a/examples/models/llama2/README.md
+++ b/examples/models/llama2/README.md
@@ -37,7 +37,7 @@ You can export and run the original Llama 2 7B model.
 
 3. Export model and generate `.pte` file:
     ```
-    python -m examples.models.llama.export_llama --model llama3_2 --checkpoint <checkpoint.pth> --params <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32
+    python -m examples.models.llama.export_llama --model llama2 --checkpoint <checkpoint.pth> --params <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32
     ```
 4. Create tokenizer.bin.
     ```

From 196499af8535bc900e993339a97ce68ceb95a6a6 Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Fri, 25 Oct 2024 15:29:03 -0700
Subject: [PATCH 16/44] Change model default arg

---
 examples/models/llama/export_llama_lib.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 39cb169be34..12598eac365 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -134,9 +134,9 @@ def build_args_parser() -> argparse.ArgumentParser:
     # )
     parser.add_argument(
         "--model",
-        default="llama2",
+        default="llama3",
         choices=EXECUTORCH_DEFINED_MODELS + TORCHTUNE_DEFINED_MODELS,
-        help="The Lllama model to export. llama2, llama3, llama3_1, llama3_2 share the same architecture, so they are technically interchangeable, given you provide the checkpoint file for the desired version.",
+        help="The Lllama model to export. llama2, llama3, llama3_1, llama3_2 share the same architecture, so they are technically interchangeable given you provide the checkpoint file for the desired version.",
     )
     parser.add_argument(
         "-E",

From 96ba40b01a6003512ba3427a49075d2995bafa14 Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Fri, 25 Oct 2024 15:32:39 -0700
Subject: [PATCH 17/44] Update eager runner and eval llama

---
 examples/models/llama/eval_llama_lib.py | 2 +-
 examples/models/llama/runner/eager.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/models/llama/eval_llama_lib.py b/examples/models/llama/eval_llama_lib.py
index 285d2f874df..d9591d4ed1e 100644
--- a/examples/models/llama/eval_llama_lib.py
+++ b/examples/models/llama/eval_llama_lib.py
@@ -191,7 +191,7 @@ def gen_eval_wrapper(
 
     pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(args)
     # GPTFastEvalWrapper: Create a wrapper around a pre-exported model
-    manager: LLMEdgeManager = _prepare_for_llama_export(model_name, args)
+    manager: LLMEdgeManager = _prepare_for_llama_export(args)
 
     if len(quantizers) != 0:
         manager = manager.export().pt2e_quantize(quantizers)
diff --git a/examples/models/llama/runner/eager.py b/examples/models/llama/runner/eager.py
index e116e08a099..e68b85fac2a 100644
--- a/examples/models/llama/runner/eager.py
+++ b/examples/models/llama/runner/eager.py
@@ -38,7 +38,7 @@ def __init__(self, args):
             model_args=model_args,
             device="cuda" if torch.cuda.is_available() else "cpu",
         )
-        manager: LLMEdgeManager = _prepare_for_llama_export("llama", args)
+        manager: LLMEdgeManager = _prepare_for_llama_export(args)
         self.model = manager.model.eval().to(device=self.device)
 
     def forward(

From 0f3035df8312fde89b2a0926fe8a5f81efe090ba Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Fri, 25 Oct 2024 16:36:26 -0700
Subject: [PATCH 18/44] Fix tests

---
 .ci/scripts/test_eval_llama_mmlu.sh     | 1 +
 .ci/scripts/test_eval_llama_wikitext.sh | 1 +
 .ci/scripts/test_llama.sh               | 2 +-
 .ci/scripts/test_llama_runner_eager.sh  | 1 +
 .ci/scripts/test_model.sh               | 2 +-
 5 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/.ci/scripts/test_eval_llama_mmlu.sh b/.ci/scripts/test_eval_llama_mmlu.sh
index c3c0a3d1a69..3a7777c7b12 100644
--- a/.ci/scripts/test_eval_llama_mmlu.sh
+++ b/.ci/scripts/test_eval_llama_mmlu.sh
@@ -35,6 +35,7 @@ run_and_verify() {
         exit 1
     fi
     $PYTHON_EXECUTABLE -m examples.models.llama.eval_llama \
+	--model llama2
 	-c stories110M.pt \
 	-p params.json \
 	-t tokenizer.model \
diff --git a/.ci/scripts/test_eval_llama_wikitext.sh b/.ci/scripts/test_eval_llama_wikitext.sh
index 77af12270ca..ba2b2ec6b30 100644
--- a/.ci/scripts/test_eval_llama_wikitext.sh
+++ b/.ci/scripts/test_eval_llama_wikitext.sh
@@ -35,6 +35,7 @@ run_and_verify() {
         exit 1
     fi
     $PYTHON_EXECUTABLE -m examples.models.llama.eval_llama \
+	--model llama2 \
 	-c stories110M.pt \
 	-p params.json \
 	-t tokenizer.model \
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
index ed2a9c2558b..49650364a7e 100644
--- a/.ci/scripts/test_llama.sh
+++ b/.ci/scripts/test_llama.sh
@@ -206,7 +206,7 @@ if [[ "${QNN}" == "ON" ]]; then
   EXPORT_ARGS="${EXPORT_ARGS} -kv -v --qnn --disable_dynamic_shape"
 fi
 # Add dynamically linked library location
-$PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS}
+$PYTHON_EXECUTABLE -m examples.models.llama.export_llama --model llama3 ${EXPORT_ARGS}
 
 # Create tokenizer.bin.
 echo "Creating tokenizer.bin"
diff --git a/.ci/scripts/test_llama_runner_eager.sh b/.ci/scripts/test_llama_runner_eager.sh
index 537d835ba1c..daa0a386ded 100644
--- a/.ci/scripts/test_llama_runner_eager.sh
+++ b/.ci/scripts/test_llama_runner_eager.sh
@@ -35,6 +35,7 @@ run_and_verify() {
         exit 1
     fi
     $PYTHON_EXECUTABLE -m examples.models.llama.runner.eager \
+	--model llama2
 	-c stories110M.pt \
 	-p params.json \
 	-t tokenizer.model \
diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
index 4e37d0ebaa3..a2608a03f0c 100755
--- a/.ci/scripts/test_model.sh
+++ b/.ci/scripts/test_model.sh
@@ -77,7 +77,7 @@ test_model() {
     # Install requirements for export_llama
     bash examples/models/llama/install_requirements.sh
     # Test export_llama script: python3 -m examples.models.llama.export_llama
-    "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama -c examples/models/llama/params/demo_rand_params.pth -p examples/models/llama/params/demo_config.json
+    "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model llama2 -c examples/models/llama/params/demo_rand_params.pth -p examples/models/llama/params/demo_config.json
     run_portable_executor_runner
     rm "./${MODEL_NAME}.pte"
   fi

From b1f66785c4ecc21eb922fa5025b6941e78e399f0 Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Mon, 28 Oct 2024 08:11:06 -0700
Subject: [PATCH 19/44] Fix tests again

---
 .ci/scripts/test_eval_llama_mmlu.sh       | 2 +-
 .ci/scripts/test_llama_runner_eager.sh    | 2 +-
 examples/models/llama/export_llama_lib.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.ci/scripts/test_eval_llama_mmlu.sh b/.ci/scripts/test_eval_llama_mmlu.sh
index 3a7777c7b12..b8af5fe609f 100644
--- a/.ci/scripts/test_eval_llama_mmlu.sh
+++ b/.ci/scripts/test_eval_llama_mmlu.sh
@@ -35,7 +35,7 @@ run_and_verify() {
         exit 1
     fi
     $PYTHON_EXECUTABLE -m examples.models.llama.eval_llama \
-	--model llama2
+	--model llama2 \
 	-c stories110M.pt \
 	-p params.json \
 	-t tokenizer.model \
diff --git a/.ci/scripts/test_llama_runner_eager.sh b/.ci/scripts/test_llama_runner_eager.sh
index daa0a386ded..769eb60142a 100644
--- a/.ci/scripts/test_llama_runner_eager.sh
+++ b/.ci/scripts/test_llama_runner_eager.sh
@@ -35,7 +35,7 @@ run_and_verify() {
         exit 1
     fi
     $PYTHON_EXECUTABLE -m examples.models.llama.runner.eager \
-	--model llama2
+	--model llama2 \
 	-c stories110M.pt \
 	-p params.json \
 	-t tokenizer.model \
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 12598eac365..b3eb2b8ad6a 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -806,7 +806,7 @@ def _load_llama_model(
     if modelname in EXECUTORCH_DEFINED_MODELS:
         # Set to llama2 because all models in EXECUTORCH_DEFINED_MODELS share the same archteciture as
         # defined in example/models/llama2.
-        modelname = "llama2"
+        modelname = "llama"
         model_class_name = "Llama2Model"
     elif modelname in TORCHTUNE_DEFINED_MODELS:
         raise NotImplementedError(

From c79b77365c1a416104fcc7d902dd031591cf91d7 Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Thu, 31 Oct 2024 12:46:27 -0700
Subject: [PATCH 20/44] Strict = True

---
 examples/models/llama3_2_vision/model.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/models/llama3_2_vision/model.py b/examples/models/llama3_2_vision/model.py
index 1a51ffeaf08..8b056680315 100644
--- a/examples/models/llama3_2_vision/model.py
+++ b/examples/models/llama3_2_vision/model.py
@@ -107,7 +107,7 @@ def __init__(self, **kwargs):
         # Load checkpoint.
         missing, unexpected = self.model_.load_state_dict(
             checkpoint,
-            strict=False,
+            strict=True,
             assign=True,
         )
         if kwargs.get("verbose", False):
@@ -154,17 +154,17 @@ def get_example_kwarg_inputs(self):
             # "mask": self.causal_mask[None, 64, None, :],
             # "encoder_input": None,
             # "encoder_mask": None,
-            # "input_pos": torch.ones(64, dtype=torch.long),
-            # input_pos: self.input_pos[None, 64]
+            # "input_pos": self.input_pos[None, 64]
         }
 
     def get_dynamic_shapes(self):
-        dim = torch.export.Dim("token_dim", min=1, max=self.max_seq_len)
+        batch_size = 1
+        dim_seq_len = torch.export.Dim("token_dim", min=1, max=self.max_seq_len)
         dynamic_shapes = {
-            "tokens": {0: 1, 1: dim},
-            # "encoder_input": {0:1, 1:dim_enc, 2:4096},
-            # "encoder_mask": {0:1, 1:dim, 2:dim_enc},
-            # "mask": None,
-            # "input_pos" : {0: dim},
+            "tokens": {0: batch_size, 1: dim_seq_len},
+            # "encoder_input": {0: 1, 1: dim_enc, 2: 4096},
+            # "encoder_mask": {0: 1, 1: dim, 2: dim_enc},
+            # "mask": {0: batch_size, 1: dim_seq_len, 2: self.max_seq_len},
+            # "input_pos" : {0: batch_size, 1: dim_seq_len},
         }
         return dynamic_shapes

From b8ff8e2ac5f2ae53bfe4da10ff8ffaeae4b34ed5 Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Thu, 31 Oct 2024 13:06:43 -0700
Subject: [PATCH 21/44] Things work

---
 examples/models/llama2/runner/eager.py      |  9 ++---
 examples/models/llama2/runner/generation.py | 41 +++++++++++++++------
 examples/models/llama2/runner/native.py     |  9 ++---
 extension/llm/export/builder.py             | 11 +++++-
 4 files changed, 46 insertions(+), 24 deletions(-)

diff --git a/examples/models/llama2/runner/eager.py b/examples/models/llama2/runner/eager.py
index 42357d6e55c..65c99a0f590 100644
--- a/examples/models/llama2/runner/eager.py
+++ b/examples/models/llama2/runner/eager.py
@@ -10,7 +10,6 @@
 
 import torch
 
-from examples.models.llama2.llama_transformer import ModelArgs
 from executorch.examples.models.model_factory import EagerModelFactory
 
 from .generation import LlamaRunner
@@ -24,13 +23,13 @@ class EagerLlamaRunner(LlamaRunner):
     def __init__(self, args):
         with open(args.params, "r") as f:
             params = json.loads(f.read())
-        model_args: ModelArgs = ModelArgs(
-            max_seq_len=args.max_len,
+        super().__init__(
+            tokenizer_path=args.tokenizer,
+            max_seq_len=args.max_seq_len,
             max_batch_size=1,
             use_kv_cache=True,
-            **params,
+            vocab_size=params["vocab_size"],
         )
-        super().__init__(tokenizer_path=args.tokenizer, model_args=model_args)
         self.model, _, _, _ = EagerModelFactory.create_model(
             "llama2",
             "Llama2Model",
diff --git a/examples/models/llama2/runner/generation.py b/examples/models/llama2/runner/generation.py
index 6d43c84932f..f255a14c187 100644
--- a/examples/models/llama2/runner/generation.py
+++ b/examples/models/llama2/runner/generation.py
@@ -51,10 +51,19 @@ def next_token(logits: torch.Tensor, temperature: float, top_p: float) -> int:
 
 
 class LlamaRunner(ABC):
-    def __init__(self, tokenizer_path: str, model_args: ModelArgs):
-        self.params = model_args
+    def __init__(
+        self,
+        tokenizer_path: str,
+        max_seq_len: int,
+        max_batch_size: int,
+        use_kv_cache: bool,
+        vocab_size: int,
+    ):
+        self.max_seq_len = max_seq_len
+        self.max_batch_size = max_batch_size
+        self.use_kv_cache = use_kv_cache
         self.tokenizer = Tokenizer(tokenizer_path)
-        assert model_args.vocab_size == self.tokenizer.n_words
+        assert vocab_size == self.tokenizer.n_words
 
     @abstractmethod
     def forward(
@@ -75,27 +84,35 @@ def generate(  # noqa: C901
         logits = self.forward(
             tokens=torch.tensor([prompt_tokens], dtype=torch.long),
             input_pos=(
-                torch.tensor([0], dtype=torch.long)
-                if self.params.use_kv_cache
-                else None
+                torch.tensor([0], dtype=torch.long) if self.use_kv_cache else None
             ),
         )
 
-        current_token = next_token(logits, temperature, top_p)
+        # TODO: accomodate TorchTune model, which doesn't
+        # make an optimization of dropping all logits but the last.
+        current_token = next_token(logits[:, -1, :], temperature, top_p)
         tokens = prompt_tokens + [current_token]
 
-        while len(tokens) < self.params.max_seq_len:
-            if self.params.use_kv_cache:
+        i = 0
+        while len(tokens) < self.max_seq_len:
+            print(f"{i} out of {self.max_seq_len} max tokens generated")
+            if self.use_kv_cache:
                 logits = self.forward(
                     tokens=torch.tensor([[current_token]], dtype=torch.long),
                     input_pos=torch.tensor([len(tokens) - 1], dtype=torch.long),
                 )
             else:
-                logits = self.forward(tokens=torch.tensor([tokens], dtype=torch.long))
-            current_token = next_token(logits, temperature, top_p)
-            if current_token in self.tokenizer.stop_tokens:
+                logits = self.forward(
+                    tokens=torch.tensor([tokens], dtype=torch.long, device=self.device),
+                )
+            current_token = next_token(logits[:, -1, :], temperature, top_p)
+            if current_token == self.tokenizer.eos_id or (
+                hasattr(self.tokenizer, "stop_tokens")
+                and current_token in self.tokenizer.stop_tokens
+            ):
                 break
             tokens.append(current_token)
+            i += 1
 
         return tokens if echo else tokens[len(prompt_tokens) :]
 
diff --git a/examples/models/llama2/runner/native.py b/examples/models/llama2/runner/native.py
index c457762d71f..afc775fad01 100644
--- a/examples/models/llama2/runner/native.py
+++ b/examples/models/llama2/runner/native.py
@@ -10,18 +10,17 @@
 
 import torch
 
-from executorch.examples.models.llama2.llama_transformer import ModelArgs
 from executorch.extension.pybindings.portable_lib import _load_for_executorch
 
 # Load custom ops and quantized ops.
 from executorch.extension.pybindings import portable_lib  # noqa # usort: skip
 
+from executorch.examples.models.llama2.runner.generation import LlamaRunner
+
 # Note: import this after portable_lib
 # from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa # usort: skip
 from executorch.kernels import quantized  # noqa
 
-from executorch.examples.models.llama2.runner.generation import LlamaRunner
-
 
 class NativeLlamaRunner(LlamaRunner):
     """
@@ -31,13 +30,13 @@ class NativeLlamaRunner(LlamaRunner):
     def __init__(self, args):
         with open(args.params, "r") as f:
             params = json.loads(f.read())
-        model_args: ModelArgs = ModelArgs(
+        super().__init__(
+            tokenizer_path=args.tokenizer,
             max_seq_len=args.max_len,
             max_batch_size=1,
             use_kv_cache=args.kv_cache,
             vocab_size=params["vocab_size"],
         )
-        super().__init__(tokenizer_path=args.tokenizer, model_args=model_args)
         self.model = _load_for_executorch(args.pte)
 
     def forward(
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index 11d92f32f56..e2308143611 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -193,12 +193,19 @@ def capture_pre_autograd_graph(self) -> "LLMEdgeManager":
                     strict=True,
                 ).module()
             else:
-                self.pre_autograd_graph_module = capture_pre_autograd_graph(
+                # pyre-fixme[8]: Attribute has type `Optional[GraphModule]`; used as
+                #  `Module`.
+                print("Exporting with:")
+                print(f"inputs: {self.example_inputs}")
+                print(f"kwargs: {self.example_kwarg_inputs}")
+                print(f"dynamic shapes: {dynamic_shape}")
+
+                self.pre_autograd_graph_module = export_for_training(
                     self.model,
                     self.example_inputs,
                     kwargs=self.example_kwarg_inputs,
                     dynamic_shapes=dynamic_shape,
-                )
+                ).module()
 
         return self
 

From 6e38763ed57d79de0c044805266606e0fba0184a Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Thu, 31 Oct 2024 13:27:10 -0700
Subject: [PATCH 22/44] Clip logits if torchtune

---
 examples/models/llama/runner/generation.py | 11 +++++++----
 extension/llm/export/builder.py            |  4 ++--
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/examples/models/llama/runner/generation.py b/examples/models/llama/runner/generation.py
index aeb758b4341..09aaa6088b3 100644
--- a/examples/models/llama/runner/generation.py
+++ b/examples/models/llama/runner/generation.py
@@ -127,15 +127,18 @@ def generate(  # noqa: C901
                     tokens=torch.tensor([tokens], dtype=torch.long, device=self.device),
                 )
 
-        if self.has_full_logits:
-            current_token = next_token(logits[:, -1, :], temperature, top_p)
-        else:
-            current_token = next_token(logits, temperature, top_p)
+            # If the logits aren't already clipped to only contain the last logit, clip them.
+            if self.has_full_logits:
+                current_token = next_token(logits[:, -1, :], temperature, top_p)
+            else:
+                current_token = next_token(logits, temperature, top_p)
+
             if current_token == self.tokenizer.eos_id or (
                 hasattr(self.tokenizer, "stop_tokens")
                 and current_token in self.tokenizer.stop_tokens
             ):
                 break
+
             tokens.append(current_token)
             i += 1
 
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index 621a0ec36b9..9244c7dd797 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -194,13 +194,13 @@ def export(self) -> "LLMEdgeManager":
                     strict=True,
                 ).module()
             else:
-                # pyre-fixme[8]: Attribute has type `Optional[GraphModule]`; used as
-                #  `Module`.
                 print("Exporting with:")
                 print(f"inputs: {self.example_inputs}")
                 print(f"kwargs: {self.example_kwarg_inputs}")
                 print(f"dynamic shapes: {dynamic_shape}")
 
+                # pyre-fixme[8]: Attribute has type `Optional[GraphModule]`; used as
+                #  `Module`.
                 self.pre_autograd_graph_module = export_for_training(
                     self.model,
                     self.example_inputs,

From 96d579858d942d23752e1203a1ad912c22cd7d13 Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Thu, 31 Oct 2024 15:01:55 -0700
Subject: [PATCH 23/44] Fix

---
 examples/models/llama/runner/eager.py      |  2 +-
 examples/models/llama/runner/generation.py |  1 -
 examples/models/llama/runner/native.py     | 12 +++++++++++-
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/examples/models/llama/runner/eager.py b/examples/models/llama/runner/eager.py
index 23423b9b677..1a7c4fd157a 100644
--- a/examples/models/llama/runner/eager.py
+++ b/examples/models/llama/runner/eager.py
@@ -34,7 +34,7 @@ def __init__(self, args):
             max_batch_size=1,
             use_kv_cache=args.use_kv_cache,
             vocab_size=params["vocab_size"],
-            has_full_logits=args.model in TORCHTUNE_DEFINED_MODELS
+            has_full_logits=args.model in TORCHTUNE_DEFINED_MODELS,
             device="cuda" if torch.cuda.is_available() else "cpu",
         )
         manager: LLMEdgeManager = _prepare_for_llama_export(args)
diff --git a/examples/models/llama/runner/generation.py b/examples/models/llama/runner/generation.py
index 09aaa6088b3..60b8b9d8b2b 100644
--- a/examples/models/llama/runner/generation.py
+++ b/examples/models/llama/runner/generation.py
@@ -73,7 +73,6 @@ def __init__(
         has_full_logits: whether the model returns the full logits or only returns the last logit.
         device: device to run the runner on.
         """
-        self.model_name = model
         self.max_seq_len = max_seq_len
         self.max_batch_size = max_batch_size
         self.use_kv_cache = use_kv_cache
diff --git a/examples/models/llama/runner/native.py b/examples/models/llama/runner/native.py
index afc775fad01..ae09bda532d 100644
--- a/examples/models/llama/runner/native.py
+++ b/examples/models/llama/runner/native.py
@@ -10,12 +10,14 @@
 
 import torch
 
+from executorch.examples.models.llama.export_llama_lib import EXECUTORCH_DEFINED_MODELS, TORCHTUNE_DEFINED_MODELS
+
 from executorch.extension.pybindings.portable_lib import _load_for_executorch
 
 # Load custom ops and quantized ops.
 from executorch.extension.pybindings import portable_lib  # noqa # usort: skip
 
-from executorch.examples.models.llama2.runner.generation import LlamaRunner
+from executorch.examples.models.llama.runner.generation import LlamaRunner
 
 # Note: import this after portable_lib
 # from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa # usort: skip
@@ -36,6 +38,7 @@ def __init__(self, args):
             max_batch_size=1,
             use_kv_cache=args.kv_cache,
             vocab_size=params["vocab_size"],
+            has_full_logits=args.model in TORCHTUNE_DEFINED_MODELS,
         )
         self.model = _load_for_executorch(args.pte)
 
@@ -58,8 +61,15 @@ def forward(
 
 
 def build_args_parser() -> argparse.ArgumentParser:
+    # TODO: merge these with build_args_parser from export_llama_lib.
     parser = argparse.ArgumentParser()
 
+    parser.add_argument(
+        "--model",
+        default="llama",
+        choices=EXECUTORCH_DEFINED_MODELS + TORCHTUNE_DEFINED_MODELS,
+    )
+
     parser.add_argument(
         "-f",
         "--pte",

From f275e2eebb7bf69e3be06711b030d13222f543c2 Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Fri, 1 Nov 2024 08:06:31 -0700
Subject: [PATCH 24/44] Kv cache by default is false

---
 examples/models/llama/runner/native.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/models/llama/runner/native.py b/examples/models/llama/runner/native.py
index ae09bda532d..1228219715d 100644
--- a/examples/models/llama/runner/native.py
+++ b/examples/models/llama/runner/native.py
@@ -104,7 +104,6 @@ def build_args_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "-kv",
         "--kv_cache",
-        default=True,
         action="store_true",
     )
 

From 37011d3de2ebab83fb2475660495a1ade9ce9abc Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Fri, 1 Nov 2024 08:11:47 -0700
Subject: [PATCH 25/44] Clean up

---
 examples/models/llama/runner/eager.py      |  1 -
 examples/models/llama/runner/generation.py | 13 ++++++++-----
 examples/models/llama/runner/native.py     |  5 ++++-
 examples/models/llama3_2_vision/model.py   |  4 +++-
 4 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/examples/models/llama/runner/eager.py b/examples/models/llama/runner/eager.py
index 1a7c4fd157a..5fdc2fae665 100644
--- a/examples/models/llama/runner/eager.py
+++ b/examples/models/llama/runner/eager.py
@@ -10,7 +10,6 @@
 
 import torch
 
-from examples.models.llama.llama_transformer import ModelArgs
 from executorch.examples.models.llama.export_llama_lib import (
     _prepare_for_llama_export,
     build_args_parser as _build_args_parser,
diff --git a/examples/models/llama/runner/generation.py b/examples/models/llama/runner/generation.py
index 60b8b9d8b2b..3ab4c8e1ede 100644
--- a/examples/models/llama/runner/generation.py
+++ b/examples/models/llama/runner/generation.py
@@ -9,7 +9,6 @@
 
 import torch
 
-from executorch.examples.models.llama.llama_transformer import ModelArgs
 from executorch.extension.llm.tokenizer.utils import get_tokenizer
 
 
@@ -63,7 +62,7 @@ def __init__(
     ):
         """
         Constructor.
-        
+
         Args:
         tokenizer_path: path to tokenizer.model file.
         max_seq_len: max length of the output sequence, after which the output will be clipped.
@@ -100,13 +99,17 @@ def generate(  # noqa: C901
         logits = self.forward(
             tokens=torch.tensor([prompt_tokens], dtype=torch.long, device=self.device),
             input_pos=(
-                torch.tensor([0], dtype=torch.long, device=self.device) if self.use_kv_cache else None
+                torch.tensor([0], dtype=torch.long, device=self.device)
+                if self.use_kv_cache
+                else None
             ),
         )
 
-        # TODO: accomodate TorchTune model, which doesn't
-        # make an optimization of dropping all logits but the last.
         current_token = next_token(logits[:, -1, :], temperature, top_p)
+        if self.has_full_logits:
+            current_token = next_token(logits[:, -1, :], temperature, top_p)
+        else:
+            current_token = next_token(logits, temperature, top_p)
         tokens = prompt_tokens + [current_token]
 
         i = 0
diff --git a/examples/models/llama/runner/native.py b/examples/models/llama/runner/native.py
index 1228219715d..5b3117674d7 100644
--- a/examples/models/llama/runner/native.py
+++ b/examples/models/llama/runner/native.py
@@ -10,7 +10,10 @@
 
 import torch
 
-from executorch.examples.models.llama.export_llama_lib import EXECUTORCH_DEFINED_MODELS, TORCHTUNE_DEFINED_MODELS
+from executorch.examples.models.llama.export_llama_lib import (
+    EXECUTORCH_DEFINED_MODELS,
+    TORCHTUNE_DEFINED_MODELS,
+)
 
 from executorch.extension.pybindings.portable_lib import _load_for_executorch
 
diff --git a/examples/models/llama3_2_vision/model.py b/examples/models/llama3_2_vision/model.py
index 8b056680315..f7321373d30 100644
--- a/examples/models/llama3_2_vision/model.py
+++ b/examples/models/llama3_2_vision/model.py
@@ -40,7 +40,9 @@ class Llama3_2Decoder(EagerModelBase):
 
     def __init__(self, **kwargs):
         # Set member vars from kwargs.
-        self.max_seq_len = kwargs.get("max_seq_len", 8192)  # Trained to be a lot larger, but this value is kept small because of static kv cache at the moment.
+        self.max_seq_len = kwargs.get(
+            "max_seq_len", 8192
+        )  # Trained to be a lot larger, but this value is kept small because of static kv cache at the moment.
         self.encoder_max_seq_len = kwargs.get(
             "encoder_max_seq_len", int(4 * (448 / 14) ** 2 + 1)
         )  # Same as above.

From 7d520029e1a5fcd6ceeb59babff30edd230ea021 Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Mon, 4 Nov 2024 13:24:50 -0800
Subject: [PATCH 26/44] Export model with KV cache + runner for Torchtune
 models

---
 examples/models/llama/runner/generation.py | 29 +++-------------------
 examples/models/llama3_2_vision/model.py   | 25 ++++++++++---------
 examples/models/model_factory.py           |  2 +-
 3 files changed, 17 insertions(+), 39 deletions(-)

diff --git a/examples/models/llama/runner/generation.py b/examples/models/llama/runner/generation.py
index 3ab4c8e1ede..2b55141d2e1 100644
--- a/examples/models/llama/runner/generation.py
+++ b/examples/models/llama/runner/generation.py
@@ -57,26 +57,12 @@ def __init__(
         max_batch_size: int,
         use_kv_cache: bool,
         vocab_size: int,
-        has_full_logits: bool = False,
         device: str = "cpu",
     ):
-        """
-        Constructor.
-
-        Args:
-        tokenizer_path: path to tokenizer.model file.
-        max_seq_len: max length of the output sequence, after which the output will be clipped.
-        max_batch_size: max batch size.
-        use_kv_cache: whether to use a KV cache.
-        vocab_size: number of items in the vocab.
-        has_full_logits: whether the model returns the full logits or only returns the last logit.
-        device: device to run the runner on.
-        """
         self.max_seq_len = max_seq_len
         self.max_batch_size = max_batch_size
         self.use_kv_cache = use_kv_cache
         self.tokenizer = get_tokenizer(tokenizer_path)
-        self.has_full_logits = has_full_logits
         self.device = device
         assert vocab_size == self.tokenizer.n_words
 
@@ -95,7 +81,7 @@ def generate(  # noqa: C901
         top_p: float = 0.9,
         echo: bool = False,
     ) -> List[int]:
-        # prefill
+        # Prefill
         logits = self.forward(
             tokens=torch.tensor([prompt_tokens], dtype=torch.long, device=self.device),
             input_pos=(
@@ -105,11 +91,7 @@ def generate(  # noqa: C901
             ),
         )
 
-        current_token = next_token(logits[:, -1, :], temperature, top_p)
-        if self.has_full_logits:
-            current_token = next_token(logits[:, -1, :], temperature, top_p)
-        else:
-            current_token = next_token(logits, temperature, top_p)
+        current_token = next_token(logits, temperature, top_p)
         tokens = prompt_tokens + [current_token]
 
         i = 0
@@ -129,12 +111,7 @@ def generate(  # noqa: C901
                     tokens=torch.tensor([tokens], dtype=torch.long, device=self.device),
                 )
 
-            # If the logits aren't already clipped to only contain the last logit, clip them.
-            if self.has_full_logits:
-                current_token = next_token(logits[:, -1, :], temperature, top_p)
-            else:
-                current_token = next_token(logits, temperature, top_p)
-
+            current_token = next_token(logits, temperature, top_p)
             if current_token == self.tokenizer.eos_id or (
                 hasattr(self.tokenizer, "stop_tokens")
                 and current_token in self.tokenizer.stop_tokens
diff --git a/examples/models/llama3_2_vision/model.py b/examples/models/llama3_2_vision/model.py
index f7321373d30..3e0cdbaf684 100644
--- a/examples/models/llama3_2_vision/model.py
+++ b/examples/models/llama3_2_vision/model.py
@@ -134,12 +134,12 @@ def __init__(self, **kwargs):
 
             self.model_ = prune_output_vocab(self.model_, output_prune_map)
 
-        # if self.use_kv_cache:
-        #     print("Setting up KV cache on the model...")
-        #     self.model_.setup_caches(
-        #         batch_size=1,
-        #         dtype=self.dtype,
-        #     )
+        if self.use_kv_cache:
+            print("Setting up KV cache on the model...")
+            self.model_.setup_caches(
+                batch_size=1,
+                dtype=self.dtype,
+            )
 
     def get_eager_model(self) -> torch.nn.Module:
         if self.dtype:
@@ -148,15 +148,16 @@ def get_eager_model(self) -> torch.nn.Module:
             return self.model_.to(torch.float16)
 
     def get_example_inputs(self):
-        return (torch.ones(1, 64, dtype=torch.long),)
+        return (torch.ones(1, 32, dtype=torch.long),)
 
     def get_example_kwarg_inputs(self):
-        # TODO: add input_pos and mask when after making cache work.
+        # For export we must use the prefill versions of the
+        # causal mask and input_pos.
         return {
-            # "mask": self.causal_mask[None, 64, None, :],
+            "mask": self.causal_mask[None, :32],
             # "encoder_input": None,
             # "encoder_mask": None,
-            # "input_pos": self.input_pos[None, 64]
+            "input_pos": self.input_pos[None, :32]
         }
 
     def get_dynamic_shapes(self):
@@ -166,7 +167,7 @@ def get_dynamic_shapes(self):
             "tokens": {0: batch_size, 1: dim_seq_len},
             # "encoder_input": {0: 1, 1: dim_enc, 2: 4096},
             # "encoder_mask": {0: 1, 1: dim, 2: dim_enc},
-            # "mask": {0: batch_size, 1: dim_seq_len, 2: self.max_seq_len},
-            # "input_pos" : {0: batch_size, 1: dim_seq_len},
+            "mask": {0: batch_size, 1: dim_seq_len, 2: dim_seq_len},
+            "input_pos" : {0: batch_size, 1: dim_seq_len},
         }
         return dynamic_shapes
diff --git a/examples/models/model_factory.py b/examples/models/model_factory.py
index 5abe5efe462..5b66aef8de7 100644
--- a/examples/models/model_factory.py
+++ b/examples/models/model_factory.py
@@ -44,7 +44,7 @@ def create_model(
             model = model_class(**kwargs)
             example_kwarg_inputs = None
             dynamic_shapes = None
-            if hasattr(model, "get_example_kwarg_inputs()"):
+            if hasattr(model, "get_example_kwarg_inputs"):
                 example_kwarg_inputs = model.get_example_kwarg_inputs()
             if hasattr(model, "get_dynamic_shapes"):
                 dynamic_shapes = model.get_dynamic_shapes()

From e44b259265317b9880da79d2859b80cbdc75361f Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Wed, 6 Nov 2024 10:18:19 -0800
Subject: [PATCH 27/44] Export with no kv cache + non-strict load checkpoint

---
 examples/models/llama3_2_vision/model.py | 37 +++++++++++++++---------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/examples/models/llama3_2_vision/model.py b/examples/models/llama3_2_vision/model.py
index 3e0cdbaf684..f735b3a3aee 100644
--- a/examples/models/llama3_2_vision/model.py
+++ b/examples/models/llama3_2_vision/model.py
@@ -109,7 +109,7 @@ def __init__(self, **kwargs):
         # Load checkpoint.
         missing, unexpected = self.model_.load_state_dict(
             checkpoint,
-            strict=True,
+            strict=False,
             assign=True,
         )
         if kwargs.get("verbose", False):
@@ -139,6 +139,7 @@ def __init__(self, **kwargs):
             self.model_.setup_caches(
                 batch_size=1,
                 dtype=self.dtype,
+                decoder_max_seq_len=self.max_seq_len,
             )
 
     def get_eager_model(self) -> torch.nn.Module:
@@ -153,21 +154,29 @@ def get_example_inputs(self):
     def get_example_kwarg_inputs(self):
         # For export we must use the prefill versions of the
         # causal mask and input_pos.
-        return {
-            "mask": self.causal_mask[None, :32],
-            # "encoder_input": None,
-            # "encoder_mask": None,
-            "input_pos": self.input_pos[None, :32]
-        }
+        if self.use_kv_cache:
+            return {
+                "input_pos": self.input_pos[None, :32],
+                "mask": self.causal_mask[None, :32],
+                # "encoder_input": None,
+                # "encoder_mask": None,
+            }
+        else:
+            return None
 
     def get_dynamic_shapes(self):
         batch_size = 1
         dim_seq_len = torch.export.Dim("token_dim", min=1, max=self.max_seq_len)
-        dynamic_shapes = {
-            "tokens": {0: batch_size, 1: dim_seq_len},
-            # "encoder_input": {0: 1, 1: dim_enc, 2: 4096},
-            # "encoder_mask": {0: 1, 1: dim, 2: dim_enc},
-            "mask": {0: batch_size, 1: dim_seq_len, 2: dim_seq_len},
-            "input_pos" : {0: batch_size, 1: dim_seq_len},
-        }
+        if self.use_kv_cache:
+            dynamic_shapes = {
+                "tokens": {0: batch_size, 1: dim_seq_len},
+                # "encoder_input": {0: 1, 1: dim_enc, 2: 4096},
+                # "encoder_mask": {0: 1, 1: dim, 2: dim_enc},
+                "mask": {0: batch_size, 1: dim_seq_len, 2: None},
+                "input_pos" : {0: batch_size, 1: dim_seq_len},
+            }
+        else:
+            dynamic_shapes = {
+                "tokens": {0: batch_size, 1: dim_seq_len},
+            }
         return dynamic_shapes

From de45c48ad51299648f9cf1d929b3d752a886baed Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Thu, 31 Oct 2024 12:46:27 -0700
Subject: [PATCH 28/44] Strict = True

---
 examples/models/llama3_2_vision/model.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/models/llama3_2_vision/model.py b/examples/models/llama3_2_vision/model.py
index 1a51ffeaf08..f4cc881b721 100644
--- a/examples/models/llama3_2_vision/model.py
+++ b/examples/models/llama3_2_vision/model.py
@@ -154,17 +154,17 @@ def get_example_kwarg_inputs(self):
             # "mask": self.causal_mask[None, 64, None, :],
             # "encoder_input": None,
             # "encoder_mask": None,
-            # "input_pos": torch.ones(64, dtype=torch.long),
-            # input_pos: self.input_pos[None, 64]
+            # "input_pos": self.input_pos[None, 64]
         }
 
     def get_dynamic_shapes(self):
-        dim = torch.export.Dim("token_dim", min=1, max=self.max_seq_len)
+        batch_size = 1
+        dim_seq_len = torch.export.Dim("token_dim", min=1, max=self.max_seq_len)
         dynamic_shapes = {
-            "tokens": {0: 1, 1: dim},
-            # "encoder_input": {0:1, 1:dim_enc, 2:4096},
-            # "encoder_mask": {0:1, 1:dim, 2:dim_enc},
-            # "mask": None,
-            # "input_pos" : {0: dim},
+            "tokens": {0: batch_size, 1: dim_seq_len},
+            # "encoder_input": {0: 1, 1: dim_enc, 2: 4096},
+            # "encoder_mask": {0: 1, 1: dim, 2: dim_enc},
+            # "mask": {0: batch_size, 1: dim_seq_len, 2: self.max_seq_len},
+            # "input_pos" : {0: batch_size, 1: dim_seq_len},
         }
         return dynamic_shapes

From 64dcbda3e972b5202a8ad6f88ec80780f97a1ecc Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Wed, 13 Nov 2024 07:08:38 -0800
Subject: [PATCH 29/44] Lint

---
 examples/models/llama/export_llama_lib.py     | 34 ++++++++++---------
 examples/models/llama3_2_vision/model.py      |  8 +++--
 .../llama3_2_vision/params/demo_config.json   |  2 +-
 3 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index ceab435ba0b..c9f74aa2e46 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -80,7 +80,7 @@
 
 
 EXECUTORCH_DEFINED_MODELS = ["stories110m", "llama2", "llama3", "llama3_1", "llama3_2"]
-TORCHTUNE_DEFINED_MODELS = ["llama3_2_vision]
+TORCHTUNE_DEFINED_MODELS = ["llama3_2_vision"]
 
 
 class WeightType(Enum):
@@ -887,21 +887,23 @@ def _load_llama_model(
     else:
         raise ValueError(f"{modelname} is not a valid Llama model.")
 
-    model, example_inputs, example_kwarg_inputs, dynamic_shapes = EagerModelFactory.create_model(
-        module_name,
-        model_class_name,
-        checkpoint=checkpoint,
-        checkpoint_dir=checkpoint_dir,
-        params=params_path,
-        use_kv_cache=use_kv_cache,
-        use_sdpa_with_kv_cache=use_sdpa_with_kv_cache,
-        generate_full_logits=generate_full_logits,
-        fairseq2=weight_type == WeightType.FAIRSEQ2,
-        max_seq_len=max_seq_len,
-        enable_dynamic_shape=enable_dynamic_shape,
-        input_prune_map_path=input_prune_map_path,
-        output_prune_map_path=output_prune_map_path,
-        args=args,
+    model, example_inputs, example_kwarg_inputs, dynamic_shapes = (
+        EagerModelFactory.create_model(
+            module_name,
+            model_class_name,
+            checkpoint=checkpoint,
+            checkpoint_dir=checkpoint_dir,
+            params=params_path,
+            use_kv_cache=use_kv_cache,
+            use_sdpa_with_kv_cache=use_sdpa_with_kv_cache,
+            generate_full_logits=generate_full_logits,
+            fairseq2=weight_type == WeightType.FAIRSEQ2,
+            max_seq_len=max_seq_len,
+            enable_dynamic_shape=enable_dynamic_shape,
+            input_prune_map_path=input_prune_map_path,
+            output_prune_map_path=output_prune_map_path,
+            args=args,
+        )
     )
     if dtype_override:
         assert isinstance(
diff --git a/examples/models/llama3_2_vision/model.py b/examples/models/llama3_2_vision/model.py
index f4cc881b721..73d79bb08e0 100644
--- a/examples/models/llama3_2_vision/model.py
+++ b/examples/models/llama3_2_vision/model.py
@@ -40,7 +40,9 @@ class Llama3_2Decoder(EagerModelBase):
 
     def __init__(self, **kwargs):
         # Set member vars from kwargs.
-        self.max_seq_len = kwargs.get("max_seq_len", 8192)  # Trained to be a lot larger, but this value is kept small because of static kv cache at the moment.
+        self.max_seq_len = kwargs.get(
+            "max_seq_len", 8192
+        )  # Trained to be a lot larger, but this value is kept small because of static kv cache at the moment.
         self.encoder_max_seq_len = kwargs.get(
             "encoder_max_seq_len", int(4 * (448 / 14) ** 2 + 1)
         )  # Same as above.
@@ -73,7 +75,9 @@ def __init__(self, **kwargs):
                 "Sharded checkpoint not yet supported for Llama3_2Decoder."
             )
         else:
-            checkpoint = torch.load(checkpoint_path, map_location=device, mmap=True)
+            checkpoint = torch.load(
+                checkpoint_path, map_location=device, weights_only=False, mmap=True
+            )
         checkpoint = llama3_vision_meta_to_tune(checkpoint)
         checkpoint = to_decoder_checkpoint(checkpoint)
         with open(params_path, "r") as f:
diff --git a/examples/models/llama3_2_vision/params/demo_config.json b/examples/models/llama3_2_vision/params/demo_config.json
index 625524ad4c8..694df17d945 100644
--- a/examples/models/llama3_2_vision/params/demo_config.json
+++ b/examples/models/llama3_2_vision/params/demo_config.json
@@ -15,4 +15,4 @@
   "vision_max_num_chunks": 4,
   "vocab_size": 128256,
   "vision_num_cross_attention_layers": 8
-}
\ No newline at end of file
+}

From a89d6b2c20ff08394c78e9504caffe43cc1e7342 Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Wed, 13 Nov 2024 07:23:56 -0800
Subject: [PATCH 30/44] Fix merge

---
 .ci/scripts/test_eval_llama_mmlu.sh                    |  1 -
 .ci/scripts/test_eval_llama_wikitext.sh                |  1 -
 .ci/scripts/test_llama.sh                              |  2 +-
 .ci/scripts/test_llama_runner_eager.sh                 |  1 -
 .ci/scripts/test_model.sh                              |  2 +-
 backends/vulkan/docs/android_demo.md                   |  1 -
 .../LlamaDemo/docs/delegates/qualcomm_README.md        |  4 ++--
 .../android/LlamaDemo/docs/delegates/xnnpack_README.md | 10 +++++-----
 .../apple_ios/LLaMA/docs/delegates/mps_README.md       |  4 ++--
 .../apple_ios/LLaMA/docs/delegates/xnnpack_README.md   |  8 ++++----
 examples/models/llama/README.md                        |  3 ---
 examples/models/llama/UTILS.md                         |  8 ++++----
 examples/models/llama2/README.md                       |  2 +-
 13 files changed, 20 insertions(+), 27 deletions(-)

diff --git a/.ci/scripts/test_eval_llama_mmlu.sh b/.ci/scripts/test_eval_llama_mmlu.sh
index b8af5fe609f..c3c0a3d1a69 100644
--- a/.ci/scripts/test_eval_llama_mmlu.sh
+++ b/.ci/scripts/test_eval_llama_mmlu.sh
@@ -35,7 +35,6 @@ run_and_verify() {
         exit 1
     fi
     $PYTHON_EXECUTABLE -m examples.models.llama.eval_llama \
-	--model llama2 \
 	-c stories110M.pt \
 	-p params.json \
 	-t tokenizer.model \
diff --git a/.ci/scripts/test_eval_llama_wikitext.sh b/.ci/scripts/test_eval_llama_wikitext.sh
index ba2b2ec6b30..77af12270ca 100644
--- a/.ci/scripts/test_eval_llama_wikitext.sh
+++ b/.ci/scripts/test_eval_llama_wikitext.sh
@@ -35,7 +35,6 @@ run_and_verify() {
         exit 1
     fi
     $PYTHON_EXECUTABLE -m examples.models.llama.eval_llama \
-	--model llama2 \
 	-c stories110M.pt \
 	-p params.json \
 	-t tokenizer.model \
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
index 49650364a7e..ed2a9c2558b 100644
--- a/.ci/scripts/test_llama.sh
+++ b/.ci/scripts/test_llama.sh
@@ -206,7 +206,7 @@ if [[ "${QNN}" == "ON" ]]; then
   EXPORT_ARGS="${EXPORT_ARGS} -kv -v --qnn --disable_dynamic_shape"
 fi
 # Add dynamically linked library location
-$PYTHON_EXECUTABLE -m examples.models.llama.export_llama --model llama3 ${EXPORT_ARGS}
+$PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS}
 
 # Create tokenizer.bin.
 echo "Creating tokenizer.bin"
diff --git a/.ci/scripts/test_llama_runner_eager.sh b/.ci/scripts/test_llama_runner_eager.sh
index 1fb44ee957a..0f2cb7b3769 100644
--- a/.ci/scripts/test_llama_runner_eager.sh
+++ b/.ci/scripts/test_llama_runner_eager.sh
@@ -35,7 +35,6 @@ run_and_verify() {
         exit 1
     fi
     $PYTHON_EXECUTABLE -m examples.models.llama.runner.eager \
-	--model llama2 \
 	-c stories110M.pt \
 	-p params.json \
 	-t tokenizer.model \
diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
index a2608a03f0c..4e37d0ebaa3 100755
--- a/.ci/scripts/test_model.sh
+++ b/.ci/scripts/test_model.sh
@@ -77,7 +77,7 @@ test_model() {
     # Install requirements for export_llama
     bash examples/models/llama/install_requirements.sh
     # Test export_llama script: python3 -m examples.models.llama.export_llama
-    "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model llama2 -c examples/models/llama/params/demo_rand_params.pth -p examples/models/llama/params/demo_config.json
+    "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama -c examples/models/llama/params/demo_rand_params.pth -p examples/models/llama/params/demo_config.json
     run_portable_executor_runner
     rm "./${MODEL_NAME}.pte"
   fi
diff --git a/backends/vulkan/docs/android_demo.md b/backends/vulkan/docs/android_demo.md
index 1314a6503aa..2a4faacc0c8 100644
--- a/backends/vulkan/docs/android_demo.md
+++ b/backends/vulkan/docs/android_demo.md
@@ -58,7 +58,6 @@ partially lower the Llama model to Vulkan.
 ```shell
 # The files will usually be downloaded to ~/.llama
 python -m examples.models.llama.export_llama \
-  --model llama3_2
   --disable_dynamic_shape --vulkan -kv --use_sdpa_with_kv_cache -d fp32 \
   -c ~/.llama/checkpoints/Llama3.2-1B/consolidated.00.pth \
   -p ~/.llama/checkpoints/Llama3.2-1B/params.json \
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
index 85e5b63c72e..7790f66923c 100644
--- a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
+++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
@@ -101,12 +101,12 @@ We support PTQ by default. The entire export may take ~20 minutes (Llama 3.1 8B)
 Examples:
 ```
 # 4 bits weight only quantize
-python -m examples.models.llama.export_llama --model llama3 --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_16a4w -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte”
+python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_16a4w -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte”
 ```
 If the model is really big, it may require model sharding because the Qualcomm DSP is a 32bit system and has a 4GB size limit . For example for Llama 3 8B models, we need to shard the model into 4, but ExecuTorch still packages it into one PTE file. Here is an example:
 ```
 # 8 bits quantization with 4 shards
-python -m examples.models.llama.export_llama --model llama3 --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_8a8w -d fp32 --num_sharding 4 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte”
+python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_8a8w -d fp32 --num_sharding 4 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte”
 ```
 Note: if you encountered issues below
 ```
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
index 4ee52bd1b99..2a6ddbbfe09 100644
--- a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
+++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
@@ -56,14 +56,14 @@ In this demo app, we support text-only inference with up-to-date Llama models an
 Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
 * Export Llama model and generate .pte file as below:
 ```
-python -m examples.models.llama.export_llama --model llama3_2 --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte"
+python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte"
 ```
 
 ### For Llama 3.2 1B and 3B QAT+LoRA models
 Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
 * Export Llama model and generate .pte file as below:
 ```
-python -m examples.models.llama.export_llama --model llama3_2 --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte"
+python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte"
 ```
 
 ### For Llama 3.2 1B and 3B BF16 models
@@ -72,7 +72,7 @@ We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B
 * Export Llama model and generate .pte file as below:
 
 ```
-python -m examples.models.llama.export_llama --model llama3_2 --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte"
+python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte"
 ```
 
 For more detail using Llama 3.2 lightweight models including prompt template, please go to our official [website](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2#-llama-3.2-lightweight-models-(1b/3b)-).
@@ -87,7 +87,7 @@ To safeguard your application, you can use our Llama Guard models for prompt cla
 * We prepared this model using the following command
 
 ```
-python -m examples.models.llama.export_llama --model llama3_2 --checkpoint <path-to-pruned-llama-guard-1b-checkpoint.pth> --params <path-to-your-params.json> -d fp32 -kv --use_sdpa_with_kv_cache --quantization_mode 8da4w --group_size 256 --xnnpack --max_seq_length 8193 --embedding-quantize 4,32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_prune_map <path-to-your-llama_guard-pruned-layers-map.json> --output_name="llama_guard_3_1b_pruned_xnnpack.pte"
+python -m examples.models.llama.export_llama --checkpoint <path-to-pruned-llama-guard-1b-checkpoint.pth> --params <path-to-your-params.json> -d fp32 -kv --use_sdpa_with_kv_cache --quantization_mode 8da4w --group_size 256 --xnnpack --max_seq_length 8193 --embedding-quantize 4,32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_prune_map <path-to-your-llama_guard-pruned-layers-map.json> --output_name="llama_guard_3_1b_pruned_xnnpack.pte"
 ```
 
 
@@ -97,7 +97,7 @@ python -m examples.models.llama.export_llama --model llama3_2 --checkpoint <path
 * Export Llama model and generate .pte file as below:
 
 ```
-python -m examples.models.llama.export_llama --model llama3_2 --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama.pte"
+python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama.pte"
 ```
 
 You may wonder what the ‘--metadata’ flag is doing. This flag helps export the model with proper special tokens added that the runner can detect EOS tokens easily.
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
index 8aeed59cab9..eb3c244dee7 100644
--- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
+++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
@@ -45,9 +45,9 @@ Install the required packages to export the model
 sh examples/models/llama/install_requirements.sh
 ```
 
-Export the model (Llama 3 in this case)
+Export the model
 ```
-python -m examples.models.llama.export_llama --model llama3 --checkpoint "${MODEL_DIR}/consolidated.00.pth" --params "${MODEL_DIR}/params.json" -kv --use_sdpa_with_kv_cache --mps -d fp32 --disable_dynamic_shape -qmode 8da4w -G 32
+python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" --params "${MODEL_DIR}/params.json" -kv --use_sdpa_with_kv_cache --mps -d fp32 --disable_dynamic_shape -qmode 8da4w -G 32
 ```
 
 ## Pushing Model and Tokenizer
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
index 63dfd334a10..201a2934470 100644
--- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
+++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
@@ -48,14 +48,14 @@ sh examples/models/llama/install_requirements.sh
 Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
 * Export Llama model and generate .pte file as below:
 ```
-python -m examples.models.llama.export_llama --model llama3_2 --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte"
+python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte"
 ```
 
 ### For Llama 3.2 1B and 3B QAT+LoRA models
 Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
 * Export Llama model and generate .pte file as below:
 ```
-python -m examples.models.llama.export_llama --model llama3_2 --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte"
+python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte"
 ```
 
 ### For Llama 3.2 1B and 3B BF16 models
@@ -64,7 +64,7 @@ We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B
 * Export Llama model and generate .pte file as below:
 
 ```
-python -m examples.models.llama.export_llama --model llama3_2 --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte"
+python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte"
 ```
 
 For more detail using Llama 3.2 lightweight models including prompt template, please go to our official [website](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2#-llama-3.2-lightweight-models-(1b/3b)-).
@@ -73,7 +73,7 @@ For more detail using Llama 3.2 lightweight models including prompt template, pl
 
 Export the model
 ```
-python -m examples.models.llama.export_llama --model llama3_2 --checkpoint <path-to-your-checkpoint.pth> -p <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w  --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
+python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> -p <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w  --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
 ```
 
 ### For LLaVA model
diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
index d10ddcdc61d..cfa0fe04b1b 100644
--- a/examples/models/llama/README.md
+++ b/examples/models/llama/README.md
@@ -168,7 +168,6 @@ LLAMA_CHECKPOINT=path/to/checkpoint.pth
 LLAMA_PARAMS=path/to/params.json
 
 python -m examples.models.llama.export_llama \
-  --model llama3_2
   --checkpoint "${LLAMA_CHECKPOINT:?}" \
   --params "${LLAMA_PARAMS:?}" \
   -kv \
@@ -190,7 +189,6 @@ LLAMA_QUANTIZED_CHECKPOINT=path/to/spinquant/checkpoint.pth
 LLAMA_PARAMS=path/to/spinquant/params.json
 
 python -m examples.models.llama.export_llama \
-   --model llama3_2
    --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \
    --params "${LLAMA_PARAMS:?}" \
    --use_sdpa_with_kv_cache \
@@ -216,7 +214,6 @@ LLAMA_QUANTIZED_CHECKPOINT=path/to/qlora/checkpoint.pth
 LLAMA_PARAMS=path/to/qlora/params.json
 
 python -m examples.models.llama.export_llama \
-   --model llama3_2
    --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \
    --params "${LLAMA_PARAMS:?}" \
    -qat \
diff --git a/examples/models/llama/UTILS.md b/examples/models/llama/UTILS.md
index f28da9c5a18..27a7a5832d3 100644
--- a/examples/models/llama/UTILS.md
+++ b/examples/models/llama/UTILS.md
@@ -19,7 +19,7 @@ From `executorch` root:
     ```
 3. Export model and generate `.pte` file.
     ```
-    python -m examples.models.llama.export_llama --model llama3 -c stories110M.pt -p params.json -X -kv
+    python -m examples.models.llama.export_llama -c stories110M.pt -p params.json -X -kv
     ```
 
 ## Smaller model delegated to other backends
@@ -27,9 +27,9 @@ From `executorch` root:
 Currently we supported lowering the stories model to other backends, including, CoreML, MPS and QNN. Please refer to the instruction
 for each backend ([CoreML](https://pytorch.org/executorch/main/build-run-coreml.html), [MPS](https://pytorch.org/executorch/main/build-run-mps.html), [QNN](https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html)) before trying to lower them. After the backend library is installed, the script to export a lowered model is
 
-- Lower to CoreML: `python -m examples.models.llama.export_llama --model llama3 -kv --disable_dynamic_shape --coreml -c stories110M.pt -p params.json `
-- MPS: `python -m examples.models.llama.export_llama --model llama3 -kv --disable_dynamic_shape --mps -c stories110M.pt -p params.json `
-- QNN: `python -m examples.models.llama.export_llama --model llama3 -kv --disable_dynamic_shape --qnn -c stories110M.pt -p params.json `
+- Lower to CoreML: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --coreml -c stories110M.pt -p params.json `
+- MPS: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --mps -c stories110M.pt -p params.json `
+- QNN: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --qnn -c stories110M.pt -p params.json `
 
 The iOS LLAMA app supports the CoreML and MPS model and the Android LLAMA app supports the QNN model. On Android, it also allow to cross compiler the llama runner binary, push to the device and run.
 
diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md
index 6e0b3794a74..92ddbf74d94 100644
--- a/examples/models/llama2/README.md
+++ b/examples/models/llama2/README.md
@@ -37,7 +37,7 @@ You can export and run the original Llama 2 7B model.
 
 3. Export model and generate `.pte` file:
     ```
-    python -m examples.models.llama.export_llama --model llama2 --checkpoint <checkpoint.pth> --params <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32
+    python -m examples.models.llama.export_llama --checkpoint <checkpoint.pth> --params <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32
     ```
 4. Create tokenizer.bin.
     ```

From 84422d9c4e3ae6b1f10ef4d8fbc129afd85bf9ee Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Wed, 13 Nov 2024 07:43:02 -0800
Subject: [PATCH 31/44] Fixes

---
 examples/models/llama/runner/native.py   | 20 +++++++-------------
 examples/models/llama3_2_vision/model.py |  2 +-
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/examples/models/llama/runner/native.py b/examples/models/llama/runner/native.py
index e871cfc77fd..06ee8e3e713 100644
--- a/examples/models/llama/runner/native.py
+++ b/examples/models/llama/runner/native.py
@@ -23,7 +23,7 @@
 from executorch.examples.models.llama.runner.generation import LlamaRunner
 
 # Note: import this after portable_lib
-# from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa # usort: skip
+from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa # usort: skip
 from executorch.kernels import quantized  # noqa
 
 
@@ -50,17 +50,11 @@ def forward(
         tokens: torch.Tensor,
         input_pos: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        # TODO: in LlamaRunner there is a generate function that automatically generates
-        # input_pos tensor and inputs it into the model. Atm TorchTune models use
-        # kwargs for the input_pos, so we will need to make some changes. At least
-        # for the time being, we can run the non-kv cache version of the Torchtune
-        # model with just the tokens like below.
-        return (self.model.forward((tokens,)))[0]
-        # return (
-        #     self.model.forward((tokens, input_pos))
-        #     if input_pos is not None
-        #     else self.model.forward((tokens,))
-        # )[0]
+        return (
+            self.model.forward((tokens, input_pos))
+            if input_pos is not None
+            else self.model.forward((tokens,))
+        )[0]
 
 
 def build_args_parser() -> argparse.ArgumentParser:
@@ -69,7 +63,7 @@ def build_args_parser() -> argparse.ArgumentParser:
 
     parser.add_argument(
         "--model",
-        default="llama",
+        default="llama3",
         choices=EXECUTORCH_DEFINED_MODELS + TORCHTUNE_DEFINED_MODELS,
     )
 
diff --git a/examples/models/llama3_2_vision/model.py b/examples/models/llama3_2_vision/model.py
index e858ad5f4af..73d79bb08e0 100644
--- a/examples/models/llama3_2_vision/model.py
+++ b/examples/models/llama3_2_vision/model.py
@@ -111,7 +111,7 @@ def __init__(self, **kwargs):
         # Load checkpoint.
         missing, unexpected = self.model_.load_state_dict(
             checkpoint,
-            strict=True,
+            strict=False,
             assign=True,
         )
         if kwargs.get("verbose", False):

From 116376973e2aeec33d60bd149cc793825fe0aab6 Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Wed, 13 Nov 2024 13:12:21 -0800
Subject: [PATCH 32/44] Remove token count printing

---
 examples/models/llama/runner/generation.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/examples/models/llama/runner/generation.py b/examples/models/llama/runner/generation.py
index 9eb8f79633a..34c8d2f893a 100644
--- a/examples/models/llama/runner/generation.py
+++ b/examples/models/llama/runner/generation.py
@@ -110,9 +110,7 @@ def generate(  # noqa: C901
         print(f"{self.tokenizer.decode_token(current_token)}", end="", flush=True)
         tokens = prompt_tokens + [current_token]
 
-        i = 0
         while len(tokens) < max_seq_len:
-            print(f"{i} out of {self.max_seq_len} max tokens generated")
             if self.use_kv_cache:
                 logits = self.forward(
                     tokens=torch.tensor(
@@ -142,7 +140,6 @@ def generate(  # noqa: C901
             ):
                 break
 
-            i += 1
             print(f"{self.tokenizer.decode_token(current_token)}", end="", flush=True)
         print("\n")
 

From aa289eafba9b30d1c8df46ce701c443dd0a9c452 Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Wed, 13 Nov 2024 13:44:07 -0800
Subject: [PATCH 33/44] Fix faulty merge

---
 examples/models/llama/runner/generation.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/examples/models/llama/runner/generation.py b/examples/models/llama/runner/generation.py
index 6090c8ec715..b75b67ab857 100644
--- a/examples/models/llama/runner/generation.py
+++ b/examples/models/llama/runner/generation.py
@@ -53,12 +53,25 @@ def __init__(
         max_batch_size: int,
         use_kv_cache: bool,
         vocab_size: int,
+        has_full_logits: bool = False,
         device: str = "cpu",
     ):
+        """
+        Constructor.
+        Args:
+        tokenizer_path: path to tokenizer.model file.
+        max_seq_len: max length of the output sequence, after which the output will be clipped.
+        max_batch_size: max batch size.
+        use_kv_cache: whether to use a KV cache.
+        vocab_size: number of items in the vocab.
+        has_full_logits: whether the model returns the full logits or only returns the last logit.
+        device: device to run the runner on.
+        """
         self.max_seq_len = max_seq_len
         self.max_batch_size = max_batch_size
         self.use_kv_cache = use_kv_cache
         self.tokenizer = get_tokenizer(tokenizer_path)
+        self.has_full_logits = has_full_logits
         self.device = device
         assert vocab_size == self.tokenizer.n_words
 

From eeeeb8a4a02270a24e2d5b707b97373772bde73e Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Wed, 13 Nov 2024 14:07:13 -0800
Subject: [PATCH 34/44] Add runner

---
 .../models/llama3_2_vision/runner/eager.py    |  85 +++++++++++++++
 .../llama3_2_vision/runner/generation.py      | 101 ++++++++++++++++++
 2 files changed, 186 insertions(+)
 create mode 100644 examples/models/llama3_2_vision/runner/eager.py
 create mode 100644 examples/models/llama3_2_vision/runner/generation.py

diff --git a/examples/models/llama3_2_vision/runner/eager.py b/examples/models/llama3_2_vision/runner/eager.py
new file mode 100644
index 00000000000..ea327ad6cc1
--- /dev/null
+++ b/examples/models/llama3_2_vision/runner/eager.py
@@ -0,0 +1,85 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import json
+from typing import Optional
+
+import torch
+
+from executorch.examples.models.llama.export_llama_lib import (
+    _prepare_for_llama_export,
+    build_args_parser as _build_args_parser,
+    TORCHTUNE_DEFINED_MODELS,
+)
+from executorch.examples.models.llama3_2_vision.runner.generation import TorchTuneLlamaRunner
+from executorch.extension.llm.export import LLMEdgeManager
+
+
+class EagerLlamaRunner(TorchTuneLlamaRunner):
+    """
+    Runs llama in eager mode with provided checkpoint file.
+    """
+
+    def __init__(self, args):
+        with open(args.params, "r") as f:
+            params = json.loads(f.read())
+        super().__init__(
+            tokenizer_path=args.tokenizer_path,
+            max_seq_len=args.max_seq_length,
+            max_batch_size=1,
+            use_kv_cache=args.use_kv_cache,
+            vocab_size=params["vocab_size"],
+            device="cuda" if torch.cuda.is_available() else "cpu",
+        )
+        manager: LLMEdgeManager = _prepare_for_llama_export(args)
+        self.model = manager.model.eval().to(device=self.device)
+
+    def forward(
+        self,
+        tokens: Optional[torch.LongTensor] = None,
+        input_pos: Optional[torch.LongTensor] = None,
+        mask: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        return self.model.forward(tokens=tokens, input_pos=input_pos, mask=mask)
+
+
+def build_args_parser() -> argparse.ArgumentParser:
+    parser = _build_args_parser()
+
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="Hello",
+    )
+
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0,
+    )
+
+    return parser
+
+
+def main() -> None:
+    parser = build_args_parser()
+    args = parser.parse_args()
+
+    runner = EagerLlamaRunner(args)
+    result = runner.text_completion(
+        prompt=args.prompt,
+        temperature=args.temperature,
+    )
+    print(
+        "Response: \n{response}\n Tokens:\n {tokens}".format(
+            response=result["generation"], tokens=result["tokens"]
+        )
+    )
+
+
+if __name__ == "__main__":
+    main()  # pragma: no cover
diff --git a/examples/models/llama3_2_vision/runner/generation.py b/examples/models/llama3_2_vision/runner/generation.py
new file mode 100644
index 00000000000..c8e24aecb4b
--- /dev/null
+++ b/examples/models/llama3_2_vision/runner/generation.py
@@ -0,0 +1,101 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from abc import ABC, abstractmethod
+from typing import List, Optional, TypedDict
+
+import torch
+
+from executorch.extension.llm.tokenizer.utils import get_tokenizer
+from executorch.examples.models.llama.runner.generation import LlamaRunner, next_token, sample_top_p
+
+
+class TorchTuneLlamaRunner(LlamaRunner):
+    def __init__(
+        self,
+        tokenizer_path: str,
+        max_seq_len: int,
+        max_batch_size: int,
+        use_kv_cache: bool,
+        vocab_size: int,
+        device: str = "cpu",
+    ):
+        super().__init__(
+            tokenizer_path,
+            max_seq_len,
+            max_batch_size,
+            use_kv_cache,
+            vocab_size,
+            device,
+        )
+
+        self.causal_mask = torch.tril(
+            torch.ones(
+                size=(max_seq_len, max_seq_len),
+                dtype=torch.bool,
+            )
+        )
+        self.input_pos = torch.arange(max_seq_len)
+
+    def generate(  # noqa: C901
+        self,
+        prompt_tokens: List[int],
+        max_seq_len: int,
+        temperature: float = 0.8,
+        top_p: float = 0.9,
+        echo: bool = False,
+    ) -> List[int]:
+        # Prefill
+        seq_len = len(prompt_tokens)
+        input_pos = self.input_pos[None, :seq_len]
+        mask = self.causal_mask[None, :seq_len]
+        if self.use_kv_cache:
+            logits = self.forward(
+                tokens=torch.tensor([prompt_tokens], dtype=torch.long, device=self.device),
+                input_pos=input_pos,
+                mask=mask,
+            )
+        else:
+            logits = self.forward(
+                tokens=torch.tensor([prompt_tokens], dtype=torch.long, device=self.device),
+            )
+
+        # Only need the last logit.
+        current_token = next_token(logits[:, -1, :], temperature, top_p)
+        print(f"{self.tokenizer.decode_token(current_token)}", end="", flush=True)
+        tokens = prompt_tokens + [current_token]
+
+        while len(tokens) < max_seq_len:
+            mask = self.causal_mask[None, seq_len, None, :]
+            input_pos = self.input_pos[None, seq_len, None]
+            if self.use_kv_cache:
+                logits = self.forward(
+                    tokens=torch.tensor(
+                        [[current_token]], dtype=torch.long, device=self.device
+                    ),
+                    input_pos=input_pos,
+                    mask=mask,
+                )
+            else:
+                logits = self.forward(
+                    tokens=torch.tensor([tokens], dtype=torch.long, device=self.device),
+                )
+
+            # Only need the last logit.
+            current_token = next_token(logits[:, -1, :], temperature, top_p)
+            tokens.append(current_token)
+
+            if current_token == self.tokenizer.eos_id or (
+                hasattr(self.tokenizer, "stop_tokens")
+                and current_token in self.tokenizer.stop_tokens
+            ):
+                break
+
+            print(f"{self.tokenizer.decode_token(current_token)}", end="", flush=True)
+            seq_len += 1
+
+        return tokens if echo else tokens[len(prompt_tokens) :]
+

From c80ce1c69def0421ac475d458edf8855296a97ef Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Wed, 13 Nov 2024 14:11:16 -0800
Subject: [PATCH 35/44] Remove has_full_logits from llama runner

---
 examples/models/llama/runner/eager.py      |  2 --
 examples/models/llama/runner/generation.py | 13 ++-----------
 examples/models/llama/runner/native.py     |  1 -
 3 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/examples/models/llama/runner/eager.py b/examples/models/llama/runner/eager.py
index b2f29a8f6bb..1dedb80324a 100644
--- a/examples/models/llama/runner/eager.py
+++ b/examples/models/llama/runner/eager.py
@@ -13,7 +13,6 @@
 from executorch.examples.models.llama.export_llama_lib import (
     _prepare_for_llama_export,
     build_args_parser as _build_args_parser,
-    TORCHTUNE_DEFINED_MODELS,
 )
 from executorch.examples.models.llama.runner.generation import LlamaRunner
 from executorch.extension.llm.export.builder import LLMEdgeManager
@@ -33,7 +32,6 @@ def __init__(self, args):
             max_batch_size=1,
             use_kv_cache=args.use_kv_cache,
             vocab_size=params["vocab_size"],
-            has_full_logits=args.model in TORCHTUNE_DEFINED_MODELS,
             device="cuda" if torch.cuda.is_available() else "cpu",
         )
         manager: LLMEdgeManager = _prepare_for_llama_export(args)
diff --git a/examples/models/llama/runner/generation.py b/examples/models/llama/runner/generation.py
index b75b67ab857..83107abd449 100644
--- a/examples/models/llama/runner/generation.py
+++ b/examples/models/llama/runner/generation.py
@@ -53,7 +53,6 @@ def __init__(
         max_batch_size: int,
         use_kv_cache: bool,
         vocab_size: int,
-        has_full_logits: bool = False,
         device: str = "cpu",
     ):
         """
@@ -64,14 +63,12 @@ def __init__(
         max_batch_size: max batch size.
         use_kv_cache: whether to use a KV cache.
         vocab_size: number of items in the vocab.
-        has_full_logits: whether the model returns the full logits or only returns the last logit.
         device: device to run the runner on.
         """
         self.max_seq_len = max_seq_len
         self.max_batch_size = max_batch_size
         self.use_kv_cache = use_kv_cache
         self.tokenizer = get_tokenizer(tokenizer_path)
-        self.has_full_logits = has_full_logits
         self.device = device
         assert vocab_size == self.tokenizer.n_words
 
@@ -102,10 +99,7 @@ def generate(  # noqa: C901
             ),
         )
 
-        if self.has_full_logits:
-            current_token = next_token(logits[:, -1, :], temperature, top_p)
-        else:
-            current_token = next_token(logits, temperature, top_p)
+        current_token = next_token(logits, temperature, top_p)
         print(f"{self.tokenizer.decode_token(current_token)}", end="", flush=True)
         tokens = prompt_tokens + [current_token]
 
@@ -127,10 +121,7 @@ def generate(  # noqa: C901
                 )
 
             # If the logits aren't already clipped to only contain the last logit, clip them.
-            if self.has_full_logits:
-                current_token = next_token(logits[:, -1, :], temperature, top_p)
-            else:
-                current_token = next_token(logits, temperature, top_p)
+            current_token = next_token(logits, temperature, top_p)
             tokens.append(current_token)
 
             if current_token == self.tokenizer.eos_id or (
diff --git a/examples/models/llama/runner/native.py b/examples/models/llama/runner/native.py
index 06ee8e3e713..62757506f3b 100644
--- a/examples/models/llama/runner/native.py
+++ b/examples/models/llama/runner/native.py
@@ -41,7 +41,6 @@ def __init__(self, args):
             max_batch_size=1,
             use_kv_cache=args.kv_cache,
             vocab_size=params["vocab_size"],
-            has_full_logits=args.model in TORCHTUNE_DEFINED_MODELS,
         )
         self.model = _load_for_executorch(args.pte)
 

From 9bd405fa3652de3807d12505bebdd6734fce58a4 Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Wed, 13 Nov 2024 14:11:36 -0800
Subject: [PATCH 36/44] Lint

---
 examples/models/llama3_2_vision/model.py         |  2 +-
 examples/models/llama3_2_vision/runner/eager.py  |  5 +++--
 .../models/llama3_2_vision/runner/generation.py  | 16 ++++++++--------
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/examples/models/llama3_2_vision/model.py b/examples/models/llama3_2_vision/model.py
index c66c1ecfc49..65c11b20322 100644
--- a/examples/models/llama3_2_vision/model.py
+++ b/examples/models/llama3_2_vision/model.py
@@ -175,7 +175,7 @@ def get_dynamic_shapes(self):
                 # "encoder_input": {0: 1, 1: dim_enc, 2: 4096},
                 # "encoder_mask": {0: 1, 1: dim, 2: dim_enc},
                 "mask": {0: batch_size, 1: dim_seq_len, 2: None},
-                "input_pos" : {0: batch_size, 1: dim_seq_len},
+                "input_pos": {0: batch_size, 1: dim_seq_len},
             }
         else:
             dynamic_shapes = {
diff --git a/examples/models/llama3_2_vision/runner/eager.py b/examples/models/llama3_2_vision/runner/eager.py
index ea327ad6cc1..36cc7349f23 100644
--- a/examples/models/llama3_2_vision/runner/eager.py
+++ b/examples/models/llama3_2_vision/runner/eager.py
@@ -13,9 +13,10 @@
 from executorch.examples.models.llama.export_llama_lib import (
     _prepare_for_llama_export,
     build_args_parser as _build_args_parser,
-    TORCHTUNE_DEFINED_MODELS,
 )
-from executorch.examples.models.llama3_2_vision.runner.generation import TorchTuneLlamaRunner
+from executorch.examples.models.llama3_2_vision.runner.generation import (
+    TorchTuneLlamaRunner,
+)
 from executorch.extension.llm.export import LLMEdgeManager
 
 
diff --git a/examples/models/llama3_2_vision/runner/generation.py b/examples/models/llama3_2_vision/runner/generation.py
index c8e24aecb4b..e17760fd852 100644
--- a/examples/models/llama3_2_vision/runner/generation.py
+++ b/examples/models/llama3_2_vision/runner/generation.py
@@ -4,13 +4,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from abc import ABC, abstractmethod
-from typing import List, Optional, TypedDict
+from typing import List
 
 import torch
-
-from executorch.extension.llm.tokenizer.utils import get_tokenizer
-from executorch.examples.models.llama.runner.generation import LlamaRunner, next_token, sample_top_p
+from executorch.examples.models.llama.runner.generation import LlamaRunner, next_token
 
 
 class TorchTuneLlamaRunner(LlamaRunner):
@@ -54,13 +51,17 @@ def generate(  # noqa: C901
         mask = self.causal_mask[None, :seq_len]
         if self.use_kv_cache:
             logits = self.forward(
-                tokens=torch.tensor([prompt_tokens], dtype=torch.long, device=self.device),
+                tokens=torch.tensor(
+                    [prompt_tokens], dtype=torch.long, device=self.device
+                ),
                 input_pos=input_pos,
                 mask=mask,
             )
         else:
             logits = self.forward(
-                tokens=torch.tensor([prompt_tokens], dtype=torch.long, device=self.device),
+                tokens=torch.tensor(
+                    [prompt_tokens], dtype=torch.long, device=self.device
+                ),
             )
 
         # Only need the last logit.
@@ -98,4 +99,3 @@ def generate(  # noqa: C901
             seq_len += 1
 
         return tokens if echo else tokens[len(prompt_tokens) :]
-

From 7507002a619d06a4c581ad77db22fb1bb60dcefb Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Wed, 13 Nov 2024 14:50:15 -0800
Subject: [PATCH 37/44] Modularize and update base eager runner

---
 examples/models/llama/runner/eager.py         | 11 ++++--
 .../models/llama3_2_vision/runner/eager.py    | 39 ++-----------------
 2 files changed, 10 insertions(+), 40 deletions(-)

diff --git a/examples/models/llama/runner/eager.py b/examples/models/llama/runner/eager.py
index 1dedb80324a..89ae44635e6 100644
--- a/examples/models/llama/runner/eager.py
+++ b/examples/models/llama/runner/eager.py
@@ -6,7 +6,7 @@
 
 import argparse
 import json
-from typing import Optional
+from typing import Optional, Type
 
 import torch
 
@@ -77,11 +77,10 @@ def build_args_parser() -> argparse.ArgumentParser:
     return parser
 
 
-def main() -> None:
+def execute_runner(runner_class: Type[LlamaRunner]) -> None:
     parser = build_args_parser()
     args = parser.parse_args()
-
-    runner = EagerLlamaRunner(args)
+    runner = runner_class(args)
     generated_tokens = (
         runner.chat_completion(temperature=args.temperature)
         if args.chat
@@ -95,5 +94,9 @@ def main() -> None:
         print(f"Generated {len(generated_tokens)} tokens: {generated_tokens}")
 
 
+def main() -> None:
+    execute_runner(EagerLlamaRunner)
+
+
 if __name__ == "__main__":
     main()  # pragma: no cover
diff --git a/examples/models/llama3_2_vision/runner/eager.py b/examples/models/llama3_2_vision/runner/eager.py
index 36cc7349f23..c5d91013077 100644
--- a/examples/models/llama3_2_vision/runner/eager.py
+++ b/examples/models/llama3_2_vision/runner/eager.py
@@ -4,16 +4,13 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import argparse
 import json
 from typing import Optional
 
 import torch
 
-from executorch.examples.models.llama.export_llama_lib import (
-    _prepare_for_llama_export,
-    build_args_parser as _build_args_parser,
-)
+from executorch.examples.models.llama.export_llama_lib import _prepare_for_llama_export
+from executorch.examples.models.llama.runner.eager import execute_runner
 from executorch.examples.models.llama3_2_vision.runner.generation import (
     TorchTuneLlamaRunner,
 )
@@ -48,38 +45,8 @@ def forward(
         return self.model.forward(tokens=tokens, input_pos=input_pos, mask=mask)
 
 
-def build_args_parser() -> argparse.ArgumentParser:
-    parser = _build_args_parser()
-
-    parser.add_argument(
-        "--prompt",
-        type=str,
-        default="Hello",
-    )
-
-    parser.add_argument(
-        "--temperature",
-        type=float,
-        default=0,
-    )
-
-    return parser
-
-
 def main() -> None:
-    parser = build_args_parser()
-    args = parser.parse_args()
-
-    runner = EagerLlamaRunner(args)
-    result = runner.text_completion(
-        prompt=args.prompt,
-        temperature=args.temperature,
-    )
-    print(
-        "Response: \n{response}\n Tokens:\n {tokens}".format(
-            response=result["generation"], tokens=result["tokens"]
-        )
-    )
+    execute_runner(EagerLlamaRunner)
 
 
 if __name__ == "__main__":

From e5428de267a07b0e927ebc9c2b4f9129aafdddc0 Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Thu, 14 Nov 2024 09:45:52 -0800
Subject: [PATCH 38/44] Move to subdir

---
 examples/models/llama3_2_vision/__init__.py                     | 2 +-
 examples/models/llama3_2_vision/{ => text_decoder}/model.py     | 0
 .../llama3_2_vision/{ => text_decoder}/params/demo_config.json  | 0
 3 files changed, 1 insertion(+), 1 deletion(-)
 rename examples/models/llama3_2_vision/{ => text_decoder}/model.py (100%)
 rename examples/models/llama3_2_vision/{ => text_decoder}/params/demo_config.json (100%)

diff --git a/examples/models/llama3_2_vision/__init__.py b/examples/models/llama3_2_vision/__init__.py
index 3c385703d72..0629abbb35b 100644
--- a/examples/models/llama3_2_vision/__init__.py
+++ b/examples/models/llama3_2_vision/__init__.py
@@ -4,6 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from .model import Llama3_2Decoder
+from .text_decoder.model import Llama3_2Decoder
 
 __all__ = [Llama3_2Decoder]
diff --git a/examples/models/llama3_2_vision/model.py b/examples/models/llama3_2_vision/text_decoder/model.py
similarity index 100%
rename from examples/models/llama3_2_vision/model.py
rename to examples/models/llama3_2_vision/text_decoder/model.py
diff --git a/examples/models/llama3_2_vision/params/demo_config.json b/examples/models/llama3_2_vision/text_decoder/params/demo_config.json
similarity index 100%
rename from examples/models/llama3_2_vision/params/demo_config.json
rename to examples/models/llama3_2_vision/text_decoder/params/demo_config.json

From f61a34701b831746a13a7dfb2118c185b0b28889 Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Thu, 14 Nov 2024 10:38:38 -0800
Subject: [PATCH 39/44] Tarun rev

---
 extension/llm/export/builder.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index 385daa8ee87..ebc7f02ee1a 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -194,10 +194,10 @@ def export(self) -> "LLMEdgeManager":
                     strict=True,
                 )
             else:
-                print("Exporting with:")
-                print(f"inputs: {self.example_inputs}")
-                print(f"kwargs: {self.example_kwarg_inputs}")
-                print(f"dynamic shapes: {dynamic_shape}")
+                logging.info("Exporting with:")
+                logging.info(f"inputs: {self.example_inputs}")
+                logging.info(f"kwargs: {self.example_kwarg_inputs}")
+                logging.info(f"dynamic shapes: {dynamic_shape}")
                 exported_module = export_for_training(
                     self.model,
                     self.example_inputs,

From 7a0101f413adbf89d46862b8d03743a704dc8b58 Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Thu, 14 Nov 2024 12:49:41 -0800
Subject: [PATCH 40/44] Add automatically generated export tests

---
 .ci/scripts/gather_test_models.py | 1 +
 examples/models/__init__.py       | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.ci/scripts/gather_test_models.py b/.ci/scripts/gather_test_models.py
index e22e1965678..078561c9d85 100755
--- a/.ci/scripts/gather_test_models.py
+++ b/.ci/scripts/gather_test_models.py
@@ -25,6 +25,7 @@
         "resnet50": "linux.12xlarge",
         "llava": "linux.12xlarge",
         "llama3_2_vision_encoder": "linux.12xlarge",
+        "llama3_2_text_decoder": "linux.12xlarge",
         # This one causes timeout on smaller runner, the root cause is unclear (T161064121)
         "dl3": "linux.12xlarge",
         "emformer_join": "linux.12xlarge",
diff --git a/examples/models/__init__.py b/examples/models/__init__.py
index d3f2a74f4d9..842b87241cc 100644
--- a/examples/models/__init__.py
+++ b/examples/models/__init__.py
@@ -19,6 +19,7 @@
     "llama2": ("llama", "Llama2Model"),
     "llama": ("llama", "Llama2Model"),
     "llama3_2_vision_encoder": ("llama3_2_vision", "FlamingoVisionEncoderModel"),
+    "llama3_2_text_decoder": ("llama3_2_vision", "Llama3_2Decoder"),
     "lstm": ("lstm", "LSTMModel"),
     "mobilebert": ("mobilebert", "MobileBertModelExample"),
     "mv2": ("mobilenet_v2", "MV2Model"),

From 9777e233c77b38d69022459c5c1db36d4a6a69bd Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Thu, 14 Nov 2024 12:52:13 -0800
Subject: [PATCH 41/44] Fix internal pyre warning

---
 examples/models/llama/export_llama_lib.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index feeacf2bb4a..7ebdf95418d 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -888,6 +888,8 @@ def _load_llama_model(
         if modelname == "llama3_2_vision":
             module_name = "llama3_2_vision"
             model_class_name = "Llama3_2Decoder"
+        else:
+            raise ValueError(f"{modelname} is not a valid Llama model.")
     else:
         raise ValueError(f"{modelname} is not a valid Llama model.")
 

From 1e26f6097995571391178a1b4d8aaa4647325eea Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Thu, 14 Nov 2024 16:13:37 -0800
Subject: [PATCH 42/44] Add executorch runner

---
 .../models/llama3_2_vision/runner/native.py   | 131 ++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 examples/models/llama3_2_vision/runner/native.py

diff --git a/examples/models/llama3_2_vision/runner/native.py b/examples/models/llama3_2_vision/runner/native.py
new file mode 100644
index 00000000000..9a28c94f9c2
--- /dev/null
+++ b/examples/models/llama3_2_vision/runner/native.py
@@ -0,0 +1,131 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import json
+from typing import Optional
+
+import torch
+
+from executorch.examples.models.llama.export_llama_lib import (
+    EXECUTORCH_DEFINED_MODELS,
+    TORCHTUNE_DEFINED_MODELS,
+)
+from executorch.examples.models.llama3_2_vision.runner.generation import (
+    TorchTuneLlamaRunner,
+)
+
+from executorch.extension.pybindings.portable_lib import _load_for_executorch
+
+# Load custom ops and quantized ops.
+from executorch.extension.pybindings import portable_lib  # noqa # usort: skip
+
+# Note: import this after portable_lib
+from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa # usort: skip
+from executorch.kernels import quantized  # noqa
+
+
+class NativeLlamaRunner(TorchTuneLlamaRunner):
+    """
+    Runs llama via ExecuTorch with provided pte file.
+    """
+
+    def __init__(self, args):
+        with open(args.params, "r") as f:
+            params = json.loads(f.read())
+        super().__init__(
+            tokenizer_path=args.tokenizer,
+            max_seq_len=args.max_len,
+            max_batch_size=1,
+            use_kv_cache=args.kv_cache,
+            vocab_size=params["vocab_size"],
+        )
+        self.model = _load_for_executorch(args.pte)
+        self.use_kv_cache = args.kv_cache
+
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        input_pos: Optional[torch.Tensor] = None,
+        mask: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        return (
+            self.model.forward((tokens, input_pos, mask))
+            if self.use_kv_cache
+            else self.model.forward((tokens,))
+        )[0]
+
+
+def build_args_parser() -> argparse.ArgumentParser:
+    # TODO: merge these with build_args_parser from export_llama_lib.
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--model",
+        default="llama3",
+        choices=EXECUTORCH_DEFINED_MODELS + TORCHTUNE_DEFINED_MODELS,
+    )
+
+    parser.add_argument(
+        "-f",
+        "--pte",
+        type=str,
+        default=None,
+        help="path to exported executorch .pte file",
+    )
+
+    parser.add_argument(
+        "-p", "--params", type=str, default=None, help="model params file"
+    )
+
+    parser.add_argument(
+        "-t",
+        "--tokenizer",
+        type=str,
+        default=None,
+    )
+
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="Hello",
+    )
+
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.6,
+    )
+
+    parser.add_argument(
+        "-kv",
+        "--kv_cache",
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "--max_len",
+        type=int,
+        default=128,
+        help="Maximum length of the generated response sequence.",
+    )
+
+    return parser
+
+
+def main() -> None:
+    parser = build_args_parser()
+    args = parser.parse_args()
+    runner = NativeLlamaRunner(args)
+    generated_tokens = runner.text_completion(
+        prompt=args.prompt,
+        temperature=args.temperature,
+    )
+    print(f"Response: {generated_tokens}")
+
+
+if __name__ == "__main__":
+    main()  # pragma: no cover

From f8f8f06f6f4173c0380ad1879719a7d0080b8733 Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Thu, 14 Nov 2024 17:32:38 -0800
Subject: [PATCH 43/44] Fix test

---
 examples/models/llama/runner/generation.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/examples/models/llama/runner/generation.py b/examples/models/llama/runner/generation.py
index 46033705126..6d0da4bc1e5 100644
--- a/examples/models/llama/runner/generation.py
+++ b/examples/models/llama/runner/generation.py
@@ -100,10 +100,7 @@ def generate(  # noqa: C901
             ),
         )
 
-        if self.has_full_logits:
-            current_token = next_token(logits[:, -1, :], temperature, top_p)
-        else:
-            current_token = next_token(logits, temperature, top_p)
+        current_token = next_token(logits, temperature, top_p)
         print(f"{self.tokenizer.decode_token(current_token)}", end="", flush=True)
         tokens = prompt_tokens + [current_token]
 

From 09e96752a631278a84905e1dc43ecc37d69384bd Mon Sep 17 00:00:00 2001
From: Jack Zhang <dvorjackz@gmail.com>
Date: Thu, 14 Nov 2024 18:41:57 -0800
Subject: [PATCH 44/44] Lint

---
 examples/models/llama/runner/eager.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/models/llama/runner/eager.py b/examples/models/llama/runner/eager.py
index 2918e7b0503..89ae44635e6 100644
--- a/examples/models/llama/runner/eager.py
+++ b/examples/models/llama/runner/eager.py
@@ -13,7 +13,6 @@
 from executorch.examples.models.llama.export_llama_lib import (
     _prepare_for_llama_export,
     build_args_parser as _build_args_parser,
-    TORCHTUNE_DEFINED_MODELS,
 )
 from executorch.examples.models.llama.runner.generation import LlamaRunner
 from executorch.extension.llm.export.builder import LLMEdgeManager