diff --git a/backends/qualcomm/serialization/qc_compiler_spec.fbs b/backends/qualcomm/serialization/qc_compiler_spec.fbs
index 8aeaa060a50..2e5c3ce9204 100644
--- a/backends/qualcomm/serialization/qc_compiler_spec.fbs
+++ b/backends/qualcomm/serialization/qc_compiler_spec.fbs
@@ -18,6 +18,7 @@ enum HtpArch: int {
   V73 = 73,
   V75 = 75,
   V79 = 79,
+  V81 = 81,
 }
 
 table HtpInfo {
@@ -43,6 +44,8 @@ enum QcomChipset: int {
   SXR1230P = 45,
   SXR2230P = 53,
   SXR2330P = 75,
+  SM8850 = 87,
+  SM8735 = 85,
 }
 
 /// Indicate the information of the specified SoC.
diff --git a/backends/qualcomm/serialization/qc_schema.py b/backends/qualcomm/serialization/qc_schema.py
index f3b9e2cc1a5..7907aa46a2b 100644
--- a/backends/qualcomm/serialization/qc_schema.py
+++ b/backends/qualcomm/serialization/qc_schema.py
@@ -27,6 +27,7 @@ class HtpArch(IntEnum):
     V73 = 73
     V75 = 75
     V79 = 79
+    V81 = 81
 
 
 @dataclass
@@ -49,6 +50,8 @@ class QcomChipset(IntEnum):
     SXR1230P = 45  # v73
     SXR2230P = 53  # v69
     SXR2330P = 75  # v79
+    SM8850 = 87,  # v81
+    SM8735 = 85,  # v73
 
 
 @dataclass
@@ -69,6 +72,8 @@ class SocInfo:
     QcomChipset.SXR1230P: SocInfo(QcomChipset.SXR1230P, HtpInfo(HtpArch.V73, 2)),
     QcomChipset.SXR2230P: SocInfo(QcomChipset.SXR2230P, HtpInfo(HtpArch.V69, 8)),
     QcomChipset.SXR2330P: SocInfo(QcomChipset.SXR2330P, HtpInfo(HtpArch.V79, 8)),
+    QcomChipset.SM8850: SocInfo(QcomChipset.SM8850, HtpInfo(HtpArch.V81, 8)),
+    QcomChipset.SM8735: SocInfo(QcomChipset.SM8735, HtpInfo(HtpArch.V73, 8)),
 }
 
 
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
index be4e86de50f..5d2ef263a47 100644
--- a/backends/qualcomm/utils/utils.py
+++ b/backends/qualcomm/utils/utils.py
@@ -1099,6 +1099,8 @@ def get_soc_to_arch_map():
         "SXR1230P": HtpArch.V73,
         "SXR2230P": HtpArch.V69,
         "SXR2330P": HtpArch.V79,
+        "SM8850": HtpArch.V81,
+        "SM8735": HtpArch.V73,
     }
 
 
@@ -1115,6 +1117,8 @@ def get_soc_to_chipset_map():
         "SXR1230P": QcomChipset.SXR1230P,
         "SXR2230P": QcomChipset.SXR2230P,
         "SXR2330P": QcomChipset.SXR2330P,
+        "SM8850": QcomChipset.SM8850,
+        "SM8735": QcomChipset.SM8735,
     }
 
 
diff --git a/benchmark.py b/benchmark.py
new file mode 100644
index 00000000000..24c7eede835
--- /dev/null
+++ b/benchmark.py
@@ -0,0 +1,1257 @@
+import argparse
+import numpy as np
+import os
+import random
+import subprocess
+import torch
+
+from executorch.examples.models.llama.evaluate.eager_eval import EagerEvalWrapper
+
+
+### GLOBALS
+qnn_sdk = os.getenv("QNN_SDK_ROOT")
+workspace = "/data/local/tmp/et_ga_benchmark"
+memory_script_file = "peak_memory.sh"
+perf_file = "statistics.txt"
+seed = 1126
+###
+
+
+def image_classification_eval(
+    backend,
+    soc_model,
+    device,
+    host,
+    pte_path,
+    module,
+    inputs,
+    targets,
+    artifact_dir,
+):
+    from executorch.examples.qualcomm.utils import (
+        make_output_dir,
+        SimpleADB,
+        topk_accuracy,
+    )
+    import numpy as np
+    from pathlib import Path
+
+    adb = SimpleADB(
+        qnn_sdk=qnn_sdk,
+        build_path="build-android",
+        pte_path=pte_path,
+        workspace=f"/data/local/tmp/executorch/{Path(pte_path).stem}",
+        device_id=device,
+        host_id=host,
+        soc_model=soc_model,
+    )
+    files = ["build-xnnpack/executor_runner"] if backend == "xnn" else None
+    custom_commands = (
+        f"cd {adb.workspace} && ./executor_runner --model_path "
+        f"{os.path.basename(adb.pte_path[0])} --input_list_path input_list.txt"
+        if backend == "xnn" else None
+    )
+    adb.push(inputs=inputs, files=files)
+    adb.execute(custom_runner_cmd=custom_commands)
+
+    # collect output data
+    output_data_folder = f"{artifact_dir}/outputs"
+    make_output_dir(output_data_folder)
+
+    adb.pull(output_path=artifact_dir)
+
+    # top-k analysis
+    predictions, goldens = [], []
+    for input in inputs:
+        goldens.append(module(*input).logits.detach().numpy())
+
+    for i in range(len(inputs)):
+        predictions.append(
+            np.fromfile(
+                os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32
+            )
+        )
+
+    k_val = [1, 5]
+    topk = [topk_accuracy(goldens, targets, k).item() for k in k_val]
+    print("cpu:")
+    for i, k in enumerate(k_val):
+        print(f"top_{k}->{topk[i]}%")
+    topk = [topk_accuracy(predictions, targets, k).item() for k in k_val]
+    print("device:")
+    for i, k in enumerate(k_val):
+        print(f"top_{k}->{topk[i]}%")
+
+
+def masked_lm_eval(
+    backend,
+    soc_model,
+    device,
+    host,
+    pte_path,
+    module,
+    inputs,
+    targets,
+    artifact_dir,
+):
+    from executorch.examples.qualcomm.utils import SimpleADB, make_output_dir
+    import numpy as np
+    from pathlib import Path
+    import evaluate
+
+    adb = SimpleADB(
+        qnn_sdk=qnn_sdk,
+        build_path="build-android",
+        pte_path=pte_path,
+        workspace=f"/data/local/tmp/executorch/{Path(pte_path).stem}",
+        device_id=device,
+        host_id=host,
+        soc_model=soc_model,
+    )
+    files = ["build-xnnpack/executor_runner"] if backend == "xnn" else None
+    custom_commands = (
+        f"cd {adb.workspace} && ./executor_runner --model_path"
+        f" {os.path.basename(adb.pte_path[0])} --input_list_path input_list.txt"
+        if backend == "xnn" else None
+    )
+    if backend == "xnn":
+        for i, input in enumerate(inputs):
+            inputs[i] = tuple(inp.to(torch.long) for inp in input)
+
+    adb.push(inputs=inputs, files=files)
+    adb.execute(custom_runner_cmd=custom_commands)
+
+    # collect output data
+    output_data_folder = f"{artifact_dir}/outputs"
+    make_output_dir(output_data_folder)
+
+    adb.pull(output_path=artifact_dir)
+
+    labels, goldens, predictions = [], [], []
+    for i in range(len(inputs)):
+        indice = [i for i, x in enumerate(targets[i]) if x != -100]
+        labels.extend(targets[i][indice].tolist())
+        golden = module(*inputs[i]).logits.detach().numpy().argmax(axis=-1)
+        goldens.extend(golden[0, indice].tolist())
+        prediction = (
+            np.fromfile(
+                os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32
+            )
+            .reshape([1, inputs[0][0].shape[1], -1])
+            .argmax(axis=-1)
+        )
+        predictions.extend(prediction[0, indice].tolist())
+
+    metric = evaluate.load("accuracy")
+    results = metric.compute(predictions=goldens, references=labels)
+    print(f"cpu accuracy: {results['accuracy']}")
+    results = metric.compute(predictions=predictions, references=labels)
+    print(f"device accuracy: {results['accuracy']}")
+
+
+def t5_eval(
+    backend,
+    soc_model,
+    device,
+    host,
+    pte_path,
+    module,
+    inputs,
+    targets,
+    artifact_dir,
+):
+    from executorch.examples.qualcomm.utils import (
+        evaluate_squad, make_output_dir, SimpleADB
+    )
+    from executorch.examples.qualcomm.oss_scripts.t5.t5 import Seq2SeqLMExportableModulePipeline
+    from pathlib import Path
+    from transformers import AutoTokenizer
+
+    tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
+    _, _, spiece_model, _, _ = tokenizer.save_pretrained(artifact_dir)
+    max_seq_len = module.decoder.max_static_cache_length
+
+    workspace = f"/data/local/tmp/executorch/{Path(pte_path).stem}"
+    adb = SimpleADB(
+        qnn_sdk=qnn_sdk,
+        build_path="build-android",
+        pte_path=pte_path,
+        workspace=workspace,
+        device_id=device,
+        host_id=host,
+        soc_model=soc_model,
+        runner="examples/qualcomm/oss_scripts/t5/qnn_t5_runner",
+    )
+    runner_args = " ".join(
+        [
+            f"--tokenizer_model_path {os.path.basename(spiece_model)}",
+            f"--model_path {os.path.basename(pte_path)}",
+            f"--seq_len {max_seq_len}",
+            "--output_folder_path outputs",
+        ]
+    )
+    runner_cmd = " ".join(
+        [
+            f"cd {workspace} &&",
+            f"./{'qnn_t5_runner' if backend == 'qnn' else 'xnn_t5_runner'}",
+            runner_args,
+        ]
+    )
+    files = [spiece_model]
+    if backend == "xnn":
+        files.append("build-xnnpack/xnn_t5_runner")
+
+    adb.push(inputs=inputs, files=files)
+    adb.execute(custom_runner_cmd=runner_cmd)
+
+    # collect output data
+    output_data_folder = f"{artifact_dir}/outputs"
+    make_output_dir(output_data_folder)
+
+    outputs = []
+    def post_process():
+        for i in range(len(inputs)):
+            with open(f"{artifact_dir}/outputs/output_{i}.txt", "r") as f:
+                outputs.append(f.read())
+    adb.pull(output_path=artifact_dir, callback=post_process)
+
+    # cpu inference
+    goldens = []
+    with torch.no_grad():
+        for input in inputs:
+            # run encoder
+            hidden_state = module.encoder(*input[:-1])
+            _, attn_mask, _, _, pos = module.decoder.get_example_inputs()
+            tokens = [input[-1].item()]
+            # generate tokens one by one
+            for _ in range(max_seq_len - 1):
+                # run decoder for next token prediction
+                logits = module.decoder(
+                    torch.tensor([[tokens[-1]]], dtype=torch.long),
+                    attn_mask,
+                    hidden_state,
+                    input[1],
+                    pos,
+                )
+
+                # get next token
+                tokens.append(torch.argmax(logits, dim=-1).item())
+                pos += 1
+                attn_mask[..., pos] = 0
+
+                # Check if EOS token
+                if tokens[-1] == module.decoder.config.eos_token_id:
+                    break
+            goldens.append(tokenizer.decode(tokens[1:-1]))
+
+    print("cpu accuracy >")
+    Seq2SeqLMExportableModulePipeline.evaluate_with_ground_truth(
+        tokenizer, goldens, targets, evaluate_squad
+    )
+    print("device accuracy >")
+    Seq2SeqLMExportableModulePipeline.evaluate_with_ground_truth(
+        tokenizer, outputs, targets, evaluate_squad
+    )
+
+
+def whisper_eval(
+    backend,
+    soc_model,
+    device,
+    host,
+    pte_path,
+    module,
+    inputs,
+    targets,
+    artifact_dir,
+):
+    from executorch.examples.qualcomm.utils import make_output_dir, SimpleADB
+    from executorch.examples.qualcomm.oss_scripts.whisper.whisper import eval_metric
+    from executorch.examples.qualcomm.oss_scripts.whisper.whisper_model import EncoderDecoderCache, DynamicCache
+    from pathlib import Path
+    from transformers import AutoTokenizer
+
+    tokenizer = AutoTokenizer.from_pretrained("openai/whisper-tiny")
+    tokenizer_json = tokenizer.save_pretrained(args.artifact)[-1]
+    max_seq_len = module.max_seq_length
+
+    workspace = f"/data/local/tmp/executorch/{Path(pte_path).stem}"
+    adb = SimpleADB(
+        qnn_sdk=qnn_sdk,
+        build_path="build-android",
+        pte_path=pte_path,
+        workspace=workspace,
+        device_id=device,
+        host_id=host,
+        soc_model=soc_model,
+        runner="examples/qualcomm/oss_scripts/whisper/qnn_whisper_runner",
+    )
+    runner_args = " ".join(
+        [
+            f"--model_path {os.path.basename(pte_path)}",
+            f"--tokenizer_json_path {os.path.basename(tokenizer_json)}",
+            "--input_list_path input_list.txt",
+            f"--seq_len {max_seq_len}",
+            "--output_folder_path outputs",
+        ]
+    )
+    runner_cmd = " ".join(
+        [
+            f"cd {workspace} &&",
+            f"./{'qnn_whisper_runner' if backend == 'qnn' else 'xnn_whisper_runner'}",
+            runner_args,
+        ]
+    )
+    files = [tokenizer_json]
+    if backend == "xnn":
+        files.append("build-xnnpack/xnn_whisper_runner")
+
+    adb.push(inputs=inputs, files=files)
+    adb.execute(custom_runner_cmd=runner_cmd)
+
+    # collect output data
+    output_data_folder = f"{artifact_dir}/outputs"
+    make_output_dir(output_data_folder)
+
+    outputs = []
+    def post_process():
+        for i in range(len(inputs)):
+            with open(f"{artifact_dir}/outputs/output_{i}.txt", "r") as f:
+                outputs.append(f.read())
+    adb.pull(output_path=artifact_dir, callback=post_process)
+
+    # cpu inference
+    decoder_start_token_id = getattr(module.config, "decoder_start_token_id", 50258)
+    eos_token_id = getattr(module.config, "eos_token_id", 50257)
+    goldens = []
+    with torch.no_grad():
+        for input in inputs:
+            # run encoder
+            hidden_state = module.whisper_encoder(*input)
+            _, attn_mask, _, pos = module.whisper_decoder.get_example_inputs()
+            tokens = [decoder_start_token_id]
+            # generate tokens one by one
+            for _ in range(max_seq_len - 1):
+                # run decoder for next token prediction
+                logits = module.whisper_decoder(
+                    torch.tensor([[tokens[-1]]], dtype=torch.long),
+                    attn_mask,
+                    hidden_state,
+                    pos,
+                )
+
+                # get next token
+                tokens.append(torch.argmax(logits, dim=-1).item())
+                pos += 1
+                attn_mask[..., pos] = 0
+
+                # Check if EOS token
+                if tokens[-1] == eos_token_id:
+                    break
+
+            module.whisper_decoder.static_cache.reset()
+            module.whisper_decoder.cache = EncoderDecoderCache(
+                module.whisper_decoder.static_cache, DynamicCache()
+            )
+            goldens.append(tokenizer.decode(tokens[1:]))
+
+    print(f"cpu accuracy >\n{eval_metric(goldens, targets)}")
+    print(f"device accuracy >\n{eval_metric(outputs, targets)}")
+
+
+class RunnerEvalWrapper(EagerEvalWrapper):
+    """
+    A wrapper class to run PPL scores on device.
+    """
+
+    def __init__(
+        self,
+        backend,
+        soc_model,
+        device,
+        host,
+        pte_path,
+        artifact_dir,
+        decoder_model,
+        tokenizer,
+        runtime_tokenizer_path,
+    ):
+        from pathlib import Path
+        from executorch.exir._serialize._program import deserialize_pte_binary
+        from executorch.examples.qualcomm.utils import SimpleADB
+
+        self.pte_path = pte_path
+        with open(pte_path, "rb") as f:
+            program_data = f.read()
+        program = deserialize_pte_binary(program_data)
+        # Retrieve vocab_size from get_metadata under static_llama that is passed to edge manager
+        self.output_vocab_size = None
+        pte_max_seq_len = None
+        self.logits_scale = None
+        self.logits_zero_point = None
+        self.kv_io_bit_width = 32
+        self.et_backend = backend
+        for method in program.execution_plan:
+            # Don't use tokenizer.n_words, the numbers are off once calling get_tokenizer()
+            if method.name == "get_vocab_size":
+                # pyre-ignore
+                self.output_vocab_size = method.values[0].val.int_val
+            if method.name == "get_max_seq_len":
+                # pyre-ignore
+                pte_max_seq_len = method.values[0].val.int_val
+            if method.name == "get_logits_scale":
+                self.logits_scale = method.values[0].val.double_val
+            if method.name == "get_logits_zero_point":
+                self.logits_zero_point = method.values[0].val.int_val
+            if method.name == "get_kv_io_bit_width":
+                self.kv_io_bit_width = method.values[0].val.int_val
+
+        # FP has no scale/zero_point, use following values, which is equivalent to not performing dequantize.
+        if self.kv_io_bit_width == 32:
+            self.logits_scale = 1
+            self.logits_zero_point = 0
+        elif self.logits_scale is None or self.logits_zero_point is None:
+            raise RuntimeError(
+                "Unable to find scale/offset. The .pte file might be deprecated. Please generate a new .pte file"
+            )
+
+        assert self.output_vocab_size is not None, "Couldn't find the vocab size"
+        assert pte_max_seq_len is not None, "Couldn't find the max_seq_len from pte"
+        self.decoder_model = decoder_model
+        self.max_seq_length = pte_max_seq_len
+        self.runtime_tokenizer_path = runtime_tokenizer_path
+        self.artifact_dir = artifact_dir
+        self.output_dir = args.artifact
+        self.workspace = f"/data/local/tmp/executorch/{decoder_model}"
+        self.adb = SimpleADB(
+            qnn_sdk=os.getenv("QNN_SDK_ROOT"),
+            build_path="build-android",
+            pte_path=pte_path,
+            workspace=self.workspace,
+            device_id=device,
+            host_id=host,
+            soc_model=soc_model,
+            runner="examples/qualcomm/oss_scripts/llama/qnn_llama_runner",
+        )
+        files = [self.runtime_tokenizer_path]
+        if backend == "xnn":
+            files.append("build-xnnpack/xnn_llama_runner")
+        self.adb.push(inputs=[], files=files)
+        # n seq len = n-1 cache len, so we len(inps) = n-1 during _model_call
+        # pyre-ignore
+        super().__init__(None, tokenizer, self.max_seq_length - 1)
+
+    def _model_call(self, inps):
+        from executorch.examples.qualcomm.oss_scripts.llama import DECODER_MODEL_VERSION
+        from executorch.examples.qualcomm.utils import make_output_dir
+
+        input_file_name = f"{self.artifact_dir}/input_tokens.raw"
+        inps = inps.to(torch.uint64).numpy()
+        inps.tofile(input_file_name)
+
+        outputs_path = "outputs/outputs.txt"
+        dump_logits_path = "outputs/all_logit.raw"
+        performance_output_path = "outputs/inference_speed.txt"
+        runner_cmd = " ".join(
+            [
+                f"cd {self.workspace} &&",
+                f"./{self.et_backend}_llama_runner",
+                f"--decoder_model_version {DECODER_MODEL_VERSION[self.decoder_model]}",
+                f"--tokenizer_path {os.path.basename(self.runtime_tokenizer_path)}",
+                f"--model_path {os.path.basename(self.pte_path)}",
+                f"--seq_len {self.max_seq_length}",
+                f"--output_path {outputs_path}",
+                f"--performance_output_path {performance_output_path}",
+                f"--kv_updater SmartMask",
+                f"--eval_mode 0",
+                "--temperature 0",
+                f"--dump_logits_path {dump_logits_path}",
+                f"--tokenized_prompt {os.path.basename(input_file_name)}",
+            ]
+        )
+
+        self.adb.push(inputs=[], files=[input_file_name], init_env=False)
+        self.adb.execute(custom_runner_cmd=runner_cmd)
+        output_data_folder = f"{self.output_dir}/outputs"
+        make_output_dir(output_data_folder)
+        output_tensor_list = []
+
+        def post_process():
+            with open(f"{self.artifact_dir}/{dump_logits_path}", "r") as f:
+                if self.kv_io_bit_width == 32:
+                    output_tensor = torch.from_numpy(
+                        np.fromfile(f.name, dtype=np.float32).reshape(
+                            1, -1, self.output_vocab_size
+                        )
+                    )
+                    output_tensor_list.append(output_tensor)
+                else:
+                    output_tensor = torch.from_numpy(
+                        np.fromfile(f.name, dtype=np.uint16).reshape(
+                            1, -1, self.output_vocab_size
+                        )
+                    )
+                    output_tensor = (
+                        output_tensor.to(torch.float32) - self.logits_zero_point
+                    ) * self.logits_scale
+                    output_tensor_list.append(output_tensor)
+
+            # simple_eval will run multiple rounds, use last run for inference speed
+            with open(f"{self.artifact_dir}/{performance_output_path}", "r") as f:
+                self.inference_speed = float(f.read())
+
+        self.adb.pull(output_path=self.output_dir, callback=post_process)
+        return output_tensor_list[0]
+
+
+def llm_eval(
+    backend,
+    soc_model,
+    device,
+    host,
+    pte_path,
+    module,
+    decoder_model,
+    decoder_model_config,
+    artifact_dir,
+    **kwargs,
+):
+    import json
+    from executorch.examples.qualcomm.oss_scripts.llama.decoder_utils import (
+        GraphModuleCalibrationWrapper, smart_mask_updater
+    )
+    from pytorch_tokenizers import get_tokenizer, TiktokenTokenizer
+    from transformers import AutoTokenizer
+    try:
+        from lm_eval.evaluator import simple_evaluate
+    except ImportError:
+        raise ImportError(
+            "Please install the llm eval dependency via examples/models/llama/install_requirements.sh"
+        )
+
+    # Tokenizer related
+    if "llama3_2" in decoder_model:
+        tokenizer = get_tokenizer(kwargs["tokenizer_model"])
+        assert isinstance(
+            tokenizer, TiktokenTokenizer
+        ), f"Wrong tokenizer provided for llama3_2."
+        runtime_tokenizer_path = args.tokenizer_model
+    else:
+        model_id = decoder_model_config.repo_id
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer_artifacts = tokenizer.save_pretrained(artifact_dir)
+        tokenizer_config = tokenizer_artifacts[0]
+        runtime_tokenizer_path = tokenizer_artifacts[-1]
+        tokenizer = get_tokenizer(runtime_tokenizer_path, tokenizer_config)
+
+    if decoder_model == "phi_4_mini":
+        with open(runtime_tokenizer_path, "r+") as file:
+            data = json.load(file)
+            data["pre_tokenizer"]["pretokenizers"][-2]["invert"] = False
+            file.seek(0)
+            json.dump(data, file, indent=4)
+            file.truncate()
+
+    # on device
+    # Generate the eval wrapper
+    device_eval_wrapper = RunnerEvalWrapper(
+        backend=backend,
+        soc_model=soc_model,
+        device=device,
+        host=host,
+        pte_path=pte_path,
+        artifact_dir=artifact_dir,
+        decoder_model=decoder_model,
+        tokenizer=tokenizer,
+        runtime_tokenizer_path=runtime_tokenizer_path,
+    )
+    # Evaluate the model on device
+    with torch.no_grad():
+        device_eval_results = simple_evaluate(
+            model=device_eval_wrapper,
+            tasks=["wikitext"],
+            num_fewshot=None,
+            limit=1,
+        )
+
+    # on host
+    # Generate the eval wrapper
+    cpu_eval_wrapper = GraphModuleCalibrationWrapper(
+        model=module,
+        tokenizer=tokenizer,
+        max_seq_length=1024,
+        ar_len=1,
+        use_kv_cache=True,
+        get_example_inputs=module.get_example_inputs,
+        kv_updater=smart_mask_updater,
+        use_i64_token=False,
+        seq_mse_candidates=0,
+    )
+    # Evaluate the model on device
+    with torch.no_grad():
+        cpu_eval_results = simple_evaluate(
+            model=cpu_eval_wrapper,
+            tasks=["wikitext"],
+            num_fewshot=None,
+            limit=1,
+        )
+
+    print("cpu accuracy >")
+    print(cpu_eval_results["results"]["wikitext"]["word_perplexity,none"])
+    print("device accuracy >")
+    print(device_eval_results["results"]["wikitext"]["word_perplexity,none"])
+
+
+def get_model_dispatcher(dataset_path, **kwargs):
+    from transformers import AutoModelForMaskedLM, AutoModelForImageClassification
+    from executorch.examples.qualcomm.utils import (
+        get_imagenet_dataset, get_masked_language_model_dataset
+    )
+
+    def get_masked_lm_sample_input(pretrained, data_size=100):
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(pretrained)
+        return get_masked_language_model_dataset(dataset_path, tokenizer, data_size)
+
+    def get_albert():
+        pretrained = "albert/albert-base-v2"
+        inputs, targets = get_masked_lm_sample_input(pretrained)
+        module = AutoModelForMaskedLM.from_pretrained(pretrained).eval()
+        return module, inputs, targets, masked_lm_eval
+
+    def get_bert():
+        pretrained = "google-bert/bert-base-uncased"
+        inputs, targets = get_masked_lm_sample_input(pretrained)
+        module = AutoModelForMaskedLM.from_pretrained(pretrained).eval()
+        return module, inputs, targets, masked_lm_eval
+
+    def get_cvt():
+        inputs, targets = get_imagenet_dataset(dataset_path, 100, (224, 224))
+        module = AutoModelForImageClassification.from_pretrained("microsoft/cvt-13").eval()
+        return module, inputs, targets, image_classification_eval
+
+    def get_deit():
+        inputs, targets = get_imagenet_dataset(dataset_path, 100, (224, 224))
+        module = AutoModelForImageClassification.from_pretrained("facebook/deit-base-distilled-patch16-224").eval()
+        return module, inputs, targets, image_classification_eval
+
+    def get_efficientnet():
+        inputs, targets = get_imagenet_dataset(dataset_path, 100, (224, 224))
+        module = AutoModelForImageClassification.from_pretrained("google/efficientnet-b0").eval()
+        return module, inputs, targets, image_classification_eval
+
+    def get_eurobert():
+        pretrained = "EuroBERT/EuroBERT-210m"
+        inputs, targets = get_masked_lm_sample_input(pretrained)
+        module = AutoModelForMaskedLM.from_pretrained(pretrained, trust_remote_code=True).eval()
+        return module, inputs, targets, masked_lm_eval
+
+    def get_distilbert():
+        pretrained = "distilbert/distilbert-base-uncased"
+        inputs, targets = get_masked_lm_sample_input(pretrained)
+        module = AutoModelForMaskedLM.from_pretrained(pretrained).eval()
+        return module, inputs, targets, masked_lm_eval
+
+    def get_dit():
+        from executorch.examples.qualcomm.oss_scripts.dit import get_rvlcdip_dataset
+        inputs, targets = get_rvlcdip_dataset(100)
+        module = AutoModelForImageClassification.from_pretrained("microsoft/dit-base-finetuned-rvlcdip").eval()
+        return module, inputs, targets, image_classification_eval
+
+    def get_focalnet():
+        inputs, targets = get_imagenet_dataset(dataset_path, 100, (224, 224))
+        module = AutoModelForImageClassification.from_pretrained("microsoft/focalnet-tiny").eval()
+        return module, inputs, targets, image_classification_eval
+
+    def get_mobilevit_v1():
+        import executorch.examples.qualcomm.oss_scripts.mobilevit_v1 as mvit1
+        inputs, targets = mvit1.get_imagenet_dataset(dataset_path, 100)
+        module = AutoModelForImageClassification.from_pretrained("apple/mobilevit-xx-small").eval()
+        return module, inputs, targets, image_classification_eval
+
+    def get_mobilevit_v2():
+        import executorch.examples.qualcomm.oss_scripts.mobilevit_v2 as mvit2
+        inputs, targets = mvit2.get_imagenet_dataset(dataset_path, 100)
+        module = AutoModelForImageClassification.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256").eval()
+        return module, inputs, targets, image_classification_eval
+
+    def get_pvt():
+        inputs, targets = get_imagenet_dataset(dataset_path, 100, (224, 224))
+        module = AutoModelForImageClassification.from_pretrained("Zetatech/pvt-tiny-224").eval()
+        return module, inputs, targets, image_classification_eval
+
+    def get_roberta():
+        pretrained = "xlm-roberta-base"
+        inputs, targets = get_masked_lm_sample_input(pretrained)
+        module = AutoModelForMaskedLM.from_pretrained(pretrained).eval()
+        return module, inputs, targets, masked_lm_eval
+
+    def get_swin():
+        inputs, targets = get_imagenet_dataset(dataset_path, 100, (224, 224))
+        module = AutoModelForImageClassification.from_pretrained("microsoft/swin-tiny-patch4-window7-224").eval()
+        return module, inputs, targets, image_classification_eval
+
+    def get_t5():
+        from executorch.examples.qualcomm.utils import get_seq2seq_dataset_from_squad_csv
+        from executorch.examples.qualcomm.oss_scripts.t5.t5 import T5
+        from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
+        model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small").eval()
+        max_hidden_seq_length = 384
+        max_cache_length = 512
+        module = T5(
+            model,
+            tokenizer,
+            max_hidden_seq_length=max_hidden_seq_length,
+            max_cache_length=max_cache_length,
+        )
+        inputs, targets = get_seq2seq_dataset_from_squad_csv(
+            args.dataset,
+            tokenizer,
+            100,
+            max_hidden_seq_length=max_hidden_seq_length,
+        )
+        return module, inputs, targets, t5_eval
+
+    def get_whisper():
+        from executorch.examples.qualcomm.oss_scripts.whisper.whisper import (
+            get_dataset, Whisper
+        )
+        from transformers import AutoModelForSpeechSeq2Seq
+        module = (
+            AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-tiny")
+            .to("cpu")
+            .eval()
+        )
+        max_cache_length = 1024
+        max_seq_length = 1024
+        batch_size = 1
+        module = Whisper(
+            module,
+            batch_size=batch_size,
+            max_cache_length=max_cache_length,
+            max_seq_length=max_seq_length,
+        )
+        inputs, targets = get_dataset(100)
+        return module, inputs, targets, whisper_eval
+
+    def get_static_llama(decoder_model, decoder_model_config, **kwargs):
+        import json
+        from executorch.examples.qualcomm.oss_scripts.llama import LLM_VARIANT_ARCHS
+        from executorch.examples.qualcomm.oss_scripts.llama.llama import (
+            download_and_convert_hf_checkpoint,
+        )
+        from executorch.examples.qualcomm.oss_scripts.llama.model.static_llama import (
+            LlamaModel,
+            ModelArgs,
+        )
+        if "params" in kwargs:
+            params_path = kwargs["params"]
+        else:
+            params_path = decoder_model_config.params_path
+        with open(params_path) as f:
+            kv_config = ModelArgs(**json.load(f))
+
+        kv_config.max_batch_size = 1
+        kv_config.max_seq_len = 1024
+        kv_config.use_kv_cache = True
+        kv_config.enable_r3 = decoder_model_config.r3
+        kv_config.kv_io_bit_width = decoder_model_config.get_kv_io_bit_width()
+        kv_config.enable_masked_softmax = decoder_model_config.masked_softmax
+
+        extra_kwargs = {}
+        if decoder_model == "gemma3-1b":
+            from transformers import Gemma3Config
+
+            hf_config = Gemma3Config.from_pretrained(decoder_model_config.repo_id)
+            extra_kwargs["layer_types"] = hf_config.text_config.layer_types
+            extra_kwargs["rope_local_base_freq"] = (
+                hf_config.text_config.rope_local_base_freq
+            )
+            extra_kwargs["sliding_window"] = hf_config.sliding_window
+
+        with torch.device("meta"):
+            llama_instance = LLM_VARIANT_ARCHS.get(
+                decoder_model, LlamaModel)(
+                    kv_config,
+                    ar_len=1,
+                    output_new_cache_only=True,
+                    output_cache=True,
+                    use_i64_token=False,
+                    **extra_kwargs,
+                )
+        if "checkpoint" not in kwargs:  # HF models
+            checkpoint = download_and_convert_hf_checkpoint(
+                decoder_model_config.repo_id,
+                decoder_model_config.convert_weights.__func__,
+            )
+            state_dict = torch.load(
+                checkpoint, weights_only=True, map_location="cpu", mmap=True
+            )
+            if decoder_model == "gemma3-1b":
+                for k, v in state_dict.items():
+                    if "norm" not in k:
+                        continue
+                    # Llama does x.to(float16) * w whilst Gemma3 is (x * w).to(float16)
+                    # See https://github.com/huggingface/transformers/pull/29402
+                    state_dict[k] = v.float() + torch.ones(v.shape, dtype=torch.float32)
+        else:
+            state_dict = torch.load(
+                kwargs["checkpoint"], weights_only=True, map_location="cpu", mmap=True
+            )
+
+        if decoder_model_config.transform_weight:
+            # Change to HuggingFace weight to improve the performance of RoPE in HTP backend.
+            def permute(w, heads):
+                dim_0 = w.size(0)
+                dim_1 = w.size(1)
+                return (
+                    w.view(heads, dim_0 // heads // 2, 2, dim_1)
+                    .transpose(1, 2)
+                    .reshape(dim_0, dim_1)
+                )
+
+            for layer_i in range(llama_instance.n_layers):
+                state_dict[f"layers.{layer_i}.attention.wq.weight"] = permute(
+                    state_dict[f"layers.{layer_i}.attention.wq.weight"], llama_instance.n_heads
+                )
+                state_dict[f"layers.{layer_i}.attention.wk.weight"] = permute(
+                    state_dict[f"layers.{layer_i}.attention.wk.weight"], llama_instance.n_kv_heads
+                )
+
+        llama_instance.load_state_dict(state_dict, strict=True, assign=True)
+        for layer in llama_instance.layers:
+            if getattr(layer.attention, "prepare_sha", None):
+                layer.attention.prepare_sha()
+            if getattr(layer.feed_forward, "prepare_feedfoward_conv", None):
+                layer.feed_forward.prepare_feedfoward_conv()
+
+        return llama_instance.to(torch.float32)
+
+    def get_decoder_model(model_name, **kwargs):
+        from executorch.examples.qualcomm.oss_scripts.llama import SUPPORTED_LLM_MODELS
+        decoder_model_config = SUPPORTED_LLM_MODELS[model_name]
+        llama_instance = get_static_llama(model_name, decoder_model_config, **kwargs)
+        return llama_instance, model_name, decoder_model_config, llm_eval
+
+    def get_qwen2_5_0_5b():
+        return get_decoder_model("qwen2_5-0_5b")
+
+    def get_qwen2_5_1_5b():
+        return get_decoder_model("qwen2_5-1_5b")
+
+    def get_qwen3_0_6b():
+        return get_decoder_model("qwen3-0_6b")
+
+    def get_qwen3_1_7b():
+        return get_decoder_model("qwen3-1_7b")
+
+    def get_smollm2_135m():
+        return get_decoder_model("smollm2_135m")
+
+    def get_smollm3_3b():
+        return get_decoder_model("smollm3-3b")
+
+    def get_phi_4_mini():
+        return get_decoder_model("phi_4_mini")
+
+    def get_llama3_2_1b_instruct(params, tokenizer_model, checkpoint):
+        return get_decoder_model(
+            "llama3_2-1b_instruct",
+            params=params,
+            tokenizer_model=tokenizer_model,
+            checkpoint=checkpoint
+        )
+
+    def get_llama3_2_3b_instruct(params, tokenizer_model, checkpoint):
+        return get_decoder_model(
+            "llama3_2-3b_instruct",
+            params=params,
+            tokenizer_model=tokenizer_model,
+            checkpoint=checkpoint
+        )
+
+    def get_gemma3_1b():
+        return get_decoder_model("gemma3-1b")
+
+    model_dict = {
+        "albert": get_albert,
+        "bert": get_bert,
+        "cvt": get_cvt,
+        "deit": get_deit,
+        "dit": get_dit,
+        "distilbert": get_distilbert,
+        "efficientnet": get_efficientnet,
+        "eurobert": get_eurobert,
+        "focalnet": get_focalnet,
+        "gemma3-1b": get_gemma3_1b,
+        "llama3_2-1b_instruct": get_llama3_2_1b_instruct,
+        "llama3_2-3b_instruct": get_llama3_2_3b_instruct,
+        "mobilevit_v1": get_mobilevit_v1,
+        "mobilevit_v2": get_mobilevit_v2,
+        "phi_4_mini": get_phi_4_mini,
+        "pvt": get_pvt,
+        "qwen2_5-0_5b": get_qwen2_5_0_5b,
+        "qwen2_5-1_5b": get_qwen2_5_1_5b,
+        "qwen3-0_6b": get_qwen3_0_6b,
+        "qwen3-1_7b": get_qwen3_1_7b,
+        "roberta": get_roberta,
+        "smollm2_135m": get_smollm2_135m,
+        "smollm3-3b": get_smollm3_3b,
+        "swin": get_swin,
+        "t5": get_t5,
+        "whisper": get_whisper,
+    }
+    return model_dict
+
+
+def get_artifacts(backend, pte_path, soc_model, target_model, **kwargs):
+    from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import (
+        DECODER_MODEL_VERSION,
+    )
+    from executorch.backends.qualcomm.utils.utils import get_soc_to_arch_map
+
+    htp_arch = get_soc_to_arch_map()[soc_model]
+
+    def get_build_dir(backend):
+        build_dir = {
+            "qnn": "build-android",
+            "xnn": "build-xnnpack",
+        }
+        return build_dir[backend]
+
+    memory_script = """$@ 2> /dev/null &
+PROCESS=$(echo $1 | sed -e 's/^\.\///g')
+PEAK_MEM=0
+SAMPLES=0
+TOTAL=0
+while true; do
+    PID=$(pidof $PROCESS)
+    if [ "$PID" != "" ]; then
+        DMA=$(dmabuf_dump $PID | grep "PROCESS TOTAL" | awk '{ print $3 }')
+        PSS=$(dumpsys meminfo -s $PID | grep "TOTAL PSS" | awk '{ print $3 }')
+        if [ "$PSS" == "" ]; then
+            continue
+        fi
+        CURRENT=$(($DMA+$PSS))
+        if [ CURRENT -gt PEAK_MEM ]; then
+            PEAK_MEM=$CURRENT
+        fi
+        SAMPLES=$(awk -v s="$SAMPLES" 'BEGIN { print s + 1 }')
+        TOTAL=$(awk -v t="$TOTAL" -v c="$CURRENT" 'BEGIN { print t + c }')
+    else
+        break
+    fi
+done
+echo "peak_mem: $PEAK_MEM" >> statistics.txt
+AVG_MEM=$(awk -v total="$TOTAL" -v samples="$SAMPLES" 'BEGIN { printf "%.3f", total / samples }')
+echo "avg_mem: $AVG_MEM" >> statistics.txt
+    """
+    with open(memory_script_file, "w") as f:
+        f.write(memory_script)
+
+    runner = {
+        "qnn": f"{get_build_dir(backend)}/examples/qualcomm/executor_runner/qnn_executor_runner",
+        "xnn": f"{get_build_dir(backend)}/executor_runner",
+    }
+
+    artifacts = {
+        "qnn": [
+            pte_path,
+            f"{qnn_sdk}/lib/aarch64-android/libQnnHtp.so",
+            (
+                f"{qnn_sdk}/lib/hexagon-v{htp_arch}/"
+                f"unsigned/libQnnHtpV{htp_arch}Skel.so"
+            ),
+            (f"{qnn_sdk}/lib/aarch64-android/" f"libQnnHtpV{htp_arch}Stub.so"),
+            f"{qnn_sdk}/lib/aarch64-android/libQnnHtpPrepare.so",
+            f"{qnn_sdk}/lib/aarch64-android/libQnnSystem.so",
+            f"{get_build_dir(backend)}/backends/qualcomm/libqnn_executorch_backend.so",
+            f"{qnn_sdk}/lib/aarch64-android/libQnnModelDlc.so",
+            runner[backend],
+            memory_script_file,
+        ],
+        "xnn": [
+            pte_path,
+            runner[backend],
+            memory_script_file,
+        ],
+    }
+
+    if target_model in DECODER_MODEL_VERSION:
+        llm_tokenizer = kwargs.get("tokenizer_model", f"{os.path.dirname(pte_path)}/tokenizer.json")
+        if backend == "qnn":
+            artifacts[backend].append(f"{get_build_dir(backend)}/examples/qualcomm/oss_scripts/llama/{backend}_llama_runner")
+        elif backend == "xnn":
+            artifacts[backend].append(f"{get_build_dir(backend)}/{backend}_llama_runner")
+        artifacts[backend].append(llm_tokenizer)
+
+    return artifacts[backend]
+
+
+def get_llm_cmds(backend, pte_path, decoder_model, **kwargs):
+    common_cmd_args = [
+        f"--model_path {os.path.basename(pte_path)}",
+        "--seq_len 1024",
+        f"--decoder_model_version {decoder_model}",
+        f"--tokenizer_path {'tokenizer.model' if 'tokenizer_model' in kwargs else 'tokenizer.json'}",
+        "--prompt 'I would like to learn python, could you teach me with a simple example?'",
+    ]
+    for k, v in kwargs.items():
+        common_cmd_args.append(f"{k} {v}")
+
+    cmds_for_inference = (
+        " ".join(
+            [
+                f"cd {workspace} &&",
+                f"./{backend}_llama_runner {' '.join(common_cmd_args)}",
+            ]
+        )
+    )
+
+    if backend == "xnn":
+        common_cmd_args[1] = "--seq_len 100"
+    cmds_for_memory = (
+        " ".join(
+            [
+                f"cd {workspace} &&",
+                f"chmod +x {memory_script_file} &&",
+                f"./{memory_script_file} ./{backend}_llama_runner {' '.join(common_cmd_args)}",
+            ]
+        )
+    )
+    return [cmds_for_inference, cmds_for_memory]
+
+
+def get_cmds(backend, pte_path, iteration, method_index, target_model, **kwargs):
+    from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import (
+        DECODER_MODEL_VERSION,
+    )
+
+    if target_model in DECODER_MODEL_VERSION:
+        return get_llm_cmds(
+            backend, pte_path, DECODER_MODEL_VERSION[target_model], **kwargs
+        )
+
+    cmd_args = {
+        "qnn": (
+            [
+                f"--model_path {os.path.basename(pte_path)}",
+                f"--iteration {iteration}",
+                f"--method_index {method_index}",
+                "--dump_statistics",
+            ]
+        ),
+        "xnn": (
+            [
+                f"--model_path {os.path.basename(pte_path)}",
+                f"--num_executions {iteration}",
+                f"--method_index {method_index}",
+                "--dump_statistics",
+            ]
+        ),
+    }
+    cmds_for_inference = {
+        "qnn": (
+            " ".join(
+                [
+                    f"cd {workspace} &&",
+                    "chmod +x ./qnn_executor_runner &&",
+                    f"./qnn_executor_runner {' '.join(cmd_args[backend])}",
+                ]
+            )
+        ),
+        "xnn": (
+            " ".join(
+                [
+                    f"cd {workspace} &&",
+                    "chmod +x ./executor_runner &&",
+                    f"./executor_runner {' '.join(cmd_args[backend])}",
+                ]
+            )
+        ),
+    }
+    # do not dump inference metrics during profiling memory
+    for _, v in cmd_args.items():
+        v.pop()
+    cmds_for_memory = {
+        "qnn": (
+            " ".join(
+                [
+                    f"cd {workspace} &&",
+                    "chmod +x ./qnn_executor_runner &&",
+                    f"chmod +x {memory_script_file} &&",
+                    f"./{memory_script_file} ./qnn_executor_runner {' '.join(cmd_args[backend])}",
+                ]
+            )
+        ),
+        "xnn": (
+            " ".join(
+                [
+                    f"cd {workspace} &&",
+                    "chmod +x ./executor_runner &&",
+                    f"chmod +x {memory_script_file} &&",
+                    f"./{memory_script_file} ./executor_runner {' '.join(cmd_args[backend])}",
+                ]
+            )
+        ),
+    }
+    return [cmds_for_inference[backend], cmds_for_memory[backend]]
+
+
+def start_benchmark(artifacts, cmds, device, host):
+    import tempfile
+
+    def adb(action):
+        if not host:
+            actions = ["adb", "-s", device]
+        else:
+            actions = ["adb", "-H", host, "-s", device]
+        actions.extend(action)
+        subprocess.run(actions, stdout=subprocess.DEVNULL)
+
+    def post_process():
+        subprocess.run(["rm", "-rf", perf_file], stdout=subprocess.DEVNULL)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            for file_name in [perf_file]:
+                adb(["pull", f"{workspace}/{file_name}", tmp_dir])
+                with open(f"{tmp_dir}/{file_name}", "r") as f:
+                    print(f.read())
+
+    adb(["shell", "rm", "-rf", workspace])
+    adb(["shell", "mkdir", "-p", workspace])
+    for artifact in artifacts:
+        adb(["push", artifact, workspace])
+    for cmd in cmds:
+        adb(["shell", cmd])
+    post_process()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-b",
+        "--backend",
+        help="either 'qnn' or 'xnn'",
+        required=True,
+    )
+    parser.add_argument(
+        "-p",
+        "--pte",
+        help="path to .pte",
+        required=True,
+    )
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path to generated intermediate artifacts",
+    )
+    parser.add_argument(
+        "-t",
+        "--target_model",
+        help=f"supported targets: {get_model_dispatcher('').keys()}",
+        required=True,
+    )
+    parser.add_argument(
+        "-H",
+        "--host",
+        help="hostname for adb gateway",
+        required=False,
+    )
+    parser.add_argument(
+        "-s",
+        "--device",
+        help="serial number for adb device",
+        required=True,
+    )
+    parser.add_argument(
+        "-m",
+        "--soc_model",
+        help="model name of SoC",
+        required=True,
+    )
+    parser.add_argument(
+        "-i",
+        "--iteration",
+        help="total number of inferences",
+        default=100,
+    )
+    parser.add_argument(
+        "-e",
+        "--eval",
+        help="perform e2e evaluation for checking accuracy metrics",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help="specify dataset path for evaluation",
+    )
+    parser.add_argument(
+        "--method_index",
+        help="specify which method to be executed",
+        default=0,
+    )
+    parser.add_argument(
+        "--checkpoint",
+        help="Pass llama checkpoint.",
+        required=False,
+        type=str,
+    )
+    parser.add_argument(
+        "--params",
+        help="Pass llama params json file.",
+        required=False,
+        type=str,
+    )
+    parser.add_argument(
+        "--tokenizer_model",
+        help="Pass llama tokenizer model.",
+        type=str,
+        default=None,
+    )
+    args = parser.parse_args()
+
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    kwargs = {}
+    if all([args.params, args.tokenizer_model, args.checkpoint]):
+        kwargs = {
+            "params": args.params,
+            "tokenizer_model": args.tokenizer_model,
+            "checkpoint": args.checkpoint,
+        }
+
+    if args.eval:
+        module, inputs, targets, eval_func = get_model_dispatcher(
+            args.dataset, **kwargs
+        )[args.target_model](**kwargs)
+        eval_func(
+            args.backend,
+            args.soc_model,
+            args.device,
+            args.host,
+            args.pte,
+            module,
+            inputs,
+            targets,
+            args.artifact,
+            **kwargs,
+        )
+    else:
+        start_benchmark(
+            artifacts=get_artifacts(
+                args.backend, args.pte, args.soc_model, args.target_model, **kwargs
+            ),
+            cmds=get_cmds(
+                args.backend,
+                args.pte,
+                args.iteration,
+                args.method_index,
+                args.target_model,
+                **kwargs,
+            ),
+            device=args.device,
+            host=args.host,
+        )
diff --git a/build_xnn.sh b/build_xnn.sh
new file mode 100755
index 00000000000..78d0e71d594
--- /dev/null
+++ b/build_xnn.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+
+if [[ -z $ANDROID_NDK_ROOT ]]; then
+  echo "Please export ANDROID_NDK_ROOT=/path/to/ndk"
+  exit -1
+fi
+
+CLEAN_BUILD="false"
+BUILD_FOLDER="build-xnnpack"
+BUILD_TYPE="release"
+
+while [[ "$#" -gt 0 ]]; do
+  case "$1" in
+    -c|--clean_build) CLEAN_BUILD="true"; shift;;
+    -d|--debug) BUILD_TYPE="Debug"; shift;;
+    *) echo "unknow arg passed: $1"; exit 1;;
+  esac
+  shift
+done
+
+if [ "$CLEAN_BUILD" = true ]; then
+  rm -rf $BUILD_FOLDER
+fi
+
+cmake \
+  -DCMAKE_INSTALL_PREFIX=$BUILD_FOLDER \
+  -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
+  -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
+  -DANDROID_ABI='arm64-v8a' \
+  -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
+  -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
+  -DSUPPORT_REGEX_LOOKAHEAD=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+  -DEXECUTORCH_BUILD_XNNPACK=ON \
+  -DEXECUTORCH_ENABLE_LOGGING=ON \
+  -DPYTHON_EXECUTABLE=python \
+  -B$BUILD_FOLDER .
+
+cmake --build $BUILD_FOLDER -j9 --target install --config $BUILD_TYPE
diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp
index 0974e751203..609530d4daa 100644
--- a/examples/portable/executor_runner/executor_runner.cpp
+++ b/examples/portable/executor_runner/executor_runner.cpp
@@ -18,6 +18,7 @@
  * all fp32 tensors.
  */
 
+#include <chrono>
 #include <fstream>
 #include <iostream>
 #include <memory>
@@ -57,7 +58,13 @@ DEFINE_string(
     output_file,
     "",
     "Base name of output file. If not empty output will be written to the file(s).");
+DEFINE_string(input_list_path, "input_list.txt", "Model input list path.");
+DEFINE_string(
+  output_folder_path,
+  "outputs",
+  "Executorch inference data output path.");
 
+DEFINE_bool(dump_statistics, false, "Dump inference statistics.");
 DEFINE_bool(
     print_all_output,
     false,
@@ -70,6 +77,11 @@ DEFINE_int32(
     cpu_threads,
     -1,
     "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");
+DEFINE_bool(
+    shared_buffer,
+    false,
+    "Specifies to use shared buffers for zero-copy usecase between the application and device/co-processor associated with the backend.");
+DEFINE_uint32(method_index, 0, "Index of methods to be specified.");
 
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
@@ -251,7 +263,7 @@ int main(int argc, char** argv) {
   // Use the first method in the program.
   const char* method_name = nullptr;
   {
-    const auto method_name_result = program->get_method_name(0);
+    const auto method_name_result = program->get_method_name(FLAGS_method_index);
     ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
     method_name = *method_name_result;
   }
@@ -324,11 +336,19 @@ int main(int argc, char** argv) {
   // be used by a single thread at at time, but it can be reused.
   //
   EventTraceManager tracer;
+  auto before_load = std::chrono::high_resolution_clock::now();
   Result<Method> method = program->load_method(
       method_name,
       &memory_manager,
       tracer.get_event_tracer(),
       ptd_data_map.get());
+
+  auto after_load = std::chrono::high_resolution_clock::now();
+  double interval_load =
+      std::chrono::duration_cast<std::chrono::microseconds>(
+          after_load - before_load)
+          .count() /
+      1000.0;
   ET_CHECK_MSG(
       method.ok(),
       "Loading of method %s failed with status 0x%" PRIx32,
@@ -336,6 +356,148 @@ int main(int argc, char** argv) {
       (uint32_t)method.error());
   ET_LOG(Info, "Method loaded.");
 
+  // QCOM change
+  std::ifstream input_list(FLAGS_input_list_path);
+  if (input_list.is_open()) {
+    auto inputs = executorch::extension::prepare_input_tensors(*method);
+    ET_LOG(Debug, "Preparing inputs.");
+    ET_CHECK_MSG(
+        inputs.ok(),
+        "Could not prepare inputs: 0x%" PRIx32,
+        (uint32_t)inputs.error());
+    ET_LOG(Debug, "Inputs prepared.");
+
+    size_t num_inputs = method->inputs_size();
+    ET_LOG(Info, "Number of inputs: %zu", num_inputs);
+
+    auto split = [](std::string s, std::string delimiter) {
+      size_t pos_start = 0, pos_end, delim_len = delimiter.length();
+      std::string token;
+      std::vector<std::string> res;
+
+      while ((pos_end = s.find(delimiter, pos_start)) != std::string::npos) {
+        token = s.substr(pos_start, pos_end - pos_start);
+        pos_start = pos_end + delim_len;
+        res.push_back(token);
+      }
+      res.push_back(s.substr(pos_start));
+      return res;
+    };
+
+    std::string file_path;
+    int inference_index = 0;
+    double elapsed_time = 0;
+    while (std::getline(input_list, file_path)) {
+      auto input_files = split(file_path, " ");
+      if (input_files.size() == 0) {
+        break;
+      }
+      ET_CHECK_MSG(
+          input_files.size() == num_inputs,
+          "Number of inputs (%zu) mismatch with input files (%zu)",
+          num_inputs,
+          input_files.size());
+
+      std::vector<std::vector<char>> input_buf(num_inputs);
+      for (int input_index = 0; input_index < num_inputs; ++input_index) {
+        MethodMeta method_meta = method->method_meta();
+        Result<executorch::runtime::TensorInfo> tensor_meta =
+            method_meta.input_tensor_meta(input_index);
+
+        std::ifstream fin(input_files[input_index], std::ios::binary);
+        fin.seekg(0, fin.end);
+        size_t file_size = fin.tellg();
+
+        input_buf[input_index].resize(file_size);
+        fin.seekg(0, fin.beg);
+        fin.read(
+            static_cast<char*>(input_buf[input_index].data()),
+            file_size);
+        fin.close();
+
+        ET_CHECK_MSG(
+            file_size == tensor_meta->nbytes(),
+            "Input(%d) size mismatch. file bytes: %zu, tensor bytes: %zu",
+            input_index,
+            file_size,
+            tensor_meta->nbytes());
+
+        auto impl = executorch::aten::TensorImpl(
+            tensor_meta->scalar_type(),
+            /*dim=*/tensor_meta->sizes().size(),
+            const_cast<executorch::aten::TensorImpl::SizesType*>(tensor_meta->sizes().data()),
+            input_buf[input_index].data(),
+            const_cast<executorch::aten::TensorImpl::DimOrderType*>(
+                tensor_meta->dim_order().data()));
+        Error ret = method->set_input(executorch::aten::Tensor(&impl), input_index);
+        ET_CHECK_MSG(
+            ret == Error::Ok, "Failed to set input tensor: %d", (int)ret);
+      }
+      Error status = method->execute();
+      std::vector<EValue> outputs(method->outputs_size());
+      status = method->get_outputs(outputs.data(), method->outputs_size());
+      ET_CHECK(status == Error::Ok);
+      for (size_t output_index = 0; output_index < method->outputs_size();
+          output_index++) {
+        auto output_tensor = outputs[output_index].toTensor();
+        size_t nbytes = output_tensor.nbytes();
+        auto output_file_name = FLAGS_output_folder_path + "/output_" +
+            std::to_string(inference_index) + "_" +
+            std::to_string(output_index) + ".raw";
+        std::ofstream fout(output_file_name.c_str(), std::ios::binary);
+        fout.write(output_tensor.const_data_ptr<char>(), nbytes);
+        fout.close();
+      }
+      ++inference_index;
+    }
+    return 0;
+  } else {
+    et_timestamp_t time_spent_executing = 0, time_spent_executing_1st = 0;
+    auto inputs = executorch::extension::prepare_input_tensors(*method);
+    ET_LOG(Info, "Preparing inputs.");
+    ET_CHECK_MSG(
+        inputs.ok(),
+        "Could not prepare inputs: 0x%" PRIx32,
+        (uint32_t)inputs.error());
+    ET_LOG(Info, "Inputs prepared.");
+
+    auto before_exec = std::chrono::high_resolution_clock::now();
+    Error status = method->execute();
+    auto after_exec = std::chrono::high_resolution_clock::now();
+    double interval_1st_infs =
+        std::chrono::duration_cast<std::chrono::microseconds>(
+            after_exec - before_exec)
+            .count() /
+        1000.0;
+
+    before_exec = std::chrono::high_resolution_clock::now();
+    for (uint32_t i = 0; i < FLAGS_num_executions; i++) {
+      status = method->execute();
+      ET_CHECK_MSG(
+          status == Error::Ok,
+          "Execution of method %s failed with status 0x%" PRIx32,
+          method_name,
+          (uint32_t)status);
+    }
+    after_exec = std::chrono::high_resolution_clock::now();
+    double interval_infs = std::chrono::duration_cast<std::chrono::microseconds>(
+                              after_exec - before_exec)
+                              .count() /
+        1000.0 / FLAGS_num_executions;
+
+    if (FLAGS_dump_statistics) {
+      auto output_file_name = "statistics.txt";
+      std::ofstream fout(output_file_name);
+      fout << "load: " + std::to_string(interval_load)
+          << "\n1st: " + std::to_string(interval_1st_infs)
+          << "\navg: " + std::to_string(interval_infs) << std::endl;
+      fout.close();
+    }
+    ET_LOG(Info, "Model executed successfully.");
+    return 0;
+  }
+  // QCOM change end
+
   et_timestamp_t time_spent_executing = 0;
   // Run the model.
   for (uint32_t i = 0; i < FLAGS_num_executions; i++) {
diff --git a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
index 47f9f0cfb38..50d81129651 100644
--- a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
+++ b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
@@ -64,6 +64,8 @@ DEFINE_bool(
     false,
     "Dump intermediate outputs to etdump file.");
 
+DEFINE_bool(dump_statistics, false, "Dump inference statistics.");
+
 DEFINE_string(
     debug_output_path,
     "debug_output.bin",
@@ -303,6 +305,7 @@ int main(int argc, char** argv) {
   // be used by a single thread at at time, but it can be reused.
   //
   ETDumpGen etdump_gen;
+  auto before_load = std::chrono::high_resolution_clock::now();
   Result<Method> method =
       program->load_method(method_name, &memory_manager, &etdump_gen);
   ET_CHECK_MSG(
@@ -310,6 +313,12 @@ int main(int argc, char** argv) {
       "Loading of method %s failed with status 0x%" PRIx32,
       method_name,
       (int)method.error());
+  auto after_load = std::chrono::high_resolution_clock::now();
+  double interval_load =
+      std::chrono::duration_cast<std::chrono::microseconds>(
+          after_load - before_load)
+          .count() /
+      1000.0;
   ET_LOG(Info, "Method loaded.");
 
   void* debug_buffer;
@@ -570,12 +579,19 @@ int main(int argc, char** argv) {
         "Input list not provided. Inputs prepared with default values set.");
 
     // Run the method
+    auto before_exec = std::chrono::high_resolution_clock::now();
     Error status = method->execute();
     ET_CHECK_MSG(
         status == Error::Ok,
         "Execution of method %s failed with status 0x%" PRIx32,
         method_name,
         (int)status);
+    auto after_exec = std::chrono::high_resolution_clock::now();
+    double interval_1st_infs =
+        std::chrono::duration_cast<std::chrono::microseconds>(
+            after_exec - before_exec)
+            .count() /
+        1000.0;
     ET_LOG(Info, "Model executed successfully.");
 
     // Warm up
@@ -585,23 +601,33 @@ int main(int argc, char** argv) {
     }
 
     // Inference with designated iterations
-    auto before_exec = std::chrono::high_resolution_clock::now();
+    before_exec = std::chrono::high_resolution_clock::now();
     for (int i = 0; i < FLAGS_iteration; ++i) {
       status = method->execute();
     }
-    auto after_exec = std::chrono::high_resolution_clock::now();
+    after_exec = std::chrono::high_resolution_clock::now();
     double interval_infs =
         std::chrono::duration_cast<std::chrono::microseconds>(
             after_exec - before_exec)
             .count() /
         1000.0;
 
+    auto avg_infs = interval_infs / (float)FLAGS_iteration;
     ET_LOG(
         Info,
         "%d inferences took %f ms, avg %f ms",
         FLAGS_iteration,
         interval_infs,
-        interval_infs / (float)FLAGS_iteration);
+        avg_infs);
+
+    if (FLAGS_dump_statistics) {
+      auto output_file_name = "statistics.txt";
+      std::ofstream fout(output_file_name);
+      fout << "load: " + std::to_string(interval_load)
+           << "\n1st: " + std::to_string(interval_1st_infs)
+           << "\navg: " + std::to_string(avg_infs) << std::endl;
+      fout.close();
+    }
   }
 
   // Dump the etdump data containing profiling/debugging data to the specified
diff --git a/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp
index f0cc6d9a7a2..65d560c52c5 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp
@@ -9,9 +9,11 @@
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h>
 #include <executorch/runtime/core/memory_allocator.h>
+
 using executorch::runtime::MemoryAllocator;
 using executorch::runtime::TensorInfo;
 
+
 namespace example {
 RpcMem::RpcMem(
     const size_t total_cache_size,
@@ -20,11 +22,20 @@ RpcMem::RpcMem(
     : calculated_offsets_(0) {
   size_t total_bytes = total_cache_size + total_prompt_processor_io_size +
       total_token_generator_io_size;
+# ifndef XNNPACK
   shared_buffer_base_ptr_ = QnnExecuTorchAllocCustomMem(
       total_bytes, MemoryAllocator::kDefaultAlignment);
+# else
+  shared_buffer_base_ptr_ =
+      new char[total_bytes + MemoryAllocator::kDefaultAlignment];
+# endif
 }
 RpcMem::~RpcMem() {
+# ifndef XNNPACK
   QnnExecuTorchFreeCustomMem(shared_buffer_base_ptr_);
+# else
+  delete shared_buffer_base_ptr_;
+# endif
 }
 
 std::byte* RpcMem::allocate(size_t data_size) {
@@ -57,7 +68,9 @@ void RpcMem::add_memory_info(
       shape,
       rank,
       scalar_type};
+# ifndef XNNPACK
   QnnExecuTorchAddCustomMemTensorInfo(info);
+# endif
 };
 
 } // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
index 0c4884bbccf..c4145a23cf0 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -163,8 +163,9 @@ Error Runner<T>::load() {
   std::vector<std::string> method_names;
   switch (eval_mode_) {
     case EvalMode::kKVCached:
-      prompt_processor_method_name = "forward";
-      token_generator_method_name = "forward";
+      // workaround for benchmark 
+      prompt_processor_method_name = "kv_forward";
+      token_generator_method_name = "kv_forward";
       method_names.emplace_back(token_generator_method_name);
       break;
     case EvalMode::kHybrid:
diff --git a/examples/qualcomm/oss_scripts/t5/t5.py b/examples/qualcomm/oss_scripts/t5/t5.py
index 093572f032a..7c126285968 100644
--- a/examples/qualcomm/oss_scripts/t5/t5.py
+++ b/examples/qualcomm/oss_scripts/t5/t5.py
@@ -217,7 +217,6 @@ def main(args):
         tokenizer,
         data_size,
         max_hidden_seq_length=max_hidden_seq_length,
-        shuffle=False,
     )
 
     if not args.pre_gen_pte:
diff --git a/examples/qualcomm/oss_scripts/t5/t5_model.py b/examples/qualcomm/oss_scripts/t5/t5_model.py
index 0593feaa8b8..620e8f2cbb0 100644
--- a/examples/qualcomm/oss_scripts/t5/t5_model.py
+++ b/examples/qualcomm/oss_scripts/t5/t5_model.py
@@ -620,12 +620,12 @@ def evaluate_with_ground_truth(
         predicted_texts = []
         target_texts = []
         for i, (pred, tar) in tqdm(enumerate(zip(predicts, targets))):
-
             predicted_texts.append(pred)
             target_texts.append(tokenizer.decode(tar, skip_special_tokens=True))
-            print(f"Show {i}/{len(predicts)} result:")
-            print(f"\tPrediction: {pred}")
-            print(f"\tTarget:    {target_texts[i]}")
+            #print(f"Show {i}/{len(predicts)} result:")
+            #print(f"\tPrediction: {pred}")
+            #print(f"\tTarget:    {target_texts[i]}")
+
         results = metrics(predicted_texts, target_texts)
         print("F1 Score:", results["f1"])
 
diff --git a/xnn_llama_runner.patch b/xnn_llama_runner.patch
new file mode 100644
index 00000000000..7ddb4972642
--- /dev/null
+++ b/xnn_llama_runner.patch
@@ -0,0 +1,625 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 7012ec641..29dea224a 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -1049,6 +1049,102 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
+   target_link_libraries(executor_runner ${_executor_runner_libs})
+   target_compile_options(executor_runner PUBLIC ${_common_compile_options})
+ 
++  set(QC_EXAMPLE_SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/examples/qualcomm/)
++  set(_xnn_t5_runner__srcs
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/t5/qnn_t5_runner.cpp
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/t5/runner/decoder.cpp
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/t5/runner/decoder.h
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/t5/runner/encoder.cpp
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/t5/runner/encoder.h
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/t5/runner/runner.cpp
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/t5/runner/runner.h
++    ${CMAKE_CURRENT_LIST_DIR}/extension/llm/sampler/sampler.cpp
++  )
++  add_executable(xnn_t5_runner ${_xnn_t5_runner__srcs})
++
++  set(_xnn_whisper_runner__srcs
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/whisper/qnn_whisper_runner.cpp
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/whisper/runner/decoder.cpp
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/whisper/runner/decoder.h
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/whisper/runner/encoder.cpp
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/whisper/runner/encoder.h
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/whisper/runner/runner.cpp
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/whisper/runner/runner.h
++    ${CMAKE_CURRENT_LIST_DIR}/extension/llm/sampler/sampler.cpp
++  )
++  add_executable(xnn_whisper_runner ${_xnn_whisper_runner__srcs})
++
++  set(_xnn_llama_runner__srcs
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/qnn_llama_runner.cpp
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/runner.cpp
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/runner.h
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/cache_utils.h
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/decoder_runner.cpp
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/decoder_runner.h
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/prompt_processor.cpp
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/prompt_processor.h
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/token_generator.cpp
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/token_generator.h
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/imem_alloc.h
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/client_mem.h
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/lhd_token_generator.cpp
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/lhd_token_generator.h
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/rpc_mem.cpp
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/rpc_mem.h
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/kv_manager.cpp
++    ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/kv_manager.h
++    ${CMAKE_CURRENT_LIST_DIR}/examples/models/llama/runner/runner.cpp
++    ${CMAKE_CURRENT_LIST_DIR}/examples/models/llama/runner/runner.h
++  )
++
++  target_link_libraries(xnn_t5_runner
++    ${_executor_runner_libs}
++    extension_data_loader
++    extension_flat_tensor
++    extension_llm_runner
++    extension_module
++    extension_tensor
++    gflags
++    tokenizers::tokenizers
++  )
++  target_compile_options(xnn_t5_runner PUBLIC ${_common_compile_options})
++
++  target_link_libraries(xnn_whisper_runner
++    ${_executor_runner_libs}
++    extension_data_loader
++    extension_flat_tensor
++    extension_llm_runner
++    extension_module
++    extension_tensor
++    gflags
++    tokenizers::tokenizers
++  )
++  target_compile_options(xnn_whisper_runner PUBLIC ${_common_compile_options})
++
++  add_definitions(-DXNNPACK)
++  add_executable(xnn_llama_runner ${_xnn_llama_runner__srcs})
++  target_include_directories(
++    xnn_llama_runner PUBLIC ${_common_include_directories}
++  )
++  executorch_target_link_options_shared_lib(quantized_ops_lib)
++  target_link_libraries(xnn_llama_runner
++    ${_executor_runner_libs}
++    executorch_core
++    extension_data_loader
++    extension_flat_tensor
++    extension_llm_runner
++    extension_module
++    extension_tensor
++    gflags
++    quantized_ops_lib
++    quantized_kernels
++    tokenizers::tokenizers
++  )
++  target_include_directories(
++    xnn_llama_runner PUBLIC ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
++  )
++  target_compile_options(xnn_llama_runner PUBLIC ${_common_compile_options})
++
+   # Automatically set when using `emcmake cmake` for Wasm build.
+   if(EMSCRIPTEN)
+     # Directory of model pte files to embed in the wasm binary.
+diff --git a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
+index 71eaea2b8..bab8664a5 100644
+--- a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
++++ b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
+@@ -266,6 +266,8 @@ int main(int argc, char** argv) {
+     start_runner<uint8_t>(std::move(module), prompts);
+   } else if (kv_bitwidth == example::KvBitWidth::kWidth16) {
+     start_runner<uint16_t>(std::move(module), prompts);
++  } else if (kv_bitwidth == example::KvBitWidth::kWidth32) {
++    start_runner<float>(std::move(module), prompts);
+   } else {
+     ET_CHECK_MSG(
+         false,
+diff --git a/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.h b/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.h
+index 888e9acd4..5d9384512 100644
+--- a/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.h
++++ b/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.h
+@@ -56,7 +56,7 @@ class DecoderRunner {
+   inline int32_t logits_to_token(
+       const executorch::aten::Tensor& logits_tensor,
+       int64_t pos) {
+-    auto* logits = logits_tensor.mutable_data_ptr<uint16_t>();
++    auto* logits = logits_tensor.mutable_data_ptr<float>();
+     auto num_tokens = logits_tensor.size(1);
+     auto vocab_size = logits_tensor.size(2);
+     static std::vector<float> logits_f(vocab_size);
+diff --git a/examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp
+index bd6d27d4b..72781139c 100644
+--- a/examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp
++++ b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp
+@@ -48,7 +48,7 @@ KVManager<T>::KVManager(KVManagerMode kv_updater, Metadata metadata)
+ 
+ template <typename T>
+ void KVManager<T>::init_attention_mask(
+-    uint16_t* attention_mask,
++    float* attention_mask,
+     const std::vector<int32_t>& attention_map,
+     int32_t ar_len,
+     int32_t n_past) {
+@@ -57,16 +57,16 @@ void KVManager<T>::init_attention_mask(
+       "The size of attention_map (%zu) doesn't match with ar_len (%d)",
+       attention_map.size(),
+       ar_len);
+-  uint16_t neg_val = 0;
+-  uint16_t pos_val = 65535;
++  float neg_val = -1e9f;
++  float pos_val = 0.0f;
+   // Clear the attention mask
+   std::fill_n(attention_mask, ar_len * metadata_.context_len, neg_val);
+ 
+   // SMART_MASK requires special handling of attention mask
+   switch (kv_updater_) {
+     case KVManagerMode::SMART_MASK: {
+-      uint16_t* past_ptr = attention_mask;
+-      uint16_t* new_ptr = attention_mask + (metadata_.context_len - ar_len);
++      float* past_ptr = attention_mask;
++      float* new_ptr = attention_mask + (metadata_.context_len - ar_len);
+       // All inputs will necessarily attend to n_past and itself
+       for (int i = 0; i < ar_len; i++) {
+         // Iterate across ar_len
+@@ -77,9 +77,9 @@ void KVManager<T>::init_attention_mask(
+           // If positive, copy attention map from (relative to 0th input) parent
+           // Parent token index
+           const int32_t pidx = attention_map[i];
+-          uint16_t* parent_ptr = attention_mask + pidx * metadata_.context_len;
++          float* parent_ptr = attention_mask + pidx * metadata_.context_len;
+           std::memcpy(
+-              past_ptr, parent_ptr, metadata_.context_len * sizeof(uint16_t));
++              past_ptr, parent_ptr, metadata_.context_len * sizeof(float));
+         }
+         // Attend to itself
+         new_ptr[i] = pos_val;
+@@ -92,7 +92,7 @@ void KVManager<T>::init_attention_mask(
+       // Only fill in ar_len. Rest will be padding
+       const size_t attn_row_start = metadata_.context_len - n_past - ar_len;
+       for (int i = 0; i < ar_len; i++) {
+-        uint16_t* cur_ptr =
++        float* cur_ptr =
+             attention_mask + i * metadata_.context_len + attn_row_start;
+         // Attend to itself
+         cur_ptr[n_past + i] = pos_val;
+@@ -103,10 +103,10 @@ void KVManager<T>::init_attention_mask(
+           // If positive, copy attention map from (relative to 0th input) parent
+           // Parent token index
+           const int32_t pidx = attention_map[i];
+-          uint16_t* parent_ptr =
++          float* parent_ptr =
+               attention_mask + pidx * metadata_.context_len + attn_row_start;
+           std::memcpy(
+-              cur_ptr, parent_ptr, (n_past + pidx + 1) * sizeof(uint16_t));
++              cur_ptr, parent_ptr, (n_past + pidx + 1) * sizeof(float));
+         }
+       }
+       break;
+@@ -118,7 +118,7 @@ void KVManager<T>::init_attention_mask(
+ 
+ template <typename T>
+ void KVManager<T>::init_attention_mask(
+-    uint16_t* attention_mask,
++    float* attention_mask,
+     const std::vector<int32_t>& attention_map,
+     int32_t ar_len,
+     int32_t n_past,
+@@ -129,16 +129,16 @@ void KVManager<T>::init_attention_mask(
+       "The size of attention_map (%zu) doesn't match with ar_len (%d)",
+       attention_map.size(),
+       ar_len);
+-  uint16_t neg_val = 0;
+-  uint16_t pos_val = 65535;
++  float neg_val = -1e9f;
++  float pos_val = 0.0f;
+   // Clear the attention mask
+   std::fill_n(attention_mask, ar_len * metadata_.context_len, neg_val);
+ 
+   // SMART_MASK requires special handling of attention mask
+   switch (kv_updater_) {
+     case KVManagerMode::SMART_MASK: {
+-      uint16_t* past_ptr = attention_mask;
+-      uint16_t* new_ptr = attention_mask + (metadata_.context_len - ar_len);
++      float* past_ptr = attention_mask;
++      float* new_ptr = attention_mask + (metadata_.context_len - ar_len);
+       // All inputs will necessarily attend to n_past and itself
+       for (int i = 0; i < ar_len; i++) {
+         // Iterate across ar_len
+@@ -149,9 +149,9 @@ void KVManager<T>::init_attention_mask(
+           // If positive, copy attention map from (relative to 0th input) parent
+           // Parent token index
+           const int32_t pidx = attention_map[i];
+-          uint16_t* parent_ptr = attention_mask + pidx * metadata_.context_len;
++          float* parent_ptr = attention_mask + pidx * metadata_.context_len;
+           std::memcpy(
+-              past_ptr, parent_ptr, metadata_.context_len * sizeof(uint16_t));
++              past_ptr, parent_ptr, metadata_.context_len * sizeof(float));
+         }
+         // Attend to itself
+         new_ptr[i] = pos_val;
+@@ -172,7 +172,7 @@ void KVManager<T>::init_attention_mask(
+       // Only fill in ar_len. Rest will be padding
+       const size_t attn_row_start = metadata_.context_len - n_past - ar_len;
+       for (int i = 0; i < ar_len; i++) {
+-        uint16_t* cur_ptr =
++        float* cur_ptr =
+             attention_mask + i * metadata_.context_len + attn_row_start;
+         // Attend to itself
+         cur_ptr[n_past + i] = pos_val;
+@@ -183,10 +183,10 @@ void KVManager<T>::init_attention_mask(
+           // If positive, copy attention map from (relative to 0th input) parent
+           // Parent token index
+           const int32_t pidx = attention_map[i];
+-          uint16_t* parent_ptr =
++          float* parent_ptr =
+               attention_mask + pidx * metadata_.context_len + attn_row_start;
+           std::memcpy(
+-              cur_ptr, parent_ptr, (n_past + pidx + 1) * sizeof(uint16_t));
++              cur_ptr, parent_ptr, (n_past + pidx + 1) * sizeof(float));
+         }
+       }
+       break;
+@@ -198,12 +198,12 @@ void KVManager<T>::init_attention_mask(
+ 
+ template <typename T>
+ void KVManager<T>::update_attention_mask(
+-    uint16_t* attention_mask,
++    float* attention_mask,
+     int32_t ar_len,
+     int32_t n_past,
+     int32_t n_update) {
+-  uint16_t pos_val = 65535;
+-  uint16_t* cur_ptr = attention_mask;
++  float pos_val = 0.0f;
++  float* cur_ptr = attention_mask;
+   if (kv_updater_ == KVManagerMode::SMART_MASK)
+     cur_ptr += n_past;
+   if (kv_updater_ == KVManagerMode::SHIFT_POINTER)
+@@ -217,15 +217,15 @@ void KVManager<T>::update_attention_mask(
+ 
+ template <typename T>
+ void KVManager<T>::update_attention_mask(
+-    uint16_t* attention_mask,
++    float* attention_mask,
+     int32_t ar_len,
+     int32_t n_past,
+     int32_t n_update,
+     int32_t sliding_window,
+     const std::vector<int32_t>& position_offset) {
+-  uint16_t pos_val = 65535;
+-  uint16_t neg_val = 0;
+-  uint16_t* cur_ptr = attention_mask;
++  float pos_val = 0.0f;
++  float neg_val = -1e9f;
++  float* cur_ptr = attention_mask;
+   if (kv_updater_ == KVManagerMode::SMART_MASK)
+     cur_ptr += n_past;
+   if (kv_updater_ == KVManagerMode::SHIFT_POINTER)
+@@ -544,6 +544,7 @@ void KVManager<T>::update_value(
+ }
+ 
+ // Explicit instantiations
++template class KVManager<float>;
+ template class KVManager<uint16_t>;
+ template class KVManager<uint8_t>;
+ 
+diff --git a/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h
+index af9cf49a3..2b2563b8e 100644
+--- a/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h
++++ b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h
+@@ -73,7 +73,7 @@ class KVManager {
+    * @param n_past Number of past elements in the cache.
+    */
+   void init_attention_mask(
+-      uint16_t* attention_mask,
++      float* attention_mask,
+       const std::vector<int32_t>& attention_map,
+       int32_t ar_len,
+       int32_t n_past);
+@@ -100,7 +100,7 @@ class KVManager {
+    * @param position_offset (optional) attention mask position offset of
+    */
+   void init_attention_mask(
+-      uint16_t* attention_mask,
++      float* attention_mask,
+       const std::vector<int32_t>& attention_map,
+       int32_t ar_len,
+       int32_t n_past,
+@@ -116,7 +116,7 @@ class KVManager {
+    * @param n_update Number of elements to be updated.
+    */
+   void update_attention_mask(
+-      uint16_t* attention_mask,
++      float* attention_mask,
+       int32_t ar_len,
+       int32_t n_past,
+       int32_t n_update);
+@@ -134,7 +134,7 @@ class KVManager {
+    * lookahead decoder
+    */
+   void update_attention_mask(
+-      uint16_t* attention_mask,
++      float* attention_mask,
+       int32_t ar_len,
+       int32_t n_past,
+       int32_t n_update,
+diff --git a/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.cpp b/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.cpp
+index 1692caa27..2f594eb16 100644
+--- a/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.cpp
++++ b/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.cpp
+@@ -398,5 +398,6 @@ Result<int64_t> LhdTokenGenerator<T>::generate(
+ // Explicit instantiations
+ template class LhdTokenGenerator<uint16_t>;
+ template class LhdTokenGenerator<uint8_t>;
++template class LhdTokenGenerator<float>;
+ 
+ } // namespace example
+diff --git a/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.cpp b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.cpp
+index 73da764b5..ab5731e89 100644
+--- a/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.cpp
++++ b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.cpp
+@@ -39,21 +39,21 @@ PromptProcessor<T>::PromptProcessor(
+   switch (metadata_.cache_mode) {
+     case CacheMode::StaticCahce:
+       attention_mask_.size =
+-          metadata_.ar_len * metadata_.context_len * sizeof(uint16_t);
++          metadata_.ar_len * metadata_.context_len * sizeof(float);
+       window_attention_mask_.size = 0;
+       break;
+     case CacheMode::HybridCache:
+       attention_mask_.size =
+-          metadata_.ar_len * metadata_.context_len * sizeof(uint16_t);
++          metadata_.ar_len * metadata_.context_len * sizeof(float);
+       window_attention_mask_.size =
+-          metadata_.ar_len * metadata_.context_len * sizeof(uint16_t);
++          metadata_.ar_len * metadata_.context_len * sizeof(float);
+       break;
+     default:
+       ET_CHECK_MSG(false, "Unsupported llama cache mode");
+       break;
+   }
+ 
+-  logits_.size = metadata_.ar_len * metadata_.vocab_size * sizeof(uint16_t);
++  logits_.size = metadata_.ar_len * metadata_.vocab_size * sizeof(float);
+ };
+ template <typename T>
+ void PromptProcessor<T>::init_io(
+@@ -78,7 +78,7 @@ void PromptProcessor<T>::init_io(
+ 
+   // [I]: attention_mask
+   Result<TensorInfo> attention_mask = method_meta->input_tensor_meta(idx++);
+-  attention_mask_.data = reinterpret_cast<uint16_t*>(
++  attention_mask_.data = reinterpret_cast<float*>(
+       buffer_manager->allocate(attention_mask_.size));
+   attention_mask_.tensor = std::make_unique<TensorImpl>(
+       attention_mask->scalar_type(),
+@@ -95,7 +95,7 @@ void PromptProcessor<T>::init_io(
+   if (metadata_.cache_mode == CacheMode::HybridCache) {
+     Result<TensorInfo> window_attention_mask =
+         method_meta->input_tensor_meta(idx++);
+-    window_attention_mask_.data = reinterpret_cast<uint16_t*>(
++    window_attention_mask_.data = reinterpret_cast<float*>(
+         buffer_manager->allocate(window_attention_mask_.size));
+     window_attention_mask_.tensor = std::make_unique<TensorImpl>(
+         window_attention_mask->scalar_type(),
+@@ -159,7 +159,7 @@ void PromptProcessor<T>::init_io(
+   // [O]: logits
+   Result<TensorInfo> logits = method_meta->output_tensor_meta(0);
+   logits_.data =
+-      reinterpret_cast<uint16_t*>(buffer_manager->allocate(logits_.size));
++      reinterpret_cast<float*>(buffer_manager->allocate(logits_.size));
+   logits_.tensor = std::make_unique<TensorImpl>(
+       logits->scalar_type(),
+       logits->sizes().size(),
+@@ -202,7 +202,7 @@ void PromptProcessor<T>::init_io(
+ }
+ 
+ template <typename T>
+-const std::vector<uint16_t>& PromptProcessor<T>::get_all_logits() {
++const std::vector<float>& PromptProcessor<T>::get_all_logits() {
+   return prompt_all_logits_;
+ }
+ 
+@@ -347,5 +347,6 @@ Result<uint64_t> PromptProcessor<T>::prefill(
+ // Explicit instantiations
+ template class PromptProcessor<uint16_t>;
+ template class PromptProcessor<uint8_t>;
++template class PromptProcessor<float>;
+ 
+ } // namespace example
+diff --git a/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h
+index a3dd20794..c375d0a6f 100644
+--- a/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h
++++ b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h
+@@ -54,7 +54,7 @@ class PromptProcessor {
+    *
+    * @return std::vector<uint16_t>& all the logits generated
+    */
+-  virtual const std::vector<uint16_t>& get_all_logits();
++  virtual const std::vector<float>& get_all_logits();
+ 
+   /**
+    * Prefill an LLM Module with the given text input.
+@@ -110,9 +110,9 @@ class PromptProcessor {
+   // inputs and outputs
+   TensorStruct<int64_t> input_toks_;
+   TensorStruct<int32_t> input_pos_;
+-  TensorStruct<uint16_t> attention_mask_;
+-  TensorStruct<uint16_t> window_attention_mask_;
+-  TensorStruct<uint16_t> logits_;
++  TensorStruct<float> attention_mask_;
++  TensorStruct<float> window_attention_mask_;
++  TensorStruct<float> logits_;
+ 
+   // layer -> head -> TensorImpl
+   std::vector<std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
+@@ -129,6 +129,6 @@ class PromptProcessor {
+   std::vector<executorch::aten::Tensor> output_tensors_;
+ 
+   // Unused by default, only used when dump_logits_path is provided.
+-  std::vector<uint16_t> prompt_all_logits_;
++  std::vector<float> prompt_all_logits_;
+ };
+ } // namespace example
+diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
+index 709ad3cfa..31b3b1afd 100644
+--- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
++++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
+@@ -65,17 +65,17 @@ void print_performance_report(
+ 
+ void save_logits(
+     const std::string& dump_logits_path,
+-    const std::vector<uint16_t>& prefill_logits,
+-    const std::vector<uint16_t>& decode_logits) {
++    const std::vector<float>& prefill_logits,
++    const std::vector<float>& decode_logits) {
+   std::ofstream outFile(dump_logits_path.c_str(), std::ios::binary);
+   if (outFile.is_open()) {
+     outFile.write(
+         reinterpret_cast<const char*>(prefill_logits.data()),
+-        prefill_logits.size() * sizeof(uint16_t));
++        prefill_logits.size() * sizeof(float));
+ 
+     outFile.write(
+         reinterpret_cast<const char*>(decode_logits.data()),
+-        decode_logits.size() * sizeof(uint16_t));
++        decode_logits.size() * sizeof(float));
+     outFile.close();
+   } else {
+     ET_CHECK_MSG(false, "Error saving the dump logits file");
+@@ -478,5 +478,6 @@ Result<DecoderModelVersion> Runner<T>::get_decoder_model_version() {
+ // Explicit instantiations
+ template class Runner<uint16_t>;
+ template class Runner<uint8_t>;
++template class Runner<float>;
+ 
+ } // namespace example
+diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h
+index 9f290d79c..160529f44 100644
+--- a/examples/qualcomm/oss_scripts/llama/runner/runner.h
++++ b/examples/qualcomm/oss_scripts/llama/runner/runner.h
+@@ -43,6 +43,7 @@ enum DecoderModelVersion {
+ enum KvBitWidth {
+   kWidth8 = 8,
+   kWidth16 = 16,
++  kWidth32 = 32,
+ };
+ 
+ template <typename T>
+diff --git a/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp b/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp
+index 6775c08bd..c7b786a0d 100644
+--- a/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp
++++ b/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp
+@@ -39,26 +39,26 @@ TokenGenerator<T>::TokenGenerator(
+   input_toks_.size = metadata_.ar_len * sizeof(int64_t);
+   input_pos_.size = metadata_.ar_len * sizeof(int32_t);
+   attention_mask_.size =
+-      metadata_.ar_len * metadata_.context_len * sizeof(uint16_t);
++      metadata_.ar_len * metadata_.context_len * sizeof(float);
+ 
+   switch (metadata_.cache_mode) {
+     case CacheMode::StaticCahce:
+       attention_mask_.size =
+-          metadata_.ar_len * metadata_.context_len * sizeof(uint16_t);
++          metadata_.ar_len * metadata_.context_len * sizeof(float);
+       window_attention_mask_.size = 0;
+       break;
+     case CacheMode::HybridCache:
+       attention_mask_.size =
+-          metadata_.ar_len * metadata_.context_len * sizeof(uint16_t);
++          metadata_.ar_len * metadata_.context_len * sizeof(float);
+       window_attention_mask_.size =
+-          metadata_.ar_len * metadata_.context_len * sizeof(uint16_t);
++          metadata_.ar_len * metadata_.context_len * sizeof(float);
+       break;
+     default:
+       ET_CHECK_MSG(false, "Unsupported llama cache mode");
+       break;
+   }
+ 
+-  logits_.size = metadata_.ar_len * metadata_.vocab_size * sizeof(uint16_t);
++  logits_.size = metadata_.ar_len * metadata_.vocab_size * sizeof(float);
+ }
+ template <typename T>
+ void TokenGenerator<T>::init_io(
+@@ -83,7 +83,7 @@ void TokenGenerator<T>::init_io(
+ 
+   // [I]: attention_mask
+   Result<TensorInfo> attention_mask = method_meta->input_tensor_meta(idx++);
+-  attention_mask_.data = reinterpret_cast<uint16_t*>(
++  attention_mask_.data = reinterpret_cast<float*>(
+       buffer_manager->allocate(attention_mask_.size));
+   attention_mask_.tensor = std::make_unique<TensorImpl>(
+       attention_mask->scalar_type(),
+@@ -100,7 +100,7 @@ void TokenGenerator<T>::init_io(
+   if (metadata_.cache_mode == CacheMode::HybridCache) {
+     Result<TensorInfo> window_attention_mask =
+         method_meta->input_tensor_meta(idx++);
+-    window_attention_mask_.data = reinterpret_cast<uint16_t*>(
++    window_attention_mask_.data = reinterpret_cast<float*>(
+         buffer_manager->allocate(window_attention_mask_.size));
+     window_attention_mask_.tensor = std::make_unique<TensorImpl>(
+         window_attention_mask->scalar_type(),
+@@ -162,7 +162,7 @@ void TokenGenerator<T>::init_io(
+   // [O]: logits
+   Result<TensorInfo> logits = method_meta->output_tensor_meta(0);
+   logits_.data =
+-      reinterpret_cast<uint16_t*>(buffer_manager->allocate(logits_.size));
++      reinterpret_cast<float*>(buffer_manager->allocate(logits_.size));
+   logits_.tensor = std::make_unique<TensorImpl>(
+       logits->scalar_type(),
+       logits->sizes().size(),
+@@ -205,7 +205,7 @@ void TokenGenerator<T>::init_io(
+ }
+ 
+ template <typename T>
+-const std::vector<uint16_t>& TokenGenerator<T>::get_all_logits() {
++const std::vector<float>& TokenGenerator<T>::get_all_logits() {
+   return token_all_logits_;
+ }
+ 
+@@ -328,5 +328,6 @@ Result<int64_t> TokenGenerator<T>::generate(
+ // Explicit instantiations
+ template class TokenGenerator<uint16_t>;
+ template class TokenGenerator<uint8_t>;
++template class TokenGenerator<float>;
+ 
+ } // namespace example
+diff --git a/examples/qualcomm/oss_scripts/llama/runner/token_generator.h b/examples/qualcomm/oss_scripts/llama/runner/token_generator.h
+index 9f0198f30..10b9f832d 100644
+--- a/examples/qualcomm/oss_scripts/llama/runner/token_generator.h
++++ b/examples/qualcomm/oss_scripts/llama/runner/token_generator.h
+@@ -59,7 +59,7 @@ class TokenGenerator {
+    *
+    * @return std::vector<uint16_t>& all the logits generated
+    */
+-  virtual const std::vector<uint16_t>& get_all_logits();
++  virtual const std::vector<float>& get_all_logits();
+ 
+   /**
+      * @brief Generate tokens.
+@@ -95,9 +95,9 @@ class TokenGenerator {
+   // inputs and outputs
+   TensorStruct<int64_t> input_toks_;
+   TensorStruct<int32_t> input_pos_;
+-  TensorStruct<uint16_t> attention_mask_;
+-  TensorStruct<uint16_t> window_attention_mask_;
+-  TensorStruct<uint16_t> logits_;
++  TensorStruct<float> attention_mask_;
++  TensorStruct<float> window_attention_mask_;
++  TensorStruct<float> logits_;
+ 
+   // layer -> head -> TensorImpl
+   std::vector<std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
+@@ -128,6 +128,6 @@ class TokenGenerator {
+   Metadata metadata_;
+ 
+   // Unused by default, only used when dump_logits_path is provided.
+-  std::vector<uint16_t> token_all_logits_;
++  std::vector<float> token_all_logits_;
+ };
+ } // namespace example