diff --git a/backends/qualcomm/serialization/qc_compiler_spec.fbs b/backends/qualcomm/serialization/qc_compiler_spec.fbs index 8aeaa060a50..2e5c3ce9204 100644 --- a/backends/qualcomm/serialization/qc_compiler_spec.fbs +++ b/backends/qualcomm/serialization/qc_compiler_spec.fbs @@ -18,6 +18,7 @@ enum HtpArch: int { V73 = 73, V75 = 75, V79 = 79, + V81 = 81, } table HtpInfo { @@ -43,6 +44,8 @@ enum QcomChipset: int { SXR1230P = 45, SXR2230P = 53, SXR2330P = 75, + SM8850 = 87, + SM8735 = 85, } /// Indicate the information of the specified SoC. diff --git a/backends/qualcomm/serialization/qc_schema.py b/backends/qualcomm/serialization/qc_schema.py index f3b9e2cc1a5..7907aa46a2b 100644 --- a/backends/qualcomm/serialization/qc_schema.py +++ b/backends/qualcomm/serialization/qc_schema.py @@ -27,6 +27,7 @@ class HtpArch(IntEnum): V73 = 73 V75 = 75 V79 = 79 + V81 = 81 @dataclass @@ -49,6 +50,8 @@ class QcomChipset(IntEnum): SXR1230P = 45 # v73 SXR2230P = 53 # v69 SXR2330P = 75 # v79 + SM8850 = 87, # v81 + SM8735 = 85, # v73 @dataclass @@ -69,6 +72,8 @@ class SocInfo: QcomChipset.SXR1230P: SocInfo(QcomChipset.SXR1230P, HtpInfo(HtpArch.V73, 2)), QcomChipset.SXR2230P: SocInfo(QcomChipset.SXR2230P, HtpInfo(HtpArch.V69, 8)), QcomChipset.SXR2330P: SocInfo(QcomChipset.SXR2330P, HtpInfo(HtpArch.V79, 8)), + QcomChipset.SM8850: SocInfo(QcomChipset.SM8850, HtpInfo(HtpArch.V81, 8)), + QcomChipset.SM8735: SocInfo(QcomChipset.SM8735, HtpInfo(HtpArch.V73, 8)), } diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py index be4e86de50f..5d2ef263a47 100644 --- a/backends/qualcomm/utils/utils.py +++ b/backends/qualcomm/utils/utils.py @@ -1099,6 +1099,8 @@ def get_soc_to_arch_map(): "SXR1230P": HtpArch.V73, "SXR2230P": HtpArch.V69, "SXR2330P": HtpArch.V79, + "SM8850": HtpArch.V81, + "SM8735": HtpArch.V73, } @@ -1115,6 +1117,8 @@ def get_soc_to_chipset_map(): "SXR1230P": QcomChipset.SXR1230P, "SXR2230P": QcomChipset.SXR2230P, "SXR2330P": QcomChipset.SXR2330P, + "SM8850": QcomChipset.SM8850, + "SM8735": QcomChipset.SM8735, } diff --git a/benchmark.py b/benchmark.py new file mode 100644 index 00000000000..24c7eede835 --- /dev/null +++ b/benchmark.py @@ -0,0 +1,1257 @@ +import argparse +import numpy as np +import os +import random +import subprocess +import torch + +from executorch.examples.models.llama.evaluate.eager_eval import EagerEvalWrapper + + +### GLOBALS +qnn_sdk = os.getenv("QNN_SDK_ROOT") +workspace = "/data/local/tmp/et_ga_benchmark" +memory_script_file = "peak_memory.sh" +perf_file = "statistics.txt" +seed = 1126 +### + + +def image_classification_eval( + backend, + soc_model, + device, + host, + pte_path, + module, + inputs, + targets, + artifact_dir, +): + from executorch.examples.qualcomm.utils import ( + make_output_dir, + SimpleADB, + topk_accuracy, + ) + import numpy as np + from pathlib import Path + + adb = SimpleADB( + qnn_sdk=qnn_sdk, + build_path="build-android", + pte_path=pte_path, + workspace=f"/data/local/tmp/executorch/{Path(pte_path).stem}", + device_id=device, + host_id=host, + soc_model=soc_model, + ) + files = ["build-xnnpack/executor_runner"] if backend == "xnn" else None + custom_commands = ( + f"cd {adb.workspace} && ./executor_runner --model_path " + f"{os.path.basename(adb.pte_path[0])} --input_list_path input_list.txt" + if backend == "xnn" else None + ) + adb.push(inputs=inputs, files=files) + adb.execute(custom_runner_cmd=custom_commands) + + # collect output data + output_data_folder = f"{artifact_dir}/outputs" + make_output_dir(output_data_folder) + + adb.pull(output_path=artifact_dir) + + # top-k analysis + predictions, goldens = [], [] + for input in inputs: + goldens.append(module(*input).logits.detach().numpy()) + + for i in range(len(inputs)): + predictions.append( + np.fromfile( + os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32 + ) + ) + + k_val = [1, 5] + topk = [topk_accuracy(goldens, targets, k).item() for k in k_val] + print("cpu:") + for i, k in enumerate(k_val): + print(f"top_{k}->{topk[i]}%") + topk = [topk_accuracy(predictions, targets, k).item() for k in k_val] + print("device:") + for i, k in enumerate(k_val): + print(f"top_{k}->{topk[i]}%") + + +def masked_lm_eval( + backend, + soc_model, + device, + host, + pte_path, + module, + inputs, + targets, + artifact_dir, +): + from executorch.examples.qualcomm.utils import SimpleADB, make_output_dir + import numpy as np + from pathlib import Path + import evaluate + + adb = SimpleADB( + qnn_sdk=qnn_sdk, + build_path="build-android", + pte_path=pte_path, + workspace=f"/data/local/tmp/executorch/{Path(pte_path).stem}", + device_id=device, + host_id=host, + soc_model=soc_model, + ) + files = ["build-xnnpack/executor_runner"] if backend == "xnn" else None + custom_commands = ( + f"cd {adb.workspace} && ./executor_runner --model_path" + f" {os.path.basename(adb.pte_path[0])} --input_list_path input_list.txt" + if backend == "xnn" else None + ) + if backend == "xnn": + for i, input in enumerate(inputs): + inputs[i] = tuple(inp.to(torch.long) for inp in input) + + adb.push(inputs=inputs, files=files) + adb.execute(custom_runner_cmd=custom_commands) + + # collect output data + output_data_folder = f"{artifact_dir}/outputs" + make_output_dir(output_data_folder) + + adb.pull(output_path=artifact_dir) + + labels, goldens, predictions = [], [], [] + for i in range(len(inputs)): + indice = [i for i, x in enumerate(targets[i]) if x != -100] + labels.extend(targets[i][indice].tolist()) + golden = module(*inputs[i]).logits.detach().numpy().argmax(axis=-1) + goldens.extend(golden[0, indice].tolist()) + prediction = ( + np.fromfile( + os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32 + ) + .reshape([1, inputs[0][0].shape[1], -1]) + .argmax(axis=-1) + ) + predictions.extend(prediction[0, indice].tolist()) + + metric = evaluate.load("accuracy") + results = metric.compute(predictions=goldens, references=labels) + print(f"cpu accuracy: {results['accuracy']}") + results = metric.compute(predictions=predictions, references=labels) + print(f"device accuracy: {results['accuracy']}") + + +def t5_eval( + backend, + soc_model, + device, + host, + pte_path, + module, + inputs, + targets, + artifact_dir, +): + from executorch.examples.qualcomm.utils import ( + evaluate_squad, make_output_dir, SimpleADB + ) + from executorch.examples.qualcomm.oss_scripts.t5.t5 import Seq2SeqLMExportableModulePipeline + from pathlib import Path + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small") + _, _, spiece_model, _, _ = tokenizer.save_pretrained(artifact_dir) + max_seq_len = module.decoder.max_static_cache_length + + workspace = f"/data/local/tmp/executorch/{Path(pte_path).stem}" + adb = SimpleADB( + qnn_sdk=qnn_sdk, + build_path="build-android", + pte_path=pte_path, + workspace=workspace, + device_id=device, + host_id=host, + soc_model=soc_model, + runner="examples/qualcomm/oss_scripts/t5/qnn_t5_runner", + ) + runner_args = " ".join( + [ + f"--tokenizer_model_path {os.path.basename(spiece_model)}", + f"--model_path {os.path.basename(pte_path)}", + f"--seq_len {max_seq_len}", + "--output_folder_path outputs", + ] + ) + runner_cmd = " ".join( + [ + f"cd {workspace} &&", + f"./{'qnn_t5_runner' if backend == 'qnn' else 'xnn_t5_runner'}", + runner_args, + ] + ) + files = [spiece_model] + if backend == "xnn": + files.append("build-xnnpack/xnn_t5_runner") + + adb.push(inputs=inputs, files=files) + adb.execute(custom_runner_cmd=runner_cmd) + + # collect output data + output_data_folder = f"{artifact_dir}/outputs" + make_output_dir(output_data_folder) + + outputs = [] + def post_process(): + for i in range(len(inputs)): + with open(f"{artifact_dir}/outputs/output_{i}.txt", "r") as f: + outputs.append(f.read()) + adb.pull(output_path=artifact_dir, callback=post_process) + + # cpu inference + goldens = [] + with torch.no_grad(): + for input in inputs: + # run encoder + hidden_state = module.encoder(*input[:-1]) + _, attn_mask, _, _, pos = module.decoder.get_example_inputs() + tokens = [input[-1].item()] + # generate tokens one by one + for _ in range(max_seq_len - 1): + # run decoder for next token prediction + logits = module.decoder( + torch.tensor([[tokens[-1]]], dtype=torch.long), + attn_mask, + hidden_state, + input[1], + pos, + ) + + # get next token + tokens.append(torch.argmax(logits, dim=-1).item()) + pos += 1 + attn_mask[..., pos] = 0 + + # Check if EOS token + if tokens[-1] == module.decoder.config.eos_token_id: + break + goldens.append(tokenizer.decode(tokens[1:-1])) + + print("cpu accuracy >") + Seq2SeqLMExportableModulePipeline.evaluate_with_ground_truth( + tokenizer, goldens, targets, evaluate_squad + ) + print("device accuracy >") + Seq2SeqLMExportableModulePipeline.evaluate_with_ground_truth( + tokenizer, outputs, targets, evaluate_squad + ) + + +def whisper_eval( + backend, + soc_model, + device, + host, + pte_path, + module, + inputs, + targets, + artifact_dir, +): + from executorch.examples.qualcomm.utils import make_output_dir, SimpleADB + from executorch.examples.qualcomm.oss_scripts.whisper.whisper import eval_metric + from executorch.examples.qualcomm.oss_scripts.whisper.whisper_model import EncoderDecoderCache, DynamicCache + from pathlib import Path + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained("openai/whisper-tiny") + tokenizer_json = tokenizer.save_pretrained(args.artifact)[-1] + max_seq_len = module.max_seq_length + + workspace = f"/data/local/tmp/executorch/{Path(pte_path).stem}" + adb = SimpleADB( + qnn_sdk=qnn_sdk, + build_path="build-android", + pte_path=pte_path, + workspace=workspace, + device_id=device, + host_id=host, + soc_model=soc_model, + runner="examples/qualcomm/oss_scripts/whisper/qnn_whisper_runner", + ) + runner_args = " ".join( + [ + f"--model_path {os.path.basename(pte_path)}", + f"--tokenizer_json_path {os.path.basename(tokenizer_json)}", + "--input_list_path input_list.txt", + f"--seq_len {max_seq_len}", + "--output_folder_path outputs", + ] + ) + runner_cmd = " ".join( + [ + f"cd {workspace} &&", + f"./{'qnn_whisper_runner' if backend == 'qnn' else 'xnn_whisper_runner'}", + runner_args, + ] + ) + files = [tokenizer_json] + if backend == "xnn": + files.append("build-xnnpack/xnn_whisper_runner") + + adb.push(inputs=inputs, files=files) + adb.execute(custom_runner_cmd=runner_cmd) + + # collect output data + output_data_folder = f"{artifact_dir}/outputs" + make_output_dir(output_data_folder) + + outputs = [] + def post_process(): + for i in range(len(inputs)): + with open(f"{artifact_dir}/outputs/output_{i}.txt", "r") as f: + outputs.append(f.read()) + adb.pull(output_path=artifact_dir, callback=post_process) + + # cpu inference + decoder_start_token_id = getattr(module.config, "decoder_start_token_id", 50258) + eos_token_id = getattr(module.config, "eos_token_id", 50257) + goldens = [] + with torch.no_grad(): + for input in inputs: + # run encoder + hidden_state = module.whisper_encoder(*input) + _, attn_mask, _, pos = module.whisper_decoder.get_example_inputs() + tokens = [decoder_start_token_id] + # generate tokens one by one + for _ in range(max_seq_len - 1): + # run decoder for next token prediction + logits = module.whisper_decoder( + torch.tensor([[tokens[-1]]], dtype=torch.long), + attn_mask, + hidden_state, + pos, + ) + + # get next token + tokens.append(torch.argmax(logits, dim=-1).item()) + pos += 1 + attn_mask[..., pos] = 0 + + # Check if EOS token + if tokens[-1] == eos_token_id: + break + + module.whisper_decoder.static_cache.reset() + module.whisper_decoder.cache = EncoderDecoderCache( + module.whisper_decoder.static_cache, DynamicCache() + ) + goldens.append(tokenizer.decode(tokens[1:])) + + print(f"cpu accuracy >\n{eval_metric(goldens, targets)}") + print(f"device accuracy >\n{eval_metric(outputs, targets)}") + + +class RunnerEvalWrapper(EagerEvalWrapper): + """ + A wrapper class to run PPL scores on device. + """ + + def __init__( + self, + backend, + soc_model, + device, + host, + pte_path, + artifact_dir, + decoder_model, + tokenizer, + runtime_tokenizer_path, + ): + from pathlib import Path + from executorch.exir._serialize._program import deserialize_pte_binary + from executorch.examples.qualcomm.utils import SimpleADB + + self.pte_path = pte_path + with open(pte_path, "rb") as f: + program_data = f.read() + program = deserialize_pte_binary(program_data) + # Retrieve vocab_size from get_metadata under static_llama that is passed to edge manager + self.output_vocab_size = None + pte_max_seq_len = None + self.logits_scale = None + self.logits_zero_point = None + self.kv_io_bit_width = 32 + self.et_backend = backend + for method in program.execution_plan: + # Don't use tokenizer.n_words, the numbers are off once calling get_tokenizer() + if method.name == "get_vocab_size": + # pyre-ignore + self.output_vocab_size = method.values[0].val.int_val + if method.name == "get_max_seq_len": + # pyre-ignore + pte_max_seq_len = method.values[0].val.int_val + if method.name == "get_logits_scale": + self.logits_scale = method.values[0].val.double_val + if method.name == "get_logits_zero_point": + self.logits_zero_point = method.values[0].val.int_val + if method.name == "get_kv_io_bit_width": + self.kv_io_bit_width = method.values[0].val.int_val + + # FP has no scale/zero_point, use following values, which is equivalent to not performing dequantize. + if self.kv_io_bit_width == 32: + self.logits_scale = 1 + self.logits_zero_point = 0 + elif self.logits_scale is None or self.logits_zero_point is None: + raise RuntimeError( + "Unable to find scale/offset. The .pte file might be deprecated. Please generate a new .pte file" + ) + + assert self.output_vocab_size is not None, "Couldn't find the vocab size" + assert pte_max_seq_len is not None, "Couldn't find the max_seq_len from pte" + self.decoder_model = decoder_model + self.max_seq_length = pte_max_seq_len + self.runtime_tokenizer_path = runtime_tokenizer_path + self.artifact_dir = artifact_dir + self.output_dir = args.artifact + self.workspace = f"/data/local/tmp/executorch/{decoder_model}" + self.adb = SimpleADB( + qnn_sdk=os.getenv("QNN_SDK_ROOT"), + build_path="build-android", + pte_path=pte_path, + workspace=self.workspace, + device_id=device, + host_id=host, + soc_model=soc_model, + runner="examples/qualcomm/oss_scripts/llama/qnn_llama_runner", + ) + files = [self.runtime_tokenizer_path] + if backend == "xnn": + files.append("build-xnnpack/xnn_llama_runner") + self.adb.push(inputs=[], files=files) + # n seq len = n-1 cache len, so we len(inps) = n-1 during _model_call + # pyre-ignore + super().__init__(None, tokenizer, self.max_seq_length - 1) + + def _model_call(self, inps): + from executorch.examples.qualcomm.oss_scripts.llama import DECODER_MODEL_VERSION + from executorch.examples.qualcomm.utils import make_output_dir + + input_file_name = f"{self.artifact_dir}/input_tokens.raw" + inps = inps.to(torch.uint64).numpy() + inps.tofile(input_file_name) + + outputs_path = "outputs/outputs.txt" + dump_logits_path = "outputs/all_logit.raw" + performance_output_path = "outputs/inference_speed.txt" + runner_cmd = " ".join( + [ + f"cd {self.workspace} &&", + f"./{self.et_backend}_llama_runner", + f"--decoder_model_version {DECODER_MODEL_VERSION[self.decoder_model]}", + f"--tokenizer_path {os.path.basename(self.runtime_tokenizer_path)}", + f"--model_path {os.path.basename(self.pte_path)}", + f"--seq_len {self.max_seq_length}", + f"--output_path {outputs_path}", + f"--performance_output_path {performance_output_path}", + f"--kv_updater SmartMask", + f"--eval_mode 0", + "--temperature 0", + f"--dump_logits_path {dump_logits_path}", + f"--tokenized_prompt {os.path.basename(input_file_name)}", + ] + ) + + self.adb.push(inputs=[], files=[input_file_name], init_env=False) + self.adb.execute(custom_runner_cmd=runner_cmd) + output_data_folder = f"{self.output_dir}/outputs" + make_output_dir(output_data_folder) + output_tensor_list = [] + + def post_process(): + with open(f"{self.artifact_dir}/{dump_logits_path}", "r") as f: + if self.kv_io_bit_width == 32: + output_tensor = torch.from_numpy( + np.fromfile(f.name, dtype=np.float32).reshape( + 1, -1, self.output_vocab_size + ) + ) + output_tensor_list.append(output_tensor) + else: + output_tensor = torch.from_numpy( + np.fromfile(f.name, dtype=np.uint16).reshape( + 1, -1, self.output_vocab_size + ) + ) + output_tensor = ( + output_tensor.to(torch.float32) - self.logits_zero_point + ) * self.logits_scale + output_tensor_list.append(output_tensor) + + # simple_eval will run multiple rounds, use last run for inference speed + with open(f"{self.artifact_dir}/{performance_output_path}", "r") as f: + self.inference_speed = float(f.read()) + + self.adb.pull(output_path=self.output_dir, callback=post_process) + return output_tensor_list[0] + + +def llm_eval( + backend, + soc_model, + device, + host, + pte_path, + module, + decoder_model, + decoder_model_config, + artifact_dir, + **kwargs, +): + import json + from executorch.examples.qualcomm.oss_scripts.llama.decoder_utils import ( + GraphModuleCalibrationWrapper, smart_mask_updater + ) + from pytorch_tokenizers import get_tokenizer, TiktokenTokenizer + from transformers import AutoTokenizer + try: + from lm_eval.evaluator import simple_evaluate + except ImportError: + raise ImportError( + "Please install the llm eval dependency via examples/models/llama/install_requirements.sh" + ) + + # Tokenizer related + if "llama3_2" in decoder_model: + tokenizer = get_tokenizer(kwargs["tokenizer_model"]) + assert isinstance( + tokenizer, TiktokenTokenizer + ), f"Wrong tokenizer provided for llama3_2." + runtime_tokenizer_path = args.tokenizer_model + else: + model_id = decoder_model_config.repo_id + tokenizer = AutoTokenizer.from_pretrained(model_id) + tokenizer_artifacts = tokenizer.save_pretrained(artifact_dir) + tokenizer_config = tokenizer_artifacts[0] + runtime_tokenizer_path = tokenizer_artifacts[-1] + tokenizer = get_tokenizer(runtime_tokenizer_path, tokenizer_config) + + if decoder_model == "phi_4_mini": + with open(runtime_tokenizer_path, "r+") as file: + data = json.load(file) + data["pre_tokenizer"]["pretokenizers"][-2]["invert"] = False + file.seek(0) + json.dump(data, file, indent=4) + file.truncate() + + # on device + # Generate the eval wrapper + device_eval_wrapper = RunnerEvalWrapper( + backend=backend, + soc_model=soc_model, + device=device, + host=host, + pte_path=pte_path, + artifact_dir=artifact_dir, + decoder_model=decoder_model, + tokenizer=tokenizer, + runtime_tokenizer_path=runtime_tokenizer_path, + ) + # Evaluate the model on device + with torch.no_grad(): + device_eval_results = simple_evaluate( + model=device_eval_wrapper, + tasks=["wikitext"], + num_fewshot=None, + limit=1, + ) + + # on host + # Generate the eval wrapper + cpu_eval_wrapper = GraphModuleCalibrationWrapper( + model=module, + tokenizer=tokenizer, + max_seq_length=1024, + ar_len=1, + use_kv_cache=True, + get_example_inputs=module.get_example_inputs, + kv_updater=smart_mask_updater, + use_i64_token=False, + seq_mse_candidates=0, + ) + # Evaluate the model on device + with torch.no_grad(): + cpu_eval_results = simple_evaluate( + model=cpu_eval_wrapper, + tasks=["wikitext"], + num_fewshot=None, + limit=1, + ) + + print("cpu accuracy >") + print(cpu_eval_results["results"]["wikitext"]["word_perplexity,none"]) + print("device accuracy >") + print(device_eval_results["results"]["wikitext"]["word_perplexity,none"]) + + +def get_model_dispatcher(dataset_path, **kwargs): + from transformers import AutoModelForMaskedLM, AutoModelForImageClassification + from executorch.examples.qualcomm.utils import ( + get_imagenet_dataset, get_masked_language_model_dataset + ) + + def get_masked_lm_sample_input(pretrained, data_size=100): + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(pretrained) + return get_masked_language_model_dataset(dataset_path, tokenizer, data_size) + + def get_albert(): + pretrained = "albert/albert-base-v2" + inputs, targets = get_masked_lm_sample_input(pretrained) + module = AutoModelForMaskedLM.from_pretrained(pretrained).eval() + return module, inputs, targets, masked_lm_eval + + def get_bert(): + pretrained = "google-bert/bert-base-uncased" + inputs, targets = get_masked_lm_sample_input(pretrained) + module = AutoModelForMaskedLM.from_pretrained(pretrained).eval() + return module, inputs, targets, masked_lm_eval + + def get_cvt(): + inputs, targets = get_imagenet_dataset(dataset_path, 100, (224, 224)) + module = AutoModelForImageClassification.from_pretrained("microsoft/cvt-13").eval() + return module, inputs, targets, image_classification_eval + + def get_deit(): + inputs, targets = get_imagenet_dataset(dataset_path, 100, (224, 224)) + module = AutoModelForImageClassification.from_pretrained("facebook/deit-base-distilled-patch16-224").eval() + return module, inputs, targets, image_classification_eval + + def get_efficientnet(): + inputs, targets = get_imagenet_dataset(dataset_path, 100, (224, 224)) + module = AutoModelForImageClassification.from_pretrained("google/efficientnet-b0").eval() + return module, inputs, targets, image_classification_eval + + def get_eurobert(): + pretrained = "EuroBERT/EuroBERT-210m" + inputs, targets = get_masked_lm_sample_input(pretrained) + module = AutoModelForMaskedLM.from_pretrained(pretrained, trust_remote_code=True).eval() + return module, inputs, targets, masked_lm_eval + + def get_distilbert(): + pretrained = "distilbert/distilbert-base-uncased" + inputs, targets = get_masked_lm_sample_input(pretrained) + module = AutoModelForMaskedLM.from_pretrained(pretrained).eval() + return module, inputs, targets, masked_lm_eval + + def get_dit(): + from executorch.examples.qualcomm.oss_scripts.dit import get_rvlcdip_dataset + inputs, targets = get_rvlcdip_dataset(100) + module = AutoModelForImageClassification.from_pretrained("microsoft/dit-base-finetuned-rvlcdip").eval() + return module, inputs, targets, image_classification_eval + + def get_focalnet(): + inputs, targets = get_imagenet_dataset(dataset_path, 100, (224, 224)) + module = AutoModelForImageClassification.from_pretrained("microsoft/focalnet-tiny").eval() + return module, inputs, targets, image_classification_eval + + def get_mobilevit_v1(): + import executorch.examples.qualcomm.oss_scripts.mobilevit_v1 as mvit1 + inputs, targets = mvit1.get_imagenet_dataset(dataset_path, 100) + module = AutoModelForImageClassification.from_pretrained("apple/mobilevit-xx-small").eval() + return module, inputs, targets, image_classification_eval + + def get_mobilevit_v2(): + import executorch.examples.qualcomm.oss_scripts.mobilevit_v2 as mvit2 + inputs, targets = mvit2.get_imagenet_dataset(dataset_path, 100) + module = AutoModelForImageClassification.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256").eval() + return module, inputs, targets, image_classification_eval + + def get_pvt(): + inputs, targets = get_imagenet_dataset(dataset_path, 100, (224, 224)) + module = AutoModelForImageClassification.from_pretrained("Zetatech/pvt-tiny-224").eval() + return module, inputs, targets, image_classification_eval + + def get_roberta(): + pretrained = "xlm-roberta-base" + inputs, targets = get_masked_lm_sample_input(pretrained) + module = AutoModelForMaskedLM.from_pretrained(pretrained).eval() + return module, inputs, targets, masked_lm_eval + + def get_swin(): + inputs, targets = get_imagenet_dataset(dataset_path, 100, (224, 224)) + module = AutoModelForImageClassification.from_pretrained("microsoft/swin-tiny-patch4-window7-224").eval() + return module, inputs, targets, image_classification_eval + + def get_t5(): + from executorch.examples.qualcomm.utils import get_seq2seq_dataset_from_squad_csv + from executorch.examples.qualcomm.oss_scripts.t5.t5 import T5 + from transformers import AutoModelForSeq2SeqLM, AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small") + model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small").eval() + max_hidden_seq_length = 384 + max_cache_length = 512 + module = T5( + model, + tokenizer, + max_hidden_seq_length=max_hidden_seq_length, + max_cache_length=max_cache_length, + ) + inputs, targets = get_seq2seq_dataset_from_squad_csv( + args.dataset, + tokenizer, + 100, + max_hidden_seq_length=max_hidden_seq_length, + ) + return module, inputs, targets, t5_eval + + def get_whisper(): + from executorch.examples.qualcomm.oss_scripts.whisper.whisper import ( + get_dataset, Whisper + ) + from transformers import AutoModelForSpeechSeq2Seq + module = ( + AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-tiny") + .to("cpu") + .eval() + ) + max_cache_length = 1024 + max_seq_length = 1024 + batch_size = 1 + module = Whisper( + module, + batch_size=batch_size, + max_cache_length=max_cache_length, + max_seq_length=max_seq_length, + ) + inputs, targets = get_dataset(100) + return module, inputs, targets, whisper_eval + + def get_static_llama(decoder_model, decoder_model_config, **kwargs): + import json + from executorch.examples.qualcomm.oss_scripts.llama import LLM_VARIANT_ARCHS + from executorch.examples.qualcomm.oss_scripts.llama.llama import ( + download_and_convert_hf_checkpoint, + ) + from executorch.examples.qualcomm.oss_scripts.llama.model.static_llama import ( + LlamaModel, + ModelArgs, + ) + if "params" in kwargs: + params_path = kwargs["params"] + else: + params_path = decoder_model_config.params_path + with open(params_path) as f: + kv_config = ModelArgs(**json.load(f)) + + kv_config.max_batch_size = 1 + kv_config.max_seq_len = 1024 + kv_config.use_kv_cache = True + kv_config.enable_r3 = decoder_model_config.r3 + kv_config.kv_io_bit_width = decoder_model_config.get_kv_io_bit_width() + kv_config.enable_masked_softmax = decoder_model_config.masked_softmax + + extra_kwargs = {} + if decoder_model == "gemma3-1b": + from transformers import Gemma3Config + + hf_config = Gemma3Config.from_pretrained(decoder_model_config.repo_id) + extra_kwargs["layer_types"] = hf_config.text_config.layer_types + extra_kwargs["rope_local_base_freq"] = ( + hf_config.text_config.rope_local_base_freq + ) + extra_kwargs["sliding_window"] = hf_config.sliding_window + + with torch.device("meta"): + llama_instance = LLM_VARIANT_ARCHS.get( + decoder_model, LlamaModel)( + kv_config, + ar_len=1, + output_new_cache_only=True, + output_cache=True, + use_i64_token=False, + **extra_kwargs, + ) + if "checkpoint" not in kwargs: # HF models + checkpoint = download_and_convert_hf_checkpoint( + decoder_model_config.repo_id, + decoder_model_config.convert_weights.__func__, + ) + state_dict = torch.load( + checkpoint, weights_only=True, map_location="cpu", mmap=True + ) + if decoder_model == "gemma3-1b": + for k, v in state_dict.items(): + if "norm" not in k: + continue + # Llama does x.to(float16) * w whilst Gemma3 is (x * w).to(float16) + # See https://github.com/huggingface/transformers/pull/29402 + state_dict[k] = v.float() + torch.ones(v.shape, dtype=torch.float32) + else: + state_dict = torch.load( + kwargs["checkpoint"], weights_only=True, map_location="cpu", mmap=True + ) + + if decoder_model_config.transform_weight: + # Change to HuggingFace weight to improve the performance of RoPE in HTP backend. + def permute(w, heads): + dim_0 = w.size(0) + dim_1 = w.size(1) + return ( + w.view(heads, dim_0 // heads // 2, 2, dim_1) + .transpose(1, 2) + .reshape(dim_0, dim_1) + ) + + for layer_i in range(llama_instance.n_layers): + state_dict[f"layers.{layer_i}.attention.wq.weight"] = permute( + state_dict[f"layers.{layer_i}.attention.wq.weight"], llama_instance.n_heads + ) + state_dict[f"layers.{layer_i}.attention.wk.weight"] = permute( + state_dict[f"layers.{layer_i}.attention.wk.weight"], llama_instance.n_kv_heads + ) + + llama_instance.load_state_dict(state_dict, strict=True, assign=True) + for layer in llama_instance.layers: + if getattr(layer.attention, "prepare_sha", None): + layer.attention.prepare_sha() + if getattr(layer.feed_forward, "prepare_feedfoward_conv", None): + layer.feed_forward.prepare_feedfoward_conv() + + return llama_instance.to(torch.float32) + + def get_decoder_model(model_name, **kwargs): + from executorch.examples.qualcomm.oss_scripts.llama import SUPPORTED_LLM_MODELS + decoder_model_config = SUPPORTED_LLM_MODELS[model_name] + llama_instance = get_static_llama(model_name, decoder_model_config, **kwargs) + return llama_instance, model_name, decoder_model_config, llm_eval + + def get_qwen2_5_0_5b(): + return get_decoder_model("qwen2_5-0_5b") + + def get_qwen2_5_1_5b(): + return get_decoder_model("qwen2_5-1_5b") + + def get_qwen3_0_6b(): + return get_decoder_model("qwen3-0_6b") + + def get_qwen3_1_7b(): + return get_decoder_model("qwen3-1_7b") + + def get_smollm2_135m(): + return get_decoder_model("smollm2_135m") + + def get_smollm3_3b(): + return get_decoder_model("smollm3-3b") + + def get_phi_4_mini(): + return get_decoder_model("phi_4_mini") + + def get_llama3_2_1b_instruct(params, tokenizer_model, checkpoint): + return get_decoder_model( + "llama3_2-1b_instruct", + params=params, + tokenizer_model=tokenizer_model, + checkpoint=checkpoint + ) + + def get_llama3_2_3b_instruct(params, tokenizer_model, checkpoint): + return get_decoder_model( + "llama3_2-3b_instruct", + params=params, + tokenizer_model=tokenizer_model, + checkpoint=checkpoint + ) + + def get_gemma3_1b(): + return get_decoder_model("gemma3-1b") + + model_dict = { + "albert": get_albert, + "bert": get_bert, + "cvt": get_cvt, + "deit": get_deit, + "dit": get_dit, + "distilbert": get_distilbert, + "efficientnet": get_efficientnet, + "eurobert": get_eurobert, + "focalnet": get_focalnet, + "gemma3-1b": get_gemma3_1b, + "llama3_2-1b_instruct": get_llama3_2_1b_instruct, + "llama3_2-3b_instruct": get_llama3_2_3b_instruct, + "mobilevit_v1": get_mobilevit_v1, + "mobilevit_v2": get_mobilevit_v2, + "phi_4_mini": get_phi_4_mini, + "pvt": get_pvt, + "qwen2_5-0_5b": get_qwen2_5_0_5b, + "qwen2_5-1_5b": get_qwen2_5_1_5b, + "qwen3-0_6b": get_qwen3_0_6b, + "qwen3-1_7b": get_qwen3_1_7b, + "roberta": get_roberta, + "smollm2_135m": get_smollm2_135m, + "smollm3-3b": get_smollm3_3b, + "swin": get_swin, + "t5": get_t5, + "whisper": get_whisper, + } + return model_dict + + +def get_artifacts(backend, pte_path, soc_model, target_model, **kwargs): + from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import ( + DECODER_MODEL_VERSION, + ) + from executorch.backends.qualcomm.utils.utils import get_soc_to_arch_map + + htp_arch = get_soc_to_arch_map()[soc_model] + + def get_build_dir(backend): + build_dir = { + "qnn": "build-android", + "xnn": "build-xnnpack", + } + return build_dir[backend] + + memory_script = """$@ 2> /dev/null & +PROCESS=$(echo $1 | sed -e 's/^\.\///g') +PEAK_MEM=0 +SAMPLES=0 +TOTAL=0 +while true; do + PID=$(pidof $PROCESS) + if [ "$PID" != "" ]; then + DMA=$(dmabuf_dump $PID | grep "PROCESS TOTAL" | awk '{ print $3 }') + PSS=$(dumpsys meminfo -s $PID | grep "TOTAL PSS" | awk '{ print $3 }') + if [ "$PSS" == "" ]; then + continue + fi + CURRENT=$(($DMA+$PSS)) + if [ CURRENT -gt PEAK_MEM ]; then + PEAK_MEM=$CURRENT + fi + SAMPLES=$(awk -v s="$SAMPLES" 'BEGIN { print s + 1 }') + TOTAL=$(awk -v t="$TOTAL" -v c="$CURRENT" 'BEGIN { print t + c }') + else + break + fi +done +echo "peak_mem: $PEAK_MEM" >> statistics.txt +AVG_MEM=$(awk -v total="$TOTAL" -v samples="$SAMPLES" 'BEGIN { printf "%.3f", total / samples }') +echo "avg_mem: $AVG_MEM" >> statistics.txt + """ + with open(memory_script_file, "w") as f: + f.write(memory_script) + + runner = { + "qnn": f"{get_build_dir(backend)}/examples/qualcomm/executor_runner/qnn_executor_runner", + "xnn": f"{get_build_dir(backend)}/executor_runner", + } + + artifacts = { + "qnn": [ + pte_path, + f"{qnn_sdk}/lib/aarch64-android/libQnnHtp.so", + ( + f"{qnn_sdk}/lib/hexagon-v{htp_arch}/" + f"unsigned/libQnnHtpV{htp_arch}Skel.so" + ), + (f"{qnn_sdk}/lib/aarch64-android/" f"libQnnHtpV{htp_arch}Stub.so"), + f"{qnn_sdk}/lib/aarch64-android/libQnnHtpPrepare.so", + f"{qnn_sdk}/lib/aarch64-android/libQnnSystem.so", + f"{get_build_dir(backend)}/backends/qualcomm/libqnn_executorch_backend.so", + f"{qnn_sdk}/lib/aarch64-android/libQnnModelDlc.so", + runner[backend], + memory_script_file, + ], + "xnn": [ + pte_path, + runner[backend], + memory_script_file, + ], + } + + if target_model in DECODER_MODEL_VERSION: + llm_tokenizer = kwargs.get("tokenizer_model", f"{os.path.dirname(pte_path)}/tokenizer.json") + if backend == "qnn": + artifacts[backend].append(f"{get_build_dir(backend)}/examples/qualcomm/oss_scripts/llama/{backend}_llama_runner") + elif backend == "xnn": + artifacts[backend].append(f"{get_build_dir(backend)}/{backend}_llama_runner") + artifacts[backend].append(llm_tokenizer) + + return artifacts[backend] + + +def get_llm_cmds(backend, pte_path, decoder_model, **kwargs): + common_cmd_args = [ + f"--model_path {os.path.basename(pte_path)}", + "--seq_len 1024", + f"--decoder_model_version {decoder_model}", + f"--tokenizer_path {'tokenizer.model' if 'tokenizer_model' in kwargs else 'tokenizer.json'}", + "--prompt 'I would like to learn python, could you teach me with a simple example?'", + ] + for k, v in kwargs.items(): + common_cmd_args.append(f"{k} {v}") + + cmds_for_inference = ( + " ".join( + [ + f"cd {workspace} &&", + f"./{backend}_llama_runner {' '.join(common_cmd_args)}", + ] + ) + ) + + if backend == "xnn": + common_cmd_args[1] = "--seq_len 100" + cmds_for_memory = ( + " ".join( + [ + f"cd {workspace} &&", + f"chmod +x {memory_script_file} &&", + f"./{memory_script_file} ./{backend}_llama_runner {' '.join(common_cmd_args)}", + ] + ) + ) + return [cmds_for_inference, cmds_for_memory] + + +def get_cmds(backend, pte_path, iteration, method_index, target_model, **kwargs): + from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import ( + DECODER_MODEL_VERSION, + ) + + if target_model in DECODER_MODEL_VERSION: + return get_llm_cmds( + backend, pte_path, DECODER_MODEL_VERSION[target_model], **kwargs + ) + + cmd_args = { + "qnn": ( + [ + f"--model_path {os.path.basename(pte_path)}", + f"--iteration {iteration}", + f"--method_index {method_index}", + "--dump_statistics", + ] + ), + "xnn": ( + [ + f"--model_path {os.path.basename(pte_path)}", + f"--num_executions {iteration}", + f"--method_index {method_index}", + "--dump_statistics", + ] + ), + } + cmds_for_inference = { + "qnn": ( + " ".join( + [ + f"cd {workspace} &&", + "chmod +x ./qnn_executor_runner &&", + f"./qnn_executor_runner {' '.join(cmd_args[backend])}", + ] + ) + ), + "xnn": ( + " ".join( + [ + f"cd {workspace} &&", + "chmod +x ./executor_runner &&", + f"./executor_runner {' '.join(cmd_args[backend])}", + ] + ) + ), + } + # do not dump inference metrics during profiling memory + for _, v in cmd_args.items(): + v.pop() + cmds_for_memory = { + "qnn": ( + " ".join( + [ + f"cd {workspace} &&", + "chmod +x ./qnn_executor_runner &&", + f"chmod +x {memory_script_file} &&", + f"./{memory_script_file} ./qnn_executor_runner {' '.join(cmd_args[backend])}", + ] + ) + ), + "xnn": ( + " ".join( + [ + f"cd {workspace} &&", + "chmod +x ./executor_runner &&", + f"chmod +x {memory_script_file} &&", + f"./{memory_script_file} ./executor_runner {' '.join(cmd_args[backend])}", + ] + ) + ), + } + return [cmds_for_inference[backend], cmds_for_memory[backend]] + + +def start_benchmark(artifacts, cmds, device, host): + import tempfile + + def adb(action): + if not host: + actions = ["adb", "-s", device] + else: + actions = ["adb", "-H", host, "-s", device] + actions.extend(action) + subprocess.run(actions, stdout=subprocess.DEVNULL) + + def post_process(): + subprocess.run(["rm", "-rf", perf_file], stdout=subprocess.DEVNULL) + with tempfile.TemporaryDirectory() as tmp_dir: + for file_name in [perf_file]: + adb(["pull", f"{workspace}/{file_name}", tmp_dir]) + with open(f"{tmp_dir}/{file_name}", "r") as f: + print(f.read()) + + adb(["shell", "rm", "-rf", workspace]) + adb(["shell", "mkdir", "-p", workspace]) + for artifact in artifacts: + adb(["push", artifact, workspace]) + for cmd in cmds: + adb(["shell", cmd]) + post_process() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-b", + "--backend", + help="either 'qnn' or 'xnn'", + required=True, + ) + parser.add_argument( + "-p", + "--pte", + help="path to .pte", + required=True, + ) + parser.add_argument( + "-a", + "--artifact", + help="path to generated intermediate artifacts", + ) + parser.add_argument( + "-t", + "--target_model", + help=f"supported targets: {get_model_dispatcher('').keys()}", + required=True, + ) + parser.add_argument( + "-H", + "--host", + help="hostname for adb gateway", + required=False, + ) + parser.add_argument( + "-s", + "--device", + help="serial number for adb device", + required=True, + ) + parser.add_argument( + "-m", + "--soc_model", + help="model name of SoC", + required=True, + ) + parser.add_argument( + "-i", + "--iteration", + help="total number of inferences", + default=100, + ) + parser.add_argument( + "-e", + "--eval", + help="perform e2e evaluation for checking accuracy metrics", + action="store_true", + ) + parser.add_argument( + "-d", + "--dataset", + help="specify dataset path for evaluation", + ) + parser.add_argument( + "--method_index", + help="specify which method to be executed", + default=0, + ) + parser.add_argument( + "--checkpoint", + help="Pass llama checkpoint.", + required=False, + type=str, + ) + parser.add_argument( + "--params", + help="Pass llama params json file.", + required=False, + type=str, + ) + parser.add_argument( + "--tokenizer_model", + help="Pass llama tokenizer model.", + type=str, + default=None, + ) + args = parser.parse_args() + + torch.manual_seed(seed) + np.random.seed(seed) + random.seed(seed) + kwargs = {} + if all([args.params, args.tokenizer_model, args.checkpoint]): + kwargs = { + "params": args.params, + "tokenizer_model": args.tokenizer_model, + "checkpoint": args.checkpoint, + } + + if args.eval: + module, inputs, targets, eval_func = get_model_dispatcher( + args.dataset, **kwargs + )[args.target_model](**kwargs) + eval_func( + args.backend, + args.soc_model, + args.device, + args.host, + args.pte, + module, + inputs, + targets, + args.artifact, + **kwargs, + ) + else: + start_benchmark( + artifacts=get_artifacts( + args.backend, args.pte, args.soc_model, args.target_model, **kwargs + ), + cmds=get_cmds( + args.backend, + args.pte, + args.iteration, + args.method_index, + args.target_model, + **kwargs, + ), + device=args.device, + host=args.host, + ) diff --git a/build_xnn.sh b/build_xnn.sh new file mode 100755 index 00000000000..78d0e71d594 --- /dev/null +++ b/build_xnn.sh @@ -0,0 +1,46 @@ +#!/bin/bash + + +if [[ -z $ANDROID_NDK_ROOT ]]; then + echo "Please export ANDROID_NDK_ROOT=/path/to/ndk" + exit -1 +fi + +CLEAN_BUILD="false" +BUILD_FOLDER="build-xnnpack" +BUILD_TYPE="release" + +while [[ "$#" -gt 0 ]]; do + case "$1" in + -c|--clean_build) CLEAN_BUILD="true"; shift;; + -d|--debug) BUILD_TYPE="Debug"; shift;; + *) echo "unknow arg passed: $1"; exit 1;; + esac + shift +done + +if [ "$CLEAN_BUILD" = true ]; then + rm -rf $BUILD_FOLDER +fi + +cmake \ + -DCMAKE_INSTALL_PREFIX=$BUILD_FOLDER \ + -DCMAKE_BUILD_TYPE=$BUILD_TYPE \ + -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \ + -DANDROID_ABI='arm64-v8a' \ + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \ + -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \ + -DSUPPORT_REGEX_LOOKAHEAD=ON \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DEXECUTORCH_ENABLE_LOGGING=ON \ + -DPYTHON_EXECUTABLE=python \ + -B$BUILD_FOLDER . + +cmake --build $BUILD_FOLDER -j9 --target install --config $BUILD_TYPE diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp index 0974e751203..609530d4daa 100644 --- a/examples/portable/executor_runner/executor_runner.cpp +++ b/examples/portable/executor_runner/executor_runner.cpp @@ -18,6 +18,7 @@ * all fp32 tensors. */ +#include #include #include #include @@ -57,7 +58,13 @@ DEFINE_string( output_file, "", "Base name of output file. If not empty output will be written to the file(s)."); +DEFINE_string(input_list_path, "input_list.txt", "Model input list path."); +DEFINE_string( + output_folder_path, + "outputs", + "Executorch inference data output path."); +DEFINE_bool(dump_statistics, false, "Dump inference statistics."); DEFINE_bool( print_all_output, false, @@ -70,6 +77,11 @@ DEFINE_int32( cpu_threads, -1, "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device."); +DEFINE_bool( + shared_buffer, + false, + "Specifies to use shared buffers for zero-copy usecase between the application and device/co-processor associated with the backend."); +DEFINE_uint32(method_index, 0, "Index of methods to be specified."); using executorch::aten::ScalarType; using executorch::aten::Tensor; @@ -251,7 +263,7 @@ int main(int argc, char** argv) { // Use the first method in the program. const char* method_name = nullptr; { - const auto method_name_result = program->get_method_name(0); + const auto method_name_result = program->get_method_name(FLAGS_method_index); ET_CHECK_MSG(method_name_result.ok(), "Program has no methods"); method_name = *method_name_result; } @@ -324,11 +336,19 @@ int main(int argc, char** argv) { // be used by a single thread at at time, but it can be reused. // EventTraceManager tracer; + auto before_load = std::chrono::high_resolution_clock::now(); Result method = program->load_method( method_name, &memory_manager, tracer.get_event_tracer(), ptd_data_map.get()); + + auto after_load = std::chrono::high_resolution_clock::now(); + double interval_load = + std::chrono::duration_cast( + after_load - before_load) + .count() / + 1000.0; ET_CHECK_MSG( method.ok(), "Loading of method %s failed with status 0x%" PRIx32, @@ -336,6 +356,148 @@ int main(int argc, char** argv) { (uint32_t)method.error()); ET_LOG(Info, "Method loaded."); + // QCOM change + std::ifstream input_list(FLAGS_input_list_path); + if (input_list.is_open()) { + auto inputs = executorch::extension::prepare_input_tensors(*method); + ET_LOG(Debug, "Preparing inputs."); + ET_CHECK_MSG( + inputs.ok(), + "Could not prepare inputs: 0x%" PRIx32, + (uint32_t)inputs.error()); + ET_LOG(Debug, "Inputs prepared."); + + size_t num_inputs = method->inputs_size(); + ET_LOG(Info, "Number of inputs: %zu", num_inputs); + + auto split = [](std::string s, std::string delimiter) { + size_t pos_start = 0, pos_end, delim_len = delimiter.length(); + std::string token; + std::vector res; + + while ((pos_end = s.find(delimiter, pos_start)) != std::string::npos) { + token = s.substr(pos_start, pos_end - pos_start); + pos_start = pos_end + delim_len; + res.push_back(token); + } + res.push_back(s.substr(pos_start)); + return res; + }; + + std::string file_path; + int inference_index = 0; + double elapsed_time = 0; + while (std::getline(input_list, file_path)) { + auto input_files = split(file_path, " "); + if (input_files.size() == 0) { + break; + } + ET_CHECK_MSG( + input_files.size() == num_inputs, + "Number of inputs (%zu) mismatch with input files (%zu)", + num_inputs, + input_files.size()); + + std::vector> input_buf(num_inputs); + for (int input_index = 0; input_index < num_inputs; ++input_index) { + MethodMeta method_meta = method->method_meta(); + Result tensor_meta = + method_meta.input_tensor_meta(input_index); + + std::ifstream fin(input_files[input_index], std::ios::binary); + fin.seekg(0, fin.end); + size_t file_size = fin.tellg(); + + input_buf[input_index].resize(file_size); + fin.seekg(0, fin.beg); + fin.read( + static_cast(input_buf[input_index].data()), + file_size); + fin.close(); + + ET_CHECK_MSG( + file_size == tensor_meta->nbytes(), + "Input(%d) size mismatch. file bytes: %zu, tensor bytes: %zu", + input_index, + file_size, + tensor_meta->nbytes()); + + auto impl = executorch::aten::TensorImpl( + tensor_meta->scalar_type(), + /*dim=*/tensor_meta->sizes().size(), + const_cast(tensor_meta->sizes().data()), + input_buf[input_index].data(), + const_cast( + tensor_meta->dim_order().data())); + Error ret = method->set_input(executorch::aten::Tensor(&impl), input_index); + ET_CHECK_MSG( + ret == Error::Ok, "Failed to set input tensor: %d", (int)ret); + } + Error status = method->execute(); + std::vector outputs(method->outputs_size()); + status = method->get_outputs(outputs.data(), method->outputs_size()); + ET_CHECK(status == Error::Ok); + for (size_t output_index = 0; output_index < method->outputs_size(); + output_index++) { + auto output_tensor = outputs[output_index].toTensor(); + size_t nbytes = output_tensor.nbytes(); + auto output_file_name = FLAGS_output_folder_path + "/output_" + + std::to_string(inference_index) + "_" + + std::to_string(output_index) + ".raw"; + std::ofstream fout(output_file_name.c_str(), std::ios::binary); + fout.write(output_tensor.const_data_ptr(), nbytes); + fout.close(); + } + ++inference_index; + } + return 0; + } else { + et_timestamp_t time_spent_executing = 0, time_spent_executing_1st = 0; + auto inputs = executorch::extension::prepare_input_tensors(*method); + ET_LOG(Info, "Preparing inputs."); + ET_CHECK_MSG( + inputs.ok(), + "Could not prepare inputs: 0x%" PRIx32, + (uint32_t)inputs.error()); + ET_LOG(Info, "Inputs prepared."); + + auto before_exec = std::chrono::high_resolution_clock::now(); + Error status = method->execute(); + auto after_exec = std::chrono::high_resolution_clock::now(); + double interval_1st_infs = + std::chrono::duration_cast( + after_exec - before_exec) + .count() / + 1000.0; + + before_exec = std::chrono::high_resolution_clock::now(); + for (uint32_t i = 0; i < FLAGS_num_executions; i++) { + status = method->execute(); + ET_CHECK_MSG( + status == Error::Ok, + "Execution of method %s failed with status 0x%" PRIx32, + method_name, + (uint32_t)status); + } + after_exec = std::chrono::high_resolution_clock::now(); + double interval_infs = std::chrono::duration_cast( + after_exec - before_exec) + .count() / + 1000.0 / FLAGS_num_executions; + + if (FLAGS_dump_statistics) { + auto output_file_name = "statistics.txt"; + std::ofstream fout(output_file_name); + fout << "load: " + std::to_string(interval_load) + << "\n1st: " + std::to_string(interval_1st_infs) + << "\navg: " + std::to_string(interval_infs) << std::endl; + fout.close(); + } + ET_LOG(Info, "Model executed successfully."); + return 0; + } + // QCOM change end + et_timestamp_t time_spent_executing = 0; // Run the model. for (uint32_t i = 0; i < FLAGS_num_executions; i++) { diff --git a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp index 47f9f0cfb38..50d81129651 100644 --- a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp +++ b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp @@ -64,6 +64,8 @@ DEFINE_bool( false, "Dump intermediate outputs to etdump file."); +DEFINE_bool(dump_statistics, false, "Dump inference statistics."); + DEFINE_string( debug_output_path, "debug_output.bin", @@ -303,6 +305,7 @@ int main(int argc, char** argv) { // be used by a single thread at at time, but it can be reused. // ETDumpGen etdump_gen; + auto before_load = std::chrono::high_resolution_clock::now(); Result method = program->load_method(method_name, &memory_manager, &etdump_gen); ET_CHECK_MSG( @@ -310,6 +313,12 @@ int main(int argc, char** argv) { "Loading of method %s failed with status 0x%" PRIx32, method_name, (int)method.error()); + auto after_load = std::chrono::high_resolution_clock::now(); + double interval_load = + std::chrono::duration_cast( + after_load - before_load) + .count() / + 1000.0; ET_LOG(Info, "Method loaded."); void* debug_buffer; @@ -570,12 +579,19 @@ int main(int argc, char** argv) { "Input list not provided. Inputs prepared with default values set."); // Run the method + auto before_exec = std::chrono::high_resolution_clock::now(); Error status = method->execute(); ET_CHECK_MSG( status == Error::Ok, "Execution of method %s failed with status 0x%" PRIx32, method_name, (int)status); + auto after_exec = std::chrono::high_resolution_clock::now(); + double interval_1st_infs = + std::chrono::duration_cast( + after_exec - before_exec) + .count() / + 1000.0; ET_LOG(Info, "Model executed successfully."); // Warm up @@ -585,23 +601,33 @@ int main(int argc, char** argv) { } // Inference with designated iterations - auto before_exec = std::chrono::high_resolution_clock::now(); + before_exec = std::chrono::high_resolution_clock::now(); for (int i = 0; i < FLAGS_iteration; ++i) { status = method->execute(); } - auto after_exec = std::chrono::high_resolution_clock::now(); + after_exec = std::chrono::high_resolution_clock::now(); double interval_infs = std::chrono::duration_cast( after_exec - before_exec) .count() / 1000.0; + auto avg_infs = interval_infs / (float)FLAGS_iteration; ET_LOG( Info, "%d inferences took %f ms, avg %f ms", FLAGS_iteration, interval_infs, - interval_infs / (float)FLAGS_iteration); + avg_infs); + + if (FLAGS_dump_statistics) { + auto output_file_name = "statistics.txt"; + std::ofstream fout(output_file_name); + fout << "load: " + std::to_string(interval_load) + << "\n1st: " + std::to_string(interval_1st_infs) + << "\navg: " + std::to_string(avg_infs) << std::endl; + fout.close(); + } } // Dump the etdump data containing profiling/debugging data to the specified diff --git a/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp index f0cc6d9a7a2..65d560c52c5 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp @@ -9,9 +9,11 @@ #include #include #include + using executorch::runtime::MemoryAllocator; using executorch::runtime::TensorInfo; + namespace example { RpcMem::RpcMem( const size_t total_cache_size, @@ -20,11 +22,20 @@ RpcMem::RpcMem( : calculated_offsets_(0) { size_t total_bytes = total_cache_size + total_prompt_processor_io_size + total_token_generator_io_size; +# ifndef XNNPACK shared_buffer_base_ptr_ = QnnExecuTorchAllocCustomMem( total_bytes, MemoryAllocator::kDefaultAlignment); +# else + shared_buffer_base_ptr_ = + new char[total_bytes + MemoryAllocator::kDefaultAlignment]; +# endif } RpcMem::~RpcMem() { +# ifndef XNNPACK QnnExecuTorchFreeCustomMem(shared_buffer_base_ptr_); +# else + delete shared_buffer_base_ptr_; +# endif } std::byte* RpcMem::allocate(size_t data_size) { @@ -57,7 +68,9 @@ void RpcMem::add_memory_info( shape, rank, scalar_type}; +# ifndef XNNPACK QnnExecuTorchAddCustomMemTensorInfo(info); +# endif }; } // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp index 0c4884bbccf..c4145a23cf0 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp @@ -163,8 +163,9 @@ Error Runner::load() { std::vector method_names; switch (eval_mode_) { case EvalMode::kKVCached: - prompt_processor_method_name = "forward"; - token_generator_method_name = "forward"; + // workaround for benchmark + prompt_processor_method_name = "kv_forward"; + token_generator_method_name = "kv_forward"; method_names.emplace_back(token_generator_method_name); break; case EvalMode::kHybrid: diff --git a/examples/qualcomm/oss_scripts/t5/t5.py b/examples/qualcomm/oss_scripts/t5/t5.py index 093572f032a..7c126285968 100644 --- a/examples/qualcomm/oss_scripts/t5/t5.py +++ b/examples/qualcomm/oss_scripts/t5/t5.py @@ -217,7 +217,6 @@ def main(args): tokenizer, data_size, max_hidden_seq_length=max_hidden_seq_length, - shuffle=False, ) if not args.pre_gen_pte: diff --git a/examples/qualcomm/oss_scripts/t5/t5_model.py b/examples/qualcomm/oss_scripts/t5/t5_model.py index 0593feaa8b8..620e8f2cbb0 100644 --- a/examples/qualcomm/oss_scripts/t5/t5_model.py +++ b/examples/qualcomm/oss_scripts/t5/t5_model.py @@ -620,12 +620,12 @@ def evaluate_with_ground_truth( predicted_texts = [] target_texts = [] for i, (pred, tar) in tqdm(enumerate(zip(predicts, targets))): - predicted_texts.append(pred) target_texts.append(tokenizer.decode(tar, skip_special_tokens=True)) - print(f"Show {i}/{len(predicts)} result:") - print(f"\tPrediction: {pred}") - print(f"\tTarget: {target_texts[i]}") + #print(f"Show {i}/{len(predicts)} result:") + #print(f"\tPrediction: {pred}") + #print(f"\tTarget: {target_texts[i]}") + results = metrics(predicted_texts, target_texts) print("F1 Score:", results["f1"]) diff --git a/xnn_llama_runner.patch b/xnn_llama_runner.patch new file mode 100644 index 00000000000..7ddb4972642 --- /dev/null +++ b/xnn_llama_runner.patch @@ -0,0 +1,625 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 7012ec641..29dea224a 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -1049,6 +1049,102 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER) + target_link_libraries(executor_runner ${_executor_runner_libs}) + target_compile_options(executor_runner PUBLIC ${_common_compile_options}) + ++ set(QC_EXAMPLE_SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/examples/qualcomm/) ++ set(_xnn_t5_runner__srcs ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/t5/qnn_t5_runner.cpp ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/t5/runner/decoder.cpp ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/t5/runner/decoder.h ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/t5/runner/encoder.cpp ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/t5/runner/encoder.h ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/t5/runner/runner.cpp ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/t5/runner/runner.h ++ ${CMAKE_CURRENT_LIST_DIR}/extension/llm/sampler/sampler.cpp ++ ) ++ add_executable(xnn_t5_runner ${_xnn_t5_runner__srcs}) ++ ++ set(_xnn_whisper_runner__srcs ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/whisper/qnn_whisper_runner.cpp ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/whisper/runner/decoder.cpp ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/whisper/runner/decoder.h ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/whisper/runner/encoder.cpp ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/whisper/runner/encoder.h ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/whisper/runner/runner.cpp ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/whisper/runner/runner.h ++ ${CMAKE_CURRENT_LIST_DIR}/extension/llm/sampler/sampler.cpp ++ ) ++ add_executable(xnn_whisper_runner ${_xnn_whisper_runner__srcs}) ++ ++ set(_xnn_llama_runner__srcs ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/qnn_llama_runner.cpp ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/runner.cpp ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/runner.h ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/cache_utils.h ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/decoder_runner.cpp ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/decoder_runner.h ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/prompt_processor.cpp ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/prompt_processor.h ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/token_generator.cpp ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/token_generator.h ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/imem_alloc.h ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/client_mem.h ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/lhd_token_generator.cpp ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/lhd_token_generator.h ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/rpc_mem.cpp ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/rpc_mem.h ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/kv_manager.cpp ++ ${QC_EXAMPLE_SOURCE_DIR}/oss_scripts/llama/runner/kv_manager.h ++ ${CMAKE_CURRENT_LIST_DIR}/examples/models/llama/runner/runner.cpp ++ ${CMAKE_CURRENT_LIST_DIR}/examples/models/llama/runner/runner.h ++ ) ++ ++ target_link_libraries(xnn_t5_runner ++ ${_executor_runner_libs} ++ extension_data_loader ++ extension_flat_tensor ++ extension_llm_runner ++ extension_module ++ extension_tensor ++ gflags ++ tokenizers::tokenizers ++ ) ++ target_compile_options(xnn_t5_runner PUBLIC ${_common_compile_options}) ++ ++ target_link_libraries(xnn_whisper_runner ++ ${_executor_runner_libs} ++ extension_data_loader ++ extension_flat_tensor ++ extension_llm_runner ++ extension_module ++ extension_tensor ++ gflags ++ tokenizers::tokenizers ++ ) ++ target_compile_options(xnn_whisper_runner PUBLIC ${_common_compile_options}) ++ ++ add_definitions(-DXNNPACK) ++ add_executable(xnn_llama_runner ${_xnn_llama_runner__srcs}) ++ target_include_directories( ++ xnn_llama_runner PUBLIC ${_common_include_directories} ++ ) ++ executorch_target_link_options_shared_lib(quantized_ops_lib) ++ target_link_libraries(xnn_llama_runner ++ ${_executor_runner_libs} ++ executorch_core ++ extension_data_loader ++ extension_flat_tensor ++ extension_llm_runner ++ extension_module ++ extension_tensor ++ gflags ++ quantized_ops_lib ++ quantized_kernels ++ tokenizers::tokenizers ++ ) ++ target_include_directories( ++ xnn_llama_runner PUBLIC ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include ++ ) ++ target_compile_options(xnn_llama_runner PUBLIC ${_common_compile_options}) ++ + # Automatically set when using `emcmake cmake` for Wasm build. + if(EMSCRIPTEN) + # Directory of model pte files to embed in the wasm binary. +diff --git a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp +index 71eaea2b8..bab8664a5 100644 +--- a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp ++++ b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp +@@ -266,6 +266,8 @@ int main(int argc, char** argv) { + start_runner(std::move(module), prompts); + } else if (kv_bitwidth == example::KvBitWidth::kWidth16) { + start_runner(std::move(module), prompts); ++ } else if (kv_bitwidth == example::KvBitWidth::kWidth32) { ++ start_runner(std::move(module), prompts); + } else { + ET_CHECK_MSG( + false, +diff --git a/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.h b/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.h +index 888e9acd4..5d9384512 100644 +--- a/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.h ++++ b/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.h +@@ -56,7 +56,7 @@ class DecoderRunner { + inline int32_t logits_to_token( + const executorch::aten::Tensor& logits_tensor, + int64_t pos) { +- auto* logits = logits_tensor.mutable_data_ptr(); ++ auto* logits = logits_tensor.mutable_data_ptr(); + auto num_tokens = logits_tensor.size(1); + auto vocab_size = logits_tensor.size(2); + static std::vector logits_f(vocab_size); +diff --git a/examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp +index bd6d27d4b..72781139c 100644 +--- a/examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp ++++ b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp +@@ -48,7 +48,7 @@ KVManager::KVManager(KVManagerMode kv_updater, Metadata metadata) + + template + void KVManager::init_attention_mask( +- uint16_t* attention_mask, ++ float* attention_mask, + const std::vector& attention_map, + int32_t ar_len, + int32_t n_past) { +@@ -57,16 +57,16 @@ void KVManager::init_attention_mask( + "The size of attention_map (%zu) doesn't match with ar_len (%d)", + attention_map.size(), + ar_len); +- uint16_t neg_val = 0; +- uint16_t pos_val = 65535; ++ float neg_val = -1e9f; ++ float pos_val = 0.0f; + // Clear the attention mask + std::fill_n(attention_mask, ar_len * metadata_.context_len, neg_val); + + // SMART_MASK requires special handling of attention mask + switch (kv_updater_) { + case KVManagerMode::SMART_MASK: { +- uint16_t* past_ptr = attention_mask; +- uint16_t* new_ptr = attention_mask + (metadata_.context_len - ar_len); ++ float* past_ptr = attention_mask; ++ float* new_ptr = attention_mask + (metadata_.context_len - ar_len); + // All inputs will necessarily attend to n_past and itself + for (int i = 0; i < ar_len; i++) { + // Iterate across ar_len +@@ -77,9 +77,9 @@ void KVManager::init_attention_mask( + // If positive, copy attention map from (relative to 0th input) parent + // Parent token index + const int32_t pidx = attention_map[i]; +- uint16_t* parent_ptr = attention_mask + pidx * metadata_.context_len; ++ float* parent_ptr = attention_mask + pidx * metadata_.context_len; + std::memcpy( +- past_ptr, parent_ptr, metadata_.context_len * sizeof(uint16_t)); ++ past_ptr, parent_ptr, metadata_.context_len * sizeof(float)); + } + // Attend to itself + new_ptr[i] = pos_val; +@@ -92,7 +92,7 @@ void KVManager::init_attention_mask( + // Only fill in ar_len. Rest will be padding + const size_t attn_row_start = metadata_.context_len - n_past - ar_len; + for (int i = 0; i < ar_len; i++) { +- uint16_t* cur_ptr = ++ float* cur_ptr = + attention_mask + i * metadata_.context_len + attn_row_start; + // Attend to itself + cur_ptr[n_past + i] = pos_val; +@@ -103,10 +103,10 @@ void KVManager::init_attention_mask( + // If positive, copy attention map from (relative to 0th input) parent + // Parent token index + const int32_t pidx = attention_map[i]; +- uint16_t* parent_ptr = ++ float* parent_ptr = + attention_mask + pidx * metadata_.context_len + attn_row_start; + std::memcpy( +- cur_ptr, parent_ptr, (n_past + pidx + 1) * sizeof(uint16_t)); ++ cur_ptr, parent_ptr, (n_past + pidx + 1) * sizeof(float)); + } + } + break; +@@ -118,7 +118,7 @@ void KVManager::init_attention_mask( + + template + void KVManager::init_attention_mask( +- uint16_t* attention_mask, ++ float* attention_mask, + const std::vector& attention_map, + int32_t ar_len, + int32_t n_past, +@@ -129,16 +129,16 @@ void KVManager::init_attention_mask( + "The size of attention_map (%zu) doesn't match with ar_len (%d)", + attention_map.size(), + ar_len); +- uint16_t neg_val = 0; +- uint16_t pos_val = 65535; ++ float neg_val = -1e9f; ++ float pos_val = 0.0f; + // Clear the attention mask + std::fill_n(attention_mask, ar_len * metadata_.context_len, neg_val); + + // SMART_MASK requires special handling of attention mask + switch (kv_updater_) { + case KVManagerMode::SMART_MASK: { +- uint16_t* past_ptr = attention_mask; +- uint16_t* new_ptr = attention_mask + (metadata_.context_len - ar_len); ++ float* past_ptr = attention_mask; ++ float* new_ptr = attention_mask + (metadata_.context_len - ar_len); + // All inputs will necessarily attend to n_past and itself + for (int i = 0; i < ar_len; i++) { + // Iterate across ar_len +@@ -149,9 +149,9 @@ void KVManager::init_attention_mask( + // If positive, copy attention map from (relative to 0th input) parent + // Parent token index + const int32_t pidx = attention_map[i]; +- uint16_t* parent_ptr = attention_mask + pidx * metadata_.context_len; ++ float* parent_ptr = attention_mask + pidx * metadata_.context_len; + std::memcpy( +- past_ptr, parent_ptr, metadata_.context_len * sizeof(uint16_t)); ++ past_ptr, parent_ptr, metadata_.context_len * sizeof(float)); + } + // Attend to itself + new_ptr[i] = pos_val; +@@ -172,7 +172,7 @@ void KVManager::init_attention_mask( + // Only fill in ar_len. Rest will be padding + const size_t attn_row_start = metadata_.context_len - n_past - ar_len; + for (int i = 0; i < ar_len; i++) { +- uint16_t* cur_ptr = ++ float* cur_ptr = + attention_mask + i * metadata_.context_len + attn_row_start; + // Attend to itself + cur_ptr[n_past + i] = pos_val; +@@ -183,10 +183,10 @@ void KVManager::init_attention_mask( + // If positive, copy attention map from (relative to 0th input) parent + // Parent token index + const int32_t pidx = attention_map[i]; +- uint16_t* parent_ptr = ++ float* parent_ptr = + attention_mask + pidx * metadata_.context_len + attn_row_start; + std::memcpy( +- cur_ptr, parent_ptr, (n_past + pidx + 1) * sizeof(uint16_t)); ++ cur_ptr, parent_ptr, (n_past + pidx + 1) * sizeof(float)); + } + } + break; +@@ -198,12 +198,12 @@ void KVManager::init_attention_mask( + + template + void KVManager::update_attention_mask( +- uint16_t* attention_mask, ++ float* attention_mask, + int32_t ar_len, + int32_t n_past, + int32_t n_update) { +- uint16_t pos_val = 65535; +- uint16_t* cur_ptr = attention_mask; ++ float pos_val = 0.0f; ++ float* cur_ptr = attention_mask; + if (kv_updater_ == KVManagerMode::SMART_MASK) + cur_ptr += n_past; + if (kv_updater_ == KVManagerMode::SHIFT_POINTER) +@@ -217,15 +217,15 @@ void KVManager::update_attention_mask( + + template + void KVManager::update_attention_mask( +- uint16_t* attention_mask, ++ float* attention_mask, + int32_t ar_len, + int32_t n_past, + int32_t n_update, + int32_t sliding_window, + const std::vector& position_offset) { +- uint16_t pos_val = 65535; +- uint16_t neg_val = 0; +- uint16_t* cur_ptr = attention_mask; ++ float pos_val = 0.0f; ++ float neg_val = -1e9f; ++ float* cur_ptr = attention_mask; + if (kv_updater_ == KVManagerMode::SMART_MASK) + cur_ptr += n_past; + if (kv_updater_ == KVManagerMode::SHIFT_POINTER) +@@ -544,6 +544,7 @@ void KVManager::update_value( + } + + // Explicit instantiations ++template class KVManager; + template class KVManager; + template class KVManager; + +diff --git a/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h +index af9cf49a3..2b2563b8e 100644 +--- a/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h ++++ b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h +@@ -73,7 +73,7 @@ class KVManager { + * @param n_past Number of past elements in the cache. + */ + void init_attention_mask( +- uint16_t* attention_mask, ++ float* attention_mask, + const std::vector& attention_map, + int32_t ar_len, + int32_t n_past); +@@ -100,7 +100,7 @@ class KVManager { + * @param position_offset (optional) attention mask position offset of + */ + void init_attention_mask( +- uint16_t* attention_mask, ++ float* attention_mask, + const std::vector& attention_map, + int32_t ar_len, + int32_t n_past, +@@ -116,7 +116,7 @@ class KVManager { + * @param n_update Number of elements to be updated. + */ + void update_attention_mask( +- uint16_t* attention_mask, ++ float* attention_mask, + int32_t ar_len, + int32_t n_past, + int32_t n_update); +@@ -134,7 +134,7 @@ class KVManager { + * lookahead decoder + */ + void update_attention_mask( +- uint16_t* attention_mask, ++ float* attention_mask, + int32_t ar_len, + int32_t n_past, + int32_t n_update, +diff --git a/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.cpp b/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.cpp +index 1692caa27..2f594eb16 100644 +--- a/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.cpp ++++ b/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.cpp +@@ -398,5 +398,6 @@ Result LhdTokenGenerator::generate( + // Explicit instantiations + template class LhdTokenGenerator; + template class LhdTokenGenerator; ++template class LhdTokenGenerator; + + } // namespace example +diff --git a/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.cpp b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.cpp +index 73da764b5..ab5731e89 100644 +--- a/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.cpp ++++ b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.cpp +@@ -39,21 +39,21 @@ PromptProcessor::PromptProcessor( + switch (metadata_.cache_mode) { + case CacheMode::StaticCahce: + attention_mask_.size = +- metadata_.ar_len * metadata_.context_len * sizeof(uint16_t); ++ metadata_.ar_len * metadata_.context_len * sizeof(float); + window_attention_mask_.size = 0; + break; + case CacheMode::HybridCache: + attention_mask_.size = +- metadata_.ar_len * metadata_.context_len * sizeof(uint16_t); ++ metadata_.ar_len * metadata_.context_len * sizeof(float); + window_attention_mask_.size = +- metadata_.ar_len * metadata_.context_len * sizeof(uint16_t); ++ metadata_.ar_len * metadata_.context_len * sizeof(float); + break; + default: + ET_CHECK_MSG(false, "Unsupported llama cache mode"); + break; + } + +- logits_.size = metadata_.ar_len * metadata_.vocab_size * sizeof(uint16_t); ++ logits_.size = metadata_.ar_len * metadata_.vocab_size * sizeof(float); + }; + template + void PromptProcessor::init_io( +@@ -78,7 +78,7 @@ void PromptProcessor::init_io( + + // [I]: attention_mask + Result attention_mask = method_meta->input_tensor_meta(idx++); +- attention_mask_.data = reinterpret_cast( ++ attention_mask_.data = reinterpret_cast( + buffer_manager->allocate(attention_mask_.size)); + attention_mask_.tensor = std::make_unique( + attention_mask->scalar_type(), +@@ -95,7 +95,7 @@ void PromptProcessor::init_io( + if (metadata_.cache_mode == CacheMode::HybridCache) { + Result window_attention_mask = + method_meta->input_tensor_meta(idx++); +- window_attention_mask_.data = reinterpret_cast( ++ window_attention_mask_.data = reinterpret_cast( + buffer_manager->allocate(window_attention_mask_.size)); + window_attention_mask_.tensor = std::make_unique( + window_attention_mask->scalar_type(), +@@ -159,7 +159,7 @@ void PromptProcessor::init_io( + // [O]: logits + Result logits = method_meta->output_tensor_meta(0); + logits_.data = +- reinterpret_cast(buffer_manager->allocate(logits_.size)); ++ reinterpret_cast(buffer_manager->allocate(logits_.size)); + logits_.tensor = std::make_unique( + logits->scalar_type(), + logits->sizes().size(), +@@ -202,7 +202,7 @@ void PromptProcessor::init_io( + } + + template +-const std::vector& PromptProcessor::get_all_logits() { ++const std::vector& PromptProcessor::get_all_logits() { + return prompt_all_logits_; + } + +@@ -347,5 +347,6 @@ Result PromptProcessor::prefill( + // Explicit instantiations + template class PromptProcessor; + template class PromptProcessor; ++template class PromptProcessor; + + } // namespace example +diff --git a/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h +index a3dd20794..c375d0a6f 100644 +--- a/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h ++++ b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h +@@ -54,7 +54,7 @@ class PromptProcessor { + * + * @return std::vector& all the logits generated + */ +- virtual const std::vector& get_all_logits(); ++ virtual const std::vector& get_all_logits(); + + /** + * Prefill an LLM Module with the given text input. +@@ -110,9 +110,9 @@ class PromptProcessor { + // inputs and outputs + TensorStruct input_toks_; + TensorStruct input_pos_; +- TensorStruct attention_mask_; +- TensorStruct window_attention_mask_; +- TensorStruct logits_; ++ TensorStruct attention_mask_; ++ TensorStruct window_attention_mask_; ++ TensorStruct logits_; + + // layer -> head -> TensorImpl + std::vector>> +@@ -129,6 +129,6 @@ class PromptProcessor { + std::vector output_tensors_; + + // Unused by default, only used when dump_logits_path is provided. +- std::vector prompt_all_logits_; ++ std::vector prompt_all_logits_; + }; + } // namespace example +diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp +index 709ad3cfa..31b3b1afd 100644 +--- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp ++++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp +@@ -65,17 +65,17 @@ void print_performance_report( + + void save_logits( + const std::string& dump_logits_path, +- const std::vector& prefill_logits, +- const std::vector& decode_logits) { ++ const std::vector& prefill_logits, ++ const std::vector& decode_logits) { + std::ofstream outFile(dump_logits_path.c_str(), std::ios::binary); + if (outFile.is_open()) { + outFile.write( + reinterpret_cast(prefill_logits.data()), +- prefill_logits.size() * sizeof(uint16_t)); ++ prefill_logits.size() * sizeof(float)); + + outFile.write( + reinterpret_cast(decode_logits.data()), +- decode_logits.size() * sizeof(uint16_t)); ++ decode_logits.size() * sizeof(float)); + outFile.close(); + } else { + ET_CHECK_MSG(false, "Error saving the dump logits file"); +@@ -478,5 +478,6 @@ Result Runner::get_decoder_model_version() { + // Explicit instantiations + template class Runner; + template class Runner; ++template class Runner; + + } // namespace example +diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h +index 9f290d79c..160529f44 100644 +--- a/examples/qualcomm/oss_scripts/llama/runner/runner.h ++++ b/examples/qualcomm/oss_scripts/llama/runner/runner.h +@@ -43,6 +43,7 @@ enum DecoderModelVersion { + enum KvBitWidth { + kWidth8 = 8, + kWidth16 = 16, ++ kWidth32 = 32, + }; + + template +diff --git a/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp b/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp +index 6775c08bd..c7b786a0d 100644 +--- a/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp ++++ b/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp +@@ -39,26 +39,26 @@ TokenGenerator::TokenGenerator( + input_toks_.size = metadata_.ar_len * sizeof(int64_t); + input_pos_.size = metadata_.ar_len * sizeof(int32_t); + attention_mask_.size = +- metadata_.ar_len * metadata_.context_len * sizeof(uint16_t); ++ metadata_.ar_len * metadata_.context_len * sizeof(float); + + switch (metadata_.cache_mode) { + case CacheMode::StaticCahce: + attention_mask_.size = +- metadata_.ar_len * metadata_.context_len * sizeof(uint16_t); ++ metadata_.ar_len * metadata_.context_len * sizeof(float); + window_attention_mask_.size = 0; + break; + case CacheMode::HybridCache: + attention_mask_.size = +- metadata_.ar_len * metadata_.context_len * sizeof(uint16_t); ++ metadata_.ar_len * metadata_.context_len * sizeof(float); + window_attention_mask_.size = +- metadata_.ar_len * metadata_.context_len * sizeof(uint16_t); ++ metadata_.ar_len * metadata_.context_len * sizeof(float); + break; + default: + ET_CHECK_MSG(false, "Unsupported llama cache mode"); + break; + } + +- logits_.size = metadata_.ar_len * metadata_.vocab_size * sizeof(uint16_t); ++ logits_.size = metadata_.ar_len * metadata_.vocab_size * sizeof(float); + } + template + void TokenGenerator::init_io( +@@ -83,7 +83,7 @@ void TokenGenerator::init_io( + + // [I]: attention_mask + Result attention_mask = method_meta->input_tensor_meta(idx++); +- attention_mask_.data = reinterpret_cast( ++ attention_mask_.data = reinterpret_cast( + buffer_manager->allocate(attention_mask_.size)); + attention_mask_.tensor = std::make_unique( + attention_mask->scalar_type(), +@@ -100,7 +100,7 @@ void TokenGenerator::init_io( + if (metadata_.cache_mode == CacheMode::HybridCache) { + Result window_attention_mask = + method_meta->input_tensor_meta(idx++); +- window_attention_mask_.data = reinterpret_cast( ++ window_attention_mask_.data = reinterpret_cast( + buffer_manager->allocate(window_attention_mask_.size)); + window_attention_mask_.tensor = std::make_unique( + window_attention_mask->scalar_type(), +@@ -162,7 +162,7 @@ void TokenGenerator::init_io( + // [O]: logits + Result logits = method_meta->output_tensor_meta(0); + logits_.data = +- reinterpret_cast(buffer_manager->allocate(logits_.size)); ++ reinterpret_cast(buffer_manager->allocate(logits_.size)); + logits_.tensor = std::make_unique( + logits->scalar_type(), + logits->sizes().size(), +@@ -205,7 +205,7 @@ void TokenGenerator::init_io( + } + + template +-const std::vector& TokenGenerator::get_all_logits() { ++const std::vector& TokenGenerator::get_all_logits() { + return token_all_logits_; + } + +@@ -328,5 +328,6 @@ Result TokenGenerator::generate( + // Explicit instantiations + template class TokenGenerator; + template class TokenGenerator; ++template class TokenGenerator; + + } // namespace example +diff --git a/examples/qualcomm/oss_scripts/llama/runner/token_generator.h b/examples/qualcomm/oss_scripts/llama/runner/token_generator.h +index 9f0198f30..10b9f832d 100644 +--- a/examples/qualcomm/oss_scripts/llama/runner/token_generator.h ++++ b/examples/qualcomm/oss_scripts/llama/runner/token_generator.h +@@ -59,7 +59,7 @@ class TokenGenerator { + * + * @return std::vector& all the logits generated + */ +- virtual const std::vector& get_all_logits(); ++ virtual const std::vector& get_all_logits(); + + /** +    * @brief Generate tokens. +@@ -95,9 +95,9 @@ class TokenGenerator { + // inputs and outputs + TensorStruct input_toks_; + TensorStruct input_pos_; +- TensorStruct attention_mask_; +- TensorStruct window_attention_mask_; +- TensorStruct logits_; ++ TensorStruct attention_mask_; ++ TensorStruct window_attention_mask_; ++ TensorStruct logits_; + + // layer -> head -> TensorImpl + std::vector>> +@@ -128,6 +128,6 @@ class TokenGenerator { + Metadata metadata_; + + // Unused by default, only used when dump_logits_path is provided. +- std::vector token_all_logits_; ++ std::vector token_all_logits_; + }; + } // namespace example