diff --git a/.ci/scripts/test_qnn_static_llm.sh b/.ci/scripts/test_qnn_static_llm.sh index 46923f52127..91e37aae7c8 100644 --- a/.ci/scripts/test_qnn_static_llm.sh +++ b/.ci/scripts/test_qnn_static_llm.sh @@ -39,6 +39,7 @@ pip install graphviz set +e echo "Executing task: $TASK_NAME" +EXTRA_FLAGS="" if [[ "${TASK_NAME}" == "stories_110m" ]]; then # Download stories llama110m artifacts download_stories_model_artifacts @@ -80,7 +81,10 @@ elif [[ "${TASK_NAME}" == "stories_260k_bc" ]]; then fi elif [[ "${TASK_NAME}" == "smollm2_135m" ]]; then - $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_static_llm_model --model_name smollm2_135m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./static_smollm2 --enable_x86_64 + if [ -n "$2" ]; then + EXTRA_FLAGS="$EXTRA_FLAGS --static_llm_eval_method $2" + fi + $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_static_llm_model --model_name smollm2_135m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./static_smollm2 --enable_x86_64 $EXTRA_FLAGS exit_code1=$? if [ $exit_code1 -ne 0 ]; then exit 1 diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index f2aa4a3511e..a26e0f0cc2e 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -69,4 +69,4 @@ jobs: # Setup install_requirements for llama PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh - PYTHON_EXECUTABLE=python bash .ci/scripts/test_qnn_static_llm.sh ${{ matrix.task }} + PYTHON_EXECUTABLE=python bash .ci/scripts/test_qnn_static_llm.sh ${{ matrix.task }} wikitext_ppl diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index f25d7ede3b3..14ffe9d3064 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -598,6 +598,40 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/test_qnn_static_llm.sh ${{ matrix.task }} + test-sqnr-static-llm-qnn-linux: + name: test-sqnr-static-llm-qnn-linux + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + strategy: + matrix: + task: [smollm2_135m] + fail-fast: false + with: + runner: linux.2xlarge + docker-image: ci-image:executorch-ubuntu-22.04-qnn-sdk + submodules: 'recursive' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 180 + script: | + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + + BUILD_TOOL="cmake" + + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh + PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh + + # Setup executorch + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}" + + # Setup install_requirements for llama + PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh + + PYTHON_EXECUTABLE=python bash .ci/scripts/test_qnn_static_llm.sh ${{ matrix.task }} sqnr + test-qnn-models-linux: name: test-qnn-models-linux uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index f22b41f2cf0..8044b3f325a 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -7,6 +7,7 @@ import io import itertools import json +import logging import subprocess import sys import tempfile @@ -6062,51 +6063,122 @@ class TestExampleLLMScript(TestQNN): class LlmSpecs: SM8650: float SM8750: float - ppl: float pte_size: float + wikitext_ppl: float + hellaswag_acc_norm: float + sqnr: float # TODO: refactor to support different backends def setUp(self): + # TODO: add SQNR for all models self.llm_specs = { "gemma-2b": TestExampleLLMScript.LlmSpecs( - SM8650=32, SM8750=36, ppl=35, pte_size=2_700_000_000 - ), # 2.7 GB + SM8650=32, + SM8750=36, + pte_size=2_700_000_000, # 2.7 GB + wikitext_ppl=17, + hellaswag_acc_norm=None, + sqnr=27, + ), "gemma3-1b": TestExampleLLMScript.LlmSpecs( - SM8650=70, SM8750=100, ppl=23, pte_size=1_200_000_000 - ), # 1.2 GB + SM8650=70, + SM8750=100, + pte_size=1_200_000_000, # 1.2 GB + wikitext_ppl=23, + hellaswag_acc_norm=None, + sqnr=10, + ), "glm-1_5b": TestExampleLLMScript.LlmSpecs( - SM8650=42, SM8750=52, ppl=21, pte_size=1_100_000_000 - ), # 1.1 GB + SM8650=42, + SM8750=52, + pte_size=1_100_000_000, # 1.1 GB + wikitext_ppl=21, + hellaswag_acc_norm=None, + sqnr=14, + ), + "granite_3_3-2b_instruct": TestExampleLLMScript.LlmSpecs( + SM8650=20, + SM8750=22, + pte_size=1_600_000_000, # 1.6 GB + wikitext_ppl=None, + hellaswag_acc_norm=0.2, + sqnr=None, + ), "phi_4_mini": TestExampleLLMScript.LlmSpecs( - SM8650=14, SM8750=19, ppl=12, pte_size=4_000_000_000 - ), # 4GB + SM8650=14, + SM8750=19, + pte_size=4_000_000_000, # 4GB + wikitext_ppl=14, + hellaswag_acc_norm=None, + sqnr=20, + ), "llama3_2-1b_instruct": TestExampleLLMScript.LlmSpecs( - SM8650=37, SM8750=45, ppl=16, pte_size=1_500_000_000 - ), # 1.5 GB + SM8650=37, + SM8750=45, + pte_size=1_500_000_000, # 1.5 GB + wikitext_ppl=16, + hellaswag_acc_norm=None, + sqnr=15, + ), "llama3_2-3b_instruct": TestExampleLLMScript.LlmSpecs( - SM8650=21, SM8750=26, ppl=11, pte_size=2_800_000_000 - ), # 2.8 GB + SM8650=21, + SM8750=26, + pte_size=2_800_000_000, # 2.8 GB + wikitext_ppl=11, + hellaswag_acc_norm=None, + sqnr=14, + ), "qwen2_5-0_5b": TestExampleLLMScript.LlmSpecs( - SM8650=115, SM8750=155, ppl=15, pte_size=600_000_000 - ), # 600 MB + SM8650=115, + SM8750=155, + pte_size=600_000_000, # 600 MB + wikitext_ppl=15, + hellaswag_acc_norm=None, + sqnr=8, + ), "qwen2_5-1_5b": TestExampleLLMScript.LlmSpecs( - SM8650=38, SM8750=47, ppl=10, pte_size=1_500_000_000 - ), # 1.5 GB + SM8650=38, + SM8750=47, + pte_size=1_500_000_000, # 1.5 GB + wikitext_ppl=10, + hellaswag_acc_norm=None, + sqnr=10, + ), "qwen3-0_6b": TestExampleLLMScript.LlmSpecs( - SM8650=47, SM8750=68, ppl=21, pte_size=700_000_000 - ), # 700 MB + SM8650=47, + SM8750=68, + pte_size=700_000_000, # 700 MB + wikitext_ppl=21, + hellaswag_acc_norm=None, + sqnr=8, + ), "qwen3-1_7b": TestExampleLLMScript.LlmSpecs( - SM8650=28, SM8750=34, ppl=15, pte_size=1_800_000_000 - ), # 1.8 GB + SM8650=28, + SM8750=34, + pte_size=1_800_000_000, # 1.8 GB + wikitext_ppl=15, + hellaswag_acc_norm=None, + sqnr=12, + ), "smollm2_135m": TestExampleLLMScript.LlmSpecs( - SM8650=214, SM8750=260, ppl=23, pte_size=210_000_000 - ), # 210 MB + SM8650=214, + SM8750=260, + pte_size=210_000_000, # 210 MB + wikitext_ppl=23, + hellaswag_acc_norm=None, + sqnr=20, + ), "smollm3-3b": TestExampleLLMScript.LlmSpecs( - SM8650=23, SM8750=28, ppl=10, pte_size=2_600_000_000 - ), # 2.6 GB + SM8650=23, + SM8750=28, + pte_size=2_600_000_000, # 2.6 GB + wikitext_ppl=10, + hellaswag_acc_norm=None, + sqnr=6, + ), } - def test_static_llm_model(self): + def test_static_llm_model(self): # noqa: C901 if not self.required_envs([self.model_name]): self.skipTest("missing required envs") assert ( @@ -6142,13 +6214,43 @@ def test_static_llm_model(self): "kv", "--max_seq_len", "1024", - "--run_lm_eval", - "--tasks", - "wikitext", - "--limit", - "1", ] + match self.static_llm_eval_method: + case "wikitext_ppl": + cmds.extend( + [ + "--eval_methods", + "tasks_eval", + "--tasks", + "wikitext", + "--limit", + "1", + ] + ) + case "hellaswag_acc_norm": + cmds.extend( + [ + "--eval_methods", + "tasks_eval", + "--tasks", + "hellaswag", + "--limit", + "10", + ] + ) + case "sqnr": + cmds.extend( + [ + "--eval_methods", + "sqnr_eval", + ] + ) + case _: + logging.warning( + "No llm eval method chosen. Only generate model output." + ) + if is_llama_model: cmds.extend( [ @@ -6168,24 +6270,36 @@ def test_static_llm_model(self): conn = listener.accept() p.communicate() msg = json.loads(conn.recv()) + logging.info(f"Model Name: {self.model_name}\nTarget Device: {self.model}") + logging.info(f"Eval Result: {msg}") if "Error" in msg: self.fail(msg["Error"]) else: llm_spec = self.llm_specs[self.model_name] pte_size = msg["pte_size"] self.assertLessEqual(pte_size, llm_spec.pte_size) - print(f"Model Name: {self.model_name}\nTarget Device: {self.model}") - print(f"PTE Size: {pte_size} bytes") if not self.compile_only: - ppl = msg["wiki_ppl"] - print(f"PPL: {ppl}") - self.assertLessEqual(ppl, llm_spec.ppl) + if self.static_llm_eval_method: + # Use "is not None" in case any eval_score is 0. + assert ( + getattr(llm_spec, self.static_llm_eval_method) is not None + ), f"{self.model_name} currently does not support {self.static_llm_eval_method}. Please choose other methods." + match self.static_llm_eval_method: + case "wikitext_ppl": + ppl = msg["wiki_ppl"] + self.assertLessEqual(ppl, llm_spec.wikitext_ppl) + case "hellaswag_acc_norm": + acc_norm = msg["acc_norm"] + self.assertGreaterEqual( + acc_norm, llm_spec.hellaswag_acc_norm + ) + case "sqnr": + sqnr = msg["sqnr"] + self.assertGreaterEqual(sqnr, llm_spec.sqnr) + if not self.enable_x86_64 and hasattr(llm_spec, self.model): device_inference_speed = msg["inference_speed"] expected_inference_speed = getattr(llm_spec, self.model) - print( - f"Prompt Evaluation: {device_inference_speed} tokens/second" - ) self.assertGreaterEqual( device_inference_speed, expected_inference_speed ) @@ -6236,73 +6350,6 @@ def test_codegen2_1b(self): if not self.compile_only and not self.enable_x86_64: self.assertGreaterEqual(msg["inference_speed"], 60) - def test_granite_3_3_2b_instruct(self): - if not self.required_envs(): - self.skipTest("missing required envs") - - prompt = "What is the meaning of life?" - cmds = [ - "python", - f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py", - "--artifact", - self.artifact_dir, - "--build_folder", - self.build_folder, - "--model", - self.model, - "--ip", - self.ip, - "--port", - str(self.port), - "--prompt", - f"{prompt}", - "--temperature", - "0", - "--decoder_model", - "granite_3_3-2b_instruct", - "--model_mode", - "kv", - "--max_seq_len", - "1024", - "--run_lm_eval", - "--tasks", - "hellaswag", - "--limit", - "10", - "--kv_updater", - "shift_pointer", - ] - if self.compile_only: - cmds.extend(["--compile_only"]) - elif self.device: - cmds.extend(["--device", self.device]) - if self.host: - cmds.extend(["--host", self.host]) - elif self.enable_x86_64: - cmds.extend(["--enable_x86_64"]) - if self.pre_gen_pte: - cmds.extend(["--pre_gen_pte", self.pre_gen_pte]) - - p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) - with Listener((self.ip, self.port)) as listener: - conn = listener.accept() - p.communicate() - msg = json.loads(conn.recv()) - if "Error" in msg: - self.fail(msg["Error"]) - else: - inference_speed_ref = {"SM8650": 20, "SM8750": 22} - if ( - not self.compile_only - and not self.enable_x86_64 - and self.model in inference_speed_ref - ): - self.assertLessEqual(msg["pte_size"], 1_600_000_000) - self.assertGreaterEqual(msg["acc_norm"], 0.2) - self.assertGreaterEqual( - msg["inference_speed"], inference_speed_ref[self.model] - ) - def test_llama_stories_260k(self): if not self.required_envs(): self.skipTest("missing required envs") @@ -8350,6 +8397,12 @@ def setup_environment(): help="A folder that contains: weight, tokenizer, and params.", type=str, ) + parser.add_argument( + "--static_llm_eval_method", + help="Methods for Static LLM evaluation.", + choices=["wikitext_ppl", "hellaswag_acc_norm", "sqnr"], + type=str, + ) args, ns_args = parser.parse_known_args(namespace=unittest) TestQNN.host = args.host @@ -8376,6 +8429,7 @@ def setup_environment(): TestQNN.op_package_dir = args.op_package_dir TestQNN.target = args.target TestQNN.backend = args.backend + TestQNN.static_llm_eval_method = args.static_llm_eval_method return sys.argv[:1] + ns_args diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py index 3dd0a1cc1aa..7f57bbe8519 100644 --- a/backends/qualcomm/tests/utils.py +++ b/backends/qualcomm/tests/utils.py @@ -184,6 +184,7 @@ class TestQNN(unittest.TestCase): dump_intermediate_outputs: bool = False inference_speed: float = 0.0 inference_speed_output_path = "outputs/inference_speed.txt" + static_llm_eval_method = "" def _assert_outputs_equal(self, model_output, ref_output): self.assertTrue(len(ref_output) == len(model_output)) @@ -496,7 +497,7 @@ def validate_intermediate_tensor(): f" --performance_output_path {self.inference_speed_output_path}" ) adb.execute(method_index=method_index, output_callback=output_callback) - adb.pull(output_path=tmp_dir, callback=post_process) + adb.pull(host_output_path=tmp_dir, callback=post_process) self._assert_outputs_equal(outputs, ref_outputs) if expected_profile_events != -1: diff --git a/examples/models/gemma3/config/1b_config.json b/examples/models/gemma3/config/1b_config.json index 3a9e673716b..a846bdda711 100644 --- a/examples/models/gemma3/config/1b_config.json +++ b/examples/models/gemma3/config/1b_config.json @@ -12,6 +12,7 @@ "post_attention_norm": true, "post_ffn_norm": true, "rope_theta": 1000000.0, + "local_rope_theta": 10000.0, "use_scaled_rope": false, "apply_embedding": true, "embedding_scale_factor": 33.941125497, diff --git a/examples/models/llama/model_args.py b/examples/models/llama/model_args.py index a0e9eb70498..043bd06f2f6 100644 --- a/examples/models/llama/model_args.py +++ b/examples/models/llama/model_args.py @@ -102,6 +102,9 @@ class ModelArgs: rope_theta: Optional[float] = ( None # The official name to override self.rope_freq_base. ) + local_rope_theta: Optional[float] = ( + None # For sliding window attention. e.g., gemma3-1b + ) rope_freq_base: float = 10000.0 # The base frequency for RoPE. Keep it for BC. use_scaled_rope: bool = False # Use scaled RoPE, introduced in llama3.1. rope_scale_factor: int = 8 diff --git a/examples/qualcomm/custom_op/custom_ops_1.py b/examples/qualcomm/custom_op/custom_ops_1.py index 1745e2df7fa..0962e44ee6c 100644 --- a/examples/qualcomm/custom_op/custom_ops_1.py +++ b/examples/qualcomm/custom_op/custom_ops_1.py @@ -261,7 +261,7 @@ def main(args): ) adb.push(inputs=sample_input, files=op_package_paths) adb.execute() - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) x86_golden = instance(*sample_input) device_output = torch.from_numpy( diff --git a/examples/qualcomm/oss_scripts/albert.py b/examples/qualcomm/oss_scripts/albert.py index 16c0d8b5027..d3dace76cb7 100644 --- a/examples/qualcomm/oss_scripts/albert.py +++ b/examples/qualcomm/oss_scripts/albert.py @@ -119,7 +119,7 @@ def main(args): # accuracy analysis adb.push(inputs=inputs) adb.execute() - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # since the original nn.Module could not perform well on this task either # we only measure the relative accuracy here goldens, predictions, nominal_predictions = [], [], [] diff --git a/examples/qualcomm/oss_scripts/bert.py b/examples/qualcomm/oss_scripts/bert.py index 2b2fc08605c..e9d68616ccf 100644 --- a/examples/qualcomm/oss_scripts/bert.py +++ b/examples/qualcomm/oss_scripts/bert.py @@ -116,7 +116,7 @@ def main(args): # accuracy analysis adb.push(inputs=inputs) adb.execute() - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) goldens, predictions = [], [] for i in range(len(inputs)): indices = [i for i, x in enumerate(targets[i]) if x != -100] diff --git a/examples/qualcomm/oss_scripts/conv_former.py b/examples/qualcomm/oss_scripts/conv_former.py index a7150701796..dd7321de123 100644 --- a/examples/qualcomm/oss_scripts/conv_former.py +++ b/examples/qualcomm/oss_scripts/conv_former.py @@ -97,7 +97,7 @@ def main(args): output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # top-k analysis predictions = [] diff --git a/examples/qualcomm/oss_scripts/convnext_small.py b/examples/qualcomm/oss_scripts/convnext_small.py index 35f4a4905ec..d7045bf55d2 100755 --- a/examples/qualcomm/oss_scripts/convnext_small.py +++ b/examples/qualcomm/oss_scripts/convnext_small.py @@ -92,7 +92,7 @@ def main(args): output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # top-k analysis predictions = [] diff --git a/examples/qualcomm/oss_scripts/cvt.py b/examples/qualcomm/oss_scripts/cvt.py index f4e5ddeed12..891a441e5b1 100644 --- a/examples/qualcomm/oss_scripts/cvt.py +++ b/examples/qualcomm/oss_scripts/cvt.py @@ -161,7 +161,7 @@ def main(args): output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # top-k analysis predictions = [] diff --git a/examples/qualcomm/oss_scripts/deit.py b/examples/qualcomm/oss_scripts/deit.py index 6c9bf70733d..3d93b9cfbc7 100644 --- a/examples/qualcomm/oss_scripts/deit.py +++ b/examples/qualcomm/oss_scripts/deit.py @@ -118,7 +118,7 @@ def main(args): output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # top-k analysis predictions = [] diff --git a/examples/qualcomm/oss_scripts/dino_v2.py b/examples/qualcomm/oss_scripts/dino_v2.py index 745da8694cf..65a39873851 100644 --- a/examples/qualcomm/oss_scripts/dino_v2.py +++ b/examples/qualcomm/oss_scripts/dino_v2.py @@ -101,7 +101,7 @@ def main(args): output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # top-k analysis predictions = [] diff --git a/examples/qualcomm/oss_scripts/distilbert.py b/examples/qualcomm/oss_scripts/distilbert.py index a93a3ac7a73..2c13b46ab4a 100644 --- a/examples/qualcomm/oss_scripts/distilbert.py +++ b/examples/qualcomm/oss_scripts/distilbert.py @@ -117,7 +117,7 @@ def main(args): # accuracy analysis adb.push(inputs=inputs) adb.execute() - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) goldens, predictions = [], [] for i in range(len(inputs)): indices = [i for i, x in enumerate(targets[i]) if x != -100] diff --git a/examples/qualcomm/oss_scripts/dit.py b/examples/qualcomm/oss_scripts/dit.py index df4142e1079..230c759aed9 100644 --- a/examples/qualcomm/oss_scripts/dit.py +++ b/examples/qualcomm/oss_scripts/dit.py @@ -125,7 +125,7 @@ def main(args): output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # top-k analysis predictions = [] diff --git a/examples/qualcomm/oss_scripts/efficientSAM/efficientSAM.py b/examples/qualcomm/oss_scripts/efficientSAM/efficientSAM.py index b5f0d6a72dc..df1579566d0 100644 --- a/examples/qualcomm/oss_scripts/efficientSAM/efficientSAM.py +++ b/examples/qualcomm/oss_scripts/efficientSAM/efficientSAM.py @@ -280,7 +280,7 @@ def post_process(): output = torch.from_numpy(output).reshape(output_shape) outputs.append(output) - adb.pull(output_path=args.artifact, callback=post_process) + adb.pull(host_output_path=args.artifact, callback=post_process) # MIoU analysis miou = 0 diff --git a/examples/qualcomm/oss_scripts/efficientnet.py b/examples/qualcomm/oss_scripts/efficientnet.py index 83b3ecccbb7..4ef522fcb3a 100644 --- a/examples/qualcomm/oss_scripts/efficientnet.py +++ b/examples/qualcomm/oss_scripts/efficientnet.py @@ -97,7 +97,7 @@ def main(args): output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # top-k analysis predictions = [] diff --git a/examples/qualcomm/oss_scripts/esrgan.py b/examples/qualcomm/oss_scripts/esrgan.py index a45e4543174..58daf2721ec 100644 --- a/examples/qualcomm/oss_scripts/esrgan.py +++ b/examples/qualcomm/oss_scripts/esrgan.py @@ -118,7 +118,7 @@ def post_process(): ) cnt += 1 - adb.pull(output_path=args.artifact, callback=post_process) + adb.pull(host_output_path=args.artifact, callback=post_process) psnr_list = [] ssim_list = [] diff --git a/examples/qualcomm/oss_scripts/eurobert.py b/examples/qualcomm/oss_scripts/eurobert.py index 7706afc9863..6820dcce063 100644 --- a/examples/qualcomm/oss_scripts/eurobert.py +++ b/examples/qualcomm/oss_scripts/eurobert.py @@ -154,7 +154,7 @@ def get_custom_quantizer(): # accuracy analysis adb.push(inputs=inputs) adb.execute() - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) goldens, predictions = [], [] for i in range(len(inputs)): indices = [i for i, x in enumerate(targets[i]) if x != -100] diff --git a/examples/qualcomm/oss_scripts/fastvit.py b/examples/qualcomm/oss_scripts/fastvit.py index a363a7bada2..37062853b5c 100644 --- a/examples/qualcomm/oss_scripts/fastvit.py +++ b/examples/qualcomm/oss_scripts/fastvit.py @@ -154,7 +154,7 @@ def get_custom_quantizer(): output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # top-k analysis predictions = [] diff --git a/examples/qualcomm/oss_scripts/fbnet.py b/examples/qualcomm/oss_scripts/fbnet.py index 6c8a452c924..6a372f920fb 100755 --- a/examples/qualcomm/oss_scripts/fbnet.py +++ b/examples/qualcomm/oss_scripts/fbnet.py @@ -93,7 +93,7 @@ def post_process(): output = np.fromfile(filename, dtype=np.float32) output_raws.append(output) - adb.pull(output_path=args.artifact, callback=post_process) + adb.pull(host_output_path=args.artifact, callback=post_process) # top-k analysis predictions = [] diff --git a/examples/qualcomm/oss_scripts/focalnet.py b/examples/qualcomm/oss_scripts/focalnet.py index 36ee4171b8c..06dd2fda145 100644 --- a/examples/qualcomm/oss_scripts/focalnet.py +++ b/examples/qualcomm/oss_scripts/focalnet.py @@ -97,7 +97,7 @@ def main(args): output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # top-k analysis predictions = [] diff --git a/examples/qualcomm/oss_scripts/gMLP_image_classification.py b/examples/qualcomm/oss_scripts/gMLP_image_classification.py index 0a718602b15..82e11a59a85 100644 --- a/examples/qualcomm/oss_scripts/gMLP_image_classification.py +++ b/examples/qualcomm/oss_scripts/gMLP_image_classification.py @@ -88,7 +88,7 @@ def main(args): output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # top-k analysis predictions = [] diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md index 33868eda6d1..360d2bed47f 100644 --- a/examples/qualcomm/oss_scripts/llama/README.md +++ b/examples/qualcomm/oss_scripts/llama/README.md @@ -152,7 +152,7 @@ python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL #### Granite3.3 2B Default example using hybrid mode ```bash -python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --temperature 0 --model_mode hybrid --max_seq_len 1024 --prefill_ar_len 128 --decoder_model granite_3_3-2b_instruct --prompt "I would like to learn python, could you teach me with a simple example?" --run_lm_eval --task hellaswag --limit 10 +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --temperature 0 --model_mode hybrid --max_seq_len 1024 --prefill_ar_len 128 --decoder_model granite_3_3-2b_instruct --prompt "I would like to learn python, could you teach me with a simple example?" --eval_methods tasks_eval --task hellaswag --limit 10 ``` #### Phi4-mini-instruct @@ -234,13 +234,13 @@ We use Smart Mask mechanisms for updating the key-value (KV) cache. #### Compile Only If you would like to compile the model only, we have provided the flag `--compile_only`. Taking LLAMA3.2 as an example: ```bash -python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -m ${SOC_MODEL} --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --decoder_model llama3_2 --model_mode hybrid --prefill_ar_len 32 --max_seq_len 128 --prompt "what is 1+1" --compile_only +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -m ${SOC_MODEL} --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --decoder_model llama3_2 --model_mode hybrid --prefill_ar_len 32 --max_seq_len 128 --prompt "I would like to learn python, could you teach me with a simple example?" --compile_only ``` #### Pre Generated PTE On the other hand, if you already have a pre-compiled .pte model, you can perform inference by providing the flag `--pre_gen_pte` and specifying the folder that contains the .pte model. Taking LLAMA3.2 as an example: ```bash -python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --decoder_model llama3_2 --model_mode hybrid --prefill_ar_len 32 --max_seq_len 128 --prompt "what is 1+1" --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE} +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --decoder_model llama3_2 --model_mode hybrid --prefill_ar_len 32 --max_seq_len 128 --prompt "I would like to learn python, could you teach me with a simple example?" --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE} ``` #### Lookahead Decoding Mode @@ -253,30 +253,37 @@ You can choose the lookahead mode to enhance decoding speed. To use this mode, y For more details, please refer to the paper ["Break the Sequential Dependency of LLM Inference Using Lookahead Decoding"](https://arxiv.org/abs/2402.02057) ```bash -python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --decoder_model llama3_2 --model_mode lookahead --prefill_ar_len 32 --max_seq_len 128 --prompt "what is 1+1" --ngram 3 --window 2 --gcap 2 +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --decoder_model llama3_2 --model_mode lookahead --prefill_ar_len 32 --max_seq_len 128 --prompt "I would like to learn python, could you teach me with a simple example?" --ngram 3 --window 2 --gcap 2 ``` -#### Perplexity Evaluation -This script supports perplexity evaluation and is capable of assessing perplexity scores across 3 phases: prepare_pt2e(CPU FP), convert_pt2e(CPU QDQ), QNN on device. +#### Tasks Evaluation +This script supports task evaluation and is capable of assessing evaluation scores across 3 phases: prepare_pt2e(CPU FP), convert_pt2e(CPU QDQ), QNN on device. -To evaluate the perplexity across all 3 phases, users should provide the `--run_lm_eval` flag and specify the evaluation task. Please notice when this flag is provided, the `--prompt ${PROMPT}` will be ignored. +To evaluate the perplexity across all 3 phases, users should provide the `--eval_methods tasks_eval` flag and specify the evaluation task. Please notice when this flag is provided, the `--prompt ${PROMPT}` will be ignored. For example, using the Qwen model and 1 wikitext sample as the evaluation task, users can assess all 3 phases perplexity score in a single run by including the appropriate configuration: ```bash -python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "What is 1+1?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --run_lm_eval --tasks wikitext --limit 1 +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "I would like to learn python, could you teach me with a simple example?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_methods tasks_eval --tasks wikitext --limit 1 --verbose ``` -For the example script above, 1 wikitext sample is used to evaluate all 3 phases. However, there are cases where a user may want to use one sample for quantization calibration and multiple samples for perplexity evaluation. In this case, the process should be split into two runs. In the 1st run, the model is compiled using one sample. In the 2nd run, the user can provide a different configuration for QNN device execution. +From the example script above, 1 wikitext sample is used to evaluate all 3 phases. However, there are cases where a user may want to use one sample for quantization calibration and multiple samples for perplexity evaluation. In this case, the process should be split into two runs. In the 1st run, the model is compiled using one sample. In the 2nd run, the user can provide a different configuration for QNN device execution. Example: ```bash # 1st run to compile with --limit 1 -python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "What is 1+1?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --run_lm_eval --tasks wikitext --limit 1 --compile_only +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "I would like to learn python, could you teach me with a simple example?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_methods tasks_eval --tasks wikitext --limit 1 --compile_only ``` ```bash # 2nd run to perform QNN device execution with --limit 3 -python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "What is 1+1?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --run_lm_eval --tasks wikitext --limit 3 --pre_gen_pte ${PATH_TO_ARTIFACT_IN_1ST_RUN} --quant_attrs_path ${PATH_TO_ARTIFACT_IN_1ST_RUN}/kv_llama_qnn_quant_attrs.json +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "I would like to learn python, could you teach me with a simple example?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_methods tasks_eval --tasks wikitext --limit 3 --pre_gen_pte ${PATH_TO_ARTIFACT_IN_1ST_RUN} --quant_attrs_path ${PATH_TO_ARTIFACT_IN_1ST_RUN}/kv_llama_qnn_quant_attrs.json ``` #### Tasks quantization calibration If `--tasks ${TASK}` is not provided, the program will use `--prompt ${PROMPT}` as the dataset for quantization calibration. -Regardless of whether `--run_lm_eval` is provided, as long as `--tasks ${TASK}` is specified, the specified tasks will be used for model quantization calibration instead of the prompt. +Regardless of whether `--eval_methods tasks_eval` is provided, as long as `--tasks ${TASK}` is specified, the specified tasks will be used for model quantization calibration instead of the prompt. + +#### SQNR Evalution +To evaluate QNN's output logits against the golden logits from `nn.Module`, users can provide the flag `--sqnr_eval`. Please note that SQNR evaluation will only compare the logits of the user's prompt and will not compare the new tokens generated by the model. +Example: +```bash +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "I would like to learn python, could you teach me with a simple example?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_methods sqnr_eval +``` diff --git a/examples/qualcomm/oss_scripts/llama/artifacts/stories260k_hybrid_llama_qnn.pte b/examples/qualcomm/oss_scripts/llama/artifacts/stories260k_hybrid_llama_qnn.pte index 01e15984812..0817a5edda9 100644 Binary files a/examples/qualcomm/oss_scripts/llama/artifacts/stories260k_hybrid_llama_qnn.pte and b/examples/qualcomm/oss_scripts/llama/artifacts/stories260k_hybrid_llama_qnn.pte differ diff --git a/examples/qualcomm/oss_scripts/llama/decoder_constants.py b/examples/qualcomm/oss_scripts/llama/decoder_constants.py index 3baa4b94ed6..c52bfd0f690 100644 --- a/examples/qualcomm/oss_scripts/llama/decoder_constants.py +++ b/examples/qualcomm/oss_scripts/llama/decoder_constants.py @@ -4,6 +4,11 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# eval methods +PROMPT_EVAL = "prompt_eval" +TASKS_EVAL = "tasks_eval" +SQNR_EVAL = "sqnr_eval" + # filenames for vision model VISION_ENCODER_INPUT_FILENAME = "vision_encoder_input" diff --git a/examples/qualcomm/oss_scripts/llama/decoder_runtime_evaluator.py b/examples/qualcomm/oss_scripts/llama/decoder_runtime_evaluator.py new file mode 100644 index 00000000000..158627d5a43 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/decoder_runtime_evaluator.py @@ -0,0 +1,569 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import getpass +import logging +import os +import subprocess +from abc import ABC, abstractmethod +from functools import partial +from typing import Any, final, List, Optional, Union + +import numpy as np +import torch +from executorch.examples.models.llama.evaluate.eager_eval import EagerEvalWrapper +from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import ( + DECODER_MODEL_VERSION, + EVAL_MODE, +) +from executorch.examples.qualcomm.oss_scripts.llama.decoder_utils import ( + INFERENCE_REGISTRY, + retrieve_info_from_pte, +) +from executorch.examples.qualcomm.utils import make_output_dir, SimpleADB +from pytorch_tokenizers.hf_tokenizer import HuggingFaceTokenizer +from pytorch_tokenizers.llama2c import Llama2cTokenizer as SentencePieceTokenizer +from pytorch_tokenizers.tiktoken import TiktokenTokenizer +from torchao.quantization.utils import compute_error + +try: + from lm_eval.evaluator import simple_evaluate +except ImportError: + raise ImportError( + "Please install the llm eval dependency via examples/models/llama/install_requirements.sh" + ) + + +def post_process_model_output(output_holder: List, host_output_response_path: str): + with open(host_output_response_path, "r") as f: + output_holder.append(f.read()) + + +def post_process_inference_speed(output_holder: List, host_performance_path: str): + with open(host_performance_path, "r") as f: + output_holder.append(float(f.read())) + + +def post_process_logits( + output_holder: List, + host_logits_path: str, + kv_io_bit_width: int, + num_input_tokens: int, + output_vocab_size: int, + logits_scale: float, + logits_zero_point: int, +): + with open(host_logits_path, "r") as f: + logits_dtype = np.float32 if kv_io_bit_width == 32 else np.uint16 + output_tensor = torch.from_numpy( + np.fromfile(f.name, dtype=logits_dtype).reshape(1, -1, output_vocab_size) + ) + output_tensor = output_tensor[:, :num_input_tokens, :] + output_tensor = ( + output_tensor.to(torch.float32) - logits_zero_point + ) * logits_scale + output_holder.append(output_tensor) + + +class EvalBase(ABC): + _adb: Optional[SimpleADB] = None # ADB shared across all instances + + def __init__(self, args, pte_path, runtime_tokenizer_path): + self.args = args + self.pte_path = pte_path + self.runtime_tokenizer_path = runtime_tokenizer_path + self.qnn_sdk = os.getenv("QNN_SDK_ROOT") + + self.device_workspace = ( + f"/data/local/tmp/{getpass.getuser()}/executorch/static_llm" + ) + + device_output_path = self._get_adb().output_folder + if args.enable_x86_64: + logging.warning( + "x86 emulator is NOT recommended as it is for CI purpose, expect significance drop in performance." + ) + device_output_path = f"{args.artifact}/outputs" + + self.device_output_response_path = f"{device_output_path}/outputs.txt" + self.device_performance_path = f"{device_output_path}/inference_speed.txt" + self.device_logits_path = f"{device_output_path}/all_logits.raw" + self.host_output_response_path = f"{args.artifact}/outputs/outputs.txt" + self.host_performance_path = f"{args.artifact}/outputs/inference_speed.txt" + self.host_logits_path = f"{args.artifact}/outputs/all_logits.raw" + make_output_dir(f"{args.artifact}/outputs") + self.runner_base_cmd = self._init_runner_base_cmd() + + def _init_runner_base_cmd(self): + """ + If a runner cmd could be shared across all EvalBase class, place it here to reduce redundant code. + """ + args = self.args + base_cmd = "" + if args.enable_x86_64: + base_cmd = " ".join( + [ + f"export LD_LIBRARY_PATH={self.qnn_sdk}/lib/x86_64-linux-clang/:{args.build_folder}/lib &&", + f"./{args.build_folder}/examples/qualcomm/oss_scripts/llama/qnn_llama_runner", + f"--decoder_model_version {DECODER_MODEL_VERSION[args.decoder_model]}", + f"--tokenizer_path {self.runtime_tokenizer_path}", + f"--model_path {self.pte_path}", + f"--output_path {self.device_output_response_path}", + f"--performance_output_path {self.device_performance_path}", + ] + ) + else: + base_cmd = " ".join( + [ + f"cd {self.device_workspace} &&", + "./qnn_llama_runner", + f"--decoder_model_version {DECODER_MODEL_VERSION[args.decoder_model]}", + f"--tokenizer_path {os.path.basename(self.runtime_tokenizer_path)}", + f"--model_path {os.path.basename(self.pte_path)}", + f"--output_path {self.device_output_response_path}", + f"--performance_output_path {self.device_performance_path}", + "--shared_buffer", + ] + ) + return base_cmd + + @final + def _get_adb(self): + args = self.args + if EvalBase._adb is None: + EvalBase._adb = SimpleADB( + qnn_sdk=self.qnn_sdk, + build_path=f"{args.build_folder}", + pte_path=self.pte_path, + workspace=self.device_workspace, + device_id=args.device, + host_id=args.host, + soc_model=args.model, + runner="examples/qualcomm/oss_scripts/llama/qnn_llama_runner", + target=args.target, + ) + return EvalBase._adb + + @abstractmethod + def run(self) -> Any: + # Performs execution and comparison. + # Returns: + # Any: The result of execution. + # The type of the return value are determined by the implementation. + # The caller can return anything and handle it themselves. + pass + + +class DefaultEval(EvalBase): + def __init__(self, args, pte_path, runtime_tokenizer_path): + super().__init__(args, pte_path, runtime_tokenizer_path) + self.adb = self._get_adb() + self.inference_speed = 0 + + lookahead_args = " ".join( + [ + f"--window {args.window}", + f"--gcap {args.gcap}", + f"--ngram {args.ngram}", + ] + ) + runner_args = " ".join( + [ + f"--eval_mode {EVAL_MODE[args.model_mode]}", + f"--temperature {args.temperature}", + f"--system_prompt '{args.system_prompt}'", + lookahead_args if args.model_mode == "lookahead" else "", + ] + ) + self.runner_cmd = " ".join( + [ + self.runner_base_cmd, + runner_args, + f"--seq_len {args.max_seq_len}", + ] + ) + + def run(self, prompt): + multi_prompts = " ".join([f'--prompt "{p}"' for p in prompt]) + + model_output_holder = [] + performance_holder = [] + + if self.args.enable_x86_64: + # x86 emulator is intended for CI and not performance. Check only the first few tokens. + seq_len = min(self.args.max_seq_len, 32) + runner_cmd = " ".join( + [ + self.runner_cmd, + multi_prompts, + f"--seq_len {seq_len}", + ] + ) + subprocess.run( + runner_cmd, + shell=True, + executable="/bin/bash", + capture_output=True, + ) + post_process_model_output( + output_holder=model_output_holder, + host_output_response_path=self.host_output_response_path, + ) + post_process_inference_speed( + output_holder=performance_holder, + host_performance_path=self.host_performance_path, + ) + else: + runner_cmd = " ".join( + [self.runner_cmd, multi_prompts, f"--seq_len {self.args.max_seq_len}"] + ) + self.adb.push(inputs=[], files=[self.runtime_tokenizer_path]) + self.adb.execute(custom_runner_cmd=runner_cmd) + self.adb.pull( + host_output_path=self.host_output_response_path, + device_output_path=self.device_output_response_path, + callback=partial( + post_process_model_output, + output_holder=model_output_holder, + host_output_response_path=self.host_output_response_path, + ), + ) + self.adb.pull( + host_output_path=self.host_performance_path, + device_output_path=self.device_performance_path, + callback=partial( + post_process_inference_speed, + output_holder=performance_holder, + host_performance_path=self.host_performance_path, + ), + ) + self.inference_speed = performance_holder[0] + return model_output_holder + + +class SqnrEval(EvalBase): + """ + SQNR Evaluator will only evaluate the given prompt's logit output. + It won't evaluate the generated token's logit. + """ + + def __init__( + self, + source_model, + example_input, + args, + pte_path, + tokenizer, + runtime_tokenizer_path, + ): + super().__init__(args, pte_path, runtime_tokenizer_path) + self.inference_speed = 0 + self.source_model = source_model + self.example_input = example_input + self.adb = self._get_adb() + self.tokenizer = tokenizer + self.enable_x86_64 = args.enable_x86_64 + self.max_seq_length = args.max_seq_len + + pte_meta_info = retrieve_info_from_pte(pte_path=pte_path) + self.output_vocab_size = pte_meta_info["output_vocab_size"] + pte_max_seq_len = pte_meta_info["pte_max_seq_len"] + self.logits_scale = pte_meta_info["logits_scale"] + self.logits_zero_point = pte_meta_info["logits_zero_point"] + self.kv_io_bit_width = pte_meta_info["kv_io_bit_width"] + + if args.model_mode != "kv": + logging.warning( + f"Current SqnrEval does not support {args.model_mode}, switching to kv mode." + ) + + if pte_max_seq_len != self.max_seq_length: + logging.warning( + f"The pte provided has a max_seq_len {pte_max_seq_len}, which is different from --max_seq_len {self.max_seq_length} provided to the script, please ensure this is desired." + ) + if pte_max_seq_len < self.max_seq_length: + logging.warning( + f"The pte max_seq_len {pte_max_seq_len} is used since it is shorter than --max_seq_len {self.max_seq_length}" + ) + self.max_seq_length = pte_max_seq_len + + def run(self, prompt): + golden_logits = INFERENCE_REGISTRY[True]( + example_input=self.example_input, + prompt=prompt, + module=self.source_model, + tokenizer=self.tokenizer, + max_seq_len=self.max_seq_length, + use_i64_token=self.args.embedding_quantize is not None, + collect_logits=True, + ) + + input_file_name = f"{self.args.artifact}/input_tokens.raw" + + # Make sure the encode param aligns with encode param under all register_inference function. + # This ensures the token used for device is same as token used for nn.Module. + inps = torch.tensor(self.tokenizer.encode(prompt, bos=True, eos=False)) + inps = inps.to(torch.uint64).numpy() + inps.tofile(input_file_name) + + assert ( + inps.size < self.max_seq_length + ), f"Number of input token is longer than max_seq_len, please shorten the input token length. input_token length: {inps.size}. max_seq_len: {self.max_seq_length}" + + output_logits_holder = [] + output_performance_holder = [] + + if self.enable_x86_64: + runner_cmd = " ".join( + [ + self.runner_base_cmd, + f"--seq_len {inps.size + 1}", + f"--eval_mode {EVAL_MODE['kv']}", + "--temperature 0", + f"--dump_logits_path {self.device_logits_path}", + f"--tokenized_prompt {input_file_name}", + ] + ) + + subprocess.run( + runner_cmd, + shell=True, + executable="/bin/bash", + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + post_process_logits( + output_holder=output_logits_holder, + host_logits_path=self.host_logits_path, + kv_io_bit_width=self.kv_io_bit_width, + num_input_tokens=inps.size, + output_vocab_size=self.output_vocab_size, + logits_scale=self.logits_scale, + logits_zero_point=self.logits_zero_point, + ) + post_process_inference_speed( + output_holder=output_performance_holder, + host_performance_path=self.host_performance_path, + ) + else: + runner_cmd = " ".join( + [ + self.runner_base_cmd, + f"--seq_len {inps.size + 1}", + f"--eval_mode {EVAL_MODE['kv']}", + "--temperature 0", + f"--dump_logits_path {self.device_logits_path}", + f"--tokenized_prompt {os.path.basename(input_file_name)}", + ] + ) + self.adb.push( + inputs=[], files=[input_file_name, self.runtime_tokenizer_path] + ) + self.adb.execute(custom_runner_cmd=runner_cmd) + self.adb.pull( + host_output_path=self.host_logits_path, + device_output_path=self.device_logits_path, + callback=partial( + post_process_logits, + output_holder=output_logits_holder, + host_logits_path=self.host_logits_path, + kv_io_bit_width=self.kv_io_bit_width, + num_input_tokens=inps.size, + output_vocab_size=self.output_vocab_size, + logits_scale=self.logits_scale, + logits_zero_point=self.logits_zero_point, + ), + ) + self.adb.pull( + host_output_path=self.host_performance_path, + device_output_path=self.device_performance_path, + callback=partial( + post_process_inference_speed, + output_holder=output_performance_holder, + host_performance_path=self.host_performance_path, + ), + ) + self.inference_speed = output_performance_holder[0] + + sqnr = compute_error(golden_logits, output_logits_holder[0]) + return sqnr.item() + + +class TaskEval(EvalBase): + class QnnRunnerEvalWrapper(EagerEvalWrapper): + """ + A wrapper class to run tasks with QNN on device. + """ + + def __init__( # noqa: C901 + self, + args, + runner_base_cmd: str, + adb: SimpleADB, + pte_path: str, + device_performance_path: str, + device_logits_path: str, + host_performance_path: str, + host_logits_path: str, + tokenizer: Union[ + SentencePieceTokenizer, TiktokenTokenizer, HuggingFaceTokenizer + ], + runtime_tokenizer_path, + ): + self.inference_speed = None + + self.args = args + self.runner_base_cmd = runner_base_cmd + self.adb = adb + self.pte_path = pte_path + self.runtime_tokenizer_path = runtime_tokenizer_path + + self.device_performance_path = device_performance_path + self.device_logits_path = device_logits_path + self.host_performance_path = host_performance_path + self.host_logits_path = host_logits_path + + self.enable_x86_64 = args.enable_x86_64 + self.max_seq_length = args.max_seq_len + pte_meta_info = retrieve_info_from_pte(pte_path=pte_path) + self.output_vocab_size = pte_meta_info["output_vocab_size"] + pte_max_seq_len = pte_meta_info["pte_max_seq_len"] + self.logits_scale = pte_meta_info["logits_scale"] + self.logits_zero_point = pte_meta_info["logits_zero_point"] + self.kv_io_bit_width = pte_meta_info["kv_io_bit_width"] + + if args.model_mode != "kv": + logging.warning( + f"Current QnnRunnerEvalWrapper does not support {args.model_mode}, switching to kv mode." + ) + + if pte_max_seq_len != self.max_seq_length: + logging.warning( + f"The pte provided has a max_seq_len {pte_max_seq_len}, which is different from --max_seq_len {self.max_seq_length} provided to the script, please ensure this is desired." + ) + if pte_max_seq_len < self.max_seq_length: + logging.warning( + f"The pte max_seq_len {pte_max_seq_len} is used since it is shorter than --max_seq_len {self.max_seq_length}" + ) + self.max_seq_length = pte_max_seq_len + + if not self.enable_x86_64: + self.adb.push(inputs=[], files=[self.runtime_tokenizer_path]) + # n seq len = n-1 cache len, so we set len(inps) = n-1 during _model_call + # pyre-ignore + super().__init__(None, tokenizer, self.max_seq_length - 1) + + def _model_call(self, inps): + input_file_name = f"{self.args.artifact}/input_tokens.raw" + # This is the dtype required by runtime tokenizer. + inps = inps.to(torch.uint64).numpy() + inps.tofile(input_file_name) + output_logits_holder = [] + output_performance_holder = [] + + if self.enable_x86_64: + runner_cmd = " ".join( + [ + self.runner_base_cmd, + f"--seq_len {self.max_seq_length}", + f"--eval_mode {EVAL_MODE['kv']}", + "--temperature 0", + f"--dump_logits_path {self.device_logits_path}", + f"--tokenized_prompt {input_file_name}", + ] + ) + subprocess.run( + runner_cmd, + shell=True, + executable="/bin/bash", + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + post_process_logits( + output_holder=output_logits_holder, + host_logits_path=self.host_logits_path, + kv_io_bit_width=self.kv_io_bit_width, + num_input_tokens=inps.size, + output_vocab_size=self.output_vocab_size, + logits_scale=self.logits_scale, + logits_zero_point=self.logits_zero_point, + ) + post_process_inference_speed( + output_holder=output_performance_holder, + host_performance_path=self.host_performance_path, + ) + + else: + runner_cmd = " ".join( + [ + self.runner_base_cmd, + f"--seq_len {self.max_seq_length}", + f"--eval_mode {EVAL_MODE['kv']}", + "--temperature 0", + f"--dump_logits_path {self.device_logits_path}", + f"--tokenized_prompt {os.path.basename(input_file_name)}", + ] + ) + self.adb.push(inputs=[], files=[input_file_name], init_env=False) + self.adb.execute(custom_runner_cmd=runner_cmd) + self.adb.pull( + host_output_path=self.host_logits_path, + device_output_path=self.device_logits_path, + callback=partial( + post_process_logits, + output_holder=output_logits_holder, + host_logits_path=self.host_logits_path, + kv_io_bit_width=self.kv_io_bit_width, + num_input_tokens=inps.size, + output_vocab_size=self.output_vocab_size, + logits_scale=self.logits_scale, + logits_zero_point=self.logits_zero_point, + ), + ) + self.adb.pull( + host_output_path=self.host_performance_path, + device_output_path=self.device_performance_path, + callback=partial( + post_process_inference_speed, + output_holder=output_performance_holder, + host_performance_path=self.host_performance_path, + ), + ) + self.inference_speed = output_performance_holder[0] + return output_logits_holder[0] + + def __init__(self, args, pte_path, tokenizer, runtime_tokenizer_path): + super().__init__( + args=args, pte_path=pte_path, runtime_tokenizer_path=runtime_tokenizer_path + ) + self.inference_speed = None + self.tasks = args.tasks + self.num_fewshot = args.num_fewshot + self.limit = args.limit + adb = self._get_adb() + self.eval_wrapper = TaskEval.QnnRunnerEvalWrapper( + args=args, + runner_base_cmd=self.runner_base_cmd, + adb=adb, + pte_path=self.pte_path, + device_performance_path=self.device_performance_path, + device_logits_path=self.device_logits_path, + host_performance_path=self.host_performance_path, + host_logits_path=self.host_logits_path, + tokenizer=tokenizer, + runtime_tokenizer_path=self.runtime_tokenizer_path, + ) + + def run(self): + with torch.no_grad(): + eval_results = simple_evaluate( + model=self.eval_wrapper, + tasks=self.tasks, + num_fewshot=self.num_fewshot, + limit=self.limit, + ) + self.inference_speed = self.eval_wrapper.inference_speed + return eval_results diff --git a/examples/qualcomm/oss_scripts/llama/decoder_utils.py b/examples/qualcomm/oss_scripts/llama/decoder_utils.py index 3d28ca2186b..9165c8900ce 100644 --- a/examples/qualcomm/oss_scripts/llama/decoder_utils.py +++ b/examples/qualcomm/oss_scripts/llama/decoder_utils.py @@ -5,25 +5,16 @@ # LICENSE file in the root directory of this source tree. import copy -import getpass import logging -import os -import subprocess from collections import defaultdict, OrderedDict from dataclasses import dataclass from typing import List, Optional, Tuple, Union -import numpy as np import torch from executorch.backends.qualcomm._passes import SeqMSE from executorch.examples.models.llama.evaluate.eager_eval import EagerEvalWrapper -from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import ( - DECODER_MODEL_VERSION, - EVAL_MODE, -) from executorch.examples.qualcomm.oss_scripts.llama.masking_utils import AttentionMask -from executorch.examples.qualcomm.utils import make_output_dir, SimpleADB from executorch.exir._serialize._program import deserialize_pte_binary from pytorch_tokenizers.hf_tokenizer import HuggingFaceTokenizer from pytorch_tokenizers.llama2c import Llama2cTokenizer as SentencePieceTokenizer @@ -291,184 +282,51 @@ def verify( return best_match, branch -class QnnRunnerEvalWrapper(EagerEvalWrapper): - """ - A wrapper class to run PPL scores with QNN on device. - """ - - def __init__( # noqa: C901 - self, - args, - pte_path: str, - tokenizer: Union[ - SentencePieceTokenizer, TiktokenTokenizer, HuggingFaceTokenizer - ], - runtime_tokenizer_path, - ): - self.args = args - self.pte_path = pte_path - self.enable_x86_64 = args.enable_x86_64 - self.max_seq_length = args.max_seq_len - - if self.enable_x86_64: - logging.warning( - "Using x86_64 emulator is NOT recommended as it is for CI purpose." - ) +def retrieve_info_from_pte(pte_path: str) -> dict: + # Retrieve vocab_size from get_metadata under static_llama that is passed to edge manager + output_vocab_size = None + pte_max_seq_len = None + logits_scale = None + logits_zero_point = None + kv_io_bit_width = 32 - with open(pte_path, "rb") as f: - program_data = f.read() + with open(pte_path, "rb") as f: + program_data = f.read() program = deserialize_pte_binary(program_data).program - # Retrieve vocab_size from get_metadata under static_llama that is passed to edge manager - self.output_vocab_size = None - pte_max_seq_len = None - self.logits_scale = None - self.logits_zero_point = None - self.kv_io_bit_width = 32 - for method in program.execution_plan: - # Don't use tokenizer.n_words, the numbers are off once calling get_tokenizer() - if method.name == "get_vocab_size": - # pyre-ignore - self.output_vocab_size = method.values[0].val.int_val - if method.name == "get_max_seq_len": - # pyre-ignore - pte_max_seq_len = method.values[0].val.int_val - if method.name == "get_logits_scale": - self.logits_scale = method.values[0].val.double_val - if method.name == "get_logits_zero_point": - self.logits_zero_point = method.values[0].val.int_val - if method.name == "get_kv_io_bit_width": - self.kv_io_bit_width = method.values[0].val.int_val - - # FP has no scale/zero_point, use following values, which is equivalent to not performing dequantize. - if self.kv_io_bit_width == 32: - self.logits_scale = 1 - self.logits_zero_point = 0 - elif self.logits_scale is None or self.logits_zero_point is None: - raise RuntimeError( - "Unable to find scale/offset. The .pte file might be deprecated. Please generate a new .pte file" - ) - - assert self.output_vocab_size is not None, "Couldn't find the vocab size" - assert pte_max_seq_len is not None, "Couldn't find the max_seq_len from pte" - if pte_max_seq_len != self.max_seq_length: - logging.warning( - f"The pte provided has a max_seq_len {pte_max_seq_len}, which is different from --max_seq_len {self.max_seq_length} provided to the script, please ensure this is desired." - ) - if pte_max_seq_len < self.max_seq_length: - logging.warning( - f"The pte max_seq_len {pte_max_seq_len} is used since it is shorter than --max_seq_len {self.max_seq_length}" - ) - self.max_seq_length = pte_max_seq_len - self.runtime_tokenizer_path = runtime_tokenizer_path - - self.output_dir = args.artifact - - self.workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/single_llama" - self.adb = SimpleADB( - qnn_sdk=os.getenv("QNN_SDK_ROOT"), - build_path=args.build_folder, - pte_path=pte_path, - workspace=self.workspace, - device_id=args.device, - host_id=args.host, - soc_model=args.model, - runner="examples/qualcomm/oss_scripts/llama/qnn_llama_runner", - target=args.target, + for method in program.execution_plan: + # Don't use tokenizer.n_words, the numbers are off once calling get_tokenizer() + if method.name == "get_vocab_size": + # pyre-ignore + output_vocab_size = method.values[0].val.int_val + if method.name == "get_max_seq_len": + # pyre-ignore + pte_max_seq_len = method.values[0].val.int_val + if method.name == "get_logits_scale": + logits_scale = method.values[0].val.double_val + if method.name == "get_logits_zero_point": + logits_zero_point = method.values[0].val.int_val + if method.name == "get_kv_io_bit_width": + kv_io_bit_width = method.values[0].val.int_val + + # FP has no scale/zero_point, use following values, which is equivalent to not performing dequantize. + if kv_io_bit_width == 32: + logits_scale = 1 + logits_zero_point = 0 + elif logits_scale is None or logits_zero_point is None: + raise RuntimeError( + "Unable to find scale/offset. The .pte file might be deprecated. Please generate a new .pte file" ) - - # collect output data - output_data_folder = f"{self.args.artifact}/outputs" - make_output_dir(output_data_folder) - - if not self.enable_x86_64: - self.adb.push(inputs=[], files=[self.runtime_tokenizer_path]) - # n seq len = n-1 cache len, so we len(inps) = n-1 during _model_call - # pyre-ignore - super().__init__(None, tokenizer, self.max_seq_length - 1) - - def _model_call(self, inps): - - input_file_name = f"{self.args.artifact}/input_tokens.raw" - inps = inps.to(torch.uint64).numpy() - inps.tofile(input_file_name) - - outputs_path = "outputs/outputs.txt" - dump_logits_path = "outputs/all_logit.raw" - performance_output_path = "outputs/inference_speed.txt" - output_tensor_list = [] - - def post_process(): - with open(f"{self.args.artifact}/{dump_logits_path}", "r") as f: - logits_dtype = np.float32 if self.kv_io_bit_width == 32 else np.uint16 - output_tensor = torch.from_numpy( - np.fromfile(f.name, dtype=logits_dtype).reshape( - 1, -1, self.output_vocab_size - ) - ) - output_tensor = ( - output_tensor.to(torch.float32) - self.logits_zero_point - ) * self.logits_scale - output_tensor_list.append(output_tensor) - - # simple_eval will run multiple rounds, use last run for inference speed - with open(f"{self.args.artifact}/{performance_output_path}", "r") as f: - self.inference_speed = float(f.read()) - - if self.enable_x86_64: - qnn_sdk = os.getenv("QNN_SDK_ROOT") - target = "x86_64-linux-clang" - runner_cmd = " ".join( - [ - f"export LD_LIBRARY_PATH={qnn_sdk}/lib/{target}/:{self.args.build_folder}/lib &&", - f"./{self.args.build_folder}/examples/qualcomm/oss_scripts/llama/qnn_llama_runner", - f"--decoder_model_version {DECODER_MODEL_VERSION[self.args.decoder_model]}", - f"--tokenizer_path {self.runtime_tokenizer_path}", - f"--model_path {self.pte_path}", - f"--seq_len {self.max_seq_length}", - f"--output_path {self.args.artifact}/outputs/outputs.txt", - f"--performance_output_path {self.args.artifact}/{performance_output_path}", - f"--eval_mode {EVAL_MODE[self.args.model_mode]}", - "--temperature 0", - f"--dump_logits_path {self.args.artifact}/{dump_logits_path}", - f"--tokenized_prompt {input_file_name}", - ] - ) - subprocess.run( - runner_cmd, - shell=True, - executable="/bin/bash", - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - ) - post_process() - - else: - runner_cmd = " ".join( - [ - f"cd {self.workspace} &&", - "./qnn_llama_runner", - f"--decoder_model_version {DECODER_MODEL_VERSION[self.args.decoder_model]}", - f"--tokenizer_path {os.path.basename(self.runtime_tokenizer_path)}", - f"--model_path {os.path.basename(self.pte_path)}", - f"--seq_len {self.max_seq_length}", - f"--output_path {outputs_path}", - f"--performance_output_path {performance_output_path}", - f"--window {self.args.window}", - f"--gcap {self.args.gcap}", - f"--ngram {self.args.ngram}", - f"--eval_mode {EVAL_MODE[self.args.model_mode]}", - "--temperature 0", - f"--dump_logits_path {dump_logits_path}", - f"--tokenized_prompt {os.path.basename(input_file_name)}", - "--shared_buffer", - ] - ) - - self.adb.push(inputs=[], files=[input_file_name], init_env=False) - self.adb.execute(custom_runner_cmd=runner_cmd) - self.adb.pull(output_path=self.output_dir, callback=post_process) - return output_tensor_list[0] + assert output_vocab_size is not None, "Couldn't find the vocab size" + assert pte_max_seq_len is not None, "Couldn't find the max_seq_len from pte" + meta_info = { + "output_vocab_size": output_vocab_size, + "pte_max_seq_len": pte_max_seq_len, + "logits_scale": logits_scale, + "logits_zero_point": logits_zero_point, + "kv_io_bit_width": kv_io_bit_width, + } + return meta_info def smart_mask_updater( diff --git a/examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py b/examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py index 05592b791fd..764033b2789 100644 --- a/examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py +++ b/examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py @@ -212,7 +212,7 @@ def permute(w, heads): def prequant_algorithm(model, prefill_config, args): # TODO: use dtype of model checkpoint model = model.to(device=args.device, dtype=torch.float) - inputs = model.get_example_inputs(use_kv_cache=False) + inputs = model.get_example_inputs() tokens, atten_mask = inputs tokens.to(args.device) for mask in atten_mask.masks: @@ -306,6 +306,7 @@ def eager_eval_quanty( def eval_llm(args): tokenizer = prepare_tokenizer(args) model, prefill_config = prepare_model(args) + get_example_inputs = model.get_example_inputs model, config, inputs, scales_state_dict = prequant_algorithm( model, prefill_config, args ) @@ -319,7 +320,7 @@ def eval_llm(args): ) quantizer = make_custom_quantizer( - quant_dtype, args.range_setting, custom_annotations, args.quant_linear_only + quant_dtype, custom_annotations, args.quant_linear_only ) with torch.no_grad(): @@ -337,7 +338,7 @@ def eval_llm(args): logging.info("Observers added, starting calibration...") graph_module_inference( use_kv_cache=False, - get_example_inputs=lambda use_kv_cache=False: inputs, + example_input=get_example_inputs(), module=model, tokenizer=tokenizer, ar_len=args.max_seq_len, @@ -371,7 +372,7 @@ def eval_llm(args): logging.info("Evaluation of QDQ model:") graph_module_inference( use_kv_cache=False, - get_example_inputs=lambda use_kv_cache=False: inputs, + example_input=get_example_inputs(), module=model, tokenizer=tokenizer, ar_len=args.max_seq_len, diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py index 1267b1ac78b..74bb038dc29 100755 --- a/examples/qualcomm/oss_scripts/llama/llama.py +++ b/examples/qualcomm/oss_scripts/llama/llama.py @@ -7,16 +7,36 @@ # TODO: reenable pyre after fixing the issues # pyre-ignore-all-errors +import copy import getpass import json import logging import os -import subprocess import sys from multiprocessing.connection import Client from typing import Dict import torch +from executorch.backends.qualcomm._passes import FoldQDQ, TagQuantIO +from executorch.backends.qualcomm._passes.i64_to_i32 import I64toI32 +from executorch.backends.qualcomm._passes.qnn_pass_manager import ( + get_capture_program_passes, +) +from executorch.backends.qualcomm._passes.utils import ( + get_passes_dependency_for_capture_program, +) + +from executorch.backends.qualcomm.builders.utils import is_graph_output +from executorch.backends.qualcomm.quantizer.custom_annotation import ( + annotate_prefill_kv_output, +) + +from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset + +from executorch.backends.qualcomm.utils.constants import ( + QCOM_PASS_ACTIVATE_KEY, + QCOM_PASS_ARGS_KWARGS_DEFAULTS_KEY, +) from executorch.backends.qualcomm.utils.utils import ( generate_htp_compiler_spec, generate_qnn_executorch_compiler_spec, @@ -31,14 +51,32 @@ AUDIO_ENCODER, DECODER_GRAPH_NAMES, EVAL_MODE, + PROMPT_EVAL, + SQNR_EVAL, + TASKS_EVAL, TEXT_DECODER, TEXT_EMBEDDING, TEXT_EMBEDDING_GRAPH_NAMES, TEXT_ENCODER, VISION_ENCODER, ) +from executorch.examples.qualcomm.oss_scripts.llama.decoder_runtime_evaluator import ( + DefaultEval, + SqnrEval, + TaskEval, +) from executorch.examples.qualcomm.oss_scripts.llama.decoder_utils import ( - QnnRunnerEvalWrapper, + graph_module_inference, +) +from executorch.examples.qualcomm.oss_scripts.llama.model.static_llama import ( + LlamaModel, + ModelArgs, +) +from executorch.examples.qualcomm.oss_scripts.llama.range_setting_pt2e import ( + make_custom_quantizer, +) +from executorch.examples.qualcomm.oss_scripts.llama.static_llm_quant_recipe import ( + StaticLLMQuantRecipe, ) from executorch.examples.qualcomm.oss_scripts.llama.tokenizer import TokenizerWrapper from executorch.examples.qualcomm.oss_scripts.llama.wrappers import ( @@ -46,20 +84,16 @@ next_power_of_two, ) -from executorch.examples.qualcomm.utils import ( - make_output_dir, - setup_common_args_and_variables, - SimpleADB, -) +from executorch.examples.qualcomm.utils import setup_common_args_and_variables +from executorch.exir.capture._config import ExecutorchBackendConfig +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass +from executorch.extension.llm.custom_ops import model_sharding +from executorch.extension.llm.export.builder import DType +from pytorch_tokenizers import get_tokenizer, TiktokenTokenizer +from pytorch_tokenizers.llama2c import Llama2cTokenizer as SentencePieceTokenizer -try: - from lm_eval.evaluator import simple_evaluate -except ImportError: - raise ImportError( - "Please install the llm eval dependency via examples/models/llama/install_requirements.sh" - ) - sys.setrecursionlimit(4096) FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s" logging.basicConfig(level=logging.INFO, format=FORMAT) @@ -93,7 +127,7 @@ def compile( TEXT_DECODER: None, } is_modality = False - # compile spec for multimodlity encoder + # compile spec for multimodality encoder for modality in compile_specs: if not hasattr(decoder_model_config, modality): continue @@ -149,6 +183,7 @@ def inference( pte_filenames: Dict[str, str], runtime_tokenizer_path, tokenizer, + chat_template, ): assert args.model_mode in EVAL_MODE, f"Unknown model_mode: {args.model_mode}." @@ -156,7 +191,6 @@ def inference( is_modality = hasattr(decoder_model_config, VISION_ENCODER) or hasattr( decoder_model_config, AUDIO_ENCODER ) - pte_path = ( f"{args.pre_gen_pte}/{pte_filenames[TEXT_DECODER]}.pte" if args.pre_gen_pte @@ -192,215 +226,89 @@ def inference( exit(0) return None - # For decoder-only models, enable accuracy evaluation using perplexity - # TODO: Add support for multimodal accuracy evaluation (e.g., VLM) - if args.run_lm_eval: - # Generate the eval wrapper - eval_wrapper = QnnRunnerEvalWrapper( + eval_results = { + "pte_size": os.path.getsize(pte_path), + } + if PROMPT_EVAL in args.eval_methods: + prompt_evaluator = DefaultEval( + args=args, pte_path=pte_path, runtime_tokenizer_path=runtime_tokenizer_path + ) + output_prompt = prompt_evaluator.run(prompt=args.prompt) + eval_results.update( + { + "inference_speed": prompt_evaluator.inference_speed, + "result": output_prompt, + } + ) + for idx, output in enumerate(output_prompt): + logging.info(f"Device Inference Results[{idx}]:\n{output}") + + if SQNR_EVAL in args.eval_methods: + tokenizer_wrapper = TokenizerWrapper( + args, + decoder_model_config, + ) + prompt = ( + tokenizer_wrapper.apply_prompt_template( + chat_template, args.prompt[0], args.system_prompt + ) + if chat_template is not None + else args.prompt[0] + ) + multi_modal_mgr = MultiModalManager( + control_args=args, config=decoder_model_config + ) + source_model = multi_modal_mgr.text_decoder.decode.decoder + sqnr_evaluator = SqnrEval( + source_model=source_model, + example_input=source_model.get_example_inputs(), args=args, pte_path=pte_path, tokenizer=tokenizer, runtime_tokenizer_path=runtime_tokenizer_path, ) + sqnr = sqnr_evaluator.run(prompt=prompt) + logging.info(f"SQNR Eval Score: {sqnr}") + eval_results.update( + { + "sqnr": sqnr, + "inference_speed": sqnr_evaluator.inference_speed, + } + ) - # Evaluate the model - with torch.no_grad(): - eval_results = simple_evaluate( - model=eval_wrapper, - tasks=args.tasks, - num_fewshot=args.num_fewshot, - limit=args.limit, - ) + if TASKS_EVAL in args.eval_methods: + # Generate the eval wrapper + ppl_evaluator = TaskEval( + args=args, + pte_path=pte_path, + tokenizer=tokenizer, + runtime_tokenizer_path=runtime_tokenizer_path, + ) + ppl_eval_result = ppl_evaluator.run() + eval_results["inference_speed"] = ppl_evaluator.inference_speed - if args.ip and args.port != -1: - assert len(args.tasks) == 1, "CI currently supports 1 lm_eval task only." - match args.tasks[0]: + for task, res in ppl_eval_result["results"].items(): + match task: case "wikitext": - wiki_ppl = eval_results["results"][args.tasks[0]][ - "word_perplexity,none" - ] - pte_size = os.path.getsize(pte_path) - with Client((args.ip, args.port)) as conn: - conn.send( - json.dumps( - { - "wiki_ppl": wiki_ppl, - "pte_size": pte_size, - "inference_speed": eval_wrapper.inference_speed, - } - ) - ) + wiki_ppl = ppl_eval_result["results"][task]["word_perplexity,none"] + eval_results["wiki_ppl"] = wiki_ppl case "hellaswag": - acc_norm = eval_results["results"][args.tasks[0]]["acc_norm,none"] - pte_size = os.path.getsize(pte_path) - with Client((args.ip, args.port)) as conn: - conn.send( - json.dumps( - { - "acc_norm": acc_norm, - "pte_size": pte_size, - "inference_speed": eval_wrapper.inference_speed, - } - ) - ) + acc_norm = ppl_eval_result["results"][task]["acc_norm,none"] + eval_results["acc_norm"] = acc_norm case _: - raise RuntimeError( - "CI currently supports [wikitext, hellaswag] only." - ) - - else: - for task, res in eval_results["results"].items(): - logging.info(f"{task}: {res}") - return - - workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/single_llama" - - # collect output data - output_data_folder = f"{args.artifact}/outputs" - make_output_dir(output_data_folder) - outputs = [] - - def post_process(): - with open(f"{args.artifact}/outputs/outputs.txt", "r") as f: - outputs.append(f.read()) - - seq_len = args.max_seq_len - multi_prompts = " ".join([f'--prompt "{prompt}"' for prompt in args.prompt]) - lookahead_args = " ".join( - [ - f"--window {args.window}", - f"--gcap {args.gcap}", - f"--ngram {args.ngram}", - ] - ) - runner_args = " ".join( - [ - multi_prompts, - f"--eval_mode {EVAL_MODE[args.model_mode]}", - f"--temperature {args.temperature}", - f"--system_prompt '{args.system_prompt}'", - lookahead_args if args.model_mode == "lookahead" else "", - ] - ) - - runner_cmd = "" - performance_output_path = "outputs/inference_speed.txt" - if args.enable_x86_64: - # x86 emulator is intended for CI and not performance. Check only the first few tokens. - seq_len = min(seq_len, 16) - - qnn_sdk = os.getenv("QNN_SDK_ROOT") - target = "x86_64-linux-clang" - runner_cmd = " ".join( - [ - f"export LD_LIBRARY_PATH={qnn_sdk}/lib/{target}/:{args.build_folder}/lib &&", - f"./{args.build_folder}/examples/qualcomm/oss_scripts/llama/qnn_llama_runner", - f"--decoder_model_version {decoder_model_config.decoder_model_version}", - f"--tokenizer_path {runtime_tokenizer_path}", - f"--model_path {pte_path}", - f"--seq_len {seq_len}", - f"--output_path {args.artifact}/outputs/outputs.txt", - f"--performance_output_path {args.artifact}/{performance_output_path}", - runner_args, - ] - ) - subprocess.run( - runner_cmd, - shell=True, - executable="/bin/bash", - capture_output=True, - ) - post_process() - else: - runner_cmd = " ".join( - [ - f"cd {workspace} &&", - f"./qnn_llama_runner", - f"--decoder_model_version {decoder_model_config.decoder_model_version}", - f"--tokenizer_path {os.path.basename(runtime_tokenizer_path)}", - f"--model_path {pte_filenames[TEXT_DECODER]}.pte", - f"--seq_len {seq_len}", - "--output_path outputs/outputs.txt", - f"--performance_output_path {performance_output_path}", - "--shared_buffer", - runner_args, - ] - ) - adb = SimpleADB( - qnn_sdk=os.getenv("QNN_SDK_ROOT"), - build_path=f"{args.build_folder}", - pte_path=pte_path, - workspace=workspace, - device_id=args.device, - host_id=args.host, - soc_model=args.model, - shared_buffer=True, - target=args.target, - runner=f"examples/qualcomm/oss_scripts/llama/qnn_llama_runner", - ) - - # No pregen inputs, input_list is not required - if not args.skip_push: - adb.push(inputs=[], files=[runtime_tokenizer_path]) - adb.execute(custom_runner_cmd=runner_cmd) - adb.pull(output_path=args.artifact, callback=post_process) + if args.ip and args.port != -1: + raise RuntimeError( + "CI currently supports [wikitext, hellaswag] only." + ) + logging.info(f"{task}: {res}") if args.ip and args.port != -1: - inference_speed = 0 - with open( - f"{os.path.abspath(args.artifact)}/{performance_output_path}", "r" - ) as f: - inference_speed = float(f.read()) - - # Prepare validation results for CI system - validation_results = { - "result": outputs, - "inference_speed": inference_speed, - "pte_size": os.path.getsize(pte_path), - } with Client((args.ip, args.port)) as conn: - conn.send(json.dumps(validation_results)) - else: - for idx, output in enumerate(outputs): - logging.info(f"Results[{idx}]:\n{output}") - - -def _build_tasks_parser(parser): - parser.add_argument( - "--run_lm_eval", - help="If enabled, this will use the tasks provided under args.tasks to calibrate the model", - action="store_true", - default=False, - ) - - parser.add_argument( - "--tasks", - nargs="+", - type=str, - default=None, - help="list of lm-eluther tasks to evaluate usage: --tasks task1 task2", - ) - - parser.add_argument( - "--limit", - type=int, - default=1, - help="number of samples to evalulate. If not set, evaluate all samples", - ) - parser.add_argument( - "--num_fewshot", - type=int, - default=None, - metavar="N", - help="Number of examples in few-shot context", - ) - - return parser + conn.send(json.dumps(eval_results)) def _build_parser(): parser = setup_common_args_and_variables() - parser = _build_tasks_parser(parser) parser.add_argument( "-a", "--artifact", @@ -526,6 +434,40 @@ def _build_parser(): type=int, ) + parser.add_argument( + "--eval_methods", + choices=[PROMPT_EVAL, TASKS_EVAL, SQNR_EVAL], + nargs="+", + default=[PROMPT_EVAL], + help="Choose eval methods(default: prompt_eval). Users can provide more than 1 eval methods. For example: --eval_methods tasks_eval sqnr_eval." + "Following eval methods are supported:" + "1) prompt_eval: Model will generate the output response based on the provided prompt through the flag --prompt." + "2) tasks_eval: This will eval the tasks provided through the flag --tasks." + "3) sqnr_eval: This will eval the sqnr between between QNN's output logit V.S. Static Llama nn.Module's output logit. Eval is based on the provided prompt through the --prompt flag. Please note that sqnr will only eval the prompt's logit but not the new generated token's logit.", + ) + + parser.add_argument( + "--tasks", + nargs="+", + type=str, + default=None, + help="list of lm-eluther tasks to evaluate usage: --tasks task1 task2", + ) + + parser.add_argument( + "--limit", + type=int, + default=1, + help="number of samples to evalulate. If not set, evaluate all samples", + ) + parser.add_argument( + "--num_fewshot", + type=int, + default=None, + metavar="N", + help="Number of examples in few-shot context", + ) + parser.add_argument("-v", "--verbose", action="store_true") return parser @@ -534,9 +476,14 @@ def _build_parser(): def export_llama(args) -> None: if args.compile_only and args.pre_gen_pte: raise RuntimeError("Cannot set both compile_only and pre_gen_pte as true") - if args.run_lm_eval and args.model_mode != "kv": - raise RuntimeError("Eval device perplexity is only supported for KV mode") - if args.run_lm_eval and args.tasks is None: + if (TASKS_EVAL or SQNR_EVAL) in args.eval_methods and args.model_mode not in { + "kv", + "hybrid", + }: + raise RuntimeError( + "Eval device perplexity is only supported for KV mode. Hybrid mode will only use KV mode when evaluating tasks/sqnr." + ) + if TASKS_EVAL in args.eval_methods and args.tasks is None: raise RuntimeError("Please provide --tasks to eval perplexity") assert ( args.decoder_model in SUPPORTED_LLM_MODELS @@ -590,7 +537,12 @@ def export_llama(args) -> None: if args.pre_gen_pte: inference( - args, decoder_model_config, pte_filenames, runtime_tokenizer_path, tokenizer + args, + decoder_model_config, + pte_filenames, + runtime_tokenizer_path, + tokenizer, + chat_template, ) print(f"Finish the running pre_gen_pte from {args.pre_gen_pte}") return @@ -626,7 +578,12 @@ def export_llama(args) -> None: calibration_data, ) inference( - args, decoder_model_config, pte_filenames, runtime_tokenizer_path, tokenizer + args, + decoder_model_config, + pte_filenames, + runtime_tokenizer_path, + tokenizer, + chat_template, ) diff --git a/examples/qualcomm/oss_scripts/llama/masking_utils.py b/examples/qualcomm/oss_scripts/llama/masking_utils.py index ea68f89276a..232b885247f 100644 --- a/examples/qualcomm/oss_scripts/llama/masking_utils.py +++ b/examples/qualcomm/oss_scripts/llama/masking_utils.py @@ -194,7 +194,7 @@ def smart_mask_update(self, pos, n_updates, lade_pos_offset): After 2nd update (e.g., pos=5, n_updates=5, sliding_window=3): - Sliding window shifts again, masking older positions and activate new postion. + Sliding window shifts again, masking older positions and activate new position. 0 ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○ ○ ○ ○ 1 ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○ ○ ○ diff --git a/examples/qualcomm/oss_scripts/llama/range_setting_pt2e.py b/examples/qualcomm/oss_scripts/llama/range_setting_pt2e.py index d5c68606f60..02c312bc89e 100644 --- a/examples/qualcomm/oss_scripts/llama/range_setting_pt2e.py +++ b/examples/qualcomm/oss_scripts/llama/range_setting_pt2e.py @@ -22,12 +22,6 @@ PerChannelParamObserver, ) -from executorch.backends.qualcomm.quantizer.qconfig import ( - _derived_bias_quant_spec, - QuantizationConfig, -) -from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype - from executorch.examples.qualcomm.utils import make_quantizer from torchao.prototype.quantization.module_swap import ( @@ -46,7 +40,6 @@ ) from torchao.quantization.pt2e import MinMaxObserver, PerChannelMinMaxObserver -from torchao.quantization.pt2e.quantizer import QuantizationSpec class WrappedLlamaModel(nn.Module): @@ -201,49 +194,14 @@ def compute_scales(model, data, weight_bits, act_bits, num_points=1600): return scales_state_dict -def make_custom_quantizer( - quant_dtype, range_setting=None, custom_annotations=(), linear_only=False -): +def make_custom_quantizer(quant_dtype, custom_annotations=(), linear_only=False): quantizer = make_quantizer( quant_dtype=quant_dtype, per_channel_conv=True, per_channel_linear=True, act_observer=MinMaxObserver, ) - if range_setting in ("mse_weight_only", "mse_with_act_loss"): - assert ( - quant_dtype != QuantDtype.use_16a4w_block - ), "Range setting only supported for per-channel quantization" - if range_setting == "mse_weight_only": - observer = PerChannelMSEObserver.with_args( - **{"steps": 1600, "use_mse": True} - ) - else: - observer = PerChannelFixedQParamsObserver.with_args(**{"eps": 2**-12}) - weight_dtype = ( - torch.int4 - if quant_dtype in (QuantDtype.use_16a4w, QuantDtype.use_16a4w_block) - else torch.int8 - ) - per_channel_q_config = quantizer.default_quant_config.quant_config - weight_qspec = QuantizationSpec( - dtype=torch.int8 if weight_dtype == torch.int4 else weight_dtype, - quant_min=( - -7 if weight_dtype == torch.int4 else torch.iinfo(weight_dtype).min + 1 - ), - quant_max=( - 7 if weight_dtype == torch.int4 else torch.iinfo(weight_dtype).max - ), - qscheme=torch.per_channel_symmetric, - ch_axis=0, - observer_or_fake_quant_ctr=observer, - ) - quantizer.default_quant_config.per_channel_quant_config = QuantizationConfig( - input_activation=per_channel_q_config.input_activation, - output_activation=per_channel_q_config.output_activation, - weight=weight_qspec, - bias=_derived_bias_quant_spec, - ) + if linear_only: all_keys = set(OP_ANNOTATOR.keys()) conv_keys = { diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp index 440b56f67a0..5794069d335 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp @@ -162,8 +162,8 @@ Error Runner::load() { std::vector method_names; switch (eval_mode_) { case EvalMode::kKVCached: - prompt_processor_method_name = "forward"; - token_generator_method_name = "forward"; + prompt_processor_method_name = "kv_forward"; + token_generator_method_name = "kv_forward"; method_names.emplace_back(token_generator_method_name); break; case EvalMode::kHybrid: diff --git a/examples/qualcomm/oss_scripts/llama/tokenizer.py b/examples/qualcomm/oss_scripts/llama/tokenizer.py index 05aee95b402..b4113f0b32f 100644 --- a/examples/qualcomm/oss_scripts/llama/tokenizer.py +++ b/examples/qualcomm/oss_scripts/llama/tokenizer.py @@ -83,36 +83,22 @@ def _from_hf(self): ) tokenizer_artifacts = tokenizer.save_pretrained(self.artifact) tokenizer_config = tokenizer_artifacts[0] - if self.decoder_model == "gemma-2b": - # For Gemma, use tokenizer.model as it doesn't provide pre_tokenizer in tokenizer.json. - runtime_tokenizer_path = tokenizer_artifacts[-3] - else: - if self.decoder_model == "glm-1_5b": - with open(tokenizer_config, "r+") as file: - data = json.load(file) - # Verified with HF flow and it uses <|user|> as eos condition - data["bos_token"] = "<|user|>" - data["eos_token"] = "<|user|>" - file.seek(0) - json.dump(data, file, indent=4) - file.truncate() - runtime_tokenizer_path = tokenizer_artifacts[-1] - + if self.decoder_model == "glm-1_5b": + with open(tokenizer_config, "r+") as file: + data = json.load(file) + # Verified with HF flow and it uses <|user|> as eos condition + data["bos_token"] = "<|user|>" + data["eos_token"] = "<|user|>" + file.seek(0) + json.dump(data, file, indent=4) + file.truncate() + runtime_tokenizer_path = tokenizer_artifacts[-1] tokenizer = get_tokenizer(runtime_tokenizer_path, tokenizer_config) if self.decoder_model == "codegen2_1b": # Override the default BOS and EOS token IDs for codegen2_1b tokenizer.bos_id = 1 tokenizer.eos_id = 2 - elif self.decoder_model == "phi_4_mini": - with open(runtime_tokenizer_path, "r+") as file: - data = json.load(file) - # TODO: Encountered the following error during runtime, so switched behavior for now. - # Error: libc++abi: terminating due to uncaught exception of type std::runtime_error: invert=true is not supported for Split PreTokenizer. Only invert=false is supported. - data["pre_tokenizer"]["pretokenizers"][-2]["invert"] = False - file.seek(0) - json.dump(data, file, indent=4) - file.truncate() return runtime_tokenizer_path, tokenizer, chat_template diff --git a/examples/qualcomm/oss_scripts/llama/wrappers.py b/examples/qualcomm/oss_scripts/llama/wrappers.py index a6ba5a5116e..c54a5e4b3a0 100644 --- a/examples/qualcomm/oss_scripts/llama/wrappers.py +++ b/examples/qualcomm/oss_scripts/llama/wrappers.py @@ -173,7 +173,7 @@ def __init__( self.mode = mode self.passes_job = get_capture_program_passes() self.dep_table = get_passes_dependency_for_capture_program() - self.meta = None + self.quant_recipe: StaticLLMQuantRecipe = ( self.config.quant_recipe(True) if self.config.quant_recipe else None ) @@ -188,16 +188,6 @@ def __init__( get_passes_dependency_for_capture_program() if apply_embedding else None ) - # check if sharding required - if self.config.num_sharding > 1: - SplitGraph, setting = model_sharding.get_split_graph_pass( - self.meta["get_n_layers"], - shares=self.config.num_sharding, - ) - self.passes_job[SplitGraph] = setting - self.dep_table[SplitGraph] = [FoldQDQ] - self.dep_table[TagQuantIO] = [SplitGraph] - # load static llama model args params_path = ( config.params_path if control_args.params is None else control_args.params @@ -210,6 +200,17 @@ def __init__( self.decoder = None if (instance := self._prepare_model()) is not None: self.tok_embedding, self.decoder = instance + self.meta = self.decoder.get_metadata() + + # check if sharding required + if self.decoder and self.config.num_sharding > 1: + SplitGraph, setting = model_sharding.get_split_graph_pass( + self.meta["get_n_layers"], + shares=self.config.num_sharding, + ) + self.passes_job[SplitGraph] = setting + self.dep_table[SplitGraph] = [FoldQDQ] + self.dep_table[TagQuantIO] = [SplitGraph] def _process_model_args(self, model_args: ModelArgs): # TODO: support batch inputs if necessary @@ -391,9 +392,7 @@ def _get_model_specific_kwargs(self): hf_config = Gemma3Config.from_pretrained(self.config.repo_id) kwargs["layer_types"] = hf_config.text_config.layer_types - kwargs["rope_local_base_freq"] = ( - hf_config.text_config.rope_local_base_freq - ) + kwargs["rope_local_base_freq"] = self.model_args.local_rope_theta kwargs["sliding_window"] = hf_config.sliding_window return kwargs @@ -451,7 +450,6 @@ def _get_model_instance(self) -> LlamaModel: **self._get_model_specific_kwargs(), ) # get example input - self.meta = decoder.get_metadata() self.example_input = decoder.get_example_inputs() self.export_input = ( self.example_input[0], # tokens or hidden_states @@ -834,8 +832,7 @@ def compile(self, request: Request): # noqa: C901 data = request.method_data[TEXT_DECODER] models = [d for d in [self.decode, self.prefill] if d.decoder is not None] example_inputs = [m.export_input for m in models if m is not None] - # For backward compatibility, we keep the graph name as forward if we use kv mode for evaluation LLM models - graph_names = ["forward"] if len(models) == 1 else DECODER_GRAPH_NAMES + graph_names = DECODER_GRAPH_NAMES[: len(models)] # start lowering if self.apply_embedding: @@ -899,7 +896,7 @@ def compile(self, request: Request): # noqa: C901 if self.config.num_sharding > 1 and self.control_args.model_mode == "kv": # weight-sharing based context binaries cannot be opened in x86 host - update_spill_fill_size(edge_prog_mgr.exported_program()) + update_spill_fill_size(edge_prog_mgr.exported_program("kv_forward")) if self.control_args.verbose: for ep in edge_prog_mgr._edge_programs.values(): diff --git a/examples/qualcomm/oss_scripts/llm_utils/eval_decoder_model_qnn.py b/examples/qualcomm/oss_scripts/llm_utils/eval_decoder_model_qnn.py index df19519e847..461224e1ccf 100644 --- a/examples/qualcomm/oss_scripts/llm_utils/eval_decoder_model_qnn.py +++ b/examples/qualcomm/oss_scripts/llm_utils/eval_decoder_model_qnn.py @@ -171,7 +171,7 @@ def post_process(): result_logits.append(output_tensor) - self.adb.pull(output_path=self.output_dir, callback=post_process) + self.adb.pull(host_output_path=self.output_dir, callback=post_process) return torch.cat(result_logits, dim=1) diff --git a/examples/qualcomm/oss_scripts/maxvit_t.py b/examples/qualcomm/oss_scripts/maxvit_t.py index e4bf9cd9127..b1d6e8b8a74 100755 --- a/examples/qualcomm/oss_scripts/maxvit_t.py +++ b/examples/qualcomm/oss_scripts/maxvit_t.py @@ -202,7 +202,7 @@ def main(args): output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # top-k analysis predictions = [] diff --git a/examples/qualcomm/oss_scripts/mobilevit_v1.py b/examples/qualcomm/oss_scripts/mobilevit_v1.py index af7170a34bf..c8e84ebeeb1 100644 --- a/examples/qualcomm/oss_scripts/mobilevit_v1.py +++ b/examples/qualcomm/oss_scripts/mobilevit_v1.py @@ -122,7 +122,7 @@ def main(args): output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # top-k analysis predictions = [] diff --git a/examples/qualcomm/oss_scripts/mobilevit_v2.py b/examples/qualcomm/oss_scripts/mobilevit_v2.py index d5f43e7bd6c..fe5245d4607 100644 --- a/examples/qualcomm/oss_scripts/mobilevit_v2.py +++ b/examples/qualcomm/oss_scripts/mobilevit_v2.py @@ -130,7 +130,7 @@ def main(args): output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # top-k analysis predictions = [] diff --git a/examples/qualcomm/oss_scripts/moshi/mimi.py b/examples/qualcomm/oss_scripts/moshi/mimi.py index ebb4519fe10..6e0742094a1 100644 --- a/examples/qualcomm/oss_scripts/moshi/mimi.py +++ b/examples/qualcomm/oss_scripts/moshi/mimi.py @@ -195,7 +195,7 @@ def inference_mimi_encoder(args, encoder_inputs, encoder_pte_filename): output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) encoder_predictions = [] for i in range(len(encoder_inputs)): @@ -371,7 +371,7 @@ def inference_static_mimi_decoder( output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) num_chunks = len(encoded_results) shape = num_chunks * pcm_chunk_size diff --git a/examples/qualcomm/oss_scripts/pvt.py b/examples/qualcomm/oss_scripts/pvt.py index dd564f7c043..22a7d78d61f 100644 --- a/examples/qualcomm/oss_scripts/pvt.py +++ b/examples/qualcomm/oss_scripts/pvt.py @@ -97,7 +97,7 @@ def main(args): output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # top-k analysis predictions = [] diff --git a/examples/qualcomm/oss_scripts/qwen2_5/qwen2_5.py b/examples/qualcomm/oss_scripts/qwen2_5/qwen2_5.py index 2b4e8dd0b01..cb6f3430235 100644 --- a/examples/qualcomm/oss_scripts/qwen2_5/qwen2_5.py +++ b/examples/qualcomm/oss_scripts/qwen2_5/qwen2_5.py @@ -162,7 +162,7 @@ def post_process(): adb.push(inputs=[], input_list="", files=[tokenizer_json_path]) adb.execute(custom_runner_cmd=runner_cmd) - adb.pull(output_path=args.artifact, callback=post_process) + adb.pull(host_output_path=args.artifact, callback=post_process) if args.ip and args.port != -1: with Client((args.ip, args.port)) as conn: diff --git a/examples/qualcomm/oss_scripts/regnet.py b/examples/qualcomm/oss_scripts/regnet.py index b8844a5dc7a..da34052a2ce 100644 --- a/examples/qualcomm/oss_scripts/regnet.py +++ b/examples/qualcomm/oss_scripts/regnet.py @@ -97,7 +97,7 @@ def main(args): output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # top-k analysis predictions = [] diff --git a/examples/qualcomm/oss_scripts/retinanet.py b/examples/qualcomm/oss_scripts/retinanet.py index c087a9cbef8..dd17d2f61ab 100644 --- a/examples/qualcomm/oss_scripts/retinanet.py +++ b/examples/qualcomm/oss_scripts/retinanet.py @@ -268,7 +268,7 @@ def main(args): # collect output data output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) predictions, classes = [], [n_classes, n_coord_of_bbox] for i in range(data_num): diff --git a/examples/qualcomm/oss_scripts/roberta.py b/examples/qualcomm/oss_scripts/roberta.py index 403133d0aa4..bbc5645ed2c 100644 --- a/examples/qualcomm/oss_scripts/roberta.py +++ b/examples/qualcomm/oss_scripts/roberta.py @@ -125,7 +125,7 @@ def main(args): golden = model(*sample_input)[0] adb.push(inputs=[sample_input]) adb.execute() - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) print(f"input: {tokenizer.batch_decode(sample_input[0])}") print(f"golden output: {tokenizer.batch_decode(golden.argmax(axis=2))}") @@ -137,7 +137,7 @@ def main(args): # accuracy analysis adb.push(inputs=inputs) adb.execute() - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) goldens, predictions = [], [] for i in range(len(inputs)): indice = [i for i, x in enumerate(targets[i]) if x != -100] diff --git a/examples/qualcomm/oss_scripts/squeezenet.py b/examples/qualcomm/oss_scripts/squeezenet.py index 453b3d146eb..80a7afbde66 100644 --- a/examples/qualcomm/oss_scripts/squeezenet.py +++ b/examples/qualcomm/oss_scripts/squeezenet.py @@ -88,7 +88,7 @@ def main(args): output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # top-k analysis predictions = [] diff --git a/examples/qualcomm/oss_scripts/ssd300_vgg16.py b/examples/qualcomm/oss_scripts/ssd300_vgg16.py index 89127a8ce6a..20b1ef5a502 100644 --- a/examples/qualcomm/oss_scripts/ssd300_vgg16.py +++ b/examples/qualcomm/oss_scripts/ssd300_vgg16.py @@ -237,7 +237,7 @@ def post_process(): print("\nMean Average Precision (mAP): %.3f" % mAP) pp.pprint(APs) - adb.pull(output_path=args.artifact, callback=post_process) + adb.pull(host_output_path=args.artifact, callback=post_process) if __name__ == "__main__": diff --git a/examples/qualcomm/oss_scripts/swin_transformer.py b/examples/qualcomm/oss_scripts/swin_transformer.py index aca8d706691..a08de41691c 100644 --- a/examples/qualcomm/oss_scripts/swin_transformer.py +++ b/examples/qualcomm/oss_scripts/swin_transformer.py @@ -150,7 +150,7 @@ def main(args): output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # top-k analysis predictions = [] diff --git a/examples/qualcomm/oss_scripts/swin_v2_t.py b/examples/qualcomm/oss_scripts/swin_v2_t.py index 7307237fbd5..af08d0cb739 100755 --- a/examples/qualcomm/oss_scripts/swin_v2_t.py +++ b/examples/qualcomm/oss_scripts/swin_v2_t.py @@ -143,7 +143,7 @@ def main(args): output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # top-k analysis predictions = [] diff --git a/examples/qualcomm/oss_scripts/t5/t5.py b/examples/qualcomm/oss_scripts/t5/t5.py index c8128772f27..4962e9e8bc0 100644 --- a/examples/qualcomm/oss_scripts/t5/t5.py +++ b/examples/qualcomm/oss_scripts/t5/t5.py @@ -338,7 +338,7 @@ def post_process(): files=[spiece_model], ) adb.execute(custom_runner_cmd=runner_cmd) - adb.pull(output_path=args.artifact, callback=post_process) + adb.pull(host_output_path=args.artifact, callback=post_process) result = Seq2SeqLMExportableModulePipeline.evaluate_with_ground_truth( tokenizer, outputs, targets, evaluate_squad diff --git a/examples/qualcomm/oss_scripts/vit_b_16.py b/examples/qualcomm/oss_scripts/vit_b_16.py index a8d361daab5..988d5e662bd 100755 --- a/examples/qualcomm/oss_scripts/vit_b_16.py +++ b/examples/qualcomm/oss_scripts/vit_b_16.py @@ -93,7 +93,7 @@ def main(args): output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # top-k analysis predictions = [] diff --git a/examples/qualcomm/oss_scripts/whisper/whisper.py b/examples/qualcomm/oss_scripts/whisper/whisper.py index 6d94c823d4a..ec62012cacb 100644 --- a/examples/qualcomm/oss_scripts/whisper/whisper.py +++ b/examples/qualcomm/oss_scripts/whisper/whisper.py @@ -475,7 +475,7 @@ def post_process(): adb.push(inputs=inputs, files=[tokenizer_json]) adb.execute(custom_runner_cmd=runner_cmd) - adb.pull(output_path=args.artifact, callback=post_process) + adb.pull(host_output_path=args.artifact, callback=post_process) wer = eval_metric(outputs, target) if args.ip and args.port != -1: diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py b/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py index 9efdc34fd12..a53c9678575 100644 --- a/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py +++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py @@ -336,7 +336,7 @@ def post_process_vae(): np.fromfile(f, dtype=np.float32).reshape(1, 512, 512, 3) ) - adb.pull(output_path=args.artifact, callback=post_process_vae) + adb.pull(host_output_path=args.artifact, callback=post_process_vae) if args.fix_latents: broadcast_ut_result(output_image, seed) diff --git a/examples/qualcomm/scripts/deeplab_v3.py b/examples/qualcomm/scripts/deeplab_v3.py index 54fa2e69e94..541efb7b402 100755 --- a/examples/qualcomm/scripts/deeplab_v3.py +++ b/examples/qualcomm/scripts/deeplab_v3.py @@ -161,7 +161,7 @@ def post_process(): output = output.reshape(output_shape) output.argmax(0).astype(np.uint8).tofile(filename) - adb.pull(output_path=args.artifact, callback=post_process) + adb.pull(host_output_path=args.artifact, callback=post_process) # segmentation metrics predictions = [] diff --git a/examples/qualcomm/scripts/edsr.py b/examples/qualcomm/scripts/edsr.py index 0f80666a7fb..efb6c36e92d 100755 --- a/examples/qualcomm/scripts/edsr.py +++ b/examples/qualcomm/scripts/edsr.py @@ -183,7 +183,7 @@ def post_process(): ) cnt += 1 - adb.pull(output_path=args.artifact, callback=post_process) + adb.pull(host_output_path=args.artifact, callback=post_process) psnr_list = [] ssim_list = [] diff --git a/examples/qualcomm/scripts/inception_v3.py b/examples/qualcomm/scripts/inception_v3.py index 599c0602237..28ae35bed25 100755 --- a/examples/qualcomm/scripts/inception_v3.py +++ b/examples/qualcomm/scripts/inception_v3.py @@ -92,7 +92,7 @@ def main(args): output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # top-k analysis predictions = [] diff --git a/examples/qualcomm/scripts/inception_v4.py b/examples/qualcomm/scripts/inception_v4.py index aa3da09407a..ac29e5af408 100755 --- a/examples/qualcomm/scripts/inception_v4.py +++ b/examples/qualcomm/scripts/inception_v4.py @@ -90,7 +90,7 @@ def main(args): output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # top-k analysis predictions = [] diff --git a/examples/qualcomm/scripts/mobilebert_fine_tune.py b/examples/qualcomm/scripts/mobilebert_fine_tune.py index 19a0f50ef0d..97e6fad3cbc 100755 --- a/examples/qualcomm/scripts/mobilebert_fine_tune.py +++ b/examples/qualcomm/scripts/mobilebert_fine_tune.py @@ -301,7 +301,7 @@ def calibrator(gm): output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # get torch cpu result cpu_preds, true_vals = evaluate(model, data_val) diff --git a/examples/qualcomm/scripts/mobilenet_v2.py b/examples/qualcomm/scripts/mobilenet_v2.py index 2d86f4de5d2..93885b5768a 100755 --- a/examples/qualcomm/scripts/mobilenet_v2.py +++ b/examples/qualcomm/scripts/mobilenet_v2.py @@ -91,7 +91,7 @@ def main(args): output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # top-k analysis predictions = [] diff --git a/examples/qualcomm/scripts/mobilenet_v3.py b/examples/qualcomm/scripts/mobilenet_v3.py index a1148383973..13a9b875694 100644 --- a/examples/qualcomm/scripts/mobilenet_v3.py +++ b/examples/qualcomm/scripts/mobilenet_v3.py @@ -83,7 +83,7 @@ def main(args): output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # top-k analysis predictions = [] diff --git a/examples/qualcomm/scripts/torchvision_vit.py b/examples/qualcomm/scripts/torchvision_vit.py index ed8dbb792c4..c26ac5d7d75 100755 --- a/examples/qualcomm/scripts/torchvision_vit.py +++ b/examples/qualcomm/scripts/torchvision_vit.py @@ -130,7 +130,7 @@ def main(args): output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # top-k analysis predictions = [] diff --git a/examples/qualcomm/scripts/wav2letter.py b/examples/qualcomm/scripts/wav2letter.py index 7fddb48bfb7..8bf22bae266 100644 --- a/examples/qualcomm/scripts/wav2letter.py +++ b/examples/qualcomm/scripts/wav2letter.py @@ -168,7 +168,7 @@ def main(args): # collect output data output_data_folder = f"{args.artifact}/outputs" make_output_dir(output_data_folder) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) predictions = [] for i in range(data_num): diff --git a/examples/qualcomm/util_scripts/qnn_intermediate_debugger_demo.py b/examples/qualcomm/util_scripts/qnn_intermediate_debugger_demo.py index 727c94900ca..a02ed60cf83 100644 --- a/examples/qualcomm/util_scripts/qnn_intermediate_debugger_demo.py +++ b/examples/qualcomm/util_scripts/qnn_intermediate_debugger_demo.py @@ -170,7 +170,7 @@ def validate_intermediate_tensor(): args.artifact, args.artifact, callback=validate_intermediate_tensor ) - adb.pull(output_path=args.artifact) + adb.pull(host_output_path=args.artifact) # top-k analysis predictions = [] diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py index b9a452c6fe5..7b8dd9ecf00 100755 --- a/examples/qualcomm/utils.py +++ b/examples/qualcomm/utils.py @@ -240,8 +240,10 @@ def execute( ["shell", f"{qnn_executor_runner_cmds}"], output_callback=output_callback ) - def pull(self, output_path, callback=None): - self._adb(["pull", "-a", self.output_folder, output_path]) + def pull(self, host_output_path, device_output_path=None, callback=None): + if device_output_path is None: + device_output_path = self.output_folder + self._adb(["pull", "-a", device_output_path, host_output_path]) if callback: callback()