diff --git a/.ci/scripts/test_qnn_static_llm.sh b/.ci/scripts/test_qnn_static_llm.sh index 9d1c82f12d5..6b105d1c6f2 100644 --- a/.ci/scripts/test_qnn_static_llm.sh +++ b/.ci/scripts/test_qnn_static_llm.sh @@ -81,7 +81,7 @@ elif [[ "${TASK_NAME}" == "stories_260k_bc" ]]; then fi elif [[ "${TASK_NAME}" == "smollm2_135m" ]]; then - $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_static_smollm2 --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./static_smollm2 --enable_x86_64 + $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_static_llm_model --model_name smollm2_135m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./static_smollm2 --enable_x86_64 exit_code1=$? if [ $exit_code1 -ne 0 ]; then exit 1 diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 082c1ea5a08..903cfd1f6c0 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -10,6 +10,7 @@ import sys import tempfile import unittest +from dataclasses import dataclass from functools import partial from multiprocessing.connection import Listener from pathlib import Path @@ -5762,131 +5763,71 @@ def test_qnn_backend_seq_mse(self): class TestExampleLLMScript(TestQNN): - def test_codegen2_1b(self): - if not self.required_envs(): - self.skipTest("missing required envs") - prompt = "def hello_world():" - cmds = [ - "python", - f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py", - "--artifact", - self.artifact_dir, - "--build_folder", - self.build_folder, - "--model", - self.model, - "--ip", - self.ip, - "--port", - str(self.port), - "--prompt", - prompt, - "--temperature", - "0", - "--decoder_model", - "codegen2_1b", - "--model_mode", - "kv", - "--max_seq_len", - "128", - ] - if self.compile_only: - cmds.extend(["--compile_only"]) - elif self.device: - cmds.extend(["--device", self.device]) - if self.host: - cmds.extend(["--host", self.host]) - elif self.enable_x86_64: - cmds.extend(["--enable_x86_64"]) - if self.pre_gen_pte: - cmds.extend(["--pre_gen_pte", self.pre_gen_pte]) + @dataclass(frozen=True) + class LlmSpecs: + SM8650: float + SM8750: float + ppl: float + pte_size: float - golden_start_with = "def hello_world():" - p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) - with Listener((self.ip, self.port)) as listener: - conn = listener.accept() - p.communicate() - msg = json.loads(conn.recv()) - if "Error" in msg: - self.fail(msg["Error"]) - else: - if not self.compile_only: - model_out = msg["result"][0] - self.assertTrue( - model_out.startswith(golden_start_with), - f"Expected Output: {golden_start_with}. Actual Output: {model_out}", - ) - if not self.enable_x86_64: - pte_size = msg["pte_size"] - self.assertLessEqual(pte_size, 1_200_000_000) # 1200MB - if not self.compile_only and not self.enable_x86_64: - self.assertGreaterEqual(msg["inference_speed"], 60) + # TODO: refactor to support different backends + def setUp(self): + self.llm_specs = { + "gemma-2b": TestExampleLLMScript.LlmSpecs( + SM8650=32, SM8750=36, ppl=35, pte_size=2_700_000_000 + ), # 2.7 GB + "gemma3-1b": TestExampleLLMScript.LlmSpecs( + SM8650=70, SM8750=100, ppl=23, pte_size=1_200_000_000 + ), # 1.2 GB + "phi_4_mini": TestExampleLLMScript.LlmSpecs( + SM8650=14, SM8750=19, ppl=12, pte_size=4_000_000_000 + ), # 4GB + "llama3_2-1b_instruct": TestExampleLLMScript.LlmSpecs( + SM8650=37, SM8750=45, ppl=16, pte_size=1_500_000_000 + ), # 1.5 GB + "llama3_2-3b_instruct": TestExampleLLMScript.LlmSpecs( + SM8650=21, SM8750=26, ppl=11, pte_size=2_800_000_000 + ), # 2.8 GB + "qwen2_5-0_5b": TestExampleLLMScript.LlmSpecs( + SM8650=115, SM8750=155, ppl=15, pte_size=600_000_000 + ), # 600 MB + "qwen2_5-1_5b": TestExampleLLMScript.LlmSpecs( + SM8650=38, SM8750=47, ppl=10, pte_size=1_500_000_000 + ), # 1.5 GB + "qwen3-0_6b": TestExampleLLMScript.LlmSpecs( + SM8650=47, SM8750=68, ppl=21, pte_size=700_000_000 + ), # 700 MB + "qwen3-1_7b": TestExampleLLMScript.LlmSpecs( + SM8650=28, SM8750=34, ppl=15, pte_size=1_800_000_000 + ), # 1.8 GB + "smollm2_135m": TestExampleLLMScript.LlmSpecs( + SM8650=214, SM8750=260, ppl=23, pte_size=210_000_000 + ), # 210 MB + "smollm3-3b": TestExampleLLMScript.LlmSpecs( + SM8650=23, SM8750=28, ppl=10, pte_size=2_600_000_000 + ), # 2.6 GB + } - def test_static_gemma_2b(self): - if not self.required_envs(): + def test_static_llm_model(self): + if not self.required_envs([self.model_name]): self.skipTest("missing required envs") + assert ( + self.model_name in self.llm_specs + ), f"Unable to find {self.model_name} under model_specs." - prompt = "My favourite condiment is " - cmds = [ - "python", - f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py", - "--artifact", - self.artifact_dir, - "--build_folder", - self.build_folder, - "--model", - self.model, - "--ip", - self.ip, - "--port", - str(self.port), - "--prompt", - f"{prompt}", - "--decoder_model", - "gemma-2b", - "--model_mode", - "kv", - "--max_seq_len", - "1024", - "--eval_perplexity", - "--tasks", - "wikitext", - "--limit", - "1", - ] - if self.compile_only: - cmds.extend(["--compile_only"]) - elif self.device: - cmds.extend(["--device", self.device]) - if self.host: - cmds.extend(["--host", self.host]) - elif self.enable_x86_64: - cmds.extend(["--enable_x86_64"]) - if self.pre_gen_pte: - cmds.extend(["--pre_gen_pte", self.pre_gen_pte]) - - p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) - with Listener((self.ip, self.port)) as listener: - conn = listener.accept() - p.communicate() - msg = json.loads(conn.recv()) - if "Error" in msg: - self.fail(msg["Error"]) - else: - inference_speed_ref = {"SM8650": 32, "SM8750": 36} - self.assertLessEqual(msg["wiki_ppl"], 35) - self.assertLessEqual(msg["pte_size"], 2_700_000_000) # 2.7GB - if self.model in inference_speed_ref: - self.assertGreaterEqual( - msg["inference_speed"], inference_speed_ref[self.model] - ) - - def test_static_gemma3_1b(self): - if not self.required_envs(): - self.skipTest("missing required envs") + is_llama_model = self.model_name in { + "llama3_2-1b_instruct", + "llama3_2-3b_instruct", + } + if is_llama_model: + assert ( + self.llama_artifacts is not None + ), "Please provide path to llama artifacts" - prompt = "My favourite condiment is " + prompt = ( + "I would like to learn python, could you teach me with a simple example?" + ) cmds = [ "python", f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py", @@ -5902,12 +5843,10 @@ def test_static_gemma3_1b(self): str(self.port), "--prompt", f"{prompt}", - "--ptq", - "16a4w_block", "--temperature", "0", "--decoder_model", - "gemma3-1b", + self.model_name, "--model_mode", "kv", "--max_seq_len", @@ -5917,8 +5856,20 @@ def test_static_gemma3_1b(self): "wikitext", "--limit", "1", - "--enable_masked_softmax", ] + + if is_llama_model: + cmds.extend( + [ + "--checkpoint", + f"{self.llama_artifacts}/consolidated.00.pth", + "--params", + f"{self.llama_artifacts}/params.json", + "--tokenizer_model", + f"{self.llama_artifacts}/tokenizer.model", + ] + ) + if self.compile_only: cmds.extend(["--compile_only"]) elif self.device: @@ -5938,29 +5889,30 @@ def test_static_gemma3_1b(self): if "Error" in msg: self.fail(msg["Error"]) else: + llm_spec = self.llm_specs[self.model_name] + pte_size = msg["pte_size"] + self.assertLessEqual(pte_size, llm_spec.pte_size) + print(f"Model Name: {self.model_name}\nTarget Device: {self.model}") + print(f"PTE Size: {pte_size} bytes") if not self.compile_only: - self.assertLessEqual(msg["wiki_ppl"], 23) - if not self.enable_x86_64: - pte_size = msg["pte_size"] - self.assertLessEqual(pte_size, 1_200_000_000) # 1.2GB - inference_speed_ref = {"SM8650": 70, "SM8750": 100} - if ( - not self.compile_only - and not self.enable_x86_64 - and self.model in inference_speed_ref - ): - self.assertGreaterEqual( - msg["inference_speed"], inference_speed_ref[self.model] - ) + ppl = msg["wiki_ppl"] + print(f"PPL: {ppl}") + self.assertLessEqual(ppl, llm_spec.ppl) + if not self.enable_x86_64 and hasattr(llm_spec, self.model): + device_inference_speed = msg["inference_speed"] + expected_inference_speed = getattr(llm_spec, self.model) + print( + f"Prompt Evaluation: {device_inference_speed} tokens/second" + ) + self.assertGreaterEqual( + device_inference_speed, expected_inference_speed + ) - def test_llama3_2_instruct(self): + def test_codegen2_1b(self): if not self.required_envs(): self.skipTest("missing required envs") - assert ( - self.llama_artifacts is not None - ), "Please provide path to llama artifacts" - prompt = "What is the meaning of life?" + prompt = "def hello_world():" cmds = [ "python", f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py", @@ -5970,33 +5922,20 @@ def test_llama3_2_instruct(self): self.build_folder, "--model", self.model, - "--target", - self.target, - "--checkpoint", - f"{self.llama_artifacts}/consolidated.00.pth", - "--params", - f"{self.llama_artifacts}/params.json", - "--tokenizer_model", - f"{self.llama_artifacts}/tokenizer.model", "--ip", self.ip, "--port", str(self.port), "--prompt", - f"{prompt}", + prompt, "--temperature", "0", "--decoder_model", - "llama3_2-1b_instruct", + "codegen2_1b", "--model_mode", "kv", "--max_seq_len", - "1024", - "--eval_perplexity", - "--tasks", - "wikitext", - "--limit", - "1", + "128", ] if self.compile_only: cmds.extend(["--compile_only"]) @@ -6009,6 +5948,7 @@ def test_llama3_2_instruct(self): if self.pre_gen_pte: cmds.extend(["--pre_gen_pte", self.pre_gen_pte]) + golden_start_with = "def hello_world():" p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) with Listener((self.ip, self.port)) as listener: conn = listener.accept() @@ -6017,17 +5957,17 @@ def test_llama3_2_instruct(self): if "Error" in msg: self.fail(msg["Error"]) else: - inference_speed_ref = {"SM8650": 37, "SM8750": 49} - if ( - not self.compile_only - and not self.enable_x86_64 - and self.model in inference_speed_ref - ): - self.assertLessEqual(msg["pte_size"], 1_500_000_000) - self.assertLessEqual(msg["wiki_ppl"], 15) - self.assertGreaterEqual( - msg["inference_speed"], inference_speed_ref[self.model] + if not self.compile_only: + model_out = msg["result"][0] + self.assertTrue( + model_out.startswith(golden_start_with), + f"Expected Output: {golden_start_with}. Actual Output: {model_out}", ) + if not self.enable_x86_64: + pte_size = msg["pte_size"] + self.assertLessEqual(pte_size, 1_200_000_000) # 1200MB + if not self.compile_only and not self.enable_x86_64: + self.assertGreaterEqual(msg["inference_speed"], 60) def test_llama_stories_260k(self): if not self.required_envs(): @@ -6182,184 +6122,8 @@ def test_llama_stories_110m(self): if not self.compile_only and not self.enable_x86_64: self.assertGreaterEqual(msg["inference_speed"], 220) # Lanai - def test_static_phi4(self): - if not self.required_envs(): - self.skipTest("missing required envs") - - prompt = "My favourite condiment is " - cmds = [ - "python", - f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py", - "--artifact", - self.artifact_dir, - "--build_folder", - self.build_folder, - "--model", - self.model, - "--ip", - self.ip, - "--port", - str(self.port), - "--prompt", - f"{prompt}", - "--decoder_model", - "phi_4_mini", - "--model_mode", - "kv", - "--max_seq_len", - "1024", - "--eval_perplexity", - "--tasks", - "wikitext", - "--limit", - "1", - ] - if self.compile_only: - cmds.extend(["--compile_only"]) - elif self.device: - cmds.extend(["--device", self.device]) - if self.host: - cmds.extend(["--host", self.host]) - elif self.enable_x86_64: - cmds.extend(["--enable_x86_64"]) - if self.pre_gen_pte: - cmds.extend(["--pre_gen_pte", self.pre_gen_pte]) - - p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) - with Listener((self.ip, self.port)) as listener: - conn = listener.accept() - p.communicate() - msg = json.loads(conn.recv()) - if "Error" in msg: - self.fail(msg["Error"]) - else: - inference_speed_ref = {"SM8650": 14, "SM8750": 19} - self.assertLessEqual(msg["wiki_ppl"], 12) - self.assertLessEqual(msg["pte_size"], 4_000_000_000) # 4GB - if self.model in inference_speed_ref: - self.assertGreaterEqual( - msg["inference_speed"], inference_speed_ref[self.model] - ) - - def test_static_qwen2_5(self): - if not self.required_envs(): - self.skipTest("missing required envs") - - prompt = "My favourite condiment is " - cmds = [ - "python", - f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py", - "--artifact", - self.artifact_dir, - "--build_folder", - self.build_folder, - "--model", - self.model, - "--ip", - self.ip, - "--port", - str(self.port), - "--prompt", - f"{prompt}", - "--decoder_model", - "qwen2_5-0_5b", - "--model_mode", - "kv", - "--max_seq_len", - "1024", - "--eval_perplexity", - "--tasks", - "wikitext", - "--limit", - "1", - ] - if self.compile_only: - cmds.extend(["--compile_only"]) - elif self.device: - cmds.extend(["--device", self.device]) - if self.host: - cmds.extend(["--host", self.host]) - elif self.enable_x86_64: - cmds.extend(["--enable_x86_64"]) - if self.pre_gen_pte: - cmds.extend(["--pre_gen_pte", self.pre_gen_pte]) - - p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) - with Listener((self.ip, self.port)) as listener: - conn = listener.accept() - p.communicate() - msg = json.loads(conn.recv()) - if "Error" in msg: - self.fail(msg["Error"]) - else: - inference_speed_ref = {"SM8650": 115, "SM8750": 155} - self.assertLessEqual(msg["wiki_ppl"], 15) - self.assertLessEqual(msg["pte_size"], 600_000_000) # 600MB - if self.model in inference_speed_ref: - self.assertGreaterEqual( - msg["inference_speed"], inference_speed_ref[self.model] - ) - - def test_static_qwen3(self): - if not self.required_envs(): - self.skipTest("missing required envs") - - prompt = "My favourite condiment is " - cmds = [ - "python", - f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py", - "--artifact", - self.artifact_dir, - "--build_folder", - self.build_folder, - "--model", - self.model, - "--ip", - self.ip, - "--port", - str(self.port), - "--prompt", - f"{prompt}", - "--decoder_model", - "qwen3-0_6b", - "--model_mode", - "kv", - "--max_seq_len", - "1024", - "--eval_perplexity", - "--tasks", - "wikitext", - "--limit", - "1", - ] - if self.compile_only: - cmds.extend(["--compile_only"]) - elif self.device: - cmds.extend(["--device", self.device]) - if self.host: - cmds.extend(["--host", self.host]) - elif self.enable_x86_64: - cmds.extend(["--enable_x86_64"]) - if self.pre_gen_pte: - cmds.extend(["--pre_gen_pte", self.pre_gen_pte]) - - p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) - with Listener((self.ip, self.port)) as listener: - conn = listener.accept() - p.communicate() - msg = json.loads(conn.recv()) - if "Error" in msg: - self.fail(msg["Error"]) - else: - inference_speed_ref = {"SM8650": 38, "SM8750": 56} - self.assertLessEqual(msg["wiki_ppl"], 18) - self.assertLessEqual(msg["pte_size"], 950_000_000) # 950MB - if self.model in inference_speed_ref: - self.assertGreaterEqual( - msg["inference_speed"], inference_speed_ref[self.model] - ) - def test_qwen2_5(self): + # This is not testing static llm flow. if not self.required_envs([]): self.skipTest("missing required envs") prompt = "My favourite condiment is " @@ -6413,125 +6177,6 @@ def test_qwen2_5(self): f"Expected Output: '{golden_start_with}' Actual Output: '{model_out}'", ) - def test_static_smollm2(self): - if not self.required_envs(): - self.skipTest("missing required envs") - - prompt = "My favourite condiment is " - cmds = [ - "python", - f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py", - "--artifact", - self.artifact_dir, - "--build_folder", - self.build_folder, - "--model", - self.model, - "--ip", - self.ip, - "--port", - str(self.port), - "--prompt", - f"{prompt}", - "--decoder_model", - "smollm2_135m", - "--model_mode", - "kv", - "--temperature", - "0", - "--max_seq_len", - "1024", - "--eval_perplexity", - "--task", - "wikitext", - "--limit", - "1", - ] - if self.compile_only: - cmds.extend(["--compile_only"]) - elif self.device: - cmds.extend(["--device", self.device]) - if self.host: - cmds.extend(["--host", self.host]) - elif self.enable_x86_64: - cmds.extend(["--enable_x86_64"]) - if self.pre_gen_pte: - cmds.extend(["--pre_gen_pte", self.pre_gen_pte]) - - p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) - with Listener((self.ip, self.port)) as listener: - conn = listener.accept() - p.communicate() - msg = json.loads(conn.recv()) - if "Error" in msg: - self.fail(msg["Error"]) - else: - print("Perplexity score: ", msg["wiki_ppl"]) - self.assertLessEqual(msg["wiki_ppl"], 25) - if not self.enable_x86_64: - self.assertGreaterEqual(msg["inference_speed"], 200) - - def test_static_smollm3(self): - if not self.required_envs(): - self.skipTest("missing required envs") - - prompt = "My favourite condiment is " - cmds = [ - "python", - f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py", - "--artifact", - self.artifact_dir, - "--build_folder", - self.build_folder, - "--model", - self.model, - "--ip", - self.ip, - "--port", - str(self.port), - "--prompt", - f"{prompt}", - "--decoder_model", - "smollm3-3b", - "--model_mode", - "kv", - "--temperature", - "0", - "--max_seq_len", - "1024", - "--eval_perplexity", - "--task", - "wikitext", - "--limit", - "1", - ] - if self.compile_only: - cmds.extend(["--compile_only"]) - elif self.device: - cmds.extend(["--device", self.device]) - if self.host: - cmds.extend(["--host", self.host]) - elif self.enable_x86_64: - cmds.extend(["--enable_x86_64"]) - if self.pre_gen_pte: - cmds.extend(["--pre_gen_pte", self.pre_gen_pte]) - - p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) - with Listener((self.ip, self.port)) as listener: - conn = listener.accept() - p.communicate() - msg = json.loads(conn.recv()) - if "Error" in msg: - self.fail(msg["Error"]) - else: - inference_speed_ref = {"SM8650": 23, "SM8750": 28} - self.assertLessEqual(msg["wiki_ppl"], 10) - self.assertLessEqual(msg["pte_size"], 2_600_000_000) # 2.6GB - if self.model in inference_speed_ref: - self.assertGreaterEqual( - msg["inference_speed"], inference_speed_ref[self.model] - ) - class TestExampleOssScript(TestQNN): def test_albert(self): diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py index 0f0c237a9e1..c0cc8daab03 100644 --- a/backends/qualcomm/tests/utils.py +++ b/backends/qualcomm/tests/utils.py @@ -178,6 +178,8 @@ class TestQNN(unittest.TestCase): dump_intermediate_outputs: bool = False inference_speed: float = 0.0 inference_speed_output_path = "outputs/inference_speed.txt" + model_name: str = "" + oss_repo: str = "" def _assert_outputs_equal(self, model_output, ref_output): self.assertTrue(len(ref_output) == len(model_output)) diff --git a/docs/source/compiler-delegate-and-partitioner.md b/docs/source/compiler-delegate-and-partitioner.md index b057f3afa2e..c0449e7366b 100644 --- a/docs/source/compiler-delegate-and-partitioner.md +++ b/docs/source/compiler-delegate-and-partitioner.md @@ -131,7 +131,7 @@ static auto success_with_compiler = register_backend(backend); Providing consistent debugging experience, be it for runtime failures or performance profiling, is important. ExecuTorch employs native Developer Tools for this purpose, which enables correlating program instructions to original PyTorch code, via debug handles. You can read more about it [here](etrecord.rst). -Delegated program or subgraphs are opaque to ExecuTorch runtime and appear as a special `call_delegate` instruction, which asks corresponding backend to handle the execution of the subgraph or program. Due to the opaque nature of backend delgates, native Developer Tools does not have visibility into delegated program. Thus the debugging, functional or performance, experiences of delegated execution suffers significantly as compared to it's non-delegated counterpart. +Delegated program or subgraphs are opaque to ExecuTorch runtime and appear as a special `call_delegate` instruction, which asks corresponding backend to handle the execution of the subgraph or program. Due to the opaque nature of backend delegates, native Developer Tools does not have visibility into delegated program. Thus the debugging, functional or performance, experiences of delegated execution suffers significantly as compared to it's non-delegated counterpart. In order to provide consistent debugging experience to users, regardless of the use of delegation for a model, Developer Tools provide an interface to correlate delegated (sub)graph to original (sub)graph. The Developer Tools do so via debug handles map which allows delegates to generate internal handles that can be associated with the original (sub)graph consumed by the delegate. Then at runtime, backend developer can report error or profiling information using the internal handle, which will be mapped to original (sub)graph using the debug handle map. For more information, please refer to [Delegate Debugging](delegate-debugging.md).