Enable x86 runner for static llama

winskuo-quic · winskuo-quic · commit e70fc977d29c · 2025-02-04T10:04:41.000+08:00
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -440,7 +440,8 @@ jobs:
         # Test llama2
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"
 
-  test-static-llama-runner-qnn-linux:
+  # Compile only as weight sharing is not applicable on x86
+  test-static-llama-size-qnn-linux:
     name: test-static-llama-runner-qnn-linux
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     strategy:
@@ -459,13 +460,46 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
 
+        # Setup executorch
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+
         # Retrieve 110M Stories Llama Artifacts
-        PYTHON_EXECUTABLE=python bash .ci/scripts/utils.sh
         PYTHON_EXECUTABLE=python download_stories_model_artifacts
+        $PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
         
-        # Test static llama stories110m
+        # Test static llama stories110m pte size
         PYTHON_EXECUTABLE=python backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --compile_only"
 
+  # Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
+  test-static-llama-accuracy-qnn-linux:
+    name: test-static-llama-runner-qnn-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    strategy:
+      fail-fast: false
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-qnn-sdk
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 900
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
+        PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
+
+        # Setup executorch
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+
+        # Retrieve 110M Stories Llama Artifacts
+        PYTHON_EXECUTABLE=python download_stories_model_artifacts
+        $PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+        
+        # Test static llama stories110m accuracy
+        PYTHON_EXECUTABLE=python backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-x86_64/ --executorch_root . --artifact_dir . --enable_x86_64"
+
   test-qnn-models-linux:
     name: test-qnn-models-linux
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -1930,6 +1930,7 @@ def test_qnn_backend_multi_graphs(self):
                 soc_model=self.chipset_table[TestQNN.model],
                 backend_options=backend_options,
                 multiple_graphs=True,
+                weight_sharing=True,
                 graph_name=graph_name,
             )
             for graph_name in graph_names
@@ -2418,6 +2419,7 @@ def test_qnn_backend_multi_graphs(self):
                 soc_model=self.chipset_table[TestQNN.model],
                 backend_options=backend_options,
                 multiple_graphs=True,
+                weight_sharing=True,
                 graph_name=graph_name,
             )
             for graph_name in graph_names
@@ -3621,6 +3623,8 @@ def test_stories_single_llama(self):
             cmds.extend(["--device", self.device])
         if self.host:
             cmds.extend(["--host", self.host])
+        if self.enable_x86_64:
+            cmds.extend(["--enable_x86_64"])
 
         golden_start_with = "Once upon a time,"
         p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
@@ -3634,8 +3638,10 @@ def test_stories_single_llama(self):
                 if not self.compile_only:
                     model_out = msg["result"][0]
                     self.assertTrue(model_out.startswith(golden_start_with))
-                pte_size = msg["pte_size"]
-                self.assertLessEqual(pte_size, 130000000)
+                # x86 does not allow weight sharing, so we don't check pte size
+                if not self.enable_x86_64:
+                    pte_size = msg["pte_size"]
+                    self.assertLessEqual(pte_size, 130000000)
 
     @unittest.skip("dynamic shape inputs appear in recent torch.export.export")
     def test_mobilebert(self):
@@ -3840,12 +3846,6 @@ def setup_environment():
         help="Path to open source software model repository",
         type=str,
     )
-    parser.add_argument(
-        "-x",
-        "--enable_x86_64",
-        help="Enable unittest to be executed on x86_64 platform",
-        action="store_true",
-    )
 
     args, ns_args = parser.parse_known_args(namespace=unittest)
     TestQNN.host = args.host
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
@@ -1122,6 +1122,7 @@ def generate_qnn_executorch_compiler_spec(
     shared_buffer: bool = False,
     is_from_context_binary: bool = False,
     multiple_graphs: bool = False,
+    weight_sharing: bool = False,
     graph_name: str = "forward",
 ) -> List[CompileSpec]:
     """
@@ -1152,6 +1153,7 @@ def generate_qnn_executorch_compiler_spec(
         is_from_context_binary: True if current graph comes from pre-built context binary.
         multiple_graphs: True if multiple methods are expected to have in single .pte file.
             Please see test cases for post-processing example.
+        weight_sharing: Used with multiple_graphs, where model size will be reduced when operations have the same weights across multiple graphs.
         graph_name: Assign unique graph name if 'multiple_graphs' is used.
 
     Returns:
@@ -1172,6 +1174,12 @@ def generate_qnn_executorch_compiler_spec(
             stacklevel=1,
         )
 
+    if weight_sharing and not multiple_graphs:
+        warnings.warn(
+            "Weight sharing is intended for multiple graphs scenario, please ensure if there are multiple graphs",
+            stacklevel=1,
+        )
+
     qnn_executorch_options = QnnExecuTorchOptions(
         _soc_info_table[soc_model], backend_options
     )
@@ -1213,7 +1221,10 @@ def generate_qnn_executorch_compiler_spec(
 
     if multiple_graphs:
         # enable weight sharing mechanism if multiple graphs appear
-        if backend_options.backend_type == QnnExecuTorchBackendType.kHtpBackend:
+        if (
+            backend_options.backend_type == QnnExecuTorchBackendType.kHtpBackend
+            and weight_sharing
+        ):
             backend_options.htp_options.use_weight_sharing = True
 
     return [
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -12,6 +12,7 @@
 import json
 import logging
 import os
+import subprocess
 import sys
 import time
 from collections import OrderedDict
@@ -647,6 +648,9 @@ def compile(args, pte_filename, tokenizer):
                 backend_options=backend_options,
                 shared_buffer=args.shared_buffer,
                 multiple_graphs=True,
+                weight_sharing=(
+                    False if args.enable_x86_64 else True
+                ),  # x86 emulator does not support weight sharing
                 graph_name=graph_name,
             )
             for graph_name in graph_names
@@ -779,48 +783,11 @@ def inference(args, quant_attrs, pte_filename, runtime_tokenizer_path, pre_gen_p
     else:
         raise RuntimeError(f"Unknown model_mode: {args.model_mode}.")
 
-    seq_len = args.prefill_seq_len if args.model_mode == "prefill" else args.kv_seq_len
-    runner_args = " ".join(
-        [
-            f"--model_path {pte_filename}.pte",
-            "--output_path outputs/outputs.txt",
-            f"--tokenizer_path {os.path.basename(runtime_tokenizer_path)}",
-            f'--prompt "{args.prompt}"',
-            f"--seq_len {seq_len}",
-            f"--eval_mode {eval_mode}",
-            f"--temperature {args.temperature}",
-            f"--system_prompt '{args.system_prompt}'",
-            f"--logits_scale {quant_attrs['scale']}",
-            f"--logits_offset {quant_attrs['zero_point']}",
-            f"--kv_updator {'SmartMask' if args.kv_updator == smart_mask_updator else 'ShiftPointer'}",
-        ]
-    )
-    runner_cmd = " ".join(
-        [
-            f"cd {workspace} &&",
-            f"./qnn_llama_runner {runner_args}",
-        ]
-    )
-
     pte_path = (
         f"{pre_gen_pte}/{pte_filename}.pte"
         if pre_gen_pte
         else f"{args.artifact}/{pte_filename}.pte"
     )
-    adb = SimpleADB(
-        qnn_sdk=os.getenv("QNN_SDK_ROOT"),
-        build_path=f"{args.build_folder}",
-        pte_path=pte_path,
-        workspace=workspace,
-        device_id=args.device,
-        host_id=args.host,
-        soc_model=args.model,
-        shared_buffer=args.shared_buffer,
-        runner=f"examples/qualcomm/oss_scripts/llama/qnn_llama_runner",
-    )
-    # No pregen inputs, input_list is not required
-    adb.push(inputs=[], input_list="", files=[runtime_tokenizer_path])
-    adb.execute(custom_runner_cmd=runner_cmd)
 
     # collect output data
     output_data_folder = f"{args.artifact}/outputs"
@@ -831,7 +798,79 @@ def post_process():
         with open(f"{args.artifact}/outputs/outputs.txt", "r") as f:
             outputs.append(f.read())
 
-    adb.pull(output_path=args.artifact, callback=post_process)
+    seq_len = args.prefill_seq_len if args.model_mode == "prefill" else args.kv_seq_len
+    runner_args = " ".join(
+        [
+            f'--prompt "{args.prompt}"',
+            f"--eval_mode {eval_mode}",
+            f"--temperature {args.temperature}",
+            f"--system_prompt '{args.system_prompt}'",
+            f"--logits_scale {quant_attrs['scale']}",
+            f"--logits_offset {quant_attrs['zero_point']}",
+        ]
+    )
+
+    runner_cmd = ""
+    if args.enable_x86_64:
+        # x86 emulator is intended for CI and not performance. Check only the first few tokens.
+        seq_len = min(seq_len, 16)
+
+        if args.kv_updator == smart_mask_updator:
+            logging.warning(
+                "x86 only support ShiftPointer, overwrite kv_updator to ShiftPointer"
+            )
+
+        qnn_sdk = os.getenv("QNN_SDK_ROOT")
+        target = "x86_64-linux-clang"
+        runner_cmd = " ".join(
+            [
+                f"export LD_LIBRARY_PATH={qnn_sdk}/lib/{target}/:{args.build_folder}/lib &&",
+                f"./{args.build_folder}/examples/qualcomm/oss_scripts/llama/qnn_llama_runner",
+                f"--tokenizer_path {runtime_tokenizer_path}",
+                f"--model_path {pte_path}",
+                f"--seq_len {seq_len}",
+                f"--output_path {args.artifact}/outputs/outputs.txt",
+                f"--kv_updator ShiftPointer",
+                runner_args,
+            ]
+        )
+        subprocess.run(
+            runner_cmd,
+            shell=True,
+            executable="/bin/bash",
+            capture_output=True,
+        )
+        post_process()
+    else:
+        runner_cmd = " ".join(
+            [
+                f"cd {workspace} &&",
+                f"./qnn_llama_runner",
+                f"--tokenizer_path {os.path.basename(runtime_tokenizer_path)}",
+                f"--model_path {pte_filename}.pte",
+                f"--seq_len {seq_len}",
+                "--output_path outputs/outputs.txt",
+                f"--kv_updator {'SmartMask' if args.kv_updator == smart_mask_updator else 'ShiftPointer'}",
+                runner_args,
+            ]
+        )
+
+        adb = SimpleADB(
+            qnn_sdk=os.getenv("QNN_SDK_ROOT"),
+            build_path=f"{args.build_folder}",
+            pte_path=pte_path,
+            workspace=workspace,
+            device_id=args.device,
+            host_id=args.host,
+            soc_model=args.model,
+            shared_buffer=args.shared_buffer,
+            runner=f"examples/qualcomm/oss_scripts/llama/qnn_llama_runner",
+        )
+        # No pregen inputs, input_list is not required
+        adb.push(inputs=[], input_list="", files=[runtime_tokenizer_path])
+        adb.execute(custom_runner_cmd=runner_cmd)
+
+        adb.pull(output_path=args.artifact, callback=post_process)
     if args.ip and args.port != -1:
         pte_size = os.path.getsize(pte_path)
         with Client((args.ip, args.port)) as conn:
diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py
@@ -524,6 +524,13 @@ def setup_common_args_and_variables():
         default=False,
     )
 
+    parser.add_argument(
+        "-x",
+        "--enable_x86_64",
+        help="Enable unittest to be executed on x86_64 platform",
+        action="store_true",
+    )
+
     # QNN_SDK_ROOT might also be an argument, but it is used in various places.
     # So maybe it's fine to just use the environment.
     if "QNN_SDK_ROOT" not in os.environ: