Skip to content

Commit 9d73dd8

Browse files
committed
Enable x86 runner for static llama, create a script for static llama ci
1 parent b3e52aa commit 9d73dd8

File tree

7 files changed

+178
-74
lines changed

7 files changed

+178
-74
lines changed

.ci/scripts/setup-stories-llama.sh

Lines changed: 0 additions & 18 deletions
This file was deleted.
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#!/bin/bash
2+
# Copyright (c) Qualcomm Innovation Center, Inc.
3+
# All rights reserved
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
set -exu
9+
10+
source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
11+
12+
export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)"
13+
export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
14+
export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang"
15+
export PYTHONPATH=".."
16+
cp schema/program.fbs exir/_serialize/program.fbs
17+
cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs
18+
cp -f build-x86/backends/qualcomm/PyQnnManagerAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python
19+
cp -f build-x86/backends/qualcomm/PyQnnWrapperAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python
20+
21+
if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
22+
PYTHON_EXECUTABLE=python3
23+
fi
24+
25+
which "${PYTHON_EXECUTABLE}"
26+
27+
# Although static llama CI does not require graphviz, it is required by test_qnn_delegate.py
28+
pip install graphviz
29+
30+
# Download stories llama110m artifacts
31+
download_stories_model_artifacts
32+
echo "Creating tokenizer.bin"
33+
$PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
34+
35+
set +e
36+
# Compile only as weight sharing is not applicable on x86
37+
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --compile_only
38+
exit_code1=$?
39+
40+
# Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
41+
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --enable_x86_64
42+
exit_code2=$?
43+
44+
# Check the exit codes and print messages
45+
if [ $exit_code1 -ne 0 ]; then
46+
echo "Static Llama compile only with weight sharing test failed. $exit_code1."
47+
fi
48+
49+
if [ $exit_code2 -ne 0 ]; then
50+
echo "Static Llama accuracy test failed. $exit_code2."
51+
fi
52+
53+
# Return failure if either program failed
54+
if [ $exit_code1 -ne 0 ] || [ $exit_code2 -ne 0 ]; then
55+
exit 1
56+
else
57+
exit 0
58+
fi
59+
set -e

.github/workflows/pull.yml

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -437,30 +437,38 @@ jobs:
437437
# Test llama2
438438
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"
439439
440-
test-static-llama-runner-qnn-linux:
441-
name: test-static-llama-runner-qnn-linux
442-
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
440+
test-static-llama-qnn-linux:
441+
name: test-static-llama-qnn-linux
442+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
443+
permissions:
444+
id-token: write
445+
contents: read
443446
strategy:
444447
fail-fast: false
445448
with:
446449
runner: linux.2xlarge
447450
docker-image: executorch-ubuntu-22.04-qnn-sdk
448451
submodules: 'true'
449452
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
450-
timeout: 900
453+
timeout: 180
451454
script: |
452455
# The generic Linux job chooses to use base env, not the one setup by the image
453456
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
454457
conda activate "${CONDA_ENV}"
455458
459+
BUILD_TOOL="cmake"
460+
456461
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
457462
PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
458463
459-
# Retrieve 110M Stories Llama Artifacts
460-
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-stories-llama.sh
464+
# Setup executorch
465+
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
466+
467+
# Setup install_requirements for llama
468+
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
461469
462-
# Test static llama stories110m
463-
PYTHON_EXECUTABLE=python backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --compile_only"
470+
# Test static llama weight sharing and accuracy
471+
PYTHON_EXECUTABLE=python bash .ci/scripts/test_qnn_static_llama.sh
464472
465473
test-qnn-models-linux:
466474
name: test-qnn-models-linux

backends/qualcomm/tests/test_qnn_delegate.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1956,6 +1956,7 @@ def test_qnn_backend_multi_graphs(self):
19561956
soc_model=self.chipset_table[TestQNN.model],
19571957
backend_options=backend_options,
19581958
multiple_graphs=True,
1959+
weight_sharing=True,
19591960
graph_name=graph_name,
19601961
)
19611962
for graph_name in graph_names
@@ -2519,6 +2520,7 @@ def test_qnn_backend_multi_graphs(self):
25192520
soc_model=self.chipset_table[TestQNN.model],
25202521
backend_options=backend_options,
25212522
multiple_graphs=True,
2523+
weight_sharing=True,
25222524
graph_name=graph_name,
25232525
)
25242526
for graph_name in graph_names
@@ -3795,10 +3797,12 @@ def test_stories_single_llama(self):
37953797
]
37963798
if self.compile_only:
37973799
cmds.extend(["--compile_only"])
3798-
else:
3800+
elif self.device:
37993801
cmds.extend(["--device", self.device])
38003802
if self.host:
38013803
cmds.extend(["--host", self.host])
3804+
elif self.enable_x86_64:
3805+
cmds.extend(["--enable_x86_64"])
38023806

38033807
golden_start_with = "Once upon a time,"
38043808
p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
@@ -3812,8 +3816,10 @@ def test_stories_single_llama(self):
38123816
if not self.compile_only:
38133817
model_out = msg["result"][0]
38143818
self.assertTrue(model_out.startswith(golden_start_with))
3815-
pte_size = msg["pte_size"]
3816-
self.assertLessEqual(pte_size, 130000000)
3819+
# x86 does not allow weight sharing, so we don't check pte size
3820+
if not self.enable_x86_64:
3821+
pte_size = msg["pte_size"]
3822+
self.assertLessEqual(pte_size, 130000000)
38173823

38183824
@unittest.skip("dynamic shape inputs appear in recent torch.export.export")
38193825
def test_mobilebert(self):
@@ -4018,12 +4024,6 @@ def setup_environment():
40184024
help="Path to open source software model repository",
40194025
type=str,
40204026
)
4021-
parser.add_argument(
4022-
"-x",
4023-
"--enable_x86_64",
4024-
help="Enable unittest to be executed on x86_64 platform",
4025-
action="store_true",
4026-
)
40274027

40284028
args, ns_args = parser.parse_known_args(namespace=unittest)
40294029
TestQNN.host = args.host

backends/qualcomm/utils/utils.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1163,6 +1163,7 @@ def generate_qnn_executorch_compiler_spec(
11631163
shared_buffer: bool = False,
11641164
is_from_context_binary: bool = False,
11651165
multiple_graphs: bool = False,
1166+
weight_sharing: bool = False,
11661167
graph_name: str = "forward",
11671168
) -> List[CompileSpec]:
11681169
"""
@@ -1193,6 +1194,7 @@ def generate_qnn_executorch_compiler_spec(
11931194
is_from_context_binary: True if current graph comes from pre-built context binary.
11941195
multiple_graphs: True if multiple methods are expected to have in single .pte file.
11951196
Please see test cases for post-processing example.
1197+
weight_sharing: Used with multiple_graphs, where model size will be reduced when operations have the same weights across multiple graphs.
11961198
graph_name: Assign unique graph name if 'multiple_graphs' is used.
11971199
11981200
Returns:
@@ -1213,6 +1215,12 @@ def generate_qnn_executorch_compiler_spec(
12131215
stacklevel=1,
12141216
)
12151217

1218+
if weight_sharing and not multiple_graphs:
1219+
warnings.warn(
1220+
"Weight sharing is intended for multiple graphs scenario, please ensure if there are multiple graphs",
1221+
stacklevel=1,
1222+
)
1223+
12161224
qnn_executorch_options = QnnExecuTorchOptions(
12171225
_soc_info_table[soc_model], backend_options
12181226
)
@@ -1254,7 +1262,10 @@ def generate_qnn_executorch_compiler_spec(
12541262

12551263
if multiple_graphs:
12561264
# enable weight sharing mechanism if multiple graphs appear
1257-
if backend_options.backend_type == QnnExecuTorchBackendType.kHtpBackend:
1265+
if (
1266+
backend_options.backend_type == QnnExecuTorchBackendType.kHtpBackend
1267+
and weight_sharing
1268+
):
12581269
backend_options.htp_options.use_weight_sharing = True
12591270

12601271
return [

examples/qualcomm/oss_scripts/llama/llama.py

Lines changed: 75 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import json
1313
import logging
1414
import os
15+
import subprocess
1516
import sys
1617
import time
1718
from collections import OrderedDict
@@ -654,6 +655,7 @@ def compile(args, pte_filename, tokenizer):
654655
backend_options=backend_options,
655656
shared_buffer=args.shared_buffer,
656657
multiple_graphs=True,
658+
weight_sharing=not args.enable_x86_64, # x86 emulator does not support weight sharing
657659
graph_name=graph_name,
658660
)
659661
for graph_name in graph_names
@@ -790,48 +792,11 @@ def inference(args, quant_attrs, pte_filename, runtime_tokenizer_path, pre_gen_p
790792
else:
791793
raise RuntimeError(f"Unknown model_mode: {args.model_mode}.")
792794

793-
seq_len = args.prefill_seq_len if args.model_mode == "prefill" else args.kv_seq_len
794-
runner_args = " ".join(
795-
[
796-
f"--model_path {pte_filename}.pte",
797-
"--output_path outputs/outputs.txt",
798-
f"--tokenizer_path {os.path.basename(runtime_tokenizer_path)}",
799-
f'--prompt "{args.prompt}"',
800-
f"--seq_len {seq_len}",
801-
f"--eval_mode {eval_mode}",
802-
f"--temperature {args.temperature}",
803-
f"--system_prompt '{args.system_prompt}'",
804-
f"--logits_scale {quant_attrs['scale']}",
805-
f"--logits_offset {quant_attrs['zero_point']}",
806-
f"--kv_updator {'SmartMask' if args.kv_updator == smart_mask_updator else 'ShiftPointer'}",
807-
]
808-
)
809-
runner_cmd = " ".join(
810-
[
811-
f"cd {workspace} &&",
812-
f"./qnn_llama_runner {runner_args}",
813-
]
814-
)
815-
816795
pte_path = (
817796
f"{pre_gen_pte}/{pte_filename}.pte"
818797
if pre_gen_pte
819798
else f"{args.artifact}/{pte_filename}.pte"
820799
)
821-
adb = SimpleADB(
822-
qnn_sdk=os.getenv("QNN_SDK_ROOT"),
823-
build_path=f"{args.build_folder}",
824-
pte_path=pte_path,
825-
workspace=workspace,
826-
device_id=args.device,
827-
host_id=args.host,
828-
soc_model=args.model,
829-
shared_buffer=args.shared_buffer,
830-
runner=f"examples/qualcomm/oss_scripts/llama/qnn_llama_runner",
831-
)
832-
# No pregen inputs, input_list is not required
833-
adb.push(inputs=[], input_list="", files=[runtime_tokenizer_path])
834-
adb.execute(custom_runner_cmd=runner_cmd)
835800

836801
# collect output data
837802
output_data_folder = f"{args.artifact}/outputs"
@@ -842,7 +807,79 @@ def post_process():
842807
with open(f"{args.artifact}/outputs/outputs.txt", "r") as f:
843808
outputs.append(f.read())
844809

845-
adb.pull(output_path=args.artifact, callback=post_process)
810+
seq_len = args.prefill_seq_len if args.model_mode == "prefill" else args.kv_seq_len
811+
runner_args = " ".join(
812+
[
813+
f'--prompt "{args.prompt}"',
814+
f"--eval_mode {eval_mode}",
815+
f"--temperature {args.temperature}",
816+
f"--system_prompt '{args.system_prompt}'",
817+
f"--logits_scale {quant_attrs['scale']}",
818+
f"--logits_offset {quant_attrs['zero_point']}",
819+
]
820+
)
821+
822+
runner_cmd = ""
823+
if args.enable_x86_64:
824+
# x86 emulator is intended for CI and not performance. Check only the first few tokens.
825+
seq_len = min(seq_len, 16)
826+
827+
if args.kv_updator == smart_mask_updator:
828+
logging.warning(
829+
"x86 only support ShiftPointer, overwrite kv_updator to ShiftPointer"
830+
)
831+
832+
qnn_sdk = os.getenv("QNN_SDK_ROOT")
833+
target = "x86_64-linux-clang"
834+
runner_cmd = " ".join(
835+
[
836+
f"export LD_LIBRARY_PATH={qnn_sdk}/lib/{target}/:{args.build_folder}/lib &&",
837+
f"./{args.build_folder}/examples/qualcomm/oss_scripts/llama/qnn_llama_runner",
838+
f"--tokenizer_path {runtime_tokenizer_path}",
839+
f"--model_path {pte_path}",
840+
f"--seq_len {seq_len}",
841+
f"--output_path {args.artifact}/outputs/outputs.txt",
842+
f"--kv_updator ShiftPointer",
843+
runner_args,
844+
]
845+
)
846+
subprocess.run(
847+
runner_cmd,
848+
shell=True,
849+
executable="/bin/bash",
850+
capture_output=True,
851+
)
852+
post_process()
853+
else:
854+
runner_cmd = " ".join(
855+
[
856+
f"cd {workspace} &&",
857+
f"./qnn_llama_runner",
858+
f"--tokenizer_path {os.path.basename(runtime_tokenizer_path)}",
859+
f"--model_path {pte_filename}.pte",
860+
f"--seq_len {seq_len}",
861+
"--output_path outputs/outputs.txt",
862+
f"--kv_updator {'SmartMask' if args.kv_updator == smart_mask_updator else 'ShiftPointer'}",
863+
runner_args,
864+
]
865+
)
866+
867+
adb = SimpleADB(
868+
qnn_sdk=os.getenv("QNN_SDK_ROOT"),
869+
build_path=f"{args.build_folder}",
870+
pte_path=pte_path,
871+
workspace=workspace,
872+
device_id=args.device,
873+
host_id=args.host,
874+
soc_model=args.model,
875+
shared_buffer=args.shared_buffer,
876+
runner=f"examples/qualcomm/oss_scripts/llama/qnn_llama_runner",
877+
)
878+
# No pregen inputs, input_list is not required
879+
adb.push(inputs=[], input_list="", files=[runtime_tokenizer_path])
880+
adb.execute(custom_runner_cmd=runner_cmd)
881+
882+
adb.pull(output_path=args.artifact, callback=post_process)
846883
if args.ip and args.port != -1:
847884
pte_size = os.path.getsize(pte_path)
848885
with Client((args.ip, args.port)) as conn:

examples/qualcomm/utils.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -524,6 +524,13 @@ def setup_common_args_and_variables():
524524
default=False,
525525
)
526526

527+
parser.add_argument(
528+
"-x",
529+
"--enable_x86_64",
530+
help="Enable unittest to be executed on x86_64 platform",
531+
action="store_true",
532+
)
533+
527534
# QNN_SDK_ROOT might also be an argument, but it is used in various places.
528535
# So maybe it's fine to just use the environment.
529536
if "QNN_SDK_ROOT" not in os.environ:

0 commit comments

Comments
 (0)