Skip to content

Commit e70fc97

Browse files
committed
Enable x86 runner for static llama
1 parent 0bccf60 commit e70fc97

File tree

5 files changed

+141
-50
lines changed

5 files changed

+141
-50
lines changed

.github/workflows/pull.yml

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -440,7 +440,8 @@ jobs:
440440
# Test llama2
441441
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"
442442
443-
test-static-llama-runner-qnn-linux:
443+
# Compile only as weight sharing is not applicable on x86
444+
test-static-llama-size-qnn-linux:
444445
name: test-static-llama-runner-qnn-linux
445446
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
446447
strategy:
@@ -459,13 +460,46 @@ jobs:
459460
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
460461
PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
461462
463+
# Setup executorch
464+
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
465+
462466
# Retrieve 110M Stories Llama Artifacts
463-
PYTHON_EXECUTABLE=python bash .ci/scripts/utils.sh
464467
PYTHON_EXECUTABLE=python download_stories_model_artifacts
468+
$PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
465469
466-
# Test static llama stories110m
470+
# Test static llama stories110m pte size
467471
PYTHON_EXECUTABLE=python backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --compile_only"
468472
473+
# Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
474+
test-static-llama-accuracy-qnn-linux:
475+
name: test-static-llama-runner-qnn-linux
476+
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
477+
strategy:
478+
fail-fast: false
479+
with:
480+
runner: linux.2xlarge
481+
docker-image: executorch-ubuntu-22.04-qnn-sdk
482+
submodules: 'true'
483+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
484+
timeout: 900
485+
script: |
486+
# The generic Linux job chooses to use base env, not the one setup by the image
487+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
488+
conda activate "${CONDA_ENV}"
489+
490+
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
491+
PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
492+
493+
# Setup executorch
494+
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
495+
496+
# Retrieve 110M Stories Llama Artifacts
497+
PYTHON_EXECUTABLE=python download_stories_model_artifacts
498+
$PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
499+
500+
# Test static llama stories110m accuracy
501+
PYTHON_EXECUTABLE=python backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-x86_64/ --executorch_root . --artifact_dir . --enable_x86_64"
502+
469503
test-qnn-models-linux:
470504
name: test-qnn-models-linux
471505
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main

backends/qualcomm/tests/test_qnn_delegate.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1930,6 +1930,7 @@ def test_qnn_backend_multi_graphs(self):
19301930
soc_model=self.chipset_table[TestQNN.model],
19311931
backend_options=backend_options,
19321932
multiple_graphs=True,
1933+
weight_sharing=True,
19331934
graph_name=graph_name,
19341935
)
19351936
for graph_name in graph_names
@@ -2418,6 +2419,7 @@ def test_qnn_backend_multi_graphs(self):
24182419
soc_model=self.chipset_table[TestQNN.model],
24192420
backend_options=backend_options,
24202421
multiple_graphs=True,
2422+
weight_sharing=True,
24212423
graph_name=graph_name,
24222424
)
24232425
for graph_name in graph_names
@@ -3621,6 +3623,8 @@ def test_stories_single_llama(self):
36213623
cmds.extend(["--device", self.device])
36223624
if self.host:
36233625
cmds.extend(["--host", self.host])
3626+
if self.enable_x86_64:
3627+
cmds.extend(["--enable_x86_64"])
36243628

36253629
golden_start_with = "Once upon a time,"
36263630
p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
@@ -3634,8 +3638,10 @@ def test_stories_single_llama(self):
36343638
if not self.compile_only:
36353639
model_out = msg["result"][0]
36363640
self.assertTrue(model_out.startswith(golden_start_with))
3637-
pte_size = msg["pte_size"]
3638-
self.assertLessEqual(pte_size, 130000000)
3641+
# x86 does not allow weight sharing, so we don't check pte size
3642+
if not self.enable_x86_64:
3643+
pte_size = msg["pte_size"]
3644+
self.assertLessEqual(pte_size, 130000000)
36393645

36403646
@unittest.skip("dynamic shape inputs appear in recent torch.export.export")
36413647
def test_mobilebert(self):
@@ -3840,12 +3846,6 @@ def setup_environment():
38403846
help="Path to open source software model repository",
38413847
type=str,
38423848
)
3843-
parser.add_argument(
3844-
"-x",
3845-
"--enable_x86_64",
3846-
help="Enable unittest to be executed on x86_64 platform",
3847-
action="store_true",
3848-
)
38493849

38503850
args, ns_args = parser.parse_known_args(namespace=unittest)
38513851
TestQNN.host = args.host

backends/qualcomm/utils/utils.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1122,6 +1122,7 @@ def generate_qnn_executorch_compiler_spec(
11221122
shared_buffer: bool = False,
11231123
is_from_context_binary: bool = False,
11241124
multiple_graphs: bool = False,
1125+
weight_sharing: bool = False,
11251126
graph_name: str = "forward",
11261127
) -> List[CompileSpec]:
11271128
"""
@@ -1152,6 +1153,7 @@ def generate_qnn_executorch_compiler_spec(
11521153
is_from_context_binary: True if current graph comes from pre-built context binary.
11531154
multiple_graphs: True if multiple methods are expected to have in single .pte file.
11541155
Please see test cases for post-processing example.
1156+
weight_sharing: Used with multiple_graphs, where model size will be reduced when operations have the same weights across multiple graphs.
11551157
graph_name: Assign unique graph name if 'multiple_graphs' is used.
11561158
11571159
Returns:
@@ -1172,6 +1174,12 @@ def generate_qnn_executorch_compiler_spec(
11721174
stacklevel=1,
11731175
)
11741176

1177+
if weight_sharing and not multiple_graphs:
1178+
warnings.warn(
1179+
"Weight sharing is intended for multiple graphs scenario, please ensure if there are multiple graphs",
1180+
stacklevel=1,
1181+
)
1182+
11751183
qnn_executorch_options = QnnExecuTorchOptions(
11761184
_soc_info_table[soc_model], backend_options
11771185
)
@@ -1213,7 +1221,10 @@ def generate_qnn_executorch_compiler_spec(
12131221

12141222
if multiple_graphs:
12151223
# enable weight sharing mechanism if multiple graphs appear
1216-
if backend_options.backend_type == QnnExecuTorchBackendType.kHtpBackend:
1224+
if (
1225+
backend_options.backend_type == QnnExecuTorchBackendType.kHtpBackend
1226+
and weight_sharing
1227+
):
12171228
backend_options.htp_options.use_weight_sharing = True
12181229

12191230
return [

examples/qualcomm/oss_scripts/llama/llama.py

Lines changed: 77 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import json
1313
import logging
1414
import os
15+
import subprocess
1516
import sys
1617
import time
1718
from collections import OrderedDict
@@ -647,6 +648,9 @@ def compile(args, pte_filename, tokenizer):
647648
backend_options=backend_options,
648649
shared_buffer=args.shared_buffer,
649650
multiple_graphs=True,
651+
weight_sharing=(
652+
False if args.enable_x86_64 else True
653+
), # x86 emulator does not support weight sharing
650654
graph_name=graph_name,
651655
)
652656
for graph_name in graph_names
@@ -779,48 +783,11 @@ def inference(args, quant_attrs, pte_filename, runtime_tokenizer_path, pre_gen_p
779783
else:
780784
raise RuntimeError(f"Unknown model_mode: {args.model_mode}.")
781785

782-
seq_len = args.prefill_seq_len if args.model_mode == "prefill" else args.kv_seq_len
783-
runner_args = " ".join(
784-
[
785-
f"--model_path {pte_filename}.pte",
786-
"--output_path outputs/outputs.txt",
787-
f"--tokenizer_path {os.path.basename(runtime_tokenizer_path)}",
788-
f'--prompt "{args.prompt}"',
789-
f"--seq_len {seq_len}",
790-
f"--eval_mode {eval_mode}",
791-
f"--temperature {args.temperature}",
792-
f"--system_prompt '{args.system_prompt}'",
793-
f"--logits_scale {quant_attrs['scale']}",
794-
f"--logits_offset {quant_attrs['zero_point']}",
795-
f"--kv_updator {'SmartMask' if args.kv_updator == smart_mask_updator else 'ShiftPointer'}",
796-
]
797-
)
798-
runner_cmd = " ".join(
799-
[
800-
f"cd {workspace} &&",
801-
f"./qnn_llama_runner {runner_args}",
802-
]
803-
)
804-
805786
pte_path = (
806787
f"{pre_gen_pte}/{pte_filename}.pte"
807788
if pre_gen_pte
808789
else f"{args.artifact}/{pte_filename}.pte"
809790
)
810-
adb = SimpleADB(
811-
qnn_sdk=os.getenv("QNN_SDK_ROOT"),
812-
build_path=f"{args.build_folder}",
813-
pte_path=pte_path,
814-
workspace=workspace,
815-
device_id=args.device,
816-
host_id=args.host,
817-
soc_model=args.model,
818-
shared_buffer=args.shared_buffer,
819-
runner=f"examples/qualcomm/oss_scripts/llama/qnn_llama_runner",
820-
)
821-
# No pregen inputs, input_list is not required
822-
adb.push(inputs=[], input_list="", files=[runtime_tokenizer_path])
823-
adb.execute(custom_runner_cmd=runner_cmd)
824791

825792
# collect output data
826793
output_data_folder = f"{args.artifact}/outputs"
@@ -831,7 +798,79 @@ def post_process():
831798
with open(f"{args.artifact}/outputs/outputs.txt", "r") as f:
832799
outputs.append(f.read())
833800

834-
adb.pull(output_path=args.artifact, callback=post_process)
801+
seq_len = args.prefill_seq_len if args.model_mode == "prefill" else args.kv_seq_len
802+
runner_args = " ".join(
803+
[
804+
f'--prompt "{args.prompt}"',
805+
f"--eval_mode {eval_mode}",
806+
f"--temperature {args.temperature}",
807+
f"--system_prompt '{args.system_prompt}'",
808+
f"--logits_scale {quant_attrs['scale']}",
809+
f"--logits_offset {quant_attrs['zero_point']}",
810+
]
811+
)
812+
813+
runner_cmd = ""
814+
if args.enable_x86_64:
815+
# x86 emulator is intended for CI and not performance. Check only the first few tokens.
816+
seq_len = min(seq_len, 16)
817+
818+
if args.kv_updator == smart_mask_updator:
819+
logging.warning(
820+
"x86 only support ShiftPointer, overwrite kv_updator to ShiftPointer"
821+
)
822+
823+
qnn_sdk = os.getenv("QNN_SDK_ROOT")
824+
target = "x86_64-linux-clang"
825+
runner_cmd = " ".join(
826+
[
827+
f"export LD_LIBRARY_PATH={qnn_sdk}/lib/{target}/:{args.build_folder}/lib &&",
828+
f"./{args.build_folder}/examples/qualcomm/oss_scripts/llama/qnn_llama_runner",
829+
f"--tokenizer_path {runtime_tokenizer_path}",
830+
f"--model_path {pte_path}",
831+
f"--seq_len {seq_len}",
832+
f"--output_path {args.artifact}/outputs/outputs.txt",
833+
f"--kv_updator ShiftPointer",
834+
runner_args,
835+
]
836+
)
837+
subprocess.run(
838+
runner_cmd,
839+
shell=True,
840+
executable="/bin/bash",
841+
capture_output=True,
842+
)
843+
post_process()
844+
else:
845+
runner_cmd = " ".join(
846+
[
847+
f"cd {workspace} &&",
848+
f"./qnn_llama_runner",
849+
f"--tokenizer_path {os.path.basename(runtime_tokenizer_path)}",
850+
f"--model_path {pte_filename}.pte",
851+
f"--seq_len {seq_len}",
852+
"--output_path outputs/outputs.txt",
853+
f"--kv_updator {'SmartMask' if args.kv_updator == smart_mask_updator else 'ShiftPointer'}",
854+
runner_args,
855+
]
856+
)
857+
858+
adb = SimpleADB(
859+
qnn_sdk=os.getenv("QNN_SDK_ROOT"),
860+
build_path=f"{args.build_folder}",
861+
pte_path=pte_path,
862+
workspace=workspace,
863+
device_id=args.device,
864+
host_id=args.host,
865+
soc_model=args.model,
866+
shared_buffer=args.shared_buffer,
867+
runner=f"examples/qualcomm/oss_scripts/llama/qnn_llama_runner",
868+
)
869+
# No pregen inputs, input_list is not required
870+
adb.push(inputs=[], input_list="", files=[runtime_tokenizer_path])
871+
adb.execute(custom_runner_cmd=runner_cmd)
872+
873+
adb.pull(output_path=args.artifact, callback=post_process)
835874
if args.ip and args.port != -1:
836875
pte_size = os.path.getsize(pte_path)
837876
with Client((args.ip, args.port)) as conn:

examples/qualcomm/utils.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -524,6 +524,13 @@ def setup_common_args_and_variables():
524524
default=False,
525525
)
526526

527+
parser.add_argument(
528+
"-x",
529+
"--enable_x86_64",
530+
help="Enable unittest to be executed on x86_64 platform",
531+
action="store_true",
532+
)
533+
527534
# QNN_SDK_ROOT might also be an argument, but it is used in various places.
528535
# So maybe it's fine to just use the environment.
529536
if "QNN_SDK_ROOT" not in os.environ:

0 commit comments

Comments
 (0)