Skip to content

Commit f592d85

Browse files
authored
Qualcomm AI Engine Direct - Reland GA Static QWEN2.5 0.5B (#12582)
### Summary Previous PR were merged unintentionally: #12054 On top of previous PR, also had a new commit addressing code review. ### Test plan [PLEASE REMOVE] How did you test this PR? Please write down any manual commands you used and note down tests that you have written if applicable.
1 parent 0134e88 commit f592d85

File tree

14 files changed

+336
-137
lines changed

14 files changed

+336
-137
lines changed

backends/qualcomm/_passes/layout_transform.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,8 @@ class LayoutTransform(ExportPass):
103103
exir_ops.edge.aten.pow.Tensor_Scalar,
104104
exir_ops.edge.aten.prelu.default,
105105
exir_ops.edge.aten.repeat.default,
106-
exir_ops.edge.aten.round.default,
107106
exir_ops.edge.aten.relu.default,
107+
exir_ops.edge.aten.round.default,
108108
exir_ops.edge.aten.sigmoid.default,
109109
exir_ops.edge.aten.split_with_sizes.default,
110110
exir_ops.edge.aten.split_with_sizes_copy.default,

backends/qualcomm/quantizer/annotators.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,9 @@ def annotate_masked_fill(node: Node, quantization_config: QuantizationConfig) ->
278278
)
279279

280280

281-
@register_annotator([torch.ops.aten.mul, torch.ops.aten.mul.Tensor])
281+
@register_annotator(
282+
[torch.ops.aten.mul, torch.ops.aten.mul.Tensor, torch.ops.aten.mul_.Tensor]
283+
)
282284
def annotate_mul(node: Node, quantization_config: QuantizationConfig) -> None:
283285
annotate_binary(node, quantization_config)
284286

@@ -1311,7 +1313,7 @@ def annotate_where(node: Node, quantization_config: QuantizationConfig) -> None:
13111313
)
13121314

13131315

1314-
@register_annotator([torch.ops.aten.zeros.default])
1316+
@register_annotator([torch.ops.aten.zeros.default, torch.ops.aten.zeros_like.default])
13151317
def annotate_zeros(node: Node, quantization_config: QuantizationConfig) -> None:
13161318
if _is_annotated([node]) or not _is_float_tensor(node):
13171319
return

backends/qualcomm/quantizer/custom_annotation.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,9 @@ def annotate_prefill_kv_output(gm: torch.fx.GraphModule, kv_quant_attrs: dict):
153153
)
154154

155155

156-
def annotate_matmul_16a8w(gm: torch.fx.GraphModule) -> None: # noqa: C901
156+
def annotate_matmul_16a8w( # noqa: C901
157+
gm: torch.fx.GraphModule, annotate_conv=True
158+
) -> None:
157159
"""
158160
This function is specific for matmul op 16a8w.
159161
For k, we will tag such as the below, and
@@ -317,9 +319,10 @@ def annotate_matmul_input1(node: Node):
317319
# The arguments of cat op: (the past kv cache, the new kv cache)
318320
node = node.args[0][1]
319321
elif node.target == torch.ops.aten.conv2d.default:
320-
annotate_conv2d(
321-
node, quantization_config=quantization_config_8a4w_per_channel
322-
)
322+
if annotate_conv:
323+
annotate_conv2d(
324+
node, quantization_config=quantization_config_8a4w_per_channel
325+
)
323326
break
324327
elif node.target in [torch.ops.aten.add.Tensor, torch.ops.aten.sub.Tensor]:
325328
break

backends/qualcomm/scripts/build.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ if [ "$BUILD_AARCH64" = true ]; then
8585
-DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
8686
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
8787
-DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
88+
-DEXECUTORCH_ENABLE_LOGGING=ON \
8889
-DQNN_SDK_ROOT=$QNN_SDK_ROOT \
8990
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
9091
-DANDROID_ABI='arm64-v8a' \
@@ -104,6 +105,9 @@ if [ "$BUILD_AARCH64" = true ]; then
104105
-DANDROID_ABI='arm64-v8a' \
105106
-DANDROID_PLATFORM=android-30 \
106107
-DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
108+
-DSUPPORT_REGEX_LOOKAHEAD=ON \
109+
-DBUILD_TESTING=OFF \
110+
-DEXECUTORCH_ENABLE_LOGGING=ON \
107111
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
108112
-DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
109113
-DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
@@ -134,6 +138,7 @@ if [ "$BUILD_X86_64" = true ]; then
134138
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
135139
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
136140
-DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
141+
-DEXECUTORCH_ENABLE_LOGGING=ON \
137142
-DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
138143
-S $PRJ_ROOT \
139144
-B $BUILD_ROOT \
@@ -157,6 +162,9 @@ if [ "$BUILD_X86_64" = true ]; then
157162
-DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
158163
-DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
159164
-DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
165+
-DSUPPORT_REGEX_LOOKAHEAD=ON \
166+
-DBUILD_TESTING=OFF \
167+
-DEXECUTORCH_ENABLE_LOGGING=ON \
160168
-B$EXAMPLE_ROOT
161169

162170
cmake --build $EXAMPLE_ROOT -j$BUILD_JOB_NUMBER

backends/qualcomm/tests/test_qnn_delegate.py

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4049,7 +4049,7 @@ def test_llama3_2_1b(self):
40494049
"16a4w",
40504050
"--temperature",
40514051
"0",
4052-
"--llama_model",
4052+
"--decoder_model",
40534053
"llama3_2",
40544054
"--model_mode",
40554055
"hybrid",
@@ -4129,7 +4129,7 @@ def test_llama_stories_110m(self):
41294129
"16a4w",
41304130
"--temperature",
41314131
"0",
4132-
"--llama_model",
4132+
"--decoder_model",
41334133
"stories110m",
41344134
"--model_mode",
41354135
"hybrid",
@@ -4171,6 +4171,65 @@ def test_llama_stories_110m(self):
41714171
if not self.compile_only and not self.enable_x86_64:
41724172
self.assertGreaterEqual(msg["inference_speed"], 220) # Lanai
41734173

4174+
def test_qwen2_5(self):
4175+
if not self.required_envs():
4176+
self.skipTest("missing required envs")
4177+
4178+
prompt = "My favourite condiment is "
4179+
cmds = [
4180+
"python",
4181+
f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
4182+
"--artifact",
4183+
self.artifact_dir,
4184+
"--build_folder",
4185+
self.build_folder,
4186+
"--model",
4187+
self.model,
4188+
"--ip",
4189+
self.ip,
4190+
"--port",
4191+
str(self.port),
4192+
"--prompt",
4193+
f"{prompt}",
4194+
"--ptq",
4195+
"16a8w",
4196+
"--decoder_model",
4197+
"qwen2_5",
4198+
"--model_mode",
4199+
"hybrid",
4200+
"--prefill_ar_len",
4201+
"32",
4202+
"--max_seq_len",
4203+
"128",
4204+
]
4205+
if self.compile_only:
4206+
cmds.extend(["--compile_only"])
4207+
elif self.device:
4208+
cmds.extend(["--device", self.device])
4209+
if self.host:
4210+
cmds.extend(["--host", self.host])
4211+
elif self.enable_x86_64:
4212+
cmds.extend(["--enable_x86_64"])
4213+
if self.pre_gen_pte:
4214+
cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
4215+
4216+
# Accuracy is bad for now. Just check user's prompt is returned.
4217+
golden_start_with = "My favourite condiment is "
4218+
p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
4219+
with Listener((self.ip, self.port)) as listener:
4220+
conn = listener.accept()
4221+
p.communicate()
4222+
msg = json.loads(conn.recv())
4223+
if "Error" in msg:
4224+
self.fail(msg["Error"])
4225+
else:
4226+
model_out = msg["result"][0]
4227+
self.assertTrue(
4228+
model_out.startswith(golden_start_with),
4229+
f"Expected Output: {golden_start_with}. Actual Output: {model_out}",
4230+
)
4231+
self.assertGreaterEqual(msg["inference_speed"], 95) # Lanai
4232+
41744233

41754234
class TestExampleOssScript(TestQNN):
41764235
def test_albert(self):
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{
2+
"dim": 896,
3+
"ffn_dim_multiplier": 1,
4+
"hidden_dim": 4864,
5+
"n_heads": 14,
6+
"n_kv_heads": 2,
7+
"n_layers": 24,
8+
"norm_eps": 1e-06,
9+
"rope_theta": 1000000.0,
10+
"use_scaled_rope": false,
11+
"vocab_size": 151936,
12+
"use_hf_rope": true,
13+
"attention_qkv_bias": true
14+
}

examples/qualcomm/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,8 @@ target_include_directories(
7777

7878
# add tokenizers
7979
add_subdirectory(
80-
${EXECUTORCH_ROOT}/extension/llm/tokenizers
81-
${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/tokenizers
80+
${EXECUTORCH_ROOT}/extension/llm/runner
81+
${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/runner
8282
)
8383

8484
# build qnn_executor_runner

examples/qualcomm/oss_scripts/llama/CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
66

7+
78
# model sharding with custom op
89
set(CUSTOM_OP_SRCS_FILE
910
"${EXECUTORCH_SOURCE_DIR}/extension/llm/custom_ops/op_fallback.cpp"
@@ -63,14 +64,22 @@ target_link_libraries(
6364
executorch_core
6465
extension_data_loader
6566
extension_flat_tensor
67+
extension_llm_runner
6668
extension_module
6769
extension_tensor
70+
tokenizers
6871
gflags
6972
custom_ops
7073
quantized_ops_lib
7174
quantized_kernels
7275
tokenizers
7376
)
77+
78+
target_include_directories(
79+
qnn_llama_runner
80+
PUBLIC ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
81+
)
82+
7483
target_compile_options(qnn_llama_runner PUBLIC ${_common_compile_options})
7584
set_target_properties(
7685
qnn_llama_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"

examples/qualcomm/oss_scripts/llama/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
# Summary
22

33
## Overview
4-
This file provides you the instructions to run LLAMA model with different parameters via Qualcomm HTP backend. We currently support the following models:
4+
This file provides you the instructions to run LLM Decoder model with different parameters via Qualcomm HTP backend. We currently support the following models:
55
1. LLAMA2 Stories 110M
66
2. LLAMA3.2 1B
77
3. LLAMA3.2 3B
8+
4. QWEN2.5 0.5B
89

910
We offer the following modes to execute the model:
1011

0 commit comments

Comments
 (0)