Skip to content

Commit 0e7a016

Browse files
committed
bump arm baremetal size test thresholds again for EXPORT diff
[ghstack-poisoned]
2 parents a436fbe + 1bb2b3d commit 0e7a016

File tree

16 files changed

+368
-163
lines changed

16 files changed

+368
-163
lines changed

.github/workflows/trunk.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -242,12 +242,12 @@ jobs:
242242
setup_script_args=""
243243
if [[ ${{ matrix.os}} == "bare_metal" ]]; then
244244
toolchain_prefix=arm-none-eabi-
245-
threshold="108260"
245+
threshold="109000"
246246
toolchain_cmake=examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
247247
elif [[ ${{ matrix.os}} == "zephyr-preset" ]]; then
248248
setup_script_args="--target-toolchain zephyr"
249249
toolchain_prefix=arm-zephyr-eabi-
250-
threshold="133120" # should be ~125KB, set threshold to 130KB
250+
threshold="135000"
251251
toolchain_cmake=examples/zephyr/x86_64-linux-arm-zephyr-eabi-gcc.cmake
252252
else
253253
echo "Fail unsupport OS selection ${{ matrix.os }}"

backends/qualcomm/_passes/layout_transform.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,8 @@ class LayoutTransform(ExportPass):
103103
exir_ops.edge.aten.pow.Tensor_Scalar,
104104
exir_ops.edge.aten.prelu.default,
105105
exir_ops.edge.aten.repeat.default,
106-
exir_ops.edge.aten.round.default,
107106
exir_ops.edge.aten.relu.default,
107+
exir_ops.edge.aten.round.default,
108108
exir_ops.edge.aten.sigmoid.default,
109109
exir_ops.edge.aten.split_with_sizes.default,
110110
exir_ops.edge.aten.split_with_sizes_copy.default,

backends/qualcomm/quantizer/annotators.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,9 @@ def annotate_masked_fill(node: Node, quantization_config: QuantizationConfig) ->
278278
)
279279

280280

281-
@register_annotator([torch.ops.aten.mul, torch.ops.aten.mul.Tensor])
281+
@register_annotator(
282+
[torch.ops.aten.mul, torch.ops.aten.mul.Tensor, torch.ops.aten.mul_.Tensor]
283+
)
282284
def annotate_mul(node: Node, quantization_config: QuantizationConfig) -> None:
283285
annotate_binary(node, quantization_config)
284286

@@ -1311,7 +1313,7 @@ def annotate_where(node: Node, quantization_config: QuantizationConfig) -> None:
13111313
)
13121314

13131315

1314-
@register_annotator([torch.ops.aten.zeros.default])
1316+
@register_annotator([torch.ops.aten.zeros.default, torch.ops.aten.zeros_like.default])
13151317
def annotate_zeros(node: Node, quantization_config: QuantizationConfig) -> None:
13161318
if _is_annotated([node]) or not _is_float_tensor(node):
13171319
return

backends/qualcomm/quantizer/custom_annotation.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,9 @@ def annotate_prefill_kv_output(gm: torch.fx.GraphModule, kv_quant_attrs: dict):
153153
)
154154

155155

156-
def annotate_matmul_16a8w(gm: torch.fx.GraphModule) -> None: # noqa: C901
156+
def annotate_matmul_16a8w( # noqa: C901
157+
gm: torch.fx.GraphModule, annotate_conv=True
158+
) -> None:
157159
"""
158160
This function is specific for matmul op 16a8w.
159161
For k, we will tag such as the below, and
@@ -317,9 +319,10 @@ def annotate_matmul_input1(node: Node):
317319
# The arguments of cat op: (the past kv cache, the new kv cache)
318320
node = node.args[0][1]
319321
elif node.target == torch.ops.aten.conv2d.default:
320-
annotate_conv2d(
321-
node, quantization_config=quantization_config_8a4w_per_channel
322-
)
322+
if annotate_conv:
323+
annotate_conv2d(
324+
node, quantization_config=quantization_config_8a4w_per_channel
325+
)
323326
break
324327
elif node.target in [torch.ops.aten.add.Tensor, torch.ops.aten.sub.Tensor]:
325328
break

backends/qualcomm/scripts/build.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ if [ "$BUILD_AARCH64" = true ]; then
8787
-DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
8888
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
8989
-DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
90+
-DEXECUTORCH_ENABLE_LOGGING=ON \
9091
-DQNN_SDK_ROOT=$QNN_SDK_ROOT \
9192
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
9293
-DANDROID_ABI='arm64-v8a' \
@@ -106,6 +107,9 @@ if [ "$BUILD_AARCH64" = true ]; then
106107
-DANDROID_ABI='arm64-v8a' \
107108
-DANDROID_PLATFORM=android-30 \
108109
-DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
110+
-DSUPPORT_REGEX_LOOKAHEAD=ON \
111+
-DBUILD_TESTING=OFF \
112+
-DEXECUTORCH_ENABLE_LOGGING=ON \
109113
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
110114
-DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
111115
-DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
@@ -138,6 +142,7 @@ if [ "$BUILD_X86_64" = true ]; then
138142
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
139143
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
140144
-DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
145+
-DEXECUTORCH_ENABLE_LOGGING=ON \
141146
-DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
142147
-S $PRJ_ROOT \
143148
-B $BUILD_ROOT \
@@ -161,6 +166,9 @@ if [ "$BUILD_X86_64" = true ]; then
161166
-DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
162167
-DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
163168
-DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
169+
-DSUPPORT_REGEX_LOOKAHEAD=ON \
170+
-DBUILD_TESTING=OFF \
171+
-DEXECUTORCH_ENABLE_LOGGING=ON \
164172
-B$EXAMPLE_ROOT
165173

166174
cmake --build $EXAMPLE_ROOT -j$BUILD_JOB_NUMBER

backends/qualcomm/tests/test_qnn_delegate.py

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4049,7 +4049,7 @@ def test_llama3_2_1b(self):
40494049
"16a4w",
40504050
"--temperature",
40514051
"0",
4052-
"--llama_model",
4052+
"--decoder_model",
40534053
"llama3_2",
40544054
"--model_mode",
40554055
"hybrid",
@@ -4129,7 +4129,7 @@ def test_llama_stories_110m(self):
41294129
"16a4w",
41304130
"--temperature",
41314131
"0",
4132-
"--llama_model",
4132+
"--decoder_model",
41334133
"stories110m",
41344134
"--model_mode",
41354135
"hybrid",
@@ -4171,6 +4171,65 @@ def test_llama_stories_110m(self):
41714171
if not self.compile_only and not self.enable_x86_64:
41724172
self.assertGreaterEqual(msg["inference_speed"], 220) # Lanai
41734173

4174+
def test_qwen2_5(self):
4175+
if not self.required_envs():
4176+
self.skipTest("missing required envs")
4177+
4178+
prompt = "My favourite condiment is "
4179+
cmds = [
4180+
"python",
4181+
f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
4182+
"--artifact",
4183+
self.artifact_dir,
4184+
"--build_folder",
4185+
self.build_folder,
4186+
"--model",
4187+
self.model,
4188+
"--ip",
4189+
self.ip,
4190+
"--port",
4191+
str(self.port),
4192+
"--prompt",
4193+
f"{prompt}",
4194+
"--ptq",
4195+
"16a8w",
4196+
"--decoder_model",
4197+
"qwen2_5",
4198+
"--model_mode",
4199+
"hybrid",
4200+
"--prefill_ar_len",
4201+
"32",
4202+
"--max_seq_len",
4203+
"128",
4204+
]
4205+
if self.compile_only:
4206+
cmds.extend(["--compile_only"])
4207+
elif self.device:
4208+
cmds.extend(["--device", self.device])
4209+
if self.host:
4210+
cmds.extend(["--host", self.host])
4211+
elif self.enable_x86_64:
4212+
cmds.extend(["--enable_x86_64"])
4213+
if self.pre_gen_pte:
4214+
cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
4215+
4216+
# Accuracy is bad for now. Just check user's prompt is returned.
4217+
golden_start_with = "My favourite condiment is "
4218+
p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
4219+
with Listener((self.ip, self.port)) as listener:
4220+
conn = listener.accept()
4221+
p.communicate()
4222+
msg = json.loads(conn.recv())
4223+
if "Error" in msg:
4224+
self.fail(msg["Error"])
4225+
else:
4226+
model_out = msg["result"][0]
4227+
self.assertTrue(
4228+
model_out.startswith(golden_start_with),
4229+
f"Expected Output: {golden_start_with}. Actual Output: {model_out}",
4230+
)
4231+
self.assertGreaterEqual(msg["inference_speed"], 95) # Lanai
4232+
41744233

41754234
class TestExampleOssScript(TestQNN):
41764235
def test_albert(self):
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{
2+
"dim": 896,
3+
"ffn_dim_multiplier": 1,
4+
"hidden_dim": 4864,
5+
"n_heads": 14,
6+
"n_kv_heads": 2,
7+
"n_layers": 24,
8+
"norm_eps": 1e-06,
9+
"rope_theta": 1000000.0,
10+
"use_scaled_rope": false,
11+
"vocab_size": 151936,
12+
"use_hf_rope": true,
13+
"attention_qkv_bias": true
14+
}

examples/qualcomm/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,11 @@ target_compile_options(
6060
full_portable_ops_lib INTERFACE -DET_EVENT_TRACER_ENABLED
6161
)
6262

63+
add_subdirectory(
64+
${EXECUTORCH_ROOT}/extension/llm/runner
65+
${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/runner
66+
)
67+
6368
# Let files say "include <executorch/path/to/header.h>".
6469
set(_common_include_directories
6570
${EXECUTORCH_ROOT}/..

examples/qualcomm/oss_scripts/llama/CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
66

7+
78
# model sharding with custom op
89
set(CUSTOM_OP_SRCS_FILE
910
"${EXECUTORCH_SOURCE_DIR}/extension/llm/custom_ops/op_fallback.cpp"
@@ -65,12 +66,19 @@ target_link_libraries(
6566
extension_llm_runner
6667
extension_module
6768
extension_tensor
69+
tokenizers
6870
gflags
6971
custom_ops
7072
quantized_ops_lib
7173
quantized_kernels
7274
tokenizers::tokenizers
7375
)
76+
77+
target_include_directories(
78+
qnn_llama_runner
79+
PUBLIC ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
80+
)
81+
7482
target_compile_options(qnn_llama_runner PUBLIC ${_common_compile_options})
7583
set_target_properties(
7684
qnn_llama_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"

examples/qualcomm/oss_scripts/llama/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
# Summary
22

33
## Overview
4-
This file provides you the instructions to run LLAMA model with different parameters via Qualcomm HTP backend. We currently support the following models:
4+
This file provides you the instructions to run LLM Decoder model with different parameters via Qualcomm HTP backend. We currently support the following models:
55
1. LLAMA2 Stories 110M
66
2. LLAMA3.2 1B
77
3. LLAMA3.2 3B
8+
4. QWEN2.5 0.5B
89

910
We offer the following modes to execute the model:
1011

0 commit comments

Comments
 (0)