Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backends/qualcomm/_passes/layout_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,8 @@ class LayoutTransform(ExportPass):
exir_ops.edge.aten.pow.Tensor_Scalar,
exir_ops.edge.aten.prelu.default,
exir_ops.edge.aten.repeat.default,
exir_ops.edge.aten.round.default,
exir_ops.edge.aten.relu.default,
exir_ops.edge.aten.round.default,
exir_ops.edge.aten.sigmoid.default,
exir_ops.edge.aten.split_with_sizes.default,
exir_ops.edge.aten.split_with_sizes_copy.default,
Expand Down
6 changes: 4 additions & 2 deletions backends/qualcomm/quantizer/annotators.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,9 @@ def annotate_masked_fill(node: Node, quantization_config: QuantizationConfig) ->
)


@register_annotator([torch.ops.aten.mul, torch.ops.aten.mul.Tensor])
@register_annotator(
[torch.ops.aten.mul, torch.ops.aten.mul.Tensor, torch.ops.aten.mul_.Tensor]
)
def annotate_mul(node: Node, quantization_config: QuantizationConfig) -> None:
annotate_binary(node, quantization_config)

Expand Down Expand Up @@ -1298,7 +1300,7 @@ def annotate_where(node: Node, quantization_config: QuantizationConfig) -> None:
)


@register_annotator([torch.ops.aten.zeros.default])
@register_annotator([torch.ops.aten.zeros.default, torch.ops.aten.zeros_like.default])
def annotate_zeros(node: Node, quantization_config: QuantizationConfig) -> None:
if _is_annotated([node]) or not _is_float_tensor(node):
return
Expand Down
11 changes: 7 additions & 4 deletions backends/qualcomm/quantizer/custom_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,9 @@ def annotate_prefill_kv_output(gm: torch.fx.GraphModule, kv_quant_attrs: dict):
)


def annotate_matmul_16a8w(gm: torch.fx.GraphModule) -> None: # noqa: C901
def annotate_matmul_16a8w( # noqa: C901
gm: torch.fx.GraphModule, annotate_conv=True
) -> None:
"""
This function is specific for matmul op 16a8w.
For k, we will tag such as the below, and
Expand Down Expand Up @@ -317,9 +319,10 @@ def annotate_matmul_input1(node: Node):
# The arguments of cat op: (the past kv cache, the new kv cache)
node = node.args[0][1]
elif node.target == torch.ops.aten.conv2d.default:
annotate_conv2d(
node, quantization_config=quantization_config_8a4w_per_channel
)
if annotate_conv:
annotate_conv2d(
node, quantization_config=quantization_config_8a4w_per_channel
)
break
elif node.target in [torch.ops.aten.add.Tensor, torch.ops.aten.sub.Tensor]:
break
Expand Down
8 changes: 8 additions & 0 deletions backends/qualcomm/scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ if [ "$BUILD_AARCH64" = true ]; then
-DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
-DEXECUTORCH_ENABLE_LOGGING=ON \
-DQNN_SDK_ROOT=$QNN_SDK_ROOT \
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
-DANDROID_ABI='arm64-v8a' \
Expand All @@ -104,6 +105,9 @@ if [ "$BUILD_AARCH64" = true ]; then
-DANDROID_ABI='arm64-v8a' \
-DANDROID_PLATFORM=android-30 \
-DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
-DSUPPORT_REGEX_LOOKAHEAD=ON \
-DBUILD_TESTING=OFF \
-DEXECUTORCH_ENABLE_LOGGING=ON \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
-DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
Expand Down Expand Up @@ -134,6 +138,7 @@ if [ "$BUILD_X86_64" = true ]; then
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
-DEXECUTORCH_ENABLE_LOGGING=ON \
-DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
-S $PRJ_ROOT \
-B $BUILD_ROOT \
Expand All @@ -157,6 +162,9 @@ if [ "$BUILD_X86_64" = true ]; then
-DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
-DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
-DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
-DSUPPORT_REGEX_LOOKAHEAD=ON \
-DBUILD_TESTING=OFF \
-DEXECUTORCH_ENABLE_LOGGING=ON \
-B$EXAMPLE_ROOT

cmake --build $EXAMPLE_ROOT -j$BUILD_JOB_NUMBER
Expand Down
63 changes: 61 additions & 2 deletions backends/qualcomm/tests/test_qnn_delegate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3999,7 +3999,7 @@ def test_llama3_2_1b(self):
"16a4w",
"--temperature",
"0",
"--llama_model",
"--decoder_model",
"llama3_2",
"--model_mode",
"hybrid",
Expand Down Expand Up @@ -4079,7 +4079,7 @@ def test_llama_stories_110m(self):
"16a4w",
"--temperature",
"0",
"--llama_model",
"--decoder_model",
"stories110m",
"--model_mode",
"hybrid",
Expand Down Expand Up @@ -4121,6 +4121,65 @@ def test_llama_stories_110m(self):
if not self.compile_only and not self.enable_x86_64:
self.assertGreaterEqual(msg["inference_speed"], 220) # Lanai

def test_qwen2_5(self):
if not self.required_envs():
self.skipTest("missing required envs")

prompt = "My favourite condiment is "
cmds = [
"python",
f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
"--artifact",
self.artifact_dir,
"--build_folder",
self.build_folder,
"--model",
self.model,
"--ip",
self.ip,
"--port",
str(self.port),
"--prompt",
f"{prompt}",
"--ptq",
"16a8w",
"--decoder_model",
"qwen2_5",
"--model_mode",
"hybrid",
"--prefill_ar_len",
"32",
"--max_seq_len",
"128",
]
if self.compile_only:
cmds.extend(["--compile_only"])
elif self.device:
cmds.extend(["--device", self.device])
if self.host:
cmds.extend(["--host", self.host])
elif self.enable_x86_64:
cmds.extend(["--enable_x86_64"])
if self.pre_gen_pte:
cmds.extend(["--pre_gen_pte", self.pre_gen_pte])

# Accuracy is bad for now. Just check user's prompt is returned.
golden_start_with = "My favourite condiment is "
p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
with Listener((self.ip, self.port)) as listener:
conn = listener.accept()
p.communicate()
msg = json.loads(conn.recv())
if "Error" in msg:
self.fail(msg["Error"])
else:
model_out = msg["result"][0]
self.assertTrue(
model_out.startswith(golden_start_with),
f"Expected Output: {golden_start_with}. Actual Output: {model_out}",
)
self.assertGreaterEqual(msg["inference_speed"], 95) # Lanai


class TestExampleOssScript(TestQNN):
def test_albert(self):
Expand Down
4 changes: 2 additions & 2 deletions examples/qualcomm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ target_include_directories(

# add tokenizers
add_subdirectory(
${EXECUTORCH_ROOT}/extension/llm/tokenizers
${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/tokenizers
${EXECUTORCH_ROOT}/extension/llm/runner
${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/runner
)

# build qnn_executor_runner
Expand Down
9 changes: 9 additions & 0 deletions examples/qualcomm/oss_scripts/llama/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.


# model sharding with custom op
set(CUSTOM_OP_SRCS_FILE
"${EXECUTORCH_SOURCE_DIR}/extension/llm/custom_ops/op_fallback.cpp"
Expand Down Expand Up @@ -63,14 +64,22 @@ target_link_libraries(
executorch_core
extension_data_loader
extension_flat_tensor
extension_llm_runner
extension_module
extension_tensor
tokenizers
gflags
custom_ops
quantized_ops_lib
quantized_kernels
tokenizers
)

target_include_directories(
qnn_llama_runner
PUBLIC ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
)

target_compile_options(qnn_llama_runner PUBLIC ${_common_compile_options})
set_target_properties(
qnn_llama_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
Expand Down
3 changes: 2 additions & 1 deletion examples/qualcomm/oss_scripts/llama/README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# Summary

## Overview
This file provides you the instructions to run LLAMA model with different parameters via Qualcomm HTP backend. We currently support the following models:
This file provides you the instructions to run LLM Decoder model with different parameters via Qualcomm HTP backend. We currently support the following models:
1. LLAMA2 Stories 110M
2. LLAMA3.2 1B
3. LLAMA3.2 3B
4. QWEN2.5 0.5B

We offer the following modes to execute the model:

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Copyright (c) Qualcomm Innovation Center, Inc.
# All rights reserved
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.


def convert_configs(config):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you share where this config come from and how we can scale to more models?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The config can be found over here:
https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2/configuration_qwen2.py.
For now, I checked qwen and gemma, and I think most configs are following the same naming, where they all use PretrainedConfig as the base class. Ideally, this function should be able to support most HF decoder models, but I will need to test them 1 by 1 to confirm they can all be supported using this function.

# HF config keys are different from Llama configs.
# Convert the config keys to align with Llama.
if hasattr(config, "hidden_size"):
config.dim = config.hidden_size
delattr(config, "hidden_size")

if hasattr(config, "num_attention_heads"):
config.n_heads = config.num_attention_heads
delattr(config, "num_attention_heads")

if hasattr(config, "num_key_value_heads"):
config.n_kv_heads = config.num_key_value_heads
delattr(config, "num_key_value_heads")

if hasattr(config, "rms_norm_eps"):
config.norm_eps = config.rms_norm_eps
delattr(config, "rms_norm_eps")

if hasattr(config, "rope_theta"):
config.rope_freq_base = config.rope_theta
delattr(config, "rope_theta")

if hasattr(config, "num_hidden_layers"):
config.n_layers = config.num_hidden_layers
delattr(config, "num_hidden_layers")

if hasattr(config, "intermediate_size"):
config.hidden_dim = config.intermediate_size
delattr(config, "intermediate_size")

if hasattr(config, "rope_scaling"):
config.use_scaled_rope = config.rope_scaling
# Use default value of precompute_freq_cis
if not hasattr(config, "rope_scale_factor"):
config.rope_scale_factor = 4

return config
Loading
Loading