Skip to content

Commit 0c4785b

Browse files
committed
Qualcomm AI Engine Direct - GA QWEN2.5 0.5B
1 parent 695c7d5 commit 0c4785b

File tree

12 files changed

+1170
-64
lines changed

12 files changed

+1170
-64
lines changed

backends/qualcomm/quantizer/annotators.py

Lines changed: 40 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,9 @@ def annotate_masked_fill(node: Node, quantization_config: QuantizationConfig) ->
250250
)
251251

252252

253-
@register_annotator([torch.ops.aten.mul, torch.ops.aten.mul.Tensor])
253+
@register_annotator(
254+
[torch.ops.aten.mul, torch.ops.aten.mul.Tensor, torch.ops.aten.mul_.Tensor]
255+
)
254256
def annotate_mul(node: Node, quantization_config: QuantizationConfig) -> None:
255257
annotate_binary(node, quantization_config)
256258

@@ -606,9 +608,35 @@ def annotate_select(node: Node, quantization_config: QuantizationConfig) -> None
606608

607609
@register_annotator([torch.ops.aten.slice.Tensor])
608610
def annotate_slice(node: Node, quantization_config: QuantizationConfig) -> None:
611+
if _is_annotated([node]) or not _is_float_tensor(node):
612+
return
609613
annotate_single_in_single_out(node, quantization_config)
610614

611615

616+
@register_annotator([torch.ops.aten.slice_scatter.default])
617+
def annotate_slice_scatter(node: Node, quantization_config: QuantizationConfig) -> None:
618+
if _is_annotated([node]):
619+
return
620+
621+
input_act_qspec = quantization_config.input_activation
622+
output_act_qspec = quantization_config.output_activation
623+
624+
input_qspec_map = {}
625+
input_act0 = node.args[0]
626+
if isinstance(input_act0, Node):
627+
input_qspec_map[input_act0] = input_act_qspec
628+
629+
input_act1 = node.args[1]
630+
if isinstance(input_act1, Node):
631+
input_qspec_map[input_act1] = input_act_qspec
632+
633+
node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
634+
input_qspec_map=input_qspec_map,
635+
output_qspec=output_act_qspec,
636+
_annotated=True,
637+
)
638+
639+
612640
@register_annotator([torch.ops.aten.sqrt.default])
613641
def annotate_sqrt(node: Node, quantization_config: QuantizationConfig) -> None:
614642
annotate_single_in_single_out(node, quantization_config)
@@ -801,16 +829,17 @@ def annotate_embedding(node: Node, quantization_config: QuantizationConfig) -> N
801829

802830
@register_annotator([torch.ops.aten.index.Tensor])
803831
def annotate_index(node: Node, quantization_config: QuantizationConfig) -> None:
832+
if _is_annotated([node]) or not _is_float_tensor(node):
833+
return
804834
annotate_in_out_obs_sharing_op(node, quantization_config)
805-
if not _is_annotated([node]):
806-
input_qspec_map = {}
807-
input = node.args[0]
808-
input_qspec_map[input] = quantization_config.input_activation
809-
node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
810-
input_qspec_map=input_qspec_map,
811-
output_qspec=SharedQuantizationSpec((input, node)),
812-
_annotated=True,
813-
)
835+
input_qspec_map = {}
836+
input = node.args[0]
837+
input_qspec_map[input] = quantization_config.input_activation
838+
node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
839+
input_qspec_map=input_qspec_map,
840+
output_qspec=SharedQuantizationSpec((input, node)),
841+
_annotated=True,
842+
)
814843

815844

816845
@register_annotator(
@@ -1270,7 +1299,7 @@ def annotate_where(node: Node, quantization_config: QuantizationConfig) -> None:
12701299
)
12711300

12721301

1273-
@register_annotator([torch.ops.aten.zeros.default])
1302+
@register_annotator([torch.ops.aten.zeros.default, torch.ops.aten.zeros_like.default])
12741303
def annotate_zeros(node: Node, quantization_config: QuantizationConfig) -> None:
12751304
if _is_annotated([node]) or not _is_float_tensor(node):
12761305
return

backends/qualcomm/quantizer/custom_annotation.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,9 @@ def annotate_prefill_kv_output(gm: torch.fx.GraphModule, kv_quant_attrs: dict):
151151
)
152152

153153

154-
def annotate_matmul_16a8w(gm: torch.fx.GraphModule) -> None: # noqa: C901
154+
def annotate_matmul_16a8w( # noqa: C901
155+
gm: torch.fx.GraphModule, annotate_conv=True
156+
) -> None:
155157
"""
156158
This function is specific for matmul op 16a8w.
157159
For k, we will tag such as the below, and
@@ -254,9 +256,10 @@ def annotate_matmul_input1(node: Node):
254256
# The arguments of cat op: (the past kv cache, the new kv cache)
255257
node = node.args[0][1]
256258
elif node.target == torch.ops.aten.conv2d.default:
257-
annotate_conv2d(
258-
node, quantization_config=quantization_config_8a4w_per_channel
259-
)
259+
if annotate_conv:
260+
annotate_conv2d(
261+
node, quantization_config=quantization_config_8a4w_per_channel
262+
)
260263
break
261264
elif node.target in [torch.ops.aten.add.Tensor, torch.ops.aten.sub.Tensor]:
262265
break

backends/qualcomm/scripts/build.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,8 @@ if [ "$BUILD_AARCH64" = true ]; then
104104
-DANDROID_ABI='arm64-v8a' \
105105
-DANDROID_PLATFORM=android-30 \
106106
-DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
107+
-DSUPPORT_REGEX_LOOKAHEAD=ON \
108+
-DBUILD_TESTING=OFF \
107109
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
108110
-DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
109111
-DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
@@ -157,6 +159,8 @@ if [ "$BUILD_X86_64" = true ]; then
157159
-DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
158160
-DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
159161
-DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
162+
-DSUPPORT_REGEX_LOOKAHEAD=ON \
163+
-DBUILD_TESTING=OFF \
160164
-B$EXAMPLE_ROOT
161165

162166
cmake --build $EXAMPLE_ROOT -j$BUILD_JOB_NUMBER

examples/qualcomm/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,8 @@ target_include_directories(
7777

7878
# add tokenizers
7979
add_subdirectory(
80-
${EXECUTORCH_ROOT}/extension/llm/tokenizers
81-
${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/tokenizers
80+
${EXECUTORCH_ROOT}/extension/llm/runner
81+
${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/runner
8282
)
8383

8484
# build qnn_executor_runner

examples/qualcomm/oss_scripts/llama/CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
66

7+
78
# model sharding with custom op
89
set(CUSTOM_OP_SRCS_FILE
910
"${EXECUTORCH_SOURCE_DIR}/extension/llm/custom_ops/op_fallback.cpp"
@@ -63,14 +64,22 @@ target_link_libraries(
6364
executorch_core
6465
extension_data_loader
6566
extension_flat_tensor
67+
extension_llm_runner
6668
extension_module
6769
extension_tensor
70+
tokenizers
6871
gflags
6972
custom_ops
7073
quantized_ops_lib
7174
quantized_kernels
7275
tokenizers
7376
)
77+
78+
target_include_directories(
79+
qnn_llama_runner
80+
PUBLIC ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
81+
)
82+
7483
target_compile_options(qnn_llama_runner PUBLIC ${_common_compile_options})
7584
set_target_properties(
7685
qnn_llama_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"

examples/qualcomm/oss_scripts/llama/llama.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -781,7 +781,7 @@ def permute(w, heads):
781781
return quant_attrs
782782

783783

784-
def inference(args, pte_filename, runtime_tokenizer_path, pre_gen_pte=""):
784+
def inference(args, pte_filename, runtime_tokenizer_path, llama_version):
785785
workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/single_llama"
786786

787787
if args.model_mode == "kv":
@@ -794,8 +794,8 @@ def inference(args, pte_filename, runtime_tokenizer_path, pre_gen_pte=""):
794794
raise RuntimeError(f"Unknown model_mode: {args.model_mode}.")
795795

796796
pte_path = (
797-
f"{pre_gen_pte}/{pte_filename}.pte"
798-
if pre_gen_pte
797+
f"{args.pre_gen_pte}/{pte_filename}.pte"
798+
if args.pre_gen_pte
799799
else f"{args.artifact}/{pte_filename}.pte"
800800
)
801801

@@ -836,6 +836,7 @@ def post_process():
836836
[
837837
f"export LD_LIBRARY_PATH={qnn_sdk}/lib/{target}/:{args.build_folder}/lib &&",
838838
f"./{args.build_folder}/examples/qualcomm/oss_scripts/llama/qnn_llama_runner",
839+
f"--decoder_model_version {llama_version}",
839840
f"--tokenizer_path {runtime_tokenizer_path}",
840841
f"--model_path {pte_path}",
841842
f"--seq_len {seq_len}",
@@ -857,6 +858,7 @@ def post_process():
857858
[
858859
f"cd {workspace} &&",
859860
f"./qnn_llama_runner",
861+
f"--decoder_model_version {llama_version}",
860862
f"--tokenizer_path {os.path.basename(runtime_tokenizer_path)}",
861863
f"--model_path {pte_filename}.pte",
862864
f"--seq_len {seq_len}",
@@ -1090,7 +1092,7 @@ def export_llama(args) -> None:
10901092
raise RuntimeError(f"Unknown model_mode: {args.model_mode}.")
10911093

10921094
tokenizer = get_tokenizer(args.tokenizer_model)
1093-
runtime_tokenizer_path = ""
1095+
runtime_tokenizer_path, llama_version = "", ""
10941096
if args.llama_model == "stories110m":
10951097
assert isinstance(
10961098
tokenizer, SentencePieceTokenizer
@@ -1099,11 +1101,13 @@ def export_llama(args) -> None:
10991101
args.tokenizer_bin is not None
11001102
), "Please provide tokenizer_bin for stories110m."
11011103
runtime_tokenizer_path = args.tokenizer_bin
1104+
llama_version = "llama2"
11021105
elif args.llama_model == "llama3_2":
11031106
assert isinstance(
11041107
tokenizer, TiktokenTokenizer
11051108
), f"Wrong tokenizer provided for llama3_2."
11061109
runtime_tokenizer_path = args.tokenizer_model
1110+
llama_version = "llama3"
11071111
else:
11081112
raise RuntimeError(f"Unknown llama_model: {args.llama_model}.")
11091113

@@ -1116,7 +1120,7 @@ def export_llama(args) -> None:
11161120
raise RuntimeError(f"Using an unknown kv update {args.kv_updater}")
11171121

11181122
if args.pre_gen_pte:
1119-
inference(args, pte_filename, runtime_tokenizer_path, args.pre_gen_pte)
1123+
inference(args, pte_filename, runtime_tokenizer_path, llama_version)
11201124
print(f"Finish the running pre_gen_pte from {args.pre_gen_pte}")
11211125
return
11221126

examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
/**
1010
* @file
1111
*
12-
* This tool can run Llama2 110M, Llama3.2 1B / 3B(WIP) with Qualcomm AI Engine
13-
* Direct.
12+
* This tool can run Llama2 110M, Llama3.2 1B / 3B, Qwen2.5 0.5B with Qualcomm
13+
* AI Engine Direct.
1414
*
1515
*/
1616

@@ -21,6 +21,7 @@
2121
#include <fstream>
2222
#include <vector>
2323

24+
DEFINE_string(decoder_model_version, "llama2", "The decoder model to execute.");
2425
DEFINE_string(
2526
model_path,
2627
"kv_llama_qnn.pte",
@@ -88,13 +89,14 @@ std::vector<std::string> CollectPrompts(int argc, char** argv) {
8889
std::string get_formatted_prompt(
8990
const std::string& prompt,
9091
const std::string& system_prompt,
91-
example::LlamaVersion llama_version) {
92+
example::DecoderModelVersion decoder_model_version) {
9293
std::string formatted_prompt;
93-
switch (llama_version) {
94-
case example::LlamaVersion::kLlama2:
94+
switch (decoder_model_version) {
95+
case example::DecoderModelVersion::kLlama2:
96+
case example::DecoderModelVersion::kQwen2_5:
9597
formatted_prompt.append(prompt);
9698
break;
97-
case example::LlamaVersion::kLlama3:
99+
case example::DecoderModelVersion::kLlama3:
98100
if (!system_prompt.empty()) {
99101
formatted_prompt.append(
100102
"<|start_header_id|>system<|end_header_id|>\n\n");
@@ -118,6 +120,7 @@ int main(int argc, char** argv) {
118120
gflags::ParseCommandLineFlags(&argc, &argv, true);
119121
// create llama runner
120122
example::Runner runner(
123+
FLAGS_decoder_model_version.c_str(),
121124
FLAGS_model_path.c_str(),
122125
FLAGS_tokenizer_path.c_str(),
123126
FLAGS_performance_output_path.c_str(),
@@ -127,7 +130,7 @@ int main(int argc, char** argv) {
127130
FLAGS_ngram,
128131
FLAGS_window,
129132
FLAGS_gcap);
130-
auto llama_version = runner.get_llama_version();
133+
auto decoder_model_version = runner.get_decoder_model_version();
131134
std::vector<char> buf;
132135
buf.reserve(5 * FLAGS_seq_len); // assume each token is around 5 char
133136
std::ofstream fout(FLAGS_output_path.c_str());
@@ -141,7 +144,7 @@ int main(int argc, char** argv) {
141144
for (const auto& prompt : prompts) {
142145
std::string formatted_prompt;
143146
formatted_prompt = get_formatted_prompt(
144-
prompt, FLAGS_system_prompt, llama_version.get());
147+
prompt, FLAGS_system_prompt, decoder_model_version.get());
145148
runner.generate(formatted_prompt.c_str(), FLAGS_seq_len, callback);
146149
}
147150
}

examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ KVManager::KVManager(KVManagerMode kv_updater, Metadata metadata)
1515
metadata_.num_layers, std::vector<KVCache>(metadata_.num_heads));
1616
v_cache_.resize(
1717
metadata_.num_layers, std::vector<KVCache>(metadata_.num_heads));
18-
1918
// Calculate cache size
2019
switch (kv_updater_) {
2120
case KVManagerMode::SMART_MASK: {

0 commit comments

Comments
 (0)