Qualcomm AI Engine Direct - GA QWEN2.5 0.5B

winskuo-quic · winskuo-quic · commit 0c4785bc7282 · 2025-06-27T16:43:12.000+08:00
diff --git a/backends/qualcomm/quantizer/annotators.py b/backends/qualcomm/quantizer/annotators.py
@@ -250,7 +250,9 @@ def annotate_masked_fill(node: Node, quantization_config: QuantizationConfig) ->
     )
 
 
-@register_annotator([torch.ops.aten.mul, torch.ops.aten.mul.Tensor])
+@register_annotator(
+    [torch.ops.aten.mul, torch.ops.aten.mul.Tensor, torch.ops.aten.mul_.Tensor]
+)
 def annotate_mul(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_binary(node, quantization_config)
 
@@ -606,9 +608,35 @@ def annotate_select(node: Node, quantization_config: QuantizationConfig) -> None
 
 @register_annotator([torch.ops.aten.slice.Tensor])
 def annotate_slice(node: Node, quantization_config: QuantizationConfig) -> None:
+    if _is_annotated([node]) or not _is_float_tensor(node):
+        return
     annotate_single_in_single_out(node, quantization_config)
 
 
+@register_annotator([torch.ops.aten.slice_scatter.default])
+def annotate_slice_scatter(node: Node, quantization_config: QuantizationConfig) -> None:
+    if _is_annotated([node]):
+        return
+
+    input_act_qspec = quantization_config.input_activation
+    output_act_qspec = quantization_config.output_activation
+
+    input_qspec_map = {}
+    input_act0 = node.args[0]
+    if isinstance(input_act0, Node):
+        input_qspec_map[input_act0] = input_act_qspec
+
+    input_act1 = node.args[1]
+    if isinstance(input_act1, Node):
+        input_qspec_map[input_act1] = input_act_qspec
+
+    node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=output_act_qspec,
+        _annotated=True,
+    )
+
+
 @register_annotator([torch.ops.aten.sqrt.default])
 def annotate_sqrt(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
@@ -801,16 +829,17 @@ def annotate_embedding(node: Node, quantization_config: QuantizationConfig) -> N
 
 @register_annotator([torch.ops.aten.index.Tensor])
 def annotate_index(node: Node, quantization_config: QuantizationConfig) -> None:
+    if _is_annotated([node]) or not _is_float_tensor(node):
+        return
     annotate_in_out_obs_sharing_op(node, quantization_config)
-    if not _is_annotated([node]):
-        input_qspec_map = {}
-        input = node.args[0]
-        input_qspec_map[input] = quantization_config.input_activation
-        node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
-            input_qspec_map=input_qspec_map,
-            output_qspec=SharedQuantizationSpec((input, node)),
-            _annotated=True,
-        )
+    input_qspec_map = {}
+    input = node.args[0]
+    input_qspec_map[input] = quantization_config.input_activation
+    node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=SharedQuantizationSpec((input, node)),
+        _annotated=True,
+    )
 
 
 @register_annotator(
@@ -1270,7 +1299,7 @@ def annotate_where(node: Node, quantization_config: QuantizationConfig) -> None:
     )
 
 
-@register_annotator([torch.ops.aten.zeros.default])
+@register_annotator([torch.ops.aten.zeros.default, torch.ops.aten.zeros_like.default])
 def annotate_zeros(node: Node, quantization_config: QuantizationConfig) -> None:
     if _is_annotated([node]) or not _is_float_tensor(node):
         return
diff --git a/backends/qualcomm/quantizer/custom_annotation.py b/backends/qualcomm/quantizer/custom_annotation.py
@@ -151,7 +151,9 @@ def annotate_prefill_kv_output(gm: torch.fx.GraphModule, kv_quant_attrs: dict):
                 )
 
 
-def annotate_matmul_16a8w(gm: torch.fx.GraphModule) -> None:  # noqa: C901
+def annotate_matmul_16a8w(  # noqa: C901
+    gm: torch.fx.GraphModule, annotate_conv=True
+) -> None:
     """
     This function is specific for matmul op 16a8w.
     For k, we will tag such as the below, and
@@ -254,9 +256,10 @@ def annotate_matmul_input1(node: Node):
                 # The arguments of cat op: (the past kv cache, the new kv cache)
                 node = node.args[0][1]
             elif node.target == torch.ops.aten.conv2d.default:
-                annotate_conv2d(
-                    node, quantization_config=quantization_config_8a4w_per_channel
-                )
+                if annotate_conv:
+                    annotate_conv2d(
+                        node, quantization_config=quantization_config_8a4w_per_channel
+                    )
                 break
             elif node.target in [torch.ops.aten.add.Tensor, torch.ops.aten.sub.Tensor]:
                 break
diff --git a/backends/qualcomm/scripts/build.sh b/backends/qualcomm/scripts/build.sh
@@ -104,6 +104,8 @@ if [ "$BUILD_AARCH64" = true ]; then
         -DANDROID_ABI='arm64-v8a' \
         -DANDROID_PLATFORM=android-30 \
         -DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
+        -DSUPPORT_REGEX_LOOKAHEAD=ON \
+        -DBUILD_TESTING=OFF \
         -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
         -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
         -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
@@ -157,6 +159,8 @@ if [ "$BUILD_X86_64" = true ]; then
        -DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
        -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
        -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
+       -DSUPPORT_REGEX_LOOKAHEAD=ON \
+       -DBUILD_TESTING=OFF \
        -B$EXAMPLE_ROOT
 
    cmake --build $EXAMPLE_ROOT -j$BUILD_JOB_NUMBER
diff --git a/examples/qualcomm/CMakeLists.txt b/examples/qualcomm/CMakeLists.txt
@@ -77,8 +77,8 @@ target_include_directories(
 
 # add tokenizers
 add_subdirectory(
-  ${EXECUTORCH_ROOT}/extension/llm/tokenizers
-  ${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/tokenizers
+  ${EXECUTORCH_ROOT}/extension/llm/runner
+  ${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/runner
 )
 
 # build qnn_executor_runner
diff --git a/examples/qualcomm/oss_scripts/llama/CMakeLists.txt b/examples/qualcomm/oss_scripts/llama/CMakeLists.txt
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+
 # model sharding with custom op
 set(CUSTOM_OP_SRCS_FILE
     "${EXECUTORCH_SOURCE_DIR}/extension/llm/custom_ops/op_fallback.cpp"
@@ -63,14 +64,22 @@ target_link_libraries(
   executorch_core
   extension_data_loader
   extension_flat_tensor
+  extension_llm_runner
   extension_module
   extension_tensor
+  tokenizers
   gflags
   custom_ops
   quantized_ops_lib
   quantized_kernels
   tokenizers
 )
+
+target_include_directories(
+  qnn_llama_runner
+  PUBLIC ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
+)
+
 target_compile_options(qnn_llama_runner PUBLIC ${_common_compile_options})
 set_target_properties(
   qnn_llama_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -781,7 +781,7 @@ def permute(w, heads):
     return quant_attrs
 
 
-def inference(args, pte_filename, runtime_tokenizer_path, pre_gen_pte=""):
+def inference(args, pte_filename, runtime_tokenizer_path, llama_version):
     workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/single_llama"
 
     if args.model_mode == "kv":
@@ -794,8 +794,8 @@ def inference(args, pte_filename, runtime_tokenizer_path, pre_gen_pte=""):
         raise RuntimeError(f"Unknown model_mode: {args.model_mode}.")
 
     pte_path = (
-        f"{pre_gen_pte}/{pte_filename}.pte"
-        if pre_gen_pte
+        f"{args.pre_gen_pte}/{pte_filename}.pte"
+        if args.pre_gen_pte
         else f"{args.artifact}/{pte_filename}.pte"
     )
 
@@ -836,6 +836,7 @@ def post_process():
             [
                 f"export LD_LIBRARY_PATH={qnn_sdk}/lib/{target}/:{args.build_folder}/lib &&",
                 f"./{args.build_folder}/examples/qualcomm/oss_scripts/llama/qnn_llama_runner",
+                f"--decoder_model_version {llama_version}",
                 f"--tokenizer_path {runtime_tokenizer_path}",
                 f"--model_path {pte_path}",
                 f"--seq_len {seq_len}",
@@ -857,6 +858,7 @@ def post_process():
             [
                 f"cd {workspace} &&",
                 f"./qnn_llama_runner",
+                f"--decoder_model_version {llama_version}",
                 f"--tokenizer_path {os.path.basename(runtime_tokenizer_path)}",
                 f"--model_path {pte_filename}.pte",
                 f"--seq_len {seq_len}",
@@ -1090,7 +1092,7 @@ def export_llama(args) -> None:
         raise RuntimeError(f"Unknown model_mode: {args.model_mode}.")
 
     tokenizer = get_tokenizer(args.tokenizer_model)
-    runtime_tokenizer_path = ""
+    runtime_tokenizer_path, llama_version = "", ""
     if args.llama_model == "stories110m":
         assert isinstance(
             tokenizer, SentencePieceTokenizer
@@ -1099,11 +1101,13 @@ def export_llama(args) -> None:
             args.tokenizer_bin is not None
         ), "Please provide tokenizer_bin for stories110m."
         runtime_tokenizer_path = args.tokenizer_bin
+        llama_version = "llama2"
     elif args.llama_model == "llama3_2":
         assert isinstance(
             tokenizer, TiktokenTokenizer
         ), f"Wrong tokenizer provided for llama3_2."
         runtime_tokenizer_path = args.tokenizer_model
+        llama_version = "llama3"
     else:
         raise RuntimeError(f"Unknown llama_model: {args.llama_model}.")
 
@@ -1116,7 +1120,7 @@ def export_llama(args) -> None:
         raise RuntimeError(f"Using an unknown kv update {args.kv_updater}")
 
     if args.pre_gen_pte:
-        inference(args, pte_filename, runtime_tokenizer_path, args.pre_gen_pte)
+        inference(args, pte_filename, runtime_tokenizer_path, llama_version)
         print(f"Finish the running pre_gen_pte from {args.pre_gen_pte}")
         return
 
diff --git a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
@@ -9,8 +9,8 @@
 /**
  * @file
  *
- * This tool can run Llama2 110M, Llama3.2 1B / 3B(WIP) with Qualcomm AI Engine
- * Direct.
+ * This tool can run Llama2 110M, Llama3.2 1B / 3B, Qwen2.5 0.5B with Qualcomm
+ * AI Engine Direct.
  *
  */
 
@@ -21,6 +21,7 @@
 #include <fstream>
 #include <vector>
 
+DEFINE_string(decoder_model_version, "llama2", "The decoder model to execute.");
 DEFINE_string(
     model_path,
     "kv_llama_qnn.pte",
@@ -88,13 +89,14 @@ std::vector<std::string> CollectPrompts(int argc, char** argv) {
 std::string get_formatted_prompt(
     const std::string& prompt,
     const std::string& system_prompt,
-    example::LlamaVersion llama_version) {
+    example::DecoderModelVersion decoder_model_version) {
   std::string formatted_prompt;
-  switch (llama_version) {
-    case example::LlamaVersion::kLlama2:
+  switch (decoder_model_version) {
+    case example::DecoderModelVersion::kLlama2:
+    case example::DecoderModelVersion::kQwen2_5:
       formatted_prompt.append(prompt);
       break;
-    case example::LlamaVersion::kLlama3:
+    case example::DecoderModelVersion::kLlama3:
       if (!system_prompt.empty()) {
         formatted_prompt.append(
             "<|start_header_id|>system<|end_header_id|>\n\n");
@@ -118,6 +120,7 @@ int main(int argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
   // create llama runner
   example::Runner runner(
+      FLAGS_decoder_model_version.c_str(),
       FLAGS_model_path.c_str(),
       FLAGS_tokenizer_path.c_str(),
       FLAGS_performance_output_path.c_str(),
@@ -127,7 +130,7 @@ int main(int argc, char** argv) {
       FLAGS_ngram,
       FLAGS_window,
       FLAGS_gcap);
-  auto llama_version = runner.get_llama_version();
+  auto decoder_model_version = runner.get_decoder_model_version();
   std::vector<char> buf;
   buf.reserve(5 * FLAGS_seq_len); // assume each token is around 5 char
   std::ofstream fout(FLAGS_output_path.c_str());
@@ -141,7 +144,7 @@ int main(int argc, char** argv) {
     for (const auto& prompt : prompts) {
       std::string formatted_prompt;
       formatted_prompt = get_formatted_prompt(
-          prompt, FLAGS_system_prompt, llama_version.get());
+          prompt, FLAGS_system_prompt, decoder_model_version.get());
       runner.generate(formatted_prompt.c_str(), FLAGS_seq_len, callback);
     }
   }
diff --git a/examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp
@@ -15,7 +15,6 @@ KVManager::KVManager(KVManagerMode kv_updater, Metadata metadata)
       metadata_.num_layers, std::vector<KVCache>(metadata_.num_heads));
   v_cache_.resize(
       metadata_.num_layers, std::vector<KVCache>(metadata_.num_heads));
-
   // Calculate cache size
   switch (kv_updater_) {
     case KVManagerMode::SMART_MASK: {
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h
diff --git a/examples/qualcomm/oss_scripts/qwen/model/static_qwen.py b/examples/qualcomm/oss_scripts/qwen/model/static_qwen.py
diff --git a/examples/qualcomm/oss_scripts/qwen/qwen.py b/examples/qualcomm/oss_scripts/qwen/qwen.py

Original file line number	Diff line number	Diff line change
`@@ -77,8 +77,8 @@ target_include_directories(`
`77`	`77`
`78`	`78`	`# add tokenizers`
`79`	`79`	`add_subdirectory(`
`80`		`- ${EXECUTORCH_ROOT}/extension/llm/tokenizers`
`81`		`- ${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/tokenizers`
	`80`	`+ ${EXECUTORCH_ROOT}/extension/llm/runner`
	`81`	`+ ${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/runner`
`82`	`82`	`)`
`83`	`83`
`84`	`84`	`# build qnn_executor_runner`