diff --git a/backends/qualcomm/_passes/layout_transform.py b/backends/qualcomm/_passes/layout_transform.py index 67073afc042..13175fe41bd 100644 --- a/backends/qualcomm/_passes/layout_transform.py +++ b/backends/qualcomm/_passes/layout_transform.py @@ -63,6 +63,7 @@ class LayoutTransform(ExportPass): exir_ops.edge.aten.abs.default, exir_ops.edge.aten.add.Tensor, exir_ops.edge.aten.amax.default, + exir_ops.edge.aten.amin.default, exir_ops.edge.aten.atan.default, exir_ops.edge.aten.bitwise_or.Tensor, exir_ops.edge.aten.bmm.default, diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp index 67e6775f451..2511cd96636 100644 --- a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp +++ b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp @@ -7,6 +7,7 @@ */ #include #include +#include "QnnSdkBuildId.h" namespace py = pybind11; namespace executorch { @@ -15,10 +16,27 @@ namespace qnn { using executorch::runtime::Error; +std::string GetQnnSdkBuildId(std::string library_path) { + QnnImplementation qnn_loaded_backend = QnnImplementation(library_path); + ET_CHECK_MSG( + qnn_loaded_backend.Load(nullptr) == Error::Ok, + "Fail to load Qnn library"); + const char* id = nullptr; + // Safe to call any time, backend does not have to be created. + Qnn_ErrorHandle_t err = + qnn_loaded_backend.GetQnnInterface().qnn_backend_get_build_id(&id); + if (err != QNN_SUCCESS || id == nullptr) { + throw std::runtime_error("Failed to get QNN backend build ID"); + } + qnn_loaded_backend.TerminateAllBackends(); + return std::string(id); +} + PYBIND11_MODULE(PyQnnManagerAdaptor, m) { // TODO: Add related documents for configurations listed below using namespace qnn_delegate; + m.def("GetQnnSdkBuildId", &GetQnnSdkBuildId); py::class_(m, "QnnExecuTorchContextBinary") .def(py::init<>()); diff --git a/backends/qualcomm/builders/__init__.py b/backends/qualcomm/builders/__init__.py index 709df7006f8..62e8e476257 100644 --- a/backends/qualcomm/builders/__init__.py +++ b/backends/qualcomm/builders/__init__.py @@ -10,6 +10,7 @@ op_adaptive_avg_pool2d, op_add, op_amax, + op_amin, op_and, op_arange, op_argmax, @@ -106,6 +107,7 @@ op_adaptive_avg_pool2d, op_add, op_amax, + op_amin, op_and, op_arange, op_argmax, diff --git a/backends/qualcomm/builders/op_amin.py b/backends/qualcomm/builders/op_amin.py new file mode 100644 index 00000000000..9f8f17b4e37 --- /dev/null +++ b/backends/qualcomm/builders/op_amin.py @@ -0,0 +1,85 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +from typing import cast, Dict, List + +import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper + +import numpy as np + +import torch +from executorch.backends.qualcomm.utils.constants import QCOM_AXIS_ORDER, QCOM_DATA + +from .node_visitor import NodeVisitor +from .node_visitor_manager import register_node_visitor +from .qnn_constants import OpReduceMin, QNN_OP_PACKAGE_NAME_QTI_AISW + + +@register_node_visitor +class AMin(NodeVisitor): + target = ["aten.amin.default"] + + def __init__(self, *args) -> None: + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper], + ) -> PyQnnWrapper.PyQnnOpWrapper: + input_node = self.get_node(node.args[0]) + input_tensor = self.get_tensor(input_node, node) + input_tensor_wrapper = self.define_tensor( + input_node, + node, + input_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + nodes_to_wrappers, + ) + + # mean dims and keep dims + mean_dims = cast(List[int], node.args[1]) + mean_dims = [ + mean_dim % len(input_node.meta["val"].shape) for mean_dim in mean_dims + ] + if QCOM_AXIS_ORDER in node.meta: + mean_dims = [ + node.meta[QCOM_AXIS_ORDER].index(mean_dim) for mean_dim in mean_dims + ] + mean_dims_shape = [len(mean_dims)] + + output_tensor = self.get_tensor(node, node) + output_tensor_wrapper = self.define_tensor( + node, + node, + output_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + nodes_to_wrappers, + ) + + reduce_min_op = PyQnnWrapper.PyQnnOpWrapper( + node.name, + QNN_OP_PACKAGE_NAME_QTI_AISW, + OpReduceMin.op_name, + ) + reduce_min_op.AddInputTensors([input_tensor_wrapper]) + reduce_min_op.AddOutputTensors([output_tensor_wrapper]) + reduce_min_op.AddTensorParam( + OpReduceMin.param_axes, + PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32, + len(mean_dims_shape), + mean_dims_shape, + np.array(mean_dims, dtype=np.uint32), + True, + ) + if len(node.args) > 2: + keep_dims = cast(bool, node.args[2]) + reduce_min_op.AddScalarParam( + OpReduceMin.param_keep_dims, + PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_BOOL_8, + {QCOM_DATA: keep_dims}, + ) + + return reduce_min_op diff --git a/backends/qualcomm/quantizer/annotators.py b/backends/qualcomm/quantizer/annotators.py index 21084a19eff..38a8bc6ebe6 100644 --- a/backends/qualcomm/quantizer/annotators.py +++ b/backends/qualcomm/quantizer/annotators.py @@ -217,6 +217,11 @@ def annotate_argmax(node: Node, quantization_config: QuantizationConfig) -> None annotate_single_in(node, quantization_config) +@register_annotator([torch.ops.aten.amin.default]) +def annotate_amin(node: Node, quantization_config: QuantizationConfig) -> None: + annotate_binary(node, quantization_config) + + @register_annotator([torch.ops.aten.argmin.default]) def annotate_argmin(node: Node, quantization_config: QuantizationConfig) -> None: annotate_single_in(node, quantization_config) diff --git a/backends/qualcomm/runtime/backends/QnnFunctionInterface.h b/backends/qualcomm/runtime/backends/QnnFunctionInterface.h index 12a6be36b64..548c363f388 100644 --- a/backends/qualcomm/runtime/backends/QnnFunctionInterface.h +++ b/backends/qualcomm/runtime/backends/QnnFunctionInterface.h @@ -32,6 +32,7 @@ class QnnInterface { // --------- QnnBackend --------- DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate); + DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_build_id, backendGetBuildId); DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree); DEFINE_SHIM_FUNCTION_INTERFACE( backend_register_op_package, diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py index b091819f0ff..988665c6583 100644 --- a/backends/qualcomm/tests/models.py +++ b/backends/qualcomm/tests/models.py @@ -102,6 +102,16 @@ def forward(self, x): return torch.amax(x, dim=self.dim, keepdim=self.keepdim) +class AMin(torch.nn.Module): + def __init__(self, dim=None, keepdim=False): + super().__init__() + self.dim = dim + self.keepdim = keepdim + + def forward(self, x): + return torch.amin(x, dim=self.dim, keepdim=self.keepdim) + + class Arange(torch.nn.Module): def __init__(self, start, end, step, dtype): super().__init__() @@ -1155,6 +1165,17 @@ def forward(self, attn_mask): ) +class MaskedSoftmax(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, attention_mask, input): + attn_weights = torch.where( + attention_mask == 0, input, torch.amin(input, dim=3, keepdim=True) + (-20) + ) + return torch.nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32) + + class MaxDim(torch.nn.Module): def __init__(self): super().__init__() diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 51480661bd3..1f06a39343b 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -46,6 +46,7 @@ from_context_binary, generate_htp_compiler_spec, generate_qnn_executorch_compiler_spec, + is_qnn_sdk_version_less_than, PyQnnManagerAdaptor, rewrite_prepared_observer, skip_annotation, @@ -53,10 +54,6 @@ update_spill_fill_size, ) -from executorch.examples.models.llama.llama_transformer import MOEFeedForward - -from executorch.examples.models.llama.model_args import ModelArgs - from executorch.examples.qualcomm.utils import ( make_quantizer, setup_common_args_and_variables, @@ -136,6 +133,13 @@ def test_qnn_backend_amax(self): with self.subTest(i=i): self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_amin(self): + modules = [AMin(dim=1, keepdim=False), AMin(dim=1, keepdim=True)] # noqa: F405 + sample_input = (torch.randn(4, 4),) + for i, module in enumerate(modules): + with self.subTest(i=i): + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_any(self): modules = [Any(), Any(dim=[0, 1]), Any(dim=1, keepdim=True)] # noqa: F405 sample_input = (torch.randn(3, 3, 3) > 0,) @@ -1227,6 +1231,9 @@ def test_qnn_backend_lift_add_tensor(self): @unittest.skip("Fail because of bad accuracy") def test_qnn_backend_moe_feed_forward(self): + from executorch.examples.models.llama.llama_transformer import MOEFeedForward + from executorch.examples.models.llama.model_args import ModelArgs + args = ModelArgs() args.dim = 32 args.n_heads = 8 @@ -1421,6 +1428,14 @@ def test_qnn_backend_amax(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_amin(self): + modules = [AMin(dim=1, keepdim=False), AMin(dim=1, keepdim=True)] # noqa: F405 + sample_input = (torch.randn(4, 4),) + for i, module in enumerate(modules): + with self.subTest(i=i): + module = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_any(self): modules = [Any(), Any(dim=[0, 1]), Any(dim=1, keepdim=True)] # noqa: F405 sample_input = (torch.randn(3, 3, 3) > 0,) @@ -2643,8 +2658,57 @@ def test_qnn_backend_einsum_outer_product_relu(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + @unittest.skipIf(is_qnn_sdk_version_less_than("2.35"), "UT pass after QNN 2.35") + def test_qnn_backend_masked_softmax(self): + if self.enable_x86_64: + self.skipTest( + "At the moment, testing is only being conducted on the device." + ) + module = MaskedSoftmax() # noqa: F405 + kv_arange = torch.arange(128) + reshaped_cache_position = torch.tensor([[0]]) + + # Simplest and most efficient way to obtain a causal mask + causal_mask = kv_arange <= reshaped_cache_position + atten_mask = torch.full((1, 128), torch.tensor(-65535.0)) + atten_mask = atten_mask.masked_fill(causal_mask, 0) + atten_mask = atten_mask[None, None, :, :].expand(1, -1, -1, -1) + sample_input = (atten_mask, torch.randn([1, 1, 1, 128])) + # Masked softmax is only support in quantized model + module = self.get_qdq_module( + module, sample_input, quant_dtype=QuantDtype.use_16a8w + ) + backend_options = generate_htp_compiler_spec(use_fp16=False) + compiler_spec = generate_qnn_executorch_compiler_spec( + soc_model=self.chipset_table[TestQNN.model], + backend_options=backend_options, + optrace=True, + ) + with tempfile.TemporaryDirectory() as tmp_dir: + edge_prog_mgr = to_edge_transform_and_lower_to_qnn( + module, sample_input, compiler_spec + ).to_executorch() + pte_path = f"{tmp_dir}/model.pte" + with open(pte_path, "wb") as f: + edge_prog_mgr.write_to_file(f) + adb = self.get_adb_tool(pte_path) + binaries_trace = generate_optrace( + tmp_dir, self.chipset_table[self.model], adb, pte_path, sample_input + ) + has_masked_softmax = False + for _, (_, qhas) in binaries_trace.items(): + with open(qhas, "r") as qhas_file: + qhas_data = json.load(qhas_file) + for row in qhas_data["data"]["htp_op_types"]["data"]: + if "MaskedSoftmax" in row["op"]: + has_masked_softmax = True + self.assertTrue(has_masked_softmax) + @unittest.skip("UT pass before QNN 2.26, segfault during partitioner") def test_qnn_backend_moe_feed_forward(self): + from executorch.examples.models.llama.llama_transformer import MOEFeedForward + from executorch.examples.models.llama.model_args import ModelArgs + args = ModelArgs() args.dim = 32 args.n_heads = 8 diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py index 3471b0155bd..ff611385de5 100644 --- a/backends/qualcomm/utils/utils.py +++ b/backends/qualcomm/utils/utils.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. import operator +import os +import re import warnings from collections import defaultdict, OrderedDict from typing import Any, Callable, Dict, List, Optional, Tuple, Union @@ -11,7 +13,6 @@ import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManagerAdaptor import executorch.exir as exir - import torch from executorch.backends.qualcomm._passes import AnnotateStack, AnnotateUnbind @@ -1167,3 +1168,28 @@ def rewrite_prepared_observer( continue for target_name in module_name_list[old_module]: setattr(graph_module, target_name, new_observer) + + +def get_sdk_build_id(): + htp_library_path = ( + os.environ.get("QNN_SDK_ROOT", None) + "/lib/x86_64-linux-clang/libQnnHtp.so" + ) + # The GetQnnSdkBuildId API can be used without needing to create a backend first, so it works regardless of which backend is used. + sdk_build_id = PyQnnManagerAdaptor.GetQnnSdkBuildId(htp_library_path) + return sdk_build_id + + +def is_qnn_sdk_version_less_than(target_version): + current_version = get_sdk_build_id() + + match = re.search(r"v(\d+)\.(\d+)", current_version) + if match: + current_major, current_minor = map(int, match.groups()[:2]) + else: + raise ValueError( + f"Failed to get current major and minor version from QNN sdk Build id {current_version}" + ) + + target_major, target_minor = map(int, target_version.split(".")[:2]) + + return current_major == target_major and current_minor < target_minor diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md index 6d10a935863..c42c22ea7db 100644 --- a/examples/qualcomm/oss_scripts/llama/README.md +++ b/examples/qualcomm/oss_scripts/llama/README.md @@ -124,12 +124,16 @@ On the other hand, if you already have a pre-compiled .pte model, you can perfor python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_ar_len 32 --max_seq_len 128 --prompt "what is 1+1" --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE} ``` +#### KV Cache Updater + You can select the KV Cache update mechanism at runtime by setting the `KV_UPDATER` variable to either "shift_pointer" or "smart_mask". By default, it is set to "smart_mask". `KV_UPDATER` = "shift_pointer" ```bash python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_ar_len 32 --max_seq_len 128 --prompt "what is 1+1" --kv_updator ${KV_UPDATER} ``` +#### Lookahead Decoding Mode + You can choose the lookahead mode to enhance decoding speed. To use this mode, you need to specify the following parameters: - `--ngram` (N-gram size): Represents the size of the n-grams used in the lookahead process. - `--window` (window size): Determines how many future tokens the algorithm attempts to predict in each step. @@ -140,3 +144,8 @@ For more details, please refer to the paper ["Break the Sequential Dependency of ```bash python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode lookahead --prefill_ar_len 32 --max_seq_len 128 --prompt "what is 1+1" --ngram 3 --window 2 --gcap 2 ``` + +#### Masked Softmax + +You can enable MaskedSoftmax feature by providing the flag `--enable_masked_softmax`. It is designed to optimize the LLMs accuracy and performance executed on HTP backend. MaskedSoftmax is used to replace the Softmax(Add(In, Mask)) structure in attention block in LLMs during backend optimization. For more details, please refer to QNN documents. +Note that it is only supported starting from QNN 2.35. diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py index 388662940f2..f37263ee179 100755 --- a/examples/qualcomm/oss_scripts/llama/llama.py +++ b/examples/qualcomm/oss_scripts/llama/llama.py @@ -50,7 +50,9 @@ convert_linear_to_conv2d, generate_htp_compiler_spec, generate_qnn_executorch_compiler_spec, + get_sdk_build_id, get_soc_to_chipset_map, + is_qnn_sdk_version_less_than, to_edge_transform_and_lower_to_qnn, update_spill_fill_size, ) @@ -534,6 +536,7 @@ def compile(args, pte_filename, tokenizer): kv_config.max_batch_size = 1 kv_config.max_seq_len = args.max_seq_len kv_config.use_kv_cache = True + kv_config.enable_masked_softmax = args.enable_masked_softmax prefill_config = copy.copy(kv_config) prefill_config.use_kv_cache = ( @@ -1186,6 +1189,12 @@ def _build_parser(): action="store_true", ) + parser.add_argument( + "--enable_masked_softmax", + help="The MaskedSoftmax feature is designed to optimize the LLMs accuracy and performance executed on HTP backend. Note that it is only supported starting from QNN 2.35.", + action="store_true", + ) + parser.add_argument("-v", "--verbose", action="store_true") return parser @@ -1261,6 +1270,11 @@ def export_llama(args) -> None: else: raise RuntimeError(f"Using an unknown kv update {args.kv_updater}") + if args.enable_masked_softmax and is_qnn_sdk_version_less_than("2.35"): + raise RuntimeError( + f"Masked softmax is supported after QNN SDK 2.35. Given sdk version {get_sdk_build_id()} is lower the target version" + ) + if args.pre_gen_pte: inference(args, pte_filename, runtime_tokenizer_path, decoder_model_version) print(f"Finish the running pre_gen_pte from {args.pre_gen_pte}") diff --git a/examples/qualcomm/oss_scripts/llama/model/static_llama.py b/examples/qualcomm/oss_scripts/llama/model/static_llama.py index 5c1f17abe47..dcc33c0537a 100755 --- a/examples/qualcomm/oss_scripts/llama/model/static_llama.py +++ b/examples/qualcomm/oss_scripts/llama/model/static_llama.py @@ -45,6 +45,7 @@ def __init__(self, config: ModelArgs, output_new_cache_only=False): self.num_key_value_groups = config.n_heads // self.n_kv_heads self.max_seq_len = config.max_seq_len self.output_new_cache_only = output_new_cache_only + self.enable_masked_softmax = getattr(config, "enable_masked_softmax", False) self.wq = nn.Linear( self.dim, @@ -189,7 +190,13 @@ def forward_sha( for i, _ in enumerate(q): cache_idx = i // self.num_key_value_groups attn = q[i] @ kh[cache_idx] - attn = attn / self.scale + atten_mask + attn = attn / self.scale + if self.enable_masked_softmax: + attn_min = torch.amin(attn, dim=-1, keepdim=True) + minus_value = -20 + attn = torch.where(atten_mask == 0, attn, attn_min + minus_value) + else: + attn = attn + atten_mask attn = self.attn_softmax(attn) y = attn @ vh[cache_idx]