diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh index 38a354eddf0..4f8dc7a30e5 100755 --- a/.ci/scripts/test_model.sh +++ b/.ci/scripts/test_model.sh @@ -188,6 +188,14 @@ test_model_with_qnn() { EXPORT_SCRIPT=edsr # Additional deps for edsr pip install piq + elif [[ "${MODEL_NAME}" == "albert" ]]; then + EXPORT_SCRIPT=albert + elif [[ "${MODEL_NAME}" == "bert" ]]; then + EXPORT_SCRIPT=bert + elif [[ "${MODEL_NAME}" == "distilbert" ]]; then + EXPORT_SCRIPT=distilbert + elif [[ "${MODEL_NAME}" == "eurobert" ]]; then + EXPORT_SCRIPT=eurobert else echo "Unsupported model $MODEL_NAME" exit 1 @@ -197,7 +205,25 @@ test_model_with_qnn() { # TODO(guangyang): Make QNN chipset matches the target device QNN_CHIPSET=SM8450 - "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --ci --compile_only $EXTRA_FLAGS + SCRIPT_FOLDER="" + case "${MODEL_NAME}" in + "dl3"|"mv3"|"mv2"|"ic4"|"ic3"|"vit"|"mb"|"w2l") + SCRIPT_FOLDER=scripts + ;; + "albert"|"bert"|"distilbert") + pip install evaluate + SCRIPT_FOLDER=oss_scripts + # Bert models running in 16bit will encounter op validation fail on some operations, + # which requires CHIPSET >= SM8550. + QNN_CHIPSET=SM8550 + ;; + *) + echo "Unsupported model $MODEL_NAME" + exit 1 + ;; + esac + + "${PYTHON_EXECUTABLE}" -m examples.qualcomm.${SCRIPT_FOLDER}.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --ci --compile_only $EXTRA_FLAGS EXPORTED_MODEL=$(find "./${EXPORT_SCRIPT}" -type f -name "${MODEL_NAME}*.pte" -print -quit) } diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 43da4e4b9b0..a4996459f8a 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -480,6 +480,32 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh ${{ matrix.model }} "cmake" "qnn" + test-qnn-optimum-model: + name: test-qnn-optimum-model + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + strategy: + matrix: + dtype: [fp32] + model: [albert, bert, distilbert] # eurobert requires transfomer >= 4.48.0, skip for now + fail-fast: false + with: + runner: linux.2xlarge + docker-image: executorch-ubuntu-22.04-qnn-sdk + submodules: 'recursive' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 900 + script: | + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh + PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh + PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh ${{ matrix.model }} "cmake" "qnn" + test-apple-model: name: test-apple-model uses: pytorch/test-infra/.github/workflows/macos_job.yml@main diff --git a/backends/qualcomm/_passes/__init__.py b/backends/qualcomm/_passes/__init__.py index ca1aa78ef17..307756bff7f 100644 --- a/backends/qualcomm/_passes/__init__.py +++ b/backends/qualcomm/_passes/__init__.py @@ -19,6 +19,7 @@ from .decompose_linalg_vector_norm import DecomposeLinalgVectorNorm from .decompose_roll import DecomposeRoll from .decompose_silu import DecomposeSilu +from .decompose_wrap_with_autocast import DecomposeWrapWithAutocast from .expand_broadcast_tensor_shape import ExpandBroadcastTensorShape from .fixed_linear_keep_dim import FixedLinearKeepDim from .fold_qdq import FoldQDQ @@ -56,6 +57,7 @@ DecomposeLinalgVectorNorm, DecomposeRoll, DecomposeSilu, + DecomposeWrapWithAutocast, ExpandBroadcastTensorShape, FixedLinearKeepDim, FoldQDQ, diff --git a/backends/qualcomm/_passes/decompose_wrap_with_autocast.py b/backends/qualcomm/_passes/decompose_wrap_with_autocast.py new file mode 100644 index 00000000000..6c073bd309c --- /dev/null +++ b/backends/qualcomm/_passes/decompose_wrap_with_autocast.py @@ -0,0 +1,88 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import _operator +from typing import Dict, Tuple + +import torch +from executorch.exir.pass_base import ExportPass, PassResult + +from .utils import copy_nn_module_stack + + +class DecomposeWrapWithAutocast(ExportPass): + """ + Decompose the _higher_order_ops WrapWithAutocast + """ + + def __init__(self) -> None: + super().__init__() + + def _get_submod( + self, gm: torch.fx.GraphModule, node: torch.fx.Node + ) -> Tuple[torch.fx.GraphModule, str]: + for a in node.args: + if isinstance(a, torch.fx.Node) and "submod" in a.target: + return getattr(gm, a.target), a.target + + def _replace_output( + self, wwac_node: torch.fx.Node, output_node: torch.fx.Node, remap: Dict + ): + for user in wwac_node.users.copy(): + arg_idx = 0 + is_user_getitem = False + + if user.target == _operator.getitem: + arg_idx = user.args[1] + is_user_getitem = True + + user.replace_input_with( + wwac_node, + remap[output_node.args[0][arg_idx]], + ) + + if is_user_getitem: + for user_user in user.users.copy(): + user_user.replace_input_with(user, user.args[0]) + + def _replace(self, gm: torch.fx.GraphModule) -> None: + graph = gm.graph + for node in graph.nodes: + if isinstance(node.target, torch._higher_order_ops.wrap.WrapWithAutocast): + submod, submod_name = self._get_submod(gm, node) + n_args = node.args + input_submod = n_args[4] + decomposed_module = submod + with graph.inserting_before(node): + # remap is used to map original node values to new node values, + # which ensures that reference to nodes are correctly updated in the new graph + # remap = {"expand_1": node.args[5], "to_4": node.args[6]} + remap = {n_args[i].name: n_args[i] for i in range(5, len(n_args))} + + for decomposed_node in decomposed_module.graph.nodes: + copy_nn_module_stack(node, decomposed_node) + # no need to copy existent 'output' + if decomposed_node.op == "output": + self._replace_output(node, decomposed_node, remap) + # no need to copy existent placeholders + elif decomposed_node.op == "placeholder": + # replace node map from string to graph node + remap[decomposed_node] = remap.pop(decomposed_node.name) + else: + remap[decomposed_node] = graph.node_copy( + decomposed_node, + arg_transform=lambda x, remap=remap: remap[x], + ) + + graph.erase_node(node) + + graph.erase_node(input_submod) + + def call(self, graph_module: torch.fx.GraphModule) -> PassResult: + self._replace(graph_module) + graph_module.graph.eliminate_dead_code() + graph_module.recompile() + return PassResult(graph_module, True) diff --git a/backends/qualcomm/_passes/qnn_pass_manager.py b/backends/qualcomm/_passes/qnn_pass_manager.py index bb6a4dd0a67..9364baccc7a 100644 --- a/backends/qualcomm/_passes/qnn_pass_manager.py +++ b/backends/qualcomm/_passes/qnn_pass_manager.py @@ -24,6 +24,7 @@ DecomposeLinalgVectorNorm, DecomposeRoll, DecomposeSilu, + DecomposeWrapWithAutocast, ExpandBroadcastTensorShape, FixedLinearKeepDim, FoldQDQ, @@ -194,6 +195,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule): self.add_pass(DecomposeScaledDotProductAttention()) self.add_pass(DecomposeRoll()) self.add_pass(DecomposeSilu()) + self.add_pass(DecomposeWrapWithAutocast()) self.add_pass(DecomposeEinsum()) self.add_pass(DecomposeExpM1()) self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True)) @@ -207,6 +209,7 @@ def transform_for_export_pipeline(self, exported_program: ExportedProgram): self.add_pass(DecomposeRoll()) self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True)) self.add_pass(DecomposeExpM1()) + self.add_pass(DecomposeWrapWithAutocast()) # this pass will rewrite state_dict, it needs to be accomplished before # to_edge_transform_and_lower self.add_pass(ConvertConv1dToConv2d(exported_program)) diff --git a/backends/qualcomm/_passes/remove_redundancy.py b/backends/qualcomm/_passes/remove_redundancy.py index bff917be3da..22d476ef21b 100644 --- a/backends/qualcomm/_passes/remove_redundancy.py +++ b/backends/qualcomm/_passes/remove_redundancy.py @@ -43,6 +43,8 @@ def _dim_order_op_condition(self, node): dim_order = node.kwargs.get("dim_order") # skip if there contains layout hint # e.g. (0, 2, 3, 1) != (0, 1, 2, 3) + if node.meta["val"].dtype != node.args[0].meta["val"].dtype: + return False return dim_order != list(range(len(dim_order))) def _to_copy_op_condition(self, node): @@ -53,19 +55,15 @@ def _default_condition(self, ndoe): def _remove(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule: for n in graph_module.graph.nodes: - if n.target not in self.redundant_ops or not self.redundant_ops[n.target]( - n - ): - continue - - to_be_remove = n - # assert_tensor_metadata op has no user - if len(n.users.keys()) == 0: - n.args = () - # normal case - for user_n in list(n.users.keys()): - user_n.replace_input_with(n, n.args[0]) - graph_module.graph.erase_node(to_be_remove) + if n.target in self.redundant_ops and self.redundant_ops[n.target](n): + to_be_remove = n + # assert_tensor_metadata op has no user + if len(n.users.keys()) == 0: + n.args = () + # normal case + for user_n in list(n.users.keys()): + user_n.replace_input_with(n, n.args[0]) + graph_module.graph.erase_node(to_be_remove) def call(self, graph_module: torch.fx.GraphModule): self._remove(graph_module) diff --git a/backends/qualcomm/_passes/replace_inf_values.py b/backends/qualcomm/_passes/replace_inf_values.py index c7e475f54f2..bffcea03a72 100644 --- a/backends/qualcomm/_passes/replace_inf_values.py +++ b/backends/qualcomm/_passes/replace_inf_values.py @@ -9,13 +9,13 @@ class ReplaceInfValues(ExportPass): """ - Due to limitation in Qnn, we need to change inf or -inf to arbitrary value in quantization. + Due to limitation in QNN, change inf or -inf to arbitrary value in quantization. """ def __init__(self): super(ReplaceInfValues, self).__init__() - def call(self, graph_module: torch.fx.GraphModule): + def call(self, graph_module: torch.fx.GraphModule): # noqa: C901 for buf_name, tensor in graph_module.named_buffers(): if tensor.is_floating_point(): # 255 here is mainly for attention_mask in Llama for reasonable quant scale @@ -38,5 +38,23 @@ def call(self, graph_module: torch.fx.GraphModule): arg_list[2] = -255 node.args = tuple(arg_list) + if node.target in [ + torch.ops.aten.masked_fill.Tensor, + torch.ops.aten.masked_fill.Scalar, + ]: + assert ( + len(node.args) == 3 + ), f"Expecting {node.name} to have 3 arguments." + val = node.args[2] + if node.args[2] > torch.finfo(torch.float16).max: + val = 255 + elif node.args[2] < torch.finfo(torch.float16).min: + val = -255 + node.args = ( + node.args[0], + node.args[1], + val, + ) + graph_module.recompile() return PassResult(graph_module, True) diff --git a/backends/qualcomm/quantizer/annotators.py b/backends/qualcomm/quantizer/annotators.py index ecce4ee3ef0..eca889a1610 100644 --- a/backends/qualcomm/quantizer/annotators.py +++ b/backends/qualcomm/quantizer/annotators.py @@ -462,7 +462,7 @@ def annotate_hardtanh(node: Node, quantization_config: QuantizationConfig) -> No annotate_single_in_single_out(node, quantization_config) -@register_annotator([torch.ops.aten.mean.default]) +@register_annotator([torch.ops.aten.mean.default, torch.ops.aten.mean.dim]) def annotate_mean(node: Node, quantization_config: QuantizationConfig) -> None: annotate_single_in_single_out(node, quantization_config) @@ -604,11 +604,6 @@ def annotate_select(node: Node, quantization_config: QuantizationConfig) -> None annotate_single_in_single_out(node, quantization_config) -@register_annotator([torch.ops.aten.mean.dim]) -def annotate_mean_dim(node: Node, quantization_config: QuantizationConfig) -> None: - annotate_single_in_single_out(node, quantization_config) - - @register_annotator([torch.ops.aten.slice.Tensor]) def annotate_slice(node: Node, quantization_config: QuantizationConfig) -> None: annotate_single_in_single_out(node, quantization_config) diff --git a/backends/qualcomm/quantizer/custom_annotation.py b/backends/qualcomm/quantizer/custom_annotation.py index 0e06015ed91..b5531cd492f 100644 --- a/backends/qualcomm/quantizer/custom_annotation.py +++ b/backends/qualcomm/quantizer/custom_annotation.py @@ -26,6 +26,35 @@ ) +def annotate_eurobert(gm: torch.fx.GraphModule): + """ + QNN does not support int32 -> signed 16bit quant + We need to first annotate this to_fp node as 8bit quant, so it will perform requantize + Final graph should look like: int32 -> convert -> cast -> matmul.args[1] + + """ + quantization_config_8a8w = get_8a8w_qnn_ptq_config() + for node in gm.graph.nodes: + # A little tricky here. This matmul node is wrapped inside a submodule after 1st torch.export. + # There are actually 2 'to' op that is redundant. + # It will look like: int64 -> to_fp -> to_fp -> matmul.args[1] + # Draw out the graph after the 1st export will help visualize the submodule. + + if node.target == torch.ops.aten.matmul.default and node.args[1].args[0].args[ + 0 + ].meta["val"].dtype in [torch.int64, torch.int32]: + to_node = node.args[1] + input_qspec_map = {} + assert isinstance(to_node, Node) + input_spec = quantization_config_8a8w.input_activation + input_qspec_map[to_node] = input_spec + to_node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=quantization_config_8a8w.output_activation, + _annotated=True, + ) + + def annotate_mimi_decoder(gm: torch.fx.GraphModule): """ The 1st transpose conv in mimi decoder is really sensitive to scale/offset in 16a8w, which causes execution failure. diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 4f101db8e6e..7666e36b985 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -3891,6 +3891,74 @@ def test_llama_stories_110m(self): class TestExampleOssScript(TestQNN): + def test_albert(self): + if not self.required_envs([self.sentence_dataset]): + self.skipTest("missing required envs") + cmds = [ + "python", + f"{self.executorch_root}/examples/qualcomm/oss_scripts/albert.py", + "--dataset", + self.sentence_dataset, + "--artifact", + self.artifact_dir, + "--build_folder", + self.build_folder, + "--device", + self.device, + "--model", + self.model, + "--ip", + self.ip, + "--port", + str(self.port), + ] + if self.host: + cmds.extend(["--host", self.host]) + + p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) + with Listener((self.ip, self.port)) as listener: + conn = listener.accept() + p.communicate() + msg = json.loads(conn.recv()) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["accuracy"], 0.8) + + def test_bert(self): + if not self.required_envs([self.sentence_dataset]): + self.skipTest("missing required envs") + cmds = [ + "python", + f"{self.executorch_root}/examples/qualcomm/oss_scripts/bert.py", + "--dataset", + self.sentence_dataset, + "--artifact", + self.artifact_dir, + "--build_folder", + self.build_folder, + "--device", + self.device, + "--model", + self.model, + "--ip", + self.ip, + "--port", + str(self.port), + ] + if self.host: + cmds.extend(["--host", self.host]) + + p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) + with Listener((self.ip, self.port)) as listener: + conn = listener.accept() + p.communicate() + msg = json.loads(conn.recv()) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["accuracy"], 0.6) + def test_conv_former(self): if not self.required_envs([self.image_dataset]): self.skipTest("missing required envs") @@ -4033,6 +4101,40 @@ def test_dino_v2(self): self.assertGreaterEqual(msg["top_1"], 70) self.assertGreaterEqual(msg["top_5"], 85) + def test_distilbert(self): + if not self.required_envs([self.sentence_dataset]): + self.skipTest("missing required envs") + cmds = [ + "python", + f"{self.executorch_root}/examples/qualcomm/oss_scripts/distilbert.py", + "--dataset", + self.sentence_dataset, + "--artifact", + self.artifact_dir, + "--build_folder", + self.build_folder, + "--device", + self.device, + "--model", + self.model, + "--ip", + self.ip, + "--port", + str(self.port), + ] + if self.host: + cmds.extend(["--host", self.host]) + + p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) + with Listener((self.ip, self.port)) as listener: + conn = listener.accept() + p.communicate() + msg = json.loads(conn.recv()) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["accuracy"], 0.45) + def test_dit(self): if not self.required_envs(): self.skipTest("missing required envs") @@ -4142,14 +4244,13 @@ def test_efficientSAM(self): else: self.assertGreaterEqual(msg["MIoU"], 0.55) - def test_swin_transformer(self): - if not self.required_envs([self.image_dataset]): + def test_esrgan(self): + if not self.required_envs(): self.skipTest("missing required envs") + cmds = [ "python", - f"{self.executorch_root}/examples/qualcomm/oss_scripts/swin_transformer.py", - "--dataset", - self.image_dataset, + f"{self.executorch_root}/examples/qualcomm/oss_scripts/esrgan.py", "--artifact", self.artifact_dir, "--build_folder", @@ -4158,6 +4259,9 @@ def test_swin_transformer(self): self.device, "--model", self.model, + "--default_dataset", + "--oss_repo", + self.oss_repo, "--ip", self.ip, "--port", @@ -4174,16 +4278,17 @@ def test_swin_transformer(self): if "Error" in msg: self.fail(msg["Error"]) else: - self.assertGreaterEqual(msg["top_1"], 60) - self.assertGreaterEqual(msg["top_5"], 80) + self.assertGreaterEqual(msg["PSNR"], 24) + self.assertGreaterEqual(msg["SSIM"], 0.8) - def test_esrgan(self): - if not self.required_envs(): + def test_eurobert(self): + if not self.required_envs([self.sentence_dataset]): self.skipTest("missing required envs") - cmds = [ "python", - f"{self.executorch_root}/examples/qualcomm/oss_scripts/esrgan.py", + f"{self.executorch_root}/examples/qualcomm/oss_scripts/eurobert.py", + "--dataset", + self.sentence_dataset, "--artifact", self.artifact_dir, "--build_folder", @@ -4192,9 +4297,6 @@ def test_esrgan(self): self.device, "--model", self.model, - "--default_dataset", - "--oss_repo", - self.oss_repo, "--ip", self.ip, "--port", @@ -4211,8 +4313,7 @@ def test_esrgan(self): if "Error" in msg: self.fail(msg["Error"]) else: - self.assertGreaterEqual(msg["PSNR"], 24) - self.assertGreaterEqual(msg["SSIM"], 0.8) + self.assertGreaterEqual(msg["accuracy"], 0.5) def test_fastvit(self): if not self.required_envs( @@ -4363,7 +4464,7 @@ def test_gMLP(self): self.fail(msg["Error"]) else: self.assertGreaterEqual(msg["top_1"], 60) - self.assertGreaterEqual(msg["top_5"], 90) + self.assertGreaterEqual(msg["top_5"], 85) @unittest.skip("Only outputs good accuracy in QNN 2.29") def test_mobilevit_v2(self): @@ -4654,6 +4755,41 @@ def test_ssd300_vgg16(self): else: self.assertGreaterEqual(msg["mAP"], 0.70) + def test_swin_transformer(self): + if not self.required_envs([self.image_dataset]): + self.skipTest("missing required envs") + cmds = [ + "python", + f"{self.executorch_root}/examples/qualcomm/oss_scripts/swin_transformer.py", + "--dataset", + self.image_dataset, + "--artifact", + self.artifact_dir, + "--build_folder", + self.build_folder, + "--device", + self.device, + "--model", + self.model, + "--ip", + self.ip, + "--port", + str(self.port), + ] + if self.host: + cmds.extend(["--host", self.host]) + + p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) + with Listener((self.ip, self.port)) as listener: + conn = listener.accept() + p.communicate() + msg = json.loads(conn.recv()) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["top_1"], 60) + self.assertGreaterEqual(msg["top_5"], 80) + class TestExampleQaihubScript(TestQNN): def test_utils_export(self): diff --git a/examples/qualcomm/oss_scripts/albert.py b/examples/qualcomm/oss_scripts/albert.py new file mode 100644 index 00000000000..6af554655f1 --- /dev/null +++ b/examples/qualcomm/oss_scripts/albert.py @@ -0,0 +1,162 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import getpass +import json +import logging +import os +from multiprocessing.connection import Client + +import evaluate +import numpy as np +import torch +from executorch.backends.qualcomm._passes.qnn_pass_manager import ( + get_capture_program_passes, +) +from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype + +from executorch.examples.qualcomm.utils import ( + build_executorch_binary, + get_masked_language_model_dataset, + make_output_dir, + parse_skip_delegation_node, + setup_common_args_and_variables, + SimpleADB, +) +from transformers import AlbertConfig, AutoModelForMaskedLM, AutoTokenizer + + +def main(args): + skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args) + + os.makedirs(args.artifact, exist_ok=True) + data_size = 100 + + model_name = "albert/albert-base-v2" + tokenizer = AutoTokenizer.from_pretrained(model_name, hidden_act="gelu") + + if args.ci: + random_ids = torch.randint(low=0, high=100, size=(1, 100), dtype=torch.int32) + attention_mask = torch.ones((1, 100), dtype=torch.float32) + inputs = [ + ( + random_ids, + attention_mask, + ) + ] + logging.warning( + "This option is for CI to verify the export flow. It uses random input and will result in poor accuracy." + ) + else: + inputs, targets, input_list = get_masked_language_model_dataset( + args.dataset, tokenizer, data_size + ) + + config = AlbertConfig.from_pretrained(model_name) + config.hidden_act = "gelu" + module = AutoModelForMaskedLM.from_pretrained(model_name, config=config).eval() + pte_filename = "albert_qnn_q16" + + # lower to QNN + passes_job = get_capture_program_passes() + build_executorch_binary( + module, + inputs[0], + args.model, + f"{args.artifact}/{pte_filename}", + dataset=inputs, + skip_node_id_set=skip_node_id_set, + skip_node_op_set=skip_node_op_set, + quant_dtype=QuantDtype.use_16a16w, + passes_job=passes_job, + shared_buffer=args.shared_buffer, + ) + + if args.compile_only: + return + + workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/{pte_filename}" + pte_path = f"{args.artifact}/{pte_filename}.pte" + + adb = SimpleADB( + qnn_sdk=os.getenv("QNN_SDK_ROOT"), + build_path=f"{args.build_folder}", + pte_path=pte_path, + workspace=workspace, + device_id=args.device, + host_id=args.host, + soc_model=args.model, + ) + output_data_folder = f"{args.artifact}/outputs" + make_output_dir(output_data_folder) + + # accuracy analysis + adb.push(inputs=inputs, input_list=input_list) + adb.execute() + adb.pull(output_path=args.artifact) + # since the original nn.Module could not perform well on this task either + # we only measure the relative accuracy here + goldens, predictions, nominal_predictions = [], [], [] + for i in range(len(inputs)): + indices = [i for i, x in enumerate(targets[i]) if x != -100] + goldens.extend(targets[i][indices].tolist()) + nominal_prediction = module(*inputs[i]) + nominal_predictions.extend( + nominal_prediction.logits.argmax(axis=-1)[0, indices].tolist() + ) + prediction = ( + np.fromfile( + os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32 + ) + .reshape([1, inputs[0][0].shape[1], -1]) + .argmax(axis=-1) + ) + predictions.extend(prediction[0, indices].tolist()) + + metric = evaluate.load("accuracy") + nominal_results = metric.compute( + predictions=nominal_predictions, references=goldens + ) + device_results = metric.compute(predictions=predictions, references=goldens) + result = device_results["accuracy"] / nominal_results["accuracy"] + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"accuracy": result})) + else: + print(f"accuracy: {device_results}") + print(f"accuracy with nn.Module as golden: {result}") + + +if __name__ == "__main__": + parser = setup_common_args_and_variables() + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts and output by this example. Default ./albert", + default="./albert", + type=str, + ) + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation text. " + "e.g. --dataset wikisent2.txt " + "for https://www.kaggle.com/datasets/mikeortman/wikipedia-sentences" + ), + type=str, + required=False, + ) + + args = parser.parse_args() + try: + main(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e) diff --git a/examples/qualcomm/oss_scripts/bert.py b/examples/qualcomm/oss_scripts/bert.py new file mode 100644 index 00000000000..96c7826d89c --- /dev/null +++ b/examples/qualcomm/oss_scripts/bert.py @@ -0,0 +1,149 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import getpass +import json +import logging +import os +from multiprocessing.connection import Client + +import evaluate +import numpy as np +import torch + +from executorch.backends.qualcomm._passes.qnn_pass_manager import ( + get_capture_program_passes, +) +from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype + +from executorch.examples.qualcomm.utils import ( + build_executorch_binary, + get_masked_language_model_dataset, + make_output_dir, + parse_skip_delegation_node, + setup_common_args_and_variables, + SimpleADB, +) +from transformers import AutoModelForMaskedLM, AutoTokenizer + + +def main(args): + skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args) + + os.makedirs(args.artifact, exist_ok=True) + data_size = 100 + + tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased") + if args.ci: + random_ids = torch.randint(low=0, high=100, size=(1, 100), dtype=torch.int32) + attention_mask = torch.ones((1, 100), dtype=torch.float32) + inputs = [ + ( + random_ids, + attention_mask, + ) + ] + logging.warning( + "This option is for CI to verify the export flow. It uses random input and will result in poor accuracy." + ) + else: + inputs, targets, input_list = get_masked_language_model_dataset( + args.dataset, tokenizer, data_size + ) + module = AutoModelForMaskedLM.from_pretrained( + "google-bert/bert-base-uncased" + ).eval() + pte_filename = "bert_qnn_q16" + + # lower to QNN + passes_job = get_capture_program_passes() + build_executorch_binary( + module, + inputs[0], + args.model, + f"{args.artifact}/{pte_filename}", + dataset=inputs, + skip_node_id_set=skip_node_id_set, + skip_node_op_set=skip_node_op_set, + quant_dtype=QuantDtype.use_16a8w, + passes_job=passes_job, + shared_buffer=args.shared_buffer, + ) + + if args.compile_only: + return + + workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/{pte_filename}" + pte_path = f"{args.artifact}/{pte_filename}.pte" + + adb = SimpleADB( + qnn_sdk=os.getenv("QNN_SDK_ROOT"), + build_path=f"{args.build_folder}", + pte_path=pte_path, + workspace=workspace, + device_id=args.device, + host_id=args.host, + soc_model=args.model, + ) + output_data_folder = f"{args.artifact}/outputs" + make_output_dir(output_data_folder) + + # accuracy analysis + adb.push(inputs=inputs, input_list=input_list) + adb.execute() + adb.pull(output_path=args.artifact) + goldens, predictions = [], [] + for i in range(len(inputs)): + indices = [i for i, x in enumerate(targets[i]) if x != -100] + goldens.extend(targets[i][indices].tolist()) + prediction = ( + np.fromfile( + os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32 + ) + .reshape([1, inputs[0][0].shape[1], -1]) + .argmax(axis=-1) + ) + predictions.extend(prediction[0, indices].tolist()) + + metric = evaluate.load("accuracy") + results = metric.compute(predictions=predictions, references=goldens) + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"accuracy": results["accuracy"]})) + else: + print(f"accuracy: {results['accuracy']}") + + +if __name__ == "__main__": + parser = setup_common_args_and_variables() + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts and output by this example. Default ./bert", + default="./bert", + type=str, + ) + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation text. " + "e.g. --dataset wikisent2.txt " + "for https://www.kaggle.com/datasets/mikeortman/wikipedia-sentences" + ), + type=str, + required=False, + ) + + args = parser.parse_args() + try: + main(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e) diff --git a/examples/qualcomm/oss_scripts/distilbert.py b/examples/qualcomm/oss_scripts/distilbert.py new file mode 100644 index 00000000000..2863a653200 --- /dev/null +++ b/examples/qualcomm/oss_scripts/distilbert.py @@ -0,0 +1,149 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import getpass +import json +import logging +import os +from multiprocessing.connection import Client + +import evaluate +import numpy as np +import torch + +from executorch.backends.qualcomm._passes.qnn_pass_manager import ( + get_capture_program_passes, +) +from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype + +from executorch.examples.qualcomm.utils import ( + build_executorch_binary, + get_masked_language_model_dataset, + make_output_dir, + parse_skip_delegation_node, + setup_common_args_and_variables, + SimpleADB, +) +from transformers import AutoModelForMaskedLM, AutoTokenizer + + +def main(args): + skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args) + + os.makedirs(args.artifact, exist_ok=True) + data_size = 100 + + tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased") + if args.ci: + random_ids = torch.randint(low=0, high=100, size=(1, 100), dtype=torch.int32) + attention_mask = torch.ones((1, 100), dtype=torch.float32) + inputs = [ + ( + random_ids, + attention_mask, + ) + ] + logging.warning( + "This option is for CI to verify the export flow. It uses random input and will result in poor accuracy." + ) + else: + inputs, targets, input_list = get_masked_language_model_dataset( + args.dataset, tokenizer, data_size + ) + module = AutoModelForMaskedLM.from_pretrained( + "distilbert/distilbert-base-uncased" + ).eval() + pte_filename = "distilbert_qnn_q16" + + # lower to QNN + passes_job = get_capture_program_passes() + build_executorch_binary( + module, + inputs[0], + args.model, + f"{args.artifact}/{pte_filename}", + dataset=inputs, + skip_node_id_set=skip_node_id_set, + skip_node_op_set=skip_node_op_set, + quant_dtype=QuantDtype.use_16a8w, + passes_job=passes_job, + shared_buffer=args.shared_buffer, + ) + + if args.compile_only: + return + + workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/{pte_filename}" + pte_path = f"{args.artifact}/{pte_filename}.pte" + + adb = SimpleADB( + qnn_sdk=os.getenv("QNN_SDK_ROOT"), + build_path=f"{args.build_folder}", + pte_path=pte_path, + workspace=workspace, + device_id=args.device, + host_id=args.host, + soc_model=args.model, + ) + output_data_folder = f"{args.artifact}/outputs" + make_output_dir(output_data_folder) + + # accuracy analysis + adb.push(inputs=inputs, input_list=input_list) + adb.execute() + adb.pull(output_path=args.artifact) + goldens, predictions = [], [] + for i in range(len(inputs)): + indices = [i for i, x in enumerate(targets[i]) if x != -100] + goldens.extend(targets[i][indices].tolist()) + prediction = ( + np.fromfile( + os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32 + ) + .reshape([1, inputs[0][0].shape[1], -1]) + .argmax(axis=-1) + ) + predictions.extend(prediction[0, indices].tolist()) + + metric = evaluate.load("accuracy") + results = metric.compute(predictions=predictions, references=goldens) + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"accuracy": results["accuracy"]})) + else: + print(f"accuracy: {results['accuracy']}") + + +if __name__ == "__main__": + parser = setup_common_args_and_variables() + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts and output by this example. Default ./distilbert", + default="./distilbert", + type=str, + ) + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation text. " + "e.g. --dataset wikisent2.txt " + "for https://www.kaggle.com/datasets/mikeortman/wikipedia-sentences" + ), + type=str, + required=False, + ) + + args = parser.parse_args() + try: + main(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e) diff --git a/examples/qualcomm/oss_scripts/eurobert.py b/examples/qualcomm/oss_scripts/eurobert.py new file mode 100644 index 00000000000..97e70428e01 --- /dev/null +++ b/examples/qualcomm/oss_scripts/eurobert.py @@ -0,0 +1,187 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import json +import logging +import os +from multiprocessing.connection import Client + +import evaluate +import numpy as np +import torch +import transformers +from executorch.backends.qualcomm._passes.qnn_pass_manager import ( + get_capture_program_passes, +) + +from executorch.backends.qualcomm.quantizer.custom_annotation import annotate_eurobert +from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype + +from executorch.examples.qualcomm.utils import ( + build_executorch_binary, + get_masked_language_model_dataset, + make_output_dir, + make_quantizer, + parse_skip_delegation_node, + setup_common_args_and_variables, + SimpleADB, +) +from transformers import AutoConfig, AutoModelForMaskedLM, AutoTokenizer + +TRANSFORMERS_VERSION = "4.48.0" + + +def main(args): + assert ( + transformers.__version__ >= TRANSFORMERS_VERSION + ), f"Please ensure transformers version >= {TRANSFORMERS_VERSION}, current version is {transformers.__version__}" + + skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args) + + os.makedirs(args.artifact, exist_ok=True) + + if not args.compile_only and args.device is None: + raise RuntimeError( + "device serial is required if not compile only. " + "Please specify a device serial by -s/--device argument." + ) + + module_id = "EuroBERT/EuroBERT-210m" + tokenizer = AutoTokenizer.from_pretrained(module_id) + model = AutoModelForMaskedLM.from_pretrained( + module_id, trust_remote_code=True + ).eval() + config = AutoConfig.from_pretrained(module_id, trust_remote_code=True) + + def replace_rms_norm_with_native_rms_norm(module: torch.nn.Module): + for name, child in module.named_children(): + if child._get_name() == "EuroBertRMSNorm": + rms_norm = torch.nn.RMSNorm( + [config.hidden_size], eps=child.variance_epsilon + ) + rms_norm.weight = child.weight + setattr( + module, + name, + rms_norm, + ) + else: + replace_rms_norm_with_native_rms_norm(child) + return module + + replace_rms_norm_with_native_rms_norm(model) + + data_size = 100 + if args.ci: + random_ids = torch.randint(low=0, high=100, size=(1, 100), dtype=torch.int32) + attention_mask = torch.ones((1, 100), dtype=torch.float32) + inputs = [ + ( + random_ids, + attention_mask, + ) + ] + logging.warning( + "This option is for CI to verify the export flow. It uses random input and will result in poor accuracy." + ) + else: + inputs, targets, input_list = get_masked_language_model_dataset( + args.dataset, tokenizer, data_size + ) + + pte_filename = "eurobert_qnn_q16" + + # lower to QNN + passes_job = get_capture_program_passes() + quantizer = make_quantizer( + quant_dtype=QuantDtype.use_16a16w, + ) + quantizer.add_custom_quant_annotations((annotate_eurobert,)) + with torch.no_grad(): + build_executorch_binary( + model, + inputs[0], + args.model, + f"{args.artifact}/{pte_filename}", + dataset=inputs, + skip_node_id_set=skip_node_id_set, + skip_node_op_set=skip_node_op_set, + custom_quantizer=quantizer, + passes_job=passes_job, + shared_buffer=args.shared_buffer, + ) + + if args.compile_only: + return + + adb = SimpleADB( + qnn_sdk=os.getenv("QNN_SDK_ROOT"), + build_path=f"{args.build_folder}", + pte_path=f"{args.artifact}/{pte_filename}.pte", + workspace=f"/data/local/tmp/executorch/{pte_filename}", + device_id=args.device, + host_id=args.host, + soc_model=args.model, + ) + output_data_folder = f"{args.artifact}/outputs" + make_output_dir(output_data_folder) + + # accuracy analysis + adb.push(inputs=inputs, input_list=input_list) + adb.execute() + adb.pull(output_path=args.artifact) + goldens, predictions = [], [] + for i in range(len(inputs)): + indices = [i for i, x in enumerate(targets[i]) if x != -100] + goldens.extend(targets[i][indices].tolist()) + + prediction = ( + np.fromfile( + os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32 + ) + .reshape([1, inputs[0][0].shape[1], -1]) + .argmax(axis=-1) + ) + predictions.extend(prediction[0, indices].tolist()) + metric = evaluate.load("accuracy") + results = metric.compute(predictions=predictions, references=goldens) + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"accuracy": results["accuracy"]})) + else: + print(f"accuracy: {results['accuracy']}") + + +if __name__ == "__main__": + parser = setup_common_args_and_variables() + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts and output by this example. Default ./eurobert", + default="./eurobert", + type=str, + ) + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation text. " + "e.g. --dataset wikisent2.txt " + "for https://www.kaggle.com/datasets/mikeortman/wikipedia-sentences" + ), + type=str, + required=False, + ) + + args = parser.parse_args() + try: + main(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e)