pytorch
diff --git a/‎.ci/scripts/test_model.sh‎
Lines changed: 27 additions & 1 deletion b/‎.ci/scripts/test_model.sh‎
Lines changed: 27 additions & 1 deletion
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 26 additions & 0 deletions b/‎.github/workflows/trunk.yml‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/qualcomm/_passes/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎backends/qualcomm/_passes/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/qualcomm/_passes/convert_bmm_to_matmul.py‎
Lines changed: 0 additions & 76 deletions b/‎backends/qualcomm/_passes/convert_bmm_to_matmul.py‎
Lines changed: 0 additions & 76 deletions
diff --git a/‎backends/qualcomm/_passes/decompose_wrap_with_autocast.py‎
Lines changed: 88 additions & 0 deletions b/‎backends/qualcomm/_passes/decompose_wrap_with_autocast.py‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎backends/qualcomm/_passes/qnn_pass_manager.py‎
Lines changed: 3 additions & 2 deletions b/‎backends/qualcomm/_passes/qnn_pass_manager.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎backends/qualcomm/_passes/remove_redundancy.py‎
Lines changed: 11 additions & 13 deletions b/‎backends/qualcomm/_passes/remove_redundancy.py‎
Lines changed: 11 additions & 13 deletions
@@ -188,6 +188,14 @@ test_model_with_qnn() {
     EXPORT_SCRIPT=edsr
     # Additional deps for edsr
     pip install piq
+  elif [[ "${MODEL_NAME}" == "albert" ]]; then
+    EXPORT_SCRIPT=albert
+  elif [[ "${MODEL_NAME}" == "bert" ]]; then
+    EXPORT_SCRIPT=bert
+  elif [[ "${MODEL_NAME}" == "distilbert" ]]; then
+    EXPORT_SCRIPT=distilbert
+  elif [[ "${MODEL_NAME}" == "eurobert" ]]; then
+    EXPORT_SCRIPT=eurobert
   else
     echo "Unsupported model $MODEL_NAME"
     exit 1
@@ -197,7 +205,25 @@ test_model_with_qnn() {
   # TODO(guangyang): Make QNN chipset matches the target device
   QNN_CHIPSET=SM8450
 
-  "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --ci --compile_only $EXTRA_FLAGS
+  SCRIPT_FOLDER=""
+  case "${MODEL_NAME}" in
+    "dl3"|"mv3"|"mv2"|"ic4"|"ic3"|"vit"|"mb"|"w2l")
+        SCRIPT_FOLDER=scripts
+        ;;
+    "albert"|"bert"|"distilbert")
+        pip install evaluate
+        SCRIPT_FOLDER=oss_scripts
+        # Bert models running in 16bit will encounter op validation fail on some operations,
+        # which requires CHIPSET >= SM8550.
+        QNN_CHIPSET=SM8550
+        ;;
+    *)
+        echo "Unsupported model $MODEL_NAME"
+        exit 1
+        ;;
+  esac
+
+  "${PYTHON_EXECUTABLE}" -m examples.qualcomm.${SCRIPT_FOLDER}.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --ci --compile_only $EXTRA_FLAGS
   EXPORTED_MODEL=$(find "./${EXPORT_SCRIPT}" -type f -name "${MODEL_NAME}*.pte" -print -quit)
 }
 
 
@@ -480,6 +480,32 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh ${{ matrix.model }} "cmake" "qnn"
 
+  test-qnn-optimum-model:
+    name: test-qnn-optimum-model
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      matrix:
+        dtype: [fp32]
+        model: [albert, bert, distilbert] # eurobert requires transfomer >= 4.48.0, skip for now
+      fail-fast: false
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-qnn-sdk
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 900
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
+        PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh ${{ matrix.model }} "cmake" "qnn"
+
   test-apple-model:
     name: test-apple-model
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
 
@@ -42,6 +42,9 @@ xcuserdata/
 *.xcworkspace/
 *.xcframework/
 
+# clangd
+.cache/
+
 # misc
 /.vscode/
 *.so
 
@@ -8,7 +8,6 @@
 from .annotate_quant_attrs import AnnotateQuantAttrs
 from .annotate_stack import AnnotateStack
 from .annotate_unbind import AnnotateUnbind
-from .convert_bmm_to_matmul import ConvertBmmToMatmul
 from .convert_conv1d_to_conv2d import ConvertConv1dToConv2d
 from .convert_square_to_pow import ConvertSquareToPow
 from .decompose_any import DecomposeAny
@@ -19,6 +18,7 @@
 from .decompose_linalg_vector_norm import DecomposeLinalgVectorNorm
 from .decompose_roll import DecomposeRoll
 from .decompose_silu import DecomposeSilu
+from .decompose_wrap_with_autocast import DecomposeWrapWithAutocast
 from .expand_broadcast_tensor_shape import ExpandBroadcastTensorShape
 from .fixed_linear_keep_dim import FixedLinearKeepDim
 from .fold_qdq import FoldQDQ
@@ -45,7 +45,6 @@
     AnnotateQuantAttrs,
     AnnotateStack,
     AnnotateUnbind,
-    ConvertBmmToMatmul,
     ConvertConv1dToConv2d,
     ConvertSquareToPow,
     DecomposeAny,
@@ -56,6 +55,7 @@
     DecomposeLinalgVectorNorm,
     DecomposeRoll,
     DecomposeSilu,
+    DecomposeWrapWithAutocast,
     ExpandBroadcastTensorShape,
     FixedLinearKeepDim,
     FoldQDQ,
 
@@ -0,0 +1,88 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import _operator
+from typing import Dict, Tuple
+
+import torch
+from executorch.exir.pass_base import ExportPass, PassResult
+
+from .utils import copy_nn_module_stack
+
+
+class DecomposeWrapWithAutocast(ExportPass):
+    """
+    Decompose the _higher_order_ops WrapWithAutocast
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def _get_submod(
+        self, gm: torch.fx.GraphModule, node: torch.fx.Node
+    ) -> Tuple[torch.fx.GraphModule, str]:
+        for a in node.args:
+            if isinstance(a, torch.fx.Node) and "submod" in a.target:
+                return getattr(gm, a.target), a.target
+
+    def _replace_output(
+        self, wwac_node: torch.fx.Node, output_node: torch.fx.Node, remap: Dict
+    ):
+        for user in wwac_node.users.copy():
+            arg_idx = 0
+            is_user_getitem = False
+
+            if user.target == _operator.getitem:
+                arg_idx = user.args[1]
+                is_user_getitem = True
+
+            user.replace_input_with(
+                wwac_node,
+                remap[output_node.args[0][arg_idx]],
+            )
+
+            if is_user_getitem:
+                for user_user in user.users.copy():
+                    user_user.replace_input_with(user, user.args[0])
+
+    def _replace(self, gm: torch.fx.GraphModule) -> None:
+        graph = gm.graph
+        for node in graph.nodes:
+            if isinstance(node.target, torch._higher_order_ops.wrap.WrapWithAutocast):
+                submod, submod_name = self._get_submod(gm, node)
+                n_args = node.args
+                input_submod = n_args[4]
+                decomposed_module = submod
+                with graph.inserting_before(node):
+                    # remap is used to map original node values to new node values,
+                    # which ensures that reference to nodes are correctly updated in the new graph
+                    # remap = {"expand_1": node.args[5], "to_4": node.args[6]}
+                    remap = {n_args[i].name: n_args[i] for i in range(5, len(n_args))}
+
+                    for decomposed_node in decomposed_module.graph.nodes:
+                        copy_nn_module_stack(node, decomposed_node)
+                        # no need to copy existent 'output'
+                        if decomposed_node.op == "output":
+                            self._replace_output(node, decomposed_node, remap)
+                        # no need to copy existent placeholders
+                        elif decomposed_node.op == "placeholder":
+                            # replace node map from string to graph node
+                            remap[decomposed_node] = remap.pop(decomposed_node.name)
+                        else:
+                            remap[decomposed_node] = graph.node_copy(
+                                decomposed_node,
+                                arg_transform=lambda x, remap=remap: remap[x],
+                            )
+
+                    graph.erase_node(node)
+
+                graph.erase_node(input_submod)
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        self._replace(graph_module)
+        graph_module.graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
@@ -13,7 +13,6 @@
     AnnotateQuantAttrs,
     AnnotateStack,
     AnnotateUnbind,
-    ConvertBmmToMatmul,
     ConvertConv1dToConv2d,
     ConvertSquareToPow,
     DecomposeAny,
@@ -24,6 +23,7 @@
     DecomposeLinalgVectorNorm,
     DecomposeRoll,
     DecomposeSilu,
+    DecomposeWrapWithAutocast,
     ExpandBroadcastTensorShape,
     FixedLinearKeepDim,
     FoldQDQ,
@@ -80,7 +80,6 @@ def get_capture_program_passes():
         (AnnotateQuantAttrs, True),
         (AnnotateStack, True),
         (AnnotateUnbind, True),
-        (ConvertBmmToMatmul, True),
         (ConvertConv1dToConv2d, True),
         (DecomposeAny, True),
         (DecomposeColIm, True),
@@ -194,6 +193,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeScaledDotProductAttention())
         self.add_pass(DecomposeRoll())
         self.add_pass(DecomposeSilu())
+        self.add_pass(DecomposeWrapWithAutocast())
         self.add_pass(DecomposeEinsum())
         self.add_pass(DecomposeExpM1())
         self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True))
@@ -207,6 +207,7 @@ def transform_for_export_pipeline(self, exported_program: ExportedProgram):
         self.add_pass(DecomposeRoll())
         self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True))
         self.add_pass(DecomposeExpM1())
+        self.add_pass(DecomposeWrapWithAutocast())
         # this pass will rewrite state_dict, it needs to be accomplished before
         # to_edge_transform_and_lower
         self.add_pass(ConvertConv1dToConv2d(exported_program))
 
@@ -43,6 +43,8 @@ def _dim_order_op_condition(self, node):
         dim_order = node.kwargs.get("dim_order")
         # skip if there contains layout hint
         # e.g. (0, 2, 3, 1) != (0, 1, 2, 3)
+        if node.meta["val"].dtype != node.args[0].meta["val"].dtype:
+            return False
         return dim_order != list(range(len(dim_order)))
 
     def _to_copy_op_condition(self, node):
@@ -53,19 +55,15 @@ def _default_condition(self, ndoe):
 
     def _remove(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule:
         for n in graph_module.graph.nodes:
-            if n.target not in self.redundant_ops or not self.redundant_ops[n.target](
-                n
-            ):
-                continue
-
-            to_be_remove = n
-            # assert_tensor_metadata op has no user
-            if len(n.users.keys()) == 0:
-                n.args = ()
-            # normal case
-            for user_n in list(n.users.keys()):
-                user_n.replace_input_with(n, n.args[0])
-            graph_module.graph.erase_node(to_be_remove)
+            if n.target in self.redundant_ops and self.redundant_ops[n.target](n):
+                to_be_remove = n
+                # assert_tensor_metadata op has no user
+                if len(n.users.keys()) == 0:
+                    n.args = ()
+                # normal case
+                for user_n in list(n.users.keys()):
+                    user_n.replace_input_with(n, n.args[0])
+                graph_module.graph.erase_node(to_be_remove)
 
     def call(self, graph_module: torch.fx.GraphModule):
         self._remove(graph_module)