pytorch
diff --git a/‎.ci/scripts/build-qnn-sdk.sh‎
Lines changed: 4 additions & 2 deletions b/‎.ci/scripts/build-qnn-sdk.sh‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎.ci/scripts/setup-qnn-deps.sh‎
Lines changed: 1 addition & 0 deletions b/‎.ci/scripts/setup-qnn-deps.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.ci/scripts/test_llama.sh‎
Lines changed: 7 additions & 1 deletion b/‎.ci/scripts/test_llama.sh‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎.ci/scripts/test_qnn_static_llama.sh‎
Lines changed: 6 additions & 1 deletion b/‎.ci/scripts/test_qnn_static_llama.sh‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎.github/workflows/android-perf.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/android-perf.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/android-release-artifacts.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/android-release-artifacts.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/apple-perf.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/apple-perf.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/_passes/to_tosa_memory_format_pass.py‎
Lines changed: 14 additions & 2 deletions b/‎backends/arm/_passes/to_tosa_memory_format_pass.py‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎backends/arm/debug/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎backends/arm/debug/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/arm/debug/schema.py‎
Lines changed: 133 additions & 0 deletions b/‎backends/arm/debug/schema.py‎
Lines changed: 133 additions & 0 deletions
@@ -11,8 +11,10 @@ set -o xtrace
 
 build_qnn_backend() {
   echo "Start building qnn backend."
-  export ANDROID_NDK_ROOT=${ANDROID_NDK_ROOT:-/opt/ndk}
-  export QNN_SDK_ROOT=${QNN_SDK_ROOT:-/tmp/qnn/2.28.0.241029}
+  # Source QNN configuration
+  source "$(dirname "${BASH_SOURCE[0]}")/../../backends/qualcomm/scripts/install_qnn_sdk.sh"
+  setup_android_ndk
+  install_qnn
   export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)"
 
   parallelism=$(( $(nproc) - 1 ))
 
@@ -10,4 +10,5 @@ set -ex
 source "$(dirname "${BASH_SOURCE[0]}")/../../backends/qualcomm/scripts/install_qnn_sdk.sh"
 
 setup_libcpp 12
+setup_android_ndk
 install_qnn
@@ -119,8 +119,12 @@ echo "COREML option ${COREML}"
 
 if [[ "${MODE}" =~ .*qnn.* ]]; then
   QNN=ON
+
+  # Download QNN_SDK. If already downloaded, export environment path
+  source "$(dirname "${BASH_SOURCE[0]}")/../../backends/qualcomm/scripts/install_qnn_sdk.sh"
+  install_qnn
+
   export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)"
-  export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
   export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang"
   export PYTHONPATH=".."
   cp schema/program.fbs exir/_serialize/program.fbs
@@ -150,6 +154,7 @@ cmake_install_executorch_libraries() {
     echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
     rm -rf cmake-out
     retry cmake --preset llm \
+        -DEXECUTORCH_BUILD_TESTS=ON \
         -DBUILD_TESTING=OFF \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
@@ -166,6 +171,7 @@ cmake_build_llama_runner() {
     popd
     dir="examples/models/llama"
     retry cmake \
+        -DEXECUTORCH_BUILD_TESTS=ON \
         -DBUILD_TESTING=OFF \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
 
@@ -9,8 +9,13 @@ set -euxo pipefail
 
 source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 
+# Source QNN configuration
+source "$(dirname "${BASH_SOURCE[0]}")/../../backends/qualcomm/scripts/qnn_config.sh"
+# Download QNN_SDK. If already downloaded, export environment path
+source "$(dirname "${BASH_SOURCE[0]}")/../../backends/qualcomm/scripts/install_qnn_sdk.sh"
+install_qnn
+
 export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)"
-export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
 export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang"
 export PYTHONPATH=".."
 cp schema/program.fbs exir/_serialize/program.fbs
 
@@ -292,7 +292,7 @@ jobs:
                       export.output_name="${OUT_ET_MODEL_NAME}.pte"
                     ls -lh "${OUT_ET_MODEL_NAME}.pte"
                 elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
-                    export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
+                    export QNN_SDK_ROOT=/tmp/qnn/2.37.0.25072
                     export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/
                     export PYTHONPATH=$(pwd)/..
 
@@ -432,7 +432,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
 
         mkdir -p aar-out
-        PYTHON_EXECUTABLE=python ANDROID_ABIS="arm64-v8a" BUILD_AAR_DIR=aar-out EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 EXECUTORCH_ANDROID_PROFILING=ON bash scripts/build_android_library.sh
+        PYTHON_EXECUTABLE=python ANDROID_ABIS="arm64-v8a" BUILD_AAR_DIR=aar-out EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.37.0.25072 EXECUTORCH_ANDROID_PROFILING=ON bash scripts/build_android_library.sh
         mkdir -p extension/benchmark/android/benchmark/app/libs
         cp aar-out/executorch.aar extension/benchmark/android/benchmark/app/libs
         pushd extension/benchmark/android/benchmark
 
@@ -104,7 +104,7 @@ jobs:
           source backends/qualcomm/scripts/qnn_config.sh
           export QNN_SDK_ROOT="/tmp/qnn/${QNN_VERSION}"
           export ANDROID_ABIS=arm64-v8a
-          GRADLE_ARGS+=" -DqnnVersion=2.28.0"
+          GRADLE_ARGS+=" -DqnnVersion=2.37.0"
         fi
 
         # Build AAR Package
 
@@ -230,7 +230,7 @@ jobs:
                 model.use_sdpa_with_kv_cache=true \
                 backend.xnnpack.enabled=true \
                 backend.xnnpack.extended_ops=true \
-                base.preq_mode="8da4w_output_8da8w" \
+                base.preq_mode="preq_8da4w_out_8da8w" \
                 base.preq_group_size=32 \
                 export.max_seq_length=2048 \
                 export.max_context_length=2048 \
@@ -256,7 +256,7 @@ jobs:
                 base.params="${DOWNLOADED_PATH}/params.json" \
                 quantization.use_qat=true \
                 base.use_lora=16 \
-                base.preq_mode="8da4w_output_8da8w" \
+                base.preq_mode="preq_8da4w_out_8da8w" \
                 base.preq_group_size=32 \
                 base.preq_embedding_quantize=\'8,0\' \
                 model.use_sdpa_with_kv_cache=true \
 
@@ -12,7 +12,6 @@
     get_first_fake_tensor,
     is_param_node,
 )
-from executorch.backends.arm.tosa_utils import is_consumer_node_depthwise_conv2d
 from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
@@ -43,6 +42,19 @@ def __init__(self, exported_program: ExportedProgram) -> None:
         self.exported_program = exported_program
         super().__init__()
 
+    @staticmethod
+    def _is_consumer_node_depthwise_conv2d(node: torch.fx.Node):
+        consumer_node = list(node.users)[0]
+        if consumer_node.target == exir_ops.edge.aten.convolution.default:
+            consumer_node_inputs = consumer_node.all_input_nodes
+            groups = consumer_node.args[-1]
+            in_channels = consumer_node_inputs[0].meta["val"].shape[1]
+            out_channels = consumer_node_inputs[1].meta["val"].shape[0]
+            if (in_channels == groups) and (out_channels % in_channels) == 0:
+                return True
+
+        return False
+
     def is_weight_node_for_depthwise_conv2d(self, node: torch.fx.Node):
         """
         returns True for w in the following sequence;
@@ -53,7 +65,7 @@ def is_weight_node_for_depthwise_conv2d(self, node: torch.fx.Node):
             consumer_node = list(node.users)[0]
             if self.is_weight_node_for_depthwise_conv2d(consumer_node):
                 return True
-            if is_consumer_node_depthwise_conv2d(node):
+            if self._is_consumer_node_depthwise_conv2d(node):
                 # Check that node is the weight-argument and not input or bias
                 return consumer_node.args[1] == node
 
 
@@ -0,0 +1,4 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
@@ -0,0 +1,133 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import annotations
+
+import json
+
+from dataclasses import asdict, dataclass
+from typing import Any
+
+import serializer.tosa_serializer as ts  # type: ignore
+import torch
+
+from torch.fx.traceback import NodeSource
+
+
+@dataclass
+class TosaDebugSchema:
+    node_name: str
+    operator_name: str
+    operator_id: int
+
+
+@dataclass
+class ATenDebugSchema:
+    node_name: str
+    operator_name: str
+
+    @staticmethod
+    def from_node(node: torch.fx.Node) -> ATenDebugSchema:
+        # node.target is Union[Callable[..., Any], str], so we need to access this correctly depending on the type
+        if callable(node.target):
+            operator_name = node.target.__name__
+        else:
+            operator_name = node.target
+
+        return ATenDebugSchema(node_name=node.name, operator_name=operator_name)
+
+
+@dataclass
+class TorchDebugSchema:
+    stack_trace: list[str]
+    node_trace: list[dict[str, Any]] | str
+    nn_module_stack: dict[str, Any] | str
+    torch_fn: tuple[str, str] | str
+
+    @staticmethod
+    def serialize_node_trace(node_trace: list[NodeSource]) -> list[dict[str, Any]]:
+        """Flatten the from_node dictionary to remove nesting."""
+        flattened = []
+        node_stack = []
+
+        for n in node_trace:
+            node_stack.append((n, -1))
+
+        while len(node_stack) > 0:
+            node, parent_id = node_stack.pop()
+            flattened.append(
+                {
+                    "name": node.name,
+                    "target": node.target,
+                    "graph_id": node.graph_id,
+                    "pass_name": node.pass_name,
+                    "action": node._get_action_string(),
+                    "parent_graph_id": parent_id,
+                }
+            )
+
+            for n in node.from_node:
+                node_stack.append((n, node.graph_id))
+
+        return flattened
+
+    @staticmethod
+    def from_node(node: torch.fx.Node) -> TorchDebugSchema:
+        node_trace: str | list[dict[str, Any]] = "No node trace available."
+
+        if "from_node" in node.meta:
+            # Flatten the node_trace dictionary, so there is no nesting
+            node_trace = TorchDebugSchema.serialize_node_trace(node.meta["from_node"])
+
+        return TorchDebugSchema(
+            stack_trace=node.meta.get("stack_trace", "No stack trace available").split(
+                "\n"
+            ),
+            node_trace=node_trace,
+            nn_module_stack=node.meta.get(
+                "nn_module_stack", "No module stack trace available"
+            ),
+            torch_fn=node.meta.get("torch_fn", "No torch_fn available"),
+        )
+
+
+@dataclass
+class DebugSchema:
+    event_id: int
+    aten_info: ATenDebugSchema
+    tosa_info: TosaDebugSchema
+    torch_info: TorchDebugSchema
+
+
+class DebugHook:
+    def __init__(self) -> None:
+        self._debug_events: list[DebugSchema] = []
+        self.__op_id_to_name = {}
+
+        # Build up a mapping from TOSA 1.0 operator IDs to their names
+        for name, val in vars(ts.Op).items():
+            self.__op_id_to_name[val] = name
+
+    def add(self, node: torch.fx.Node, tosa_op: Any, tosa_op_id: int) -> None:
+        tosa_debug_info = TosaDebugSchema(
+            node_name=str(tosa_op),
+            operator_name=self.__op_id_to_name[tosa_op_id],
+            operator_id=tosa_op_id,
+        )
+
+        aten_debug_info = ATenDebugSchema.from_node(node)
+        torch_debug_info = TorchDebugSchema.from_node(node)
+
+        self._debug_events.append(
+            DebugSchema(
+                event_id=len(self._debug_events),
+                aten_info=aten_debug_info,
+                tosa_info=tosa_debug_info,
+                torch_info=torch_debug_info,
+            )
+        )
+
+    def serialize(self) -> str:
+        return json.dumps([asdict(event) for event in self._debug_events], indent=4)