pytorch
diff --git a/‎.ci/scripts/build-qnn-sdk.sh‎
Lines changed: 4 additions & 2 deletions b/‎.ci/scripts/build-qnn-sdk.sh‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎.ci/scripts/setup-qnn-deps.sh‎
Lines changed: 1 addition & 0 deletions b/‎.ci/scripts/setup-qnn-deps.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.ci/scripts/test_llama.sh‎
Lines changed: 5 additions & 1 deletion b/‎.ci/scripts/test_llama.sh‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎.ci/scripts/test_qnn_static_llama.sh‎
Lines changed: 6 additions & 1 deletion b/‎.ci/scripts/test_qnn_static_llama.sh‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎.github/workflows/android-perf.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/android-perf.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/android-release-artifacts.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/android-release-artifacts.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/apple-perf.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/apple-perf.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/apple/mps/serialization/mps_graph_serialize.py‎
Lines changed: 6 additions & 2 deletions b/‎backends/apple/mps/serialization/mps_graph_serialize.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎backends/arm/CMakeLists.txt‎
Lines changed: 3 additions & 2 deletions b/‎backends/arm/CMakeLists.txt‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎backends/arm/README.md‎
Lines changed: 48 additions & 2 deletions b/‎backends/arm/README.md‎
Lines changed: 48 additions & 2 deletions
@@ -11,8 +11,10 @@ set -o xtrace
 
 build_qnn_backend() {
   echo "Start building qnn backend."
-  export ANDROID_NDK_ROOT=${ANDROID_NDK_ROOT:-/opt/ndk}
-  export QNN_SDK_ROOT=${QNN_SDK_ROOT:-/tmp/qnn/2.28.0.241029}
+  # Source QNN configuration
+  source "$(dirname "${BASH_SOURCE[0]}")/../../backends/qualcomm/scripts/install_qnn_sdk.sh"
+  setup_android_ndk
+  install_qnn
   export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)"
 
   parallelism=$(( $(nproc) - 1 ))
 
@@ -10,4 +10,5 @@ set -ex
 source "$(dirname "${BASH_SOURCE[0]}")/../../backends/qualcomm/scripts/install_qnn_sdk.sh"
 
 setup_libcpp 12
+setup_android_ndk
 install_qnn
@@ -119,8 +119,12 @@ echo "COREML option ${COREML}"
 
 if [[ "${MODE}" =~ .*qnn.* ]]; then
   QNN=ON
+
+  # Download QNN_SDK. If already downloaded, export environment path
+  source "$(dirname "${BASH_SOURCE[0]}")/../../backends/qualcomm/scripts/install_qnn_sdk.sh"
+  install_qnn
+
   export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)"
-  export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
   export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang"
   export PYTHONPATH=".."
   cp schema/program.fbs exir/_serialize/program.fbs
 
@@ -9,8 +9,13 @@ set -euxo pipefail
 
 source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 
+# Source QNN configuration
+source "$(dirname "${BASH_SOURCE[0]}")/../../backends/qualcomm/scripts/qnn_config.sh"
+# Download QNN_SDK. If already downloaded, export environment path
+source "$(dirname "${BASH_SOURCE[0]}")/../../backends/qualcomm/scripts/install_qnn_sdk.sh"
+install_qnn
+
 export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)"
-export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
 export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang"
 export PYTHONPATH=".."
 cp schema/program.fbs exir/_serialize/program.fbs
 
@@ -292,7 +292,7 @@ jobs:
                       export.output_name="${OUT_ET_MODEL_NAME}.pte"
                     ls -lh "${OUT_ET_MODEL_NAME}.pte"
                 elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
-                    export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
+                    export QNN_SDK_ROOT=/tmp/qnn/2.37.0.25072
                     export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/
                     export PYTHONPATH=$(pwd)/..
 
@@ -432,7 +432,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
 
         mkdir -p aar-out
-        PYTHON_EXECUTABLE=python ANDROID_ABIS="arm64-v8a" BUILD_AAR_DIR=aar-out EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 EXECUTORCH_ANDROID_PROFILING=ON bash scripts/build_android_library.sh
+        PYTHON_EXECUTABLE=python ANDROID_ABIS="arm64-v8a" BUILD_AAR_DIR=aar-out EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.37.0.25072 EXECUTORCH_ANDROID_PROFILING=ON bash scripts/build_android_library.sh
         mkdir -p extension/benchmark/android/benchmark/app/libs
         cp aar-out/executorch.aar extension/benchmark/android/benchmark/app/libs
         pushd extension/benchmark/android/benchmark
 
@@ -104,7 +104,7 @@ jobs:
           source backends/qualcomm/scripts/qnn_config.sh
           export QNN_SDK_ROOT="/tmp/qnn/${QNN_VERSION}"
           export ANDROID_ABIS=arm64-v8a
-          GRADLE_ARGS+=" -DqnnVersion=2.28.0"
+          GRADLE_ARGS+=" -DqnnVersion=2.37.0"
         fi
 
         # Build AAR Package
 
@@ -230,7 +230,7 @@ jobs:
                 model.use_sdpa_with_kv_cache=true \
                 backend.xnnpack.enabled=true \
                 backend.xnnpack.extended_ops=true \
-                base.preq_mode="8da4w_output_8da8w" \
+                base.preq_mode="preq_8da4w_out_8da8w" \
                 base.preq_group_size=32 \
                 export.max_seq_length=2048 \
                 export.max_context_length=2048 \
@@ -256,7 +256,7 @@ jobs:
                 base.params="${DOWNLOADED_PATH}/params.json" \
                 quantization.use_qat=true \
                 base.use_lora=16 \
-                base.preq_mode="8da4w_output_8da8w" \
+                base.preq_mode="preq_8da4w_out_8da8w" \
                 base.preq_group_size=32 \
                 base.preq_embedding_quantize=\'8,0\' \
                 model.use_sdpa_with_kv_cache=true \
 
@@ -1,14 +1,16 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import importlib.resources as _resources
 import json
 import os
 import tempfile
 
-import pkg_resources
+import executorch.backends.apple.mps.serialization as serialization_package
 from executorch.backends.apple.mps.serialization.mps_graph_schema import MPSGraph
 from executorch.exir._serialize._dataclass import _DataclassEncoder
 from executorch.exir._serialize._flatbuffer import _flatc_compile
@@ -19,7 +21,9 @@ def convert_to_flatbuffer(mps_graph: MPSGraph) -> bytes:
     with tempfile.TemporaryDirectory() as d:
         schema_path = os.path.join(d, "schema.fbs")
         with open(schema_path, "wb") as schema_file:
-            schema_file.write(pkg_resources.resource_string(__name__, "schema.fbs"))
+            schema_file.write(
+                _resources.read_binary(serialization_package, "schema.fbs")
+            )
         json_path = os.path.join(d, "schema.json")
         with open(json_path, "wb") as json_file:
             json_file.write(mps_graph_json.encode("ascii"))
 
@@ -73,9 +73,10 @@ if(EXECUTORCH_BUILD_VGF)
   # vgf backend
   list(TRANSFORM _vgf_backend_sources PREPEND "${EXECUTORCH_ROOT}/")
   add_library(vgf_backend ${_vgf_backend_sources})
+  install(TARGETS vgf_backend EXPORT ExecuTorchTargets)
   target_include_directories(
-    vgf_backend PUBLIC ${_common_include_directories} ${VULKAN_HEADERS_PATH}
-                       ${VOLK_HEADERS_PATH}
+    vgf_backend PRIVATE ${_common_include_directories} ${VULKAN_HEADERS_PATH}
+                        ${VOLK_HEADERS_PATH}
   )
   target_compile_options(
     vgf_backend PRIVATE -DUSE_VULKAN_WRAPPER -DUSE_VULKAN_VOLK
 
@@ -88,6 +88,19 @@ You can test to run some models with the full fvp test flow
 backends/arm/test/test_arm_baremetal.sh test_full_ethosu_fvp
 ```
 
+To run the unit test suite with VKML use the following. Note Vulkan SDK need to be installed.
+Have a look at install_vulkan_sdk() in .ci/scripts/setup-vulkan-linux-deps.sh on how to install Vulkan SDK.
+
+```
+backends/arm/test/test_arm_baremetal.sh test_pytest_vkml
+```
+
+You can test to run some models with the full VKML flow
+
+```
+backends/arm/test/test_arm_baremetal.sh test_full_vkml
+```
+
 ## Unit tests
 
 This is the structure of the test directory
@@ -102,6 +115,7 @@ test                            #  Root test folder
 ├── tosautil                    #  Utility functions for TOSA artifacts
 ├ common.py                     #  Common functions and definitions used by many tests
 ├ setup_testing.sh              #  Script to prepare testing for using the Corstone 3x0 FVP
+├ setup_testing_vkml.sh         #  Script to prepare testing for using the VKML
 ├ test_arm_baremetal.sh         #  Help script to trigger testing
 ```
 
@@ -123,7 +137,7 @@ first you need to build and prepare some used target libs
 
 ```
 examples/arm/run.sh --model_name=add --build_only
-backends/arm/test/setup_testing.sh
+backends/arm/test/setup_testing.sh and/or backends/arm/test/setup_testing_vkml.sh
 ```
 
 The you can run the tests with
@@ -195,6 +209,38 @@ List of model specific and optional passes:
 - InsertCastForOpsWithInt64InputPass
     - Functionality:
         - For LLMs such as LLama, some opeartors like aten.embedding have int64 input. In order to lower these operators to TOSA, this pass will insert a casting node that converts the input from int64 to int32.
-        - Example usage: backends/arm/test/models/test_llama.py
     - Supported Ops:
         - aten.embedding.default, aten.slice_copy.Tensor
+    - Example usage:
+        - backends/arm/test/models/test_llama.py
+
+- ConvertInt64ConstOpsToInt32Pass
+    - Functionalities:
+      - Rewrites constant-producing ops that output int64 to instead output int32, when values are within int32 bounds.
+    - Supported Ops:
+      - `torch.full`, `torch.arange`, `torch.eye`, `torch.linspace`, `torch.tensor`
+    - Example usage:
+        - backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py
+        - backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py
+
+- ConvertInt64OutputOpsToInt32Pass
+    - Overview:
+      - Rewrites or removes operations that produce int64 outputs, converting them to int32 where possible.
+      - Overflow checks are applied selectively; for ops without such checks, users need to ensure values fit within the int32 range.
+    - Functionalities:
+        1. Handling casting to int64:
+            - (1) int32 -> int64:
+                - Removes the cast and redirect uses of int64 to int32
+            - (2) other types -> int64:
+                - Rewrites the cast to other types -> int32
+            - Supported Ops:
+              - torch.ops.aten.to.\[dtype|dtype_layout\]
+              - exir_ops.edge.dim_order_ops._to_dim_order_copy.default
+        2. Post-process argmax outputs:
+            - Inserts an int64->int32 cast after the argmax operations that produce int64 outputs:
+            - Supported Ops:
+              - torch.ops.aten.argmax.default
+              - exir_ops.edge.aten.argmax.default
+    - Example usage:
+      - (Functionality 1) backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py
+      - (Functionality 2) backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py