pytorch
diff --git a/‎.buckconfig‎
Lines changed: 1 addition & 0 deletions b/‎.buckconfig‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.ci/scripts/build-qnn-sdk.sh‎
Lines changed: 1 addition & 2 deletions b/‎.ci/scripts/build-qnn-sdk.sh‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 68 additions & 10 deletions b/‎.github/workflows/trunk.yml‎
Lines changed: 68 additions & 10 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 13 additions & 10 deletions b/‎CMakeLists.txt‎
Lines changed: 13 additions & 10 deletions
diff --git a/‎backends/apple/mps/CMakeLists.txt‎
Lines changed: 0 additions & 4 deletions b/‎backends/apple/mps/CMakeLists.txt‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 10 additions & 3 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎backends/arm/_passes/broadcast_args_pass.py‎
Lines changed: 63 additions & 0 deletions b/‎backends/arm/_passes/broadcast_args_pass.py‎
Lines changed: 63 additions & 0 deletions
@@ -39,6 +39,7 @@
 
 [buck2]
 restarter=true
+file_watcher=notify
 
 [oss]
 folly_cxx_tests = False
@@ -39,8 +39,7 @@ set_up_aot() {
       -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
       -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
       -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
-      -DPYTHON_EXECUTABLE=python3 \
-      -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF
+      -DPYTHON_EXECUTABLE=python3
   cmake --build $PWD --target "PyQnnManagerAdaptor" "PyQnnWrapperAdaptor" -j$(nproc)
   # install Python APIs to correct import path
   # The filename might vary depending on your Python and host version.
 
@@ -555,11 +555,11 @@ jobs:
     strategy:
       matrix:
         hf_model_id: [
-          google/gemma-2-2b,
-          Qwen/Qwen2.5-0.5B,
+          google/gemma-3-1b-it,
+          Qwen/Qwen3-0.6B,
           HuggingFaceTB/SmolLM2-135M,
           meta-llama/Llama-3.2-1B,
-          allenai/OLMo-1B-hf
+          allenai/OLMo-1B-hf,
         ]
       fail-fast: false
     with:
@@ -569,44 +569,102 @@ jobs:
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
+      upload-artifact: profiling-artifacts-${{ strategy.job-index }}
       script: |
         echo "::group::Set up ExecuTorch"
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
+        # Build executor_runner with ETdump enabled
+        PYTHON_EXECUTABLE=python cmake -DPYTHON_EXECUTABLE=python \
+          -DCMAKE_INSTALL_PREFIX=cmake-out \
+          -DEXECUTORCH_ENABLE_LOGGING=1 \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+          -DEXECUTORCH_BUILD_XNNPACK=ON \
+          -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+          -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+          -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+          -DEXECUTORCH_BUILD_DEVTOOLS=ON \
+          -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
+          -Bcmake-out .
+        cmake --build cmake-out -j16 --target install --config Release
         echo "::endgroup::"
 
         echo "::group::Set up Hugging Face"
         pip install -U "huggingface_hub[cli]"
         huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
         git clone https://github.com/huggingface/optimum-executorch
-        cd optimum-executorch
+        pushd optimum-executorch
         # There is no release yet, for CI stability, always test from the same commit on main
-        git checkout 577a2b19670e4c643a5c6ecb09bf47b9a699e7c6
+        git checkout da80c9e35b3db5c7eea8731b7d660482fb4870a8
         pip install .[tests]
+        popd
+
+        if [ "${{ matrix.hf_model_id }}" == "google/gemma-3-1b-it" ]; then
+          # Fixes for gemma-3 is not available in the released version
+          git clone https://github.com/huggingface/transformers.git
+          pushd transformers
+          git checkout a57274466f7f72efaa2662d1738cdaf28ae8071f
+          pip install -e .
+          popd
+        fi
         pip list
         echo "::endgroup::"
 
-        echo "::group::Export and Run ${{ matrix.hf_model_id }}"
+        echo "::group::Export to ExecuTorch"
         # Pass matrix variable as environment variable
         export MODEL_ID="${{ matrix.hf_model_id }}"
+        export OUTPUT_DIR="$(pwd)/${MODEL_ID}_custom_sdpa_8da4w"
+        pushd optimum-executorch
+
+        optimum-cli export executorch \
+          --model ${MODEL_ID} \
+          --task text-generation \
+          --recipe xnnpack \
+          --use_custom_sdpa \
+          --output_dir ${OUTPUT_DIR} \
+          --qlinear
+
+        ls -FlAGhp ${OUTPUT_DIR}
+        popd
+        echo "::endgroup::"
+
+        echo "::group::Inference using python API"
+        pushd optimum-executorch
         python -c "
         import os
         from optimum.executorch import ExecuTorchModelForCausalLM
         from transformers import AutoTokenizer
 
         model_id = os.getenv('MODEL_ID')
-        print(f'Loading model: {model_id}')
-        model = ExecuTorchModelForCausalLM.from_pretrained(model_id, recipe='xnnpack')
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        pte_dir = os.getenv('OUTPUT_DIR')
+        print(f'Loading model {model_id} from {pte_dir}.')
+        model = ExecuTorchModelForCausalLM.from_pretrained(pte_dir)
         generated_text = model.text_generation(
-          tokenizer=tokenizer,
+          tokenizer=AutoTokenizer.from_pretrained(model_id),
           prompt='Simply put, the theory of relativity states that',
           max_seq_len=64
         )
         print(generated_text)
         "
+        popd
+        echo "::endgroup::"
+
+        echo "::group::Inference using executor_runner with ETDump"
+        ./cmake-out/executor_runner \
+          --model_path ${OUTPUT_DIR}/model.pte \
+          --etdump_path ${OUTPUT_DIR}/etdump.etdp
+
+        export TSV_PATH=artifacts-to-be-uploaded/${MODEL_ID}_op_prof.tsv
+        mkdir -p $(dirname "$TSV_PATH")
+        python3 -m devtools.inspector.inspector_cli \
+          --etdump_path ${OUTPUT_DIR}/etdump.etdp \
+          --tsv_path ${TSV_PATH}
+
         echo "::endgroup::"
 
 
 
@@ -48,21 +48,33 @@ project(executorch)
 # MARK: - Start EXECUTORCH_H12025_BUILD_MIGRATION --------------------------------------------------
 
 include(${PROJECT_SOURCE_DIR}/tools/cmake/common/preset.cmake)
+include(${PROJECT_SOURCE_DIR}/tools/cmake/Utils.cmake)
+include(CMakeDependentOption)
+include(ExternalProject)
 
 if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
 endif()
 announce_configured_options(CMAKE_CXX_STANDARD)
 
+if(NOT CMAKE_SYSTEM_PROCESSOR)
+  set(CMAKE_SYSTEM_PROCESSOR ${CMAKE_HOST_SYSTEM_PROCESSOR})
+endif()
+announce_configured_options(CMAKE_SYSTEM_PROCESSOR)
+
 if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Debug)
 endif()
 announce_configured_options(CMAKE_BUILD_TYPE)
 
+if(NOT PYTHON_EXECUTABLE)
+  resolve_python_executable()
+endif()
+announce_configured_options(PYTHON_EXECUTABLE)
+
 announce_configured_options(CMAKE_CXX_COMPILER_ID)
 announce_configured_options(CMAKE_TOOLCHAIN_FILE)
 announce_configured_options(BUCK2)
-announce_configured_options(PYTHON_EXECUTABLE)
 
 load_build_preset()
 include(${PROJECT_SOURCE_DIR}/tools/cmake/preset/default.cmake)
@@ -72,10 +84,6 @@ print_configured_options()
 
 # MARK: - End EXECUTORCH_H12025_BUILD_MIGRATION ----------------------------------------------------
 
-include(tools/cmake/Utils.cmake)
-include(CMakeDependentOption)
-include(ExternalProject)
-
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 # Setup RPATH.
@@ -251,11 +259,6 @@ if(EXECUTORCH_BUILD_TESTS)
   include(CTest)
 endif()
 
-if(NOT PYTHON_EXECUTABLE)
-  resolve_python_executable()
-endif()
-message(STATUS "Using python executable '${PYTHON_EXECUTABLE}'")
-
 # TODO(dbort): Fix these warnings and remove this flag.
 set(_common_compile_options -Wno-deprecated-declarations -fPIC)
 
 
@@ -18,10 +18,6 @@ endif()
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
-if(NOT PYTHON_EXECUTABLE)
-  resolve_python_executable()
-endif()
-
 set(_common_compile_options -Wno-deprecated-declarations)
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 
 
@@ -8,6 +8,7 @@
 from .annotate_channels_last_dim_order_pass import AnnotateChannelsLastDimOrder  # noqa
 from .annotate_decomposed_matmul import AnnotateDecomposedMatmulPass  # noqa
 from .arm_pass import ArmPass  # noqa
+from .broadcast_args_pass import BroadcastArgsPass  # noqa
 from .cast_int64_pass import CastInt64BuffersToInt32Pass  # noqa
 from .cast_to_int32_pass import CastToInt32Pass  # noqa
 from .conv1d_unsqueeze_pass import Conv1dUnsqueezePass  # noqa
@@ -24,6 +25,7 @@
 from .decompose_gelu_pass import DecomposeGeluPass  # noqa
 from .decompose_layernorm_pass import DecomposeLayerNormPass  # noqa
 from .decompose_leaky_relu_pass import DecomposeLeakyReLUPass  # noqa
+from .decompose_linalg_vector_norm_pass import DecomposeLinearVectorNormPass  # noqa
 from .decompose_linear_pass import DecomposeLinearPass  # noqa
 from .decompose_meandim_pass import DecomposeMeanDimPass  # noqa
 from .decompose_ne_pass import DecomposeNotEqualPass  # noqa
 
@@ -10,6 +10,7 @@
 from executorch.backends.arm._passes import (
     AnnotateChannelsLastDimOrder,
     AnnotateDecomposedMatmulPass,
+    BroadcastArgsPass,
     CastInt64BuffersToInt32Pass,
     CastToInt32Pass,
     ComputeConstantOpsAOT,
@@ -29,6 +30,7 @@
     DecomposeLayerNormPass,
     DecomposeLeakyReLUPass,
     DecomposeLinearPass,
+    DecomposeLinearVectorNormPass,
     DecomposeMeanDimPass,
     DecomposeNotEqualPass,
     DecomposeSelectPass,
@@ -59,7 +61,7 @@
     UnsqueezeScalarPlaceholdersPass,
 )
 
-from executorch.backends.arm.tosa_specification import Tosa_0_80, TosaSpecification
+from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.backends.transforms.decompose_sdpa import (
     DecomposeScaledDotProductAttention,
 )
@@ -86,13 +88,14 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertSplitToSlicePass())
         self.add_pass(ConvertMmToBmmPass())
         self.add_pass(DecomposeLinearPass())
+        self.add_pass(DecomposeLinearVectorNormPass())
         self.add_pass(DecomposeMeanDimPass())
         self.add_pass(ConvertFullLikeToFullPass())
         self.add_pass(ConvertToClampPass())
         self.add_pass(ConvertMinMaxPass())
         self.add_pass(ConvertAnyDefaultDimDimsPass())
         self.add_pass(MatchWhereSelfDtypePass())
-        if isinstance(self.tosa_spec, Tosa_0_80) and self.tosa_spec.is_U55_subset:
+        if self.tosa_spec.is_U55_subset:
             self.add_pass(CastToInt32Pass())
 
         self.add_pass(ReplaceScalarWithTensorArgPassTOSABI())
@@ -102,6 +105,8 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(RetraceFoldedDtypesPass())
         self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
         self.add_pass(MatchArgRanksPass(exported_program))
+        if self.tosa_spec.is_U55_subset:
+            self.add_pass(BroadcastArgsPass())
         self.add_pass(ComputeConstantOpsAOT(exported_program))
 
         self.add_pass(RemoveClonePass())
@@ -133,6 +138,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(FuseBatchnorm2DPass(exported_program))
         self.add_pass(ConvertMmToBmmPass())
         self.add_pass(DecomposeLinearPass())
+        self.add_pass(DecomposeLinearVectorNormPass())
         self.add_pass(DecomposeLeakyReLUPass())
         self.add_pass(DecomposeBatchNormPass())
         self.add_pass(DecomposeLayerNormPass())
@@ -207,10 +213,11 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeCosineSimilarityPass())
         self.add_pass(DecomposeDivPass())
         self.add_pass(DecomposeLeakyReLUPass())
+        self.add_pass(DecomposeLinearVectorNormPass())
         self.add_pass(DecomposeSqrtPass())
         self.add_pass(DecomposeSiluPass())
 
-        if isinstance(self.tosa_spec, Tosa_0_80) and self.tosa_spec.is_U55_subset:
+        if self.tosa_spec.is_U55_subset:
             # Numerically stable softmax uses amax which is not supported on Ethos-U55
             self.add_pass(DecomposeSoftmaxUnstablePass())
         else:
 
@@ -0,0 +1,63 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.arm._passes import ArmPass
+
+from executorch.backends.arm._passes.arm_pass_utils import (
+    create_node,
+    get_first_fake_tensor,
+)
+
+from executorch.exir.dialects._ops import ops as exir_ops
+
+from executorch.exir.pass_base import PassResult
+from torch.fx import GraphModule, Node
+
+
+class BroadcastArgsPass(ArmPass):
+    """
+    Pass to manually broadcast arguments by inserting repeats.
+    This is done when more than one arg needs broadcasting.
+    """
+
+    targeted_ops = {
+        exir_ops.edge.aten.add.Tensor,
+        exir_ops.edge.aten.sub.Tensor,
+        # mul is indirectly targeting div as div is decompsed to reciprocal + mul
+        exir_ops.edge.aten.mul.Tensor,
+    }
+
+    def call(self, graph_module: GraphModule) -> PassResult:
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function" or node.target not in self.targeted_ops:
+                continue
+
+            output_shape = get_first_fake_tensor(node).shape
+            nbr_of_broacasts = 0
+            for arg in node.args:
+                if not isinstance(arg, Node):
+                    continue
+
+                shape = get_first_fake_tensor(arg).shape
+                if shape != output_shape:
+                    nbr_of_broacasts += 1
+                if nbr_of_broacasts > 1:
+                    multiples = [
+                        int(output_shape[d] / shape[d])
+                        for d in range(len(output_shape))
+                    ]
+                    with graph_module.graph.inserting_before(node):
+                        repeat = create_node(
+                            graph_module.graph,
+                            exir_ops.edge.aten.repeat.default,
+                            args=(arg, multiples),
+                            kwargs={},
+                            from_node=node,
+                        )
+                        node.replace_input_with(arg, repeat)
+
+        graph_module.recompile()
+        graph_module = super().call(graph_module).graph_module
+        return PassResult(graph_module, True)