pytorch
diff --git a/‎.buckconfig‎
Lines changed: 1 addition & 0 deletions b/‎.buckconfig‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.ci/scripts/build-qnn-sdk.sh‎
100644100755
Lines changed: 4 additions & 9 deletions b/‎.ci/scripts/build-qnn-sdk.sh‎
100644100755
Lines changed: 4 additions & 9 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 14 additions & 10 deletions b/‎CMakeLists.txt‎
Lines changed: 14 additions & 10 deletions
diff --git a/‎CMakePresets.json‎
Lines changed: 15 additions & 0 deletions b/‎CMakePresets.json‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎backends/apple/mps/CMakeLists.txt‎
Lines changed: 0 additions & 4 deletions b/‎backends/apple/mps/CMakeLists.txt‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 7 additions & 0 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎backends/arm/_passes/broadcast_args_pass.py‎
Lines changed: 63 additions & 0 deletions b/‎backends/arm/_passes/broadcast_args_pass.py‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎backends/arm/_passes/decompose_linalg_vector_norm_pass.py‎
Lines changed: 78 additions & 0 deletions b/‎backends/arm/_passes/decompose_linalg_vector_norm_pass.py‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎backends/arm/arm_vela.py‎
Lines changed: 2 additions & 2 deletions b/‎backends/arm/arm_vela.py‎
Lines changed: 2 additions & 2 deletions
@@ -39,6 +39,7 @@
 
 [buck2]
 restarter=true
+file_watcher=notify
 
 [oss]
 folly_cxx_tests = False
@@ -11,17 +11,12 @@ set -o xtrace
 
 build_qnn_backend() {
   echo "Start building qnn backend."
-  export ANDROID_NDK_ROOT=/opt/ndk
-  export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
+  export ANDROID_NDK_ROOT=${ANDROID_NDK_ROOT:-/opt/ndk}
+  export QNN_SDK_ROOT=${QNN_SDK_ROOT:-/tmp/qnn/2.28.0.241029}
   export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)"
 
-  # Workaround to avoid issues around missing flatccrt library (depending on the
-  # number of jobs used), see issue #7300:
-  # Build twice (second time with `--no_clean`) to make sure libflatccrt.a is
-  # available.
-  # TODO: Remove this workaround once the underlying issue is fixed.
-  bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release || \
-  bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release --no_clean
+  parallelism=$(( $(nproc) - 1 ))
+  bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number ${parallelism} --release
 }
 
 set_up_aot() {
 
@@ -48,21 +48,33 @@ project(executorch)
 # MARK: - Start EXECUTORCH_H12025_BUILD_MIGRATION --------------------------------------------------
 
 include(${PROJECT_SOURCE_DIR}/tools/cmake/common/preset.cmake)
+include(${PROJECT_SOURCE_DIR}/tools/cmake/Utils.cmake)
+include(CMakeDependentOption)
+include(ExternalProject)
 
 if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
 endif()
 announce_configured_options(CMAKE_CXX_STANDARD)
 
+if(NOT CMAKE_SYSTEM_PROCESSOR)
+  set(CMAKE_SYSTEM_PROCESSOR ${CMAKE_HOST_SYSTEM_PROCESSOR})
+endif()
+announce_configured_options(CMAKE_SYSTEM_PROCESSOR)
+
 if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Debug)
 endif()
 announce_configured_options(CMAKE_BUILD_TYPE)
 
+if(NOT PYTHON_EXECUTABLE)
+  resolve_python_executable()
+endif()
+announce_configured_options(PYTHON_EXECUTABLE)
+
 announce_configured_options(CMAKE_CXX_COMPILER_ID)
 announce_configured_options(CMAKE_TOOLCHAIN_FILE)
 announce_configured_options(BUCK2)
-announce_configured_options(PYTHON_EXECUTABLE)
 
 load_build_preset()
 include(${PROJECT_SOURCE_DIR}/tools/cmake/preset/default.cmake)
@@ -72,10 +84,6 @@ print_configured_options()
 
 # MARK: - End EXECUTORCH_H12025_BUILD_MIGRATION ----------------------------------------------------
 
-include(tools/cmake/Utils.cmake)
-include(CMakeDependentOption)
-include(ExternalProject)
-
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 # Setup RPATH.
@@ -251,11 +259,6 @@ if(EXECUTORCH_BUILD_TESTS)
   include(CTest)
 endif()
 
-if(NOT PYTHON_EXECUTABLE)
-  resolve_python_executable()
-endif()
-message(STATUS "Using python executable '${PYTHON_EXECUTABLE}'")
-
 # TODO(dbort): Fix these warnings and remove this flag.
 set(_common_compile_options -Wno-deprecated-declarations -fPIC)
 
@@ -579,6 +582,7 @@ if(EXECUTORCH_BUILD_PYBIND)
       ${TORCH_PYTHON_LIBRARY}
       bundled_program
       etdump
+      flatccrt
       executorch
       extension_data_loader
       util
 
@@ -15,6 +15,7 @@
     },
     {
       "name": "macos-arm64",
+      "displayName": "Build everything buildable on macOS arm64",
       "inherits": ["common"],
       "generator": "Xcode",
       "cacheVariables": {
@@ -28,6 +29,20 @@
         "type": "equals",
         "rhs": "Darwin"
       }
+    },
+    {
+      "name": "pybind",
+      "displayName": "Build pybindings exported in the wheel",
+      "inherits": ["common"],
+      "cacheVariables": {
+        "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/pybind.cmake",
+        "CMAKE_OSX_DEPLOYMENT_TARGET": "10.15"
+      },
+      "condition": {
+        "type": "inList",
+        "string": "${hostSystemName}",
+        "list": ["Darwin", "Linux", "Windows"]
+      }
     }
   ]
 }
@@ -18,10 +18,6 @@ endif()
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
-if(NOT PYTHON_EXECUTABLE)
-  resolve_python_executable()
-endif()
-
 set(_common_compile_options -Wno-deprecated-declarations)
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 
 
@@ -8,6 +8,7 @@
 from .annotate_channels_last_dim_order_pass import AnnotateChannelsLastDimOrder  # noqa
 from .annotate_decomposed_matmul import AnnotateDecomposedMatmulPass  # noqa
 from .arm_pass import ArmPass  # noqa
+from .broadcast_args_pass import BroadcastArgsPass  # noqa
 from .cast_int64_pass import CastInt64BuffersToInt32Pass  # noqa
 from .cast_to_int32_pass import CastToInt32Pass  # noqa
 from .conv1d_unsqueeze_pass import Conv1dUnsqueezePass  # noqa
@@ -24,6 +25,7 @@
 from .decompose_gelu_pass import DecomposeGeluPass  # noqa
 from .decompose_layernorm_pass import DecomposeLayerNormPass  # noqa
 from .decompose_leaky_relu_pass import DecomposeLeakyReLUPass  # noqa
+from .decompose_linalg_vector_norm_pass import DecomposeLinearVectorNormPass  # noqa
 from .decompose_linear_pass import DecomposeLinearPass  # noqa
 from .decompose_meandim_pass import DecomposeMeanDimPass  # noqa
 from .decompose_ne_pass import DecomposeNotEqualPass  # noqa
 
@@ -10,6 +10,7 @@
 from executorch.backends.arm._passes import (
     AnnotateChannelsLastDimOrder,
     AnnotateDecomposedMatmulPass,
+    BroadcastArgsPass,
     CastInt64BuffersToInt32Pass,
     CastToInt32Pass,
     ComputeConstantOpsAOT,
@@ -29,6 +30,7 @@
     DecomposeLayerNormPass,
     DecomposeLeakyReLUPass,
     DecomposeLinearPass,
+    DecomposeLinearVectorNormPass,
     DecomposeMeanDimPass,
     DecomposeNotEqualPass,
     DecomposeSelectPass,
@@ -86,6 +88,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertSplitToSlicePass())
         self.add_pass(ConvertMmToBmmPass())
         self.add_pass(DecomposeLinearPass())
+        self.add_pass(DecomposeLinearVectorNormPass())
         self.add_pass(DecomposeMeanDimPass())
         self.add_pass(ConvertFullLikeToFullPass())
         self.add_pass(ConvertToClampPass())
@@ -102,6 +105,8 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(RetraceFoldedDtypesPass())
         self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
         self.add_pass(MatchArgRanksPass(exported_program))
+        if self.tosa_spec.is_U55_subset:
+            self.add_pass(BroadcastArgsPass())
         self.add_pass(ComputeConstantOpsAOT(exported_program))
 
         self.add_pass(RemoveClonePass())
@@ -133,6 +138,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(FuseBatchnorm2DPass(exported_program))
         self.add_pass(ConvertMmToBmmPass())
         self.add_pass(DecomposeLinearPass())
+        self.add_pass(DecomposeLinearVectorNormPass())
         self.add_pass(DecomposeLeakyReLUPass())
         self.add_pass(DecomposeBatchNormPass())
         self.add_pass(DecomposeLayerNormPass())
@@ -207,6 +213,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeCosineSimilarityPass())
         self.add_pass(DecomposeDivPass())
         self.add_pass(DecomposeLeakyReLUPass())
+        self.add_pass(DecomposeLinearVectorNormPass())
         self.add_pass(DecomposeSqrtPass())
         self.add_pass(DecomposeSiluPass())
 
 
@@ -0,0 +1,63 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.arm._passes import ArmPass
+
+from executorch.backends.arm._passes.arm_pass_utils import (
+    create_node,
+    get_first_fake_tensor,
+)
+
+from executorch.exir.dialects._ops import ops as exir_ops
+
+from executorch.exir.pass_base import PassResult
+from torch.fx import GraphModule, Node
+
+
+class BroadcastArgsPass(ArmPass):
+    """
+    Pass to manually broadcast arguments by inserting repeats.
+    This is done when more than one arg needs broadcasting.
+    """
+
+    targeted_ops = {
+        exir_ops.edge.aten.add.Tensor,
+        exir_ops.edge.aten.sub.Tensor,
+        # mul is indirectly targeting div as div is decompsed to reciprocal + mul
+        exir_ops.edge.aten.mul.Tensor,
+    }
+
+    def call(self, graph_module: GraphModule) -> PassResult:
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function" or node.target not in self.targeted_ops:
+                continue
+
+            output_shape = get_first_fake_tensor(node).shape
+            nbr_of_broacasts = 0
+            for arg in node.args:
+                if not isinstance(arg, Node):
+                    continue
+
+                shape = get_first_fake_tensor(arg).shape
+                if shape != output_shape:
+                    nbr_of_broacasts += 1
+                if nbr_of_broacasts > 1:
+                    multiples = [
+                        int(output_shape[d] / shape[d])
+                        for d in range(len(output_shape))
+                    ]
+                    with graph_module.graph.inserting_before(node):
+                        repeat = create_node(
+                            graph_module.graph,
+                            exir_ops.edge.aten.repeat.default,
+                            args=(arg, multiples),
+                            kwargs={},
+                            from_node=node,
+                        )
+                        node.replace_input_with(arg, repeat)
+
+        graph_module.recompile()
+        graph_module = super().call(graph_module).graph_module
+        return PassResult(graph_module, True)
@@ -0,0 +1,78 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.pass_base import ExportPass
+
+
+class DecomposeLinearVectorNormPass(ExportPass):
+    """
+    This pass decomposes aten.linalg_vector_norm.default into more primitive ops.
+    We need to add this pass before quantization for graph annotation.
+    By default, aten.linalg_vector_norm op is decomposed during legalization to Edge IR.
+
+    The decomposition is as follows:
+
+      For p == 1:
+          out = REDUCE_SUM(ABS(x), dims, keepdim)
+
+      For p == 2:
+          out = SQRT(REDUCE_SUM(MUL(x, x), dims, keepdim))
+
+      For arbitrary p:
+          We dont support arbitrary p, because our decomposition looks like
+          out = POW(REDUCE_SUM(POW(ABS(x), p), dims, keepdim), 1/p)
+          In this case we need to wrap p into Tensor and we need to know
+          dtype prior, but we dont know this from FX graph.
+    """
+
+    torch_linalg_vector_norm = (torch.ops.aten.linalg_vector_norm.default,)
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in self.torch_linalg_vector_norm:
+            return super().call_operator(op, args, kwargs, meta)
+
+        # Extract inputs and optional arguments.
+        # Expected args:
+        #   args[0]: input tensor
+        #   args[1]: norm order 'p' (optional, default: 2.0)
+        #   args[2]: dimensions to reduce (should be provided)
+        #   args[3]: keepdim flag (optional, default: False)
+        input_tensor = args[0]
+        norm_order = args[1] if len(args) > 1 else 2.0
+        norm_dim = args[2] if len(args) > 2 else None
+        keepdim = args[3] if len(args) > 3 else False
+
+        if norm_order not in (1, 2):
+            raise ValueError(
+                f"The order of {norm_order}\n"
+                f"is not supported for linalg_vector_norm operator"
+            )
+
+        if norm_dim is None:
+            raise ValueError("The norm_dim for linalg_vector_norm is None.")
+
+        dims = [norm_dim] if isinstance(norm_dim, int) else list(norm_dim)
+
+        # Decomposition based on norm order.
+        if norm_order == 1:
+            op1 = super().call_operator(
+                torch.ops.aten.abs.default, (input_tensor,), {}, meta
+            )
+            op2 = super().call_operator(
+                torch.ops.aten.sum.dim_IntList, (op1, dims, keepdim), {}, meta
+            )
+            return op2
+
+        elif norm_order == 2:
+            # For p == 2, decomposition is sqrt(sum(x * x, dims, keepdim))
+            op1 = super().call_operator(
+                torch.ops.aten.mul.Tensor, (input_tensor, input_tensor), {}, meta
+            )
+            op2 = super().call_operator(
+                torch.ops.aten.sum.dim_IntList, (op1, dims, keepdim), {}, meta
+            )
+            op3 = super().call_operator(torch.ops.aten.sqrt.default, (op2,), {}, meta)
+            return op3
@@ -73,8 +73,8 @@ def vela_compile(tosa_flatbuffer: bytes, args: List[str], verbose: bool = False)
             np_path = os.path.join(tmpdir, "output", "out_vela.npz")
         else:
             np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz")
-        blocks = b""
 
+        blocks = b""
         with np.load(np_path, allow_pickle=False) as data:
             # Construct our modified output_blocks with data in a form easily
             # digested on the device side
@@ -92,7 +92,7 @@ def vela_compile(tosa_flatbuffer: bytes, args: List[str], verbose: bool = False)
             if not isinstance(data["scratch_shape"][0], np.int64):
                 raise RuntimeError("Expected scratch to be int64")
             block_length = int(data["scratch_shape"][0])
-            bin_blocks["scratch_data"] = b"\x00" * block_length
+            bin_blocks["scratch_size"] = struct.pack("<I", block_length)
 
             # Capture inputs and outputs
             bin_blocks["inputs"] = vela_bin_pack_io("input", data)
Original file line number	Diff line number	Diff line change
`@@ -15,6 +15,7 @@`
`15`	`15`	`},`
`16`	`16`	`{`
`17`	`17`	`"name": "macos-arm64",`
	`18`	`+ "displayName": "Build everything buildable on macOS arm64",`
`18`	`19`	`"inherits": ["common"],`
`19`	`20`	`"generator": "Xcode",`
`20`	`21`	`"cacheVariables": {`
`@@ -28,6 +29,20 @@`
`28`	`29`	`"type": "equals",`
`29`	`30`	`"rhs": "Darwin"`
`30`	`31`	`}`
	`32`	`+ },`
	`33`	`+ {`
	`34`	`+ "name": "pybind",`
	`35`	`+ "displayName": "Build pybindings exported in the wheel",`
	`36`	`+ "inherits": ["common"],`
	`37`	`+ "cacheVariables": {`
	`38`	`+ "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/pybind.cmake",`
	`39`	`+ "CMAKE_OSX_DEPLOYMENT_TARGET": "10.15"`
	`40`	`+ },`
	`41`	`+ "condition": {`
	`42`	`+ "type": "inList",`
	`43`	`+ "string": "${hostSystemName}",`
	`44`	`+ "list": ["Darwin", "Linux", "Windows"]`
	`45`	`+ }`
`31`	`46`	`}`
`32`	`47`	`]`
`33`	`48`	`}`