pytorch
diff --git a/‎.ci/docker/ci_commit_pins/buck2.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/buck2.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/apple.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/apple.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/build-presets.yml‎
Lines changed: 17 additions & 0 deletions b/‎.github/workflows/build-presets.yml‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 1 addition & 3 deletions b/‎.github/workflows/pull.yml‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 13 additions & 5 deletions b/‎CMakeLists.txt‎
Lines changed: 13 additions & 5 deletions
diff --git a/‎CMakePresets.json‎
Lines changed: 33 additions & 0 deletions b/‎CMakePresets.json‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/annotate_decomposed_matmul.py‎
Lines changed: 3 additions & 3 deletions b/‎backends/arm/_passes/annotate_decomposed_matmul.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 2 additions & 0 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/arm/_passes/decompose_cosine_similarity_pass.py‎
Lines changed: 75 additions & 0 deletions b/‎backends/arm/_passes/decompose_cosine_similarity_pass.py‎
Lines changed: 75 additions & 0 deletions
@@ -1 +1 @@
-2024-12-16
+2025-05-06
@@ -5,6 +5,8 @@ on:
     branches:
       - main
       - release/*
+    tags:
+      - ciflow/trunk/*
   pull_request:
     paths:
       - .ci/scripts/setup-ios.sh
 
@@ -11,3 +11,20 @@ on:
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
+
+jobs:
+  apple:
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    strategy:
+      matrix:
+        preset: [macos-arm64]
+    with:
+      job-name: build
+      runner: macos-latest-xlarge
+      python-version: 3.12
+      submodules: recursive
+      script: |
+        set -eux
+        ${CONDA_RUN} ./install_requirements.sh > /dev/null
+        ${CONDA_RUN} cmake --preset ${{ matrix.preset }}
+        ${CONDA_RUN} cmake --build cmake-out --parallel
@@ -434,9 +434,7 @@ jobs:
         output=$(ls -la cmake-out/test/size_test)
         arr=($output)
         size=${arr[4]}
-        # threshold=48120 on devserver with gcc11.4
-        # todo(lfq): update once binary size is below 50kb.
-        threshold="47552"
+        threshold="47560"
         if [[ "$size" -le "$threshold" ]]; then
           echo "Success $size <= $threshold"
         else
 
@@ -44,6 +44,19 @@
 
 cmake_minimum_required(VERSION 3.24)
 project(executorch)
+
+# MARK: - Start EXECUTORCH_H12025_BUILD_MIGRATION --------------------------------------------------
+
+include(${PROJECT_SOURCE_DIR}/tools/cmake/common/preset.cmake)
+
+load_build_preset()
+include(${PROJECT_SOURCE_DIR}/tools/cmake/preset/default.cmake)
+
+# Print all the configs that were called with announce_configured_options.
+print_configured_options()
+
+# MARK: - End EXECUTORCH_H12025_BUILD_MIGRATION ----------------------------------------------------
+
 include(tools/cmake/Utils.cmake)
 include(CMakeDependentOption)
 
@@ -96,9 +109,6 @@ set(EXECUTORCH_PAL_DEFAULT
           "Which PAL default implementation to use: one of {posix, minimal}"
 )
 
-option(EXECUTORCH_ENABLE_LOGGING "Build with ET_LOG_ENABLED"
-       ${_default_release_disabled_options}
-)
 if(NOT EXECUTORCH_ENABLE_LOGGING)
   # Avoid pulling in the logging strings, which can be large. Note that this
   # will set the compiler flag for all targets in this directory, and for all
@@ -170,8 +180,6 @@ option(EXECUTORCH_BUILD_ARM_BAREMETAL
        "Build the Arm Baremetal flow for Cortex-M and Ethos-U" OFF
 )
 
-option(EXECUTORCH_BUILD_COREML "Build the Core ML backend" OFF)
-
 option(EXECUTORCH_BUILD_KERNELS_CUSTOM "Build the custom kernels" OFF)
 
 option(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT "Build the custom ops lib for AOT"
 
@@ -0,0 +1,33 @@
+{
+  "version": 10,
+  "cmakeMinimumRequired": {
+    "major": 3,
+    "minor": 31,
+    "patch": 0
+  },
+  "$comment": "On-device AI across mobile, embedded and edge for PyTorch.",
+  "configurePresets": [
+    {
+      "name": "common",
+      "hidden": true,
+      "binaryDir": "${sourceDir}/cmake-out",
+      "generator": "Unix Makefiles"
+    },
+    {
+      "name": "macos-arm64",
+      "inherits": ["common"],
+      "generator": "Xcode",
+      "cacheVariables": {
+        "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/third-party/ios-cmake/ios.toolchain.cmake",
+        "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/macos-arm64.cmake",
+        "PLATFORM": "MAC_ARM64",
+        "DEPLOYMENT_TARGET": "10.15"
+      },
+      "condition": {
+        "lhs": "${hostSystemName}",
+        "type": "equals",
+        "rhs": "Darwin"
+      }
+    }
+  ]
+}
@@ -19,6 +19,7 @@
 from .convert_squeezes_to_view import ConvertSqueezesToViewPass  # noqa
 from .convert_to_clamp import ConvertToClampPass  # noqa
 from .decompose_batchnorm_pass import DecomposeBatchNormPass  # noqa
+from .decompose_cosine_similarity_pass import DecomposeCosineSimilarityPass  # noqa
 from .decompose_div_pass import DecomposeDivPass  # noqa
 from .decompose_gelu_pass import DecomposeGeluPass  # noqa
 from .decompose_layernorm_pass import DecomposeLayerNormPass  # noqa
 
@@ -1,13 +1,12 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
 
 import itertools
-
+import operator
 from typing import List
 
 import torch
@@ -22,7 +21,7 @@
 
 class AnnotateDecomposedMatmulPass(ExportPass):
     """
-    torch.matmul can be decomposed in many ways, for instance:
+    torch.matmul and it's equivalent operator @ can be decomposed in many ways, for instance:
     dq -> matmul -> q can become
     dq -> repeat -> view -> bmm -> view -> dq which makes quantization folding
     difficult. This helper function find all matmul partitions and annotate its
@@ -50,6 +49,7 @@ def call(self, graph_module: GraphModule) -> PassResult:
             graph_module.graph,
             [
                 torch.matmul,
+                operator.matmul,
             ],
             None,
         )
 
@@ -24,6 +24,7 @@
     ConvertSqueezesToViewPass,
     ConvertToClampPass,
     DecomposeBatchNormPass,
+    DecomposeCosineSimilarityPass,
     DecomposeDivPass,
     DecomposeGeluPass,
     DecomposeLayerNormPass,
@@ -205,6 +206,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeVarPass())
         self.add_pass(DecomposeMeanDimPass())
         self.add_pass(DecomposeNotEqualPass())
+        self.add_pass(DecomposeCosineSimilarityPass())
         self.add_pass(DecomposeDivPass())
         self.add_pass(DecomposeLeakyReLUPass())
         self.add_pass(DecomposeSqrtPass())
 
@@ -0,0 +1,75 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.pass_base import ExportPass
+
+torch_cosine_similarity = (torch.ops.aten.cosine_similarity.default,)
+
+
+class DecomposeCosineSimilarityPass(ExportPass):
+    """
+    Decomposition of aten.cosine_similarity:
+
+      dot    = sum(mul(x1, x2), dims, keepdim=False)
+      norm   = pow( sum(mul(x, x), dims, keepdim=False), 0.5 )
+      eps    = full( (), eps_scalar )
+      n1c    = max(norm1, eps)
+      n2c    = max(norm2, eps)
+      denom  = mul(n1c, n2c)
+      out    = div(dot, denom)
+    """
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in torch_cosine_similarity:
+            return super().call_operator(op, args, kwargs, meta)
+
+        x1, x2 = args[0], args[1]
+        dim = kwargs.get("dim", 1)
+        eps = kwargs.get("eps", 1e-8)
+        dims = [dim] if isinstance(dim, int) else list(dim)
+
+        # 1) dot
+        prod = super().call_operator(torch.ops.aten.mul.Tensor, (x1, x2), {}, meta)
+        dot = super().call_operator(
+            torch.ops.aten.sum.dim_IntList, (prod, dims, False), {}, meta
+        )
+
+        # 2a) norm1 = pow(sum(x1*x1), 0.5)
+        x1_sq = super().call_operator(torch.ops.aten.mul.Tensor, (x1, x1), {}, meta)
+        s1 = super().call_operator(
+            torch.ops.aten.sum.dim_IntList, (x1_sq, dims, False), {}, meta
+        )
+        norm1 = super().call_operator(
+            torch.ops.aten.pow.Tensor_Scalar, (s1, 0.5), {}, meta
+        )
+
+        # 2b) norm2 = pow(sum(x2*x2), 0.5)
+        x2_sq = super().call_operator(torch.ops.aten.mul.Tensor, (x2, x2), {}, meta)
+        s2 = super().call_operator(
+            torch.ops.aten.sum.dim_IntList, (x2_sq, dims, False), {}, meta
+        )
+        norm2 = super().call_operator(
+            torch.ops.aten.pow.Tensor_Scalar, (s2, 0.5), {}, meta
+        )
+
+        # 3) eps scalar - we need to broadcast ourselves as TOSA dont do this for scalar
+        eps_t = super().call_operator(
+            torch.ops.aten.full_like.default, (norm1, eps), {}, meta
+        )
+
+        # 4) clamp to avoid zero division
+        n1c = super().call_operator(
+            torch.ops.aten.maximum.default, (norm1, eps_t), {}, meta
+        )
+        n2c = super().call_operator(
+            torch.ops.aten.maximum.default, (norm2, eps_t), {}, meta
+        )
+
+        # 5) denom and divide
+        denom = super().call_operator(torch.ops.aten.mul.Tensor, (n1c, n2c), {}, meta)
+        out = super().call_operator(torch.ops.aten.div.Tensor, (dot, denom), {}, meta)
+
+        return out