pytorch
diff --git a/‎.ci/docker/build.sh‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/build.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/android-perf.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/android-perf.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/android-release-artifacts.yml‎
Lines changed: 16 additions & 0 deletions b/‎.github/workflows/android-release-artifacts.yml‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎.github/workflows/android.yml‎
Lines changed: 6 additions & 1 deletion b/‎.github/workflows/android.yml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎.github/workflows/apple-perf.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/apple-perf.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.lintrunner.toml‎
Lines changed: 2 additions & 1 deletion b/‎.lintrunner.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 4 additions & 0 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/arm/_passes/insert_squeeze_after_sum_pass.py‎
Lines changed: 69 additions & 0 deletions b/‎backends/arm/_passes/insert_squeeze_after_sum_pass.py‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎backends/arm/arm_partitioner.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/arm_partitioner.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/operators/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/operators/__init__.py‎
Lines changed: 1 addition & 0 deletions
@@ -41,7 +41,7 @@ case "${IMAGE_NAME}" in
     LINTRUNNER=""
     CLANG_VERSION=12
     # From https://developer.android.com/ndk/downloads
-    ANDROID_NDK_VERSION=r26c
+    ANDROID_NDK_VERSION=r27b
     ;;
   *)
     echo "Invalid image name ${IMAGE_NAME}"
 
@@ -205,6 +205,7 @@ jobs:
 
   # Let's see how expensive this job is, we might want to tone it down by running it periodically
   benchmark-on-device:
+    if: always()
     permissions:
       id-token: write
       contents: read
 
@@ -13,8 +13,24 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  check-if-aar-exists:
+    name: check-if-aar-exists
+    runs-on: ubuntu-22.04
+    timeout-minutes: 10
+    steps:
+      - name: Check if this RC version is already in S3
+        shell: bash
+        run: |
+          VERSION="${{ inputs.version }}"
+          if curl -I "https://ossci-android.s3.amazonaws.com/executorch/release/${VERSION}/executorch.aar" | grep "200 OK"; then
+            echo "AAR already exists at https://ossci-android.s3.amazonaws.com/executorch/release/${VERSION}/executorch.aar"
+            echo "Will skip build/upload"
+            exit 1
+          fi
+
   build-aar:
     name: build-aar
+    needs: check-if-aar-exists
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       runner: linux.2xlarge
 
@@ -53,7 +53,7 @@ jobs:
     # NB: Use metal install for KVM support to run the emulator faster
     runs-on: linux.24xl.spr-metal
     env:
-      ANDROID_NDK_VERSION: r26c
+      ANDROID_NDK_VERSION: r27b
       API_LEVEL: 34
     steps:
       - name: Setup SSH (Click me for login details)
@@ -80,6 +80,11 @@ jobs:
           # Reuse the script that install Android on ET Docker image
           sudo -E bash .ci/docker/common/install_android.sh
 
+          # After https://github.com/ReactiveCircus/android-emulator-runner/releases/tag/v2.33.0 release,
+          # it seems that we need to chown the Android setup to the current user instead of root to
+          # avoid permission issue
+          sudo chown -R "${USER}" /opt/android
+
       - name: Gradle cache
         uses: gradle/actions/setup-gradle@v3
 
 
@@ -279,6 +279,7 @@ jobs:
           path: ${{ runner.temp }}/artifacts/
 
   benchmark-on-device:
+    if: always()
     needs:
       - set-parameters
       - upload-benchmark-app
 
@@ -151,7 +151,8 @@ command = [
     'lintrunner_adapters',
     'run',
     'grep_linter',
-    '--pattern= Executorch\W+',
+    # Exclude "ExecuTorch" pattern within URLs
+    '--pattern= Executorch(?!\\W*(://|\\.[a-z]{2,}))\\W+',
     '--linter-name=ExecuTorchCapitalization',
     '--error-name=Incorrect capitalization for ExecuTorch',
     """--error-description=
 
@@ -19,6 +19,9 @@
     ConvertSplitToSlicePass,
 )
 from executorch.backends.arm._passes.decompose_div_pass import DecomposeDivPass
+from executorch.backends.arm._passes.insert_squeeze_after_sum_pass import (
+    InsertSqueezeAfterSumPass,
+)
 from executorch.backends.arm._passes.meandim_to_averagepool_pass import (
     ConvertMeanDimToAveragePool,
 )
@@ -47,6 +50,7 @@ def transform_to_backend_pipeline(
         self.add_pass(ConvertExpandCopyToRepeatPass())
         self.add_pass(ConvertMeanDimToAveragePool())
         self.add_pass(DecomposeDivPass())
+        self.add_pass(InsertSqueezeAfterSumPass())
         self.add_pass(ConvertSplitToSlicePass())
         for spec in compile_spec:
             if spec.key == "permute_memory_format":
 
@@ -0,0 +1,69 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import cast
+
+import torch
+import torch.fx
+from executorch.backends.arm._passes.arm_pass_utils import create_node, insert_q_dq_pair
+
+from executorch.backends.arm.tosa_quant_utils import get_quant_node_args, is_quant_node
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class InsertSqueezeAfterSumPass(ExportPass):
+    """
+    In Pytorch, the default behaviour of Tensor.sum is to squeeze
+    the dimension that is summed (keep_dim = False).
+    However, in TOSA, REDUCE_SUM always preserves the
+    rank of the input (keep_dim = True).
+    To get a 1-1 mapping in the sum lowering, normalize the
+    keep_dim = False case to keep_dim = True and add squeeze ops.
+
+    Original:
+        sum(dims, keep_dim = False)
+    After pass:
+        sum(dims, keep_dim = True)
+        (q)
+        (dq)
+        squeeze(dim = dims)
+    """
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function":
+                continue
+            if node.target != exir_ops.edge.aten.sum.dim_IntList:
+                continue
+            sum_node = cast(torch.fx.Node, node)
+            keep_dim = cast(bool, sum_node.args[2] if len(sum_node.args) > 2 else False)
+            if keep_dim:
+                continue
+
+            dim_list = cast(list[int], sum_node.args[1])
+            quantized = is_quant_node(sum_node)
+            if quantized:
+                qparams = get_quant_node_args(sum_node.all_input_nodes[0])
+                qparams = qparams + (torch.int8,)
+            else:
+                qparams = None
+
+            # Add keep_dim = True arg to sum node.
+            sum_node.args = sum_node.args[0:2] + (True,)
+
+            with graph_module.graph.inserting_after(sum_node):
+                squeeze_node = create_node(
+                    graph_module.graph, exir_ops.edge.aten.squeeze_copy.dims, ()
+                )
+                sum_node.replace_all_uses_with(squeeze_node)
+                squeeze_node.args = (sum_node, dim_list)
+                if quantized:
+                    sum_node = insert_q_dq_pair(graph_module.graph, sum_node, qparams)
+        graph_module.graph.eliminate_dead_code()
+        graph_module.recompile()
+        graph_module = super().call(graph_module).graph_module
+        return PassResult(graph_module, True)
@@ -63,6 +63,7 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
             exir_ops.edge.aten._softmax.default,
             exir_ops.edge.aten.slice_copy.Tensor,
             exir_ops.edge.aten.sub.Tensor,
+            exir_ops.edge.aten.sum.dim_IntList,
             exir_ops.edge.aten.tanh.default,
             exir_ops.edge.aten.view_copy.default,
             exir_ops.edge.aten.clone.default,
 
@@ -34,6 +34,7 @@
     op_softmax,
     op_squeeze,
     op_sub,
+    op_sum,
     op_tanh,
     op_unsqueeze,
     op_view,