pytorch
diff --git a/‎.ci/docker/requirements-ci.txt‎
Lines changed: 1 addition & 0 deletions b/‎.ci/docker/requirements-ci.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/_android.yml‎
Lines changed: 4 additions & 0 deletions b/‎.github/workflows/_android.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎.github/workflows/android-perf-private-device-experiment.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/android-perf-private-device-experiment.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/android-perf.yml‎
Lines changed: 5 additions & 1 deletion b/‎.github/workflows/android-perf.yml‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎.github/workflows/android-release-artifacts.yml‎
Lines changed: 4 additions & 0 deletions b/‎.github/workflows/android-release-artifacts.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/arm/test/ops/test_sigmoid.py‎
Lines changed: 19 additions & 5 deletions b/‎backends/arm/test/ops/test_sigmoid.py‎
Lines changed: 19 additions & 5 deletions
diff --git a/‎backends/arm/test/targets.bzl‎
Lines changed: 5 additions & 2 deletions b/‎backends/arm/test/targets.bzl‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎backends/qualcomm/_passes/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎backends/qualcomm/_passes/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/qualcomm/_passes/convert_square_to_pow.py‎
Lines changed: 38 additions & 0 deletions b/‎backends/qualcomm/_passes/convert_square_to_pow.py‎
Lines changed: 38 additions & 0 deletions
@@ -17,6 +17,7 @@ parameterized==0.9.0
 
 # Doc build requirements, same as https://github.com/pytorch/pytorch/blob/main/.ci/docker/requirements-docs.txt
 sphinx==5.3.0
+sphinx-reredirects==0.1.4
 sphinx-gallery==0.14.0
 breathe==4.34.0
 exhale==0.2.3
 
@@ -22,6 +22,10 @@ jobs:
       script: |
         set -eux
 
+        # Use sccache for NDK compiler as well
+        export CMAKE_CXX_COMPILER_LAUNCHER=sccache
+        export CMAKE_C_COMPILER_LAUNCHER=sccache
+
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
@@ -57,6 +57,6 @@ jobs:
       id-token: write
       contents: read
     with:
-      models: ${{ inputs.models }}
+      models: ${{ inputs.models || 'mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' }}
       devices: google_pixel_3_private_rooted
       benchmark_configs: ${{ inputs.benchmark_configs }}
@@ -353,6 +353,10 @@ jobs:
       script: |
         set -eux
 
+        # Use sccache for NDK compiler as well
+        export CMAKE_CXX_COMPILER_LAUNCHER=sccache
+        export CMAKE_C_COMPILER_LAUNCHER=sccache
+
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
@@ -392,7 +396,7 @@ jobs:
       fail-fast: false
     with:
       # Due to scheduling a job may be pushed beyond the default 60m threshold
-      timeout: 120
+      timeout: 240
       device-type: android
       runner: linux.2xlarge
       test-infra-ref: ''
 
@@ -60,6 +60,10 @@ jobs:
       script: |
         set -eux
 
+        # Use sccache for NDK compiler as well
+        export CMAKE_CXX_COMPILER_LAUNCHER=sccache
+        export CMAKE_C_COMPILER_LAUNCHER=sccache
+
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
@@ -810,6 +810,10 @@ if(EXECUTORCH_BUILD_PYBIND)
       torch
   )
 
+  if(EXECUTORCH_BUILD_TESTS)
+    list(APPEND _dep_libs test_backend_compiler_lib)
+  endif()
+
   if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
     list(APPEND _dep_libs optimized_native_cpu_ops_lib)
   else()
 
@@ -9,8 +9,10 @@
 
 from typing import Tuple
 
+import pytest
+
 import torch
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
@@ -63,7 +65,7 @@ def forward(self, x, y):
     def _test_sigmoid_tosa_MI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -77,11 +79,13 @@ def _test_sigmoid_tosa_MI_pipeline(
             .check_not(["executorch_exir_dialects_edge__ops_aten_sigmoid_default"])
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
         )
 
+        if conftest.is_option_enabled("tosa_ref_model"):
+            tester.run_method_and_compare_outputs(inputs=test_data)
+
     def _test_sigmoid_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -96,9 +100,11 @@ def _test_sigmoid_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tup
             .check_not(["executorch_exir_dialects_edge__ops_aten_sigmoid_default"])
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
         )
 
+        if conftest.is_option_enabled("tosa_ref_model"):
+            tester.run_method_and_compare_outputs(inputs=test_data)
+
     def _test_sigmoid_tosa_ethos_BI_pipeline(
         self,
         compile_spec: list[CompileSpec],
@@ -137,6 +143,7 @@ def _test_sigmoid_tosa_u85_BI_pipeline(
         )
 
     @parameterized.expand(test_data_suite)
+    @pytest.mark.tosa_ref_model
     def test_sigmoid_tosa_MI(
         self,
         test_name: str,
@@ -145,26 +152,33 @@ def test_sigmoid_tosa_MI(
         self._test_sigmoid_tosa_MI_pipeline(self.Sigmoid(), (test_data,))
 
     @parameterized.expand(test_data_suite)
+    @pytest.mark.tosa_ref_model
     def test_sigmoid_tosa_BI(self, test_name: str, test_data: torch.Tensor):
         self._test_sigmoid_tosa_BI_pipeline(self.Sigmoid(), (test_data,))
 
+    @pytest.mark.tosa_ref_model
     def test_add_sigmoid_tosa_MI(self):
         self._test_sigmoid_tosa_MI_pipeline(self.AddSigmoid(), (test_data_suite[0][1],))
 
+    @pytest.mark.tosa_ref_model
     def test_add_sigmoid_tosa_BI(self):
         self._test_sigmoid_tosa_BI_pipeline(self.AddSigmoid(), (test_data_suite[5][1],))
 
+    @pytest.mark.tosa_ref_model
     def test_sigmoid_add_tosa_MI(self):
         self._test_sigmoid_tosa_MI_pipeline(self.SigmoidAdd(), (test_data_suite[0][1],))
 
+    @pytest.mark.tosa_ref_model
     def test_sigmoid_add_tosa_BI(self):
         self._test_sigmoid_tosa_BI_pipeline(self.SigmoidAdd(), (test_data_suite[0][1],))
 
+    @pytest.mark.tosa_ref_model
     def test_sigmoid_add_sigmoid_tosa_MI(self):
         self._test_sigmoid_tosa_MI_pipeline(
             self.SigmoidAddSigmoid(), (test_data_suite[4][1], test_data_suite[3][1])
         )
 
+    @pytest.mark.tosa_ref_model
     def test_sigmoid_add_sigmoid_tosa_BI(self):
         self._test_sigmoid_tosa_BI_pipeline(
             self.SigmoidAddSigmoid(), (test_data_suite[4][1], test_data_suite[3][1])
 
@@ -12,8 +12,11 @@ def define_arm_tests():
     test_files.remove("passes/test_ioquantization_pass.py")
 
     # Operators
-    test_files += ["ops/test_linear.py"]
-    test_files += ["ops/test_slice.py"]
+    test_files += [
+        "ops/test_linear.py", 
+        "ops/test_slice.py",
+        "ops/test_sigmoid.py",
+    ]
 
     TESTS = {}
 
 
@@ -9,6 +9,7 @@
 from .annotate_unbind import AnnotateUnbind
 from .convert_bmm_to_matmul import ConvertBmmToMatmul
 from .convert_conv1d_to_conv2d import ConvertConv1dToConv2d
+from .convert_square_to_pow import ConvertSquareToPow
 from .convert_upsample_bicubic2d import ConvertUpsampleBicubicWithBilinear
 from .decompose_any import DecomposeAny
 from .decompose_cdist import DecomposeCDist
@@ -42,6 +43,7 @@
     AnnotateUnbind,
     ConvertBmmToMatmul,
     ConvertConv1dToConv2d,
+    ConvertSquareToPow,
     ConvertUpsampleBicubicWithBilinear,
     DecomposeAny,
     DecomposeCDist,
 
@@ -0,0 +1,38 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from executorch.exir.pass_base import ExportPass, PassResult
+
+from .utils import copy_meta
+
+
+class ConvertSquareToPow(ExportPass):
+    """
+    Convert square to pow with a scalar value of 2.
+    This allows LiftConstantScalarOperands to lift the scalar into a scalar.
+    Otherwise, the square op will be converted to pow.tensor_scalar after to_edge.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph = graph_module.graph
+        for node in graph.nodes:
+            if node.target == torch.ops.aten.square.default:
+                input_node = node.args[0]
+                with graph_module.graph.inserting_after(input_node):
+                    pow_op = torch.ops.aten.pow.Tensor_Scalar
+                    pow_node = graph.create_node(
+                        "call_function", pow_op, (input_node, 2)
+                    )
+                    pow_node.meta = copy_meta(node.meta)
+                for user in node.users.copy():
+                    user.replace_input_with(node, pow_node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)