pytorch
diff --git a/‎.ci/scripts/analyze_benchmark_stability.py‎
Lines changed: 1523 additions & 0 deletions b/‎.ci/scripts/analyze_benchmark_stability.py‎
Lines changed: 1523 additions & 0 deletions
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/pull.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/test/tester/analyze_output_utils.py‎
Lines changed: 16 additions & 8 deletions b/‎backends/arm/test/tester/analyze_output_utils.py‎
Lines changed: 16 additions & 8 deletions
diff --git a/‎backends/cadence/aot/TARGETS‎
Lines changed: 39 additions & 0 deletions b/‎backends/cadence/aot/TARGETS‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎backends/cadence/aot/decompose_ops.py‎
Lines changed: 122 additions & 0 deletions b/‎backends/cadence/aot/decompose_ops.py‎
Lines changed: 122 additions & 0 deletions
diff --git a/‎backends/cadence/aot/replace_ops.py‎
Lines changed: 1 addition & 79 deletions b/‎backends/cadence/aot/replace_ops.py‎
Lines changed: 1 addition & 79 deletions
@@ -371,7 +371,7 @@ jobs:
         size=${arr[4]}
         # threshold=48120 on devserver with gcc11.4
         # todo(lfq): update once binary size is below 50kb.
-        threshold="51408"
+        threshold="55584"
         if [[ "$size" -le "$threshold" ]]; then
           echo "Success $size <= $threshold"
         else
@@ -406,7 +406,7 @@ jobs:
         output=$(ls -la cmake-out/test/size_test)
         arr=($output)
         size=${arr[4]}
-        threshold="47560"
+        threshold="51728"
         if [[ "$size" -le "$threshold" ]]; then
           echo "Success $size <= $threshold"
         else
 
@@ -154,6 +154,13 @@ def print_error_diffs(
         output_str += f"BATCH {n}\n"
         result_batch = result[n, :, :, :]
         reference_batch = reference[n, :, :, :]
+
+        if reference_batch.dtype == torch.bool or result_batch.dtype == torch.bool:
+            mismatches = (reference_batch != result_batch).sum().item()
+            total = reference_batch.numel()
+            output_str += f"(BOOLEAN tensor) {mismatches} / {total} elements differ ({mismatches / total:.2%})\n"
+            continue
+
         is_close = torch.allclose(result_batch, reference_batch, rtol, atol)
         if is_close:
             output_str += ".\n"
@@ -180,14 +187,15 @@ def print_error_diffs(
                 output_str += _print_elements(
                     result[n, :, :, :], reference[n, :, :, :], C, H, W, rtol, atol
                 )
-
-    reference_range = torch.max(reference) - torch.min(reference)
-    diff = torch.abs(reference - result).flatten()
-    diff = diff[diff.nonzero()]
-    if not len(diff) == 0:
-        diff_percent = diff / reference_range
-        output_str += "\nMEAN      MEDIAN    MAX       MIN    (error as % of reference output range)\n"
-        output_str += f"{torch.mean(diff_percent):<8.2%}  {torch.median(diff_percent):<8.2%}  {torch.max(diff_percent):<8.2%}  {torch.min(diff_percent):<8.2%}\n"
+    # Only compute numeric error metrics if tensor is not boolean
+    if reference.dtype != torch.bool and result.dtype != torch.bool:
+        reference_range = torch.max(reference) - torch.min(reference)
+        diff = torch.abs(reference - result).flatten()
+        diff = diff[diff.nonzero()]
+        if not len(diff) == 0:
+            diff_percent = diff / reference_range
+            output_str += "\nMEAN      MEDIAN    MAX       MIN    (error as % of reference output range)\n"
+            output_str += f"{torch.mean(diff_percent):<8.2%}  {torch.median(diff_percent):<8.2%}  {torch.max(diff_percent):<8.2%}  {torch.min(diff_percent):<8.2%}\n"
 
     # Over-engineer separators to match output width
     lines = output_str.split("\n")
 
@@ -276,6 +276,24 @@ python_library(
     ],
 )
 
+python_library(
+    name = "decompose_ops",
+    srcs = [
+        "decompose_ops.py",
+    ],
+    typing = True,
+    deps = [
+        ":pass_utils",
+        "//caffe2:torch",
+        "//executorch/backends/cadence/aot:pass_utils",
+        "//executorch/exir:pass_base",
+        "//executorch/exir/dialects:lib",
+        "//executorch/exir/dialects/edge:lib",
+        "//executorch/exir/passes:spec_prop_pass",
+    ],
+)
+
+
 python_unittest(
     name = "test_graph_builder",
     srcs = [
@@ -314,6 +332,27 @@ python_unittest(
     ],
 )
 
+python_unittest(
+    name = "test_decompose_ops_passes",
+    srcs = [
+        "tests/test_decompose_ops_passes.py",
+    ],
+    supports_static_listing = False,
+    typing = True,
+    deps = [
+        "fbsource//third-party/pypi/parameterized:parameterized",
+        ":compiler",
+        ":decompose_ops",
+        "//caffe2:torch",
+        "//executorch/backends/cadence/aot:compiler",
+        "//executorch/backends/cadence/aot:graph_builder",
+        "//executorch/backends/cadence/aot:pass_utils",
+        "//executorch/exir:pass_base",
+        "//executorch/exir/dialects:lib",
+        "//executorch/exir/passes:lib",
+    ],
+)
+
 python_unittest(
     name = "test_fusion_ops_passes",
     srcs = [
 
@@ -0,0 +1,122 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+# This file contains all the functions that decompose one op into simpler ops in the
+# graph. The functions decomposing ops for models deployed with Jarvis are grouped
+# together in class 'DecomposeOpsInGraph'. Some examples of functions in the class are
+# 1. functions that decompose an ATen gelu op into an equivalent series of simpler ops
+
+# pyre-strict
+
+from typing import Dict
+
+from executorch.backends.cadence.aot.pass_utils import (
+    CadencePassAttribute,
+    register_cadence_pass,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.dialects.edge._ops import EdgeOpOverload
+from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue
+from torch.fx.node import Argument
+
+
+@register_cadence_pass(CadencePassAttribute(opt_level=0))
+class DecomposeAtenApproxGeluPass(ExportPass):
+    """
+    Decompose the aten gelu op with an approximate arg to a series of simpler ops
+    """
+
+    def call_operator(
+        self,
+        op: EdgeOpOverload,
+        args: tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        # compute the approximate gelu (0.7978845608028654 is sqrt(2 / pi))
+        # as 0.5 * x * (1 + torch.tanh(0.7978845608028654 * ( x + 0.044715 * x^3)))
+
+        # Get 0.5 * x
+        half = super().call_operator(
+            exir_ops.edge.aten.mul.Tensor,
+            (args[0], 0.5),
+            {},
+            meta,
+        )
+
+        scaled = super().call_operator(
+            exir_ops.edge.aten.mul.Tensor,
+            (args[0], 0.044715),
+            {},
+            meta,
+        )
+
+        # Get x^2 (note that we use mul.Tensor twice instead of pow.Tensor because
+        # it is much more efficient on DSP backends)
+        scaled_square = super().call_operator(
+            exir_ops.edge.aten.mul.Tensor,
+            (scaled, args[0]),
+            {},
+            meta,
+        )
+
+        # Get x^3
+        scaled_cubed = super().call_operator(
+            exir_ops.edge.aten.mul.Tensor,
+            (scaled_square, args[0]),
+            {},
+            meta,
+        )
+
+        # Get x + 0.044715 * x^3
+        inner_sum = super().call_operator(
+            exir_ops.edge.aten.add.Tensor,
+            (scaled_cubed, args[0]),
+            {},
+            meta,
+        )
+
+        # Get 0.7978845608028654 * ( x + 0.044715 * x^3)
+        scaled_sum = super().call_operator(
+            exir_ops.edge.aten.mul.Tensor,
+            (inner_sum, 0.7978845608028654),
+            {},
+            meta,
+        )
+
+        # Get torch.tanh(0.7978845608028654 * ( x + 0.044715 * x^3))
+        tanh = super().call_operator(
+            exir_ops.edge.aten.tanh.default,
+            (scaled_sum,),
+            {},
+            meta,
+        )
+
+        # Get 1 + torch.tanh(0.79788456 * ( x + 0.044715 * x^3))
+        # TODO(): Check why this is not working properly with integer values (e.g. 1 instead of 1.)
+        outer_sum = super().call_operator(
+            exir_ops.edge.aten.add.Tensor,
+            (tanh, 1.0),
+            {},
+            meta,
+        )
+
+        # Return the final result
+        return super().call_operator(
+            exir_ops.edge.aten.mul.Tensor,
+            (half, outer_sum),
+            {},
+            meta,
+        )
+
+
+# This class encapsulates all the functions that decompose one op in the graph.
+class CadenceDecomposeOpsInGraph:
+    passes = [
+        DecomposeAtenApproxGeluPass,
+    ]
@@ -2078,89 +2078,11 @@ def call_operator(
         kwargs: Dict[str, Argument],
         meta: NodeMetadata,
     ) -> ProxyValue:
-        if "approximate" not in kwargs:
-            return super().call_operator(op, args, kwargs, meta)
-
         if op not in {
             exir_ops.edge.aten.gelu.default,
         }:
             return super().call_operator(op, args, kwargs, meta)
-
-        # compute the approximate gelu (0.7978845608028654 is sqrt(2 / pi))
-        # as 0.5 * x * (1 + torch.tanh(0.7978845608028654 * ( x + 0.044715 * x^3)))
-
-        # Get 0.5 * x
-        half = super().call_operator(
-            exir_ops.edge.aten.mul.Tensor,
-            (args[0], 0.5),
-            {},
-            meta,
-        )
-
-        scaled = super().call_operator(
-            exir_ops.edge.aten.mul.Tensor,
-            (args[0], 0.044715),
-            {},
-            meta,
-        )
-
-        # Get x^2 (note that we use mul.Tensor twice instead of pow.Tensor because
-        # it is much more efficient on DSP backends)
-        scaled_square = super().call_operator(
-            exir_ops.edge.aten.mul.Tensor,
-            (scaled, args[0]),
-            {},
-            meta,
-        )
-
-        # Get x^3
-        scaled_cubed = super().call_operator(
-            exir_ops.edge.aten.mul.Tensor,
-            (scaled_square, args[0]),
-            {},
-            meta,
-        )
-
-        # Get x + 0.044715 * x^3
-        inner_sum = super().call_operator(
-            exir_ops.edge.aten.add.Tensor,
-            (scaled_cubed, args[0]),
-            {},
-            meta,
-        )
-
-        # Get 0.7978845608028654 * ( x + 0.044715 * x^3)
-        scaled_sum = super().call_operator(
-            exir_ops.edge.aten.mul.Tensor,
-            (inner_sum, 0.7978845608028654),
-            {},
-            meta,
-        )
-
-        # Get torch.tanh(0.7978845608028654 * ( x + 0.044715 * x^3))
-        tanh = super().call_operator(
-            exir_ops.edge.aten.tanh.default,
-            (scaled_sum,),
-            {},
-            meta,
-        )
-
-        # Get 1 + torch.tanh(0.79788456 * ( x + 0.044715 * x^3))
-        # TODO(): Check why this is not working properly with integer values (e.g. 1 instead of 1.)
-        outer_sum = super().call_operator(
-            exir_ops.edge.aten.add.Tensor,
-            (tanh, 1.0),
-            {},
-            meta,
-        )
-
-        # Retunr the final result
-        return super().call_operator(
-            exir_ops.edge.aten.mul.Tensor,
-            (half, outer_sum),
-            {},
-            meta,
-        )
+        return super().call_operator(op, args, kwargs, meta)
 
 
 # Adapted from fbcode/pyspeech/opt_passes/replace_ops.py