pytorch
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 4 additions & 1 deletion b/‎backends/arm/_passes/__init__.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 9 additions & 7 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 9 additions & 7 deletions
diff --git a/‎backends/arm/_passes/convert_split_to_slice.py‎
Lines changed: 39 additions & 2 deletions b/‎backends/arm/_passes/convert_split_to_slice.py‎
Lines changed: 39 additions & 2 deletions
diff --git a/‎backends/arm/_passes/decompose_select_scatter_pass.py‎
Lines changed: 143 additions & 0 deletions b/‎backends/arm/_passes/decompose_select_scatter_pass.py‎
Lines changed: 143 additions & 0 deletions
diff --git a/‎backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py‎
Lines changed: 13 additions & 5 deletions b/‎backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py‎
Lines changed: 13 additions & 5 deletions
diff --git a/‎backends/arm/_passes/fuse_constant_ops_pass.py‎
Lines changed: 4 additions & 1 deletion b/‎backends/arm/_passes/fuse_constant_ops_pass.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎…s/arm/_passes/replace_inf_values_pass.py‎ ‎…ses/replace_inf_and_limit_values_pass.py‎backends/arm/_passes/replace_inf_values_pass.py renamed to backends/arm/_passes/replace_inf_and_limit_values_pass.py
Lines changed: 8 additions & 6 deletions b/‎…s/arm/_passes/replace_inf_values_pass.py‎ ‎…ses/replace_inf_and_limit_values_pass.py‎backends/arm/_passes/replace_inf_values_pass.py renamed to backends/arm/_passes/replace_inf_and_limit_values_pass.py
Lines changed: 8 additions & 6 deletions
@@ -67,6 +67,7 @@
 from .decompose_round_pass import DecomposeRoundPass  # noqa
 from .decompose_sdpa_pass import DecomposeScaledDotProductAttentionPass  # noqa
 from .decompose_select import DecomposeSelectPass  # noqa
+from .decompose_select_scatter_pass import DecomposeSelectScatterPass  # noqa
 from .decompose_sign_pass import DecomposeSignPass  # noqa
 from .decompose_silu_pass import DecomposeSiluPass  # noqa
 from .decompose_sinh_pass import DecomposeSinhPass  # noqa
@@ -116,5 +117,7 @@
 from .to_tosa_memory_format_pass import ToTosaMemoryFormatPass  # noqa
 from .unsqueeze_before_repeat_pass import UnsqueezeBeforeRepeatPass  # noqa
 from .unsqueeze_scalar_placeholders_pass import UnsqueezeScalarPlaceholdersPass  # noqa
-from .replace_inf_values_pass import ReplaceInfValuesPass  # noqa  # usort: skip
+from .replace_inf_and_limit_values_pass import (  # noqa  # usort: skip
+    ReplaceInfAndLimitValuesPass,
+)
 from .arm_pass_manager import ArmPassManager  # noqa  # usort: skip
@@ -70,6 +70,7 @@
     DecomposeRoundPass,
     DecomposeScaledDotProductAttentionPass,
     DecomposeSelectPass,
+    DecomposeSelectScatterPass,
     DecomposeSignPass,
     DecomposeSiluPass,
     DecomposeSinhPass,
@@ -98,7 +99,7 @@
     RemoveGetItemPass,
     RemoveGraphAssertsPass,
     RemoveNoopPass,
-    ReplaceInfValuesPass,
+    ReplaceInfAndLimitValuesPass,
     ReplaceScalarWithTensorByProfilePass,
     RewriteConv2dPass,
     RewriteMatmulPass,
@@ -174,18 +175,14 @@ def _tosa_pipeline(
         self.add_passes(
             [
                 FuseQuantizedActivationPass(),
-                RemoveGetItemPass(),
                 ConvertToClampPass(),
                 DecomposeInt32ClampPass(),
                 DecomposeGroupNormPass(),
                 DecomposeLayerNormPass(),
-                DecomposeBatchNormNoStatsPass(),
                 DecomposeVarPass(),
                 DecomposeMeanDimPass(exported_program.graph_module, self.tosa_spec),
                 AnnotateDecomposedMatmulPass(),
                 ConvertELUParamsPass(),
-                ConvertSplitToSlicePass(),
-                QuantizeClampArgumentsPass(),
             ]
         )
 
@@ -207,6 +204,10 @@ def _tosa_pipeline(
         # Node transformation passes (post q/dq folding)
         self.add_passes(
             [
+                ConvertSplitToSlicePass(),
+                QuantizeClampArgumentsPass(),
+                RemoveGetItemPass(),
+                DecomposeBatchNormNoStatsPass(),
                 DecomposeLogitPass(),
                 DecomposeMaskedFillPass(),
                 DecomposeRoundPass(),
@@ -243,7 +244,6 @@ def _tosa_pipeline(
                 # passes. Ticket: MLETORCH-1540
                 DecomposeNotEqualPass(),
                 MatchArgRanksPass(exported_program),
-                FuseConstantArgsPass(exported_program),
             ]
         )
 
@@ -265,6 +265,7 @@ def _tosa_pipeline(
                 DecomposeAvgPool2dPass(),
                 DecorateFp32toInt32CastingPass(),
                 ComputeConstantOpsAOTPass(exported_program),
+                FuseConstantArgsPass(exported_program),
                 ConvertExpandCopyToRepeatPass(),
                 UnsqueezeBeforeRepeatPass(),
                 DecomposeCumsumPass(exported_program),
@@ -330,6 +331,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         # Transformation passes (pre scalar -> tensor)
         self.add_passes(
             [
+                DecomposeSelectScatterPass(),
                 ConvertInt64ConstOpsToInt32Pass(),
                 ConvertInt64OutputOpsToInt32Pass(),
                 InsertInt32CastsAfterInt64PlaceholdersPass(),
@@ -383,7 +385,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         # Postprocessing passes
         self.add_passes(
             [
-                ReplaceInfValuesPass(),
+                ReplaceInfAndLimitValuesPass(),
                 DecomposeMaskedFillPass() if not self.tosa_spec.is_U55_subset else None,
             ]
         )
 
@@ -85,11 +85,48 @@ def call(self, graph_module: torch.fx.GraphModule):
                         graph,
                         self.slice,
                         (input_node, dim, starts[index], ends[index]),
+                        from_node=node,
+                    )
+                    slice_node.meta = _copy_user_node_qparams(
+                        split_node, output_node, index
                     )
-                    slice_node.meta = split_node.meta.copy()
-                    slice_node.meta["val"] = slice_node.meta["val"][index]
                     output_node.replace_all_uses_with(slice_node)
         graph.eliminate_dead_code()
         graph_module.recompile()
         graph_module = super().call(graph_module).graph_module
         return PassResult(graph_module, True)
+
+
+def _copy_user_node_qparams(
+    split_node: torch.fx.Node, output_node: torch.fx.Node, index: int
+) -> dict:
+    """
+    Construct metadata for the slice node that will replace the split output.
+
+    Note that output quantization parameters are copied from the user nodes
+    of the split node. The split node itself does not have output quantization
+    parameters.
+
+    Args:
+        split_node: The split node being replaced.
+        output_node: The getitem node that is user of the split node.
+        index: The index of the output being processed.
+    Returns:
+        Updated metadata dictionary for the slice node.
+    """
+
+    def _select_index(value):
+        if isinstance(value, (list, tuple)):
+            return value[index]
+        return value
+
+    meta = split_node.meta.copy()
+    if "val" in meta:
+        meta["val"] = _select_index(meta["val"])
+    if "tensor_meta" in meta:
+        meta["tensor_meta"] = _select_index(meta["tensor_meta"])
+    if "input_qparams" in meta:
+        meta["input_qparams"] = dict(meta["input_qparams"])
+    if "output_qparams" in meta:
+        meta["output_qparams"] = dict(output_node.meta["output_qparams"])
+    return meta
@@ -0,0 +1,143 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Set, Type
+
+import torch
+
+from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.convert_int64_const_ops_to_int32 import (
+    ConvertInt64ConstOpsToInt32Pass,
+)
+from executorch.backends.arm._passes.replace_scalar_with_tensor_pass import (
+    ReplaceScalarWithTensorByProfilePass,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+edge_scatter_ops = (exir_ops.edge.aten.select_scatter.default,)
+aten_scatter_ops = (torch.ops.aten.select_scatter.default,)
+
+
+def get_select_scatter_decomposition(op) -> tuple:
+    if op in edge_scatter_ops:
+        return (
+            exir_ops.edge.aten.arange.start_step,
+            exir_ops.edge.aten.eq.Scalar,
+            exir_ops.edge.aten.where.self,
+            exir_ops.edge.aten.expand_copy.default,
+            exir_ops.edge.aten.unsqueeze_copy.default,
+            exir_ops.edge.aten.view_copy.default,
+        )
+    if op in aten_scatter_ops:
+        return (
+            torch.ops.aten.arange.start_step,
+            torch.ops.aten.eq.Scalar,
+            torch.ops.aten.where.self,
+            torch.ops.aten.expand_copy.default,
+            torch.ops.aten.unsqueeze_copy.default,
+            torch.ops.aten.view_copy.default,
+        )
+
+    raise RuntimeError(f"Can't get select_scatter decomposition for op {op}")
+
+
+class DecomposeSelectScatterPass(ArmPass):
+    """select_scatter is decomposed into other ops during export, however this is only
+    suppported for the fp profile and for the int profile we need to decompose it here.
+
+    The decomposition is as follows:
+    - Build a boolean mask the size of x
+        eq(view(arange(0, dim_size), mask_shape), index)
+    - Broadcast source to x
+        expand(unsqueeze(source, dim), shape)
+    - Route the updated slice while keeping the untouched lanes
+        where(mask, expanded_source, x)
+
+    This reflects the decomposition for the fp profile implemented in torch._refs
+    """
+
+    _passes_required_after: Set[Type[ExportPass]] = {
+        ReplaceScalarWithTensorByProfilePass,
+        ConvertInt64ConstOpsToInt32Pass,
+    }
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in (edge_scatter_ops + aten_scatter_ops):
+            return super().call_operator(op, args, kwargs, meta, updated=False)
+
+        (
+            arange_op,
+            eq_op,
+            where_op,
+            expand_op,
+            unsqueeze_op,
+            view_op,
+        ) = get_select_scatter_decomposition(op)
+
+        input_tensor = args[0]
+        src_tensor = args[1]
+        dim = int(args[2])
+        index = int(args[3])
+
+        shape = input_tensor.data.size()
+        rank = len(shape)
+        dim = dim % rank if dim < 0 else dim
+        dim_size = shape[dim]
+        if index < 0:
+            index = index + dim_size
+
+        mask_shape = [1] * rank
+        mask_shape[dim] = -1
+
+        arange_node = super().call_operator(
+            arange_op,
+            (0, dim_size, 1),
+            {},
+            meta,
+            updated=False,
+        )
+
+        view_node = super().call_operator(
+            view_op,
+            (arange_node, mask_shape),
+            {},
+            meta,
+            updated=False,
+        )
+
+        mask_node = super().call_operator(
+            eq_op,
+            (view_node, index),
+            {},
+            meta,
+            updated=False,
+        )
+
+        unsqueeze_node = super().call_operator(
+            unsqueeze_op,
+            (src_tensor, dim),
+            {},
+            meta,
+            updated=False,
+        )
+
+        expand_node = super().call_operator(
+            expand_op,
+            (unsqueeze_node, shape),
+            {},
+            meta,
+            updated=False,
+        )
+
+        where_node = super().call_operator(
+            where_op,
+            (mask_node, expand_node, input_tensor),
+            {},
+            meta,
+            updated=True,
+        )
+
+        return where_node
@@ -334,7 +334,7 @@ class QuantizeClampArgumentsPass(ArmPass):
         - Makes sure the min and max values to clamp.default are quantized, if it's a quantized operator.
     """
 
-    _passes_required_after: Set[Type[ExportPass]] = {FoldAndAnnotateQParamsPass}
+    _passes_required_after: Set[Type[ExportPass]] = set()
 
     def call(self, graph_module: GraphModule) -> PassResult:
         modified = False
@@ -346,12 +346,15 @@ def call(self, graph_module: GraphModule) -> PassResult:
             }:
                 continue
 
-            # Make sure we have a quantized operator
-            user = list(n.users)[0]
-            if user.target not in Q_OPS:
+            try:
+                output_qparams = get_output_qparams(n)
+            except ValueError:
+                continue
+            if len(output_qparams) == 0:
                 continue
 
-            qargs = QuantArgs.from_operator(user.target, user.args)
+            # Qparams are stored per user index; use the first entry.
+            qargs = next(iter(output_qparams.values()))
 
             if n.target == exir_ops.edge.aten.clamp.default:
                 # Quantize the min and max arguments of clamp, if they are not None
@@ -368,4 +371,9 @@ def call(self, graph_module: GraphModule) -> PassResult:
 
                 modified = True
 
+        if modified:
+            # Retrace to refresh fake tensor metadata after updating clamp min/max.
+            graph_module = super().call(graph_module).graph_module
+            graph_module.recompile()
+
         return PassResult(graph_module, modified)
@@ -178,7 +178,10 @@ def f(node_name_pre_computed):
             return node_name_pre_computed
     """
 
-    _passes_required_after: Set[Type[ExportPass]] = {FuseEqualPlaceholdersPass}
+    _passes_required_after: Set[Type[ExportPass]] = {
+        FuseEqualPlaceholdersPass,
+        FuseConstantArgsPass,
+    }
 
     targeted_ops = [
         exir_ops.edge.aten.full.default,
 
@@ -14,9 +14,11 @@
 from executorch.exir.pass_base import ExportPass, PassResult
 
 
-class ReplaceInfValuesPass(ArmPass):
+class ReplaceInfAndLimitValuesPass(ArmPass):
     """
-    Due to limitation in Quantizer, we need to change inf/-inf to more quantizable values.
+    Rewrites +inf/-inf and floating-point limit values (e.g., torch.finfo(...).min/max)
+    to quantization-friendly values (±255 by default), improving quantizer stability
+    (notably for attention mask paths).
     """
 
     _passes_required_after: Set[Type[ExportPass]] = set()
@@ -34,12 +36,12 @@ def call(self, graph_module: torch.fx.GraphModule):
         for node in graph_module.graph.nodes:
             arg_list = list(node.args)
             for index, arg in enumerate(arg_list):
-                if arg == float("-inf"):
+                if arg == float("-inf") or arg == torch.finfo(torch.float32).min:
                     modified = True
-                    arg_list[index] = -255
-                elif arg == float("inf"):
+                    arg_list[index] = -255.0
+                elif arg == float("inf") or arg == torch.finfo(torch.float32).max:
                     modified = True
-                    arg_list[index] = +255
+                    arg_list[index] = +255.0
             node.args = tuple(arg_list)
 
         if modified: