Arm backend: Add partial support for aten.gather (pytorch#16561)

YufengShi-dudu · SS-JIA · web-flow · commit 5b3d9fcdc1e2 · 2026-01-22T17:35:41.000+01:00
- Canonicalize edge.aten.gather to backend dialect tosa.GATHER - Register TOSA dialect op for GATHER - Add GatherVisitor lowering for tosa.GATHER - Add GatherSupported check for the restricted 2D gather pattern Change-Id: I0c31079a46bd3a2309ac337eff7824b7a8c0c661 cc @freddan80 @per @zingo @oscarandersson8218 @digantdesai Signed-off-by: Yufeng Shi <yufeng.shi@arm.com> Co-authored-by: Sicheng Stephen Jia <ssjia@meta.com>
diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
@@ -10,6 +10,7 @@
 from .annotate_decomposed_matmul import AnnotateDecomposedMatmulPass  # noqa
 from .annotate_output_dim_order_pass import AnnotateOutputDimOrderPass  # noqa
 from .broadcast_args_pass import BroadcastArgsPass  # noqa
+from .canonicalize_gather_pass import CanonicalizeGatherPass  # noqa
 from .cast_int64_pass import CastInt64BuffersToInt32Pass  # noqa
 from .cast_to_int32_pass import CastToInt32Pass  # noqa
 from .conv1d_unsqueeze_pass import Conv1dUnsqueezePass  # noqa
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
@@ -15,6 +15,7 @@
     AnnotateDecomposedMatmulPass,
     AnnotateOutputDimOrderPass,
     BroadcastArgsPass,
+    CanonicalizeGatherPass,
     CastInt64BuffersToInt32Pass,
     CastToInt32Pass,
     ComputeConstantOpsAOTPass,
@@ -228,6 +229,7 @@ def _tosa_pipeline(
                 FuseQuantizedActivationPass(),
                 RewriteBoolBitwiseNotToLogicalNotPass(),
                 RewriteBoolToFp32CastViaInt8Pass(),
+                CanonicalizeGatherPass(),
                 ConvertToClampPass(),
                 DecomposeTOSAUnsupportedClampPass(),
                 DecomposeGroupNormPass(),
diff --git a/backends/arm/_passes/canonicalize_gather_pass.py b/backends/arm/_passes/canonicalize_gather_pass.py
@@ -0,0 +1,122 @@
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import logging
+from typing import Set, Type
+
+import torch
+from executorch.backends.arm._passes import ArmPass
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+logger = logging.getLogger(__name__)
+
+
+class CanonicalizeGatherPass(ArmPass):
+    """
+    Canonicalize gather so it can be lowered to TOSA.GATHER via the backend dialect.
+
+    This pass is intended to run only for nodes already gated by GatherSupported.
+
+    Behavior:
+      - Reshape x from [N,K] to [N,K,1] so values matches TOSA gather's [N,K,C].
+      - Keep indices as [N,W]
+      - Lower using tosa.GATHER.default, producing [N,W,1].
+      - Reshape output to [N,W].
+      - Only insert bool<->int8 casts when x is bool:
+        * If x is bool: gather runs on int8 and output is cast back to bool.
+        * If x is not bool: gather runs on original dtype and output keeps dtype.
+    """
+
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
+    _TARGET_OPS = {exir_ops.edge.aten.gather.default}
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in self._TARGET_OPS:
+            return super().call_operator(op, args, kwargs, meta)
+
+        # edge.aten.gather.default: (x, dim, index) with kw-only sparse_grad
+        x, dim, index = args
+
+        # GatherSupported should have gated this already; treat violations as errors.
+        x_shape = x.data.shape
+        index_shape = index.data.shape
+        if not (
+            dim in (1, -1)
+            and len(x_shape) == 2
+            and len(index_shape) == 2
+            and index_shape[0] == x_shape[0]
+        ):
+            raise RuntimeError(
+                f"[{op}] Unexpected gather pattern; expected "
+                f"x:[N,K], index:[N,W], dim in {{1,-1}}, matching N. "
+                f"Got dim={dim}, x.shape={x_shape}, index.shape={index_shape}."
+            )
+
+        N, K = x_shape[0], x_shape[1]
+        W = index_shape[1]
+
+        view_op = exir_ops.edge.aten.view_copy.default
+        to_copy_op = exir_ops.edge.dim_order_ops._to_dim_order_copy.default
+
+        # Use backend dialect gather:
+        # values:  [N,K,C]
+        # indices: [N,W]
+        # output:  [N,W,C]
+        tosa_gather_op = exir_ops.backend.tosa.GATHER.default
+
+        needs_bool_cast = x.data.dtype == torch.bool
+
+        # bool -> int8 (only if needed)
+        values_in = x
+        if needs_bool_cast:
+            values_in = super().call_operator(
+                to_copy_op,
+                (x,),
+                {"dtype": torch.int8},
+                meta,
+                updated=True,
+            )
+
+        # [N,K] -> [N,K,1]
+        values_3d = super().call_operator(
+            view_op,
+            (values_in, [N, K, 1]),
+            {},
+            meta,
+            updated=True,
+        )
+
+        # indices stays [N,W]
+        gathered_3d = super().call_operator(
+            tosa_gather_op,
+            (values_3d, index),
+            {},
+            meta,
+            updated=True,
+        )
+
+        # [N,W,1] -> [N,W]
+        gathered_2d = super().call_operator(
+            view_op,
+            (gathered_3d, [N, W]),
+            {},
+            meta,
+            updated=True,
+        )
+
+        # int8 -> bool (only if needed)
+        if needs_bool_cast:
+            return super().call_operator(
+                to_copy_op,
+                (gathered_2d,),
+                {"dtype": torch.bool},
+                meta,
+                updated=True,
+            )
+
+        return gathered_2d
diff --git a/backends/arm/operator_support/__init__.py b/backends/arm/operator_support/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
+# Copyright 2024-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -10,6 +10,7 @@
     convolution_support,
     embedding_support,
     ethos_u55_support,
+    gather_support,
     index_select_support,
     index_tensor_support,
     minmax_support,
diff --git a/backends/arm/operator_support/ethos_u55_support.py b/backends/arm/operator_support/ethos_u55_support.py
@@ -204,6 +204,7 @@ class EthosU55NotSupported(OperatorSupportBase):
         exir_ops.edge.aten.ne.Tensor,
         exir_ops.edge.aten.ne.Scalar,
         exir_ops.edge.aten.flip.default,  # REVERSE
+        exir_ops.edge.aten.gather.default,  # GATHER
         exir_ops.edge.aten.grid_sampler_2d,  # GATHER
         exir_ops.edge.aten.index.Tensor,  # GATHER
         exir_ops.edge.aten.index_select.default,  # GATHER
diff --git a/backends/arm/operator_support/gather_support.py b/backends/arm/operator_support/gather_support.py
@@ -0,0 +1,126 @@
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Declare operator support for ``edge.aten.gather`` in TOSA.
+
+This support check matches the subset accepted by CanonicalizeGatherPass:
+
+- target: exir_ops.edge.aten.gather.default
+- args: exactly (x, dim, index)  (i.e. len(node.args) == 3)
+- dim must be 1 or -1
+- x must be rank-2
+- index must be rank-2
+- index dtype must be int32
+- batch dim must match: x.shape[0] == index.shape[0]
+
+Dtype gating is capability-based:
+
+- int8/int16/int32 values require INT profile.
+- bool values require INT profile (handled via casts: bool -> int8 -> bool).
+- fp16/fp32 values are supported via FP profile directly, or via quantization
+  when running under an INT profile.
+
+Note:
+- CanonicalizeGatherPass reshapes values to [N, K, 1] and keeps indices as [N, W],
+  then lowers via the TOSA gather dialect.
+"""
+
+import torch
+import torch.fx as fx
+
+from executorch.backends.arm.operator_support.tosa_supported_operators import (
+    register_tosa_support_check,
+    SupportedTOSAOperatorCheck,
+)
+from executorch.backends.arm.tosa import TosaSpecification
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+@register_tosa_support_check
+class GatherSupported(SupportedTOSAOperatorCheck):
+    """Provide TOSA support check for ``edge.aten.gather``."""
+
+    targets = [exir_ops.edge.aten.gather.default]
+
+    tosa_specs = [
+        TosaSpecification.create_from_string("TOSA-1.0+INT"),
+        TosaSpecification.create_from_string("TOSA-1.0+FP"),
+    ]
+
+    def is_node_tosa_supported(
+        self, node: fx.Node, tosa_spec: TosaSpecification
+    ) -> bool:  # type: ignore[override, misc]
+        if len(node.args) != 3:
+            self.reporter.report_reject(
+                node,
+                f"{node.target}: expected 3 args (x, dim, index), got "
+                f"{len(node.args)}.",
+            )
+            return False
+
+        x_arg, dim, index_arg = node.args[0], node.args[1], node.args[2]
+        x_val = x_arg.meta["val"]  # type: ignore[union-attr]
+        index_val = index_arg.meta["val"]  # type: ignore[union-attr]
+
+        x_shape = tuple(x_val.shape)
+        index_shape = tuple(index_val.shape)
+
+        # ---- index dtype ----
+        if index_val.dtype != torch.int32:
+            self.reporter.report_reject(
+                node,
+                f"{node.target}: index dtype {index_val.dtype} not supported; "
+                "expected int32.",
+            )
+            return False
+
+        # ---- dim + rank ----
+        if not (
+            (dim == 1 or dim == -1) and len(x_shape) == 2 and len(index_shape) == 2
+        ):
+            self.reporter.report_reject(
+                node,
+                f"{node.target}: unsupported dim/rank; got {dim=}, "
+                f"x_rank={len(x_shape)}, index_rank={len(index_shape)}; "
+                "supported: dim in {1, -1} with rank-2 x and rank-2 index.",
+            )
+            return False
+
+        # ---- batch dim compatibility ----
+        if x_shape[0] != index_shape[0]:
+            self.reporter.report_reject(
+                node,
+                f"{node.target}: batch mismatch {x_shape[0]=} vs {index_shape[0]=}.",
+            )
+            return False
+
+        # ---- values dtype ----
+        values_dtype = x_val.dtype
+        # ints (and bool via casts) require INT profile
+        if values_dtype in (torch.bool, torch.int8, torch.int16, torch.int32):
+            if not tosa_spec.support_integer():
+                self.reporter.report_reject(
+                    node,
+                    f"{node.target}: dtype {values_dtype} requires INT profile.",
+                )
+                return False
+        # fp16/fp32: either FP profile, or INT profile (via quantization)
+        elif values_dtype in (torch.float16, torch.float32):
+            if not (tosa_spec.support_float() or tosa_spec.support_integer()):
+                self.reporter.report_reject(
+                    node,
+                    f"{node.target}: dtype {values_dtype} requires FP profile or "
+                    "INT profile (with quantization).",
+                )
+                return False
+        else:
+            self.reporter.report_reject(
+                node,
+                f"{node.target}: unsupported values dtype {values_dtype}; "
+                "expected bool/int8/int16/int32/float16/float32.",
+            )
+            return False
+
+        return True
diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py
@@ -58,6 +58,7 @@
     op_tosa_conv2d,
     op_tosa_conv3d,
     op_tosa_depthwise_conv2d,
+    op_tosa_gather,
     op_tosa_matmul,
     op_tosa_rescale,
     op_tosa_resize,
diff --git a/backends/arm/operators/op_tosa_gather.py b/backends/arm/operators/op_tosa_gather.py
@@ -0,0 +1,80 @@
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, List
+
+import tosa_serializer as ts
+
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.operators.operator_validation_utils import (
+    validate_num_inputs,
+    validate_same_dtype,
+    validate_valid_dtype,
+)
+from executorch.backends.arm.tosa.mapping import TosaArg
+from torch.fx import Node
+
+
+@register_node_visitor
+class GatherVisitor(NodeVisitor):
+    """
+    Lowers backend TOSA dialect `tosa.GATHER.default`.
+
+    Expected signature (per TOSA):
+      values:  [N, K, C]  (rank 3)
+      indices: [N, W]     (rank 2, int32)
+      output:  [N, W, C]  (rank 3)
+    """
+
+    target = "tosa.GATHER.default"
+    tosa_specs = NodeVisitor.tosa_specs
+
+    def define_node(
+        self,
+        node: Node,
+        tosa_graph: Any,
+        inputs: List[TosaArg],
+        output: TosaArg,
+    ) -> None:
+        validate_num_inputs(self.target, inputs, 2)
+
+        values = inputs[0]
+        indices = inputs[1]
+
+        validate_same_dtype(self.target, [values, output], ts)
+        # Indices must be int32 for TOSA GATHER
+        validate_valid_dtype(
+            self.target,
+            [indices],
+            [ts.DType.INT32],
+            output.tosa_spec,
+        )
+        validate_valid_dtype(
+            self.target,
+            [values, output],
+            [
+                ts.DType.INT8,
+                ts.DType.INT16,
+                ts.DType.INT32,
+                ts.DType.FP16,
+                ts.DType.FP32,
+            ],
+            output.tosa_spec,
+        )
+
+        attr = ts.TosaSerializerAttribute()
+        attr.GatherAttribute()
+
+        self._serialize_operator(
+            node,
+            tosa_graph,
+            ts.Op.GATHER,
+            [values.name, indices.name],
+            [output.name],
+            attr,
+        )
diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
@@ -428,6 +428,7 @@ def _match_pattern(
     torch.ops.aten.clamp.default,
     torch.ops.aten.clamp.Tensor,
     torch.ops.aten.unflatten.int,
+    torch.ops.aten.gather.default,
     torch.ops.aten.index_select.default,
     torch.ops.aten.index.Tensor,
     # Neg operator flips the range, but keps the magnitude the same.
diff --git a/backends/arm/scripts/collect_testname_resources.py b/backends/arm/scripts/collect_testname_resources.py
diff --git a/backends/arm/test/ops/test_gather.py b/backends/arm/test/ops/test_gather.py
diff --git a/backends/arm/tosa/dialect/__init__.py b/backends/arm/tosa/dialect/__init__.py
diff --git a/backends/arm/tosa/dialect/ops/gather.py b/backends/arm/tosa/dialect/ops/gather.py