Qualcomm AI Engine Direct - Support Flip & Index_Select (#13906)

winskuo-quic · web-flow · commit b2a8550fa56b · 2025-09-03T11:31:57.000-07:00
### Summary

- Support Flip
- Support Index_Select

### Test plan
- `python backends/qualcomm/tests/test_qnn_delegate.py -k
TestQNNQuantizedOperator.test_qnn_backend_flip --model SM8750 --device
$DEVICE --build_folder build-android`
- `python backends/qualcomm/tests/test_qnn_delegate.py -k
TestQNNQuantizedOperator.test_qnn_backend_index_select --model SM8750
--device $DEVICE --build_folder build-android`
- `python backends/qualcomm/tests/test_qnn_delegate.py -k
TestQNNQuantizedModel.test_qnn_backend_conv2d_flip --model SM8750
--device $DEVICE --build_folder build-android`
- `python backends/qualcomm/tests/test_qnn_delegate.py -k
TestQNNQuantizedModel.test_qnn_backend_conv2d_slice_copy --model SM8750
--device $DEVICE --build_folder build-android`
diff --git a/backends/qualcomm/_passes/layout_transform.py b/backends/qualcomm/_passes/layout_transform.py
@@ -79,6 +79,7 @@ class LayoutTransform(ExportPass):
         exir_ops.edge.aten.elu.default,
         exir_ops.edge.aten.eq.Tensor,
         exir_ops.edge.aten.exp.default,
+        exir_ops.edge.aten.flip.default,
         exir_ops.edge.aten.floor.default,
         exir_ops.edge.aten.floor_divide.default,
         exir_ops.edge.aten.full.default,
@@ -111,6 +112,7 @@ class LayoutTransform(ExportPass):
         exir_ops.edge.aten.round.default,
         exir_ops.edge.aten.sigmoid.default,
         exir_ops.edge.aten.sign.default,
+        exir_ops.edge.aten.slice_copy.Tensor,
         exir_ops.edge.aten.split_with_sizes.default,
         exir_ops.edge.aten.split_with_sizes_copy.default,
         exir_ops.edge.aten.sqrt.default,
diff --git a/backends/qualcomm/builders/__init__.py b/backends/qualcomm/builders/__init__.py
@@ -36,6 +36,7 @@
     op_eq,
     op_exp,
     op_expand,
+    op_flip,
     op_floor,
     op_full,
     op_full_like,
@@ -49,6 +50,7 @@
     op_hardtanh,
     op_index,
     op_index_put,
+    op_index_select,
     op_instance_norm,
     op_layer_norm,
     op_le,
@@ -139,6 +141,7 @@
     op_eq,
     op_exp,
     op_expand,
+    op_flip,
     op_floor,
     op_full,
     op_full_like,
@@ -152,6 +155,7 @@
     op_hardsigmoid,
     op_index,
     op_index_put,
+    op_index_select,
     op_instance_norm,
     op_layer_norm,
     op_le,
diff --git a/backends/qualcomm/builders/op_flip.py b/backends/qualcomm/builders/op_flip.py
@@ -0,0 +1,81 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+
+import numpy as np
+import torch
+
+from executorch.backends.qualcomm.utils.constants import QCOM_AXIS_ORDER
+
+from .node_visitor import NodeVisitor
+from .node_visitor_manager import register_node_visitor
+from .qnn_constants import OpStridedSlice, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class Flip(NodeVisitor):
+    target = ["aten.flip.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        input_node = self.get_node(node.args[0])
+        input_tensor = self.get_tensor(input_node, node)
+        tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
+
+        input_tensor_wrapper = self.define_tensor(
+            input_node,
+            node,
+            input_tensor,
+            tensor_type,
+            nodes_to_wrappers,
+        )
+
+        output_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            node,
+            output_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+        ranges = []
+
+        dims = node.args[1]
+        if QCOM_AXIS_ORDER in node.meta:
+            dims = [node.meta[QCOM_AXIS_ORDER].index(dim) for dim in dims]
+
+        for dim, size in enumerate(output_tensor.shape):
+            if dim in dims:
+                ranges.extend([size - 1, -1, -1])
+            else:
+                ranges.extend([0, size, 1])
+
+        range_shape = [input_tensor.dim(), 3]
+        stride_slice_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpStridedSlice.op_name,
+        )
+        stride_slice_op.AddInputTensors([input_tensor_wrapper])
+        stride_slice_op.AddOutputTensors([output_tensor_wrapper])
+        stride_slice_op.AddTensorParam(
+            OpStridedSlice.param_ranges,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_32,
+            len(range_shape),
+            range_shape,
+            np.array(ranges, dtype=np.int32),
+            True,
+        )
+
+        return stride_slice_op
diff --git a/backends/qualcomm/builders/op_index_select.py b/backends/qualcomm/builders/op_index_select.py
@@ -0,0 +1,81 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+
+import numpy as np
+import torch
+from executorch.backends.qualcomm.utils.constants import QCOM_DATA
+
+from .node_visitor import NodeVisitor
+from .node_visitor_manager import register_node_visitor
+from .qnn_constants import OpGather, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class IndexSelect(NodeVisitor):
+    target = ["aten.index_select.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        input_node = self.get_node(node.args[0])
+        input_tensor = self.get_tensor(input_node, node)
+        input_tensor_wrapper = self.define_tensor(
+            input_node,
+            node,
+            input_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        axis = node.args[1]
+        indices_node = node.args[2]
+        indices_tensor = self.get_tensor(indices_node, node).to(torch.int32)
+        assert indices_tensor.size(0) != 0, "Not support empty indices list"
+
+        indices_tensor_wrapper = self.define_tensor(
+            indices_node,
+            node,
+            indices_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        gather_input_tensors = [input_tensor_wrapper, indices_tensor_wrapper]
+
+        output_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            node,
+            output_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+        gather_output_tensors = [output_tensor_wrapper]
+
+        gather_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpGather.op_name,
+        )
+        gather_op.AddInputTensors(gather_input_tensors)
+        gather_op.AddOutputTensors(gather_output_tensors)
+
+        # If support tuple of tensor, need to refine it based on len
+        gather_op.AddScalarParam(
+            OpGather.param_axis,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_32,
+            {QCOM_DATA: np.int32(axis)},
+        )
+
+        return gather_op
diff --git a/backends/qualcomm/builders/op_slice_copy.py b/backends/qualcomm/builders/op_slice_copy.py
@@ -6,9 +6,9 @@
 from typing import cast, Dict
 
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
-
 import numpy as np
 import torch
+from executorch.backends.qualcomm.utils.constants import QCOM_AXIS_ORDER
 
 from .node_visitor import NodeVisitor
 from .node_visitor_manager import register_node_visitor
@@ -47,8 +47,9 @@ def define_node(
             PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
             nodes_to_wrappers,
         )
-
         dim = cast(int, node.args[1])
+        if QCOM_AXIS_ORDER in node.meta:
+            dim = node.meta[QCOM_AXIS_ORDER].index(dim)
         if dim < 0:
             dim = dim % len(input_tensor.shape)
 
@@ -62,7 +63,6 @@ def define_node(
                 end = end % input_tensor.shape[dim]
         else:
             end = input_tensor.shape[dim]
-
         input_tensor_rank = len(input_tensor.shape)
         ranges = []
         for i in range(input_tensor_rank):
diff --git a/backends/qualcomm/partition/common_defs.py b/backends/qualcomm/partition/common_defs.py
@@ -19,11 +19,9 @@
     exir_ops.edge.aten.adaptive_max_pool2d.default,
     exir_ops.edge.aten.avg_pool3d.default,
     exir_ops.edge.aten.div.Tensor_mode,
-    exir_ops.edge.aten.index_select.default,
     exir_ops.edge.aten.log10.default,
     exir_ops.edge.aten.log1p.default,
     exir_ops.edge.aten.log2.default,
-    exir_ops.edge.aten.flip.default,
     exir_ops.edge.aten.max_pool3d_with_indices.default,
     exir_ops.edge.aten.median.default,
     exir_ops.edge.aten.median.dim,
diff --git a/backends/qualcomm/quantizer/annotators.py b/backends/qualcomm/quantizer/annotators.py
@@ -433,6 +433,17 @@ def annotate_clamp(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
 
 
+@register_annotator([torch.ops.aten.index_select.default])
+def annotate_index_select(node: Node, quantization_config: QuantizationConfig) -> None:
+    # args[2] = indices, which should be int
+    annotate_single_in_single_out(node, quantization_config)
+
+
+@register_annotator([torch.ops.aten.flip.default])
+def annotate_flip(node: Node, quantization_config: QuantizationConfig) -> None:
+    annotate_single_in_single_out(node, quantization_config)
+
+
 @register_annotator([torch.ops.aten.floor.default])
 def annotate_floor(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
@@ -646,6 +646,40 @@ def forward(self, x):
         return self.conv_transpose(self.conv(x))
 
 
+class Conv2dFlip(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(
+            in_channels=16,
+            out_channels=16,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False,
+        )
+        self.dims = [1, 3]
+
+    def forward(self, x):
+        x = self.conv(x)
+        return torch.flip(x, self.dims)
+
+
+class Conv2dSliceCopy(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(
+            in_channels=1,
+            out_channels=4,
+            kernel_size=(3, 3),
+            padding=1,
+            bias=True,
+        )
+
+    def forward(self, x):
+        x = self.conv(x)
+        return x[:, 2:, :, :]
+
+
 class Conv2dSumReduceDim(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -814,6 +848,15 @@ def forward(self, x):
         return torch.special.expm1(x)
 
 
+class Flip(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.dims = [0, 2]
+
+    def forward(self, x):
+        return torch.flip(x, self.dims)
+
+
 class Floor(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -1039,6 +1082,15 @@ def forward(self, input_pos, k_val):
         return k_out + 0
 
 
+class IndexSelect(torch.nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x, indices):
+        return torch.index_select(x, self.dim, indices)
+
+
 class InstanceNorm2d(torch.nn.Module):
     def __init__(self, n_features, affine=True):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py