Qualcomm AI Engine Direct - Support Flip & Index_Select

winskuo-quic · winskuo-quic · commit f4540f1882ce · 2025-09-03T09:13:02.000+08:00
diff --git a/backends/qualcomm/builders/__init__.py b/backends/qualcomm/builders/__init__.py
@@ -36,6 +36,7 @@
     op_eq,
     op_exp,
     op_expand,
+    op_flip,
     op_floor,
     op_full,
     op_full_like,
@@ -49,6 +50,7 @@
     op_hardtanh,
     op_index,
     op_index_put,
+    op_index_select,
     op_instance_norm,
     op_layer_norm,
     op_le,
@@ -139,6 +141,7 @@
     op_eq,
     op_exp,
     op_expand,
+    op_flip,
     op_floor,
     op_full,
     op_full_like,
@@ -152,6 +155,7 @@
     op_hardsigmoid,
     op_index,
     op_index_put,
+    op_index_select,
     op_instance_norm,
     op_layer_norm,
     op_le,
diff --git a/backends/qualcomm/builders/op_flip.py b/backends/qualcomm/builders/op_flip.py
@@ -0,0 +1,76 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+
+import numpy as np
+import torch
+
+from .node_visitor import NodeVisitor
+from .node_visitor_manager import register_node_visitor
+from .qnn_constants import OpStridedSlice, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class Flip(NodeVisitor):
+    target = ["aten.flip.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        input_node = self.get_node(node.args[0])
+        input_tensor = self.get_tensor(input_node, node)
+        tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
+
+        input_tensor_wrapper = self.define_tensor(
+            input_node,
+            node,
+            input_tensor,
+            tensor_type,
+            nodes_to_wrappers,
+        )
+
+        output_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            node,
+            output_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        ranges = []
+
+        for dim, size in enumerate(output_tensor.shape):
+            if dim in node.args[1]:
+                ranges.extend([size - 1, -1, -1])
+            else:
+                ranges.extend([0, size, 1])
+
+        range_shape = [input_tensor.dim(), 3]
+        stride_slice_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpStridedSlice.op_name,
+        )
+        stride_slice_op.AddInputTensors([input_tensor_wrapper])
+        stride_slice_op.AddOutputTensors([output_tensor_wrapper])
+        stride_slice_op.AddTensorParam(
+            OpStridedSlice.param_ranges,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_32,
+            len(range_shape),
+            range_shape,
+            np.array(ranges, dtype=np.int32),
+            True,
+        )
+
+        return stride_slice_op
diff --git a/backends/qualcomm/builders/op_index_select.py b/backends/qualcomm/builders/op_index_select.py
@@ -0,0 +1,81 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+
+import numpy as np
+import torch
+from executorch.backends.qualcomm.utils.constants import QCOM_DATA
+
+from .node_visitor import NodeVisitor
+from .node_visitor_manager import register_node_visitor
+from .qnn_constants import OpGather, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class IndexSelect(NodeVisitor):
+    target = ["aten.index_select.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        input_node = self.get_node(node.args[0])
+        input_tensor = self.get_tensor(input_node, node)
+        input_tensor_wrapper = self.define_tensor(
+            input_node,
+            node,
+            input_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        axis = node.args[1]
+        indices_node = node.args[2]
+        indices_tensor = self.get_tensor(indices_node, node).to(torch.int32)
+        assert indices_tensor.size(0) != 0, "Not support empty indices list"
+
+        indices_tensor_wrapper = self.define_tensor(
+            indices_node,
+            node,
+            indices_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        gather_input_tensors = [input_tensor_wrapper, indices_tensor_wrapper]
+
+        output_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            node,
+            output_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+        gather_output_tensors = [output_tensor_wrapper]
+
+        gather_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpGather.op_name,
+        )
+        gather_op.AddInputTensors(gather_input_tensors)
+        gather_op.AddOutputTensors(gather_output_tensors)
+
+        # If support tuple of tensor, need to refine it based on len
+        gather_op.AddScalarParam(
+            OpGather.param_axis,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_32,
+            {QCOM_DATA: np.int32(axis)},
+        )
+
+        return gather_op
diff --git a/backends/qualcomm/partition/common_defs.py b/backends/qualcomm/partition/common_defs.py
@@ -19,11 +19,9 @@
     exir_ops.edge.aten.adaptive_max_pool2d.default,
     exir_ops.edge.aten.avg_pool3d.default,
     exir_ops.edge.aten.div.Tensor_mode,
-    exir_ops.edge.aten.index_select.default,
     exir_ops.edge.aten.log10.default,
     exir_ops.edge.aten.log1p.default,
     exir_ops.edge.aten.log2.default,
-    exir_ops.edge.aten.flip.default,
     exir_ops.edge.aten.max_pool3d_with_indices.default,
     exir_ops.edge.aten.median.default,
     exir_ops.edge.aten.median.dim,
diff --git a/backends/qualcomm/quantizer/annotators.py b/backends/qualcomm/quantizer/annotators.py
@@ -432,11 +432,18 @@ def annotate_ceil(node: Node, quantization_config: QuantizationConfig) -> None:
 def annotate_clamp(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
 
+
 @register_annotator([torch.ops.aten.index_select.default])
 def annotate_index_select(node: Node, quantization_config: QuantizationConfig) -> None:
-    import pdb; pdb.set_trace()
+    # args[2] = indices, which should be int
     annotate_single_in_single_out(node, quantization_config)
 
+
+@register_annotator([torch.ops.aten.flip.default])
+def annotate_flip(node: Node, quantization_config: QuantizationConfig) -> None:
+    annotate_single_in_single_out(node, quantization_config)
+
+
 @register_annotator([torch.ops.aten.floor.default])
 def annotate_floor(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
@@ -813,23 +813,29 @@ def __init__(self):
     def forward(self, x):
         return torch.special.expm1(x)
 
+
 class Flip(torch.nn.Module):
     def __init__(self):
         super().__init__()
-        self.dims = [0,2]
+        self.dims = [0, 2]
 
     def forward(self, x):
         return torch.flip(x, self.dims)
 
+
 class FlipDecomp(torch.nn.Module):
     def __init__(self):
         super().__init__()
-        self.dims = [0,2]
+        self.dims = [0, 2]
+
     def forward(self, x):
         for dim in self.dims:
-            idx = torch.arange(x.size(dim) - 1, -1, -1, device=x.device)
+            idx = torch.arange(start=x.size(dim) - 1, end=-1, step=-1)
+            # Select using reverse index, equivalent to flipping.
             x = torch.index_select(x, dim, idx)
         return x
+
+
 class Floor(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -1055,6 +1061,15 @@ def forward(self, input_pos, k_val):
         return k_out + 0
 
 
+class IndexSelect(torch.nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x, indices):
+        return torch.index_select(x, self.dim, indices)
+
+
 class InstanceNorm2d(torch.nn.Module):
     def __init__(self, n_features, affine=True):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -555,6 +555,11 @@ def test_qnn_backend_expm1(self):
         module = ExpM1()  # noqa: F405
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_flip(self):
+        sample_input = (torch.randn(3, 4, 5, 6),)
+        module = Flip()  # noqa: F405
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_floor(self):
         sample_input = (torch.randn(3, 4),)
         module = Floor()  # noqa: F405
@@ -778,6 +783,14 @@ def test_qnn_backend_index_put(self):
                     skip_mutable_buffer=test[QCOM_MODULE].skip_mutable_buffer,
                 )
 
+    def test_qnn_backend_index_select(self):
+        module = IndexSelect(dim=1)  # noqa: F405
+        sample_input = (
+            torch.randn(2, 3, 4, 5),
+            torch.tensor([0, 2]),
+        )
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_instance_norm_2d(self):
         modules = [InstanceNorm2d(32), InstanceNorm2d(32, affine=False)]  # noqa: F405
         sample_input = (torch.randn([4, 32, 16, 16]),)
@@ -2031,17 +2044,11 @@ def test_qnn_backend_expm1(self):
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_flip(self):
-        sample_input = (torch.randn(3, 4, 5,6),)
-        # golden_module = Flip()
-        decomp_module = FlipDecomp()
-        decomp_module = self.get_qdq_module(decomp_module, sample_input)
-        self.lower_module_and_test_output(decomp_module, sample_input)
-        # golden_out = golden_module(sample_input)
-        # decomp_out = decomp_module(sample_input)
-        # torch.testing.assert_close(golden_out, decomp_out)
-        
-        
-    
+        sample_input = (torch.randn(3, 4, 5, 6),)
+        module = Flip()  # noqa: F405
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_floor(self):
         sample_input = (torch.randn(3, 4),)
         module = Floor()  # noqa: F405
@@ -2285,6 +2292,15 @@ def test_qnn_backend_index_put(self):
                     skip_mutable_buffer=test[QCOM_MODULE].skip_mutable_buffer,
                 )
 
+    def test_qnn_backend_index_select(self):
+        module = IndexSelect(dim=1)  # noqa: F405
+        sample_input = (
+            torch.randn(2, 3, 4, 5),
+            torch.tensor([0, 2]),
+        )
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_instance_norm_2d(self):
         modules = [InstanceNorm2d(32), InstanceNorm2d(32, affine=False)]  # noqa: F405
         sample_input = (torch.randn([4, 32, 16, 16]),)