From 9953890714067ab66e9175fb6109029c716c1ca5 Mon Sep 17 00:00:00 2001
From: winskuo-quic <winskuo@qti.qualcomm.com>
Date: Mon, 13 Oct 2025 16:40:53 +0800
Subject: [PATCH] Qualcomm AI Engine Direct - Suite operator fix part 3

---
 backends/qualcomm/builders/node_visitor.py   |  18 +-
 backends/qualcomm/builders/op_conv.py        |  52 ++++-
 backends/qualcomm/builders/op_elu.py         |   5 +-
 backends/qualcomm/quantizer/qconfig.py       |  28 ++-
 backends/qualcomm/quantizer/quantizer.py     |  85 +++++--
 backends/qualcomm/tests/models.py            |  23 +-
 backends/qualcomm/tests/test_qnn_delegate.py | 226 +++++++++++++++++--
 backends/test/suite/operators/test_div.py    |   6 +
 backends/test/suite/operators/test_log.py    |  28 ++-
 backends/test/suite/operators/test_rsqrt.py  |   7 +-
 backends/test/suite/operators/test_sqrt.py   |   7 +-
 11 files changed, 409 insertions(+), 76 deletions(-)

diff --git a/backends/qualcomm/builders/node_visitor.py b/backends/qualcomm/builders/node_visitor.py
index 8cbf3a50e22..f3dadb99129 100644
--- a/backends/qualcomm/builders/node_visitor.py
+++ b/backends/qualcomm/builders/node_visitor.py
@@ -153,6 +153,13 @@ def make_qnn_per_block_config(self, node: torch.fx.Node, quant_attrs: Dict):
         scales, scale_offset, quantized_scales = quant_attrs[QCOM_SCALE], [], []
         # channel in observers defaults to zero
         num_channels = node.meta["val"].shape[0]
+        user_0 = self.get_first_user(node)
+
+        ch_axis = 0
+        # args[6] to check if it is transpose conv
+        if user_0.target == exir_ops.edge.aten.convolution.default and user_0.args[6]:
+            num_channels = node.meta["val"].shape[1]
+            ch_axis = 1
         # TODO: expand this when QNN starts to support more configurations
         bitwidth_of_scale = 4
         quant_scales_dtype = torch.uint8
@@ -162,9 +169,10 @@ def make_qnn_per_block_config(self, node: torch.fx.Node, quant_attrs: Dict):
         )
 
         for ch in range(num_channels):
-            max_scale = scales[ch].reshape(1, -1).amax(dim=-1) / num_steps
+            candidates = scales[ch] if ch_axis == 0 else scales[:, ch, ...]
+            max_scale = candidates.reshape(1, -1).amax(dim=-1) / num_steps
             q_scales = torch.clamp(
-                input=torch.round(input=scales[ch] / max_scale),
+                input=torch.round(input=candidates / max_scale),
                 min=1,
                 max=2**bitwidth_of_scale,
             ).to(quant_scales_dtype)
@@ -174,11 +182,11 @@ def make_qnn_per_block_config(self, node: torch.fx.Node, quant_attrs: Dict):
 
         # skip dequantize op, e.g. frozen_param -> dq -> conv2d
         user_0 = self.get_first_user(node)
-        if "convolution" in user_0.target.__name__:
+        if user_0.target == exir_ops.edge.aten.convolution.default:
             # OIHW (pytorch) -> HWIO (QNN)
             quant_config[QCOM_AXIS] = node.meta["val"].dim() - 1
             quant_config[QCOM_AXIS_ORDER] = (2, 3, 1, 0)
-        elif "linear" in user_0.target.__name__:
+        elif user_0.target == exir_ops.edge.aten.linear.default:
             # OI (pytorch) -> OI (QNN)
             quant_config[QCOM_AXIS] = 0
             quant_config[QCOM_AXIS_ORDER] = (0, 1)
@@ -217,7 +225,7 @@ def make_qnn_per_channel_config(self, node: torch.fx.Node, quant_attrs: Dict):
         # skip dequantize op, e.g. frozen_param -> dq -> conv2d
         user_0 = self.get_first_user(node)
         # Memory layout of QNN conv weight always ends in Output. Like conv2d is HWIO
-        if "convolution" in user_0.target.__name__:
+        if user_0.target == exir_ops.edge.aten.convolution.default:
             quant_config[QCOM_AXIS] = node.meta["val"].dim() - 1
         else:
             quant_config[QCOM_AXIS] = quant_attrs[QCOM_AXIS]
diff --git a/backends/qualcomm/builders/op_conv.py b/backends/qualcomm/builders/op_conv.py
index 2bc0b41524d..317a3269ede 100644
--- a/backends/qualcomm/builders/op_conv.py
+++ b/backends/qualcomm/builders/op_conv.py
@@ -9,9 +9,9 @@
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
 import numpy as np
 import torch
-from executorch.backends.qualcomm.utils.constants import QCOM_DATA
+from executorch.backends.qualcomm.utils.constants import QCOM_DATA, QCOM_QUANT_ATTRS
 
-from .node_visitor import NodeVisitor
+from .node_visitor import NodeVisitor, PER_CHANNEL_ENCODING
 from .node_visitor_manager import register_node_visitor
 from .qnn_constants import (
     OpConv2d,
@@ -101,6 +101,29 @@ def _add_conv_op_parameter(
 
         return conv_op
 
+    def _reduce_bias_scales(
+        self,
+        node: torch.fx.Node,
+        filter_node: torch.fx.Node,
+        bias_node: torch.fx.Node,
+        groups: int,
+    ):
+        """_summary_
+        If transpose_conv has groups, need special handle for bias_node's per channel quant.
+        Check _derived_bias_quant_spec under backends/qualcomm/quantizer/qconfig.py for more info.
+        """
+
+        filter_scales = filter_node.meta[QCOM_QUANT_ATTRS]["scales"]
+        bias_scales = bias_node.meta[QCOM_QUANT_ATTRS]["scales"]
+        bias_zero_points = bias_node.meta[QCOM_QUANT_ATTRS]["zero_points"]
+
+        # Adding this condition to prevent reduce twice: op_validation and qnn_preprocess
+        if filter_scales.numel() != bias_scales.numel():
+            bias_scales = bias_scales.view(-1, groups)[:, 0]
+            bias_zero_points = bias_zero_points.view(-1, groups)[:, 0]
+            bias_node.meta[QCOM_QUANT_ATTRS]["scales"] = bias_scales
+            bias_node.meta[QCOM_QUANT_ATTRS]["zero_points"] = bias_zero_points
+
     def define_node(
         self,
         node: torch.fx.Node,
@@ -127,8 +150,15 @@ def define_node(
 
         filter_node = self.get_node(node.args[1])
         filter_tensor = get_parameter(filter_node, self.edge_program)
+
+        stride = cast(List[int], node.args[3])
+        padding = cast(List[int], node.args[4])
+        dilation = cast(List[int], node.args[5])
+        output_padding = cast(List[int], node.args[7])
+        groups = cast(int, node.args[8])
+
         # weight of pytorch OIHW(conv2d) / OIDHW(conv3d) or IOHW(conv_transpose2d) / IODHW(conv_transpose3d),
-        # yet QNN is HWIO or DHWIO
+        # yet QNN is HWIO or DHWIO for both conv and conv_transpose.
         is_transpose_conv = cast(bool, node.args[6])
         if is_conv2d:
             filter_axis_order = (2, 3, 0, 1) if is_transpose_conv else (2, 3, 1, 0)
@@ -147,6 +177,16 @@ def define_node(
         conv_input_tensors = [input_tensor_wrapper, filter_tensor_wrapper]
         if node.args[2] is not None:
             bias_node = self.get_node(node.args[2])
+            # TODO: Double check on condition below once QNN supports transpose_conv with block_quant.
+            # By checking node.args[1].target, only allow per_channel_quant to go through and bypass block_quant.
+            if (
+                is_transpose_conv
+                and groups != 1
+                and bias_node.meta.get(QCOM_QUANT_ATTRS) is not None
+                and node.args[1].target in PER_CHANNEL_ENCODING
+            ):
+                self._reduce_bias_scales(node, filter_node, bias_node, groups)
+
             bias_tensor = get_parameter(bias_node, self.edge_program)
             bias_tensor_wrapper = self.define_tensor(
                 bias_node,
@@ -156,7 +196,6 @@ def define_node(
                 nodes_to_wrappers,
             )
             conv_input_tensors.append(bias_tensor_wrapper)
-
         output_tensor = self.get_tensor(node, node)
         output_tensor_wrapper = self.define_tensor(
             node,
@@ -167,11 +206,6 @@ def define_node(
         )
         conv_output_tensors = [output_tensor_wrapper]
 
-        stride = cast(List[int], node.args[3])
-        padding = cast(List[int], node.args[4])
-        dilation = cast(List[int], node.args[5])
-        output_padding = cast(List[int], node.args[7])
-        groups = cast(int, node.args[8])
         # Qnn filter tensor is (H, W, Cin, Cout) or (D, H, W, Cin, Cout)
         group_input_channels = filter_tensor.shape[-2]
         group_output_channels = int(filter_tensor.shape[-1] / groups)
diff --git a/backends/qualcomm/builders/op_elu.py b/backends/qualcomm/builders/op_elu.py
index 65e8d93f414..215fe654948 100644
--- a/backends/qualcomm/builders/op_elu.py
+++ b/backends/qualcomm/builders/op_elu.py
@@ -58,12 +58,11 @@ def define_node(
         )
         elu_op.AddInputTensors(elu_input_tensors)
         elu_op.AddOutputTensors(elu_output_tensors)
-
-        if len(node.args) == 2:
+        if len(node.args) > 1:
             elu_op.AddScalarParam(
                 OpElu.param_alpha,
                 PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_FLOAT_32,
-                {QCOM_DATA: np.uint32(node.args[1])},
+                {QCOM_DATA: np.float32(node.args[1])},
             )
 
         return elu_op
diff --git a/backends/qualcomm/quantizer/qconfig.py b/backends/qualcomm/quantizer/qconfig.py
index 3c8191dc57d..e22d5b30fa7 100644
--- a/backends/qualcomm/quantizer/qconfig.py
+++ b/backends/qualcomm/quantizer/qconfig.py
@@ -52,6 +52,22 @@ def _derive_bias_qparams_fn(
             act_scale, weight_scale
         )
         derived_scale = (broadcast_act_scale * broadcast_weight_scale).to(torch.float32)
+        # TransposeConv per channel axis=1, and the weight_shape[1] = out_channel / groups.
+        # E.g., out_channel = 6, groups = 2, weight_shape[1] = 3, which means there are 3 pairs of scale/offset.
+        # However, bias still has 6 values, meaning it requires repeat_interleave 2 times derived_scale in order to
+        # generate 6 pairs of scale/offset to perform per channel quantization. For bias node, Conv OP builder will later
+        # only pass 3 pairs of scale/offset to QNN.
+        if (
+            node.target
+            in {
+                torch.ops.aten.conv_transpose2d.input,
+                torch.ops.aten.conv_transpose3d.input,
+            }
+            and len(node.args) > 6
+            and node.args[6] != 1
+        ):
+            groups = node.args[6]
+            derived_scale = derived_scale.repeat_interleave(groups)
         derived_zero = torch.zeros(derived_scale.size(), device=weight_zp.device).to(
             torch.int32
         )
@@ -68,7 +84,6 @@ def _derive_bias_qparams_fn(
     assert isinstance(input_act, Node)
     weight = node.args[1]
     assert isinstance(weight, Node)
-
     return DerivedQuantizationSpec(
         derived_from=[(input_act, node), (weight, node)],
         derive_qparams_fn=_derive_bias_qparams_fn,
@@ -300,6 +315,7 @@ def get_ptq_per_channel_quant_config(
     weight_dtype=torch.int8,
     act_observer=MovingAverageMinMaxObserver,
     act_symmetric: bool = False,
+    ch_axis: int = 0,
 ) -> QuantizationConfig:
     extra_args: Dict[str, Any] = {"eps": 2**-12}
 
@@ -349,7 +365,7 @@ def get_ptq_per_channel_quant_config(
         ),
         quant_max=7 if weight_dtype == torch.int4 else torch.iinfo(weight_dtype).max,
         qscheme=torch.per_channel_symmetric,
-        ch_axis=0,
+        ch_axis=ch_axis,
         observer_or_fake_quant_ctr=PerChannelMinMaxObserver.with_args(**extra_args),
     )
 
@@ -370,6 +386,7 @@ def get_ptq_per_block_quant_config(
     weight_dtype=torch.int8,
     act_observer=MovingAverageMinMaxObserver,
     act_symmetric: bool = False,
+    ch_axis: int = 0,
 ) -> QuantizationConfig:
     extra_args: Dict[str, Any] = {"eps": 2**-12}
     quantization_config = get_ptq_per_channel_quant_config(
@@ -385,7 +402,7 @@ def get_ptq_per_block_quant_config(
         ),
         quant_max=7 if weight_dtype == torch.int4 else torch.iinfo(weight_dtype).max,
         qscheme=torch.per_channel_symmetric,
-        ch_axis=0,
+        ch_axis=ch_axis,
         observer_or_fake_quant_ctr=PerBlockParamObserver.with_args(**extra_args),
     )
     return QuantizationConfig(
@@ -522,6 +539,7 @@ def get_qat_per_channel_quant_config(
     weight_dtype=torch.int8,
     act_observer=MovingAverageMinMaxObserver,
     act_symmetric=False,
+    ch_axis: int = 0,
 ) -> QuantizationConfig:
     supported_act_types = {
         torch.uint8,
@@ -577,7 +595,7 @@ def get_qat_per_channel_quant_config(
         ),
         quant_max=7 if weight_dtype == torch.int4 else torch.iinfo(weight_dtype).max,
         qscheme=torch.per_channel_symmetric,
-        ch_axis=0,
+        ch_axis=ch_axis,
         observer=MovingAveragePerChannelMinMaxObserver,
     )
     weight_quantization_spec = QuantizationSpec(
@@ -587,7 +605,7 @@ def get_qat_per_channel_quant_config(
         ),
         quant_max=7 if weight_dtype == torch.int4 else torch.iinfo(weight_dtype).max,
         qscheme=torch.per_channel_symmetric,
-        ch_axis=0,
+        ch_axis=ch_axis,
         observer_or_fake_quant_ctr=weight_fake_quant_ctr,
     )
 
diff --git a/backends/qualcomm/quantizer/quantizer.py b/backends/qualcomm/quantizer/quantizer.py
index 44d129d5544..4d0f1098a62 100644
--- a/backends/qualcomm/quantizer/quantizer.py
+++ b/backends/qualcomm/quantizer/quantizer.py
@@ -150,33 +150,62 @@ def __post_init__(self):
             if self.act_observer
             else quant_config_func()
         )
-        self.per_channel_quant_config = (
-            per_channel_quant_config_func(act_observer=self.act_observer)
-            if self.act_observer
-            else per_channel_quant_config_func()
-        )
-        self.use_per_channel_weight_quant_ops = set()
+
+        # Assume per_channel_quant/per_block_quant only happen on axis_0 or axis_1, increase the range if there's a need
+        potential_axis = 2
+
+        self.per_channel_quant_config_list = []
+        for i in range(potential_axis):
+            self.per_channel_quant_config_list.append(
+                (
+                    per_channel_quant_config_func(
+                        act_observer=self.act_observer, ch_axis=i
+                    )
+                    if self.act_observer
+                    else per_channel_quant_config_func(ch_axis=i)
+                )
+            )
+
+        # Key is the node target, and value is the axis to perform per channel quantization
+        self.op_axis_dict = {
+            torch.ops.aten.conv1d.default: 0,
+            torch.ops.aten.conv2d.default: 0,
+            torch.ops.aten.conv3d.default: 0,
+            torch.ops.aten.conv_transpose2d.input: 1,
+            torch.ops.aten.conv_transpose3d.input: 1,
+            torch.ops.aten.linear.default: 0,
+        }
+
+        self.use_per_channel_weight_quant_ops = {}
         if self.is_conv_per_channel:
+            conv_ops = [
+                torch.ops.aten.conv1d.default,
+                torch.ops.aten.conv2d.default,
+                torch.ops.aten.conv3d.default,
+                torch.ops.aten.conv_transpose2d.input,
+                torch.ops.aten.conv_transpose3d.input,
+            ]
             self.use_per_channel_weight_quant_ops.update(
-                {
-                    torch.ops.aten.conv1d.default,
-                    torch.ops.aten.conv2d.default,
-                    torch.ops.aten.conv3d.default,
-                    torch.ops.aten.conv_transpose2d.input,
-                }
+                {k: self.op_axis_dict[k] for k in conv_ops if k in self.op_axis_dict}
             )
         if self.is_linear_per_channel:
+            linear_ops = [torch.ops.aten.linear.default]
             self.use_per_channel_weight_quant_ops.update(
-                {
-                    torch.ops.aten.linear.default,
-                }
+                {k: self.op_axis_dict[k] for k in linear_ops if k in self.op_axis_dict}
             )
+
         if per_block_quant_config_func:
-            self.per_block_quant_config = (
-                per_block_quant_config_func(act_observer=self.act_observer)
-                if self.act_observer
-                else per_block_quant_config_func()
-            )
+            self.per_block_quant_config_list = []
+            for i in range(potential_axis):
+                self.per_block_quant_config_list.append(
+                    (
+                        per_block_quant_config_func(
+                            act_observer=self.act_observer, ch_axis=i
+                        )
+                        if self.act_observer
+                        else per_block_quant_config_func(ch_axis=i)
+                    )
+                )
 
 
 class QnnQuantizer(Quantizer):
@@ -269,16 +298,22 @@ def _get_quant_config(self, node: torch.fx.Node) -> Optional[QuantizationConfig]
         op = node.target
         if isinstance(op, str):
             return
-
+        config = self._get_submodule_qconfig(node)
         if block_size := self.block_size_map.get(node.name):
-            config = self.default_quant_config.per_block_quant_config
+            ch_axis = config.op_axis_dict.get(node.target, 0)
+            assert (
+                len(config.per_block_quant_config_list) > ch_axis
+            ), f"Unsupported per block quantization axis: {ch_axis}, please increase the range of per_block_quant_config_list"
+            config = config.per_block_quant_config_list[ch_axis]
             config.block_size = block_size
             return config
 
-        config = self._get_submodule_qconfig(node)
-
         if op in config.use_per_channel_weight_quant_ops:
-            return config.per_channel_quant_config
+            ch_axis = config.use_per_channel_weight_quant_ops[op]
+            assert (
+                len(config.per_channel_quant_config_list) > ch_axis
+            ), f"Unsupported per channel quantization axis: {ch_axis}, please increase the range of per_channel_quant_config_list"
+            return config.per_channel_quant_config_list[ch_axis]
 
         if op in self.quant_ops:
             return config.quant_config
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
index 5ea6caf54ad..58647441210 100644
--- a/backends/qualcomm/tests/models.py
+++ b/backends/qualcomm/tests/models.py
@@ -746,15 +746,26 @@ def forward(self, x):
 
 
 class ConvTranspose2dSingle(torch.nn.Module):
-    def __init__(self, bias=True, dilation=1):
+    def __init__(
+        self,
+        bias=True,
+        in_channels=1,
+        out_channels=3,
+        kernel_size=1,
+        stride=1,
+        padding=1,
+        dilation=1,
+        groups=1,
+    ):
         super().__init__()
         self.conv_transpose = torch.nn.ConvTranspose2d(
-            in_channels=1,
-            out_channels=3,
-            kernel_size=3,
-            stride=2,
-            padding=1,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
             dilation=dilation,
+            groups=groups,
             bias=bias,
         )
 
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 2641acc5a2d..b1882a7deca 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -352,17 +352,98 @@ def test_qnn_backend_conv_transpose1d(self):
                 self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_conv_transpose2d(self):
-        modules = [
-            ConvTranspose2dSingle(),  # noqa: F405
-            ConvTranspose2dSingle(bias=False),  # noqa: F405
-            ConvTranspose2dSingle(dilation=2),  # noqa: F405
-            ConvTranspose2dSingle(dilation=(2, 3)),  # noqa: F405
-            ConvTranspose2dSingle(dilation=(2, 1)),  # noqa: F405
+        test_comb = [
+            {
+                QCOM_MODULE: [ConvTranspose2dSingle()],  # noqa: F405
+                QCOM_SAMPLE_INPUTS: [
+                    (torch.randn(1, 1, 16, 16),),
+                ],
+            },
+            {
+                QCOM_MODULE: [ConvTranspose2dSingle(bias=False)],  # noqa: F405
+                QCOM_SAMPLE_INPUTS: [
+                    (torch.randn(1, 1, 16, 16),),
+                ],
+            },
+            {
+                QCOM_MODULE: [
+                    ConvTranspose2dSingle(  # noqa: F405
+                        in_channels=2,
+                        out_channels=3,
+                        dilation=2,
+                        kernel_size=3,
+                        stride=2,
+                    )
+                ],
+                QCOM_SAMPLE_INPUTS: [
+                    (torch.randn(1, 2, 16, 16),),
+                ],
+            },
+            {
+                QCOM_MODULE: [
+                    ConvTranspose2dSingle(  # noqa: F405
+                        in_channels=2,
+                        out_channels=3,
+                        dilation=(2, 3),
+                        kernel_size=3,
+                        stride=2,
+                    )
+                ],
+                QCOM_SAMPLE_INPUTS: [
+                    (torch.randn(1, 2, 16, 16),),
+                ],
+            },
+            {
+                QCOM_MODULE: [
+                    ConvTranspose2dSingle(  # noqa: F405
+                        in_channels=2,
+                        out_channels=3,
+                        dilation=(2, 1),
+                        kernel_size=3,
+                        stride=2,
+                    )
+                ],
+                QCOM_SAMPLE_INPUTS: [
+                    (torch.randn(1, 2, 16, 16),),
+                ],
+            },
+            {
+                QCOM_MODULE: [
+                    ConvTranspose2dSingle(  # noqa: F405
+                        in_channels=2,
+                        out_channels=3,
+                        dilation=(2, 1),
+                        kernel_size=3,
+                        stride=2,
+                    )
+                ],
+                QCOM_SAMPLE_INPUTS: [
+                    (torch.randn(1, 2, 16, 16),),
+                ],
+            },
+            {
+                QCOM_MODULE: [
+                    ConvTranspose2dSingle(  # noqa: F405
+                        in_channels=6,
+                        out_channels=6,
+                        kernel_size=3,
+                        padding=0,
+                        groups=2,
+                    )
+                ],
+                QCOM_SAMPLE_INPUTS: [
+                    (torch.randn(4, 6, 16, 16),),
+                ],
+            },
         ]
-        sample_input = (torch.randn([1, 1, 33, 33]),)
-        for i, module in enumerate(modules):
-            with self.subTest(i=i):
-                self.lower_module_and_test_output(module, sample_input)
+
+        index = 0
+        for comb in test_comb:
+            for module in comb[QCOM_MODULE]:
+                for sample_input in comb[QCOM_SAMPLE_INPUTS]:
+                    with self.subTest(i=index):
+                        index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_conv_transpose3d(self):
         modules = [
@@ -611,7 +692,6 @@ def test_qnn_backend_element_wise_sub(self):
                         index += 1
                         self.lower_module_and_test_output(module, sample_input)
 
-    @unittest.expectedFailure
     def test_qnn_backend_elu(self):
         module = Elu()  # noqa: F405
         sample_input = (torch.randn(2, 5, 1, 3),)
@@ -2248,16 +2328,128 @@ def test_qnn_backend_conv_transpose1d(self):
                 self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_conv_transpose2d(self):
+        test_comb = [
+            {
+                QCOM_MODULE: [ConvTranspose2dSingle()],  # noqa: F405
+                QCOM_SAMPLE_INPUTS: [
+                    (torch.randn(1, 1, 16, 16),),
+                ],
+            },
+            {
+                QCOM_MODULE: [ConvTranspose2dSingle(bias=False)],  # noqa: F405
+                QCOM_SAMPLE_INPUTS: [
+                    (torch.randn(1, 1, 16, 16),),
+                ],
+            },
+            {
+                QCOM_MODULE: [
+                    ConvTranspose2dSingle(  # noqa: F405
+                        in_channels=2,
+                        out_channels=3,
+                        dilation=2,
+                        kernel_size=3,
+                        stride=2,
+                    )
+                ],
+                QCOM_SAMPLE_INPUTS: [
+                    (torch.randn(1, 2, 16, 16),),
+                ],
+            },
+            {
+                QCOM_MODULE: [
+                    ConvTranspose2dSingle(  # noqa: F405
+                        in_channels=2,
+                        out_channels=3,
+                        dilation=(2, 3),
+                        kernel_size=3,
+                        stride=2,
+                    )
+                ],
+                QCOM_SAMPLE_INPUTS: [
+                    (torch.randn(1, 2, 16, 16),),
+                ],
+            },
+            {
+                QCOM_MODULE: [
+                    ConvTranspose2dSingle(  # noqa: F405
+                        in_channels=2,
+                        out_channels=3,
+                        dilation=(2, 1),
+                        kernel_size=3,
+                        stride=2,
+                    )
+                ],
+                QCOM_SAMPLE_INPUTS: [
+                    (torch.randn(1, 2, 16, 16),),
+                ],
+            },
+            {
+                QCOM_MODULE: [
+                    ConvTranspose2dSingle(  # noqa: F405
+                        in_channels=2,
+                        out_channels=3,
+                        dilation=(2, 1),
+                        kernel_size=3,
+                        stride=2,
+                    )
+                ],
+                QCOM_SAMPLE_INPUTS: [
+                    (torch.randn(1, 2, 16, 16),),
+                ],
+            },
+            {
+                QCOM_MODULE: [
+                    ConvTranspose2dSingle(  # noqa: F405
+                        in_channels=6,
+                        out_channels=6,
+                        kernel_size=3,
+                        padding=0,
+                        groups=2,
+                    )
+                ],
+                QCOM_SAMPLE_INPUTS: [
+                    (torch.randn(4, 6, 16, 16),),
+                ],
+            },
+        ]
+
+        index = 0
+        for comb in test_comb:
+            for module in comb[QCOM_MODULE]:
+                for sample_input in comb[QCOM_SAMPLE_INPUTS]:
+                    with self.subTest(i=index):
+                        index += 1
+                        gm = self.get_qdq_module(module, sample_input)
+                        self.lower_module_and_test_output(gm, sample_input)
+
+    @unittest.skip("As of QNN 2.37, transpose conv block quant is not supported")
+    def test_qnn_backend_conv_transpose2d_block(self):
+        i_ch, o_ch, kernel, padding = 128, 32, (1, 1), 0
         modules = [
-            ConvTranspose2dSingle(),  # noqa: F405
-            ConvTranspose2dSingle(bias=False),  # noqa: F405
-            ConvTranspose2dSingle(dilation=(2, 3)),  # noqa: F405
-            ConvTranspose2dSingle(dilation=(2, 1)),  # noqa: F405
+            ConvTranspose2dSingle(  # noqa: F405
+                bias=False,
+                in_channels=i_ch,
+                out_channels=o_ch,
+                kernel_size=kernel,
+                padding=padding,
+            ),
+            ConvTranspose2dSingle(  # noqa: F405
+                in_channels=i_ch,
+                out_channels=o_ch,
+                kernel_size=kernel,
+                padding=padding,
+            ),
         ]
-        sample_input = (torch.randn([1, 1, 3, 3]),)
+
+        sample_input = (torch.randn(1, 128, 16, 16),)
         for i, module in enumerate(modules):
             with self.subTest(i=i):
-                module = self.get_qdq_module(module, sample_input)
+                module = self.get_qdq_module(
+                    module,
+                    sample_input,
+                    quant_dtype=QuantDtype.use_16a4w_block,
+                    block_size_map={"conv_transpose2d": (16, 1, 1, 1)},
+                )
                 self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_conv_transpose3d(self):
diff --git a/backends/test/suite/operators/test_div.py b/backends/test/suite/operators/test_div.py
index 656d350585d..d493c97a20d 100644
--- a/backends/test/suite/operators/test_div.py
+++ b/backends/test/suite/operators/test_div.py
@@ -46,6 +46,7 @@ def test_divide_dtype(self, flow: TestFlow, dtype) -> None:
                 ),  # Adding 0.1 to avoid division by zero
             ),
             flow,
+            generate_random_test_inputs=False,
         )
 
     def test_divide_f32_bcast_first(self, flow: TestFlow) -> None:
@@ -57,6 +58,7 @@ def test_divide_f32_bcast_first(self, flow: TestFlow) -> None:
                 + 0.1,  # Using abs and adding 0.1 to avoid division by zero
             ),
             flow,
+            generate_random_test_inputs=False,
         )
 
     def test_divide_f32_bcast_second(self, flow: TestFlow) -> None:
@@ -68,6 +70,7 @@ def test_divide_f32_bcast_second(self, flow: TestFlow) -> None:
                 + 0.1,  # Using abs and adding 0.1 to avoid division by zero
             ),
             flow,
+            generate_random_test_inputs=False,
         )
 
     def test_divide_f32_bcast_unary(self, flow: TestFlow) -> None:
@@ -79,6 +82,7 @@ def test_divide_f32_bcast_unary(self, flow: TestFlow) -> None:
                 + 0.1,  # Using abs and adding 0.1 to avoid division by zero
             ),
             flow,
+            generate_random_test_inputs=False,
         )
 
     def test_divide_f32_trunc(self, flow: TestFlow) -> None:
@@ -90,6 +94,7 @@ def test_divide_f32_trunc(self, flow: TestFlow) -> None:
                 + 0.1,  # Using abs and adding 0.1 to avoid division by zero
             ),
             flow,
+            generate_random_test_inputs=False,
         )
 
     def test_divide_f32_floor(self, flow: TestFlow) -> None:
@@ -101,4 +106,5 @@ def test_divide_f32_floor(self, flow: TestFlow) -> None:
                 + 0.1,  # Using abs and adding 0.1 to avoid division by zero
             ),
             flow,
+            generate_random_test_inputs=False,
         )
diff --git a/backends/test/suite/operators/test_log.py b/backends/test/suite/operators/test_log.py
index c4af1fe442b..320f4fe463b 100644
--- a/backends/test/suite/operators/test_log.py
+++ b/backends/test/suite/operators/test_log.py
@@ -34,19 +34,39 @@ def test_log_dtype(self, flow: TestFlow, dtype) -> None:
         # Test with different dtypes
         model = LogModel().to(dtype)
         # Use positive values only for log
-        self._test_op(model, (torch.rand(10, 10).to(dtype) + 0.01,), flow)
+        self._test_op(
+            model,
+            (torch.rand(10, 10).to(dtype) + 0.01,),
+            flow,
+            generate_random_test_inputs=False,
+        )
 
     def test_log_shapes(self, flow: TestFlow) -> None:
         # Test with different tensor shapes
 
         # 1D tensor
-        self._test_op(LogModel(), (torch.rand(20) + 0.01,), flow)
+        self._test_op(
+            LogModel(),
+            (torch.rand(20) + 0.01,),
+            flow,
+            generate_random_test_inputs=False,
+        )
 
         # 2D tensor
-        self._test_op(LogModel(), (torch.rand(5, 10) + 0.01,), flow)
+        self._test_op(
+            LogModel(),
+            (torch.rand(5, 10) + 0.01,),
+            flow,
+            generate_random_test_inputs=False,
+        )
 
         # 3D tensor
-        self._test_op(LogModel(), (torch.rand(3, 4, 5) + 0.01,), flow)
+        self._test_op(
+            LogModel(),
+            (torch.rand(3, 4, 5) + 0.01,),
+            flow,
+            generate_random_test_inputs=False,
+        )
 
     @unittest.skip("NaN and Inf are not enforced for backends.")
     def test_log_edge_cases(self, flow: TestFlow) -> None:
diff --git a/backends/test/suite/operators/test_rsqrt.py b/backends/test/suite/operators/test_rsqrt.py
index bb51b213dd4..0b7c9739cf7 100644
--- a/backends/test/suite/operators/test_rsqrt.py
+++ b/backends/test/suite/operators/test_rsqrt.py
@@ -33,7 +33,12 @@ def test_rsqrt_dtype(self, flow: TestFlow, dtype) -> None:
         # Test with different dtypes
         model = RsqrtModel().to(dtype)
         # Use positive values only for rsqrt to avoid division by zero
-        self._test_op(model, (torch.rand(10, 10).to(dtype) + 0.01,), flow)
+        self._test_op(
+            model,
+            (torch.rand(10, 10).to(dtype) + 0.01,),
+            flow,
+            generate_random_test_inputs=False,
+        )
 
     def test_rsqrt_shapes(self, flow: TestFlow) -> None:
         # Test with different tensor shapes
diff --git a/backends/test/suite/operators/test_sqrt.py b/backends/test/suite/operators/test_sqrt.py
index 92fbc64878e..4a3f931204d 100644
--- a/backends/test/suite/operators/test_sqrt.py
+++ b/backends/test/suite/operators/test_sqrt.py
@@ -33,7 +33,12 @@ def test_sqrt_dtype(self, flow: TestFlow, dtype) -> None:
         # Test with different dtypes
         model = SqrtModel().to(dtype)
         # Use non-negative values only for sqrt
-        self._test_op(model, (torch.rand(10, 10).to(dtype),), flow)
+        self._test_op(
+            model,
+            (torch.rand(10, 10).to(dtype),),
+            flow,
+            generate_random_test_inputs=False,
+        )
 
     def test_sqrt_shapes(self, flow: TestFlow) -> None:
         # Test with different tensor shapes