[Backend Tester] Add test flows for QNN quantization

GregoryComer · GregoryComer · commit d452e602df23 · 2025-08-19T20:33:35.000-07:00
ghstack-source-id: 0b94375 ghstack-comment-id: 3195495418 Pull-Request: #13469
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
@@ -42,7 +42,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        flow: [vulkan, xnnpack, xnnpack_static_int8_per_channel]
+        flow: [qualcomm, qualcomm_16a16w, qualcomm_16a8w, qualcomm_16a4w, qualcomm_16a4w_block, qualcomm_8a8w, vulkan, xnnpack, xnnpack_static_int8_per_channel]
         suite: [models, operators]
     with:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
diff --git a/backends/qualcomm/tests/tester.py b/backends/qualcomm/tests/tester.py
@@ -4,14 +4,15 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Any, List, Optional, Tuple
+from typing import Any, List, Optional, Sequence, Tuple
 
 import executorch
 import executorch.backends.test.harness.stages as BaseStages
 
 import torch
 from executorch.backends.qualcomm._passes.qnn_pass_manager import QnnPassManager
 from executorch.backends.qualcomm.partition.qnn_partitioner import QnnPartitioner
+from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer
 from executorch.backends.qualcomm.utils.utils import (
     generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
@@ -21,9 +22,32 @@
 from executorch.backends.test.harness.stages import StageType
 from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower
 from executorch.exir.backend.partitioner import Partitioner
+from torch.ao.quantization.quantize_pt2e import (
+    convert_pt2e,
+    prepare_pt2e,
+    prepare_qat_pt2e,
+)
 from torch.export import ExportedProgram
 
 
+class Quantize(BaseStages.Quantize):
+    def __init__(
+        self,
+        quantizer: QnnQuantizer,
+        quantization_config: Optional[Any] = None,
+        calibrate: bool = True,
+        calibration_samples: Optional[Sequence[Any]] = None,
+        is_qat: Optional[bool] = False,
+    ):
+        super().__init__(
+            quantizer=quantizer,
+            calibrate=calibrate,
+            calibration_samples=calibration_samples,
+            is_qat=is_qat,
+            set_global=False,
+        )
+
+
 class Partition(BaseStages.Partition):
     def __init__(self, partitioner: Optional[Partitioner] = None):
         super().__init__(
@@ -37,8 +61,9 @@ def __init__(
         partitioners: Optional[List[Partitioner]] = None,
         edge_compile_config: Optional[EdgeCompileConfig] = None,
         soc_model: str = "SM8650",
+        use_fp16: bool = True,
     ):
-        backend_options = generate_htp_compiler_spec(use_fp16=True)
+        backend_options = generate_htp_compiler_spec(use_fp16=use_fp16)
         self.chipset = get_soc_to_chipset_map()[soc_model]
         self.compiler_specs = generate_qnn_executorch_compiler_spec(
             soc_model=self.chipset,
@@ -73,15 +98,17 @@ def __init__(
         module: torch.nn.Module,
         example_inputs: Tuple[torch.Tensor],
         dynamic_shapes: Optional[Tuple[Any]] = None,
+        use_fp16: bool = True,
     ):
+        def create_to_edge_transform_and_lower(*args, **kwargs):
+            kwargs["use_fp16"] = use_fp16
+            return ToEdgeTransformAndLower(*args, **kwargs)
+
         # Specialize for Qualcomm
-        stage_classes = (
-            executorch.backends.test.harness.Tester.default_stage_classes()
-            | {
-                StageType.PARTITION: Partition,
-                StageType.TO_EDGE_TRANSFORM_AND_LOWER: ToEdgeTransformAndLower,
-            }
-        )
+        stage_classes = executorch.backends.test.harness.Tester.default_stage_classes() | {
+            StageType.PARTITION: Partition,
+            StageType.TO_EDGE_TRANSFORM_AND_LOWER: create_to_edge_transform_and_lower,
+        }
 
         super().__init__(
             module=module,
diff --git a/backends/test/suite/flow.py b/backends/test/suite/flow.py
@@ -81,10 +81,22 @@ def all_flows() -> dict[str, TestFlow]:
         logger.info(f"Skipping Vulkan flow registration: {e}")
 
     try:
-        from executorch.backends.test.suite.flows.qualcomm import QUALCOMM_TEST_FLOW
+        from executorch.backends.test.suite.flows.qualcomm import (
+            QUALCOMM_16A16W_TEST_FLOW,
+            QUALCOMM_16A4W_BLOCK_TEST_FLOW,
+            QUALCOMM_16A4W_TEST_FLOW,
+            QUALCOMM_16A8W_TEST_FLOW,
+            QUALCOMM_8A8W_TEST_FLOW,
+            QUALCOMM_TEST_FLOW,
+        )
 
         flows += [
             QUALCOMM_TEST_FLOW,
+            QUALCOMM_16A16W_TEST_FLOW,
+            QUALCOMM_16A8W_TEST_FLOW,
+            QUALCOMM_16A4W_TEST_FLOW,
+            QUALCOMM_16A4W_BLOCK_TEST_FLOW,
+            QUALCOMM_8A8W_TEST_FLOW,
         ]
     except Exception as e:
         logger.info(f"Skipping Qualcomm flow registration: {e}")
diff --git a/backends/test/suite/flows/qualcomm.py b/backends/test/suite/flows/qualcomm.py
@@ -1,17 +1,61 @@
-from executorch.backends.qualcomm.tests.tester import QualcommTester
+from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer, QuantDtype
+from executorch.backends.qualcomm.tests.tester import QualcommTester, Quantize
 from executorch.backends.test.suite.flow import TestFlow
+from torchao.quantization.pt2e import MovingAverageMinMaxObserver
 
 
 def _create_qualcomm_flow(
     name: str,
     quantize: bool = False,
+    quant_dtype: QuantDtype | None = None,
+    per_channel_conv=True,
+    per_channel_linear=False,
+    is_qat=False,
+    use_fp16=True,
 ) -> TestFlow:
+    if quantize and quant_dtype is None:
+        raise RuntimeError("Quant dtype must be provided when quantize is true.")
+
+    def create_tester(*args, **kwargs) -> QualcommTester:
+        kwargs["use_fp16"] = (use_fp16,)
+        return QualcommTester(*args, **kwargs)
+
+    def create_quantize_stage() -> Quantize:
+        quantizer = QnnQuantizer()
+        quantizer.set_default_quant_config(
+            quant_dtype,
+            is_qat=is_qat,
+            is_conv_per_channel=per_channel_conv,
+            is_linear_per_channel=per_channel_linear,
+            act_observer=MovingAverageMinMaxObserver,
+        )
+        return Quantize(quantizer=quantizer)
+
     return TestFlow(
         name,
         backend="qualcomm",
-        tester_factory=QualcommTester,
+        tester_factory=create_tester,
         quantize=quantize,
+        quantize_stage_factory=create_quantize_stage if quantize else None,
     )
 
 
 QUALCOMM_TEST_FLOW = _create_qualcomm_flow("qualcomm")
+QUALCOMM_16A16W_TEST_FLOW = _create_qualcomm_flow(
+    "qualcomm_16a16w", quantize=True, quant_dtype=QuantDtype.use_8a8w, use_fp16=False
+)
+QUALCOMM_16A8W_TEST_FLOW = _create_qualcomm_flow(
+    "qualcomm_16a8w", quantize=True, quant_dtype=QuantDtype.use_16a8w, use_fp16=False
+)
+QUALCOMM_16A4W_TEST_FLOW = _create_qualcomm_flow(
+    "qualcomm_16a4w", quantize=True, quant_dtype=QuantDtype.use_16a4w, use_fp16=False
+)
+QUALCOMM_16A4W_BLOCK_TEST_FLOW = _create_qualcomm_flow(
+    "qualcomm_16a4w_block",
+    quantize=True,
+    quant_dtype=QuantDtype.use_8a8w,
+    use_fp16=False,
+)
+QUALCOMM_8A8W_TEST_FLOW = _create_qualcomm_flow(
+    "qualcomm_8a8w", quantize=True, quant_dtype=QuantDtype.use_8a8w, use_fp16=False
+)