[Backend Tester] Add test flows for QNN quantization

GregoryComer · GregoryComer · commit 7ba1ff0a8025 · 2025-08-20T14:22:16.000-07:00
ghstack-source-id: c5bb389 ghstack-comment-id: 3195495418 Pull-Request: #13469
diff --git a/.ci/scripts/test_backend_linux.sh b/.ci/scripts/test_backend_linux.sh
@@ -18,11 +18,32 @@ eval "$(conda shell.bash hook)"
 CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
 conda activate "${CONDA_ENV}"
 
-# Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate
-source .ci/scripts/setup-vulkan-linux-deps.sh
+export PYTHON_EXECUTABLE=python
 
 # CMake options to use, in addition to the defaults.
-EXTRA_BUILD_ARGS="-DEXECUTORCH_BUILD_VULKAN=ON"
+EXTRA_BUILD_ARGS=""
+
+if [[ "$FLOW" == *qnn* ]]; then
+    # Setup QNN sdk and deps - note that this is a bit hacky due to the nature of the
+    # Qualcomm build. TODO (gjcomer) Clean this up once the QNN pybinding integration is
+    # cleaned up.
+    PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
+    PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
+    PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
+    QNN_X86_LIB_DIR=`realpath build-x86/lib/`
+    QNN_SDK_ROOT="/tmp/qnn/2.28.0.241029"
+    export LD_LIBRARY_PATH"=$QNN_X86_LIB_DIR:$QNN_SDK_ROOT/lib/x86_64-linux-clang/:${LD_LIBRARY_PATH:-}"
+
+    # TODO Get SDK root from install scripts
+    EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_QNN=ON -DQNN_SDK_ROOT=$QNN_SDK_ROOT"
+fi
+
+if [[ "$FLOW" == *vulkan* ]]; then
+    # Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate
+    source .ci/scripts/setup-vulkan-linux-deps.sh
+
+    EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_VULKAN=ON"
+fi
 
 # We need the runner to test the built library.
 PYTHON_EXECUTABLE=python CMAKE_ARGS="$EXTRA_BUILD_ARGS" .ci/scripts/setup-linux.sh --build-tool cmake --build-mode Release --editable true
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
@@ -42,7 +42,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        flow: [vulkan, xnnpack, xnnpack_static_int8_per_channel]
+        flow: [qnn, qnn_16a16w, qnn_16a8w, qnn_16a4w, qnn_16a4w_block, qnn_8a8w, vulkan, xnnpack, xnnpack_static_int8_per_channel]
         suite: [models, operators]
     with:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
diff --git a/backends/qualcomm/scripts/install_qnn_sdk.sh b/backends/qualcomm/scripts/install_qnn_sdk.sh
@@ -9,7 +9,7 @@ source "${SCRIPT_DIR}/qnn_config.sh"
 # Function to install Android NDK (only if not already set)
 setup_android_ndk() {
     # Check if ANDROID_NDK_ROOT is already set and valid
-    if [ -n "${ANDROID_NDK_ROOT}" ] && [ -d "${ANDROID_NDK_ROOT}" ]; then
+    if [ -n "${ANDROID_NDK_ROOT:-}" ] && [ -d "${ANDROID_NDK_ROOT:-}" ]; then
         echo "Android NDK already set to ${ANDROID_NDK_ROOT} - skipping installation"
         return
     fi
@@ -41,7 +41,7 @@ verify_pkg_installed() {
 
 install_qnn() {
   # Check if QNN_SDK_ROOT is already set and valid
-  if [ -n "${QNN_SDK_ROOT}" ] && [ -d "${QNN_SDK_ROOT}" ]; then
+  if [ -n "${QNN_SDK_ROOT:-}" ] && [ -d "${QNN_SDK_ROOT:-}" ]; then
     echo "QNN SDK already set to ${QNN_SDK_ROOT} - skipping installation"
     return
   fi
@@ -141,9 +141,9 @@ setup_libcpp() {
   popd >/dev/null
 
   # Set environment variables
-  export CPLUS_INCLUDE_PATH="${INSTALL_DIR}/include:$CPLUS_INCLUDE_PATH"
-  export LD_LIBRARY_PATH="${INSTALL_DIR}/lib:$LD_LIBRARY_PATH"
-  export LIBRARY_PATH="${INSTALL_DIR}/lib:$LIBRARY_PATH"
+  export CPLUS_INCLUDE_PATH="${INSTALL_DIR}/include:${CPLUS_INCLUDE_PATH:-}"
+  export LD_LIBRARY_PATH="${INSTALL_DIR}/lib:${LD_LIBRARY_PATH:-}"
+  export LIBRARY_PATH="${INSTALL_DIR}/lib:${LIBRARY_PATH:-}"
 
   echo "libc++ installed to ${INSTALL_DIR}"
 }
diff --git a/backends/qualcomm/tests/tester.py b/backends/qualcomm/tests/tester.py
@@ -4,14 +4,15 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Any, List, Optional, Tuple
+from typing import Any, List, Optional, Sequence, Tuple
 
 import executorch
 import executorch.backends.test.harness.stages as BaseStages
 
 import torch
 from executorch.backends.qualcomm._passes.qnn_pass_manager import QnnPassManager
 from executorch.backends.qualcomm.partition.qnn_partitioner import QnnPartitioner
+from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer
 from executorch.backends.qualcomm.utils.utils import (
     generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
@@ -21,9 +22,32 @@
 from executorch.backends.test.harness.stages import StageType
 from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower
 from executorch.exir.backend.partitioner import Partitioner
+from torch.ao.quantization.quantize_pt2e import (
+    convert_pt2e,
+    prepare_pt2e,
+    prepare_qat_pt2e,
+)
 from torch.export import ExportedProgram
 
 
+class Quantize(BaseStages.Quantize):
+    def __init__(
+        self,
+        quantizer: QnnQuantizer,
+        quantization_config: Optional[Any] = None,
+        calibrate: bool = True,
+        calibration_samples: Optional[Sequence[Any]] = None,
+        is_qat: Optional[bool] = False,
+    ):
+        super().__init__(
+            quantizer=quantizer,
+            calibrate=calibrate,
+            calibration_samples=calibration_samples,
+            is_qat=is_qat,
+            set_global=False,
+        )
+
+
 class Partition(BaseStages.Partition):
     def __init__(self, partitioner: Optional[Partitioner] = None):
         super().__init__(
@@ -37,8 +61,9 @@ def __init__(
         partitioners: Optional[List[Partitioner]] = None,
         edge_compile_config: Optional[EdgeCompileConfig] = None,
         soc_model: str = "SM8650",
+        use_fp16: bool = True,
     ):
-        backend_options = generate_htp_compiler_spec(use_fp16=True)
+        backend_options = generate_htp_compiler_spec(use_fp16=use_fp16)
         self.chipset = get_soc_to_chipset_map()[soc_model]
         self.compiler_specs = generate_qnn_executorch_compiler_spec(
             soc_model=self.chipset,
@@ -73,15 +98,17 @@ def __init__(
         module: torch.nn.Module,
         example_inputs: Tuple[torch.Tensor],
         dynamic_shapes: Optional[Tuple[Any]] = None,
+        use_fp16: bool = True,
     ):
+        def create_to_edge_transform_and_lower(*args, **kwargs):
+            kwargs["use_fp16"] = use_fp16
+            return ToEdgeTransformAndLower(*args, **kwargs)
+
         # Specialize for Qualcomm
-        stage_classes = (
-            executorch.backends.test.harness.Tester.default_stage_classes()
-            | {
-                StageType.PARTITION: Partition,
-                StageType.TO_EDGE_TRANSFORM_AND_LOWER: ToEdgeTransformAndLower,
-            }
-        )
+        stage_classes = executorch.backends.test.harness.Tester.default_stage_classes() | {
+            StageType.PARTITION: Partition,
+            StageType.TO_EDGE_TRANSFORM_AND_LOWER: create_to_edge_transform_and_lower,
+        }
 
         super().__init__(
             module=module,
diff --git a/backends/test/suite/flow.py b/backends/test/suite/flow.py
@@ -81,12 +81,24 @@ def all_flows() -> dict[str, TestFlow]:
         logger.info(f"Skipping Vulkan flow registration: {e}")
 
     try:
-        from executorch.backends.test.suite.flows.qualcomm import QUALCOMM_TEST_FLOW
+        from executorch.backends.test.suite.flows.qualcomm import (
+            QNN_16A16W_TEST_FLOW,
+            QNN_16A4W_BLOCK_TEST_FLOW,
+            QNN_16A4W_TEST_FLOW,
+            QNN_16A8W_TEST_FLOW,
+            QNN_8A8W_TEST_FLOW,
+            QNN_TEST_FLOW,
+        )
 
         flows += [
-            QUALCOMM_TEST_FLOW,
+            QNN_TEST_FLOW,
+            QNN_16A16W_TEST_FLOW,
+            QNN_16A8W_TEST_FLOW,
+            QNN_16A4W_TEST_FLOW,
+            QNN_16A4W_BLOCK_TEST_FLOW,
+            QNN_8A8W_TEST_FLOW,
         ]
     except Exception as e:
-        logger.info(f"Skipping Qualcomm flow registration: {e}")
+        logger.info(f"Skipping QNN flow registration: {e}")
 
     return {f.name: f for f in flows if f is not None}
diff --git a/backends/test/suite/flows/qualcomm.py b/backends/test/suite/flows/qualcomm.py
@@ -1,17 +1,61 @@
-from executorch.backends.qualcomm.tests.tester import QualcommTester
+from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer, QuantDtype
+from executorch.backends.qualcomm.tests.tester import QualcommTester, Quantize
 from executorch.backends.test.suite.flow import TestFlow
+from torchao.quantization.pt2e import MovingAverageMinMaxObserver
 
 
-def _create_qualcomm_flow(
+def _create_qnn_flow(
     name: str,
     quantize: bool = False,
+    quant_dtype: QuantDtype | None = None,
+    per_channel_conv=True,
+    per_channel_linear=False,
+    is_qat=False,
+    use_fp16=True,
 ) -> TestFlow:
+    if quantize and quant_dtype is None:
+        raise RuntimeError("Quant dtype must be provided when quantize is true.")
+
+    def create_tester(*args, **kwargs) -> QualcommTester:
+        kwargs["use_fp16"] = (use_fp16,)
+        return QualcommTester(*args, **kwargs)
+
+    def create_quantize_stage() -> Quantize:
+        quantizer = QnnQuantizer()
+        quantizer.set_default_quant_config(
+            quant_dtype,
+            is_qat=is_qat,
+            is_conv_per_channel=per_channel_conv,
+            is_linear_per_channel=per_channel_linear,
+            act_observer=MovingAverageMinMaxObserver,
+        )
+        return Quantize(quantizer=quantizer)
+
     return TestFlow(
         name,
         backend="qualcomm",
-        tester_factory=QualcommTester,
+        tester_factory=create_tester,
         quantize=quantize,
+        quantize_stage_factory=create_quantize_stage if quantize else None,
     )
 
 
-QUALCOMM_TEST_FLOW = _create_qualcomm_flow("qualcomm")
+QNN_TEST_FLOW = _create_qnn_flow("qnn")
+QNN_16A16W_TEST_FLOW = _create_qnn_flow(
+    "qnn_16a16w", quantize=True, quant_dtype=QuantDtype.use_8a8w, use_fp16=False
+)
+QNN_16A8W_TEST_FLOW = _create_qnn_flow(
+    "qnn_16a8w", quantize=True, quant_dtype=QuantDtype.use_16a8w, use_fp16=False
+)
+QNN_16A4W_TEST_FLOW = _create_qnn_flow(
+    "qnn_16a4w", quantize=True, quant_dtype=QuantDtype.use_16a4w, use_fp16=False
+)
+QNN_16A4W_BLOCK_TEST_FLOW = _create_qnn_flow(
+    "qnn_16a4w_block",
+    quantize=True,
+    quant_dtype=QuantDtype.use_8a8w,
+    use_fp16=False,
+)
+QNN_8A8W_TEST_FLOW = _create_qnn_flow(
+    "qnn_8a8w", quantize=True, quant_dtype=QuantDtype.use_8a8w, use_fp16=False
+)