Merge branch 'main' into dev1/winskuo/custom_annotation_fix

winskuo-quic · web-flow · commit c2e3ba536405 · 2025-08-08T13:16:21.000+08:00
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -60,7 +60,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     strategy:
       matrix:
-        model: [add]
+        model: [add, softmax, mv2]
       fail-fast: false
     with:
       runner: linux.2xlarge
@@ -72,6 +72,16 @@ jobs:
         MODEL_NAME=${{ matrix.model }}
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
+        if [[ ${{ matrix.model}} == "add" ]]; then
+          SIM_LIMIT_SEC=60
+        elif [[ ${{ matrix.model}} == "softmax" ]]; then
+          SIM_LIMIT_SEC=60
+        elif [[ ${{ matrix.model}} == "mv2" ]]; then
+          SIM_LIMIT_SEC=5000
+        else
+          echo "Failed unsupported model selection ${{ matrix.model }}"
+          exit 1
+        fi
 
         source .ci/scripts/utils.sh
         source .ci/scripts/zephyr-utils.sh
@@ -80,20 +90,23 @@ jobs:
         export ZEPHYR_PROJ_ROOT=$(realpath $(pwd))
         export ARM_FVP_TUTORIALS_ROOT=$ZEPHYR_PROJ_ROOT/zephyr/samples/modules/executorch/arm-fvp-tutorials
 
+        # TODO @Bujji: Should see if this can be moved into the docker image itself
         download_arm_zephyr_sdk
         ./zephyr-sdk-0.16.0/setup.sh -c -t arm-zephyr-eabi
-
         cd $ZEPHYR_PROJ_ROOT
         setup_zephyr_et_module
 
+        # Run setup scripts for Arm FVP and Arm AOT Compilation
         cd $ZEPHYR_PROJ_ROOT/modules/lib/executorch
         install_executorch "--use-pt-pinned-commit"
         .ci/scripts/setup-arm-baremetal-tools.sh --target-toolchain zephyr
         source examples/arm/ethos-u-scratch/setup_path.sh
         source $ZEPHYR_PROJ_ROOT/zephyr/zephyr-env.sh
 
         # Get the model as PTE
-        python -m examples.arm.aot_arm_compiler --model_name="${MODEL_NAME}" --output="${MODEL_NAME}.pte"
+        python -m examples.arm.aot_arm_compiler \
+            --model_name="${MODEL_NAME}" \
+            --output="${MODEL_NAME}.pte"
 
         # Generate the C-style header
         cd $ARM_FVP_TUTORIALS_ROOT
@@ -105,7 +118,8 @@ jobs:
         cd $ARM_FVP_TUTORIALS_ROOT/models/${MODEL_NAME}/
 
         # Build the zephyr elf
-        west build -p always -b mps3/corstone300/fvp
+        west build -p always -b mps3/corstone300/fvp -- \
+            -DET_PTE_FILE_PATH_FOR_SELECTIVE_BUILD=$ZEPHYR_PROJ_ROOT/modules/lib/executorch/${MODEL_NAME}.pte
 
         # Run the simulation
         FVP_Corstone_SSE-300_Ethos-U55 -a build/zephyr/zephyr.elf \
@@ -114,23 +128,29 @@ jobs:
             -C mps3_board.uart0.out_file='sim.out'  \
             -C cpu0.CFGITCMSZ=15 \
             -C cpu0.CFGDTCMSZ=15 \
-            --simlimit 120
+            --simlimit ${SIM_LIMIT_SEC}
 
+        # Disable exit on error
+        set +e
         # Report failure if any of the ouptut verification checks fail
         grep -qF "ERROR" sim.out
         exit_status=$? #store 0 if found (failure), 1 if not (success)
         if [[ "$exit_status" -eq "0" ]]; then
-          cat sim.out
-          exit 1
+            cat sim.out
+            set -e
+            exit 1
         fi
 
         # Report fail if simulation does not complete successfully
         grep -qF "SUCCESS: Program complete, exiting." sim.out
         exit_status=$? #store 0 if found (success), 1 if not (failure)
         if [[ "$exit_status" -eq "1" ]]; then
-          cat sim.out
-          exit 1
+            cat sim.out
+            set -e
+            exit 1
         fi
+        # Re-enable exit on error
+        set -e
 
   test-models-linux-aarch64:
     name: test-models-linux-aarch64
diff --git a/backends/qualcomm/quantizer/custom_annotation.py b/backends/qualcomm/quantizer/custom_annotation.py
@@ -12,8 +12,11 @@
 )
 from executorch.backends.qualcomm.quantizer.quantizer import (
     get_16a8w_qnn_ptq_config,
+    get_16a8w_qnn_qat_config,
     get_8a8w_qnn_ptq_config,
+    get_8a8w_qnn_qat_config,
     get_ptq_per_channel_quant_config,
+    get_qat_per_channel_quant_config,
     QuantizationConfig,
 )
 from executorch.exir.dialects._ops import ops as exir_ops
@@ -154,7 +157,9 @@ def annotate_prefill_kv_output(gm: torch.fx.GraphModule, kv_quant_attrs: dict):
 
 
 def annotate_matmul_16a8w(  # noqa: C901
-    gm: torch.fx.GraphModule, annotate_conv=True
+    gm: torch.fx.GraphModule,
+    annotate_conv=True,
+    is_qat=False,
 ) -> None:
     """
     This function is specific for matmul op 16a8w.
@@ -242,7 +247,6 @@ def annotate_rms_norm(node: Node, quantization_config: QuantizationConfig) -> No
     def annotate_single_in_single_out(
         node: Node, quantization_config: QuantizationConfig
     ) -> None:
-
         input_qspec_map = {}
         input_act = node.args[0]
         input_qspec_map[input_act] = quantization_config.input_activation
@@ -256,7 +260,6 @@ def annotate_single_in_single_out(
     def annotate_single_in_share_out(
         node: Node, quantization_config: QuantizationConfig
     ) -> None:
-
         input_qspec_map = {}
         input_act = node.args[0]
         input_qspec_map[input_act] = quantization_config.input_activation
@@ -287,16 +290,27 @@ def annotate_stack(node: Node, quantization_config: QuantizationConfig) -> None:
             _annotated=True,
         )
 
-    def annotate_matmul_input1(node: Node):
-        quantization_config_8a8w = get_8a8w_qnn_ptq_config(
-            act_symmetric=True, act_observer=MinMaxObserver
-        )
-        quantization_config_8a4w_per_channel = get_ptq_per_channel_quant_config(
-            act_dtype=torch.uint8,
-            weight_dtype=torch.int4,
-            act_observer=MinMaxObserver,
-            act_symmetric=True,
-        )
+    def annotate_matmul_input1(node: Node, is_qat: str):
+        if is_qat:
+            quantization_config_8a8w = get_8a8w_qnn_qat_config(
+                act_symmetric=True, act_observer=MinMaxObserver
+            )
+            quantization_config_8a4w_per_channel = get_qat_per_channel_quant_config(
+                act_dtype=torch.uint8,
+                weight_dtype=torch.int4,
+                act_observer=MinMaxObserver,
+                act_symmetric=True,
+            )
+        else:
+            quantization_config_8a8w = get_8a8w_qnn_ptq_config(
+                act_symmetric=True, act_observer=MinMaxObserver
+            )
+            quantization_config_8a4w_per_channel = get_ptq_per_channel_quant_config(
+                act_dtype=torch.uint8,
+                weight_dtype=torch.int4,
+                act_observer=MinMaxObserver,
+                act_symmetric=True,
+            )
         while isinstance(node, Node) and node.op == "call_function":
             if node.target in [
                 torch.ops.aten.permute.default,
@@ -334,12 +348,19 @@ def annotate_matmul_input1(node: Node):
                 print(f"The node ({node}) is not expected in the input1 of the matmul")
                 node = node.args[0]
 
-    quantization_config_16a8w = get_16a8w_qnn_ptq_config(act_observer=MinMaxObserver)
+    if is_qat:
+        quantization_config_16a8w = get_16a8w_qnn_qat_config(
+            act_observer=MinMaxObserver
+        )
+    else:
+        quantization_config_16a8w = get_16a8w_qnn_ptq_config(
+            act_observer=MinMaxObserver
+        )
 
     for node in gm.graph.nodes:
         if node.op == "call_function" and node.target == torch.ops.aten.matmul.default:
             annotate_matmul(node, quantization_config_16a8w)
-            annotate_matmul_input1(node.args[1])
+            annotate_matmul_input1(node.args[1], is_qat=is_qat)
 
 
 def custom_annotate_llama_matmul_16a8w(gm: torch.fx.GraphModule) -> None:  # noqa: C901
diff --git a/backends/qualcomm/quantizer/qconfig.py b/backends/qualcomm/quantizer/qconfig.py
@@ -187,6 +187,65 @@ def get_16a8w_qnn_ptq_config(
     return quantization_config
 
 
+def get_16a8w_qnn_qat_config(
+    act_observer=MovingAverageMinMaxObserver,
+) -> QuantizationConfig:
+    extra_args: Dict[str, Any] = {"eps": 2**-20}
+    act_fake_quant_ctr = FakeQuantize.with_args(
+        dtype=torch.int32,
+        quant_min=torch.iinfo(torch.uint16).min,
+        quant_max=torch.iinfo(torch.uint16).max,
+        qscheme=torch.per_tensor_affine,
+        reduce_range=True,
+        observer=act_observer.with_args(**extra_args),
+    )
+    act_quantization_spec = QuantizationSpec(
+        dtype=torch.int32,
+        quant_min=torch.iinfo(torch.uint16).min,
+        quant_max=torch.iinfo(torch.uint16).max,
+        qscheme=torch.per_tensor_affine,
+        observer_or_fake_quant_ctr=act_fake_quant_ctr,
+    )
+    weight_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
+        dtype=torch.int8,
+        quant_min=torch.iinfo(torch.int8).min + 1,
+        quant_max=torch.iinfo(torch.int8).max,
+        qscheme=torch.per_tensor_symmetric,
+        reduce_range=True,
+        observer=MovingAverageMinMaxObserver,
+    )
+    weight_quantization_spec = QuantizationSpec(
+        dtype=torch.int8,
+        quant_min=torch.iinfo(torch.int8).min + 1,
+        quant_max=torch.iinfo(torch.int8).max,
+        qscheme=torch.per_tensor_symmetric,
+        ch_axis=0,
+        observer_or_fake_quant_ctr=weight_fake_quant_ctr,
+    )
+    bias_fake_quant_ctr = FakeQuantize.with_args(
+        dtype=torch.int32,
+        quant_min=torch.iinfo(torch.int32).min,
+        quant_max=torch.iinfo(torch.int32).max,
+        qscheme=torch.per_tensor_symmetric,
+        observer=MovingAverageMinMaxObserver,
+    )
+    bias_quantization_spec = QuantizationSpec(
+        dtype=torch.int32,
+        quant_min=torch.iinfo(torch.int32).min,
+        quant_max=torch.iinfo(torch.int32).max,
+        qscheme=torch.per_tensor_symmetric,
+        observer_or_fake_quant_ctr=bias_fake_quant_ctr,
+    )
+    quantization_config = QuantizationConfig(
+        input_activation=act_quantization_spec,
+        output_activation=act_quantization_spec,
+        weight=weight_quantization_spec,
+        bias=bias_quantization_spec,
+    )
+
+    return quantization_config
+
+
 def get_16a16w_qnn_ptq_config(
     act_observer=MovingAverageMinMaxObserver,
 ) -> QuantizationConfig:
@@ -459,6 +518,7 @@ def get_qat_per_channel_quant_config(
     act_dtype=torch.uint8,
     weight_dtype=torch.int8,
     act_observer=MovingAverageMinMaxObserver,
+    act_symmetric=False,
 ) -> QuantizationConfig:
     supported_act_types = {
         torch.uint8,
@@ -476,21 +536,38 @@ def get_qat_per_channel_quant_config(
     ), f"weight_dtype, {weight_dtype} is not one of supported types, {supported_weight_dtypes}"
 
     # torch does not support uint16 quantization, use int32 to bypass
-    act_fake_quant_ctr = FakeQuantize.with_args(
-        dtype=torch.int32 if act_dtype == torch.uint16 else act_dtype,
-        quant_min=torch.iinfo(act_dtype).min,
-        quant_max=torch.iinfo(act_dtype).max,
-        qscheme=torch.per_tensor_affine,
-        reduce_range=True,
-        observer=act_observer,
-    )
-    act_quantization_spec = QuantizationSpec(
-        dtype=torch.int32 if act_dtype == torch.uint16 else act_dtype,
-        quant_min=torch.iinfo(act_dtype).min,
-        quant_max=torch.iinfo(act_dtype).max,
-        qscheme=torch.per_tensor_affine,
-        observer_or_fake_quant_ctr=act_fake_quant_ctr,
-    )
+    if act_symmetric:
+        # If zero_point is 128, htp can do optimizations.
+        # If we keep quant_min and quant_max none, observer will default use 128 as zero_point.
+        # If we provide uint8 quant_min/max, it will use 127 as zero_point, which is undesired.
+        act_fake_quant_ctr = FakeQuantize.with_args(
+            dtype=torch.int32 if act_dtype == torch.uint16 else act_dtype,
+            qscheme=torch.per_tensor_symmetric,
+            reduce_range=True,
+            observer=act_observer,
+        )
+        act_quantization_spec = QuantizationSpec(
+            dtype=torch.int32 if act_dtype == torch.uint16 else act_dtype,
+            qscheme=torch.per_tensor_symmetric,
+            ch_axis=0,
+            observer_or_fake_quant_ctr=act_fake_quant_ctr,
+        )
+    else:
+        act_fake_quant_ctr = FakeQuantize.with_args(
+            dtype=torch.int32 if act_dtype == torch.uint16 else act_dtype,
+            quant_min=torch.iinfo(act_dtype).min,
+            quant_max=torch.iinfo(act_dtype).max,
+            qscheme=torch.per_tensor_affine,
+            reduce_range=True,
+            observer=act_observer,
+        )
+        act_quantization_spec = QuantizationSpec(
+            dtype=torch.int32 if act_dtype == torch.uint16 else act_dtype,
+            quant_min=torch.iinfo(act_dtype).min,
+            quant_max=torch.iinfo(act_dtype).max,
+            qscheme=torch.per_tensor_affine,
+            observer_or_fake_quant_ctr=act_fake_quant_ctr,
+        )
 
     weight_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
         dtype=torch.int8 if weight_dtype == torch.int4 else weight_dtype,
@@ -513,7 +590,21 @@ def get_qat_per_channel_quant_config(
         observer_or_fake_quant_ctr=weight_fake_quant_ctr,
     )
 
-    bias_quantization_spec = _derived_bias_quant_spec
+    bias_fake_quant_ctr = FakeQuantize.with_args(
+        dtype=torch.int32,
+        quant_min=torch.iinfo(torch.int32).min,
+        quant_max=torch.iinfo(torch.int32).max,
+        qscheme=torch.per_tensor_symmetric,
+        reduce_range=True,
+        observer=MovingAverageMinMaxObserver,
+    )
+    bias_quantization_spec = QuantizationSpec(
+        dtype=torch.int32,
+        quant_min=torch.iinfo(torch.int32).min,
+        quant_max=torch.iinfo(torch.int32).max,
+        qscheme=torch.per_tensor_symmetric,
+        observer_or_fake_quant_ctr=bias_fake_quant_ctr,
+    )
 
     quantization_config = QuantizationConfig(
         input_activation=act_quantization_spec,
diff --git a/backends/qualcomm/quantizer/quantizer.py b/backends/qualcomm/quantizer/quantizer.py
@@ -23,6 +23,7 @@
     get_16a4w_qnn_ptq_config,
     get_16a4w_qnn_qat_config,
     get_16a8w_qnn_ptq_config,
+    get_16a8w_qnn_qat_config,
     get_8a8w_qnn_ptq_config,
     get_8a8w_qnn_qat_config,
     get_ptq_per_block_quant_config,
@@ -39,6 +40,7 @@
     "QuantDtype",
     "get_16a4w_qnn_ptq_config",
     "get_16a8w_qnn_ptq_config",
+    "get_16a8w_qnn_qat_config",
     "get_16a16w_qnn_ptq_config",
     "get_8a8w_qnn_ptq_config",
     "get_8a8w_qnn_qat_config",