Qualcomm AI Engine Direct - Observer Fix and remove unused passes (pytorch#6225)

winskuo-quic · facebook-github-bot · commit dc4be7c74bb9 · 2024-10-16T11:25:13.000-07:00
Summary: - `ConvertToLinear()` is redundant in `qnn_preprocess.py` since this pass is already called in `executorch/backends/qualcomm/utils/utils.py` - Some models are experiencing a significant drop in accuracy, with a few models having 0% accuracy. Adding new conditions to perform requantization and change ptq_per_channel_quant_config's IO from MinMaxObserver to MovingAverageMinMaxObserver to resolve the issue. 1. Why adding new conditions to do requantization? We noticed this change in PyTorch PR (pytorch/pytorch@b8eef50#diff-976c3b0c6f85048d3db01a0c394ce8eb16e2f7541f0983d0f4ef549baa4be822L152). Before this PR, quantization spec only checks whether 2 qspecs were same by comparing `dtype` and `is_dynamic`. After this change, it checks for more attributes such as `scale`, `zero_point`, etc. This causes some nodes having an extra pair of QDQ nodes. As shown in the image below, there are 2 pairs of QDQ nodes after the PyTorch PR, and these 2 pairs of QDQ nodes have different scale and offset. For QNN lowering process, node will only save the quant info right after the node output. For example, `cat` op below will use `quantize_per_tensor_default_18`'s scale and offset as the node's quant attribute, and all other quant and dequant nodes will be ignored. This causes an accuracy drop, but by inserting a requantize node, we can see an improvement in accuracy for most models. Taking inceptionv3 as an example, the average top1 accuracy 0%->~75%. I have checked a couple other models and see accuracy either stays the same or have improvements. I have also provided the option for users to skip this requant optimization if they preferred not to use it. **Before:** ![image](https://github.com/user-attachments/assets/e6048b24-347c-4a5b-8406-c11dc14d33ae) ___ **After** ![image](https://github.com/user-attachments/assets/200cca57-f4f7-48bc-83fb-fc1595935569) 2. Why change ptq_per_channel_quant_config's IO from MinMaxObserver to MovingAverageMinMaxObserver? After the above change, it seems like there is an inference speed drop due to requantization. By switching to MovingAverageMinMaxObserver, I observed an improvement in inference speed for some models such as inceptionv3. Pull Request resolved: pytorch#6225 Reviewed By: kirklandsign Differential Revision: D64413835 Pulled By: cccclai fbshipit-source-id: a8be66b034c69ff403f9f2985f2b584695f3798b
diff --git a/backends/qualcomm/_passes/annotate_quant_attrs.py b/backends/qualcomm/_passes/annotate_quant_attrs.py
@@ -27,9 +27,12 @@ class AnnotateQuantAttrs(ExportPass):
     generated after quatization process.
     """
 
-    def __init__(self, edge_program: torch.export.ExportedProgram):
+    def __init__(
+        self, edge_program: torch.export.ExportedProgram, skip_advanced_requat: bool
+    ):
         super(AnnotateQuantAttrs, self).__init__()
         self.edge_program = edge_program
+        self.skip_advanced_requant = skip_advanced_requat
 
     def _annotate_source_nodes(
         self, quant_node: torch.fx.Node, quant_attrs: Dict[str, Any]
@@ -68,9 +71,26 @@ def _annotate_requant(self, n):
 
             # TODO: Store multiple pairs of requantize attributes when we have an op builder
             # that has multiple outputs that requires quant attributes.
-            if q_attrs["dtype"] != dq_attrs["dtype"]:
-                dq_attrs[QCOM_ENCODING] = q_attrs[QCOM_ENCODING]
-                n.args[0].meta[QCOM_REQUANTIZE] = dq_attrs
+            if self.skip_advanced_requant:
+                if q_attrs["dtype"] != dq_attrs["dtype"]:
+                    dq_attrs[QCOM_ENCODING] = q_attrs[QCOM_ENCODING]
+                    n.args[0].meta[QCOM_REQUANTIZE] = dq_attrs
+            else:
+                # When dtype is the same but other specs such as scale and offset are different,
+                # insert requant to improve accuracy.
+                # Users can turn this feature off if any inference speed drop is observed.
+                if any(
+                    q_attrs[attr] != dq_attrs[attr]
+                    for attr in [
+                        "scale",
+                        "zero_point",
+                        "quant_min",
+                        "quant_max",
+                        "dtype",
+                    ]
+                ):
+                    dq_attrs[QCOM_ENCODING] = q_attrs[QCOM_ENCODING]
+                    n.args[0].meta[QCOM_REQUANTIZE] = dq_attrs
 
     # Dequant all the fold_quant parameters back to fp32.
     # If an operation is not supported by QNN and got fallback, it will expect a fp32 param.
diff --git a/backends/qualcomm/qnn_preprocess.py b/backends/qualcomm/qnn_preprocess.py
@@ -11,7 +11,6 @@
 import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManager
 
 import torch  # noqa: F401
-from executorch.backends.qualcomm._passes.convert_to_linear import ConvertToLinear
 from executorch.backends.qualcomm._passes.fuse_consecutive_transpose import (
     FuseConsecutiveTranspose,
 )
@@ -49,7 +48,6 @@ def preprocess(
         # QNN Delegate Specific Passes
         qnn_compiler_passes = PassManager(
             passes=[
-                ConvertToLinear(),
                 InsertRequantize(edge_program),
                 InsertIOQDQ(edge_program),
                 LayoutTransform(edge_program, insert_permute=True),
diff --git a/backends/qualcomm/quantizer/utils.py b/backends/qualcomm/quantizer/utils.py
@@ -364,7 +364,7 @@ def get_ptq_per_channel_quant_config(
         quant_min=torch.iinfo(act_dtype).min,
         quant_max=torch.iinfo(act_dtype).max,
         qscheme=torch.per_tensor_affine,
-        observer_or_fake_quant_ctr=MinMaxObserver.with_args(**extra_args),
+        observer_or_fake_quant_ctr=MovingAverageMinMaxObserver.with_args(**extra_args),
     )
 
     weight_quantization_spec = QuantizationSpec(
diff --git a/backends/qualcomm/utils/constants.py b/backends/qualcomm/utils/constants.py
@@ -26,6 +26,7 @@
 QCOM_ZERO_POINT = "zero_point"
 QCOM_ZERO_POINTS = "zero_points"
 QCOM_PASS_EXPAND_BROADCAST_SHAPE = "expand_broadcast_shape"
+QCOM_PASS_SKIP_ADVANCED_REQUANT = "skip_advanced_requant"
 
 # constants in backends/qualcomm/tests
 QCOM_ANNOTATION = "annotation"
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
@@ -69,6 +69,7 @@
 )
 from executorch.backends.qualcomm.utils.constants import (
     QCOM_PASS_EXPAND_BROADCAST_SHAPE,
+    QCOM_PASS_SKIP_ADVANCED_REQUANT,
     QCOM_QNN_COMPILE_SPEC,
 )
 
@@ -305,7 +306,9 @@ def _transform(
     ConvertBmmToMatmul()(graph_module)
     ConvertInterpolateWithUpsample2D()(graph_module)
     I64toI32(edge_program)(graph_module)
-    AnnotateQuantAttrs(edge_program)(graph_module)
+    AnnotateQuantAttrs(
+        edge_program, QCOM_PASS_SKIP_ADVANCED_REQUANT in custom_pass_config
+    )(graph_module)
     AnnotateAndQuantScalar(edge_program)(graph_module)
     AnnotateDecomposed(edge_program)(graph_module)
     FoldQDQ()(graph_module)

Original file line number	Diff line number	Diff line change
`@@ -364,7 +364,7 @@ def get_ptq_per_channel_quant_config(`
`364`	`364`	`quant_min=torch.iinfo(act_dtype).min,`
`365`	`365`	`quant_max=torch.iinfo(act_dtype).max,`
`366`	`366`	`qscheme=torch.per_tensor_affine,`
`367`		`- observer_or_fake_quant_ctr=MinMaxObserver.with_args(**extra_args),`
	`367`	`+ observer_or_fake_quant_ctr=MovingAverageMinMaxObserver.with_args(**extra_args),`
`368`	`368`	`)`
`369`	`369`
`370`	`370`	`weight_quantization_spec = QuantizationSpec(`