Qualcomm AI Engine Direct - qat proto

chunit-quic · web-flow · commit 124754558e00 · 2024-10-21T16:40:53.000-07:00
* [qat proto] - Add qat proto - Add Unit test test_qnn_backend_linear_qat - Test command ```bash python backends/qualcomm/tests/test_qnn_delegate.py -H $HOST -s $DEVICE -b $build-android/ -m "SM8650" -r $EXECUTORCH_ROOT -k TestQNNQuantizedOperator.test_qnn_backend_linear_qat ``` * [Fix lint] --------- Co-authored-by: Joey Tsai <chunit@qti.qualcomm.com> Pull Request resolved: #6222
diff --git a/backends/qualcomm/quantizer/quantizer.py b/backends/qualcomm/quantizer/quantizer.py
@@ -26,6 +26,7 @@
     get_16a4w_qnn_ptq_config,
     get_16a8w_qnn_ptq_config,
     get_default_16bit_qnn_ptq_config,
+    get_default_8bit_qat_proto,
     get_default_8bit_qnn_ptq_config,
     get_ptq_per_channel_quant_config,
     OP_ANNOTATOR,
@@ -39,6 +40,7 @@
     "get_16a8w_qnn_ptq_config",
     "get_default_16bit_qnn_ptq_config",
     "get_default_8bit_qnn_ptq_config",
+    "get_default_8bit_qat_proto",
 ]
 
 
diff --git a/backends/qualcomm/quantizer/utils.py b/backends/qualcomm/quantizer/utils.py
@@ -15,6 +15,11 @@
 from torch._ops import OpOverload
 from torch._subclasses import FakeTensor
 
+from torch.ao.quantization.fake_quantize import (
+    default_fake_quant,
+    FusedMovingAvgObsFakeQuantize,
+)
+
 from torch.ao.quantization.observer import (
     FixedQParamsObserver,
     MinMaxObserver,
@@ -179,6 +184,46 @@ def _derive_bias_qparams_fn(
     )
 
 
+def get_default_8bit_qat_proto(act_symmetric: bool = False) -> QuantizationConfig:
+
+    act_quantization_spec = QuantizationSpec(
+        dtype=torch.uint8,
+        qscheme=(
+            torch.per_tensor_symmetric if act_symmetric else torch.per_tensor_affine
+        ),
+        ch_axis=0,
+        observer_or_fake_quant_ctr=default_fake_quant,
+    )
+
+    weight_quantization_spec = QuantizationSpec(
+        dtype=torch.int8,
+        quant_min=torch.iinfo(torch.int8).min + 1,
+        quant_max=torch.iinfo(torch.int8).max,
+        qscheme=torch.per_tensor_symmetric,
+        ch_axis=0,
+        observer_or_fake_quant_ctr=FusedMovingAvgObsFakeQuantize.with_args(
+            observer=MovingAverageMinMaxObserver
+        ),
+    )
+
+    bias_quantization_spec = QuantizationSpec(
+        dtype=torch.int32,
+        quant_min=torch.iinfo(torch.int32).min,
+        quant_max=torch.iinfo(torch.int32).max,
+        qscheme=torch.per_tensor_symmetric,
+        observer_or_fake_quant_ctr=default_fake_quant,
+    )
+
+    quantization_config = QuantizationConfig(
+        input_activation=act_quantization_spec,
+        output_activation=act_quantization_spec,
+        weight=weight_quantization_spec,
+        bias=bias_quantization_spec,
+    )
+
+    return quantization_config
+
+
 def get_default_8bit_qnn_ptq_config(
     act_symmetric: bool = False, act_observer=MovingAverageMinMaxObserver
 ) -> QuantizationConfig:
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -1042,6 +1042,26 @@ def test_qnn_backend_linear(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_linear_qat(self):
+        """
+        Prototype to test qat model
+        """
+        module = Linear()  # noqa: F405
+        sample_input = (torch.randn([3, 4]),)
+
+        module = self.get_prepared_qat_module(module, sample_input)
+
+        optimizer = torch.optim.SGD(module.parameters(), lr=0.1)
+        criterion = torch.nn.CrossEntropyLoss()
+        output = module(*sample_input)
+        loss = criterion(output, module(*sample_input))
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        module = torch.ao.quantization.quantize_pt2e.convert_pt2e(module)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_log_softmax(self):
         module = LogSoftmax()  # noqa: F405
         sample_input = (torch.randn([1, 4, 8, 8]),)
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
@@ -20,6 +20,7 @@
 from executorch.backends.qualcomm.quantizer.quantizer import (
     get_16a4w_qnn_ptq_config,
     get_default_16bit_qnn_ptq_config,
+    get_default_8bit_qat_proto,
     QnnQuantizer,
     QuantDtype,
 )
@@ -44,7 +45,11 @@
 from executorch.exir.pass_base import ExportPass
 from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
 from executorch.exir.program import ExecutorchProgram, ExecutorchProgramManager
-from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
+from torch.ao.quantization.quantize_pt2e import (
+    convert_pt2e,
+    prepare_pt2e,
+    prepare_qat_pt2e,
+)
 
 
 def generate_context_binary(
@@ -426,6 +431,30 @@ def get_qdq_module(
         self.assertTrue(nodes.intersection(q_and_dq))
         return quantized_module
 
+    def get_prepared_qat_module(
+        self,
+        module: torch.nn.Module,
+        inputs: Tuple[torch.Tensor],
+        is_conv_per_channel: Optional[bool] = True,
+        is_linear_per_channel: Optional[bool] = False,
+        custom_quant_annotations: Tuple[Callable] = (),
+        quant_dtype: QuantDtype = QuantDtype.use_8a8w,
+    ) -> torch.fx.GraphModule:
+        m = torch.export.export_for_training(module, inputs).module()
+
+        quantizer = QnnQuantizer()
+        quantizer.add_custom_quant_annotations(custom_quant_annotations)
+        quantizer.set_per_channel_conv_quant(is_conv_per_channel)
+        quantizer.set_per_channel_linear_quant(is_linear_per_channel)
+
+        if quant_dtype == QuantDtype.use_8a8w:
+            quantizer.set_bit8_op_quant_config(get_default_8bit_qat_proto())
+        else:
+            raise RuntimeError("Shuld not be here")
+
+        prepared = prepare_qat_pt2e(m, quantizer)
+        return torch.ao.quantization.move_exported_model_to_train(prepared)
+
     def split_graph(self, graph_module: torch.fx.GraphModule, division: int):
         class SplitGraph(ExportPass):
             """