Support loading a checkpoint with QAT (#11346)

Borda · awaelchli · Carlos Mocholí · web-flow · commit 2230d596c42a · 2022-03-28T16:58:33.000+02:00
Co-authored-by: Adrian Wälchli &lt;aedu.waelchli@gmail.com&gt;
Co-authored-by: Carlos Mocholí &lt;carlossmocholi@gmail.comk-Pro.local&gt;
diff --git a/pytorch_lightning/callbacks/quantization.py b/pytorch_lightning/callbacks/quantization.py
@@ -91,6 +91,10 @@ class QuantizationAwareTraining(Callback):
 
     .. warning:: ``QuantizationAwareTraining`` is in beta and subject to change.
 
+    The ``LightningModule`` is prepared for QAT training in the ``on_fit_start`` hook. Checkpoints saved during training
+    include already collected stats to perform the Quantization conversion, but it doesn't contain the quantized or
+    fused model/layers. The quantization is performed in the ``on_fit_end`` hook so the model needs to be saved after
+    training finishes if quantization is desired.
 
     Args:
 
@@ -178,7 +182,7 @@ def __init__(
             )
         self._collect_quantization = collect_quantization
 
-        self.modules_to_fuse = modules_to_fuse
+        self._modules_to_fuse = modules_to_fuse
         self._input_compatible = input_compatible
         self._convert_on_fit_end = quantize_on_fit_end
 
@@ -193,11 +197,12 @@ def __init__(
         self._forward_calls = 0
         self._fake_quant_to_initial_state_dict = {}
         self._last_fake_quant_to_observer_enabled = {}
+        self._module_prepared = False
 
     def _check_feasible_fuse(self, model: "pl.LightningModule") -> bool:
-        if not self.modules_to_fuse:
+        if not self._modules_to_fuse:
             return False
-        for group in self.modules_to_fuse:
+        for group in self._modules_to_fuse:
             if not all(_recursive_hasattr(model, m) for m in group):
                 raise MisconfigurationException(
                     f"You have requested to fuse {group} but one or more of them is not your model attributes"
@@ -217,44 +222,50 @@ def _restore_last_observer_enabled(self) -> None:
         for fake_quant, observer_enabled in self._last_fake_quant_to_observer_enabled.items():
             fake_quant.observer_enabled.copy_(observer_enabled)
 
-    def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+    def _prepare_model(self, model: torch.nn.Module) -> None:
+        if self._module_prepared:
+            return
         # QuantStub converts tensors from floating point to quantized
-        pl_module.quant = torch.quantization.QuantStub()
+        model.quant = torch.quantization.QuantStub()
         # DeQuantStub converts tensors from quantized to floating point
-        pl_module.dequant = torch.quantization.DeQuantStub()
+        model.dequant = torch.quantization.DeQuantStub()
         # manually specify where tensors will be converted from quantized
         # to floating point in the quantized model
-        self.__module_forward = pl_module.forward
-        pl_module.forward = wrap_qat_forward_context(
-            quant_cb=self, model=pl_module, func=pl_module.forward, trigger_condition=self._collect_quantization
+        self.__module_forward = model.forward
+        model.forward = wrap_qat_forward_context(
+            quant_cb=self, model=model, func=model.forward, trigger_condition=self._collect_quantization
         )
 
         # attach a global qconfig, which contains information about what kind
         # of observers to attach. Use 'fbgemm' for server inference
         if isinstance(self._qconfig, str):
             if self._observer_type == "histogram":
-                pl_module.qconfig = torch.quantization.get_default_qconfig(self._qconfig)
+                model.qconfig = torch.quantization.get_default_qconfig(self._qconfig)
             elif self._observer_type == "average":
                 # version=None corresponds to using FakeQuantize rather than
                 # FusedMovingAvgObsFakeQuantize which was introduced in PT1.10
                 # details in https://github.com/pytorch/pytorch/issues/64564
                 extra_kwargs = dict(version=None) if _TORCH_GREATER_EQUAL_1_10 else {}
-                pl_module.qconfig = torch.quantization.get_default_qat_qconfig(self._qconfig, **extra_kwargs)
+                model.qconfig = torch.quantization.get_default_qat_qconfig(self._qconfig, **extra_kwargs)
 
         elif isinstance(self._qconfig, QConfig):
-            pl_module.qconfig = self._qconfig
+            model.qconfig = self._qconfig
 
-        if self._check_feasible_fuse(pl_module):
-            torch.quantization.fuse_modules(pl_module, self.modules_to_fuse, inplace=True)
+        if self._check_feasible_fuse(model):
+            torch.quantization.fuse_modules(model, self._modules_to_fuse, inplace=True)
 
         # Prepare the model for QAT. This inserts observers and fake_quants in
         # the model that will observe weight and activation tensors during calibration.
-        torch.quantization.prepare_qat(pl_module, inplace=True)
+        torch.quantization.prepare_qat(model, inplace=True)
 
-        fake_quants = tuple(module for module in pl_module.modules() if isinstance(module, FakeQuantizeBase))
+        fake_quants = tuple(module for module in model.modules() if isinstance(module, FakeQuantizeBase))
         self._fake_quant_to_initial_state_dict = {
             fake_quant: copy.deepcopy(fake_quant.state_dict()) for fake_quant in fake_quants
         }
+        self._module_prepared = True
+
+    def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule"):
+        self._prepare_model(pl_module)
 
     def on_fit_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
         if not self._convert_on_fit_end:
@@ -311,3 +322,18 @@ def on_predict_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule
     def on_predict_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
         if "predict" in self._observer_disabled_stages:
             self._restore_last_observer_enabled()
+
+    def state_dict(self) -> Dict[str, Any]:
+        keys = {"_qconfig", "_observer_type", "_collect_quantization", "_modules_to_fuse", "_input_compatible"}
+        return {n: getattr(self, n) for n in keys}
+
+    def _load_before_model(self, model: torch.nn.Module, state_dict: Dict[str, Any]) -> None:
+        """Special hook that gets called by the CheckpointConnector *before* the model gets loaded.
+
+        This hook replaces the :meth:`on_load_checkpoint` and :meth:`load_state_dict` callback methods which get called
+        after the model has already loaded the weights. For quantization, we need to convert the model first before that
+        happens, assuming the previous training used quantization.
+        """
+        for k, v in state_dict.items():
+            setattr(self, k, v)
+        self._prepare_model(model)
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -15,6 +15,7 @@
 import logging
 import os
 import re
+from copy import deepcopy
 from typing import Any, Dict, Optional
 
 import torch
@@ -217,6 +218,32 @@ def restore_precision_plugin_state(self) -> None:
         ):
             prec_plugin.load_state_dict(self._loaded_checkpoint["native_amp_scaling_state"])
 
+    def _restore_quantization_callbacks(self) -> None:
+        """Restores all the ``QuantizationAwareTraining`` callbacks from the pre-loaded checkpoint.
+
+        The implementation is similar to :meth:`restore_callbacks` but calls the QAT callback with a special hook
+        `load_before_model` instead of `load_state_dict`.
+        """
+        if not self._loaded_checkpoint:
+            return
+
+        callback_states = self._loaded_checkpoint.get("callbacks")
+
+        if callback_states is None:
+            return
+
+        from pytorch_lightning.callbacks.quantization import QuantizationAwareTraining  # avoid circular import
+
+        for callback in self.trainer.callbacks:
+            if not isinstance(callback, QuantizationAwareTraining):
+                continue
+
+            state = callback_states.get(callback.state_key, callback_states.get(callback._legacy_state_key))
+            if state:
+                # The Quantization callbacks have a special method that must be called before restoring the weights
+                # of the model
+                callback._load_before_model(self.trainer.model, deepcopy(state))
+
     def restore_callbacks(self) -> None:
         """Restores all callbacks from the pre-loaded checkpoint."""
         if not self._loaded_checkpoint:
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
@@ -1140,6 +1140,7 @@ def tune(
     def _restore_modules_and_callbacks(self, checkpoint_path: Optional[_PATH] = None) -> None:
         # restore modules after setup
         self._checkpoint_connector.resume_start(checkpoint_path)
+        self._checkpoint_connector._restore_quantization_callbacks()
         self._checkpoint_connector.restore_model()
         self._checkpoint_connector.restore_datamodule()
         if self.state.fn == TrainerFn.FITTING:
diff --git a/tests/callbacks/test_quantization.py b/tests/callbacks/test_quantization.py
@@ -63,6 +63,7 @@ def test_quantization(tmpdir, observe: str, fuse: bool, convert: bool):
     # test that the test score is almost the same as with pure training
     assert torch.allclose(org_score, quant_score, atol=0.45)
     model_path = trainer.checkpoint_callback.best_model_path
+    curr_epoch = trainer.current_epoch
 
     trainer_args.update(dict(max_epochs=1, enable_checkpointing=False))
     if not convert:
@@ -81,6 +82,15 @@ def test_quantization(tmpdir, observe: str, fuse: bool, convert: bool):
     quant2_score = torch.mean(torch.tensor([mape(qmodel2(x), y) for x, y in dm.test_dataloader()]))
     assert torch.allclose(org_score, quant2_score, atol=0.45)
 
+    # test without and with QAT callback
+    trainer_args.update(max_epochs=curr_epoch + 1)
+    qmodel2 = RegressionModel()
+    trainer = Trainer(callbacks=[QuantizationAwareTraining()], **trainer_args)
+    trainer.fit(qmodel2, datamodule=dm, ckpt_path=model_path)
+    quant2_score = torch.mean(torch.tensor([mape(qmodel2(x), y) for x, y in dm.test_dataloader()]))
+    # test that the test score is almost the same as with pure training
+    assert torch.allclose(org_score, quant2_score, atol=0.45)
+
 
 @RunIf(quantization=True)
 def test_quantize_torchscript(tmpdir):