Add QAT (quantization-aware training) model converter

mori360 · mori360 · commit 9501b3d046e2 · 2026-03-12T12:45:54.000-07:00
ghstack-source-id: b139735 Pull Request resolved: #2488
diff --git a/tests/unit_tests/test_model_converter.py b/tests/unit_tests/test_model_converter.py
@@ -9,6 +9,7 @@
 
 from torchtitan.components.lora import LoRAConverter
 from torchtitan.components.quantization.float8 import Float8LinearConverter
+from torchtitan.components.quantization.qat import QATConverter
 from torchtitan.config import ConfigManager
 from torchtitan.distributed import ParallelDims
 from torchtitan.protocols.model_converter import ModelConvertersContainer
@@ -198,3 +199,19 @@ def test_qlora_base_weights_quantized_adapters_full_precision():
         assert (
             layer.lora_b.weight.dtype == torch.float32
         ), f"{name}.lora_b.weight should be float32"
+
+
+def test_qat_preserves_weight_dtype():
+    """QAT converter should not change weight dtype (fake quantization happens in forward)."""
+    pytest.importorskip("torchao")
+
+    model = SimpleModel()
+    original_dtypes = {name: param.dtype for name, param in model.named_parameters()}
+
+    converter = QATConverter(QATConverter.Config(group_size=64))
+    converter.convert(model)
+
+    for name, param in model.named_parameters():
+        assert (
+            param.dtype == original_dtypes[name]
+        ), f"'{name}' dtype changed from {original_dtypes[name]} to {param.dtype}"
diff --git a/torchtitan/components/quantization/qat.py b/torchtitan/components/quantization/qat.py
@@ -0,0 +1,77 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from typing import Literal
+
+import torch
+import torch.nn as nn
+from torchtitan.config import Configurable
+from torchtitan.tools.logging import logger
+
+
+class QATConverter(Configurable):
+    """Replace nn.Linear with FakeQuantizedLinear for quantization-aware training.
+
+    Uses torchao's FakeQuantizedLinear to simulate int4 weight quantization during
+    training. The fake quantization is applied in the forward pass so the model
+    learns to compensate for quantization error.
+
+    When composed with LoRA (QATConverter listed before LoRAConverter in converters),
+    LoRA will inherit from FakeQuantizedLinear so base weights are fake-quantized
+    while LoRA adapters stay full-precision.
+    """
+
+    @dataclass(kw_only=True, slots=True)
+    class Config(Configurable.Config):
+        dtype: Literal["int4", "int8"] = "int4"
+        """Data type for fake quantization. Supported: 'int4', 'int8'."""
+
+        group_size: int = 256
+        """Group size for per-group weight quantization.
+        Must divide in_features of all Linear layers in the model."""
+
+    def __init__(self, config: Config, **kwargs):
+        self.dtype = config.dtype
+        self.group_size = config.group_size
+        logger.info(
+            f"QAT training active (dtype={self.dtype}, group_size={self.group_size})"
+        )
+
+    def convert(self, model: nn.Module) -> None:
+        from torchao.quantization.qat import FakeQuantizedLinear, IntxFakeQuantizeConfig
+        from torchao.quantization.quant_primitives import TorchAODType
+
+        dtype_map = {
+            "int4": TorchAODType.INT4,
+            "int8": torch.int8,
+        }
+        torch_dtype = dtype_map[self.dtype]
+
+        weight_config = IntxFakeQuantizeConfig(
+            dtype=torch_dtype,
+            group_size=self.group_size,
+            is_symmetric=True,
+        )
+
+        def _replace_recursive(parent: nn.Module) -> None:
+            for name, child in list(parent.named_children()):
+                if isinstance(child, nn.Linear):
+                    fq = FakeQuantizedLinear.from_linear(
+                        child, weight_config=weight_config
+                    )
+                    setattr(parent, name, fq)
+                else:
+                    _replace_recursive(child)
+
+        _replace_recursive(model)
+        logger.info(
+            "Swapped to FakeQuantizedLinear layers "
+            f"(dtype={self.dtype}, group_size={self.group_size})"
+        )
+
+    def post_optimizer_hook(self, model: nn.Module | list[nn.Module]) -> None:
+        pass
diff --git a/torchtitan/models/llama3/config_registry.py b/torchtitan/models/llama3/config_registry.py
@@ -13,6 +13,7 @@
     OptimizersInBackwardContainer,
 )
 from torchtitan.components.quantization.float8 import Float8LinearConverter
+from torchtitan.components.quantization.qat import QATConverter
 from torchtitan.components.validate import Validator
 from torchtitan.config import (
     ActivationCheckpointConfig,
@@ -144,6 +145,32 @@ def llama3_debugmodel_qlora() -> Trainer.Config:
     return config
 
 
+def llama3_debugmodel_qat() -> Trainer.Config:
+    config = llama3_debugmodel()
+    config.model_converters = ModelConvertersContainer.Config(
+        converters=[
+            QATConverter.Config(),
+        ],
+    )
+    return config
+
+
+def llama3_debugmodel_qat_lora() -> Trainer.Config:
+    config = llama3_debugmodel()
+    # QATConverter must come before LoRAConverter so that LoRA inherits from
+    # FakeQuantizedLinear, giving fake-quantized base weights + full-precision adapters.
+    config.model_converters = ModelConvertersContainer.Config(
+        converters=[
+            QATConverter.Config(),
+            LoRAConverter.Config(
+                rank=8,
+                alpha=16.0,
+            ),
+        ],
+    )
+    return config
+
+
 def llama3_8b() -> Trainer.Config:
     return Trainer.Config(
         hf_assets_path="./assets/hf/Llama-3.1-8B",