Add QAT (quantization-aware training) model converter

mori360 · mori360 · commit 1e5030a657b3 · 2026-03-04T17:17:16.000-08:00
ghstack-source-id: 29c6878 Pull Request resolved: #2488
diff --git a/torchtitan/components/lora.py b/torchtitan/components/lora.py
@@ -84,7 +84,7 @@ class LoRAConverter(Configurable):
     """Apply LoRA adapters to all Linear layers in a model."""
 
     @dataclass(kw_only=True, slots=True)
-    class Config(Configurable.Config):
+    class LoRAConfig(Configurable.Config):
         rank: int = 8
         """Rank of the LoRA matrices (lora_a: in_features x rank, lora_b: rank x out_features)."""
 
@@ -104,7 +104,10 @@ class Config(Configurable.Config):
         """Scaler block size for NF4 quantization. Default 128 works with debugmodel on 8 GPUs.
         The default torchao value (256) may be too large for sharded tensors."""
 
-    def __init__(self, config: Config, **kwargs):
+    # Alias for backwards compatibility
+    Config = LoRAConfig
+
+    def __init__(self, config: LoRAConfig, **kwargs):
         self.rank = config.rank
         self.alpha = config.alpha
         self.save_adapter_only = config.save_adapter_only
diff --git a/torchtitan/components/quantization/qat.py b/torchtitan/components/quantization/qat.py
@@ -0,0 +1,79 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from typing import Literal
+
+import torch
+import torch.nn as nn
+from torchtitan.config import Configurable
+from torchtitan.tools.logging import logger
+
+
+class QATConverter(Configurable):
+    """Replace nn.Linear with FakeQuantizedLinear for quantization-aware training.
+
+    Uses torchao's FakeQuantizedLinear to simulate int4 weight quantization during
+    training. The fake quantization is applied in the forward pass so the model
+    learns to compensate for quantization error.
+
+    When composed with LoRA (QATConverter listed before LoRAConverter in converters),
+    LoRA will inherit from FakeQuantizedLinear so base weights are fake-quantized
+    while LoRA adapters stay full-precision.
+    """
+
+    @dataclass(kw_only=True, slots=True)
+    class QATConfig(Configurable.Config):
+        dtype: Literal["int4", "int8"] = "int4"
+        """Data type for fake quantization. Supported: 'int4', 'int8'."""
+
+        group_size: int = 256
+        """Group size for per-group weight quantization.
+        Must divide in_features of all Linear layers in the model."""
+
+    # Alias for backwards compatibility
+    Config = QATConfig
+
+    def __init__(self, config: QATConfig, **kwargs):
+        self.dtype = config.dtype
+        self.group_size = config.group_size
+        logger.info(
+            f"QAT training active (dtype={self.dtype}, group_size={self.group_size})"
+        )
+
+    def convert(self, model: nn.Module) -> None:
+        from torchao.quantization.qat import FakeQuantizedLinear, IntxFakeQuantizeConfig
+
+        dtype_map = {
+            "int4": torch.int4,
+            "int8": torch.int8,
+        }
+        torch_dtype = dtype_map[self.dtype]
+
+        weight_config = IntxFakeQuantizeConfig(
+            dtype=torch_dtype,
+            group_size=self.group_size,
+            is_symmetric=True,
+        )
+
+        def _replace_recursive(parent: nn.Module) -> None:
+            for name, child in list(parent.named_children()):
+                if isinstance(child, nn.Linear):
+                    fq = FakeQuantizedLinear.from_linear(
+                        child, weight_config=weight_config
+                    )
+                    setattr(parent, name, fq)
+                else:
+                    _replace_recursive(child)
+
+        _replace_recursive(model)
+        logger.info(
+            "Swapped to FakeQuantizedLinear layers "
+            f"(dtype={self.dtype}, group_size={self.group_size})"
+        )
+
+    def post_optimizer_hook(self, model: nn.Module | list[nn.Module]) -> None:
+        pass
diff --git a/torchtitan/models/llama3/config_registry.py b/torchtitan/models/llama3/config_registry.py
@@ -13,6 +13,7 @@
     OptimizersInBackwardContainer,
 )
 from torchtitan.components.quantization.float8 import Float8LinearConverter
+from torchtitan.components.quantization.qat import QATConverter
 from torchtitan.components.validate import Validator
 from torchtitan.config import (
     ActivationCheckpointConfig,
@@ -131,7 +132,7 @@ def llama3_debugmodel_lora() -> Trainer.Config:
 
 
 def llama3_debugmodel_qlora() -> Trainer.Config:
-    config = llama3_debugmodel_lora()
+    config = llama3_debugmodel()
     config.model_converters = ModelConvertersContainer.Config(
         converters=[
             LoRAConverter.Config(
@@ -144,6 +145,32 @@ def llama3_debugmodel_qlora() -> Trainer.Config:
     return config
 
 
+def llama3_debugmodel_qat() -> Trainer.Config:
+    config = llama3_debugmodel()
+    config.model_converters = ModelConvertersContainer.Config(
+        converters=[
+            QATConverter.Config(),
+        ],
+    )
+    return config
+
+
+def llama3_debugmodel_qat_lora() -> Trainer.Config:
+    config = llama3_debugmodel()
+    # QATConverter must come before LoRAConverter so that LoRA inherits from
+    # FakeQuantizedLinear, giving fake-quantized base weights + full-precision adapters.
+    config.model_converters = ModelConvertersContainer.Config(
+        converters=[
+            QATConverter.Config(),
+            LoRAConverter.Config(
+                rank=8,
+                alpha=16.0,
+            ),
+        ],
+    )
+    return config
+
+
 def llama3_8b() -> Trainer.Config:
     return Trainer.Config(
         hf_assets_path="./assets/hf/Llama-3.1-8B",