Add QAT (quantization-aware training) model converter

mori360 · mori360 · commit 34ef778c3e66 · 2026-03-17T11:44:09.000-07:00
ghstack-source-id: 84aff39 Pull Request resolved: #2488
diff --git a/tests/unit_tests/test_model_converter.py b/tests/unit_tests/test_model_converter.py
@@ -11,6 +11,7 @@
 
 from torchtitan.components.lora import LoRAConverter
 from torchtitan.components.quantization.float8 import Float8LinearConverter
+from torchtitan.components.quantization.qat import QATConverter
 from torchtitan.config import ConfigManager
 from torchtitan.distributed import ParallelDims
 from torchtitan.protocols.model_converter import ModelConvertersContainer
@@ -202,3 +203,88 @@ def test_lora_key_remap_roundtrip():
     assert set(rt_sd.keys()) == set(tt_sd.keys())
     for k in tt_sd:
         assert torch.equal(rt_sd[k], tt_sd[k])
+
+
+def test_qat_preserves_weight_dtype():
+    """QAT converter should not change weight dtype (fake quantization happens in forward)."""
+    pytest.importorskip("torchao")
+
+    model = nn.Sequential(
+        OrderedDict(
+            [
+                ("fc1", nn.Linear(64, 64)),
+                ("relu", nn.ReLU()),
+                ("fc2", nn.Linear(64, 64)),
+            ]
+        )
+    )
+    original_dtypes = {name: param.dtype for name, param in model.named_parameters()}
+
+    converter = QATConverter(QATConverter.Config(group_size=64))
+    converter.convert(model)
+
+    for name, param in model.named_parameters():
+        assert (
+            param.dtype == original_dtypes[name]
+        ), f"'{name}' dtype changed from {original_dtypes[name]} to {param.dtype}"
+
+
+@pytest.mark.parametrize(
+    "scheme, group_size, expected_linear_cls",
+    [
+        ("int4_weight_only", 64, "FakeQuantizedLinear"),
+        ("intx_weight_only", 64, "FakeQuantizedLinear"),
+        ("int8_dynamic_act_intx_weight", 64, "FakeQuantizedLinear"),
+        ("float8_dynamic_act_float8_weight", None, "FakeQuantizedLinear"),
+        ("float8_dynamic_act_int4_weight", None, "FakeQuantizedLinear"),
+        ("nvfp4", None, "NVFP4FakeQuantizedLinear"),
+        ("mx", None, "MXFakeQuantizedLinear"),
+    ],
+)
+def test_qat_all_schemes(scheme, group_size, expected_linear_cls):
+    """Each QAT scheme should replace nn.Linear with the correct fake-quantized class."""
+    pytest.importorskip("torchao")
+
+    model = nn.Sequential(
+        OrderedDict(
+            [
+                ("fc1", nn.Linear(64, 64)),
+                ("relu", nn.ReLU()),
+                ("fc2", nn.Linear(64, 64)),
+            ]
+        )
+    )
+
+    config_kwargs = {"scheme": scheme}
+    if group_size is not None:
+        config_kwargs["group_size"] = group_size
+    converter = QATConverter(QATConverter.Config(**config_kwargs))
+    converter.convert(model)
+
+    # Linear layers should be replaced with the expected class
+    assert (
+        type(model.fc1).__name__ == expected_linear_cls
+    ), f"scheme={scheme}: expected {expected_linear_cls}, got {type(model.fc1).__name__}"
+    assert (
+        type(model.fc2).__name__ == expected_linear_cls
+    ), f"scheme={scheme}: expected {expected_linear_cls}, got {type(model.fc2).__name__}"
+
+
+def test_qat_unknown_scheme_raises():
+    """QATConverter should raise ValueError for unknown schemes."""
+    with pytest.raises(ValueError, match="Unknown QAT scheme"):
+        QATConverter(QATConverter.Config(scheme="not_a_real_scheme"))
+
+
+def test_qat_group_size_warning_for_unsupported_scheme(caplog):
+    """QATConverter should warn when group_size is set for a scheme that ignores it."""
+    pytest.importorskip("torchao")
+    import logging
+
+    with caplog.at_level(logging.WARNING):
+        QATConverter(
+            QATConverter.Config(
+                scheme="float8_dynamic_act_float8_weight", group_size=64
+            )
+        )
+    assert "does not use group_size" in caplog.text
diff --git a/torchtitan/components/lora.py b/torchtitan/components/lora.py
@@ -98,6 +98,15 @@ class Config(Configurable.Config):
         "merged" folds adapters into base weights (base + alpha/rank * B @ A)
         and saves a standard checkpoint with no LoRA keys."""
 
+        adapter_qat_scheme: str = ""
+        """QAT scheme for adapter weights. Empty = no adapter QAT.
+        Must match a supported QATConverter scheme."""
+
+        adapter_qat_group_size: int = 8
+        """Group size for adapter weight quantization.
+        Must divide rank (i.e. rank % group_size == 0).
+        Only used by schemes that support per-group granularity."""
+
     def __init__(self, config: Config, **kwargs):
         self.rank = config.rank
         self.alpha = config.alpha
@@ -107,6 +116,33 @@ def __init__(self, config: Config, **kwargs):
                 f"LoRA save_format must be 'dcp', 'peft', or 'merged', "
                 f"got '{self.save_format}'"
             )
+
+        self.adapter_qat_scheme = config.adapter_qat_scheme
+        self.adapter_qat_group_size = config.adapter_qat_group_size
+        if self.adapter_qat_scheme:
+            from torchtitan.components.quantization.qat import (
+                _SCHEMES_WITH_GROUP_SIZE,
+                _SUPPORTED_SCHEMES,
+            )
+
+            if self.adapter_qat_scheme not in _SUPPORTED_SCHEMES:
+                raise ValueError(
+                    f"Unknown adapter QAT scheme '{self.adapter_qat_scheme}'. "
+                    f"Supported: {_SUPPORTED_SCHEMES}"
+                )
+            if self.adapter_qat_scheme in _SCHEMES_WITH_GROUP_SIZE:
+                if self.rank % self.adapter_qat_group_size != 0:
+                    raise ValueError(
+                        f"LoRA rank ({self.rank}) must be divisible by "
+                        f"adapter_qat_group_size ({self.adapter_qat_group_size})"
+                    )
+            else:
+                logger.warning(
+                    f"Adapter QAT scheme '{self.adapter_qat_scheme}' does not use "
+                    f"group_size, ignoring adapter_qat_group_size="
+                    f"{self.adapter_qat_group_size}"
+                )
+
         logger.info(f"LoRA training active with rank={self.rank}, alpha={self.alpha}")
 
     @staticmethod
@@ -148,6 +184,9 @@ def convert(self, model: nn.Module) -> None:
         model.requires_grad_(False)
         self._replace_linears_with_lora(model)
 
+        if self.adapter_qat_scheme:
+            self._apply_adapter_qat(model)
+
         # Wire up checkpoint filtering so ModelWrapper knows which keys
         # are adapter keys and how to save them.
         model.converter_key_filter = self._is_lora_key  # type: ignore[attr-defined]
@@ -160,6 +199,33 @@ def convert(self, model: nn.Module) -> None:
         if self.save_format == "merged":
             model.converter_export_sd_fn = self._make_merge_fn()  # type: ignore[attr-defined]
 
+    def _apply_adapter_qat(self, model: nn.Module) -> None:
+        from torchao.quantization import quantize_
+        from torchao.quantization.qat import QATConfig
+        from torchao.quantization.qat.api import QATStep
+
+        from torchtitan.components.quantization.qat import _build_base_config
+
+        base_config = _build_base_config(
+            self.adapter_qat_scheme, self.adapter_qat_group_size
+        )
+
+        def _is_lora_linear(mod: nn.Module, fqn: str) -> bool:
+            return isinstance(mod, nn.Linear) and (
+                fqn.endswith(".lora_a") or fqn.endswith(".lora_b")
+            )
+
+        quantize_(
+            model,
+            QATConfig(base_config, step=QATStep.PREPARE),
+            filter_fn=_is_lora_linear,
+        )
+        logger.info(
+            f"Applied adapter QAT fake quantization "
+            f"(scheme={self.adapter_qat_scheme}, "
+            f"group_size={self.adapter_qat_group_size})"
+        )
+
     def _replace_linears_with_lora(self, module: nn.Module) -> None:
         for _, child in list(module.named_modules()):
             if isinstance(child, nn.Linear):
diff --git a/torchtitan/components/quantization/qat.py b/torchtitan/components/quantization/qat.py
@@ -0,0 +1,153 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+
+import torch.nn as nn
+from torchtitan.config import Configurable
+from torchtitan.tools.logging import logger
+
+# Supported scheme names.
+_SUPPORTED_SCHEMES = (
+    "int4_weight_only",
+    "intx_weight_only",
+    "int8_dynamic_act_intx_weight",
+    "float8_dynamic_act_float8_weight",
+    "float8_dynamic_act_int4_weight",
+    "nvfp4",
+    "mx",
+)
+
+# Schemes that accept a group_size parameter.
+_SCHEMES_WITH_GROUP_SIZE = (
+    "int4_weight_only",
+    "intx_weight_only",
+    "int8_dynamic_act_intx_weight",
+)
+
+
+def _build_base_config(scheme: str, group_size: int):
+    """Return a torchao PTQ base config for the given scheme name."""
+    if scheme == "int4_weight_only":
+        from torchao.quantization import Int4WeightOnlyConfig
+
+        return Int4WeightOnlyConfig(group_size=group_size)
+
+    elif scheme == "intx_weight_only":
+        import torch
+        from torchao.quantization import IntxWeightOnlyConfig
+        from torchao.quantization.granularity import PerGroup
+
+        int4_dtype = torch.int4  # pyrefly: ignore[missing-attribute]
+        return IntxWeightOnlyConfig(
+            weight_dtype=int4_dtype,
+            granularity=PerGroup(group_size),
+        )
+
+    elif scheme == "int8_dynamic_act_intx_weight":
+        import torch
+        from torchao.quantization import Int8DynamicActivationIntxWeightConfig
+        from torchao.quantization.granularity import PerGroup
+
+        int4_dtype = torch.int4  # pyrefly: ignore[missing-attribute]
+        return Int8DynamicActivationIntxWeightConfig(
+            weight_dtype=int4_dtype,
+            weight_granularity=PerGroup(group_size),
+        )
+
+    elif scheme == "float8_dynamic_act_float8_weight":
+        from torchao.quantization import Float8DynamicActivationFloat8WeightConfig
+
+        return Float8DynamicActivationFloat8WeightConfig()
+
+    elif scheme == "float8_dynamic_act_int4_weight":
+        from torchao.quantization import Float8DynamicActivationInt4WeightConfig
+
+        return Float8DynamicActivationInt4WeightConfig()
+
+    elif scheme == "nvfp4":
+        from torchao.prototype.mx_formats import NVFP4DynamicActivationNVFP4WeightConfig
+
+        return NVFP4DynamicActivationNVFP4WeightConfig()
+
+    elif scheme == "mx":
+        from torchao.prototype.mx_formats import MXDynamicActivationMXWeightConfig
+
+        return MXDynamicActivationMXWeightConfig()
+
+    else:
+        raise ValueError(
+            f"Unknown QAT scheme '{scheme}'. Supported: {_SUPPORTED_SCHEMES}"
+        )
+
+
+class QATConverter(Configurable):
+    """Apply quantization-aware training via torchao's QATConfig.
+
+    Uses ``torchao.quantize_(model, QATConfig(base_config, step="prepare"))``
+    to insert fake quantization into ``nn.Linear`` modules. The ``scheme``
+    config field selects a torchao PTQ base config, which QATConfig uses to
+    infer the appropriate fake quantization for both weights and activations.
+
+    Supported schemes:
+      - ``"int4_weight_only"`` — int4 weight-only fake quantization
+      - ``"intx_weight_only"`` — intx weight-only fake quantization
+      - ``"int8_dynamic_act_intx_weight"`` — int8 activation + int4 weight
+      - ``"float8_dynamic_act_float8_weight"`` — float8 activation + float8 weight
+      - ``"float8_dynamic_act_int4_weight"`` — float8 activation + int4 weight
+      - ``"nvfp4"`` — NVFP4 dynamic activation + NVFP4 weight
+      - ``"mx"`` — MX dynamic activation + MX weight
+
+    When composed with LoRA (QATConverter listed before LoRAConverter in converters),
+    LoRA will inherit from FakeQuantizedLinear so base weights are fake-quantized
+    while LoRA adapters stay full-precision.
+    """
+
+    @dataclass(kw_only=True, slots=True)
+    class Config(Configurable.Config):
+        scheme: str = "int4_weight_only"
+        """QAT scheme name. Maps to a torchao PTQ base config.
+        Supported: 'int4_weight_only', 'intx_weight_only',
+        'int8_dynamic_act_intx_weight', 'float8_dynamic_act_float8_weight',
+        'float8_dynamic_act_int4_weight', 'nvfp4', 'mx'."""
+
+        group_size: int = 256
+        """Group size for per-group weight quantization.
+        Used by schemes that support per-group granularity
+        (int4_weight_only, intx_weight_only, int8_dynamic_act_intx_weight).
+        Must divide in_features of all Linear layers in the model."""
+
+    def __init__(self, config: Config, **kwargs):
+        if config.scheme not in _SUPPORTED_SCHEMES:
+            raise ValueError(
+                f"Unknown QAT scheme '{config.scheme}'. "
+                f"Supported: {_SUPPORTED_SCHEMES}"
+            )
+        self.scheme = config.scheme
+        self.group_size = config.group_size
+        if config.scheme not in _SCHEMES_WITH_GROUP_SIZE:
+            logger.warning(
+                f"QAT scheme '{config.scheme}' does not use group_size, "
+                f"ignoring group_size={config.group_size}"
+            )
+        logger.info(
+            f"QAT training active (scheme={self.scheme}, group_size={self.group_size})"
+        )
+
+    def convert(self, model: nn.Module) -> None:
+        from torchao.quantization import quantize_
+        from torchao.quantization.qat import QATConfig
+        from torchao.quantization.qat.api import QATStep
+
+        base_config = _build_base_config(self.scheme, self.group_size)
+        quantize_(model, QATConfig(base_config, step=QATStep.PREPARE))
+        logger.info(
+            f"Applied QAT fake quantization (scheme={self.scheme}, "
+            f"group_size={self.group_size})"
+        )
+
+    def post_optimizer_hook(self, model: nn.Module | list[nn.Module]) -> None:
+        pass
diff --git a/torchtitan/models/llama3/config_registry.py b/torchtitan/models/llama3/config_registry.py
@@ -13,6 +13,7 @@
     OptimizersInBackwardContainer,
 )
 from torchtitan.components.quantization.float8 import Float8LinearConverter
+from torchtitan.components.quantization.qat import QATConverter
 from torchtitan.components.validate import Validator
 from torchtitan.config import (
     ActivationCheckpointConfig,
@@ -130,6 +131,32 @@ def llama3_debugmodel_lora() -> Trainer.Config:
     return config
 
 
+def llama3_debugmodel_qat() -> Trainer.Config:
+    config = llama3_debugmodel()
+    config.model_converters = ModelConvertersContainer.Config(
+        converters=[
+            QATConverter.Config(),
+        ],
+    )
+    return config
+
+
+def llama3_debugmodel_qat_lora() -> Trainer.Config:
+    config = llama3_debugmodel()
+    # QATConverter must come before LoRAConverter so that LoRA inherits from
+    # FakeQuantizedLinear, giving fake-quantized base weights + full-precision adapters.
+    config.model_converters = ModelConvertersContainer.Config(
+        converters=[
+            QATConverter.Config(),
+            LoRAConverter.Config(
+                rank=8,
+                alpha=16.0,
+            ),
+        ],
+    )
+    return config
+
+
 def llama3_8b() -> Trainer.Config:
     return Trainer.Config(
         hf_assets_path="./assets/hf/Llama-3.1-8B",
diff --git a/torchtitan/protocols/model_converter.py b/torchtitan/protocols/model_converter.py