Add QAT (quantization-aware training) model converter

mori360 · mori360 · commit 0f5cb10e0e66 · 2026-03-13T14:35:32.000-07:00
ghstack-source-id: 0cc296d Pull Request resolved: #2488
diff --git a/tests/unit_tests/test_model_converter.py b/tests/unit_tests/test_model_converter.py
@@ -11,6 +11,7 @@
 
 from torchtitan.components.lora import LoRAConverter
 from torchtitan.components.quantization.float8 import Float8LinearConverter
+from torchtitan.components.quantization.qat import QATConverter
 from torchtitan.config import ConfigManager
 from torchtitan.distributed import ParallelDims
 from torchtitan.protocols.model_converter import ModelConvertersContainer
@@ -214,3 +215,88 @@ def test_qlora_base_weights_quantized_adapters_full_precision():
         assert (
             layer.lora_b.weight.dtype == torch.float32
         ), f"{name}.lora_b.weight should be float32"
+
+
+def test_qat_preserves_weight_dtype():
+    """QAT converter should not change weight dtype (fake quantization happens in forward)."""
+    pytest.importorskip("torchao")
+
+    model = nn.Sequential(
+        OrderedDict(
+            [
+                ("fc1", nn.Linear(64, 64)),
+                ("relu", nn.ReLU()),
+                ("fc2", nn.Linear(64, 64)),
+            ]
+        )
+    )
+    original_dtypes = {name: param.dtype for name, param in model.named_parameters()}
+
+    converter = QATConverter(QATConverter.Config(group_size=64))
+    converter.convert(model)
+
+    for name, param in model.named_parameters():
+        assert (
+            param.dtype == original_dtypes[name]
+        ), f"'{name}' dtype changed from {original_dtypes[name]} to {param.dtype}"
+
+
+@pytest.mark.parametrize(
+    "scheme, group_size, expected_linear_cls",
+    [
+        ("int4_weight_only", 64, "FakeQuantizedLinear"),
+        ("intx_weight_only", 64, "FakeQuantizedLinear"),
+        ("int8_dynamic_act_intx_weight", 64, "FakeQuantizedLinear"),
+        ("float8_dynamic_act_float8_weight", None, "FakeQuantizedLinear"),
+        ("float8_dynamic_act_int4_weight", None, "FakeQuantizedLinear"),
+        ("nvfp4", None, "NVFP4FakeQuantizedLinear"),
+        ("mx", None, "MXFakeQuantizedLinear"),
+    ],
+)
+def test_qat_all_schemes(scheme, group_size, expected_linear_cls):
+    """Each QAT scheme should replace nn.Linear with the correct fake-quantized class."""
+    pytest.importorskip("torchao")
+
+    model = nn.Sequential(
+        OrderedDict(
+            [
+                ("fc1", nn.Linear(64, 64)),
+                ("relu", nn.ReLU()),
+                ("fc2", nn.Linear(64, 64)),
+            ]
+        )
+    )
+
+    config_kwargs = {"scheme": scheme}
+    if group_size is not None:
+        config_kwargs["group_size"] = group_size
+    converter = QATConverter(QATConverter.Config(**config_kwargs))
+    converter.convert(model)
+
+    # Linear layers should be replaced with the expected class
+    assert (
+        type(model.fc1).__name__ == expected_linear_cls
+    ), f"scheme={scheme}: expected {expected_linear_cls}, got {type(model.fc1).__name__}"
+    assert (
+        type(model.fc2).__name__ == expected_linear_cls
+    ), f"scheme={scheme}: expected {expected_linear_cls}, got {type(model.fc2).__name__}"
+
+
+def test_qat_unknown_scheme_raises():
+    """QATConverter should raise ValueError for unknown schemes."""
+    with pytest.raises(ValueError, match="Unknown QAT scheme"):
+        QATConverter(QATConverter.Config(scheme="not_a_real_scheme"))
+
+
+def test_qat_group_size_warning_for_unsupported_scheme(caplog):
+    """QATConverter should warn when group_size is set for a scheme that ignores it."""
+    pytest.importorskip("torchao")
+    import logging
+
+    with caplog.at_level(logging.WARNING):
+        QATConverter(
+            QATConverter.Config(
+                scheme="float8_dynamic_act_float8_weight", group_size=64
+            )
+        )
+    assert "does not use group_size" in caplog.text
diff --git a/torchtitan/components/quantization/qat.py b/torchtitan/components/quantization/qat.py
@@ -0,0 +1,153 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+
+import torch.nn as nn
+from torchtitan.config import Configurable
+from torchtitan.tools.logging import logger
+
+# Supported scheme names.
+_SUPPORTED_SCHEMES = (
+    "int4_weight_only",
+    "intx_weight_only",
+    "int8_dynamic_act_intx_weight",
+    "float8_dynamic_act_float8_weight",
+    "float8_dynamic_act_int4_weight",
+    "nvfp4",
+    "mx",
+)
+
+# Schemes that accept a group_size parameter.
+_SCHEMES_WITH_GROUP_SIZE = (
+    "int4_weight_only",
+    "intx_weight_only",
+    "int8_dynamic_act_intx_weight",
+)
+
+
+def _build_base_config(scheme: str, group_size: int):
+    """Return a torchao PTQ base config for the given scheme name."""
+    if scheme == "int4_weight_only":
+        from torchao.quantization import Int4WeightOnlyConfig
+
+        return Int4WeightOnlyConfig(group_size=group_size)
+
+    elif scheme == "intx_weight_only":
+        import torch
+        from torchao.quantization import IntxWeightOnlyConfig
+        from torchao.quantization.granularity import PerGroup
+
+        int4_dtype = torch.int4  # pyrefly: ignore[missing-attribute]
+        return IntxWeightOnlyConfig(
+            weight_dtype=int4_dtype,
+            granularity=PerGroup(group_size),
+        )
+
+    elif scheme == "int8_dynamic_act_intx_weight":
+        import torch
+        from torchao.quantization import Int8DynamicActivationIntxWeightConfig
+        from torchao.quantization.granularity import PerGroup
+
+        int4_dtype = torch.int4  # pyrefly: ignore[missing-attribute]
+        return Int8DynamicActivationIntxWeightConfig(
+            weight_dtype=int4_dtype,
+            weight_granularity=PerGroup(group_size),
+        )
+
+    elif scheme == "float8_dynamic_act_float8_weight":
+        from torchao.quantization import Float8DynamicActivationFloat8WeightConfig
+
+        return Float8DynamicActivationFloat8WeightConfig()
+
+    elif scheme == "float8_dynamic_act_int4_weight":
+        from torchao.quantization import Float8DynamicActivationInt4WeightConfig
+
+        return Float8DynamicActivationInt4WeightConfig()
+
+    elif scheme == "nvfp4":
+        from torchao.prototype.mx_formats import NVFP4DynamicActivationNVFP4WeightConfig
+
+        return NVFP4DynamicActivationNVFP4WeightConfig()
+
+    elif scheme == "mx":
+        from torchao.prototype.mx_formats import MXDynamicActivationMXWeightConfig
+
+        return MXDynamicActivationMXWeightConfig()
+
+    else:
+        raise ValueError(
+            f"Unknown QAT scheme '{scheme}'. Supported: {_SUPPORTED_SCHEMES}"
+        )
+
+
+class QATConverter(Configurable):
+    """Apply quantization-aware training via torchao's QATConfig.
+
+    Uses ``torchao.quantize_(model, QATConfig(base_config, step="prepare"))``
+    to insert fake quantization into ``nn.Linear`` modules. The ``scheme``
+    config field selects a torchao PTQ base config, which QATConfig uses to
+    infer the appropriate fake quantization for both weights and activations.
+
+    Supported schemes:
+      - ``"int4_weight_only"`` — int4 weight-only fake quantization
+      - ``"intx_weight_only"`` — intx weight-only fake quantization
+      - ``"int8_dynamic_act_intx_weight"`` — int8 activation + int4 weight
+      - ``"float8_dynamic_act_float8_weight"`` — float8 activation + float8 weight
+      - ``"float8_dynamic_act_int4_weight"`` — float8 activation + int4 weight
+      - ``"nvfp4"`` — NVFP4 dynamic activation + NVFP4 weight
+      - ``"mx"`` — MX dynamic activation + MX weight
+
+    When composed with LoRA (QATConverter listed before LoRAConverter in converters),
+    LoRA will inherit from FakeQuantizedLinear so base weights are fake-quantized
+    while LoRA adapters stay full-precision.
+    """
+
+    @dataclass(kw_only=True, slots=True)
+    class Config(Configurable.Config):
+        scheme: str = "int4_weight_only"
+        """QAT scheme name. Maps to a torchao PTQ base config.
+        Supported: 'int4_weight_only', 'intx_weight_only',
+        'int8_dynamic_act_intx_weight', 'float8_dynamic_act_float8_weight',
+        'float8_dynamic_act_int4_weight', 'nvfp4', 'mx'."""
+
+        group_size: int = 256
+        """Group size for per-group weight quantization.
+        Used by schemes that support per-group granularity
+        (int4_weight_only, intx_weight_only, int8_dynamic_act_intx_weight).
+        Must divide in_features of all Linear layers in the model."""
+
+    def __init__(self, config: Config, **kwargs):
+        if config.scheme not in _SUPPORTED_SCHEMES:
+            raise ValueError(
+                f"Unknown QAT scheme '{config.scheme}'. "
+                f"Supported: {_SUPPORTED_SCHEMES}"
+            )
+        self.scheme = config.scheme
+        self.group_size = config.group_size
+        if config.scheme not in _SCHEMES_WITH_GROUP_SIZE:
+            logger.warning(
+                f"QAT scheme '{config.scheme}' does not use group_size, "
+                f"ignoring group_size={config.group_size}"
+            )
+        logger.info(
+            f"QAT training active (scheme={self.scheme}, group_size={self.group_size})"
+        )
+
+    def convert(self, model: nn.Module) -> None:
+        from torchao.quantization import quantize_
+        from torchao.quantization.qat import QATConfig
+        from torchao.quantization.qat.api import QATStep
+
+        base_config = _build_base_config(self.scheme, self.group_size)
+        quantize_(model, QATConfig(base_config, step=QATStep.PREPARE))
+        logger.info(
+            f"Applied QAT fake quantization (scheme={self.scheme}, "
+            f"group_size={self.group_size})"
+        )
+
+    def post_optimizer_hook(self, model: nn.Module | list[nn.Module]) -> None:
+        pass
diff --git a/torchtitan/models/llama3/config_registry.py b/torchtitan/models/llama3/config_registry.py
@@ -13,6 +13,7 @@
     OptimizersInBackwardContainer,
 )
 from torchtitan.components.quantization.float8 import Float8LinearConverter
+from torchtitan.components.quantization.qat import QATConverter
 from torchtitan.components.validate import Validator
 from torchtitan.config import (
     ActivationCheckpointConfig,
@@ -144,6 +145,32 @@ def llama3_debugmodel_qlora() -> Trainer.Config:
     return config
 
 
+def llama3_debugmodel_qat() -> Trainer.Config:
+    config = llama3_debugmodel()
+    config.model_converters = ModelConvertersContainer.Config(
+        converters=[
+            QATConverter.Config(),
+        ],
+    )
+    return config
+
+
+def llama3_debugmodel_qat_lora() -> Trainer.Config:
+    config = llama3_debugmodel()
+    # QATConverter must come before LoRAConverter so that LoRA inherits from
+    # FakeQuantizedLinear, giving fake-quantized base weights + full-precision adapters.
+    config.model_converters = ModelConvertersContainer.Config(
+        converters=[
+            QATConverter.Config(),
+            LoRAConverter.Config(
+                rank=8,
+                alpha=16.0,
+            ),
+        ],
+    )
+    return config
+
+
 def llama3_8b() -> Trainer.Config:
     return Trainer.Config(
         hf_assets_path="./assets/hf/Llama-3.1-8B",
diff --git a/torchtitan/protocols/model_converter.py b/torchtitan/protocols/model_converter.py
@@ -87,20 +87,21 @@ def post_optimizer_hook(self, model: nn.Module | list[nn.Module]):
 def _validate_converter_ordering(converters: list[Configurable.Config]):
     """Validates that converters are in the correct order.
 
-    LoRA must come after quantization because quantization replaces nn.Linear
-    with specialized subclasses (e.g. Float8Linear), and LoRA dynamically
-    inherits from whatever linear class it wraps.
+    LoRA must come after quantization and QAT because both replace nn.Linear
+    with specialized subclasses (e.g. Float8Linear, FakeQuantizedLinear), and
+    LoRA dynamically inherits from whatever linear class it wraps.
     """
     from torchtitan.components.lora import LoRAConverter
+    from torchtitan.components.quantization.qat import QATConverter
 
     seen_lora = False
     for config in converters:
         if isinstance(config, LoRAConverter.Config):
             seen_lora = True
-        elif isinstance(config, QuantizationConverter.Config) and seen_lora:
+        elif isinstance(config, (QuantizationConverter.Config, QATConverter.Config)) and seen_lora:
             raise ValueError(
-                "LoRA converter must come after quantization converters. "
-                "Quantization replaces nn.Linear with specialized subclasses, "
+                "LoRA converter must come after quantization and QAT converters. "
+                "Quantization/QAT replaces nn.Linear with specialized subclasses, "
                 "and LoRA must wrap the final linear class."
             )