Clean up QAT API surface + add separate API ref (#2567)

andrewor14 · web-flow · commit 11f1a76dfde0 · 2025-07-17T14:55:34.000-04:00
This commit does a few things: 1. Make AffineFakeQuantizedTensor and associated functions private. These are not meant to be exposed to users yet. 2. Expose some commonly used APIs to top level (e.g. FakeQuantizer) 3. Deprecate some QAT APIs 4. Add separate API ref to better categorize QAT APIs As of this commit, all APIs under `torchao.quantization.qat` should be either public and documented, deprecated, or private. To preview docs: https://docs-preview.pytorch.org/pytorch/ao/2567/api_ref_qat.html
diff --git a/docs/source/api_ref_qat.rst b/docs/source/api_ref_qat.rst
@@ -0,0 +1,58 @@
+.. _api_qat:
+
+========================
+torchao.quantization.qat
+========================
+
+.. currentmodule:: torchao.quantization.qat
+
+QAT Configs for quantize_
+---------------------------------------
+For a full example of how to use QAT with our main `quantize_` API,
+please refer to the `QAT README <https://github.com/pytorch/ao/blob/main/torchao/quantization/qat/README.md#quantize_-api-recommended>`__.
+
+.. autosummary::
+    :toctree: generated/
+    :nosignatures:
+
+    IntXQuantizationAwareTrainingConfig
+    FromIntXQuantizationAwareTrainingConfig
+
+Custom QAT APIs
+---------------
+.. autosummary::
+    :toctree: generated/
+    :nosignatures:
+
+    FakeQuantizeConfig
+    FakeQuantizedLinear
+    FakeQuantizedEmbedding
+    FakeQuantizer
+    linear.enable_linear_fake_quant
+    linear.disable_linear_fake_quant
+
+Legacy QAT Quantizers
+---------------------
+
+.. autosummary::
+    :toctree: generated/
+    :nosignatures:
+
+    Int4WeightOnlyQATQuantizer
+    linear.Int4WeightOnlyQATLinear
+    Int8DynActInt4WeightQATQuantizer
+    linear.Int8DynActInt4WeightQATLinear
+    Int4WeightOnlyEmbeddingQATQuantizer
+    embedding.Int4WeightOnlyQATEmbedding
+    embedding.Int4WeightOnlyEmbedding
+    Float8ActInt4WeightQATQuantizer
+    ComposableQATQuantizer
+
+Prototype
+---------
+
+.. autosummary::
+    :toctree: generated/
+    :nosignatures:
+
+    initialize_fake_quantizers
diff --git a/docs/source/api_ref_quantization.rst b/docs/source/api_ref_quantization.rst
@@ -34,24 +34,6 @@ Inference APIs for quantize\_
     UIntXWeightOnlyConfig
     FPXWeightOnlyConfig
 
-.. currentmodule:: torchao.quantization.qat
-
-QAT APIs
-----------------------
-
-.. autosummary::
-    :toctree: generated/
-    :nosignatures:
-
-    IntXQuantizationAwareTrainingConfig
-    FromIntXQuantizationAwareTrainingConfig
-    FakeQuantizeConfig
-    Int4WeightOnlyQATQuantizer
-    Int8DynActInt4WeightQATQuantizer
-    Int4WeightOnlyEmbeddingQATQuantizer
-    ComposableQATQuantizer
-    initialize_fake_quantizers
-
 .. currentmodule:: torchao.quantization
 
 Quantization Primitives
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -31,6 +31,7 @@ for an overall introduction to the library and recent highlight and updates.
 
    api_ref_dtypes
    api_ref_quantization
+   api_ref_qat
    api_ref_sparsity
    api_ref_float8
 
diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
@@ -1223,8 +1223,8 @@ def test_qat_prototype_bc(self):
             Int8DynActInt4WeightQATQuantizerModuleSwap,
         )
         from torchao.quantization.prototype.qat.affine_fake_quantized_tensor import (  # noqa: F401, F811
-            AffineFakeQuantizedTensor,
-            to_affine_fake_quantized,
+            _AffineFakeQuantizedTensor,
+            _to_affine_fake_quantized,
         )
         from torchao.quantization.prototype.qat.api import (  # noqa: F401, F811
             ComposableQATQuantizer,
diff --git a/torchao/prototype/quantization/autoquant_v2.py b/torchao/prototype/quantization/autoquant_v2.py
@@ -74,7 +74,7 @@
 def _is_linear(mod, *args):
     # avoid circular dependencies
     from torchao.quantization.qat.affine_fake_quantized_tensor import (
-        AffineFakeQuantizedTensor,
+        _AffineFakeQuantizedTensor,
     )
 
     # adding weight tensor subclass isinstance check to make sure the weight is only quantized once
@@ -86,7 +86,7 @@ def _is_linear(mod, *args):
         and not isinstance(mod.weight, AutoQuantizableLinearWeightV1)
         and not isinstance(mod.weight, AffineQuantizedTensor)
         and not isinstance(mod.weight, LinearActivationQuantizedTensor)
-        and not isinstance(mod.weight, AffineFakeQuantizedTensor)
+        and not isinstance(mod.weight, _AffineFakeQuantizedTensor)
         and not isinstance(mod, torch.nn.modules.linear.NonDynamicallyQuantizableLinear)
     )
 
diff --git a/torchao/quantization/prototype/qat/affine_fake_quantized_tensor.py b/torchao/quantization/prototype/qat/affine_fake_quantized_tensor.py
@@ -1,9 +1,9 @@
 from torchao.quantization.qat.affine_fake_quantized_tensor import (
-    AffineFakeQuantizedTensor,
-    to_affine_fake_quantized,
+    _AffineFakeQuantizedTensor,
+    _to_affine_fake_quantized,
 )
 
 __all__ = [
-    "AffineFakeQuantizedTensor",
-    "to_affine_fake_quantized",
+    "_AffineFakeQuantizedTensor",
+    "_to_affine_fake_quantized",
 ]
diff --git a/torchao/quantization/qat/__init__.py b/torchao/quantization/qat/__init__.py
@@ -8,9 +8,12 @@
     intx_quantization_aware_training,
 )
 from .embedding import (
+    FakeQuantizedEmbedding,
     Int4WeightOnlyEmbeddingQATQuantizer,
 )
+from .fake_quantizer import FakeQuantizer
 from .linear import (
+    FakeQuantizedLinear,
     Float8ActInt4WeightQATQuantizer,
     Int4WeightOnlyQATQuantizer,
     Int8DynActInt4WeightQATQuantizer,
@@ -19,6 +22,9 @@
 __all__ = [
     "ComposableQATQuantizer",
     "FakeQuantizeConfig",
+    "FakeQuantizedLinear",
+    "FakeQuantizedEmbedding",
+    "FakeQuantizer",
     "Float8ActInt4WeightQATQuantizer",
     "FromIntXQuantizationAwareTrainingConfig",
     "Int4WeightOnlyEmbeddingQATQuantizer",
diff --git a/torchao/quantization/qat/affine_fake_quantized_tensor.py b/torchao/quantization/qat/affine_fake_quantized_tensor.py
@@ -20,16 +20,12 @@
 )
 from torchao.utils import TorchAOBaseTensor
 
-from .utils import (
-    _UnwrapAffineFakeQuantizedTensor,
-)
-
 aten = torch.ops.aten
 
 
 class _ToAffineFakeQuantized(torch.autograd.Function):
     """
-    Differentiable constructor for `AffineFakeQuantizedTensor`,
+    Differentiable constructor for `_AffineFakeQuantizedTensor`,
     needed for input activation fake quantization.
     """
 
@@ -47,12 +43,12 @@ def forward(
         zero_point_dtype: Optional[torch.dtype] = None,
         preserve_zero: bool = True,
         zero_point_domain: ZeroPointDomain = ZeroPointDomain.INT,
-    ) -> "AffineFakeQuantizedTensor":
+    ) -> "_AffineFakeQuantizedTensor":
         if zero_point_domain is None:
             raise ValueError("Please use ZeroPointDomain.NONE instead of None")
 
         def apply_fake_quant_fn(t: torch.Tensor):
-            assert isinstance(t, AffineFakeQuantizedTensor)
+            assert isinstance(t, _AffineFakeQuantizedTensor)
             qmin, qmax = _get_and_check_qmin_qmax(target_dtype, quant_min, quant_max)
             if zero_point_domain == ZeroPointDomain.FLOAT and not preserve_zero:
                 scale, zero_point = _choose_qparams_affine_tinygemm(
@@ -102,7 +98,7 @@ def apply_fake_quant_fn(t: torch.Tensor):
             )
             return fq
 
-        return AffineFakeQuantizedTensor(
+        return _AffineFakeQuantizedTensor(
             original_tensor,
             apply_fake_quant_fn,
             fake_quant_enabled=True,
@@ -113,7 +109,7 @@ def backward(ctx, gy):
         return gy, None, None, None, None, None, None, None, None, None, None
 
 
-class AffineFakeQuantizedTensor(TorchAOBaseTensor):
+class _AffineFakeQuantizedTensor(TorchAOBaseTensor):
     """
     Affine fake quantized tensor subclass. Affine quantization means we quantize the floating point tensor
     with an affine transformation:
@@ -212,7 +208,7 @@ def get_value(self) -> torch.Tensor:
         if self.fake_quant_enabled:
             return self.apply_fake_quant_fn(self)
         else:
-            return _UnwrapAffineFakeQuantizedTensor.apply(self)
+            return self.original_tensor
 
     def _get_to_kwargs(self, *args, **kwargs):
         device, dtype, _, memory_format = torch._C._nn._parse_to(*args, **kwargs)
@@ -243,14 +239,14 @@ def to(self, *args, **kwargs):
 
     def _apply_fn_to_data(self, fn: Callable):
         """
-        Create a new `AffineFakeQuantizedTensor` with `fn` applied to the
+        Create a new `_AffineFakeQuantizedTensor` with `fn` applied to the
         original tensor, to be called within __torch_dispatch__.
         """
         return self._create_new(fn(self.original_tensor))
 
     def _create_new(self, new_value: torch.Tensor):
         """
-        Create a new `AffineFakeQuantizedTensor` with a new value,
+        Create a new `_AffineFakeQuantizedTensor` with a new value,
         to be called within __torch_dispatch__.
 
         Note: `requires_grad` must be False here because tensors created
@@ -267,7 +263,7 @@ def _create_new(self, new_value: torch.Tensor):
         )
 
 
-implements = AffineFakeQuantizedTensor.implements
+implements = _AffineFakeQuantizedTensor.implements
 
 
 @implements(torch.nn.functional.linear)
@@ -277,9 +273,9 @@ def _(func, types, args, kwargs):
         args[1],
         args[2] if len(args) > 2 else None,
     )
-    if isinstance(input_tensor, AffineFakeQuantizedTensor):
+    if isinstance(input_tensor, _AffineFakeQuantizedTensor):
         input_tensor = input_tensor.get_value()
-    if isinstance(weight_tensor, AffineFakeQuantizedTensor):
+    if isinstance(weight_tensor, _AffineFakeQuantizedTensor):
         weight_tensor = weight_tensor.get_value()
     return torch.nn.functional.linear(input_tensor, weight_tensor, bias)
 
@@ -288,9 +284,9 @@ def _(func, types, args, kwargs):
 def _(func, types, args, kwargs):
     input_tensor = args[0]
     weight_tensor = args[1]
-    if isinstance(input_tensor, AffineFakeQuantizedTensor):
+    if isinstance(input_tensor, _AffineFakeQuantizedTensor):
         input_tensor = input_tensor.get_value()
-    if isinstance(weight_tensor, AffineFakeQuantizedTensor):
+    if isinstance(weight_tensor, _AffineFakeQuantizedTensor):
         weight_tensor = weight_tensor.get_value()
     return func(input_tensor, weight_tensor)
 
@@ -300,9 +296,9 @@ def _(func, types, args, kwargs):
     bias = args[0]
     input_tensor = args[1]
     weight_tensor = args[2]
-    if isinstance(input_tensor, AffineFakeQuantizedTensor):
+    if isinstance(input_tensor, _AffineFakeQuantizedTensor):
         input_tensor = input_tensor.get_value()
-    if isinstance(weight_tensor, AffineFakeQuantizedTensor):
+    if isinstance(weight_tensor, _AffineFakeQuantizedTensor):
         weight_tensor = weight_tensor.get_value()
     return func(bias, input_tensor, weight_tensor)
 
@@ -348,10 +344,10 @@ def _(func, types, args, kwargs):
 def _(func, types, args, kwargs):
     assert len(args) == 2, f"dispatched the wrong op to the binary handler: {func}"
     new_args = pytree.tree_map_only(
-        AffineFakeQuantizedTensor, lambda x: x.original_tensor, args
+        _AffineFakeQuantizedTensor, lambda x: x.original_tensor, args
     )
     first_afq_tensor = (
-        args[0] if isinstance(args[0], AffineFakeQuantizedTensor) else args[1]
+        args[0] if isinstance(args[0], _AffineFakeQuantizedTensor) else args[1]
     )
     new_value = func(*new_args, **kwargs)
     out = first_afq_tensor._create_new(new_value)
@@ -384,4 +380,4 @@ def _(func, types, args, kwargs):
     return return_and_correct_aliasing(func, args, kwargs, out)
 
 
-to_affine_fake_quantized = AffineFakeQuantizedTensor.from_float
+_to_affine_fake_quantized = _AffineFakeQuantizedTensor.from_float
diff --git a/torchao/quantization/qat/api.py b/torchao/quantization/qat/api.py
@@ -34,7 +34,7 @@ class FakeQuantizeConfig:
     """
     Config for how to fake quantize weights or activations.
 
-    args:
+    Args:
         dtype: dtype to simulate during fake quantization, e.g. torch.int8.
             For PyTorch versions older than 2.6, you may use `TorchAODType` to represent
             torch.int1 to torch.int7 instead, e.g. TorchAODType.INT4.
@@ -54,7 +54,7 @@ class FakeQuantizeConfig:
         range_learning (prototype): whether to learn scale and zero points during training
             (default false), not compatible with `is_dynamic`.
 
-    kwargs (optional):
+    Keyword args:
         group_size: size of each group in per group fake quantization,
             can be set instead of `granularity`
         is_symmetric: whether to use symmetric or asymmetric quantization,
diff --git a/torchao/quantization/qat/linear.py b/torchao/quantization/qat/linear.py
diff --git a/torchao/quantization/qat/utils.py b/torchao/quantization/qat/utils.py
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py

Original file line number	Diff line number	Diff line change
`@@ -1223,8 +1223,8 @@ def test_qat_prototype_bc(self):`
`1223`	`1223`	`Int8DynActInt4WeightQATQuantizerModuleSwap,`
`1224`	`1224`	`)`
`1225`	`1225`	`from torchao.quantization.prototype.qat.affine_fake_quantized_tensor import ( # noqa: F401, F811`
`1226`		`- AffineFakeQuantizedTensor,`
`1227`		`- to_affine_fake_quantized,`
	`1226`	`+ _AffineFakeQuantizedTensor,`
	`1227`	`+ _to_affine_fake_quantized,`
`1228`	`1228`	`)`
`1229`	`1229`	`from torchao.quantization.prototype.qat.api import ( # noqa: F401, F811`
`1230`	`1230`	`ComposableQATQuantizer,`
Original file line number	Diff line number	Diff line change
`@@ -1,9 +1,9 @@`
`1`	`1`	`from torchao.quantization.qat.affine_fake_quantized_tensor import (`
`2`		`- AffineFakeQuantizedTensor,`
`3`		`- to_affine_fake_quantized,`
	`2`	`+ _AffineFakeQuantizedTensor,`
	`3`	`+ _to_affine_fake_quantized,`
`4`	`4`	`)`
`5`	`5`
`6`	`6`	`__all__ = [`
`7`		`- "AffineFakeQuantizedTensor",`
`8`		`- "to_affine_fake_quantized",`
	`7`	`+ "_AffineFakeQuantizedTensor",`
	`8`	`+ "_to_affine_fake_quantized",`
`9`	`9`	`]`