delete outdated MX inference code (#2615)

vkuzo · web-flow · commit 344d201e39a9 · 2025-07-28T13:17:07.000-04:00
* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]
diff --git a/test/prototype/mx_formats/test_mx_linear.py b/test/prototype/mx_formats/test_mx_linear.py
@@ -14,17 +14,14 @@
 from torchao.prototype.mx_formats.config import (
     MXFP8Dim1CastKernelChoice,
     MXGemmKernelChoice,
-    MXInferenceLinearConfig,
     MXLinearConfig,
     MXLinearRecipeName,
 )
 from torchao.prototype.mx_formats.constants import (
     DTYPE_FP6_E2M3,
     DTYPE_FP6_E3M2,
-    SUPPORTED_ELEM_DTYPES,
 )
 from torchao.prototype.mx_formats.mx_linear import (
-    MXInferenceLinear,
     MXLinear,
 )
 from torchao.prototype.mx_formats.mx_subclass import (
@@ -313,77 +310,18 @@ def test_linear_compile(hp_dtype, recipe_name, bias, mxfp8_cast_kernel_choice):
     torch.testing.assert_close(x_g_ref, x_g, atol=0.02, rtol=0.02)
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
-@pytest.mark.parametrize("elem_dtype", SUPPORTED_ELEM_DTYPES)
-@pytest.mark.parametrize("bias", [True, False])
-@pytest.mark.parametrize("input_shape", [(2, 4), (1, 2, 4), (1, 1, 2, 4)])
-def test_inference_linear(elem_dtype, bias, input_shape):
-    """
-    Smoke test for inference linear module with mx weight
-    """
-    m = nn.Sequential(nn.Linear(4, 8, bias=bias, dtype=torch.bfloat16))
-    m = m.cuda()
-    m_mx = copy.deepcopy(m)
-    config = MXInferenceLinearConfig(block_size=4, elem_dtype=elem_dtype)
-    quantize_(m_mx, config=config)
-
-    x = torch.randn(*input_shape, device="cuda", dtype=torch.bfloat16)
-    y_ref = m(x)
-    y_mx = m_mx(x)
-    sqnr = compute_error(y_ref, y_mx)
-    if elem_dtype is torch.float8_e4m3fn:
-        assert sqnr >= 20.0
-    else:
-        assert sqnr >= 11.0
-
-
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
-@pytest.mark.skipif(
-    not TORCH_VERSION_AT_LEAST_2_8, reason="torch.compile requires PyTorch 2.8+"
-)
-@pytest.mark.parametrize("elem_dtype", SUPPORTED_ELEM_DTYPES)
-def test_inference_compile_simple(elem_dtype):
-    """
-    Smoke test for inference compile
-    """
-    if elem_dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
-        if not is_sm_at_least_89():
-            pytest.skip("CUDA capability >= 8.9 required for float8 in triton")
-    m = nn.Sequential(nn.Linear(4, 8, bias=False, dtype=torch.bfloat16))
-    m = m.cuda()
-    m_mx = copy.deepcopy(m)
-    config = MXInferenceLinearConfig(block_size=4, elem_dtype=elem_dtype)
-    quantize_(m_mx, config=config)
-    m_mx = torch.compile(m_mx, fullgraph="true")
-
-    x = torch.randn(2, 4, device="cuda", dtype=torch.bfloat16)
-    y_ref = m(x)
-    y_mx = m_mx(x)
-    sqnr = compute_error(y_ref, y_mx)
-    if elem_dtype is torch.float8_e4m3fn:
-        assert sqnr >= 20.0
-    else:
-        assert sqnr >= 11.5
-
-
 def test_filter_fn():
     m1 = nn.Sequential(
         nn.Linear(32, 32),
         nn.Linear(32, 32),
     )
-    m2 = copy.deepcopy(m1)
     filter_fn = lambda mod, fqn: isinstance(mod, torch.nn.Linear) and fqn != "1"  # noqa: E731
 
     config = MXLinearConfig(block_size=32)
     quantize_(m1, config=config, filter_fn=filter_fn)
     assert type(m1[0]) == MXLinear
     assert type(m1[1]) == torch.nn.Linear
 
-    config2 = MXInferenceLinearConfig(block_size=32)
-    quantize_(m2, config=config2, filter_fn=filter_fn)  # noqa: E501
-    assert type(m2[0]) == MXInferenceLinear
-    assert type(m2[1]) == torch.nn.Linear
-
 
 def test_training_print_str():
     m = nn.Sequential(nn.Linear(32, 32))
@@ -394,15 +332,6 @@ def test_training_print_str():
     assert "kernel=emulated" in s
 
 
-def test_inference_print_str():
-    m = nn.Sequential(nn.Linear(32, 32))
-    config = MXInferenceLinearConfig()
-    quantize_(m, config=config)
-    s = str(m)
-    assert "bl_sz=32" in s
-    assert "kernel=emulated" in s
-
-
 test_dtypes = (
     [torch.float8_e4m3fn, torch.float4_e2m1fn_x2]
     if TORCH_VERSION_AT_LEAST_2_8
diff --git a/torchao/prototype/mx_formats/README.md b/torchao/prototype/mx_formats/README.md
@@ -45,24 +45,8 @@ quantize_(m, config)
 
 ## MX inference
 
-Note: currently only weight-only quantization is supported.
-
-```python
-import torch
-from torchao.quantization import quantize_
-from torchao.prototype.mx_formats import MXInferenceLinearConfig, MXGemmKernelChoice
-
-m = torch.nn.Sequential(torch.nn.Linear(32, 32)).cuda()
-gemm_kernel_choice = MXGemmKernelChoice.CUBLAS
-config = MXInferenceLinearConfig(
-    elem_dtype=torch.float8_e4m3fn,
-    block_size=32,
-    gemm_kernel_choice=gemm_kernel_choice,
-)
-quantize_(m, config=config)
+Coming soon!
 
-# do inference (not shown)
-```
 ## MXTensor
 
 This is casts between high precision and MX formats implemented in native PyTorch. Currently
diff --git a/torchao/prototype/mx_formats/__init__.py b/torchao/prototype/mx_formats/__init__.py
@@ -1,6 +1,5 @@
 from torchao.prototype.mx_formats.config import (
     MXGemmKernelChoice,
-    MXInferenceLinearConfig,
     MXLinearConfig,
     MXLinearRecipeName,
 )
@@ -18,7 +17,6 @@
 
 __all__ = [
     "MXGemmKernelChoice",
-    "MXInferenceLinearConfig",
     "MXLinearConfig",
     "MXLinearRecipeName",
     "MXFPInferenceConfig",
diff --git a/torchao/prototype/mx_formats/config.py b/torchao/prototype/mx_formats/config.py
@@ -12,8 +12,6 @@
 
 from torchao.core.config import AOBaseConfig
 from torchao.prototype.mx_formats.constants import (
-    DTYPE_FP6_E2M3,
-    DTYPE_FP6_E3M2,
     DTYPE_TO_SHORT_STR,
     SUPPORTED_ELEM_DTYPES,
 )
@@ -163,46 +161,3 @@ def short_str(self) -> str:
         if self.use_fp4_custom_triton_dequant_kernel:
             s += ", use_fp4_custom_triton_dequant_kernel=True"
         return s
-
-
-@dataclass
-class MXInferenceLinearConfig(AOBaseConfig):
-    # block size for scaling, default is 32 to match
-    # https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf,
-    # section 5.2
-    block_size: int = 32
-
-    # element dtype, used for activations, weights and gradients
-    elem_dtype: Any = torch.float8_e4m3fn
-    # TODO(future PR): support different elem_dtype for activations vs weights
-
-    # defines the gemm kernel choice, if the chosen kernel is not supported
-    # on the given hardware an exception will be thrown
-    gemm_kernel_choice: MXGemmKernelChoice = MXGemmKernelChoice.EMULATED
-
-    # If True, uses a custom triton kernel for fp4 dequantize
-    use_fp4_custom_triton_dequant_kernel: bool = False
-
-    # If True, packs 4xFP6 into 3xuint8 containers for inference, using custom triton
-    # kernels (fused unpack/dequantize).
-    pack_fp6: bool = True
-
-    def __post_init__(self):
-        _validate_elem_dtype(self.elem_dtype)
-        _validate_gemm_kernel_choice(
-            self.gemm_kernel_choice, self.block_size, self.elem_dtype
-        )
-
-    def short_str(self) -> str:
-        """
-        Returns a concise representation of the current config.
-        """
-        s = f"bl_sz={self.block_size}, lp_dtype={DTYPE_TO_SHORT_STR[self.elem_dtype]}"
-        s += f", kernel={self.gemm_kernel_choice.value}"
-        if self.use_fp4_custom_triton_dequant_kernel:
-            s += ", use_fp4_custom_triton_dequant_kernel=True"
-        if self.elem_dtype in (DTYPE_FP6_E2M3, DTYPE_FP6_E3M2) and self.pack_fp6:
-            s += ", pack_fp6=True"
-        return s
-
-    # TODO(future PR): add a recipe to config API for inference
diff --git a/torchao/prototype/mx_formats/mx_linear.py b/torchao/prototype/mx_formats/mx_linear.py
@@ -11,13 +11,11 @@
 from typing import Any, Optional
 
 import torch
-import torch.nn.functional as F
 from torch.distributed._tensor import DTensor
 
 from torchao.prototype.mx_formats.config import (
     MXFP8Dim1CastKernelChoice,
     MXGemmKernelChoice,
-    MXInferenceLinearConfig,
     MXLinearConfig,
 )
 from torchao.prototype.mx_formats.kernels import (
@@ -270,59 +268,6 @@ def extra_repr(self):
         return s
 
 
-class MXInferenceLinear(torch.nn.Linear):
-    """
-    Inference version of MXLinear, with the weight pre-quantized to MX.
-
-    Note: this is weight-only quantization, with the gemm being executed
-    in high precision.
-    """
-
-    @classmethod
-    @torch.no_grad()
-    def from_float(
-        cls,
-        mod,
-        config: Optional[MXInferenceLinearConfig] = MXInferenceLinearConfig(),
-    ):
-        with torch.device("meta"):
-            super_kwargs = {
-                "in_features": mod.in_features,
-                "out_features": mod.out_features,
-                "bias": False,
-            }
-            new_mod = cls(**super_kwargs)
-        # TODO(future PR): set to new_mod.weight directly, will need to work
-        # through some errors
-        new_mod.weight_mx = MXTensor.to_mx(
-            mod.weight,
-            config.elem_dtype,
-            block_size=config.block_size,
-            gemm_kernel_choice=config.gemm_kernel_choice,
-            pack_fp6=config.pack_fp6,
-        )
-        new_mod.bias = mod.bias
-        new_mod.config = config
-        return new_mod
-
-    @torch.no_grad()
-    def forward(self, x):
-        w_hp = self.weight_mx.to_dtype(x.dtype)
-        y = F.linear(x, w_hp, self.bias)
-        return y
-
-    def extra_repr(self):
-        s = f"{super().extra_repr()}, {self.config.short_str()}"
-        return s
-
-
 @register_quantize_module_handler(MXLinearConfig)
 def _mx_linear_transform(module: torch.nn.Module, config: MXLinearConfig):
     return MXLinear.from_float(module, config=config)
-
-
-@register_quantize_module_handler(MXInferenceLinearConfig)
-def _mx_inference_linear_transform(
-    module: torch.nn.Module, config: MXInferenceLinearConfig
-):
-    return MXInferenceLinear.from_float(module, config=config)