fix mxfp exporting (#806)

WeiweiZhang1 · pre-commit-ci[bot] · web-flow · commit 520c78fe4fd0 · 2025-09-09T21:54:19.000+08:00
* enable act quantization for mxfp datatype Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add ut, fixtypo Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixtypo Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/auto_round/autoround.py b/auto_round/autoround.py
@@ -686,7 +686,11 @@ def _check_configs(self) -> None:
         if self.gradient_accumulate_steps <= 0:
             raise ValueError("`gradient_accumulate_steps` must be positive")
 
-        if self.act_bits <= 8 and (not is_nv_fp(self.act_data_type) or "static_gs" not in self.act_data_type):
+        if (
+            self.act_bits <= 8
+            and (not is_nv_fp(self.act_data_type) or "static_gs" not in self.act_data_type)
+            and not is_mx_fp(self.act_data_type)
+        ):
             logger.warning(
                 "activation quantization is an experimental feature with limited support and a complex API. "
                 "And please save the quantized model to fake format as real deployment is not supported currently"
@@ -897,25 +901,22 @@ def _check_supported_format(self, format: str) -> bool:
         # format check for fp8
         w_fp8 = self.data_type == "fp" and self.bits == 8
         act_fp8 = self.act_data_type == "fp" and self.act_bits == 8
-        if (w_fp8 or act_fp8) and re.search("^auto_round|^llmcompressor", format) is None:
+        if (w_fp8 or act_fp8) and re.search("^auto_round|^llm_compressor", format) is None:
             error_msg = (
-                f"is only supported to export auto_round or llmcompressor format," f" but got {format}, please check."
+                f"is only supported to export auto_round or llm_compressor format," f" but got {format}, please check."
             )
             error_msg = ("act_data_type<fp8> " + error_msg) if act_fp8 else error_msg
             error_msg = ("data_type<fp8> " + error_msg) if w_fp8 else error_msg
             logger.error(error_msg)
             sys.exit(-1)
 
-        # Only support to export afp8/nv_fp
+        # Only support to export afp8/nv_fp/mx_fp
         if self.act_bits <= 8:
             if not is_standard_fp(self.act_data_type) or self.act_dynamic:
                 if "llm_compressor" in format:
-                    if is_nv_fp(self.act_data_type) and "static_gs" in self.act_data_type:
-                        logger.warning(
-                            f"AutoRound supports exporting to format '{format}', "
-                            "but loading quantized models in this format is not yet supported. "
-                            "It is currently recommended to export to the 'llm_compressor' format."
-                        )
+                    if (is_nv_fp(self.act_data_type) and "static_gs" in self.act_data_type) or (
+                        is_mx_fp(self.act_data_type)
+                    ):
                         return format
                     bits, group_size, sym, act_bits = 8, -1, True, 8
                     assert (
@@ -925,15 +926,23 @@ def _check_supported_format(self, format: str) -> bool:
                         and self.act_bits == act_bits
                         and self.act_dynamic
                     ), (
-                        f"Currently only support to export llmcompressor format for sym dynamic quantized"
+                        f"Currently only support to export llm_compressor format for sym dynamic quantized"
                         f" W{self.bits}A{self.act_bits} model with group_size={group_size},"
                         f" but got bits={self.bits}, group_size={self.group_size}, sym={self.sym},"
                         f" act_bits={self.act_bits}"
                     )
-                elif format != "fake" and (not is_nv_fp(format) or "static_gs" not in self.act_data_type):
+                elif "auto_round" in format and (
+                    is_mx_fp(self.act_data_type) or (is_nv_fp(format) and "static_gs" in self.act_data_type)
+                ):
+                    logger.warning(
+                        f"AutoRound supports exporting to format '{format}', "
+                        "but loading quantized models in this format is not yet supported. "
+                        "It is currently recommended to export to the 'llm_compressor' format."
+                    )
+                elif format != "fake":
                     logger.warning(
                         "Currently only support to export auto_round format quantized model"
-                        " with fp8 or nv_fp4 dtype activation for activation quantization."
+                        " with fp8, mx_fp and nv_fp4 dtype activation for activation quantization."
                         " Change format to fake and save."
                     )
                     format = "fake"
@@ -2159,7 +2168,7 @@ def calib(self, nsamples, bs):
             exit(-1)
         elif total_cnt < nsamples:
             logger.warning(
-                f"An insufficient number of samples likely reduces the accuracy of the quantized model."
+                f"An insufficient number of samples likely reduces the accuracy of the quantized model. "
                 f"Target samples count is {nsamples}, while valid samples count is {total_cnt}"
             )
 
@@ -2838,7 +2847,11 @@ def _quantize_block(
         with torch.no_grad():
             unwrapper_block(block, best_params)
 
-        if is_nv_fp(self.act_data_type) and any("nv_fp" in format_ for format_ in self.formats):
+        if (
+            is_nv_fp(self.act_data_type)
+            and hasattr(self, "formats")
+            and any("nv_fp" in format_ for format_ in self.formats)
+        ):
             # enable moe experts act_max automatic generation for WrapperWALayer
             set_amax_for_all_moe_layers(block, attr_name="orig_layer.act_max")
 
diff --git a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py
@@ -166,16 +166,21 @@ def save_quantized_as_fp(output_dir, inplace=True, **kwargs):
         for n, m in model.named_modules():
             if isinstance(m, WrapperWALayer):
                 orig_layer = m.orig_layer
-                if not getattr(orig_layer, "input_global_scale", None):
-                    assert hasattr(orig_layer, "act_max")
-                    from auto_round.data_type.nvfp import calculate_gparam
-
-                    input_global_scale = calculate_gparam(orig_layer.act_max, orig_layer.group_size, model.device)
-                    setattr(orig_layer, "input_global_scale", input_global_scale)
-                    delattr(orig_layer, "act_max")
                 set_module(model, n, orig_layer)
 
-        # update input_global_scale
+    if is_nv_fp(act_data_type) and "static_gs" in str(act_data_type).lower():
+        # generate static input_global_scale
+        for n, m in model.named_modules():
+            if isinstance(m, SUPPORTED_LAYER_TYPES):
+                layer = m
+                if layer.act_bits < 8 and not getattr(layer, "input_global_scale", None):
+                    assert hasattr(layer, "act_max")
+                    from auto_round.data_type.nvfp import calculate_gparam
+
+                    input_global_scale = calculate_gparam(layer.act_max, layer.group_size, model.device)
+                    setattr(layer, "input_global_scale", input_global_scale)
+                    delattr(layer, "act_max")
+        # update fused input_global_scale
         from auto_round.data_type.utils import update_fused_layer_global_scales
 
         modules = list(model.modules())
diff --git a/auto_round/export/export_to_autoround/qlinear_fp.py b/auto_round/export/export_to_autoround/qlinear_fp.py
@@ -95,18 +95,6 @@ def __init__(
             weight_name,
             torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32),
         )
-        if not self.sym:
-            ## TODO Currently only sym quant is supported for mxfp dtype. Is weight_zero_point param necessary？
-            self.register_buffer(
-                "weight_zero_point",
-                torch.zeros(
-                    (
-                        math.ceil(infeatures / self.group_size),
-                        outfeatures // 32 * self.bits,
-                    ),
-                    dtype=torch.int32,
-                ),
-            )
         self.register_buffer(
             "weight_scale",
             torch.zeros(
diff --git a/auto_round/export/export_to_llmcompressor/export_to_fp.py b/auto_round/export/export_to_llmcompressor/export_to_fp.py
@@ -161,16 +161,21 @@ def save_quantized_as_fp(output_dir, inplace=True, **kwargs):
         for n, m in model.named_modules():
             if isinstance(m, WrapperWALayer):
                 orig_layer = m.orig_layer
-                if not getattr(orig_layer, "input_global_scale", None):
-                    assert hasattr(orig_layer, "act_max")
-                    from auto_round.data_type.nvfp import calculate_gparam
-
-                    input_global_scale = calculate_gparam(orig_layer.act_max, orig_layer.group_size, model.device)
-                    setattr(orig_layer, "input_global_scale", input_global_scale)
-                    delattr(orig_layer, "act_max")
                 set_module(model, n, orig_layer)
 
-        # update input_global_scale
+    if is_nv_fp(act_data_type) and "static_gs" in str(act_data_type).lower():
+        # generate static input_global_scale
+        for n, m in model.named_modules():
+            if isinstance(m, SUPPORTED_LAYER_TYPES):
+                layer = m
+                if layer.act_bits < 8 and not getattr(layer, "input_global_scale", None):
+                    assert hasattr(layer, "act_max")
+                    from auto_round.data_type.nvfp import calculate_gparam
+
+                    input_global_scale = calculate_gparam(layer.act_max, layer.group_size, model.device)
+                    setattr(layer, "input_global_scale", input_global_scale)
+                    delattr(layer, "act_max")
+        # update fused input_global_scale
         from auto_round.data_type.utils import update_fused_layer_global_scales
 
         modules = list(model.modules())
diff --git a/auto_round/utils.py b/auto_round/utils.py
@@ -2538,14 +2538,17 @@ class BackendDataType(str, Enum):
 
 
 def is_standard_fp(backend):
+    backend = backend.lower()
     return BackendDataType.STANDARD_FP in backend and not is_mx_fp(backend) and not is_nv_fp(backend)
 
 
 def is_mx_fp(backend):
+    backend = backend.lower()
     return BackendDataType.MX_FP in backend
 
 
 def is_nv_fp(backend):
+    backend = backend.lower()
     return BackendDataType.NV_FP in backend
 
 
diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py
@@ -1,3 +1,4 @@
+import os
 import shutil
 import sys
 import unittest
@@ -11,6 +12,17 @@
 from auto_round import AutoRound
 
 
+def _get_folder_size(path: str) -> float:
+    """Return folder size in GB."""
+    total_size = 0
+    for dirpath, _, filenames in os.walk(path):
+        for f in filenames:
+            fp = os.path.join(dirpath, f)
+            if os.path.isfile(fp):
+                total_size += os.path.getsize(fp)
+    return total_size / (1024**3)  # convert to GB
+
+
 class LLMDataLoader:
     def __init__(self):
         self.batch_size = 1
@@ -25,7 +37,7 @@ class TestAutoRound(unittest.TestCase):
     def setUpClass(self):
         model_name = "facebook/opt-125m"
         self.save_dir = "./saved"
-        self.model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         self.llm_dataloader = LLMDataLoader()
 
@@ -268,10 +280,7 @@ def test_mxfp4_llmcompressor_format(self):
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         from transformers import AutoConfig
 
-        bits = 4
-        data_type = "mx_fp"
-        group_size = 32
-        sym = True
+        scheme = "MXFP4"
         layer_config = {}
         fp_layers_str = "k_proj"
         from auto_round.utils import get_fp_layer_names
@@ -282,12 +291,58 @@ def test_mxfp4_llmcompressor_format(self):
         autoround = AutoRound(
             model,
             self.tokenizer,
-            bits=bits,
-            group_size=group_size,
-            sym=sym,
+            scheme=scheme,
             iters=2,
             seqlen=2,
-            data_type=data_type,
+            layer_config=layer_config,
+            dataset=self.llm_dataloader,
+        )
+        quantized_model_path = self.save_dir
+        autoround.quantize()
+        compressed_model = autoround.save_quantized(
+            output_dir=quantized_model_path, inplace=True, format="llm_compressor"
+        )
+        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
+        skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj
+        assert (
+            hasattr(tmp_layer, "weight_scale")
+            and hasattr(tmp_layer, "weight_packed")
+            and tmp_layer.weight_scale.dtype is torch.uint8
+            and tmp_layer.weight_scale.shape[0] == 768
+        ), "Illegal MXFP4 packing name or data_type or shape"
+        assert not hasattr(skip_layer, "weight_scale") and not hasattr(  ## check skipped layers
+            skip_layer, "weight_packed"
+        ), "Illegal MXFP4 quantization for fp_layers"
+        quantization_config = AutoConfig.from_pretrained(
+            quantized_model_path, trust_remote_code=True
+        ).quantization_config
+        assert (
+            quantization_config["format"] == "float-quantized"
+            and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True
+            and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4
+        ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
+
+        shutil.rmtree("./saved", ignore_errors=True)
+
+    def test_rtn_mxfp4_llmcompressor_format(self):
+        model_name = "facebook/opt-125m"
+        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        from transformers import AutoConfig
+
+        scheme = "MXFP4"
+        layer_config = {}
+        fp_layers_str = "k_proj"
+        from auto_round.utils import get_fp_layer_names
+
+        not_quantize_layer_names = get_fp_layer_names(model, fp_layers_str)
+        for name in not_quantize_layer_names:
+            layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float"}
+        autoround = AutoRound(
+            model,
+            self.tokenizer,
+            scheme=scheme,
+            iters=0,
+            seqlen=2,
             layer_config=layer_config,
             dataset=self.llm_dataloader,
         )
@@ -322,19 +377,13 @@ def test_mxfp8_llmcompressor_format(self):
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         from transformers import AutoConfig
 
-        bits = 8
-        data_type = "mx_fp_rceil"
-        group_size = 32
-        sym = True
+        scheme = "MXFP8"
         autoround = AutoRound(
             model,
             self.tokenizer,
-            bits=bits,
-            group_size=group_size,
-            sym=sym,
+            scheme=scheme,
             iters=2,
             seqlen=2,
-            data_type=data_type,
             dataset=self.llm_dataloader,
         )
         quantized_model_path = self.save_dir
@@ -355,28 +404,23 @@ def test_mxfp8_llmcompressor_format(self):
             and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True
             and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 8
         ), f"Invalid MXFP8 quantization configuration: {quantization_config}"
+        folder_size_gb = _get_folder_size(quantized_model_path)
+        # Original opt-125m is < 0.5GB -> quantized mxfp8 model should be smaller but not empty
+        assert (
+            0.15 < folder_size_gb < 0.2
+        ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.2 GB)"
         shutil.rmtree("./saved", ignore_errors=True)
 
     def test_nvfp4_llmcompressor_format(self):
         model_name = "facebook/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         from transformers import AutoConfig
 
-        bits = 4
-        act_bits = 4
-        data_type = "nv_fp"
-        act_data_type = "nv_fp4_with_static_gs"
-        group_size = 16
-        sym = True
+        scheme = "NVFP4"
         autoround = AutoRound(
             model,
             self.tokenizer,
-            bits=bits,
-            act_bits=act_bits,
-            data_type=data_type,
-            act_data_type=act_data_type,
-            group_size=group_size,
-            sym=sym,
+            scheme=scheme,
             iters=2,
             seqlen=2,
             dataset=self.llm_dataloader,
@@ -399,28 +443,23 @@ def test_nvfp4_llmcompressor_format(self):
             quantization_config["format"] == "nvfp4-pack-quantized"
             and quantization_config["config_groups"]["group_0"]["input_activations"]["num_bits"] == 4
         ), f"Invalid NVFP4 quantization configuration: {quantization_config}"
+        folder_size_gb = _get_folder_size(quantized_model_path)
+        # Original opt-125m is < 0.5GB -> quantized nvfp4 model should be smaller but not empty
+        assert (
+            0.1 < folder_size_gb < 0.15
+        ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.15 GB)"
         shutil.rmtree("./saved", ignore_errors=True)
 
     def test_nvfp4_autoround_format(self):
         model_name = "facebook/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         from transformers import AutoConfig
 
-        bits = 4
-        act_bits = 4
-        data_type = "nv_fp"
-        act_data_type = "nv_fp4_with_static_gs"
-        group_size = 16
-        sym = True
+        scheme = "NVFP4"
         autoround = AutoRound(
             model,
             self.tokenizer,
-            bits=bits,
-            act_bits=act_bits,
-            data_type=data_type,
-            act_data_type=act_data_type,
-            group_size=group_size,
-            sym=sym,
+            scheme="NVFP4",
             iters=2,
             seqlen=2,
             dataset=self.llm_dataloader,
diff --git a/test/test_cpu/test_scheme.py b/test/test_cpu/test_scheme.py
diff --git a/test/test_cuda/test_export.py b/test/test_cuda/test_export.py