Support mxfp nvfp lmhead quant (#1051)

WeiweiZhang1 · pre-commit-ci[bot] · web-flow · commit c4a14799a50d · 2025-11-28T17:14:59.000+08:00
* fp8 exporting bugfix Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * refine exllama backend cuda UT Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * add lm_head layer act_max hook, enable mxfp/nvfp lm_head export Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixtypo Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * fixtypo Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix ut typo Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * refine logs, fix pack_layer for awq&gptq Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * refine log, fix pack_layer for awq&gptq Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add awq&gptq lm_head UT Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix local path Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> --------- Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
@@ -509,7 +509,7 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None:
             if len(tmp_devices) > 1:
                 logger.warning(
                     f"there are multiple device types in the device_map, "
-                    f"please make sure they are correct,use the first device {tmp_devices[0]} as the core device "
+                    f"please make sure they are correct,use the first device {tmp_devices[0]} as the core device."
                 )
 
             self.device = tmp_devices[0]
@@ -526,7 +526,7 @@ def _parse_and_set(scheme, kwargs):
                 if "bits" not in kwargs:
                     data_type = kwargs["data_type"]
                     raise KeyError(
-                        f"please set bits when setting data_type={data_type}, or using scheme as an alternative.."
+                        f"please set bits when setting data_type={data_type}, or using scheme as an alternative."
                     )
                 bits = kwargs["bits"]
                 scheme = f"gguf:q{bits}_k" if bits == 6 else f"gguf:q{bits}_k_s"
@@ -1469,8 +1469,8 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str])
             raise ValueError("Could not find any blocks. Check the model or quant_block_list.")
 
         all_first_block_names = [block[0] for block in all_blocks]
-        if self.act_bits < 16 and not self.act_dynamic:
-            layer_names = self._get_quantized_layer_names_outside_blocks()
+        layer_names = self._get_quantized_layer_names_outside_blocks()
+        if self.act_bits < 16 and (not self.act_dynamic or len(layer_names) > 0):
             if len(layer_names) > 0:
                 logger.warning(
                     "quantize layers outside blocks for static activation quantizaiton"
@@ -1783,6 +1783,21 @@ def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None:
 
         for layer_name in copy.deepcopy(layer_names):
             if layer_name not in layer_inputs:
+                if self.act_bits < 16 and not self.act_dynamic:
+                    # Activation quantization requires collected inputs
+                    msg_prefix = (
+                        f"Activation max hook for layer '{layer_name}' is unavailable due to "
+                        f"insufficient collected inputs. "
+                    )
+                    if "fp8_e5m2" in self.act_data_type:
+                        logger.warning(msg_prefix + "Please notes that unit scale is used for this layer.")
+                    else:
+                        logger.warning(
+                            msg_prefix + "Static activation quantization is not supported or ineffective, "
+                            "Skipping quantization for this layer."
+                        )
+                        layer_names.remove(layer_name)
+                        continue
                 logger.info(f"using rtn to quantize {layer_name}")
                 from auto_round.data_type import QUANT_FUNC_WITH_DTYPE
 
@@ -1813,6 +1828,7 @@ def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None:
         q_layer_inputs = None
         enable_quanted_input = self.enable_quanted_input
         has_gguf = False
+
         if hasattr(self, "formats"):
             has_gguf = any("gguf" in format_ for format_ in self.formats)
         if has_gguf and self.immediate_packing:
@@ -2334,6 +2350,64 @@ def _replace_forward(self):
                 hook_handle = m.register_forward_hook(hook_func)
                 self.hook_handles.append(hook_handle)
 
+    def _register_act_max_hook(self, model):
+        def get_act_max_hook(module, input, output):
+            if isinstance(input, (tuple, list)):
+                input = input[0]
+            if input.numel() == 0:
+                return  # as no needs for act_max update
+            input, _, _ = reshape_pad_tensor_by_group_size(input, self.act_group_size)
+            act_max = torch.max(torch.abs(input), dim=-1).values
+            if not hasattr(module, "act_max") or module.act_max.numel() == 0:
+                module.act_max = act_max
+            else:
+                act_max = act_max.to(module.act_max.device)
+                if is_nv_fp(self.act_data_type):  ## for nvfp per-tensor input_global_scale calculation usage
+                    module.act_max = torch.max(
+                        torch.tensor([act_max.max(), module.act_max.max()], device=act_max.device)
+                    )
+                else:
+                    module.act_max = torch.max(act_max, module.act_max)
+
+        hook_handles = []
+        # for single layers out of blocks, like lm_head
+        if isinstance(model, SUPPORTED_LAYER_TYPES):
+            m = model
+            if (
+                hasattr(m, "act_dynamic")
+                and check_need_act_calibration(m.act_dynamic, m.act_data_type, m.act_bits)
+                and check_to_quantized(m)
+            ):
+                hook = m.register_forward_hook(get_act_max_hook)
+                hook_handles.append(hook)
+            return hook_handles
+
+        for n, m in model.named_modules():
+            if (
+                hasattr(m, "act_dynamic")
+                and check_need_act_calibration(m.act_dynamic, m.act_data_type, m.act_bits)
+                and check_to_quantized(m)
+            ):
+                hook = m.register_forward_hook(get_act_max_hook)
+                hook_handles.append(hook)
+                continue
+
+            # for whole model, RTN
+            if n in self.layer_config:
+                config = self.layer_config[n]
+                act_dynamic = config.get("act_dynamic", True)
+                act_data_type = config.get("act_data_type", None)
+                act_bits = config.get("act_bits", 16)
+                if (
+                    config["bits"] <= 8
+                    and check_need_act_calibration(act_dynamic, act_data_type, act_bits)
+                    and check_to_quantized(config)
+                ):
+                    hook = m.register_forward_hook(get_act_max_hook)
+                    hook_handles.append(hook)
+                    continue
+        return hook_handles
+
     def _quantize_layer(
         self, layer_name: str, inputs: torch.Tensor, q_inputs: torch.Tensor = None, device: str = "cpu"
     ):
@@ -2359,6 +2433,19 @@ def _quantize_layer(
             if q_inputs is not None:
                 q_inputs[i] = q_inputs[i].to(layer.weight.dtype)
 
+        if q_inputs is None:
+            hook_handles = self._register_act_max_hook(layer)
+            with torch.no_grad():
+                layer(torch.cat(inputs, dim=0))
+            for handle in hook_handles:
+                handle.remove()
+        else:
+            hook_handles = self._register_act_max_hook(layer)
+            if hook_handles:
+                layer(torch.cat(q_inputs, dim=0))
+            for handle in hook_handles:
+                handle.remove()
+
         wrapper_linear = WrapperLinear(
             layer,
             enable_minmax_tuning=self.enable_minmax_tuning,
@@ -2495,54 +2582,6 @@ def _quantize_layer(
         dump_info = f"quantized {layer_name},  loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}"
         logger.info(dump_info)
 
-    def _register_act_max_hook(self, model):
-
-        def get_act_max_hook(module, input, output):
-            if isinstance(input, (tuple, list)):
-                input = input[0]
-            if input.numel() == 0:
-                return  # as no needs for act_max update
-            input, _, _ = reshape_pad_tensor_by_group_size(input, self.act_group_size)
-            act_max = torch.max(torch.abs(input), dim=-1).values
-            if not hasattr(module, "act_max") or module.act_max.numel() == 0:
-                module.act_max = act_max
-            else:
-                act_max = act_max.to(module.act_max.device)
-                if is_nv_fp(self.act_data_type):  ## for nvfp per-tensor input_global_scale calculation usage
-                    module.act_max = torch.max(
-                        torch.tensor([act_max.max(), module.act_max.max()], device=act_max.device)
-                    )
-                else:
-                    module.act_max = torch.max(act_max, module.act_max)
-
-        hook_handles = []
-
-        for n, m in model.named_modules():
-            if (
-                hasattr(m, "act_dynamic")
-                and check_need_act_calibration(m.act_dynamic, m.act_data_type, m.act_bits)
-                and check_to_quantized(m)
-            ):
-                hook = m.register_forward_hook(get_act_max_hook)
-                hook_handles.append(hook)
-                continue
-
-            # for whole model, RTN
-            if n in self.layer_config:
-                config = self.layer_config[n]
-                act_dynamic = config.get("act_dynamic", True)
-                act_data_type = config.get("act_data_type", None)
-                act_bits = config.get("act_bits", 16)
-                if (
-                    config["bits"] <= 8
-                    and check_need_act_calibration(act_dynamic, act_data_type, act_bits)
-                    and check_to_quantized(config)
-                ):
-                    hook = m.register_forward_hook(get_act_max_hook)
-                    hook_handles.append(hook)
-                    continue
-        return hook_handles
-
     def _get_current_output(self, output: list[torch.Tensor], indices: list[int]) -> torch.Tensor:
         current_output = [output[x] for x in indices]
         current_output = torch.cat(current_output, dim=self.batch_dim)
diff --git a/auto_round/export/export_to_autogptq/export.py b/auto_round/export/export_to_autogptq/export.py
@@ -130,8 +130,6 @@ def convert_from_autogptq_dynamic(dynamic_config: dict) -> dict:
 
 
 def pack_layer(name, model, backend, device=None):
-    if name == "lm_head":  ##dese not support lm-head
-        return
     layer = get_module(model, name)
 
     if type(layer) not in SUPPORTED_LAYER_TYPES:  # already packed
diff --git a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py
@@ -52,8 +52,6 @@
 
 
 def pack_layer(name, model, backend, device=None):
-    if name == "lm_head":  # TODO: Check vLLM inference status to determine whether to enable this feature
-        return
     layer = get_module(model, name)
     if type(layer) not in SUPPORTED_LAYER_TYPES and not isinstance(layer, WrapperWALayer):  ##already packed
         return
@@ -82,8 +80,6 @@ def pack_layer(name, model, backend, device=None):
             setattr(layer, "input_global_scale", input_global_scale)
             delattr(layer, "act_max")
 
-    # QuantLinear = get_fp_qlinear(backend, bits, group_size, sym)
-
     if type(layer) == nn.Linear:
         in_features = layer.in_features
         out_features = layer.out_features
diff --git a/auto_round/export/export_to_awq/export.py b/auto_round/export/export_to_awq/export.py
@@ -46,8 +46,6 @@
 
 
 def pack_layer(name, model, backend, device=None):
-    if name == "lm_head":  ##dese not support lm-head
-        return
     layer = get_module(model, name)
 
     if type(layer) not in SUPPORTED_LAYER_TYPES:  ##already packed
diff --git a/auto_round/export/export_to_llmcompressor/export_to_fp.py b/auto_round/export/export_to_llmcompressor/export_to_fp.py
@@ -51,8 +51,6 @@
 
 
 def pack_layer(name, model, backend, device=None):
-    if name == "lm_head":  # TODO: Check vLLM inference status to determine whether to enable this feature
-        return
     layer = get_module(model, name)
     if type(layer) not in SUPPORTED_LAYER_TYPES and not isinstance(layer, WrapperWALayer):  ##already packed
         return
diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py
@@ -35,10 +35,10 @@ def __iter__(self):
 class TestAutoRound(unittest.TestCase):
     @classmethod
     def setUpClass(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         self.save_dir = "./saved"
-        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         self.llm_dataloader = LLMDataLoader()
 
     @classmethod
@@ -49,7 +49,7 @@ def tearDownClass(self):
     def test_autogptq_format(self):
         for group_size in [-1, 32, 128]:
             bits, sym = 4, False
-            model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+            model_name = self.model_name
             autoround = AutoRound(
                 model=model_name,
                 bits=bits,
@@ -79,7 +79,7 @@ def test_autogptq_format(self):
     def test_autoround_format(self):
         for group_size in [-1, 32, 128]:
             bits, sym = 4, True
-            model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+            model_name = self.model_name
             autoround = AutoRound(
                 model=model_name,
                 bits=bits,
@@ -105,7 +105,7 @@ def test_autoround_format(self):
     def test_autoround_awq_format(self):
         for group_size in [-1, 32, 128]:
             bits, sym = 4, False
-            model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+            model_name = self.model_name
             autoround = AutoRound(
                 model=model_name,
                 bits=bits,
@@ -217,7 +217,7 @@ def test_static_afp8_export(self, static_kv_dtype):
 
         from safetensors import safe_open
 
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = self.model_name
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         autoround = AutoRound(
             model,
@@ -307,7 +307,7 @@ def test_static_fp8_attn(self):
 
         from safetensors import safe_open
 
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = self.model_name
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         autoround = AutoRound(
             model,
@@ -334,6 +334,69 @@ def test_static_fp8_attn(self):
 
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
+    def test_awq_lmhead_export(self):
+        bits, sym, group_size = 4, False, 128
+        model_name = "/tf_dataset/auto_round/models/microsoft/phi-2"
+        layer_config = {
+            "lm_head": {"bits": 4},  # set lm_head quant
+        }
+        autoround = AutoRound(
+            model=model_name,
+            bits=bits,
+            group_size=group_size,
+            sym=sym,
+            iters=2,
+            seqlen=2,
+            layer_config=layer_config,
+            dataset=self.llm_dataloader,
+        )
+        quantized_model_path = "./saved"
+        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq")
+        lm_head = compressed_model.lm_head
+        from auto_round.export.export_to_awq.utils import WQLinear_GEMM
+
+        assert isinstance(lm_head, WQLinear_GEMM), "Illegal GPTQ quantization for lm_head layer"
+        quantization_config = AutoRoundConfig()
+        model = AutoModelForCausalLM.from_pretrained(
+            quantized_model_path, device_map="cpu", quantization_config=quantization_config
+        )
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
+        text = "There is a girl who likes adventure,"
+        inputs = tokenizer(text, return_tensors="pt").to(model.device)
+        print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
+
+    def test_gptq_lmhead_export(self):
+        bits, sym, group_size = 4, True, 128
+        model_name = "/tf_dataset/auto_round/models/microsoft/phi-2"
+        layer_config = {
+            "lm_head": {"bits": 4},  # set lm_head quant
+        }
+        autoround = AutoRound(
+            model=model_name,
+            bits=bits,
+            group_size=group_size,
+            sym=sym,
+            iters=2,
+            seqlen=2,
+            layer_config=layer_config,
+            dataset=self.llm_dataloader,
+        )
+        quantized_model_path = "./saved"
+        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+        lm_head = compressed_model.lm_head
+        assert hasattr(lm_head, "bits") and lm_head.bits == 4, "Illegal GPTQ quantization for lm_head layer"
+        quantization_config = AutoRoundConfig()
+        model = AutoModelForCausalLM.from_pretrained(
+            quantized_model_path, device_map="cpu", quantization_config=quantization_config
+        )
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
+        text = "There is a girl who likes adventure,"
+        inputs = tokenizer(text, return_tensors="pt").to(model.device)
+        res = tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0])
+        print(res)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py
diff --git a/test/test_cuda/test_export.py b/test/test_cuda/test_export.py