From b34fb322f869007a51002a4b6e10fd0dcfecf920 Mon Sep 17 00:00:00 2001 From: lkk Date: Thu, 25 Sep 2025 07:16:46 +0000 Subject: [PATCH 01/15] add mxfp8 qat code, mxfp8fwd-bf16bwd. --- .../torch/algorithms/qat/__init__.py | 16 ++ .../torch/algorithms/qat/quant_linear.py | 81 ++++++++ .../torch/algorithms/qat/quant_utils.py | 123 ++++++++++++ .../torch/algorithms/qat/tensor_quantizer.py | 181 ++++++++++++++++++ neural_compressor/torch/export/export_hf.py | 112 +++++++++++ 5 files changed, 513 insertions(+) create mode 100644 neural_compressor/torch/algorithms/qat/__init__.py create mode 100644 neural_compressor/torch/algorithms/qat/quant_linear.py create mode 100644 neural_compressor/torch/algorithms/qat/quant_utils.py create mode 100644 neural_compressor/torch/algorithms/qat/tensor_quantizer.py create mode 100644 neural_compressor/torch/export/export_hf.py diff --git a/neural_compressor/torch/algorithms/qat/__init__.py b/neural_compressor/torch/algorithms/qat/__init__.py new file mode 100644 index 00000000000..d3bdaf8e760 --- /dev/null +++ b/neural_compressor/torch/algorithms/qat/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pylint:disable=import-error +"""QAT (Quantization Aware Tuning).""" diff --git a/neural_compressor/torch/algorithms/qat/quant_linear.py b/neural_compressor/torch/algorithms/qat/quant_linear.py new file mode 100644 index 00000000000..911d082db55 --- /dev/null +++ b/neural_compressor/torch/algorithms/qat/quant_linear.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Quantized Linear.""" + + +import torch +import torch.nn as nn + +from .tensor_quantizer import TensorQuantizer + +class QuantLinear(nn.Module): + """Quantized version of nn.Linear.""" + + def forward(self, input: torch.Tensor): + """Add weight/input/output of quantization for the original forward method.""" + qw = self.weight_quantizer(self.weight) + qi = self.input_quantizer(input) + out = F.linear(qi, qw, self.bias) + out = self.output_quantizer(out) + return out + + def _setup(self, quant_cfg: "QuantizationSchem"): + """Init quantizer""" + self.weight_quantizer = TensorQuantizer( + data_type=quant_cfg.data_type, + block_size=quant_cfg.group_size, + bits=quant_cfg.bits, + sym=quant_cfg.sym, + if_quant=True, + learn_exponent=False, + ) + self.input_quantizer = TensorQuantizer( + data_type=quant_cfg.act_data_type, + block_size=quant_cfg.act_group_size, + bits=quant_cfg.act_bits, + sym=quant_cfg.act_sym, + if_quant=True, + learn_exponent=False, + ) + self.output_quantizer = TensorQuantizer( + data_type=quant_cfg.act_data_type, + block_size=quant_cfg.act_group_size, + bits=quant_cfg.act_bits, + sym=quant_cfg.act_sym, + if_quant=False, + ) + # Currently don't quant output + self.output_quantizer.disable() + + # TODO: remove + self.original_weight_dtype = None if self.weight is None else self.weight.dtype + + def extra_repr(self) -> str: + """Generate extra_repr making sure import keys exist in self.__dict__.""" + return f"in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}" + + def __repr__(self): + """Overriding the __repr__ method, makes the output more concise and meaningful.""" + return f"QuantLinear(\n" \ + f" in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}\n" \ + f" (input_quantizer): {self.input_quantizer}\n" \ + f" (output_quantizer): {self.output_quantizer}\n" \ + f" (weight_quantizer): {self.weight_quantizer}\n" \ + f")" diff --git a/neural_compressor/torch/algorithms/qat/quant_utils.py b/neural_compressor/torch/algorithms/qat/quant_utils.py new file mode 100644 index 00000000000..5bebdbbf3c5 --- /dev/null +++ b/neural_compressor/torch/algorithms/qat/quant_utils.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utils for quantization""" + +import types +import torch +import torch.nn as nn +from typing import Any +from .quant_linear import QuantLinear + + +def convert(module: nn.Module, quant_cfg=None, quant_module=None): + """Convert the model to a quantized one with quant config""" + + # update class + original_cls = type(module) + module.__class__ = quant_module + module.forward = types.MethodType(quant_module.forward, module) + + # setup quantizers + module._setup(quant_cfg) + + return module + + +def replace_with_quant_linear(model, quant_cfg=None): + """Recursively replace the module with quantized module.""" + + # TODO: support more modules, like kv. + for name, child in model.named_children(): + if isinstance(child, nn.Linear): + if "lm_head" in name: + continue + # REPLACE on the parent (model), not on child + quantized = convert(child, quant_cfg, QuantLinear) + setattr(model, name, quantized) + + # now recurse into whichever module is now at `model.name` + replace_with_quant_linear(getattr(model, name), quant_cfg=quant_cfg) + + return model + + +def get_quant_config(scheme: str) -> dict[str, Any]: + """Generate quantization config for a torch model. + + Args: + model: The PyTorch model to analyze + + Returns: + Dictionary containing the quantization configuration + """ + + # TODO: support more quant config + try: + from auto_round.export.export_to_llmcompressor.config import initialize_quantization + quantization_config = initialize_quantization(scheme=scheme) + quantization_config = quantization_config.to_dict() + quantization_config["provider"] = "auto-round" + quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] = True + quantization_config["config_groups"]["group_0"]["input_activations"]["is_mx"] = True + + except ImportError: + quantization_config = None + + return quantization_config + + +def get_quantization_format(module) -> str | None: + """Gets the quantization string. + + Gets the quantization string by iterating through the module and its children. + The first non-None quantization string is returned. + """ + + def _get_quantization_from_layer(layer): + weight_quantizer = getattr(layer, "weight_quantizer", None) + input_quantizer = getattr(layer, "input_quantizer", None) + + if weight_quantizer is None or weight_quantizer._disabled: + return None + + # TODO: support more quant format + if weight_quantizer.num_bits == 8 and weight_quantizer.data_type == "mx_fp8": + return "MXFP8" + + # Raise error for unsupported num_bits + raise NotImplementedError( + f"Unsupported quantizer with num_bits: {weight_quantizer.num_bits}" + ) + + quantization = _get_quantization_from_layer(module) + if quantization != None: + return quantization + + for _, layer in module.named_children(): + format = get_quantization_format(layer) + if format != None: + return format + + return None + + +def is_quantlinear(module: nn.Module) -> bool: + """Returns whether the module is a quantized linear layer.""" + return "QuantLinear" in type(module).__name__ diff --git a/neural_compressor/torch/algorithms/qat/tensor_quantizer.py b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py new file mode 100644 index 00000000000..668786bffa8 --- /dev/null +++ b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""TensorQuantizer Module.""" + +import torch +from torch import nn + +try: + from auto_round.data_type import get_quant_func +except ImportError: + get_quant_func = None + +class TensorQuantizer(nn.Module): + """Tensor quantizer module.""" + + def __init__( + self, + data_type="mx_fp8", + bits=8, + block_size=32, + sym=True, + if_quant=True, + learn_exponent=False, + amax=None, + scale_shape=None, + device=None, + ): + """Initialize quantizer and set up required variables.""" + super().__init__() + self.amax = amax + self.data_type = data_type + self.num_bits = bits + self.block_size = block_size + self.sym = sym + self._if_quant = if_quant + self.learn_exponent = learn_exponent + self._dequantize = False + self._input_dtype = None + self._fake_quant = True + + # enable quantizer + self.enable() + + assert get_quant_func is not None, ( + f"The quantization function is imported from AutoRound, please intall it. 'pip install auto-round'" + ) + + # self.data_type will be overided 'mx_fp' -> 'mx_fp8' + self.quant_func, self.data_type = get_quant_func(self.data_type, self.num_bits, self.sym) + + if scale_shape is not None: + # E8M0 scales (exponent) + self.register_buffer( + "scale", + torch.empty(scale_shape[0], scale_shape.shape[1] // self.block_size, dtype=torch.uint8, device=device), + ) + self.save_scale = True + else: + self.save_scale = False + + def forward(self, inputs: torch.Tensor): + """Apply tensor_quant function to inputs. + + Args: + inputs: A Tensor of type float32/float16/bfloat16. + + Returns: + outputs: A Tensor of type output_dtype + """ + + if self._disabled or (not self._if_quant): + self._input_dtype = inputs.dtype + return inputs + + x = inputs + if not x.is_contiguous(): + x = x.contiguous() + + if self.fake_quant: + q = self._fake_quantize(x)[0] + else: + # TODO: add implementation + q = self._real_quantize(x) + + return q.to(inputs.dtype) + + def _fake_quantize(self, inputs: torch.Tensor): + """Fake quantization.""" + + # the shared_exp can be trainable + if self.learn_exponent: + q, shared_exp, _ = self.quant_func( + inputs, + bits=self.num_bits, + group_size=self.block_size, + data_type=self.data_type, + ) + else: + # wrapper no_grad, because the function includes extra trainable variables + with torch.no_grad(): + q, shared_exp, _ = self.quant_func( + inputs, + bits=self.num_bits, + group_size=self.block_size, + data_type=self.data_type, + ) + + # simple STE, since we add no_grad in the quant function + q = q.detach() + (inputs - inputs.detach()) + + if self.save_scale: + # TODO: PACK uint8 + self.scale.data.copy_(shared_exp.detach()) + + return q, shared_exp + + @property + def fake_quant(self): + """Return True if fake quantization is used.""" + return self._fake_quant + + def disable(self): + """Bypass the module.""" + self._disabled = True + + def enable(self): + """Enable the module.""" + self._disabled = False + + def weight_pack(self, weight, scale): + """pack weight and scale when saving.""" + original_shape = weight.shape + + # TODO: support more quantization format + if self.data_type == "mx_fp8": + qweight = (weight.reshape(-1, self.block_size) \ + / torch.exp2(scale.float()).reshape(-1, 1)).to(torch.float8_e4m3fn) + + e8m0_scale = (scale + 127).to(torch.uint8) + return qweight.reshape(original_shape), e8m0_scale + + def __repr__(self): + if self._disabled or not self._if_quant: + return "TensorQuantizer(disabled)" + + qformat_str = f"({self.data_type}) format" + bits_str = f"({self.num_bits}) bit" + + if self.block_size: + bs_str = f"block_size={self.block_size}" + else: + bs_str = "block_size=None" + + # amax + amax_str = f"amax={self.amax}" if self.amax is not None else "amax=?" + # fake / real + mode_str = "fake" if self._fake_quant else "real" + # sym + sym_str = "sym" if self.sym else "asym" + # quant enable + qflag = "quant" if self._if_quant else "no-quant" + + return f"TensorQuantizer({qformat_str} {bits_str} {mode_str} {bs_str}, {amax_str} {qflag})" diff --git a/neural_compressor/torch/export/export_hf.py b/neural_compressor/torch/export/export_hf.py new file mode 100644 index 00000000000..d78187bb2b0 --- /dev/null +++ b/neural_compressor/torch/export/export_hf.py @@ -0,0 +1,112 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Export quantized hf model to compatible formats""" + +import tempfile +from pathlib import Path +import warnings +from typing import Any +import torch +import torch.nn as nn + +def _export_quantized_weight( + sub_module: nn.Module, quantization_format: str = None, weight_name: str = "weight" +): + """For the given weight attr of the sub_module, export the quantization info of it. + + The export includes converting weight tensor to correct quantized values and quantized dtype, + and registering scaling factors. + """ + if quantization_format == None: + return + + weight: nn.Parameter = getattr(sub_module, weight_name) + weight_quantizer = getattr( + sub_module, "weight_quantizer" + ) + + qdq_weight, scale = weight_quantizer._fake_quantize(weight) + + # TODO: support more scale dtype when there are other quantization format except mxfp8/mxfp4 + quantized_weight, e8m0_scale = weight_quantizer.weight_pack(qdq_weight, scale) + + sub_module.register_buffer("weight_scale", e8m0_scale.reshape(*weight.shape[:-1], -1)) + + setattr(sub_module, weight_name, nn.Parameter(quantized_weight, requires_grad=False)) + +def _export_hf_checkpoint( + model: nn.Module, scheme: str | None = None +) -> tuple[dict[str, Any], dict[str, Any]]: + """Exports the torch model to the packed checkpoint with original HF naming. + + The packed checkpoint will be consumed by the TensorRT-LLM unified converter. + + Args: + model: the torch model. + dtype: the weights data type to export the unquantized layers or the default model data type if None. + + Returns: + post_state_dict: Dict containing quantized weights + quant_config: config information to export hf_quant_cfg.json + """ + + # Create a model layer pool + # If `model.model` exists use that, otherwise use `model` itself, e.g., Nemotron-H + root = getattr(model, "model", model) + # If that has a `.layers`, use it, otherwise fall back to the object itself + root = getattr(root, "layers", root) + layer_pool = {f"model.layers.{name}": sub_module for name, sub_module in root.named_modules()} + + from ..algorithms.qat.quant_utils import get_quant_config, get_quantization_format, is_quantlinear + # compressored config + quant_config = get_quant_config(scheme=scheme) + + for name, sub_module in layer_pool.items(): + quantization_format = get_quantization_format(sub_module) + if quantization_format != None: + if is_quantlinear(sub_module): + _export_quantized_weight(sub_module, quantization_format) + + quantized_state_dict = model.state_dict() + + + return quantized_state_dict, quant_config + + +def export_hf2compressored_model( + model: nn.Module, + export_dir: Path | str = tempfile.gettempdir(), + scheme: str = None +): + """Exports the torch model to the packed checkpoint with original HF naming. + + The packed checkpoint will be consumed by the VLLM. + """ + + export_dir = Path(export_dir) + export_dir.mkdir(parents=True, exist_ok=True) + + try: + _, quant_config = _export_hf_checkpoint(model, scheme) + model.save_pretrained(export_dir) + model.config.quantization_config = quant_config + model.config.save_pretrained(export_dir) + + except Exception as e: + warnings.warn( + "Cannot export model and config, the state" + " can be saved with torch.save for further inspection." + ) + raise e + From 7f99561fb9da7718269a8200a18143722ee8f9c6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 25 Sep 2025 07:23:00 +0000 Subject: [PATCH 02/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../torch/algorithms/qat/quant_linear.py | 17 +++++----- .../torch/algorithms/qat/quant_utils.py | 13 ++++---- .../torch/algorithms/qat/tensor_quantizer.py | 15 ++++----- neural_compressor/torch/export/export_hf.py | 31 +++++++------------ 4 files changed, 36 insertions(+), 40 deletions(-) diff --git a/neural_compressor/torch/algorithms/qat/quant_linear.py b/neural_compressor/torch/algorithms/qat/quant_linear.py index 911d082db55..070d9085403 100644 --- a/neural_compressor/torch/algorithms/qat/quant_linear.py +++ b/neural_compressor/torch/algorithms/qat/quant_linear.py @@ -25,6 +25,7 @@ from .tensor_quantizer import TensorQuantizer + class QuantLinear(nn.Module): """Quantized version of nn.Linear.""" @@ -37,7 +38,7 @@ def forward(self, input: torch.Tensor): return out def _setup(self, quant_cfg: "QuantizationSchem"): - """Init quantizer""" + """Init quantizer.""" self.weight_quantizer = TensorQuantizer( data_type=quant_cfg.data_type, block_size=quant_cfg.group_size, @@ -73,9 +74,11 @@ def extra_repr(self) -> str: def __repr__(self): """Overriding the __repr__ method, makes the output more concise and meaningful.""" - return f"QuantLinear(\n" \ - f" in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}\n" \ - f" (input_quantizer): {self.input_quantizer}\n" \ - f" (output_quantizer): {self.output_quantizer}\n" \ - f" (weight_quantizer): {self.weight_quantizer}\n" \ - f")" + return ( + f"QuantLinear(\n" + f" in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}\n" + f" (input_quantizer): {self.input_quantizer}\n" + f" (output_quantizer): {self.output_quantizer}\n" + f" (weight_quantizer): {self.weight_quantizer}\n" + f")" + ) diff --git a/neural_compressor/torch/algorithms/qat/quant_utils.py b/neural_compressor/torch/algorithms/qat/quant_utils.py index 5bebdbbf3c5..ce5d8395c73 100644 --- a/neural_compressor/torch/algorithms/qat/quant_utils.py +++ b/neural_compressor/torch/algorithms/qat/quant_utils.py @@ -17,17 +17,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Utils for quantization""" +"""Utils for quantization.""" import types +from typing import Any + import torch import torch.nn as nn -from typing import Any + from .quant_linear import QuantLinear def convert(module: nn.Module, quant_cfg=None, quant_module=None): - """Convert the model to a quantized one with quant config""" + """Convert the model to a quantized one with quant config.""" # update class original_cls = type(module) @@ -71,6 +73,7 @@ def get_quant_config(scheme: str) -> dict[str, Any]: # TODO: support more quant config try: from auto_round.export.export_to_llmcompressor.config import initialize_quantization + quantization_config = initialize_quantization(scheme=scheme) quantization_config = quantization_config.to_dict() quantization_config["provider"] = "auto-round" @@ -102,9 +105,7 @@ def _get_quantization_from_layer(layer): return "MXFP8" # Raise error for unsupported num_bits - raise NotImplementedError( - f"Unsupported quantizer with num_bits: {weight_quantizer.num_bits}" - ) + raise NotImplementedError(f"Unsupported quantizer with num_bits: {weight_quantizer.num_bits}") quantization = _get_quantization_from_layer(module) if quantization != None: diff --git a/neural_compressor/torch/algorithms/qat/tensor_quantizer.py b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py index 668786bffa8..b4d51f2eae5 100644 --- a/neural_compressor/torch/algorithms/qat/tensor_quantizer.py +++ b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py @@ -17,7 +17,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """TensorQuantizer Module.""" import torch @@ -28,6 +27,7 @@ except ImportError: get_quant_func = None + class TensorQuantizer(nn.Module): """Tensor quantizer module.""" @@ -59,9 +59,9 @@ def __init__( # enable quantizer self.enable() - assert get_quant_func is not None, ( - f"The quantization function is imported from AutoRound, please intall it. 'pip install auto-round'" - ) + assert ( + get_quant_func is not None + ), "The quantization function is imported from AutoRound, please install it. 'pip install auto-round'" # self.data_type will be overided 'mx_fp' -> 'mx_fp8' self.quant_func, self.data_type = get_quant_func(self.data_type, self.num_bits, self.sym) @@ -146,13 +146,14 @@ def enable(self): self._disabled = False def weight_pack(self, weight, scale): - """pack weight and scale when saving.""" + """Pack weight and scale when saving.""" original_shape = weight.shape # TODO: support more quantization format if self.data_type == "mx_fp8": - qweight = (weight.reshape(-1, self.block_size) \ - / torch.exp2(scale.float()).reshape(-1, 1)).to(torch.float8_e4m3fn) + qweight = (weight.reshape(-1, self.block_size) / torch.exp2(scale.float()).reshape(-1, 1)).to( + torch.float8_e4m3fn + ) e8m0_scale = (scale + 127).to(torch.uint8) return qweight.reshape(original_shape), e8m0_scale diff --git a/neural_compressor/torch/export/export_hf.py b/neural_compressor/torch/export/export_hf.py index d78187bb2b0..4ed61b22859 100644 --- a/neural_compressor/torch/export/export_hf.py +++ b/neural_compressor/torch/export/export_hf.py @@ -11,18 +11,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Export quantized hf model to compatible formats""" +"""Export quantized hf model to compatible formats.""" import tempfile -from pathlib import Path import warnings +from pathlib import Path from typing import Any + import torch import torch.nn as nn -def _export_quantized_weight( - sub_module: nn.Module, quantization_format: str = None, weight_name: str = "weight" -): + +def _export_quantized_weight(sub_module: nn.Module, quantization_format: str = None, weight_name: str = "weight"): """For the given weight attr of the sub_module, export the quantization info of it. The export includes converting weight tensor to correct quantized values and quantized dtype, @@ -32,9 +32,7 @@ def _export_quantized_weight( return weight: nn.Parameter = getattr(sub_module, weight_name) - weight_quantizer = getattr( - sub_module, "weight_quantizer" - ) + weight_quantizer = getattr(sub_module, "weight_quantizer") qdq_weight, scale = weight_quantizer._fake_quantize(weight) @@ -45,9 +43,8 @@ def _export_quantized_weight( setattr(sub_module, weight_name, nn.Parameter(quantized_weight, requires_grad=False)) -def _export_hf_checkpoint( - model: nn.Module, scheme: str | None = None -) -> tuple[dict[str, Any], dict[str, Any]]: + +def _export_hf_checkpoint(model: nn.Module, scheme: str | None = None) -> tuple[dict[str, Any], dict[str, Any]]: """Exports the torch model to the packed checkpoint with original HF naming. The packed checkpoint will be consumed by the TensorRT-LLM unified converter. @@ -69,6 +66,7 @@ def _export_hf_checkpoint( layer_pool = {f"model.layers.{name}": sub_module for name, sub_module in root.named_modules()} from ..algorithms.qat.quant_utils import get_quant_config, get_quantization_format, is_quantlinear + # compressored config quant_config = get_quant_config(scheme=scheme) @@ -80,15 +78,10 @@ def _export_hf_checkpoint( quantized_state_dict = model.state_dict() - return quantized_state_dict, quant_config -def export_hf2compressored_model( - model: nn.Module, - export_dir: Path | str = tempfile.gettempdir(), - scheme: str = None -): +def export_hf2compressored_model(model: nn.Module, export_dir: Path | str = tempfile.gettempdir(), scheme: str = None): """Exports the torch model to the packed checkpoint with original HF naming. The packed checkpoint will be consumed by the VLLM. @@ -105,8 +98,6 @@ def export_hf2compressored_model( except Exception as e: warnings.warn( - "Cannot export model and config, the state" - " can be saved with torch.save for further inspection." + "Cannot export model and config, the state" " can be saved with torch.save for further inspection." ) raise e - From b6d74ae6ce6ea25397d66a59d3711889d085edfb Mon Sep 17 00:00:00 2001 From: lkk Date: Thu, 25 Sep 2025 08:56:50 +0000 Subject: [PATCH 03/15] fix comments. --- .../torch/algorithms/qat/__init__.py | 2 +- .../torch/algorithms/qat/quant_linear.py | 3 +- .../torch/algorithms/qat/quant_utils.py | 6 +-- .../torch/algorithms/qat/tensor_quantizer.py | 9 ++-- neural_compressor/torch/export/export_hf.py | 10 ++--- .../torch/quantization/quantize.py | 43 +++++++++++-------- 6 files changed, 41 insertions(+), 32 deletions(-) diff --git a/neural_compressor/torch/algorithms/qat/__init__.py b/neural_compressor/torch/algorithms/qat/__init__.py index d3bdaf8e760..e4c8a62c491 100644 --- a/neural_compressor/torch/algorithms/qat/__init__.py +++ b/neural_compressor/torch/algorithms/qat/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 Intel Corporation +# Copyright (c) 2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/neural_compressor/torch/algorithms/qat/quant_linear.py b/neural_compressor/torch/algorithms/qat/quant_linear.py index 070d9085403..50107b9db70 100644 --- a/neural_compressor/torch/algorithms/qat/quant_linear.py +++ b/neural_compressor/torch/algorithms/qat/quant_linear.py @@ -4,7 +4,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. # -# Copyright (c) 2024 Intel Corporation +# Copyright (c) 2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ import torch import torch.nn as nn +import torch.nn.functional as F from .tensor_quantizer import TensorQuantizer diff --git a/neural_compressor/torch/algorithms/qat/quant_utils.py b/neural_compressor/torch/algorithms/qat/quant_utils.py index ce5d8395c73..0f76650e8ab 100644 --- a/neural_compressor/torch/algorithms/qat/quant_utils.py +++ b/neural_compressor/torch/algorithms/qat/quant_utils.py @@ -4,7 +4,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. # -# Copyright (c) 2024 Intel Corporation +# Copyright (c) 2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -108,12 +108,12 @@ def _get_quantization_from_layer(layer): raise NotImplementedError(f"Unsupported quantizer with num_bits: {weight_quantizer.num_bits}") quantization = _get_quantization_from_layer(module) - if quantization != None: + if quantization is not None: return quantization for _, layer in module.named_children(): format = get_quantization_format(layer) - if format != None: + if format is not None: return format return None diff --git a/neural_compressor/torch/algorithms/qat/tensor_quantizer.py b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py index b4d51f2eae5..e8c0badad28 100644 --- a/neural_compressor/torch/algorithms/qat/tensor_quantizer.py +++ b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py @@ -4,7 +4,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. # -# Copyright (c) 2024 Intel Corporation +# Copyright (c) 2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -70,7 +70,7 @@ def __init__( # E8M0 scales (exponent) self.register_buffer( "scale", - torch.empty(scale_shape[0], scale_shape.shape[1] // self.block_size, dtype=torch.uint8, device=device), + torch.empty(scale_shape[0], scale_shape[1] // self.block_size, dtype=torch.uint8, device=device), ) self.save_scale = True else: @@ -132,6 +132,9 @@ def _fake_quantize(self, inputs: torch.Tensor): return q, shared_exp + def _real_quantize(self, inputs: torch.Tensor): + raise NotImplementedError("This method hasn't be implemented.") + @property def fake_quant(self): """Return True if fake quantization is used.""" @@ -156,7 +159,7 @@ def weight_pack(self, weight, scale): ) e8m0_scale = (scale + 127).to(torch.uint8) - return qweight.reshape(original_shape), e8m0_scale + return qweight.reshape(original_shape), e8m0_scale.reshape(original_shape[0], -1) def __repr__(self): if self._disabled or not self._if_quant: diff --git a/neural_compressor/torch/export/export_hf.py b/neural_compressor/torch/export/export_hf.py index 4ed61b22859..d3ce4bdde09 100644 --- a/neural_compressor/torch/export/export_hf.py +++ b/neural_compressor/torch/export/export_hf.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 Intel Corporation +# Copyright (c) 2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,7 +28,7 @@ def _export_quantized_weight(sub_module: nn.Module, quantization_format: str = N The export includes converting weight tensor to correct quantized values and quantized dtype, and registering scaling factors. """ - if quantization_format == None: + if quantization_format is None: return weight: nn.Parameter = getattr(sub_module, weight_name) @@ -39,7 +39,7 @@ def _export_quantized_weight(sub_module: nn.Module, quantization_format: str = N # TODO: support more scale dtype when there are other quantization format except mxfp8/mxfp4 quantized_weight, e8m0_scale = weight_quantizer.weight_pack(qdq_weight, scale) - sub_module.register_buffer("weight_scale", e8m0_scale.reshape(*weight.shape[:-1], -1)) + sub_module.register_buffer("weight_scale", e8m0_scale) setattr(sub_module, weight_name, nn.Parameter(quantized_weight, requires_grad=False)) @@ -72,7 +72,7 @@ def _export_hf_checkpoint(model: nn.Module, scheme: str | None = None) -> tuple[ for name, sub_module in layer_pool.items(): quantization_format = get_quantization_format(sub_module) - if quantization_format != None: + if quantization_format is not None: if is_quantlinear(sub_module): _export_quantized_weight(sub_module, quantization_format) @@ -98,6 +98,6 @@ def export_hf2compressored_model(model: nn.Module, export_dir: Path | str = temp except Exception as e: warnings.warn( - "Cannot export model and config, the state" " can be saved with torch.save for further inspection." + "Cannot export model and config, the state can be saved with torch.save for further inspection." ) raise e diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py index a313220c43e..5311bdc318a 100644 --- a/neural_compressor/torch/quantization/quantize.py +++ b/neural_compressor/torch/quantization/quantize.py @@ -148,9 +148,10 @@ def quantize( @log_process(mode=Mode.PREPARE) def prepare( model: torch.nn.Module, - quant_config: BaseConfig, + quant_config: BaseConfig | dict | None = None, inplace: bool = True, example_inputs: Any = None, + qat: bool = False ): """Prepare the model for calibration. @@ -165,24 +166,28 @@ def prepare( Returns: prepared and calibrated module. """ - prepared_model = model if inplace else copy.deepcopy(model) - prepared_model, configs_mapping = preprocess_quant_config( - prepared_model, quant_config, mode="prepare", example_inputs=example_inputs - ) - for algo_name, algo_func in algos_mapping.items(): - # select quantization algo according to config - if need_apply(configs_mapping, algo_name): - logger.info(f"Start to prepare model with {algo_name}.") - prepared_model = algo_func( - prepared_model, - configs_mapping, - example_inputs=example_inputs, - mode=Mode.PREPARE, - ) - setattr(prepared_model, "is_prepared", True) - setattr(prepared_model, "quant_config", quant_config) - setattr(prepared_model, "example_inputs", example_inputs) - return prepared_model + if not qat: + prepared_model = model if inplace else copy.deepcopy(model) + prepared_model, configs_mapping = preprocess_quant_config( + prepared_model, quant_config, mode="prepare", example_inputs=example_inputs + ) + for algo_name, algo_func in algos_mapping.items(): + # select quantization algo according to config + if need_apply(configs_mapping, algo_name): + logger.info(f"Start to prepare model with {algo_name}.") + prepared_model = algo_func( + prepared_model, + configs_mapping, + example_inputs=example_inputs, + mode=Mode.PREPARE, + ) + setattr(prepared_model, "is_prepared", True) + setattr(prepared_model, "quant_config", quant_config) + setattr(prepared_model, "example_inputs", example_inputs) + return prepared_model + else: + from ..algorithms.qat.quant_utils import replace_with_quant_linear + return replace_with_quant_linear(model, quant_config) @log_process(mode=Mode.CONVERT) From c9a002642f2554bc19486514dcb663ce449f1c15 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 25 Sep 2025 08:59:13 +0000 Subject: [PATCH 04/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/torch/export/export_hf.py | 4 +--- neural_compressor/torch/quantization/quantize.py | 3 ++- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/neural_compressor/torch/export/export_hf.py b/neural_compressor/torch/export/export_hf.py index d3ce4bdde09..609a4b534e5 100644 --- a/neural_compressor/torch/export/export_hf.py +++ b/neural_compressor/torch/export/export_hf.py @@ -97,7 +97,5 @@ def export_hf2compressored_model(model: nn.Module, export_dir: Path | str = temp model.config.save_pretrained(export_dir) except Exception as e: - warnings.warn( - "Cannot export model and config, the state can be saved with torch.save for further inspection." - ) + warnings.warn("Cannot export model and config, the state can be saved with torch.save for further inspection.") raise e diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py index 5311bdc318a..dc86d210255 100644 --- a/neural_compressor/torch/quantization/quantize.py +++ b/neural_compressor/torch/quantization/quantize.py @@ -151,7 +151,7 @@ def prepare( quant_config: BaseConfig | dict | None = None, inplace: bool = True, example_inputs: Any = None, - qat: bool = False + qat: bool = False, ): """Prepare the model for calibration. @@ -187,6 +187,7 @@ def prepare( return prepared_model else: from ..algorithms.qat.quant_utils import replace_with_quant_linear + return replace_with_quant_linear(model, quant_config) From 1651d71b5cd3a1e82d0bfbde1223b12952911037 Mon Sep 17 00:00:00 2001 From: lkk Date: Fri, 26 Sep 2025 02:29:10 +0000 Subject: [PATCH 05/15] fix code style. --- neural_compressor/torch/algorithms/qat/quant_linear.py | 2 +- neural_compressor/torch/export/export_hf.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/neural_compressor/torch/algorithms/qat/quant_linear.py b/neural_compressor/torch/algorithms/qat/quant_linear.py index 50107b9db70..2858c0f9420 100644 --- a/neural_compressor/torch/algorithms/qat/quant_linear.py +++ b/neural_compressor/torch/algorithms/qat/quant_linear.py @@ -38,7 +38,7 @@ def forward(self, input: torch.Tensor): out = self.output_quantizer(out) return out - def _setup(self, quant_cfg: "QuantizationSchem"): + def _setup(self, quant_cfg): """Init quantizer.""" self.weight_quantizer = TensorQuantizer( data_type=quant_cfg.data_type, diff --git a/neural_compressor/torch/export/export_hf.py b/neural_compressor/torch/export/export_hf.py index 609a4b534e5..e617ae122a9 100644 --- a/neural_compressor/torch/export/export_hf.py +++ b/neural_compressor/torch/export/export_hf.py @@ -57,7 +57,6 @@ def _export_hf_checkpoint(model: nn.Module, scheme: str | None = None) -> tuple[ post_state_dict: Dict containing quantized weights quant_config: config information to export hf_quant_cfg.json """ - # Create a model layer pool # If `model.model` exists use that, otherwise use `model` itself, e.g., Nemotron-H root = getattr(model, "model", model) @@ -86,7 +85,6 @@ def export_hf2compressored_model(model: nn.Module, export_dir: Path | str = temp The packed checkpoint will be consumed by the VLLM. """ - export_dir = Path(export_dir) export_dir.mkdir(parents=True, exist_ok=True) From fcf4b86368b3f5569eda007dde67c29bced3cc93 Mon Sep 17 00:00:00 2001 From: lkk Date: Fri, 26 Sep 2025 03:20:25 +0000 Subject: [PATCH 06/15] add unit tests. --- .../torch/algorithms/qat/test_quant_utils.py | 208 ++++++++++++++++++ .../qat/test_quantizer_and_linear.py | 166 ++++++++++++++ 2 files changed, 374 insertions(+) create mode 100644 test/3x/torch/algorithms/qat/test_quant_utils.py create mode 100644 test/3x/torch/algorithms/qat/test_quantizer_and_linear.py diff --git a/test/3x/torch/algorithms/qat/test_quant_utils.py b/test/3x/torch/algorithms/qat/test_quant_utils.py new file mode 100644 index 00000000000..cca51126caf --- /dev/null +++ b/test/3x/torch/algorithms/qat/test_quant_utils.py @@ -0,0 +1,208 @@ +# -*- coding: utf-8 -*- + +import sys +import types +import importlib +from types import SimpleNamespace +from pathlib import Path + +import pytest +import torch +import torch.nn as nn + +from neural_compressor.torch.algorithms.qat import quant_utils + + +from neural_compressor.torch.algorithms.qat.tensor_quantizer import TensorQuantizer # type: ignore +from neural_compressor.torch.algorithms.qat.quant_linear import QuantLinear + + +class TinyModel(nn.Module): + """Simple hierarchical model for recursive replacement tests.""" + + def __init__(self): + super().__init__() + self.fc1 = nn.Linear(16, 8) + self.block = nn.Sequential( + nn.Linear(8, 8), + nn.ReLU(), + nn.Linear(8, 4), + ) + self.lm_head = nn.Linear(4, 2) + + def forward(self, x): + x = self.fc1(x) + x = self.block(x) + return self.lm_head(x) + + +@pytest.fixture +def sample_input(): + return torch.randn(2, 16) + +def make_quant_cfg( + *, + data_type="mx_fp8", + bits=8, + group_size=32, + sym=True, + act_data_type="mx_fp8", + act_bits=8, + act_group_size=32, + act_sym=True, +): + """ + Build a lightweight namespace mimicking the attributes QuantLinear._setup expects. + """ + return types.SimpleNamespace( + data_type=data_type, + bits=bits, + group_size=group_size, + sym=sym, + act_data_type=act_data_type, + act_bits=act_bits, + act_group_size=act_group_size, + act_sym=act_sym, + ) + +@pytest.fixture +def quant_cfg(): + return make_quant_cfg() + + +def test_convert_replaces_class_and_calls_setup(monkeypatch, quant_cfg): + linear = nn.Linear(4, 3) + + original_forward_id = id(QuantLinear.forward) + + quant_utils.convert(linear, quant_cfg=quant_cfg, quant_module=QuantLinear) + + assert isinstance(linear, QuantLinear) + assert hasattr(linear.forward, "__self__") and linear.forward.__self__ is linear + assert linear.forward.__func__ is QuantLinear.forward or id(linear.forward.__func__) == original_forward_id + + +def test_replace_with_quant_linear_recursive(monkeypatch, quant_cfg): + model = TinyModel() + + + quant_utils.replace_with_quant_linear(model, quant_cfg=quant_cfg) + + assert isinstance(model.fc1, QuantLinear) + assert isinstance(model.block[0], QuantLinear) + assert isinstance(model.block[2], QuantLinear) + assert isinstance(model.lm_head, nn.Linear) + + +def test_is_quantlinear_positive_and_negative(): + q = QuantLinear() + plain = nn.Linear(4, 2) + assert quant_utils.is_quantlinear(q) is True + assert quant_utils.is_quantlinear(plain) is False + + +def test_get_quantization_format_positive(monkeypatch): + layer = QuantLinear() + + layer.weight_quantizer = TensorQuantizer(bits=8, data_type="mx_fp8") + layer.weight_quantizer._disabled = False + layer.input_quantizer = TensorQuantizer(bits=8, data_type="mx_fp8") + layer.input_quantizer._disabled = False + + layer.weight = None + fmt = quant_utils.get_quantization_format(layer) + assert fmt == "MXFP8" + + +def test_get_quantization_format_none(): + layer = nn.Linear(4, 2) + fmt = quant_utils.get_quantization_format(layer) + assert fmt is None + + +def test_get_quantization_format_unsupported_bits_raises(): + layer = QuantLinear() + layer.weight_quantizer = TensorQuantizer(bits=4, data_type="mx_fp8") + layer.weight_quantizer._disabled = False + layer.input_quantizer = TensorQuantizer(bits=4, data_type="mx_fp8") + layer.input_quantizer._disabled = False + + with pytest.raises(NotImplementedError): + quant_utils.get_quantization_format(layer) + + +def test_get_quant_config_success(monkeypatch): + # dynamic fake module: auto_round.export.export_to_llmcompressor.config + module_name = "auto_round.export.export_to_llmcompressor.config" + + class DummyQuantCfg: + def __init__(self): + self.data = { + "provider": "dummy", + "config_groups": { + "group_0": { + "weights": {}, + "input_activations": {}, + } + }, + } + + def to_dict(self): + return self.data + + def initialize_quantization(scheme: str): + return DummyQuantCfg() + + # auto_round + auto_round = types.ModuleType("auto_round") + export = types.ModuleType("auto_round.export") + export_to = types.ModuleType("auto_round.export.export_to_llmcompressor") + config_mod = types.ModuleType(module_name) + config_mod.initialize_quantization = initialize_quantization + + sys.modules["auto_round"] = auto_round + sys.modules["auto_round.export"] = export + sys.modules["auto_round.export.export_to_llmcompressor"] = export_to + sys.modules[module_name] = config_mod + + cfg = quant_utils.get_quant_config(scheme="mxfp8") + assert isinstance(cfg, dict) + assert cfg["provider"] == "auto-round" + assert cfg["config_groups"]["group_0"]["weights"]["is_mx"] is True + assert cfg["config_groups"]["group_0"]["input_activations"]["is_mx"] is True + + +def test_convert_forward_executes(monkeypatch): + linear = nn.Linear(5, 3) + + def fake_forward(self, x): + return torch.zeros(x.shape[0], 3) + + monkeypatch.setattr(QuantLinear, "forward", fake_forward, raising=True) + + quant_utils.convert(linear, quant_cfg=make_quant_cfg(), quant_module=QuantLinear) + out = linear(torch.randn(2, 5)) + assert out.shape == (2, 3) + assert torch.all(out == 0) + + +def test_replace_with_quant_linear_idempotent(quant_cfg): + model = TinyModel() + quant_utils.replace_with_quant_linear(model, quant_cfg=quant_cfg) + quant_utils.replace_with_quant_linear(model, quant_cfg=quant_cfg) + assert isinstance(model.fc1, QuantLinear) + + +@pytest.mark.parametrize("disabled", [True, False]) +def test_get_quantization_format_disabled_returns_none(disabled): + layer = QuantLinear() + layer.weight_quantizer = TensorQuantizer(bits=8, data_type="mx_fp8") + layer.weight_quantizer._disabled = disabled + layer.input_quantizer = TensorQuantizer(bits=8, data_type="mx_fp8") + layer.input_quantizer._disabled = disabled + + fmt = quant_utils.get_quantization_format(layer) + if disabled: + assert fmt is None + else: + assert fmt == "MXFP8" diff --git a/test/3x/torch/algorithms/qat/test_quantizer_and_linear.py b/test/3x/torch/algorithms/qat/test_quantizer_and_linear.py new file mode 100644 index 00000000000..7fced0768bf --- /dev/null +++ b/test/3x/torch/algorithms/qat/test_quantizer_and_linear.py @@ -0,0 +1,166 @@ +import math +import types +import torch +import pytest +import torch.nn as nn + +# Skip the whole module if auto_round (needed for get_quant_func inside TensorQuantizer) is not available +auto_round = pytest.importorskip("auto_round") + +from neural_compressor.torch.algorithms.qat.quant_linear import QuantLinear +from neural_compressor.torch.algorithms.qat.tensor_quantizer import TensorQuantizer + +def make_quant_cfg( + *, + data_type="mx_fp8", + bits=8, + group_size=32, + sym=True, + act_data_type="mx_fp8", + act_bits=8, + act_group_size=32, + act_sym=True, +): + """ + Build a lightweight namespace mimicking the attributes QuantLinear._setup expects. + """ + return types.SimpleNamespace( + data_type=data_type, + bits=bits, + group_size=group_size, + sym=sym, + act_data_type=act_data_type, + act_bits=act_bits, + act_group_size=act_group_size, + act_sym=act_sym, + ) + + +def build_quant_linear(in_features=32, out_features=16, bias=True, quant_cfg=None, device="cpu", dtype=torch.float32): + """ + Manually construct a QuantLinear since the class does not define an __init__. + + Steps: + 1. Instantiate the module + 2. Register parameter tensors (weight, bias) + 3. Add metadata attributes used by extra_repr / repr + 4. Call internal _setup with provided quant config + """ + if quant_cfg is None: + quant_cfg = make_quant_cfg(group_size=32, act_group_size=32) + + ql = QuantLinear() + ql.in_features = in_features + ql.out_features = out_features + + weight = torch.randn(out_features, in_features, device=device, dtype=dtype) + ql.register_parameter("weight", nn.Parameter(weight)) + + if bias: + b = torch.randn(out_features, device=device, dtype=dtype) + ql.register_parameter("bias", nn.Parameter(b)) + else: + ql.bias = None # make sure attribute exists + + ql._setup(quant_cfg) + return ql + + +@pytest.mark.parametrize("bias", [True, False]) +def test_quant_linear_forward_and_backward(bias): + torch.manual_seed(42) + + in_features = 32 + out_features = 16 + batch_size = 3 + + ql = build_quant_linear(in_features=in_features, out_features=out_features, bias=bias) + + # Create a deliberately non-contiguous input (transpose trick) + base = torch.randn(in_features, batch_size) + x = base.t() # shape (batch_size, in_features) but non-contiguous + assert not x.is_contiguous() + + x.requires_grad_(True) + out = ql(x) + + # Shape & dtype checks + assert out.shape == (batch_size, out_features) + assert out.dtype == x.dtype + + # Backward pass + loss = out.sum() + loss.backward() + + assert ql.weight.grad is not None, "Weight should receive gradient through fake quant path" + if bias: + assert ql.bias.grad is not None, "Bias should receive gradient" + else: + assert ql.bias is None + + # Ensure original weight dtype tracked + assert ql.original_weight_dtype == ql.weight.dtype + + # Output quantizer is explicitly disabled in _setup + assert "TensorQuantizer(disabled)" in repr(ql.output_quantizer) + + # Input/weight quantizers should be enabled (not containing 'disabled') + assert "disabled" not in repr(ql.input_quantizer) + assert "disabled" not in repr(ql.weight_quantizer) + + +def test_quant_linear_repr_and_extra_repr(): + ql = build_quant_linear(in_features=8, out_features=4, bias=True) + r = repr(ql) + # Basic structural checks + assert "QuantLinear(" in r + assert "(input_quantizer):" in r + assert "(weight_quantizer):" in r + assert "(output_quantizer):" in r + # extra_repr path + er = ql.extra_repr() + assert "in_features=8" in er + assert "out_features=4" in er + assert "bias=True" in er + + +def test_tensor_quantizer_disable_and_no_quant_path(): + tq = TensorQuantizer(if_quant=False) # constructed with quantization turned off + x = torch.randn(5, 7) + out = tq(x) + # When disabled (not quant) it should return the identical object (same memory) + assert out.data_ptr() == x.data_ptr() + assert repr(tq) == "TensorQuantizer(disabled)" + + +def test_tensor_quantizer_enable_disable_cycle(): + tq = TensorQuantizer() + x = torch.randn(4, 32) # group size default 32, matches last dim + y1 = tq(x) + assert y1.shape == x.shape + # Disable and ensure passthrough (pointer equality) + tq.disable() + y2 = tq(x) + assert y2.data_ptr() == x.data_ptr() + assert "disabled" in repr(tq) + # Re-enable + tq.enable() + y3 = tq(x) + assert y3.shape == x.shape + assert "disabled" not in repr(tq) + + +def test_tensor_quantizer_scale_persistence(): + # Provide scale_shape so internal buffer is registered & updated + tq = TensorQuantizer(scale_shape=(4, 32), block_size=32) + x = torch.randn(4, 32) + # Use internal fake quant function to generate scale + q, shared_exp = tq._fake_quantize(x) + # scale buffer should have been updated (shape (4, 1)) + assert hasattr(tq, "scale") + assert tq.scale.shape == (4, 1) + # We cannot be certain of values, but at least ensure it is uint8 and not all zeros (likely) + assert tq.scale.dtype == torch.uint8 + # Heuristic: at least one non-zero (if all zero it may still be valid, but improbable) + assert (tq.scale != 0).any() or (shared_exp == 0).all() + From 089c2478962336ad5c8592b9627a99c2a73726ba Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 26 Sep 2025 03:22:04 +0000 Subject: [PATCH 07/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- test/3x/torch/algorithms/qat/test_quantizer_and_linear.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/3x/torch/algorithms/qat/test_quantizer_and_linear.py b/test/3x/torch/algorithms/qat/test_quantizer_and_linear.py index 7fced0768bf..8f5c6108ba8 100644 --- a/test/3x/torch/algorithms/qat/test_quantizer_and_linear.py +++ b/test/3x/torch/algorithms/qat/test_quantizer_and_linear.py @@ -163,4 +163,3 @@ def test_tensor_quantizer_scale_persistence(): assert tq.scale.dtype == torch.uint8 # Heuristic: at least one non-zero (if all zero it may still be valid, but improbable) assert (tq.scale != 0).any() or (shared_exp == 0).all() - From 6c0621d43f19e9a0419a9b62f0fa11205bd58d32 Mon Sep 17 00:00:00 2001 From: lkk Date: Fri, 26 Sep 2025 03:40:40 +0000 Subject: [PATCH 08/15] update `prepare_qat` entry. --- .../torch/quantization/quantize.py | 67 ++++++++++++------- 1 file changed, 43 insertions(+), 24 deletions(-) diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py index dc86d210255..3600869c096 100644 --- a/neural_compressor/torch/quantization/quantize.py +++ b/neural_compressor/torch/quantization/quantize.py @@ -148,10 +148,9 @@ def quantize( @log_process(mode=Mode.PREPARE) def prepare( model: torch.nn.Module, - quant_config: BaseConfig | dict | None = None, + quant_config: BaseConfig, inplace: bool = True, example_inputs: Any = None, - qat: bool = False, ): """Prepare the model for calibration. @@ -166,29 +165,49 @@ def prepare( Returns: prepared and calibrated module. """ - if not qat: - prepared_model = model if inplace else copy.deepcopy(model) - prepared_model, configs_mapping = preprocess_quant_config( - prepared_model, quant_config, mode="prepare", example_inputs=example_inputs - ) - for algo_name, algo_func in algos_mapping.items(): - # select quantization algo according to config - if need_apply(configs_mapping, algo_name): - logger.info(f"Start to prepare model with {algo_name}.") - prepared_model = algo_func( - prepared_model, - configs_mapping, - example_inputs=example_inputs, - mode=Mode.PREPARE, - ) - setattr(prepared_model, "is_prepared", True) - setattr(prepared_model, "quant_config", quant_config) - setattr(prepared_model, "example_inputs", example_inputs) - return prepared_model - else: - from ..algorithms.qat.quant_utils import replace_with_quant_linear + prepared_model = model if inplace else copy.deepcopy(model) + prepared_model, configs_mapping = preprocess_quant_config( + prepared_model, quant_config, mode="prepare", example_inputs=example_inputs + ) + for algo_name, algo_func in algos_mapping.items(): + # select quantization algo according to config + if need_apply(configs_mapping, algo_name): + logger.info(f"Start to prepare model with {algo_name}.") + prepared_model = algo_func( + prepared_model, + configs_mapping, + example_inputs=example_inputs, + mode=Mode.PREPARE, + ) + setattr(prepared_model, "is_prepared", True) + setattr(prepared_model, "quant_config", quant_config) + setattr(prepared_model, "example_inputs", example_inputs) + return prepared_model + + +@log_process(mode=Mode.PREPARE) +def prepare_qat( + model: torch.nn.Module, + quant_config: dict, + inplace: bool = True, +): + r""" + Prepares a copy of the model for quantization calibration or + quantization-aware training and converts it to quantized version. + + Quantization configuration should be assigned preemptively + to individual submodules in `.qconfig` attribute. + + Args: + model: input model to be modified in-place + quant_config: quantization config that maps float modules to quantized modules to be + replaced. + inplace: carry out model transformations in-place, the original module + is mutated + """ + from ..algorithms.qat.quant_utils import replace_with_quant_linear - return replace_with_quant_linear(model, quant_config) + return replace_with_quant_linear(model, quant_config) @log_process(mode=Mode.CONVERT) From a1f8c3adda9a4e2e3db3174b39fc4487b27d72fb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 26 Sep 2025 03:42:41 +0000 Subject: [PATCH 09/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/torch/quantization/quantize.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py index 3600869c096..241bac9c196 100644 --- a/neural_compressor/torch/quantization/quantize.py +++ b/neural_compressor/torch/quantization/quantize.py @@ -191,8 +191,7 @@ def prepare_qat( quant_config: dict, inplace: bool = True, ): - r""" - Prepares a copy of the model for quantization calibration or + r"""Prepares a copy of the model for quantization calibration or quantization-aware training and converts it to quantized version. Quantization configuration should be assigned preemptively From fbe0918fb2041137b9100c7822542126a9e22931 Mon Sep 17 00:00:00 2001 From: lkk Date: Fri, 26 Sep 2025 05:26:37 +0000 Subject: [PATCH 10/15] update `prepare_qat` code style to align with torchao. --- .../torch/algorithms/qat/quant_utils.py | 24 +++++++++++++++++++ .../torch/quantization/config.py | 12 ++++++++++ .../torch/quantization/quantize.py | 12 +++++++--- 3 files changed, 45 insertions(+), 3 deletions(-) diff --git a/neural_compressor/torch/algorithms/qat/quant_utils.py b/neural_compressor/torch/algorithms/qat/quant_utils.py index 0f76650e8ab..0f156338f7a 100644 --- a/neural_compressor/torch/algorithms/qat/quant_utils.py +++ b/neural_compressor/torch/algorithms/qat/quant_utils.py @@ -60,6 +60,30 @@ def replace_with_quant_linear(model, quant_cfg=None): return model +def get_quant_config_with_scheme(scheme: str): + """get quantization config.""" + + try: + # use scheme definitions from AutoRound since we utilize the quantization functions now + from auto_round.schemes import preset_name_to_scheme + quant_cfg = preset_name_to_scheme(scheme) + return quant_cfg + except ImportError: + return None + + +def convert_model_with_mapping(model, mapping=None): + """process mapping to quant config.""" + # key is torch module, TODO: support more key format, like layer name. + for key in mapping: + # TODO: support more torch modules + if isinstance(key, nn.Linear): + quant_cfg = get_quant_config_with_scheme(mapping[key]) + if quant_cfg is None: + continue + replace_with_quant_linear(model, quant_cfg) + + def get_quant_config(scheme: str) -> dict[str, Any]: """Generate quantization config for a torch model. diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index 27e5a85551e..b6b78d3f62d 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -24,6 +24,7 @@ from typing import Callable, Dict, List, NamedTuple, Optional from typing import OrderedDict as OrderedDictType from typing import Tuple, Union +import copy import torch @@ -2167,3 +2168,14 @@ def get_config_set_for_tuning(cls, dtype="int8"): return cls._model_mapping[STATIC_QUANT].get_config_set_for_tuning() else: raise ValueError(f"Unsupported dtype: {dtype}, allowed values are 'fp8' and 'int8'.") + +# TODO: support more mappings configurations. +# Default map for swapping float module to qat modules +DEFAULT_QAT_MODULE_MAPPINGS: dict[Callable, Any] = { + torch.nn.Linear: "MXFP8", +} + +def get_default_qat_module_mappings() -> dict[Callable, Any]: + """Get default module mapping for quantization aware training""" + return copy.deepcopy(DEFAULT_QAT_MODULE_MAPPINGS) + diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py index 241bac9c196..2d92bd63596 100644 --- a/neural_compressor/torch/quantization/quantize.py +++ b/neural_compressor/torch/quantization/quantize.py @@ -188,7 +188,7 @@ def prepare( @log_process(mode=Mode.PREPARE) def prepare_qat( model: torch.nn.Module, - quant_config: dict, + mapping=None, inplace: bool = True, ): r"""Prepares a copy of the model for quantization calibration or @@ -204,9 +204,15 @@ def prepare_qat( inplace: carry out model transformations in-place, the original module is mutated """ - from ..algorithms.qat.quant_utils import replace_with_quant_linear + assert model.training, "prepare_qat only works on models in training mode" - return replace_with_quant_linear(model, quant_config) + from .config import get_default_qat_module_mappings + if mapping is None: + mapping = get_default_qat_module_mappings() + + from ..algorithms.qat.quant_utils import convert_model_with_mapping + + return convert_model_with_mapping(model, mapping) @log_process(mode=Mode.CONVERT) From 4d7508f6d6b5b0d4bae5d39db79b73d8da84052f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 26 Sep 2025 05:28:18 +0000 Subject: [PATCH 11/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/torch/algorithms/qat/quant_utils.py | 9 +++++---- neural_compressor/torch/quantization/config.py | 7 ++++--- neural_compressor/torch/quantization/quantize.py | 1 + 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/neural_compressor/torch/algorithms/qat/quant_utils.py b/neural_compressor/torch/algorithms/qat/quant_utils.py index 0f156338f7a..b065f3605fb 100644 --- a/neural_compressor/torch/algorithms/qat/quant_utils.py +++ b/neural_compressor/torch/algorithms/qat/quant_utils.py @@ -61,19 +61,20 @@ def replace_with_quant_linear(model, quant_cfg=None): def get_quant_config_with_scheme(scheme: str): - """get quantization config.""" + """Get quantization config.""" try: # use scheme definitions from AutoRound since we utilize the quantization functions now from auto_round.schemes import preset_name_to_scheme + quant_cfg = preset_name_to_scheme(scheme) return quant_cfg except ImportError: return None - + def convert_model_with_mapping(model, mapping=None): - """process mapping to quant config.""" + """Process mapping to quant config.""" # key is torch module, TODO: support more key format, like layer name. for key in mapping: # TODO: support more torch modules @@ -83,7 +84,7 @@ def convert_model_with_mapping(model, mapping=None): continue replace_with_quant_linear(model, quant_cfg) - + def get_quant_config(scheme: str) -> dict[str, Any]: """Generate quantization config for a torch model. diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index b6b78d3f62d..fd79864a467 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -18,13 +18,13 @@ """Intel Neural Compressor Pytorch quantization config API.""" +import copy import importlib import json from collections import OrderedDict from typing import Callable, Dict, List, NamedTuple, Optional from typing import OrderedDict as OrderedDictType from typing import Tuple, Union -import copy import torch @@ -2169,13 +2169,14 @@ def get_config_set_for_tuning(cls, dtype="int8"): else: raise ValueError(f"Unsupported dtype: {dtype}, allowed values are 'fp8' and 'int8'.") + # TODO: support more mappings configurations. # Default map for swapping float module to qat modules DEFAULT_QAT_MODULE_MAPPINGS: dict[Callable, Any] = { torch.nn.Linear: "MXFP8", } + def get_default_qat_module_mappings() -> dict[Callable, Any]: - """Get default module mapping for quantization aware training""" + """Get default module mapping for quantization aware training.""" return copy.deepcopy(DEFAULT_QAT_MODULE_MAPPINGS) - diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py index 2d92bd63596..84f770a4a71 100644 --- a/neural_compressor/torch/quantization/quantize.py +++ b/neural_compressor/torch/quantization/quantize.py @@ -207,6 +207,7 @@ def prepare_qat( assert model.training, "prepare_qat only works on models in training mode" from .config import get_default_qat_module_mappings + if mapping is None: mapping = get_default_qat_module_mappings() From 6d89e55154838e8e9cd25b9d615e6f4944c7222e Mon Sep 17 00:00:00 2001 From: lkk Date: Mon, 29 Sep 2025 02:37:38 +0000 Subject: [PATCH 12/15] add qat test ut. --- .../torch/algorithms/qat/quant_utils.py | 7 ++- .../torch/quantization/config.py | 2 +- test/3x/torch/algorithms/qat/test_qat.py | 62 +++++++++++++++++++ 3 files changed, 68 insertions(+), 3 deletions(-) create mode 100644 test/3x/torch/algorithms/qat/test_qat.py diff --git a/neural_compressor/torch/algorithms/qat/quant_utils.py b/neural_compressor/torch/algorithms/qat/quant_utils.py index b065f3605fb..ad32ee76806 100644 --- a/neural_compressor/torch/algorithms/qat/quant_utils.py +++ b/neural_compressor/torch/algorithms/qat/quant_utils.py @@ -26,6 +26,7 @@ import torch.nn as nn from .quant_linear import QuantLinear +from .tensor_quantizer import TensorQuantizer def convert(module: nn.Module, quant_cfg=None, quant_module=None): @@ -66,7 +67,6 @@ def get_quant_config_with_scheme(scheme: str): try: # use scheme definitions from AutoRound since we utilize the quantization functions now from auto_round.schemes import preset_name_to_scheme - quant_cfg = preset_name_to_scheme(scheme) return quant_cfg except ImportError: @@ -78,12 +78,15 @@ def convert_model_with_mapping(model, mapping=None): # key is torch module, TODO: support more key format, like layer name. for key in mapping: # TODO: support more torch modules - if isinstance(key, nn.Linear): + if key == nn.Linear: quant_cfg = get_quant_config_with_scheme(mapping[key]) if quant_cfg is None: continue replace_with_quant_linear(model, quant_cfg) + replaced_modules = sum(isinstance(m, TensorQuantizer) for _, m in model.named_modules()) + print(f"Inserted {replaced_modules} quantizers") + def get_quant_config(scheme: str) -> dict[str, Any]: """Generate quantization config for a torch model. diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index fd79864a467..bc7cd91e172 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -24,7 +24,7 @@ from collections import OrderedDict from typing import Callable, Dict, List, NamedTuple, Optional from typing import OrderedDict as OrderedDictType -from typing import Tuple, Union +from typing import Tuple, Union, Any import torch diff --git a/test/3x/torch/algorithms/qat/test_qat.py b/test/3x/torch/algorithms/qat/test_qat.py new file mode 100644 index 00000000000..5279c808bef --- /dev/null +++ b/test/3x/torch/algorithms/qat/test_qat.py @@ -0,0 +1,62 @@ +import math +import types +import torch +import torch.nn as nn +import pytest + +# Skip the whole module if auto_round (needed for get_quant_func inside TensorQuantizer) is not available +auto_round = pytest.importorskip("auto_round") + +from neural_compressor.torch.quantization.quantize import prepare_qat + + +def setup_seed(seed): + import numpy as np + import random + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + np.random.seed(seed) + random.seed(seed) + torch.backends.cudnn.deterministic = True + +class TinyModel(nn.Module): + """Simple hierarchical model for recursive replacement tests.""" + + def __init__(self): + super().__init__() + self.fc1 = nn.Linear(32, 64) + self.lm_head = nn.Linear(64, 2) + + def forward(self, x): + x = self.fc1(x) + return self.lm_head(x) + +def test_replace_quant_layer(): + """Check the inserted quant linear.""" + model = TinyModel() + + prepare_qat(model) + + replaced_modules = sum(isinstance(m, TensorQuantizer) for _, m in model.named_modules()) + + assert replaced_modules == 3 + + +def test_train(): + """QAT test.""" + setup_seed(20) + + model = TinyModel() + prepare_qat(model) + + inp = torch.randn([2, 32]) + + optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5) + + with torch.autocast(device_type="cpu", dtype=torch.bfloat16): + output = model(inp) + loss = output.mean() + + optimizer.zero_grad() + loss.backward() + optimizer.step() From 0551717ac25b047fd6987c40da7c77e93f4b08c6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 29 Sep 2025 02:39:29 +0000 Subject: [PATCH 13/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/torch/algorithms/qat/quant_utils.py | 3 ++- neural_compressor/torch/quantization/config.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/neural_compressor/torch/algorithms/qat/quant_utils.py b/neural_compressor/torch/algorithms/qat/quant_utils.py index ad32ee76806..bf99a36d5b3 100644 --- a/neural_compressor/torch/algorithms/qat/quant_utils.py +++ b/neural_compressor/torch/algorithms/qat/quant_utils.py @@ -26,7 +26,7 @@ import torch.nn as nn from .quant_linear import QuantLinear -from .tensor_quantizer import TensorQuantizer +from .tensor_quantizer import TensorQuantizer def convert(module: nn.Module, quant_cfg=None, quant_module=None): @@ -67,6 +67,7 @@ def get_quant_config_with_scheme(scheme: str): try: # use scheme definitions from AutoRound since we utilize the quantization functions now from auto_round.schemes import preset_name_to_scheme + quant_cfg = preset_name_to_scheme(scheme) return quant_cfg except ImportError: diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index bc7cd91e172..883edc60f60 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -22,9 +22,9 @@ import importlib import json from collections import OrderedDict -from typing import Callable, Dict, List, NamedTuple, Optional +from typing import Any, Callable, Dict, List, NamedTuple, Optional from typing import OrderedDict as OrderedDictType -from typing import Tuple, Union, Any +from typing import Tuple, Union import torch From ece99c341a7f68c1a358992b03a73a6e5d912093 Mon Sep 17 00:00:00 2001 From: lkk Date: Mon, 29 Sep 2025 03:35:37 +0000 Subject: [PATCH 14/15] fix ut. --- test/3x/torch/algorithms/qat/test_qat.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/3x/torch/algorithms/qat/test_qat.py b/test/3x/torch/algorithms/qat/test_qat.py index 5279c808bef..4e2d270bef9 100644 --- a/test/3x/torch/algorithms/qat/test_qat.py +++ b/test/3x/torch/algorithms/qat/test_qat.py @@ -8,6 +8,7 @@ auto_round = pytest.importorskip("auto_round") from neural_compressor.torch.quantization.quantize import prepare_qat +from neural_compressor.torch.algorithms.qat.tensor_quantizer import TensorQuantizer def setup_seed(seed): From 1addd32c5c847b658e6d5b9d1734f51159fc6f43 Mon Sep 17 00:00:00 2001 From: lkk <33276950+lkk12014402@users.noreply.github.com> Date: Mon, 29 Sep 2025 15:41:11 +0800 Subject: [PATCH 15/15] update qat ut assert. --- test/3x/torch/algorithms/qat/test_qat.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/3x/torch/algorithms/qat/test_qat.py b/test/3x/torch/algorithms/qat/test_qat.py index 4e2d270bef9..83fc1dd4348 100644 --- a/test/3x/torch/algorithms/qat/test_qat.py +++ b/test/3x/torch/algorithms/qat/test_qat.py @@ -60,4 +60,8 @@ def test_train(): optimizer.zero_grad() loss.backward() + + # check the grad + for name, param in model.named_parameters(): + assert param.grad is not None optimizer.step()