From b34fb322f869007a51002a4b6e10fd0dcfecf920 Mon Sep 17 00:00:00 2001
From: lkk <kaokao.lv@intel.com>
Date: Thu, 25 Sep 2025 07:16:46 +0000
Subject: [PATCH 01/15] add mxfp8 qat code, mxfp8fwd-bf16bwd.

---
 .../torch/algorithms/qat/__init__.py          |  16 ++
 .../torch/algorithms/qat/quant_linear.py      |  81 ++++++++
 .../torch/algorithms/qat/quant_utils.py       | 123 ++++++++++++
 .../torch/algorithms/qat/tensor_quantizer.py  | 181 ++++++++++++++++++
 neural_compressor/torch/export/export_hf.py   | 112 +++++++++++
 5 files changed, 513 insertions(+)
 create mode 100644 neural_compressor/torch/algorithms/qat/__init__.py
 create mode 100644 neural_compressor/torch/algorithms/qat/quant_linear.py
 create mode 100644 neural_compressor/torch/algorithms/qat/quant_utils.py
 create mode 100644 neural_compressor/torch/algorithms/qat/tensor_quantizer.py
 create mode 100644 neural_compressor/torch/export/export_hf.py

diff --git a/neural_compressor/torch/algorithms/qat/__init__.py b/neural_compressor/torch/algorithms/qat/__init__.py
new file mode 100644
index 00000000000..d3bdaf8e760
--- /dev/null
+++ b/neural_compressor/torch/algorithms/qat/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pylint:disable=import-error
+"""QAT (Quantization Aware Tuning)."""
diff --git a/neural_compressor/torch/algorithms/qat/quant_linear.py b/neural_compressor/torch/algorithms/qat/quant_linear.py
new file mode 100644
index 00000000000..911d082db55
--- /dev/null
+++ b/neural_compressor/torch/algorithms/qat/quant_linear.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+#
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Quantized Linear."""
+
+
+import torch
+import torch.nn as nn
+
+from .tensor_quantizer import TensorQuantizer
+
+class QuantLinear(nn.Module):
+    """Quantized version of nn.Linear."""
+
+    def forward(self, input: torch.Tensor):
+        """Add weight/input/output of quantization for the original forward method."""
+        qw = self.weight_quantizer(self.weight)
+        qi = self.input_quantizer(input)
+        out = F.linear(qi, qw, self.bias)
+        out = self.output_quantizer(out)
+        return out
+
+    def _setup(self, quant_cfg: "QuantizationSchem"):
+        """Init quantizer"""
+        self.weight_quantizer = TensorQuantizer(
+            data_type=quant_cfg.data_type,
+            block_size=quant_cfg.group_size,
+            bits=quant_cfg.bits,
+            sym=quant_cfg.sym,
+            if_quant=True,
+            learn_exponent=False,
+        )
+        self.input_quantizer = TensorQuantizer(
+            data_type=quant_cfg.act_data_type,
+            block_size=quant_cfg.act_group_size,
+            bits=quant_cfg.act_bits,
+            sym=quant_cfg.act_sym,
+            if_quant=True,
+            learn_exponent=False,
+        )
+        self.output_quantizer = TensorQuantizer(
+            data_type=quant_cfg.act_data_type,
+            block_size=quant_cfg.act_group_size,
+            bits=quant_cfg.act_bits,
+            sym=quant_cfg.act_sym,
+            if_quant=False,
+        )
+        # Currently don't quant output
+        self.output_quantizer.disable()
+
+        # TODO: remove
+        self.original_weight_dtype = None if self.weight is None else self.weight.dtype
+
+    def extra_repr(self) -> str:
+        """Generate extra_repr making sure import keys exist in self.__dict__."""
+        return f"in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}"
+
+    def __repr__(self):
+        """Overriding the __repr__ method, makes the output more concise and meaningful."""
+        return f"QuantLinear(\n" \
+               f"  in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}\n" \
+               f"  (input_quantizer): {self.input_quantizer}\n" \
+               f"  (output_quantizer): {self.output_quantizer}\n" \
+               f"  (weight_quantizer): {self.weight_quantizer}\n" \
+               f")"
diff --git a/neural_compressor/torch/algorithms/qat/quant_utils.py b/neural_compressor/torch/algorithms/qat/quant_utils.py
new file mode 100644
index 00000000000..5bebdbbf3c5
--- /dev/null
+++ b/neural_compressor/torch/algorithms/qat/quant_utils.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+#
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utils for quantization"""
+
+import types
+import torch
+import torch.nn as nn
+from typing import Any
+from .quant_linear import QuantLinear
+
+
+def convert(module: nn.Module, quant_cfg=None, quant_module=None):
+    """Convert the model to a quantized one with quant config"""
+
+    # update class
+    original_cls = type(module)
+    module.__class__ = quant_module
+    module.forward = types.MethodType(quant_module.forward, module)
+
+    # setup quantizers
+    module._setup(quant_cfg)
+
+    return module
+
+
+def replace_with_quant_linear(model, quant_cfg=None):
+    """Recursively replace the module with quantized module."""
+
+    # TODO: support more modules, like kv.
+    for name, child in model.named_children():
+        if isinstance(child, nn.Linear):
+            if "lm_head" in name:
+                continue
+            # REPLACE on the parent (model), not on child
+            quantized = convert(child, quant_cfg, QuantLinear)
+            setattr(model, name, quantized)
+
+        # now recurse into whichever module is now at `model.name`
+        replace_with_quant_linear(getattr(model, name), quant_cfg=quant_cfg)
+
+    return model
+
+
+def get_quant_config(scheme: str) -> dict[str, Any]:
+    """Generate quantization config for a torch model.
+
+    Args:
+        model: The PyTorch model to analyze
+
+    Returns:
+        Dictionary containing the quantization configuration
+    """
+
+    # TODO: support more quant config
+    try:
+        from auto_round.export.export_to_llmcompressor.config import initialize_quantization
+        quantization_config = initialize_quantization(scheme=scheme)
+        quantization_config = quantization_config.to_dict()
+        quantization_config["provider"] = "auto-round"
+        quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] = True
+        quantization_config["config_groups"]["group_0"]["input_activations"]["is_mx"] = True
+
+    except ImportError:
+        quantization_config = None
+
+    return quantization_config
+
+
+def get_quantization_format(module) -> str | None:
+    """Gets the quantization string.
+
+    Gets the quantization string by iterating through the module and its children.
+    The first non-None quantization string is returned.
+    """
+
+    def _get_quantization_from_layer(layer):
+        weight_quantizer = getattr(layer, "weight_quantizer", None)
+        input_quantizer = getattr(layer, "input_quantizer", None)
+
+        if weight_quantizer is None or weight_quantizer._disabled:
+            return None
+
+        # TODO: support more quant format
+        if weight_quantizer.num_bits == 8 and weight_quantizer.data_type == "mx_fp8":
+            return "MXFP8"
+
+        # Raise error for unsupported num_bits
+        raise NotImplementedError(
+            f"Unsupported quantizer with num_bits: {weight_quantizer.num_bits}"
+        )
+
+    quantization = _get_quantization_from_layer(module)
+    if quantization != None:
+        return quantization
+
+    for _, layer in module.named_children():
+        format = get_quantization_format(layer)
+        if format != None:
+            return format
+
+    return None
+
+
+def is_quantlinear(module: nn.Module) -> bool:
+    """Returns whether the module is a quantized linear layer."""
+    return "QuantLinear" in type(module).__name__
diff --git a/neural_compressor/torch/algorithms/qat/tensor_quantizer.py b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py
new file mode 100644
index 00000000000..668786bffa8
--- /dev/null
+++ b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py
@@ -0,0 +1,181 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+#
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""TensorQuantizer Module."""
+
+import torch
+from torch import nn
+
+try:
+    from auto_round.data_type import get_quant_func
+except ImportError:
+    get_quant_func = None
+
+class TensorQuantizer(nn.Module):
+    """Tensor quantizer module."""
+
+    def __init__(
+        self,
+        data_type="mx_fp8",
+        bits=8,
+        block_size=32,
+        sym=True,
+        if_quant=True,
+        learn_exponent=False,
+        amax=None,
+        scale_shape=None,
+        device=None,
+    ):
+        """Initialize quantizer and set up required variables."""
+        super().__init__()
+        self.amax = amax
+        self.data_type = data_type
+        self.num_bits = bits
+        self.block_size = block_size
+        self.sym = sym
+        self._if_quant = if_quant
+        self.learn_exponent = learn_exponent
+        self._dequantize = False
+        self._input_dtype = None
+        self._fake_quant = True
+
+        # enable quantizer
+        self.enable()
+
+        assert get_quant_func is not None, (
+                f"The quantization function is imported from AutoRound, please intall it. 'pip install auto-round'"
+            )
+
+        # self.data_type will be overided 'mx_fp' -> 'mx_fp8'
+        self.quant_func, self.data_type = get_quant_func(self.data_type, self.num_bits, self.sym)
+
+        if scale_shape is not None:
+            # E8M0 scales (exponent)
+            self.register_buffer(
+                "scale",
+                torch.empty(scale_shape[0], scale_shape.shape[1] // self.block_size, dtype=torch.uint8, device=device),
+            )
+            self.save_scale = True
+        else:
+            self.save_scale = False
+
+    def forward(self, inputs: torch.Tensor):
+        """Apply tensor_quant function to inputs.
+
+        Args:
+            inputs: A Tensor of type float32/float16/bfloat16.
+
+        Returns:
+            outputs: A Tensor of type output_dtype
+        """
+
+        if self._disabled or (not self._if_quant):
+            self._input_dtype = inputs.dtype
+            return inputs
+
+        x = inputs
+        if not x.is_contiguous():
+            x = x.contiguous()
+
+        if self.fake_quant:
+            q = self._fake_quantize(x)[0]
+        else:
+            # TODO: add implementation
+            q = self._real_quantize(x)
+
+        return q.to(inputs.dtype)
+
+    def _fake_quantize(self, inputs: torch.Tensor):
+        """Fake quantization."""
+
+        # the shared_exp can be trainable
+        if self.learn_exponent:
+            q, shared_exp, _ = self.quant_func(
+                inputs,
+                bits=self.num_bits,
+                group_size=self.block_size,
+                data_type=self.data_type,
+            )
+        else:
+            # wrapper no_grad, because the function includes extra trainable variables
+            with torch.no_grad():
+                q, shared_exp, _ = self.quant_func(
+                    inputs,
+                    bits=self.num_bits,
+                    group_size=self.block_size,
+                    data_type=self.data_type,
+                )
+
+            # simple STE, since we add no_grad in the quant function
+            q = q.detach() + (inputs - inputs.detach())
+
+        if self.save_scale:
+            # TODO: PACK uint8
+            self.scale.data.copy_(shared_exp.detach())
+
+        return q, shared_exp
+
+    @property
+    def fake_quant(self):
+        """Return True if fake quantization is used."""
+        return self._fake_quant
+
+    def disable(self):
+        """Bypass the module."""
+        self._disabled = True
+
+    def enable(self):
+        """Enable the module."""
+        self._disabled = False
+
+    def weight_pack(self, weight, scale):
+        """pack weight and scale when saving."""
+        original_shape = weight.shape
+
+        # TODO: support more quantization format
+        if self.data_type == "mx_fp8":
+            qweight = (weight.reshape(-1, self.block_size) \
+                    / torch.exp2(scale.float()).reshape(-1, 1)).to(torch.float8_e4m3fn)
+
+            e8m0_scale = (scale + 127).to(torch.uint8)
+            return qweight.reshape(original_shape), e8m0_scale
+
+    def __repr__(self):
+        if self._disabled or not self._if_quant:
+            return "TensorQuantizer(disabled)"
+
+        qformat_str = f"({self.data_type}) format"
+        bits_str = f"({self.num_bits}) bit"
+
+        if self.block_size:
+            bs_str = f"block_size={self.block_size}"
+        else:
+            bs_str = "block_size=None"
+
+        # amax
+        amax_str = f"amax={self.amax}" if self.amax is not None else "amax=?"
+        # fake / real
+        mode_str = "fake" if self._fake_quant else "real"
+        # sym
+        sym_str = "sym" if self.sym else "asym"
+        # quant enable
+        qflag = "quant" if self._if_quant else "no-quant"
+
+        return f"TensorQuantizer({qformat_str} {bits_str} {mode_str} {bs_str}, {amax_str} {qflag})"
diff --git a/neural_compressor/torch/export/export_hf.py b/neural_compressor/torch/export/export_hf.py
new file mode 100644
index 00000000000..d78187bb2b0
--- /dev/null
+++ b/neural_compressor/torch/export/export_hf.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Export quantized hf model to compatible formats"""
+
+import tempfile
+from pathlib import Path
+import warnings
+from typing import Any
+import torch
+import torch.nn as nn
+
+def _export_quantized_weight(
+    sub_module: nn.Module, quantization_format: str = None, weight_name: str = "weight"
+):
+    """For the given weight attr of the sub_module, export the quantization info of it.
+
+    The export includes converting weight tensor to correct quantized values and quantized dtype,
+    and registering scaling factors.
+    """
+    if quantization_format == None:
+        return
+
+    weight: nn.Parameter = getattr(sub_module, weight_name)
+    weight_quantizer = getattr(
+        sub_module, "weight_quantizer"
+    )
+
+    qdq_weight, scale = weight_quantizer._fake_quantize(weight)
+
+    # TODO: support more scale dtype when there are other quantization format except mxfp8/mxfp4
+    quantized_weight, e8m0_scale = weight_quantizer.weight_pack(qdq_weight, scale)
+
+    sub_module.register_buffer("weight_scale", e8m0_scale.reshape(*weight.shape[:-1], -1))
+
+    setattr(sub_module, weight_name, nn.Parameter(quantized_weight, requires_grad=False))
+
+def _export_hf_checkpoint(
+    model: nn.Module, scheme: str | None = None
+) -> tuple[dict[str, Any], dict[str, Any]]:
+    """Exports the torch model to the packed checkpoint with original HF naming.
+
+    The packed checkpoint will be consumed by the TensorRT-LLM unified converter.
+
+    Args:
+        model: the torch model.
+        dtype: the weights data type to export the unquantized layers or the default model data type if None.
+
+    Returns:
+        post_state_dict: Dict containing quantized weights
+        quant_config: config information to export hf_quant_cfg.json
+    """
+
+    # Create a model layer pool
+    # If `model.model` exists use that, otherwise use `model` itself, e.g., Nemotron-H
+    root = getattr(model, "model", model)
+    # If that has a `.layers`, use it, otherwise fall back to the object itself
+    root = getattr(root, "layers", root)
+    layer_pool = {f"model.layers.{name}": sub_module for name, sub_module in root.named_modules()}
+
+    from ..algorithms.qat.quant_utils import get_quant_config, get_quantization_format, is_quantlinear
+    # compressored config
+    quant_config = get_quant_config(scheme=scheme)
+
+    for name, sub_module in layer_pool.items():
+        quantization_format = get_quantization_format(sub_module)
+        if quantization_format != None:
+            if is_quantlinear(sub_module):
+                _export_quantized_weight(sub_module, quantization_format)
+
+    quantized_state_dict = model.state_dict()
+
+
+    return quantized_state_dict, quant_config
+
+
+def export_hf2compressored_model(
+    model: nn.Module,
+    export_dir: Path | str = tempfile.gettempdir(),
+    scheme: str = None
+):
+    """Exports the torch model to the packed checkpoint with original HF naming.
+
+    The packed checkpoint will be consumed by the VLLM.
+    """
+
+    export_dir = Path(export_dir)
+    export_dir.mkdir(parents=True, exist_ok=True)
+
+    try:
+        _, quant_config = _export_hf_checkpoint(model, scheme)
+        model.save_pretrained(export_dir)
+        model.config.quantization_config = quant_config
+        model.config.save_pretrained(export_dir)
+
+    except Exception as e:
+        warnings.warn(
+            "Cannot export model and config, the state"
+            " can be saved with torch.save for further inspection."
+        )
+        raise e
+

From 7f99561fb9da7718269a8200a18143722ee8f9c6 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 25 Sep 2025 07:23:00 +0000
Subject: [PATCH 02/15] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../torch/algorithms/qat/quant_linear.py      | 17 +++++-----
 .../torch/algorithms/qat/quant_utils.py       | 13 ++++----
 .../torch/algorithms/qat/tensor_quantizer.py  | 15 ++++-----
 neural_compressor/torch/export/export_hf.py   | 31 +++++++------------
 4 files changed, 36 insertions(+), 40 deletions(-)

diff --git a/neural_compressor/torch/algorithms/qat/quant_linear.py b/neural_compressor/torch/algorithms/qat/quant_linear.py
index 911d082db55..070d9085403 100644
--- a/neural_compressor/torch/algorithms/qat/quant_linear.py
+++ b/neural_compressor/torch/algorithms/qat/quant_linear.py
@@ -25,6 +25,7 @@
 
 from .tensor_quantizer import TensorQuantizer
 
+
 class QuantLinear(nn.Module):
     """Quantized version of nn.Linear."""
 
@@ -37,7 +38,7 @@ def forward(self, input: torch.Tensor):
         return out
 
     def _setup(self, quant_cfg: "QuantizationSchem"):
-        """Init quantizer"""
+        """Init quantizer."""
         self.weight_quantizer = TensorQuantizer(
             data_type=quant_cfg.data_type,
             block_size=quant_cfg.group_size,
@@ -73,9 +74,11 @@ def extra_repr(self) -> str:
 
     def __repr__(self):
         """Overriding the __repr__ method, makes the output more concise and meaningful."""
-        return f"QuantLinear(\n" \
-               f"  in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}\n" \
-               f"  (input_quantizer): {self.input_quantizer}\n" \
-               f"  (output_quantizer): {self.output_quantizer}\n" \
-               f"  (weight_quantizer): {self.weight_quantizer}\n" \
-               f")"
+        return (
+            f"QuantLinear(\n"
+            f"  in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}\n"
+            f"  (input_quantizer): {self.input_quantizer}\n"
+            f"  (output_quantizer): {self.output_quantizer}\n"
+            f"  (weight_quantizer): {self.weight_quantizer}\n"
+            f")"
+        )
diff --git a/neural_compressor/torch/algorithms/qat/quant_utils.py b/neural_compressor/torch/algorithms/qat/quant_utils.py
index 5bebdbbf3c5..ce5d8395c73 100644
--- a/neural_compressor/torch/algorithms/qat/quant_utils.py
+++ b/neural_compressor/torch/algorithms/qat/quant_utils.py
@@ -17,17 +17,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Utils for quantization"""
+"""Utils for quantization."""
 
 import types
+from typing import Any
+
 import torch
 import torch.nn as nn
-from typing import Any
+
 from .quant_linear import QuantLinear
 
 
 def convert(module: nn.Module, quant_cfg=None, quant_module=None):
-    """Convert the model to a quantized one with quant config"""
+    """Convert the model to a quantized one with quant config."""
 
     # update class
     original_cls = type(module)
@@ -71,6 +73,7 @@ def get_quant_config(scheme: str) -> dict[str, Any]:
     # TODO: support more quant config
     try:
         from auto_round.export.export_to_llmcompressor.config import initialize_quantization
+
         quantization_config = initialize_quantization(scheme=scheme)
         quantization_config = quantization_config.to_dict()
         quantization_config["provider"] = "auto-round"
@@ -102,9 +105,7 @@ def _get_quantization_from_layer(layer):
             return "MXFP8"
 
         # Raise error for unsupported num_bits
-        raise NotImplementedError(
-            f"Unsupported quantizer with num_bits: {weight_quantizer.num_bits}"
-        )
+        raise NotImplementedError(f"Unsupported quantizer with num_bits: {weight_quantizer.num_bits}")
 
     quantization = _get_quantization_from_layer(module)
     if quantization != None:
diff --git a/neural_compressor/torch/algorithms/qat/tensor_quantizer.py b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py
index 668786bffa8..b4d51f2eae5 100644
--- a/neural_compressor/torch/algorithms/qat/tensor_quantizer.py
+++ b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py
@@ -17,7 +17,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """TensorQuantizer Module."""
 
 import torch
@@ -28,6 +27,7 @@
 except ImportError:
     get_quant_func = None
 
+
 class TensorQuantizer(nn.Module):
     """Tensor quantizer module."""
 
@@ -59,9 +59,9 @@ def __init__(
         # enable quantizer
         self.enable()
 
-        assert get_quant_func is not None, (
-                f"The quantization function is imported from AutoRound, please intall it. 'pip install auto-round'"
-            )
+        assert (
+            get_quant_func is not None
+        ), "The quantization function is imported from AutoRound, please install it. 'pip install auto-round'"
 
         # self.data_type will be overided 'mx_fp' -> 'mx_fp8'
         self.quant_func, self.data_type = get_quant_func(self.data_type, self.num_bits, self.sym)
@@ -146,13 +146,14 @@ def enable(self):
         self._disabled = False
 
     def weight_pack(self, weight, scale):
-        """pack weight and scale when saving."""
+        """Pack weight and scale when saving."""
         original_shape = weight.shape
 
         # TODO: support more quantization format
         if self.data_type == "mx_fp8":
-            qweight = (weight.reshape(-1, self.block_size) \
-                    / torch.exp2(scale.float()).reshape(-1, 1)).to(torch.float8_e4m3fn)
+            qweight = (weight.reshape(-1, self.block_size) / torch.exp2(scale.float()).reshape(-1, 1)).to(
+                torch.float8_e4m3fn
+            )
 
             e8m0_scale = (scale + 127).to(torch.uint8)
             return qweight.reshape(original_shape), e8m0_scale
diff --git a/neural_compressor/torch/export/export_hf.py b/neural_compressor/torch/export/export_hf.py
index d78187bb2b0..4ed61b22859 100644
--- a/neural_compressor/torch/export/export_hf.py
+++ b/neural_compressor/torch/export/export_hf.py
@@ -11,18 +11,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Export quantized hf model to compatible formats"""
+"""Export quantized hf model to compatible formats."""
 
 import tempfile
-from pathlib import Path
 import warnings
+from pathlib import Path
 from typing import Any
+
 import torch
 import torch.nn as nn
 
-def _export_quantized_weight(
-    sub_module: nn.Module, quantization_format: str = None, weight_name: str = "weight"
-):
+
+def _export_quantized_weight(sub_module: nn.Module, quantization_format: str = None, weight_name: str = "weight"):
     """For the given weight attr of the sub_module, export the quantization info of it.
 
     The export includes converting weight tensor to correct quantized values and quantized dtype,
@@ -32,9 +32,7 @@ def _export_quantized_weight(
         return
 
     weight: nn.Parameter = getattr(sub_module, weight_name)
-    weight_quantizer = getattr(
-        sub_module, "weight_quantizer"
-    )
+    weight_quantizer = getattr(sub_module, "weight_quantizer")
 
     qdq_weight, scale = weight_quantizer._fake_quantize(weight)
 
@@ -45,9 +43,8 @@ def _export_quantized_weight(
 
     setattr(sub_module, weight_name, nn.Parameter(quantized_weight, requires_grad=False))
 
-def _export_hf_checkpoint(
-    model: nn.Module, scheme: str | None = None
-) -> tuple[dict[str, Any], dict[str, Any]]:
+
+def _export_hf_checkpoint(model: nn.Module, scheme: str | None = None) -> tuple[dict[str, Any], dict[str, Any]]:
     """Exports the torch model to the packed checkpoint with original HF naming.
 
     The packed checkpoint will be consumed by the TensorRT-LLM unified converter.
@@ -69,6 +66,7 @@ def _export_hf_checkpoint(
     layer_pool = {f"model.layers.{name}": sub_module for name, sub_module in root.named_modules()}
 
     from ..algorithms.qat.quant_utils import get_quant_config, get_quantization_format, is_quantlinear
+
     # compressored config
     quant_config = get_quant_config(scheme=scheme)
 
@@ -80,15 +78,10 @@ def _export_hf_checkpoint(
 
     quantized_state_dict = model.state_dict()
 
-
     return quantized_state_dict, quant_config
 
 
-def export_hf2compressored_model(
-    model: nn.Module,
-    export_dir: Path | str = tempfile.gettempdir(),
-    scheme: str = None
-):
+def export_hf2compressored_model(model: nn.Module, export_dir: Path | str = tempfile.gettempdir(), scheme: str = None):
     """Exports the torch model to the packed checkpoint with original HF naming.
 
     The packed checkpoint will be consumed by the VLLM.
@@ -105,8 +98,6 @@ def export_hf2compressored_model(
 
     except Exception as e:
         warnings.warn(
-            "Cannot export model and config, the state"
-            " can be saved with torch.save for further inspection."
+            "Cannot export model and config, the state" " can be saved with torch.save for further inspection."
         )
         raise e
-

From b6d74ae6ce6ea25397d66a59d3711889d085edfb Mon Sep 17 00:00:00 2001
From: lkk <kaokao.lv@intel.com>
Date: Thu, 25 Sep 2025 08:56:50 +0000
Subject: [PATCH 03/15] fix comments.

---
 .../torch/algorithms/qat/__init__.py          |  2 +-
 .../torch/algorithms/qat/quant_linear.py      |  3 +-
 .../torch/algorithms/qat/quant_utils.py       |  6 +--
 .../torch/algorithms/qat/tensor_quantizer.py  |  9 ++--
 neural_compressor/torch/export/export_hf.py   | 10 ++---
 .../torch/quantization/quantize.py            | 43 +++++++++++--------
 6 files changed, 41 insertions(+), 32 deletions(-)

diff --git a/neural_compressor/torch/algorithms/qat/__init__.py b/neural_compressor/torch/algorithms/qat/__init__.py
index d3bdaf8e760..e4c8a62c491 100644
--- a/neural_compressor/torch/algorithms/qat/__init__.py
+++ b/neural_compressor/torch/algorithms/qat/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024 Intel Corporation
+# Copyright (c) 2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/neural_compressor/torch/algorithms/qat/quant_linear.py b/neural_compressor/torch/algorithms/qat/quant_linear.py
index 070d9085403..50107b9db70 100644
--- a/neural_compressor/torch/algorithms/qat/quant_linear.py
+++ b/neural_compressor/torch/algorithms/qat/quant_linear.py
@@ -4,7 +4,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 #
-# Copyright (c) 2024 Intel Corporation
+# Copyright (c) 2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 
 from .tensor_quantizer import TensorQuantizer
 
diff --git a/neural_compressor/torch/algorithms/qat/quant_utils.py b/neural_compressor/torch/algorithms/qat/quant_utils.py
index ce5d8395c73..0f76650e8ab 100644
--- a/neural_compressor/torch/algorithms/qat/quant_utils.py
+++ b/neural_compressor/torch/algorithms/qat/quant_utils.py
@@ -4,7 +4,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 #
-# Copyright (c) 2024 Intel Corporation
+# Copyright (c) 2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -108,12 +108,12 @@ def _get_quantization_from_layer(layer):
         raise NotImplementedError(f"Unsupported quantizer with num_bits: {weight_quantizer.num_bits}")
 
     quantization = _get_quantization_from_layer(module)
-    if quantization != None:
+    if quantization is not None:
         return quantization
 
     for _, layer in module.named_children():
         format = get_quantization_format(layer)
-        if format != None:
+        if format is not None:
             return format
 
     return None
diff --git a/neural_compressor/torch/algorithms/qat/tensor_quantizer.py b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py
index b4d51f2eae5..e8c0badad28 100644
--- a/neural_compressor/torch/algorithms/qat/tensor_quantizer.py
+++ b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py
@@ -4,7 +4,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 #
-# Copyright (c) 2024 Intel Corporation
+# Copyright (c) 2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -70,7 +70,7 @@ def __init__(
             # E8M0 scales (exponent)
             self.register_buffer(
                 "scale",
-                torch.empty(scale_shape[0], scale_shape.shape[1] // self.block_size, dtype=torch.uint8, device=device),
+                torch.empty(scale_shape[0], scale_shape[1] // self.block_size, dtype=torch.uint8, device=device),
             )
             self.save_scale = True
         else:
@@ -132,6 +132,9 @@ def _fake_quantize(self, inputs: torch.Tensor):
 
         return q, shared_exp
 
+    def _real_quantize(self, inputs: torch.Tensor):
+        raise NotImplementedError("This method hasn't be implemented.")
+
     @property
     def fake_quant(self):
         """Return True if fake quantization is used."""
@@ -156,7 +159,7 @@ def weight_pack(self, weight, scale):
             )
 
             e8m0_scale = (scale + 127).to(torch.uint8)
-            return qweight.reshape(original_shape), e8m0_scale
+            return qweight.reshape(original_shape), e8m0_scale.reshape(original_shape[0], -1)
 
     def __repr__(self):
         if self._disabled or not self._if_quant:
diff --git a/neural_compressor/torch/export/export_hf.py b/neural_compressor/torch/export/export_hf.py
index 4ed61b22859..d3ce4bdde09 100644
--- a/neural_compressor/torch/export/export_hf.py
+++ b/neural_compressor/torch/export/export_hf.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024 Intel Corporation
+# Copyright (c) 2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@ def _export_quantized_weight(sub_module: nn.Module, quantization_format: str = N
     The export includes converting weight tensor to correct quantized values and quantized dtype,
     and registering scaling factors.
     """
-    if quantization_format == None:
+    if quantization_format is None:
         return
 
     weight: nn.Parameter = getattr(sub_module, weight_name)
@@ -39,7 +39,7 @@ def _export_quantized_weight(sub_module: nn.Module, quantization_format: str = N
     # TODO: support more scale dtype when there are other quantization format except mxfp8/mxfp4
     quantized_weight, e8m0_scale = weight_quantizer.weight_pack(qdq_weight, scale)
 
-    sub_module.register_buffer("weight_scale", e8m0_scale.reshape(*weight.shape[:-1], -1))
+    sub_module.register_buffer("weight_scale", e8m0_scale)
 
     setattr(sub_module, weight_name, nn.Parameter(quantized_weight, requires_grad=False))
 
@@ -72,7 +72,7 @@ def _export_hf_checkpoint(model: nn.Module, scheme: str | None = None) -> tuple[
 
     for name, sub_module in layer_pool.items():
         quantization_format = get_quantization_format(sub_module)
-        if quantization_format != None:
+        if quantization_format is not None:
             if is_quantlinear(sub_module):
                 _export_quantized_weight(sub_module, quantization_format)
 
@@ -98,6 +98,6 @@ def export_hf2compressored_model(model: nn.Module, export_dir: Path | str = temp
 
     except Exception as e:
         warnings.warn(
-            "Cannot export model and config, the state" " can be saved with torch.save for further inspection."
+            "Cannot export model and config, the state can be saved with torch.save for further inspection."
         )
         raise e
diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
index a313220c43e..5311bdc318a 100644
--- a/neural_compressor/torch/quantization/quantize.py
+++ b/neural_compressor/torch/quantization/quantize.py
@@ -148,9 +148,10 @@ def quantize(
 @log_process(mode=Mode.PREPARE)
 def prepare(
     model: torch.nn.Module,
-    quant_config: BaseConfig,
+    quant_config: BaseConfig | dict | None = None,
     inplace: bool = True,
     example_inputs: Any = None,
+    qat: bool = False
 ):
     """Prepare the model for calibration.
 
@@ -165,24 +166,28 @@ def prepare(
     Returns:
         prepared and calibrated module.
     """
-    prepared_model = model if inplace else copy.deepcopy(model)
-    prepared_model, configs_mapping = preprocess_quant_config(
-        prepared_model, quant_config, mode="prepare", example_inputs=example_inputs
-    )
-    for algo_name, algo_func in algos_mapping.items():
-        # select quantization algo according to config
-        if need_apply(configs_mapping, algo_name):
-            logger.info(f"Start to prepare model with {algo_name}.")
-            prepared_model = algo_func(
-                prepared_model,
-                configs_mapping,
-                example_inputs=example_inputs,
-                mode=Mode.PREPARE,
-            )
-            setattr(prepared_model, "is_prepared", True)
-    setattr(prepared_model, "quant_config", quant_config)
-    setattr(prepared_model, "example_inputs", example_inputs)
-    return prepared_model
+    if not qat:
+        prepared_model = model if inplace else copy.deepcopy(model)
+        prepared_model, configs_mapping = preprocess_quant_config(
+            prepared_model, quant_config, mode="prepare", example_inputs=example_inputs
+        )
+        for algo_name, algo_func in algos_mapping.items():
+            # select quantization algo according to config
+            if need_apply(configs_mapping, algo_name):
+                logger.info(f"Start to prepare model with {algo_name}.")
+                prepared_model = algo_func(
+                    prepared_model,
+                    configs_mapping,
+                    example_inputs=example_inputs,
+                    mode=Mode.PREPARE,
+                )
+                setattr(prepared_model, "is_prepared", True)
+        setattr(prepared_model, "quant_config", quant_config)
+        setattr(prepared_model, "example_inputs", example_inputs)
+        return prepared_model
+    else:
+        from ..algorithms.qat.quant_utils import replace_with_quant_linear
+        return replace_with_quant_linear(model, quant_config)
 
 
 @log_process(mode=Mode.CONVERT)

From c9a002642f2554bc19486514dcb663ce449f1c15 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 25 Sep 2025 08:59:13 +0000
Subject: [PATCH 04/15] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_compressor/torch/export/export_hf.py      | 4 +---
 neural_compressor/torch/quantization/quantize.py | 3 ++-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/neural_compressor/torch/export/export_hf.py b/neural_compressor/torch/export/export_hf.py
index d3ce4bdde09..609a4b534e5 100644
--- a/neural_compressor/torch/export/export_hf.py
+++ b/neural_compressor/torch/export/export_hf.py
@@ -97,7 +97,5 @@ def export_hf2compressored_model(model: nn.Module, export_dir: Path | str = temp
         model.config.save_pretrained(export_dir)
 
     except Exception as e:
-        warnings.warn(
-            "Cannot export model and config, the state can be saved with torch.save for further inspection."
-        )
+        warnings.warn("Cannot export model and config, the state can be saved with torch.save for further inspection.")
         raise e
diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
index 5311bdc318a..dc86d210255 100644
--- a/neural_compressor/torch/quantization/quantize.py
+++ b/neural_compressor/torch/quantization/quantize.py
@@ -151,7 +151,7 @@ def prepare(
     quant_config: BaseConfig | dict | None = None,
     inplace: bool = True,
     example_inputs: Any = None,
-    qat: bool = False
+    qat: bool = False,
 ):
     """Prepare the model for calibration.
 
@@ -187,6 +187,7 @@ def prepare(
         return prepared_model
     else:
         from ..algorithms.qat.quant_utils import replace_with_quant_linear
+
         return replace_with_quant_linear(model, quant_config)
 
 

From 1651d71b5cd3a1e82d0bfbde1223b12952911037 Mon Sep 17 00:00:00 2001
From: lkk <kaokao.lv@intel.com>
Date: Fri, 26 Sep 2025 02:29:10 +0000
Subject: [PATCH 05/15] fix code style.

---
 neural_compressor/torch/algorithms/qat/quant_linear.py | 2 +-
 neural_compressor/torch/export/export_hf.py            | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/neural_compressor/torch/algorithms/qat/quant_linear.py b/neural_compressor/torch/algorithms/qat/quant_linear.py
index 50107b9db70..2858c0f9420 100644
--- a/neural_compressor/torch/algorithms/qat/quant_linear.py
+++ b/neural_compressor/torch/algorithms/qat/quant_linear.py
@@ -38,7 +38,7 @@ def forward(self, input: torch.Tensor):
         out = self.output_quantizer(out)
         return out
 
-    def _setup(self, quant_cfg: "QuantizationSchem"):
+    def _setup(self, quant_cfg):
         """Init quantizer."""
         self.weight_quantizer = TensorQuantizer(
             data_type=quant_cfg.data_type,
diff --git a/neural_compressor/torch/export/export_hf.py b/neural_compressor/torch/export/export_hf.py
index 609a4b534e5..e617ae122a9 100644
--- a/neural_compressor/torch/export/export_hf.py
+++ b/neural_compressor/torch/export/export_hf.py
@@ -57,7 +57,6 @@ def _export_hf_checkpoint(model: nn.Module, scheme: str | None = None) -> tuple[
         post_state_dict: Dict containing quantized weights
         quant_config: config information to export hf_quant_cfg.json
     """
-
     # Create a model layer pool
     # If `model.model` exists use that, otherwise use `model` itself, e.g., Nemotron-H
     root = getattr(model, "model", model)
@@ -86,7 +85,6 @@ def export_hf2compressored_model(model: nn.Module, export_dir: Path | str = temp
 
     The packed checkpoint will be consumed by the VLLM.
     """
-
     export_dir = Path(export_dir)
     export_dir.mkdir(parents=True, exist_ok=True)
 

From fcf4b86368b3f5569eda007dde67c29bced3cc93 Mon Sep 17 00:00:00 2001
From: lkk <kaokao.lv@intel.com>
Date: Fri, 26 Sep 2025 03:20:25 +0000
Subject: [PATCH 06/15] add unit tests.

---
 .../torch/algorithms/qat/test_quant_utils.py  | 208 ++++++++++++++++++
 .../qat/test_quantizer_and_linear.py          | 166 ++++++++++++++
 2 files changed, 374 insertions(+)
 create mode 100644 test/3x/torch/algorithms/qat/test_quant_utils.py
 create mode 100644 test/3x/torch/algorithms/qat/test_quantizer_and_linear.py

diff --git a/test/3x/torch/algorithms/qat/test_quant_utils.py b/test/3x/torch/algorithms/qat/test_quant_utils.py
new file mode 100644
index 00000000000..cca51126caf
--- /dev/null
+++ b/test/3x/torch/algorithms/qat/test_quant_utils.py
@@ -0,0 +1,208 @@
+# -*- coding: utf-8 -*-
+
+import sys
+import types
+import importlib
+from types import SimpleNamespace
+from pathlib import Path
+
+import pytest
+import torch
+import torch.nn as nn
+
+from neural_compressor.torch.algorithms.qat import quant_utils
+
+
+from neural_compressor.torch.algorithms.qat.tensor_quantizer import TensorQuantizer  # type: ignore
+from neural_compressor.torch.algorithms.qat.quant_linear import QuantLinear
+
+
+class TinyModel(nn.Module):
+    """Simple hierarchical model for recursive replacement tests."""
+
+    def __init__(self):
+        super().__init__()
+        self.fc1 = nn.Linear(16, 8)
+        self.block = nn.Sequential(
+            nn.Linear(8, 8),
+            nn.ReLU(),
+            nn.Linear(8, 4),
+        )
+        self.lm_head = nn.Linear(4, 2)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.block(x)
+        return self.lm_head(x)
+
+
+@pytest.fixture
+def sample_input():
+    return torch.randn(2, 16)
+
+def make_quant_cfg(
+    *,
+    data_type="mx_fp8",
+    bits=8,
+    group_size=32,
+    sym=True,
+    act_data_type="mx_fp8",
+    act_bits=8,
+    act_group_size=32,
+    act_sym=True,
+):
+    """
+    Build a lightweight namespace mimicking the attributes QuantLinear._setup expects.
+    """
+    return types.SimpleNamespace(
+        data_type=data_type,
+        bits=bits,
+        group_size=group_size,
+        sym=sym,
+        act_data_type=act_data_type,
+        act_bits=act_bits,
+        act_group_size=act_group_size,
+        act_sym=act_sym,
+    )
+
+@pytest.fixture
+def quant_cfg():
+    return make_quant_cfg()
+
+
+def test_convert_replaces_class_and_calls_setup(monkeypatch, quant_cfg):
+    linear = nn.Linear(4, 3)
+
+    original_forward_id = id(QuantLinear.forward)
+
+    quant_utils.convert(linear, quant_cfg=quant_cfg, quant_module=QuantLinear)
+
+    assert isinstance(linear, QuantLinear)
+    assert hasattr(linear.forward, "__self__") and linear.forward.__self__ is linear
+    assert linear.forward.__func__ is QuantLinear.forward or id(linear.forward.__func__) == original_forward_id
+
+
+def test_replace_with_quant_linear_recursive(monkeypatch, quant_cfg):
+    model = TinyModel()
+
+
+    quant_utils.replace_with_quant_linear(model, quant_cfg=quant_cfg)
+
+    assert isinstance(model.fc1, QuantLinear)
+    assert isinstance(model.block[0], QuantLinear)
+    assert isinstance(model.block[2], QuantLinear)
+    assert isinstance(model.lm_head, nn.Linear)
+
+
+def test_is_quantlinear_positive_and_negative():
+    q = QuantLinear()
+    plain = nn.Linear(4, 2)
+    assert quant_utils.is_quantlinear(q) is True
+    assert quant_utils.is_quantlinear(plain) is False
+
+
+def test_get_quantization_format_positive(monkeypatch):
+    layer = QuantLinear()
+
+    layer.weight_quantizer = TensorQuantizer(bits=8, data_type="mx_fp8")
+    layer.weight_quantizer._disabled = False
+    layer.input_quantizer = TensorQuantizer(bits=8, data_type="mx_fp8")
+    layer.input_quantizer._disabled = False
+
+    layer.weight = None
+    fmt = quant_utils.get_quantization_format(layer)
+    assert fmt == "MXFP8"
+
+
+def test_get_quantization_format_none():
+    layer = nn.Linear(4, 2)
+    fmt = quant_utils.get_quantization_format(layer)
+    assert fmt is None
+
+
+def test_get_quantization_format_unsupported_bits_raises():
+    layer = QuantLinear()
+    layer.weight_quantizer = TensorQuantizer(bits=4, data_type="mx_fp8")
+    layer.weight_quantizer._disabled = False
+    layer.input_quantizer = TensorQuantizer(bits=4, data_type="mx_fp8")
+    layer.input_quantizer._disabled = False
+
+    with pytest.raises(NotImplementedError):
+        quant_utils.get_quantization_format(layer)
+
+
+def test_get_quant_config_success(monkeypatch):
+    # dynamic fake module: auto_round.export.export_to_llmcompressor.config
+    module_name = "auto_round.export.export_to_llmcompressor.config"
+
+    class DummyQuantCfg:
+        def __init__(self):
+            self.data = {
+                "provider": "dummy",
+                "config_groups": {
+                    "group_0": {
+                        "weights": {},
+                        "input_activations": {},
+                    }
+                },
+            }
+
+        def to_dict(self):
+            return self.data
+
+    def initialize_quantization(scheme: str):
+        return DummyQuantCfg()
+
+    # auto_round
+    auto_round = types.ModuleType("auto_round")
+    export = types.ModuleType("auto_round.export")
+    export_to = types.ModuleType("auto_round.export.export_to_llmcompressor")
+    config_mod = types.ModuleType(module_name)
+    config_mod.initialize_quantization = initialize_quantization
+
+    sys.modules["auto_round"] = auto_round
+    sys.modules["auto_round.export"] = export
+    sys.modules["auto_round.export.export_to_llmcompressor"] = export_to
+    sys.modules[module_name] = config_mod
+
+    cfg = quant_utils.get_quant_config(scheme="mxfp8")
+    assert isinstance(cfg, dict)
+    assert cfg["provider"] == "auto-round"
+    assert cfg["config_groups"]["group_0"]["weights"]["is_mx"] is True
+    assert cfg["config_groups"]["group_0"]["input_activations"]["is_mx"] is True
+
+
+def test_convert_forward_executes(monkeypatch):
+    linear = nn.Linear(5, 3)
+
+    def fake_forward(self, x):
+        return torch.zeros(x.shape[0], 3)
+
+    monkeypatch.setattr(QuantLinear, "forward", fake_forward, raising=True)
+
+    quant_utils.convert(linear, quant_cfg=make_quant_cfg(), quant_module=QuantLinear)
+    out = linear(torch.randn(2, 5))
+    assert out.shape == (2, 3)
+    assert torch.all(out == 0)
+
+
+def test_replace_with_quant_linear_idempotent(quant_cfg):
+    model = TinyModel()
+    quant_utils.replace_with_quant_linear(model, quant_cfg=quant_cfg)
+    quant_utils.replace_with_quant_linear(model, quant_cfg=quant_cfg)
+    assert isinstance(model.fc1, QuantLinear)
+
+
+@pytest.mark.parametrize("disabled", [True, False])
+def test_get_quantization_format_disabled_returns_none(disabled):
+    layer = QuantLinear()
+    layer.weight_quantizer = TensorQuantizer(bits=8, data_type="mx_fp8")
+    layer.weight_quantizer._disabled = disabled
+    layer.input_quantizer = TensorQuantizer(bits=8, data_type="mx_fp8")
+    layer.input_quantizer._disabled = disabled
+
+    fmt = quant_utils.get_quantization_format(layer)
+    if disabled:
+        assert fmt is None
+    else:
+        assert fmt == "MXFP8"
diff --git a/test/3x/torch/algorithms/qat/test_quantizer_and_linear.py b/test/3x/torch/algorithms/qat/test_quantizer_and_linear.py
new file mode 100644
index 00000000000..7fced0768bf
--- /dev/null
+++ b/test/3x/torch/algorithms/qat/test_quantizer_and_linear.py
@@ -0,0 +1,166 @@
+import math
+import types
+import torch
+import pytest
+import torch.nn as nn
+
+# Skip the whole module if auto_round (needed for get_quant_func inside TensorQuantizer) is not available
+auto_round = pytest.importorskip("auto_round")
+
+from neural_compressor.torch.algorithms.qat.quant_linear import QuantLinear
+from neural_compressor.torch.algorithms.qat.tensor_quantizer import TensorQuantizer
+
+def make_quant_cfg(
+    *,
+    data_type="mx_fp8",
+    bits=8,
+    group_size=32,
+    sym=True,
+    act_data_type="mx_fp8",
+    act_bits=8,
+    act_group_size=32,
+    act_sym=True,
+):
+    """
+    Build a lightweight namespace mimicking the attributes QuantLinear._setup expects.
+    """
+    return types.SimpleNamespace(
+        data_type=data_type,
+        bits=bits,
+        group_size=group_size,
+        sym=sym,
+        act_data_type=act_data_type,
+        act_bits=act_bits,
+        act_group_size=act_group_size,
+        act_sym=act_sym,
+    )
+
+
+def build_quant_linear(in_features=32, out_features=16, bias=True, quant_cfg=None, device="cpu", dtype=torch.float32):
+    """
+    Manually construct a QuantLinear since the class does not define an __init__.
+
+    Steps:
+      1. Instantiate the module
+      2. Register parameter tensors (weight, bias)
+      3. Add metadata attributes used by extra_repr / repr
+      4. Call internal _setup with provided quant config
+    """
+    if quant_cfg is None:
+        quant_cfg = make_quant_cfg(group_size=32, act_group_size=32)
+
+    ql = QuantLinear()
+    ql.in_features = in_features
+    ql.out_features = out_features
+
+    weight = torch.randn(out_features, in_features, device=device, dtype=dtype)
+    ql.register_parameter("weight", nn.Parameter(weight))
+
+    if bias:
+        b = torch.randn(out_features, device=device, dtype=dtype)
+        ql.register_parameter("bias", nn.Parameter(b))
+    else:
+        ql.bias = None  # make sure attribute exists
+
+    ql._setup(quant_cfg)
+    return ql
+
+
+@pytest.mark.parametrize("bias", [True, False])
+def test_quant_linear_forward_and_backward(bias):
+    torch.manual_seed(42)
+
+    in_features = 32
+    out_features = 16
+    batch_size = 3
+
+    ql = build_quant_linear(in_features=in_features, out_features=out_features, bias=bias)
+
+    # Create a deliberately non-contiguous input (transpose trick)
+    base = torch.randn(in_features, batch_size)
+    x = base.t()  # shape (batch_size, in_features) but non-contiguous
+    assert not x.is_contiguous()
+
+    x.requires_grad_(True)
+    out = ql(x)
+
+    # Shape & dtype checks
+    assert out.shape == (batch_size, out_features)
+    assert out.dtype == x.dtype
+
+    # Backward pass
+    loss = out.sum()
+    loss.backward()
+
+    assert ql.weight.grad is not None, "Weight should receive gradient through fake quant path"
+    if bias:
+        assert ql.bias.grad is not None, "Bias should receive gradient"
+    else:
+        assert ql.bias is None
+
+    # Ensure original weight dtype tracked
+    assert ql.original_weight_dtype == ql.weight.dtype
+
+    # Output quantizer is explicitly disabled in _setup
+    assert "TensorQuantizer(disabled)" in repr(ql.output_quantizer)
+
+    # Input/weight quantizers should be enabled (not containing 'disabled')
+    assert "disabled" not in repr(ql.input_quantizer)
+    assert "disabled" not in repr(ql.weight_quantizer)
+
+
+def test_quant_linear_repr_and_extra_repr():
+    ql = build_quant_linear(in_features=8, out_features=4, bias=True)
+    r = repr(ql)
+    # Basic structural checks
+    assert "QuantLinear(" in r
+    assert "(input_quantizer):" in r
+    assert "(weight_quantizer):" in r
+    assert "(output_quantizer):" in r
+    # extra_repr path
+    er = ql.extra_repr()
+    assert "in_features=8" in er
+    assert "out_features=4" in er
+    assert "bias=True" in er
+
+
+def test_tensor_quantizer_disable_and_no_quant_path():
+    tq = TensorQuantizer(if_quant=False)  # constructed with quantization turned off
+    x = torch.randn(5, 7)
+    out = tq(x)
+    # When disabled (not quant) it should return the identical object (same memory)
+    assert out.data_ptr() == x.data_ptr()
+    assert repr(tq) == "TensorQuantizer(disabled)"
+
+
+def test_tensor_quantizer_enable_disable_cycle():
+    tq = TensorQuantizer()
+    x = torch.randn(4, 32)  # group size default 32, matches last dim
+    y1 = tq(x)
+    assert y1.shape == x.shape
+    # Disable and ensure passthrough (pointer equality)
+    tq.disable()
+    y2 = tq(x)
+    assert y2.data_ptr() == x.data_ptr()
+    assert "disabled" in repr(tq)
+    # Re-enable
+    tq.enable()
+    y3 = tq(x)
+    assert y3.shape == x.shape
+    assert "disabled" not in repr(tq)
+
+
+def test_tensor_quantizer_scale_persistence():
+    # Provide scale_shape so internal buffer is registered & updated
+    tq = TensorQuantizer(scale_shape=(4, 32), block_size=32)
+    x = torch.randn(4, 32)
+    # Use internal fake quant function to generate scale
+    q, shared_exp = tq._fake_quantize(x)
+    # scale buffer should have been updated (shape (4, 1))
+    assert hasattr(tq, "scale")
+    assert tq.scale.shape == (4, 1)
+    # We cannot be certain of values, but at least ensure it is uint8 and not all zeros (likely)
+    assert tq.scale.dtype == torch.uint8
+    # Heuristic: at least one non-zero (if all zero it may still be valid, but improbable)
+    assert (tq.scale != 0).any() or (shared_exp == 0).all()
+

From 089c2478962336ad5c8592b9627a99c2a73726ba Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 26 Sep 2025 03:22:04 +0000
Subject: [PATCH 07/15] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 test/3x/torch/algorithms/qat/test_quantizer_and_linear.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/3x/torch/algorithms/qat/test_quantizer_and_linear.py b/test/3x/torch/algorithms/qat/test_quantizer_and_linear.py
index 7fced0768bf..8f5c6108ba8 100644
--- a/test/3x/torch/algorithms/qat/test_quantizer_and_linear.py
+++ b/test/3x/torch/algorithms/qat/test_quantizer_and_linear.py
@@ -163,4 +163,3 @@ def test_tensor_quantizer_scale_persistence():
     assert tq.scale.dtype == torch.uint8
     # Heuristic: at least one non-zero (if all zero it may still be valid, but improbable)
     assert (tq.scale != 0).any() or (shared_exp == 0).all()
-

From 6c0621d43f19e9a0419a9b62f0fa11205bd58d32 Mon Sep 17 00:00:00 2001
From: lkk <kaokao.lv@intel.com>
Date: Fri, 26 Sep 2025 03:40:40 +0000
Subject: [PATCH 08/15] update `prepare_qat` entry.

---
 .../torch/quantization/quantize.py            | 67 ++++++++++++-------
 1 file changed, 43 insertions(+), 24 deletions(-)

diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
index dc86d210255..3600869c096 100644
--- a/neural_compressor/torch/quantization/quantize.py
+++ b/neural_compressor/torch/quantization/quantize.py
@@ -148,10 +148,9 @@ def quantize(
 @log_process(mode=Mode.PREPARE)
 def prepare(
     model: torch.nn.Module,
-    quant_config: BaseConfig | dict | None = None,
+    quant_config: BaseConfig,
     inplace: bool = True,
     example_inputs: Any = None,
-    qat: bool = False,
 ):
     """Prepare the model for calibration.
 
@@ -166,29 +165,49 @@ def prepare(
     Returns:
         prepared and calibrated module.
     """
-    if not qat:
-        prepared_model = model if inplace else copy.deepcopy(model)
-        prepared_model, configs_mapping = preprocess_quant_config(
-            prepared_model, quant_config, mode="prepare", example_inputs=example_inputs
-        )
-        for algo_name, algo_func in algos_mapping.items():
-            # select quantization algo according to config
-            if need_apply(configs_mapping, algo_name):
-                logger.info(f"Start to prepare model with {algo_name}.")
-                prepared_model = algo_func(
-                    prepared_model,
-                    configs_mapping,
-                    example_inputs=example_inputs,
-                    mode=Mode.PREPARE,
-                )
-                setattr(prepared_model, "is_prepared", True)
-        setattr(prepared_model, "quant_config", quant_config)
-        setattr(prepared_model, "example_inputs", example_inputs)
-        return prepared_model
-    else:
-        from ..algorithms.qat.quant_utils import replace_with_quant_linear
+    prepared_model = model if inplace else copy.deepcopy(model)
+    prepared_model, configs_mapping = preprocess_quant_config(
+        prepared_model, quant_config, mode="prepare", example_inputs=example_inputs
+    )
+    for algo_name, algo_func in algos_mapping.items():
+        # select quantization algo according to config
+        if need_apply(configs_mapping, algo_name):
+            logger.info(f"Start to prepare model with {algo_name}.")
+            prepared_model = algo_func(
+                prepared_model,
+                configs_mapping,
+                example_inputs=example_inputs,
+                mode=Mode.PREPARE,
+            )
+            setattr(prepared_model, "is_prepared", True)
+    setattr(prepared_model, "quant_config", quant_config)
+    setattr(prepared_model, "example_inputs", example_inputs)
+    return prepared_model
+
+
+@log_process(mode=Mode.PREPARE)
+def prepare_qat(
+    model: torch.nn.Module,
+    quant_config: dict,
+    inplace: bool = True,
+):
+    r"""
+    Prepares a copy of the model for quantization calibration or
+    quantization-aware training and converts it to quantized version.
+
+    Quantization configuration should be assigned preemptively
+    to individual submodules in `.qconfig` attribute.
+
+    Args:
+        model: input model to be modified in-place
+        quant_config: quantization config that maps float modules to quantized modules to be
+                 replaced.
+        inplace: carry out model transformations in-place, the original module
+                 is mutated
+    """
+    from ..algorithms.qat.quant_utils import replace_with_quant_linear
 
-        return replace_with_quant_linear(model, quant_config)
+    return replace_with_quant_linear(model, quant_config)
 
 
 @log_process(mode=Mode.CONVERT)

From a1f8c3adda9a4e2e3db3174b39fc4487b27d72fb Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 26 Sep 2025 03:42:41 +0000
Subject: [PATCH 09/15] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_compressor/torch/quantization/quantize.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
index 3600869c096..241bac9c196 100644
--- a/neural_compressor/torch/quantization/quantize.py
+++ b/neural_compressor/torch/quantization/quantize.py
@@ -191,8 +191,7 @@ def prepare_qat(
     quant_config: dict,
     inplace: bool = True,
 ):
-    r"""
-    Prepares a copy of the model for quantization calibration or
+    r"""Prepares a copy of the model for quantization calibration or
     quantization-aware training and converts it to quantized version.
 
     Quantization configuration should be assigned preemptively

From fbe0918fb2041137b9100c7822542126a9e22931 Mon Sep 17 00:00:00 2001
From: lkk <kaokao.lv@intel.com>
Date: Fri, 26 Sep 2025 05:26:37 +0000
Subject: [PATCH 10/15] update `prepare_qat` code style to align with torchao.

---
 .../torch/algorithms/qat/quant_utils.py       | 24 +++++++++++++++++++
 .../torch/quantization/config.py              | 12 ++++++++++
 .../torch/quantization/quantize.py            | 12 +++++++---
 3 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/neural_compressor/torch/algorithms/qat/quant_utils.py b/neural_compressor/torch/algorithms/qat/quant_utils.py
index 0f76650e8ab..0f156338f7a 100644
--- a/neural_compressor/torch/algorithms/qat/quant_utils.py
+++ b/neural_compressor/torch/algorithms/qat/quant_utils.py
@@ -60,6 +60,30 @@ def replace_with_quant_linear(model, quant_cfg=None):
     return model
 
 
+def get_quant_config_with_scheme(scheme: str):
+    """get quantization config."""
+
+    try:
+        # use scheme definitions from AutoRound since we utilize the quantization functions now
+        from auto_round.schemes import preset_name_to_scheme
+        quant_cfg = preset_name_to_scheme(scheme)
+        return quant_cfg
+    except ImportError:
+        return None
+        
+
+def convert_model_with_mapping(model, mapping=None):
+    """process mapping to quant config."""
+    # key is torch module, TODO: support more key format, like layer name.
+    for key in mapping:
+        # TODO: support more torch modules
+        if isinstance(key, nn.Linear):
+            quant_cfg = get_quant_config_with_scheme(mapping[key])
+            if quant_cfg is None:
+                continue
+            replace_with_quant_linear(model, quant_cfg)
+
+        
 def get_quant_config(scheme: str) -> dict[str, Any]:
     """Generate quantization config for a torch model.
 
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index 27e5a85551e..b6b78d3f62d 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -24,6 +24,7 @@
 from typing import Callable, Dict, List, NamedTuple, Optional
 from typing import OrderedDict as OrderedDictType
 from typing import Tuple, Union
+import copy
 
 import torch
 
@@ -2167,3 +2168,14 @@ def get_config_set_for_tuning(cls, dtype="int8"):
             return cls._model_mapping[STATIC_QUANT].get_config_set_for_tuning()
         else:
             raise ValueError(f"Unsupported dtype: {dtype}, allowed values are 'fp8' and 'int8'.")
+
+# TODO: support more mappings configurations.
+# Default map for swapping float module to qat modules
+DEFAULT_QAT_MODULE_MAPPINGS: dict[Callable, Any] = {
+    torch.nn.Linear: "MXFP8",
+}
+
+def get_default_qat_module_mappings() -> dict[Callable, Any]:
+    """Get default module mapping for quantization aware training"""
+    return copy.deepcopy(DEFAULT_QAT_MODULE_MAPPINGS)
+
diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
index 241bac9c196..2d92bd63596 100644
--- a/neural_compressor/torch/quantization/quantize.py
+++ b/neural_compressor/torch/quantization/quantize.py
@@ -188,7 +188,7 @@ def prepare(
 @log_process(mode=Mode.PREPARE)
 def prepare_qat(
     model: torch.nn.Module,
-    quant_config: dict,
+    mapping=None,
     inplace: bool = True,
 ):
     r"""Prepares a copy of the model for quantization calibration or
@@ -204,9 +204,15 @@ def prepare_qat(
         inplace: carry out model transformations in-place, the original module
                  is mutated
     """
-    from ..algorithms.qat.quant_utils import replace_with_quant_linear
+    assert model.training, "prepare_qat only works on models in training mode"
 
-    return replace_with_quant_linear(model, quant_config)
+    from .config import get_default_qat_module_mappings
+    if mapping is None:
+        mapping = get_default_qat_module_mappings()
+
+    from ..algorithms.qat.quant_utils import convert_model_with_mapping
+
+    return convert_model_with_mapping(model, mapping)
 
 
 @log_process(mode=Mode.CONVERT)

From 4d7508f6d6b5b0d4bae5d39db79b73d8da84052f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 26 Sep 2025 05:28:18 +0000
Subject: [PATCH 11/15] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_compressor/torch/algorithms/qat/quant_utils.py | 9 +++++----
 neural_compressor/torch/quantization/config.py        | 7 ++++---
 neural_compressor/torch/quantization/quantize.py      | 1 +
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/neural_compressor/torch/algorithms/qat/quant_utils.py b/neural_compressor/torch/algorithms/qat/quant_utils.py
index 0f156338f7a..b065f3605fb 100644
--- a/neural_compressor/torch/algorithms/qat/quant_utils.py
+++ b/neural_compressor/torch/algorithms/qat/quant_utils.py
@@ -61,19 +61,20 @@ def replace_with_quant_linear(model, quant_cfg=None):
 
 
 def get_quant_config_with_scheme(scheme: str):
-    """get quantization config."""
+    """Get quantization config."""
 
     try:
         # use scheme definitions from AutoRound since we utilize the quantization functions now
         from auto_round.schemes import preset_name_to_scheme
+
         quant_cfg = preset_name_to_scheme(scheme)
         return quant_cfg
     except ImportError:
         return None
-        
+
 
 def convert_model_with_mapping(model, mapping=None):
-    """process mapping to quant config."""
+    """Process mapping to quant config."""
     # key is torch module, TODO: support more key format, like layer name.
     for key in mapping:
         # TODO: support more torch modules
@@ -83,7 +84,7 @@ def convert_model_with_mapping(model, mapping=None):
                 continue
             replace_with_quant_linear(model, quant_cfg)
 
-        
+
 def get_quant_config(scheme: str) -> dict[str, Any]:
     """Generate quantization config for a torch model.
 
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index b6b78d3f62d..fd79864a467 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -18,13 +18,13 @@
 """Intel Neural Compressor Pytorch quantization config API."""
 
 
+import copy
 import importlib
 import json
 from collections import OrderedDict
 from typing import Callable, Dict, List, NamedTuple, Optional
 from typing import OrderedDict as OrderedDictType
 from typing import Tuple, Union
-import copy
 
 import torch
 
@@ -2169,13 +2169,14 @@ def get_config_set_for_tuning(cls, dtype="int8"):
         else:
             raise ValueError(f"Unsupported dtype: {dtype}, allowed values are 'fp8' and 'int8'.")
 
+
 # TODO: support more mappings configurations.
 # Default map for swapping float module to qat modules
 DEFAULT_QAT_MODULE_MAPPINGS: dict[Callable, Any] = {
     torch.nn.Linear: "MXFP8",
 }
 
+
 def get_default_qat_module_mappings() -> dict[Callable, Any]:
-    """Get default module mapping for quantization aware training"""
+    """Get default module mapping for quantization aware training."""
     return copy.deepcopy(DEFAULT_QAT_MODULE_MAPPINGS)
-
diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
index 2d92bd63596..84f770a4a71 100644
--- a/neural_compressor/torch/quantization/quantize.py
+++ b/neural_compressor/torch/quantization/quantize.py
@@ -207,6 +207,7 @@ def prepare_qat(
     assert model.training, "prepare_qat only works on models in training mode"
 
     from .config import get_default_qat_module_mappings
+
     if mapping is None:
         mapping = get_default_qat_module_mappings()
 

From 6d89e55154838e8e9cd25b9d615e6f4944c7222e Mon Sep 17 00:00:00 2001
From: lkk <kaokao.lv@intel.com>
Date: Mon, 29 Sep 2025 02:37:38 +0000
Subject: [PATCH 12/15] add qat test ut.

---
 .../torch/algorithms/qat/quant_utils.py       |  7 ++-
 .../torch/quantization/config.py              |  2 +-
 test/3x/torch/algorithms/qat/test_qat.py      | 62 +++++++++++++++++++
 3 files changed, 68 insertions(+), 3 deletions(-)
 create mode 100644 test/3x/torch/algorithms/qat/test_qat.py

diff --git a/neural_compressor/torch/algorithms/qat/quant_utils.py b/neural_compressor/torch/algorithms/qat/quant_utils.py
index b065f3605fb..ad32ee76806 100644
--- a/neural_compressor/torch/algorithms/qat/quant_utils.py
+++ b/neural_compressor/torch/algorithms/qat/quant_utils.py
@@ -26,6 +26,7 @@
 import torch.nn as nn
 
 from .quant_linear import QuantLinear
+from .tensor_quantizer import TensorQuantizer 
 
 
 def convert(module: nn.Module, quant_cfg=None, quant_module=None):
@@ -66,7 +67,6 @@ def get_quant_config_with_scheme(scheme: str):
     try:
         # use scheme definitions from AutoRound since we utilize the quantization functions now
         from auto_round.schemes import preset_name_to_scheme
-
         quant_cfg = preset_name_to_scheme(scheme)
         return quant_cfg
     except ImportError:
@@ -78,12 +78,15 @@ def convert_model_with_mapping(model, mapping=None):
     # key is torch module, TODO: support more key format, like layer name.
     for key in mapping:
         # TODO: support more torch modules
-        if isinstance(key, nn.Linear):
+        if key == nn.Linear:
             quant_cfg = get_quant_config_with_scheme(mapping[key])
             if quant_cfg is None:
                 continue
             replace_with_quant_linear(model, quant_cfg)
 
+    replaced_modules = sum(isinstance(m, TensorQuantizer) for _, m in model.named_modules())
+    print(f"Inserted {replaced_modules} quantizers")
+
 
 def get_quant_config(scheme: str) -> dict[str, Any]:
     """Generate quantization config for a torch model.
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index fd79864a467..bc7cd91e172 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -24,7 +24,7 @@
 from collections import OrderedDict
 from typing import Callable, Dict, List, NamedTuple, Optional
 from typing import OrderedDict as OrderedDictType
-from typing import Tuple, Union
+from typing import Tuple, Union, Any
 
 import torch
 
diff --git a/test/3x/torch/algorithms/qat/test_qat.py b/test/3x/torch/algorithms/qat/test_qat.py
new file mode 100644
index 00000000000..5279c808bef
--- /dev/null
+++ b/test/3x/torch/algorithms/qat/test_qat.py
@@ -0,0 +1,62 @@
+import math
+import types
+import torch
+import torch.nn as nn
+import pytest
+
+# Skip the whole module if auto_round (needed for get_quant_func inside TensorQuantizer) is not available
+auto_round = pytest.importorskip("auto_round")
+
+from neural_compressor.torch.quantization.quantize import prepare_qat
+
+
+def setup_seed(seed):
+    import numpy as np
+    import random
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+
+class TinyModel(nn.Module):
+    """Simple hierarchical model for recursive replacement tests."""
+
+    def __init__(self):
+        super().__init__()
+        self.fc1 = nn.Linear(32, 64)
+        self.lm_head = nn.Linear(64, 2)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        return self.lm_head(x)
+
+def test_replace_quant_layer():
+    """Check the inserted quant linear."""
+    model = TinyModel()
+
+    prepare_qat(model)
+
+    replaced_modules = sum(isinstance(m, TensorQuantizer) for _, m in model.named_modules())
+
+    assert replaced_modules == 3
+
+
+def test_train():
+    """QAT test."""
+    setup_seed(20)
+
+    model = TinyModel()
+    prepare_qat(model)
+
+    inp = torch.randn([2, 32])
+
+    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
+
+    with torch.autocast(device_type="cpu", dtype=torch.bfloat16):
+        output = model(inp)
+        loss = output.mean()
+
+    optimizer.zero_grad()
+    loss.backward()
+    optimizer.step()

From 0551717ac25b047fd6987c40da7c77e93f4b08c6 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 29 Sep 2025 02:39:29 +0000
Subject: [PATCH 13/15] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_compressor/torch/algorithms/qat/quant_utils.py | 3 ++-
 neural_compressor/torch/quantization/config.py        | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/neural_compressor/torch/algorithms/qat/quant_utils.py b/neural_compressor/torch/algorithms/qat/quant_utils.py
index ad32ee76806..bf99a36d5b3 100644
--- a/neural_compressor/torch/algorithms/qat/quant_utils.py
+++ b/neural_compressor/torch/algorithms/qat/quant_utils.py
@@ -26,7 +26,7 @@
 import torch.nn as nn
 
 from .quant_linear import QuantLinear
-from .tensor_quantizer import TensorQuantizer 
+from .tensor_quantizer import TensorQuantizer
 
 
 def convert(module: nn.Module, quant_cfg=None, quant_module=None):
@@ -67,6 +67,7 @@ def get_quant_config_with_scheme(scheme: str):
     try:
         # use scheme definitions from AutoRound since we utilize the quantization functions now
         from auto_round.schemes import preset_name_to_scheme
+
         quant_cfg = preset_name_to_scheme(scheme)
         return quant_cfg
     except ImportError:
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index bc7cd91e172..883edc60f60 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -22,9 +22,9 @@
 import importlib
 import json
 from collections import OrderedDict
-from typing import Callable, Dict, List, NamedTuple, Optional
+from typing import Any, Callable, Dict, List, NamedTuple, Optional
 from typing import OrderedDict as OrderedDictType
-from typing import Tuple, Union, Any
+from typing import Tuple, Union
 
 import torch
 

From ece99c341a7f68c1a358992b03a73a6e5d912093 Mon Sep 17 00:00:00 2001
From: lkk <kaokao.lv@intel.com>
Date: Mon, 29 Sep 2025 03:35:37 +0000
Subject: [PATCH 14/15] fix ut.

---
 test/3x/torch/algorithms/qat/test_qat.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/3x/torch/algorithms/qat/test_qat.py b/test/3x/torch/algorithms/qat/test_qat.py
index 5279c808bef..4e2d270bef9 100644
--- a/test/3x/torch/algorithms/qat/test_qat.py
+++ b/test/3x/torch/algorithms/qat/test_qat.py
@@ -8,6 +8,7 @@
 auto_round = pytest.importorskip("auto_round")
 
 from neural_compressor.torch.quantization.quantize import prepare_qat
+from neural_compressor.torch.algorithms.qat.tensor_quantizer import TensorQuantizer
 
 
 def setup_seed(seed):

From 1addd32c5c847b658e6d5b9d1734f51159fc6f43 Mon Sep 17 00:00:00 2001
From: lkk <33276950+lkk12014402@users.noreply.github.com>
Date: Mon, 29 Sep 2025 15:41:11 +0800
Subject: [PATCH 15/15] update qat ut assert.

---
 test/3x/torch/algorithms/qat/test_qat.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/3x/torch/algorithms/qat/test_qat.py b/test/3x/torch/algorithms/qat/test_qat.py
index 4e2d270bef9..83fc1dd4348 100644
--- a/test/3x/torch/algorithms/qat/test_qat.py
+++ b/test/3x/torch/algorithms/qat/test_qat.py
@@ -60,4 +60,8 @@ def test_train():
 
     optimizer.zero_grad()
     loss.backward()
+
+    # check the grad
+    for name, param in model.named_parameters():
+        assert param.grad is not None
     optimizer.step()