diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 5feafa5c8..83a566775 100755
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -20,6 +20,7 @@ NVIDIA Model Optimizer Changelog (Linux)
 - Add support for PyTorch Geometric quantization.
 - Add per tensor and per channel MSE calibrator support.
 - Added support for PTQ/QAT checkpoint export and loading for running fakequant evaluation in vLLM. See `examples/vllm_serve/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/vllm_serve#load-qatptq-model-and-serve-in-vllm-wip>`_ for more details.
+- Add support for Transformer Engine quantization for Megatron Core models.
 
 **Documentation**
 
diff --git a/modelopt/torch/quantization/model_quant.py b/modelopt/torch/quantization/model_quant.py
index 4a2b74a30..c5c2424b6 100644
--- a/modelopt/torch/quantization/model_quant.py
+++ b/modelopt/torch/quantization/model_quant.py
@@ -30,6 +30,7 @@
 from modelopt.torch.opt.utils import forward_with_reshard
 from modelopt.torch.quantization.config import QuantizeConfig
 from modelopt.torch.quantization.conversion import set_quantizer_by_cfg
+from modelopt.torch.utils import atomic_print
 
 from .algorithms import AutoQuantizeGradientSearcher, AutoQuantizeKLDivSearcher, QuantRecipe
 from .config import QuantizeAlgoCfgType
@@ -506,6 +507,7 @@ def enable_quantizer(model: nn.Module, wildcard_or_filter_func: str | Callable):
     set_quantizer_attribute(model, wildcard_or_filter_func, {"enable": True})
 
 
+@atomic_print
 def print_quant_summary(model: nn.Module):
     """Print summary of all quantizer modules in the model."""
     count = 0
diff --git a/modelopt/torch/quantization/plugins/custom.py b/modelopt/torch/quantization/plugins/custom.py
index 4227f3c49..4200aadc7 100644
--- a/modelopt/torch/quantization/plugins/custom.py
+++ b/modelopt/torch/quantization/plugins/custom.py
@@ -59,7 +59,7 @@ class _QuantFunctionalMixin(QuantModule):
     def functionals_to_replace(self) -> Iterator[tuple[ModuleType, str, Callable]]:
         return (
             (package, func_name, quantized_func)
-            for package, func_name, quantized_func in self._functionals_to_replace
+            for package, func_name, quantized_func in getattr(self, "_functionals_to_replace", [])
             if hasattr(package, func_name)
         )
 
diff --git a/modelopt/torch/quantization/plugins/megatron.py b/modelopt/torch/quantization/plugins/megatron.py
index a33f715cf..95e8651aa 100644
--- a/modelopt/torch/quantization/plugins/megatron.py
+++ b/modelopt/torch/quantization/plugins/megatron.py
@@ -47,11 +47,14 @@
 try:
     from megatron.core.extensions.transformer_engine import (
         TEColumnParallelGroupedLinear,
+        TEColumnParallelLinear,
         TEDotProductAttention,
+        TELayerNormColumnParallelLinear,
         TERowParallelGroupedLinear,
+        TERowParallelLinear,
     )
 
-    from .transformer_engine import _QuantTEGroupedLinear
+    from .transformer_engine import _QuantTEGroupedLinear, _QuantTELayerNormLinear, _QuantTELinear
 
     HAS_TE = True
 except ImportError:
@@ -549,6 +552,23 @@ def sync_moe_local_experts_amax(self):
 
 
 if HAS_TE:
+
+    @QuantModuleRegistry.register({TERowParallelLinear: "te_mcore_RowParallelLinear"})
+    class _QuantTEMCoreRowParallelLinear(_QuantTELinear, _MegatronRowParallelLinear):
+        pass
+
+    @QuantModuleRegistry.register({TEColumnParallelLinear: "te_mcore_ColumnParallelLinear"})
+    class _QuantTEMCoreColumnParallelLinear(_QuantTELinear, _MegatronColumnParallelLinear):
+        pass
+
+    @QuantModuleRegistry.register(
+        {TELayerNormColumnParallelLinear: "te_mcore_LayerNormColumnParallelLinear"}
+    )
+    class _QuantTELayerNormColumnParallelLinear(
+        _QuantTELayerNormLinear, _MegatronColumnParallelLinear
+    ):
+        pass
+
     # Quantized subclasses to support TEGroupedMLP quantization
     class _QuantMegatronTEGroupedLinear(_QuantTEGroupedLinear, _MegatronParallelLinear):
         def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
diff --git a/modelopt/torch/quantization/plugins/transformer_engine.py b/modelopt/torch/quantization/plugins/transformer_engine.py
index 5199bbf34..848908657 100644
--- a/modelopt/torch/quantization/plugins/transformer_engine.py
+++ b/modelopt/torch/quantization/plugins/transformer_engine.py
@@ -15,44 +15,87 @@
 
 """Support quantization for Transformer Engine layers."""
 
+import warnings
+
 import torch
 import transformer_engine as te
 import transformer_engine.pytorch.module.grouped_linear as te_grouped_linear
+import transformer_engine.pytorch.module.layernorm_linear as te_layernorm_linear
 import transformer_engine.pytorch.module.linear as te_linear
+from packaging.version import Version
+
+from modelopt.torch.quantization.utils import replace_function
 
 from ..nn import QuantModuleRegistry
 from .custom import _ParallelLinear
 
+_TE_VERSION = Version(te.__version__)
+
+
+def _assert_te_fp8_enabled():
+    """Check if Transformer Engine FP8 autocast is enabled and raise error if so."""
+    try:
+        from transformer_engine.pytorch.fp8 import FP8GlobalStateManager
+
+        if FP8GlobalStateManager.is_fp8_enabled():
+            raise RuntimeError(
+                "Transformer Engine FP8 training (fp8_autocast) is enabled, which conflicts with "
+                "ModelOpt quantization. Please disable TE FP8 autocast when using ModelOpt "
+                "quantization, or use ModelOpt's FP8 quantization instead."
+            )
+    except ImportError:
+        pass  # Older TE versions may not have this API
+
 
 @QuantModuleRegistry.register({te.pytorch.Linear: "te_Linear"})
 class _QuantTELinear(_ParallelLinear):
-    _functionals_to_replace = [
-        (
-            te_linear._Linear,
-            "apply" if torch.is_grad_enabled() else "forward",
-        ),
-    ]
+    @property
+    def _functionals_to_replace(self):
+        return (
+            [(te_linear._Linear, "apply")]
+            if torch.is_grad_enabled()
+            else [(te_linear._Linear, "forward")]
+        )
+
+    @_functionals_to_replace.setter
+    def _functionals_to_replace(self, value):
+        self._functionals_to_replace = value
+
+    def _setup(self):
+        super()._setup()
+        if getattr(self, "fuse_wgrad_accumulation", False):
+            warnings.warn(
+                "fuse_wgrad_accumulation is not supported with ModelOpt quantization. "
+                "Setting fuse_wgrad_accumulation to False."
+            )
+            self.fuse_wgrad_accumulation = False
 
     @staticmethod
     def te_quantized_linear_fn(package, func_name, self, *args, **kwargs):
         """Quantized version specifically for TE with weight first, then input."""
-        if te.__version__ >= "2.0":
-            weight, inputs = args[0], args[1]
-            remaining_args = args[2:]
+        _assert_te_fp8_enabled()
+        if Version("2.0") <= _TE_VERSION:
+            idx = 1 if func_name == "_forward" else 0
+            weight, inputs = args[idx], args[idx + 1]
+            remaining_args = args[idx + 2 :]
+            weight = self.weight_quantizer(weight)
+            inputs = self.input_quantizer(inputs)
+            new_args = (weight, inputs, *remaining_args)
+            new_args = (args[0], *new_args) if func_name == "_forward" else new_args
             output = getattr(package, func_name)(
-                self.weight_quantizer(weight),
-                self.input_quantizer(inputs),
-                *remaining_args,
+                *new_args,
                 **kwargs,
             )
         else:
-            weight, weight_fp8, inputs = args[0], args[1], args[2]
-            remaining_args = args[3:]
+            idx = 1 if func_name == "_forward" else 0
+            weight, weight_fp8, inputs = args[idx], args[idx + 1], args[idx + 2]
+            remaining_args = args[idx + 3 :]
+            weight = self.weight_quantizer(weight)
+            inputs = self.input_quantizer(inputs)
+            new_args = (weight, weight_fp8, inputs, *remaining_args)
+            new_args = (args[0], *new_args) if func_name == "_forward" else new_args
             output = getattr(package, func_name)(
-                self.weight_quantizer(weight),
-                weight_fp8,
-                self.input_quantizer(inputs),
-                *remaining_args,
+                *new_args,
                 **kwargs,
             )
         return self.output_quantizer(output)
@@ -64,10 +107,17 @@ def te_quantized_linear_fn(package, func_name, self, *args, **kwargs):
 # Register the public te.pytorch.GroupedLinear class
 @QuantModuleRegistry.register({te_grouped_linear.GroupedLinear: "te_GroupedLinear"})
 class _QuantTEGroupedLinear(_ParallelLinear):
-    _functionals_to_replace = [
-        (te_grouped_linear._GroupedLinear, "forward"),
-        (te_grouped_linear._GroupedLinear, "apply"),
-    ]
+    @property
+    def _functionals_to_replace(self):
+        return (
+            [(te_grouped_linear._GroupedLinear, "apply")]
+            if torch.is_grad_enabled()
+            else [(te_grouped_linear._GroupedLinear, "forward")]
+        )
+
+    @_functionals_to_replace.setter
+    def _functionals_to_replace(self, value):
+        self._functionals_to_replace = value
 
     def _setup(self):
         # GroupedMLP stores the weights as weight0, weight1, etc. To run setup in order to
@@ -93,6 +143,7 @@ def modelopt_post_restore(self, prefix: str = ""):
 
     @staticmethod
     def te_grouped_quantized_linear_fn(package, func_name, self, *args):
+        _assert_te_fp8_enabled()
         idx = 1 if func_name == "_forward" else 0
         inp = args[idx]
         num_gemms = len(args[idx + 1])
@@ -116,3 +167,120 @@ def te_grouped_quantized_linear_fn(package, func_name, self, *args):
 
     # Override the quantized linear function
     _quantized_linear_fn = te_grouped_quantized_linear_fn
+
+
+class _QuantLayerNormLinearFunc(torch.autograd.Function):
+    """Patched version of _LayerNormLinear to quantize the input to the GEMM operation."""
+
+    @staticmethod
+    def _get_original_gemm():
+        if Version("2.0") <= _TE_VERSION:
+            return te_layernorm_linear.general_gemm
+        else:
+            return te_layernorm_linear.tex.gemm
+
+    @staticmethod
+    def _gemm_replace_args():
+        if Version("2.0") <= _TE_VERSION:
+            return (te_layernorm_linear, "general_gemm")
+        else:
+            return (te_layernorm_linear.tex, "gemm")
+
+    @staticmethod
+    def forward(ctx, inp, ln_weight, ln_bias, weight, *args, **kwargs):
+        input_quantizer, weight_quantizer = _QuantLayerNormLinearFunc.modelopt_quantizers
+
+        qweight = weight_quantizer(weight)
+        qweight.requires_grad = weight.requires_grad
+        if ctx is not None:
+            # We need to recompute the quantized input for the backward pass, so we save the input_quantizer
+            ctx.modelopt_input_quantizer = input_quantizer
+
+        original_gemm = _QuantLayerNormLinearFunc._get_original_gemm()
+
+        def _patched_general_gemm(weight, input, *gemm_args, **gemm_kwargs):
+            qinput = input_quantizer(input)
+            return original_gemm(weight, qinput, *gemm_args, **gemm_kwargs)
+
+        with replace_function(
+            *_QuantLayerNormLinearFunc._gemm_replace_args(),
+            _patched_general_gemm,  # type: ignore[call-arg]
+        ):
+            outputs = te_layernorm_linear._og_LayerNormLinear.forward(
+                ctx, inp, ln_weight, ln_bias, qweight, *args, **kwargs
+            )
+        return outputs
+
+    # TODO: Support non-pass-through backward behavior for activation quantization
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        """Backward pass for _QuantLayerNormLinearFunc functional.
+
+        The backward pass input and weight gradient estimation uses straight through estimator (STE).
+        We should add support for advanced gradient estimation techniques like STE with clipping.
+        However this is a low priority item.
+        """
+        gemm_call_counter = {"count": 0}
+
+        original_gemm = _QuantLayerNormLinearFunc._get_original_gemm()
+
+        def _patched_general_gemm(a, b, *gemm_args, **gemm_kwargs):
+            # The first time, gemm is used for dgrad calculation
+            # dgrad GEMM; dx = dy * qw; Called as gemm(qw, dy, ...)
+            if gemm_call_counter["count"] == 0:
+                gemm_call_counter["count"] += 1
+                return original_gemm(a, b, *gemm_args, **gemm_kwargs)
+
+            # The second time, gemm is used for wgrad calculation
+            # wgrad GEMM; dqw = dy^T * x; Called as gemm(x, dy, ..);
+
+            # x should be quantized input (qinput) for the backward pass as per chain rule,
+            # but gemm is called with the unquantized input (a)
+            # So lets first get the quantized input (qinput) and then call the gemm
+            qinput = ctx.modelopt_input_quantizer(a)
+            return original_gemm(qinput, b, *gemm_args, **gemm_kwargs)
+
+        with replace_function(
+            *_QuantLayerNormLinearFunc._gemm_replace_args(),
+            _patched_general_gemm,  # type: ignore[call-arg]
+        ):
+            # During backward, the patch does not exist; autograd will automatically use
+            # _QuantLayerNormLinearFunc.backward
+            outputs = te_layernorm_linear._LayerNormLinear.backward(ctx, *grad_outputs)
+
+        delattr(ctx, "modelopt_input_quantizer")
+        return outputs
+
+
+@QuantModuleRegistry.register({te.pytorch.LayerNormLinear: "te_LayerNormLinear"})
+class _QuantTELayerNormLinear(_ParallelLinear):
+    _functionals_to_replace = []
+
+    def _setup(self):
+        super()._setup()
+        if getattr(self, "fuse_wgrad_accumulation", False):
+            warnings.warn(
+                "fuse_wgrad_accumulation is not supported with ModelOpt quantization. "
+                "Setting fuse_wgrad_accumulation to False."
+            )
+            self.fuse_wgrad_accumulation = False
+
+    def forward(self, *args, **kwargs):
+        """Call ModelOpt patch for _LayerNormLinear functional."""
+        _assert_te_fp8_enabled()
+        # This is multi-process safe (such as in torch distributed jobs), not multi-thread safe
+        _QuantLayerNormLinearFunc.modelopt_quantizers = (
+            self.input_quantizer,
+            self.weight_quantizer,
+        )
+        with replace_function(
+            te_layernorm_linear,
+            "_LayerNormLinear",
+            _QuantLayerNormLinearFunc,
+            "_og_LayerNormLinear",
+        ):
+            outputs = super().forward(*args, **kwargs)
+        delattr(_QuantLayerNormLinearFunc, "modelopt_quantizers")
+        if isinstance(outputs, tuple):
+            return (self.output_quantizer(outputs[0]), *outputs[1:])
+        return self.output_quantizer(outputs)
diff --git a/modelopt/torch/quantization/utils.py b/modelopt/torch/quantization/utils.py
index c8be1d014..3155c5b7d 100644
--- a/modelopt/torch/quantization/utils.py
+++ b/modelopt/torch/quantization/utils.py
@@ -337,14 +337,16 @@ def disable_lora_quantizers_in_config(config, layers):
 
 
 @contextmanager
-def replace_function(package, name, new_func):
+def replace_function(package, name, new_func, og_func_cache_name=None):
     """Replace a function with a new one within a context."""
+    if og_func_cache_name is None:
+        og_func_cache_name = "_" + name
     old_func = getattr(package, name)
     setattr(package, name, new_func)
-    setattr(package, "_" + name, old_func)
+    setattr(package, og_func_cache_name, old_func)
     yield
     setattr(package, name, old_func)
-    delattr(package, "_" + name)
+    delattr(package, og_func_cache_name)
 
 
 @contextmanager
diff --git a/modelopt/torch/utils/logging.py b/modelopt/torch/utils/logging.py
index 15a7f750e..c8b5297b8 100644
--- a/modelopt/torch/utils/logging.py
+++ b/modelopt/torch/utils/logging.py
@@ -18,15 +18,28 @@
 import contextlib
 import os
 import re
+import sys
+import threading
 import warnings
-from contextlib import contextmanager
+from collections.abc import Iterator
+from contextlib import contextmanager, redirect_stderr, redirect_stdout
+from functools import wraps
 from inspect import signature
+from io import StringIO
 
 import tqdm
 
 from . import distributed as dist
 
-__all__ = ["DeprecatedError", "no_stdout", "num2hrb", "print_rank_0", "silence_matched_warnings"]
+__all__ = [
+    "DeprecatedError",
+    "atomic_print",
+    "capture_io",
+    "no_stdout",
+    "num2hrb",
+    "print_rank_0",
+    "silence_matched_warnings",
+]
 
 
 def num2hrb(num: float, suffix="") -> str:
@@ -95,6 +108,61 @@ def print_rank_0(*args, **kwargs):
         print(*args, **kwargs, flush=True)
 
 
+# Global reentrant lock for thread-safe printing (allows nested @atomic_print decorators)
+_print_lock = threading.RLock()
+
+
+def atomic_print(func):
+    """Decorator to prevent interleaved output in distributed/multi-threaded environments."""
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        # Capture stdout to prevent interleaved printing
+        with _print_lock:
+            old_stdout = sys.stdout
+            sys.stdout = captured_output = StringIO()
+
+            try:
+                result = func(*args, **kwargs)
+                output = captured_output.getvalue()
+            finally:
+                # Always restore stdout, even if there's an exception
+                sys.stdout = old_stdout
+
+            # Print all at once atomically
+            if output:
+                print(output, end="", flush=True)
+
+            return result
+
+    return wrapper
+
+
+@contextmanager
+def capture_io(capture_stderr: bool = True) -> Iterator[StringIO]:
+    """Capture stdout and stderr within the invoked context.
+
+    Args:
+        capture_stderr (bool): Whether to capture stderr. Defaults to True.
+
+    Returns:
+        Iterator[StringIO]: An iterator that yields a StringIO object that contains the captured output.
+
+    Example::
+
+        with capture_io() as buf:
+            print("Hello, world!")
+        print(buf.getvalue())
+    """
+    buf = StringIO()
+    if capture_stderr:
+        with redirect_stdout(buf), redirect_stderr(buf):
+            yield buf
+    else:
+        with redirect_stdout(buf):
+            yield buf
+
+
 @contextlib.contextmanager
 def silence_matched_warnings(pattern=None):
     """Silences warnings that match a given pattern.
diff --git a/tests/_test_utils/torch/megatron/utils.py b/tests/_test_utils/torch/megatron/utils.py
index 695189f6c..5ca0cf14c 100644
--- a/tests/_test_utils/torch/megatron/utils.py
+++ b/tests/_test_utils/torch/megatron/utils.py
@@ -81,7 +81,10 @@ def run_mcore_inference(
         if HAS_MAMBA and isinstance(model.decoder.layers[0], MambaLayer):
             active_hidden_size = model.decoder.layers[0].mixer.d_model
         elif isinstance(model.decoder.layers[0].self_attention, SelfAttention):
-            active_hidden_size = model.decoder.layers[0].self_attention.linear_qkv.input_size
+            if hasattr(model.decoder.layers[0].self_attention.linear_qkv, "in_features"):
+                active_hidden_size = model.decoder.layers[0].self_attention.linear_qkv.in_features
+            else:
+                active_hidden_size = model.decoder.layers[0].self_attention.linear_qkv.input_size
         elif isinstance(model.decoder.layers[0].mlp, MLP):
             active_hidden_size = model.decoder.layers[0].mlp.linear_fc1.input_size
         else:
diff --git a/tests/gpu/torch/quantization/plugins/test_megatron.py b/tests/gpu/torch/quantization/plugins/test_megatron.py
index 2993749b1..5b2a8cc0a 100644
--- a/tests/gpu/torch/quantization/plugins/test_megatron.py
+++ b/tests/gpu/torch/quantization/plugins/test_megatron.py
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import copy
 from functools import partial
 
@@ -50,11 +51,36 @@
 import modelopt.torch.opt as mto
 import modelopt.torch.quantization as mtq
 from modelopt.torch.quantization.nn import QuantModuleRegistry
+from modelopt.torch.quantization.plugins.megatron import _QuantTEMCoreRowParallelLinear
 from modelopt.torch.utils.plugins import megatron_prefill
 
+try:
+    from megatron.core.extensions.transformer_engine import TERowParallelLinear
+
+    HAS_TE = True
+except ImportError:
+    HAS_TE = False
+
 SEED = 1234
 
 
+def get_batch(model, batch_size=2):
+    seq_length = model.max_sequence_length
+    vocab_size = model.vocab_size
+
+    input_ids = torch.randint(0, vocab_size, (batch_size, seq_length)).cuda()
+    labels = torch.randint(0, vocab_size, (batch_size, seq_length)).cuda()
+    position_ids = (
+        torch.arange(seq_length, dtype=torch.int64).unsqueeze(0).repeat(batch_size, 1).cuda()
+    )
+    attention_mask = torch.tril(
+        torch.ones((batch_size, 1, seq_length, seq_length), dtype=torch.bool)
+    ).cuda()
+    loss_mask = torch.ones((batch_size, seq_length), dtype=torch.float32).cuda()
+
+    return input_ids, labels, position_ids, attention_mask, loss_mask
+
+
 def test_convert_megatron_parallel_linear(distributed_setup_size_1):
     initialize_for_megatron(seed=SEED)
     set_seed(SEED)
@@ -535,6 +561,7 @@ def test_fp8_real_quantize():
     spawn_multiprocess_job(size=size, job=_test_fp8_real_quantize_helper, backend="nccl")
 
 
+@pytest.mark.skip(reason="TODO: etp requires sequence parallelism now in Megatron due to a bug;")
 @pytest.mark.parametrize(
     "config",
     [mtq.FP8_DEFAULT_CFG, mtq.NVFP4_DEFAULT_CFG, mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG],
@@ -727,6 +754,7 @@ def forward_fn(model):
     assert final_sync, f"Inconsistent amax for expert {quantizer_type} across ranks: {rank_values}"
 
 
+@pytest.mark.skip(reason="TODO: etp requires sequence parallelism now in Megatron due to a bug;")
 @pytest.mark.parametrize("config", [mtq.FP8_DEFAULT_CFG, mtq.INT8_DEFAULT_CFG])
 @pytest.mark.parametrize(("ep_size", "etp_size"), [(1, 2), (2, 1), (2, 2)])
 @pytest.mark.parametrize("moe_grouped_gemm", [True, False])
@@ -932,3 +960,79 @@ def test_kv_cache_sharded_state_dict(tmp_path, config):
         job=partial(_test_kv_cache_sharded_state_dict_helper, tmp_path, config),
         backend="nccl",
     )
+
+
+def test_convert_mcore_te_gpt_model(distributed_setup_size_1):
+    if not HAS_TE:
+        pytest.skip("Transformer Engine is not installed")
+    initialize_for_megatron(tensor_model_parallel_size=1, seed=SEED)
+    model = get_mcore_gpt_model(tensor_model_parallel_size=1, transformer_impl="transformer_engine")
+
+    input_ids, labels, position_ids, attention_mask, loss_mask = get_batch(model)
+
+    def forward(model):
+        return model.forward(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            labels=labels,
+            loss_mask=loss_mask,
+        )
+
+    for name, param in model.named_parameters():
+        param.requires_grad = True
+
+    # Set to eval mode to disable dropout for deterministic outputs
+    model.eval()
+    ref_output = forward(model)
+
+    model = mtq.quantize(model, mtq.INT8_DEFAULT_CFG, forward)
+
+    for n, m in model.named_modules():
+        if isinstance(m, TERowParallelLinear):
+            assert isinstance(m, _QuantTEMCoreRowParallelLinear)
+            assert m.input_quantizer.amax is not None
+            assert m.weight_quantizer.amax is not None
+
+    # Save which quantizers are enabled before disabling
+    enabled_quantizers = {
+        name
+        for name, m in model.named_modules()
+        if isinstance(m, mtq.nn.TensorQuantizer) and m.is_enabled
+    }
+
+    mtq.disable_quantizer(model, "*")
+    disabled_output = forward(model)
+    assert torch.allclose(ref_output, disabled_output, atol=1e-5), (
+        "Output with quantizers disabled should match reference output"
+    )
+
+    mtq.enable_quantizer(model, lambda name: name in enabled_quantizers)
+    enabled_output = forward(model)
+    assert not torch.allclose(ref_output, enabled_output, atol=1e-5), (
+        "Output with quantizers enabled should differ from reference output"
+    )
+    # enable model for training to test backward pass
+    model.train()
+    loss = forward(model).sum()
+    loss.backward()
+
+    destroy_model_parallel()
+
+
+def test_homogeneous_sharded_state_dict_te_spec(tmp_path):
+    pytest.skip("The test is temporarily disabled to avoid CI timeout")
+    spawn_multiprocess_job(
+        size=2,
+        job=partial(
+            _test_sharded_state_dict,
+            tmp_path,
+            mtq.INT8_DEFAULT_CFG,
+            256,
+            None,
+            False,
+            False,
+            {"transformer_impl": "transformer_engine"},
+        ),
+        backend="nccl",
+    )
diff --git a/tests/gpu/torch/quantization/plugins/test_transformer_engine.py b/tests/gpu/torch/quantization/plugins/test_transformer_engine.py
index ae84fe95b..288cc7519 100644
--- a/tests/gpu/torch/quantization/plugins/test_transformer_engine.py
+++ b/tests/gpu/torch/quantization/plugins/test_transformer_engine.py
@@ -16,11 +16,13 @@
 import pytest
 import torch
 import torch.nn as nn
+from _test_utils.torch.misc import set_seed
 from _test_utils.torch.quantization.quantize_common import quantize_model_and_forward
 
 import modelopt.torch.opt as mto
 import modelopt.torch.quantization as mtq
 from modelopt.torch.quantization.extensions import get_cuda_ext_mx
+from modelopt.torch.quantization.nn import QuantModule
 
 te = pytest.importorskip("transformer_engine")
 
@@ -29,7 +31,7 @@ class TELinear(nn.Module):
     def __init__(self):
         super().__init__()
         self.net = torch.nn.Sequential(
-            te.pytorch.Linear(16, 32), te.pytorch.Linear(32, 64), te.pytorch.Linear(64, 16)
+            te.pytorch.Linear(16, 32), te.pytorch.LayerNormLinear(32, 16, normalization="RMSNorm")
         )
 
     def forward(self, x):
@@ -77,6 +79,35 @@ def test_quantize(model_cls, config):
     quantize_model_and_forward(model, config, calib_data)
 
 
+def test_quantize_forward_backward():
+    set_seed()
+    model = TELinear().cuda()
+    with torch.no_grad():
+        for name, param in model.named_parameters():
+            param.data.copy_(param.data.abs() + 0.1)  # hack to get non-zero gradient
+
+    # hack to get non-zero gradient
+    calib_data = [model.get_input().cuda().abs() + 0.1 for _ in range(1)]
+
+    quantize_model_and_forward(model, mtq.INT8_DEFAULT_CFG, calib_data)
+
+    model.train()
+    for name, param in model.named_parameters():
+        param.grad = None
+
+    loss = model(calib_data[0]).sum()
+    loss.backward()
+
+    for i, linear in enumerate(model.net):
+        assert isinstance(linear, QuantModule)
+        # In-directly tests that data was passed to the quantizers
+        assert linear.input_quantizer.amax is not None
+        assert linear.weight_quantizer.amax is not None
+
+        # In-directly tests that gradients were computed correctly
+        assert linear.weight.grad is not None and linear.weight.grad.abs().sum() > 0.0
+
+
 @pytest.mark.parametrize(
     ("model_cls", "quant_config"),
     [
diff --git a/tests/unit/torch/quantization/test_quantize_cpu.py b/tests/unit/torch/quantization/test_quantize_cpu.py
index 85fa07fa4..5bc39a517 100644
--- a/tests/unit/torch/quantization/test_quantize_cpu.py
+++ b/tests/unit/torch/quantization/test_quantize_cpu.py
@@ -109,6 +109,10 @@ def test_quantize(model_cls, config):
     calib_data = [model.get_input() for _ in range(2)]
     quantize_model_and_forward(model, config, calib_data)
 
+    # For fast testing, lets just test one config
+    if config == mtq.INT8_DEFAULT_CFG:
+        mtq.print_quant_summary(model)
+
 
 @pytest.mark.parametrize(
     ("model_cls", "quant_config"),