export working, cleanup needed

sugunav14 · sugunav14 · commit 9021845db681 · 2025-10-13T19:47:40.000Z
Signed-off-by: Suguna Velury &lt;178320438+sugunav14@users.noreply.github.com&gt;
diff --git a/modelopt/torch/export/layer_utils.py b/modelopt/torch/export/layer_utils.py
@@ -345,7 +345,9 @@ def is_moe(module: nn.Module) -> bool:
 
 def is_quantlinear(module: nn.Module) -> bool:
     """Returns whether the module is a quantized linear layer."""
-    return "QuantLinear" in type(module).__name__ and "lora" not in type(module).__name__.lower()
+    return (
+        "QuantLinear" in type(module).__name__ and "lora" not in type(module).__name__.lower()
+    ) or ("Quant" in type(module).__name__ and "Linear" in type(module).__name__)
 
 
 def dup_kv_weight(v: torch.Tensor, head_size: int, num_head: int, tp_size: int) -> torch.Tensor:
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -150,14 +150,11 @@ def _output_hook(module, input, output):
                 # For encoder-decoder models, we need to pass both the encoder and decoder input ids
                 model(fake_input, decoder_input_ids=decoder_fake_input)
             else:
-                print("DEBUG LOG: Calling model(fake_input)")
                 model(fake_input)
 
         for handle in handles:
             handle.remove()
 
-    print(f"DEBUG LOG: input_to_linear: {input_to_linear}")
-
     for tensor, modules in input_to_linear.items():
         quantization_format = get_quantization_format(modules[0])
         if len(modules) > 1 and quantization_format not in [
@@ -177,7 +174,8 @@ def _output_hook(module, input, output):
             and tensor in output_to_layernorm
         ):
             # Pre quant scale of modules is already updated to avg_pre_quant_scale
-            fuse_prequant_layernorm(output_to_layernorm[tensor], modules)
+            with fsdp2_aware_weight_update(model, output_to_layernorm[tensor]):
+                fuse_prequant_layernorm(output_to_layernorm[tensor], modules)
 
     # The dummy forward may not be able to activate all the experts.
     # Process experts by naming rules like experts.0, experts.1, etc.
@@ -470,7 +468,8 @@ def _export_hf_checkpoint(
         if get_quantization_format(sub_module) != QUANTIZATION_NONE:
             has_quantized_layers = True
             if is_quantlinear(sub_module):
-                _export_quantized_weight(sub_module, dtype)
+                with fsdp2_aware_weight_update(model, sub_module):
+                    _export_quantized_weight(sub_module, dtype)
             elif (
                 "Llama4TextExperts" in type(sub_module).__name__
                 or "GptOssExperts" in type(sub_module).__name__
@@ -488,7 +487,8 @@ def _export_hf_checkpoint(
                 )
                 # Export the quantized weights
                 for weight_name in ["gate_up_proj", "down_proj"]:
-                    _export_quantized_weight(sub_module, dtype, weight_name)
+                    with fsdp2_aware_weight_update(model, sub_module):
+                        _export_quantized_weight(sub_module, dtype, weight_name)
 
     quantized_state_dict = model.state_dict()
 
diff --git a/modelopt/torch/quantization/qtensor/base_qtensor.py b/modelopt/torch/quantization/qtensor/base_qtensor.py
@@ -274,76 +274,88 @@ def fsdp2_aware_weight_update(root_model, modules_to_update):
 
         from modelopt.torch.quantization.utils import _get_enclosing_fsdp_module, _get_module_name
 
-        breakpoint()
-        # Get FSDP root module, if none is returned, then the update is not made to a submodule of an FSDPModule
-        if not isinstance(modules_to_update, list):
-            modules_to_update = [modules_to_update]
-
-        root_modules = set()
-        for module in modules_to_update:
-            root_module = _get_enclosing_fsdp_module(module, root_model)
-            root_modules.add(root_module)
-
-        # Ensure all modules in root_modules are the same
-        assert len(root_modules) == 1, "All modules must be in the same root FSDPModule"
-        root_module = next(iter(root_modules))
-
-        # Check if root module state is sharded and unshard if needed
-        if fully_shard.state(root_module)._fsdp_param_group.is_sharded:
-            with enable_fake_quant(root_module):
-                root_module.unshard()
-
-        # Get FSDPParam list
-        fsdp_param_group = fully_shard.state(root_module)._fsdp_param_group
-        fsdp_param_mapping = _create_fsdp_param_mapping(fsdp_param_group.fsdp_params, root_module)
-
-        # Assert that all the modules in the module list are present in this fsdp_param_group
-        for module in modules_to_update:
-            name = _get_module_name(module, root_module)
-            assert name in fsdp_param_mapping, f"Module {module} not found in fsdp_param_mapping"
+        if isinstance(root_model, FSDPModule):
+            # Get FSDP root module, if none is returned, then the update is not made to a submodule of an FSDPModule
+            if not isinstance(modules_to_update, list):
+                modules_to_update = [modules_to_update]
+
+            root_modules = set()
+            for module in modules_to_update:
+                root_module = _get_enclosing_fsdp_module(module, root_model)
+                root_modules.add(root_module)
+
+            # Ensure all modules in root_modules are the same
+            assert len(root_modules) == 1, "All modules must be in the same root FSDPModule"
+            root_module = next(iter(root_modules))
+
+            # Check if root module state is sharded and unshard if needed
+            if fully_shard.state(root_module)._fsdp_param_group.is_sharded:
+                with enable_fake_quant(root_module):
+                    root_module.unshard()
+
+            # Get FSDPParam list
+            fsdp_param_group = fully_shard.state(root_module)._fsdp_param_group
+            fsdp_param_mapping = _create_fsdp_param_mapping(
+                fsdp_param_group.fsdp_params, root_model
+            )
 
+            # Assert that all the modules in the module list are present in this fsdp_param_group
+            for module in modules_to_update:
+                name = _get_module_name(module, root_model)
+                assert name in fsdp_param_mapping, (
+                    f"Module {module} not found in fsdp_param_mapping"
+                )
         # Yields for necessary weight updates/processing
         yield
     finally:
-        # Update FSDPParam list
-        for module in modules_to_update:
-            name = _get_module_name(module, root_module)
-            old_fsdp_param = fsdp_param_mapping[name]
-
-            # Update mp policy to reflect the new dtype
-            new_mp_policy = MixedPrecisionPolicy(
-                param_dtype=module.weight.dtype,
-                reduce_dtype=None,
-                output_dtype=None,
-                cast_forward_inputs=False,
-            )
+        from torch.distributed.fsdp import fully_shard
 
-            with no_requires_grad():
-                # Create a new QFSDPParam or FSDPParam based on weight type
-                param_class = QFSDPParam if isinstance(module.weight, QTensorWrapper) else FSDPParam
-                new_param = param_class(
-                    module.weight,
-                    old_fsdp_param._module_info,
-                    old_fsdp_param.mesh_info,
-                    old_fsdp_param.post_forward_mesh_info,
-                    old_fsdp_param.device,
-                    None,
-                    new_mp_policy,
-                    None,
+        from modelopt.torch.quantization.utils import _get_enclosing_fsdp_module, _get_module_name
+
+        if isinstance(root_model, FSDPModule):
+            # Update FSDPParam list
+            for module in modules_to_update:
+                name = _get_module_name(module, root_model)
+                old_fsdp_param = fsdp_param_mapping[name]
+
+                # Update mp policy to reflect the new dtype
+                new_mp_policy = MixedPrecisionPolicy(
+                    param_dtype=module.weight.dtype,
+                    reduce_dtype=None,
+                    output_dtype=None,
+                    cast_forward_inputs=False,
                 )
 
-                # Update the FSDPParam mapping to keep track of the new FSDPParam
-                fsdp_param_mapping[name] = new_param
+                with no_requires_grad():
+                    # Create a new QFSDPParam or FSDPParam based on weight type
+                    param_class = (
+                        QFSDPParam if isinstance(module.weight, QTensorWrapper) else FSDPParam
+                    )
+                    new_param = param_class(
+                        module.weight,
+                        old_fsdp_param._module_info,
+                        old_fsdp_param.mesh_info,
+                        old_fsdp_param.post_forward_mesh_info,
+                        old_fsdp_param.device,
+                        None,
+                        new_mp_policy,
+                        None,
+                    )
+                    if not isinstance(new_param, QFSDPParam):
+                        new_param.init_dtype_attrs(new_mp_policy)
+
+                    # Update the FSDPParam mapping to keep track of the new FSDPParam
+                    fsdp_param_mapping[name] = new_param
 
-                # Remove the post_load_hook_handle to allow gc to collect the old FSDPParam
-                old_fsdp_param._post_load_hook_handle.remove()
+                    # Remove the post_load_hook_handle to allow gc to collect the old FSDPParam
+                    old_fsdp_param._post_load_hook_handle.remove()
 
-        # Update FSDPParam list with new compressed weights
-        fsdp_param_group.fsdp_params = list(fsdp_param_mapping.values())
+            # Update FSDPParam list with new compressed weights
+            fsdp_param_group.fsdp_params = list(fsdp_param_mapping.values())
 
-        # Reshard FSDP root module
-        # TODO: Check if reshard is needed or not
-        root_module.reshard()
+            # Reshard FSDP root module
+            # TODO: Check if reshard is needed or not
+            root_module.reshard()
 
 
 def pack_real_quantize_weight(module, force_quantize: bool = False):
@@ -422,39 +434,8 @@ def _compress_fsdp_module(fsdp_module):
             if name not in fsdp_param_mapping:
                 continue
 
-            if _compress_and_update_module_weight(submodule):
-                old_fsdp_param = fsdp_param_mapping[name]
-
-                # Update mp policy to reflect the new dtype
-                new_mp_policy = MixedPrecisionPolicy(
-                    param_dtype=submodule.weight.dtype,
-                    reduce_dtype=None,
-                    output_dtype=None,
-                    cast_forward_inputs=False,
-                )
-                with no_requires_grad():
-                    # Create a new QFSDPParam parameter
-                    new_param = QFSDPParam(
-                        submodule.weight,
-                        old_fsdp_param._module_info,
-                        old_fsdp_param.mesh_info,
-                        old_fsdp_param.post_forward_mesh_info,
-                        old_fsdp_param.device,
-                        None,
-                        new_mp_policy,
-                        None,
-                    )
-
-                    # Update the FSDPParam mapping to keep track of the new FSDPParam
-                    fsdp_param_mapping[name] = new_param
-                    # Remove the post_load_hook_handle to allow gc to collect the old FSDPParam
-                    old_fsdp_param._post_load_hook_handle.remove()
-
-        # Update FSDPParam list with new compressed weights
-        fsdp_param_group.fsdp_params = list(fsdp_param_mapping.values())
-
-        # Reshard FSDP root module
-        fsdp_module.reshard()
+            with fsdp2_aware_weight_update(fsdp_module, submodule):
+                _compress_and_update_module_weight(submodule)
 
     with SequentialQuantizer.convert_to_single_quantizer(module), torch.no_grad():
         for _, m in module.named_modules():
diff --git a/tests/_test_utils/torch_export/export_utils.py b/tests/_test_utils/torch_export/export_utils.py
@@ -18,20 +18,22 @@
 
 # Models
 class ToyModel(torch.nn.Module):
-    def __init__(self, dims=[10, 10, 10, 10]):
+    def __init__(self, dims=[10, 10, 10, 10], bias=True):
         super().__init__()
         assert len(dims) >= 2
         if len(dims) == 2:
-            self.linears = torch.nn.Linear(dims[0], dims[1])
+            self.linears = torch.nn.Linear(dims[0], dims[1], bias=bias)
         else:
-            linears = [torch.nn.Linear(dims[i], dims[i + 1]) for i in range(len(dims) - 1)]
+            linears = [
+                torch.nn.Linear(dims[i], dims[i + 1], bias=bias) for i in range(len(dims) - 1)
+            ]
             self.linears = torch.nn.Sequential(*linears)
 
     def forward(self, x):
         return self.linears(x)
 
 
-class SmallQKVModel(torch.nn.Module):
+class SmallLinearModelwithCustomWeight(torch.nn.Module):
     def __init__(self, weights):
         super().__init__()
         self.q_proj = torch.nn.Linear(weights[0].shape[1], weights[0].shape[0], bias=False)
@@ -52,6 +54,35 @@ def forward(self, x):
         return x
 
 
+class SmallQKVModel(torch.nn.Module):
+    def __init__(self, dim=4, device="cuda", apply_embed=False):
+        super().__init__()
+        self.embedding = torch.nn.Embedding(2, dim)
+        self.q_proj = torch.nn.Linear(dim, dim, bias=False)
+        self.k_proj = torch.nn.Linear(dim, dim, bias=False)
+        self.v_proj = torch.nn.Linear(dim, dim, bias=False)
+        self.o_proj = torch.nn.Linear(dim, dim, bias=False)
+        self.device = device
+        self.config = None
+        self.apply_embed = apply_embed
+        # TODO: Debug why fsdp2 modifies bias of layernorm for awq
+        self.input_layernorm = torch.nn.LayerNorm(dim, bias=False)
+
+    def forward(self, x):
+        if self.apply_embed:
+            x = self.embedding(x)
+
+        x = self.input_layernorm(x)
+        q_proj = self.q_proj(x)
+        k_proj = self.k_proj(x)
+        v_proj = self.v_proj(x)
+        scores = torch.matmul(q_proj, k_proj.transpose(-2, -1))
+        attn = torch.nn.functional.softmax(scores, dim=-1)
+        x = torch.matmul(attn, v_proj)
+        o_proj = self.o_proj(x)
+        return o_proj
+
+
 # Quantization configs
 partial_fp8_config = {
     "quant_cfg": {
diff --git a/tests/gpu/torch/export/test_export.py b/tests/gpu/torch/export/test_export.py
@@ -16,7 +16,7 @@
 import pytest
 import torch
 from _test_utils.torch_export.export_utils import (
-    SmallQKVModel,
+    SmallLinearModelwithCustomWeight,
     ToyModel,
     only_input_quantizer_fp8_config,
     only_output_quantizer_fp8_config,
@@ -306,7 +306,7 @@ def test_adjust_attn_amax_values(
     q_weight, k_weight, v_weight, o_weight, expected_qkv_amax, expected_o_amax, config
 ):
     # Initialize model and quantize to insert quantizers
-    model = SmallQKVModel([q_weight, k_weight, v_weight, o_weight]).to("cuda")
+    model = SmallLinearModelwithCustomWeight([q_weight, k_weight, v_weight, o_weight]).to("cuda")
     mtq.quantize(model, config, lambda x: x(torch.randn(1, 4, q_weight.shape[1], device="cuda")))
     adjust_attn_amax_values(model)
     # Weight quantizer amax must remain unchanged for non qkv layers
@@ -375,11 +375,12 @@ def test_get_scaling_factor(
     q_weight, k_weight, v_weight, o_weight, config, expected_amax, maxbound
 ):
     # Initialize model and quantize to insert quantizers
-    model = SmallQKVModel([q_weight, k_weight, v_weight, o_weight]).to("cuda")
+    model = SmallLinearModelwithCustomWeight([q_weight, k_weight, v_weight, o_weight]).to("cuda")
     mtq.quantize(model, config, lambda x: x(torch.ones(1, 2, q_weight.shape[1], device="cuda")))
     for name, module in model.named_modules():
         if isinstance(module, TensorQuantizer) and module.is_enabled:
             scale = get_scaling_factor(module)
+            print(f"DEBUG LOG: Scale: {scale}, Expected: {expected_amax[0] / maxbound}")
             assert torch.allclose(
                 scale,
                 torch.tensor((expected_amax[0] / maxbound), dtype=scale.dtype),
diff --git a/tests/gpu/torch/export/test_fsdp2_export.py b/tests/gpu/torch/export/test_fsdp2_export.py