updated unit tests

sugunav14 · sugunav14 · commit fc0d6e81b8af · 2025-12-07T21:33:00.000Z
Signed-off-by: Suguna Velury &lt;178320438+sugunav14@users.noreply.github.com&gt;
diff --git a/modelopt/torch/quantization/utils.py b/modelopt/torch/quantization/utils.py
@@ -712,9 +712,15 @@ def fsdp2_aware_weight_update(root_model, modules_to_update, reshard=True):
             # Assert that all the modules in the module list are present in this fsdp_param_group
             if len(modules_to_update) > 1:
                 for module in modules_to_update:
-                    name = _get_module_name(module, root_model)
-                    assert name in fsdp_param_mapping, (
-                        f"Module {module} not found in fsdp_param_mapping"
+                    module_name = _get_module_name(module, root_model)
+                    # Check if any parameter from this module is in the mapping
+                    module_params_in_mapping = any(
+                        f"{module_name}.{n}" in fsdp_param_mapping
+                        for n, _ in module.named_parameters()
+                    )
+                    assert module_params_in_mapping, (
+                        f"Module {module} with name '{module_name}' not found in fsdp_param_mapping. "
+                        f"Available keys: {list(fsdp_param_mapping.keys())}"
                     )
         # Yields for necessary weight updates/processing
         yield
diff --git a/tests/_test_utils/torch/export/utils.py b/tests/_test_utils/torch/export/utils.py
@@ -55,18 +55,18 @@ def forward(self, x):
 
 
 class SmallQKVModel(torch.nn.Module):
-    def __init__(self, dim=4, device="cuda", apply_embed=False):
+    def __init__(self, dim=4, device="cuda", apply_embed=False, bias=False):
         super().__init__()
         self.embedding = torch.nn.Embedding(2, dim)
-        self.q_proj = torch.nn.Linear(dim, dim, bias=False)
-        self.k_proj = torch.nn.Linear(dim, dim, bias=False)
-        self.v_proj = torch.nn.Linear(dim, dim, bias=False)
-        self.o_proj = torch.nn.Linear(dim, dim, bias=False)
+        self.q_proj = torch.nn.Linear(dim, dim, bias=bias)
+        self.k_proj = torch.nn.Linear(dim, dim, bias=bias)
+        self.v_proj = torch.nn.Linear(dim, dim, bias=bias)
+        self.o_proj = torch.nn.Linear(dim, dim, bias=bias)
         self.device = device
         self.config = None
         self.apply_embed = apply_embed
         # TODO: Debug why fsdp2 modifies bias of layernorm for awq
-        self.input_layernorm = torch.nn.LayerNorm(dim, bias=False)
+        self.input_layernorm = torch.nn.LayerNorm(dim, bias=bias)
 
     def forward(self, x):
         if self.apply_embed:
diff --git a/tests/gpu/torch/export/test_fsdp2_export.py b/tests/gpu/torch/export/test_fsdp2_export.py
@@ -118,11 +118,11 @@ def _compare_parameters_and_buffers(model1, model2):
         )
 
 
-def _fuse_layers(rank, size, quant_config):
+def _fuse_layers(rank, size, quant_config, bias):
     with patch_fsdp_mp_dtypes():
         # Initialize model
-        model = SmallQKVModel(dim=32).to("cuda")
-        non_fsdp_model = SmallQKVModel(dim=32).to("cuda")
+        model = SmallQKVModel(dim=32, bias=bias).to("cuda")
+        non_fsdp_model = SmallQKVModel(dim=32, bias=bias).to("cuda")
         non_fsdp_model.load_state_dict(copy.deepcopy(model.state_dict()))
         model.eval()
         non_fsdp_model.eval()
@@ -159,15 +159,15 @@ def calib_fn(x):
         _compare_parameters_and_buffers(model, non_fsdp_model)
 
 
-def _export_quantized_weight_test(rank, size, quant_config):
+def _export_quantized_weight_test(rank, size, quant_config, bias):
     import copy
 
     from torch.distributed._composable.fsdp import fully_shard
 
     with patch_fsdp_mp_dtypes():
         # Initialize model
-        model = SmallQKVModel(dim=32).to("cuda")
-        non_fsdp_model = SmallQKVModel(dim=32).to("cuda")
+        model = SmallQKVModel(dim=32, bias=bias).to("cuda")
+        non_fsdp_model = SmallQKVModel(dim=32, bias=bias).to("cuda")
         non_fsdp_model.load_state_dict(copy.deepcopy(model.state_dict()))
         model.eval()
         non_fsdp_model.eval()
@@ -247,10 +247,11 @@ def test_fsdp2_weight_update_context_for_export(device_count):
     ],
 )
 @pytest.mark.parametrize("device_count", get_device_counts())
-def test_fsdp2_weight_update_context_for_fuse_layers(device_count, quant_config):
+@pytest.mark.parametrize("bias", [True, False])
+def test_fsdp2_weight_update_context_for_fuse_layers(device_count, quant_config, bias):
     spawn_multiprocess_job(
         size=device_count,
-        job=partial(_fuse_layers, quant_config=quant_config),
+        job=partial(_fuse_layers, quant_config=quant_config, bias=bias),
         backend="nccl",
     )
 
@@ -270,9 +271,10 @@ def test_fsdp2_weight_update_context_for_fuse_layers(device_count, quant_config)
     ],
 )
 @pytest.mark.parametrize("device_count", get_device_counts())
-def test_fsdp2_weight_update_context_for_export_quantized_weight(device_count, quant_config):
+@pytest.mark.parametrize("bias", [True, False])
+def test_fsdp2_weight_update_context_for_export_quantized_weight(device_count, quant_config, bias):
     spawn_multiprocess_job(
         size=device_count,
-        job=partial(_export_quantized_weight_test, quant_config=quant_config),
+        job=partial(_export_quantized_weight_test, quant_config=quant_config, bias=bias),
         backend="nccl",
     )