Fix according to coderabbitai

yueshen2016 · yueshen2016 · commit 0ccb1d6117bb · 2026-01-28T16:50:01.000-08:00
Signed-off-by: James Shen &lt;yueshen@nvidia.com&gt;
diff --git a/examples/quantization/ptq_generate_vlm.py b/examples/quantization/ptq_generate_vlm.py
@@ -80,28 +80,12 @@ def _validate_quantized_model(model: torch.nn.Module, is_rank_0: bool) -> None:
     """
     model_str = str(model)
 
-    # DEBUG: Print full model structure to diagnose CI vs local differences
-    if is_rank_0:
-        console.print(f"\n{'=' * 80}")
-        console.print("[yellow]DEBUG: Full model structure:[/yellow]")
-        console.print(f"{'=' * 80}")
-        console.print(model_str)
-        console.print(f"{'=' * 80}\n")
-
     # TE spec quantized layers (VLM models always use TE spec)
     te_spec_layers = [
         "QuantTERowParallelLinear",
         "QuantTELayerNormColumnParallelLinear",
     ]
 
-    # DEBUG: Check each layer individually
-    if is_rank_0:
-        console.print("[yellow]DEBUG: Checking for quantized layers:[/yellow]")
-        for layer in te_spec_layers:
-            found = layer in model_str
-            status = "[green]FOUND[/green]" if found else "[red]NOT FOUND[/red]"
-            console.print(f"  {layer}: {status}")
-
     # Check if model has TE spec quantized layers
     has_te_spec = all(layer in model_str for layer in te_spec_layers)
 
@@ -264,21 +248,22 @@ def main(
         default=DEFAULT_IMAGE_PATH,
         help="Path to the image file for VLM generation.",
     )
-    parser.add_argument("--trust-remote-code", action="store_true", default=True, help="if trust_remote_code")
+    parser.add_argument("--trust-remote-code", action="store_true", help="if trust_remote_code")
 
     args = parser.parse_args()
-    main(
-        args.hf_model_id,
-        args.tp,
-        args.pp,
-        args.ep,
-        args.etp,
-        args.megatron_load_path,
-        args.prompts,
-        args.osl,
-        args.image_path,
-        args.trust_remote_code,
-    )
-
-    if torch.distributed.is_initialized():
-        torch.distributed.destroy_process_group()
+    try:
+        main(
+            args.hf_model_id,
+            args.tp,
+            args.pp,
+            args.ep,
+            args.etp,
+            args.megatron_load_path,
+            args.prompts,
+            args.osl,
+            args.image_path,
+            args.trust_remote_code,
+        )
+    finally:
+        if torch.distributed.is_initialized():
+            torch.distributed.destroy_process_group()
diff --git a/examples/quantization/quantize_utils.py b/examples/quantization/quantize_utils.py
@@ -20,6 +20,7 @@
 """
 
 import argparse
+import copy
 
 import modelopt.torch.quantization as mtq
 from rich.console import Console
@@ -40,7 +41,9 @@
 }
 
 
-def get_modelopt_torch_quantization_config(export_quant_cfg, export_kv_cache_quant=False, weight_only=False):
+def get_modelopt_torch_quantization_config(
+    export_quant_cfg: str, export_kv_cache_quant: bool = False, weight_only: bool = False
+) -> dict:
     """Return a quantization config based on the specified configuration.
 
     Args:
@@ -54,7 +57,8 @@ def get_modelopt_torch_quantization_config(export_quant_cfg, export_kv_cache_qua
     Raises:
         KeyError: If export_quant_cfg is not a valid configuration name.
     """
-    mtq_config = QUANT_CFG_CHOICES[export_quant_cfg]
+    # Use deepcopy to avoid mutating the original config in QUANT_CFG_CHOICES
+    mtq_config = copy.deepcopy(QUANT_CFG_CHOICES[export_quant_cfg])
 
     fp8_config = {"enable": True, "num_bits": (4, 3), "axis": None}
     fp4_config = {
diff --git a/examples/quantization/quantize_vlm.py b/examples/quantization/quantize_vlm.py
@@ -209,10 +209,11 @@ def _hf_dataset_forward_loop_func(
             disable_tqdm=True,
         )
 
-        if force_all_expert_routing:
-            for name, module in model.named_modules():
-                if isinstance(module, TopKRouter):
-                    module.topk = module.config.moe_router_topk
+    # Restore original topk after calibration is complete
+    if force_all_expert_routing:
+        for name, module in model.named_modules():
+            if isinstance(module, TopKRouter):
+                module.topk = module.config.moe_router_topk
 
 
 def _custom_prompt_forward_loop_func(
@@ -393,6 +394,10 @@ def ptq_forward_loop_func(model):
     if megatron_save_path is None:
         model_name = hf_model_id.replace("/", "_")
         megatron_save_path = f"./{model_name}_quantized_{export_quant_cfg}"
+        if is_rank_0:
+            console.print(
+                f"[yellow]No --megatron-save-path specified. Using default path: {megatron_save_path}[/yellow]"
+            )
 
     if is_rank_0:
         console.print("[green]Testing model AFTER quantization...[/green]")
@@ -406,18 +411,9 @@ def ptq_forward_loop_func(model):
         _custom_prompt_forward_loop_func(unwrapped_model, processor, is_rank_0, prompts)
 
     # Save quantized model in Megatron format
-    if megatron_save_path:
-        save_path = megatron_save_path
-    else:
-        # Create default save path using model name and quantization config
-        model_name = hf_model_id.split("/")[-1]
-        save_path = f"{model_name}_quantized_{export_quant_cfg}"
-        if is_rank_0:
-            console.print(f"[yellow]No --megatron-save-path specified. Using default path: {save_path}[/yellow]")
-
     if is_rank_0:
-        console.print(f"Saving quantized Megatron checkpoint in {save_path}...")
-    bridge.save_megatron_model(megatron_model, save_path)
+        console.print(f"Saving quantized Megatron checkpoint in {megatron_save_path}...")
+    bridge.save_megatron_model(megatron_model, megatron_save_path)
 
 
 if __name__ == "__main__":
@@ -459,25 +455,26 @@ def ptq_forward_loop_func(model):
         "Useful for offline CI environments.",
     )
     args = parser.parse_args()
-    main(
-        args.hf_model_id,
-        args.tp,
-        args.pp,
-        args.ep,
-        args.etp,
-        args.megatron_save_path,
-        args.export_quant_cfg,
-        args.calib_size,
-        args.compress,
-        args.weight_only,
-        args.export_kv_cache_quant,
-        args.force_all_expert_routing,
-        args.trust_remote_code,
-        args.prompts,
-        args.skip_quantization,
-        args.test_image_path,
-        args.use_random_calib,
-    )
-
-    if torch.distributed.is_initialized():
-        torch.distributed.destroy_process_group()
+    try:
+        main(
+            args.hf_model_id,
+            args.tp,
+            args.pp,
+            args.ep,
+            args.etp,
+            args.megatron_save_path,
+            args.export_quant_cfg,
+            args.calib_size,
+            args.compress,
+            args.weight_only,
+            args.export_kv_cache_quant,
+            args.force_all_expert_routing,
+            args.trust_remote_code,
+            args.prompts,
+            args.skip_quantization,
+            args.test_image_path,
+            args.use_random_calib,
+        )
+    finally:
+        if torch.distributed.is_initialized():
+            torch.distributed.destroy_process_group()
diff --git a/pyproject.toml b/pyproject.toml
@@ -120,7 +120,7 @@ override-dependencies = [
 transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "6a34b6574fa6c29d9d07fdcddf9812cbb1488878" }
 megatron-core = { path = "3rdparty/Megatron-LM/" }
 nvidia-resiliency-ext = { git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git", rev = "54f85fe422d296cf04ea524130014bd3a2c3add1" }
-nvidia-modelopt = { git = "https://github.com/NVIDIA/TensorRT-Model-Optimizer.git", rev = "0a4f0a8b933121f7af080261a0a5a7717f2c5d49" }
+nvidia-modelopt = { git = "https://github.com/NVIDIA/TensorRT-Model-Optimizer.git", rev = "aafd3883942a564f1ac08a1e5f363abd61d383cf" }
 # mamba-ssm = { git = "https://github.com/yfw/mamba", branch = "general_stride_fix" }
 
 [project.optional-dependencies]
diff --git a/src/megatron/bridge/models/qwen_vl/modelling_qwen3_vl/model.py b/src/megatron/bridge/models/qwen_vl/modelling_qwen3_vl/model.py
@@ -189,8 +189,8 @@ def forward(
         video_grid_thw: torch.Tensor = None,
         # cat set at dataset
         image_input_mask: torch.Tensor = None,
-        inference_context=None,
-        runtime_gather_output=None,
+        inference_context: object | None = None,
+        runtime_gather_output: bool | None = None,
     ) -> torch.Tensor:
         """Forward function of the Qwen3VL model.
 
@@ -212,6 +212,7 @@ def forward(
             output (torch.Tensor): Loss of shape [b, s] if labels are provided, otherwise logits of shape
                 [b, s, vocab_size].
         """
+        del inference_context, runtime_gather_output  # Unused, kept for API compatibility
         assert pixel_values_videos is None and video_grid_thw is None, "not support video now"
         assert inference_params is None, "not support inference"
 
diff --git a/src/megatron/bridge/models/qwen_vl/modelling_qwen3_vl/utils.py b/src/megatron/bridge/models/qwen_vl/modelling_qwen3_vl/utils.py
@@ -111,8 +111,11 @@ def get_rope_index(
             attention_mask = torch.ones_like(total_input_ids)
         # Handle multi-dimensional attention masks
         elif attention_mask.dim() > 2:
-            # For causal mask, create a simple 2D mask [batch, seq]
-            attention_mask = torch.ones_like(total_input_ids)
+            # Collapse to [batch, seq] while preserving padding information
+            attention_mask = attention_mask.any(dim=-1)
+            if attention_mask.dim() == 3:
+                attention_mask = attention_mask.squeeze(1)
+            attention_mask = attention_mask.to(dtype=total_input_ids.dtype)
         position_ids = torch.ones(
             3,
             input_ids.shape[0],
@@ -192,10 +195,11 @@ def get_rope_index(
         if attention_mask is not None:
             # Handle multi-dimensional attention mask
             if attention_mask.dim() > 2:
-                # For causal mask, create a simple 2D mask [batch, seq]
-                attention_mask = torch.ones(
-                    (input_ids.shape[0], input_ids.shape[1]), dtype=torch.long, device=input_ids.device
-                )
+                # Collapse to [batch, seq] while preserving padding information
+                attention_mask = attention_mask.any(dim=-1)
+                if attention_mask.dim() == 3:
+                    attention_mask = attention_mask.squeeze(1)
+                attention_mask = attention_mask.to(dtype=torch.long)
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
             position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
diff --git a/tests/functional_tests/quantization/models/qwen_vl/test_qwen3_vl_quantization_workflow.py b/tests/functional_tests/quantization/models/qwen_vl/test_qwen3_vl_quantization_workflow.py
@@ -326,7 +326,7 @@ def test_qwen3_vl_quantization_and_generation(self, qwen3_vl_toy_model_path, tmp
             if quantize_result.returncode != 0:
                 print(f"Quantization STDOUT: {quantize_result.stdout}")
                 print(f"Quantization STDERR: {quantize_result.stderr}")
-                assert False, f"Quantization step failed with return code {quantize_result.returncode}"
+                pytest.fail(f"Quantization step failed with return code {quantize_result.returncode}")
 
             # Verify quantization succeeded
             assert "Quantizing the model with fp8 configuration" in quantize_result.stdout, (
@@ -350,7 +350,7 @@ def test_qwen3_vl_quantization_and_generation(self, qwen3_vl_toy_model_path, tmp
             if generation_result.returncode != 0:
                 print(f"Generation STDOUT: {generation_result.stdout}")
                 print(f"Generation STDERR: {generation_result.stderr}")
-                assert False, f"Generation step failed with return code {generation_result.returncode}"
+                pytest.fail(f"Generation step failed with return code {generation_result.returncode}")
 
             # Verify generation succeeded
             stdout_normalized = generation_result.stdout.replace("\n", "")
@@ -408,7 +408,7 @@ def test_qwen3_vl_quantization_and_generation_parallelism(
             if quantize_result.returncode != 0:
                 print(f"Quantization STDOUT: {quantize_result.stdout}")
                 print(f"Quantization STDERR: {quantize_result.stderr}")
-                assert False, f"Quantization step for {test_name} failed with return code {quantize_result.returncode}"
+                pytest.fail(f"Quantization step for {test_name} failed with return code {quantize_result.returncode}")
 
             # Verify quantization succeeded with correct parallelism
             assert "Quantizing the model with fp8 configuration" in quantize_result.stdout, (
@@ -438,7 +438,7 @@ def test_qwen3_vl_quantization_and_generation_parallelism(
             if generation_result.returncode != 0:
                 print(f"Generation STDOUT: {generation_result.stdout}")
                 print(f"Generation STDERR: {generation_result.stderr}")
-                assert False, f"Generation step for {test_name} failed with return code {generation_result.returncode}"
+                pytest.fail(f"Generation step for {test_name} failed with return code {generation_result.returncode}")
 
             # Verify generation succeeded with correct parallelism
             stdout_normalized = generation_result.stdout.replace("\n", "")
diff --git a/tests/unit_tests/models/qwen_vl/modelling_qwen3_vl/test_utils.py b/tests/unit_tests/models/qwen_vl/modelling_qwen3_vl/test_utils.py
@@ -144,3 +144,65 @@ def test_get_rope_index_packed_seq_params_fallback_dense_mask(self):
 
         assert torch.equal(position_ids, expected_positions)
         assert torch.equal(deltas, expected_deltas)
+
+    def test_get_rope_index_with_3d_attention_mask(self):
+        """Test get_rope_index with 3D attention mask (batch, seq, seq)."""
+        batch_size, seq_len = 2, 8
+        input_ids = torch.randint(0, 1000, (batch_size, seq_len))
+        # Create a 3D causal attention mask [batch, seq, seq]
+        attention_mask = torch.tril(torch.ones((batch_size, seq_len, seq_len)))
+
+        position_ids, deltas = get_rope_index(
+            spatial_merge_size=2,
+            image_token_id=151655,
+            video_token_id=151656,
+            vision_start_token_id=151652,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+        )
+
+        assert position_ids.shape == (3, batch_size, seq_len)
+        assert deltas.shape == (batch_size, 1)
+
+    def test_get_rope_index_with_4d_attention_mask(self):
+        """Test get_rope_index with 4D attention mask (batch, 1, seq, seq)."""
+        batch_size, seq_len = 2, 8
+        input_ids = torch.randint(0, 1000, (batch_size, seq_len))
+        # Create a 4D attention mask [batch, 1, seq, seq] - singleton head dimension
+        attention_mask = torch.tril(torch.ones((batch_size, 1, seq_len, seq_len)))
+
+        position_ids, deltas = get_rope_index(
+            spatial_merge_size=2,
+            image_token_id=151655,
+            video_token_id=151656,
+            vision_start_token_id=151652,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+        )
+
+        assert position_ids.shape == (3, batch_size, seq_len)
+        assert deltas.shape == (batch_size, 1)
+
+    def test_get_rope_index_with_3d_attention_mask_and_image(self):
+        """Test get_rope_index with 3D attention mask and image grid."""
+        batch_size, seq_len = 1, 16
+        input_ids = torch.randint(0, 1000, (batch_size, seq_len))
+        # Insert vision tokens
+        input_ids[0, 4] = 151652  # vision_start_token_id
+        input_ids[0, 5] = 151655  # image_token_id
+        image_grid_thw = torch.tensor([[1, 4, 4]])  # t=1, h=4, w=4
+        # Create a 3D attention mask [batch, seq, seq]
+        attention_mask = torch.tril(torch.ones((batch_size, seq_len, seq_len)))
+
+        position_ids, deltas = get_rope_index(
+            spatial_merge_size=2,
+            image_token_id=151655,
+            video_token_id=151656,
+            vision_start_token_id=151652,
+            input_ids=input_ids,
+            image_grid_thw=image_grid_thw,
+            attention_mask=attention_mask,
+        )
+
+        assert position_ids.shape == (3, batch_size, seq_len)
+        assert deltas.shape == (batch_size, 1)
diff --git a/tests/unit_tests/training/test_model_load_save.py b/tests/unit_tests/training/test_model_load_save.py
diff --git a/uv.lock b/uv.lock