Merge branch 'main' into jennifchen/qat_slurm

jenchen13 · web-flow · commit 1ef040f9d530 · 2025-09-05T13:08:08.000-04:00
diff --git a/.gitlab/tests.yml b/.gitlab/tests.yml
@@ -2,10 +2,10 @@
 .tests-default:
   stage: tests
   rules:
-    - if: $JET_ONLY != null
-      when: never
-    - if: $CI_COMMIT_TAG =~ /^\d+\.\d+\.\d+$/
-    - if: $CI_PIPELINE_SOURCE == "web" || $CI_PIPELINE_SOURCE == "schedule"
+    - if: $CI_PIPELINE_SOURCE == "schedule"
+      when: always
+    - if: $CI_PIPELINE_SOURCE != "schedule"
+      when: manual
 
 ##### Unit Tests #####
 unit:
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -25,7 +25,9 @@
 from accelerate.hooks import remove_hook_from_module
 from example_utils import apply_kv_cache_quant, get_model, get_processor, get_tokenizer, is_enc_dec
 from transformers import (
+    AutoConfig,
     AutoModelForCausalLM,
+    AutoProcessor,
     PreTrainedTokenizer,
     PreTrainedTokenizerFast,
     WhisperProcessor,
@@ -39,6 +41,7 @@
     export_tensorrt_llm_checkpoint,
     get_model_type,
 )
+from modelopt.torch.export.model_utils import is_multimodal_model
 from modelopt.torch.quantization.config import need_calibration
 from modelopt.torch.quantization.plugins.accelerate import init_quantized_weights
 from modelopt.torch.quantization.utils import is_quantized
@@ -567,19 +570,26 @@ def output_decode(generated_ids, input_shape):
 
         export_path = args.export_path
 
-        if hasattr(full_model, "language_model"):
-            # Save original model config and the preprocessor config to the export path for VLMs.
-            from transformers import AutoConfig, AutoProcessor
+        # Check if the model is a multimodal/VLM model
+        is_vlm = is_multimodal_model(full_model)
 
-            print(f"Saving original model and processor configs to {export_path}")
+        if is_vlm:
+            # Save original model config and the processor config to the export path for VLMs.
+            print(f"Saving original model config to {export_path}")
 
             AutoConfig.from_pretrained(
                 args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code
             ).save_pretrained(export_path)
 
-            AutoProcessor.from_pretrained(
-                args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code
-            ).save_pretrained(export_path)
+            # Try to save processor config if available
+            try:
+                print(f"Saving processor config to {export_path}")
+                AutoProcessor.from_pretrained(
+                    args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code
+                ).save_pretrained(export_path)
+            except Exception as e:
+                print(f"Warning: Could not save processor config: {e}")
+                print("This is normal for some VLM architectures that don't use AutoProcessor")
 
         if model_type == "mllama":
             full_model_config = model.config
diff --git a/modelopt/onnx/utils.py b/modelopt/onnx/utils.py
@@ -25,7 +25,6 @@
 import numpy as np
 import onnx
 import onnx_graphsurgeon as gs
-from onnx import TensorProto, ValueInfoProto, numpy_helper
 from onnx.helper import get_attribute_value
 from onnx_graphsurgeon import Constant, Node, Variable
 
@@ -289,7 +288,7 @@ def _convert_types_to_np(types: dict[str, int] | list[int] | int) -> Any:
 
 def get_tensor_by_name(
     onnx_model: onnx.ModelProto, tensor_name: str
-) -> ValueInfoProto | TensorProto | None:
+) -> onnx.ValueInfoProto | onnx.TensorProto | None:
     """This function returns a tensor from its name.
 
     This function searches for a tensor in the model's:
@@ -438,7 +437,7 @@ def randomize_weights_onnx_bytes(onnx_bytes: bytes, seed: int = 0) -> bytes:
                     numpy_array = np.random.normal(float(avg), float(var), size=init.dims).astype(
                         dtype
                     )
-                    tensor = numpy_helper.from_array(numpy_array, init.name)
+                    tensor = onnx.numpy_helper.from_array(numpy_array, init.name)
                     model.graph.initializer[idx].CopyFrom(tensor)
 
     buffer = io.BytesIO()
@@ -751,3 +750,53 @@ def onnx_type_str_to_enum(dtype: str) -> int:
     dtype = dtype.split("tensor(")[-1].split(")")[0]
     dtype = "FLOAT" if dtype == "float32" else dtype.upper()
     return getattr(onnx.TensorProto, dtype)
+
+
+def remove_node_training_mode(onnx_model: onnx.ModelProto, node_op_type: str) -> onnx.ModelProto:
+    """Remove `training_mode` attribute and extra training outputs from nodes of a given op type.
+
+    This also removes the unused outputs from the training_mode nodes.
+
+    Args:
+        onnx_model: The onnx model.
+        node_op_type: The node type to remove training_mode attribute from.
+
+    Returns:
+        The onnx model with the training_mode attribute removed.
+    """
+    removed_output_names = set()
+    all_inputs = {inp for n in onnx_model.graph.node for inp in n.input}
+    graph_outputs = {o.name for o in onnx_model.graph.output}
+    keep = all_inputs | graph_outputs
+
+    for node in onnx_model.graph.node:
+        if node.op_type != node_op_type:
+            continue
+
+        is_training_mode = False
+        # Drop the 'training_mode' attribute if present
+        for idx, attr in enumerate(list(node.attribute)):
+            if attr.name == "training_mode":
+                del node.attribute[idx]
+                if attr.i == 1:
+                    is_training_mode = True
+                break
+
+        # If the node has extra outputs, remove them all including the training outputs
+        if is_training_mode:
+            to_remove = []
+            for name in node.output:
+                if name not in keep:
+                    removed_output_names.add(name)
+                    to_remove.append(name)
+
+            for name in to_remove:
+                node.output.remove(name)
+
+    if removed_output_names:
+        # Clean up corresponding value_info entries
+        keep = [vi for vi in onnx_model.graph.value_info if vi.name not in removed_output_names]
+        del onnx_model.graph.value_info[:]
+        onnx_model.graph.value_info.extend(keep)
+
+    return onnx_model
diff --git a/modelopt/torch/_deploy/utils/torch_onnx.py b/modelopt/torch/_deploy/utils/torch_onnx.py
@@ -45,6 +45,7 @@
     get_node_names,
     get_output_names,
     get_output_shapes,
+    remove_node_training_mode,
 )
 from modelopt.torch.quantization.export_onnx import configure_linear_module_onnx_quantizers
 from modelopt.torch.utils import flatten_tree, standardize_named_model_args
@@ -569,25 +570,3 @@ def get_onnx_bytes(*args, **kwargs) -> bytes:
     onnx_bytes = get_onnx_bytes_and_metadata(*args, **kwargs)[0]
     onnx_bytes_obj = OnnxBytes.from_bytes(onnx_bytes)
     return onnx_bytes_obj.get_onnx_model_file_bytes()
-
-
-def remove_node_training_mode(onnx_model: ModelProto, node_op_type: str) -> ModelProto:
-    """Remove training_mode attribute from selected node type.
-
-    Args:
-        onnx_model: The onnx model.
-        node_op_type: The node type to remove training_mode attribute from.
-
-    Returns:
-        The onnx model with the training_mode attribute removed.
-    """
-    for node in onnx_model.graph.node:
-        if node.op_type == node_op_type:
-            for attribute in node.attribute:
-                if attribute.name == "training_mode":
-                    if attribute.i == 1:
-                        node.output.remove(node.output[1])
-                        node.output.remove(node.output[1])
-                    attribute.i = 0
-
-    return onnx_model
diff --git a/modelopt/torch/export/model_utils.py b/modelopt/torch/export/model_utils.py
@@ -60,7 +60,7 @@
         {MODEL_NAME_TO_TYPE=}
 """
 
-__all__ = ["get_model_type"]
+__all__ = ["get_model_type", "is_multimodal_model"]
 
 
 def get_model_type(model):
@@ -69,3 +69,43 @@ def get_model_type(model):
         if k.lower() in type(model).__name__.lower():
             return v
     return None
+
+
+def is_multimodal_model(model):
+    """Check if a model is a Vision-Language Model (VLM) or multimodal model.
+
+    This function detects various multimodal model architectures by checking for:
+    - Standard vision configurations (vision_config)
+    - Language model attributes (language_model)
+    - Specific multimodal model types (phi4mm)
+    - Vision LoRA configurations
+    - Audio processing capabilities
+    - Image embedding layers
+
+    Args:
+        model: The HuggingFace model instance to check
+
+    Returns:
+        bool: True if the model is detected as multimodal, False otherwise
+
+    Examples:
+        >>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
+        >>> is_multimodal_model(model)
+        True
+
+        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-4-multimodal-instruct")
+        >>> is_multimodal_model(model)
+        True
+    """
+    config = model.config
+
+    return (
+        hasattr(config, "vision_config")  # Standard vision config (e.g., Qwen2.5-VL)
+        or hasattr(model, "language_model")  # Language model attribute (e.g., LLaVA)
+        or getattr(config, "model_type", "") == "phi4mm"  # Phi-4 multimodal
+        or hasattr(config, "vision_lora")  # Vision LoRA configurations
+        or hasattr(config, "audio_processor")  # Audio processing capabilities
+        or (
+            hasattr(config, "embd_layer") and hasattr(config.embd_layer, "image_embd_layer")
+        )  # Image embedding layers
+    )
diff --git a/modelopt/torch/prune/plugins/mcore_minitron.py b/modelopt/torch/prune/plugins/mcore_minitron.py
@@ -58,29 +58,37 @@
     "num_layers",
 }
 
-SUPPORTED_MODELS = set()
 
-try:
-    from megatron.core.models.gpt import GPTModel
+def get_supported_models():
+    """Get the supported models for Minitron pruning.
 
-    SUPPORTED_MODELS.add(GPTModel)
-except Exception:
-    pass
+    NOTE: Keep inside function to avoid circular import issues.
+    """
+    supported_models = set()
+
+    try:
+        from megatron.core.models.gpt import GPTModel
+
+        supported_models.add(GPTModel)
+    except Exception:
+        pass
+
+    try:
+        from megatron.core.models.mamba import MambaModel
 
-try:
-    from megatron.core.models.mamba import MambaModel
+        supported_models.add(MambaModel)
+    except Exception:
+        pass
 
-    SUPPORTED_MODELS.add(MambaModel)
-except Exception:
-    pass
+    try:
+        from nemo.collections import llm
 
-try:
-    from nemo.collections import llm
+        # NOTE: llm.MambaModel is a subclass of llm.GPTModel
+        supported_models.add(llm.GPTModel)
+    except Exception:
+        pass
 
-    # NOTE: llm.MambaModel is a subclass of llm.GPTModel
-    SUPPORTED_MODELS.add(llm.GPTModel)
-except Exception:
-    pass
+    return supported_models
 
 
 class MCoreMinitronSearcher(BaseSearcher):
@@ -151,13 +159,14 @@ def run_search(self) -> None:
         """Run actual search."""
         # Run forward loop to collect activations and sort parameters
         model_cfg = None
-        for m_type in SUPPORTED_MODELS:
+        supported_models = get_supported_models()
+        for m_type in supported_models:
             if isinstance(self.model, m_type):
                 model_cfg = self.model.config
                 break
         if model_cfg is None:
             raise NotImplementedError(
-                f"Only {SUPPORTED_MODELS} models are supported! Got: {type(self.model)}"
+                f"Only {supported_models} models are supported! Got: {type(self.model)}"
             )
 
         assert self.forward_loop is not None
diff --git a/modelopt/torch/speculative/plugins/megatron_eagle.py b/modelopt/torch/speculative/plugins/megatron_eagle.py
@@ -90,7 +90,7 @@ def dict_to_config(
         fp16=fp16,
         bf16=bf16,
         params_dtype=getattr(torch, architecture_config["torch_dtype"]),
-        pipeline_dtype=None,
+        pipeline_dtype=getattr(torch, architecture_config["torch_dtype"]),
         num_layers=architecture_config.get("num_hidden_layers"),
         hidden_size=architecture_config.get("hidden_size"),
         ffn_hidden_size=architecture_config.get("intermediate_size"),
diff --git a/tests/gpu/torch/quantization/backends/test_gemm_common.py b/tests/gpu/torch/quantization/backends/test_gemm_common.py
@@ -29,6 +29,12 @@
 set_seed()
 
 
+@pytest.fixture(autouse=True)
+def setup_seed():
+    """Set seed before each test function."""
+    set_seed()
+
+
 @pytest.mark.parametrize(
     ("config", "gemm_forward", "atol", "rtol"),
     [
@@ -257,9 +263,9 @@ def forward_loop(model, run_backward=False):
 
         # The way the compression of the weights and inputs might be different.
         # E.g. we may use torch.compile in the gemms.
-        assert torch.allclose(output_dynamic_quant_gemm, output_dynamic_quant, atol=atol / 3)
-        assert torch.allclose(output_calib_quant_gemm, output_calib_quant, atol=atol / 3)
+        assert torch.allclose(output_dynamic_quant_gemm, output_dynamic_quant, atol=atol / 2)
+        assert torch.allclose(output_calib_quant_gemm, output_calib_quant, atol=atol / 2)
         assert torch.allclose(
-            output_dynamic_quant_gemm, output_dynamic_quant_compressed, atol=atol / 3
+            output_dynamic_quant_gemm, output_dynamic_quant_compressed, atol=atol / 2
         )
-        assert torch.allclose(output_calib_quant_gemm, output_calib_quant_compressed, atol=atol / 3)
+        assert torch.allclose(output_calib_quant_gemm, output_calib_quant_compressed, atol=atol / 2)
diff --git a/tests/unit/onnx/test_onnx_utils.py b/tests/unit/onnx/test_onnx_utils.py
diff --git a/tests/unit/torch/deploy/utils/test_torch_onnx_utils.py b/tests/unit/torch/deploy/utils/test_torch_onnx_utils.py