Skip to content

Commit 1ef040f

Browse files
authored
Merge branch 'main' into jennifchen/qat_slurm
2 parents 0eac8f6 + 8a07376 commit 1ef040f

File tree

10 files changed

+379
-153
lines changed

10 files changed

+379
-153
lines changed

.gitlab/tests.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22
.tests-default:
33
stage: tests
44
rules:
5-
- if: $JET_ONLY != null
6-
when: never
7-
- if: $CI_COMMIT_TAG =~ /^\d+\.\d+\.\d+$/
8-
- if: $CI_PIPELINE_SOURCE == "web" || $CI_PIPELINE_SOURCE == "schedule"
5+
- if: $CI_PIPELINE_SOURCE == "schedule"
6+
when: always
7+
- if: $CI_PIPELINE_SOURCE != "schedule"
8+
when: manual
99

1010
##### Unit Tests #####
1111
unit:

examples/llm_ptq/hf_ptq.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,9 @@
2525
from accelerate.hooks import remove_hook_from_module
2626
from example_utils import apply_kv_cache_quant, get_model, get_processor, get_tokenizer, is_enc_dec
2727
from transformers import (
28+
AutoConfig,
2829
AutoModelForCausalLM,
30+
AutoProcessor,
2931
PreTrainedTokenizer,
3032
PreTrainedTokenizerFast,
3133
WhisperProcessor,
@@ -39,6 +41,7 @@
3941
export_tensorrt_llm_checkpoint,
4042
get_model_type,
4143
)
44+
from modelopt.torch.export.model_utils import is_multimodal_model
4245
from modelopt.torch.quantization.config import need_calibration
4346
from modelopt.torch.quantization.plugins.accelerate import init_quantized_weights
4447
from modelopt.torch.quantization.utils import is_quantized
@@ -567,19 +570,26 @@ def output_decode(generated_ids, input_shape):
567570

568571
export_path = args.export_path
569572

570-
if hasattr(full_model, "language_model"):
571-
# Save original model config and the preprocessor config to the export path for VLMs.
572-
from transformers import AutoConfig, AutoProcessor
573+
# Check if the model is a multimodal/VLM model
574+
is_vlm = is_multimodal_model(full_model)
573575

574-
print(f"Saving original model and processor configs to {export_path}")
576+
if is_vlm:
577+
# Save original model config and the processor config to the export path for VLMs.
578+
print(f"Saving original model config to {export_path}")
575579

576580
AutoConfig.from_pretrained(
577581
args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code
578582
).save_pretrained(export_path)
579583

580-
AutoProcessor.from_pretrained(
581-
args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code
582-
).save_pretrained(export_path)
584+
# Try to save processor config if available
585+
try:
586+
print(f"Saving processor config to {export_path}")
587+
AutoProcessor.from_pretrained(
588+
args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code
589+
).save_pretrained(export_path)
590+
except Exception as e:
591+
print(f"Warning: Could not save processor config: {e}")
592+
print("This is normal for some VLM architectures that don't use AutoProcessor")
583593

584594
if model_type == "mllama":
585595
full_model_config = model.config

modelopt/onnx/utils.py

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
import numpy as np
2626
import onnx
2727
import onnx_graphsurgeon as gs
28-
from onnx import TensorProto, ValueInfoProto, numpy_helper
2928
from onnx.helper import get_attribute_value
3029
from onnx_graphsurgeon import Constant, Node, Variable
3130

@@ -289,7 +288,7 @@ def _convert_types_to_np(types: dict[str, int] | list[int] | int) -> Any:
289288

290289
def get_tensor_by_name(
291290
onnx_model: onnx.ModelProto, tensor_name: str
292-
) -> ValueInfoProto | TensorProto | None:
291+
) -> onnx.ValueInfoProto | onnx.TensorProto | None:
293292
"""This function returns a tensor from its name.
294293
295294
This function searches for a tensor in the model's:
@@ -438,7 +437,7 @@ def randomize_weights_onnx_bytes(onnx_bytes: bytes, seed: int = 0) -> bytes:
438437
numpy_array = np.random.normal(float(avg), float(var), size=init.dims).astype(
439438
dtype
440439
)
441-
tensor = numpy_helper.from_array(numpy_array, init.name)
440+
tensor = onnx.numpy_helper.from_array(numpy_array, init.name)
442441
model.graph.initializer[idx].CopyFrom(tensor)
443442

444443
buffer = io.BytesIO()
@@ -751,3 +750,53 @@ def onnx_type_str_to_enum(dtype: str) -> int:
751750
dtype = dtype.split("tensor(")[-1].split(")")[0]
752751
dtype = "FLOAT" if dtype == "float32" else dtype.upper()
753752
return getattr(onnx.TensorProto, dtype)
753+
754+
755+
def remove_node_training_mode(onnx_model: onnx.ModelProto, node_op_type: str) -> onnx.ModelProto:
756+
"""Remove `training_mode` attribute and extra training outputs from nodes of a given op type.
757+
758+
This also removes the unused outputs from the training_mode nodes.
759+
760+
Args:
761+
onnx_model: The onnx model.
762+
node_op_type: The node type to remove training_mode attribute from.
763+
764+
Returns:
765+
The onnx model with the training_mode attribute removed.
766+
"""
767+
removed_output_names = set()
768+
all_inputs = {inp for n in onnx_model.graph.node for inp in n.input}
769+
graph_outputs = {o.name for o in onnx_model.graph.output}
770+
keep = all_inputs | graph_outputs
771+
772+
for node in onnx_model.graph.node:
773+
if node.op_type != node_op_type:
774+
continue
775+
776+
is_training_mode = False
777+
# Drop the 'training_mode' attribute if present
778+
for idx, attr in enumerate(list(node.attribute)):
779+
if attr.name == "training_mode":
780+
del node.attribute[idx]
781+
if attr.i == 1:
782+
is_training_mode = True
783+
break
784+
785+
# If the node has extra outputs, remove them all including the training outputs
786+
if is_training_mode:
787+
to_remove = []
788+
for name in node.output:
789+
if name not in keep:
790+
removed_output_names.add(name)
791+
to_remove.append(name)
792+
793+
for name in to_remove:
794+
node.output.remove(name)
795+
796+
if removed_output_names:
797+
# Clean up corresponding value_info entries
798+
keep = [vi for vi in onnx_model.graph.value_info if vi.name not in removed_output_names]
799+
del onnx_model.graph.value_info[:]
800+
onnx_model.graph.value_info.extend(keep)
801+
802+
return onnx_model

modelopt/torch/_deploy/utils/torch_onnx.py

Lines changed: 1 addition & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
get_node_names,
4646
get_output_names,
4747
get_output_shapes,
48+
remove_node_training_mode,
4849
)
4950
from modelopt.torch.quantization.export_onnx import configure_linear_module_onnx_quantizers
5051
from modelopt.torch.utils import flatten_tree, standardize_named_model_args
@@ -569,25 +570,3 @@ def get_onnx_bytes(*args, **kwargs) -> bytes:
569570
onnx_bytes = get_onnx_bytes_and_metadata(*args, **kwargs)[0]
570571
onnx_bytes_obj = OnnxBytes.from_bytes(onnx_bytes)
571572
return onnx_bytes_obj.get_onnx_model_file_bytes()
572-
573-
574-
def remove_node_training_mode(onnx_model: ModelProto, node_op_type: str) -> ModelProto:
575-
"""Remove training_mode attribute from selected node type.
576-
577-
Args:
578-
onnx_model: The onnx model.
579-
node_op_type: The node type to remove training_mode attribute from.
580-
581-
Returns:
582-
The onnx model with the training_mode attribute removed.
583-
"""
584-
for node in onnx_model.graph.node:
585-
if node.op_type == node_op_type:
586-
for attribute in node.attribute:
587-
if attribute.name == "training_mode":
588-
if attribute.i == 1:
589-
node.output.remove(node.output[1])
590-
node.output.remove(node.output[1])
591-
attribute.i = 0
592-
593-
return onnx_model

modelopt/torch/export/model_utils.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
{MODEL_NAME_TO_TYPE=}
6161
"""
6262

63-
__all__ = ["get_model_type"]
63+
__all__ = ["get_model_type", "is_multimodal_model"]
6464

6565

6666
def get_model_type(model):
@@ -69,3 +69,43 @@ def get_model_type(model):
6969
if k.lower() in type(model).__name__.lower():
7070
return v
7171
return None
72+
73+
74+
def is_multimodal_model(model):
75+
"""Check if a model is a Vision-Language Model (VLM) or multimodal model.
76+
77+
This function detects various multimodal model architectures by checking for:
78+
- Standard vision configurations (vision_config)
79+
- Language model attributes (language_model)
80+
- Specific multimodal model types (phi4mm)
81+
- Vision LoRA configurations
82+
- Audio processing capabilities
83+
- Image embedding layers
84+
85+
Args:
86+
model: The HuggingFace model instance to check
87+
88+
Returns:
89+
bool: True if the model is detected as multimodal, False otherwise
90+
91+
Examples:
92+
>>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
93+
>>> is_multimodal_model(model)
94+
True
95+
96+
>>> model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-4-multimodal-instruct")
97+
>>> is_multimodal_model(model)
98+
True
99+
"""
100+
config = model.config
101+
102+
return (
103+
hasattr(config, "vision_config") # Standard vision config (e.g., Qwen2.5-VL)
104+
or hasattr(model, "language_model") # Language model attribute (e.g., LLaVA)
105+
or getattr(config, "model_type", "") == "phi4mm" # Phi-4 multimodal
106+
or hasattr(config, "vision_lora") # Vision LoRA configurations
107+
or hasattr(config, "audio_processor") # Audio processing capabilities
108+
or (
109+
hasattr(config, "embd_layer") and hasattr(config.embd_layer, "image_embd_layer")
110+
) # Image embedding layers
111+
)

modelopt/torch/prune/plugins/mcore_minitron.py

Lines changed: 28 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -58,29 +58,37 @@
5858
"num_layers",
5959
}
6060

61-
SUPPORTED_MODELS = set()
6261

63-
try:
64-
from megatron.core.models.gpt import GPTModel
62+
def get_supported_models():
63+
"""Get the supported models for Minitron pruning.
6564
66-
SUPPORTED_MODELS.add(GPTModel)
67-
except Exception:
68-
pass
65+
NOTE: Keep inside function to avoid circular import issues.
66+
"""
67+
supported_models = set()
68+
69+
try:
70+
from megatron.core.models.gpt import GPTModel
71+
72+
supported_models.add(GPTModel)
73+
except Exception:
74+
pass
75+
76+
try:
77+
from megatron.core.models.mamba import MambaModel
6978

70-
try:
71-
from megatron.core.models.mamba import MambaModel
79+
supported_models.add(MambaModel)
80+
except Exception:
81+
pass
7282

73-
SUPPORTED_MODELS.add(MambaModel)
74-
except Exception:
75-
pass
83+
try:
84+
from nemo.collections import llm
7685

77-
try:
78-
from nemo.collections import llm
86+
# NOTE: llm.MambaModel is a subclass of llm.GPTModel
87+
supported_models.add(llm.GPTModel)
88+
except Exception:
89+
pass
7990

80-
# NOTE: llm.MambaModel is a subclass of llm.GPTModel
81-
SUPPORTED_MODELS.add(llm.GPTModel)
82-
except Exception:
83-
pass
91+
return supported_models
8492

8593

8694
class MCoreMinitronSearcher(BaseSearcher):
@@ -151,13 +159,14 @@ def run_search(self) -> None:
151159
"""Run actual search."""
152160
# Run forward loop to collect activations and sort parameters
153161
model_cfg = None
154-
for m_type in SUPPORTED_MODELS:
162+
supported_models = get_supported_models()
163+
for m_type in supported_models:
155164
if isinstance(self.model, m_type):
156165
model_cfg = self.model.config
157166
break
158167
if model_cfg is None:
159168
raise NotImplementedError(
160-
f"Only {SUPPORTED_MODELS} models are supported! Got: {type(self.model)}"
169+
f"Only {supported_models} models are supported! Got: {type(self.model)}"
161170
)
162171

163172
assert self.forward_loop is not None

modelopt/torch/speculative/plugins/megatron_eagle.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def dict_to_config(
9090
fp16=fp16,
9191
bf16=bf16,
9292
params_dtype=getattr(torch, architecture_config["torch_dtype"]),
93-
pipeline_dtype=None,
93+
pipeline_dtype=getattr(torch, architecture_config["torch_dtype"]),
9494
num_layers=architecture_config.get("num_hidden_layers"),
9595
hidden_size=architecture_config.get("hidden_size"),
9696
ffn_hidden_size=architecture_config.get("intermediate_size"),

tests/gpu/torch/quantization/backends/test_gemm_common.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,12 @@
2929
set_seed()
3030

3131

32+
@pytest.fixture(autouse=True)
33+
def setup_seed():
34+
"""Set seed before each test function."""
35+
set_seed()
36+
37+
3238
@pytest.mark.parametrize(
3339
("config", "gemm_forward", "atol", "rtol"),
3440
[
@@ -257,9 +263,9 @@ def forward_loop(model, run_backward=False):
257263

258264
# The way the compression of the weights and inputs might be different.
259265
# E.g. we may use torch.compile in the gemms.
260-
assert torch.allclose(output_dynamic_quant_gemm, output_dynamic_quant, atol=atol / 3)
261-
assert torch.allclose(output_calib_quant_gemm, output_calib_quant, atol=atol / 3)
266+
assert torch.allclose(output_dynamic_quant_gemm, output_dynamic_quant, atol=atol / 2)
267+
assert torch.allclose(output_calib_quant_gemm, output_calib_quant, atol=atol / 2)
262268
assert torch.allclose(
263-
output_dynamic_quant_gemm, output_dynamic_quant_compressed, atol=atol / 3
269+
output_dynamic_quant_gemm, output_dynamic_quant_compressed, atol=atol / 2
264270
)
265-
assert torch.allclose(output_calib_quant_gemm, output_calib_quant_compressed, atol=atol / 3)
271+
assert torch.allclose(output_calib_quant_gemm, output_calib_quant_compressed, atol=atol / 2)

0 commit comments

Comments
 (0)