Skip to content

Commit 40afd8f

Browse files
Merge branch 'main' into QAT-Walkthrough-Notebook
2 parents e0caa1c + d5c88e7 commit 40afd8f

File tree

15 files changed

+439
-189
lines changed

15 files changed

+439
-189
lines changed

.github/workflows/gpu_tests.yml

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,20 +22,31 @@ jobs:
2222
any_changed: ${{ steps.changed-tests.outputs.any_changed }}
2323
steps:
2424
- uses: actions/checkout@v4
25+
with:
26+
fetch-depth: 0
2527
- id: get-pr-info
2628
uses: nv-gha-runners/get-pr-info@main
29+
# Get commit from main branch that is present in the PR to use as base for changed files
30+
- id: calculate-merge-base
31+
env:
32+
PR_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
33+
BASE_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}
34+
run: |
35+
(echo -n "merge-base="; git merge-base "$BASE_SHA" "$PR_SHA") | tee --append "${GITHUB_OUTPUT}"
2736
- name: Check for changes in test-relevant directories
2837
id: changed-tests
2938
uses: step-security/[email protected]
3039
with:
40+
base_sha: ${{ steps.calculate-merge-base.outputs.merge-base }}
41+
sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
3142
files: |
3243
.github/workflows/gpu_tests.yml
3344
modelopt/**
3445
tests/gpu/**
3546
tox.ini
3647
pyproject.toml
3748
setup.py
38-
base_sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }}
49+
fail_on_initial_diff_error: true
3950
wait-checks:
4051
needs: [check-file-changes]
4152
if: needs.check-file-changes.outputs.any_changed == 'true'
@@ -70,3 +81,12 @@ jobs:
7081
timeout-minutes: 90
7182
container: *gpu_container
7283
steps: *gpu_steps
84+
gpu-pr-required-check:
85+
# Run even if gpu-tests-pr is skipped
86+
if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }}
87+
needs: [check-file-changes, gpu-tests-pr]
88+
runs-on: ubuntu-latest
89+
steps:
90+
- name: Required GPU tests did not succeed
91+
if: ${{ needs.check-file-changes.result != 'success' || (needs.check-file-changes.outputs.any_changed == 'true' && needs.gpu-tests-pr.result != 'success') }}
92+
run: exit 1

.github/workflows/unit_tests.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,3 +126,9 @@ jobs:
126126
python-version: "3.12"
127127
- name: Run unit tests
128128
run: pip install tox && tox -e py312-partial-unit-${{ matrix.test-env }}
129+
unit-pr-required-check:
130+
if: github.event_name == 'pull_request'
131+
needs: [linux, windows, multi-py, multi-torch, multi-transformers, partial-install]
132+
runs-on: ubuntu-latest
133+
steps:
134+
- run: echo "All PR unit test jobs completed"

.gitlab/tests.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22
.tests-default:
33
stage: tests
44
rules:
5-
- if: $JET_ONLY != null
6-
when: never
7-
- if: $CI_COMMIT_TAG =~ /^\d+\.\d+\.\d+$/
8-
- if: $CI_PIPELINE_SOURCE == "web" || $CI_PIPELINE_SOURCE == "schedule"
5+
- if: $CI_PIPELINE_SOURCE == "schedule"
6+
when: always
7+
- if: $CI_PIPELINE_SOURCE != "schedule"
8+
when: manual
99

1010
##### Unit Tests #####
1111
unit:

examples/llm_ptq/hf_ptq.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,9 @@
2525
from accelerate.hooks import remove_hook_from_module
2626
from example_utils import apply_kv_cache_quant, get_model, get_processor, get_tokenizer, is_enc_dec
2727
from transformers import (
28+
AutoConfig,
2829
AutoModelForCausalLM,
30+
AutoProcessor,
2931
PreTrainedTokenizer,
3032
PreTrainedTokenizerFast,
3133
WhisperProcessor,
@@ -39,6 +41,7 @@
3941
export_tensorrt_llm_checkpoint,
4042
get_model_type,
4143
)
44+
from modelopt.torch.export.model_utils import is_multimodal_model
4245
from modelopt.torch.quantization.config import need_calibration
4346
from modelopt.torch.quantization.plugins.accelerate import init_quantized_weights
4447
from modelopt.torch.quantization.utils import is_quantized
@@ -567,19 +570,26 @@ def output_decode(generated_ids, input_shape):
567570

568571
export_path = args.export_path
569572

570-
if hasattr(full_model, "language_model"):
571-
# Save original model config and the preprocessor config to the export path for VLMs.
572-
from transformers import AutoConfig, AutoProcessor
573+
# Check if the model is a multimodal/VLM model
574+
is_vlm = is_multimodal_model(full_model)
573575

574-
print(f"Saving original model and processor configs to {export_path}")
576+
if is_vlm:
577+
# Save original model config and the processor config to the export path for VLMs.
578+
print(f"Saving original model config to {export_path}")
575579

576580
AutoConfig.from_pretrained(
577581
args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code
578582
).save_pretrained(export_path)
579583

580-
AutoProcessor.from_pretrained(
581-
args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code
582-
).save_pretrained(export_path)
584+
# Try to save processor config if available
585+
try:
586+
print(f"Saving processor config to {export_path}")
587+
AutoProcessor.from_pretrained(
588+
args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code
589+
).save_pretrained(export_path)
590+
except Exception as e:
591+
print(f"Warning: Could not save processor config: {e}")
592+
print("This is normal for some VLM architectures that don't use AutoProcessor")
583593

584594
if model_type == "mllama":
585595
full_model_config = model.config

modelopt/onnx/quantization/qdq_utils.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -790,8 +790,10 @@ def remove_input_dq_and_output_q(
790790
if cons_idx in quantizable_custom_ops[consumer.op_type]["inp"]:
791791
consumer.input[cons_idx] = q_node.output[0]
792792
else:
793-
q_node_prev = tensor_producers[q_node.input[0]]
794-
consumer.input[cons_idx] = q_node_prev.output[0]
793+
q_node_prev = tensor_producers.get(q_node.input[0], None)
794+
consumer.input[cons_idx] = (
795+
q_node_prev.output[0] if q_node_prev else q_node.input[0]
796+
)
795797
break
796798

797799
# Track DequantizeLinear node indices for cleanup
@@ -828,8 +830,11 @@ def remove_input_dq_and_output_q(
828830
if quantizable_custom_ops[producer.op_type]["out"]:
829831
dq_node[0].input[0] = producer.output[0]
830832
else:
831-
dq_node_next = tensor_consumers[dq_node[0].output[0]]
832-
dq_node_next[0].input[0] = producer.output[0]
833+
dq_node_next = tensor_consumers.get(dq_node[0].output[0], None)
834+
if dq_node_next:
835+
dq_node_next[0].input[0] = producer.output[0]
836+
else:
837+
dq_node[0].input[0] = producer.output[0]
833838

834839
# Track QuantizeLinear node indices for cleanup
835840
q_indices.append(node_idx)

modelopt/onnx/trt_utils.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -416,9 +416,10 @@ def interpret_trt_plugins_precision_flag(
416416
# Will add Q/DQ nodes in the requested I/O indices
417417
inp_precision_quant = [i for i, p in enumerate(inp_precision) if p in ["int8", "fp8"]]
418418
out_precision_quant = [i for i, p in enumerate(out_precision) if p in ["int8", "fp8"]]
419-
custom_ops_to_quantize[op_type] = {
420-
"inp": inp_precision_quant,
421-
"out": out_precision_quant,
422-
}
419+
if inp_precision_quant or out_precision_quant:
420+
custom_ops_to_quantize[op_type] = {
421+
"inp": inp_precision_quant,
422+
"out": out_precision_quant,
423+
}
423424

424425
return custom_ops_to_cast, custom_ops_to_quantize

modelopt/onnx/utils.py

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
import numpy as np
2626
import onnx
2727
import onnx_graphsurgeon as gs
28-
from onnx import TensorProto, ValueInfoProto, numpy_helper
2928
from onnx.helper import get_attribute_value
3029
from onnx_graphsurgeon import Constant, Node, Variable
3130

@@ -289,7 +288,7 @@ def _convert_types_to_np(types: dict[str, int] | list[int] | int) -> Any:
289288

290289
def get_tensor_by_name(
291290
onnx_model: onnx.ModelProto, tensor_name: str
292-
) -> ValueInfoProto | TensorProto | None:
291+
) -> onnx.ValueInfoProto | onnx.TensorProto | None:
293292
"""This function returns a tensor from its name.
294293
295294
This function searches for a tensor in the model's:
@@ -438,7 +437,7 @@ def randomize_weights_onnx_bytes(onnx_bytes: bytes, seed: int = 0) -> bytes:
438437
numpy_array = np.random.normal(float(avg), float(var), size=init.dims).astype(
439438
dtype
440439
)
441-
tensor = numpy_helper.from_array(numpy_array, init.name)
440+
tensor = onnx.numpy_helper.from_array(numpy_array, init.name)
442441
model.graph.initializer[idx].CopyFrom(tensor)
443442

444443
buffer = io.BytesIO()
@@ -751,3 +750,53 @@ def onnx_type_str_to_enum(dtype: str) -> int:
751750
dtype = dtype.split("tensor(")[-1].split(")")[0]
752751
dtype = "FLOAT" if dtype == "float32" else dtype.upper()
753752
return getattr(onnx.TensorProto, dtype)
753+
754+
755+
def remove_node_training_mode(onnx_model: onnx.ModelProto, node_op_type: str) -> onnx.ModelProto:
756+
"""Remove `training_mode` attribute and extra training outputs from nodes of a given op type.
757+
758+
This also removes the unused outputs from the training_mode nodes.
759+
760+
Args:
761+
onnx_model: The onnx model.
762+
node_op_type: The node type to remove training_mode attribute from.
763+
764+
Returns:
765+
The onnx model with the training_mode attribute removed.
766+
"""
767+
removed_output_names = set()
768+
all_inputs = {inp for n in onnx_model.graph.node for inp in n.input}
769+
graph_outputs = {o.name for o in onnx_model.graph.output}
770+
keep = all_inputs | graph_outputs
771+
772+
for node in onnx_model.graph.node:
773+
if node.op_type != node_op_type:
774+
continue
775+
776+
is_training_mode = False
777+
# Drop the 'training_mode' attribute if present
778+
for idx, attr in enumerate(list(node.attribute)):
779+
if attr.name == "training_mode":
780+
del node.attribute[idx]
781+
if attr.i == 1:
782+
is_training_mode = True
783+
break
784+
785+
# If the node has extra outputs, remove them all including the training outputs
786+
if is_training_mode:
787+
to_remove = []
788+
for name in node.output:
789+
if name not in keep:
790+
removed_output_names.add(name)
791+
to_remove.append(name)
792+
793+
for name in to_remove:
794+
node.output.remove(name)
795+
796+
if removed_output_names:
797+
# Clean up corresponding value_info entries
798+
keep = [vi for vi in onnx_model.graph.value_info if vi.name not in removed_output_names]
799+
del onnx_model.graph.value_info[:]
800+
onnx_model.graph.value_info.extend(keep)
801+
802+
return onnx_model

modelopt/torch/_deploy/utils/torch_onnx.py

Lines changed: 1 addition & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
get_node_names,
4646
get_output_names,
4747
get_output_shapes,
48+
remove_node_training_mode,
4849
)
4950
from modelopt.torch.quantization.export_onnx import configure_linear_module_onnx_quantizers
5051
from modelopt.torch.utils import flatten_tree, standardize_named_model_args
@@ -569,25 +570,3 @@ def get_onnx_bytes(*args, **kwargs) -> bytes:
569570
onnx_bytes = get_onnx_bytes_and_metadata(*args, **kwargs)[0]
570571
onnx_bytes_obj = OnnxBytes.from_bytes(onnx_bytes)
571572
return onnx_bytes_obj.get_onnx_model_file_bytes()
572-
573-
574-
def remove_node_training_mode(onnx_model: ModelProto, node_op_type: str) -> ModelProto:
575-
"""Remove training_mode attribute from selected node type.
576-
577-
Args:
578-
onnx_model: The onnx model.
579-
node_op_type: The node type to remove training_mode attribute from.
580-
581-
Returns:
582-
The onnx model with the training_mode attribute removed.
583-
"""
584-
for node in onnx_model.graph.node:
585-
if node.op_type == node_op_type:
586-
for attribute in node.attribute:
587-
if attribute.name == "training_mode":
588-
if attribute.i == 1:
589-
node.output.remove(node.output[1])
590-
node.output.remove(node.output[1])
591-
attribute.i = 0
592-
593-
return onnx_model

modelopt/torch/export/model_utils.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
{MODEL_NAME_TO_TYPE=}
6161
"""
6262

63-
__all__ = ["get_model_type"]
63+
__all__ = ["get_model_type", "is_multimodal_model"]
6464

6565

6666
def get_model_type(model):
@@ -69,3 +69,43 @@ def get_model_type(model):
6969
if k.lower() in type(model).__name__.lower():
7070
return v
7171
return None
72+
73+
74+
def is_multimodal_model(model):
75+
"""Check if a model is a Vision-Language Model (VLM) or multimodal model.
76+
77+
This function detects various multimodal model architectures by checking for:
78+
- Standard vision configurations (vision_config)
79+
- Language model attributes (language_model)
80+
- Specific multimodal model types (phi4mm)
81+
- Vision LoRA configurations
82+
- Audio processing capabilities
83+
- Image embedding layers
84+
85+
Args:
86+
model: The HuggingFace model instance to check
87+
88+
Returns:
89+
bool: True if the model is detected as multimodal, False otherwise
90+
91+
Examples:
92+
>>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
93+
>>> is_multimodal_model(model)
94+
True
95+
96+
>>> model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-4-multimodal-instruct")
97+
>>> is_multimodal_model(model)
98+
True
99+
"""
100+
config = model.config
101+
102+
return (
103+
hasattr(config, "vision_config") # Standard vision config (e.g., Qwen2.5-VL)
104+
or hasattr(model, "language_model") # Language model attribute (e.g., LLaVA)
105+
or getattr(config, "model_type", "") == "phi4mm" # Phi-4 multimodal
106+
or hasattr(config, "vision_lora") # Vision LoRA configurations
107+
or hasattr(config, "audio_processor") # Audio processing capabilities
108+
or (
109+
hasattr(config, "embd_layer") and hasattr(config.embd_layer, "image_embd_layer")
110+
) # Image embedding layers
111+
)

modelopt/torch/prune/plugins/mcore_minitron.py

Lines changed: 1 addition & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -58,30 +58,6 @@
5858
"num_layers",
5959
}
6060

61-
SUPPORTED_MODELS = set()
62-
63-
try:
64-
from megatron.core.models.gpt import GPTModel
65-
66-
SUPPORTED_MODELS.add(GPTModel)
67-
except Exception:
68-
pass
69-
70-
try:
71-
from megatron.core.models.mamba import MambaModel
72-
73-
SUPPORTED_MODELS.add(MambaModel)
74-
except Exception:
75-
pass
76-
77-
try:
78-
from nemo.collections import llm
79-
80-
# NOTE: llm.MambaModel is a subclass of llm.GPTModel
81-
SUPPORTED_MODELS.add(llm.GPTModel)
82-
except Exception:
83-
pass
84-
8561

8662
class MCoreMinitronSearcher(BaseSearcher):
8763
"""Searcher for Minitron pruning algorithm."""
@@ -150,16 +126,6 @@ def before_search(self) -> None:
150126
def run_search(self) -> None:
151127
"""Run actual search."""
152128
# Run forward loop to collect activations and sort parameters
153-
model_cfg = None
154-
for m_type in SUPPORTED_MODELS:
155-
if isinstance(self.model, m_type):
156-
model_cfg = self.model.config
157-
break
158-
if model_cfg is None:
159-
raise NotImplementedError(
160-
f"Only {SUPPORTED_MODELS} models are supported! Got: {type(self.model)}"
161-
)
162-
163129
assert self.forward_loop is not None
164130
is_training = self.model.training
165131
self.model.eval()
@@ -178,6 +144,7 @@ def run_search(self) -> None:
178144
hp.active = export_config[hp_name]
179145

180146
# kv_channels can be None so we need to save original from original hidden_size and num_attention_heads
147+
model_cfg = self.model.config
181148
orig_kv_channels = getattr(model_cfg, "kv_channels")
182149
if orig_kv_channels is None:
183150
orig_kv_channels = getattr(model_cfg, "hidden_size") // getattr(

0 commit comments

Comments
 (0)