Skip to content

Commit 71a9f7a

Browse files
authored
Merge branch 'main' into jennifchen/cp_amax_sync
Signed-off-by: Jenny Chen <[email protected]>
2 parents 1f7d17e + 615f3c0 commit 71a9f7a

File tree

34 files changed

+916
-668
lines changed

34 files changed

+916
-668
lines changed

CHANGELOG.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,15 @@
11
Model Optimizer Changelog (Linux)
22
=================================
33

4+
0.39 (2025-10-xx)
5+
^^^^^^^^^^^^^^^^^
6+
7+
**Deprecations**
8+
9+
**New Features**
10+
11+
- Add flag ``op_types_to_exclude_fp16`` in ONNX quantization to exclude ops from being converted to FP16/BF16. Alternatively, for custom TensorRT ops, this can also be done by indicating ``'fp32'`` precision in ``trt_plugins_precision``.
12+
413
0.37 (2025-09-xx)
514
^^^^^^^^^^^^^^^^^
615

docs/source/_templates/autosummary/module.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
:recursive:
1212
{% for item in modules %}
1313
{% set full_item = fullname + '.' + item.split('.')[-1] %}
14-
{% if '.plugins.' not in full_item or full_item == 'modelopt.torch.opt.plugins.huggingface' %}
14+
{% if ('.plugins.' not in full_item or full_item == 'modelopt.torch.opt.plugins.huggingface') and full_item != 'modelopt.torch.quantization.backends.fp8_per_tensor_gemm' %}
1515
{{ full_item }}
1616
{% endif %}
1717
{%- endfor %}

docs/source/guides/_compress_quantized_models.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ After PTQ, the model can be compressed with the following code:
3232
Initialize HF models with compressed weights for lower memory usage
3333
===================================================================
3434

35-
When working with large language models, memory constraints can be a significant challenge. ModelOpt provides a workflow for initaializing HF models with compressed weights across multiple GPUs to dramatically reduce memory usage.
35+
When working with large language models, memory constraints can be a significant challenge. ModelOpt provides a workflow for initializing HF models with compressed weights across multiple GPUs to dramatically reduce memory usage.
3636

3737
For quantized formats like NVFP4, you can reduce memory usage by up to 4x compared to FP16/BF16 models. One limitation is that this workflow only works with max calibration algorithm.
3838

examples/llm_ptq/example_utils.py

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,12 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16+
import glob
1617
import os
18+
import shutil
1719
import sys
1820
import warnings
21+
from pathlib import Path
1922
from typing import Any
2023

2124
import torch
@@ -24,6 +27,11 @@
2427
from accelerate.utils import get_max_memory
2528
from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor, AutoTokenizer
2629

30+
try:
31+
from huggingface_hub import snapshot_download
32+
except ImportError:
33+
snapshot_download = None
34+
2735
from modelopt.torch.utils.image_processor import MllamaImageProcessor
2836

2937
SPECULATIVE_MODEL_LIST = ["Eagle", "Medusa"]
@@ -253,3 +261,141 @@ def apply_kv_cache_quant(quant_cfg: dict[str, Any], kv_cache_quant_cfg: dict[str
253261
quant_cfg["algorithm"] = "max"
254262

255263
return quant_cfg
264+
265+
266+
def _resolve_model_path(model_name_or_path: str, trust_remote_code: bool = False) -> str:
267+
"""Resolve a model name or path to a local directory path.
268+
269+
If the input is already a local directory, returns it as-is.
270+
If the input is a HuggingFace model ID, attempts to resolve it to the local cache path.
271+
272+
Args:
273+
model_name_or_path: Either a local directory path or HuggingFace model ID
274+
trust_remote_code: Whether to trust remote code when loading the model
275+
276+
Returns:
277+
Local directory path to the model files
278+
"""
279+
# If it's already a local directory, return as-is
280+
if os.path.isdir(model_name_or_path):
281+
return model_name_or_path
282+
283+
# Try to resolve HuggingFace model ID to local cache path
284+
try:
285+
# First try to load the config to trigger caching
286+
config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code)
287+
288+
# The config object should have the local path information
289+
# Try different ways to get the cached path
290+
if hasattr(config, "_name_or_path") and os.path.isdir(config._name_or_path):
291+
return config._name_or_path
292+
293+
# Alternative: use snapshot_download if available
294+
if snapshot_download is not None:
295+
try:
296+
local_path = snapshot_download(
297+
repo_id=model_name_or_path,
298+
allow_patterns=["*.py", "*.json"], # Only download Python files and config
299+
)
300+
return local_path
301+
except Exception as e:
302+
print(f"Warning: Could not download model files using snapshot_download: {e}")
303+
304+
# Fallback: try to find in HuggingFace cache
305+
from transformers.utils import TRANSFORMERS_CACHE
306+
307+
# Look for the model in the cache directory
308+
cache_pattern = os.path.join(TRANSFORMERS_CACHE, "models--*")
309+
cache_dirs = glob.glob(cache_pattern)
310+
311+
# Convert model name to cache directory format
312+
model_cache_name = model_name_or_path.replace("/", "--")
313+
for cache_dir in cache_dirs:
314+
if model_cache_name in cache_dir:
315+
# Look for the snapshots directory
316+
snapshots_dir = os.path.join(cache_dir, "snapshots")
317+
if os.path.exists(snapshots_dir):
318+
# Get the latest snapshot
319+
snapshot_dirs = [
320+
d
321+
for d in os.listdir(snapshots_dir)
322+
if os.path.isdir(os.path.join(snapshots_dir, d))
323+
]
324+
if snapshot_dirs:
325+
latest_snapshot = max(snapshot_dirs) # Use lexicographically latest
326+
snapshot_path = os.path.join(snapshots_dir, latest_snapshot)
327+
return snapshot_path
328+
329+
except Exception as e:
330+
print(f"Warning: Could not resolve model path for {model_name_or_path}: {e}")
331+
332+
# If all else fails, return the original path
333+
# This will cause the copy function to skip with a warning
334+
return model_name_or_path
335+
336+
337+
def copy_custom_model_files(source_path: str, export_path: str, trust_remote_code: bool = False):
338+
"""Copy custom model files (configuration_*.py, modeling_*.py, *.json, etc.) from source to export directory.
339+
340+
This function copies custom Python files and JSON configuration files that are needed for
341+
models with custom code. It excludes config.json and model.safetensors.index.json as these
342+
are typically handled separately by the model export process.
343+
344+
Args:
345+
source_path: Path to the original model directory or HuggingFace model ID
346+
export_path: Path to the exported model directory
347+
trust_remote_code: Whether trust_remote_code was used (only copy files if True)
348+
"""
349+
if not trust_remote_code:
350+
return
351+
352+
# Resolve the source path (handles both local paths and HF model IDs)
353+
resolved_source_path = _resolve_model_path(source_path, trust_remote_code)
354+
355+
source_dir = Path(resolved_source_path)
356+
export_dir = Path(export_path)
357+
358+
if not source_dir.exists():
359+
if resolved_source_path != source_path:
360+
print(
361+
f"Warning: Could not find local cache for HuggingFace model '{source_path}' "
362+
f"(resolved to '{resolved_source_path}')"
363+
)
364+
else:
365+
print(f"Warning: Source directory '{source_path}' does not exist")
366+
return
367+
368+
if not export_dir.exists():
369+
print(f"Warning: Export directory {export_path} does not exist")
370+
return
371+
372+
# Common patterns for custom model files that need to be copied
373+
custom_file_patterns = [
374+
"configuration_*.py",
375+
"modeling_*.py",
376+
"tokenization_*.py",
377+
"processing_*.py",
378+
"image_processing_*.py",
379+
"feature_extraction_*.py",
380+
"*.json",
381+
]
382+
383+
copied_files = []
384+
for pattern in custom_file_patterns:
385+
for file_path in source_dir.glob(pattern):
386+
if file_path.is_file():
387+
# Skip config.json and model.safetensors.index.json as they're handled separately
388+
if file_path.name in ["config.json", "model.safetensors.index.json"]:
389+
continue
390+
dest_path = export_dir / file_path.name
391+
try:
392+
shutil.copy2(file_path, dest_path)
393+
copied_files.append(file_path.name)
394+
print(f"Copied custom model file: {file_path.name}")
395+
except Exception as e:
396+
print(f"Warning: Failed to copy {file_path.name}: {e}")
397+
398+
if copied_files:
399+
print(f"Successfully copied {len(copied_files)} custom model files to {export_path}")
400+
else:
401+
print("No custom model files found to copy")

examples/llm_ptq/hf_ptq.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,14 @@
2323
import numpy as np
2424
import torch
2525
from accelerate.hooks import remove_hook_from_module
26-
from example_utils import apply_kv_cache_quant, get_model, get_processor, get_tokenizer, is_enc_dec
26+
from example_utils import (
27+
apply_kv_cache_quant,
28+
copy_custom_model_files,
29+
get_model,
30+
get_processor,
31+
get_tokenizer,
32+
is_enc_dec,
33+
)
2734
from transformers import (
2835
AutoConfig,
2936
AutoModelForCausalLM,
@@ -61,6 +68,7 @@
6168
QUANT_CFG_CHOICES: dict[str, dict[str, Any]] = {
6269
"int8": mtq.INT8_DEFAULT_CFG,
6370
"int8_sq": mtq.INT8_SMOOTHQUANT_CFG,
71+
"int8_wo": mtq.INT8_WEIGHT_ONLY_CFG,
6472
"fp8": mtq.FP8_DEFAULT_CFG,
6573
"int4_awq": mtq.INT4_AWQ_CFG,
6674
"w4a8_awq": mtq.W4A8_AWQ_BETA_CFG,
@@ -94,6 +102,7 @@ def auto_quantize(
94102
in [
95103
"fp8",
96104
"int8_sq",
105+
"int8_wo",
97106
"int4_awq",
98107
"nvfp4",
99108
"nvfp4_awq",
@@ -216,6 +225,7 @@ def main(args):
216225
assert (
217226
args.qformat
218227
in [
228+
"int8_wo",
219229
"int4_awq",
220230
"fp8",
221231
"nvfp4",
@@ -604,6 +614,9 @@ def output_decode(generated_ids, input_shape):
604614
inference_tensor_parallel=args.inference_tensor_parallel,
605615
inference_pipeline_parallel=args.inference_pipeline_parallel,
606616
)
617+
618+
# Copy custom model files (Python files and JSON configs) for TensorRT-LLM export
619+
copy_custom_model_files(args.pyt_ckpt_path, export_path, args.trust_remote_code)
607620
else:
608621
# Check arguments for unified_hf export format and set to default if unsupported arguments are provided
609622
assert args.sparsity_fmt == "dense", (
@@ -621,6 +634,9 @@ def output_decode(generated_ids, input_shape):
621634
export_dir=export_path,
622635
)
623636

637+
# Copy custom model files (Python files and JSON configs) if trust_remote_code is used
638+
copy_custom_model_files(args.pyt_ckpt_path, export_path, args.trust_remote_code)
639+
624640
# Restore default padding and export the tokenizer as well.
625641
if tokenizer is not None:
626642
tokenizer.padding_side = default_padding_side

examples/llm_ptq/scripts/huggingface_example.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,9 @@ esac
5353
IFS=","
5454
for qformat in $QFORMAT; do
5555
case $qformat in
56-
fp8 | fp8_pc_pt | fp8_pb_wo | int8_sq | int4_awq | w4a8_awq | fp16 | bf16 | nvfp4 | nvfp4_awq | w4a8_nvfp4_fp8 | w4a8_mxfp4_fp8) ;;
56+
fp8 | fp8_pc_pt | fp8_pb_wo | int8_wo | int8_sq | int4_awq | w4a8_awq | fp16 | bf16 | nvfp4 | nvfp4_awq | w4a8_nvfp4_fp8 | w4a8_mxfp4_fp8) ;;
5757
*)
58-
echo "Unknown quant argument: Expected one of: [fp8, fp8_pc_pt, fp8_pb_wo, int8_sq, int4_awq, w4a8_awq, fp16, bf16, nvfp4, nvfp4_awq, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8]" >&2
58+
echo "Unknown quant argument: Expected one of: [fp8, fp8_pc_pt, fp8_pb_wo, int8_wo, int8_sq, int4_awq, w4a8_awq, fp16, bf16, nvfp4, nvfp4_awq, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8]" >&2
5959
exit 1
6060
;;
6161
esac

examples/onnx_ptq/README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,13 @@ Model Optimizer enables highly performant quantization formats including NVFP4,
2626

2727
Please use the TensorRT docker image (e.g., `nvcr.io/nvidia/tensorrt:25.08-py3`) or visit our [installation docs](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html) for more information.
2828

29+
Set the following environment variables inside the TensorRT docker.
30+
31+
```bash
32+
export CUDNN_LIB_DIR=/usr/lib/x86_64-linux-gnu/
33+
export LD_LIBRARY_PATH="${CUDNN_LIB_DIR}:${LD_LIBRARY_PATH}"
34+
```
35+
2936
Also follow the installation steps below to upgrade to the latest version of Model Optimizer and install example-specific dependencies.
3037

3138
### Local Installation

examples/speculative_decoding/eagle_utils.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,10 @@ def __getitem__(self, i) -> dict[str, torch.Tensor]:
236236

237237

238238
def make_eagle_supervised_data_module(
239-
tokenizer: transformers.PreTrainedTokenizer, data_args, use_offline_training: bool
239+
tokenizer: transformers.PreTrainedTokenizer,
240+
data_args,
241+
use_offline_training: bool,
242+
max_length=None,
240243
) -> dict:
241244
"""Make dataset and collator for supervised fine-tuning.
242245
@@ -295,15 +298,15 @@ def make_eagle_supervised_data_module(
295298
train_dataset = dataset_cls(valid_entries[:num_train], tokenizer=tokenizer)
296299
eval_dataset = dataset_cls(valid_entries[num_train:], tokenizer=tokenizer)
297300

298-
data_collator = DataCollatorForOffline()
301+
data_collator = DataCollatorForOffline(max_length=max_length)
299302
else:
300303
print_rank_0("Loading input conversations...")
301304
dataset_cls = LazySupervisedDataset if data_args.lazy_preprocess else SupervisedDataset
302305

303306
train_dataset = dataset_cls(data_json[: int(len(data_json) * 0.95)], tokenizer=tokenizer)
304307
eval_dataset = dataset_cls(data_json[int(len(data_json) * 0.95) :], tokenizer=tokenizer)
305308

306-
data_collator = DataCollatorWithPadding()
309+
data_collator = DataCollatorWithPadding(max_length=max_length)
307310

308311
return {
309312
"train_dataset": train_dataset,
@@ -313,6 +316,9 @@ def make_eagle_supervised_data_module(
313316

314317

315318
class DataCollatorWithPadding:
319+
def __init__(self, max_length):
320+
self.max_length = max_length
321+
316322
def paddingtensor2d(self, intensors, length):
317323
n, dim = intensors.shape
318324
padding_tensor = torch.zeros(length - n, dim, dtype=intensors.dtype)
@@ -325,19 +331,18 @@ def paddingtensor(self, intensors, length):
325331
return outtensors
326332

327333
def __call__(self, features: list[dict[str, Any]]) -> dict[str, Any]:
328-
max_length = max(item["input_ids"].shape[0] for item in features)
329334
batch_input_ids = torch.stack(
330-
[self.paddingtensor(item["input_ids"], max_length) for item in features]
335+
[self.paddingtensor(item["input_ids"], self.max_length) for item in features]
331336
)
332337
batch_attention_mask = torch.stack(
333-
[self.paddingtensor(item["attention_mask"], max_length) for item in features]
338+
[self.paddingtensor(item["attention_mask"], self.max_length) for item in features]
334339
)
335340
batch_loss_mask = torch.stack(
336-
[self.paddingtensor(item["loss_mask"], max_length) for item in features]
341+
[self.paddingtensor(item["loss_mask"], self.max_length) for item in features]
337342
)
338343

339344
batch_labels = torch.stack(
340-
[self.paddingtensor(item["labels"], max_length) for item in features]
345+
[self.paddingtensor(item["labels"], self.max_length) for item in features]
341346
)
342347

343348
batch = {
@@ -357,16 +362,15 @@ def __call__(self, features: list[dict[str, Any]]) -> dict[str, Any]:
357362
raise ValueError("No kwargs found in batch features. Offline data required.")
358363

359364
features = [item["kwargs"]["base_model_outputs"] for item in features]
360-
max_hs_length = max(item["base_model_hidden_states"].shape[0] for item in features)
361365

362366
batch_hidden_states = torch.stack(
363367
[
364-
self.paddingtensor2d(item["base_model_hidden_states"], max_hs_length)
368+
self.paddingtensor2d(item["base_model_hidden_states"], self.max_length)
365369
for item in features
366370
]
367371
)
368372
batch_aux_hidden_states = torch.stack(
369-
[self.paddingtensor2d(item["aux_hidden_states"], max_hs_length) for item in features]
373+
[self.paddingtensor2d(item["aux_hidden_states"], self.max_length) for item in features]
370374
)
371375

372376
batch = {

examples/speculative_decoding/main.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,9 @@ def train():
227227
if training_args.mode == "medusa":
228228
data_module = make_medusa_supervised_data_module(tokenizer, data_args)
229229
elif training_args.mode in ["eagle1", "eagle3"]:
230-
data_module = make_eagle_supervised_data_module(tokenizer, data_args, use_offline_training)
230+
data_module = make_eagle_supervised_data_module(
231+
tokenizer, data_args, use_offline_training, max_length=training_args.training_seq_len
232+
)
231233

232234
class ARValidationCallback(TrainerCallback):
233235
def __init__(self, ar_validate_steps: int = 500):

modelopt/onnx/autocast/convert.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ def convert_to_f16(
179179
sanitizer.find_custom_nodes()
180180
sanitizer.convert_opset()
181181
sanitizer.ensure_graph_name_exists()
182+
sanitizer.convert_fp64_to_fp32()
182183
model = sanitizer.model
183184

184185
# Setup internal mappings

0 commit comments

Comments
 (0)