NVIDIA
diff --git a/‎CHANGELOG.rst‎
Lines changed: 9 additions & 0 deletions b/‎CHANGELOG.rst‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎docs/source/_templates/autosummary/module.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/_templates/autosummary/module.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/guides/_compress_quantized_models.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/guides/_compress_quantized_models.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/llm_ptq/example_utils.py‎
Lines changed: 146 additions & 0 deletions b/‎examples/llm_ptq/example_utils.py‎
Lines changed: 146 additions & 0 deletions
diff --git a/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 17 additions & 1 deletion b/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎examples/llm_ptq/scripts/huggingface_example.sh‎
Lines changed: 2 additions & 2 deletions b/‎examples/llm_ptq/scripts/huggingface_example.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/onnx_ptq/README.md‎
Lines changed: 7 additions & 0 deletions b/‎examples/onnx_ptq/README.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎examples/speculative_decoding/eagle_utils.py‎
Lines changed: 15 additions & 11 deletions b/‎examples/speculative_decoding/eagle_utils.py‎
Lines changed: 15 additions & 11 deletions
diff --git a/‎examples/speculative_decoding/main.py‎
Lines changed: 3 additions & 1 deletion b/‎examples/speculative_decoding/main.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎modelopt/onnx/autocast/convert.py‎
Lines changed: 1 addition & 0 deletions b/‎modelopt/onnx/autocast/convert.py‎
Lines changed: 1 addition & 0 deletions
@@ -1,6 +1,15 @@
 Model Optimizer Changelog (Linux)
 =================================
 
+0.39 (2025-10-xx)
+^^^^^^^^^^^^^^^^^
+
+**Deprecations**
+
+**New Features**
+
+- Add flag ``op_types_to_exclude_fp16`` in ONNX quantization to exclude ops from being converted to FP16/BF16. Alternatively, for custom TensorRT ops, this can also be done by indicating ``'fp32'`` precision in ``trt_plugins_precision``.
+
 0.37 (2025-09-xx)
 ^^^^^^^^^^^^^^^^^
 
 
@@ -11,7 +11,7 @@
    :recursive:
 {% for item in modules %}
 {% set full_item = fullname + '.' + item.split('.')[-1] %}
-{% if '.plugins.' not in full_item or full_item == 'modelopt.torch.opt.plugins.huggingface' %}
+{% if ('.plugins.' not in full_item or full_item == 'modelopt.torch.opt.plugins.huggingface') and full_item != 'modelopt.torch.quantization.backends.fp8_per_tensor_gemm' %}
    {{ full_item }}
 {% endif %}
 {%- endfor %}
 
@@ -32,7 +32,7 @@ After PTQ, the model can be compressed with the following code:
 Initialize HF models with compressed weights for lower memory usage
 ===================================================================
 
-When working with large language models, memory constraints can be a significant challenge. ModelOpt provides a workflow for initaializing HF models with compressed weights across multiple GPUs to dramatically reduce memory usage.
+When working with large language models, memory constraints can be a significant challenge. ModelOpt provides a workflow for initializing HF models with compressed weights across multiple GPUs to dramatically reduce memory usage.
 
 For quantized formats like NVFP4, you can reduce memory usage by up to 4x compared to FP16/BF16 models. One limitation is that this workflow only works with max calibration algorithm.
 
 
@@ -13,9 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import glob
 import os
+import shutil
 import sys
 import warnings
+from pathlib import Path
 from typing import Any
 
 import torch
@@ -24,6 +27,11 @@
 from accelerate.utils import get_max_memory
 from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor, AutoTokenizer
 
+try:
+    from huggingface_hub import snapshot_download
+except ImportError:
+    snapshot_download = None
+
 from modelopt.torch.utils.image_processor import MllamaImageProcessor
 
 SPECULATIVE_MODEL_LIST = ["Eagle", "Medusa"]
@@ -253,3 +261,141 @@ def apply_kv_cache_quant(quant_cfg: dict[str, Any], kv_cache_quant_cfg: dict[str
         quant_cfg["algorithm"] = "max"
 
     return quant_cfg
+
+
+def _resolve_model_path(model_name_or_path: str, trust_remote_code: bool = False) -> str:
+    """Resolve a model name or path to a local directory path.
+
+    If the input is already a local directory, returns it as-is.
+    If the input is a HuggingFace model ID, attempts to resolve it to the local cache path.
+
+    Args:
+        model_name_or_path: Either a local directory path or HuggingFace model ID
+        trust_remote_code: Whether to trust remote code when loading the model
+
+    Returns:
+        Local directory path to the model files
+    """
+    # If it's already a local directory, return as-is
+    if os.path.isdir(model_name_or_path):
+        return model_name_or_path
+
+    # Try to resolve HuggingFace model ID to local cache path
+    try:
+        # First try to load the config to trigger caching
+        config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code)
+
+        # The config object should have the local path information
+        # Try different ways to get the cached path
+        if hasattr(config, "_name_or_path") and os.path.isdir(config._name_or_path):
+            return config._name_or_path
+
+        # Alternative: use snapshot_download if available
+        if snapshot_download is not None:
+            try:
+                local_path = snapshot_download(
+                    repo_id=model_name_or_path,
+                    allow_patterns=["*.py", "*.json"],  # Only download Python files and config
+                )
+                return local_path
+            except Exception as e:
+                print(f"Warning: Could not download model files using snapshot_download: {e}")
+
+        # Fallback: try to find in HuggingFace cache
+        from transformers.utils import TRANSFORMERS_CACHE
+
+        # Look for the model in the cache directory
+        cache_pattern = os.path.join(TRANSFORMERS_CACHE, "models--*")
+        cache_dirs = glob.glob(cache_pattern)
+
+        # Convert model name to cache directory format
+        model_cache_name = model_name_or_path.replace("/", "--")
+        for cache_dir in cache_dirs:
+            if model_cache_name in cache_dir:
+                # Look for the snapshots directory
+                snapshots_dir = os.path.join(cache_dir, "snapshots")
+                if os.path.exists(snapshots_dir):
+                    # Get the latest snapshot
+                    snapshot_dirs = [
+                        d
+                        for d in os.listdir(snapshots_dir)
+                        if os.path.isdir(os.path.join(snapshots_dir, d))
+                    ]
+                    if snapshot_dirs:
+                        latest_snapshot = max(snapshot_dirs)  # Use lexicographically latest
+                        snapshot_path = os.path.join(snapshots_dir, latest_snapshot)
+                        return snapshot_path
+
+    except Exception as e:
+        print(f"Warning: Could not resolve model path for {model_name_or_path}: {e}")
+
+    # If all else fails, return the original path
+    # This will cause the copy function to skip with a warning
+    return model_name_or_path
+
+
+def copy_custom_model_files(source_path: str, export_path: str, trust_remote_code: bool = False):
+    """Copy custom model files (configuration_*.py, modeling_*.py, *.json, etc.) from source to export directory.
+
+    This function copies custom Python files and JSON configuration files that are needed for
+    models with custom code. It excludes config.json and model.safetensors.index.json as these
+    are typically handled separately by the model export process.
+
+    Args:
+        source_path: Path to the original model directory or HuggingFace model ID
+        export_path: Path to the exported model directory
+        trust_remote_code: Whether trust_remote_code was used (only copy files if True)
+    """
+    if not trust_remote_code:
+        return
+
+    # Resolve the source path (handles both local paths and HF model IDs)
+    resolved_source_path = _resolve_model_path(source_path, trust_remote_code)
+
+    source_dir = Path(resolved_source_path)
+    export_dir = Path(export_path)
+
+    if not source_dir.exists():
+        if resolved_source_path != source_path:
+            print(
+                f"Warning: Could not find local cache for HuggingFace model '{source_path}' "
+                f"(resolved to '{resolved_source_path}')"
+            )
+        else:
+            print(f"Warning: Source directory '{source_path}' does not exist")
+        return
+
+    if not export_dir.exists():
+        print(f"Warning: Export directory {export_path} does not exist")
+        return
+
+    # Common patterns for custom model files that need to be copied
+    custom_file_patterns = [
+        "configuration_*.py",
+        "modeling_*.py",
+        "tokenization_*.py",
+        "processing_*.py",
+        "image_processing_*.py",
+        "feature_extraction_*.py",
+        "*.json",
+    ]
+
+    copied_files = []
+    for pattern in custom_file_patterns:
+        for file_path in source_dir.glob(pattern):
+            if file_path.is_file():
+                # Skip config.json and model.safetensors.index.json as they're handled separately
+                if file_path.name in ["config.json", "model.safetensors.index.json"]:
+                    continue
+                dest_path = export_dir / file_path.name
+                try:
+                    shutil.copy2(file_path, dest_path)
+                    copied_files.append(file_path.name)
+                    print(f"Copied custom model file: {file_path.name}")
+                except Exception as e:
+                    print(f"Warning: Failed to copy {file_path.name}: {e}")
+
+    if copied_files:
+        print(f"Successfully copied {len(copied_files)} custom model files to {export_path}")
+    else:
+        print("No custom model files found to copy")
@@ -23,7 +23,14 @@
 import numpy as np
 import torch
 from accelerate.hooks import remove_hook_from_module
-from example_utils import apply_kv_cache_quant, get_model, get_processor, get_tokenizer, is_enc_dec
+from example_utils import (
+    apply_kv_cache_quant,
+    copy_custom_model_files,
+    get_model,
+    get_processor,
+    get_tokenizer,
+    is_enc_dec,
+)
 from transformers import (
     AutoConfig,
     AutoModelForCausalLM,
@@ -61,6 +68,7 @@
 QUANT_CFG_CHOICES: dict[str, dict[str, Any]] = {
     "int8": mtq.INT8_DEFAULT_CFG,
     "int8_sq": mtq.INT8_SMOOTHQUANT_CFG,
+    "int8_wo": mtq.INT8_WEIGHT_ONLY_CFG,
     "fp8": mtq.FP8_DEFAULT_CFG,
     "int4_awq": mtq.INT4_AWQ_CFG,
     "w4a8_awq": mtq.W4A8_AWQ_BETA_CFG,
@@ -94,6 +102,7 @@ def auto_quantize(
         in [
             "fp8",
             "int8_sq",
+            "int8_wo",
             "int4_awq",
             "nvfp4",
             "nvfp4_awq",
@@ -216,6 +225,7 @@ def main(args):
         assert (
             args.qformat
             in [
+                "int8_wo",
                 "int4_awq",
                 "fp8",
                 "nvfp4",
@@ -604,6 +614,9 @@ def output_decode(generated_ids, input_shape):
                 inference_tensor_parallel=args.inference_tensor_parallel,
                 inference_pipeline_parallel=args.inference_pipeline_parallel,
             )
+
+            # Copy custom model files (Python files and JSON configs) for TensorRT-LLM export
+            copy_custom_model_files(args.pyt_ckpt_path, export_path, args.trust_remote_code)
         else:
             # Check arguments for unified_hf export format and set to default if unsupported arguments are provided
             assert args.sparsity_fmt == "dense", (
@@ -621,6 +634,9 @@ def output_decode(generated_ids, input_shape):
                 export_dir=export_path,
             )
 
+        # Copy custom model files (Python files and JSON configs) if trust_remote_code is used
+        copy_custom_model_files(args.pyt_ckpt_path, export_path, args.trust_remote_code)
+
         # Restore default padding and export the tokenizer as well.
         if tokenizer is not None:
             tokenizer.padding_side = default_padding_side
 
@@ -53,9 +53,9 @@ esac
 IFS=","
 for qformat in $QFORMAT; do
     case $qformat in
-    fp8 | fp8_pc_pt | fp8_pb_wo | int8_sq | int4_awq | w4a8_awq | fp16 | bf16 | nvfp4 | nvfp4_awq | w4a8_nvfp4_fp8 | w4a8_mxfp4_fp8) ;;
+    fp8 | fp8_pc_pt | fp8_pb_wo | int8_wo | int8_sq | int4_awq | w4a8_awq | fp16 | bf16 | nvfp4 | nvfp4_awq | w4a8_nvfp4_fp8 | w4a8_mxfp4_fp8) ;;
     *)
-        echo "Unknown quant argument: Expected one of: [fp8, fp8_pc_pt, fp8_pb_wo, int8_sq, int4_awq, w4a8_awq, fp16, bf16, nvfp4, nvfp4_awq, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8]" >&2
+        echo "Unknown quant argument: Expected one of: [fp8, fp8_pc_pt, fp8_pb_wo, int8_wo, int8_sq, int4_awq, w4a8_awq, fp16, bf16, nvfp4, nvfp4_awq, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8]" >&2
         exit 1
         ;;
     esac
 
@@ -26,6 +26,13 @@ Model Optimizer enables highly performant quantization formats including NVFP4,
 
 Please use the TensorRT docker image (e.g., `nvcr.io/nvidia/tensorrt:25.08-py3`) or visit our [installation docs](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html) for more information.
 
+Set the following environment variables inside the TensorRT docker.
+
+```bash
+export CUDNN_LIB_DIR=/usr/lib/x86_64-linux-gnu/
+export LD_LIBRARY_PATH="${CUDNN_LIB_DIR}:${LD_LIBRARY_PATH}"
+```
+
 Also follow the installation steps below to upgrade to the latest version of Model Optimizer and install example-specific dependencies.
 
 ### Local Installation
 
@@ -236,7 +236,10 @@ def __getitem__(self, i) -> dict[str, torch.Tensor]:
 
 
 def make_eagle_supervised_data_module(
-    tokenizer: transformers.PreTrainedTokenizer, data_args, use_offline_training: bool
+    tokenizer: transformers.PreTrainedTokenizer,
+    data_args,
+    use_offline_training: bool,
+    max_length=None,
 ) -> dict:
     """Make dataset and collator for supervised fine-tuning.
 
@@ -295,15 +298,15 @@ def make_eagle_supervised_data_module(
         train_dataset = dataset_cls(valid_entries[:num_train], tokenizer=tokenizer)
         eval_dataset = dataset_cls(valid_entries[num_train:], tokenizer=tokenizer)
 
-        data_collator = DataCollatorForOffline()
+        data_collator = DataCollatorForOffline(max_length=max_length)
     else:
         print_rank_0("Loading input conversations...")
         dataset_cls = LazySupervisedDataset if data_args.lazy_preprocess else SupervisedDataset
 
         train_dataset = dataset_cls(data_json[: int(len(data_json) * 0.95)], tokenizer=tokenizer)
         eval_dataset = dataset_cls(data_json[int(len(data_json) * 0.95) :], tokenizer=tokenizer)
 
-        data_collator = DataCollatorWithPadding()
+        data_collator = DataCollatorWithPadding(max_length=max_length)
 
     return {
         "train_dataset": train_dataset,
@@ -313,6 +316,9 @@ def make_eagle_supervised_data_module(
 
 
 class DataCollatorWithPadding:
+    def __init__(self, max_length):
+        self.max_length = max_length
+
     def paddingtensor2d(self, intensors, length):
         n, dim = intensors.shape
         padding_tensor = torch.zeros(length - n, dim, dtype=intensors.dtype)
@@ -325,19 +331,18 @@ def paddingtensor(self, intensors, length):
         return outtensors
 
     def __call__(self, features: list[dict[str, Any]]) -> dict[str, Any]:
-        max_length = max(item["input_ids"].shape[0] for item in features)
         batch_input_ids = torch.stack(
-            [self.paddingtensor(item["input_ids"], max_length) for item in features]
+            [self.paddingtensor(item["input_ids"], self.max_length) for item in features]
         )
         batch_attention_mask = torch.stack(
-            [self.paddingtensor(item["attention_mask"], max_length) for item in features]
+            [self.paddingtensor(item["attention_mask"], self.max_length) for item in features]
         )
         batch_loss_mask = torch.stack(
-            [self.paddingtensor(item["loss_mask"], max_length) for item in features]
+            [self.paddingtensor(item["loss_mask"], self.max_length) for item in features]
         )
 
         batch_labels = torch.stack(
-            [self.paddingtensor(item["labels"], max_length) for item in features]
+            [self.paddingtensor(item["labels"], self.max_length) for item in features]
         )
 
         batch = {
@@ -357,16 +362,15 @@ def __call__(self, features: list[dict[str, Any]]) -> dict[str, Any]:
             raise ValueError("No kwargs found in batch features. Offline data required.")
 
         features = [item["kwargs"]["base_model_outputs"] for item in features]
-        max_hs_length = max(item["base_model_hidden_states"].shape[0] for item in features)
 
         batch_hidden_states = torch.stack(
             [
-                self.paddingtensor2d(item["base_model_hidden_states"], max_hs_length)
+                self.paddingtensor2d(item["base_model_hidden_states"], self.max_length)
                 for item in features
             ]
         )
         batch_aux_hidden_states = torch.stack(
-            [self.paddingtensor2d(item["aux_hidden_states"], max_hs_length) for item in features]
+            [self.paddingtensor2d(item["aux_hidden_states"], self.max_length) for item in features]
         )
 
         batch = {
 
@@ -227,7 +227,9 @@ def train():
     if training_args.mode == "medusa":
         data_module = make_medusa_supervised_data_module(tokenizer, data_args)
     elif training_args.mode in ["eagle1", "eagle3"]:
-        data_module = make_eagle_supervised_data_module(tokenizer, data_args, use_offline_training)
+        data_module = make_eagle_supervised_data_module(
+            tokenizer, data_args, use_offline_training, max_length=training_args.training_seq_len
+        )
 
     class ARValidationCallback(TrainerCallback):
         def __init__(self, ar_validate_steps: int = 500):
 
@@ -179,6 +179,7 @@ def convert_to_f16(
     sanitizer.find_custom_nodes()
     sanitizer.convert_opset()
     sanitizer.ensure_graph_name_exists()
+    sanitizer.convert_fp64_to_fp32()
     model = sanitizer.model
 
     # Setup internal mappings