Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,9 @@ examples/llm_eval @NVIDIA/modelopt-examples-llm_ptq-codeowners
examples/llm_ptq @NVIDIA/modelopt-examples-llm_ptq-codeowners
examples/llm_qat @NVIDIA/modelopt-examples-llm_qat-codeowners
examples/llm_sparsity @NVIDIA/modelopt-torch-sparsity-codeowners
examples/megatron-lm @NVIDIA/modelopt-examples-megatron-codeowners
examples/model_hub @NVIDIA/modelopt-examples-model_hub-codeowners
examples/nemo_run @NVIDIA/modelopt-examples-nemo_run-codeowners
examples/nemo_run @NVIDIA/modelopt-examples-megatron-codeowners
examples/onnx_ptq @NVIDIA/modelopt-onnx-codeowners
examples/pruning @NVIDIA/modelopt-torch-nas-prune-codeowners
examples/speculative_decoding @NVIDIA/modelopt-torch-speculative-codeowners
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ Model Optimizer Changelog (Linux)
- Add support for ``mamba_num_heads``, ``mamba_head_dim``, ``hidden_size`` and ``num_layers`` pruning for Megatron Core Mamba or Hybrid Transformer Mamba models in ``mcore_minitron`` (previously ``mcore_gpt_minitron``) mode.
- Add example for QAT/QAD training with `LLaMA Factory <https://github.com/hiyouga/LLaMA-Factory/tree/main>`_. See ``examples/llm_qat/llama_factory`` for more details.
- Upgrade TensorRT-LLM dependency to 1.0.0rc6.
- Add unified HuggingFace model export support for quantized NVFP4 GPT-OSS models.

0.33 (2025-07-14)
^^^^^^^^^^^^^^^^^
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1107,6 +1107,7 @@ def main(input_args: list[str] | None = None) -> None:
format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO,
force=True,
)
logger.info(accelerator.state, main_process_only=False)
if accelerator.is_local_main_process:
Expand Down
18 changes: 16 additions & 2 deletions examples/diffusers/quantization/diffusion_trt.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
remove_nesting,
update_dynamic_axes,
)
from quantize import create_pipeline
from quantize import ModelType, PipelineManager

import modelopt.torch.opt as mto
from modelopt.torch._deploy._runtime import RuntimeRegistry
Expand All @@ -31,6 +31,20 @@
from modelopt.torch._deploy.device_model import DeviceModel
from modelopt.torch._deploy.utils import get_onnx_bytes_and_metadata

MODEL_ID = {
"sdxl-1.0": ModelType.SDXL_BASE,
"sdxl-turbo": ModelType.SDXL_TURBO,
"sd3-medium": ModelType.SD3_MEDIUM,
"flux-dev": ModelType.FLUX_DEV,
"flux-schnell": ModelType.FLUX_SCHNELL,
}

dtype_map = {
"Half": torch.float16,
"BFloat16": torch.bfloat16,
"Float": torch.float32,
}


def generate_image(pipe, prompt, image_name):
seed = 42
Expand Down Expand Up @@ -91,7 +105,7 @@ def main():

image_name = args.save_image_as if args.save_image_as else f"{args.model}.png"

pipe = create_pipeline(args.model, args.model_dtype, args.override_model_path)
pipe = PipelineManager.create_pipeline_from(MODEL_ID[args.model], dtype_map[args.model_dtype])

# Save the backbone of the pipeline and move it to the GPU
add_embedding = None
Expand Down
31 changes: 31 additions & 0 deletions examples/diffusers/quantization/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,37 @@ def __init__(self, config: ModelConfig, logger: logging.Logger):
self.pipe: DiffusionPipeline | None = None
self.pipe_upsample: LTXLatentUpsamplePipeline | None = None # For LTX-Video upsampling

@staticmethod
def create_pipeline_from(
model_type: ModelType, torch_dtype: torch.dtype = torch.bfloat16
) -> DiffusionPipeline:
"""
Create and return an appropriate pipeline based on configuration.
Returns:
Configured diffusion pipeline
Raises:
ValueError: If model type is unsupported
"""
try:
model_id = MODEL_REGISTRY[model_type]
if model_type == ModelType.SD3_MEDIUM:
pipe = StableDiffusion3Pipeline.from_pretrained(model_id, torch_dtype=torch_dtype)
elif model_type in [ModelType.FLUX_DEV, ModelType.FLUX_SCHNELL]:
pipe = FluxPipeline.from_pretrained(model_id, torch_dtype=torch_dtype)
else:
# SDXL models
pipe = DiffusionPipeline.from_pretrained(
model_id,
torch_dtype=torch_dtype,
use_safetensors=True,
)
pipe.set_progress_bar_config(disable=True)
return pipe
except Exception as e:
raise e

def create_pipeline(self) -> DiffusionPipeline:
"""
Create and return an appropriate pipeline based on configuration.
Expand Down
2 changes: 2 additions & 0 deletions examples/gpt-oss/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ model = mtq.quantize(model, config, forward_loop)
train(model, train_loader, optimizer, scheduler, ...)
```

For an end to end example showcasing the above workflow, checkout [qat-finetune-transformers.ipynb](/examples/gpt-oss/qat-finetune-transformers.ipynb).

If you are training Huggingface models with trainer classes from Huggingface such as [SFTTrainer](https://huggingface.co/docs/trl/en/sft_trainer) performing QAT is even easier - simply replace the trainer with its equivalent, `QATSFTTrainer` from ModelOpt and specify additional quantization arguments to it. `QATSFTTrainer` will perform the necessary quantization steps in the backend and train the model just like the original `SFTTrainer`.

A real end-to-end example for this is in `sft.py` in this folder. To perform QAT with full parameter SFT on GPT-OSS 20B model, run:
Expand Down
22 changes: 13 additions & 9 deletions examples/gpt-oss/convert_oai_mxfp4_weight_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,8 @@
from transformers import AutoModelForCausalLM, AutoTokenizer, Mxfp4Config
from utils import get_original_huggingface_quant_method

import modelopt.torch.opt as mto
from modelopt.torch.quantization.qtensor import MXFP4QTensor

mto.enable_huggingface_checkpointing()


def _to_oai_mxfp4_weight_only(model, block_size=32):
new_state_dict = {}
Expand All @@ -36,15 +33,20 @@ def _to_oai_mxfp4_weight_only(model, block_size=32):
# Only convert experts weights, skip bias and other modules
if "experts" in name and "bias" not in name:
param = param.transpose(-1, -2).contiguous()
quantized, scales = MXFP4QTensor.quantize(param, block_size=block_size)

shape = quantized._quantized_data.shape
quantized_tensors = []
scales_tensors = []
for expert in param:
quantized, scales = MXFP4QTensor.quantize(expert, block_size=block_size)
quantized_tensors.append(quantized._quantized_data)
scales_tensors.append(scales)
quantized = torch.stack(quantized_tensors)
scales = torch.stack(scales_tensors)

shape = quantized.shape
# Add converted weights and scales to state_dict
new_state_dict.update(
{
f"{name}_blocks": quantized._quantized_data.view(
shape[0], shape[1], -1, block_size // 2
).cpu(),
f"{name}_blocks": quantized.view(shape[0], shape[1], -1, block_size // 2).cpu(),
f"{name}_scales": scales.view(shape[0], shape[1], -1).cpu(),
}
)
Expand Down Expand Up @@ -134,6 +136,8 @@ def create_parser():
if args.lora_path:
model = PeftModel.from_pretrained(model, args.lora_path)
model = model.merge_and_unload() # Merge LoRA-QAT adapter weights to base model
torch.cuda.empty_cache()
gc.collect()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
Expand Down
Loading
Loading