Skip to content

Commit 2abe00c

Browse files
Push latest changes and bug fixes
Signed-off-by: Keval Morabia <[email protected]>
1 parent dd0bed2 commit 2abe00c

File tree

74 files changed

+4114
-1099
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

74 files changed

+4114
-1099
lines changed

.github/CODEOWNERS

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,9 @@ examples/llm_eval @NVIDIA/modelopt-examples-llm_ptq-codeowners
4343
examples/llm_ptq @NVIDIA/modelopt-examples-llm_ptq-codeowners
4444
examples/llm_qat @NVIDIA/modelopt-examples-llm_qat-codeowners
4545
examples/llm_sparsity @NVIDIA/modelopt-torch-sparsity-codeowners
46+
examples/megatron-lm @NVIDIA/modelopt-examples-megatron-codeowners
4647
examples/model_hub @NVIDIA/modelopt-examples-model_hub-codeowners
47-
examples/nemo_run @NVIDIA/modelopt-examples-nemo_run-codeowners
48+
examples/nemo_run @NVIDIA/modelopt-examples-megatron-codeowners
4849
examples/onnx_ptq @NVIDIA/modelopt-onnx-codeowners
4950
examples/pruning @NVIDIA/modelopt-torch-nas-prune-codeowners
5051
examples/speculative_decoding @NVIDIA/modelopt-torch-speculative-codeowners

CHANGELOG.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ Model Optimizer Changelog (Linux)
2626
- Add support for ``mamba_num_heads``, ``mamba_head_dim``, ``hidden_size`` and ``num_layers`` pruning for Megatron Core Mamba or Hybrid Transformer Mamba models in ``mcore_minitron`` (previously ``mcore_gpt_minitron``) mode.
2727
- Add example for QAT/QAD training with `LLaMA Factory <https://github.com/hiyouga/LLaMA-Factory/tree/main>`_. See ``examples/llm_qat/llama_factory`` for more details.
2828
- Upgrade TensorRT-LLM dependency to 1.0.0rc6.
29+
- Add unified HuggingFace model export support for quantized NVFP4 GPT-OSS models.
2930

3031
0.33 (2025-07-14)
3132
^^^^^^^^^^^^^^^^^

examples/chained_optimizations/bert_prune_distill_quantize.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1107,6 +1107,7 @@ def main(input_args: list[str] | None = None) -> None:
11071107
format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
11081108
datefmt="%m/%d/%Y %H:%M:%S",
11091109
level=logging.INFO,
1110+
force=True,
11101111
)
11111112
logger.info(accelerator.state, main_process_only=False)
11121113
if accelerator.is_local_main_process:

examples/diffusers/quantization/diffusion_trt.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
remove_nesting,
2323
update_dynamic_axes,
2424
)
25-
from quantize import create_pipeline
25+
from quantize import ModelType, PipelineManager
2626

2727
import modelopt.torch.opt as mto
2828
from modelopt.torch._deploy._runtime import RuntimeRegistry
@@ -31,6 +31,20 @@
3131
from modelopt.torch._deploy.device_model import DeviceModel
3232
from modelopt.torch._deploy.utils import get_onnx_bytes_and_metadata
3333

34+
MODEL_ID = {
35+
"sdxl-1.0": ModelType.SDXL_BASE,
36+
"sdxl-turbo": ModelType.SDXL_TURBO,
37+
"sd3-medium": ModelType.SD3_MEDIUM,
38+
"flux-dev": ModelType.FLUX_DEV,
39+
"flux-schnell": ModelType.FLUX_SCHNELL,
40+
}
41+
42+
dtype_map = {
43+
"Half": torch.float16,
44+
"BFloat16": torch.bfloat16,
45+
"Float": torch.float32,
46+
}
47+
3448

3549
def generate_image(pipe, prompt, image_name):
3650
seed = 42
@@ -91,7 +105,7 @@ def main():
91105

92106
image_name = args.save_image_as if args.save_image_as else f"{args.model}.png"
93107

94-
pipe = create_pipeline(args.model, args.model_dtype, args.override_model_path)
108+
pipe = PipelineManager.create_pipeline_from(MODEL_ID[args.model], dtype_map[args.model_dtype])
95109

96110
# Save the backbone of the pipeline and move it to the GPU
97111
add_embedding = None

examples/diffusers/quantization/quantize.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,37 @@ def __init__(self, config: ModelConfig, logger: logging.Logger):
306306
self.pipe: DiffusionPipeline | None = None
307307
self.pipe_upsample: LTXLatentUpsamplePipeline | None = None # For LTX-Video upsampling
308308

309+
@staticmethod
310+
def create_pipeline_from(
311+
model_type: ModelType, torch_dtype: torch.dtype = torch.bfloat16
312+
) -> DiffusionPipeline:
313+
"""
314+
Create and return an appropriate pipeline based on configuration.
315+
316+
Returns:
317+
Configured diffusion pipeline
318+
319+
Raises:
320+
ValueError: If model type is unsupported
321+
"""
322+
try:
323+
model_id = MODEL_REGISTRY[model_type]
324+
if model_type == ModelType.SD3_MEDIUM:
325+
pipe = StableDiffusion3Pipeline.from_pretrained(model_id, torch_dtype=torch_dtype)
326+
elif model_type in [ModelType.FLUX_DEV, ModelType.FLUX_SCHNELL]:
327+
pipe = FluxPipeline.from_pretrained(model_id, torch_dtype=torch_dtype)
328+
else:
329+
# SDXL models
330+
pipe = DiffusionPipeline.from_pretrained(
331+
model_id,
332+
torch_dtype=torch_dtype,
333+
use_safetensors=True,
334+
)
335+
pipe.set_progress_bar_config(disable=True)
336+
return pipe
337+
except Exception as e:
338+
raise e
339+
309340
def create_pipeline(self) -> DiffusionPipeline:
310341
"""
311342
Create and return an appropriate pipeline based on configuration.

examples/gpt-oss/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ model = mtq.quantize(model, config, forward_loop)
4949
train(model, train_loader, optimizer, scheduler, ...)
5050
```
5151

52+
For an end to end example showcasing the above workflow, checkout [qat-finetune-transformers.ipynb](/examples/gpt-oss/qat-finetune-transformers.ipynb).
53+
5254
If you are training Huggingface models with trainer classes from Huggingface such as [SFTTrainer](https://huggingface.co/docs/trl/en/sft_trainer) performing QAT is even easier - simply replace the trainer with its equivalent, `QATSFTTrainer` from ModelOpt and specify additional quantization arguments to it. `QATSFTTrainer` will perform the necessary quantization steps in the backend and train the model just like the original `SFTTrainer`.
5355

5456
A real end-to-end example for this is in `sft.py` in this folder. To perform QAT with full parameter SFT on GPT-OSS 20B model, run:

examples/gpt-oss/convert_oai_mxfp4_weight_only.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,8 @@
2323
from transformers import AutoModelForCausalLM, AutoTokenizer, Mxfp4Config
2424
from utils import get_original_huggingface_quant_method
2525

26-
import modelopt.torch.opt as mto
2726
from modelopt.torch.quantization.qtensor import MXFP4QTensor
2827

29-
mto.enable_huggingface_checkpointing()
30-
3128

3229
def _to_oai_mxfp4_weight_only(model, block_size=32):
3330
new_state_dict = {}
@@ -36,15 +33,20 @@ def _to_oai_mxfp4_weight_only(model, block_size=32):
3633
# Only convert experts weights, skip bias and other modules
3734
if "experts" in name and "bias" not in name:
3835
param = param.transpose(-1, -2).contiguous()
39-
quantized, scales = MXFP4QTensor.quantize(param, block_size=block_size)
40-
41-
shape = quantized._quantized_data.shape
36+
quantized_tensors = []
37+
scales_tensors = []
38+
for expert in param:
39+
quantized, scales = MXFP4QTensor.quantize(expert, block_size=block_size)
40+
quantized_tensors.append(quantized._quantized_data)
41+
scales_tensors.append(scales)
42+
quantized = torch.stack(quantized_tensors)
43+
scales = torch.stack(scales_tensors)
44+
45+
shape = quantized.shape
4246
# Add converted weights and scales to state_dict
4347
new_state_dict.update(
4448
{
45-
f"{name}_blocks": quantized._quantized_data.view(
46-
shape[0], shape[1], -1, block_size // 2
47-
).cpu(),
49+
f"{name}_blocks": quantized.view(shape[0], shape[1], -1, block_size // 2).cpu(),
4850
f"{name}_scales": scales.view(shape[0], shape[1], -1).cpu(),
4951
}
5052
)
@@ -134,6 +136,8 @@ def create_parser():
134136
if args.lora_path:
135137
model = PeftModel.from_pretrained(model, args.lora_path)
136138
model = model.merge_and_unload() # Merge LoRA-QAT adapter weights to base model
139+
torch.cuda.empty_cache()
140+
gc.collect()
137141

138142
# Load tokenizer
139143
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

0 commit comments

Comments
 (0)