NVIDIA
diff --git a/‎.github/workflows/code_quality.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/code_quality.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/gpu_tests.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/gpu_tests.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/pages.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/pages.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/unit_tests.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/unit_tests.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎CHANGELOG.rst‎
Lines changed: 3 additions & 0 deletions b/‎CHANGELOG.rst‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/diffusers/quantization/diffusion_trt.py‎
Lines changed: 85 additions & 6 deletions b/‎examples/diffusers/quantization/diffusion_trt.py‎
Lines changed: 85 additions & 6 deletions
diff --git a/‎examples/diffusers/quantization/onnx_utils/export.py‎
Lines changed: 16 additions & 2 deletions b/‎examples/diffusers/quantization/onnx_utils/export.py‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎examples/diffusers/quantization/quantize.py‎
Lines changed: 13 additions & 4 deletions b/‎examples/diffusers/quantization/quantize.py‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎examples/llm_ptq/README.md‎
Lines changed: 32 additions & 0 deletions b/‎examples/llm_ptq/README.md‎
Lines changed: 32 additions & 0 deletions
@@ -2,7 +2,7 @@ name: Code Quality
 
 on:
   pull_request:
-    branches: [main, release/*]
+    branches: [main, release/*, feature/*]
   schedule:
     - cron: "0 0 * * *" # Nightly
   workflow_dispatch: # On-demand
 
@@ -61,7 +61,7 @@ jobs:
     if: needs.check-file-changes.outputs.any_changed == 'true'
     # Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
     runs-on: linux-amd64-gpu-l4-latest-1
-    timeout-minutes: 90
+    timeout-minutes: 120
     container: &gpu_container
       image: nvcr.io/nvidia/pytorch:25.06-py3
       env:
@@ -80,7 +80,7 @@ jobs:
     if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
     # Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
     runs-on: linux-amd64-gpu-h100-latest-1
-    timeout-minutes: 90
+    timeout-minutes: 120
     container: *gpu_container
     steps: *gpu_steps
   gpu-pr-required-check:
 
@@ -2,7 +2,7 @@ name: Docs
 
 on:
   pull_request:
-    branches: [main, release/*]
+    branches: [main, release/*, feature/*]
   push:
     branches: [main]
   schedule:
 
@@ -3,9 +3,9 @@ name: Unit tests
 
 on:
   pull_request:
-    branches: [main, release/*]
+    branches: [main, release/*, feature/*]
   push:
-    branches: [main, release/*]
+    branches: [main, release/*, feature/*]
     paths:
       - ".github/workflows/unit_tests.yml"
       - "modelopt/**"
 
@@ -11,6 +11,9 @@ Model Optimizer Changelog (Linux)
 - Support PTQ and fakequant in vLLM for fast evaluation of arbitrary quantization formats. See ``examples/vllm_serve`` for more details.
 - Add support for ``nemotron-post-training-dataset-v2`` and ``nemotron-post-training-dataset-v1`` in ``examples/llm_ptq``. Default to a mix of ``cnn_dailymail`` and ``nemotron-post-training-dataset-v2`` (gated dataset accessed using ``HF_TOKEN`` environment variable) if no dataset is specified.
 - Allow specifying ``calib_seq`` in ``examples/llm_ptq`` to set the maximum sequence length for calibration.
+- Add support for MCore MoE PTQ/QAT/QAD.
+- Add support for multi-node PTQ and export with FSDP2 in ``examples/llm_ptq/multinode_ptq.py``. See `examples/llm_ptq/README.md <https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/llm_ptq#multi-node-post-training-quantization-with-fsdp2>`_ for more details.
+- Add support for Nemotron Nano VL v1 & v2 models in FP8/NVFP4 PTQ workflow.
 
 **Documentation**
 
 
@@ -23,6 +23,7 @@
     update_dynamic_axes,
 )
 from quantize import ModelType, PipelineManager
+from tqdm import tqdm
 
 import modelopt.torch.opt as mto
 from modelopt.torch._deploy._runtime import RuntimeRegistry
@@ -58,6 +59,59 @@ def generate_image(pipe, prompt, image_name):
     print(f"Image generated saved as {image_name}")
 
 
+def benchmark_model(
+    pipe, prompt, num_warmup=10, num_runs=50, num_inference_steps=20, model_dtype="Half"
+):
+    """Benchmark the backbone model inference time."""
+    backbone = pipe.transformer if hasattr(pipe, "transformer") else pipe.unet
+
+    backbone_times = []
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    def forward_pre_hook(_module, _input):
+        start_event.record()
+
+    def forward_hook(_module, _input, _output):
+        end_event.record()
+        torch.cuda.synchronize()
+        backbone_times.append(start_event.elapsed_time(end_event))
+
+    pre_handle = backbone.register_forward_pre_hook(forward_pre_hook)
+    post_handle = backbone.register_forward_hook(forward_hook)
+
+    try:
+        print(f"Starting warmup: {num_warmup} runs")
+        for _ in tqdm(range(num_warmup), desc="Warmup"):
+            with torch.amp.autocast("cuda", dtype=dtype_map[model_dtype]):
+                _ = pipe(
+                    prompt,
+                    output_type="pil",
+                    num_inference_steps=num_inference_steps,
+                    generator=torch.Generator("cuda").manual_seed(42),
+                )
+
+        backbone_times.clear()
+
+        print(f"Starting benchmark: {num_runs} runs")
+        for _ in tqdm(range(num_runs), desc="Benchmark"):
+            with torch.amp.autocast("cuda", dtype=dtype_map[model_dtype]):
+                _ = pipe(
+                    prompt,
+                    output_type="pil",
+                    num_inference_steps=num_inference_steps,
+                    generator=torch.Generator("cuda").manual_seed(42),
+                )
+    finally:
+        pre_handle.remove()
+        post_handle.remove()
+
+    total_backbone_time = sum(backbone_times)
+    avg_latency = total_backbone_time / (num_runs * num_inference_steps)
+    print(f"Inference latency of the torch backbone: {avg_latency:.2f} ms")
+    return avg_latency
+
+
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -92,15 +146,24 @@ def main():
         "--onnx-load-path", type=str, default="", help="Path to load the ONNX model"
     )
     parser.add_argument(
-        "--trt-engine-load-path", type=str, default=None, help="Path to load the TRT engine"
+        "--trt-engine-load-path", type=str, default=None, help="Path to load the TensorRT engine"
     )
     parser.add_argument(
         "--dq-only", action="store_true", help="Converts the ONNX model to a dq_only model"
     )
     parser.add_argument(
-        "--torch", action="store_true", help="Generate an image using the torch pipeline"
+        "--torch",
+        action="store_true",
+        help="Use the torch pipeline for image generation or benchmarking",
     )
     parser.add_argument("--save-image-as", type=str, default=None, help="Name of the image to save")
+    parser.add_argument(
+        "--benchmark", action="store_true", help="Benchmark the model backbone inference time"
+    )
+    parser.add_argument(
+        "--torch-compile", action="store_true", help="Use torch.compile() on the backbone model"
+    )
+    parser.add_argument("--skip-image", action="store_true", help="Skip image generation")
     args = parser.parse_args()
 
     image_name = args.save_image_as if args.save_image_as else f"{args.model}.png"
@@ -125,13 +188,25 @@ def main():
     if args.restore_from:
         mto.restore(backbone, args.restore_from)
 
+    if args.torch_compile:
+        assert args.model_dtype in ["BFloat16", "Float", "Half"], (
+            "torch.compile() only supports BFloat16 and Float"
+        )
+        print("Compiling backbone with torch.compile()...")
+        backbone = torch.compile(backbone, mode="max-autotune")
+
     if args.torch:
         if hasattr(pipe, "transformer"):
             pipe.transformer = backbone
         elif hasattr(pipe, "unet"):
             pipe.unet = backbone
         pipe.to("cuda")
-        generate_image(pipe, args.prompt, image_name)
+
+        if args.benchmark:
+            benchmark_model(pipe, args.prompt, model_dtype=args.model_dtype)
+
+        if not args.skip_image:
+            generate_image(pipe, args.prompt, image_name)
         return
 
     backbone.to("cuda")
@@ -211,10 +286,14 @@ def main():
         raise ValueError("Pipeline does not have a transformer or unet backbone")
     pipe.to("cuda")
 
-    generate_image(pipe, args.prompt, image_name)
-    print(f"Image generated using {args.model} model saved as {image_name}")
+    if not args.skip_image:
+        generate_image(pipe, args.prompt, image_name)
+        print(f"Image generated using {args.model} model saved as {image_name}")
 
-    print(f"Inference latency of the backbone of the pipeline is {device_model.get_latency()} ms")
+    if args.benchmark:
+        print(
+            f"Inference latency of the TensorRT optimized backbone: {device_model.get_latency()} ms"
+        )
 
 
 if __name__ == "__main__":
 
@@ -73,6 +73,13 @@
         "pooled_projections": {0: "batch_size"},
         "sample": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"},
     },
+    "sd3.5-medium": {
+        "hidden_states": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"},
+        "timestep": {0: "steps"},
+        "encoder_hidden_states": {0: "batch_size", 1: "sequence_length"},
+        "pooled_projections": {0: "batch_size"},
+        "out_hidden_states": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"},
+    },
     "flux-dev": {
         "hidden_states": {0: "batch_size", 1: "latent_dim"},
         "encoder_hidden_states": {0: "batch_size"},
@@ -290,6 +297,8 @@ def update_dynamic_axes(model_id, dynamic_axes):
         dynamic_axes["out.0"] = dynamic_axes.pop("latent")
     elif model_id == "sd3-medium":
         dynamic_axes["out.0"] = dynamic_axes.pop("sample")
+    elif model_id == "sd3.5-medium":
+        dynamic_axes["out.0"] = dynamic_axes.pop("out_hidden_states")
 
 
 def _create_dynamic_shapes(dynamic_shapes):
@@ -313,7 +322,7 @@ def generate_dummy_inputs_and_dynamic_axes_and_shapes(model_id, backbone):
         dummy_input, dynamic_shapes = _gen_dummy_inp_and_dyn_shapes_sdxl(
             backbone, min_bs=2, opt_bs=16
         )
-    elif model_id == "sd3-medium":
+    elif model_id in ["sd3-medium", "sd3.5-medium"]:
         dummy_input, dynamic_shapes = _gen_dummy_inp_and_dyn_shapes_sd3(
             backbone, min_bs=2, opt_bs=16
         )
@@ -343,14 +352,16 @@ def get_io_shapes(model_id, onnx_load_path, dynamic_shapes):
             output_name = "latent"
         elif model_id in ["sd3-medium"]:
             output_name = "sample"
+        elif model_id in ["sd3.5-medium"]:
+            output_name = "out_hidden_states"
         elif model_id in ["flux-dev", "flux-schnell"]:
             output_name = "output"
         else:
             raise NotImplementedError(f"Unsupported model_id: {model_id}")
 
     if model_id in ["sdxl-1.0", "sdxl-turbo"]:
         io_shapes = {output_name: dynamic_shapes["dynamic_shapes"]["minShapes"]["sample"]}
-    elif model_id in ["sd3-medium"]:
+    elif model_id in ["sd3-medium", "sd3.5-medium"]:
         io_shapes = {output_name: dynamic_shapes["dynamic_shapes"]["minShapes"]["hidden_states"]}
     elif model_id in ["flux-dev", "flux-schnell"]:
         io_shapes = {}
@@ -406,6 +417,9 @@ def modelopt_export_sd(backbone, onnx_dir, model_name, precision):
     elif model_name == "sd3-medium":
         input_names = ["hidden_states", "encoder_hidden_states", "pooled_projections", "timestep"]
         output_names = ["sample"]
+    elif model_name == "sd3.5-medium":
+        input_names = ["hidden_states", "encoder_hidden_states", "pooled_projections", "timestep"]
+        output_names = ["out_hidden_states"]
     elif model_name in ["flux-dev", "flux-schnell"]:
         input_names = [
             "hidden_states",
 
@@ -16,6 +16,7 @@
 import argparse
 import logging
 import sys
+import time as time
 from collections.abc import Callable
 from dataclasses import dataclass
 from enum import Enum
@@ -59,6 +60,7 @@ class ModelType(str, Enum):
     SDXL_BASE = "sdxl-1.0"
     SDXL_TURBO = "sdxl-turbo"
     SD3_MEDIUM = "sd3-medium"
+    SD35_MEDIUM = "sd3.5-medium"
     FLUX_DEV = "flux-dev"
     FLUX_SCHNELL = "flux-schnell"
     LTX_VIDEO_DEV = "ltx-video-dev"
@@ -114,6 +116,7 @@ def get_model_filter_func(model_type: ModelType) -> Callable[[str], bool]:
         ModelType.SDXL_BASE: filter_func_default,
         ModelType.SDXL_TURBO: filter_func_default,
         ModelType.SD3_MEDIUM: filter_func_default,
+        ModelType.SD35_MEDIUM: filter_func_default,
         ModelType.LTX_VIDEO_DEV: filter_func_ltx_video,
     }
 
@@ -125,6 +128,7 @@ def get_model_filter_func(model_type: ModelType) -> Callable[[str], bool]:
     ModelType.SDXL_BASE: "stabilityai/stable-diffusion-xl-base-1.0",
     ModelType.SDXL_TURBO: "stabilityai/sdxl-turbo",
     ModelType.SD3_MEDIUM: "stabilityai/stable-diffusion-3-medium-diffusers",
+    ModelType.SD35_MEDIUM: "stabilityai/stable-diffusion-3.5-medium",
     ModelType.FLUX_DEV: "black-forest-labs/FLUX.1-dev",
     ModelType.FLUX_SCHNELL: "black-forest-labs/FLUX.1-schnell",
     ModelType.LTX_VIDEO_DEV: "Lightricks/LTX-Video-0.9.7-dev",
@@ -230,6 +234,7 @@ def uses_transformer(self) -> bool:
         """Check if model uses transformer backbone (vs UNet)."""
         return self.model_type in [
             ModelType.SD3_MEDIUM,
+            ModelType.SD35_MEDIUM,
             ModelType.FLUX_DEV,
             ModelType.FLUX_SCHNELL,
             ModelType.LTX_VIDEO_DEV,
@@ -326,7 +331,7 @@ def create_pipeline_from(
             model_id = (
                 MODEL_REGISTRY[model_type] if override_model_path is None else override_model_path
             )
-            if model_type == ModelType.SD3_MEDIUM:
+            if model_type in [ModelType.SD3_MEDIUM, ModelType.SD35_MEDIUM]:
                 pipe = StableDiffusion3Pipeline.from_pretrained(model_id, torch_dtype=torch_dtype)
             elif model_type in [ModelType.FLUX_DEV, ModelType.FLUX_SCHNELL]:
                 pipe = FluxPipeline.from_pretrained(model_id, torch_dtype=torch_dtype)
@@ -357,7 +362,7 @@ def create_pipeline(self) -> DiffusionPipeline:
         self.logger.info(f"Data type: {self.config.model_dtype.value}")
 
         try:
-            if self.config.model_type == ModelType.SD3_MEDIUM:
+            if self.config.model_type in [ModelType.SD3_MEDIUM, ModelType.SD35_MEDIUM]:
                 self.pipe = StableDiffusion3Pipeline.from_pretrained(
                     self.config.model_path, torch_dtype=self.config.torch_dtype
                 )
@@ -864,6 +869,8 @@ def main() -> None:
     parser = create_argument_parser()
     args = parser.parse_args()
 
+    s = time.time()
+
     logger = setup_logging(args.verbose)
     logger.info("Starting Enhanced Diffusion Model Quantization")
 
@@ -939,9 +946,11 @@ def forward_loop(mod):
             backbone,
             model_config.model_type,
             quant_config.format,
-            quantize_mha=QuantizationConfig.quantize_mha,
+            quantize_mha=quant_config.quantize_mha,
+        )
+        logger.info(
+            f"Quantization process completed successfully! Time taken = {time.time() - s} seconds"
         )
-        logger.info("Quantization process completed successfully!")
 
     except Exception as e:
         logger.error(f"Quantization failed: {e}", exc_info=True)
 
@@ -235,6 +235,38 @@ with init_quantized_weights(mtq.NVFP4_DEFAULT_CFG):
 mtq.calibrate(model, algorithm="max", forward_loop=calibrate_loop)
 ```
 
+## Multi-Node Post-Training Quantization with FSDP2
+
+ModelOpt enables quantization of LLMs across multiple GPU nodes using various quantization formats. It leverages HuggingFace's Accelerate library and FSDP2 for distributed model sharding and calibration.
+
+### Usage
+
+For distributed execution across multiple nodes, use the `accelerate` library. A template configuration file (`fsdp2.yaml`) is provided and can be customized for user specific requirements.
+
+On each node run the following command:
+
+```bash
+accelerate launch --config_file fsdp2.yaml \
+    --num_machines=<num_nodes> \
+    --machine_rank=<current_node_rank> \
+    --main_process_ip=<node0_ip_addr> \
+    --main_process_port=<port> \
+    --fsdp_transformer_layer_cls_to_wrap=<decoder_layer_name>
+     multinode_ptq.py \
+    --pyt_ckpt_path <path_to_model> \
+    --qformat <fp8/nvfp4/nvfp4_awq/int8> \
+    --kv_cache_qformat <fp8/nvfp4/nvfp4_affine/none> \
+    --batch_size <calib_batch_size> \
+    --calib_size <num_calib_samples> \
+    --dataset <dataset> \
+    --export_path <export_path> \
+    --trust_remote_code 
+```
+
+The exported checkpoint can be deployed using TensorRT-LLM/ vLLM/ SGLang. For more details refer to the [deployment section](#deployment) of this document.
+
+> *Performance Note: FSDP2 is designed for training workloads and may result in longer calibration and export times. For faster calibration, maximize the batch size based on available GPU memory and choose the right number of GPUs to avoid unnecessary communication.*
+>
 ## Framework Scripts
 
 ### Hugging Face Example [Script](./scripts/huggingface_example.sh)