addressed PR comments

ishan-modi · ishan-modi · commit 395e75b2c713 · 2025-08-22T09:38:28.000+05:30
diff --git a/docs/source/en/quantization/modelopt.md b/docs/source/en/quantization/modelopt.md
@@ -9,7 +9,7 @@ Unless required by applicable law or agreed to in writing, software distributed
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License. -->
 
-# Nvidia ModelOpt
+# NVIDIA ModelOpt
 
 [nvidia_modelopt](https://github.com/NVIDIA/TensorRT-Model-Optimizer) is a unified library of state-of-the-art model optimization techniques like quantization, pruning, distillation, speculative decoding, etc. It compresses deep learning models for downstream deployment frameworks like TensorRT-LLM or TensorRT to optimize inference speed.
 
@@ -19,7 +19,6 @@ Before you begin, make sure you have nvidia_modelopt installed.
 pip install -U "nvidia_modelopt[hf]"
 ```
 
-
 Quantize a model by passing [`NVIDIAModelOptConfig`] to [`~ModelMixin.from_pretrained`] (you can also load pre-quantized models). This works for any model in any modality, as long as it supports loading with [Accelerate](https://hf.co/docs/accelerate/index) and contains `torch.nn.Linear` layers.
 
 The example below only quantizes the weights to FP8.
diff --git a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py
@@ -32,7 +32,7 @@ class NVIDIAModelOptQuantizer(DiffusersQuantizer):
 
     use_keep_in_fp32_modules = True
     requires_calibration = False
-    required_packages = ["modelopt"]
+    required_packages = ["nvidia_modelopt"]
 
     def __init__(self, quantization_config, **kwargs):
         super().__init__(quantization_config, **kwargs)
diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py
@@ -767,9 +767,13 @@ class NVIDIAModelOptConfig(QuantizationConfigMixin):
         "FP8": (4, 3),
         "INT8": 8,
         "INT4": 4,
-        # "NF4": 4,  # TODO: enable this upon modelopt release https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/183
+        "NF4": 4,
         "NVFP4": (2,1),
     }
+    quanttype_to_scalingbits = {
+        "NF4": 8,
+        "NVFP4": (4, 3),
+    }
 
     def __init__(
         self,
@@ -884,15 +888,17 @@ def get_config_from_quant_type(self) -> Dict[str, Any]:
             quant_cfg["*input_quantizer"]["axis"] = self.channel_quantize
             quant_cfg["*input_quantizer"]["type"] = "dynamic"
 
-        # Only fixed sizes are supported for now in modelopt
-        if "NF4" in w_type:
-            quant_cfg["*weight_quantizer"]["block_sizes"].update({"scale_bits":8, "scale_block_sizes": {self.scale_channel_quantize: self.scale_block_quantize}})
-        elif "NVFP4" in w_type:
-            quant_cfg["*weight_quantizer"]["block_sizes"].update({"scale_bits":(4,3), "type": "dynamic"})
-        if act_type:
-            if "NF4" in act_type:
-                quant_cfg["*input_quantizer"]["block_sizes"].update({"scale_bits":8, "scale_block_sizes": {self.scale_channel_quantize: self.scale_block_quantize}})
-            elif "NVFP4" in act_type:
-                quant_cfg["*input_quantizer"]["block_sizes"].update({"scale_bits":(4,3), "type": "dynamic"})
+        # Only fixed scaling sizes are supported for now in modelopt
+        if self.scale_channel_quantize is not None and self.scale_block_quantize is not None:
+            if w_type in NVIDIAModelOptConfig.quanttype_to_scalingbits:
+                quant_cfg["*weight_quantizer"]["block_sizes"].update({
+                    "scale_bits": NVIDIAModelOptConfig.quanttype_to_scalingbits[w_type],
+                    "scale_block_sizes": {self.scale_channel_quantize: self.scale_block_quantize}
+                })
+            if act_type and act_type in NVIDIAModelOptConfig.quanttype_to_scalingbits:
+                quant_cfg["*input_quantizer"]["block_sizes"].update({
+                    "scale_bits": NVIDIAModelOptConfig.quanttype_to_scalingbits[act_type],
+                    "scale_block_sizes": {self.scale_channel_quantize: self.scale_block_quantize}
+                })
 
         return BASE_CONFIG