added support for nvfp4 export

sugunav14 · sugunav14 · commit 33fdcf365a53 · 2025-09-26T16:26:28.000Z
Signed-off-by: Suguna Velury &lt;178320438+sugunav14@users.noreply.github.com&gt;
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -357,7 +357,9 @@ def main(args):
         )
         mts.export(model)
 
-    if args.auto_quantize_bits or args.qformat in QUANT_CFG_CHOICES:
+    if (
+        args.auto_quantize_bits or args.qformat in QUANT_CFG_CHOICES
+    ) and not model_is_already_quantized:
         if "awq" in args.qformat:
             print(
                 "\n####\nAWQ calibration could take longer than other calibration methods. "
@@ -386,6 +388,9 @@ def main(args):
                 sample_input_single_batch = None
 
             run_auto_quant = args.auto_quantize_bits is not None
+            print("DEBUG LOG: Entereing here")
+            for k, v in model.state_dict().items():
+                print(k, v.shape, v.dtype, v.device)
 
             args.batch_size = get_max_batch_size(
                 model,
@@ -628,7 +633,6 @@ def output_decode(generated_ids, input_shape):
                     "They will be set at deployment time."
                 )
 
-            print("DEBUG LOG: Calling unified export hf checkpoint")
             export_hf_checkpoint(
                 full_model,
                 export_dir=export_path,
diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
@@ -269,6 +269,15 @@ def get_weight_scaling_factor(module: nn.Module, weight_name: str = "weight") ->
         QUANTIZATION_NVFP4_AWQ,
         QUANTIZATION_W4A8_NVFP4_FP8,
     ]:
+        if hasattr(weight_quantizer, "_scale"):
+            # In this case, weight must be a QTensorWrapper
+            original_shape = weight.metadata["shape"]
+            ws = NVFP4QTensor.get_modelopt_weights_scaling_factor(
+                weight_quantizer._scale, original_shape
+            )
+            print(f"weight_quantizer._scale: {ws.shape}")
+            return ws
+
         return NVFP4QTensor.get_weights_scaling_factor(
             weight,
             weight_quantizer.block_sizes[-1],
@@ -608,8 +617,6 @@ def process_layer_quant_config(layer_config_dict):
         # Get the corresponding AWQ block size
         block_size_value = layer_config_dict.get(awq_key, 0)
 
-        # print(f"DEBUG LOG: Processing layer {k} with quantization {v}, block size {block_size_value}")
-
         if v == "fp8":
             layer_config = {"quant_algo": "FP8"}
         elif v == "fp8_pc_pt":
@@ -1082,6 +1089,9 @@ def get_quant_config(named_modules: nn.Module | dict[str, nn.Module]) -> dict[st
                 block_size = get_weight_block_size(module)
 
             # Construct per layer config dictionary
+            if block_size == 0 and quantization_format != QUANTIZATION_FP8:
+                continue
+
             layer_config_dict[name + ".quantization"] = quantization_format
             layer_config_dict[name + ".awq_block_size"] = block_size
 
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -538,10 +538,11 @@ def export_hf_checkpoint(
             model.base_model.save_pretrained(
                 base_export_dir, state_dict=post_state_dict, save_modelopt_state=save_modelopt_state
             )
-
-        model.save_pretrained(
-            export_dir, state_dict=post_state_dict, save_modelopt_state=save_modelopt_state
-        )
+            model.save_pretrained(export_dir, save_modelopt_state=save_modelopt_state)
+        else:
+            model.save_pretrained(
+                export_dir, state_dict=post_state_dict, save_modelopt_state=save_modelopt_state
+            )
 
         original_config = f"{base_export_dir}/config.json"
         config_data = {}
diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
@@ -94,6 +94,27 @@ def get_weights_scaling_factor_2(cls, input: torch.Tensor):
         """Returns per tensor weight scaling factor."""
         return reduce_amax(input).float() / (6.0 * 448.0)
 
+    @classmethod
+    def get_modelopt_weights_scaling_factor(cls, weight_scaling_factor: torch.Tensor, weight_shape):
+        """Returns the modelopt weights scaling factor if the quantization is done by trtllm."""
+        if weight_scaling_factor.dtype == torch.float8_e4m3fn:
+            return weight_scaling_factor
+
+        if weight_scaling_factor.dtype == torch.uint8 and weight_scaling_factor.ndim == 1:
+            # If quantization is done by trtllm, convert cutlass fp4 scale to modelopt fp4 scale
+            try:
+                from tensorrt_llm._torch.auto_deploy.utils.quantization_utils import (
+                    cutlass_fp4_scale_to_modelopt_fp4_scale,
+                )
+
+                return cutlass_fp4_scale_to_modelopt_fp4_scale(
+                    weight_scaling_factor, weight_shape[-2:]
+                )
+            except ImportError as e:
+                raise ImportError(
+                    "This tensor is quantized by trtllm, but tensorrt_llm cannot be imported."
+                ) from e
+
     @classmethod
     def get_activation_scaling_factor(cls, quantizer):
         """Returns the activation scaling factor for export."""