fix bmm experts export issue with nvfp4 scales

Edwardf0t1 · Edwardf0t1 · commit 061f2e51e35f · 2025-09-27T19:00:09.000Z
Signed-off-by: Zhiyu Cheng &lt;zhiyuc@nvidia.com&gt;
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -62,7 +62,6 @@
     get_weight_block_size,
     get_weight_scaling_factor,
     get_weight_scaling_factor_2,
-    maybe_transpose_expert_weight_dimensions,
     postprocess_state_dict,
     preprocess_linear_fusion,
     to_quantized_weight,
@@ -293,34 +292,54 @@ def _export_quantized_weight(
     weight_scale: torch.Tensor | None = getattr(sub_module, quantizer_attrs.weight_scale, None)
     weight_scale_2: torch.Tensor | None = getattr(sub_module, quantizer_attrs.weight_scale_2, None)
 
-    # Transpose weight for bmm-style expert quantization (llama4, gpt-oss)
+    # For NVFP4 quantization of expert weights, transpose to (num_experts, out_dim, in_dim)
+    # because ModelOpt assumes in_dim is the last dimension for scaling factor computation
     if quantization_format in [QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ]:
-        # Transpose weight from (num_experts, input_dim, output_dim) to (num_experts, output_dim, input_dim)
-        # for NVFP4 quantization functions that expect input_dim as the last dimension for block quantization
-        is_bmm_expert_weight = weight.dim() == 3 and any(
+        is_expert_weight = weight.dim() == 3 and any(
             expert_type in type(sub_module).__name__
             for expert_type in ["Llama4TextExperts", "GptOssExperts"]
         )
-        weight, _ = maybe_transpose_expert_weight_dimensions(
-            weight, is_bmm_expert_weight=is_bmm_expert_weight
-        )
-        weight_scale = NVFP4QTensor.get_weights_scaling_factor(
-            weight,
-            block_size=block_size,
-            weights_scaling_factor_2=weight_scale_2,
-        )[0]
-
-        quantized_weight = to_quantized_weight(
-            weight.to(dtype),
-            weight_scale,
-            quantization_format,
-            weight_scale_2,
-            block_size,
-        )
-
-        quantized_weight, weight_scale = maybe_transpose_expert_weight_dimensions(
-            quantized_weight, weight_scale, is_bmm_expert_weight=is_bmm_expert_weight
-        )
+        
+        if is_expert_weight:
+            # Transpose from (num_experts, in_dim, out_dim) to (num_experts, out_dim, in_dim)
+            transposed_weight = weight.transpose(-2, -1).contiguous()
+            
+            # Compute scaling factor from transposed weight
+            weight_scale = NVFP4QTensor.get_weights_scaling_factor(
+                transposed_weight,
+                block_size=block_size,
+                weights_scaling_factor_2=weight_scale_2,
+            )[0]
+
+            # Quantize using transposed weight and scaling factor
+            quantized_weight = to_quantized_weight(
+                transposed_weight.to(dtype),
+                weight_scale,
+                quantization_format,
+                weight_scale_2,
+                block_size,
+            )
+            
+            # Transpose quantized weight back to original format (num_experts, in_dim, out_dim)
+            quantized_weight = quantized_weight.transpose(-2, -1).contiguous()
+            
+            # Transpose scaling factor back to match original weight dimensions
+            weight_scale = weight_scale.transpose(-2, -1).contiguous()
+        else:
+            # Regular weight quantization (non-expert)
+            weight_scale = NVFP4QTensor.get_weights_scaling_factor(
+                weight,
+                block_size=block_size,
+                weights_scaling_factor_2=weight_scale_2,
+            )[0]
+
+            quantized_weight = to_quantized_weight(
+                weight.to(dtype),
+                weight_scale,
+                quantization_format,
+                weight_scale_2,
+                block_size,
+            )
     else:
         quantized_weight = to_quantized_weight(
             weight.to(dtype),