fix bmm style moe export in fp8_pc_pt recipe

Edwardf0t1 · Edwardf0t1 · commit 4bb85d1b317e · 2025-11-05T15:00:25.000-08:00
Signed-off-by: Zhiyu Cheng &lt;zhiyuc@nvidia.com&gt;
diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
@@ -775,7 +775,38 @@ def to_quantized_weight(
         )[0]._quantized_data
 
     if quantization == QUANTIZATION_FP8_PC_PT:
-        return (weight / weights_scaling_factor.unsqueeze(-1)).to(torch.float8_e4m3fn)
+        if weight.dim() == 3:
+            # for MOE stacked weights
+            # For standard MoE: weight (num_experts, output_dim, input_dim)
+            #                   scale (num_experts, output_dim)
+            # For BMM-style transposed experts: weight (num_experts, output_dim, input_dim)
+            #                                    scale (num_experts, input_dim)
+
+            # Handle different scale tensor shapes
+            if weights_scaling_factor.dim() == 1:
+                # Per-expert scaling only: (num_experts,) -> (num_experts, 1, 1)
+                return (weight / weights_scaling_factor[:, None, None]).to(torch.float8_e4m3fn)
+            elif weights_scaling_factor.dim() == 2:
+                # Per-channel scaling: check which dimension matches
+                if weights_scaling_factor.shape[-1] == weight.shape[-1]:
+                    # Scale matches last dim (input_dim) - BMM-style transposed case
+                    # (num_experts, input_dim) -> (num_experts, 1, input_dim)
+                    return (weight / weights_scaling_factor.unsqueeze(-2)).to(torch.float8_e4m3fn)
+                elif weights_scaling_factor.shape[-1] == weight.shape[-2]:
+                    # Scale matches second-to-last dim (output_dim) - standard MoE case
+                    # (num_experts, output_dim) -> (num_experts, output_dim, 1)
+                    return (weight / weights_scaling_factor.unsqueeze(-1)).to(torch.float8_e4m3fn)
+                else:
+                    # Shape mismatch - try to infer correct broadcasting
+                    raise ValueError(
+                        f"Cannot determine correct unsqueeze dimension for FP8_PC_PT quantization. "
+                        f"weight shape: {weight.shape}, scale shape: {weights_scaling_factor.shape}"
+                    )
+            else:
+                raise ValueError(
+                    f"Unexpected scaling factor dimension for 3D weight: {weights_scaling_factor.dim()}"
+                )
+        return (weight / weights_scaling_factor[:, None]).to(torch.float8_e4m3fn)
 
     if quantization in [QUANTIZATION_INT4_AWQ, QUANTIZATION_W4A8_AWQ]:
         return pack_int4_in_uint8(weight, weights_scaling_factor)
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -50,6 +50,7 @@
     KV_CACHE_NVFP4_AFFINE,
     QUANTIZATION_FP8,
     QUANTIZATION_FP8_PB_REAL,
+    QUANTIZATION_FP8_PC_PT,
     QUANTIZATION_NONE,
     QUANTIZATION_NVFP4,
     QUANTIZATION_NVFP4_AWQ,
@@ -323,13 +324,15 @@ def _export_quantized_weight(
     weight_scale_2: torch.Tensor | None = getattr(sub_module, quantizer_attrs.weight_scale_2, None)
 
     # Transpose weight for bmm-style expert quantization (llama4, gpt-oss)
+    # Check if this is a BMM-style expert weight that needs transposition
+    is_bmm_expert_weight = weight.dim() == 3 and any(
+        expert_type in type(sub_module).__name__
+        for expert_type in ["Llama4TextExperts", "GptOssExperts"]
+    )
+
     if quantization_format in [QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ]:
         # Transpose weight from (num_experts, input_dim, output_dim) to (num_experts, output_dim, input_dim)
         # for NVFP4 quantization functions that expect input_dim as the last dimension for block quantization
-        is_bmm_expert_weight = weight.dim() == 3 and any(
-            expert_type in type(sub_module).__name__
-            for expert_type in ["Llama4TextExperts", "GptOssExperts"]
-        )
         weight, _ = maybe_transpose_expert_weight_dimensions(
             weight, is_bmm_expert_weight=is_bmm_expert_weight
         )
@@ -350,6 +353,26 @@ def _export_quantized_weight(
         quantized_weight, weight_scale = maybe_transpose_expert_weight_dimensions(
             quantized_weight, weight_scale, is_bmm_expert_weight=is_bmm_expert_weight
         )
+    elif quantization_format == QUANTIZATION_FP8_PC_PT and is_bmm_expert_weight:
+        # For FP8_PC_PT with BMM-style experts, transpose only the weight (not weight_scale)
+        # Transpose weight from (num_experts, input_dim, output_dim) to (num_experts, output_dim, input_dim)
+        # weight_scale remains (num_experts, output_dim) for per-channel quantization
+        weight, _ = maybe_transpose_expert_weight_dimensions(
+            weight, is_bmm_expert_weight=is_bmm_expert_weight
+        )
+
+        quantized_weight = to_quantized_weight(
+            weight.to(dtype),
+            weight_scale,
+            quantization_format,
+            weight_scale_2,
+            block_size,
+        )
+
+        # Transpose back to original BMM format
+        quantized_weight, _ = maybe_transpose_expert_weight_dimensions(
+            quantized_weight, is_bmm_expert_weight=is_bmm_expert_weight
+        )
     else:
         quantized_weight = to_quantized_weight(
             weight.to(dtype),