[OMNIML-2336][feat] w4a8 nvfp4 fp8 exports scale factor properly (#8180)

sychen52 · Shiyang Chen · web-flow · commit 6a6124dcb56c · 2025-10-15T13:41:27.000+08:00
Signed-off-by: Shiyang Chen &lt;shiychen@nvidia.com&gt;
Co-authored-by: Shiyang Chen &lt;shiychen@omniml-a6.nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
@@ -2039,23 +2039,6 @@ def load_expert_w2_weight_scale_nvfp4(self, module: torch.nn.Module,
         return super().load_expert_w2_weight_scale_nvfp4(
             module, w2_weight_scale, dst_w2_weight_scale, 32)
 
-    def load_all_fp4_weight_scales_and_alphas(
-            self, module: torch.nn.Module, weights: Dict,
-            load_expert_ids: List[int], dst_w3_w1_weight_scale: torch.Tensor,
-            dst_w2_weight_scale: torch.Tensor, dst_fc31_alpha: torch.Tensor,
-            dst_fc2_alpha: torch.Tensor):
-        super().load_all_fp4_weight_scales_and_alphas(
-            module, weights, load_expert_ids, dst_w3_w1_weight_scale,
-            dst_w2_weight_scale, dst_fc31_alpha, dst_fc2_alpha)
-        # The kernel we use will convert nvfp4 to e4m3 before matmul,
-        # so the range of the scale factor can only be [0,448/6].
-        dst_w3_w1_weight_scale.copy_((dst_w3_w1_weight_scale.to(torch.float32) /
-                                      6.0).to(torch.float8_e4m3fn))
-        dst_w2_weight_scale.copy_((dst_w2_weight_scale.to(torch.float32) /
-                                   6.0).to(torch.float8_e4m3fn))
-        dst_fc31_alpha.copy_(dst_fc31_alpha * 6.0)
-        dst_fc2_alpha.copy_(dst_fc2_alpha * 6.0)
-
 
 def _get_weight_alignment(weight_alignment, scaling_vector_size, tp_size,
                           shard_dim_size):
diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py
@@ -1027,15 +1027,12 @@ def load_weight_scales(
                                        tp_mode,
                                        device=device).contiguous()
                 assert ws.dtype == torch.float8_e4m3fn
-                # The kernel we use will convert nvfp4 to e4m3 before matmul,
-                # so the range of the scale factor can only be [0,448/6].
-                ws = (ws.to(torch.float32) / 6.0).to(torch.float8_e4m3fn)
                 weight_scale.append(ws.view(dtype=fp4_utils.float4_sf_dtype))
             if "weight_scale_2" in w:
                 if weight_scale_2 is None:
-                    weight_scale_2 = w["weight_scale_2"][...] * 6.0
+                    weight_scale_2 = w["weight_scale_2"][...]
                 else:
-                    assert weight_scale_2 == w["weight_scale_2"][...] * 6.0, (
+                    assert weight_scale_2 == w["weight_scale_2"][...], (
                         f"The weight_scale_2 should be same for all the weights: {weight_scale_2} vs. {w['weight_scale_2']}*6"
                     )
 
diff --git a/tests/unittest/_torch/thop/parallel/test_moe.py b/tests/unittest/_torch/thop/parallel/test_moe.py
@@ -991,7 +991,6 @@ class TestMoeFp4:
     the default tactic selection works. This reduces unnecessary test runs for CI
     """
 
-    @pytest.mark.skip(reason="https://nvbugs/5550249")
     @pytest.mark.parametrize("num_tokens", [1, 1024])
     @pytest.mark.parametrize("hidden_size", [1024])
     @pytest.mark.parametrize("intermediate_size", [1024, 768, 384, 192])