address review comments

nzmora-nvidia · nzmora-nvidia · commit 15618ee8831e · 2025-12-29T09:44:16.000-08:00
Signed-off-by: Neta Zmora &lt;96238833+nzmora-nvidia@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/trtllm_moe.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/trtllm_moe.py
@@ -306,6 +306,7 @@ def trtllm_quant_nvfp4_moe_fused(
     )
     hidden_size_needs_padding = hidden_size % TRTLLM_NVFP4_COLUMN_SIZE != 0
     if inter_size_needs_padding or hidden_size_needs_padding:
+        assert False, "See https://github.com/NVIDIA/TensorRT-LLM/issues/10331"
         # fc1_expert_weights_fp4: [E, I, H] or [E, 2*I, H]
         fc1_padded = fc1_expert_weights_fp4.new_zeros(
             fc1_expert_weights_fp4.size(0),
@@ -319,6 +320,11 @@ def trtllm_quant_nvfp4_moe_fused(
         fc2_padded = fc2_expert_weights_fp4.new_zeros(
             n_experts, hidden_size_padded, inter_size_padded // FP4_PER_UINT8
         )
+
+        assert inter_size % NVFP4_BLOCK_SIZE == 0, (
+            f"inter_size {inter_size} must be divisible by {NVFP4_BLOCK_SIZE}"
+        )
+
         fc2_padded[:, :, : inter_size // FP4_PER_UINT8] = fc2_expert_weights_fp4
         fc2_expert_weights_fp4 = fc2_padded
 
@@ -334,17 +340,20 @@ def trtllm_quant_nvfp4_moe_fused(
     # https://github.com/NVIDIA/TensorRT-LLM/blob/c9771ebb997683c08b26bbba796a7fc6aff09d93/cpp/tensorrt_llm/thop/moeOp.cpp#L1015
     quant_scales = [
         fc1_act_global_scale,  # torch.float32; [E] or scalar
-        fc1_weight_blockscale_fp8.view(torch.int32),
+        fc1_weight_blockscale_fp8.view(
+            torch.int32
+        ),  # 4 FP8 as packed int32; [E, I*2, H / 16 / 4] or [E, I, H / 16 / 4]
         fc1_alpha,  # torch.float32; [E]
         fc2_act_global_scale,  # torch.float32; [E] or scalar
-        fc2_weight_blockscale_fp8.view(torch.int32),
+        fc2_weight_blockscale_fp8.view(torch.int32),  # 4 FP8 as packed int32; [E, H, I / 16 / 4]
         fc2_alpha,  # torch.float32; [E]
     ]
 
     trtllm_output = torch.ops.trtllm.fused_moe(
-        x_q_fp4,
-        selected_experts.to(torch.int),
+        x_q_fp4.view(torch.long),
+        selected_experts.to(torch.int32),
         routing_weights.to(torch.float32),
+        # Groups of 16 FP4 weight elements are packed as a single int64 element (see isNvfp4Quant in moeOp.cpp)
         fc1_expert_weights=fc1_expert_weights_fp4.view(torch.long),
         fc1_expert_biases=None,
         fc2_expert_weights=fc2_expert_weights_fp4.view(torch.long),
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/fused_moe.py b/tensorrt_llm/_torch/auto_deploy/transform/library/fused_moe.py
@@ -1613,10 +1613,13 @@ def _extract_op_args(node):
             "is_gated_mlp",
         )
 
-    def _stack(param_list, dim=0):
-        return torch.stack(
-            [get_param_or_buffer(element.target) for element in param_list], dim=dim
-        ).contiguous()
+    def _stack(param_list, dim=0, device=None, dtype=None):
+        if param_list:
+            return torch.stack(
+                [get_param_or_buffer(element.target) for element in param_list], dim=dim
+            ).contiguous()
+        else:
+            return torch.empty(0, device=device, dtype=dtype)
 
     def _prepare_args_cutlass_format_nvfp4():
         if is_gated_mlp:
@@ -1627,9 +1630,15 @@ def _prepare_args_cutlass_format_nvfp4():
             fc1_act_scale = torch.cat(
                 [w3_input_scale_stacked, w1_input_scale_stacked], dim=1
             ).contiguous()
+            fc1_alpha_stacked = torch.cat([w3_alpha_stacked, w1_alpha_stacked], dim=1).contiguous()
+            fc1_weight_blockscale_fp8_stacked = torch.cat(
+                [w3_weight_blockscale_fp8_stacked, w1_weight_blockscale_fp8_stacked], dim=1
+            ).contiguous()
         else:
             fc1_expert_weights = w1_stacked
             fc1_act_scale = w1_input_scale_stacked
+            fc1_alpha_stacked = w1_alpha_stacked
+            fc1_weight_blockscale_fp8_stacked = w1_weight_blockscale_fp8_stacked
 
         fc2_expert_weights = w2_stacked
         fc2_act_scale = w2_input_scale_stacked
@@ -1651,11 +1660,13 @@ def _prepare_args_cutlass_format_nvfp4():
         weight_dtype = torch.float8_e4m3fn
         _register_parameter(gm, new_key_fc1_expert_weights, fc1_expert_weights.to(weight_dtype))
         _register_parameter(gm, new_key_fc2_expert_weights, fc2_expert_weights.to(weight_dtype))
-        _register_parameter(gm, new_key_fc1_weight_blockscale_fp8, w1_weight_blockscale_fp8_stacked)
+        _register_parameter(
+            gm, new_key_fc1_weight_blockscale_fp8, fc1_weight_blockscale_fp8_stacked
+        )
         _register_parameter(gm, new_key_fc2_weight_blockscale_fp8, w2_weight_blockscale_fp8_stacked)
         _register_parameter(gm, new_key_fc1_act_scale, fc1_act_scale)
         _register_parameter(gm, new_key_fc2_act_scale, fc2_act_scale)
-        _register_parameter(gm, new_key_fc1_alpha, w1_alpha_stacked)
+        _register_parameter(gm, new_key_fc1_alpha, fc1_alpha_stacked)
         _register_parameter(gm, new_key_fc2_alpha, w2_alpha_stacked)
 
         with graph.inserting_before(node):
@@ -1705,50 +1716,23 @@ def _prepare_args_cutlass_format_nvfp4():
         # Stack the actual tensor values (fast, like in quantize_moe.py)
         w1_stacked = _stack(w1_list, dim=0)
         w2_stacked = _stack(w2_list, dim=0)
-        w3_stacked = (
-            _stack(w3_list, dim=0)
-            if w3_list
-            else torch.empty(0, device=w1_stacked.device, dtype=w1_stacked.dtype)
-        )
+        device, dtype = (w1_stacked.device, w1_stacked.dtype)
+        w3_stacked = _stack(w3_list, dim=0, device=device, dtype=dtype)
 
         # Scales are buffers, not parameters
         w1_input_scale_stacked = _stack(w1_input_scale, dim=0)
         w2_input_scale_stacked = _stack(w2_input_scale, dim=0)
-        w3_input_scale_stacked = (
-            _stack(w3_input_scale, dim=0)
-            if w3_input_scale
-            else torch.empty(
-                0, device=w1_input_scale_stacked.device, dtype=w1_input_scale_stacked.dtype
-            )
-        )
-        # assert torch.all(w1_input_scale_stacked[0] == w1_input_scale_stacked), (
-        #     "All w1 scales should have the same value."
-        # )
-        # assert torch.all(w2_input_scale_stacked[0] == w2_input_scale_stacked), (
-        #     "All w2 scales should have the same value."
-        # )
+        w3_input_scale_stacked = _stack(w3_input_scale, dim=0, device=device, dtype=dtype)
 
         w1_weight_blockscale_fp8_stacked = _stack(w1_weight_scale, dim=0).to(torch.float8_e4m3fn)
         w2_weight_blockscale_fp8_stacked = _stack(w2_weight_scale, dim=0).to(torch.float8_e4m3fn)
-        # w3_weight_blockscale_fp8_stacked = (
-        #     (
-        #         _stack(w3_weight_scale, dim=0)
-        #         if w3_weight_scale
-        #         else torch.empty(
-        #             0,
-        #             device=w1_weight_blockscale_fp8_stacked.device,
-        #             dtype=w1_weight_blockscale_fp8_stacked.dtype,
-        #         )
-        #     )
-        #     .to(torch.float8_e4m3fn)
-        #     .contiguous()
-        # )
-
-        ###
+        w3_weight_blockscale_fp8_stacked = _stack(
+            w3_weight_scale, dim=0, device=device, dtype=dtype
+        ).to(torch.float8_e4m3fn)
+
         w1_alpha_stacked = _stack(w1_alpha, dim=0)
         w2_alpha_stacked = _stack(w2_alpha, dim=0)
-        # w3_alpha_stacked = _stack(w3_alpha, dim=0)
-        ###
+        w3_alpha_stacked = _stack(w3_alpha, dim=0, device=device, dtype=dtype)
 
         args = _prepare_args_cutlass_format_nvfp4()
 
@@ -1770,7 +1754,6 @@ def _prepare_args_cutlass_format_nvfp4():
     # will remove the parameters/buffers that are no longer referenced
     gm.graph.eliminate_dead_code()
     gm.delete_all_unused_submodules()
-
     return fused_key_counter
 
 
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_trtllm_moe.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_trtllm_moe.py
@@ -584,7 +584,9 @@ def test_trtllm_fused_moe_nvfp4(
 ):
     # Skip known failing configuration
     if activation_func == ActivationType.Relu2 and intermediate_size == 1856:
-        pytest.skip("test fails for Relu2 with intermediate_size=1856")
+        pytest.skip(
+            "test fails for Relu2 with intermediate_size=1856; see https://github.com/NVIDIA/TensorRT-LLM/issues/10331"
+        )
 
     # In the code below:
     #   sf := block scale factors for NVFP4