Fix bugs

dongfengy · dongfengy · commit 8acda3c5efb5 · 2025-11-21T18:42:23.000Z
Signed-off-by: Dongfeng Yu &lt;dongfengy@nvidia.com&gt;
diff --git a/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp b/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp
@@ -21,6 +21,7 @@
 #include "tensorrt_llm/thop/thUtils.h"
 #include <ATen/cuda/EmptyTensor.h>
 #include <ATen/ops/index_select.h>
+#include <iostream>
 
 namespace torch_ext
 {
@@ -44,6 +45,78 @@ std::vector<torch::Tensor> run_fp4_block_scale_moe_runner(torch::optional<torch:
     bool const do_finalize, btg::Dtype const dtype, MoeRunnerType& moe_runner, int64_t const moeConfigIndex,
     torch::optional<torch::Tensor> const& topk_weights, torch::optional<torch::Tensor> const& topk_ids)
 {
+    std::cout << "Function: run_fp4_block_scale_moe_runner" << std::endl;
+
+    auto print_tensor = [](std::string name, torch::Tensor const& t)
+    {
+        std::cout << name << ": shape=[";
+        for (auto s : t.sizes())
+        {
+            std::cout << s << ",";
+        }
+        std::cout << "], dtype=" << t.scalar_type() << std::endl;
+    };
+
+    auto print_opt_tensor = [&](std::string name, auto const& t)
+    {
+        if (t.has_value())
+        {
+            print_tensor(name, t.value());
+        }
+        else
+        {
+            std::cout << name << ": None" << std::endl;
+        }
+    };
+
+    auto print_val = [](std::string name, auto const& v) { std::cout << name << ": " << v << std::endl; };
+
+    auto print_opt_val = [&](std::string name, auto const& v)
+    {
+        if (v.has_value())
+        {
+            std::cout << name << ": " << v.value() << std::endl;
+        }
+        else
+        {
+            std::cout << name << ": None" << std::endl;
+        }
+    };
+
+    print_opt_tensor("routing_logits", routing_logits);
+    print_opt_tensor("routing_bias", routing_bias);
+    print_tensor("hidden_states", hidden_states);
+    print_opt_tensor("hidden_states_scale", hidden_states_scale);
+    print_tensor("gemm1_weights", gemm1_weights);
+    print_tensor("gemm1_weights_scale", gemm1_weights_scale);
+    print_opt_tensor("gemm1_bias", gemm1_bias);
+    print_opt_tensor("gemm1_alpha", gemm1_alpha);
+    print_opt_tensor("gemm1_beta", gemm1_beta);
+    print_opt_tensor("gemm1_clamp_limit", gemm1_clamp_limit);
+    print_tensor("gemm2_weights", gemm2_weights);
+    print_tensor("gemm2_weights_scale", gemm2_weights_scale);
+    print_opt_tensor("gemm2_bias", gemm2_bias);
+    print_tensor("output1_scales_scalar", output1_scales_scalar);
+    print_tensor("output1_scales_gate_scalar", output1_scales_gate_scalar);
+    print_tensor("output2_scales_scalar", output2_scales_scalar);
+
+    print_val("num_experts", num_experts);
+    print_val("top_k", top_k);
+    print_opt_val("n_group", n_group);
+    print_opt_val("topk_group", topk_group);
+    print_val("intermediate_size", intermediate_size);
+    print_val("local_expert_offset", local_expert_offset);
+    print_val("local_num_experts", local_num_experts);
+    print_opt_val("routed_scaling_factor", routed_scaling_factor);
+    print_val("tile_tokens_dim", tile_tokens_dim);
+    print_val("routing_method_type", routing_method_type);
+    print_val("do_finalize", do_finalize);
+    print_val("dtype", static_cast<int>(dtype));
+    print_val("moeConfigIndex", moeConfigIndex);
+    print_opt_tensor("topk_weights", topk_weights);
+    print_opt_tensor("topk_ids", topk_ids);
+    std::cout << "--------------------------------" << std::endl;
+
     TORCH_CHECK(dtype == btg::Dtype::E4m3 || dtype == btg::Dtype::E2m1, "dtype can only be e4m3 or e2m1.");
     TORCH_CHECK(tensorrt_llm::common::isSM100Family(), "Only SM100f is supported by FP4 block scale MOE");
     TORCH_CHECK(tile_tokens_dim == 8 || tile_tokens_dim == 16 || tile_tokens_dim == 32 || tile_tokens_dim == 64
diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
@@ -201,12 +201,11 @@ def create_weights(
 
         # bias
         if module.bias:
+            # The shape might be padded so we use weight shape[:2]
             if w3_w1_bias_shape is None:
-                w3_w1_bias_shape = (module.expert_size_per_partition,
-                                    module.intermediate_size_per_partition * 2)
+                w3_w1_bias_shape = w3_w1_weight_shape[:2]
             if w2_bias_shape is None:
-                w2_bias_shape = (module.expert_size_per_partition,
-                                 module.hidden_size)
+                w2_bias_shape = w2_weight_shape[:2]
             bias_dtype = bias_dtype or module.dtype
             w3_w1_bias = nn.Parameter(torch.empty(w3_w1_bias_shape,
                                                   dtype=bias_dtype),
diff --git a/tests/unittest/_torch/modules/test_fused_moe.py b/tests/unittest/_torch/modules/test_fused_moe.py
@@ -1381,11 +1381,11 @@ def test_fused_moe_nvfp4(dtype, moe_backend, hidden_size, intermediate_size):
     with torch.device(f"cuda:{mapping.rank}"):
         SCALING_VECTOR_SIZE = 16
 
-        SEQ_LEN = 4
+        SEQ_LEN = 1024
         HIDDEN_SIZE = hidden_size
         INTERMEDIATE_SIZE = intermediate_size
-        NUM_EXPERTS = 4
-        TOP_K = 2
+        NUM_EXPERTS = 32
+        TOP_K = 4
         routing_method = RenormalizeMoeRoutingMethod(top_k=TOP_K)
         torch.manual_seed(0)
         torch.cuda.manual_seed(0)
@@ -1487,7 +1487,7 @@ def test_fused_moe_nvfp4(dtype, moe_backend, hidden_size, intermediate_size):
             fused_moe.forward(x, router_logits)
 
         output = fused_moe.forward(x, router_logits)
-        torch.testing.assert_close(output, ref_output, rtol=0.1, atol=0.4)
+        torch.testing.assert_close(output, ref_output, rtol=0.1, atol=0.5)
 
         if not test_all_kernels:
             return
@@ -1503,7 +1503,7 @@ def test_fused_moe_nvfp4(dtype, moe_backend, hidden_size, intermediate_size):
                 torch.testing.assert_close(output,
                                            ref_output,
                                            rtol=0.1,
-                                           atol=0.4)
+                                           atol=0.5)
 
 
 @skip_pre_blackwell