Remove debug prints

dongfengy · dongfengy · commit 71c05586f381 · 2025-11-25T20:33:58.000Z
Signed-off-by: Dongfeng Yu &lt;dongfengy@nvidia.com&gt;
diff --git a/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp b/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp
@@ -21,7 +21,6 @@
 #include "tensorrt_llm/thop/thUtils.h"
 #include <ATen/cuda/EmptyTensor.h>
 #include <ATen/ops/index_select.h>
-#include <iostream>
 
 namespace torch_ext
 {
diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
@@ -1954,6 +1954,7 @@ def post_load_weights(self, module: torch.nn.Module):
         module.quant_scales.fc1_weight_block.data.copy_(
             w3_w1_weight_scale_interleaved)
 
+
 def _fp4_quantize_pad_unpad(weight: torch.Tensor, alignment: Tuple[int, int]):
     assert weight.dim() == 2, "Only 2D tensor is supported."
     assert weight.device.type == 'cuda', "Only cuda tensor is supported."
@@ -2274,29 +2275,17 @@ def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
         # matching the kernel's expectation (similar to test_moe.py logic).
         if module.w3_w1_bias is not None:
             # gemm1_bias * gemm1_scales_global * hidden_states_scale_global
-            print("Divided w3_w1_bias by fc31_alpha")
-            print("before:", module.w3_w1_bias.data)
             module.w3_w1_bias.data.div_((module.fc31_alpha.data).view(-1, 1))
-            print("after:", module.w3_w1_bias.data)
 
         if module.w2_bias is not None:
             # gemm2_bias * c_global_sf * gemm2_scales_global
-            print("Divided w2_bias by fc2_alpha")
-            print("before:", module.w2_bias.data)
             module.w2_bias.data.div_((module.fc2_alpha.data).view(-1, 1))
-            print("after:", module.w2_bias.data)
 
         if module.swiglu_beta is not None:
-            print("Dividing swiglu_beta by fc31_alpha")
-            print("before:", module.swiglu_beta.data)
             module.swiglu_beta.data.div_((module.fc31_alpha.data))
-            print("after:", module.swiglu_beta.data)
 
         if module.swiglu_limit is not None:
-            print("Dividing swiglu_limit by fc31_alpha")
-            print("before:", module.swiglu_limit.data)
             module.swiglu_limit.data.div_((module.fc31_alpha.data))
-            print("after:", module.swiglu_limit.data)
 
         if self.need_load_shared_weights(module):
             local_shared_load_expert_ids = module.layer_load_balancer.get_load_expert_ids(
diff --git a/tests/unittest/_torch/modules/test_fused_moe.py b/tests/unittest/_torch/modules/test_fused_moe.py
@@ -1344,20 +1344,6 @@ def test_fused_moe_fp8_blockwise_cute_dsl_multi_gpu(ep_size, routing_method,
             assert r is True
 
 
-def test_fp4_quantize_pad_unpad():
-    INTERMEDIATE_SIZE = 5760
-    HIDDEN_SIZE = 2880
-    weight = torch.randn(
-        (INTERMEDIATE_SIZE, HIDDEN_SIZE), dtype=torch.bfloat16,
-        device="cuda") * 0.05
-
-    from tensorrt_llm._torch.modules.fused_moe.quantization import \
-        _fp4_quantize_pad_unpad
-    weight_nvfp4, global_scale_factor, block_scale_factor = _fp4_quantize_pad_unpad(
-        weight, (1, 1))
-    # Current we don't check anything as this is just used for debugging purpose
-
-
 @skip_pre_blackwell
 @pytest.mark.parametrize("hidden_size, intermediate_size", [(512, 512),
                                                             (2880, 2880)])
@@ -1463,7 +1449,6 @@ def test_fused_moe_nvfp4(dtype, moe_backend, hidden_size, intermediate_size,
             weights[
                 f"{expert_id}.w3.weight_scale"] = w3_sf_block_unswizzled.view(
                     torch.float8_e4m3fn).cuda()
-
             weights[f"{expert_id}.w1.input_scale"] = 1.0 / w1_input_scale
             weights[f"{expert_id}.w2.input_scale"] = 1.0 / w2_input_scale
             weights[f"{expert_id}.w3.input_scale"] = 1.0 / w3_input_scale
@@ -1530,9 +1515,6 @@ def test_fused_moe_nvfp4(dtype, moe_backend, hidden_size, intermediate_size,
             fused_moe.forward(x, router_logits)
 
         output = fused_moe.forward(x, router_logits)
-        print()
-        print("actual", output)
-        print("ref", ref_output)
         check_accuracy(output, ref_output, rtol=0.1, atol=0.1, percent=0.95)
 
         if not test_all_kernels:
@@ -2685,7 +2667,8 @@ def custom_swiglu(x):
                 dtype=self.dtype,
                 config=model_config,
                 use_cute_dsl_blockscaling_mm=use_cute_dsl_blockscaling_mm,
-                activation=custom_swiglu,
+                activation=custom_swiglu
+                if swiglu_alpha is not None else F.silu,
             ) for _ in range(self.num_experts)
         ])
 

Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,6 @@`
`21`	`21`	`#include "tensorrt_llm/thop/thUtils.h"`
`22`	`22`	`#include <ATen/cuda/EmptyTensor.h>`
`23`	`23`	`#include <ATen/ops/index_select.h>`
`24`		`-#include <iostream>`
`25`	`24`
`26`	`25`	`namespace torch_ext`
`27`	`26`	`{`