Skip to content

Commit 71c0558

Browse files
committed
Remove debug prints
Signed-off-by: Dongfeng Yu <[email protected]>
1 parent 56d5824 commit 71c0558

File tree

3 files changed

+3
-32
lines changed

3 files changed

+3
-32
lines changed

cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
#include "tensorrt_llm/thop/thUtils.h"
2222
#include <ATen/cuda/EmptyTensor.h>
2323
#include <ATen/ops/index_select.h>
24-
#include <iostream>
2524

2625
namespace torch_ext
2726
{

tensorrt_llm/_torch/modules/fused_moe/quantization.py

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1954,6 +1954,7 @@ def post_load_weights(self, module: torch.nn.Module):
19541954
module.quant_scales.fc1_weight_block.data.copy_(
19551955
w3_w1_weight_scale_interleaved)
19561956

1957+
19571958
def _fp4_quantize_pad_unpad(weight: torch.Tensor, alignment: Tuple[int, int]):
19581959
assert weight.dim() == 2, "Only 2D tensor is supported."
19591960
assert weight.device.type == 'cuda', "Only cuda tensor is supported."
@@ -2274,29 +2275,17 @@ def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
22742275
# matching the kernel's expectation (similar to test_moe.py logic).
22752276
if module.w3_w1_bias is not None:
22762277
# gemm1_bias * gemm1_scales_global * hidden_states_scale_global
2277-
print("Divided w3_w1_bias by fc31_alpha")
2278-
print("before:", module.w3_w1_bias.data)
22792278
module.w3_w1_bias.data.div_((module.fc31_alpha.data).view(-1, 1))
2280-
print("after:", module.w3_w1_bias.data)
22812279

22822280
if module.w2_bias is not None:
22832281
# gemm2_bias * c_global_sf * gemm2_scales_global
2284-
print("Divided w2_bias by fc2_alpha")
2285-
print("before:", module.w2_bias.data)
22862282
module.w2_bias.data.div_((module.fc2_alpha.data).view(-1, 1))
2287-
print("after:", module.w2_bias.data)
22882283

22892284
if module.swiglu_beta is not None:
2290-
print("Dividing swiglu_beta by fc31_alpha")
2291-
print("before:", module.swiglu_beta.data)
22922285
module.swiglu_beta.data.div_((module.fc31_alpha.data))
2293-
print("after:", module.swiglu_beta.data)
22942286

22952287
if module.swiglu_limit is not None:
2296-
print("Dividing swiglu_limit by fc31_alpha")
2297-
print("before:", module.swiglu_limit.data)
22982288
module.swiglu_limit.data.div_((module.fc31_alpha.data))
2299-
print("after:", module.swiglu_limit.data)
23002289

23012290
if self.need_load_shared_weights(module):
23022291
local_shared_load_expert_ids = module.layer_load_balancer.get_load_expert_ids(

tests/unittest/_torch/modules/test_fused_moe.py

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1344,20 +1344,6 @@ def test_fused_moe_fp8_blockwise_cute_dsl_multi_gpu(ep_size, routing_method,
13441344
assert r is True
13451345

13461346

1347-
def test_fp4_quantize_pad_unpad():
1348-
INTERMEDIATE_SIZE = 5760
1349-
HIDDEN_SIZE = 2880
1350-
weight = torch.randn(
1351-
(INTERMEDIATE_SIZE, HIDDEN_SIZE), dtype=torch.bfloat16,
1352-
device="cuda") * 0.05
1353-
1354-
from tensorrt_llm._torch.modules.fused_moe.quantization import \
1355-
_fp4_quantize_pad_unpad
1356-
weight_nvfp4, global_scale_factor, block_scale_factor = _fp4_quantize_pad_unpad(
1357-
weight, (1, 1))
1358-
# Current we don't check anything as this is just used for debugging purpose
1359-
1360-
13611347
@skip_pre_blackwell
13621348
@pytest.mark.parametrize("hidden_size, intermediate_size", [(512, 512),
13631349
(2880, 2880)])
@@ -1463,7 +1449,6 @@ def test_fused_moe_nvfp4(dtype, moe_backend, hidden_size, intermediate_size,
14631449
weights[
14641450
f"{expert_id}.w3.weight_scale"] = w3_sf_block_unswizzled.view(
14651451
torch.float8_e4m3fn).cuda()
1466-
14671452
weights[f"{expert_id}.w1.input_scale"] = 1.0 / w1_input_scale
14681453
weights[f"{expert_id}.w2.input_scale"] = 1.0 / w2_input_scale
14691454
weights[f"{expert_id}.w3.input_scale"] = 1.0 / w3_input_scale
@@ -1530,9 +1515,6 @@ def test_fused_moe_nvfp4(dtype, moe_backend, hidden_size, intermediate_size,
15301515
fused_moe.forward(x, router_logits)
15311516

15321517
output = fused_moe.forward(x, router_logits)
1533-
print()
1534-
print("actual", output)
1535-
print("ref", ref_output)
15361518
check_accuracy(output, ref_output, rtol=0.1, atol=0.1, percent=0.95)
15371519

15381520
if not test_all_kernels:
@@ -2685,7 +2667,8 @@ def custom_swiglu(x):
26852667
dtype=self.dtype,
26862668
config=model_config,
26872669
use_cute_dsl_blockscaling_mm=use_cute_dsl_blockscaling_mm,
2688-
activation=custom_swiglu,
2670+
activation=custom_swiglu
2671+
if swiglu_alpha is not None else F.silu,
26892672
) for _ in range(self.num_experts)
26902673
])
26912674

0 commit comments

Comments
 (0)