From 2ae080ca389db3f84ed26092988ebb3bdaff940a Mon Sep 17 00:00:00 2001 From: leslie-fang25 Date: Wed, 24 Dec 2025 22:13:51 -0800 Subject: [PATCH] [https://nvbugs/5753788][chore] fix empty tensor cutlass moe Signed-off-by: leslie-fang25 --- cpp/tensorrt_llm/kernels/quantization.cu | 10 ++++++-- .../modules/fused_moe/fused_moe_cutlass.py | 24 ++++++++++++++++--- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/cpp/tensorrt_llm/kernels/quantization.cu b/cpp/tensorrt_llm/kernels/quantization.cu index 3941277dfa0..77a07b8a5c8 100644 --- a/cpp/tensorrt_llm/kernels/quantization.cu +++ b/cpp/tensorrt_llm/kernels/quantization.cu @@ -146,7 +146,10 @@ void invokeFP4Quantization(int b, int m, int n, T const* input, float const* SFS int const numBlocksPerSM = std::max(1u, 2048u / block.x); // The number of blocks for m. The m dimension will be padded to 128 for swizzled layout. int numBlocksForM = layout == QuantizationSFLayout::SWIZZLED ? PadUpFn(m, 128) : m; - dim3 grid(std::min(numBlocksForM, multiProcessorCount * numBlocksPerSM)); + int gridSize = std::min(numBlocksForM, multiProcessorCount * numBlocksPerSM); + // Ensure gridSize is not zero. + gridSize = std::max(1, gridSize); + dim3 grid(gridSize); // Launch the cvt kernel. auto* kernel_instance = useUE8M0 @@ -165,7 +168,10 @@ void invokeFP4Quantization(int b, int m, int n, T const* input, float const* SFS int const numBlocksPerSM = std::max(1u, 2048u / block.x); // The number of blocks for m. The m dimension will be padded to 128 for swizzled layout. int numBlocksForM = layout == QuantizationSFLayout::SWIZZLED ? PadUpFn(m, 128) : m; - dim3 grid(std::min(numBlocksForM, multiProcessorCount * numBlocksPerSM)); + int gridSize = std::min(numBlocksForM, multiProcessorCount * numBlocksPerSM); + // Ensure gridSize is not zero. + gridSize = std::max(1, gridSize); + dim3 grid(gridSize); // Launch the cvt kernel. auto* kernel_instance = useUE8M0 diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py index 71e13e1324b..0a6812cb137 100755 --- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py +++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py @@ -319,11 +319,17 @@ def quantize_input( x_row = x.shape[0] else: x_row = x.shape[0] + hidden_size = x.shape[-1] x, x_sf = torch.ops.trtllm.fp4_quantize( x, self.fc31_input_scale, self.scaling_vector_size, False, False) + if x_sf.numel() == 0 and x_sf.dim() == 1: + # View torch.Size[0] in to (0, -1) is not supported + x_sf = x_sf.view( + (0, + hidden_size // int(self.scaling_vector_size))) # Reshape x_sf to 2D for post-quant communication - if x_sf is not None: + if x_sf is not None and x_sf.numel() != 0: x_sf = x_sf.view((x_row, -1)) else: if not isinstance(x, Fp4QuantizedTensor): @@ -494,8 +500,20 @@ def forward_chunk( self._load_balancer_start_wait_gpu_stage(is_first_call) # apply routing - token_selected_experts, token_final_scales = self.routing_method.apply( - router_logits) + if router_logits.numel() == 0: + # For dtype, refer to https://github.com/NVIDIA/TensorRT-LLM/blob/55f3cda66d05a2e5686c9c7512721beb522bc8b7/tensorrt_llm/_torch/modules/fused_moe/routing.py#L327 + token_selected_experts = torch.empty( + (0, self.routing_method.experts_per_token), + dtype=torch.int32, + device=router_logits.device) + token_final_scales = torch.empty( + (0, self.routing_method.experts_per_token), + dtype=torch.float32, + device=router_logits.device) + else: + token_selected_experts, token_final_scales = self.routing_method.apply( + router_logits) + assert token_selected_experts.shape[ 1] == self.routing_method.experts_per_token assert token_selected_experts.shape == token_final_scales.shape