fix compilation

syuoni · syuoni · commit c235f0d6310f · 2025-12-02T12:28:30.000Z
Signed-off-by: Enwei Zhu &lt;21126786+syuoni@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/thop/cuteDslMoeUtilsOp.cpp b/cpp/tensorrt_llm/thop/cuteDslMoeUtilsOp.cpp
@@ -255,7 +255,7 @@ torch::Tensor moe_unpermute(torch::Tensor const& permuted_input, torch::Tensor c
     return output;
 }
 
-torch::Tensor moe_output_memset(torch::Tensor const& input, torch::Tensor const& tile_idx_to_mn_limit,
+void moe_output_memset_inplace(torch::Tensor const& input, torch::Tensor const& tile_idx_to_mn_limit,
     torch::Tensor const& expanded_idx_to_permuted_idx, torch::Tensor const& permuted_idx_to_expanded_idx,
     torch::Tensor const& num_non_exiting_tiles, int64_t const tile_tokens_dim, int64_t const top_k)
 {
@@ -305,8 +305,6 @@ torch::Tensor moe_output_memset(torch::Tensor const& input, torch::Tensor const&
     }
 
 #undef DISPATCH_MOE_OUTPUT_MEMSET
-
-    return input;
 }
 
 // Activation
@@ -478,8 +476,8 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m)
         "Tensor num_non_exiting_tiles, int tile_tokens_dim, int top_k) -> (Tensor, Tensor?)");
     m.def("moe_unpermute(Tensor permuted_input, Tensor expanded_idx_to_permuted_idx, Tensor topk_scales) -> Tensor");
     m.def(
-        "moe_output_memset(Tensor! input, Tensor tile_idx_to_mn_limit, Tensor expanded_idx_to_permuted_idx, "
-        "Tensor permuted_idx_to_expanded_idx, Tensor num_non_exiting_tiles, int tile_tokens_dim, int top_k) -> Tensor");
+        "moe_output_memset_inplace(Tensor(a!) input, Tensor tile_idx_to_mn_limit, Tensor expanded_idx_to_permuted_idx, "
+        "Tensor permuted_idx_to_expanded_idx, Tensor num_non_exiting_tiles, int tile_tokens_dim, int top_k) -> ()");
     m.def(
         "moe_swiglu(Tensor input, Tensor tile_idx_to_mn_limit, Tensor num_non_exiting_tiles, "
         "int tile_tokens_dim) -> Tensor");
@@ -497,7 +495,7 @@ TORCH_LIBRARY_IMPL(trtllm, CUDA, m)
     m.impl("moe_sort", &torch_ext::moe_sort);
     m.impl("moe_permute", &torch_ext::moe_permute);
     m.impl("moe_unpermute", &torch_ext::moe_unpermute);
-    m.impl("moe_output_memset", &torch_ext::moe_output_memset);
+    m.impl("moe_output_memset_inplace", &torch_ext::moe_output_memset_inplace);
     m.impl("moe_swiglu", &torch_ext::moe_swiglu);
     m.impl("moe_swiglu_nvfp4_quantize", &torch_ext::moe_swiglu_nvfp4_quantize);
     m.impl("moe_gelu", &torch_ext::moe_gelu);
diff --git a/tensorrt_llm/_torch/compilation/utils.py b/tensorrt_llm/_torch/compilation/utils.py
@@ -76,6 +76,13 @@ def inplace_info():
         },
         torch.ops.trtllm.logits_bitmask.default: {
             1: "logits"
+        },
+        torch.ops.trtllm.moe_output_memset_inplace.default: {
+            1: "input"
+        },
+        torch.ops.trtllm.cute_dsl_nvfp4_grouped_gemm_finalize_blackwell.default:
+        {
+            6: "output"
         }
     }
     return inplace_map
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py
@@ -302,7 +302,7 @@ def run_moe_nvfp4(
                 output = torch.empty(output_shape,
                                      dtype=output_dtype,
                                      device=x.device)
-                torch.ops.trtllm.moe_output_memset(
+                torch.ops.trtllm.moe_output_memset_inplace(
                     output=output,
                     tile_idx_to_mn_limit=tile_idx_to_mn_limit,
                     expanded_idx_to_permuted_idx=expanded_idx_to_permuted_idx,
diff --git a/tests/unittest/_torch/thop/parallel/test_cute_dsl_moe.py b/tests/unittest/_torch/thop/parallel/test_cute_dsl_moe.py
@@ -231,7 +231,7 @@ def test_moe_unpermute(dtype: str, num_tokens: int, top_k: int, tile_size: int):
 @pytest.mark.parametrize("top_k", [1, 2, 8])
 @pytest.mark.parametrize("num_tokens", [128, 515, 1024])
 @pytest.mark.parametrize("dtype", ["bfloat16", "float16"])
-def test_moe_output_memset(dtype: str, num_tokens: int, top_k: int, tile_size: int):
+def test_moe_output_memset_inplace(dtype: str, num_tokens: int, top_k: int, tile_size: int):
     dtype = getattr(torch, dtype)
     hidden_size = 4096
     num_experts = 256
@@ -260,7 +260,7 @@ def test_moe_output_memset(dtype: str, num_tokens: int, top_k: int, tile_size: i
     )
 
     x = torch.ones(num_tokens, hidden_size, dtype=dtype, device="cuda")
-    x = torch.ops.trtllm.moe_output_memset(
+    torch.ops.trtllm.moe_output_memset_inplace(
         x,
         tile_idx_to_mn_limit,
         expanded_idx_to_permuted_idx,

Original file line number	Diff line number	Diff line change
`@@ -76,6 +76,13 @@ def inplace_info():`
`76`	`76`	`},`
`77`	`77`	`torch.ops.trtllm.logits_bitmask.default: {`
`78`	`78`	`1: "logits"`
	`79`	`+ },`
	`80`	`+ torch.ops.trtllm.moe_output_memset_inplace.default: {`
	`81`	`+ 1: "input"`
	`82`	`+ },`
	`83`	`+ torch.ops.trtllm.cute_dsl_nvfp4_grouped_gemm_finalize_blackwell.default:`
	`84`	`+ {`
	`85`	`+ 6: "output"`
`79`	`86`	`}`
`80`	`87`	`}`
`81`	`88`	`return inplace_map`