[None][chore] Upgrade CuteDSL to 4.3.0 (NVIDIA#9444)

syuoni · web-flow · commit 1bf2d750a2a6 · 2025-11-26T14:53:09.000+08:00
Signed-off-by: Enwei Zhu &lt;21126786+syuoni@users.noreply.github.com&gt;
diff --git a/ATTRIBUTIONS-Python.md b/ATTRIBUTIONS-Python.md
@@ -25250,7 +25250,7 @@ License: `NVIDIA Proprietary Software`
   - `Homepage`: https://developer.nvidia.com/cusparselt
 
 
-## nvidia-cutlass-dsl (4.2.1)
+## nvidia-cutlass-dsl (4.3.0)
 
 ### Licenses
 License: `None`
diff --git a/requirements.txt b/requirements.txt
@@ -69,7 +69,7 @@ triton==3.5.0; platform_machine == "x86_64"
 tiktoken
 blobfile
 openai-harmony==0.0.4
-nvidia-cutlass-dsl==4.3.0.dev0; python_version >= "3.10"
+nvidia-cutlass-dsl==4.3.0; python_version >= "3.10"
 plotly
 numexpr<2.14.0 # WAR for attempted use of nonexistent numpy.typing
 partial_json_parser
diff --git a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py
@@ -1552,6 +1552,8 @@ def kernel(
                 epi_tidx, tCtAcc_base, tCgC, epi_tile, use_2cta_instrs
             )
 
+            tTR_rC = cute.make_rmem_tensor(tTR_rAcc.shape, self.out_dtype)
+
             copy_atom_r2s = sm100_utils.get_smem_store_op(
                 self.gemm_output_layout, self.out_dtype, self.acc_dtype, tiled_copy_t2r
             )
@@ -1641,8 +1643,6 @@ def kernel(
                     layout = cute.make_layout(shape=(cute.size(tTR_rAcc),), stride=(1,))
                     loop_size = cute.size(tTR_rAcc)
 
-                rOut_epi = cute.make_rmem_tensor(layout, self.out_dtype)
-
                 for subtile_idx in cutlass.range(subtile_cnt):
                     #
                     # Load accumulator from tensor memory buffer to register
@@ -1657,7 +1657,8 @@ def kernel(
                     # Apply router scale to the entire row (broadcast scalar to vector)
                     acc_vec_finalized = token_scale * acc_vec_scaled
 
-                    rOut_epi.store(acc_vec_finalized.to(self.out_dtype))
+                    tTR_rC.store(acc_vec_finalized.to(self.out_dtype))
+                    rOut_epi = cute.make_tensor(tTR_rC.iterator, layout)
 
                     if permuted_row < tile_mn_limit:
                         coord_n = mma_tile_coord_mnl[1] * self.cta_tile_shape_mnk[