Skip to content

Commit 1bf2d75

Browse files
authored
[None][chore] Upgrade CuteDSL to 4.3.0 (NVIDIA#9444)
Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
1 parent b7308a4 commit 1bf2d75

File tree

3 files changed

+6
-5
lines changed

3 files changed

+6
-5
lines changed

ATTRIBUTIONS-Python.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25250,7 +25250,7 @@ License: `NVIDIA Proprietary Software`
2525025250
- `Homepage`: https://developer.nvidia.com/cusparselt
2525125251

2525225252

25253-
## nvidia-cutlass-dsl (4.2.1)
25253+
## nvidia-cutlass-dsl (4.3.0)
2525425254

2525525255
### Licenses
2525625256
License: `None`

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ triton==3.5.0; platform_machine == "x86_64"
6969
tiktoken
7070
blobfile
7171
openai-harmony==0.0.4
72-
nvidia-cutlass-dsl==4.3.0.dev0; python_version >= "3.10"
72+
nvidia-cutlass-dsl==4.3.0; python_version >= "3.10"
7373
plotly
7474
numexpr<2.14.0 # WAR for attempted use of nonexistent numpy.typing
7575
partial_json_parser

tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1552,6 +1552,8 @@ def kernel(
15521552
epi_tidx, tCtAcc_base, tCgC, epi_tile, use_2cta_instrs
15531553
)
15541554

1555+
tTR_rC = cute.make_rmem_tensor(tTR_rAcc.shape, self.out_dtype)
1556+
15551557
copy_atom_r2s = sm100_utils.get_smem_store_op(
15561558
self.gemm_output_layout, self.out_dtype, self.acc_dtype, tiled_copy_t2r
15571559
)
@@ -1641,8 +1643,6 @@ def kernel(
16411643
layout = cute.make_layout(shape=(cute.size(tTR_rAcc),), stride=(1,))
16421644
loop_size = cute.size(tTR_rAcc)
16431645

1644-
rOut_epi = cute.make_rmem_tensor(layout, self.out_dtype)
1645-
16461646
for subtile_idx in cutlass.range(subtile_cnt):
16471647
#
16481648
# Load accumulator from tensor memory buffer to register
@@ -1657,7 +1657,8 @@ def kernel(
16571657
# Apply router scale to the entire row (broadcast scalar to vector)
16581658
acc_vec_finalized = token_scale * acc_vec_scaled
16591659

1660-
rOut_epi.store(acc_vec_finalized.to(self.out_dtype))
1660+
tTR_rC.store(acc_vec_finalized.to(self.out_dtype))
1661+
rOut_epi = cute.make_tensor(tTR_rC.iterator, layout)
16611662

16621663
if permuted_row < tile_mn_limit:
16631664
coord_n = mma_tile_coord_mnl[1] * self.cta_tile_shape_mnk[

0 commit comments

Comments
 (0)