From 9c8902f0fbd281298f94c4b451da145f1d9e21f8 Mon Sep 17 00:00:00 2001 From: Mindy Li <11663212+limin2021@users.noreply.github.com> Date: Thu, 19 Mar 2026 19:45:56 -0700 Subject: [PATCH 1/7] feat: bump nvidia-cutlass-dsl minimum version to 4.4.2 Update requirements.txt to require >=4.4.2 and add CI override via setup_python.env so tests use the new version immediately without waiting for Docker image rebuild. Co-Authored-By: Claude Opus 4.6 --- ci/setup_python.env | 3 +++ requirements.txt | 2 +- scripts/setup_test_env.sh | 10 ++++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/ci/setup_python.env b/ci/setup_python.env index ffd9c49ac2..1cba321988 100644 --- a/ci/setup_python.env +++ b/ci/setup_python.env @@ -15,3 +15,6 @@ # Uncomment to override TVM-FFI version: # TVM_FFI_REF= + +# Uncomment to override nvidia-cutlass-dsl version: +CUTLASS_DSL_VERSION=4.4.2 diff --git a/requirements.txt b/requirements.txt index 7dd93c67f7..7eb97a4ab9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ einops ninja numpy nvidia-cudnn-frontend>=1.13.0 -nvidia-cutlass-dsl>=4.3.4 +nvidia-cutlass-dsl>=4.4.2 nvidia-ml-py packaging>=24.2 requests diff --git a/scripts/setup_test_env.sh b/scripts/setup_test_env.sh index 83480cbd6a..69ec3c8d98 100755 --- a/scripts/setup_test_env.sh +++ b/scripts/setup_test_env.sh @@ -23,3 +23,13 @@ if [ -n "${TVM_FFI_REF:-}" ]; then echo "TVM-FFI override complete." echo "" fi + +# Override nvidia-cutlass-dsl if specified +if [ -n "${CUTLASS_DSL_VERSION:-}" ]; then + echo "========================================" + echo "Overriding nvidia-cutlass-dsl with version: ${CUTLASS_DSL_VERSION}" + echo "========================================" + pip install --force-reinstall "nvidia-cutlass-dsl==${CUTLASS_DSL_VERSION}" + echo "nvidia-cutlass-dsl override complete." + echo "" +fi From aa17f5422cda63b9cc6d593ed027ca91ba65ae85 Mon Sep 17 00:00:00 2001 From: Mindy Li <11663212+limin2021@users.noreply.github.com> Date: Fri, 20 Mar 2026 01:08:47 -0700 Subject: [PATCH 2/7] fix: use nvidia-cutlass-dsl[cu13] extra for CUDA 13 libs From 4.4 onwards, nvidia-cutlass-dsl requires the [cu13] extra to install nvidia-cutlass-dsl-libs-cu13. Without it, CI fails. Co-Authored-By: Claude Opus 4.6 --- requirements.txt | 2 +- scripts/setup_test_env.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 7eb97a4ab9..c39bf72042 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ einops ninja numpy nvidia-cudnn-frontend>=1.13.0 -nvidia-cutlass-dsl>=4.4.2 +nvidia-cutlass-dsl[cu13]>=4.4.2 nvidia-ml-py packaging>=24.2 requests diff --git a/scripts/setup_test_env.sh b/scripts/setup_test_env.sh index 69ec3c8d98..ded733bd75 100755 --- a/scripts/setup_test_env.sh +++ b/scripts/setup_test_env.sh @@ -29,7 +29,7 @@ if [ -n "${CUTLASS_DSL_VERSION:-}" ]; then echo "========================================" echo "Overriding nvidia-cutlass-dsl with version: ${CUTLASS_DSL_VERSION}" echo "========================================" - pip install --force-reinstall "nvidia-cutlass-dsl==${CUTLASS_DSL_VERSION}" + pip install --force-reinstall "nvidia-cutlass-dsl[cu13]==${CUTLASS_DSL_VERSION}" echo "nvidia-cutlass-dsl override complete." echo "" fi From ae215a6ea311f2810db7ba2cbae0eaf67970ebad Mon Sep 17 00:00:00 2001 From: Mindy Li <11663212+limin2021@users.noreply.github.com> Date: Fri, 20 Mar 2026 01:21:18 -0700 Subject: [PATCH 3/7] fix: auto-detect CUDA version for nvidia-cutlass-dsl extra and clean install - requirements.txt: remove [cu13] extra (keep generic for Docker builds) - setup_test_env.sh: detect CUDA major version from torch to select [cu12] or [cu13] extra, and clean uninstall old packages before installing (per NVIDIA docs recommendation) Co-Authored-By: Claude Opus 4.6 --- requirements.txt | 2 +- scripts/setup_test_env.sh | 13 +++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index c39bf72042..7eb97a4ab9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ einops ninja numpy nvidia-cudnn-frontend>=1.13.0 -nvidia-cutlass-dsl[cu13]>=4.4.2 +nvidia-cutlass-dsl>=4.4.2 nvidia-ml-py packaging>=24.2 requests diff --git a/scripts/setup_test_env.sh b/scripts/setup_test_env.sh index ded733bd75..312419dd02 100755 --- a/scripts/setup_test_env.sh +++ b/scripts/setup_test_env.sh @@ -26,10 +26,19 @@ fi # Override nvidia-cutlass-dsl if specified if [ -n "${CUTLASS_DSL_VERSION:-}" ]; then + # Detect CUDA major version to select the correct extra (cu12 or cu13) + CUDA_MAJOR=$(python -c "import torch; print(torch.version.cuda.split('.')[0])" 2>/dev/null || echo "12") + if [ "$CUDA_MAJOR" = "13" ]; then + CUTLASS_DSL_EXTRA="cu13" + else + CUTLASS_DSL_EXTRA="cu12" + fi echo "========================================" - echo "Overriding nvidia-cutlass-dsl with version: ${CUTLASS_DSL_VERSION}" + echo "Overriding nvidia-cutlass-dsl with version: ${CUTLASS_DSL_VERSION} [${CUTLASS_DSL_EXTRA}]" echo "========================================" - pip install --force-reinstall "nvidia-cutlass-dsl[cu13]==${CUTLASS_DSL_VERSION}" + # Clean uninstall old packages first (recommended by NVIDIA docs) + pip uninstall nvidia-cutlass-dsl nvidia-cutlass-dsl-libs-base nvidia-cutlass-dsl-libs-cu12 nvidia-cutlass-dsl-libs-cu13 -y 2>/dev/null || true + pip install "nvidia-cutlass-dsl[${CUTLASS_DSL_EXTRA}]==${CUTLASS_DSL_VERSION}" echo "nvidia-cutlass-dsl override complete." echo "" fi From bc3b93cecef182747b8a18e0f9f1cde639b03c4e Mon Sep 17 00:00:00 2001 From: Mindy Li <11663212+limin2021@users.noreply.github.com> Date: Fri, 20 Mar 2026 02:05:53 -0700 Subject: [PATCH 4/7] fix: CUDA 12 needs no extra, only CUDA 13 needs [cu13] for cutlass-dsl - setup_test_env.sh: only add [cu13] extra for CUDA 13, plain install for CUDA 12 - Update skill docs to match NVIDIA's official installation guide Co-Authored-By: Claude Opus 4.6 --- scripts/setup_test_env.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/setup_test_env.sh b/scripts/setup_test_env.sh index 312419dd02..5cd61330f1 100755 --- a/scripts/setup_test_env.sh +++ b/scripts/setup_test_env.sh @@ -26,19 +26,19 @@ fi # Override nvidia-cutlass-dsl if specified if [ -n "${CUTLASS_DSL_VERSION:-}" ]; then - # Detect CUDA major version to select the correct extra (cu12 or cu13) + # Detect CUDA major version: only CUDA 13+ needs [cu13] extra CUDA_MAJOR=$(python -c "import torch; print(torch.version.cuda.split('.')[0])" 2>/dev/null || echo "12") if [ "$CUDA_MAJOR" = "13" ]; then - CUTLASS_DSL_EXTRA="cu13" + CUTLASS_DSL_PKG="nvidia-cutlass-dsl[cu13]==${CUTLASS_DSL_VERSION}" else - CUTLASS_DSL_EXTRA="cu12" + CUTLASS_DSL_PKG="nvidia-cutlass-dsl==${CUTLASS_DSL_VERSION}" fi echo "========================================" - echo "Overriding nvidia-cutlass-dsl with version: ${CUTLASS_DSL_VERSION} [${CUTLASS_DSL_EXTRA}]" + echo "Overriding nvidia-cutlass-dsl with: ${CUTLASS_DSL_PKG}" echo "========================================" # Clean uninstall old packages first (recommended by NVIDIA docs) pip uninstall nvidia-cutlass-dsl nvidia-cutlass-dsl-libs-base nvidia-cutlass-dsl-libs-cu12 nvidia-cutlass-dsl-libs-cu13 -y 2>/dev/null || true - pip install "nvidia-cutlass-dsl[${CUTLASS_DSL_EXTRA}]==${CUTLASS_DSL_VERSION}" + pip install "${CUTLASS_DSL_PKG}" echo "nvidia-cutlass-dsl override complete." echo "" fi From 0c966f1a0a0ac9b64ebfa666e12bde43f724d4ba Mon Sep 17 00:00:00 2001 From: Yunzhe Qiu Date: Fri, 20 Mar 2026 18:35:24 -0700 Subject: [PATCH 5/7] fix cute dsl failure after 4.4.2 --- .../cute_dsl/gemm_allreduce_two_shot.py | 11 ++-- ...guous_gather_grouped_gemm_swiglu_fusion.py | 56 +++++++++---------- ...contiguous_grouped_gemm_finalize_fusion.py | 44 +++++++-------- .../fused_moe/cute_dsl/blackwell/utils.py | 1 - .../kernels/dense_blockscaled_gemm_sm100.py | 4 +- 5 files changed, 56 insertions(+), 60 deletions(-) diff --git a/flashinfer/cute_dsl/gemm_allreduce_two_shot.py b/flashinfer/cute_dsl/gemm_allreduce_two_shot.py index baf55468a4..25d8ddff2a 100644 --- a/flashinfer/cute_dsl/gemm_allreduce_two_shot.py +++ b/flashinfer/cute_dsl/gemm_allreduce_two_shot.py @@ -31,7 +31,6 @@ def spin_lock_multimem_arrive(lock_ptr: Pointer, loc=None, ip=None) -> None: # HACK https://github.com/NVIDIA/cutlass/issues/2845 from cutlass._mlir.dialects import nvvm -from cutlass.cutlass_dsl import T from cutlass._mlir.dialects.nvvm import ( MemOrderKind, MemScopeKind, @@ -56,7 +55,6 @@ def spin_lock_atom_cas_acquire_wait( result = 0 while result != expected_val: result = nvvm.atomicrmw( - T.i32(), AtomicOpKind.CAS, lock_ptr.llvm_ptr, Int32(reset_val).ir_value(loc=loc, ip=ip), @@ -70,7 +68,6 @@ def spin_lock_atom_cas_acquire_wait( result = 0 while result != expected_val: result = nvvm.atomicrmw( - T.i32(), AtomicOpKind.CAS, lock_ptr.llvm_ptr, Int32(reset_val).ir_value(loc=loc, ip=ip), @@ -92,7 +89,7 @@ def sm_wise_inter_gpu_multimem_barrier( bdimx, bdimy, _ = cute.arch.grid_dim() pid = bidx + bidy * bdimx + bidz * bdimx * bdimy distributed.multimem_red_release_sys_add1(barrier_mc + pid, loc=loc, ip=ip) - cute.arch.fence_proxy(cute.arch.ProxyKind.alias) + cute.arch.fence_proxy("alias") # v4.3.1 does not have mem_order="acquire" variant in `distributed` module # filed issue https://github.com/NVIDIA/cutlass/issues/2845 @@ -1251,8 +1248,8 @@ def kernel( ) # Fence and barrier to make sure shared memory store is visible to TMA store cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) epilog_threads = 32 * len(self.epilog_warp_id) cute.arch.barrier( @@ -1312,7 +1309,7 @@ def kernel( flag = barrier_flag_mc.iterator + tile_id cute.arch.fence_acq_rel_gpu() spin_lock_multimem_arrive(flag) - cute.arch.fence_proxy(cute.arch.ProxyKind.alias) + cute.arch.fence_proxy("alias") # # Advance to next tile diff --git a/flashinfer/fused_moe/cute_dsl/blackwell/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py b/flashinfer/fused_moe/cute_dsl/blackwell/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py index 10a1f7f822..f8c50c624f 100644 --- a/flashinfer/fused_moe/cute_dsl/blackwell/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py +++ b/flashinfer/fused_moe/cute_dsl/blackwell/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py @@ -1512,8 +1512,8 @@ def kernel( sInfo[(4, tile_info_producer_state.index)] = mn_limit # fence view async shared cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) self.sched_sync_barrier.arrive_and_wait() @@ -1548,8 +1548,8 @@ def kernel( sInfo[(4, tile_info_producer_state.index)] = mn_limit # fence view async shared cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) self.sched_sync_barrier.arrive_and_wait() @@ -1569,8 +1569,8 @@ def kernel( sInfo[(3, tile_info_producer_state.index)] = cutlass.Int32(0) sInfo[(4, tile_info_producer_state.index)] = -1 cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) self.sched_sync_barrier.arrive_and_wait() tile_info_pipeline.producer_commit(tile_info_producer_state) @@ -1669,8 +1669,8 @@ def kernel( tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)] is_valid_tile = tile_info[3] == 1 cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) tile_info_pipeline.consumer_release(tile_info_consumer_state) tile_info_consumer_state.advance() @@ -1844,8 +1844,8 @@ def kernel( tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)] is_valid_tile = tile_info[3] == 1 cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) tile_info_pipeline.consumer_release(tile_info_consumer_state) tile_info_consumer_state.advance() @@ -1886,8 +1886,8 @@ def kernel( valid_tile_info[0] = sInfo[(3, tile_info_consumer_state.index)] is_valid_tile = valid_tile_info[0] == 1 cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) tile_info_pipeline.consumer_release(tile_info_consumer_state) tile_info_consumer_state.advance() @@ -1927,8 +1927,8 @@ def kernel( valid_tile_info[0] = sInfo[(3, tile_info_consumer_state.index)] is_valid_tile = valid_tile_info[0] == 1 cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) tile_info_pipeline.consumer_release(tile_info_consumer_state) tile_info_consumer_state.advance() @@ -1968,8 +1968,8 @@ def kernel( tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)] is_valid_tile = tile_info[3] == 1 cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) tile_info_pipeline.consumer_release(tile_info_consumer_state) tile_info_consumer_state.advance() @@ -2051,8 +2051,8 @@ def kernel( tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)] is_valid_tile = tile_info[3] == 1 cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) tile_info_pipeline.consumer_release(tile_info_consumer_state) tile_info_consumer_state.advance() @@ -2152,8 +2152,8 @@ def kernel( tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)] is_valid_tile = tile_info[3] == 1 cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) tile_info_pipeline.consumer_release(tile_info_consumer_state) tile_info_consumer_state.advance() @@ -2368,8 +2368,8 @@ def kernel( tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)] is_valid_tile = tile_info[3] == 1 cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) tile_info_pipeline.consumer_release(tile_info_consumer_state) tile_info_consumer_state.advance() @@ -2480,8 +2480,8 @@ def kernel( tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)] is_valid_tile = tile_info[3] == 1 cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) tile_info_pipeline.consumer_release(tile_info_consumer_state) tile_info_consumer_state.advance() @@ -2811,8 +2811,8 @@ def kernel( ) # Fence and barrier to make sure shared memory store is visible to TMA store cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) self.epilog_sync_barrier.arrive_and_wait() # @@ -2845,8 +2845,8 @@ def kernel( tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)] is_valid_tile = tile_info[3] == 1 cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) tile_info_pipeline.consumer_release(tile_info_consumer_state) tile_info_consumer_state.advance() diff --git a/flashinfer/fused_moe/cute_dsl/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py b/flashinfer/fused_moe/cute_dsl/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py index ce4fb6269b..e07fab4eb6 100644 --- a/flashinfer/fused_moe/cute_dsl/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py +++ b/flashinfer/fused_moe/cute_dsl/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py @@ -1380,8 +1380,8 @@ def kernel( sInfo[(4, tile_info_producer_state.index)] = mn_limit # fence view async shared cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) self.sched_sync_barrier.arrive_and_wait() @@ -1416,8 +1416,8 @@ def kernel( sInfo[(4, tile_info_producer_state.index)] = mn_limit # fence view async shared cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) self.sched_sync_barrier.arrive_and_wait() @@ -1438,8 +1438,8 @@ def kernel( sInfo[(3, tile_info_producer_state.index)] = cutlass.Int32(0) sInfo[(4, tile_info_producer_state.index)] = cutlass.Int32(0) cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) self.sched_sync_barrier.arrive_and_wait() tile_info_pipeline.producer_commit(tile_info_producer_state) @@ -1467,8 +1467,8 @@ def kernel( tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)] is_valid_tile = tile_info[3] == 1 cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) tile_info_pipeline.consumer_release(tile_info_consumer_state) tile_info_consumer_state.advance() @@ -1573,8 +1573,8 @@ def kernel( tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)] is_valid_tile = tile_info[3] == 1 cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) tile_info_pipeline.consumer_release(tile_info_consumer_state) tile_info_consumer_state.advance() @@ -1659,8 +1659,8 @@ def kernel( tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)] is_valid_tile = tile_info[3] == 1 cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) tile_info_pipeline.consumer_release(tile_info_consumer_state) tile_info_consumer_state.advance() @@ -1818,8 +1818,8 @@ def kernel( tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)] is_valid_tile = tile_info[3] == 1 cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) tile_info_pipeline.consumer_release(tile_info_consumer_state) tile_info_consumer_state.advance() @@ -1886,8 +1886,8 @@ def kernel( tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)] is_valid_tile = tile_info[3] == 1 cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) tile_info_pipeline.consumer_release(tile_info_consumer_state) tile_info_consumer_state.advance() @@ -2023,8 +2023,8 @@ def kernel( if cutlass.const_expr(self.use_blkred): cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) # # Async arrive accumulator buffer empty @@ -2037,8 +2037,8 @@ def kernel( if cutlass.const_expr(self.use_blkred): cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) if is_valid_row: coord_n = mma_tile_coord_mnl[1] * self.cta_tile_shape_mnk[1] @@ -2073,8 +2073,8 @@ def kernel( tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)] is_valid_tile = tile_info[3] == 1 cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) tile_info_pipeline.consumer_release(tile_info_consumer_state) tile_info_consumer_state.advance() diff --git a/flashinfer/fused_moe/cute_dsl/blackwell/utils.py b/flashinfer/fused_moe/cute_dsl/blackwell/utils.py index b1c5349de1..4bc3b960c4 100644 --- a/flashinfer/fused_moe/cute_dsl/blackwell/utils.py +++ b/flashinfer/fused_moe/cute_dsl/blackwell/utils.py @@ -208,7 +208,6 @@ def fmin( ) -> cutlass.Float32: return cutlass.Float32( nvvm.fmin( - T.f32(), cutlass.Float32(a).ir_value(loc=loc, ip=ip), cutlass.Float32(b).ir_value(loc=loc, ip=ip), nan=nan, diff --git a/flashinfer/gemm/kernels/dense_blockscaled_gemm_sm100.py b/flashinfer/gemm/kernels/dense_blockscaled_gemm_sm100.py index 1475850e46..5710f97fac 100644 --- a/flashinfer/gemm/kernels/dense_blockscaled_gemm_sm100.py +++ b/flashinfer/gemm/kernels/dense_blockscaled_gemm_sm100.py @@ -1469,8 +1469,8 @@ def kernel( ) # Fence and barrier to make sure shared memory store is visible to TMA store cute.arch.fence_proxy( - cute.arch.ProxyKind.async_shared, - space=cute.arch.SharedSpace.shared_cta, + "async.shared", + space="cta", ) epilog_threads = 32 * len(self.epilog_warp_id) cute.arch.barrier( From 8cf864ee868782852fab16be2a43a6d6f2c13aa7 Mon Sep 17 00:00:00 2001 From: Mindy Li <11663212+limin2021@users.noreply.github.com> Date: Sun, 22 Mar 2026 19:04:04 -0700 Subject: [PATCH 6/7] fix: make nvvm.fmin and nvvm.atomicrmw compatible with both CUDA 12 and CUDA 13 nvidia-cutlass-dsl 4.4.2 has different MLIR dialect signatures between CUDA 12 (base) and CUDA 13 (cu13): the `res` parameter was removed in cu13. Add runtime detection to support both variants. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../cute_dsl/gemm_allreduce_two_shot.py | 39 ++++++++++++++++++- .../fused_moe/cute_dsl/blackwell/utils.py | 26 ++++++++----- 2 files changed, 54 insertions(+), 11 deletions(-) diff --git a/flashinfer/cute_dsl/gemm_allreduce_two_shot.py b/flashinfer/cute_dsl/gemm_allreduce_two_shot.py index 25d8ddff2a..20b0614304 100644 --- a/flashinfer/cute_dsl/gemm_allreduce_two_shot.py +++ b/flashinfer/cute_dsl/gemm_allreduce_two_shot.py @@ -30,7 +30,11 @@ def spin_lock_multimem_arrive(lock_ptr: Pointer, loc=None, ip=None) -> None: # HACK https://github.com/NVIDIA/cutlass/issues/2845 +import functools +import inspect + from cutlass._mlir.dialects import nvvm +from cutlass.cutlass_dsl import T from cutlass._mlir.dialects.nvvm import ( MemOrderKind, MemScopeKind, @@ -38,6 +42,35 @@ def spin_lock_multimem_arrive(lock_ptr: Pointer, loc=None, ip=None) -> None: ) +@functools.lru_cache(maxsize=None) +def _nvvm_atomicrmw_has_res_param(): + return "res" in inspect.signature(nvvm.atomicrmw).parameters + + +def _nvvm_atomicrmw_compat( + res_type, op, ptr, a, *, b=None, mem_order=None, syncscope=None, loc=None, ip=None +): + """Call nvvm.atomicrmw compatible with both CUDA 12 and CUDA 13.""" + if _nvvm_atomicrmw_has_res_param(): + # CUDA 12: nvvm.atomicrmw(res, op, ptr, a, ...) + return nvvm.atomicrmw( + res_type, + op, + ptr, + a, + b=b, + mem_order=mem_order, + syncscope=syncscope, + loc=loc, + ip=ip, + ) + else: + # CUDA 13: nvvm.atomicrmw(op, ptr, a, ...) — res removed + return nvvm.atomicrmw( + op, ptr, a, b=b, mem_order=mem_order, syncscope=syncscope, loc=loc, ip=ip + ) + + @cute.jit def spin_lock_atom_cas_acquire_wait( lock_ptr: Pointer, @@ -54,7 +87,8 @@ def spin_lock_atom_cas_acquire_wait( if scope == "gpu": result = 0 while result != expected_val: - result = nvvm.atomicrmw( + result = _nvvm_atomicrmw_compat( + T.i32(), AtomicOpKind.CAS, lock_ptr.llvm_ptr, Int32(reset_val).ir_value(loc=loc, ip=ip), @@ -67,7 +101,8 @@ def spin_lock_atom_cas_acquire_wait( elif scope == "sys": result = 0 while result != expected_val: - result = nvvm.atomicrmw( + result = _nvvm_atomicrmw_compat( + T.i32(), AtomicOpKind.CAS, lock_ptr.llvm_ptr, Int32(reset_val).ir_value(loc=loc, ip=ip), diff --git a/flashinfer/fused_moe/cute_dsl/blackwell/utils.py b/flashinfer/fused_moe/cute_dsl/blackwell/utils.py index 4bc3b960c4..b57e260171 100644 --- a/flashinfer/fused_moe/cute_dsl/blackwell/utils.py +++ b/flashinfer/fused_moe/cute_dsl/blackwell/utils.py @@ -44,6 +44,7 @@ # This file is copied and modified from cutlass https://github.com/NVIDIA/cutlass/blob/main/python/CuTeDSL/cutlass/cute/core.py import ctypes +import functools from typing import Union import cutlass @@ -197,6 +198,13 @@ def is_power_of_2(x: int) -> bool: return x > 0 and (x & (x - 1)) == 0 +@functools.lru_cache(maxsize=None) +def _nvvm_fmin_needs_res(): + import inspect + + return "res" in inspect.signature(nvvm.fmin).parameters + + @dsl_user_op def fmin( a: Union[float, cutlass.Float32], @@ -206,15 +214,15 @@ def fmin( loc=None, ip=None, ) -> cutlass.Float32: - return cutlass.Float32( - nvvm.fmin( - cutlass.Float32(a).ir_value(loc=loc, ip=ip), - cutlass.Float32(b).ir_value(loc=loc, ip=ip), - nan=nan, - loc=loc, - ip=ip, - ) - ) + a_val = cutlass.Float32(a).ir_value(loc=loc, ip=ip) + b_val = cutlass.Float32(b).ir_value(loc=loc, ip=ip) + if _nvvm_fmin_needs_res(): + # CUDA 12: nvvm.fmin(res, a, b, ...) + result = nvvm.fmin(T.f32(), a_val, b_val, nan=nan, loc=loc, ip=ip) + else: + # CUDA 13: nvvm.fmin(a, b, ...) + result = nvvm.fmin(a_val, b_val, nan=nan, loc=loc, ip=ip) + return cutlass.Float32(result) def sigmoid_f32( From dc45dd2978fd0a5c587425b8b4e65a0c7e63fa01 Mon Sep 17 00:00:00 2001 From: Mindy Li <11663212+limin2021@users.noreply.github.com> Date: Mon, 23 Mar 2026 18:35:55 -0700 Subject: [PATCH 7/7] remove cutlass-dsl version override from CI config CI has passed with the 4.4.2 fixes, so remove the temporary override per reviewer request. Co-Authored-By: Claude Opus 4.6 --- ci/setup_python.env | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/setup_python.env b/ci/setup_python.env index 1cba321988..0305a6329b 100644 --- a/ci/setup_python.env +++ b/ci/setup_python.env @@ -17,4 +17,4 @@ # TVM_FFI_REF= # Uncomment to override nvidia-cutlass-dsl version: -CUTLASS_DSL_VERSION=4.4.2 +# CUTLASS_DSL_VERSION=