From ba37c9e35f0f9b750452f89122b176015c9f965b Mon Sep 17 00:00:00 2001 From: Valentin Clement Date: Sat, 11 Oct 2025 11:47:06 -0700 Subject: [PATCH] [flang][cuda] Add interfaces and lowering for tma_bulk subroutine --- .../flang/Optimizer/Builder/IntrinsicCall.h | 2 ++ flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 23 +++++++++++++++++++ flang/module/cudadevice.f90 | 10 ++++++++ flang/test/Lower/CUDA/cuda-device-proc.cuf | 9 ++++++++ 4 files changed, 44 insertions(+) diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index ca02693c53aeb..1f7da10fdcc20 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -456,6 +456,8 @@ struct IntrinsicLibrary { mlir::Value genTand(mlir::Type, llvm::ArrayRef); mlir::Value genTanpi(mlir::Type, llvm::ArrayRef); mlir::Value genTime(mlir::Type, llvm::ArrayRef); + void genTMABulkCommitGroup(llvm::ArrayRef); + void genTMABulkWaitGroup(llvm::ArrayRef); mlir::Value genTrailz(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genTransfer(mlir::Type, llvm::ArrayRef); diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index c9cf6c23a81a5..e5b70eed9926b 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -1012,6 +1012,14 @@ static constexpr IntrinsicHandler handlers[]{ {"threadfence_block", &I::genThreadFenceBlock, {}, /*isElemental=*/false}, {"threadfence_system", &I::genThreadFenceSystem, {}, /*isElemental=*/false}, {"time", &I::genTime, {}, /*isElemental=*/false}, + {"tma_bulk_commit_group", + &I::genTMABulkCommitGroup, + {{}}, + /*isElemental=*/false}, + {"tma_bulk_wait_group", + &I::genTMABulkWaitGroup, + {{}}, + /*isElemental=*/false}, {"trailz", &I::genTrailz}, {"transfer", &I::genTransfer, @@ -9169,6 +9177,21 @@ mlir::Value IntrinsicLibrary::genTime(mlir::Type resultType, fir::runtime::genTime(builder, loc)); } +// TMA_BULK_COMMIT_GROUP (CUDA) +void IntrinsicLibrary::genTMABulkCommitGroup( + llvm::ArrayRef args) { + assert(args.size() == 0); + mlir::NVVM::CpAsyncBulkCommitGroupOp::create(builder, loc); +} + +// TMA_BULK_WAIT_GROUP (CUDA) +void IntrinsicLibrary::genTMABulkWaitGroup( + llvm::ArrayRef args) { + assert(args.size() == 0); + auto group = builder.getIntegerAttr(builder.getI32Type(), 0); + mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, group, {}); +} + // TRIM fir::ExtendedValue IntrinsicLibrary::genTrim(mlir::Type resultType, diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90 index e6c9e958af365..afb39ebdf0d07 100644 --- a/flang/module/cudadevice.f90 +++ b/flang/module/cudadevice.f90 @@ -2008,6 +2008,16 @@ attributes(device) function barrier_arrive_cnt(barrier, count) result(token) end function end interface + interface + attributes(device) subroutine tma_bulk_commit_group() + end subroutine + end interface + + interface + attributes(device) subroutine tma_bulk_wait_group() + end subroutine + end interface + contains attributes(device) subroutine syncthreads() diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 1bf714010f5d3..487a6b2883f46 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -419,3 +419,12 @@ end subroutine ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref) -> !llvm.ptr ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3> ! CHECK: nvvm.mbarrier.arrive.expect_tx %[[SHARED_PTR]], %{{.*}} : !llvm.ptr<3>, i32 + +attributes(global) subroutine test_tma() + call tma_bulk_commit_group() + call tma_bulk_wait_group() +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma() +! CHECK: nvvm.cp.async.bulk.commit.group +! CHECK: nvvm.cp.async.bulk.wait_group 0