From fa56a80edd4dfe8e3f5245d5efc2ad2be2bbf48b Mon Sep 17 00:00:00 2001 From: Valentin Clement Date: Fri, 7 Nov 2025 11:51:57 -0800 Subject: [PATCH] [flang][cuda][NFC] Simplify thread fence lowering --- .../Optimizer/Builder/CUDAIntrinsicCall.h | 3 +-- .../Optimizer/Builder/CUDAIntrinsicCall.cpp | 25 +++++-------------- 2 files changed, 7 insertions(+), 21 deletions(-) diff --git a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h index d735ce95a83dc..ae7d566920656 100644 --- a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h @@ -63,9 +63,8 @@ struct CUDAIntrinsicLibrary : IntrinsicLibrary { mlir::Value genThisGrid(mlir::Type, llvm::ArrayRef); mlir::Value genThisThreadBlock(mlir::Type, llvm::ArrayRef); mlir::Value genThisWarp(mlir::Type, llvm::ArrayRef); + template void genThreadFence(llvm::ArrayRef); - void genThreadFenceBlock(llvm::ArrayRef); - void genThreadFenceSystem(llvm::ArrayRef); void genTMABulkCommitGroup(llvm::ArrayRef); void genTMABulkG2S(llvm::ArrayRef); void genTMABulkLoadC4(llvm::ArrayRef); diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp index 18b56d384b479..323d1ef78e65d 100644 --- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp @@ -472,17 +472,17 @@ static constexpr IntrinsicHandler cudaHandlers[]{ /*isElemental=*/false}, {"threadfence", static_cast( - &CI::genThreadFence), + &CI::genThreadFence), {}, /*isElemental=*/false}, {"threadfence_block", static_cast( - &CI::genThreadFenceBlock), + &CI::genThreadFence), {}, /*isElemental=*/false}, {"threadfence_system", static_cast( - &CI::genThreadFenceSystem), + &CI::genThreadFence), {}, /*isElemental=*/false}, {"tma_bulk_commit_group", @@ -1306,25 +1306,12 @@ CUDAIntrinsicLibrary::genThisWarp(mlir::Type resultType, return res; } -// THREADFENCE +// THREADFENCE, THREADFENCE_BLOCK, THREADFENCE_SYSTEM +template void CUDAIntrinsicLibrary::genThreadFence( llvm::ArrayRef args) { assert(args.size() == 0); - mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::GPU); -} - -// THREADFENCE_BLOCK -void CUDAIntrinsicLibrary::genThreadFenceBlock( - llvm::ArrayRef args) { - assert(args.size() == 0); - mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::CTA); -} - -// THREADFENCE_SYSTEM -void CUDAIntrinsicLibrary::genThreadFenceSystem( - llvm::ArrayRef args) { - assert(args.size() == 0); - mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::SYS); + mlir::NVVM::MembarOp::create(builder, loc, scope); } // TMA_BULK_COMMIT_GROUP