Skip to content

Commit f00d353

Browse files
authored
[flang][cuda][NFC] Simplify thread fence lowering (llvm#167009)
Just use a single templated function to generate the 3 kind of thread fence so we can remove duplicated code.
1 parent 682c8e2 commit f00d353

File tree

2 files changed

+7
-21
lines changed

2 files changed

+7
-21
lines changed

flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,8 @@ struct CUDAIntrinsicLibrary : IntrinsicLibrary {
6363
mlir::Value genThisGrid(mlir::Type, llvm::ArrayRef<mlir::Value>);
6464
mlir::Value genThisThreadBlock(mlir::Type, llvm::ArrayRef<mlir::Value>);
6565
mlir::Value genThisWarp(mlir::Type, llvm::ArrayRef<mlir::Value>);
66+
template <mlir::NVVM::MemScopeKind scope>
6667
void genThreadFence(llvm::ArrayRef<fir::ExtendedValue>);
67-
void genThreadFenceBlock(llvm::ArrayRef<fir::ExtendedValue>);
68-
void genThreadFenceSystem(llvm::ArrayRef<fir::ExtendedValue>);
6968
void genTMABulkCommitGroup(llvm::ArrayRef<fir::ExtendedValue>);
7069
void genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue>);
7170
void genTMABulkLoadC4(llvm::ArrayRef<fir::ExtendedValue>);

flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp

Lines changed: 6 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -472,17 +472,17 @@ static constexpr IntrinsicHandler cudaHandlers[]{
472472
/*isElemental=*/false},
473473
{"threadfence",
474474
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
475-
&CI::genThreadFence),
475+
&CI::genThreadFence<mlir::NVVM::MemScopeKind::GPU>),
476476
{},
477477
/*isElemental=*/false},
478478
{"threadfence_block",
479479
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
480-
&CI::genThreadFenceBlock),
480+
&CI::genThreadFence<mlir::NVVM::MemScopeKind::CTA>),
481481
{},
482482
/*isElemental=*/false},
483483
{"threadfence_system",
484484
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
485-
&CI::genThreadFenceSystem),
485+
&CI::genThreadFence<mlir::NVVM::MemScopeKind::SYS>),
486486
{},
487487
/*isElemental=*/false},
488488
{"tma_bulk_commit_group",
@@ -1306,25 +1306,12 @@ CUDAIntrinsicLibrary::genThisWarp(mlir::Type resultType,
13061306
return res;
13071307
}
13081308

1309-
// THREADFENCE
1309+
// THREADFENCE, THREADFENCE_BLOCK, THREADFENCE_SYSTEM
1310+
template <mlir::NVVM::MemScopeKind scope>
13101311
void CUDAIntrinsicLibrary::genThreadFence(
13111312
llvm::ArrayRef<fir::ExtendedValue> args) {
13121313
assert(args.size() == 0);
1313-
mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::GPU);
1314-
}
1315-
1316-
// THREADFENCE_BLOCK
1317-
void CUDAIntrinsicLibrary::genThreadFenceBlock(
1318-
llvm::ArrayRef<fir::ExtendedValue> args) {
1319-
assert(args.size() == 0);
1320-
mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::CTA);
1321-
}
1322-
1323-
// THREADFENCE_SYSTEM
1324-
void CUDAIntrinsicLibrary::genThreadFenceSystem(
1325-
llvm::ArrayRef<fir::ExtendedValue> args) {
1326-
assert(args.size() == 0);
1327-
mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::SYS);
1314+
mlir::NVVM::MembarOp::create(builder, loc, scope);
13281315
}
13291316

13301317
// TMA_BULK_COMMIT_GROUP

0 commit comments

Comments
 (0)