Skip to content

Commit 873b8d5

Browse files
authored
[flang][cuda][NFC] Use NVVM operation for thread syncs (#166999)
Use the operation introduced in #166698. Also split the test into a new file since `flang/test/Lower/CUDA/cuda-device-proc.cuf` is getting to big. I'm planning to reorganize this file to have better separation of the tests
1 parent c21cd52 commit 873b8d5

File tree

3 files changed

+20
-24
lines changed

3 files changed

+20
-24
lines changed

flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp

Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1309,34 +1309,22 @@ CUDAIntrinsicLibrary::genThisWarp(mlir::Type resultType,
13091309
// THREADFENCE
13101310
void CUDAIntrinsicLibrary::genThreadFence(
13111311
llvm::ArrayRef<fir::ExtendedValue> args) {
1312-
constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.gl";
1313-
mlir::FunctionType funcType =
1314-
mlir::FunctionType::get(builder.getContext(), {}, {});
1315-
auto funcOp = builder.createFunction(loc, funcName, funcType);
1316-
llvm::SmallVector<mlir::Value> noArgs;
1317-
fir::CallOp::create(builder, loc, funcOp, noArgs);
1312+
assert(args.size() == 0);
1313+
mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::GPU);
13181314
}
13191315

13201316
// THREADFENCE_BLOCK
13211317
void CUDAIntrinsicLibrary::genThreadFenceBlock(
13221318
llvm::ArrayRef<fir::ExtendedValue> args) {
1323-
constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.cta";
1324-
mlir::FunctionType funcType =
1325-
mlir::FunctionType::get(builder.getContext(), {}, {});
1326-
auto funcOp = builder.createFunction(loc, funcName, funcType);
1327-
llvm::SmallVector<mlir::Value> noArgs;
1328-
fir::CallOp::create(builder, loc, funcOp, noArgs);
1319+
assert(args.size() == 0);
1320+
mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::CTA);
13291321
}
13301322

13311323
// THREADFENCE_SYSTEM
13321324
void CUDAIntrinsicLibrary::genThreadFenceSystem(
13331325
llvm::ArrayRef<fir::ExtendedValue> args) {
1334-
constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.sys";
1335-
mlir::FunctionType funcType =
1336-
mlir::FunctionType::get(builder.getContext(), {}, {});
1337-
auto funcOp = builder.createFunction(loc, funcName, funcType);
1338-
llvm::SmallVector<mlir::Value> noArgs;
1339-
fir::CallOp::create(builder, loc, funcOp, noArgs);
1326+
assert(args.size() == 0);
1327+
mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::SYS);
13401328
}
13411329

13421330
// TMA_BULK_COMMIT_GROUP

flang/test/Lower/CUDA/cuda-device-proc.cuf

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,6 @@ attributes(global) subroutine devsub()
2222

2323
call syncthreads()
2424
call syncwarp(1)
25-
call threadfence()
26-
call threadfence_block()
27-
call threadfence_system()
2825
ret = syncthreads_and(1)
2926
res = syncthreads_and(tid > offset)
3027
ret = syncthreads_count(1)
@@ -106,9 +103,6 @@ end
106103
! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
107104
! CHECK: nvvm.barrier0
108105
! CHECK: nvvm.bar.warp.sync %c1{{.*}} : i32
109-
! CHECK: fir.call @llvm.nvvm.membar.gl() fastmath<contract> : () -> ()
110-
! CHECK: fir.call @llvm.nvvm.membar.cta() fastmath<contract> : () -> ()
111-
! CHECK: fir.call @llvm.nvvm.membar.sys() fastmath<contract> : () -> ()
112106
! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%c1{{.*}}) fastmath<contract> : (i32) -> i32
113107
! CHECK: %[[A:.*]] = fir.load %{{.*}} : !fir.ref<i32>
114108
! CHECK: %[[B:.*]] = fir.load %{{.*}} : !fir.ref<i32>
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
2+
3+
! Test CUDA Fortran instrinsics lowerings for synchronization.
4+
5+
attributes(global) subroutine sync()
6+
call threadfence()
7+
call threadfence_block()
8+
call threadfence_system()
9+
end subroutine
10+
11+
! CHECK-LABEL: func.func @_QPsync() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
12+
! CHECK: nvvm.memory.barrier <gpu>
13+
! CHECK: nvvm.memory.barrier <cta>
14+
! CHECK: nvvm.memory.barrier <sys>

0 commit comments

Comments
 (0)