Skip to content

Commit 32adfb5

Browse files
authored
1 parent 48c9a8a commit 32adfb5

File tree

4 files changed

+36
-1
lines changed

4 files changed

+36
-1
lines changed

flang/include/flang/Optimizer/Builder/IntrinsicCall.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -459,6 +459,7 @@ struct IntrinsicLibrary {
459459
mlir::Value genTime(mlir::Type, llvm::ArrayRef<mlir::Value>);
460460
void genTMABulkCommitGroup(llvm::ArrayRef<fir::ExtendedValue>);
461461
void genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue>);
462+
void genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue>);
462463
void genTMABulkWaitGroup(llvm::ArrayRef<fir::ExtendedValue>);
463464
mlir::Value genTrailz(mlir::Type, llvm::ArrayRef<mlir::Value>);
464465
fir::ExtendedValue genTransfer(mlir::Type,

flang/lib/Optimizer/Builder/IntrinsicCall.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1027,6 +1027,10 @@ static constexpr IntrinsicHandler handlers[]{
10271027
{"dst", asAddr},
10281028
{"nbytes", asValue}}},
10291029
/*isElemental=*/false},
1030+
{"tma_bulk_s2g",
1031+
&I::genTMABulkS2G,
1032+
{{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}},
1033+
/*isElemental=*/false},
10301034
{"tma_bulk_wait_group",
10311035
&I::genTMABulkWaitGroup,
10321036
{{}},
@@ -9227,6 +9231,17 @@ void IntrinsicLibrary::genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue> args) {
92279231
builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {});
92289232
}
92299233

9234+
// TMA_BULK_S2G (CUDA)
9235+
void IntrinsicLibrary::genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue> args) {
9236+
assert(args.size() == 3);
9237+
mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[0]),
9238+
mlir::NVVM::NVVMMemorySpace::Shared);
9239+
mlir::Value dst = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]),
9240+
mlir::NVVM::NVVMMemorySpace::Global);
9241+
mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(
9242+
builder, loc, dst, src, fir::getBase(args[2]), {}, {});
9243+
}
9244+
92309245
// TMA_BULK_WAIT_GROUP (CUDA)
92319246
void IntrinsicLibrary::genTMABulkWaitGroup(
92329247
llvm::ArrayRef<fir::ExtendedValue> args) {

flang/module/cudadevice.f90

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2034,6 +2034,15 @@ attributes(device) subroutine tma_bulk_g2s(barrier, src, dst, nbytes)
20342034
end subroutine
20352035
end interface
20362036

2037+
interface
2038+
attributes(device) subroutine tma_bulk_s2g(src, dst, nbytes)
2039+
!dir$ ignore_tkr src, dst
2040+
integer(4), shared :: src(*)
2041+
integer(4), device :: dst(*)
2042+
integer(4), value :: nbytes
2043+
end subroutine
2044+
end interface
2045+
20372046
contains
20382047

20392048
attributes(device) subroutine syncthreads()

flang/test/Lower/CUDA/cuda-device-proc.cuf

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -438,7 +438,7 @@ end subroutine
438438
! CHECK: nvvm.cp.async.bulk.commit.group
439439
! CHECK: nvvm.cp.async.bulk.wait_group 0
440440

441-
attributes(global) subroutine test_bulk_g2s(c, a, b, n)
441+
attributes(global) subroutine test_bulk_g2s(a)
442442
real(8), device :: a(*)
443443
real(8), shared :: tmpa(1024)
444444
integer(8), shared :: barrier1
@@ -448,3 +448,13 @@ end subroutine
448448

449449
! CHECK-LABEL: func.func @_QPtest_bulk_g2s
450450
! CHECK: nvvm.cp.async.bulk.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : <7>, <1>
451+
452+
attributes(global) subroutine test_bulk_s2g(a)
453+
real(8), device :: a(*)
454+
real(8), shared :: tmpa(1024)
455+
integer(4) :: tx_count
456+
call tma_bulk_s2g(tmpa, a(j), tx_count)
457+
end subroutine
458+
459+
! CHECK-LABEL: func.func @_QPtest_bulk_s2g
460+
! CHECL: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>

0 commit comments

Comments
 (0)