Skip to content

Commit 9f06843

Browse files
authored
[flang][cuda] Add interface and lowering for fence_proxy_async (llvm#163014)
Part of TMA operation defined here: https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/#load-and-store-functions-using-bulk-tma-operations
1 parent ff79c71 commit 9f06843

File tree

4 files changed

+29
-0
lines changed

4 files changed

+29
-0
lines changed

flang/include/flang/Optimizer/Builder/IntrinsicCall.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,7 @@ struct IntrinsicLibrary {
274274
llvm::ArrayRef<fir::ExtendedValue>);
275275
template <Extremum, ExtremumBehavior>
276276
mlir::Value genExtremum(mlir::Type, llvm::ArrayRef<mlir::Value>);
277+
void genFenceProxyAsync(llvm::ArrayRef<fir::ExtendedValue>);
277278
mlir::Value genFloor(mlir::Type, llvm::ArrayRef<mlir::Value>);
278279
mlir::Value genFraction(mlir::Type resultType,
279280
mlir::ArrayRef<mlir::Value> args);

flang/lib/Optimizer/Builder/IntrinsicCall.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -502,6 +502,10 @@ static constexpr IntrinsicHandler handlers[]{
502502
&I::genExtendsTypeOf,
503503
{{{"a", asBox}, {"mold", asBox}}},
504504
/*isElemental=*/false},
505+
{"fence_proxy_async",
506+
&I::genFenceProxyAsync,
507+
{},
508+
/*isElemental=*/false},
505509
{"findloc",
506510
&I::genFindloc,
507511
{{{"array", asBox},
@@ -4367,6 +4371,17 @@ IntrinsicLibrary::genExtendsTypeOf(mlir::Type resultType,
43674371
fir::getBase(args[1])));
43684372
}
43694373

4374+
// FENCE_PROXY_ASYNC (CUDA)
4375+
void IntrinsicLibrary::genFenceProxyAsync(
4376+
llvm::ArrayRef<fir::ExtendedValue> args) {
4377+
assert(args.size() == 0);
4378+
auto kind = mlir::NVVM::ProxyKindAttr::get(
4379+
builder.getContext(), mlir::NVVM::ProxyKind::async_shared);
4380+
auto space = mlir::NVVM::SharedSpaceAttr::get(
4381+
builder.getContext(), mlir::NVVM::SharedSpace::shared_cta);
4382+
mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space);
4383+
}
4384+
43704385
// FINDLOC
43714386
fir::ExtendedValue
43724387
IntrinsicLibrary::genFindloc(mlir::Type resultType,

flang/module/cudadevice.f90

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2008,6 +2008,11 @@ attributes(device) function barrier_arrive_cnt(barrier, count) result(token)
20082008
end function
20092009
end interface
20102010

2011+
interface
2012+
attributes(device) subroutine fence_proxy_async()
2013+
end subroutine
2014+
end interface
2015+
20112016
interface
20122017
attributes(device) subroutine tma_bulk_commit_group()
20132018
end subroutine

flang/test/Lower/CUDA/cuda-device-proc.cuf

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,14 @@ end subroutine
421421
! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3>
422422
! CHECK: nvvm.mbarrier.arrive.expect_tx %[[SHARED_PTR]], %{{.*}} : !llvm.ptr<3>, i32
423423

424+
425+
attributes(global) subroutine test_fence()
426+
call fence_proxy_async()
427+
end subroutine
428+
429+
! CHECK-LABEL: func.func @_QPtest_fence()
430+
! CHECK: nvvm.fence.proxy {kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cta>}
431+
424432
attributes(global) subroutine test_tma()
425433
call tma_bulk_commit_group()
426434
call tma_bulk_wait_group()

0 commit comments

Comments
 (0)