Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions flang/include/flang/Optimizer/Builder/IntrinsicCall.h
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ struct IntrinsicLibrary {
llvm::ArrayRef<fir::ExtendedValue>);
template <Extremum, ExtremumBehavior>
mlir::Value genExtremum(mlir::Type, llvm::ArrayRef<mlir::Value>);
void genFenceProxyAsync(llvm::ArrayRef<fir::ExtendedValue>);
mlir::Value genFloor(mlir::Type, llvm::ArrayRef<mlir::Value>);
mlir::Value genFraction(mlir::Type resultType,
mlir::ArrayRef<mlir::Value> args);
Expand Down
15 changes: 15 additions & 0 deletions flang/lib/Optimizer/Builder/IntrinsicCall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,10 @@ static constexpr IntrinsicHandler handlers[]{
&I::genExtendsTypeOf,
{{{"a", asBox}, {"mold", asBox}}},
/*isElemental=*/false},
{"fence_proxy_async",
&I::genFenceProxyAsync,
{},
/*isElemental=*/false},
{"findloc",
&I::genFindloc,
{{{"array", asBox},
Expand Down Expand Up @@ -4362,6 +4366,17 @@ IntrinsicLibrary::genExtendsTypeOf(mlir::Type resultType,
fir::getBase(args[1])));
}

// FENCE_PROXY_ASYNC (CUDA)
void IntrinsicLibrary::genFenceProxyAsync(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 0);
auto kind = mlir::NVVM::ProxyKindAttr::get(
builder.getContext(), mlir::NVVM::ProxyKind::async_shared);
auto space = mlir::NVVM::SharedSpaceAttr::get(
builder.getContext(), mlir::NVVM::SharedSpace::shared_cta);
mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space);
}

// FINDLOC
fir::ExtendedValue
IntrinsicLibrary::genFindloc(mlir::Type resultType,
Expand Down
5 changes: 5 additions & 0 deletions flang/module/cudadevice.f90
Original file line number Diff line number Diff line change
Expand Up @@ -2008,6 +2008,11 @@ attributes(device) function barrier_arrive_cnt(barrier, count) result(token)
end function
end interface

interface
attributes(device) subroutine fence_proxy_async()
end subroutine
end interface

interface
attributes(device) subroutine tma_bulk_commit_group()
end subroutine
Expand Down
10 changes: 9 additions & 1 deletion flang/test/Lower/CUDA/cuda-device-proc.cuf
Original file line number Diff line number Diff line change
Expand Up @@ -420,11 +420,19 @@ end subroutine
! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3>
! CHECK: nvvm.mbarrier.arrive.expect_tx %[[SHARED_PTR]], %{{.*}} : !llvm.ptr<3>, i32


attributes(global) subroutine test_fence()
call fence_proxy_async()
end subroutine

! CHECK-LABEL: func.func @_QPtest_fence()
! CHECK: nvvm.fence.proxy {kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cta>}

attributes(global) subroutine test_tma()
call tma_bulk_commit_group()
call tma_bulk_wait_group()
end subroutine

! CHECK-LABEL: func.func @_QPtest_tma()
! CHECK: nvvm.cp.async.bulk.commit.group
! CHECK: nvvm.cp.async.bulk.wait_group 0
! CHECK: nvvm.cp.async.bulk.wait_group 0
Loading