Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions flang/include/flang/Optimizer/Builder/IntrinsicCall.h
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,13 @@ struct IntrinsicLibrary {
mlir::Value genTime(mlir::Type, llvm::ArrayRef<mlir::Value>);
void genTMABulkCommitGroup(llvm::ArrayRef<fir::ExtendedValue>);
void genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue>);
void genTMABulkLoadC4(llvm::ArrayRef<fir::ExtendedValue>);
void genTMABulkLoadC8(llvm::ArrayRef<fir::ExtendedValue>);
void genTMABulkLoadI4(llvm::ArrayRef<fir::ExtendedValue>);
void genTMABulkLoadI8(llvm::ArrayRef<fir::ExtendedValue>);
void genTMABulkLoadR2(llvm::ArrayRef<fir::ExtendedValue>);
void genTMABulkLoadR4(llvm::ArrayRef<fir::ExtendedValue>);
void genTMABulkLoadR8(llvm::ArrayRef<fir::ExtendedValue>);
void genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue>);
void genTMABulkWaitGroup(llvm::ArrayRef<fir::ExtendedValue>);
mlir::Value genTrailz(mlir::Type, llvm::ArrayRef<mlir::Value>);
Expand Down
136 changes: 136 additions & 0 deletions flang/lib/Optimizer/Builder/IntrinsicCall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1045,6 +1045,55 @@ static constexpr IntrinsicHandler handlers[]{
{"dst", asAddr},
{"nbytes", asValue}}},
/*isElemental=*/false},
{"tma_bulk_ldc4",
&I::genTMABulkLoadC4,
{{{"barrier", asAddr},
{"src", asAddr},
{"dst", asAddr},
{"nelems", asValue}}},
/*isElemental=*/false},
{"tma_bulk_ldc8",
&I::genTMABulkLoadC8,
{{{"barrier", asAddr},
{"src", asAddr},
{"dst", asAddr},
{"nelems", asValue}}},
/*isElemental=*/false},
{"tma_bulk_ldi4",
&I::genTMABulkLoadI4,
{{{"barrier", asAddr},
{"src", asAddr},
{"dst", asAddr},
{"nelems", asValue}}},
/*isElemental=*/false},
{"tma_bulk_ldi8",
&I::genTMABulkLoadI8,
{{{"barrier", asAddr},
{"src", asAddr},
{"dst", asAddr},
{"nelems", asValue}}},
/*isElemental=*/false},
{"tma_bulk_ldr2",
&I::genTMABulkLoadR2,
{{{"barrier", asAddr},
{"src", asAddr},
{"dst", asAddr},
{"nelems", asValue}}},
/*isElemental=*/false},
{"tma_bulk_ldr4",
&I::genTMABulkLoadR4,
{{{"barrier", asAddr},
{"src", asAddr},
{"dst", asAddr},
{"nelems", asValue}}},
/*isElemental=*/false},
{"tma_bulk_ldr8",
&I::genTMABulkLoadR8,
{{{"barrier", asAddr},
{"src", asAddr},
{"dst", asAddr},
{"nelems", asValue}}},
/*isElemental=*/false},
{"tma_bulk_s2g",
&I::genTMABulkS2G,
{{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}},
Expand Down Expand Up @@ -9278,6 +9327,93 @@ void IntrinsicLibrary::genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue> args) {
builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {});
}

static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc,
mlir::Value barrier, mlir::Value src,
mlir::Value dst, mlir::Value nelem,
mlir::Value eleSize) {
mlir::Value size = mlir::arith::MulIOp::create(builder, loc, nelem, eleSize);
auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
barrier = builder.createConvert(loc, llvmPtrTy, barrier);
mlir::NVVM::InlinePtxOp::create(
builder, loc, mlir::TypeRange{}, {dst, src, size, barrier}, {},
"cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], "
"[%1], %2, [%3];",
{});
mlir::NVVM::InlinePtxOp::create(
builder, loc, mlir::TypeRange{}, {barrier, size}, {},
"mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;", {});
}

// TMA_BULK_LOADC4
void IntrinsicLibrary::genTMABulkLoadC4(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 4);
mlir::Value eleSize =
builder.createIntegerConstant(loc, builder.getI32Type(), 8);
genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
}

// TMA_BULK_LOADC8
void IntrinsicLibrary::genTMABulkLoadC8(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 4);
mlir::Value eleSize =
builder.createIntegerConstant(loc, builder.getI32Type(), 16);
genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
}

// TMA_BULK_LOADI4
void IntrinsicLibrary::genTMABulkLoadI4(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 4);
mlir::Value eleSize =
builder.createIntegerConstant(loc, builder.getI32Type(), 4);
genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
}

// TMA_BULK_LOADI8
void IntrinsicLibrary::genTMABulkLoadI8(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 4);
mlir::Value eleSize =
builder.createIntegerConstant(loc, builder.getI32Type(), 8);
genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
}

// TMA_BULK_LOADR2
void IntrinsicLibrary::genTMABulkLoadR2(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 4);
mlir::Value eleSize =
builder.createIntegerConstant(loc, builder.getI32Type(), 2);
genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
}

// TMA_BULK_LOADR4
void IntrinsicLibrary::genTMABulkLoadR4(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 4);
mlir::Value eleSize =
builder.createIntegerConstant(loc, builder.getI32Type(), 4);
genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
}

// TMA_BULK_LOADR8
void IntrinsicLibrary::genTMABulkLoadR8(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 4);
mlir::Value eleSize =
builder.createIntegerConstant(loc, builder.getI32Type(), 8);
genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
}

// TMA_BULK_S2G (CUDA)
void IntrinsicLibrary::genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 3);
Expand Down
61 changes: 61 additions & 0 deletions flang/module/cudadevice.f90
Original file line number Diff line number Diff line change
Expand Up @@ -2067,6 +2067,67 @@ attributes(device) subroutine tma_bulk_s2g(src, dst, nbytes)
end subroutine
end interface

! Load specific types, count is in elements
! -----------------------------------------
interface tma_bulk_load
attributes(device) subroutine tma_bulk_ldc4(barrier, src, dst, nelems)
!dir$ ignore_tkr (r) src, (r) dst
integer(8), shared :: barrier
complex(4), device :: src(*)
complex(4), shared :: dst(*)
integer(4), value :: nelems
end subroutine

attributes(device) subroutine tma_bulk_ldc8(barrier, src, dst, nelems)
!dir$ ignore_tkr (r) src, (r) dst
integer(8), shared :: barrier
complex(8), device :: src(*)
complex(8), shared :: dst(*)
integer(4), value :: nelems
end subroutine

attributes(device) subroutine tma_bulk_ldi4(barrier, src, dst, nelems)
!dir$ ignore_tkr (r) src, (r) dst
integer(8), shared :: barrier
integer(4), device :: src(*)
integer(4), shared :: dst(*)
integer(4), value :: nelems
end subroutine

attributes(device) subroutine tma_bulk_ldi8(barrier, src, dst, nelems)
!dir$ ignore_tkr (r) src, (r) dst
integer(8), shared :: barrier
integer(8), device :: src(*)
integer(8), shared :: dst(*)
integer(4), value :: nelems
end subroutine

attributes(device) subroutine tma_bulk_ldr2(barrier, src, dst, nelems)
!dir$ ignore_tkr (r) src, (r) dst
integer(8), shared :: barrier
real(2), device :: src(*)
real(2), shared :: dst(*)
integer(4), value :: nelems
end subroutine

attributes(device) subroutine tma_bulk_ldr4(barrier, src, dst, nelems)
!dir$ ignore_tkr (r) src, (r) dst
integer(8), shared :: barrier
real(4), device :: src(*)
real(4), shared :: dst(*)
integer(4), value :: nelems
end subroutine

attributes(device) subroutine tma_bulk_ldr8(barrier, src, dst, nelems)
!dir$ ignore_tkr (r) src, (r) dst
integer(8), shared :: barrier
real(8), device :: src(*)
real(8), shared :: dst(*)
integer(4), value :: nelems
end subroutine
end interface


contains

attributes(device) subroutine syncthreads()
Expand Down
133 changes: 133 additions & 0 deletions flang/test/Lower/CUDA/cuda-device-proc.cuf
Original file line number Diff line number Diff line change
Expand Up @@ -514,3 +514,136 @@ end subroutine

! CHECK-LABEL: func.func @_QPtest_barrier_try_wait_sleep()
! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %0, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32

attributes(global) subroutine test_tma_bulk_load_c4(a, n)
integer(8), shared :: barrier1
integer, value :: n
complex(4), device :: r8(n)
complex(4), shared :: tmp(1024)
integer(4) :: j, elem_count
call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
end subroutine

! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_c4
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_c4Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_c4Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref<!fir.array<1024xcomplex<f32>>>, !fir.ref<complex<f32>>, i32, !llvm.ptr)
! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)

attributes(global) subroutine test_tma_bulk_load_c8(a, n)
integer(8), shared :: barrier1
integer, value :: n
complex(8), device :: r8(n)
complex(8), shared :: tmp(1024)
integer(4) :: j, elem_count
call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
end subroutine

! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_c8
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_c8Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_c8Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 16 : i32
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref<!fir.array<1024xcomplex<f64>>>, !fir.ref<complex<f64>>, i32, !llvm.ptr)
! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)

attributes(global) subroutine test_tma_bulk_load_i4(a, n)
integer(8), shared :: barrier1
integer, value :: n
integer(4), device :: r8(n)
integer(4), shared :: tmp(1024)
integer(4) :: j, elem_count
call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
end subroutine

! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_i4
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_i4Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_i4Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 4 : i32
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref<!fir.array<1024xi32>>, !fir.ref<i32>, i32, !llvm.ptr)
! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)

attributes(global) subroutine test_tma_bulk_load_i8(a, n)
integer(8), shared :: barrier1
integer, value :: n
integer(8), device :: r8(n)
integer(8), shared :: tmp(1024)
integer(4) :: j, elem_count
call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
end subroutine

! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_i8
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_i8Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_i8Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref<!fir.array<1024xi64>>, !fir.ref<i64>, i32, !llvm.ptr)
! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)

attributes(global) subroutine test_tma_bulk_load_r2(a, n)
integer(8), shared :: barrier1
integer, value :: n
real(2), device :: r8(n)
real(2), shared :: tmp(1024)
integer(4) :: j, elem_count
call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
end subroutine

! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r2
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_r2Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_r2Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 2 : i32
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref<!fir.array<1024xf16>>, !fir.ref<f16>, i32, !llvm.ptr)
! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)

attributes(global) subroutine test_tma_bulk_load_r4(a, n)
integer(8), shared :: barrier1
integer, value :: n
real(4), device :: r8(n)
real(4), shared :: tmp(1024)
integer(4) :: j, elem_count
call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
end subroutine

! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r4
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_r4Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_r4Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 4 : i32
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref<!fir.array<1024xf32>>, !fir.ref<f32>, i32, !llvm.ptr)
! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)

attributes(global) subroutine test_tma_bulk_load_r8(a, n)
integer(8), shared :: barrier1
integer, value :: n
real(8), device :: r8(n)
real(8), shared :: tmp(1024)
integer(4) :: j, elem_count
call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
end subroutine

! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r8
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_r8Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_r8Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref<!fir.array<1024xf64>>, !fir.ref<f64>, i32, !llvm.ptr)
! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)