From 9319733c5fc19b52061f21debb49713862ca0587 Mon Sep 17 00:00:00 2001 From: Valentin Clement Date: Wed, 3 Dec 2025 13:31:04 -0800 Subject: [PATCH] [flang][cuda] Do not use address cast for src and dst in TMA bulk load --- flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp | 6 ++---- flang/test/Lower/CUDA/cuda-device-proc.cuf | 14 +++++++------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp index ae6120826f8d2..67af481cec31a 100644 --- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp @@ -1506,10 +1506,8 @@ static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc, auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); barrier = builder.createConvert(loc, llvmPtrTy, barrier); setAlignment(dst, 16); - dst = convertPtrToNVVMSpace(builder, loc, dst, - mlir::NVVM::NVVMMemorySpace::Shared); - src = convertPtrToNVVMSpace(builder, loc, src, - mlir::NVVM::NVVMMemorySpace::Shared); + dst = builder.createConvert(loc, llvmPtrTy, dst); + src = builder.createConvert(loc, llvmPtrTy, src); mlir::NVVM::InlinePtxOp::create( builder, loc, mlir::TypeRange{}, {dst, src, size, barrier}, {}, "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], " diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 1e3c66307c334..7f350944d70f6 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -543,7 +543,7 @@ end subroutine ! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32 ! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 ! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref) -> !llvm.ptr -! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr<3>, !llvm.ptr<3>, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr) ! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) attributes(global) subroutine test_tma_bulk_load_c8(a, n) @@ -563,7 +563,7 @@ end subroutine ! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 16 : i32 ! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 ! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref) -> !llvm.ptr -! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr<3>, !llvm.ptr<3>, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr) ! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) attributes(global) subroutine test_tma_bulk_load_i4(a, n) @@ -583,7 +583,7 @@ end subroutine ! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 4 : i32 ! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 ! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref) -> !llvm.ptr -! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr<3>, !llvm.ptr<3>, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr) ! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) attributes(global) subroutine test_tma_bulk_load_i8(a, n) @@ -603,7 +603,7 @@ end subroutine ! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32 ! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 ! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref) -> !llvm.ptr -! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr<3>, !llvm.ptr<3>, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr) ! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) attributes(global) subroutine test_tma_bulk_load_r2(a, n) @@ -623,7 +623,7 @@ end subroutine ! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 2 : i32 ! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 ! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref) -> !llvm.ptr -! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr<3>, !llvm.ptr<3>, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr) ! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) attributes(global) subroutine test_tma_bulk_load_r4(a, n) @@ -643,7 +643,7 @@ end subroutine ! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 4 : i32 ! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 ! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref) -> !llvm.ptr -! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr<3>, !llvm.ptr<3>, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr) ! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) attributes(global) subroutine test_tma_bulk_load_r8(a, n) @@ -663,7 +663,7 @@ end subroutine ! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32 ! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 ! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref) -> !llvm.ptr -! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr<3>, !llvm.ptr<3>, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr) ! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) attributes(global) subroutine test_tma_bulk_store_c4(c, n)