[flang][cuda] Add interface and lowering for tma_bulk_s2g #163232

clementval · 2025-10-13T17:55:33Z

https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/index.html#load-and-store-functions-using-cache-hints

llvmbot · 2025-10-13T17:56:08Z

@llvm/pr-subscribers-flang-fir-hlfir

Author: Valentin Clement (バレンタインクレメン) (clementval)

Changes

https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/index.html#load-and-store-functions-using-cache-hints

Full diff: https://github.com/llvm/llvm-project/pull/163232.diff

4 Files Affected:

(modified) flang/include/flang/Optimizer/Builder/IntrinsicCall.h (+1)
(modified) flang/lib/Optimizer/Builder/IntrinsicCall.cpp (+15)
(modified) flang/module/cudadevice.f90 (+9)
(modified) flang/test/Lower/CUDA/cuda-device-proc.cuf (+10)

diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index 2adfd6f2510d4..c3cd119b96174 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -459,6 +459,7 @@ struct IntrinsicLibrary {
   mlir::Value genTime(mlir::Type, llvm::ArrayRef<mlir::Value>);
   void genTMABulkCommitGroup(llvm::ArrayRef<fir::ExtendedValue>);
   void genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue>);
   void genTMABulkWaitGroup(llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genTrailz(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genTransfer(mlir::Type,
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 5fe2a76128e0d..e07baafcef0d7 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -1027,6 +1027,10 @@ static constexpr IntrinsicHandler handlers[]{
        {"dst", asAddr},
        {"nbytes", asValue}}},
      /*isElemental=*/false},
+    {"tma_bulk_s2g",
+     &I::genTMABulkS2G,
+     {{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}},
+     /*isElemental=*/false},
     {"tma_bulk_wait_group",
      &I::genTMABulkWaitGroup,
      {{}},
@@ -9227,6 +9231,17 @@ void IntrinsicLibrary::genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue> args) {
       builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {});
 }
 
+// TMA_BULK_S2G (CUDA)
+void IntrinsicLibrary::genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[0]),
+                                          mlir::NVVM::NVVMMemorySpace::Shared);
+  mlir::Value dst = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]),
+                                          mlir::NVVM::NVVMMemorySpace::Global);
+  mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(
+      builder, loc, dst, src, fir::getBase(args[2]), {}, {});
+}
+
 // TMA_BULK_WAIT_GROUP (CUDA)
 void IntrinsicLibrary::genTMABulkWaitGroup(
     llvm::ArrayRef<fir::ExtendedValue> args) {
diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index a8b9aa8b57ef9..22df9cdf410d5 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -2034,6 +2034,15 @@ attributes(device) subroutine tma_bulk_g2s(barrier, src, dst, nbytes)
     end subroutine
   end interface
 
+  interface
+    attributes(device) subroutine tma_bulk_s2g(src, dst, nbytes)
+      !dir$ ignore_tkr src, dst
+      integer(4), shared  :: src(*)
+      integer(4), device  :: dst(*)
+      integer(4), value   :: nbytes
+    end subroutine
+  end interface
+
 contains
 
   attributes(device) subroutine syncthreads()
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index 83ee0118638b2..728d65256c528 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -448,3 +448,13 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_bulk_g2s
 ! CHECK: nvvm.cp.async.bulk.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : <7>, <1>
+
+attributes(global) subroutine test_bulk_s2g(c, a, b, n)
+  real(8), device :: a(*)
+  real(8), shared :: tmpa(1024)
+  integer(4) :: tx_count
+  call tma_bulk_s2g(tmpa, a(j), tx_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_bulk_s2g
+! CHECL: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>

flang/test/Lower/CUDA/cuda-device-proc.cuf

https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/index.html#load-and-store-functions-using-cache-hints

[flang][cuda] Add interface and lowering for tma_bulk_s2g

59106cd

clementval requested a review from wangzpgi October 13, 2025 17:55

llvmbot added flang Flang issues not falling into any other category flang:fir-hlfir labels Oct 13, 2025

wangzpgi reviewed Oct 13, 2025

View reviewed changes

flang/test/Lower/CUDA/cuda-device-proc.cuf Outdated Show resolved Hide resolved

clean up dummies

d1e2a7a

wangzpgi approved these changes Oct 13, 2025

View reviewed changes

clementval enabled auto-merge (squash) October 13, 2025 18:44

clementval merged commit 32adfb5 into llvm:main Oct 13, 2025
8 of 10 checks passed

clementval deleted the cuf_test_bulk_s2g branch October 13, 2025 18:53

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[flang][cuda] Add interface and lowering for tma_bulk_s2g #163232

[flang][cuda] Add interface and lowering for tma_bulk_s2g #163232

Uh oh!

clementval commented Oct 13, 2025

Uh oh!

llvmbot commented Oct 13, 2025

Uh oh!

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

[flang][cuda] Add interface and lowering for tma_bulk_s2g #163232

[flang][cuda] Add interface and lowering for tma_bulk_s2g #163232

Uh oh!

Conversation

clementval commented Oct 13, 2025

Uh oh!

llvmbot commented Oct 13, 2025

Uh oh!

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants