Skip to content

Conversation

@clementval
Copy link
Contributor

Pass the allocator index as part of the allocate function. The information is part of cuf.allocate and it is useful for device resident components.

@clementval clementval requested a review from wangzpgi September 5, 2025 22:11
@llvmbot llvmbot added flang Flang issues not falling into any other category flang:fir-hlfir labels Sep 5, 2025
@llvmbot
Copy link
Member

llvmbot commented Sep 5, 2025

@llvm/pr-subscribers-flang-fir-hlfir

Author: Valentin Clement (バレンタイン クレメン) (clementval)

Changes

Pass the allocator index as part of the allocate function. The information is part of cuf.allocate and it is useful for device resident components.


Patch is 26.34 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/157189.diff

12 Files Affected:

  • (modified) flang-rt/lib/cuda/allocatable.cpp (+14-11)
  • (modified) flang-rt/lib/cuda/pointer.cpp (+19-14)
  • (modified) flang-rt/lib/runtime/CMakeLists.txt (+1-1)
  • (modified) flang/include/flang/Lower/CUDA.h (+1-15)
  • (modified) flang/include/flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h (+2)
  • (modified) flang/include/flang/Runtime/CUDA/allocatable.h (+12-10)
  • (modified) flang/include/flang/Runtime/CUDA/pointer.h (+12-10)
  • (modified) flang/lib/Lower/CUDA.cpp (+17)
  • (modified) flang/lib/Lower/ConvertVariable.cpp (+3-15)
  • (modified) flang/lib/Optimizer/Dialect/CUF/Attributes/CUFAttr.cpp (+13)
  • (modified) flang/lib/Optimizer/Transforms/CUFOpConversion.cpp (+7-5)
  • (modified) flang/test/Fir/CUDA/cuda-allocate.fir (+7-7)
diff --git a/flang-rt/lib/cuda/allocatable.cpp b/flang-rt/lib/cuda/allocatable.cpp
index ff1a225d66ce9..483b54061036d 100644
--- a/flang-rt/lib/cuda/allocatable.cpp
+++ b/flang-rt/lib/cuda/allocatable.cpp
@@ -23,11 +23,11 @@ namespace Fortran::runtime::cuda {
 extern "C" {
 RT_EXT_API_GROUP_BEGIN
 
-int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t *stream,
-    bool *pinned, bool hasStat, const Descriptor *errMsg,
+int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int32_t allocIdx,
+    int64_t *stream, bool *pinned, bool hasStat, const Descriptor *errMsg,
     const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFAllocatableAllocate)(
-      desc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+      desc, allocIdx, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
 #ifndef RT_DEVICE_COMPILATION
   // Descriptor synchronization is only done when the allocation is done
   // from the host.
@@ -41,9 +41,12 @@ int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t *stream,
   return stat;
 }
 
-int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t *stream,
-    bool *pinned, bool hasStat, const Descriptor *errMsg,
+int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int32_t allocIdx,
+    int64_t *stream, bool *pinned, bool hasStat, const Descriptor *errMsg,
     const char *sourceFile, int sourceLine) {
+#if !defined(RT_DEVICE_COMPILATION)
+  desc.SetAllocIdx(allocIdx);
+#endif
   // Perform the standard allocation.
   int stat{RTNAME(AllocatableAllocate)(
       desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
@@ -56,10 +59,10 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t *stream,
 }
 
 int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
-    const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
+    const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
-  int stat{RTNAME(CUFAllocatableAllocate)(
-      alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+  int stat{RTNAME(CUFAllocatableAllocate)(alloc, allocIdx, stream, pinned,
+      hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     Fortran::runtime::DoFromSourceAssign(
@@ -69,10 +72,10 @@ int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
 }
 
 int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
-    const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
+    const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
-  int stat{RTNAME(CUFAllocatableAllocateSync)(
-      alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+  int stat{RTNAME(CUFAllocatableAllocateSync)(alloc, allocIdx, stream, pinned,
+      hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     Fortran::runtime::DoFromSourceAssign(
diff --git a/flang-rt/lib/cuda/pointer.cpp b/flang-rt/lib/cuda/pointer.cpp
index d3f5cfe8e96a1..3e450596e0f12 100644
--- a/flang-rt/lib/cuda/pointer.cpp
+++ b/flang-rt/lib/cuda/pointer.cpp
@@ -22,9 +22,12 @@ namespace Fortran::runtime::cuda {
 extern "C" {
 RT_EXT_API_GROUP_BEGIN
 
-int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t *stream, bool *pinned,
-    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
-    int sourceLine) {
+int RTDEF(CUFPointerAllocate)(Descriptor &desc, int32_t allocIdx,
+    int64_t *stream, bool *pinned, bool hasStat, const Descriptor *errMsg,
+    const char *sourceFile, int sourceLine) {
+#if !defined(RT_DEVICE_COMPILATION)
+  desc.SetAllocIdx(allocIdx);
+#endif
   // Perform the standard allocation.
   int stat{
       RTNAME(PointerAllocate)(desc, hasStat, errMsg, sourceFile, sourceLine)};
@@ -36,11 +39,11 @@ int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t *stream, bool *pinned,
   return stat;
 }
 
-int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t *stream,
-    bool *pinned, bool hasStat, const Descriptor *errMsg,
+int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int32_t allocIdx,
+    int64_t *stream, bool *pinned, bool hasStat, const Descriptor *errMsg,
     const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFPointerAllocate)(
-      desc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+      desc, allocIdx, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
 #ifndef RT_DEVICE_COMPILATION
   // Descriptor synchronization is only done when the allocation is done
   // from the host.
@@ -55,10 +58,11 @@ int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t *stream,
 }
 
 int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
-    const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
-    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
-  int stat{RTNAME(CUFPointerAllocate)(
-      pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+    const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned,
+    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
+    int sourceLine) {
+  int stat{RTNAME(CUFPointerAllocate)(pointer, allocIdx, stream, pinned,
+      hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     Fortran::runtime::DoFromSourceAssign(
@@ -68,10 +72,11 @@ int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
 }
 
 int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
-    const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
-    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
-  int stat{RTNAME(CUFPointerAllocateSync)(
-      pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+    const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned,
+    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
+    int sourceLine) {
+  int stat{RTNAME(CUFPointerAllocateSync)(pointer, allocIdx, stream, pinned,
+      hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     Fortran::runtime::DoFromSourceAssign(
diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt
index 6548ec955b2b8..bd4eca52d6e29 100644
--- a/flang-rt/lib/runtime/CMakeLists.txt
+++ b/flang-rt/lib/runtime/CMakeLists.txt
@@ -180,7 +180,7 @@ if ("${LLVM_RUNTIMES_TARGET}" MATCHES "^amdgcn|^nvptx")
 elseif(FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "CUDA")
   # findloc.cpp has some issues with higher compute capability. Remove it
   # from CUDA build until we can lower its memory footprint.
-  list(REMOVE_ITEM supported_sources findloc.cpp)
+  
   set(sources ${supported_sources})
 else ()
   set(sources ${supported_sources} ${host_sources} ${f128_sources})
diff --git a/flang/include/flang/Lower/CUDA.h b/flang/include/flang/Lower/CUDA.h
index 4a831fd502af4..0a085f47327f2 100644
--- a/flang/include/flang/Lower/CUDA.h
+++ b/flang/include/flang/Lower/CUDA.h
@@ -31,21 +31,7 @@ namespace Fortran::lower {
 
 class AbstractConverter;
 
-static inline unsigned getAllocatorIdx(const Fortran::semantics::Symbol &sym) {
-  std::optional<Fortran::common::CUDADataAttr> cudaAttr =
-      Fortran::semantics::GetCUDADataAttr(&sym.GetUltimate());
-  if (cudaAttr) {
-    if (*cudaAttr == Fortran::common::CUDADataAttr::Pinned)
-      return kPinnedAllocatorPos;
-    if (*cudaAttr == Fortran::common::CUDADataAttr::Device)
-      return kDeviceAllocatorPos;
-    if (*cudaAttr == Fortran::common::CUDADataAttr::Managed)
-      return kManagedAllocatorPos;
-    if (*cudaAttr == Fortran::common::CUDADataAttr::Unified)
-      return kUnifiedAllocatorPos;
-  }
-  return kDefaultAllocator;
-}
+unsigned getAllocatorIdx(const Fortran::semantics::Symbol &sym);
 
 void initializeDeviceComponentAllocator(
     Fortran::lower::AbstractConverter &converter,
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h b/flang/include/flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h
index 4a250d1cc6c54..c00f9e718ad18 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h
+++ b/flang/include/flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h
@@ -112,6 +112,8 @@ cuf::DataAttributeAttr getDataAttr(mlir::Operation *op);
 /// Returns true if the operation has a data attribute with the given value.
 bool hasDataAttr(mlir::Operation *op, cuf::DataAttribute value);
 
+unsigned getAllocatorIdx(cuf::DataAttribute dataAttr);
+
 } // namespace cuf
 
 #endif // FORTRAN_OPTIMIZER_DIALECT_CUF_CUFATTR_H
diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h
index 6c97afa9e10e8..43b45cff9a1f5 100644
--- a/flang/include/flang/Runtime/CUDA/allocatable.h
+++ b/flang/include/flang/Runtime/CUDA/allocatable.h
@@ -17,31 +17,33 @@ namespace Fortran::runtime::cuda {
 extern "C" {
 
 /// Perform allocation of the descriptor.
-int RTDECL(CUFAllocatableAllocate)(Descriptor &, int64_t *stream = nullptr,
-    bool *pinned = nullptr, bool hasStat = false,
+int RTDECL(CUFAllocatableAllocate)(Descriptor &, int32_t allocIdx,
+    int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary.
-int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int64_t *stream = nullptr,
-    bool *pinned = nullptr, bool hasStat = false,
+int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int32_t allocIdx,
+    int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
 /// Perform allocation of the descriptor without synchronization. Assign data
 /// from source.
 int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
-    const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    const Descriptor &source, int32_t allocIdx, int64_t *stream = nullptr,
+    bool *pinned = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary. Assign data from source.
 int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
-    const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    const Descriptor &source, int32_t allocIdx, int64_t *stream = nullptr,
+    bool *pinned = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 
 /// Perform deallocation of the descriptor with synchronization of it when
 /// necessary.
diff --git a/flang/include/flang/Runtime/CUDA/pointer.h b/flang/include/flang/Runtime/CUDA/pointer.h
index bdfc3268e0814..64698370534ce 100644
--- a/flang/include/flang/Runtime/CUDA/pointer.h
+++ b/flang/include/flang/Runtime/CUDA/pointer.h
@@ -17,31 +17,33 @@ namespace Fortran::runtime::cuda {
 extern "C" {
 
 /// Perform allocation of the descriptor.
-int RTDECL(CUFPointerAllocate)(Descriptor &, int64_t *stream = nullptr,
-    bool *pinned = nullptr, bool hasStat = false,
+int RTDECL(CUFPointerAllocate)(Descriptor &, int32_t allocIdx,
+    int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary.
-int RTDECL(CUFPointerAllocateSync)(Descriptor &, int64_t *stream = nullptr,
-    bool *pinned = nullptr, bool hasStat = false,
+int RTDECL(CUFPointerAllocateSync)(Descriptor &, int32_t allocIdx,
+    int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
 /// Perform allocation of the descriptor without synchronization. Assign data
 /// from source.
 int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
-    const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    const Descriptor &source, int32_t allocIdx, int64_t *stream = nullptr,
+    bool *pinned = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary. Assign data from source.
 int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
-    const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    const Descriptor &source, int32_t allocIdx, int64_t *stream = nullptr,
+    bool *pinned = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 
 } // extern "C"
 
diff --git a/flang/lib/Lower/CUDA.cpp b/flang/lib/Lower/CUDA.cpp
index 1293d2c5bd3ae..5bb0a11e4fa56 100644
--- a/flang/lib/Lower/CUDA.cpp
+++ b/flang/lib/Lower/CUDA.cpp
@@ -165,3 +165,20 @@ bool Fortran::lower::isTransferWithConversion(mlir::Value rhs) {
       return true;
   return false;
 }
+
+unsigned
+Fortran::lower::getAllocatorIdx(const Fortran::semantics::Symbol &sym) {
+  std::optional<Fortran::common::CUDADataAttr> cudaAttr =
+      Fortran::semantics::GetCUDADataAttr(&sym.GetUltimate());
+  if (cudaAttr) {
+    if (*cudaAttr == Fortran::common::CUDADataAttr::Pinned)
+      return kPinnedAllocatorPos;
+    if (*cudaAttr == Fortran::common::CUDADataAttr::Device)
+      return kDeviceAllocatorPos;
+    if (*cudaAttr == Fortran::common::CUDADataAttr::Managed)
+      return kManagedAllocatorPos;
+    if (*cudaAttr == Fortran::common::CUDADataAttr::Unified)
+      return kUnifiedAllocatorPos;
+  }
+  return kDefaultAllocator;
+}
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 80af7f4c1aaad..6e9518a0f3349 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -478,20 +478,6 @@ createGlobalInitialization(fir::FirOpBuilder &builder, fir::GlobalOp global,
   builder.restoreInsertionPoint(insertPt);
 }
 
-static unsigned getAllocatorIdxFromDataAttr(cuf::DataAttributeAttr dataAttr) {
-  if (dataAttr) {
-    if (dataAttr.getValue() == cuf::DataAttribute::Pinned)
-      return kPinnedAllocatorPos;
-    if (dataAttr.getValue() == cuf::DataAttribute::Device)
-      return kDeviceAllocatorPos;
-    if (dataAttr.getValue() == cuf::DataAttribute::Managed)
-      return kManagedAllocatorPos;
-    if (dataAttr.getValue() == cuf::DataAttribute::Unified)
-      return kUnifiedAllocatorPos;
-  }
-  return kDefaultAllocator;
-}
-
 /// Create the global op and its init if it has one
 fir::GlobalOp Fortran::lower::defineGlobal(
     Fortran::lower::AbstractConverter &converter,
@@ -554,7 +540,9 @@ fir::GlobalOp Fortran::lower::defineGlobal(
         mlir::Value box = fir::factory::createUnallocatedBox(
             b, loc, symTy,
             /*nonDeferredParams=*/{},
-            /*typeSourceBox=*/{}, getAllocatorIdxFromDataAttr(dataAttr));
+            /*typeSourceBox=*/{},
+            dataAttr ? cuf::getAllocatorIdx(dataAttr.getValue())
+                     : kDefaultAllocator);
         fir::HasValueOp::create(b, loc, box);
       });
     }
diff --git a/flang/lib/Optimizer/Dialect/CUF/Attributes/CUFAttr.cpp b/flang/lib/Optimizer/Dialect/CUF/Attributes/CUFAttr.cpp
index bd0499f406c18..fd5dd555c04cd 100644
--- a/flang/lib/Optimizer/Dialect/CUF/Attributes/CUFAttr.cpp
+++ b/flang/lib/Optimizer/Dialect/CUF/Attributes/CUFAttr.cpp
@@ -12,6 +12,7 @@
 
 #include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h"
 #include "flang/Optimizer/Dialect/CUF/CUFDialect.h"
+#include "flang/Runtime/allocator-registry-consts.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/DialectImplementation.h"
@@ -52,4 +53,16 @@ bool hasDataAttr(mlir::Operation *op, cuf::DataAttribute value) {
   return false;
 }
 
+unsigned getAllocatorIdx(cuf::DataAttribute dataAttr) {
+  if (dataAttr == cuf::DataAttribute::Pinned)
+    return kPinnedAllocatorPos;
+  if (dataAttr == cuf::DataAttribute::Device)
+    return kDeviceAllocatorPos;
+  if (dataAttr == cuf::DataAttribute::Managed)
+    return kManagedAllocatorPos;
+  if (dataAttr == cuf::DataAttribute::Unified)
+    return kUnifiedAllocatorPos;
+  return kDefaultAllocator;
+}
+
 } // namespace cuf
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 9834b0499b930..9021c5d982321 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -106,7 +106,7 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
   mlir::Value sourceLine;
   if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>)
     sourceLine = fir::factory::locationToLineNo(
-        builder, loc, op.getSource() ? fTy.getInput(7) : fTy.getInput(6));
+        builder, loc, op.getSource() ? fTy.getInput(8) : fTy.getInput(7));
   else
     sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
 
@@ -122,6 +122,8 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
   }
   llvm::SmallVector<mlir::Value> args;
   if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>) {
+    mlir::Value allocIdx = builder.createIntegerConstant(
+        loc, builder.getI32Type(), cuf::getAllocatorIdx(op.getDataAttr()));
     mlir::Value pinned =
         op.getPinned()
             ? op.getPinned()
@@ -133,15 +135,15 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
           op.getStream() ? op.getStream()
                          : builder.createNullConstant(loc, fTy.getInput(2));
       args = fir::runtime::createArguments(
-          builder, loc, fTy, op.getBox(), op.getSource(), stream, pinned,
-          hasStat, errmsg, sourceFile, sourceLine);
+          builder, loc, fTy, op.getBox(), op.getSource(), allocIdx, stream,
+          pinned, hasStat, errmsg, sourceFile, sourceLine);
     } else {
       mlir::Value stream =
           op.getStream() ? op.getStream()
                          : builder.createNullConstant(loc, fTy.getInput(1));
       args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
-                                           stream, pinned, hasStat, errmsg,
-                                           sourceFile, sourceLine);
+                                           allocIdx, stream, pinned, hasStat,
+                                           errmsg, sourceFile, sourceLine);
     }
   } else {
     args =
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index ea7890c9aac52..799d9991dfa83 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -19,7 +19,7 @@ func.func @_QPsub1() {
 // CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 // CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{....
[truncated]

@github-actions
Copy link

github-actions bot commented Sep 5, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.


int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t *stream,
bool *pinned, bool hasStat, const Descriptor *errMsg,
int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int32_t allocIdx,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't the allocator index available in the descriptor?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes in most cases the allocator is set correctly. The problem comes when you have allocable device components in a derived type.
If the derived type itself is an allocatable array then it becomes tricky to initialize the allocator index correctly at the right time. Since we have the information at hand on the allocate statement of the component it seems easier to pass it at that time.
Let me know if you have another idea. I was thinking to use the set_allocator_idx operation but it feels like an extra runtime call that can be embedded with the cuf.allocate for this corner case.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@vzakhari Any concern about this?

@clementval
Copy link
Contributor Author

Close for #157731

@clementval clementval closed this Sep 9, 2025
@clementval clementval deleted the cuf_allocate_alloc_id branch September 9, 2025 18:29
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

flang:fir-hlfir flang Flang issues not falling into any other category

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants