Skip to content
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 18 additions & 13 deletions flang-rt/lib/cuda/allocatable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@ namespace Fortran::runtime::cuda {
extern "C" {
RT_EXT_API_GROUP_BEGIN

int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t *stream,
bool *pinned, bool hasStat, const Descriptor *errMsg,
int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int32_t allocIdx,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't the allocator index available in the descriptor?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes in most cases the allocator is set correctly. The problem comes when you have allocable device components in a derived type.
If the derived type itself is an allocatable array then it becomes tricky to initialize the allocator index correctly at the right time. Since we have the information at hand on the allocate statement of the component it seems easier to pass it at that time.
Let me know if you have another idea. I was thinking to use the set_allocator_idx operation but it feels like an extra runtime call that can be embedded with the cuf.allocate for this corner case.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@vzakhari Any concern about this?

int64_t *stream, bool *pinned, bool hasStat, const Descriptor *errMsg,
const char *sourceFile, int sourceLine) {
int stat{RTNAME(CUFAllocatableAllocate)(
desc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
desc, allocIdx, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
#ifndef RT_DEVICE_COMPILATION
// Descriptor synchronization is only done when the allocation is done
// from the host.
Expand All @@ -41,9 +41,12 @@ int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t *stream,
return stat;
}

int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t *stream,
bool *pinned, bool hasStat, const Descriptor *errMsg,
int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int32_t allocIdx,
int64_t *stream, bool *pinned, bool hasStat, const Descriptor *errMsg,
const char *sourceFile, int sourceLine) {
#if !defined(RT_DEVICE_COMPILATION)
desc.SetAllocIdx(allocIdx);
#endif
// Perform the standard allocation.
int stat{RTNAME(AllocatableAllocate)(
desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
Expand All @@ -56,10 +59,11 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t *stream,
}

int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
int stat{RTNAME(CUFAllocatableAllocate)(
alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned,
bool hasStat, const Descriptor *errMsg, const char *sourceFile,
int sourceLine) {
int stat{RTNAME(CUFAllocatableAllocate)(alloc, allocIdx, stream, pinned,
hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
Terminator terminator{sourceFile, sourceLine};
Fortran::runtime::DoFromSourceAssign(
Expand All @@ -69,10 +73,11 @@ int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
}

int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
int stat{RTNAME(CUFAllocatableAllocateSync)(
alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned,
bool hasStat, const Descriptor *errMsg, const char *sourceFile,
int sourceLine) {
int stat{RTNAME(CUFAllocatableAllocateSync)(alloc, allocIdx, stream, pinned,
hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
Terminator terminator{sourceFile, sourceLine};
Fortran::runtime::DoFromSourceAssign(
Expand Down
33 changes: 19 additions & 14 deletions flang-rt/lib/cuda/pointer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,12 @@ namespace Fortran::runtime::cuda {
extern "C" {
RT_EXT_API_GROUP_BEGIN

int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t *stream, bool *pinned,
bool hasStat, const Descriptor *errMsg, const char *sourceFile,
int sourceLine) {
int RTDEF(CUFPointerAllocate)(Descriptor &desc, int32_t allocIdx,
int64_t *stream, bool *pinned, bool hasStat, const Descriptor *errMsg,
const char *sourceFile, int sourceLine) {
#if !defined(RT_DEVICE_COMPILATION)
desc.SetAllocIdx(allocIdx);
#endif
// Perform the standard allocation.
int stat{
RTNAME(PointerAllocate)(desc, hasStat, errMsg, sourceFile, sourceLine)};
Expand All @@ -36,11 +39,11 @@ int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t *stream, bool *pinned,
return stat;
}

int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t *stream,
bool *pinned, bool hasStat, const Descriptor *errMsg,
int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int32_t allocIdx,
int64_t *stream, bool *pinned, bool hasStat, const Descriptor *errMsg,
const char *sourceFile, int sourceLine) {
int stat{RTNAME(CUFPointerAllocate)(
desc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
desc, allocIdx, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
#ifndef RT_DEVICE_COMPILATION
// Descriptor synchronization is only done when the allocation is done
// from the host.
Expand All @@ -55,10 +58,11 @@ int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t *stream,
}

int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
int stat{RTNAME(CUFPointerAllocate)(
pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned,
bool hasStat, const Descriptor *errMsg, const char *sourceFile,
int sourceLine) {
int stat{RTNAME(CUFPointerAllocate)(pointer, allocIdx, stream, pinned,
hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
Terminator terminator{sourceFile, sourceLine};
Fortran::runtime::DoFromSourceAssign(
Expand All @@ -68,10 +72,11 @@ int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
}

int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
int stat{RTNAME(CUFPointerAllocateSync)(
pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned,
bool hasStat, const Descriptor *errMsg, const char *sourceFile,
int sourceLine) {
int stat{RTNAME(CUFPointerAllocateSync)(pointer, allocIdx, stream, pinned,
hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
Terminator terminator{sourceFile, sourceLine};
Fortran::runtime::DoFromSourceAssign(
Expand Down
2 changes: 1 addition & 1 deletion flang-rt/lib/runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ if ("${LLVM_RUNTIMES_TARGET}" MATCHES "^amdgcn|^nvptx")
elseif(FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "CUDA")
# findloc.cpp has some issues with higher compute capability. Remove it
# from CUDA build until we can lower its memory footprint.
list(REMOVE_ITEM supported_sources findloc.cpp)

set(sources ${supported_sources})
else ()
set(sources ${supported_sources} ${host_sources} ${f128_sources})
Expand Down
16 changes: 1 addition & 15 deletions flang/include/flang/Lower/CUDA.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,21 +31,7 @@ namespace Fortran::lower {

class AbstractConverter;

static inline unsigned getAllocatorIdx(const Fortran::semantics::Symbol &sym) {
std::optional<Fortran::common::CUDADataAttr> cudaAttr =
Fortran::semantics::GetCUDADataAttr(&sym.GetUltimate());
if (cudaAttr) {
if (*cudaAttr == Fortran::common::CUDADataAttr::Pinned)
return kPinnedAllocatorPos;
if (*cudaAttr == Fortran::common::CUDADataAttr::Device)
return kDeviceAllocatorPos;
if (*cudaAttr == Fortran::common::CUDADataAttr::Managed)
return kManagedAllocatorPos;
if (*cudaAttr == Fortran::common::CUDADataAttr::Unified)
return kUnifiedAllocatorPos;
}
return kDefaultAllocator;
}
unsigned getAllocatorIdx(const Fortran::semantics::Symbol &sym);

void initializeDeviceComponentAllocator(
Fortran::lower::AbstractConverter &converter,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ cuf::DataAttributeAttr getDataAttr(mlir::Operation *op);
/// Returns true if the operation has a data attribute with the given value.
bool hasDataAttr(mlir::Operation *op, cuf::DataAttribute value);

unsigned getAllocatorIdx(cuf::DataAttribute dataAttr);

} // namespace cuf

#endif // FORTRAN_OPTIMIZER_DIALECT_CUF_CUFATTR_H
22 changes: 12 additions & 10 deletions flang/include/flang/Runtime/CUDA/allocatable.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,31 +17,33 @@ namespace Fortran::runtime::cuda {
extern "C" {

/// Perform allocation of the descriptor.
int RTDECL(CUFAllocatableAllocate)(Descriptor &, int64_t *stream = nullptr,
bool *pinned = nullptr, bool hasStat = false,
int RTDECL(CUFAllocatableAllocate)(Descriptor &, int32_t allocIdx,
int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false,
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
int sourceLine = 0);

/// Perform allocation of the descriptor with synchronization of it when
/// necessary.
int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int64_t *stream = nullptr,
bool *pinned = nullptr, bool hasStat = false,
int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int32_t allocIdx,
int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false,
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
int sourceLine = 0);

/// Perform allocation of the descriptor without synchronization. Assign data
/// from source.
int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
bool hasStat = false, const Descriptor *errMsg = nullptr,
const char *sourceFile = nullptr, int sourceLine = 0);
const Descriptor &source, int32_t allocIdx, int64_t *stream = nullptr,
bool *pinned = nullptr, bool hasStat = false,
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
int sourceLine = 0);

/// Perform allocation of the descriptor with synchronization of it when
/// necessary. Assign data from source.
int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
bool hasStat = false, const Descriptor *errMsg = nullptr,
const char *sourceFile = nullptr, int sourceLine = 0);
const Descriptor &source, int32_t allocIdx, int64_t *stream = nullptr,
bool *pinned = nullptr, bool hasStat = false,
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
int sourceLine = 0);

/// Perform deallocation of the descriptor with synchronization of it when
/// necessary.
Expand Down
22 changes: 12 additions & 10 deletions flang/include/flang/Runtime/CUDA/pointer.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,31 +17,33 @@ namespace Fortran::runtime::cuda {
extern "C" {

/// Perform allocation of the descriptor.
int RTDECL(CUFPointerAllocate)(Descriptor &, int64_t *stream = nullptr,
bool *pinned = nullptr, bool hasStat = false,
int RTDECL(CUFPointerAllocate)(Descriptor &, int32_t allocIdx,
int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false,
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
int sourceLine = 0);

/// Perform allocation of the descriptor with synchronization of it when
/// necessary.
int RTDECL(CUFPointerAllocateSync)(Descriptor &, int64_t *stream = nullptr,
bool *pinned = nullptr, bool hasStat = false,
int RTDECL(CUFPointerAllocateSync)(Descriptor &, int32_t allocIdx,
int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false,
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
int sourceLine = 0);

/// Perform allocation of the descriptor without synchronization. Assign data
/// from source.
int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
bool hasStat = false, const Descriptor *errMsg = nullptr,
const char *sourceFile = nullptr, int sourceLine = 0);
const Descriptor &source, int32_t allocIdx, int64_t *stream = nullptr,
bool *pinned = nullptr, bool hasStat = false,
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
int sourceLine = 0);

/// Perform allocation of the descriptor with synchronization of it when
/// necessary. Assign data from source.
int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
bool hasStat = false, const Descriptor *errMsg = nullptr,
const char *sourceFile = nullptr, int sourceLine = 0);
const Descriptor &source, int32_t allocIdx, int64_t *stream = nullptr,
bool *pinned = nullptr, bool hasStat = false,
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
int sourceLine = 0);

} // extern "C"

Expand Down
17 changes: 17 additions & 0 deletions flang/lib/Lower/CUDA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,20 @@ bool Fortran::lower::isTransferWithConversion(mlir::Value rhs) {
return true;
return false;
}

unsigned
Fortran::lower::getAllocatorIdx(const Fortran::semantics::Symbol &sym) {
std::optional<Fortran::common::CUDADataAttr> cudaAttr =
Fortran::semantics::GetCUDADataAttr(&sym.GetUltimate());
if (cudaAttr) {
if (*cudaAttr == Fortran::common::CUDADataAttr::Pinned)
return kPinnedAllocatorPos;
if (*cudaAttr == Fortran::common::CUDADataAttr::Device)
return kDeviceAllocatorPos;
if (*cudaAttr == Fortran::common::CUDADataAttr::Managed)
return kManagedAllocatorPos;
if (*cudaAttr == Fortran::common::CUDADataAttr::Unified)
return kUnifiedAllocatorPos;
}
return kDefaultAllocator;
}
18 changes: 3 additions & 15 deletions flang/lib/Lower/ConvertVariable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -478,20 +478,6 @@ createGlobalInitialization(fir::FirOpBuilder &builder, fir::GlobalOp global,
builder.restoreInsertionPoint(insertPt);
}

static unsigned getAllocatorIdxFromDataAttr(cuf::DataAttributeAttr dataAttr) {
if (dataAttr) {
if (dataAttr.getValue() == cuf::DataAttribute::Pinned)
return kPinnedAllocatorPos;
if (dataAttr.getValue() == cuf::DataAttribute::Device)
return kDeviceAllocatorPos;
if (dataAttr.getValue() == cuf::DataAttribute::Managed)
return kManagedAllocatorPos;
if (dataAttr.getValue() == cuf::DataAttribute::Unified)
return kUnifiedAllocatorPos;
}
return kDefaultAllocator;
}

/// Create the global op and its init if it has one
fir::GlobalOp Fortran::lower::defineGlobal(
Fortran::lower::AbstractConverter &converter,
Expand Down Expand Up @@ -554,7 +540,9 @@ fir::GlobalOp Fortran::lower::defineGlobal(
mlir::Value box = fir::factory::createUnallocatedBox(
b, loc, symTy,
/*nonDeferredParams=*/{},
/*typeSourceBox=*/{}, getAllocatorIdxFromDataAttr(dataAttr));
/*typeSourceBox=*/{},
dataAttr ? cuf::getAllocatorIdx(dataAttr.getValue())
: kDefaultAllocator);
fir::HasValueOp::create(b, loc, box);
});
}
Expand Down
13 changes: 13 additions & 0 deletions flang/lib/Optimizer/Dialect/CUF/Attributes/CUFAttr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h"
#include "flang/Optimizer/Dialect/CUF/CUFDialect.h"
#include "flang/Runtime/allocator-registry-consts.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/DialectImplementation.h"
Expand Down Expand Up @@ -52,4 +53,16 @@ bool hasDataAttr(mlir::Operation *op, cuf::DataAttribute value) {
return false;
}

unsigned getAllocatorIdx(cuf::DataAttribute dataAttr) {
if (dataAttr == cuf::DataAttribute::Pinned)
return kPinnedAllocatorPos;
if (dataAttr == cuf::DataAttribute::Device)
return kDeviceAllocatorPos;
if (dataAttr == cuf::DataAttribute::Managed)
return kManagedAllocatorPos;
if (dataAttr == cuf::DataAttribute::Unified)
return kUnifiedAllocatorPos;
return kDefaultAllocator;
}

} // namespace cuf
12 changes: 7 additions & 5 deletions flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
mlir::Value sourceLine;
if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>)
sourceLine = fir::factory::locationToLineNo(
builder, loc, op.getSource() ? fTy.getInput(7) : fTy.getInput(6));
builder, loc, op.getSource() ? fTy.getInput(8) : fTy.getInput(7));
else
sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));

Expand All @@ -122,6 +122,8 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
}
llvm::SmallVector<mlir::Value> args;
if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>) {
mlir::Value allocIdx = builder.createIntegerConstant(
loc, builder.getI32Type(), cuf::getAllocatorIdx(op.getDataAttr()));
mlir::Value pinned =
op.getPinned()
? op.getPinned()
Expand All @@ -133,15 +135,15 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
op.getStream() ? op.getStream()
: builder.createNullConstant(loc, fTy.getInput(2));
args = fir::runtime::createArguments(
builder, loc, fTy, op.getBox(), op.getSource(), stream, pinned,
hasStat, errmsg, sourceFile, sourceLine);
builder, loc, fTy, op.getBox(), op.getSource(), allocIdx, stream,
pinned, hasStat, errmsg, sourceFile, sourceLine);
} else {
mlir::Value stream =
op.getStream() ? op.getStream()
: builder.createNullConstant(loc, fTy.getInput(1));
args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
stream, pinned, hasStat, errmsg,
sourceFile, sourceLine);
allocIdx, stream, pinned, hasStat,
errmsg, sourceFile, sourceLine);
}
} else {
args =
Expand Down
Loading