Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions flang/include/flang/Runtime/CUDA/allocatable.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,18 @@ int RTDECL(CUFAllocatableAllocate)(Descriptor &, bool hasStat = false,
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
int sourceLine = 0);

/// Perform allocation of the descriptor without synchronization. Assign data
/// from source.
int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
const Descriptor &source, bool hasStat, const Descriptor *errMsg,
const char *sourceFile, int sourceLine);

/// Perform allocation of the descriptor with synchronization of it when
/// necessary. Assign data from source.
int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
const Descriptor &source, bool hasStat, const Descriptor *errMsg,
const char *sourceFile, int sourceLine);

/// Perform deallocation of the descriptor with synchronization of it when
/// necessary.
int RTDECL(CUFAllocatableDeallocate)(Descriptor &, bool hasStat = false,
Expand Down
23 changes: 23 additions & 0 deletions flang/include/flang/Runtime/CUDA/memmove-function.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
//===-- include/flang/Runtime/CUDA/memmove-function.h -----------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include <cstddef>

#ifndef FORTRAN_RUNTIME_CUDA_MEMMOVE_FUNCTION_H_
#define FORTRAN_RUNTIME_CUDA_MEMMOVE_FUNCTION_H_

namespace Fortran::runtime::cuda {

void *MemmoveHostToDevice(void *dst, const void *src, std::size_t count);

void *MemmoveDeviceToHost(void *dst, const void *src, std::size_t count);

void *MemmoveDeviceToDevice(void *dst, const void *src, std::size_t count);

} // namespace Fortran::runtime::cuda
#endif // FORTRAN_RUNTIME_CUDA_MEMMOVE_FUNCTION_H_
61 changes: 42 additions & 19 deletions flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,12 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
auto fTy = func.getFunctionType();

mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
mlir::Value sourceLine =
fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
mlir::Value sourceLine;
if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>)
sourceLine = fir::factory::locationToLineNo(
builder, loc, op.getSource() ? fTy.getInput(5) : fTy.getInput(4));
else
sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));

mlir::Value hasStat = op.getHasStat() ? builder.createBool(loc, true)
: builder.createBool(loc, false);
Expand All @@ -168,8 +172,21 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
mlir::Type boxNoneTy = fir::BoxType::get(builder.getNoneType());
errmsg = builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
}
llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
builder, loc, fTy, op.getBox(), hasStat, errmsg, sourceFile, sourceLine)};
llvm::SmallVector<mlir::Value> args;
if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>) {
if (op.getSource())
args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
op.getSource(), hasStat, errmsg,
sourceFile, sourceLine);
else
args =
fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat,
errmsg, sourceFile, sourceLine);
} else {
args =
fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat,
errmsg, sourceFile, sourceLine);
}
auto callOp = builder.create<fir::CallOp>(loc, func, args);
rewriter.replaceOp(op, callOp);
return mlir::success();
Expand All @@ -182,10 +199,6 @@ struct CUFAllocateOpConversion
mlir::LogicalResult
matchAndRewrite(cuf::AllocateOp op,
mlir::PatternRewriter &rewriter) const override {
// TODO: Allocation with source will need a new entry point in the runtime.
if (op.getSource())
return mlir::failure();

// TODO: Allocation using different stream.
if (op.getStream())
return mlir::failure();
Expand All @@ -202,18 +215,28 @@ struct CUFAllocateOpConversion
if (hasDoubleDescriptors(op)) {
// Allocation for module variable are done with custom runtime entry point
// so the descriptors can be synchronized.
mlir::func::FuncOp func =
fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocate)>(
loc, builder);
return convertOpToCall(op, rewriter, func);
mlir::func::FuncOp func;
if (op.getSource())
func = fir::runtime::getRuntimeFunc<mkRTKey(
CUFAllocatableAllocateSourceSync)>(loc, builder);
else
func = fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocate)>(
loc, builder);
return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
}

// Allocation for local descriptor falls back on the standard runtime
// AllocatableAllocate as the dedicated allocator is set in the descriptor
// before the call.
mlir::func::FuncOp func =
fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(loc,
builder);
mlir::func::FuncOp func;
if (op.getSource())
func =
fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocateSource)>(
loc, builder);
else
// Allocation for local descriptor falls back on the standard runtime
// AllocatableAllocate as the dedicated allocator is set in the descriptor
// before the call.
func = fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(
loc, builder);

return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
}
};
Expand All @@ -236,7 +259,7 @@ struct CUFDeallocateOpConversion
mlir::func::FuncOp func =
fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableDeallocate)>(
loc, builder);
return convertOpToCall(op, rewriter, func);
return convertOpToCall<cuf::DeallocateOp>(op, rewriter, func);
}

// Deallocation for local descriptor falls back on the standard runtime
Expand Down
1 change: 1 addition & 0 deletions flang/runtime/CUDA/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ add_flang_library(${CUFRT_LIBNAME}
allocatable.cpp
descriptor.cpp
kernel.cpp
memmove-function.cpp
memory.cpp
registration.cpp
)
Expand Down
28 changes: 28 additions & 0 deletions flang/runtime/CUDA/allocatable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@
//===----------------------------------------------------------------------===//

#include "flang/Runtime/CUDA/allocatable.h"
#include "../assign-impl.h"
#include "../stat.h"
#include "../terminator.h"
#include "flang/Runtime/CUDA/common.h"
#include "flang/Runtime/CUDA/descriptor.h"
#include "flang/Runtime/CUDA/memmove-function.h"
#include "flang/Runtime/allocatable.h"

#include "cuda_runtime.h"
Expand Down Expand Up @@ -45,6 +47,32 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat,
return stat;
}

int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
const Descriptor &source, bool hasStat, const Descriptor *errMsg,
const char *sourceFile, int sourceLine) {
int stat{RTNAME(AllocatableAllocate)(
alloc, hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
Terminator terminator{sourceFile, sourceLine};
Fortran::runtime::DoFromSourceAssign(
alloc, source, terminator, &MemmoveHostToDevice);
}
return stat;
}

int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
const Descriptor &source, bool hasStat, const Descriptor *errMsg,
const char *sourceFile, int sourceLine) {
int stat{RTNAME(AllocatableAllocate)(
alloc, hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
Terminator terminator{sourceFile, sourceLine};
Fortran::runtime::DoFromSourceAssign(
alloc, source, terminator, &MemmoveHostToDevice);
}
return stat;
}

int RTDEF(CUFAllocatableDeallocate)(Descriptor &desc, bool hasStat,
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
// Perform the standard allocation.
Expand Down
35 changes: 35 additions & 0 deletions flang/runtime/CUDA/memmove-function.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
//===-- runtime/CUDA/memmove-function.cpp ---------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "flang/Runtime/CUDA/memmove-function.h"
#include "../terminator.h"
#include "flang/Runtime/CUDA/common.h"

#include "cuda_runtime.h"

namespace Fortran::runtime::cuda {

void *MemmoveHostToDevice(void *dst, const void *src, std::size_t count) {
// TODO: Use cudaMemcpyAsync when we have support for stream.
CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyHostToDevice));
return dst;
}

void *MemmoveDeviceToHost(void *dst, const void *src, std::size_t count) {
// TODO: Use cudaMemcpyAsync when we have support for stream.
CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToHost));
return dst;
}

void *MemmoveDeviceToDevice(void *dst, const void *src, std::size_t count) {
// TODO: Use cudaMemcpyAsync when we have support for stream.
CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToDevice));
return dst;
}

} // namespace Fortran::runtime::cuda
21 changes: 1 addition & 20 deletions flang/runtime/CUDA/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,31 +11,12 @@
#include "../terminator.h"
#include "flang/Runtime/CUDA/common.h"
#include "flang/Runtime/CUDA/descriptor.h"
#include "flang/Runtime/CUDA/memmove-function.h"
#include "flang/Runtime/assign.h"

#include "cuda_runtime.h"

namespace Fortran::runtime::cuda {
static void *MemmoveHostToDevice(
void *dst, const void *src, std::size_t count) {
// TODO: Use cudaMemcpyAsync when we have support for stream.
CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyHostToDevice));
return dst;
}

static void *MemmoveDeviceToHost(
void *dst, const void *src, std::size_t count) {
// TODO: Use cudaMemcpyAsync when we have support for stream.
CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToHost));
return dst;
}

static void *MemmoveDeviceToDevice(
void *dst, const void *src, std::size_t count) {
// TODO: Use cudaMemcpyAsync when we have support for stream.
CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToDevice));
return dst;
}

extern "C" {

Expand Down
45 changes: 45 additions & 0 deletions flang/test/Fir/CUDA/cuda-allocate.fir
Original file line number Diff line number Diff line change
Expand Up @@ -120,4 +120,49 @@ func.func @_QQsub6() attributes {fir.bindc_name = "test"} {
// CHECK: %[[B_BOX:.*]] = fir.convert %[[B]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: fir.call @_FortranACUFAllocatableAllocate(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32


func.func @_QPallocate_source() {
%c0_i64 = arith.constant 0 : i64
%c1_i32 = arith.constant 1 : i32
%c0_i32 = arith.constant 0 : i32
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xf32>>> {bindc_name = "a", uniq_name = "_QFallocate_sourceEa"}
%4 = fir.declare %0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocate_sourceEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
%5 = cuf.alloc !fir.box<!fir.heap<!fir.array<?x?xf32>>> {bindc_name = "a_d", data_attr = #cuf.cuda<device>, uniq_name = "_QFallocate_sourceEa_d"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
%7 = fir.declare %5 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocate_sourceEa_d"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
%8 = fir.load %4 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
%22 = cuf.allocate %7 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> source(%8 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>) {data_attr = #cuf.cuda<device>} -> i32
return
}

// CHECK-LABEL: func.func @_QPallocate_source()
// CHECK: %[[DECL_HOST:.*]] = fir.declare %{{.*}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocate_sourceEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
// CHECK: %[[DECL_DEV:.*]] = fir.declare %{{.*}} {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocate_sourceEa_d"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
// CHECK: %[[SOURCE:.*]] = fir.load %[[DECL_HOST]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
// CHECK: %[[DEV_CONV:.*]] = fir.convert %[[DECL_DEV]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: %[[SOURCE_CONV:.*]] = fir.convert %[[SOURCE]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<none>
// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32


fir.global @_QMmod1Ea_d {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
%c0 = arith.constant 0 : index
%0 = fir.zero_bits !fir.heap<!fir.array<?x?xf32>>
%1 = fir.shape %c0, %c0 : (index, index) -> !fir.shape<2>
%2 = fir.embox %0(%1) {allocator_idx = 2 : i32} : (!fir.heap<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.box<!fir.heap<!fir.array<?x?xf32>>>
fir.has_value %2 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>
}
func.func @_QMmod1Pallocate_source_global() {
%0 = fir.address_of(@_QMmod1Ea_d) : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
%1 = fir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Ea_d"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
%2 = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xf32>>> {bindc_name = "a", uniq_name = "_QMmod1Fallocate_source_globalEa"}
%6 = fir.declare %2 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Fallocate_source_globalEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
%7 = fir.load %6 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
%21 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> source(%7 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>) {data_attr = #cuf.cuda<device>} -> i32
return
}

// CHECK-LABEL: func.func @_QMmod1Pallocate_source_global()
// CHECK: fir.call @_FortranACUFAllocatableAllocateSourceSync

} // end of module
Loading