diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h index e986ad910a3f3..bbfcd2cafcdb2 100644 --- a/flang/include/flang/Runtime/CUDA/allocatable.h +++ b/flang/include/flang/Runtime/CUDA/allocatable.h @@ -22,6 +22,18 @@ int RTDECL(CUFAllocatableAllocate)(Descriptor &, bool hasStat = false, const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr, int sourceLine = 0); +/// Perform allocation of the descriptor without synchronization. Assign data +/// from source. +int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc, + const Descriptor &source, bool hasStat, const Descriptor *errMsg, + const char *sourceFile, int sourceLine); + +/// Perform allocation of the descriptor with synchronization of it when +/// necessary. Assign data from source. +int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc, + const Descriptor &source, bool hasStat, const Descriptor *errMsg, + const char *sourceFile, int sourceLine); + /// Perform deallocation of the descriptor with synchronization of it when /// necessary. int RTDECL(CUFAllocatableDeallocate)(Descriptor &, bool hasStat = false, diff --git a/flang/include/flang/Runtime/CUDA/memmove-function.h b/flang/include/flang/Runtime/CUDA/memmove-function.h new file mode 100644 index 0000000000000..74d6a05eff4c9 --- /dev/null +++ b/flang/include/flang/Runtime/CUDA/memmove-function.h @@ -0,0 +1,23 @@ +//===-- include/flang/Runtime/CUDA/memmove-function.h -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#ifndef FORTRAN_RUNTIME_CUDA_MEMMOVE_FUNCTION_H_ +#define FORTRAN_RUNTIME_CUDA_MEMMOVE_FUNCTION_H_ + +namespace Fortran::runtime::cuda { + +void *MemmoveHostToDevice(void *dst, const void *src, std::size_t count); + +void *MemmoveDeviceToHost(void *dst, const void *src, std::size_t count); + +void *MemmoveDeviceToDevice(void *dst, const void *src, std::size_t count); + +} // namespace Fortran::runtime::cuda +#endif // FORTRAN_RUNTIME_CUDA_MEMMOVE_FUNCTION_H_ diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index f1ebd08967b9a..3983336516db9 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -155,8 +155,12 @@ static mlir::LogicalResult convertOpToCall(OpTy op, auto fTy = func.getFunctionType(); mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc); - mlir::Value sourceLine = - fir::factory::locationToLineNo(builder, loc, fTy.getInput(4)); + mlir::Value sourceLine; + if constexpr (std::is_same_v) + sourceLine = fir::factory::locationToLineNo( + builder, loc, op.getSource() ? fTy.getInput(5) : fTy.getInput(4)); + else + sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(4)); mlir::Value hasStat = op.getHasStat() ? builder.createBool(loc, true) : builder.createBool(loc, false); @@ -168,8 +172,21 @@ static mlir::LogicalResult convertOpToCall(OpTy op, mlir::Type boxNoneTy = fir::BoxType::get(builder.getNoneType()); errmsg = builder.create(loc, boxNoneTy).getResult(); } - llvm::SmallVector args{fir::runtime::createArguments( - builder, loc, fTy, op.getBox(), hasStat, errmsg, sourceFile, sourceLine)}; + llvm::SmallVector args; + if constexpr (std::is_same_v) { + if (op.getSource()) + args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(), + op.getSource(), hasStat, errmsg, + sourceFile, sourceLine); + else + args = + fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat, + errmsg, sourceFile, sourceLine); + } else { + args = + fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat, + errmsg, sourceFile, sourceLine); + } auto callOp = builder.create(loc, func, args); rewriter.replaceOp(op, callOp); return mlir::success(); @@ -182,10 +199,6 @@ struct CUFAllocateOpConversion mlir::LogicalResult matchAndRewrite(cuf::AllocateOp op, mlir::PatternRewriter &rewriter) const override { - // TODO: Allocation with source will need a new entry point in the runtime. - if (op.getSource()) - return mlir::failure(); - // TODO: Allocation using different stream. if (op.getStream()) return mlir::failure(); @@ -202,18 +215,28 @@ struct CUFAllocateOpConversion if (hasDoubleDescriptors(op)) { // Allocation for module variable are done with custom runtime entry point // so the descriptors can be synchronized. - mlir::func::FuncOp func = - fir::runtime::getRuntimeFunc( - loc, builder); - return convertOpToCall(op, rewriter, func); + mlir::func::FuncOp func; + if (op.getSource()) + func = fir::runtime::getRuntimeFunc(loc, builder); + else + func = fir::runtime::getRuntimeFunc( + loc, builder); + return convertOpToCall(op, rewriter, func); } - // Allocation for local descriptor falls back on the standard runtime - // AllocatableAllocate as the dedicated allocator is set in the descriptor - // before the call. - mlir::func::FuncOp func = - fir::runtime::getRuntimeFunc(loc, - builder); + mlir::func::FuncOp func; + if (op.getSource()) + func = + fir::runtime::getRuntimeFunc( + loc, builder); + else + // Allocation for local descriptor falls back on the standard runtime + // AllocatableAllocate as the dedicated allocator is set in the descriptor + // before the call. + func = fir::runtime::getRuntimeFunc( + loc, builder); + return convertOpToCall(op, rewriter, func); } }; @@ -236,7 +259,7 @@ struct CUFDeallocateOpConversion mlir::func::FuncOp func = fir::runtime::getRuntimeFunc( loc, builder); - return convertOpToCall(op, rewriter, func); + return convertOpToCall(op, rewriter, func); } // Deallocation for local descriptor falls back on the standard runtime diff --git a/flang/runtime/CUDA/CMakeLists.txt b/flang/runtime/CUDA/CMakeLists.txt index ce87f3efdc363..3a88824826de3 100644 --- a/flang/runtime/CUDA/CMakeLists.txt +++ b/flang/runtime/CUDA/CMakeLists.txt @@ -18,6 +18,7 @@ add_flang_library(${CUFRT_LIBNAME} allocatable.cpp descriptor.cpp kernel.cpp + memmove-function.cpp memory.cpp registration.cpp ) diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp index 649ddb638abe6..9fed50c859a9c 100644 --- a/flang/runtime/CUDA/allocatable.cpp +++ b/flang/runtime/CUDA/allocatable.cpp @@ -7,10 +7,12 @@ //===----------------------------------------------------------------------===// #include "flang/Runtime/CUDA/allocatable.h" +#include "../assign-impl.h" #include "../stat.h" #include "../terminator.h" #include "flang/Runtime/CUDA/common.h" #include "flang/Runtime/CUDA/descriptor.h" +#include "flang/Runtime/CUDA/memmove-function.h" #include "flang/Runtime/allocatable.h" #include "cuda_runtime.h" @@ -45,6 +47,32 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat, return stat; } +int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc, + const Descriptor &source, bool hasStat, const Descriptor *errMsg, + const char *sourceFile, int sourceLine) { + int stat{RTNAME(AllocatableAllocate)( + alloc, hasStat, errMsg, sourceFile, sourceLine)}; + if (stat == StatOk) { + Terminator terminator{sourceFile, sourceLine}; + Fortran::runtime::DoFromSourceAssign( + alloc, source, terminator, &MemmoveHostToDevice); + } + return stat; +} + +int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc, + const Descriptor &source, bool hasStat, const Descriptor *errMsg, + const char *sourceFile, int sourceLine) { + int stat{RTNAME(AllocatableAllocate)( + alloc, hasStat, errMsg, sourceFile, sourceLine)}; + if (stat == StatOk) { + Terminator terminator{sourceFile, sourceLine}; + Fortran::runtime::DoFromSourceAssign( + alloc, source, terminator, &MemmoveHostToDevice); + } + return stat; +} + int RTDEF(CUFAllocatableDeallocate)(Descriptor &desc, bool hasStat, const Descriptor *errMsg, const char *sourceFile, int sourceLine) { // Perform the standard allocation. diff --git a/flang/runtime/CUDA/memmove-function.cpp b/flang/runtime/CUDA/memmove-function.cpp new file mode 100644 index 0000000000000..3ba9fa7e0f7f7 --- /dev/null +++ b/flang/runtime/CUDA/memmove-function.cpp @@ -0,0 +1,35 @@ +//===-- runtime/CUDA/memmove-function.cpp ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "flang/Runtime/CUDA/memmove-function.h" +#include "../terminator.h" +#include "flang/Runtime/CUDA/common.h" + +#include "cuda_runtime.h" + +namespace Fortran::runtime::cuda { + +void *MemmoveHostToDevice(void *dst, const void *src, std::size_t count) { + // TODO: Use cudaMemcpyAsync when we have support for stream. + CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyHostToDevice)); + return dst; +} + +void *MemmoveDeviceToHost(void *dst, const void *src, std::size_t count) { + // TODO: Use cudaMemcpyAsync when we have support for stream. + CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToHost)); + return dst; +} + +void *MemmoveDeviceToDevice(void *dst, const void *src, std::size_t count) { + // TODO: Use cudaMemcpyAsync when we have support for stream. + CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToDevice)); + return dst; +} + +} // namespace Fortran::runtime::cuda diff --git a/flang/runtime/CUDA/memory.cpp b/flang/runtime/CUDA/memory.cpp index 68963c4d7738a..0bbb493d2db91 100644 --- a/flang/runtime/CUDA/memory.cpp +++ b/flang/runtime/CUDA/memory.cpp @@ -11,31 +11,12 @@ #include "../terminator.h" #include "flang/Runtime/CUDA/common.h" #include "flang/Runtime/CUDA/descriptor.h" +#include "flang/Runtime/CUDA/memmove-function.h" #include "flang/Runtime/assign.h" #include "cuda_runtime.h" namespace Fortran::runtime::cuda { -static void *MemmoveHostToDevice( - void *dst, const void *src, std::size_t count) { - // TODO: Use cudaMemcpyAsync when we have support for stream. - CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyHostToDevice)); - return dst; -} - -static void *MemmoveDeviceToHost( - void *dst, const void *src, std::size_t count) { - // TODO: Use cudaMemcpyAsync when we have support for stream. - CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToHost)); - return dst; -} - -static void *MemmoveDeviceToDevice( - void *dst, const void *src, std::size_t count) { - // TODO: Use cudaMemcpyAsync when we have support for stream. - CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToDevice)); - return dst; -} extern "C" { diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir index d68ff894d5af5..47d75b16b7a2d 100644 --- a/flang/test/Fir/CUDA/cuda-allocate.fir +++ b/flang/test/Fir/CUDA/cuda-allocate.fir @@ -120,4 +120,49 @@ func.func @_QQsub6() attributes {fir.bindc_name = "test"} { // CHECK: %[[B_BOX:.*]] = fir.convert %[[B]]#1 : (!fir.ref>>>) -> !fir.ref> // CHECK: fir.call @_FortranACUFAllocatableAllocate(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 + +func.func @_QPallocate_source() { + %c0_i64 = arith.constant 0 : i64 + %c1_i32 = arith.constant 1 : i32 + %c0_i32 = arith.constant 0 : i32 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %0 = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFallocate_sourceEa"} + %4 = fir.declare %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFallocate_sourceEa"} : (!fir.ref>>>) -> !fir.ref>>> + %5 = cuf.alloc !fir.box>> {bindc_name = "a_d", data_attr = #cuf.cuda, uniq_name = "_QFallocate_sourceEa_d"} -> !fir.ref>>> + %7 = fir.declare %5 {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFallocate_sourceEa_d"} : (!fir.ref>>>) -> !fir.ref>>> + %8 = fir.load %4 : !fir.ref>>> + %22 = cuf.allocate %7 : !fir.ref>>> source(%8 : !fir.box>>) {data_attr = #cuf.cuda} -> i32 + return +} + +// CHECK-LABEL: func.func @_QPallocate_source() +// CHECK: %[[DECL_HOST:.*]] = fir.declare %{{.*}} {fortran_attrs = #fir.var_attrs, uniq_name = "_QFallocate_sourceEa"} : (!fir.ref>>>) -> !fir.ref>>> +// CHECK: %[[DECL_DEV:.*]] = fir.declare %{{.*}} {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFallocate_sourceEa_d"} : (!fir.ref>>>) -> !fir.ref>>> +// CHECK: %[[SOURCE:.*]] = fir.load %[[DECL_HOST]] : !fir.ref>>> +// CHECK: %[[DEV_CONV:.*]] = fir.convert %[[DECL_DEV]] : (!fir.ref>>>) -> !fir.ref> +// CHECK: %[[SOURCE_CONV:.*]] = fir.convert %[[SOURCE]] : (!fir.box>>) -> !fir.box +// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.box, i1, !fir.box, !fir.ref, i32) -> i32 + + +fir.global @_QMmod1Ea_d {data_attr = #cuf.cuda} : !fir.box>> { + %c0 = arith.constant 0 : index + %0 = fir.zero_bits !fir.heap> + %1 = fir.shape %c0, %c0 : (index, index) -> !fir.shape<2> + %2 = fir.embox %0(%1) {allocator_idx = 2 : i32} : (!fir.heap>, !fir.shape<2>) -> !fir.box>> + fir.has_value %2 : !fir.box>> +} +func.func @_QMmod1Pallocate_source_global() { + %0 = fir.address_of(@_QMmod1Ea_d) : !fir.ref>>> + %1 = fir.declare %0 {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QMmod1Ea_d"} : (!fir.ref>>>) -> !fir.ref>>> + %2 = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QMmod1Fallocate_source_globalEa"} + %6 = fir.declare %2 {fortran_attrs = #fir.var_attrs, uniq_name = "_QMmod1Fallocate_source_globalEa"} : (!fir.ref>>>) -> !fir.ref>>> + %7 = fir.load %6 : !fir.ref>>> + %21 = cuf.allocate %1 : !fir.ref>>> source(%7 : !fir.box>>) {data_attr = #cuf.cuda} -> i32 + return +} + +// CHECK-LABEL: func.func @_QMmod1Pallocate_source_global() +// CHECK: fir.call @_FortranACUFAllocatableAllocateSourceSync + } // end of module