diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h index be18b2b705bbc..e2156281d1b2b 100644 --- a/flang/include/flang/Runtime/CUDA/allocatable.h +++ b/flang/include/flang/Runtime/CUDA/allocatable.h @@ -16,23 +16,28 @@ namespace Fortran::runtime::cuda { extern "C" { +/// Perform allocation of the descriptor. +int RTDECL(CUFAllocatableAllocate)(Descriptor &, int64_t stream = -1, + bool hasStat = false, const Descriptor *errMsg = nullptr, + const char *sourceFile = nullptr, int sourceLine = 0); + /// Perform allocation of the descriptor with synchronization of it when /// necessary. -int RTDECL(CUFAllocatableAllocate)(Descriptor &, bool hasStat = false, - const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr, - int sourceLine = 0); +int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int64_t stream = -1, + bool hasStat = false, const Descriptor *errMsg = nullptr, + const char *sourceFile = nullptr, int sourceLine = 0); /// Perform allocation of the descriptor without synchronization. Assign data /// from source. int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc, - const Descriptor &source, bool hasStat = false, + const Descriptor &source, int64_t stream = -1, bool hasStat = false, const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr, int sourceLine = 0); /// Perform allocation of the descriptor with synchronization of it when /// necessary. Assign data from source. int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc, - const Descriptor &source, bool hasStat = false, + const Descriptor &source, int64_t stream = -1, bool hasStat = false, const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr, int sourceLine = 0); diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index 3983336516db9..5056c48c91cfa 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -158,7 +158,7 @@ static mlir::LogicalResult convertOpToCall(OpTy op, mlir::Value sourceLine; if constexpr (std::is_same_v) sourceLine = fir::factory::locationToLineNo( - builder, loc, op.getSource() ? fTy.getInput(5) : fTy.getInput(4)); + builder, loc, op.getSource() ? fTy.getInput(6) : fTy.getInput(5)); else sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(4)); @@ -174,14 +174,23 @@ static mlir::LogicalResult convertOpToCall(OpTy op, } llvm::SmallVector args; if constexpr (std::is_same_v) { - if (op.getSource()) + if (op.getSource()) { + mlir::Value stream = + op.getStream() + ? op.getStream() + : builder.createIntegerConstant(loc, fTy.getInput(2), -1); args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(), - op.getSource(), hasStat, errmsg, - sourceFile, sourceLine); - else - args = - fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat, - errmsg, sourceFile, sourceLine); + op.getSource(), stream, hasStat, + errmsg, sourceFile, sourceLine); + } else { + mlir::Value stream = + op.getStream() + ? op.getStream() + : builder.createIntegerConstant(loc, fTy.getInput(1), -1); + args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(), + stream, hasStat, errmsg, sourceFile, + sourceLine); + } } else { args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat, @@ -199,10 +208,6 @@ struct CUFAllocateOpConversion mlir::LogicalResult matchAndRewrite(cuf::AllocateOp op, mlir::PatternRewriter &rewriter) const override { - // TODO: Allocation using different stream. - if (op.getStream()) - return mlir::failure(); - // TODO: Pinned is a reference to a logical value that can be set to true // when pinned allocation succeed. This will require a new entry point. if (op.getPinned()) @@ -220,8 +225,9 @@ struct CUFAllocateOpConversion func = fir::runtime::getRuntimeFunc(loc, builder); else - func = fir::runtime::getRuntimeFunc( - loc, builder); + func = + fir::runtime::getRuntimeFunc( + loc, builder); return convertOpToCall(op, rewriter, func); } @@ -231,10 +237,7 @@ struct CUFAllocateOpConversion fir::runtime::getRuntimeFunc( loc, builder); else - // Allocation for local descriptor falls back on the standard runtime - // AllocatableAllocate as the dedicated allocator is set in the descriptor - // before the call. - func = fir::runtime::getRuntimeFunc( + func = fir::runtime::getRuntimeFunc( loc, builder); return convertOpToCall(op, rewriter, func); diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp index 9fed50c859a9c..9be54e8906903 100644 --- a/flang/runtime/CUDA/allocatable.cpp +++ b/flang/runtime/CUDA/allocatable.cpp @@ -22,18 +22,11 @@ namespace Fortran::runtime::cuda { extern "C" { RT_EXT_API_GROUP_BEGIN -int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat, - const Descriptor *errMsg, const char *sourceFile, int sourceLine) { - if (desc.HasAddendum()) { - Terminator terminator{sourceFile, sourceLine}; - // TODO: This require a bit more work to set the correct type descriptor - // address - terminator.Crash( - "not yet implemented: CUDA descriptor allocation with addendum"); - } - // Perform the standard allocation. - int stat{RTNAME(AllocatableAllocate)( - desc, hasStat, errMsg, sourceFile, sourceLine)}; +int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t stream, + bool hasStat, const Descriptor *errMsg, const char *sourceFile, + int sourceLine) { + int stat{RTNAME(CUFAllocatableAllocate)( + desc, stream, hasStat, errMsg, sourceFile, sourceLine)}; #ifndef RT_DEVICE_COMPILATION // Descriptor synchronization is only done when the allocation is done // from the host. @@ -47,11 +40,27 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat, return stat; } -int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc, - const Descriptor &source, bool hasStat, const Descriptor *errMsg, - const char *sourceFile, int sourceLine) { +int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t stream, + bool hasStat, const Descriptor *errMsg, const char *sourceFile, + int sourceLine) { + if (desc.HasAddendum()) { + Terminator terminator{sourceFile, sourceLine}; + // TODO: This require a bit more work to set the correct type descriptor + // address + terminator.Crash( + "not yet implemented: CUDA descriptor allocation with addendum"); + } + // Perform the standard allocation. int stat{RTNAME(AllocatableAllocate)( - alloc, hasStat, errMsg, sourceFile, sourceLine)}; + desc, hasStat, errMsg, sourceFile, sourceLine)}; + return stat; +} + +int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc, + const Descriptor &source, int64_t stream, bool hasStat, + const Descriptor *errMsg, const char *sourceFile, int sourceLine) { + int stat{RTNAME(CUFAllocatableAllocate)( + alloc, stream, hasStat, errMsg, sourceFile, sourceLine)}; if (stat == StatOk) { Terminator terminator{sourceFile, sourceLine}; Fortran::runtime::DoFromSourceAssign( @@ -61,10 +70,10 @@ int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc, } int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc, - const Descriptor &source, bool hasStat, const Descriptor *errMsg, - const char *sourceFile, int sourceLine) { - int stat{RTNAME(AllocatableAllocate)( - alloc, hasStat, errMsg, sourceFile, sourceLine)}; + const Descriptor &source, int64_t stream, bool hasStat, + const Descriptor *errMsg, const char *sourceFile, int sourceLine) { + int stat{RTNAME(CUFAllocatableAllocateSync)( + alloc, stream, hasStat, errMsg, sourceFile, sourceLine)}; if (stat == StatOk) { Terminator terminator{sourceFile, sourceLine}; Fortran::runtime::DoFromSourceAssign( diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir index 47d75b16b7a2d..9b87c7546d1e9 100644 --- a/flang/test/Fir/CUDA/cuda-allocate.fir +++ b/flang/test/Fir/CUDA/cuda-allocate.fir @@ -19,7 +19,7 @@ func.func @_QPsub1() { // CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref>) -> !fir.ref>>> // CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub1Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref>>>) -> !fir.ref> -// CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 +// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, i64, i1, !fir.box, !fir.ref, i32) -> i32 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref>>>) -> !fir.ref> // CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 @@ -47,7 +47,7 @@ func.func @_QPsub3() { // CHECK: %[[A:.*]]:2 = hlfir.declare %[[A_ADDR]] {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QMmod1Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) // CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref>>>) -> !fir.ref> -// CHECK: fir.call @_FortranACUFAllocatableAllocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 +// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, i64, i1, !fir.box, !fir.ref, i32) -> i32 // CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref>>>) -> !fir.ref> // CHECK: fir.call @_FortranACUFAllocatableDeallocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 @@ -87,7 +87,7 @@ func.func @_QPsub5() { } // CHECK-LABEL: func.func @_QPsub5() -// CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 +// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref>, i64, i1, !fir.box, !fir.ref, i32) -> i32 // CHECK: fir.call @_FortranAAllocatableDeallocate({{.*}}) : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 @@ -118,7 +118,7 @@ func.func @_QQsub6() attributes {fir.bindc_name = "test"} { // CHECK: %[[B:.*]]:2 = hlfir.declare %[[B_ADDR]] {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QMdataEb"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) // CHECK: _FortranAAllocatableSetBounds // CHECK: %[[B_BOX:.*]] = fir.convert %[[B]]#1 : (!fir.ref>>>) -> !fir.ref> -// CHECK: fir.call @_FortranACUFAllocatableAllocate(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 +// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, i64, i1, !fir.box, !fir.ref, i32) -> i32 func.func @_QPallocate_source() { @@ -142,7 +142,7 @@ func.func @_QPallocate_source() { // CHECK: %[[SOURCE:.*]] = fir.load %[[DECL_HOST]] : !fir.ref>>> // CHECK: %[[DEV_CONV:.*]] = fir.convert %[[DECL_DEV]] : (!fir.ref>>>) -> !fir.ref> // CHECK: %[[SOURCE_CONV:.*]] = fir.convert %[[SOURCE]] : (!fir.box>>) -> !fir.box -// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.box, i1, !fir.box, !fir.ref, i32) -> i32 +// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.box, i64, i1, !fir.box, !fir.ref, i32) -> i32 fir.global @_QMmod1Ea_d {data_attr = #cuf.cuda} : !fir.box>> { @@ -165,4 +165,20 @@ func.func @_QMmod1Pallocate_source_global() { // CHECK-LABEL: func.func @_QMmod1Pallocate_source_global() // CHECK: fir.call @_FortranACUFAllocatableAllocateSourceSync +func.func @_QQallocate_stream() { + %0 = cuf.alloc !fir.box>> {bindc_name = "a", data_attr = #cuf.cuda, uniq_name = "_QFEa"} -> !fir.ref>>> + %1 = fir.declare %0 {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFEa"} : (!fir.ref>>>) -> !fir.ref>>> + %2 = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"} + %3 = fir.declare %2 {uniq_name = "_QFEstream1"} : (!fir.ref) -> !fir.ref + %4 = fir.load %3 : !fir.ref + %5 = cuf.allocate %1 : !fir.ref>>> stream(%4 : i64) {data_attr = #cuf.cuda} -> i32 + return +} + +// CHECK-LABEL: func.func @_QQallocate_stream() +// CHECK: %[[STREAM_ALLOCA:.*]] = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"} +// CHECK: %[[STREAM:.*]] = fir.declare %[[STREAM_ALLOCA]] {uniq_name = "_QFEstream1"} : (!fir.ref) -> !fir.ref +// CHECK: %[[STREAM_LOAD:.*]] = fir.load %[[STREAM]] : !fir.ref +// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM_LOAD]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, i64, i1, !fir.box, !fir.ref, i32) -> i32 + } // end of module