diff --git a/flang-rt/include/flang-rt/runtime/descriptor.h b/flang-rt/include/flang-rt/runtime/descriptor.h index 9907e7866e7bf..aa6ec1dbdebea 100644 --- a/flang-rt/include/flang-rt/runtime/descriptor.h +++ b/flang-rt/include/flang-rt/runtime/descriptor.h @@ -437,6 +437,64 @@ class Descriptor { }; static_assert(sizeof(Descriptor) == sizeof(ISO::CFI_cdesc_t)); +// Lightweight iterator-like API to simplify specialising Descriptor indexing +// in cases where it can improve application performance. On account of the +// purpose of this API being performance optimisation, it is up to the user to +// do all the necessary checks to make sure the specialised variants can be used +// safely and that Advance() is not called more times than the number of +// elements in the Descriptor allows for. +// Default RANK=-1 supports aray descriptors of any rank up to maxRank. +template class DescriptorIterator { +private: + const Descriptor &descriptor; + SubscriptValue subscripts[maxRank]; + std::size_t elementOffset{0}; + +public: + RT_API_ATTRS DescriptorIterator(const Descriptor &descriptor) + : descriptor(descriptor) { + // We do not need the subscripts to iterate over a rank-1 array + if constexpr (RANK != 1) { + descriptor.GetLowerBounds(subscripts); + } + }; + + template RT_API_ATTRS A *Get() { + std::size_t offset{0}; + // The rank-1 case doesn't require looping at all + if constexpr (RANK == 1) { + offset = elementOffset; + // The compiler might be able to optimise this better if we know the rank + // at compile time + } else if constexpr (RANK != -1) { + for (int j{0}; j < RANK; ++j) { + offset += descriptor.SubscriptByteOffset(j, subscripts[j]); + } + // General fallback + } else { + offset = descriptor.SubscriptsToByteOffset(subscripts); + } + + return descriptor.OffsetElement(offset); + } + + RT_API_ATTRS void Advance() { + if constexpr (RANK == 1) { + elementOffset += descriptor.GetDimension(0).ByteStride(); + } else if constexpr (RANK != -1) { + for (int j{0}; j < RANK; ++j) { + const Dimension &dim{descriptor.GetDimension(j)}; + if (subscripts[j]++ < dim.UpperBound()) { + break; + } + subscripts[j] = dim.LowerBound(); + } + } else { + descriptor.IncrementSubscripts(subscripts); + } + } +}; + // Properly configured instances of StaticDescriptor will occupy the // exact amount of storage required for the descriptor, its dimensional // information, and possible addendum. To build such a static descriptor, diff --git a/flang-rt/include/flang-rt/runtime/tools.h b/flang-rt/include/flang-rt/runtime/tools.h index 91a026bf2ac14..a1b96f41f4936 100644 --- a/flang-rt/include/flang-rt/runtime/tools.h +++ b/flang-rt/include/flang-rt/runtime/tools.h @@ -511,10 +511,13 @@ inline RT_API_ATTRS const char *FindCharacter( // Copy payload data from one allocated descriptor to another. // Assumes element counts and element sizes match, and that both // descriptors are allocated. +template RT_API_ATTRS void ShallowCopyDiscontiguousToDiscontiguous( const Descriptor &to, const Descriptor &from); +template RT_API_ATTRS void ShallowCopyDiscontiguousToContiguous( const Descriptor &to, const Descriptor &from); +template RT_API_ATTRS void ShallowCopyContiguousToDiscontiguous( const Descriptor &to, const Descriptor &from); RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from, diff --git a/flang-rt/lib/runtime/assign.cpp b/flang-rt/lib/runtime/assign.cpp index 4a813cd489022..9f4dcfa7d86c1 100644 --- a/flang-rt/lib/runtime/assign.cpp +++ b/flang-rt/lib/runtime/assign.cpp @@ -494,7 +494,7 @@ RT_API_ATTRS void Assign(Descriptor &to, const Descriptor &from, } } else { // elemental copies, possibly with character truncation for (std::size_t n{toElements}; n-- > 0; - to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) { + to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) { memmoveFct(to.Element(toAt), from.Element(fromAt), toElementBytes); } @@ -588,7 +588,8 @@ void RTDEF(CopyInAssign)(Descriptor &temp, const Descriptor &var, temp = var; temp.set_base_addr(nullptr); temp.raw().attribute = CFI_attribute_allocatable; - RTNAME(AssignTemporary)(temp, var, sourceFile, sourceLine); + temp.Allocate(kNoAsyncId); + ShallowCopy(temp, var); } void RTDEF(CopyOutAssign)( @@ -597,9 +598,10 @@ void RTDEF(CopyOutAssign)( // Copyout from the temporary must not cause any finalizations // for LHS. The variable must be properly initialized already. - if (var) - Assign(*var, temp, terminator, NoAssignFlags); - temp.Destroy(/*finalize=*/false, /*destroyPointers=*/false, &terminator); + if (var) { + ShallowCopy(*var, temp); + } + temp.Deallocate(); } void RTDEF(AssignExplicitLengthCharacter)(Descriptor &to, diff --git a/flang-rt/lib/runtime/tools.cpp b/flang-rt/lib/runtime/tools.cpp index 5d6e35faca70a..e13d0fe10a63a 100644 --- a/flang-rt/lib/runtime/tools.cpp +++ b/flang-rt/lib/runtime/tools.cpp @@ -114,58 +114,148 @@ RT_API_ATTRS void CheckIntegerKind( } } +template RT_API_ATTRS void ShallowCopyDiscontiguousToDiscontiguous( const Descriptor &to, const Descriptor &from) { - SubscriptValue toAt[maxRank], fromAt[maxRank]; - to.GetLowerBounds(toAt); - from.GetLowerBounds(fromAt); + DescriptorIterator toIt{to}; + DescriptorIterator fromIt{from}; + // Knowing the size at compile time can enable memcpy inlining optimisations + constexpr std::size_t typeElementBytes{sizeof(P)}; + // We might still need to check the actual size as a fallback std::size_t elementBytes{to.ElementBytes()}; for (std::size_t n{to.Elements()}; n-- > 0; - to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) { - std::memcpy( - to.Element(toAt), from.Element(fromAt), elementBytes); + toIt.Advance(), fromIt.Advance()) { + // typeElementBytes == 1 when P is a char - the non-specialised case + if constexpr (typeElementBytes != 1) { + std::memcpy( + toIt.template Get

(), fromIt.template Get

(), typeElementBytes); + } else { + std::memcpy( + toIt.template Get

(), fromIt.template Get

(), elementBytes); + } } } +template RT_API_ATTRS void ShallowCopyDiscontiguousToContiguous( const Descriptor &to, const Descriptor &from) { char *toAt{to.OffsetElement()}; - SubscriptValue fromAt[maxRank]; - from.GetLowerBounds(fromAt); + constexpr std::size_t typeElementBytes{sizeof(P)}; std::size_t elementBytes{to.ElementBytes()}; + DescriptorIterator fromIt{from}; for (std::size_t n{to.Elements()}; n-- > 0; - toAt += elementBytes, from.IncrementSubscripts(fromAt)) { - std::memcpy(toAt, from.Element(fromAt), elementBytes); + toAt += elementBytes, fromIt.Advance()) { + if constexpr (typeElementBytes != 1) { + std::memcpy(toAt, fromIt.template Get

(), typeElementBytes); + } else { + std::memcpy(toAt, fromIt.template Get

(), elementBytes); + } } } +template RT_API_ATTRS void ShallowCopyContiguousToDiscontiguous( const Descriptor &to, const Descriptor &from) { - SubscriptValue toAt[maxRank]; - to.GetLowerBounds(toAt); char *fromAt{from.OffsetElement()}; + DescriptorIterator toIt{to}; + constexpr std::size_t typeElementBytes{sizeof(P)}; std::size_t elementBytes{to.ElementBytes()}; for (std::size_t n{to.Elements()}; n-- > 0; - to.IncrementSubscripts(toAt), fromAt += elementBytes) { - std::memcpy(to.Element(toAt), fromAt, elementBytes); + toIt.Advance(), fromAt += elementBytes) { + if constexpr (typeElementBytes != 1) { + std::memcpy(toIt.template Get

(), fromAt, typeElementBytes); + } else { + std::memcpy(toIt.template Get

(), fromAt, elementBytes); + } } } -RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from, +// ShallowCopy helper for calling the correct specialised variant based on +// scenario +template +RT_API_ATTRS void ShallowCopyInner(const Descriptor &to, const Descriptor &from, bool toIsContiguous, bool fromIsContiguous) { if (toIsContiguous) { if (fromIsContiguous) { std::memcpy(to.OffsetElement(), from.OffsetElement(), to.Elements() * to.ElementBytes()); } else { - ShallowCopyDiscontiguousToContiguous(to, from); + ShallowCopyDiscontiguousToContiguous(to, from); } } else { if (fromIsContiguous) { - ShallowCopyContiguousToDiscontiguous(to, from); + ShallowCopyContiguousToDiscontiguous(to, from); + } else { + ShallowCopyDiscontiguousToDiscontiguous(to, from); + } + } +} + +// Most arrays are much closer to rank-1 than to maxRank. +// Doing the recursion upwards instead of downwards puts the more common +// cases earlier in the if-chain and has a tangible impact on performance. +template struct ShallowCopyRankSpecialize { + static bool execute(const Descriptor &to, const Descriptor &from, + bool toIsContiguous, bool fromIsContiguous) { + if (to.rank() == RANK && from.rank() == RANK) { + ShallowCopyInner(to, from, toIsContiguous, fromIsContiguous); + return true; + } + return ShallowCopyRankSpecialize::execute( + to, from, toIsContiguous, fromIsContiguous); + } +}; + +template struct ShallowCopyRankSpecialize { + static bool execute(const Descriptor &to, const Descriptor &from, + bool toIsContiguous, bool fromIsContiguous) { + return false; + } +}; + +// ShallowCopy helper for specialising the variants based on array rank +template +RT_API_ATTRS void ShallowCopyRank(const Descriptor &to, const Descriptor &from, + bool toIsContiguous, bool fromIsContiguous) { + // Try to call a specialised ShallowCopy variant from rank-1 up to maxRank + bool specialized{ShallowCopyRankSpecialize::execute( + to, from, toIsContiguous, fromIsContiguous)}; + if (!specialized) { + ShallowCopyInner

(to, from, toIsContiguous, fromIsContiguous); + } +} + +RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from, + bool toIsContiguous, bool fromIsContiguous) { + std::size_t elementBytes{to.ElementBytes()}; + // Checking the type at runtime and making sure the pointer passed to memcpy + // has a type that matches the element type makes it possible for the compiler + // to optimise out the memcpy calls altogether and can substantially improve + // performance for some applications. + if (to.type().IsInteger()) { + if (elementBytes == sizeof(int64_t)) { + ShallowCopyRank(to, from, toIsContiguous, fromIsContiguous); + } else if (elementBytes == sizeof(int32_t)) { + ShallowCopyRank(to, from, toIsContiguous, fromIsContiguous); + } else if (elementBytes == sizeof(int16_t)) { + ShallowCopyRank(to, from, toIsContiguous, fromIsContiguous); +#if defined USING_NATIVE_INT128_T + } else if (elementBytes == sizeof(__int128_t)) { + ShallowCopyRank<__int128_t>(to, from, toIsContiguous, fromIsContiguous); +#endif } else { - ShallowCopyDiscontiguousToDiscontiguous(to, from); + ShallowCopyRank(to, from, toIsContiguous, fromIsContiguous); } + } else if (to.type().IsReal()) { + if (elementBytes == sizeof(double)) { + ShallowCopyRank(to, from, toIsContiguous, fromIsContiguous); + } else if (elementBytes == sizeof(float)) { + ShallowCopyRank(to, from, toIsContiguous, fromIsContiguous); + } else { + ShallowCopyRank(to, from, toIsContiguous, fromIsContiguous); + } + } else { + ShallowCopyRank(to, from, toIsContiguous, fromIsContiguous); } } diff --git a/flang-rt/unittests/Runtime/Assign.cpp b/flang-rt/unittests/Runtime/Assign.cpp new file mode 100644 index 0000000000000..4001cc90ca0a1 --- /dev/null +++ b/flang-rt/unittests/Runtime/Assign.cpp @@ -0,0 +1,55 @@ +//===-- unittests/Runtime/Assign.cpp ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "flang/Runtime/assign.h" +#include "tools.h" +#include "gtest/gtest.h" +#include + +using namespace Fortran::runtime; +using Fortran::common::TypeCategory; + +TEST(Assign, RTNAME(CopyInAssign)) { + // contiguous -> contiguous copy in + auto intArray{MakeArray( + std::vector{2, 3}, std::vector{1, 2, 3, 4, 5, 6}, sizeof(int))}; + StaticDescriptor<2> staticIntResult; + Descriptor &intResult{staticIntResult.descriptor()}; + + RTNAME(CopyInAssign(intResult, *intArray)); + ASSERT_TRUE(intResult.IsAllocated()); + ASSERT_TRUE(intResult.IsContiguous()); + ASSERT_EQ(intResult.type(), intArray->type()); + ASSERT_EQ(intResult.ElementBytes(), sizeof(int)); + EXPECT_EQ(intResult.GetDimension(0).LowerBound(), 1); + EXPECT_EQ(intResult.GetDimension(0).Extent(), 2); + EXPECT_EQ(intResult.GetDimension(1).LowerBound(), 1); + EXPECT_EQ(intResult.GetDimension(1).Extent(), 3); + int expected[6] = {1, 2, 3, 4, 5, 6}; + EXPECT_EQ( + std::memcmp(intResult.OffsetElement(0), expected, 6 * sizeof(int)), + 0); + intResult.Destroy(); + + // discontiguous -> contiguous rank-1 copy in + intArray = MakeArray(std::vector{8}, + std::vector{1, 2, 3, 4, 5, 6, 7, 8}, sizeof(int)); + StaticDescriptor<1> staticIntResultStrided; + Descriptor &intResultStrided{staticIntResultStrided.descriptor()}; + // Treat the descriptor as a strided array of 4 + intArray->GetDimension(0).SetByteStride(sizeof(int) * 2); + intArray->GetDimension(0).SetExtent(4); + RTNAME(CopyInAssign(intResultStrided, *intArray)); + + int expectedStrided[4] = {1, 3, 5, 7}; + EXPECT_EQ(std::memcmp(intResultStrided.OffsetElement(0), expectedStrided, + 4 * sizeof(int)), + 0); + + intResultStrided.Destroy(); +} diff --git a/flang-rt/unittests/Runtime/CMakeLists.txt b/flang-rt/unittests/Runtime/CMakeLists.txt index 61d0aba93b14b..49f55a442863b 100644 --- a/flang-rt/unittests/Runtime/CMakeLists.txt +++ b/flang-rt/unittests/Runtime/CMakeLists.txt @@ -10,6 +10,7 @@ add_flangrt_unittest(RuntimeTests AccessTest.cpp Allocatable.cpp ArrayConstructor.cpp + Assign.cpp BufferTest.cpp CharacterTest.cpp CommandTest.cpp