Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions flang-rt/include/flang-rt/runtime/descriptor.h
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,64 @@ class Descriptor {
};
static_assert(sizeof(Descriptor) == sizeof(ISO::CFI_cdesc_t));

// Lightweight iterator-like API to simplify specialising Descriptor indexing
// in cases where it can improve application performance. On account of the
// purpose of this API being performance optimisation, it is up to the user to
// do all the necessary checks to make sure the specialised variants can be used
// safely and that Advance() is not called more times than the number of
// elements in the Descriptor allows for.
// Default RANK=-1 supports aray descriptors of any rank up to maxRank.
template <int RANK = -1> class DescriptorIterator {
private:
const Descriptor &descriptor;
SubscriptValue subscripts[maxRank];
std::size_t elementOffset{0};

public:
RT_API_ATTRS DescriptorIterator(const Descriptor &descriptor)
: descriptor(descriptor) {
// We do not need the subscripts to iterate over a rank-1 array
if constexpr (RANK != 1) {
descriptor.GetLowerBounds(subscripts);
}
};

template <typename A> RT_API_ATTRS A *Get() {
std::size_t offset{0};
// The rank-1 case doesn't require looping at all
if constexpr (RANK == 1) {
offset = elementOffset;
// The compiler might be able to optimise this better if we know the rank
// at compile time
} else if constexpr (RANK != -1) {
for (int j{0}; j < RANK; ++j) {
offset += descriptor.SubscriptByteOffset(j, subscripts[j]);
}
// General fallback
} else {
offset = descriptor.SubscriptsToByteOffset(subscripts);
}

return descriptor.OffsetElement<A>(offset);
}

RT_API_ATTRS void Advance() {
if constexpr (RANK == 1) {
elementOffset += descriptor.GetDimension(0).ByteStride();
} else if constexpr (RANK != -1) {
for (int j{0}; j < RANK; ++j) {
const Dimension &dim{descriptor.GetDimension(j)};
if (subscripts[j]++ < dim.UpperBound()) {
break;
}
subscripts[j] = dim.LowerBound();
}
} else {
descriptor.IncrementSubscripts(subscripts);
}
}
};

// Properly configured instances of StaticDescriptor will occupy the
// exact amount of storage required for the descriptor, its dimensional
// information, and possible addendum. To build such a static descriptor,
Expand Down
3 changes: 3 additions & 0 deletions flang-rt/include/flang-rt/runtime/tools.h
Original file line number Diff line number Diff line change
Expand Up @@ -511,10 +511,13 @@ inline RT_API_ATTRS const char *FindCharacter(
// Copy payload data from one allocated descriptor to another.
// Assumes element counts and element sizes match, and that both
// descriptors are allocated.
template <typename P = char, int RANK = -1>
RT_API_ATTRS void ShallowCopyDiscontiguousToDiscontiguous(
const Descriptor &to, const Descriptor &from);
template <typename P = char, int RANK = -1>
RT_API_ATTRS void ShallowCopyDiscontiguousToContiguous(
const Descriptor &to, const Descriptor &from);
template <typename P = char, int RANK = -1>
RT_API_ATTRS void ShallowCopyContiguousToDiscontiguous(
const Descriptor &to, const Descriptor &from);
RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from,
Expand Down
12 changes: 7 additions & 5 deletions flang-rt/lib/runtime/assign.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ RT_API_ATTRS void Assign(Descriptor &to, const Descriptor &from,
}
} else { // elemental copies, possibly with character truncation
for (std::size_t n{toElements}; n-- > 0;
to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
memmoveFct(to.Element<char>(toAt), from.Element<const char>(fromAt),
toElementBytes);
}
Expand Down Expand Up @@ -588,7 +588,8 @@ void RTDEF(CopyInAssign)(Descriptor &temp, const Descriptor &var,
temp = var;
temp.set_base_addr(nullptr);
temp.raw().attribute = CFI_attribute_allocatable;
RTNAME(AssignTemporary)(temp, var, sourceFile, sourceLine);
temp.Allocate(kNoAsyncId);
ShallowCopy(temp, var);
}

void RTDEF(CopyOutAssign)(
Expand All @@ -597,9 +598,10 @@ void RTDEF(CopyOutAssign)(

// Copyout from the temporary must not cause any finalizations
// for LHS. The variable must be properly initialized already.
if (var)
Assign(*var, temp, terminator, NoAssignFlags);
temp.Destroy(/*finalize=*/false, /*destroyPointers=*/false, &terminator);
if (var) {
ShallowCopy(*var, temp);
}
temp.Deallocate();
}

void RTDEF(AssignExplicitLengthCharacter)(Descriptor &to,
Expand Down
126 changes: 108 additions & 18 deletions flang-rt/lib/runtime/tools.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,58 +114,148 @@ RT_API_ATTRS void CheckIntegerKind(
}
}

template <typename P, int RANK>
RT_API_ATTRS void ShallowCopyDiscontiguousToDiscontiguous(
const Descriptor &to, const Descriptor &from) {
SubscriptValue toAt[maxRank], fromAt[maxRank];
to.GetLowerBounds(toAt);
from.GetLowerBounds(fromAt);
DescriptorIterator<RANK> toIt{to};
DescriptorIterator<RANK> fromIt{from};
// Knowing the size at compile time can enable memcpy inlining optimisations
constexpr std::size_t typeElementBytes{sizeof(P)};
// We might still need to check the actual size as a fallback
std::size_t elementBytes{to.ElementBytes()};
for (std::size_t n{to.Elements()}; n-- > 0;
to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
std::memcpy(
to.Element<char>(toAt), from.Element<char>(fromAt), elementBytes);
toIt.Advance(), fromIt.Advance()) {
// typeElementBytes == 1 when P is a char - the non-specialised case
if constexpr (typeElementBytes != 1) {
std::memcpy(
toIt.template Get<P>(), fromIt.template Get<P>(), typeElementBytes);
} else {
std::memcpy(
toIt.template Get<P>(), fromIt.template Get<P>(), elementBytes);
}
}
}

template <typename P, int RANK>
RT_API_ATTRS void ShallowCopyDiscontiguousToContiguous(
const Descriptor &to, const Descriptor &from) {
char *toAt{to.OffsetElement()};
SubscriptValue fromAt[maxRank];
from.GetLowerBounds(fromAt);
constexpr std::size_t typeElementBytes{sizeof(P)};
std::size_t elementBytes{to.ElementBytes()};
DescriptorIterator<RANK> fromIt{from};
for (std::size_t n{to.Elements()}; n-- > 0;
toAt += elementBytes, from.IncrementSubscripts(fromAt)) {
std::memcpy(toAt, from.Element<char>(fromAt), elementBytes);
toAt += elementBytes, fromIt.Advance()) {
if constexpr (typeElementBytes != 1) {
std::memcpy(toAt, fromIt.template Get<P>(), typeElementBytes);
} else {
std::memcpy(toAt, fromIt.template Get<P>(), elementBytes);
}
}
}

template <typename P, int RANK>
RT_API_ATTRS void ShallowCopyContiguousToDiscontiguous(
const Descriptor &to, const Descriptor &from) {
SubscriptValue toAt[maxRank];
to.GetLowerBounds(toAt);
char *fromAt{from.OffsetElement()};
DescriptorIterator<RANK> toIt{to};
constexpr std::size_t typeElementBytes{sizeof(P)};
std::size_t elementBytes{to.ElementBytes()};
for (std::size_t n{to.Elements()}; n-- > 0;
to.IncrementSubscripts(toAt), fromAt += elementBytes) {
std::memcpy(to.Element<char>(toAt), fromAt, elementBytes);
toIt.Advance(), fromAt += elementBytes) {
if constexpr (typeElementBytes != 1) {
std::memcpy(toIt.template Get<P>(), fromAt, typeElementBytes);
} else {
std::memcpy(toIt.template Get<P>(), fromAt, elementBytes);
}
}
}

RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from,
// ShallowCopy helper for calling the correct specialised variant based on
// scenario
template <typename P, int RANK = -1>
RT_API_ATTRS void ShallowCopyInner(const Descriptor &to, const Descriptor &from,
bool toIsContiguous, bool fromIsContiguous) {
if (toIsContiguous) {
if (fromIsContiguous) {
std::memcpy(to.OffsetElement(), from.OffsetElement(),
to.Elements() * to.ElementBytes());
} else {
ShallowCopyDiscontiguousToContiguous(to, from);
ShallowCopyDiscontiguousToContiguous<P, RANK>(to, from);
}
} else {
if (fromIsContiguous) {
ShallowCopyContiguousToDiscontiguous(to, from);
ShallowCopyContiguousToDiscontiguous<P, RANK>(to, from);
} else {
ShallowCopyDiscontiguousToDiscontiguous<P, RANK>(to, from);
}
}
}

// Most arrays are much closer to rank-1 than to maxRank.
// Doing the recursion upwards instead of downwards puts the more common
// cases earlier in the if-chain and has a tangible impact on performance.
template <typename P, int RANK> struct ShallowCopyRankSpecialize {
static bool execute(const Descriptor &to, const Descriptor &from,
bool toIsContiguous, bool fromIsContiguous) {
if (to.rank() == RANK && from.rank() == RANK) {
ShallowCopyInner<P, RANK>(to, from, toIsContiguous, fromIsContiguous);
return true;
}
return ShallowCopyRankSpecialize<P, RANK + 1>::execute(
to, from, toIsContiguous, fromIsContiguous);
}
};

template <typename P> struct ShallowCopyRankSpecialize<P, maxRank + 1> {
static bool execute(const Descriptor &to, const Descriptor &from,
bool toIsContiguous, bool fromIsContiguous) {
return false;
}
};

// ShallowCopy helper for specialising the variants based on array rank
template <typename P>
RT_API_ATTRS void ShallowCopyRank(const Descriptor &to, const Descriptor &from,
bool toIsContiguous, bool fromIsContiguous) {
// Try to call a specialised ShallowCopy variant from rank-1 up to maxRank
bool specialized{ShallowCopyRankSpecialize<P, 1>::execute(
to, from, toIsContiguous, fromIsContiguous)};
if (!specialized) {
ShallowCopyInner<P>(to, from, toIsContiguous, fromIsContiguous);
}
}

RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from,
bool toIsContiguous, bool fromIsContiguous) {
std::size_t elementBytes{to.ElementBytes()};
// Checking the type at runtime and making sure the pointer passed to memcpy
// has a type that matches the element type makes it possible for the compiler
// to optimise out the memcpy calls altogether and can substantially improve
// performance for some applications.
if (to.type().IsInteger()) {
if (elementBytes == sizeof(int64_t)) {
ShallowCopyRank<int64_t>(to, from, toIsContiguous, fromIsContiguous);
} else if (elementBytes == sizeof(int32_t)) {
ShallowCopyRank<int32_t>(to, from, toIsContiguous, fromIsContiguous);
} else if (elementBytes == sizeof(int16_t)) {
ShallowCopyRank<int16_t>(to, from, toIsContiguous, fromIsContiguous);
#if defined USING_NATIVE_INT128_T
} else if (elementBytes == sizeof(__int128_t)) {
ShallowCopyRank<__int128_t>(to, from, toIsContiguous, fromIsContiguous);
#endif
} else {
ShallowCopyDiscontiguousToDiscontiguous(to, from);
ShallowCopyRank<char>(to, from, toIsContiguous, fromIsContiguous);
}
} else if (to.type().IsReal()) {
if (elementBytes == sizeof(double)) {
ShallowCopyRank<double>(to, from, toIsContiguous, fromIsContiguous);
} else if (elementBytes == sizeof(float)) {
ShallowCopyRank<float>(to, from, toIsContiguous, fromIsContiguous);
} else {
ShallowCopyRank<char>(to, from, toIsContiguous, fromIsContiguous);
}
} else {
ShallowCopyRank<char>(to, from, toIsContiguous, fromIsContiguous);
}
}

Expand Down
55 changes: 55 additions & 0 deletions flang-rt/unittests/Runtime/Assign.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
//===-- unittests/Runtime/Assign.cpp ------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "flang/Runtime/assign.h"
#include "tools.h"
#include "gtest/gtest.h"
#include <vector>

using namespace Fortran::runtime;
using Fortran::common::TypeCategory;

TEST(Assign, RTNAME(CopyInAssign)) {
// contiguous -> contiguous copy in
auto intArray{MakeArray<TypeCategory::Integer, 1>(
std::vector<int>{2, 3}, std::vector<int>{1, 2, 3, 4, 5, 6}, sizeof(int))};
StaticDescriptor<2> staticIntResult;
Descriptor &intResult{staticIntResult.descriptor()};

RTNAME(CopyInAssign(intResult, *intArray));
ASSERT_TRUE(intResult.IsAllocated());
ASSERT_TRUE(intResult.IsContiguous());
ASSERT_EQ(intResult.type(), intArray->type());
ASSERT_EQ(intResult.ElementBytes(), sizeof(int));
EXPECT_EQ(intResult.GetDimension(0).LowerBound(), 1);
EXPECT_EQ(intResult.GetDimension(0).Extent(), 2);
EXPECT_EQ(intResult.GetDimension(1).LowerBound(), 1);
EXPECT_EQ(intResult.GetDimension(1).Extent(), 3);
int expected[6] = {1, 2, 3, 4, 5, 6};
EXPECT_EQ(
std::memcmp(intResult.OffsetElement<int>(0), expected, 6 * sizeof(int)),
0);
intResult.Destroy();

// discontiguous -> contiguous rank-1 copy in
intArray = MakeArray<TypeCategory::Integer, 1>(std::vector<int>{8},
std::vector<int>{1, 2, 3, 4, 5, 6, 7, 8}, sizeof(int));
StaticDescriptor<1> staticIntResultStrided;
Descriptor &intResultStrided{staticIntResultStrided.descriptor()};
// Treat the descriptor as a strided array of 4
intArray->GetDimension(0).SetByteStride(sizeof(int) * 2);
intArray->GetDimension(0).SetExtent(4);
RTNAME(CopyInAssign(intResultStrided, *intArray));

int expectedStrided[4] = {1, 3, 5, 7};
EXPECT_EQ(std::memcmp(intResultStrided.OffsetElement<int>(0), expectedStrided,
4 * sizeof(int)),
0);

intResultStrided.Destroy();
}
1 change: 1 addition & 0 deletions flang-rt/unittests/Runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ add_flangrt_unittest(RuntimeTests
AccessTest.cpp
Allocatable.cpp
ArrayConstructor.cpp
Assign.cpp
BufferTest.cpp
CharacterTest.cpp
CommandTest.cpp
Expand Down
Loading