Skip to content

Commit 8f24bb5

Browse files
committed
[flang-rt] Optimise ShallowCopy and elemental copies in Assign
Using Descriptor.Element<>() when iterating through a rank-1 array is currently inefficient, because the generic implementation suitable for arrays of any rank makes the compiler unable to perform optimisations that would make the rank-1 case considerably faster. This is currently done inside ShallowCopy, as well as inside Assign where the implementation of elemental copies is equivalent to ShallowCopyDiscontiguousToDiscontiguous. To address that, add a DescriptorIterator abstraction specialised both for the optimised rank-1 case as well as for the generic case, and use that throughout ShallowCopy to iterate over the arrays. Furthermore, depending on the pointer type passed to memcpy, the optimiser can remove the memcpy calls from ShallowCopy altogether which can result in substantial performance improvements on its own. Check the element size throughout ShallowCopy and use the pointer type that matches it where applicable to make these optimisations possible. Finally, replace the implementation of elemental copies inside Assign to make use of the ShallowCopy* family of functions whenever possible. For the thornado-mini application, this reduces the runtime by 27.7%. Signed-off-by: Kajetan Puchalski <[email protected]>
1 parent c14acb7 commit 8f24bb5

File tree

4 files changed

+122
-22
lines changed

4 files changed

+122
-22
lines changed

flang-rt/include/flang-rt/runtime/descriptor.h

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,43 @@ class Descriptor {
437437
};
438438
static_assert(sizeof(Descriptor) == sizeof(ISO::CFI_cdesc_t));
439439

440+
// Lightweight iterator-like API to simplify specialising Descriptor indexing
441+
// in cases where it can improve application performance. On account of the
442+
// purpose of this API being performance optimisation, it is up to the user to
443+
// do all the necessary checks to make sure the RANK1=true variant can be used
444+
// safely and that Advance() is not called more times than the number of
445+
// elements in the Descriptor allows for.
446+
template <bool RANK1 = false> class DescriptorIterator {
447+
private:
448+
const Descriptor &descriptor;
449+
SubscriptValue subscripts[maxRank];
450+
std::size_t elementOffset = 0;
451+
452+
public:
453+
DescriptorIterator(const Descriptor &descriptor) : descriptor(descriptor) {
454+
descriptor.GetLowerBounds(subscripts);
455+
if constexpr (RANK1) {
456+
elementOffset = descriptor.SubscriptByteOffset(0, subscripts[0]);
457+
}
458+
};
459+
460+
template <typename A> A *Get() {
461+
if constexpr (RANK1) {
462+
return descriptor.OffsetElement<A>(elementOffset);
463+
} else {
464+
return descriptor.Element<A>(subscripts);
465+
}
466+
}
467+
468+
void Advance() {
469+
if constexpr (RANK1) {
470+
elementOffset += descriptor.GetDimension(0).ByteStride();
471+
} else {
472+
descriptor.IncrementSubscripts(subscripts);
473+
}
474+
}
475+
};
476+
440477
// Properly configured instances of StaticDescriptor will occupy the
441478
// exact amount of storage required for the descriptor, its dimensional
442479
// information, and possible addendum. To build such a static descriptor,

flang-rt/include/flang-rt/runtime/tools.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -511,10 +511,13 @@ inline RT_API_ATTRS const char *FindCharacter(
511511
// Copy payload data from one allocated descriptor to another.
512512
// Assumes element counts and element sizes match, and that both
513513
// descriptors are allocated.
514+
template <bool RANK1 = false>
514515
RT_API_ATTRS void ShallowCopyDiscontiguousToDiscontiguous(
515516
const Descriptor &to, const Descriptor &from);
517+
template <bool RANK1 = false>
516518
RT_API_ATTRS void ShallowCopyDiscontiguousToContiguous(
517519
const Descriptor &to, const Descriptor &from);
520+
template <bool RANK1 = false>
518521
RT_API_ATTRS void ShallowCopyContiguousToDiscontiguous(
519522
const Descriptor &to, const Descriptor &from);
520523
RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from,

flang-rt/lib/runtime/assign.cpp

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -492,11 +492,21 @@ RT_API_ATTRS void Assign(Descriptor &to, const Descriptor &from,
492492
terminator.Crash("unexpected type code %d in blank padded Assign()",
493493
to.type().raw());
494494
}
495-
} else { // elemental copies, possibly with character truncation
496-
for (std::size_t n{toElements}; n-- > 0;
497-
to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
498-
memmoveFct(to.Element<char>(toAt), from.Element<const char>(fromAt),
499-
toElementBytes);
495+
} else {
496+
// We can't simply call ShallowCopy due to edge cases such as character
497+
// truncation or assignments where the RHS is a scalar.
498+
if (toElementBytes == fromElementBytes && to.IsContiguous()) {
499+
if (to.rank() == 1 && from.rank() == 1) {
500+
ShallowCopyDiscontiguousToContiguous<true>(to, from);
501+
} else {
502+
ShallowCopyDiscontiguousToContiguous<false>(to, from);
503+
}
504+
} else {
505+
if (to.rank() == 1 && from.rank() == 1) {
506+
ShallowCopyDiscontiguousToDiscontiguous<true>(to, from);
507+
} else {
508+
ShallowCopyDiscontiguousToDiscontiguous<false>(to, from);
509+
}
500510
}
501511
}
502512
}

flang-rt/lib/runtime/tools.cpp

Lines changed: 67 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -114,40 +114,78 @@ RT_API_ATTRS void CheckIntegerKind(
114114
}
115115
}
116116

117+
template <bool RANK1>
117118
RT_API_ATTRS void ShallowCopyDiscontiguousToDiscontiguous(
118119
const Descriptor &to, const Descriptor &from) {
119-
SubscriptValue toAt[maxRank], fromAt[maxRank];
120-
to.GetLowerBounds(toAt);
121-
from.GetLowerBounds(fromAt);
120+
DescriptorIterator<RANK1> toIt{to};
121+
DescriptorIterator<RANK1> fromIt{from};
122122
std::size_t elementBytes{to.ElementBytes()};
123123
for (std::size_t n{to.Elements()}; n-- > 0;
124-
to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
125-
std::memcpy(
126-
to.Element<char>(toAt), from.Element<char>(fromAt), elementBytes);
124+
toIt.Advance(), fromIt.Advance()) {
125+
// Checking the size at runtime and making sure the pointer passed to memcpy
126+
// has a type that matches the element size makes it possible for the
127+
// compiler to optimise out the memcpy calls altogether and can
128+
// substantially improve performance for some applications.
129+
if (elementBytes == 16) {
130+
std::memcpy(toIt.template Get<__int128_t>(),
131+
fromIt.template Get<__int128_t>(), elementBytes);
132+
} else if (elementBytes == 8) {
133+
std::memcpy(toIt.template Get<int64_t>(), fromIt.template Get<int64_t>(),
134+
elementBytes);
135+
} else if (elementBytes == 4) {
136+
std::memcpy(toIt.template Get<int32_t>(), fromIt.template Get<int32_t>(),
137+
elementBytes);
138+
} else if (elementBytes == 2) {
139+
std::memcpy(toIt.template Get<int16_t>(), fromIt.template Get<int16_t>(),
140+
elementBytes);
141+
} else {
142+
std::memcpy(
143+
toIt.template Get<char>(), fromIt.template Get<char>(), elementBytes);
144+
}
127145
}
128146
}
129147

148+
template <bool RANK1>
130149
RT_API_ATTRS void ShallowCopyDiscontiguousToContiguous(
131150
const Descriptor &to, const Descriptor &from) {
132151
char *toAt{to.OffsetElement()};
133-
SubscriptValue fromAt[maxRank];
134-
from.GetLowerBounds(fromAt);
135152
std::size_t elementBytes{to.ElementBytes()};
153+
DescriptorIterator<RANK1> fromIt{from};
136154
for (std::size_t n{to.Elements()}; n-- > 0;
137-
toAt += elementBytes, from.IncrementSubscripts(fromAt)) {
138-
std::memcpy(toAt, from.Element<char>(fromAt), elementBytes);
155+
toAt += elementBytes, fromIt.Advance()) {
156+
if (elementBytes == 16) {
157+
std::memcpy(toAt, fromIt.template Get<__int128_t>(), elementBytes);
158+
} else if (elementBytes == 8) {
159+
std::memcpy(toAt, fromIt.template Get<int64_t>(), elementBytes);
160+
} else if (elementBytes == 4) {
161+
std::memcpy(toAt, fromIt.template Get<int32_t>(), elementBytes);
162+
} else if (elementBytes == 2) {
163+
std::memcpy(toAt, fromIt.template Get<int16_t>(), elementBytes);
164+
} else {
165+
std::memcpy(toAt, fromIt.template Get<char>(), elementBytes);
166+
}
139167
}
140168
}
141169

170+
template <bool RANK1>
142171
RT_API_ATTRS void ShallowCopyContiguousToDiscontiguous(
143172
const Descriptor &to, const Descriptor &from) {
144-
SubscriptValue toAt[maxRank];
145-
to.GetLowerBounds(toAt);
146173
char *fromAt{from.OffsetElement()};
174+
DescriptorIterator<RANK1> toIt{to};
147175
std::size_t elementBytes{to.ElementBytes()};
148176
for (std::size_t n{to.Elements()}; n-- > 0;
149-
to.IncrementSubscripts(toAt), fromAt += elementBytes) {
150-
std::memcpy(to.Element<char>(toAt), fromAt, elementBytes);
177+
toIt.Advance(), fromAt += elementBytes) {
178+
if (elementBytes == 16) {
179+
std::memcpy(toIt.template Get<__int128_t>(), fromAt, elementBytes);
180+
} else if (elementBytes == 8) {
181+
std::memcpy(toIt.template Get<int64_t>(), fromAt, elementBytes);
182+
} else if (elementBytes == 4) {
183+
std::memcpy(toIt.template Get<int32_t>(), fromAt, elementBytes);
184+
} else if (elementBytes == 2) {
185+
std::memcpy(toIt.template Get<int16_t>(), fromAt, elementBytes);
186+
} else {
187+
std::memcpy(toIt.template Get<char>(), fromAt, elementBytes);
188+
}
151189
}
152190
}
153191

@@ -158,13 +196,25 @@ RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from,
158196
std::memcpy(to.OffsetElement(), from.OffsetElement(),
159197
to.Elements() * to.ElementBytes());
160198
} else {
161-
ShallowCopyDiscontiguousToContiguous(to, from);
199+
if (to.rank() == 1 && from.rank() == 1) {
200+
ShallowCopyDiscontiguousToContiguous<true>(to, from);
201+
} else {
202+
ShallowCopyDiscontiguousToContiguous<false>(to, from);
203+
}
162204
}
163205
} else {
164206
if (fromIsContiguous) {
165-
ShallowCopyContiguousToDiscontiguous(to, from);
207+
if (to.rank() == 1 && from.rank() == 1) {
208+
ShallowCopyContiguousToDiscontiguous<true>(to, from);
209+
} else {
210+
ShallowCopyContiguousToDiscontiguous<false>(to, from);
211+
}
166212
} else {
167-
ShallowCopyDiscontiguousToDiscontiguous(to, from);
213+
if (to.rank() == 1 && from.rank() == 1) {
214+
ShallowCopyDiscontiguousToDiscontiguous<true>(to, from);
215+
} else {
216+
ShallowCopyDiscontiguousToDiscontiguous<false>(to, from);
217+
}
168218
}
169219
}
170220
}

0 commit comments

Comments
 (0)