@@ -114,58 +114,148 @@ RT_API_ATTRS void CheckIntegerKind(
114114 }
115115}
116116
117+ template <typename P, int RANK>
117118RT_API_ATTRS void ShallowCopyDiscontiguousToDiscontiguous (
118119 const Descriptor &to, const Descriptor &from) {
119- SubscriptValue toAt[maxRank], fromAt[maxRank];
120- to.GetLowerBounds (toAt);
121- from.GetLowerBounds (fromAt);
120+ DescriptorIterator<RANK> toIt{to};
121+ DescriptorIterator<RANK> fromIt{from};
122+ // Knowing the size at compile time can enable memcpy inlining optimisations
123+ constexpr std::size_t typeElementBytes{sizeof (P)};
124+ // We might still need to check the actual size as a fallback
122125 std::size_t elementBytes{to.ElementBytes ()};
123126 for (std::size_t n{to.Elements ()}; n-- > 0 ;
124- to.IncrementSubscripts (toAt), from.IncrementSubscripts (fromAt)) {
125- std::memcpy (
126- to.Element <char >(toAt), from.Element <char >(fromAt), elementBytes);
127+ toIt.Advance (), fromIt.Advance ()) {
128+ // typeElementBytes == 1 when P is a char - the non-specialised case
129+ if constexpr (typeElementBytes != 1 ) {
130+ std::memcpy (
131+ toIt.template Get <P>(), fromIt.template Get <P>(), typeElementBytes);
132+ } else {
133+ std::memcpy (
134+ toIt.template Get <P>(), fromIt.template Get <P>(), elementBytes);
135+ }
127136 }
128137}
129138
139+ template <typename P, int RANK>
130140RT_API_ATTRS void ShallowCopyDiscontiguousToContiguous (
131141 const Descriptor &to, const Descriptor &from) {
132142 char *toAt{to.OffsetElement ()};
133- SubscriptValue fromAt[maxRank];
134- from.GetLowerBounds (fromAt);
143+ constexpr std::size_t typeElementBytes{sizeof (P)};
135144 std::size_t elementBytes{to.ElementBytes ()};
145+ DescriptorIterator<RANK> fromIt{from};
136146 for (std::size_t n{to.Elements ()}; n-- > 0 ;
137- toAt += elementBytes, from.IncrementSubscripts (fromAt)) {
138- std::memcpy (toAt, from.Element <char >(fromAt), elementBytes);
147+ toAt += elementBytes, fromIt.Advance ()) {
148+ if constexpr (typeElementBytes != 1 ) {
149+ std::memcpy (toAt, fromIt.template Get <P>(), typeElementBytes);
150+ } else {
151+ std::memcpy (toAt, fromIt.template Get <P>(), elementBytes);
152+ }
139153 }
140154}
141155
156+ template <typename P, int RANK>
142157RT_API_ATTRS void ShallowCopyContiguousToDiscontiguous (
143158 const Descriptor &to, const Descriptor &from) {
144- SubscriptValue toAt[maxRank];
145- to.GetLowerBounds (toAt);
146159 char *fromAt{from.OffsetElement ()};
160+ DescriptorIterator<RANK> toIt{to};
161+ constexpr std::size_t typeElementBytes{sizeof (P)};
147162 std::size_t elementBytes{to.ElementBytes ()};
148163 for (std::size_t n{to.Elements ()}; n-- > 0 ;
149- to.IncrementSubscripts (toAt), fromAt += elementBytes) {
150- std::memcpy (to.Element <char >(toAt), fromAt, elementBytes);
164+ toIt.Advance (), fromAt += elementBytes) {
165+ if constexpr (typeElementBytes != 1 ) {
166+ std::memcpy (toIt.template Get <P>(), fromAt, typeElementBytes);
167+ } else {
168+ std::memcpy (toIt.template Get <P>(), fromAt, elementBytes);
169+ }
151170 }
152171}
153172
154- RT_API_ATTRS void ShallowCopy (const Descriptor &to, const Descriptor &from,
173+ // ShallowCopy helper for calling the correct specialised variant based on
174+ // scenario
175+ template <typename P, int RANK = -1 >
176+ RT_API_ATTRS void ShallowCopyInner (const Descriptor &to, const Descriptor &from,
155177 bool toIsContiguous, bool fromIsContiguous) {
156178 if (toIsContiguous) {
157179 if (fromIsContiguous) {
158180 std::memcpy (to.OffsetElement (), from.OffsetElement (),
159181 to.Elements () * to.ElementBytes ());
160182 } else {
161- ShallowCopyDiscontiguousToContiguous (to, from);
183+ ShallowCopyDiscontiguousToContiguous<P, RANK> (to, from);
162184 }
163185 } else {
164186 if (fromIsContiguous) {
165- ShallowCopyContiguousToDiscontiguous (to, from);
187+ ShallowCopyContiguousToDiscontiguous<P, RANK>(to, from);
188+ } else {
189+ ShallowCopyDiscontiguousToDiscontiguous<P, RANK>(to, from);
190+ }
191+ }
192+ }
193+
194+ // Most arrays are much closer to rank-1 than to maxRank.
195+ // Doing the recursion upwards instead of downwards puts the more common
196+ // cases earlier in the if-chain and has a tangible impact on performance.
197+ template <typename P, int RANK> struct ShallowCopyRankSpecialize {
198+ static bool execute (const Descriptor &to, const Descriptor &from,
199+ bool toIsContiguous, bool fromIsContiguous) {
200+ if (to.rank () == RANK && from.rank () == RANK) {
201+ ShallowCopyInner<P, RANK>(to, from, toIsContiguous, fromIsContiguous);
202+ return true ;
203+ }
204+ return ShallowCopyRankSpecialize<P, RANK + 1 >::execute (
205+ to, from, toIsContiguous, fromIsContiguous);
206+ }
207+ };
208+
209+ template <typename P> struct ShallowCopyRankSpecialize <P, maxRank + 1 > {
210+ static bool execute (const Descriptor &to, const Descriptor &from,
211+ bool toIsContiguous, bool fromIsContiguous) {
212+ return false ;
213+ }
214+ };
215+
216+ // ShallowCopy helper for specialising the variants based on array rank
217+ template <typename P>
218+ RT_API_ATTRS void ShallowCopyRank (const Descriptor &to, const Descriptor &from,
219+ bool toIsContiguous, bool fromIsContiguous) {
220+ // Try to call a specialised ShallowCopy variant from rank-1 up to maxRank
221+ bool specialized{ShallowCopyRankSpecialize<P, 1 >::execute (
222+ to, from, toIsContiguous, fromIsContiguous)};
223+ if (!specialized) {
224+ ShallowCopyInner<P>(to, from, toIsContiguous, fromIsContiguous);
225+ }
226+ }
227+
228+ RT_API_ATTRS void ShallowCopy (const Descriptor &to, const Descriptor &from,
229+ bool toIsContiguous, bool fromIsContiguous) {
230+ std::size_t elementBytes{to.ElementBytes ()};
231+ // Checking the type at runtime and making sure the pointer passed to memcpy
232+ // has a type that matches the element type makes it possible for the compiler
233+ // to optimise out the memcpy calls altogether and can substantially improve
234+ // performance for some applications.
235+ if (to.type ().IsInteger ()) {
236+ if (elementBytes == sizeof (int64_t )) {
237+ ShallowCopyRank<int64_t >(to, from, toIsContiguous, fromIsContiguous);
238+ } else if (elementBytes == sizeof (int32_t )) {
239+ ShallowCopyRank<int32_t >(to, from, toIsContiguous, fromIsContiguous);
240+ } else if (elementBytes == sizeof (int16_t )) {
241+ ShallowCopyRank<int16_t >(to, from, toIsContiguous, fromIsContiguous);
242+ #if defined USING_NATIVE_INT128_T
243+ } else if (elementBytes == sizeof (__int128_t )) {
244+ ShallowCopyRank<__int128_t >(to, from, toIsContiguous, fromIsContiguous);
245+ #endif
166246 } else {
167- ShallowCopyDiscontiguousToDiscontiguous (to, from);
247+ ShallowCopyRank< char > (to, from, toIsContiguous, fromIsContiguous );
168248 }
249+ } else if (to.type ().IsReal ()) {
250+ if (elementBytes == sizeof (double )) {
251+ ShallowCopyRank<double >(to, from, toIsContiguous, fromIsContiguous);
252+ } else if (elementBytes == sizeof (float )) {
253+ ShallowCopyRank<float >(to, from, toIsContiguous, fromIsContiguous);
254+ } else {
255+ ShallowCopyRank<char >(to, from, toIsContiguous, fromIsContiguous);
256+ }
257+ } else {
258+ ShallowCopyRank<char >(to, from, toIsContiguous, fromIsContiguous);
169259 }
170260}
171261
0 commit comments