Skip to content

Commit 5fe057c

Browse files
committed
[flang][runtime] Speed up initialization & destruction
Rework derived type initialization in the runtime to just initialize the first element of any array, and then memcpy it to the others, rather than exercising the per-component paths for each element. Reword derived type destruction in the runtime to detect and exploit a fast path for allocatable components whose types themselves don't need nested destruction. Small tweaks were made in hot paths exposed by profiling in descriptor operations and derived type assignment.
1 parent 2910c24 commit 5fe057c

File tree

8 files changed

+241
-157
lines changed

8 files changed

+241
-157
lines changed

flang-rt/include/flang-rt/runtime/descriptor.h

Lines changed: 80 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,16 @@
2020

2121
#include "memory.h"
2222
#include "type-code.h"
23+
#include "flang-rt/runtime/allocator-registry.h"
2324
#include "flang/Common/ISO_Fortran_binding_wrapper.h"
25+
#include "flang/Common/optional.h"
2426
#include "flang/Runtime/descriptor-consts.h"
2527
#include <algorithm>
2628
#include <cassert>
2729
#include <cinttypes>
2830
#include <cstddef>
2931
#include <cstdio>
32+
#include <cstdlib>
3033
#include <cstring>
3134

3235
/// Value used for asyncObject when no specific stream is specified.
@@ -262,9 +265,20 @@ class Descriptor {
262265

263266
template <typename A>
264267
RT_API_ATTRS A *ZeroBasedIndexedElement(std::size_t n) const {
265-
SubscriptValue at[maxRank];
266-
if (SubscriptsForZeroBasedElementNumber(at, n)) {
267-
return Element<A>(at);
268+
if (raw_.rank == 0) {
269+
if (n == 0) {
270+
return OffsetElement<A>();
271+
}
272+
} else if (raw_.rank == 1) {
273+
const auto &dim{GetDimension(0)};
274+
if (n < static_cast<std::size_t>(dim.Extent())) {
275+
return OffsetElement<A>(n * dim.ByteStride());
276+
}
277+
} else {
278+
SubscriptValue at[maxRank];
279+
if (SubscriptsForZeroBasedElementNumber(at, n)) {
280+
return Element<A>(at);
281+
}
268282
}
269283
return nullptr;
270284
}
@@ -366,6 +380,18 @@ class Descriptor {
366380
RT_API_ATTRS std::size_t SizeInBytes() const;
367381

368382
RT_API_ATTRS std::size_t Elements() const;
383+
RT_API_ATTRS std::size_t InlineElements() const {
384+
int n{rank()};
385+
if (n == 0) {
386+
return 1;
387+
} else {
388+
auto elements{static_cast<std::size_t>(GetDimension(0).Extent())};
389+
for (int j{1}; j < n; ++j) {
390+
elements *= GetDimension(j).Extent();
391+
}
392+
return elements;
393+
}
394+
}
369395

370396
// Allocate() assumes Elements() and ElementBytes() work;
371397
// define the extents of the dimensions and the element length
@@ -377,7 +403,22 @@ class Descriptor {
377403

378404
// Deallocates storage; does not call FINAL subroutines or
379405
// deallocate allocatable/automatic components.
380-
RT_API_ATTRS int Deallocate();
406+
RT_API_ATTRS int Deallocate() {
407+
ISO::CFI_cdesc_t &descriptor{raw()};
408+
void *pointer{descriptor.base_addr};
409+
if (!pointer) {
410+
return CFI_ERROR_BASE_ADDR_NULL;
411+
} else {
412+
int allocIndex{MapAllocIdx()};
413+
if (allocIndex == kDefaultAllocator) {
414+
std::free(pointer);
415+
} else {
416+
allocatorRegistry.GetDeallocator(MapAllocIdx())(pointer);
417+
}
418+
descriptor.base_addr = nullptr;
419+
return CFI_SUCCESS;
420+
}
421+
}
381422

382423
// Deallocates storage, including allocatable and automatic
383424
// components. Optionally invokes FINAL subroutines.
@@ -392,8 +433,7 @@ class Descriptor {
392433
bool stridesAreContiguous{true};
393434
for (int j{0}; j < leadingDimensions; ++j) {
394435
const Dimension &dim{GetDimension(j)};
395-
stridesAreContiguous &=
396-
(bytes == dim.ByteStride()) || (dim.Extent() == 1);
436+
stridesAreContiguous &= bytes == dim.ByteStride() || dim.Extent() == 1;
397437
bytes *= dim.Extent();
398438
}
399439
// One and zero element arrays are contiguous even if the descriptor
@@ -406,6 +446,32 @@ class Descriptor {
406446
return stridesAreContiguous || bytes == 0;
407447
}
408448

449+
// The result, if any, is a fixed stride value that can be used to
450+
// address all elements. It generalizes contiguity by also allowing
451+
// the case of an array with extent 1 on all but one dimension.
452+
RT_API_ATTRS common::optional<SubscriptValue> FixedStride() const {
453+
auto rank{static_cast<std::size_t>(raw_.rank)};
454+
common::optional<SubscriptValue> stride;
455+
for (std::size_t j{0}; j < rank; ++j) {
456+
const Dimension &dim{GetDimension(j)};
457+
auto extent{dim.Extent()};
458+
if (extent == 0) {
459+
break; // empty array
460+
} else if (extent == 1) { // ok
461+
} else if (stride) {
462+
// Extent > 1 on multiple dimensions
463+
if (IsContiguous()) {
464+
return ElementBytes();
465+
} else {
466+
return common::nullopt;
467+
}
468+
} else {
469+
stride = dim.ByteStride();
470+
}
471+
}
472+
return stride.value_or(0); // 0 for scalars and empty arrays
473+
}
474+
409475
// Establishes a pointer to a section or element.
410476
RT_API_ATTRS bool EstablishPointerSection(const Descriptor &source,
411477
const SubscriptValue *lower = nullptr,
@@ -427,6 +493,14 @@ class Descriptor {
427493
RT_API_ATTRS inline int GetAllocIdx() const {
428494
return (raw_.extra & _CFI_ALLOCATOR_IDX_MASK) >> _CFI_ALLOCATOR_IDX_SHIFT;
429495
}
496+
RT_API_ATTRS int MapAllocIdx() const {
497+
#ifdef RT_DEVICE_COMPILATION
498+
// Force default allocator in device code.
499+
return kDefaultAllocator;
500+
#else
501+
return GetAllocIdx();
502+
#endif
503+
}
430504
RT_API_ATTRS inline void SetAllocIdx(int pos) {
431505
raw_.extra &= ~_CFI_ALLOCATOR_IDX_MASK; // Clear the allocator index bits.
432506
raw_.extra |= pos << _CFI_ALLOCATOR_IDX_SHIFT;

flang-rt/include/flang-rt/runtime/type-info.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,9 @@ class Component {
6868
RT_API_ATTRS std::uint64_t offset() const { return offset_; }
6969
RT_API_ATTRS const Value &characterLen() const { return characterLen_; }
7070
RT_API_ATTRS const DerivedType *derivedType() const {
71-
return derivedType_.descriptor().OffsetElement<const DerivedType>();
71+
return category() == TypeCategory::Derived
72+
? derivedType_.descriptor().OffsetElement<const DerivedType>()
73+
: nullptr;
7274
}
7375
RT_API_ATTRS const Value *lenValue() const {
7476
return lenValue_.descriptor().OffsetElement<const Value>();

flang-rt/include/flang-rt/runtime/work-queue.h

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
#include "flang-rt/runtime/stat.h"
6363
#include "flang-rt/runtime/type-info.h"
6464
#include "flang/Common/api-attrs.h"
65+
#include "flang/Common/optional.h"
6566
#include "flang/Runtime/freestanding-tools.h"
6667
#include <flang/Common/variant.h>
6768

@@ -122,7 +123,7 @@ class Elementwise {
122123

123124
protected:
124125
const Descriptor &instance_, *from_{nullptr};
125-
std::size_t elements_{instance_.Elements()};
126+
std::size_t elements_{instance_.InlineElements()};
126127
std::size_t elementAt_{0};
127128
SubscriptValue subscripts_[common::maxRank];
128129
SubscriptValue fromSubscripts_[common::maxRank];
@@ -131,11 +132,19 @@ class Elementwise {
131132
// Base class for ticket workers that operate over derived type components.
132133
class Componentwise {
133134
public:
134-
RT_API_ATTRS Componentwise(const typeInfo::DerivedType &);
135+
RT_API_ATTRS Componentwise(const typeInfo::DerivedType &derived)
136+
: derived_{derived}, components_{derived_.component().InlineElements()} {
137+
GetFirstComponent();
138+
}
139+
135140
RT_API_ATTRS bool IsComplete() const { return componentAt_ >= components_; }
136141
RT_API_ATTRS void Advance() {
137142
++componentAt_;
138-
GetComponent();
143+
if (IsComplete()) {
144+
component_ = nullptr;
145+
} else {
146+
++component_;
147+
}
139148
}
140149
RT_API_ATTRS void SkipToEnd() {
141150
component_ = nullptr;
@@ -144,15 +153,21 @@ class Componentwise {
144153
RT_API_ATTRS void Reset() {
145154
component_ = nullptr;
146155
componentAt_ = 0;
147-
GetComponent();
156+
GetFirstComponent();
148157
}
149-
RT_API_ATTRS void GetComponent();
150158

151159
protected:
152160
const typeInfo::DerivedType &derived_;
153161
std::size_t components_{0}, componentAt_{0};
154162
const typeInfo::Component *component_{nullptr};
155163
StaticDescriptor<common::maxRank, true, 0> componentDescriptor_;
164+
165+
private:
166+
RT_API_ATTRS void GetFirstComponent() {
167+
if (components_ > 0) {
168+
component_ = derived_.component().OffsetElement<typeInfo::Component>();
169+
}
170+
}
156171
};
157172

158173
// Base class for ticket workers that operate over derived type components
@@ -228,14 +243,14 @@ class ElementsOverComponents : public Elementwise, public Componentwise {
228243

229244
// Ticket worker classes
230245

231-
// Implements derived type instance initialization
246+
// Implements derived type instance initialization.
232247
class InitializeTicket : public ImmediateTicketRunner<InitializeTicket>,
233-
private ComponentsOverElements {
248+
private ElementsOverComponents {
234249
public:
235250
RT_API_ATTRS InitializeTicket(
236251
const Descriptor &instance, const typeInfo::DerivedType &derived)
237252
: ImmediateTicketRunner<InitializeTicket>{*this},
238-
ComponentsOverElements{instance, derived} {}
253+
ElementsOverComponents{instance, derived} {}
239254
RT_API_ATTRS int Begin(WorkQueue &);
240255
RT_API_ATTRS int Continue(WorkQueue &);
241256
};
@@ -283,12 +298,14 @@ class DestroyTicket : public ImmediateTicketRunner<DestroyTicket>,
283298
RT_API_ATTRS DestroyTicket(const Descriptor &instance,
284299
const typeInfo::DerivedType &derived, bool finalize)
285300
: ImmediateTicketRunner<DestroyTicket>{*this},
286-
ComponentsOverElements{instance, derived}, finalize_{finalize} {}
301+
ComponentsOverElements{instance, derived}, finalize_{finalize},
302+
fixedStride_{instance.FixedStride()} {}
287303
RT_API_ATTRS int Begin(WorkQueue &);
288304
RT_API_ATTRS int Continue(WorkQueue &);
289305

290306
private:
291307
bool finalize_{false};
308+
std::optional<SubscriptValue> fixedStride_;
292309
};
293310

294311
// Implements general intrinsic assignment
@@ -302,11 +319,11 @@ class AssignTicket : public ImmediateTicketRunner<AssignTicket> {
302319
RT_API_ATTRS int Continue(WorkQueue &);
303320

304321
private:
322+
RT_API_ATTRS Descriptor &GetTempDescriptor();
305323
RT_API_ATTRS bool IsSimpleMemmove() const {
306324
return !toDerived_ && to_.rank() == from_->rank() && to_.IsContiguous() &&
307325
from_->IsContiguous() && to_.ElementBytes() == from_->ElementBytes();
308326
}
309-
RT_API_ATTRS Descriptor &GetTempDescriptor();
310327

311328
Descriptor &to_;
312329
const Descriptor *from_{nullptr};
@@ -549,6 +566,7 @@ class WorkQueue {
549566
TicketList *first_{nullptr}, *last_{nullptr}, *insertAfter_{nullptr};
550567
TicketList static_[numStatic_];
551568
TicketList *firstFree_{static_};
569+
bool anyDynamicAllocation_{false};
552570
};
553571

554572
} // namespace Fortran::runtime

flang-rt/lib/runtime/assign.cpp

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -217,8 +217,8 @@ static RT_API_ATTRS void DoElementalDefinedAssignment(const Descriptor &to,
217217
toElementDesc.Establish(derived, nullptr, 0, nullptr, CFI_attribute_pointer);
218218
fromElementDesc.Establish(
219219
derived, nullptr, 0, nullptr, CFI_attribute_pointer);
220-
for (std::size_t toElements{to.Elements()}; toElements-- > 0;
221-
to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
220+
for (std::size_t toElements{to.InlineElements()}; toElements-- > 0;
221+
to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
222222
toElementDesc.set_base_addr(to.Element<char>(toAt));
223223
fromElementDesc.set_base_addr(from.Element<char>(fromAt));
224224
DoScalarDefinedAssignment(toElementDesc, fromElementDesc, derived, special);
@@ -431,11 +431,14 @@ RT_API_ATTRS int AssignTicket::Continue(WorkQueue &workQueue) {
431431
}
432432
}
433433
// Intrinsic assignment
434-
std::size_t toElements{to_.Elements()};
435-
if (from_->rank() > 0 && toElements != from_->Elements()) {
436-
workQueue.terminator().Crash("Assign: mismatching element counts in array "
437-
"assignment (to %zd, from %zd)",
438-
toElements, from_->Elements());
434+
std::size_t toElements{to_.InlineElements()};
435+
if (from_->rank() > 0) {
436+
std::size_t fromElements{from_->InlineElements()};
437+
if (toElements != fromElements) {
438+
workQueue.terminator().Crash("Assign: mismatching element counts in "
439+
"array assignment (to %zd, from %zd)",
440+
toElements, fromElements);
441+
}
439442
}
440443
if (to_.type() != from_->type()) {
441444
workQueue.terminator().Crash(
@@ -529,7 +532,7 @@ RT_API_ATTRS int DerivedAssignTicket<IS_COMPONENTWISE>::Begin(
529532
// allocatable components or defined ASSIGNMENT(=) at any level.
530533
memmoveFct_(this->instance_.template OffsetElement<char>(),
531534
this->from_->template OffsetElement<const char *>(),
532-
this->instance_.Elements() * elementBytes);
535+
this->instance_.InlineElements() * elementBytes);
533536
return StatOk;
534537
}
535538
}
@@ -544,7 +547,7 @@ RT_API_ATTRS int DerivedAssignTicket<IS_COMPONENTWISE>::Begin(
544547
// Copy procedure pointer components
545548
const Descriptor &procPtrDesc{this->derived_.procPtr()};
546549
bool noDataComponents{this->IsComplete()};
547-
if (std::size_t numProcPtrs{procPtrDesc.Elements()}) {
550+
if (std::size_t numProcPtrs{procPtrDesc.InlineElements()}) {
548551
for (std::size_t k{0}; k < numProcPtrs; ++k) {
549552
const auto &procPtr{
550553
*procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(k)};
@@ -614,7 +617,7 @@ RT_API_ATTRS int DerivedAssignTicket<IS_COMPONENTWISE>::Continue(
614617
memmoveFct_(to, from, componentByteSize);
615618
}
616619
}
617-
this->Componentwise::Advance();
620+
this->SkipToNextComponent();
618621
} else {
619622
memmoveFct_(
620623
this->instance_.template Element<char>(this->subscripts_) +
@@ -646,7 +649,7 @@ RT_API_ATTRS int DerivedAssignTicket<IS_COMPONENTWISE>::Continue(
646649
memmoveFct_(to, from, componentByteSize);
647650
}
648651
}
649-
this->Componentwise::Advance();
652+
this->SkipToNextComponent();
650653
} else {
651654
memmoveFct_(this->instance_.template Element<char>(this->subscripts_) +
652655
this->component_->offset(),
@@ -668,11 +671,11 @@ RT_API_ATTRS int DerivedAssignTicket<IS_COMPONENTWISE>::Continue(
668671
if (toDesc->IsAllocatable() && !fromDesc->IsAllocated()) {
669672
if (toDesc->IsAllocated()) {
670673
if (this->phase_ == 0) {
671-
this->phase_++;
672674
if (componentDerived && !componentDerived->noDestructionNeeded()) {
673675
if (int status{workQueue.BeginDestroy(
674676
*toDesc, *componentDerived, /*finalize=*/false)};
675677
status != StatOk) {
678+
this->phase_++;
676679
return status;
677680
}
678681
}
@@ -725,15 +728,15 @@ RT_API_ATTRS void DoFromSourceAssign(Descriptor &alloc,
725728
SubscriptValue allocAt[maxRank];
726729
alloc.GetLowerBounds(allocAt);
727730
if (allocDerived) {
728-
for (std::size_t n{alloc.Elements()}; n-- > 0;
729-
alloc.IncrementSubscripts(allocAt)) {
731+
for (std::size_t n{alloc.InlineElements()}; n-- > 0;
732+
alloc.IncrementSubscripts(allocAt)) {
730733
Descriptor allocElement{*Descriptor::Create(*allocDerived,
731734
reinterpret_cast<void *>(alloc.Element<char>(allocAt)), 0)};
732735
Assign(allocElement, source, terminator, NoAssignFlags, memmoveFct);
733736
}
734737
} else { // intrinsic type
735-
for (std::size_t n{alloc.Elements()}; n-- > 0;
736-
alloc.IncrementSubscripts(allocAt)) {
738+
for (std::size_t n{alloc.InlineElements()}; n-- > 0;
739+
alloc.IncrementSubscripts(allocAt)) {
737740
memmoveFct(alloc.Element<char>(allocAt), source.raw().base_addr,
738741
alloc.ElementBytes());
739742
}

0 commit comments

Comments
 (0)