Skip to content

Commit 9f53d4a

Browse files
committed
[Runtime] Have ConcurrentReadableHashMap store indices inline when the table is sufficiently small.
1 parent d6227dc commit 9f53d4a

File tree

1 file changed

+141
-70
lines changed

1 file changed

+141
-70
lines changed

include/swift/Runtime/Concurrent.h

Lines changed: 141 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -620,25 +620,76 @@ template <class ElemTy> struct ConcurrentReadableHashMap {
620620
/// is stored inline. We work around this contradiction by considering the
621621
/// first index to always be occupied with a value that never matches any key.
622622
struct IndexStorage {
623+
using RawType = uintptr_t;
624+
625+
RawType Value;
626+
627+
static constexpr uintptr_t log2(uintptr_t x) {
628+
return x <= 1 ? 0 : log2(x >> 1) + 1;
629+
}
630+
631+
static constexpr uintptr_t InlineIndexBits = 4;
632+
static constexpr uintptr_t InlineIndexMask = 0xF;
633+
static constexpr uintptr_t InlineCapacity =
634+
sizeof(RawType) * CHAR_BIT / InlineIndexBits;
635+
static constexpr uintptr_t InlineCapacityLog2 = log2(InlineCapacity);
636+
637+
// Indices can be stored in different ways, depending on how big they need
638+
// to be. The index mode is stored in the bottom two bits of Value. The
639+
// meaning of the rest of Value depends on the mode.
640+
enum class IndexMode {
641+
// Value is treated as an array of four-bit integers, storing the indices.
642+
// The first element overlaps with the mode, and is never used.
643+
Inline,
644+
645+
// The rest of Value holds a pointer to storage. The first byte of this
646+
// storage holds the log2 of the storage capacity. The storage is treated
647+
// as an array of 8, 16, or 32-bit integers. The first element overlaps
648+
// with the capacity, and is never used.
649+
Array8,
650+
Array16,
651+
Array32,
652+
};
653+
654+
IndexStorage() : Value(0) {}
655+
IndexStorage(RawType value) : Value(value) {}
656+
IndexStorage(void *ptr, unsigned indexSize, uint8_t capacityLog2) {
657+
assert(capacityLog2 > InlineCapacityLog2);
658+
IndexMode mode;
659+
switch (indexSize) {
660+
case sizeof(uint8_t):
661+
mode = IndexMode::Array8;
662+
break;
663+
case sizeof(uint16_t):
664+
mode = IndexMode::Array16;
665+
break;
666+
case sizeof(uint32_t):
667+
mode = IndexMode::Array32;
668+
break;
669+
default:
670+
swift_unreachable("unknown index size");
671+
}
672+
Value = reinterpret_cast<uintptr_t>(ptr) | static_cast<uintptr_t>(mode);
673+
*reinterpret_cast<uint8_t *>(ptr) = capacityLog2;
674+
}
675+
676+
bool valueIsPointer() { return Value & 3; }
677+
678+
void *pointer() {
679+
if (valueIsPointer())
680+
return (void *)(Value & (RawType)~3);
681+
return nullptr;
682+
}
683+
684+
IndexMode indexMode() { return IndexMode(Value & 3); }
685+
623686
// Index size is variable based on capacity, either 8, 16, or 32 bits.
624687
//
625688
// This is somewhat conservative. We could have, for example, a capacity of
626689
// 512 but a maximum index of only 200, which would still allow for 8-bit
627690
// indices. However, taking advantage of this would require reallocating
628691
// the index storage when the element count crossed a threshold, which is
629692
// more complex, and the advantages are minimal. This keeps it simple.
630-
//
631-
// The first byte of the storage is the log 2 of the capacity. The remaining
632-
// storage is then an array of 8, 16, or 32 bit integers, depending on the
633-
// capacity number. This union allows us to access the capacity, and then
634-
// access the rest of the storage by taking the address of one of the
635-
// IndexZero members and indexing into it (always avoiding index 0).
636-
union {
637-
uint8_t CapacityLog2;
638-
std::atomic<uint8_t> IndexZero8;
639-
std::atomic<uint16_t> IndexZero16;
640-
std::atomic<uint32_t> IndexZero32;
641-
};
642693

643694
// Get the size, in bytes, of the index needed for the given capacity.
644695
static unsigned indexSize(uint8_t capacityLog2) {
@@ -649,46 +700,66 @@ template <class ElemTy> struct ConcurrentReadableHashMap {
649700
return sizeof(uint32_t);
650701
}
651702

652-
unsigned indexSize() { return indexSize(CapacityLog2); }
703+
uint8_t getCapacityLog2() {
704+
if (auto *ptr = pointer())
705+
return *reinterpret_cast<uint8_t *>(ptr);
706+
return InlineCapacityLog2;
707+
}
653708

654-
static IndexStorage *allocate(size_t capacityLog2) {
709+
static IndexStorage allocate(size_t capacityLog2) {
655710
assert(capacityLog2 > 0);
656711
size_t capacity = 1UL << capacityLog2;
657-
auto *ptr = reinterpret_cast<IndexStorage *>(
658-
calloc(capacity, indexSize(capacityLog2)));
712+
unsigned size = indexSize(capacityLog2);
713+
auto *ptr = calloc(capacity, size);
659714
if (!ptr)
660715
swift::crash("Could not allocate memory.");
661-
ptr->CapacityLog2 = capacityLog2;
662-
return ptr;
716+
return IndexStorage(ptr, size, capacityLog2);
663717
}
664718

665719
unsigned loadIndexAt(size_t i, std::memory_order order) {
666720
assert(i > 0 && "index zero is off-limits, used to store capacity");
667-
668-
switch (indexSize()) {
669-
case sizeof(uint8_t):
670-
return (&IndexZero8)[i].load(order);
671-
case sizeof(uint16_t):
672-
return (&IndexZero16)[i].load(order);
673-
case sizeof(uint32_t):
674-
return (&IndexZero32)[i].load(order);
675-
default:
676-
swift_unreachable("unknown index size");
721+
assert(i < (1 << getCapacityLog2()) &&
722+
"index is off the end of the indices");
723+
724+
switch (indexMode()) {
725+
case IndexMode::Inline:
726+
return (Value >> (i * InlineIndexBits)) & InlineIndexMask;
727+
case IndexMode::Array8:
728+
return ((std::atomic<uint8_t> *)pointer())[i].load(order);
729+
case IndexMode::Array16:
730+
return ((std::atomic<uint16_t> *)pointer())[i].load(order);
731+
case IndexMode::Array32:
732+
return ((std::atomic<uint32_t> *)pointer())[i].load(order);
677733
}
678734
}
679735

680-
void storeIndexAt(unsigned value, size_t i, std::memory_order order) {
736+
void storeIndexAt(std::atomic<RawType> *inlineStorage, unsigned value,
737+
size_t i, std::memory_order order) {
681738
assert(i > 0 && "index zero is off-limits, used to store capacity");
682-
683-
switch (indexSize()) {
684-
case sizeof(uint8_t):
685-
return (&IndexZero8)[i].store(value, order);
686-
case sizeof(uint16_t):
687-
return (&IndexZero16)[i].store(value, order);
688-
case sizeof(uint32_t):
689-
return (&IndexZero32)[i].store(value, order);
690-
default:
691-
swift_unreachable("unknown index size");
739+
assert(i < (1 << getCapacityLog2()) &&
740+
"index is off the end of the indices");
741+
742+
switch (indexMode()) {
743+
case IndexMode::Inline: {
744+
assert(value == (value & InlineIndexMask) && "value is too big to fit");
745+
auto shift = i * InlineIndexBits;
746+
assert((Value & (InlineIndexMask << shift)) == 0 &&
747+
"can't overwrite an existing index");
748+
assert(Value == inlineStorage->load(std::memory_order_relaxed) &&
749+
"writing with a stale IndexStorage");
750+
auto newStorage = Value | ((RawType)value << shift);
751+
inlineStorage->store(newStorage, order);
752+
break;
753+
}
754+
case IndexMode::Array8:
755+
((std::atomic<uint8_t> *)pointer())[i].store(value, order);
756+
break;
757+
case IndexMode::Array16:
758+
((std::atomic<uint16_t> *)pointer())[i].store(value, order);
759+
break;
760+
case IndexMode::Array32:
761+
((std::atomic<uint32_t> *)pointer())[i].store(value, order);
762+
break;
692763
}
693764
}
694765
};
@@ -726,7 +797,11 @@ template <class ElemTy> struct ConcurrentReadableHashMap {
726797
std::atomic<ElemTy *> Elements{nullptr};
727798

728799
/// The array of indices.
729-
std::atomic<IndexStorage *> Indices{nullptr};
800+
///
801+
/// This has to be stored as a IndexStorage::RawType instead of a IndexStorage
802+
/// because some of our targets don't support interesting structs as atomic
803+
/// types. See also MetadataCache::TrackingInfo which uses the same technique.
804+
std::atomic<typename IndexStorage::RawType> Indices{0};
730805

731806
/// The writer lock, which must be taken before any mutation of the table.
732807
StaticMutex WriterLock;
@@ -778,18 +853,17 @@ template <class ElemTy> struct ConcurrentReadableHashMap {
778853
/// returning the new array with all existing indices copied into it. This
779854
/// operation performs a rehash, so that the indices are in the correct
780855
/// location in the new array.
781-
IndexStorage *resize(IndexStorage *indices, uint8_t indicesCapacityLog2,
782-
ElemTy *elements) {
783-
// Double the size. Start with 16 (fits into 16-byte malloc
784-
// bucket), which is 2^4.
785-
size_t newCapacityLog2 = indices ? indicesCapacityLog2 + 1 : 4;
856+
IndexStorage resize(IndexStorage indices, uint8_t indicesCapacityLog2,
857+
ElemTy *elements) {
858+
// Double the size.
859+
size_t newCapacityLog2 = indicesCapacityLog2 + 1;
786860
size_t newMask = (1UL << newCapacityLog2) - 1;
787861

788-
IndexStorage *newIndices = IndexStorage::allocate(newCapacityLog2);
862+
IndexStorage newIndices = IndexStorage::allocate(newCapacityLog2);
789863

790864
size_t indicesCount = 1UL << indicesCapacityLog2;
791865
for (size_t i = 1; i < indicesCount; i++) {
792-
unsigned index = indices->loadIndexAt(i, std::memory_order_relaxed);
866+
unsigned index = indices.loadIndexAt(i, std::memory_order_relaxed);
793867
if (index == 0)
794868
continue;
795869

@@ -799,15 +873,16 @@ template <class ElemTy> struct ConcurrentReadableHashMap {
799873
size_t newI = hash & newMask;
800874
// Index 0 is unusable (occupied by the capacity), so always skip it.
801875
while (newI == 0 ||
802-
newIndices->loadIndexAt(newI, std::memory_order_relaxed) != 0) {
876+
newIndices.loadIndexAt(newI, std::memory_order_relaxed) != 0) {
803877
newI = (newI + 1) & newMask;
804878
}
805-
newIndices->storeIndexAt(index, newI, std::memory_order_relaxed);
879+
newIndices.storeIndexAt(nullptr, index, newI, std::memory_order_relaxed);
806880
}
807881

808-
Indices.store(newIndices, std::memory_order_release);
882+
Indices.store(newIndices.Value, std::memory_order_release);
809883

810-
FreeListNode::add(&FreeList, indices);
884+
if (auto *ptr = indices.pointer())
885+
FreeListNode::add(&FreeList, ptr);
811886

812887
return newIndices;
813888
}
@@ -818,20 +893,18 @@ template <class ElemTy> struct ConcurrentReadableHashMap {
818893
/// of the new element would be stored.
819894
template <class KeyTy>
820895
static std::pair<ElemTy *, unsigned>
821-
find(const KeyTy &key, IndexStorage *indices, size_t elementCount,
896+
find(const KeyTy &key, IndexStorage indices, size_t elementCount,
822897
ElemTy *elements) {
823-
if (!indices)
824-
return {nullptr, 0};
825898
auto hash = hash_value(key);
826-
auto indicesMask = (1UL << indices->CapacityLog2) - 1;
899+
auto indicesMask = (1UL << indices.getCapacityLog2()) - 1;
827900

828901
auto i = hash & indicesMask;
829902
while (true) {
830903
// Index 0 is used for the mask and is not actually an index.
831904
if (i == 0)
832905
i++;
833906

834-
auto index = indices->loadIndexAt(i, std::memory_order_acquire);
907+
auto index = indices.loadIndexAt(i, std::memory_order_acquire);
835908
// Element indices are 1-based, 0 means no entry.
836909
if (index == 0)
837910
return {nullptr, i};
@@ -864,12 +937,12 @@ template <class ElemTy> struct ConcurrentReadableHashMap {
864937
/// Readers take a snapshot of the hash map, then work with the snapshot.
865938
class Snapshot {
866939
ConcurrentReadableHashMap *Map;
867-
IndexStorage *Indices;
940+
IndexStorage Indices;
868941
ElemTy *Elements;
869942
size_t ElementCount;
870943

871944
public:
872-
Snapshot(ConcurrentReadableHashMap *map, IndexStorage *indices,
945+
Snapshot(ConcurrentReadableHashMap *map, IndexStorage indices,
873946
ElemTy *elements, size_t elementCount)
874947
: Map(map), Indices(indices), Elements(elements),
875948
ElementCount(elementCount) {}
@@ -885,7 +958,7 @@ template <class ElemTy> struct ConcurrentReadableHashMap {
885958
/// Search for an element matching the given key. Returns a pointer to the
886959
/// found element, or nullptr if no matching element exists.
887960
template <class KeyTy> const ElemTy *find(const KeyTy &key) {
888-
if (!Indices || !ElementCount || !Elements)
961+
if (!Indices.Value || !ElementCount || !Elements)
889962
return nullptr;
890963
return ConcurrentReadableHashMap::find(key, Indices, ElementCount,
891964
Elements)
@@ -917,7 +990,7 @@ template <class ElemTy> struct ConcurrentReadableHashMap {
917990
// pointer can just mean a concurrent insert that triggered a resize of the
918991
// elements array. This is harmless aside from a small performance hit, and
919992
// should not happen often.
920-
IndexStorage *indices;
993+
IndexStorage indices;
921994
size_t elementCount;
922995
ElemTy *elements;
923996
ElemTy *elements2;
@@ -951,11 +1024,8 @@ template <class ElemTy> struct ConcurrentReadableHashMap {
9511024
void getOrInsert(KeyTy key, const Call &call) {
9521025
StaticScopedLock guard(WriterLock);
9531026

954-
auto *indices = Indices.load(std::memory_order_relaxed);
955-
if (!indices)
956-
indices = resize(indices, 0, nullptr);
957-
958-
auto indicesCapacityLog2 = indices->CapacityLog2;
1027+
auto indices = IndexStorage{Indices.load(std::memory_order_relaxed)};
1028+
auto indicesCapacityLog2 = indices.getCapacityLog2();
9591029
auto elementCount = ElementCount.load(std::memory_order_relaxed);
9601030
auto *elements = Elements.load(std::memory_order_relaxed);
9611031

@@ -990,8 +1060,8 @@ template <class ElemTy> struct ConcurrentReadableHashMap {
9901060
assert(hash_value(key) == hash_value(*element) &&
9911061
"Element must have the same hash code as its key.");
9921062
ElementCount.store(elementCount + 1, std::memory_order_release);
993-
indices->storeIndexAt(elementCount + 1, found.second,
994-
std::memory_order_release);
1063+
indices.storeIndexAt(&Indices, elementCount + 1, found.second,
1064+
std::memory_order_release);
9951065
}
9961066

9971067
deallocateFreeListIfSafe();
@@ -1002,17 +1072,18 @@ template <class ElemTy> struct ConcurrentReadableHashMap {
10021072
void clear() {
10031073
StaticScopedLock guard(WriterLock);
10041074

1005-
auto *indices = Indices.load(std::memory_order_relaxed);
1075+
IndexStorage indices = Indices.load(std::memory_order_relaxed);
10061076
auto *elements = Elements.load(std::memory_order_relaxed);
10071077

10081078
// Order doesn't matter here, snapshots will gracefully handle any field
10091079
// being NULL/0 while the others are not.
1010-
Indices.store(nullptr, std::memory_order_relaxed);
1080+
Indices.store(0, std::memory_order_relaxed);
10111081
ElementCount.store(0, std::memory_order_relaxed);
10121082
Elements.store(nullptr, std::memory_order_relaxed);
10131083
ElementCapacity = 0;
10141084

1015-
FreeListNode::add(&FreeList, indices);
1085+
if (auto *ptr = indices.pointer())
1086+
FreeListNode::add(&FreeList, ptr);
10161087
FreeListNode::add(&FreeList, elements);
10171088

10181089
deallocateFreeListIfSafe();

0 commit comments

Comments
 (0)