Skip to content

Commit f3909ad

Browse files
author
git apple-llvm automerger
committed
Merge commit 'ca006898b3c2' from llvm.org/main into next
2 parents a4c6692 + ca00689 commit f3909ad

File tree

5 files changed

+211
-2
lines changed

5 files changed

+211
-2
lines changed

libc/src/__support/GPU/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,12 @@ add_header_library(
99
utils.h
1010
)
1111

12+
add_header_library(
13+
fixedstack
14+
HDRS
15+
fixedstack.h
16+
)
17+
1218
add_object_library(
1319
allocator
1420
SRCS
@@ -23,4 +29,5 @@ add_object_library(
2329
libc.src.__support.CPP.bit
2430
libc.src.__support.CPP.new
2531
.utils
32+
.fixedstack
2633
)

libc/src/__support/GPU/allocator.cpp

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "src/__support/CPP/atomic.h"
2121
#include "src/__support/CPP/bit.h"
2222
#include "src/__support/CPP/new.h"
23+
#include "src/__support/GPU/fixedstack.h"
2324
#include "src/__support/GPU/utils.h"
2425
#include "src/__support/RPC/rpc_client.h"
2526
#include "src/__support/threads/sleep.h"
@@ -39,6 +40,9 @@ constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1;
3940
// The number of times to attempt claiming an in-progress slab allocation.
4041
constexpr static uint32_t MAX_TRIES = 1024;
4142

43+
// The number of previously allocated slabs we will keep in memory.
44+
constexpr static uint32_t CACHED_SLABS = 8;
45+
4246
static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two");
4347

4448
namespace impl {
@@ -185,20 +189,35 @@ struct Slab {
185189
struct alignas(MIN_SIZE) Header {
186190
uint32_t chunk_size;
187191
uint32_t global_index;
192+
uint32_t cached_chunk_size;
188193
};
189194

190195
// Initialize the slab with its chunk size and index in the global table for
191196
// use when freeing.
192197
Slab(uint32_t chunk_size, uint32_t global_index) {
193198
Header *header = reinterpret_cast<Header *>(memory);
199+
header->cached_chunk_size = cpp::numeric_limits<uint32_t>::max();
194200
header->chunk_size = chunk_size;
195201
header->global_index = global_index;
196202
}
197203

204+
// Reset the memory with a new index and chunk size, not thread safe.
205+
Slab *reset(uint32_t chunk_size, uint32_t global_index) {
206+
Header *header = reinterpret_cast<Header *>(memory);
207+
header->cached_chunk_size = header->chunk_size;
208+
header->chunk_size = chunk_size;
209+
header->global_index = global_index;
210+
return this;
211+
}
212+
198213
// Set the necessary bitfield bytes to zero in parallel using many lanes. This
199214
// must be called before the bitfield can be accessed safely, memory is not
200215
// guaranteed to be zero initialized in the current implementation.
201216
void initialize(uint64_t uniform) {
217+
// If this is a re-used slab the memory is already set to zero.
218+
if (get_cached_chunk_size() <= get_chunk_size())
219+
return;
220+
202221
uint32_t size = (bitfield_bytes(get_chunk_size()) + sizeof(uint32_t) - 1) /
203222
sizeof(uint32_t);
204223
impl::uniform_memset(get_bitfield(), 0, size, uniform);
@@ -236,6 +255,11 @@ struct Slab {
236255
return reinterpret_cast<const Header *>(memory)->chunk_size;
237256
}
238257

258+
// Get the chunk size that was previously used.
259+
uint32_t get_cached_chunk_size() const {
260+
return reinterpret_cast<const Header *>(memory)->cached_chunk_size;
261+
}
262+
239263
// Get the location in the memory where we will store the global index.
240264
uint32_t get_global_index() const {
241265
return reinterpret_cast<const Header *>(memory)->global_index;
@@ -337,6 +361,9 @@ struct Slab {
337361
uint8_t memory[SLAB_SIZE];
338362
};
339363

364+
// A global cache of previously allocated slabs for efficient reuse.
365+
static FixedStack<Slab *, CACHED_SLABS> slab_cache;
366+
340367
/// A wait-free guard around a pointer resource to be created dynamically if
341368
/// space is available and freed once there are no more users.
342369
struct GuardPtr {
@@ -408,6 +435,11 @@ struct GuardPtr {
408435
reinterpret_cast<Slab *>(cpp::numeric_limits<uintptr_t>::max()),
409436
cpp::MemoryOrder::RELAXED, cpp::MemoryOrder::RELAXED)) {
410437
count = cpp::numeric_limits<uint32_t>::max();
438+
439+
Slab *cached = nullptr;
440+
if (slab_cache.pop(cached))
441+
return cached->reset(cpp::forward<Args>(args)...);
442+
411443
void *raw = impl::rpc_allocate(sizeof(Slab));
412444
if (!raw)
413445
return nullptr;
@@ -475,8 +507,10 @@ struct GuardPtr {
475507
if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(mask)) &&
476508
ref.release(cpp::popcount(mask))) {
477509
Slab *p = ptr.load(cpp::MemoryOrder::RELAXED);
478-
p->~Slab();
479-
impl::rpc_free(p);
510+
if (!slab_cache.push(p)) {
511+
p->~Slab();
512+
impl::rpc_free(p);
513+
}
480514
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
481515
ptr.store(nullptr, cpp::MemoryOrder::RELAXED);
482516
}

libc/src/__support/GPU/fixedstack.h

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
//===-- A lock-free data structure for a fixed capacity stack ---*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_FIXEDSTACK_H
10+
#define LLVM_LIBC_SRC___SUPPORT_GPU_FIXEDSTACK_H
11+
12+
#include "src/__support/CPP/atomic.h"
13+
#include "src/__support/threads/sleep.h"
14+
15+
#include <stdint.h>
16+
17+
namespace LIBC_NAMESPACE_DECL {
18+
19+
// A lock-free fixed size stack backed by an underlying array of data. It
20+
// supports push and pop operations in a completely lock-free manner.
21+
template <typename T, uint32_t CAPACITY> struct alignas(16) FixedStack {
22+
// The index is stored as a 20-bit value and cannot index into any more.
23+
static_assert(CAPACITY < 1024 * 1024, "Invalid buffer size");
24+
25+
// The head of the free and used stacks. Represents as a 20-bit index combined
26+
// with a 44-bit ABA tag that is updated in a single atomic operation.
27+
uint64_t free;
28+
uint64_t used;
29+
30+
// The stack is a linked list of indices into the underlying data
31+
uint32_t next[CAPACITY];
32+
T data[CAPACITY];
33+
34+
// Get the 20-bit index into the underlying array from the head.
35+
LIBC_INLINE static constexpr uint32_t get_node(uint64_t head) {
36+
return static_cast<uint32_t>(head & 0xfffff);
37+
}
38+
39+
// Increment the old ABA tag and merge it into the new index.
40+
LIBC_INLINE static constexpr uint64_t make_head(uint64_t orig,
41+
uint32_t node) {
42+
return static_cast<uint64_t>(node) | (((orig >> 20ul) + 1ul) << 20ul);
43+
}
44+
45+
// Attempts to pop data from the given stack by making it point to the next
46+
// node. We repeatedly attempt to write to the head using compare-and-swap,
47+
// expecting that it has not been changed by any other thread.
48+
LIBC_INLINE uint32_t pop_impl(cpp::AtomicRef<uint64_t> head) {
49+
uint64_t orig = head.load(cpp::MemoryOrder::RELAXED);
50+
51+
for (;;) {
52+
if (get_node(orig) == CAPACITY)
53+
return CAPACITY;
54+
55+
uint32_t node =
56+
cpp::AtomicRef(next[get_node(orig)]).load(cpp::MemoryOrder::RELAXED);
57+
if (head.compare_exchange_strong(orig, make_head(orig, node),
58+
cpp::MemoryOrder::ACQUIRE,
59+
cpp::MemoryOrder::RELAXED))
60+
break;
61+
}
62+
return get_node(orig);
63+
}
64+
65+
// Attempts to push data to the given stack by making it point to the new
66+
// node. We repeatedly attempt to write to the head using compare-and-swap,
67+
// expecting that it has not been changed by any other thread.
68+
LIBC_INLINE uint32_t push_impl(cpp::AtomicRef<uint64_t> head, uint32_t node) {
69+
uint64_t orig = head.load(cpp::MemoryOrder::RELAXED);
70+
for (;;) {
71+
next[node] = get_node(orig);
72+
if (head.compare_exchange_strong(orig, make_head(orig, node),
73+
cpp::MemoryOrder::RELEASE,
74+
cpp::MemoryOrder::RELAXED))
75+
break;
76+
}
77+
return get_node(head.load(cpp::MemoryOrder::RELAXED));
78+
}
79+
80+
public:
81+
// Initialize the free stack to be full and the used stack to be empty. We use
82+
// the capacity of the stack as a sentinel value.
83+
LIBC_INLINE constexpr FixedStack() : free(0), used(CAPACITY), data{} {
84+
for (uint32_t i = 0; i < CAPACITY; ++i)
85+
next[i] = i + 1;
86+
}
87+
88+
LIBC_INLINE bool push(const T &val) {
89+
uint32_t node = pop_impl(cpp::AtomicRef(free));
90+
if (node == CAPACITY)
91+
return false;
92+
93+
data[node] = val;
94+
push_impl(cpp::AtomicRef(used), node);
95+
return true;
96+
}
97+
98+
LIBC_INLINE bool pop(T &val) {
99+
uint32_t node = pop_impl(cpp::AtomicRef(used));
100+
if (node == CAPACITY)
101+
return false;
102+
103+
val = data[node];
104+
push_impl(cpp::AtomicRef(free), node);
105+
return true;
106+
}
107+
};
108+
109+
} // namespace LIBC_NAMESPACE_DECL
110+
111+
#endif // LLVM_LIBC_SRC___SUPPORT_GPU_FIXEDSTACK_H

libc/test/integration/src/__support/GPU/CMakeLists.txt

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,16 @@ add_integration_test(
2727
LOADER_ARGS
2828
--threads 64
2929
)
30+
31+
add_libc_test(
32+
fixedstack_test
33+
SUITE
34+
libc-support-gpu-tests
35+
SRCS
36+
fixedstack_test.cpp
37+
DEPENDS
38+
libc.src.__support.GPU.fixedstack
39+
LOADER_ARGS
40+
--threads 32
41+
--blocks 16
42+
)
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
//===-- Integration test for the lock-free stack --------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "src/__support/GPU/fixedstack.h"
10+
#include "src/__support/GPU/utils.h"
11+
#include "test/IntegrationTest/test.h"
12+
13+
using namespace LIBC_NAMESPACE;
14+
15+
static FixedStack<uint32_t, 2048> global_stack;
16+
17+
void run() {
18+
// We need enough space in the stack as threads in flight can temporarily
19+
// consume memory before they finish comitting it back to the stack.
20+
ASSERT_EQ(gpu::get_num_blocks() * gpu::get_num_threads(), 512);
21+
22+
uint32_t val;
23+
uint32_t num_threads = static_cast<uint32_t>(gpu::get_num_threads());
24+
for (int i = 0; i < 256; ++i) {
25+
EXPECT_TRUE(global_stack.push(UINT32_MAX))
26+
EXPECT_TRUE(global_stack.pop(val))
27+
ASSERT_TRUE(val < num_threads || val == UINT32_MAX);
28+
}
29+
30+
EXPECT_TRUE(global_stack.push(static_cast<uint32_t>(gpu::get_thread_id())));
31+
EXPECT_TRUE(global_stack.push(static_cast<uint32_t>(gpu::get_thread_id())));
32+
EXPECT_TRUE(global_stack.pop(val));
33+
ASSERT_TRUE(val < num_threads || val == UINT32_MAX);
34+
35+
// Fill the rest of the stack with the default value.
36+
while (!global_stack.push(UINT32_MAX))
37+
;
38+
}
39+
40+
TEST_MAIN(int argc, char **argv, char **envp) {
41+
run();
42+
43+
return 0;
44+
}

0 commit comments

Comments
 (0)