Skip to content

Commit 7d3624a

Browse files
committed
[libc] Cache old slabs when allocating GPU memory
Summary: This patch introduces a lock-free stack used to store a fixed number of slabs. Instead of going directly through RPC memory, we instead can consult the cache and use that. Currently, this means that ~64 MiB of memory will remain in-use if the user completely fills the cache. However, because we always fully destroy the object, the chunk size can be reset so they can be fully reused. This greatly improves performance in cases where the user has previously accessed malloc, lowering the difference between an implementation that does not free slabs at all and one that does. We can also skip the expensive zeroing step if the old chunk size was smaller than the previous one. Smaller chunk sizes need a larger bitfield, and because we know for a fact that the number of users remaining in this slab is zero thanks to the reference counting we can guarantee that the bitfield is all zero like when it was initialized.
1 parent 25c02fb commit 7d3624a

File tree

5 files changed

+211
-2
lines changed

5 files changed

+211
-2
lines changed

libc/src/__support/GPU/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,12 @@ add_header_library(
99
utils.h
1010
)
1111

12+
add_header_library(
13+
fixedstack
14+
HDRS
15+
fixedstack.h
16+
)
17+
1218
add_object_library(
1319
allocator
1420
SRCS
@@ -23,4 +29,5 @@ add_object_library(
2329
libc.src.__support.CPP.bit
2430
libc.src.__support.CPP.new
2531
.utils
32+
.fixedstack
2633
)

libc/src/__support/GPU/allocator.cpp

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "src/__support/CPP/atomic.h"
2121
#include "src/__support/CPP/bit.h"
2222
#include "src/__support/CPP/new.h"
23+
#include "src/__support/GPU/fixedstack.h"
2324
#include "src/__support/GPU/utils.h"
2425
#include "src/__support/RPC/rpc_client.h"
2526
#include "src/__support/threads/sleep.h"
@@ -39,6 +40,9 @@ constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1;
3940
// The number of times to attempt claiming an in-progress slab allocation.
4041
constexpr static uint32_t MAX_TRIES = 1024;
4142

43+
// The number of previously allocated slabs we will keep in memory.
44+
constexpr static uint32_t CACHED_SLABS = 8;
45+
4246
static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two");
4347

4448
namespace impl {
@@ -185,20 +189,35 @@ struct Slab {
185189
struct alignas(MIN_SIZE) Header {
186190
uint32_t chunk_size;
187191
uint32_t global_index;
192+
uint32_t cached_chunk_size;
188193
};
189194

190195
// Initialize the slab with its chunk size and index in the global table for
191196
// use when freeing.
192197
Slab(uint32_t chunk_size, uint32_t global_index) {
193198
Header *header = reinterpret_cast<Header *>(memory);
199+
header->cached_chunk_size = cpp::numeric_limits<uint32_t>::max();
194200
header->chunk_size = chunk_size;
195201
header->global_index = global_index;
196202
}
197203

204+
// Reset the memory with a new index and chunk size, not thread safe.
205+
Slab *reset(uint32_t chunk_size, uint32_t global_index) {
206+
Header *header = reinterpret_cast<Header *>(memory);
207+
header->cached_chunk_size = header->chunk_size;
208+
header->chunk_size = chunk_size;
209+
header->global_index = global_index;
210+
return this;
211+
}
212+
198213
// Set the necessary bitfield bytes to zero in parallel using many lanes. This
199214
// must be called before the bitfield can be accessed safely, memory is not
200215
// guaranteed to be zero initialized in the current implementation.
201216
void initialize(uint64_t uniform) {
217+
// If this is a re-used slab the memory is already set to zero.
218+
if (get_cached_chunk_size() <= get_chunk_size())
219+
return;
220+
202221
uint32_t size = (bitfield_bytes(get_chunk_size()) + sizeof(uint32_t) - 1) /
203222
sizeof(uint32_t);
204223
impl::uniform_memset(get_bitfield(), 0, size, uniform);
@@ -236,6 +255,11 @@ struct Slab {
236255
return reinterpret_cast<const Header *>(memory)->chunk_size;
237256
}
238257

258+
// Get the chunk size that was previously used.
259+
uint32_t get_cached_chunk_size() const {
260+
return reinterpret_cast<const Header *>(memory)->cached_chunk_size;
261+
}
262+
239263
// Get the location in the memory where we will store the global index.
240264
uint32_t get_global_index() const {
241265
return reinterpret_cast<const Header *>(memory)->global_index;
@@ -337,6 +361,9 @@ struct Slab {
337361
uint8_t memory[SLAB_SIZE];
338362
};
339363

364+
// A global cache of previously allocated slabs for efficient reuse.
365+
static FixedStack<Slab *, CACHED_SLABS> slab_cache;
366+
340367
/// A wait-free guard around a pointer resource to be created dynamically if
341368
/// space is available and freed once there are no more users.
342369
struct GuardPtr {
@@ -408,6 +435,11 @@ struct GuardPtr {
408435
reinterpret_cast<Slab *>(cpp::numeric_limits<uintptr_t>::max()),
409436
cpp::MemoryOrder::RELAXED, cpp::MemoryOrder::RELAXED)) {
410437
count = cpp::numeric_limits<uint32_t>::max();
438+
439+
Slab *cached = nullptr;
440+
if (slab_cache.pop(cached))
441+
return cached->reset(cpp::forward<Args>(args)...);
442+
411443
void *raw = impl::rpc_allocate(sizeof(Slab));
412444
if (!raw)
413445
return nullptr;
@@ -475,8 +507,10 @@ struct GuardPtr {
475507
if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(mask)) &&
476508
ref.release(cpp::popcount(mask))) {
477509
Slab *p = ptr.load(cpp::MemoryOrder::RELAXED);
478-
p->~Slab();
479-
impl::rpc_free(p);
510+
if (!slab_cache.push(p)) {
511+
p->~Slab();
512+
impl::rpc_free(p);
513+
}
480514
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
481515
ptr.store(nullptr, cpp::MemoryOrder::RELAXED);
482516
}

libc/src/__support/GPU/fixedstack.h

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
//===-- A lock-free data structure for a fixed capacity stack ---*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_FIXEDSTACK_H
10+
#define LLVM_LIBC_SRC___SUPPORT_GPU_FIXEDSTACK_H
11+
12+
#include "src/__support/CPP/atomic.h"
13+
#include "src/__support/threads/sleep.h"
14+
15+
#include <stdint.h>
16+
17+
namespace LIBC_NAMESPACE_DECL {
18+
19+
// A lock-free fixed size stack backed by an underlying array of data. It
20+
// supports push and pop operations in a completely lock-free manner.
21+
template <typename T, uint32_t CAPACITY> struct alignas(16) FixedStack {
22+
// The index is stored as a 20-bit value and cannot index into any more.
23+
static_assert(CAPACITY < 1024 * 1024, "Invalid buffer size");
24+
25+
// The head of the free and used stacks. Represents as a 20-bit index combined
26+
// with a 44-bit ABA tag that is updated in a single atomic operation.
27+
uint64_t free;
28+
uint64_t used;
29+
30+
// The stack is a linked list of indices into the underlying data
31+
uint32_t next[CAPACITY];
32+
T data[CAPACITY];
33+
34+
// Get the 20-bit index into the underlying array from the head.
35+
LIBC_INLINE static constexpr uint32_t get_node(uint64_t head) {
36+
return static_cast<uint32_t>(head & 0xfffff);
37+
}
38+
39+
// Increment the old ABA tag and merge it into the new index.
40+
LIBC_INLINE static constexpr uint64_t make_head(uint64_t orig,
41+
uint32_t node) {
42+
return static_cast<uint64_t>(node) | (((orig >> 20ul) + 1ul) << 20ul);
43+
}
44+
45+
// Attempts to pop data from the given stack by making it point to the next
46+
// node. We repeatedly attempt to write to the head using compare-and-swap,
47+
// expecting that it has not been changed by any other thread.
48+
LIBC_INLINE uint32_t pop_impl(cpp::AtomicRef<uint64_t> head) {
49+
uint64_t orig = head.load(cpp::MemoryOrder::RELAXED);
50+
51+
for (;;) {
52+
if (get_node(orig) == CAPACITY)
53+
return CAPACITY;
54+
55+
uint32_t node =
56+
cpp::AtomicRef(next[get_node(orig)]).load(cpp::MemoryOrder::RELAXED);
57+
if (head.compare_exchange_strong(orig, make_head(orig, node),
58+
cpp::MemoryOrder::ACQUIRE,
59+
cpp::MemoryOrder::RELAXED))
60+
break;
61+
}
62+
return get_node(orig);
63+
}
64+
65+
// Attempts to push data to the given stack by making it point to the new
66+
// node. We repeatedly attempt to write to the head using compare-and-swap,
67+
// expecting that it has not been changed by any other thread.
68+
LIBC_INLINE uint32_t push_impl(cpp::AtomicRef<uint64_t> head, uint32_t node) {
69+
uint64_t orig = head.load(cpp::MemoryOrder::RELAXED);
70+
for (;;) {
71+
next[node] = get_node(orig);
72+
if (head.compare_exchange_strong(orig, make_head(orig, node),
73+
cpp::MemoryOrder::RELEASE,
74+
cpp::MemoryOrder::RELAXED))
75+
break;
76+
}
77+
return get_node(head.load(cpp::MemoryOrder::RELAXED));
78+
}
79+
80+
public:
81+
// Initialize the free stack to be full and the used stack to be empty. We use
82+
// the capacity of the stack as a sentinel value.
83+
LIBC_INLINE constexpr FixedStack() : free(0), used(CAPACITY), data{} {
84+
for (uint32_t i = 0; i < CAPACITY; ++i)
85+
next[i] = i + 1;
86+
}
87+
88+
LIBC_INLINE bool push(const T &val) {
89+
uint32_t node = pop_impl(cpp::AtomicRef(free));
90+
if (node == CAPACITY)
91+
return false;
92+
93+
data[node] = val;
94+
push_impl(cpp::AtomicRef(used), node);
95+
return true;
96+
}
97+
98+
LIBC_INLINE bool pop(T &val) {
99+
uint32_t node = pop_impl(cpp::AtomicRef(used));
100+
if (node == CAPACITY)
101+
return false;
102+
103+
val = data[node];
104+
push_impl(cpp::AtomicRef(free), node);
105+
return true;
106+
}
107+
};
108+
109+
} // namespace LIBC_NAMESPACE_DECL
110+
111+
#endif // LLVM_LIBC_SRC___SUPPORT_GPU_FIXEDSTACK_H

libc/test/integration/src/__support/GPU/CMakeLists.txt

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,16 @@ add_integration_test(
2727
LOADER_ARGS
2828
--threads 64
2929
)
30+
31+
add_libc_test(
32+
fixedstack_test
33+
SUITE
34+
libc-support-gpu-tests
35+
SRCS
36+
fixedstack_test.cpp
37+
DEPENDS
38+
libc.src.__support.GPU.fixedstack
39+
LOADER_ARGS
40+
--threads 32
41+
--blocks 16
42+
)
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
//===-- Integration test for the lock-free stack --------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "src/__support/GPU/fixedstack.h"
10+
#include "src/__support/GPU/utils.h"
11+
#include "test/IntegrationTest/test.h"
12+
13+
using namespace LIBC_NAMESPACE;
14+
15+
static FixedStack<uint32_t, 2048> global_stack;
16+
17+
void run() {
18+
// We need enough space in the stack as threads in flight can temporarily
19+
// consume memory before they finish comitting it back to the stack.
20+
ASSERT_EQ(gpu::get_num_blocks() * gpu::get_num_threads(), 512);
21+
22+
uint32_t val;
23+
uint32_t num_threads = static_cast<uint32_t>(gpu::get_num_threads());
24+
for (int i = 0; i < 256; ++i) {
25+
EXPECT_TRUE(global_stack.push(UINT32_MAX))
26+
EXPECT_TRUE(global_stack.pop(val))
27+
ASSERT_TRUE(val < num_threads || val == UINT32_MAX);
28+
}
29+
30+
EXPECT_TRUE(global_stack.push(static_cast<uint32_t>(gpu::get_thread_id())));
31+
EXPECT_TRUE(global_stack.push(static_cast<uint32_t>(gpu::get_thread_id())));
32+
EXPECT_TRUE(global_stack.pop(val));
33+
ASSERT_TRUE(val < num_threads || val == UINT32_MAX);
34+
35+
// Fill the rest of the stack with the default value.
36+
while (!global_stack.push(UINT32_MAX))
37+
;
38+
}
39+
40+
TEST_MAIN(int argc, char **argv, char **envp) {
41+
run();
42+
43+
return 0;
44+
}

0 commit comments

Comments
 (0)