diff --git a/libc/src/__support/GPU/CMakeLists.txt b/libc/src/__support/GPU/CMakeLists.txt index f8fdfeb9da9df..72a7879f72224 100644 --- a/libc/src/__support/GPU/CMakeLists.txt +++ b/libc/src/__support/GPU/CMakeLists.txt @@ -9,6 +9,12 @@ add_header_library( utils.h ) +add_header_library( + fixedstack + HDRS + fixedstack.h +) + add_object_library( allocator SRCS @@ -23,4 +29,5 @@ add_object_library( libc.src.__support.CPP.bit libc.src.__support.CPP.new .utils + .fixedstack ) diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp index 250bebdbb7d81..534a309fec7b4 100644 --- a/libc/src/__support/GPU/allocator.cpp +++ b/libc/src/__support/GPU/allocator.cpp @@ -20,6 +20,7 @@ #include "src/__support/CPP/atomic.h" #include "src/__support/CPP/bit.h" #include "src/__support/CPP/new.h" +#include "src/__support/GPU/fixedstack.h" #include "src/__support/GPU/utils.h" #include "src/__support/RPC/rpc_client.h" #include "src/__support/threads/sleep.h" @@ -39,6 +40,9 @@ constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1; // The number of times to attempt claiming an in-progress slab allocation. constexpr static uint32_t MAX_TRIES = 1024; +// The number of previously allocated slabs we will keep in memory. +constexpr static uint32_t CACHED_SLABS = 8; + static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two"); namespace impl { @@ -185,20 +189,35 @@ struct Slab { struct alignas(MIN_SIZE) Header { uint32_t chunk_size; uint32_t global_index; + uint32_t cached_chunk_size; }; // Initialize the slab with its chunk size and index in the global table for // use when freeing. Slab(uint32_t chunk_size, uint32_t global_index) { Header *header = reinterpret_cast
(memory); + header->cached_chunk_size = cpp::numeric_limits::max(); header->chunk_size = chunk_size; header->global_index = global_index; } + // Reset the memory with a new index and chunk size, not thread safe. + Slab *reset(uint32_t chunk_size, uint32_t global_index) { + Header *header = reinterpret_cast
(memory); + header->cached_chunk_size = header->chunk_size; + header->chunk_size = chunk_size; + header->global_index = global_index; + return this; + } + // Set the necessary bitfield bytes to zero in parallel using many lanes. This // must be called before the bitfield can be accessed safely, memory is not // guaranteed to be zero initialized in the current implementation. void initialize(uint64_t uniform) { + // If this is a re-used slab the memory is already set to zero. + if (get_cached_chunk_size() <= get_chunk_size()) + return; + uint32_t size = (bitfield_bytes(get_chunk_size()) + sizeof(uint32_t) - 1) / sizeof(uint32_t); impl::uniform_memset(get_bitfield(), 0, size, uniform); @@ -236,6 +255,11 @@ struct Slab { return reinterpret_cast(memory)->chunk_size; } + // Get the chunk size that was previously used. + uint32_t get_cached_chunk_size() const { + return reinterpret_cast(memory)->cached_chunk_size; + } + // Get the location in the memory where we will store the global index. uint32_t get_global_index() const { return reinterpret_cast(memory)->global_index; @@ -337,6 +361,9 @@ struct Slab { uint8_t memory[SLAB_SIZE]; }; +// A global cache of previously allocated slabs for efficient reuse. +static FixedStack slab_cache; + /// A wait-free guard around a pointer resource to be created dynamically if /// space is available and freed once there are no more users. struct GuardPtr { @@ -408,6 +435,11 @@ struct GuardPtr { reinterpret_cast(cpp::numeric_limits::max()), cpp::MemoryOrder::RELAXED, cpp::MemoryOrder::RELAXED)) { count = cpp::numeric_limits::max(); + + Slab *cached = nullptr; + if (slab_cache.pop(cached)) + return cached->reset(cpp::forward(args)...); + void *raw = impl::rpc_allocate(sizeof(Slab)); if (!raw) return nullptr; @@ -475,8 +507,10 @@ struct GuardPtr { if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(mask)) && ref.release(cpp::popcount(mask))) { Slab *p = ptr.load(cpp::MemoryOrder::RELAXED); - p->~Slab(); - impl::rpc_free(p); + if (!slab_cache.push(p)) { + p->~Slab(); + impl::rpc_free(p); + } cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); ptr.store(nullptr, cpp::MemoryOrder::RELAXED); } diff --git a/libc/src/__support/GPU/fixedstack.h b/libc/src/__support/GPU/fixedstack.h new file mode 100644 index 0000000000000..6ceaa2fc73355 --- /dev/null +++ b/libc/src/__support/GPU/fixedstack.h @@ -0,0 +1,111 @@ +//===-- A lock-free data structure for a fixed capacity stack ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_FIXEDSTACK_H +#define LLVM_LIBC_SRC___SUPPORT_GPU_FIXEDSTACK_H + +#include "src/__support/CPP/atomic.h" +#include "src/__support/threads/sleep.h" + +#include + +namespace LIBC_NAMESPACE_DECL { + +// A lock-free fixed size stack backed by an underlying array of data. It +// supports push and pop operations in a completely lock-free manner. +template struct alignas(16) FixedStack { + // The index is stored as a 20-bit value and cannot index into any more. + static_assert(CAPACITY < 1024 * 1024, "Invalid buffer size"); + + // The head of the free and used stacks. Represents as a 20-bit index combined + // with a 44-bit ABA tag that is updated in a single atomic operation. + uint64_t free; + uint64_t used; + + // The stack is a linked list of indices into the underlying data + uint32_t next[CAPACITY]; + T data[CAPACITY]; + + // Get the 20-bit index into the underlying array from the head. + LIBC_INLINE static constexpr uint32_t get_node(uint64_t head) { + return static_cast(head & 0xfffff); + } + + // Increment the old ABA tag and merge it into the new index. + LIBC_INLINE static constexpr uint64_t make_head(uint64_t orig, + uint32_t node) { + return static_cast(node) | (((orig >> 20ul) + 1ul) << 20ul); + } + + // Attempts to pop data from the given stack by making it point to the next + // node. We repeatedly attempt to write to the head using compare-and-swap, + // expecting that it has not been changed by any other thread. + LIBC_INLINE uint32_t pop_impl(cpp::AtomicRef head) { + uint64_t orig = head.load(cpp::MemoryOrder::RELAXED); + + for (;;) { + if (get_node(orig) == CAPACITY) + return CAPACITY; + + uint32_t node = + cpp::AtomicRef(next[get_node(orig)]).load(cpp::MemoryOrder::RELAXED); + if (head.compare_exchange_strong(orig, make_head(orig, node), + cpp::MemoryOrder::ACQUIRE, + cpp::MemoryOrder::RELAXED)) + break; + } + return get_node(orig); + } + + // Attempts to push data to the given stack by making it point to the new + // node. We repeatedly attempt to write to the head using compare-and-swap, + // expecting that it has not been changed by any other thread. + LIBC_INLINE uint32_t push_impl(cpp::AtomicRef head, uint32_t node) { + uint64_t orig = head.load(cpp::MemoryOrder::RELAXED); + for (;;) { + next[node] = get_node(orig); + if (head.compare_exchange_strong(orig, make_head(orig, node), + cpp::MemoryOrder::RELEASE, + cpp::MemoryOrder::RELAXED)) + break; + } + return get_node(head.load(cpp::MemoryOrder::RELAXED)); + } + +public: + // Initialize the free stack to be full and the used stack to be empty. We use + // the capacity of the stack as a sentinel value. + LIBC_INLINE constexpr FixedStack() : free(0), used(CAPACITY), data{} { + for (uint32_t i = 0; i < CAPACITY; ++i) + next[i] = i + 1; + } + + LIBC_INLINE bool push(const T &val) { + uint32_t node = pop_impl(cpp::AtomicRef(free)); + if (node == CAPACITY) + return false; + + data[node] = val; + push_impl(cpp::AtomicRef(used), node); + return true; + } + + LIBC_INLINE bool pop(T &val) { + uint32_t node = pop_impl(cpp::AtomicRef(used)); + if (node == CAPACITY) + return false; + + val = data[node]; + push_impl(cpp::AtomicRef(free), node); + return true; + } +}; + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_GPU_FIXEDSTACK_H diff --git a/libc/test/integration/src/__support/GPU/CMakeLists.txt b/libc/test/integration/src/__support/GPU/CMakeLists.txt index e066830f6cc0d..1fb175b92ef2a 100644 --- a/libc/test/integration/src/__support/GPU/CMakeLists.txt +++ b/libc/test/integration/src/__support/GPU/CMakeLists.txt @@ -27,3 +27,16 @@ add_integration_test( LOADER_ARGS --threads 64 ) + +add_libc_test( + fixedstack_test + SUITE + libc-support-gpu-tests + SRCS + fixedstack_test.cpp + DEPENDS + libc.src.__support.GPU.fixedstack + LOADER_ARGS + --threads 32 + --blocks 16 +) diff --git a/libc/test/integration/src/__support/GPU/fixedstack_test.cpp b/libc/test/integration/src/__support/GPU/fixedstack_test.cpp new file mode 100644 index 0000000000000..fde51df325ab8 --- /dev/null +++ b/libc/test/integration/src/__support/GPU/fixedstack_test.cpp @@ -0,0 +1,44 @@ +//===-- Integration test for the lock-free stack --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/GPU/fixedstack.h" +#include "src/__support/GPU/utils.h" +#include "test/IntegrationTest/test.h" + +using namespace LIBC_NAMESPACE; + +static FixedStack global_stack; + +void run() { + // We need enough space in the stack as threads in flight can temporarily + // consume memory before they finish comitting it back to the stack. + ASSERT_EQ(gpu::get_num_blocks() * gpu::get_num_threads(), 512); + + uint32_t val; + uint32_t num_threads = static_cast(gpu::get_num_threads()); + for (int i = 0; i < 256; ++i) { + EXPECT_TRUE(global_stack.push(UINT32_MAX)) + EXPECT_TRUE(global_stack.pop(val)) + ASSERT_TRUE(val < num_threads || val == UINT32_MAX); + } + + EXPECT_TRUE(global_stack.push(static_cast(gpu::get_thread_id()))); + EXPECT_TRUE(global_stack.push(static_cast(gpu::get_thread_id()))); + EXPECT_TRUE(global_stack.pop(val)); + ASSERT_TRUE(val < num_threads || val == UINT32_MAX); + + // Fill the rest of the stack with the default value. + while (!global_stack.push(UINT32_MAX)) + ; +} + +TEST_MAIN(int argc, char **argv, char **envp) { + run(); + + return 0; +}