Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions libc/src/__support/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,15 @@ add_header_library(
libc.src.__support.CPP.array
)

add_header_library(
fixedstack
HDRS
fixedstack.h
DEPENDS
libc.src.__support.CPP.array
libc.src.__support.CPP.atomic
)

add_header_library(
char_vector
HDRS
Expand Down
130 changes: 130 additions & 0 deletions libc/src/__support/fixedstack.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
//===-- A lock-free data structure for a fixed capacity stack ---*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC___SUPPORT_FIXEDSTACK_H
#define LLVM_LIBC_SRC___SUPPORT_FIXEDSTACK_H

#include "src/__support/CPP/array.h"
#include "src/__support/CPP/atomic.h"
#include "src/__support/threads/sleep.h"

#include <stdint.h>

namespace LIBC_NAMESPACE {

// A lock-free fixed size stack backed by an underlying cpp::array data
// structure. It supports push and pop operations in a thread safe manner.
template <typename T, uint32_t CAPACITY> class alignas(16) FixedStack {
// The index is stored as a 20-bit value and cannot index into any more.
static_assert(CAPACITY < 1024 * 1024, "Invalid buffer size");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: add a comment explaining why 1024*1024 is the max size

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. Unfortunately I don't know of any other use-cases for a strict stack in libc. The other uses of atexit just get to claim the whole thing with a mutex so they don't need to worry about this and just let it append.


// The head of the free and used stacks. Represents as a 20-bit index combined
// with a 44-bit ABA tag that is updated in a single atomic operation.
uint64_t free;
uint64_t used;

// The stack is a linked list of indices into the underlying data
cpp::array<uint32_t, CAPACITY> next;
cpp::array<T, CAPACITY> data;

// Get the 20-bit index into the underlying array from the head.
static constexpr uint32_t get_node(uint64_t head) {
return static_cast<uint32_t>(head & 0xffff);
}

// Increment the old ABA tag and merge it into the new index.
static constexpr uint64_t make_new_head(uint64_t orig, uint32_t node) {
return static_cast<uint64_t>(node) | (((orig >> 20ul) + 1ul) << 20ul);
}

// Helper macros for the atomic operations. We cannot use the standard
// cpp::atomic helpers because the initializer will no longer be constexpr and
// the NVPTX backend cannot currently support all of the atomics.
#define atomic_load(val, mem_order) __atomic_load_n(val, (int)mem_order)
#define atomic_cas(val, expected, desired, success_order, failure_order) \
__atomic_compare_exchange_n(val, expected, desired, /*weak=*/true, \
(int)success_order, (int)failure_order)

// Attempts to pop data from the given stack by making it point to the next
// node. We repeatedly attempt to write to the head using compare-and-swap,
// expecting that it has not been changed by any other thread.
uint32_t pop_impl(uint64_t *head) {
uint64_t orig = atomic_load(head, cpp::MemoryOrder::RELAXED);

for (;;) {
if (get_node(orig) == CAPACITY)
return CAPACITY;

uint32_t node =
atomic_load(&next[get_node(orig)], cpp::MemoryOrder::RELAXED);
if (atomic_cas(head, &orig, make_new_head(orig, node),
cpp::MemoryOrder::ACQUIRE, cpp::MemoryOrder::RELAXED))
break;
sleep_briefly();
}
return get_node(orig);
}

// Attempts to push data to the given stack by making it point to the new
// node. We repeatedly attempt to write to the head using compare-and-swap,
// expecting that it has not been changed by any other thread.
uint32_t push_impl(uint64_t *head, uint32_t node) {
uint64_t orig = atomic_load(head, cpp::MemoryOrder::RELAXED);
for (;;) {
next[node] = get_node(orig);
if (atomic_cas(head, &orig, make_new_head(orig, node),
cpp::MemoryOrder::RELEASE, cpp::MemoryOrder::RELAXED))
break;
sleep_briefly();
}
return get_node(*head);
}

public:
// Initialize the free stack to be full and the used stack to be empty. We use
// the capacity of the stack as a sentinel value.
constexpr FixedStack() : free(0), used(CAPACITY), data{} {
for (uint32_t i = 0; i < CAPACITY; ++i)
next[i] = i + 1;
}

bool push(const T &val) {
uint32_t node = pop_impl(&free);
if (node == CAPACITY)
return false;

data[node] = val;
push_impl(&used, node);
return true;
}

bool pop(T &val) {
uint32_t node = pop_impl(&used);
if (node == CAPACITY)
return false;

val = data[node];
push_impl(&free, node);
return true;
}

bool empty() const {
return get_node(atomic_load(&used, cpp::MemoryOrder::RELAXED)) == CAPACITY;
}

bool full() const {
return get_node(atomic_load(&free, cpp::MemoryOrder::RELAXED)) == CAPACITY;
}

#undef atomic_load
#undef atomic_cas
};

} // namespace LIBC_NAMESPACE

#endif // LLVM_LIBC_SRC___SUPPORT_FIXEDSTACK_H
4 changes: 4 additions & 0 deletions libc/test/integration/src/__support/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
add_subdirectory(${LIBC_TARGET_OS})
endif()

add_subdirectory(threads)
19 changes: 19 additions & 0 deletions libc/test/integration/src/__support/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
add_custom_target(support-gpu-integration-tests)
add_dependencies(libc-integration-tests support-gpu-integration-tests)

add_integration_test(
support_fixed_stack_test
SUITE support-gpu-integration-tests
SRCS
fixed_stack_test.cpp
DEPENDS
libc.src.__support.GPU.utils
libc.src.__support.fixedstack
LOADER_ARGS
--blocks-x 2
--blocks-y 2
--blocks-z 2
--threads-x 4
--threads-y 4
--threads-z 4
)
75 changes: 75 additions & 0 deletions libc/test/integration/src/__support/gpu/fixed_stack_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
//===-- Integration test for the lock-free stack --------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "src/__support/GPU/utils.h"
#include "src/__support/fixedstack.h"
#include "test/IntegrationTest/test.h"

using namespace LIBC_NAMESPACE;

void single_thread() {
// FIXME: The NVPTX backend cannot handle atomic CAS on a local address space.
#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
FixedStack<int, 16> local_stack;

for (int i = 0; i < 16; ++i)
EXPECT_TRUE(local_stack.push(i));
ASSERT_TRUE(local_stack.full());

int val;
for (int i = 0; i < 16; ++i) {
EXPECT_TRUE(local_stack.pop(val));
EXPECT_EQ(val, 16 - 1 - i);
}
ASSERT_TRUE(local_stack.empty());
#endif
}

static FixedStack<uint32_t, 2048> global_stack;
void multiple_threads() {
// We need enough space in the stack as threads in flight can temporarily
// consume memory before they finish comitting it back to the stack.
ASSERT_EQ(gpu::get_num_blocks() * gpu::get_num_threads(), 512);

uint32_t val;
uint32_t num_threads = static_cast<uint32_t>(gpu::get_num_threads());
for (int i = 0; i < 256; ++i) {
EXPECT_TRUE(global_stack.push(UINT32_MAX))
EXPECT_TRUE(global_stack.pop(val))
ASSERT_TRUE(val < num_threads || val == UINT32_MAX);
}

EXPECT_TRUE(global_stack.push(static_cast<uint32_t>(gpu::get_thread_id())));
EXPECT_TRUE(global_stack.push(static_cast<uint32_t>(gpu::get_thread_id())));
EXPECT_TRUE(global_stack.pop(val));
ASSERT_TRUE(val < num_threads || val == UINT32_MAX);

// Fill the rest of the stack with the default value.
while (!global_stack.push(UINT32_MAX))
;
}

// Once all the threads have finished executing check the final state of the
// stack. Destructors are always run with a single thread on the GPU.
[[gnu::destructor]] void check_stack() {
ASSERT_FALSE(global_stack.empty());

while (!global_stack.empty()) {
uint32_t val;
ASSERT_TRUE(global_stack.pop(val));
ASSERT_TRUE(val < 64 || val == UINT32_MAX);
}
}

TEST_MAIN(int argc, char **argv, char **envp) {
single_thread();

multiple_threads();

return 0;
}
10 changes: 10 additions & 0 deletions libc/test/src/__support/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,16 @@ add_libc_test(
libc.src.__support.fixedvector
)

add_libc_test(
fixedstack_test
SUITE
libc-support-tests
SRCS
fixedstack_test.cpp
DEPENDS
libc.src.__support.fixedstack
)

add_libc_test(
char_vector_test
SUITE
Expand Down
26 changes: 26 additions & 0 deletions libc/test/src/__support/fixedstack_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
//===-- Unittests for FixedStack ------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "src/__support/fixedstack.h"
#include "test/UnitTest/Test.h"

TEST(LlvmLibcFixedVectorTest, PushAndPop) {
static LIBC_NAMESPACE::FixedStack<int, 20> fixed_stack;
ASSERT_TRUE(fixed_stack.empty());
for (int i = 0; i < 20; i++)
ASSERT_TRUE(fixed_stack.push(i));
ASSERT_FALSE(fixed_stack.empty());
ASSERT_FALSE(fixed_stack.push(123));
int val;
for (int i = 20; i > 0; --i) {
ASSERT_TRUE(fixed_stack.pop(val));
ASSERT_EQ(val, i - 1);
}
ASSERT_FALSE(fixed_stack.pop(val));
ASSERT_TRUE(fixed_stack.empty());
}