diff --git a/libc/src/__support/GPU/CMakeLists.txt b/libc/src/__support/GPU/CMakeLists.txt index 9b359f65cdb33..4ffee011be961 100644 --- a/libc/src/__support/GPU/CMakeLists.txt +++ b/libc/src/__support/GPU/CMakeLists.txt @@ -18,5 +18,8 @@ add_object_library( DEPENDS libc.src.__support.common libc.src.__support.RPC.rpc_client + libc.src.__support.CPP.atomic + libc.src.__support.CPP.bit + libc.src.__support.CPP.new .utils ) diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp index ac335a1b9aab0..ca68cbcedd48a 100644 --- a/libc/src/__support/GPU/allocator.cpp +++ b/libc/src/__support/GPU/allocator.cpp @@ -5,17 +5,49 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +// +// This file implements a parallel allocator intended for use on a GPU device. +// The core algorithm is slab allocator using a random walk over a bitfield for +// maximum parallel progress. Slab handling is done by a wait-free reference +// counted guard. The first use of a slab will create it from system memory for +// re-use. The last use will invalidate it and free the memory. +// +//===----------------------------------------------------------------------===// #include "allocator.h" +#include "src/__support/CPP/atomic.h" +#include "src/__support/CPP/bit.h" +#include "src/__support/CPP/new.h" #include "src/__support/GPU/utils.h" #include "src/__support/RPC/rpc_client.h" -#include "src/__support/macros/config.h" +#include "src/__support/threads/sleep.h" namespace LIBC_NAMESPACE_DECL { -namespace { -void *rpc_allocate(uint64_t size) { +constexpr static uint64_t MAX_SIZE = /* 64 GiB */ 64ull * 1024 * 1024 * 1024; +constexpr static uint64_t SLAB_SIZE = /* 2 MiB */ 2ull * 1024 * 1024; +constexpr static uint64_t ARRAY_SIZE = MAX_SIZE / SLAB_SIZE; +constexpr static uint64_t SLAB_ALIGNMENT = SLAB_SIZE - 1; +constexpr static uint32_t BITS_IN_WORD = sizeof(uint32_t) * 8; +constexpr static uint32_t MIN_SIZE = 16; +constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1; + +// A sentinel used to indicate an invalid but non-null pointer value. +constexpr static uint64_t SENTINEL = cpp::numeric_limits::max(); + +// The number of times we will try starting on a single index before skipping +// past it. +constexpr static uint32_t MAX_TRIES = 512; + +static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two"); + +namespace impl { +// Allocates more memory from the system through the RPC interface. All +// allocations from the system MUST be aligned on a 2MiB barrier. The default +// HSA allocator has this behavior for any allocation >= 2MiB and the CUDA +// driver provides an alignment field for virtual memory allocations. +static void *rpc_allocate(uint64_t size) { void *ptr = nullptr; rpc::Client::Port port = rpc::client.open(); port.send_and_recv( @@ -27,7 +59,8 @@ void *rpc_allocate(uint64_t size) { return ptr; } -void rpc_free(void *ptr) { +// Deallocates the associated system memory. +static void rpc_free(void *ptr) { rpc::Client::Port port = rpc::client.open(); port.send([=](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = reinterpret_cast(ptr); @@ -35,13 +68,453 @@ void rpc_free(void *ptr) { port.close(); } -} // namespace +// Convert a potentially disjoint bitmask into an increasing integer per-lane +// for use with indexing between gpu lanes. +static inline uint32_t lane_count(uint64_t lane_mask) { + return cpp::popcount(lane_mask & ((uint64_t(1) << gpu::get_lane_id()) - 1)); +} + +// Obtain an initial value to seed a random number generator. We use the rounded +// multiples of the golden ratio from xorshift* as additional spreading. +static inline uint32_t entropy() { + return (static_cast(gpu::processor_clock()) ^ + (gpu::get_thread_id_x() * 0x632be59b) ^ + (gpu::get_block_id_x() * 0x85157af5)) * + 0x9e3779bb; +} + +// Generate a random number and update the state using the xorshift32* PRNG. +static inline uint32_t xorshift32(uint32_t &state) { + state ^= state << 13; + state ^= state >> 17; + state ^= state << 5; + return state * 0x9e3779bb; +} + +// Final stage of murmurhash used to get a unique index for the global array +static inline uint32_t hash(uint32_t x) { + x ^= x >> 16; + x *= 0x85ebca6b; + x ^= x >> 13; + x *= 0xc2b2ae35; + x ^= x >> 16; + return x; +} + +// Rounds the input value to the closest permitted chunk size. Here we accept +// the sum of the closest three powers of two. For a 2MiB slab size this is 48 +// different chunk sizes. This gives us average internal fragmentation of 87.5%. +static inline uint32_t get_chunk_size(uint32_t x) { + uint32_t y = x < MIN_SIZE ? MIN_SIZE : x; + uint32_t pow2 = BITS_IN_WORD - cpp::countl_zero(y - 1); + + uint32_t s0 = 0b0100 << (pow2 - 3); + uint32_t s1 = 0b0110 << (pow2 - 3); + uint32_t s2 = 0b0111 << (pow2 - 3); + uint32_t s3 = 0b1000 << (pow2 - 3); + + if (s0 > y) + return (s0 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT; + if (s1 > y) + return (s1 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT; + if (s2 > y) + return (s2 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT; + return (s3 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT; +} + +// Rounds to the nearest power of two. +template +static inline constexpr T round_up(const T x) { + static_assert(((N - 1) & N) == 0, "N must be a power of two"); + return (x + N) & ~(N - 1); +} + +} // namespace impl + +/// A slab allocator used to hand out identically sized slabs of memory. +/// Allocation is done through random walks of a bitfield until a free bit is +/// encountered. This reduces contention and is highly parallel on a GPU. +/// +/// 0 4 8 16 ... 2 MiB +/// ┌────────┬──────────┬────────┬──────────────────┬──────────────────────────┐ +/// │ chunk │ index │ pad │ bitfield[] │ memory[] │ +/// └────────┴──────────┴────────┴──────────────────┴──────────────────────────┘ +/// +/// The size of the bitfield is the slab size divided by the chunk size divided +/// by the number of bits per word. We pad the interface to ensure 16 byte +/// alignment and to indicate that if the pointer is not aligned by 2MiB it +/// belongs to a slab rather than the global allocator. +struct Slab { + // Header metadata for the slab, aligned to the minimum alignment. + struct alignas(MIN_SIZE) Header { + uint32_t chunk_size; + uint32_t global_index; + }; + + // Initialize the slab with its chunk size and index in the global table for + // use when freeing. + Slab(uint32_t chunk_size, uint32_t global_index) { + Header *header = reinterpret_cast
(memory); + header->chunk_size = chunk_size; + header->global_index = global_index; + + // This memset is expensive and likely not necessary for the current 'kfd' + // driver. Until zeroed pages are exposed by the API we must be careful. + __builtin_memset(get_bitfield(), 0, bitfield_bytes(chunk_size)); + } + + // Get the number of chunks that can theoretically fit inside this slab. + constexpr static uint32_t num_chunks(uint32_t chunk_size) { + return SLAB_SIZE / chunk_size; + } + + // Get the number of bytes needed to contain the bitfield bits. + constexpr static uint32_t bitfield_bytes(uint32_t chunk_size) { + return ((num_chunks(chunk_size) + BITS_IN_WORD - 1) / BITS_IN_WORD) * 8; + } + + // The actual amount of memory available excluding the bitfield and metadata. + constexpr static uint32_t available_bytes(uint32_t chunk_size) { + return SLAB_SIZE - bitfield_bytes(chunk_size) - sizeof(Header); + } + + // The number of chunks that can be stored in this slab. + constexpr static uint32_t available_chunks(uint32_t chunk_size) { + return available_bytes(chunk_size) / chunk_size; + } + + // The length in bits of the bitfield. + constexpr static uint32_t usable_bits(uint32_t chunk_size) { + return available_bytes(chunk_size) / chunk_size; + } + + // Get the location in the memory where we will store the chunk size. + uint32_t get_chunk_size() const { + return reinterpret_cast(memory)->chunk_size; + } + + // Get the location in the memory where we will store the global index. + uint32_t get_global_index() const { + return reinterpret_cast(memory)->global_index; + } + + // Get a pointer to where the bitfield is located in the memory. + uint32_t *get_bitfield() { + return reinterpret_cast(memory + sizeof(Header)); + } + + // Get a pointer to where the actual memory to be allocated lives. + uint8_t *get_memory(uint32_t chunk_size) { + return reinterpret_cast(get_bitfield()) + + bitfield_bytes(chunk_size); + } + + // Get a pointer to the actual memory given an index into the bitfield. + void *ptr_from_index(uint32_t index, uint32_t chunk_size) { + return get_memory(chunk_size) + index * chunk_size; + } + + // Convert a pointer back into its bitfield index using its offset. + uint32_t index_from_ptr(void *ptr, uint32_t chunk_size) { + return static_cast(reinterpret_cast(ptr) - + get_memory(chunk_size)) / + chunk_size; + } + + // Randomly walks the bitfield until it finds a free bit. Allocations attempt + // to put lanes right next to each other for better caching and convergence. + void *allocate(uint64_t lane_mask, uint64_t uniform) { + uint32_t chunk_size = get_chunk_size(); + uint32_t state = impl::entropy(); + + // The uniform mask represents which lanes contain a uniform target pointer. + // We attempt to place these next to each other. + // TODO: We should coalesce these bits and use the result of `fetch_or` to + // search for free bits in parallel. + void *result = nullptr; + for (uint64_t mask = lane_mask; mask; + mask = gpu::ballot(lane_mask, !result)) { + uint32_t id = impl::lane_count(uniform & mask); + uint32_t index = + (gpu::broadcast_value(lane_mask, impl::xorshift32(state)) + id) % + usable_bits(chunk_size); + + uint32_t slot = index / BITS_IN_WORD; + uint32_t bit = index % BITS_IN_WORD; + if (!result) { + uint32_t before = cpp::AtomicRef(get_bitfield()[slot]) + .fetch_or(1u << bit, cpp::MemoryOrder::RELAXED); + if (~before & (1 << bit)) + result = ptr_from_index(index, chunk_size); + } + } + + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); + return result; + } + + // Deallocates memory by resetting its corresponding bit in the bitfield. + void deallocate(void *ptr) { + uint32_t chunk_size = get_chunk_size(); + uint32_t index = index_from_ptr(ptr, chunk_size); + uint32_t slot = index / BITS_IN_WORD; + uint32_t bit = index % BITS_IN_WORD; + + cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); + cpp::AtomicRef(get_bitfield()[slot]) + .fetch_and(~(1u << bit), cpp::MemoryOrder::RELAXED); + } + + // The actual memory the slab will manage. All offsets are calculated at + // runtime with the chunk size to keep the interface convergent when a warp or + // wavefront is handling multiple sizes at once. + uint8_t memory[SLAB_SIZE]; +}; + +/// A wait-free guard around a pointer resource to be created dynamically if +/// space is available and freed once there are no more users. +template struct GuardPtr { +private: + struct RefCounter { + // Indicates that the object is in its deallocation phase and thus invalid. + static constexpr uint64_t INVALID = uint64_t(1) << 63; + + // If a read preempts an unlock call we indicate this so the following + // unlock call can swap out the helped bit and maintain exclusive ownership. + static constexpr uint64_t HELPED = uint64_t(1) << 62; + + // Resets the reference counter, cannot be reset to zero safely. + void reset(uint32_t n, uint64_t &count) { + counter.store(n, cpp::MemoryOrder::RELAXED); + count = n; + } + + // Acquire a slot in the reference counter if it is not invalid. + bool acquire(uint32_t n, uint64_t &count) { + count = counter.fetch_add(n, cpp::MemoryOrder::RELAXED) + n; + return (count & INVALID) == 0; + } + + // Release a slot in the reference counter. This function should only be + // called following a valid acquire call. + bool release(uint32_t n) { + // If this thread caused the counter to reach zero we try to invalidate it + // and obtain exclusive rights to deconstruct it. If the CAS failed either + // another thread resurrected the counter and we quit, or a parallel read + // helped us invalidating it. For the latter, claim that flag and return. + if (counter.fetch_sub(n, cpp::MemoryOrder::RELAXED) == n) { + uint64_t expected = 0; + if (counter.compare_exchange_strong(expected, INVALID, + cpp::MemoryOrder::RELAXED, + cpp::MemoryOrder::RELAXED)) + return true; + else if ((expected & HELPED) && + (counter.exchange(INVALID, cpp::MemoryOrder::RELAXED) & + HELPED)) + return true; + } + return false; + } + + // Returns the current reference count, potentially helping a releasing + // thread. + uint64_t read() { + auto val = counter.load(cpp::MemoryOrder::RELAXED); + if (val == 0 && counter.compare_exchange_strong( + val, INVALID | HELPED, cpp::MemoryOrder::RELAXED)) + return 0; + return (val & INVALID) ? 0 : val; + } + + cpp::Atomic counter{0}; + }; + + cpp::Atomic ptr{nullptr}; + RefCounter ref{}; + + // Should be called be a single lane for each different pointer. + template + T *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) { + T *expected = ptr.load(cpp::MemoryOrder::RELAXED); + if (!expected && + ptr.compare_exchange_strong(expected, reinterpret_cast(SENTINEL), + cpp::MemoryOrder::RELAXED, + cpp::MemoryOrder::RELAXED)) { + count = cpp::numeric_limits::max(); + void *raw = impl::rpc_allocate(sizeof(T)); + if (!raw) + return nullptr; + T *mem = new (raw) T(cpp::forward(args)...); + + cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); + ptr.store(mem, cpp::MemoryOrder::RELAXED); + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); + if (!ref.acquire(n, count)) + ref.reset(n, count); + return mem; + } + + if (!expected || expected == reinterpret_cast(SENTINEL)) + return nullptr; + + if (!ref.acquire(n, count)) + return nullptr; + + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); + return ptr.load(cpp::MemoryOrder::RELAXED); + } + +public: + // Attempt to lock access to the pointer, potentially creating it if empty. + // The uniform mask represents which lanes share the same pointer. For each + // uniform value we elect a leader to handle it on behalf of the other lanes. + template + T *try_lock(uint64_t lane_mask, uint64_t uniform, uint64_t &count, + Args &&...args) { + count = 0; + T *result = nullptr; + if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform))) + result = try_lock_impl(cpp::popcount(uniform), count, + cpp::forward(args)...); + result = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), result); + count = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), count); + + if (!result) + return nullptr; + + if (count != cpp::numeric_limits::max()) + count = count - cpp::popcount(uniform) + impl::lane_count(uniform) + 1; + + return result; + } + + // Release the associated lock on the pointer, potentially destroying it. + void unlock(uint64_t lane_mask, uint64_t mask) { + cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); + if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(mask)) && + ref.release(cpp::popcount(mask))) { + T *p = ptr.load(cpp::MemoryOrder::RELAXED); + p->~T(); + impl::rpc_free(p); + cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); + ptr.store(nullptr, cpp::MemoryOrder::RELAXED); + } + gpu::sync_lane(lane_mask); + } + + // Get the current value of the reference counter. + uint64_t use_count() { return ref.read(); } +}; + +// The global array used to search for a valid slab to allocate from. +static GuardPtr slots[ARRAY_SIZE] = {}; + +// Tries to find a slab in the table that can support the given chunk size. +static Slab *find_slab(uint32_t chunk_size) { + // We start at a hashed value to spread out different chunk sizes. + uint32_t start = impl::hash(chunk_size); + uint64_t lane_mask = gpu::get_lane_mask(); + uint64_t uniform = gpu::match_any(lane_mask, chunk_size); + + Slab *result = nullptr; + uint32_t nudge = 0; + for (uint64_t mask = lane_mask; mask; + mask = gpu::ballot(lane_mask, !result), ++nudge) { + uint32_t index = cpp::numeric_limits::max(); + for (uint32_t offset = nudge / MAX_TRIES; + gpu::ballot(lane_mask, index == cpp::numeric_limits::max()); + offset += cpp::popcount(uniform & lane_mask)) { + uint32_t candidate = + (start + offset + impl::lane_count(uniform & lane_mask)) % ARRAY_SIZE; + uint64_t available = + gpu::ballot(lane_mask, slots[candidate].use_count() < + Slab::available_chunks(chunk_size)); + uint32_t new_index = gpu::shuffle( + lane_mask, cpp::countr_zero(available & uniform), candidate); + + // Each uniform group will use the first empty slot they find. + if ((index == cpp::numeric_limits::max() && + (available & uniform))) + index = new_index; + + // Guaruntees that this loop will eventuall exit if there is no space. + if (offset >= ARRAY_SIZE) { + result = reinterpret_cast(SENTINEL); + index = 0; + } + } + + // Try to claim a slot for the found slot. + if (!result) { + uint64_t reserved = 0; + Slab *slab = slots[index].try_lock(lane_mask & mask, uniform & mask, + reserved, chunk_size, index); + // If we find a slab with a matching chunk size then we store the result. + // Otherwise, we need to free the claimed lock and continue. In the case + // of out-of-memory we return a sentinel value. + if (slab && reserved <= Slab::available_chunks(chunk_size) && + slab->get_chunk_size() == chunk_size) { + result = slab; + } else if (slab && (reserved > Slab::available_chunks(chunk_size) || + slab->get_chunk_size() != chunk_size)) { + if (slab->get_chunk_size() != chunk_size) + start = index + 1; + slots[index].unlock(gpu::get_lane_mask(), + gpu::get_lane_mask() & uniform); + } else if (!slab && reserved == cpp::numeric_limits::max()) { + result = reinterpret_cast(SENTINEL); + } else { + sleep_briefly(); + } + } + } + return result; +} + +// Release the lock associated with a given slab. +static void release_slab(Slab *slab) { + uint32_t index = slab->get_global_index(); + uint64_t lane_mask = gpu::get_lane_mask(); + uint64_t uniform = gpu::match_any(lane_mask, index); + slots[index].unlock(lane_mask, uniform); +} namespace gpu { -void *allocate(uint64_t size) { return rpc_allocate(size); } +void *allocate(uint64_t size) { + if (!size) + return nullptr; + + // Allocations requiring a full slab or more go directly to memory. + if (size >= SLAB_SIZE / 2) + return impl::rpc_allocate(impl::round_up(size)); + + // Try to find a slab for the rounded up chunk size and allocate from it. + uint32_t chunk_size = impl::get_chunk_size(static_cast(size)); + Slab *slab = find_slab(chunk_size); + if (!slab || slab == reinterpret_cast(SENTINEL)) + return nullptr; -void deallocate(void *ptr) { rpc_free(ptr); } + uint64_t lane_mask = gpu::get_lane_mask(); + uint64_t uniform = gpu::match_any(lane_mask, slab->get_global_index()); + void *ptr = slab->allocate(lane_mask, uniform); + return ptr; +} + +void deallocate(void *ptr) { + if (!ptr) + return; + + // All non-slab allocations will be aligned on a 2MiB boundary. + if ((reinterpret_cast(ptr) & SLAB_ALIGNMENT) == 0) + return impl::rpc_free(ptr); + + // The original slab pointer is the 2MiB boundary using the given pointer. + Slab *slab = reinterpret_cast( + (reinterpret_cast(ptr) & ~SLAB_ALIGNMENT)); + slab->deallocate(ptr); + release_slab(slab); +} } // namespace gpu } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/integration/src/stdlib/CMakeLists.txt b/libc/test/integration/src/stdlib/CMakeLists.txt index 1efdf607defe9..1773d9fc9f0f5 100644 --- a/libc/test/integration/src/stdlib/CMakeLists.txt +++ b/libc/test/integration/src/stdlib/CMakeLists.txt @@ -1,3 +1,6 @@ +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS}) + add_subdirectory(${LIBC_TARGET_OS}) +endif() add_custom_target(stdlib-integration-tests) add_dependencies(libc-integration-tests stdlib-integration-tests) diff --git a/libc/test/integration/src/stdlib/gpu/CMakeLists.txt b/libc/test/integration/src/stdlib/gpu/CMakeLists.txt new file mode 100644 index 0000000000000..26c877b1b6ae6 --- /dev/null +++ b/libc/test/integration/src/stdlib/gpu/CMakeLists.txt @@ -0,0 +1,33 @@ +add_custom_target(stdlib-gpu-integration-tests) +add_dependencies(libc-integration-tests stdlib-gpu-integration-tests) + +# TODO: Test on NVPTX, requires CUDA VMEM API. +if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX) + add_integration_test( + malloc + SUITE + stdlib-gpu-integration-tests + SRCS + malloc.cpp + DEPENDS + libc.src.stdlib.malloc + libc.src.stdlib.free + LOADER_ARGS + --threads 256 + --blocks 1024 + ) + + add_integration_test( + malloc_stress + SUITE + stdlib-gpu-integration-tests + SRCS + malloc_stress.cpp + DEPENDS + libc.src.stdlib.malloc + libc.src.stdlib.free + LOADER_ARGS + --threads 256 + --blocks 2048 + ) +endif() diff --git a/libc/test/integration/src/stdlib/gpu/malloc.cpp b/libc/test/integration/src/stdlib/gpu/malloc.cpp new file mode 100644 index 0000000000000..7880206b1aaaa --- /dev/null +++ b/libc/test/integration/src/stdlib/gpu/malloc.cpp @@ -0,0 +1,40 @@ +//===-- Test for parallel GPU malloc interface ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "test/IntegrationTest/test.h" + +#include "src/__support/GPU/utils.h" +#include "src/stdlib/free.h" +#include "src/stdlib/malloc.h" + +using namespace LIBC_NAMESPACE; + +TEST_MAIN(int, char **, char **) { + int *convergent = reinterpret_cast(LIBC_NAMESPACE::malloc(16)); + EXPECT_NE(convergent, nullptr); + *convergent = 1; + EXPECT_EQ(*convergent, 1); + LIBC_NAMESPACE::free(convergent); + + int *divergent = reinterpret_cast( + LIBC_NAMESPACE::malloc((gpu::get_thread_id() + 1) * 16)); + EXPECT_NE(divergent, nullptr); + *divergent = 1; + EXPECT_EQ(*divergent, 1); + LIBC_NAMESPACE::free(divergent); + + if (gpu::get_lane_id() & 1) { + int *masked = reinterpret_cast( + LIBC_NAMESPACE::malloc((gpu::get_thread_id() + 1) * 16)); + EXPECT_NE(masked, nullptr); + *masked = 1; + EXPECT_EQ(*masked, 1); + LIBC_NAMESPACE::free(masked); + } + return 0; +} diff --git a/libc/test/integration/src/stdlib/gpu/malloc_stress.cpp b/libc/test/integration/src/stdlib/gpu/malloc_stress.cpp new file mode 100644 index 0000000000000..77479f85dc5cc --- /dev/null +++ b/libc/test/integration/src/stdlib/gpu/malloc_stress.cpp @@ -0,0 +1,38 @@ +//===-- Test for parallel GPU malloc interface ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "test/IntegrationTest/test.h" + +#include "src/__support/GPU/utils.h" +#include "src/stdlib/free.h" +#include "src/stdlib/malloc.h" + +using namespace LIBC_NAMESPACE; + +static inline void use(uint8_t *ptr, uint32_t size) { + EXPECT_NE(ptr, nullptr); + for (int i = 0; i < size; ++i) + ptr[i] = uint8_t(i + gpu::get_thread_id()); + + // Try to detect if some other thread manages to clobber our memory. + for (int i = 0; i < size; ++i) + EXPECT_EQ(ptr[i], uint8_t(i + gpu::get_thread_id())); +} + +TEST_MAIN(int, char **, char **) { + void *ptrs[256]; + for (int i = 0; i < 256; ++i) + ptrs[i] = malloc(gpu::get_lane_id() % 2 ? 16 : 32); + + for (int i = 0; i < 256; ++i) + use(reinterpret_cast(ptrs[i]), gpu::get_lane_id() % 2 ? 16 : 32); + + for (int i = 0; i < 256; ++i) + free(ptrs[i]); + return 0; +} diff --git a/libc/test/src/stdlib/malloc_test.cpp b/libc/test/src/stdlib/malloc_test.cpp index d9023cf56d9fe..a8b32b7a430c9 100644 --- a/libc/test/src/stdlib/malloc_test.cpp +++ b/libc/test/src/stdlib/malloc_test.cpp @@ -17,3 +17,15 @@ TEST(LlvmLibcMallocTest, Allocate) { EXPECT_EQ(*ptr, 1); LIBC_NAMESPACE::free(ptr); } + +TEST(LlvmLibcMallocTest, Nullptr) { + int *ptr = reinterpret_cast(LIBC_NAMESPACE::malloc(0)); + EXPECT_EQ(reinterpret_cast(ptr), static_cast(nullptr)); + LIBC_NAMESPACE::free(ptr); +} + +TEST(LlvmLibcMallocTest, LargeAllocation) { + int *ptr = reinterpret_cast(LIBC_NAMESPACE::malloc(2ul * 1024 * 1024)); + EXPECT_NE(reinterpret_cast(ptr), static_cast(nullptr)); + LIBC_NAMESPACE::free(ptr); +}