diff --git a/libc/src/__support/GPU/CMakeLists.txt b/libc/src/__support/GPU/CMakeLists.txt
index 9b359f65cdb33..4ffee011be961 100644
--- a/libc/src/__support/GPU/CMakeLists.txt
+++ b/libc/src/__support/GPU/CMakeLists.txt
@@ -18,5 +18,8 @@ add_object_library(
   DEPENDS
     libc.src.__support.common
     libc.src.__support.RPC.rpc_client
+    libc.src.__support.CPP.atomic
+    libc.src.__support.CPP.bit
+    libc.src.__support.CPP.new
     .utils
 )
diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index ac335a1b9aab0..ca68cbcedd48a 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -5,17 +5,49 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+//
+// This file implements a parallel allocator intended for use on a GPU device.
+// The core algorithm is slab allocator using a random walk over a bitfield for
+// maximum parallel progress. Slab handling is done by a wait-free reference
+// counted guard. The first use of a slab will create it from system memory for
+// re-use. The last use will invalidate it and free the memory.
+//
+//===----------------------------------------------------------------------===//
 
 #include "allocator.h"
 
+#include "src/__support/CPP/atomic.h"
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/new.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/threads/sleep.h"
 
 namespace LIBC_NAMESPACE_DECL {
-namespace {
 
-void *rpc_allocate(uint64_t size) {
+constexpr static uint64_t MAX_SIZE = /* 64 GiB */ 64ull * 1024 * 1024 * 1024;
+constexpr static uint64_t SLAB_SIZE = /* 2 MiB */ 2ull * 1024 * 1024;
+constexpr static uint64_t ARRAY_SIZE = MAX_SIZE / SLAB_SIZE;
+constexpr static uint64_t SLAB_ALIGNMENT = SLAB_SIZE - 1;
+constexpr static uint32_t BITS_IN_WORD = sizeof(uint32_t) * 8;
+constexpr static uint32_t MIN_SIZE = 16;
+constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1;
+
+// A sentinel used to indicate an invalid but non-null pointer value.
+constexpr static uint64_t SENTINEL = cpp::numeric_limits<uint64_t>::max();
+
+// The number of times we will try starting on a single index before skipping
+// past it.
+constexpr static uint32_t MAX_TRIES = 512;
+
+static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two");
+
+namespace impl {
+// Allocates more memory from the system through the RPC interface. All
+// allocations from the system MUST be aligned on a 2MiB barrier. The default
+// HSA allocator has this behavior for any allocation >= 2MiB and the CUDA
+// driver provides an alignment field for virtual memory allocations.
+static void *rpc_allocate(uint64_t size) {
   void *ptr = nullptr;
   rpc::Client::Port port = rpc::client.open<LIBC_MALLOC>();
   port.send_and_recv(
@@ -27,7 +59,8 @@ void *rpc_allocate(uint64_t size) {
   return ptr;
 }
 
-void rpc_free(void *ptr) {
+// Deallocates the associated system memory.
+static void rpc_free(void *ptr) {
   rpc::Client::Port port = rpc::client.open<LIBC_FREE>();
   port.send([=](rpc::Buffer *buffer, uint32_t) {
     buffer->data[0] = reinterpret_cast<uintptr_t>(ptr);
@@ -35,13 +68,453 @@ void rpc_free(void *ptr) {
   port.close();
 }
 
-} // namespace
+// Convert a potentially disjoint bitmask into an increasing integer per-lane
+// for use with indexing between gpu lanes.
+static inline uint32_t lane_count(uint64_t lane_mask) {
+  return cpp::popcount(lane_mask & ((uint64_t(1) << gpu::get_lane_id()) - 1));
+}
+
+// Obtain an initial value to seed a random number generator. We use the rounded
+// multiples of the golden ratio from xorshift* as additional spreading.
+static inline uint32_t entropy() {
+  return (static_cast<uint32_t>(gpu::processor_clock()) ^
+          (gpu::get_thread_id_x() * 0x632be59b) ^
+          (gpu::get_block_id_x() * 0x85157af5)) *
+         0x9e3779bb;
+}
+
+// Generate a random number and update the state using the xorshift32* PRNG.
+static inline uint32_t xorshift32(uint32_t &state) {
+  state ^= state << 13;
+  state ^= state >> 17;
+  state ^= state << 5;
+  return state * 0x9e3779bb;
+}
+
+// Final stage of murmurhash used to get a unique index for the global array
+static inline uint32_t hash(uint32_t x) {
+  x ^= x >> 16;
+  x *= 0x85ebca6b;
+  x ^= x >> 13;
+  x *= 0xc2b2ae35;
+  x ^= x >> 16;
+  return x;
+}
+
+// Rounds the input value to the closest permitted chunk size. Here we accept
+// the sum of the closest three powers of two. For a 2MiB slab size this is 48
+// different chunk sizes. This gives us average internal fragmentation of 87.5%.
+static inline uint32_t get_chunk_size(uint32_t x) {
+  uint32_t y = x < MIN_SIZE ? MIN_SIZE : x;
+  uint32_t pow2 = BITS_IN_WORD - cpp::countl_zero(y - 1);
+
+  uint32_t s0 = 0b0100 << (pow2 - 3);
+  uint32_t s1 = 0b0110 << (pow2 - 3);
+  uint32_t s2 = 0b0111 << (pow2 - 3);
+  uint32_t s3 = 0b1000 << (pow2 - 3);
+
+  if (s0 > y)
+    return (s0 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
+  if (s1 > y)
+    return (s1 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
+  if (s2 > y)
+    return (s2 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
+  return (s3 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
+}
+
+// Rounds to the nearest power of two.
+template <uint32_t N, typename T>
+static inline constexpr T round_up(const T x) {
+  static_assert(((N - 1) & N) == 0, "N must be a power of two");
+  return (x + N) & ~(N - 1);
+}
+
+} // namespace impl
+
+/// A slab allocator used to hand out identically sized slabs of memory.
+/// Allocation is done through random walks of a bitfield until a free bit is
+/// encountered. This reduces contention and is highly parallel on a GPU.
+///
+/// 0       4           8       16                 ...                     2 MiB
+/// ┌────────┬──────────┬────────┬──────────────────┬──────────────────────────┐
+/// │ chunk  │  index   │  pad   │    bitfield[]    │         memory[]         │
+/// └────────┴──────────┴────────┴──────────────────┴──────────────────────────┘
+///
+/// The size of the bitfield is the slab size divided by the chunk size divided
+/// by the number of bits per word. We pad the interface to ensure 16 byte
+/// alignment and to indicate that if the pointer is not aligned by 2MiB it
+/// belongs to a slab rather than the global allocator.
+struct Slab {
+  // Header metadata for the slab, aligned to the minimum alignment.
+  struct alignas(MIN_SIZE) Header {
+    uint32_t chunk_size;
+    uint32_t global_index;
+  };
+
+  // Initialize the slab with its chunk size and index in the global table for
+  // use when freeing.
+  Slab(uint32_t chunk_size, uint32_t global_index) {
+    Header *header = reinterpret_cast<Header *>(memory);
+    header->chunk_size = chunk_size;
+    header->global_index = global_index;
+
+    // This memset is expensive and likely not necessary for the current 'kfd'
+    // driver. Until zeroed pages are exposed by the API we must be careful.
+    __builtin_memset(get_bitfield(), 0, bitfield_bytes(chunk_size));
+  }
+
+  // Get the number of chunks that can theoretically fit inside this slab.
+  constexpr static uint32_t num_chunks(uint32_t chunk_size) {
+    return SLAB_SIZE / chunk_size;
+  }
+
+  // Get the number of bytes needed to contain the bitfield bits.
+  constexpr static uint32_t bitfield_bytes(uint32_t chunk_size) {
+    return ((num_chunks(chunk_size) + BITS_IN_WORD - 1) / BITS_IN_WORD) * 8;
+  }
+
+  // The actual amount of memory available excluding the bitfield and metadata.
+  constexpr static uint32_t available_bytes(uint32_t chunk_size) {
+    return SLAB_SIZE - bitfield_bytes(chunk_size) - sizeof(Header);
+  }
+
+  // The number of chunks that can be stored in this slab.
+  constexpr static uint32_t available_chunks(uint32_t chunk_size) {
+    return available_bytes(chunk_size) / chunk_size;
+  }
+
+  // The length in bits of the bitfield.
+  constexpr static uint32_t usable_bits(uint32_t chunk_size) {
+    return available_bytes(chunk_size) / chunk_size;
+  }
+
+  // Get the location in the memory where we will store the chunk size.
+  uint32_t get_chunk_size() const {
+    return reinterpret_cast<const Header *>(memory)->chunk_size;
+  }
+
+  // Get the location in the memory where we will store the global index.
+  uint32_t get_global_index() const {
+    return reinterpret_cast<const Header *>(memory)->global_index;
+  }
+
+  // Get a pointer to where the bitfield is located in the memory.
+  uint32_t *get_bitfield() {
+    return reinterpret_cast<uint32_t *>(memory + sizeof(Header));
+  }
+
+  // Get a pointer to where the actual memory to be allocated lives.
+  uint8_t *get_memory(uint32_t chunk_size) {
+    return reinterpret_cast<uint8_t *>(get_bitfield()) +
+           bitfield_bytes(chunk_size);
+  }
+
+  // Get a pointer to the actual memory given an index into the bitfield.
+  void *ptr_from_index(uint32_t index, uint32_t chunk_size) {
+    return get_memory(chunk_size) + index * chunk_size;
+  }
+
+  // Convert a pointer back into its bitfield index using its offset.
+  uint32_t index_from_ptr(void *ptr, uint32_t chunk_size) {
+    return static_cast<uint32_t>(reinterpret_cast<uint8_t *>(ptr) -
+                                 get_memory(chunk_size)) /
+           chunk_size;
+  }
+
+  // Randomly walks the bitfield until it finds a free bit. Allocations attempt
+  // to put lanes right next to each other for better caching and convergence.
+  void *allocate(uint64_t lane_mask, uint64_t uniform) {
+    uint32_t chunk_size = get_chunk_size();
+    uint32_t state = impl::entropy();
+
+    // The uniform mask represents which lanes contain a uniform target pointer.
+    // We attempt to place these next to each other.
+    // TODO: We should coalesce these bits and use the result of `fetch_or` to
+    //       search for free bits in parallel.
+    void *result = nullptr;
+    for (uint64_t mask = lane_mask; mask;
+         mask = gpu::ballot(lane_mask, !result)) {
+      uint32_t id = impl::lane_count(uniform & mask);
+      uint32_t index =
+          (gpu::broadcast_value(lane_mask, impl::xorshift32(state)) + id) %
+          usable_bits(chunk_size);
+
+      uint32_t slot = index / BITS_IN_WORD;
+      uint32_t bit = index % BITS_IN_WORD;
+      if (!result) {
+        uint32_t before = cpp::AtomicRef(get_bitfield()[slot])
+                              .fetch_or(1u << bit, cpp::MemoryOrder::RELAXED);
+        if (~before & (1 << bit))
+          result = ptr_from_index(index, chunk_size);
+      }
+    }
+
+    cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
+    return result;
+  }
+
+  // Deallocates memory by resetting its corresponding bit in the bitfield.
+  void deallocate(void *ptr) {
+    uint32_t chunk_size = get_chunk_size();
+    uint32_t index = index_from_ptr(ptr, chunk_size);
+    uint32_t slot = index / BITS_IN_WORD;
+    uint32_t bit = index % BITS_IN_WORD;
+
+    cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
+    cpp::AtomicRef(get_bitfield()[slot])
+        .fetch_and(~(1u << bit), cpp::MemoryOrder::RELAXED);
+  }
+
+  // The actual memory the slab will manage. All offsets are calculated at
+  // runtime with the chunk size to keep the interface convergent when a warp or
+  // wavefront is handling multiple sizes at once.
+  uint8_t memory[SLAB_SIZE];
+};
+
+/// A wait-free guard around a pointer resource to be created dynamically if
+/// space is available and freed once there are no more users.
+template <typename T> struct GuardPtr {
+private:
+  struct RefCounter {
+    // Indicates that the object is in its deallocation phase and thus invalid.
+    static constexpr uint64_t INVALID = uint64_t(1) << 63;
+
+    // If a read preempts an unlock call we indicate this so the following
+    // unlock call can swap out the helped bit and maintain exclusive ownership.
+    static constexpr uint64_t HELPED = uint64_t(1) << 62;
+
+    // Resets the reference counter, cannot be reset to zero safely.
+    void reset(uint32_t n, uint64_t &count) {
+      counter.store(n, cpp::MemoryOrder::RELAXED);
+      count = n;
+    }
+
+    // Acquire a slot in the reference counter if it is not invalid.
+    bool acquire(uint32_t n, uint64_t &count) {
+      count = counter.fetch_add(n, cpp::MemoryOrder::RELAXED) + n;
+      return (count & INVALID) == 0;
+    }
+
+    // Release a slot in the reference counter. This function should only be
+    // called following a valid acquire call.
+    bool release(uint32_t n) {
+      // If this thread caused the counter to reach zero we try to invalidate it
+      // and obtain exclusive rights to deconstruct it. If the CAS failed either
+      // another thread resurrected the counter and we quit, or a parallel read
+      // helped us invalidating it. For the latter, claim that flag and return.
+      if (counter.fetch_sub(n, cpp::MemoryOrder::RELAXED) == n) {
+        uint64_t expected = 0;
+        if (counter.compare_exchange_strong(expected, INVALID,
+                                            cpp::MemoryOrder::RELAXED,
+                                            cpp::MemoryOrder::RELAXED))
+          return true;
+        else if ((expected & HELPED) &&
+                 (counter.exchange(INVALID, cpp::MemoryOrder::RELAXED) &
+                  HELPED))
+          return true;
+      }
+      return false;
+    }
+
+    // Returns the current reference count, potentially helping a releasing
+    // thread.
+    uint64_t read() {
+      auto val = counter.load(cpp::MemoryOrder::RELAXED);
+      if (val == 0 && counter.compare_exchange_strong(
+                          val, INVALID | HELPED, cpp::MemoryOrder::RELAXED))
+        return 0;
+      return (val & INVALID) ? 0 : val;
+    }
+
+    cpp::Atomic<uint64_t> counter{0};
+  };
+
+  cpp::Atomic<T *> ptr{nullptr};
+  RefCounter ref{};
+
+  // Should be called be a single lane for each different pointer.
+  template <typename... Args>
+  T *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) {
+    T *expected = ptr.load(cpp::MemoryOrder::RELAXED);
+    if (!expected &&
+        ptr.compare_exchange_strong(expected, reinterpret_cast<T *>(SENTINEL),
+                                    cpp::MemoryOrder::RELAXED,
+                                    cpp::MemoryOrder::RELAXED)) {
+      count = cpp::numeric_limits<uint64_t>::max();
+      void *raw = impl::rpc_allocate(sizeof(T));
+      if (!raw)
+        return nullptr;
+      T *mem = new (raw) T(cpp::forward<Args>(args)...);
+
+      cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
+      ptr.store(mem, cpp::MemoryOrder::RELAXED);
+      cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
+      if (!ref.acquire(n, count))
+        ref.reset(n, count);
+      return mem;
+    }
+
+    if (!expected || expected == reinterpret_cast<T *>(SENTINEL))
+      return nullptr;
+
+    if (!ref.acquire(n, count))
+      return nullptr;
+
+    cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
+    return ptr.load(cpp::MemoryOrder::RELAXED);
+  }
+
+public:
+  // Attempt to lock access to the pointer, potentially creating it if empty.
+  // The uniform mask represents which lanes share the same pointer. For each
+  // uniform value we elect a leader to handle it on behalf of the other lanes.
+  template <typename... Args>
+  T *try_lock(uint64_t lane_mask, uint64_t uniform, uint64_t &count,
+              Args &&...args) {
+    count = 0;
+    T *result = nullptr;
+    if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform)))
+      result = try_lock_impl(cpp::popcount(uniform), count,
+                             cpp::forward<Args>(args)...);
+    result = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), result);
+    count = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), count);
+
+    if (!result)
+      return nullptr;
+
+    if (count != cpp::numeric_limits<uint64_t>::max())
+      count = count - cpp::popcount(uniform) + impl::lane_count(uniform) + 1;
+
+    return result;
+  }
+
+  // Release the associated lock on the pointer, potentially destroying it.
+  void unlock(uint64_t lane_mask, uint64_t mask) {
+    cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
+    if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(mask)) &&
+        ref.release(cpp::popcount(mask))) {
+      T *p = ptr.load(cpp::MemoryOrder::RELAXED);
+      p->~T();
+      impl::rpc_free(p);
+      cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
+      ptr.store(nullptr, cpp::MemoryOrder::RELAXED);
+    }
+    gpu::sync_lane(lane_mask);
+  }
+
+  // Get the current value of the reference counter.
+  uint64_t use_count() { return ref.read(); }
+};
+
+// The global array used to search for a valid slab to allocate from.
+static GuardPtr<Slab> slots[ARRAY_SIZE] = {};
+
+// Tries to find a slab in the table that can support the given chunk size.
+static Slab *find_slab(uint32_t chunk_size) {
+  // We start at a hashed value to spread out different chunk sizes.
+  uint32_t start = impl::hash(chunk_size);
+  uint64_t lane_mask = gpu::get_lane_mask();
+  uint64_t uniform = gpu::match_any(lane_mask, chunk_size);
+
+  Slab *result = nullptr;
+  uint32_t nudge = 0;
+  for (uint64_t mask = lane_mask; mask;
+       mask = gpu::ballot(lane_mask, !result), ++nudge) {
+    uint32_t index = cpp::numeric_limits<uint32_t>::max();
+    for (uint32_t offset = nudge / MAX_TRIES;
+         gpu::ballot(lane_mask, index == cpp::numeric_limits<uint32_t>::max());
+         offset += cpp::popcount(uniform & lane_mask)) {
+      uint32_t candidate =
+          (start + offset + impl::lane_count(uniform & lane_mask)) % ARRAY_SIZE;
+      uint64_t available =
+          gpu::ballot(lane_mask, slots[candidate].use_count() <
+                                     Slab::available_chunks(chunk_size));
+      uint32_t new_index = gpu::shuffle(
+          lane_mask, cpp::countr_zero(available & uniform), candidate);
+
+      // Each uniform group will use the first empty slot they find.
+      if ((index == cpp::numeric_limits<uint32_t>::max() &&
+           (available & uniform)))
+        index = new_index;
+
+      // Guaruntees that this loop will eventuall exit if there is no space.
+      if (offset >= ARRAY_SIZE) {
+        result = reinterpret_cast<Slab *>(SENTINEL);
+        index = 0;
+      }
+    }
+
+    // Try to claim a slot for the found slot.
+    if (!result) {
+      uint64_t reserved = 0;
+      Slab *slab = slots[index].try_lock(lane_mask & mask, uniform & mask,
+                                         reserved, chunk_size, index);
+      // If we find a slab with a matching chunk size then we store the result.
+      // Otherwise, we need to free the claimed lock and continue. In the case
+      // of out-of-memory we return a sentinel value.
+      if (slab && reserved <= Slab::available_chunks(chunk_size) &&
+          slab->get_chunk_size() == chunk_size) {
+        result = slab;
+      } else if (slab && (reserved > Slab::available_chunks(chunk_size) ||
+                          slab->get_chunk_size() != chunk_size)) {
+        if (slab->get_chunk_size() != chunk_size)
+          start = index + 1;
+        slots[index].unlock(gpu::get_lane_mask(),
+                            gpu::get_lane_mask() & uniform);
+      } else if (!slab && reserved == cpp::numeric_limits<uint64_t>::max()) {
+        result = reinterpret_cast<Slab *>(SENTINEL);
+      } else {
+        sleep_briefly();
+      }
+    }
+  }
+  return result;
+}
+
+// Release the lock associated with a given slab.
+static void release_slab(Slab *slab) {
+  uint32_t index = slab->get_global_index();
+  uint64_t lane_mask = gpu::get_lane_mask();
+  uint64_t uniform = gpu::match_any(lane_mask, index);
+  slots[index].unlock(lane_mask, uniform);
+}
 
 namespace gpu {
 
-void *allocate(uint64_t size) { return rpc_allocate(size); }
+void *allocate(uint64_t size) {
+  if (!size)
+    return nullptr;
+
+  // Allocations requiring a full slab or more go directly to memory.
+  if (size >= SLAB_SIZE / 2)
+    return impl::rpc_allocate(impl::round_up<SLAB_SIZE>(size));
+
+  // Try to find a slab for the rounded up chunk size and allocate from it.
+  uint32_t chunk_size = impl::get_chunk_size(static_cast<uint32_t>(size));
+  Slab *slab = find_slab(chunk_size);
+  if (!slab || slab == reinterpret_cast<Slab *>(SENTINEL))
+    return nullptr;
 
-void deallocate(void *ptr) { rpc_free(ptr); }
+  uint64_t lane_mask = gpu::get_lane_mask();
+  uint64_t uniform = gpu::match_any(lane_mask, slab->get_global_index());
+  void *ptr = slab->allocate(lane_mask, uniform);
+  return ptr;
+}
+
+void deallocate(void *ptr) {
+  if (!ptr)
+    return;
+
+  // All non-slab allocations will be aligned on a 2MiB boundary.
+  if ((reinterpret_cast<uintptr_t>(ptr) & SLAB_ALIGNMENT) == 0)
+    return impl::rpc_free(ptr);
+
+  // The original slab pointer is the 2MiB boundary using the given pointer.
+  Slab *slab = reinterpret_cast<Slab *>(
+      (reinterpret_cast<uintptr_t>(ptr) & ~SLAB_ALIGNMENT));
+  slab->deallocate(ptr);
+  release_slab(slab);
+}
 
 } // namespace gpu
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/integration/src/stdlib/CMakeLists.txt b/libc/test/integration/src/stdlib/CMakeLists.txt
index 1efdf607defe9..1773d9fc9f0f5 100644
--- a/libc/test/integration/src/stdlib/CMakeLists.txt
+++ b/libc/test/integration/src/stdlib/CMakeLists.txt
@@ -1,3 +1,6 @@
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
+  add_subdirectory(${LIBC_TARGET_OS})
+endif()
 add_custom_target(stdlib-integration-tests)
 add_dependencies(libc-integration-tests stdlib-integration-tests)
 
diff --git a/libc/test/integration/src/stdlib/gpu/CMakeLists.txt b/libc/test/integration/src/stdlib/gpu/CMakeLists.txt
new file mode 100644
index 0000000000000..26c877b1b6ae6
--- /dev/null
+++ b/libc/test/integration/src/stdlib/gpu/CMakeLists.txt
@@ -0,0 +1,33 @@
+add_custom_target(stdlib-gpu-integration-tests)
+add_dependencies(libc-integration-tests stdlib-gpu-integration-tests)
+
+# TODO: Test on NVPTX, requires CUDA VMEM API.
+if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+  add_integration_test(
+    malloc
+    SUITE
+      stdlib-gpu-integration-tests
+    SRCS
+      malloc.cpp
+    DEPENDS
+      libc.src.stdlib.malloc
+      libc.src.stdlib.free
+    LOADER_ARGS
+      --threads 256
+      --blocks 1024
+  )
+
+  add_integration_test(
+    malloc_stress
+    SUITE
+      stdlib-gpu-integration-tests
+    SRCS
+      malloc_stress.cpp
+    DEPENDS
+      libc.src.stdlib.malloc
+      libc.src.stdlib.free
+    LOADER_ARGS
+      --threads 256
+      --blocks 2048
+  )
+endif()
diff --git a/libc/test/integration/src/stdlib/gpu/malloc.cpp b/libc/test/integration/src/stdlib/gpu/malloc.cpp
new file mode 100644
index 0000000000000..7880206b1aaaa
--- /dev/null
+++ b/libc/test/integration/src/stdlib/gpu/malloc.cpp
@@ -0,0 +1,40 @@
+//===-- Test for parallel GPU malloc interface ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "test/IntegrationTest/test.h"
+
+#include "src/__support/GPU/utils.h"
+#include "src/stdlib/free.h"
+#include "src/stdlib/malloc.h"
+
+using namespace LIBC_NAMESPACE;
+
+TEST_MAIN(int, char **, char **) {
+  int *convergent = reinterpret_cast<int *>(LIBC_NAMESPACE::malloc(16));
+  EXPECT_NE(convergent, nullptr);
+  *convergent = 1;
+  EXPECT_EQ(*convergent, 1);
+  LIBC_NAMESPACE::free(convergent);
+
+  int *divergent = reinterpret_cast<int *>(
+      LIBC_NAMESPACE::malloc((gpu::get_thread_id() + 1) * 16));
+  EXPECT_NE(divergent, nullptr);
+  *divergent = 1;
+  EXPECT_EQ(*divergent, 1);
+  LIBC_NAMESPACE::free(divergent);
+
+  if (gpu::get_lane_id() & 1) {
+    int *masked = reinterpret_cast<int *>(
+        LIBC_NAMESPACE::malloc((gpu::get_thread_id() + 1) * 16));
+    EXPECT_NE(masked, nullptr);
+    *masked = 1;
+    EXPECT_EQ(*masked, 1);
+    LIBC_NAMESPACE::free(masked);
+  }
+  return 0;
+}
diff --git a/libc/test/integration/src/stdlib/gpu/malloc_stress.cpp b/libc/test/integration/src/stdlib/gpu/malloc_stress.cpp
new file mode 100644
index 0000000000000..77479f85dc5cc
--- /dev/null
+++ b/libc/test/integration/src/stdlib/gpu/malloc_stress.cpp
@@ -0,0 +1,38 @@
+//===-- Test for parallel GPU malloc interface ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "test/IntegrationTest/test.h"
+
+#include "src/__support/GPU/utils.h"
+#include "src/stdlib/free.h"
+#include "src/stdlib/malloc.h"
+
+using namespace LIBC_NAMESPACE;
+
+static inline void use(uint8_t *ptr, uint32_t size) {
+  EXPECT_NE(ptr, nullptr);
+  for (int i = 0; i < size; ++i)
+    ptr[i] = uint8_t(i + gpu::get_thread_id());
+
+  // Try to detect if some other thread manages to clobber our memory.
+  for (int i = 0; i < size; ++i)
+    EXPECT_EQ(ptr[i], uint8_t(i + gpu::get_thread_id()));
+}
+
+TEST_MAIN(int, char **, char **) {
+  void *ptrs[256];
+  for (int i = 0; i < 256; ++i)
+    ptrs[i] = malloc(gpu::get_lane_id() % 2 ? 16 : 32);
+
+  for (int i = 0; i < 256; ++i)
+    use(reinterpret_cast<uint8_t *>(ptrs[i]), gpu::get_lane_id() % 2 ? 16 : 32);
+
+  for (int i = 0; i < 256; ++i)
+    free(ptrs[i]);
+  return 0;
+}
diff --git a/libc/test/src/stdlib/malloc_test.cpp b/libc/test/src/stdlib/malloc_test.cpp
index d9023cf56d9fe..a8b32b7a430c9 100644
--- a/libc/test/src/stdlib/malloc_test.cpp
+++ b/libc/test/src/stdlib/malloc_test.cpp
@@ -17,3 +17,15 @@ TEST(LlvmLibcMallocTest, Allocate) {
   EXPECT_EQ(*ptr, 1);
   LIBC_NAMESPACE::free(ptr);
 }
+
+TEST(LlvmLibcMallocTest, Nullptr) {
+  int *ptr = reinterpret_cast<int *>(LIBC_NAMESPACE::malloc(0));
+  EXPECT_EQ(reinterpret_cast<void *>(ptr), static_cast<void *>(nullptr));
+  LIBC_NAMESPACE::free(ptr);
+}
+
+TEST(LlvmLibcMallocTest, LargeAllocation) {
+  int *ptr = reinterpret_cast<int *>(LIBC_NAMESPACE::malloc(2ul * 1024 * 1024));
+  EXPECT_NE(reinterpret_cast<void *>(ptr), static_cast<void *>(nullptr));
+  LIBC_NAMESPACE::free(ptr);
+}