diff --git a/libc/src/__support/GPU/CMakeLists.txt b/libc/src/__support/GPU/CMakeLists.txt
index f8fdfeb9da9df..72a7879f72224 100644
--- a/libc/src/__support/GPU/CMakeLists.txt
+++ b/libc/src/__support/GPU/CMakeLists.txt
@@ -9,6 +9,12 @@ add_header_library(
     utils.h
 )
 
+add_header_library(
+  fixedstack
+  HDRS
+    fixedstack.h
+)
+
 add_object_library(
   allocator
   SRCS
@@ -23,4 +29,5 @@ add_object_library(
     libc.src.__support.CPP.bit
     libc.src.__support.CPP.new
     .utils
+    .fixedstack
 )
diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index 250bebdbb7d81..534a309fec7b4 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -20,6 +20,7 @@
 #include "src/__support/CPP/atomic.h"
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/new.h"
+#include "src/__support/GPU/fixedstack.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "src/__support/threads/sleep.h"
@@ -39,6 +40,9 @@ constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1;
 // The number of times to attempt claiming an in-progress slab allocation.
 constexpr static uint32_t MAX_TRIES = 1024;
 
+// The number of previously allocated slabs we will keep in memory.
+constexpr static uint32_t CACHED_SLABS = 8;
+
 static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two");
 
 namespace impl {
@@ -185,20 +189,35 @@ struct Slab {
   struct alignas(MIN_SIZE) Header {
     uint32_t chunk_size;
     uint32_t global_index;
+    uint32_t cached_chunk_size;
   };
 
   // Initialize the slab with its chunk size and index in the global table for
   // use when freeing.
   Slab(uint32_t chunk_size, uint32_t global_index) {
     Header *header = reinterpret_cast<Header *>(memory);
+    header->cached_chunk_size = cpp::numeric_limits<uint32_t>::max();
     header->chunk_size = chunk_size;
     header->global_index = global_index;
   }
 
+  // Reset the memory with a new index and chunk size, not thread safe.
+  Slab *reset(uint32_t chunk_size, uint32_t global_index) {
+    Header *header = reinterpret_cast<Header *>(memory);
+    header->cached_chunk_size = header->chunk_size;
+    header->chunk_size = chunk_size;
+    header->global_index = global_index;
+    return this;
+  }
+
   // Set the necessary bitfield bytes to zero in parallel using many lanes. This
   // must be called before the bitfield can be accessed safely, memory is not
   // guaranteed to be zero initialized in the current implementation.
   void initialize(uint64_t uniform) {
+    // If this is a re-used slab the memory is already set to zero.
+    if (get_cached_chunk_size() <= get_chunk_size())
+      return;
+
     uint32_t size = (bitfield_bytes(get_chunk_size()) + sizeof(uint32_t) - 1) /
                     sizeof(uint32_t);
     impl::uniform_memset(get_bitfield(), 0, size, uniform);
@@ -236,6 +255,11 @@ struct Slab {
     return reinterpret_cast<const Header *>(memory)->chunk_size;
   }
 
+  // Get the chunk size that was previously used.
+  uint32_t get_cached_chunk_size() const {
+    return reinterpret_cast<const Header *>(memory)->cached_chunk_size;
+  }
+
   // Get the location in the memory where we will store the global index.
   uint32_t get_global_index() const {
     return reinterpret_cast<const Header *>(memory)->global_index;
@@ -337,6 +361,9 @@ struct Slab {
   uint8_t memory[SLAB_SIZE];
 };
 
+// A global cache of previously allocated slabs for efficient reuse.
+static FixedStack<Slab *, CACHED_SLABS> slab_cache;
+
 /// A wait-free guard around a pointer resource to be created dynamically if
 /// space is available and freed once there are no more users.
 struct GuardPtr {
@@ -408,6 +435,11 @@ struct GuardPtr {
             reinterpret_cast<Slab *>(cpp::numeric_limits<uintptr_t>::max()),
             cpp::MemoryOrder::RELAXED, cpp::MemoryOrder::RELAXED)) {
       count = cpp::numeric_limits<uint32_t>::max();
+
+      Slab *cached = nullptr;
+      if (slab_cache.pop(cached))
+        return cached->reset(cpp::forward<Args>(args)...);
+
       void *raw = impl::rpc_allocate(sizeof(Slab));
       if (!raw)
         return nullptr;
@@ -475,8 +507,10 @@ struct GuardPtr {
     if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(mask)) &&
         ref.release(cpp::popcount(mask))) {
       Slab *p = ptr.load(cpp::MemoryOrder::RELAXED);
-      p->~Slab();
-      impl::rpc_free(p);
+      if (!slab_cache.push(p)) {
+        p->~Slab();
+        impl::rpc_free(p);
+      }
       cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
       ptr.store(nullptr, cpp::MemoryOrder::RELAXED);
     }
diff --git a/libc/src/__support/GPU/fixedstack.h b/libc/src/__support/GPU/fixedstack.h
new file mode 100644
index 0000000000000..6ceaa2fc73355
--- /dev/null
+++ b/libc/src/__support/GPU/fixedstack.h
@@ -0,0 +1,111 @@
+//===-- A lock-free data structure for a fixed capacity stack ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_FIXEDSTACK_H
+#define LLVM_LIBC_SRC___SUPPORT_GPU_FIXEDSTACK_H
+
+#include "src/__support/CPP/atomic.h"
+#include "src/__support/threads/sleep.h"
+
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+// A lock-free fixed size stack backed by an underlying array of data. It
+// supports push and pop operations in a completely lock-free manner.
+template <typename T, uint32_t CAPACITY> struct alignas(16) FixedStack {
+  // The index is stored as a 20-bit value and cannot index into any more.
+  static_assert(CAPACITY < 1024 * 1024, "Invalid buffer size");
+
+  // The head of the free and used stacks. Represents as a 20-bit index combined
+  // with a 44-bit ABA tag that is updated in a single atomic operation.
+  uint64_t free;
+  uint64_t used;
+
+  // The stack is a linked list of indices into the underlying data
+  uint32_t next[CAPACITY];
+  T data[CAPACITY];
+
+  // Get the 20-bit index into the underlying array from the head.
+  LIBC_INLINE static constexpr uint32_t get_node(uint64_t head) {
+    return static_cast<uint32_t>(head & 0xfffff);
+  }
+
+  // Increment the old ABA tag and merge it into the new index.
+  LIBC_INLINE static constexpr uint64_t make_head(uint64_t orig,
+                                                  uint32_t node) {
+    return static_cast<uint64_t>(node) | (((orig >> 20ul) + 1ul) << 20ul);
+  }
+
+  // Attempts to pop data from the given stack by making it point to the next
+  // node. We repeatedly attempt to write to the head using compare-and-swap,
+  // expecting that it has not been changed by any other thread.
+  LIBC_INLINE uint32_t pop_impl(cpp::AtomicRef<uint64_t> head) {
+    uint64_t orig = head.load(cpp::MemoryOrder::RELAXED);
+
+    for (;;) {
+      if (get_node(orig) == CAPACITY)
+        return CAPACITY;
+
+      uint32_t node =
+          cpp::AtomicRef(next[get_node(orig)]).load(cpp::MemoryOrder::RELAXED);
+      if (head.compare_exchange_strong(orig, make_head(orig, node),
+                                       cpp::MemoryOrder::ACQUIRE,
+                                       cpp::MemoryOrder::RELAXED))
+        break;
+    }
+    return get_node(orig);
+  }
+
+  // Attempts to push data to the given stack by making it point to the new
+  // node. We repeatedly attempt to write to the head using compare-and-swap,
+  // expecting that it has not been changed by any other thread.
+  LIBC_INLINE uint32_t push_impl(cpp::AtomicRef<uint64_t> head, uint32_t node) {
+    uint64_t orig = head.load(cpp::MemoryOrder::RELAXED);
+    for (;;) {
+      next[node] = get_node(orig);
+      if (head.compare_exchange_strong(orig, make_head(orig, node),
+                                       cpp::MemoryOrder::RELEASE,
+                                       cpp::MemoryOrder::RELAXED))
+        break;
+    }
+    return get_node(head.load(cpp::MemoryOrder::RELAXED));
+  }
+
+public:
+  // Initialize the free stack to be full and the used stack to be empty. We use
+  // the capacity of the stack as a sentinel value.
+  LIBC_INLINE constexpr FixedStack() : free(0), used(CAPACITY), data{} {
+    for (uint32_t i = 0; i < CAPACITY; ++i)
+      next[i] = i + 1;
+  }
+
+  LIBC_INLINE bool push(const T &val) {
+    uint32_t node = pop_impl(cpp::AtomicRef(free));
+    if (node == CAPACITY)
+      return false;
+
+    data[node] = val;
+    push_impl(cpp::AtomicRef(used), node);
+    return true;
+  }
+
+  LIBC_INLINE bool pop(T &val) {
+    uint32_t node = pop_impl(cpp::AtomicRef(used));
+    if (node == CAPACITY)
+      return false;
+
+    val = data[node];
+    push_impl(cpp::AtomicRef(free), node);
+    return true;
+  }
+};
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_GPU_FIXEDSTACK_H
diff --git a/libc/test/integration/src/__support/GPU/CMakeLists.txt b/libc/test/integration/src/__support/GPU/CMakeLists.txt
index e066830f6cc0d..1fb175b92ef2a 100644
--- a/libc/test/integration/src/__support/GPU/CMakeLists.txt
+++ b/libc/test/integration/src/__support/GPU/CMakeLists.txt
@@ -27,3 +27,16 @@ add_integration_test(
   LOADER_ARGS
     --threads 64
 )
+
+add_libc_test(
+  fixedstack_test
+  SUITE
+    libc-support-gpu-tests
+  SRCS
+    fixedstack_test.cpp
+  DEPENDS
+    libc.src.__support.GPU.fixedstack
+  LOADER_ARGS
+    --threads 32
+    --blocks 16
+)
diff --git a/libc/test/integration/src/__support/GPU/fixedstack_test.cpp b/libc/test/integration/src/__support/GPU/fixedstack_test.cpp
new file mode 100644
index 0000000000000..fde51df325ab8
--- /dev/null
+++ b/libc/test/integration/src/__support/GPU/fixedstack_test.cpp
@@ -0,0 +1,44 @@
+//===-- Integration test for the lock-free stack --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/GPU/fixedstack.h"
+#include "src/__support/GPU/utils.h"
+#include "test/IntegrationTest/test.h"
+
+using namespace LIBC_NAMESPACE;
+
+static FixedStack<uint32_t, 2048> global_stack;
+
+void run() {
+  // We need enough space in the stack as threads in flight can temporarily
+  // consume memory before they finish comitting it back to the stack.
+  ASSERT_EQ(gpu::get_num_blocks() * gpu::get_num_threads(), 512);
+
+  uint32_t val;
+  uint32_t num_threads = static_cast<uint32_t>(gpu::get_num_threads());
+  for (int i = 0; i < 256; ++i) {
+    EXPECT_TRUE(global_stack.push(UINT32_MAX))
+    EXPECT_TRUE(global_stack.pop(val))
+    ASSERT_TRUE(val < num_threads || val == UINT32_MAX);
+  }
+
+  EXPECT_TRUE(global_stack.push(static_cast<uint32_t>(gpu::get_thread_id())));
+  EXPECT_TRUE(global_stack.push(static_cast<uint32_t>(gpu::get_thread_id())));
+  EXPECT_TRUE(global_stack.pop(val));
+  ASSERT_TRUE(val < num_threads || val == UINT32_MAX);
+
+  // Fill the rest of the stack with the default value.
+  while (!global_stack.push(UINT32_MAX))
+    ;
+}
+
+TEST_MAIN(int argc, char **argv, char **envp) {
+  run();
+
+  return 0;
+}