From febcc823a60ed7eef5eaf7f416927828c3dc5876 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 24 Jul 2025 09:30:28 -0500
Subject: [PATCH] [libc] Improve starting indicdes for GPU allocation

Summary:
The slots in this allocation scheme are statically allocated. All sizes
share the same array of slots, but are given different starting
locations to space them apart. The previous implementation used a
trivial linear slice. This is inefficient because it provides the more
likely allocations (1-1024 bytes) with just as much space as a highly
unlikely one (1 MiB).

This patch uses a cubic easing function to gradually shrink the gaps.
For example, we used to get around 700 free slots for a 16 byte
allocation, now we get around 2100 before it starts encroaching on the
32 byte allocation space. This could be improved further, but I think
this is sufficient.
---
 libc/src/__support/GPU/allocator.cpp | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index 866aea7b69d4e..f115d2a26606a 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -142,10 +142,16 @@ static inline constexpr bool is_pow2(uint64_t x) {
   return x && (x & (x - 1)) == 0;
 }
 
-// Where this chunk size should start looking in the global array.
-static inline constexpr uint32_t start_index(uint32_t chunk_index) {
-  return (ARRAY_SIZE * impl::get_chunk_id(chunk_index)) /
-         impl::get_chunk_id(SLAB_SIZE / 2);
+// Where this chunk size should start looking in the global array. Small
+// allocations are much more likely than large ones, so we give them the most
+// space. We use a cubic easing function normalized on the possible chunks.
+static inline constexpr uint32_t start_index(uint32_t chunk_size) {
+  constexpr uint32_t max_chunk = impl::get_chunk_id(SLAB_SIZE / 2);
+  uint64_t norm =
+      (1 << 16) - (impl::get_chunk_id(chunk_size) << 16) / max_chunk;
+  uint64_t bias = (norm * norm * norm) >> 32;
+  uint64_t inv = (1 << 16) - bias;
+  return static_cast<uint32_t>(((ARRAY_SIZE - 1) * inv) >> 16);
 }
 
 } // namespace impl
@@ -487,9 +493,10 @@ static Slab *find_slab(uint32_t chunk_size) {
   uint32_t start = indices[chunk_id].load(cpp::MemoryOrder::RELAXED);
   uint64_t uniform = gpu::match_any(gpu::get_lane_mask(), chunk_size);
 
-  for (uint32_t offset = 0; offset < ARRAY_SIZE; ++offset) {
+  for (uint32_t offset = 0; offset <= ARRAY_SIZE; ++offset) {
     uint32_t index =
-        !offset ? start : (impl::start_index(chunk_size) + offset) % ARRAY_SIZE;
+        !offset ? start
+                : (impl::start_index(chunk_size) + offset - 1) % ARRAY_SIZE;
 
     if (slots[index].use_count() < Slab::available_chunks(chunk_size)) {
       uint64_t lane_mask = gpu::get_lane_mask();