[libc] Simplifiy slab waiting in GPU memory allocator (#152872)

jhuber6 · web-flow · commit 005895290d59 · 2025-08-11T13:11:39.000-05:00
Summary:
This moves the waiting to be done inside of the `try_lock` routine
instead. This makes the logic much simpler since it's just a single loop
on a load. We should have the same effect here, and since we don't care
about this being a generic interface it shouldn't matter that it waits
abit. Still wait free since it's guaranteed to make progress
*eventually*.
diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
@@ -166,7 +166,11 @@ static inline uint32_t get_leader_id(uint64_t ballot, uint32_t id) {
 
 // We use a sentinal value to indicate a failed or in-progress allocation.
 template <typename T> bool is_sentinel(const T &x) {
-  return x == cpp::numeric_limits<T>::max();
+  if constexpr (cpp::is_pointer_v<T>)
+    return reinterpret_cast<uintptr_t>(x) ==
+           cpp::numeric_limits<uintptr_t>::max();
+  else
+    return x == cpp::numeric_limits<T>::max();
 }
 
 } // namespace impl
@@ -446,7 +450,13 @@ struct GuardPtr {
       return new (raw) Slab(cpp::forward<Args>(args)...);
     }
 
-    if (!expected || impl::is_sentinel(reinterpret_cast<uintptr_t>(expected)))
+    // If there is a slab allocation in progress we retry a few times.
+    for (uint32_t t = 0; impl::is_sentinel(expected) && t < MAX_TRIES; ++t) {
+      sleep_briefly();
+      expected = ptr.load(cpp::MemoryOrder::RELAXED);
+    }
+
+    if (!expected || impl::is_sentinel(expected))
       return nullptr;
 
     if (!ref.acquire(n, count))
@@ -557,16 +567,6 @@ static Slab *find_slab(uint32_t chunk_size, uint64_t &uniform,
       Slab *slab = slots[index].try_lock(lane_mask, uniform & lane_mask,
                                          reserved, chunk_size, index);
 
-      // If there is a slab allocation in progress we retry a few times.
-      for (uint32_t retries = 0;
-           !slab && !impl::is_sentinel(reserved) && retries < MAX_TRIES;
-           retries++) {
-        uint64_t lane_mask = gpu::get_lane_mask();
-        slab = slots[index].try_lock(lane_mask, uniform & lane_mask, reserved,
-                                     chunk_size, index);
-        sleep_briefly();
-      }
-
       // If we find a slab with a matching chunk size then we store the result.
       // Otherwise, we need to free the claimed lock and continue. In the case
       // of out-of-memory we receive a sentinel value and return a failure.