Skip to content

Conversation

jhuber6
Copy link
Contributor

@jhuber6 jhuber6 commented Jul 31, 2025

Summary:
Without slab reclaiming this interface is much simpler and it can speed
up cases with a lot of churn. Basically, wastes memory for performance.

@llvmbot
Copy link
Member

llvmbot commented Jul 31, 2025

@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-libc

Author: Joseph Huber (jhuber6)

Changes

Summary:
Without slab reclaiming this interface is much simpler and it can speed
up cases with a lot of churn. Basically, wastes memory for performance.


Full diff: https://github.com/llvm/llvm-project/pull/151599.diff

6 Files Affected:

  • (modified) libc/config/config.json (+6)
  • (modified) libc/config/gpu/amdgpu/config.json (+5)
  • (modified) libc/config/gpu/nvptx/config.json (+5)
  • (modified) libc/docs/configure.rst (+2)
  • (modified) libc/src/__support/GPU/CMakeLists.txt (+5)
  • (modified) libc/src/__support/GPU/allocator.cpp (+12-4)
diff --git a/libc/config/config.json b/libc/config/config.json
index d53b2936edb07..fe077d16342de 100644
--- a/libc/config/config.json
+++ b/libc/config/config.json
@@ -119,6 +119,12 @@
       "doc": "Force the size of time_t to 64 bits, even on platforms where compatibility considerations would otherwise make it 32-bit."
     }
   },
+  "malloc": {
+    "LIBC_CONF_GPU_MALLOC_DISABLE_SLAB_RECLAIMING": {
+      "value": false,
+      "doc": "The malloc implementation will return unused slabs to system memory."
+    }
+  },
   "general": {
     "LIBC_ADD_NULL_CHECKS": {
       "value": true,
diff --git a/libc/config/gpu/amdgpu/config.json b/libc/config/gpu/amdgpu/config.json
index 30ae10e2cfd61..d3918eff8a59d 100644
--- a/libc/config/gpu/amdgpu/config.json
+++ b/libc/config/gpu/amdgpu/config.json
@@ -36,5 +36,10 @@
     "LIBC_CONF_MATH_OPTIMIZATIONS": {
       "value": "(LIBC_MATH_SKIP_ACCURATE_PASS | LIBC_MATH_SMALL_TABLES | LIBC_MATH_NO_ERRNO | LIBC_MATH_NO_EXCEPT)"
     }
+  },
+  "malloc": {
+    "LIBC_CONF_GPU_MALLOC_DISABLE_SLAB_RECLAIMING": {
+      "value": false
+    }
   }
 }
diff --git a/libc/config/gpu/nvptx/config.json b/libc/config/gpu/nvptx/config.json
index 30ae10e2cfd61..d3918eff8a59d 100644
--- a/libc/config/gpu/nvptx/config.json
+++ b/libc/config/gpu/nvptx/config.json
@@ -36,5 +36,10 @@
     "LIBC_CONF_MATH_OPTIMIZATIONS": {
       "value": "(LIBC_MATH_SKIP_ACCURATE_PASS | LIBC_MATH_SMALL_TABLES | LIBC_MATH_NO_ERRNO | LIBC_MATH_NO_EXCEPT)"
     }
+  },
+  "malloc": {
+    "LIBC_CONF_GPU_MALLOC_DISABLE_SLAB_RECLAIMING": {
+      "value": false
+    }
   }
 }
diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst
index 109412225634f..4b005bb177da1 100644
--- a/libc/docs/configure.rst
+++ b/libc/docs/configure.rst
@@ -32,6 +32,8 @@ to learn about the defaults for your platform and target.
     - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, LIBC_ERRNO_MODE_SYSTEM, and LIBC_ERRNO_MODE_SYSTEM_INLINE.
 * **"general" options**
     - ``LIBC_ADD_NULL_CHECKS``: Add nullptr checks in the library's implementations to some functions for which passing nullptr is undefined behavior.
+* **"malloc" options**
+    - ``LIBC_CONF_GPU_MALLOC_DISABLE_SLAB_RECLAIMING``: The malloc implementation will return unused slabs to system memory.
 * **"math" options**
     - ``LIBC_CONF_FREXP_INF_NAN_EXPONENT``: The value written back to the second parameter when calling frexp/frexpf/frexpl` with `+/-Inf`/`NaN` is unspecified.  Configure an explicit exp value for Inf/NaN inputs.
     - ``LIBC_CONF_MATH_OPTIMIZATIONS``: Configures optimizations for math functions. Values accepted are LIBC_MATH_SKIP_ACCURATE_PASS, LIBC_MATH_SMALL_TABLES, LIBC_MATH_NO_ERRNO, LIBC_MATH_NO_EXCEPT, and LIBC_MATH_FAST.
diff --git a/libc/src/__support/GPU/CMakeLists.txt b/libc/src/__support/GPU/CMakeLists.txt
index f8fdfeb9da9df..e6884a31b6a5b 100644
--- a/libc/src/__support/GPU/CMakeLists.txt
+++ b/libc/src/__support/GPU/CMakeLists.txt
@@ -3,6 +3,10 @@ if(NOT LIBC_TARGET_OS_IS_GPU)
   return()
 endif()
 
+if(LIBC_CONF_MALLOC_DISABLE_SLAB_RECLAIMING)
+  list(APPEND malloc_config_copts "-DLIBC_CONF_MALLOC_DISABLE_SLAB_RECLAIMING")
+endif()
+
 add_header_library(
   utils
   HDRS
@@ -23,4 +27,5 @@ add_object_library(
     libc.src.__support.CPP.bit
     libc.src.__support.CPP.new
     .utils
+  ${malloc_config_copts}
 )
diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index bd0a55cb938fb..a55ff6b967546 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -39,6 +39,13 @@ constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1;
 // The number of times to attempt claiming an in-progress slab allocation.
 constexpr static uint32_t MAX_TRIES = 1024;
 
+// Configuration for whether or not we will return unused slabs to memory.
+#ifdef LIBC_CONF_MALLOC_DISABLE_SLAB_RECLAIMING
+constexpr static bool RECLAIM = false;
+#else
+constexpr static bool RECLAIM = true;
+#endif
+
 static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two");
 
 namespace impl {
@@ -368,7 +375,7 @@ struct GuardPtr {
       // and obtain exclusive rights to deconstruct it. If the CAS failed either
       // another thread resurrected the counter and we quit, or a parallel read
       // helped us invalidating it. For the latter, claim that flag and return.
-      if (counter.fetch_sub(n, cpp::MemoryOrder::RELAXED) == n) {
+      if (counter.fetch_sub(n, cpp::MemoryOrder::RELAXED) == n && RECLAIM) {
         uint32_t expected = 0;
         if (counter.compare_exchange_strong(expected, INVALID,
                                             cpp::MemoryOrder::RELAXED,
@@ -386,8 +393,9 @@ struct GuardPtr {
     // thread.
     uint64_t read() {
       auto val = counter.load(cpp::MemoryOrder::RELAXED);
-      if (val == 0 && counter.compare_exchange_strong(
-                          val, INVALID | HELPED, cpp::MemoryOrder::RELAXED))
+      if (val == 0 && RECLAIM &&
+          counter.compare_exchange_strong(val, INVALID | HELPED,
+                                          cpp::MemoryOrder::RELAXED))
         return 0;
       return (val & INVALID) ? 0 : val;
     }
@@ -421,7 +429,7 @@ struct GuardPtr {
       return nullptr;
 
     cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
-    return ptr.load(cpp::MemoryOrder::RELAXED);
+    return RECLAIM ? ptr.load(cpp::MemoryOrder::RELAXED) : expected;
   }
 
   // Finalize the associated memory and signal that it is ready to use by

Copy link
Contributor

@shiltian shiltian left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will this config be tested as well?

@arsenm
Copy link
Contributor

arsenm commented Aug 27, 2025

This isn't very discoverable, and I think it's unlikely to be used in practice. Is it feasible to use an environment variable at load time for this?

@jhuber6
Copy link
Contributor Author

jhuber6 commented Aug 27, 2025

This isn't very discoverable, and I think it's unlikely to be used in practice. Is it feasible to use an environment variable at load time for this?

This is less important now that I've added a small cache of already allocated slabs so I'll probably just close it.

@jhuber6 jhuber6 closed this Aug 27, 2025
@arsenm
Copy link
Contributor

arsenm commented Aug 27, 2025

Maybe keep the code change in, just with the hardcoded value at least?

@jhuber6
Copy link
Contributor Author

jhuber6 commented Aug 27, 2025

Maybe keep the code change in, just with the hardcoded value at least?

I can do that.

@jhuber6 jhuber6 reopened this Aug 27, 2025
Summary:
Without slab reclaiming this interface is much simpler and it can speed
up cases with a lot of churn. Basically, wastes memory for performance.
@jhuber6 jhuber6 force-pushed the DisableReclaiming branch from bdccf8a to 69a6fb0 Compare August 27, 2025 12:51
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants