Update on "[Executorch] Introduce caching cpu memory allocator"

kimishpatel · kimishpatel · commit f9ce984ece1c · 2025-11-06T12:51:19.000-08:00
Meant to use this for temp allocator for kernels. Specifically for sdpa, it seems that on iOS there is a significant overhead coming from allocations Differential Revision: [D85532079](https://our.internmc.facebook.com/intern/diff/D85532079/) [ghstack-poisoned]
diff --git a/extension/llm/custom_ops/TARGETS b/extension/llm/custom_ops/TARGETS
@@ -60,5 +60,6 @@ runtime.python_test(
     ],
     deps = [
         "//caffe2:torch",
+        "//executorch/extension/pybindings:portable_lib",
     ],
 )
diff --git a/extension/llm/custom_ops/op_sdpa_impl.h b/extension/llm/custom_ops/op_sdpa_impl.h
@@ -779,11 +779,13 @@ void cpu_flash_attention(
   // Lets align size_per_thread_qdq_vec to 64 bytes, for coalesced cache reads,
   // by padding with right number of per thread elements
   constexpr int64_t kAlignment = 32;
-  size_per_thread_qdq_vec = (size_per_thread_qdq_vec + kAlignment - 1) & (-(kAlignment - 1));
-  int64_t size_per_thread_qdq_bytes = size_per_thread_qdq_vec * query.element_size();
+  size_per_thread_qdq_vec =
+      (size_per_thread_qdq_vec + kAlignment - 1) & (-(kAlignment - 1));
+  int64_t size_per_thread_qdq_bytes = size_per_thread_qdq_vec * sizeof(accum_t);
   int64_t size_qdq_bytes = size_per_thread_qdq_bytes * num_thread;
   std::vector<char> scratch_for_quant_dequant_vec(size_qdq_bytes);
-  accum_t* scratch_for_quant_dequant = reinterpret_cast<accum_t*>(scratch_for_quant_dequant_vec.data());
+  accum_t* scratch_for_quant_dequant =
+      reinterpret_cast<accum_t*>(scratch_for_quant_dequant_vec.data());
 
   // Data ptrs
   const scalar_t* q_data = query.const_data_ptr<scalar_t>();
@@ -808,7 +810,8 @@ void cpu_flash_attention(
     scalar_t* qk_reduced_data = is_reduced_type
         ? buf_reduced_data + ompIdx * qSplitSize * kvSplitSize
         : nullptr;
-    accum_t* buf_qdq_ptr = scratch_for_quant_dequant + ompIdx * size_per_thread_qdq_vec;
+    accum_t* buf_qdq_ptr =
+        scratch_for_quant_dequant + ompIdx * size_per_thread_qdq_vec;
 
     for (int64_t z = begin; z < end; z++) {
       int64_t m = k * qSplitSize;
diff --git a/extension/llm/custom_ops/test_quantized_sdpa.py b/extension/llm/custom_ops/test_quantized_sdpa.py
@@ -12,6 +12,7 @@
 import torch.nn.functional as F
 
 from executorch.extension.llm.custom_ops import custom_ops  # noqa
+from executorch.extension.pybindings.portable_lib import _unsafe_reset_threadpool
 
 
 def is_fbcode():
@@ -40,6 +41,11 @@ def setUp(self):
         self.q_shape = None
         self.kv_shape = None
         self.is_seq_at_dim_2 = True
+        # For some reason 4 threads doesnt work
+        # This setting is needed to make this test not flaky due to OMP
+        # error of "OMP: Error #131: Thread identifier invalid"
+        # Not clear why that happens but having smaller threadpool resolves it
+        _unsafe_reset_threadpool(3)
 
     def _scale_tensor(self, tensor, min_value, max_value, scale=True):
         normalized_tensor = (tensor - tensor.min()) / (tensor.max() - tensor.min())
diff --git a/extension/memory_allocator/cpu_caching_malloc_allocator.cpp b/extension/memory_allocator/cpu_caching_malloc_allocator.cpp
@@ -17,7 +17,8 @@ size_t get_alignment_adjusted_size(size_t size, size_t alignment) {
 }
 } // namespace
 
-CPUCachingAllocator::CPUCachingAllocator(uint32_t max_size) : MemoryAllocator(0, nullptr) {
+CPUCachingAllocator::CPUCachingAllocator(uint32_t max_size)
+    : MemoryAllocator(0, nullptr) {
   max_size_ = max_size;
   current_size_ = 0;
 }
diff --git a/extension/memory_allocator/cpu_caching_malloc_allocator.h b/extension/memory_allocator/cpu_caching_malloc_allocator.h
@@ -34,10 +34,10 @@ using SmallVector = std::vector<T>;
 #endif
 
 #ifdef USE_C10_FLAT_HASH_MAP
-template<typename KeyType, typename ValueType>
+template <typename KeyType, typename ValueType>
 using FlatHashMap = ska::flat_hash_map<KeyType, ValueType>;
 #else
-template<typename KeyType, typename ValueType>
+template <typename KeyType, typename ValueType>
 using FlatHashMap = std::unordered_map<KeyType, ValueType>;
 #endif
 
diff --git a/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp b/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp
@@ -1,5 +1,3 @@
-// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
-
 #include <cstddef>
 #include <cstdint>
 #include <cstring>

Original file line number	Diff line number	Diff line change
`@@ -60,5 +60,6 @@ runtime.python_test(`
`60`	`60`	`],`
`61`	`61`	`deps = [`
`62`	`62`	`"//caffe2:torch",`
	`63`	`+ "//executorch/extension/pybindings:portable_lib",`
`63`	`64`	`],`
`64`	`65`	`)`
Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,8 @@ size_t get_alignment_adjusted_size(size_t size, size_t alignment) {`
`17`	`17`	`}`
`18`	`18`	`} // namespace`
`19`	`19`
`20`		`-CPUCachingAllocator::CPUCachingAllocator(uint32_t max_size) : MemoryAllocator(0, nullptr) {`
	`20`	`+CPUCachingAllocator::CPUCachingAllocator(uint32_t max_size)`
	`21`	`+ : MemoryAllocator(0, nullptr) {`
`21`	`22`	`max_size_ = max_size;`
`22`	`23`	`current_size_ = 0;`
`23`	`24`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,3 @@`
`1`		`-// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.`
`2`		`-`
`3`	`1`	`#include <cstddef>`
`4`	`2`	`#include <cstdint>`
`5`	`3`	`#include <cstring>`