Skip to content

Commit f9ce984

Browse files
committed
Update on "[Executorch] Introduce caching cpu memory allocator"
Meant to use this for temp allocator for kernels. Specifically for sdpa, it seems that on iOS there is a significant overhead coming from allocations Differential Revision: [D85532079](https://our.internmc.facebook.com/intern/diff/D85532079/) [ghstack-poisoned]
2 parents 08ab552 + dbf63cc commit f9ce984

File tree

6 files changed

+18
-9
lines changed

6 files changed

+18
-9
lines changed

extension/llm/custom_ops/TARGETS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,5 +60,6 @@ runtime.python_test(
6060
],
6161
deps = [
6262
"//caffe2:torch",
63+
"//executorch/extension/pybindings:portable_lib",
6364
],
6465
)

extension/llm/custom_ops/op_sdpa_impl.h

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -779,11 +779,13 @@ void cpu_flash_attention(
779779
// Lets align size_per_thread_qdq_vec to 64 bytes, for coalesced cache reads,
780780
// by padding with right number of per thread elements
781781
constexpr int64_t kAlignment = 32;
782-
size_per_thread_qdq_vec = (size_per_thread_qdq_vec + kAlignment - 1) & (-(kAlignment - 1));
783-
int64_t size_per_thread_qdq_bytes = size_per_thread_qdq_vec * query.element_size();
782+
size_per_thread_qdq_vec =
783+
(size_per_thread_qdq_vec + kAlignment - 1) & (-(kAlignment - 1));
784+
int64_t size_per_thread_qdq_bytes = size_per_thread_qdq_vec * sizeof(accum_t);
784785
int64_t size_qdq_bytes = size_per_thread_qdq_bytes * num_thread;
785786
std::vector<char> scratch_for_quant_dequant_vec(size_qdq_bytes);
786-
accum_t* scratch_for_quant_dequant = reinterpret_cast<accum_t*>(scratch_for_quant_dequant_vec.data());
787+
accum_t* scratch_for_quant_dequant =
788+
reinterpret_cast<accum_t*>(scratch_for_quant_dequant_vec.data());
787789

788790
// Data ptrs
789791
const scalar_t* q_data = query.const_data_ptr<scalar_t>();
@@ -808,7 +810,8 @@ void cpu_flash_attention(
808810
scalar_t* qk_reduced_data = is_reduced_type
809811
? buf_reduced_data + ompIdx * qSplitSize * kvSplitSize
810812
: nullptr;
811-
accum_t* buf_qdq_ptr = scratch_for_quant_dequant + ompIdx * size_per_thread_qdq_vec;
813+
accum_t* buf_qdq_ptr =
814+
scratch_for_quant_dequant + ompIdx * size_per_thread_qdq_vec;
812815

813816
for (int64_t z = begin; z < end; z++) {
814817
int64_t m = k * qSplitSize;

extension/llm/custom_ops/test_quantized_sdpa.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import torch.nn.functional as F
1313

1414
from executorch.extension.llm.custom_ops import custom_ops # noqa
15+
from executorch.extension.pybindings.portable_lib import _unsafe_reset_threadpool
1516

1617

1718
def is_fbcode():
@@ -40,6 +41,11 @@ def setUp(self):
4041
self.q_shape = None
4142
self.kv_shape = None
4243
self.is_seq_at_dim_2 = True
44+
# For some reason 4 threads doesnt work
45+
# This setting is needed to make this test not flaky due to OMP
46+
# error of "OMP: Error #131: Thread identifier invalid"
47+
# Not clear why that happens but having smaller threadpool resolves it
48+
_unsafe_reset_threadpool(3)
4349

4450
def _scale_tensor(self, tensor, min_value, max_value, scale=True):
4551
normalized_tensor = (tensor - tensor.min()) / (tensor.max() - tensor.min())

extension/memory_allocator/cpu_caching_malloc_allocator.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ size_t get_alignment_adjusted_size(size_t size, size_t alignment) {
1717
}
1818
} // namespace
1919

20-
CPUCachingAllocator::CPUCachingAllocator(uint32_t max_size) : MemoryAllocator(0, nullptr) {
20+
CPUCachingAllocator::CPUCachingAllocator(uint32_t max_size)
21+
: MemoryAllocator(0, nullptr) {
2122
max_size_ = max_size;
2223
current_size_ = 0;
2324
}

extension/memory_allocator/cpu_caching_malloc_allocator.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,10 @@ using SmallVector = std::vector<T>;
3434
#endif
3535

3636
#ifdef USE_C10_FLAT_HASH_MAP
37-
template<typename KeyType, typename ValueType>
37+
template <typename KeyType, typename ValueType>
3838
using FlatHashMap = ska::flat_hash_map<KeyType, ValueType>;
3939
#else
40-
template<typename KeyType, typename ValueType>
40+
template <typename KeyType, typename ValueType>
4141
using FlatHashMap = std::unordered_map<KeyType, ValueType>;
4242
#endif
4343

extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
2-
31
#include <cstddef>
42
#include <cstdint>
53
#include <cstring>

0 commit comments

Comments
 (0)