Introduce per-thread memory pools for lock-free aloocations in quantile.

ienkovich · ienkovich · commit 12cf4fc800c6 · 2023-09-20T12:27:13.000-05:00
Signed-off-by: ienkovich &lt;ilya.enkovich@intel.com&gt;
diff --git a/omniscidb/ResultSet/RowSetMemoryOwner.h b/omniscidb/ResultSet/RowSetMemoryOwner.h
@@ -31,6 +31,7 @@
 #include "Logger/Logger.h"
 #include "Shared/approx_quantile.h"
 #include "Shared/quantile.h"
+#include "Shared/thread_count.h"
 #include "StringDictionary/StringDictionaryProxy.h"
 #include "ThirdParty/robin_hood.h"
 
@@ -41,6 +42,19 @@ class ResultSet;
  * managed allocator object
  */
 class RowSetMemoryOwner final : public SimpleAllocator, boost::noncopyable {
+ private:
+  struct ThreadMemPool {
+    ThreadMemPool() : data(nullptr), size(0) {}
+    ThreadMemPool(const ThreadMemPool& other) = default;
+    ThreadMemPool& operator=(const ThreadMemPool& other) = default;
+
+    int8_t* data;
+    size_t size;
+  };
+
+  constexpr static size_t SMALL_MEM_POOL_SIZE = 10 << 20;  // 10MB
+  constexpr static size_t MAX_IGNORED_FRAGMENT = 1 << 20;  // 1MB
+
  public:
   RowSetMemoryOwner(DataProvider* data_provider,
                     const size_t arena_block_size,
@@ -52,6 +66,7 @@ class RowSetMemoryOwner final : public SimpleAllocator, boost::noncopyable {
     // size up to 256 bytes to avoid such cache conflicts. This allows to significantly
     // reduce amount of allocated virtual memory which is important for ASAN runs.
     allocator_ = std::make_unique<Arena>(arena_block_size);
+    small_mem_pools_.resize(cpu_threads());
   }
 
   enum class StringTranslationType { SOURCE_INTERSECTION, SOURCE_UNION };
@@ -67,6 +82,35 @@ class RowSetMemoryOwner final : public SimpleAllocator, boost::noncopyable {
         allocator_->allocate(std::max(num_bytes, (size_t)256)));
   }
 
+  int8_t* allocateSmallMtNoLock(size_t size, size_t thread_idx = 0) override {
+    if (size > SMALL_MEM_POOL_SIZE) {
+      return allocate(size);
+    }
+
+    // Round-up size to keep 8-byte alignment.
+    size = (size + 7) & (~7);
+
+    // Normally, we use TBB thread index and don't expect it to be greater than
+    // cpu_threads() but we don't respect g_cpu_threads_override currently for TBB.
+    if (thread_idx >= small_mem_pools_.size()) {
+      return allocate(size);
+    }
+
+    auto& pool = small_mem_pools_[thread_idx];
+    if (size > pool.size) {
+      if (pool.size > MAX_IGNORED_FRAGMENT) {
+        return allocate(size);
+      }
+      pool.data = allocate(SMALL_MEM_POOL_SIZE);
+      pool.size = SMALL_MEM_POOL_SIZE;
+    }
+
+    auto res = pool.data;
+    pool.data += size;
+    pool.size -= size;
+    return res;
+  }
+
   int8_t* allocateCountDistinctBuffer(const size_t num_bytes,
                                       const size_t thread_idx = 0) {
     int8_t* buffer = allocate(num_bytes, thread_idx);
@@ -267,6 +311,10 @@ class RowSetMemoryOwner final : public SimpleAllocator, boost::noncopyable {
   size_t arena_block_size_;      // for cloning
   std::unique_ptr<Arena> allocator_;
 
+  // Small memory pools that get memory from the base arena and are used
+  // for lock-free allocation of small memory batches in execution kernels.
+  std::vector<ThreadMemPool> small_mem_pools_;
+
   mutable std::mutex state_mutex_;
 
   friend class ResultSet;
diff --git a/omniscidb/Shared/SimpleAllocator.h b/omniscidb/Shared/SimpleAllocator.h
@@ -22,4 +22,12 @@ class SimpleAllocator {
 
  public:
   virtual int8_t* allocate(const size_t num_bytes, const size_t thread_idx = 0) = 0;
+  // This allocation method is supposed to be used by execution kernels for allocating
+  // small memory batches. Callers are responsible for not using the same thread_idx
+  // values from different threads. This enables lock-free thread local memory pools
+  // usage for better performance. Implementations are likely to fallback to a regular
+  // allocation for big memory chunks and for thread indexes exceeding cpu_count().
+  virtual int8_t* allocateSmallMtNoLock(size_t size, size_t thread_idx = 0) {
+    return allocate(size);
+  }
 };
diff --git a/omniscidb/Shared/quantile.h b/omniscidb/Shared/quantile.h
@@ -11,6 +11,8 @@
 
 #include "IR/OpTypeEnums.h"
 
+#include <tbb/task_arena.h>
+
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
@@ -30,6 +32,8 @@ class ChunkedArray {
     size_t max_elems;
   };
 
+  using ChunkVector = std::vector<Chunk>;
+
   // Random access iterator to be used with std::nth_element.
   template <typename T>
   class Iterator {
@@ -40,7 +44,7 @@ class ChunkedArray {
     typedef T& reference;
     typedef std::random_access_iterator_tag iterator_category;
 
-    Iterator(const std::vector<Chunk>* chunks, size_t chunk_idx, size_t chunk_offs)
+    Iterator(const ChunkVector* chunks, size_t chunk_idx, size_t chunk_offs)
         : chunks_(chunks), chunk_idx_(chunk_idx), chunk_offs_(chunk_offs) {}
 
     Iterator(const Iterator& other) = default;
@@ -166,7 +170,7 @@ class ChunkedArray {
     }
 
    private:
-    const std::vector<Chunk>* chunks_;
+    const ChunkVector* chunks_;
     // Current chunk index. Can be equal to size of chunks_ vector for `end` iterator.
     size_t chunk_idx_;
     // Offset in the current chunk. Should always be less than chunk size when the
@@ -180,11 +184,16 @@ class ChunkedArray {
   void push(T value) {
     // Check if we need to allocate a new chunk.
     if (chunks_.empty() || cur_idx_ == chunks_.back().max_elems) {
-      // Allocator is most probably a RowSetMemoryOwner object. It is not supposed to be
-      // used to allocate very small objects, so we start with 1 KB and double it each
-      // time with 64KB limit.
+      if (thread_idx_ < 0) {
+        thread_idx_ = tbb::this_task_arena::current_thread_index();
+      } else if (thread_idx_ != tbb::this_task_arena::current_thread_index()) {
+        // Pushing elements from different threads is not allowed bacause can cause
+        // memory corruption.
+        abort();
+      }
       size_t size_to_allocate = std::max((size_t)64, (size_t)1 << chunks_.size()) << 10;
-      Chunk chunk{allocator_->allocate(size_to_allocate), size_to_allocate / sizeof(T)};
+      Chunk chunk{allocator_->allocateSmallMtNoLock(size_to_allocate, thread_idx_),
+                  size_to_allocate / sizeof(T)};
       chunks_.emplace_back(chunk);
       cur_idx_ = 0;
     }
@@ -243,9 +252,13 @@ class ChunkedArray {
   SimpleAllocator* allocator_;
   // All chunks except the last one should be full, i.e. they hold
   // chunk.max_elems elements.
-  std::vector<Chunk> chunks_;
+  ChunkVector chunks_;
   // Insertion position in the last chunk.
   size_t cur_idx_;
+  // Thread index working with this quantile object. We assume elements are pushed
+  // by a single thread only. Index is determined on the first push. Merge can be
+  // done from different threads.
+  int thread_idx_ = -1;
 };
 
 class Quantile {