Skip to content

Commit acaf60b

Browse files
committed
benchmark workload redesign
Changed existing workload to include multiple alloc/free in the row. Added workload that firstly increases number of allocations, and decreases it.
1 parent dc6c91c commit acaf60b

File tree

2 files changed

+231
-21
lines changed

2 files changed

+231
-21
lines changed

benchmark/benchmark.cpp

Lines changed: 67 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,9 @@
3232
// The exact meaning of each argument depends on the benchmark, allocator, and size components used.
3333
// Refer to the 'argsName()' function in each component to find detailed descriptions of these arguments.
3434

35-
template <size_t max_threads = 12>
3635
static void multithreaded(benchmark::internal::Benchmark *benchmark) {
3736
benchmark->Threads(1);
38-
benchmark->DenseThreadRange(4, max_threads, 4);
37+
benchmark->Threads(4);
3938
}
4039

4140
static void singlethreaded(benchmark::internal::Benchmark *benchmark) {
@@ -92,16 +91,14 @@ UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, disjoint_pool_fix,
9291
pool_allocator<disjoint_pool<os_provider>>);
9392
UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, disjoint_pool_fix)
9493
->Apply(&default_multiple_alloc_fix_size)
95-
// Limit benchmarks to 4 threads, as the disjoint pool scales poorly with higher thread counts.
96-
->Apply(&multithreaded<4>);
94+
->Apply(&multithreaded);
9795

9896
UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark,
9997
disjoint_pool_uniform, uniform_alloc_size,
10098
pool_allocator<disjoint_pool<os_provider>>);
10199
UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, disjoint_pool_uniform)
102100
->Apply(&default_multiple_alloc_uniform_size)
103-
// Limit benchmarks to 4 threads, as the disjoint pool scales poorly with higher thread counts.
104-
->Apply(&multithreaded<4>);
101+
->Apply(&multithreaded);
105102

106103
#ifdef UMF_POOL_JEMALLOC_ENABLED
107104
UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, jemalloc_pool_fix,
@@ -159,6 +156,70 @@ UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, fixed_provider)
159156
// reduce iterations, to match os_provider benchmark
160157
->Iterations(50000);
161158

159+
// peak
160+
UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, glibc_fix, fixed_alloc_size,
161+
glibc_malloc);
162+
163+
UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, glibc_fix)
164+
->Apply(&default_multiple_alloc_fix_size)
165+
->Apply(&multithreaded);
166+
167+
UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, glibc_uniform,
168+
uniform_alloc_size, glibc_malloc);
169+
UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, glibc_uniform)
170+
->Apply(&default_multiple_alloc_uniform_size)
171+
->Apply(&multithreaded);
172+
173+
UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, disjoint_pool_fix,
174+
fixed_alloc_size,
175+
pool_allocator<disjoint_pool<os_provider>>);
176+
UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, disjoint_pool_fix)
177+
->Apply(&default_multiple_alloc_fix_size)
178+
->Apply(&multithreaded);
179+
180+
UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, disjoint_pool_uniform,
181+
uniform_alloc_size,
182+
pool_allocator<disjoint_pool<os_provider>>);
183+
UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, disjoint_pool_uniform)
184+
->Apply(&default_multiple_alloc_uniform_size)
185+
->Apply(&multithreaded);
186+
187+
#ifdef UMF_POOL_JEMALLOC_ENABLED
188+
UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, jemalloc_pool_fix,
189+
fixed_alloc_size,
190+
pool_allocator<jemalloc_pool<os_provider>>);
191+
UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, jemalloc_pool_fix)
192+
->Apply(&default_multiple_alloc_fix_size)
193+
->Apply(&multithreaded);
194+
195+
UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, jemalloc_pool_uniform,
196+
uniform_alloc_size,
197+
pool_allocator<jemalloc_pool<os_provider>>);
198+
UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, jemalloc_pool_uniform)
199+
->Apply(&default_multiple_alloc_uniform_size)
200+
->Apply(&multithreaded);
201+
202+
#endif
203+
204+
#ifdef UMF_POOL_SCALABLE_ENABLED
205+
UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, scalable_pool_fix,
206+
fixed_alloc_size,
207+
pool_allocator<scalable_pool<os_provider>>);
208+
209+
UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, scalable_pool_fix)
210+
->Apply(&default_multiple_alloc_fix_size)
211+
->Apply(&multithreaded);
212+
213+
UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, scalable_pool_uniform,
214+
uniform_alloc_size,
215+
pool_allocator<scalable_pool<os_provider>>);
216+
217+
UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, scalable_pool_uniform)
218+
->Apply(&default_multiple_alloc_uniform_size)
219+
->Apply(&multithreaded);
220+
221+
#endif
222+
162223
//BENCHMARK_MAIN();
163224
int main(int argc, char **argv) {
164225
if (initAffinityMask()) {

benchmark/benchmark.hpp

Lines changed: 164 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
* - Additional benchmarking scenarios can be created by extending `benchmark_interface`.
7171
*/
7272

73+
#include <list>
7374
#include <malloc.h>
7475
#include <random>
7576

@@ -86,6 +87,7 @@ struct alloc_data {
8687
};
8788

8889
struct next_alloc_data {
90+
bool alloc; // true if allocation, false if deallocation
8991
size_t offset;
9092
size_t size;
9193
};
@@ -288,18 +290,17 @@ template <
288290
typename =
289291
std::enable_if_t<std::is_base_of<allocator_interface, Alloc>::value>>
290292
class multiple_malloc_free_benchmark : public benchmark_interface<Size, Alloc> {
291-
using distribution = std::uniform_int_distribution<size_t>;
293+
protected:
292294
template <class T> using vector2d = std::vector<std::vector<T>>;
293295
using base = benchmark_interface<Size, Alloc>;
294-
295296
int allocsPerIterations = 10;
296297
bool thread_local_allocations = true;
297298
size_t max_allocs = 0;
298299

299300
vector2d<alloc_data> allocations;
300301
vector2d<next_alloc_data> next;
301302
using next_alloc_data_iterator =
302-
std::vector<next_alloc_data>::const_iterator;
303+
typename std::vector<next_alloc_data>::const_iterator;
303304
std::vector<std::unique_ptr<next_alloc_data_iterator>> next_iter;
304305
int64_t iterations;
305306

@@ -386,15 +387,20 @@ class multiple_malloc_free_benchmark : public benchmark_interface<Size, Alloc> {
386387
auto tid = state.thread_index();
387388
auto &allocation = allocations[tid];
388389
auto &iter = next_iter[tid];
390+
389391
for (int i = 0; i < allocsPerIterations; i++) {
390392
auto &n = *(*iter)++;
391393
auto &alloc = allocation[n.offset];
392-
base::allocator.benchFree(alloc.ptr, alloc.size);
393-
alloc.size = n.size;
394-
alloc.ptr = base::allocator.benchAlloc(alloc.size);
395-
396-
if (alloc.ptr == NULL) {
397-
state.SkipWithError("allocation failed");
394+
if (n.alloc) {
395+
alloc.ptr = base::allocator.benchAlloc(n.size);
396+
if (alloc.ptr == NULL) {
397+
state.SkipWithError("allocation failed");
398+
}
399+
alloc.size = n.size;
400+
} else {
401+
base::allocator.benchFree(alloc.ptr, alloc.size);
402+
alloc.ptr = NULL;
403+
alloc.size = 0;
398404
}
399405
}
400406
}
@@ -412,13 +418,14 @@ class multiple_malloc_free_benchmark : public benchmark_interface<Size, Alloc> {
412418
}
413419

414420
private:
415-
void prealloc(benchmark::State &state) {
421+
virtual void prealloc(benchmark::State &state) {
416422
auto tid = state.thread_index();
417423
auto &i = allocations[tid];
418424
i.resize(max_allocs);
419425
auto sizeGenerator = base::alloc_sizes[tid];
420426

421-
for (size_t j = 0; j < max_allocs; j++) {
427+
// Preallocate half of the available slots, for allocations
428+
for (size_t j = 0; j < max_allocs / 2; j++) {
422429
auto size = sizeGenerator.nextSize();
423430
i[j].ptr = base::allocator.benchAlloc(size);
424431
if (i[j].ptr == NULL) {
@@ -441,20 +448,162 @@ class multiple_malloc_free_benchmark : public benchmark_interface<Size, Alloc> {
441448
}
442449
}
443450

444-
void prepareWorkload(benchmark::State &state) {
451+
virtual void prepareWorkload(benchmark::State &state) {
445452
auto tid = state.thread_index();
446453
auto &n = next[tid];
454+
455+
// Create generators for random index selection and binary decision.
456+
using distribution = std::uniform_int_distribution<size_t>;
447457
std::default_random_engine generator;
448-
distribution dist;
458+
distribution dist_offset(0, max_allocs - 1);
459+
distribution dist_opt_type(0, 1);
449460
generator.seed(0);
450-
dist.param(distribution::param_type(0, max_allocs - 1));
461+
451462
auto sizeGenerator = base::alloc_sizes[tid];
463+
std::vector<size_t> free;
464+
std::vector<size_t> allocated;
465+
466+
// Preallocate memory: initially, half the indices are allocated.
467+
// See prealloc() function;
468+
size_t i = 0;
469+
for (; i < max_allocs / 2; i++) {
470+
allocated.push_back(i);
471+
}
472+
// The remaining indices are marked as free.
473+
for (; i < max_allocs; i++) {
474+
free.push_back(i);
475+
}
452476

453477
n.clear();
454478
for (int64_t j = 0; j < state.max_iterations * allocsPerIterations;
455479
j++) {
456-
n.push_back({dist(generator), sizeGenerator.nextSize()});
480+
// Decide whether to allocate or free:
481+
// - If no allocations exist, allocation is forced.
482+
// - If there is maximum number of allocation, free is forced
483+
// - Otherwise, use a binary random choice (0 or 1)
484+
if (allocated.empty() ||
485+
(dist_opt_type(generator) == 0 && !free.empty())) {
486+
// Allocation:
487+
std::swap(free[dist_offset(generator) % free.size()],
488+
free.back());
489+
auto offset = free.back();
490+
free.pop_back();
491+
492+
n.push_back({true, offset, sizeGenerator.nextSize()});
493+
allocated.push_back(offset);
494+
} else {
495+
// Free
496+
std::swap(allocated[dist_offset(generator) % allocated.size()],
497+
allocated.back());
498+
auto offset = allocated.back();
499+
allocated.pop_back();
500+
501+
n.push_back({false, offset, 0});
502+
free.push_back(offset);
503+
}
457504
}
505+
458506
next_iter[tid] = std::make_unique<next_alloc_data_iterator>(n.cbegin());
459507
}
460508
};
509+
// This class benchmarks performance by randomly allocating and freeing memory.
510+
// Initially, it slowly increases the memory footprint, and later decreases it."
511+
template <
512+
typename Size, typename Alloc,
513+
typename =
514+
std::enable_if_t<std::is_base_of<alloc_size_interface, Size>::value>,
515+
typename =
516+
std::enable_if_t<std::is_base_of<allocator_interface, Alloc>::value>>
517+
class peak_alloc_benchmark
518+
: public multiple_malloc_free_benchmark<Size, Alloc> {
519+
using base = multiple_malloc_free_benchmark<Size, Alloc>;
520+
virtual void prepareWorkload(benchmark::State &state) override {
521+
// Retrieve the thread index and corresponding operation buffer.
522+
auto tid = state.thread_index();
523+
auto &n = this->next[tid];
524+
525+
// Set up the random generators for index selection and decision making.
526+
std::default_random_engine generator;
527+
std::uniform_int_distribution<size_t> dist_offset(0,
528+
this->max_allocs - 1);
529+
std::uniform_real_distribution<double> dist_opt_type(0, 1);
530+
generator.seed(0);
531+
auto sizeGenerator = this->alloc_sizes[tid];
532+
533+
n.clear();
534+
std::vector<size_t> free;
535+
std::vector<size_t> allocated;
536+
537+
// Initially, all indices are available.
538+
for (size_t i = 0; i < this->max_allocs; i++) {
539+
free.push_back(i);
540+
}
541+
542+
// Total number of allocation/free operations to simulate.
543+
int64_t iterations = state.max_iterations * this->allocsPerIterations;
544+
for (int64_t j = 0; j < iterations; j++) {
545+
int64_t target_allocation;
546+
int64_t max_allocs = static_cast<int64_t>(this->max_allocs);
547+
548+
// Determine the target number of allocations based on the progress of the iterations.
549+
// In the first half of the iterations, the target allocation increases linearly.
550+
// In the second half, it decreases linearly.
551+
if (j < iterations / 2) {
552+
target_allocation = 2 * max_allocs * j / iterations;
553+
} else {
554+
target_allocation =
555+
-2 * max_allocs * j / iterations + 2 * max_allocs;
556+
}
557+
558+
// x represents the gap between the target and current allocations.
559+
auto x = static_cast<double>(target_allocation -
560+
static_cast<double>(allocated.size()));
561+
562+
// Apply a normal CDF to x with a high sigma to create a smooth, gradual bias.
563+
// This mechanism gently nudges the probability toward allocation if behind target,
564+
// and toward deallocation if above, while keeping fluctuations modest.
565+
const double sigma = 1000;
566+
auto cdf = normalCDF(x, sigma);
567+
568+
// Decide whether to allocate or free:
569+
// - If no allocations exist, allocation is forced.
570+
// - If there is maximum number of allocation, free is forced
571+
// - Otherwise, Based on the computed probability, choose whether to allocate or free
572+
if (allocated.empty() ||
573+
(!free.empty() && cdf > dist_opt_type(generator))) {
574+
// Allocation
575+
std::swap(free[dist_offset(generator) % free.size()],
576+
free.back());
577+
auto offset = free.back();
578+
free.pop_back();
579+
n.push_back({true, offset, sizeGenerator.nextSize()});
580+
allocated.push_back(offset);
581+
} else {
582+
// Free
583+
std::swap(allocated[dist_offset(generator) % allocated.size()],
584+
allocated.back());
585+
auto offset = allocated.back();
586+
allocated.pop_back();
587+
n.push_back({false, offset, 0});
588+
free.push_back(offset);
589+
}
590+
}
591+
592+
this->next_iter[tid] =
593+
std::make_unique<std::vector<next_alloc_data>::const_iterator>(
594+
n.cbegin());
595+
}
596+
597+
virtual void prealloc(benchmark::State &state) {
598+
auto tid = state.thread_index();
599+
auto &i = base::allocations[tid];
600+
i.resize(base::max_allocs);
601+
}
602+
virtual std::string name() { return base::base::name() + "/peak_alloc"; }
603+
604+
private:
605+
// Function to calculate the CDF of a normal distribution
606+
double normalCDF(double x, double sigma = 1.0, double mu = 0.0) {
607+
return 0.5 * (1 + std::erf((x - mu) / (sigma * std::sqrt(2.0))));
608+
}
609+
};

0 commit comments

Comments
 (0)