Skip to content

Commit 653309e

Browse files
committed
benchmark workload redesign
Changed existing workload to include multiple alloc/free in the row. Added workload that firstly increases number of allocations, and decreases it.
1 parent dc6c91c commit 653309e

File tree

2 files changed

+209
-21
lines changed

2 files changed

+209
-21
lines changed

benchmark/benchmark.cpp

Lines changed: 67 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,9 @@
3232
// The exact meaning of each argument depends on the benchmark, allocator, and size components used.
3333
// Refer to the 'argsName()' function in each component to find detailed descriptions of these arguments.
3434

35-
template <size_t max_threads = 12>
3635
static void multithreaded(benchmark::internal::Benchmark *benchmark) {
3736
benchmark->Threads(1);
38-
benchmark->DenseThreadRange(4, max_threads, 4);
37+
benchmark->Threads(4);
3938
}
4039

4140
static void singlethreaded(benchmark::internal::Benchmark *benchmark) {
@@ -92,16 +91,14 @@ UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, disjoint_pool_fix,
9291
pool_allocator<disjoint_pool<os_provider>>);
9392
UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, disjoint_pool_fix)
9493
->Apply(&default_multiple_alloc_fix_size)
95-
// Limit benchmarks to 4 threads, as the disjoint pool scales poorly with higher thread counts.
96-
->Apply(&multithreaded<4>);
94+
->Apply(&multithreaded);
9795

9896
UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark,
9997
disjoint_pool_uniform, uniform_alloc_size,
10098
pool_allocator<disjoint_pool<os_provider>>);
10199
UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, disjoint_pool_uniform)
102100
->Apply(&default_multiple_alloc_uniform_size)
103-
// Limit benchmarks to 4 threads, as the disjoint pool scales poorly with higher thread counts.
104-
->Apply(&multithreaded<4>);
101+
->Apply(&multithreaded);
105102

106103
#ifdef UMF_POOL_JEMALLOC_ENABLED
107104
UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, jemalloc_pool_fix,
@@ -159,6 +156,70 @@ UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, fixed_provider)
159156
// reduce iterations, to match os_provider benchmark
160157
->Iterations(50000);
161158

159+
// peak
160+
UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, glibc_fix, fixed_alloc_size,
161+
glibc_malloc);
162+
163+
UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, glibc_fix)
164+
->Apply(&default_multiple_alloc_fix_size)
165+
->Apply(&multithreaded);
166+
167+
UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, glibc_uniform,
168+
uniform_alloc_size, glibc_malloc);
169+
UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, glibc_uniform)
170+
->Apply(&default_multiple_alloc_uniform_size)
171+
->Apply(&multithreaded);
172+
173+
UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, disjoint_pool_fix,
174+
fixed_alloc_size,
175+
pool_allocator<disjoint_pool<os_provider>>);
176+
UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, disjoint_pool_fix)
177+
->Apply(&default_multiple_alloc_fix_size)
178+
->Apply(&multithreaded);
179+
180+
UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, disjoint_pool_uniform,
181+
uniform_alloc_size,
182+
pool_allocator<disjoint_pool<os_provider>>);
183+
UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, disjoint_pool_uniform)
184+
->Apply(&default_multiple_alloc_uniform_size)
185+
->Apply(&multithreaded);
186+
187+
#ifdef UMF_POOL_JEMALLOC_ENABLED
188+
UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, jemalloc_pool_fix,
189+
fixed_alloc_size,
190+
pool_allocator<jemalloc_pool<os_provider>>);
191+
UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, jemalloc_pool_fix)
192+
->Apply(&default_multiple_alloc_fix_size)
193+
->Apply(&multithreaded);
194+
195+
UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, jemalloc_pool_uniform,
196+
uniform_alloc_size,
197+
pool_allocator<jemalloc_pool<os_provider>>);
198+
UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, jemalloc_pool_uniform)
199+
->Apply(&default_multiple_alloc_uniform_size)
200+
->Apply(&multithreaded);
201+
202+
#endif
203+
204+
#ifdef UMF_POOL_SCALABLE_ENABLED
205+
UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, scalable_pool_fix,
206+
fixed_alloc_size,
207+
pool_allocator<scalable_pool<os_provider>>);
208+
209+
UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, scalable_pool_fix)
210+
->Apply(&default_multiple_alloc_fix_size)
211+
->Apply(&multithreaded);
212+
213+
UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, scalable_pool_uniform,
214+
uniform_alloc_size,
215+
pool_allocator<scalable_pool<os_provider>>);
216+
217+
UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, scalable_pool_uniform)
218+
->Apply(&default_multiple_alloc_uniform_size)
219+
->Apply(&multithreaded);
220+
221+
#endif
222+
162223
//BENCHMARK_MAIN();
163224
int main(int argc, char **argv) {
164225
if (initAffinityMask()) {

benchmark/benchmark.hpp

Lines changed: 142 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
* - Additional benchmarking scenarios can be created by extending `benchmark_interface`.
7171
*/
7272

73+
#include <list>
7374
#include <malloc.h>
7475
#include <random>
7576

@@ -86,6 +87,7 @@ struct alloc_data {
8687
};
8788

8889
struct next_alloc_data {
90+
bool alloc; // true if allocation, false if deallocation
8991
size_t offset;
9092
size_t size;
9193
};
@@ -288,18 +290,17 @@ template <
288290
typename =
289291
std::enable_if_t<std::is_base_of<allocator_interface, Alloc>::value>>
290292
class multiple_malloc_free_benchmark : public benchmark_interface<Size, Alloc> {
291-
using distribution = std::uniform_int_distribution<size_t>;
293+
protected:
292294
template <class T> using vector2d = std::vector<std::vector<T>>;
293295
using base = benchmark_interface<Size, Alloc>;
294-
295296
int allocsPerIterations = 10;
296297
bool thread_local_allocations = true;
297298
size_t max_allocs = 0;
298299

299300
vector2d<alloc_data> allocations;
300301
vector2d<next_alloc_data> next;
301302
using next_alloc_data_iterator =
302-
std::vector<next_alloc_data>::const_iterator;
303+
typename std::vector<next_alloc_data>::const_iterator;
303304
std::vector<std::unique_ptr<next_alloc_data_iterator>> next_iter;
304305
int64_t iterations;
305306

@@ -386,15 +387,20 @@ class multiple_malloc_free_benchmark : public benchmark_interface<Size, Alloc> {
386387
auto tid = state.thread_index();
387388
auto &allocation = allocations[tid];
388389
auto &iter = next_iter[tid];
390+
389391
for (int i = 0; i < allocsPerIterations; i++) {
390392
auto &n = *(*iter)++;
391393
auto &alloc = allocation[n.offset];
392-
base::allocator.benchFree(alloc.ptr, alloc.size);
393-
alloc.size = n.size;
394-
alloc.ptr = base::allocator.benchAlloc(alloc.size);
395-
396-
if (alloc.ptr == NULL) {
397-
state.SkipWithError("allocation failed");
394+
if (n.alloc) {
395+
alloc.ptr = base::allocator.benchAlloc(n.size);
396+
if (alloc.ptr == NULL) {
397+
state.SkipWithError("allocation failed");
398+
}
399+
alloc.size = n.size;
400+
} else {
401+
base::allocator.benchFree(alloc.ptr, alloc.size);
402+
alloc.ptr = NULL;
403+
alloc.size = 0;
398404
}
399405
}
400406
}
@@ -412,13 +418,13 @@ class multiple_malloc_free_benchmark : public benchmark_interface<Size, Alloc> {
412418
}
413419

414420
private:
415-
void prealloc(benchmark::State &state) {
421+
virtual void prealloc(benchmark::State &state) {
416422
auto tid = state.thread_index();
417423
auto &i = allocations[tid];
418424
i.resize(max_allocs);
419425
auto sizeGenerator = base::alloc_sizes[tid];
420426

421-
for (size_t j = 0; j < max_allocs; j++) {
427+
for (size_t j = 0; j < max_allocs / 2; j++) {
422428
auto size = sizeGenerator.nextSize();
423429
i[j].ptr = base::allocator.benchAlloc(size);
424430
if (i[j].ptr == NULL) {
@@ -441,20 +447,141 @@ class multiple_malloc_free_benchmark : public benchmark_interface<Size, Alloc> {
441447
}
442448
}
443449

444-
void prepareWorkload(benchmark::State &state) {
450+
virtual void prepareWorkload(benchmark::State &state) {
445451
auto tid = state.thread_index();
446452
auto &n = next[tid];
453+
using distribution = std::uniform_int_distribution<size_t>;
447454
std::default_random_engine generator;
448-
distribution dist;
455+
distribution dist_offset(0, max_allocs - 1);
456+
distribution dist_opt_type(0, 1);
449457
generator.seed(0);
450-
dist.param(distribution::param_type(0, max_allocs - 1));
451458
auto sizeGenerator = base::alloc_sizes[tid];
459+
std::vector<size_t> free;
460+
std::vector<size_t> allocated;
461+
462+
// this benchmark prealloc memory, so we start with some allocation
463+
size_t i = 0;
464+
for (; i < max_allocs / 2; i++) {
465+
allocated.push_back(i);
466+
}
467+
for (; i < max_allocs; i++) {
468+
free.push_back(i);
469+
}
452470

453471
n.clear();
454472
for (int64_t j = 0; j < state.max_iterations * allocsPerIterations;
455473
j++) {
456-
n.push_back({dist(generator), sizeGenerator.nextSize()});
474+
if (allocated.empty() ||
475+
(dist_opt_type(generator) == 0 && !free.empty())) {
476+
477+
std::swap(free[dist_offset(generator) % free.size()],
478+
free.back());
479+
auto offset = free.back();
480+
free.pop_back();
481+
482+
n.push_back({true, offset, sizeGenerator.nextSize()});
483+
allocated.push_back(offset);
484+
} else {
485+
std::swap(allocated[dist_offset(generator) % allocated.size()],
486+
allocated.back());
487+
auto offset = allocated.back();
488+
allocated.pop_back();
489+
490+
n.push_back({false, offset, 0});
491+
free.push_back(offset);
492+
}
457493
}
494+
458495
next_iter[tid] = std::make_unique<next_alloc_data_iterator>(n.cbegin());
459496
}
460497
};
498+
499+
// This class benchmarks performance randomly allocates and frees,
500+
// Firstly slowly increasing memory footprint, and later decreasing
501+
template <
502+
typename Size, typename Alloc,
503+
typename =
504+
std::enable_if_t<std::is_base_of<alloc_size_interface, Size>::value>,
505+
typename =
506+
std::enable_if_t<std::is_base_of<allocator_interface, Alloc>::value>>
507+
class peak_alloc_benchmark
508+
: public multiple_malloc_free_benchmark<Size, Alloc> {
509+
using base = multiple_malloc_free_benchmark<Size, Alloc>;
510+
virtual void prepareWorkload(benchmark::State &state) override {
511+
512+
auto tid = state.thread_index();
513+
auto &n = this->next[tid];
514+
std::default_random_engine generator;
515+
std::uniform_int_distribution<size_t> dist_offset(0,
516+
this->max_allocs - 1);
517+
std::uniform_real_distribution<double> dist_opt_type(0, 1);
518+
generator.seed(0);
519+
auto sizeGenerator = this->alloc_sizes[tid];
520+
521+
n.clear();
522+
std::vector<size_t> free;
523+
std::vector<size_t> allocated;
524+
// we start without any allocations
525+
for (size_t i = 0; i < this->max_allocs; i++) {
526+
free.push_back(i);
527+
}
528+
529+
int64_t iterations = state.max_iterations * this->allocsPerIterations;
530+
for (int64_t j = 0; j < iterations; j++) {
531+
int64_t target_allocation;
532+
int64_t max_allocs = static_cast<int64_t>(this->max_allocs);
533+
if (j < iterations / 2) {
534+
target_allocation = 2 * max_allocs * j / iterations;
535+
} else {
536+
target_allocation =
537+
-2 * max_allocs * j / iterations + 2 * max_allocs;
538+
}
539+
540+
auto x = static_cast<double>(target_allocation -
541+
static_cast<double>(allocated.size()));
542+
// high sigma value cause small changes in probability between alloc and free
543+
// based on offset x from target_allocation number.
544+
// sigma == 1000 causes that actual number of allocation are +/-40 of target number
545+
const double sigma = 1000;
546+
auto cdf = normalCDF(x, sigma);
547+
548+
if (allocated.empty() ||
549+
(!free.empty() && cdf > dist_opt_type(generator))) {
550+
// allocate
551+
std::swap(free[dist_offset(generator) % free.size()],
552+
free.back());
553+
auto offset = free.back();
554+
free.pop_back();
555+
n.push_back({true, offset, sizeGenerator.nextSize()});
556+
allocated.push_back(offset);
557+
558+
} else {
559+
// free
560+
std::swap(allocated[dist_offset(generator) % allocated.size()],
561+
allocated.back());
562+
auto offset = allocated.back();
563+
allocated.pop_back();
564+
565+
n.push_back({false, offset, 0});
566+
free.push_back(offset);
567+
}
568+
}
569+
570+
this->next_iter[tid] =
571+
std::make_unique<std::vector<next_alloc_data>::const_iterator>(
572+
n.cbegin());
573+
}
574+
575+
virtual void prealloc(benchmark::State &state) {
576+
auto tid = state.thread_index();
577+
auto &i = base::allocations[tid];
578+
i.resize(base::max_allocs);
579+
}
580+
virtual std::string name() { return base::base::name() + "/peak_alloc"; }
581+
582+
private:
583+
// Function to calculate the CDF of a normal distribution
584+
double normalCDF(double x, double sigma = 1.0, double mu = 0.0) {
585+
return 0.5 * (1 + std::erf((x - mu) / (sigma * std::sqrt(2.0))));
586+
}
587+
};

0 commit comments

Comments
 (0)