forked from rapidsai/rmm
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathasync_priming_bench.cpp
More file actions
172 lines (137 loc) · 5.17 KB
/
async_priming_bench.cpp
File metadata and controls
172 lines (137 loc) · 5.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
/*
* SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/
#include <rmm/cuda_device.hpp>
#include <rmm/mr/cuda_async_memory_resource.hpp>
#include <benchmark/benchmark.h>
#include <benchmarks/utilities/cxxopts.hpp>
#include <chrono>
#include <cstddef>
#include <iostream>
#include <memory>
#include <vector>
namespace {
/**
* @brief Factory function to create a cuda_async_memory_resource with priming
*/
inline auto make_cuda_async_primed()
{
auto const [free, total] = rmm::available_device_memory();
auto const pool_size = total / 2;
return std::make_shared<rmm::mr::cuda_async_memory_resource>(pool_size);
}
/**
* @brief Factory function to create a cuda_async_memory_resource without priming
*/
inline auto make_cuda_async_unprimed()
{
return std::make_shared<rmm::mr::cuda_async_memory_resource>();
}
/**
* @brief Benchmark to measure the impact of async allocator priming
*/
template <typename MRFactoryFunc>
void BM_AsyncPrimingImpact(benchmark::State& state, MRFactoryFunc factory)
{
auto const [free, total] = rmm::available_device_memory();
auto const allocation_size = static_cast<std::size_t>(total * 0.009); // 0.9% of total memory
auto const num_allocations = 100;
// Create memory resource
auto mr = factory();
// Storage for allocations
std::vector<void*> allocations;
allocations.reserve(num_allocations);
for (auto _ : state) {
// Measure latency to first allocation
auto start_time = std::chrono::high_resolution_clock::now();
// First allocation - measure latency to this specific call
allocations.push_back(mr->allocate_sync(allocation_size));
cudaDeviceSynchronize();
auto first_allocation_time = std::chrono::high_resolution_clock::now();
// Continue with remaining allocations in first round
for (int i = 1; i < num_allocations; ++i) {
allocations.push_back(mr->allocate_sync(allocation_size));
}
cudaDeviceSynchronize();
auto first_round_end = std::chrono::high_resolution_clock::now();
// Deallocate all
for (auto* ptr : allocations) {
mr->deallocate_sync(ptr, allocation_size);
}
allocations.clear();
// Second round of allocations
for (int i = 0; i < num_allocations; ++i) {
allocations.push_back(mr->allocate_sync(allocation_size));
}
cudaDeviceSynchronize();
auto second_round_end = std::chrono::high_resolution_clock::now();
// Calculate metrics
auto latency_to_first =
std::chrono::duration_cast<std::chrono::nanoseconds>(first_allocation_time - start_time)
.count();
auto first_round_duration_ns =
std::chrono::duration_cast<std::chrono::nanoseconds>(first_round_end - start_time).count();
auto second_round_duration_ns =
std::chrono::duration_cast<std::chrono::nanoseconds>(second_round_end - first_round_end)
.count();
// Calculate throughput (bytes per second)
auto first_round_throughput =
(static_cast<double>(num_allocations * allocation_size) * 1e9) / first_round_duration_ns;
auto second_round_throughput =
(static_cast<double>(num_allocations * allocation_size) * 1e9) / second_round_duration_ns;
// Set benchmark counters
state.counters["latency_to_first_ns"] = latency_to_first;
state.counters["first_round_throughput"] = first_round_throughput;
state.counters["second_round_throughput"] = second_round_throughput;
// Clean up for next iteration
for (auto* ptr : allocations) {
mr->deallocate_sync(ptr, allocation_size);
}
allocations.clear();
}
}
/**
* @brief Benchmark to measure construction time with and without priming
*/
template <typename MRFactoryFunc>
void BM_AsyncConstructionTime(benchmark::State& state, MRFactoryFunc factory)
{
for (auto _ : state) {
auto start_time = std::chrono::high_resolution_clock::now();
auto mr = factory();
auto end_time = std::chrono::high_resolution_clock::now();
auto construction_time =
std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - start_time).count();
state.counters["construction_time_ns"] = construction_time;
}
}
} // namespace
// Register benchmarks
BENCHMARK_CAPTURE(BM_AsyncPrimingImpact, primed, &make_cuda_async_primed)
->Unit(benchmark::kMicrosecond);
BENCHMARK_CAPTURE(BM_AsyncPrimingImpact, unprimed, &make_cuda_async_unprimed)
->Unit(benchmark::kMicrosecond);
BENCHMARK_CAPTURE(BM_AsyncConstructionTime, primed, &make_cuda_async_primed)
->Unit(benchmark::kMicrosecond);
BENCHMARK_CAPTURE(BM_AsyncConstructionTime, unprimed, &make_cuda_async_unprimed)
->Unit(benchmark::kMicrosecond);
int main(int argc, char** argv)
{
try {
::benchmark::Initialize(&argc, argv);
cxxopts::Options options("async_priming_bench", "Benchmark async allocator priming impact");
options.add_options()("h,help", "Print usage");
auto args = options.parse(argc, argv);
if (args.count("help")) {
std::cout << options.help() << std::endl;
return 0;
}
::benchmark::RunSpecifiedBenchmarks();
::benchmark::Shutdown();
} catch (std::exception const& e) {
std::cout << "Exception caught: " << e.what() << std::endl;
return 1;
}
return 0;
}