rmm/cpp/benchmarks/async_priming/async_priming_bench.cpp at 5d0ccdc3c01ff33eb22266032001da64a09b98cf · bdice/rmm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
 * SPDX-License-Identifier: Apache-2.0
 */

#include <rmm/cuda_device.hpp>
#include <rmm/mr/cuda_async_memory_resource.hpp>

#include <benchmark/benchmark.h>
#include <benchmarks/utilities/cxxopts.hpp>

#include <chrono>
#include <cstddef>
#include <iostream>
#include <memory>
#include <vector>

namespace {

/**
 * @brief Factory function to create a cuda_async_memory_resource with priming
 */
inline auto make_cuda_async_primed()
{
  auto const [free, total] = rmm::available_device_memory();
  auto const pool_size     = total / 2;
  return std::make_shared<rmm::mr::cuda_async_memory_resource>(pool_size);
}

/**
 * @brief Factory function to create a cuda_async_memory_resource without priming
 */
inline auto make_cuda_async_unprimed()
{
  return std::make_shared<rmm::mr::cuda_async_memory_resource>();
}

/**
 * @brief Benchmark to measure the impact of async allocator priming
 */
template <typename MRFactoryFunc>
void BM_AsyncPrimingImpact(benchmark::State& state, MRFactoryFunc factory)
{
  auto const [free, total]   = rmm::available_device_memory();
  auto const allocation_size = static_cast<std::size_t>(total * 0.009);  // 0.9% of total memory
  auto const num_allocations = 100;

  // Create memory resource
  auto mr = factory();

  // Storage for allocations
  std::vector<void*> allocations;
  allocations.reserve(num_allocations);

  for (auto _ : state) {
    // Measure latency to first allocation
    auto start_time = std::chrono::high_resolution_clock::now();

    // First allocation - measure latency to this specific call
    allocations.push_back(mr->allocate_sync(allocation_size));
    cudaDeviceSynchronize();
    auto first_allocation_time = std::chrono::high_resolution_clock::now();

    // Continue with remaining allocations in first round
    for (int i = 1; i < num_allocations; ++i) {
      allocations.push_back(mr->allocate_sync(allocation_size));
    }

    cudaDeviceSynchronize();
    auto first_round_end = std::chrono::high_resolution_clock::now();

    // Deallocate all
    for (auto* ptr : allocations) {
      mr->deallocate_sync(ptr, allocation_size);
    }
    allocations.clear();

    // Second round of allocations
    for (int i = 0; i < num_allocations; ++i) {
      allocations.push_back(mr->allocate_sync(allocation_size));
    }

    cudaDeviceSynchronize();
    auto second_round_end = std::chrono::high_resolution_clock::now();

    // Calculate metrics
    auto latency_to_first =
      std::chrono::duration_cast<std::chrono::nanoseconds>(first_allocation_time - start_time)
        .count();
    auto first_round_duration_ns =
      std::chrono::duration_cast<std::chrono::nanoseconds>(first_round_end - start_time).count();
    auto second_round_duration_ns =
      std::chrono::duration_cast<std::chrono::nanoseconds>(second_round_end - first_round_end)
        .count();

    // Calculate throughput (bytes per second)
    auto first_round_throughput =
      (static_cast<double>(num_allocations * allocation_size) * 1e9) / first_round_duration_ns;
    auto second_round_throughput =
      (static_cast<double>(num_allocations * allocation_size) * 1e9) / second_round_duration_ns;

    // Set benchmark counters
    state.counters["latency_to_first_ns"]     = latency_to_first;
    state.counters["first_round_throughput"]  = first_round_throughput;
    state.counters["second_round_throughput"] = second_round_throughput;

    // Clean up for next iteration
    for (auto* ptr : allocations) {
      mr->deallocate_sync(ptr, allocation_size);
    }
    allocations.clear();
  }
}

/**
 * @brief Benchmark to measure construction time with and without priming
 */
template <typename MRFactoryFunc>
void BM_AsyncConstructionTime(benchmark::State& state, MRFactoryFunc factory)
{
  for (auto _ : state) {
    auto start_time = std::chrono::high_resolution_clock::now();
    auto mr         = factory();
    auto end_time   = std::chrono::high_resolution_clock::now();

    auto construction_time =
      std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - start_time).count();

    state.counters["construction_time_ns"] = construction_time;
  }
}

}  // namespace

// Register benchmarks
BENCHMARK_CAPTURE(BM_AsyncPrimingImpact, primed, &make_cuda_async_primed)
  ->Unit(benchmark::kMicrosecond);

BENCHMARK_CAPTURE(BM_AsyncPrimingImpact, unprimed, &make_cuda_async_unprimed)
  ->Unit(benchmark::kMicrosecond);

BENCHMARK_CAPTURE(BM_AsyncConstructionTime, primed, &make_cuda_async_primed)
  ->Unit(benchmark::kMicrosecond);

BENCHMARK_CAPTURE(BM_AsyncConstructionTime, unprimed, &make_cuda_async_unprimed)
  ->Unit(benchmark::kMicrosecond);

int main(int argc, char** argv)
{
  try {
    ::benchmark::Initialize(&argc, argv);

    cxxopts::Options options("async_priming_bench", "Benchmark async allocator priming impact");
    options.add_options()("h,help", "Print usage");

    auto args = options.parse(argc, argv);

    if (args.count("help")) {
      std::cout << options.help() << std::endl;
      return 0;
    }

    ::benchmark::RunSpecifiedBenchmarks();
    ::benchmark::Shutdown();

  } catch (std::exception const& e) {
    std::cout << "Exception caught: " << e.what() << std::endl;
    return 1;
  }

  return 0;
}