Skip to content

Commit 2505e71

Browse files
committed
[+]Feat: Implement GDRCopy support for GPU maps and add benchmarks
1 parent c8f93df commit 2505e71

14 files changed

+1030
-13
lines changed

CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,11 @@ if(BPFTIME_ENABLE_CUDA_ATTACH)
159159
add_compile_definitions(BPFTIME_ENABLE_CUDA_ATTACH=1)
160160
endif()
161161

162+
if(BPFTIME_ENABLE_GDRCOPY)
163+
add_compile_definitions(BPFTIME_ENABLE_GDRCOPY=1)
164+
message(STATUS "Building with optional GDRCopy support (BPFTIME_ENABLE_GDRCOPY=1)")
165+
endif()
166+
162167
# argparse
163168
add_subdirectory(third_party/argparse)
164169

benchmark/CMakeLists.txt

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,30 @@ target_include_directories(simple-benchmark-with-embed-ebpf-calling
99
PRIVATE
1010
${CMAKE_CURRENT_SOURCE_DIR}../vm/include
1111
${LIBBPF_INCLUDE_DIRS})
12+
13+
if(BPFTIME_ENABLE_CUDA_ATTACH)
14+
add_executable(gpu_array_map_host_perf
15+
gpu/host/gpu_array_map_host_perf.cpp)
16+
add_dependencies(gpu_array_map_host_perf runtime)
17+
target_link_libraries(gpu_array_map_host_perf
18+
PRIVATE
19+
runtime
20+
spdlog::spdlog
21+
)
22+
set_target_properties(gpu_array_map_host_perf
23+
PROPERTIES
24+
RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/benchmark/gpu/host
25+
)
26+
add_executable(gpu_per_thread_array_map_host_perf
27+
gpu/host/gpu_per_thread_array_map_host_perf.cpp)
28+
add_dependencies(gpu_per_thread_array_map_host_perf runtime)
29+
target_link_libraries(gpu_per_thread_array_map_host_perf
30+
PRIVATE
31+
runtime
32+
spdlog::spdlog
33+
)
34+
set_target_properties(gpu_per_thread_array_map_host_perf
35+
PROPERTIES
36+
RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/benchmark/gpu/host
37+
)
38+
endif()

benchmark/gpu/host/README.md

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# Host-side GPU map microbenchmarks
2+
3+
This directory contains **host-side** microbenchmarks for bpftime CUDA-backed maps.
4+
5+
## Prerequisites
6+
7+
- A working CUDA driver + toolkit (CUDA headers + `libcuda.so`)
8+
- Build bpftime with CUDA attach enabled
9+
- Optional (for GDRCopy speedup): install GDRCopy user library (`libgdrapi.so`) and load `gdrdrv` (creates `/dev/gdrdrv`)
10+
11+
## Build
12+
13+
From the repo root:
14+
15+
```bash
16+
cmake -S . -B build -G Ninja \
17+
-DBPFTIME_ENABLE_CUDA_ATTACH=ON \
18+
-DBPFTIME_CUDA_ROOT=/usr/local/cuda \
19+
-DBPFTIME_ENABLE_GDRCOPY=ON
20+
21+
cmake --build build -j --target gpu_array_map_host_perf gpu_per_thread_array_map_host_perf
22+
```
23+
24+
If you don’t want GDRCopy support, omit `-DBPFTIME_ENABLE_GDRCOPY=ON` (the binaries will still work, but `--gdrcopy 1` will always fall back to `cuMemcpyDtoH`).
25+
26+
## Binaries
27+
28+
- `build/benchmark/gpu/host/gpu_array_map_host_perf`
29+
- `build/benchmark/gpu/host/gpu_per_thread_array_map_host_perf`
30+
31+
## gpu_array_map_host_perf
32+
33+
Benchmarks `BPF_MAP_TYPE_GPU_ARRAY_MAP` (per-key bytes = `value_size`).
34+
35+
```bash
36+
# baseline (cuMemcpyDtoH)
37+
./build/benchmark/gpu/host/gpu_array_map_host_perf \
38+
--iters 50000 --max-entries 1024 --value-size 8 \
39+
--gdrcopy 0
40+
41+
# enable GDRCopy (hybrid policy)
42+
./build/benchmark/gpu/host/gpu_array_map_host_perf \
43+
--iters 50000 --max-entries 1024 --value-size 8 \
44+
--gdrcopy 1 --gdrcopy-max-per-key-bytes 4096
45+
```
46+
47+
## gpu_per_thread_array_map_host_perf
48+
49+
Benchmarks `BPF_MAP_TYPE_PERGPUTD_ARRAY_MAP` (per-key bytes = `value_size * thread_count`).
50+
51+
```bash
52+
# baseline (cuMemcpyDtoH)
53+
./build/benchmark/gpu/host/gpu_per_thread_array_map_host_perf \
54+
--iters 50000 --max-entries 1024 --value-size 8 --thread-count 32 \
55+
--gdrcopy 0
56+
57+
# enable GDRCopy (hybrid policy)
58+
./build/benchmark/gpu/host/gpu_per_thread_array_map_host_perf \
59+
--iters 50000 --max-entries 1024 --value-size 8 --thread-count 32 \
60+
--gdrcopy 1 --gdrcopy-max-per-key-bytes 4096
61+
```
62+
63+
## Flags
64+
65+
- `--gdrcopy 0|1`: enable/disable GDRCopy attempts
66+
- `--gdrcopy-max-per-key-bytes <N>`: skip GDRCopy when per-key bytes `> N` (use `0` to disable the threshold)
67+
68+
Notes:
69+
70+
- If GDRCopy isn’t available at runtime (missing `libgdrapi.so` or `/dev/gdrdrv`), bpftime automatically falls back to `cuMemcpyDtoH` and performance will match baseline.
Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
#include "bpftime_shm.hpp"
2+
#include <cuda.h>
3+
#include <chrono>
4+
#include <cstdint>
5+
#include <cstdlib>
6+
#include <cstring>
7+
#include <iostream>
8+
#include <string>
9+
#include <vector>
10+
11+
using clock_type = std::chrono::high_resolution_clock;
12+
13+
struct bench_options {
14+
uint64_t iters = 20000;
15+
uint64_t max_entries = 1024;
16+
uint64_t value_size = 8;
17+
bool enable_gdrcopy = false;
18+
uint64_t gdrcopy_max_per_key_bytes =
19+
DEFAULT_GPU_GDRCOPY_MAX_PER_KEY_BYTES;
20+
};
21+
22+
static bool parse_u64_arg(const char *arg, const char *name, uint64_t &out)
23+
{
24+
auto prefix = std::string("--") + name + "=";
25+
if (std::strncmp(arg, prefix.c_str(), prefix.size()) != 0) {
26+
return false;
27+
}
28+
const char *val = arg + prefix.size();
29+
if (!*val) {
30+
return false;
31+
}
32+
char *end = nullptr;
33+
uint64_t parsed = std::strtoull(val, &end, 10);
34+
if (!end || *end != '\0') {
35+
return false;
36+
}
37+
out = parsed;
38+
return true;
39+
}
40+
41+
static bench_options parse_args(int argc, char **argv)
42+
{
43+
bench_options opt;
44+
for (int i = 1; i < argc; ++i) {
45+
if (parse_u64_arg(argv[i], "iters", opt.iters) ||
46+
parse_u64_arg(argv[i], "max-entries", opt.max_entries) ||
47+
parse_u64_arg(argv[i], "value-size", opt.value_size) ||
48+
parse_u64_arg(argv[i], "gdrcopy-max-per-key-bytes",
49+
opt.gdrcopy_max_per_key_bytes)) {
50+
continue;
51+
}
52+
if (std::strcmp(argv[i], "--gdrcopy") == 0 && i + 1 < argc) {
53+
opt.enable_gdrcopy =
54+
std::strtoull(argv[++i], nullptr, 10) != 0;
55+
continue;
56+
}
57+
if (std::strcmp(argv[i], "--iters") == 0 && i + 1 < argc) {
58+
opt.iters = std::strtoull(argv[++i], nullptr, 10);
59+
continue;
60+
}
61+
if (std::strcmp(argv[i], "--max-entries") == 0 &&
62+
i + 1 < argc) {
63+
opt.max_entries =
64+
std::strtoull(argv[++i], nullptr, 10);
65+
continue;
66+
}
67+
if (std::strcmp(argv[i], "--value-size") == 0 &&
68+
i + 1 < argc) {
69+
opt.value_size =
70+
std::strtoull(argv[++i], nullptr, 10);
71+
continue;
72+
}
73+
if (std::strcmp(argv[i], "--gdrcopy-max-per-key-bytes") == 0 &&
74+
i + 1 < argc) {
75+
opt.gdrcopy_max_per_key_bytes =
76+
std::strtoull(argv[++i], nullptr, 10);
77+
continue;
78+
}
79+
std::cerr << "Unknown argument: " << argv[i] << "\n";
80+
std::exit(1);
81+
}
82+
return opt;
83+
}
84+
85+
static uint32_t lcg_next(uint32_t &state, uint32_t modulus)
86+
{
87+
state = state * 1664525u + 1013904223u;
88+
if (modulus == 0) {
89+
return state;
90+
}
91+
return state % modulus;
92+
}
93+
94+
static void print_result(const char *label, uint64_t iters,
95+
std::chrono::nanoseconds duration)
96+
{
97+
double ns_per_op =
98+
static_cast<double>(duration.count()) / (double)iters;
99+
double ops_per_sec = 1e9 / ns_per_op;
100+
std::cout.setf(std::ios::fixed);
101+
std::cout.precision(1);
102+
std::cout << label << ": " << ns_per_op << " ns/op"
103+
<< " (" << ops_per_sec << " ops/s)\n";
104+
}
105+
106+
int main(int argc, char **argv)
107+
{
108+
auto opt = parse_args(argc, argv);
109+
110+
if (auto err = cuInit(0); err != CUDA_SUCCESS) {
111+
std::cerr << "cuInit(0) failed with error " << (int)err
112+
<< "\n";
113+
return 1;
114+
}
115+
116+
bpftime_initialize_global_shm(
117+
bpftime::shm_open_type::SHM_REMOVE_AND_CREATE);
118+
119+
auto config = bpftime::bpftime_get_agent_config();
120+
config.enable_gpu_gdrcopy = opt.enable_gdrcopy;
121+
config.gpu_gdrcopy_max_per_key_bytes = opt.gdrcopy_max_per_key_bytes;
122+
bpftime::bpftime_set_agent_config(std::move(config));
123+
124+
bpftime::bpf_map_attr attr{};
125+
attr.type =
126+
(int)bpftime::bpf_map_type::BPF_MAP_TYPE_GPU_ARRAY_MAP;
127+
attr.key_size = sizeof(uint32_t);
128+
attr.value_size = (uint32_t)opt.value_size;
129+
attr.max_ents = (uint32_t)opt.max_entries;
130+
131+
int map_fd =
132+
bpftime_maps_create(-1, "gpu_array_map_host_perf", attr);
133+
if (map_fd < 0) {
134+
std::perror("bpftime_maps_create");
135+
return 1;
136+
}
137+
138+
std::vector<uint8_t> value(opt.value_size, 0);
139+
140+
// Pre-fill the map with some values to avoid ENOENT on lookups.
141+
for (uint32_t i = 0; i < opt.max_entries; ++i) {
142+
uint32_t key = i;
143+
if (bpftime_map_update_elem(map_fd, &key, value.data(), 0) !=
144+
0) {
145+
std::perror("bpftime_map_update_elem (prefill)");
146+
return 1;
147+
}
148+
}
149+
150+
// Update benchmark.
151+
uint32_t state = 0x12345678u;
152+
auto start_update = clock_type::now();
153+
for (uint64_t i = 0; i < opt.iters; ++i) {
154+
uint32_t key =
155+
lcg_next(state, (uint32_t)opt.max_entries);
156+
if (bpftime_map_update_elem(map_fd, &key, value.data(), 0) !=
157+
0) {
158+
std::perror("bpftime_map_update_elem");
159+
return 1;
160+
}
161+
}
162+
auto end_update = clock_type::now();
163+
164+
// Lookup benchmark.
165+
state = 0xdeadbeefu;
166+
auto start_lookup = clock_type::now();
167+
for (uint64_t i = 0; i < opt.iters; ++i) {
168+
uint32_t key =
169+
lcg_next(state, (uint32_t)opt.max_entries);
170+
auto *ptr = bpftime_map_lookup_elem(map_fd, &key);
171+
if (!ptr) {
172+
std::perror("bpftime_map_lookup_elem");
173+
return 1;
174+
}
175+
}
176+
auto end_lookup = clock_type::now();
177+
178+
auto update_ns =
179+
std::chrono::duration_cast<std::chrono::nanoseconds>(
180+
end_update - start_update);
181+
auto lookup_ns =
182+
std::chrono::duration_cast<std::chrono::nanoseconds>(
183+
end_lookup - start_lookup);
184+
185+
std::cout << "iters=" << opt.iters
186+
<< " max_entries=" << opt.max_entries
187+
<< " value_size=" << opt.value_size << " bytes"
188+
<< " gdrcopy=" << (opt.enable_gdrcopy ? 1 : 0)
189+
<< " gdrcopy_max_per_key_bytes="
190+
<< opt.gdrcopy_max_per_key_bytes << "\n";
191+
print_result("update", opt.iters, update_ns);
192+
print_result("lookup", opt.iters, lookup_ns);
193+
194+
bpftime_destroy_global_shm();
195+
return 0;
196+
}

0 commit comments

Comments
 (0)