|
| 1 | +#include "bpftime_shm.hpp" |
| 2 | +#include <cuda.h> |
| 3 | +#include <chrono> |
| 4 | +#include <cstdint> |
| 5 | +#include <cstdlib> |
| 6 | +#include <cstring> |
| 7 | +#include <iostream> |
| 8 | +#include <string> |
| 9 | +#include <vector> |
| 10 | + |
| 11 | +using clock_type = std::chrono::high_resolution_clock; |
| 12 | + |
| 13 | +struct bench_options { |
| 14 | + uint64_t iters = 20000; |
| 15 | + uint64_t max_entries = 1024; |
| 16 | + uint64_t value_size = 8; |
| 17 | + bool enable_gdrcopy = false; |
| 18 | + uint64_t gdrcopy_max_per_key_bytes = |
| 19 | + DEFAULT_GPU_GDRCOPY_MAX_PER_KEY_BYTES; |
| 20 | +}; |
| 21 | + |
| 22 | +static bool parse_u64_arg(const char *arg, const char *name, uint64_t &out) |
| 23 | +{ |
| 24 | + auto prefix = std::string("--") + name + "="; |
| 25 | + if (std::strncmp(arg, prefix.c_str(), prefix.size()) != 0) { |
| 26 | + return false; |
| 27 | + } |
| 28 | + const char *val = arg + prefix.size(); |
| 29 | + if (!*val) { |
| 30 | + return false; |
| 31 | + } |
| 32 | + char *end = nullptr; |
| 33 | + uint64_t parsed = std::strtoull(val, &end, 10); |
| 34 | + if (!end || *end != '\0') { |
| 35 | + return false; |
| 36 | + } |
| 37 | + out = parsed; |
| 38 | + return true; |
| 39 | +} |
| 40 | + |
| 41 | +static bench_options parse_args(int argc, char **argv) |
| 42 | +{ |
| 43 | + bench_options opt; |
| 44 | + for (int i = 1; i < argc; ++i) { |
| 45 | + if (parse_u64_arg(argv[i], "iters", opt.iters) || |
| 46 | + parse_u64_arg(argv[i], "max-entries", opt.max_entries) || |
| 47 | + parse_u64_arg(argv[i], "value-size", opt.value_size) || |
| 48 | + parse_u64_arg(argv[i], "gdrcopy-max-per-key-bytes", |
| 49 | + opt.gdrcopy_max_per_key_bytes)) { |
| 50 | + continue; |
| 51 | + } |
| 52 | + if (std::strcmp(argv[i], "--gdrcopy") == 0 && i + 1 < argc) { |
| 53 | + opt.enable_gdrcopy = |
| 54 | + std::strtoull(argv[++i], nullptr, 10) != 0; |
| 55 | + continue; |
| 56 | + } |
| 57 | + if (std::strcmp(argv[i], "--iters") == 0 && i + 1 < argc) { |
| 58 | + opt.iters = std::strtoull(argv[++i], nullptr, 10); |
| 59 | + continue; |
| 60 | + } |
| 61 | + if (std::strcmp(argv[i], "--max-entries") == 0 && |
| 62 | + i + 1 < argc) { |
| 63 | + opt.max_entries = |
| 64 | + std::strtoull(argv[++i], nullptr, 10); |
| 65 | + continue; |
| 66 | + } |
| 67 | + if (std::strcmp(argv[i], "--value-size") == 0 && |
| 68 | + i + 1 < argc) { |
| 69 | + opt.value_size = |
| 70 | + std::strtoull(argv[++i], nullptr, 10); |
| 71 | + continue; |
| 72 | + } |
| 73 | + if (std::strcmp(argv[i], "--gdrcopy-max-per-key-bytes") == 0 && |
| 74 | + i + 1 < argc) { |
| 75 | + opt.gdrcopy_max_per_key_bytes = |
| 76 | + std::strtoull(argv[++i], nullptr, 10); |
| 77 | + continue; |
| 78 | + } |
| 79 | + std::cerr << "Unknown argument: " << argv[i] << "\n"; |
| 80 | + std::exit(1); |
| 81 | + } |
| 82 | + return opt; |
| 83 | +} |
| 84 | + |
| 85 | +static uint32_t lcg_next(uint32_t &state, uint32_t modulus) |
| 86 | +{ |
| 87 | + state = state * 1664525u + 1013904223u; |
| 88 | + if (modulus == 0) { |
| 89 | + return state; |
| 90 | + } |
| 91 | + return state % modulus; |
| 92 | +} |
| 93 | + |
| 94 | +static void print_result(const char *label, uint64_t iters, |
| 95 | + std::chrono::nanoseconds duration) |
| 96 | +{ |
| 97 | + double ns_per_op = |
| 98 | + static_cast<double>(duration.count()) / (double)iters; |
| 99 | + double ops_per_sec = 1e9 / ns_per_op; |
| 100 | + std::cout.setf(std::ios::fixed); |
| 101 | + std::cout.precision(1); |
| 102 | + std::cout << label << ": " << ns_per_op << " ns/op" |
| 103 | + << " (" << ops_per_sec << " ops/s)\n"; |
| 104 | +} |
| 105 | + |
| 106 | +int main(int argc, char **argv) |
| 107 | +{ |
| 108 | + auto opt = parse_args(argc, argv); |
| 109 | + |
| 110 | + if (auto err = cuInit(0); err != CUDA_SUCCESS) { |
| 111 | + std::cerr << "cuInit(0) failed with error " << (int)err |
| 112 | + << "\n"; |
| 113 | + return 1; |
| 114 | + } |
| 115 | + |
| 116 | + bpftime_initialize_global_shm( |
| 117 | + bpftime::shm_open_type::SHM_REMOVE_AND_CREATE); |
| 118 | + |
| 119 | + auto config = bpftime::bpftime_get_agent_config(); |
| 120 | + config.enable_gpu_gdrcopy = opt.enable_gdrcopy; |
| 121 | + config.gpu_gdrcopy_max_per_key_bytes = opt.gdrcopy_max_per_key_bytes; |
| 122 | + bpftime::bpftime_set_agent_config(std::move(config)); |
| 123 | + |
| 124 | + bpftime::bpf_map_attr attr{}; |
| 125 | + attr.type = |
| 126 | + (int)bpftime::bpf_map_type::BPF_MAP_TYPE_GPU_ARRAY_MAP; |
| 127 | + attr.key_size = sizeof(uint32_t); |
| 128 | + attr.value_size = (uint32_t)opt.value_size; |
| 129 | + attr.max_ents = (uint32_t)opt.max_entries; |
| 130 | + |
| 131 | + int map_fd = |
| 132 | + bpftime_maps_create(-1, "gpu_array_map_host_perf", attr); |
| 133 | + if (map_fd < 0) { |
| 134 | + std::perror("bpftime_maps_create"); |
| 135 | + return 1; |
| 136 | + } |
| 137 | + |
| 138 | + std::vector<uint8_t> value(opt.value_size, 0); |
| 139 | + |
| 140 | + // Pre-fill the map with some values to avoid ENOENT on lookups. |
| 141 | + for (uint32_t i = 0; i < opt.max_entries; ++i) { |
| 142 | + uint32_t key = i; |
| 143 | + if (bpftime_map_update_elem(map_fd, &key, value.data(), 0) != |
| 144 | + 0) { |
| 145 | + std::perror("bpftime_map_update_elem (prefill)"); |
| 146 | + return 1; |
| 147 | + } |
| 148 | + } |
| 149 | + |
| 150 | + // Update benchmark. |
| 151 | + uint32_t state = 0x12345678u; |
| 152 | + auto start_update = clock_type::now(); |
| 153 | + for (uint64_t i = 0; i < opt.iters; ++i) { |
| 154 | + uint32_t key = |
| 155 | + lcg_next(state, (uint32_t)opt.max_entries); |
| 156 | + if (bpftime_map_update_elem(map_fd, &key, value.data(), 0) != |
| 157 | + 0) { |
| 158 | + std::perror("bpftime_map_update_elem"); |
| 159 | + return 1; |
| 160 | + } |
| 161 | + } |
| 162 | + auto end_update = clock_type::now(); |
| 163 | + |
| 164 | + // Lookup benchmark. |
| 165 | + state = 0xdeadbeefu; |
| 166 | + auto start_lookup = clock_type::now(); |
| 167 | + for (uint64_t i = 0; i < opt.iters; ++i) { |
| 168 | + uint32_t key = |
| 169 | + lcg_next(state, (uint32_t)opt.max_entries); |
| 170 | + auto *ptr = bpftime_map_lookup_elem(map_fd, &key); |
| 171 | + if (!ptr) { |
| 172 | + std::perror("bpftime_map_lookup_elem"); |
| 173 | + return 1; |
| 174 | + } |
| 175 | + } |
| 176 | + auto end_lookup = clock_type::now(); |
| 177 | + |
| 178 | + auto update_ns = |
| 179 | + std::chrono::duration_cast<std::chrono::nanoseconds>( |
| 180 | + end_update - start_update); |
| 181 | + auto lookup_ns = |
| 182 | + std::chrono::duration_cast<std::chrono::nanoseconds>( |
| 183 | + end_lookup - start_lookup); |
| 184 | + |
| 185 | + std::cout << "iters=" << opt.iters |
| 186 | + << " max_entries=" << opt.max_entries |
| 187 | + << " value_size=" << opt.value_size << " bytes" |
| 188 | + << " gdrcopy=" << (opt.enable_gdrcopy ? 1 : 0) |
| 189 | + << " gdrcopy_max_per_key_bytes=" |
| 190 | + << opt.gdrcopy_max_per_key_bytes << "\n"; |
| 191 | + print_result("update", opt.iters, update_ns); |
| 192 | + print_result("lookup", opt.iters, lookup_ns); |
| 193 | + |
| 194 | + bpftime_destroy_global_shm(); |
| 195 | + return 0; |
| 196 | +} |
0 commit comments