Skip to content

Commit 1ecffda

Browse files
[libc] Add Kernel Resource Usage to nvptx-loader (#97503)
This PR allows `nvptx-loader` to read the resource usage of `_start`, `_begin`, and `_end` when executing CUDA binaries. Example output: ``` $ nvptx-loader --print-resource-usage libc/benchmarks/gpu/src/ctype/libc.benchmarks.gpu.src.ctype.isalnum_benchmark.__build__ [ RUN ] LlvmLibcIsAlNumGpuBenchmark.IsAlnumWrapper [ OK ] LlvmLibcIsAlNumGpuBenchmark.IsAlnumWrapper: 93 cycles, 76 min, 470 max, 23 iterations, 78000 ns, 80 stddev _begin registers: 25 _start registers: 80 _end registers: 62 ``` --------- Co-authored-by: Joseph Huber <[email protected]>
1 parent 3061963 commit 1ecffda

File tree

6 files changed

+73
-27
lines changed

6 files changed

+73
-27
lines changed

libc/benchmarks/gpu/CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,15 @@ function(add_benchmark benchmark_name)
1515
endif()
1616
add_libc_hermetic(
1717
${benchmark_name}
18-
IS_BENCHMARK
18+
IS_GPU_BENCHMARK
1919
LINK_LIBRARIES
2020
LibcGpuBenchmark.hermetic
2121
${BENCHMARK_LINK_LIBRARIES}
2222
${BENCHMARK_UNPARSED_ARGUMENTS}
2323
)
2424
get_fq_target_name(${benchmark_name} fq_target_name)
25+
set(fq_build_target_name ${fq_target_name}.__build__)
26+
2527
add_dependencies(gpu-benchmark ${fq_target_name})
2628
endfunction(add_benchmark)
2729

libc/cmake/modules/LLVMLibCTestRules.cmake

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -553,7 +553,7 @@ function(add_libc_hermetic test_name)
553553
endif()
554554
cmake_parse_arguments(
555555
"HERMETIC_TEST"
556-
"IS_BENCHMARK" # Optional arguments
556+
"IS_GPU_BENCHMARK" # Optional arguments
557557
"SUITE" # Single value arguments
558558
"SRCS;HDRS;DEPENDS;ARGS;ENV;COMPILE_OPTIONS;LINK_LIBRARIES;LOADER_ARGS" # Multi-value arguments
559559
${ARGN}
@@ -709,14 +709,24 @@ function(add_libc_hermetic test_name)
709709
$<TARGET_FILE:${fq_build_target_name}> ${HERMETIC_TEST_ARGS})
710710
add_custom_target(
711711
${fq_target_name}
712+
DEPENDS ${fq_target_name}-cmd
713+
)
714+
715+
add_custom_command(
716+
OUTPUT ${fq_target_name}-cmd
712717
COMMAND ${test_cmd}
713718
COMMAND_EXPAND_LISTS
714719
COMMENT "Running hermetic test ${fq_target_name}"
715720
${LIBC_HERMETIC_TEST_JOB_POOL}
716721
)
717722

723+
set_source_files_properties(${fq_target_name}-cmd
724+
PROPERTIES
725+
SYMBOLIC "TRUE"
726+
)
727+
718728
add_dependencies(${HERMETIC_TEST_SUITE} ${fq_target_name})
719-
if(NOT ${HERMETIC_TEST_IS_BENCHMARK})
729+
if(NOT ${HERMETIC_TEST_IS_GPU_BENCHMARK})
720730
# If it is a benchmark, it will already have been added to the
721731
# gpu-benchmark target
722732
add_dependencies(libc-hermetic-tests ${fq_target_name})

libc/utils/gpu/loader/Loader.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ struct end_args_t {
5454
/// kernel on the target device. Copies \p argc and \p argv to the device.
5555
/// Returns the final value of the `main` function on the device.
5656
int load(int argc, char **argv, char **evnp, void *image, size_t size,
57-
const LaunchParameters &params);
57+
const LaunchParameters &params, bool print_resource_usage);
5858

5959
/// Return \p V aligned "upwards" according to \p Align.
6060
template <typename V, typename A> inline V align_up(V val, A align) {

libc/utils/gpu/loader/Main.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020

2121
int main(int argc, char **argv, char **envp) {
2222
if (argc < 2) {
23-
printf("USAGE: ./loader [--threads <n>, --blocks <n>] <device_image> "
23+
printf("USAGE: ./loader [--threads <n>, --blocks <n>, "
24+
"--print-resource-usage] <device_image> "
2425
"<args>, ...\n");
2526
return EXIT_SUCCESS;
2627
}
@@ -29,6 +30,7 @@ int main(int argc, char **argv, char **envp) {
2930
FILE *file = nullptr;
3031
char *ptr;
3132
LaunchParameters params = {1, 1, 1, 1, 1, 1};
33+
bool print_resource_usage = false;
3234
while (!file && ++offset < argc) {
3335
if (argv[offset] == std::string("--threads") ||
3436
argv[offset] == std::string("--threads-x")) {
@@ -62,6 +64,9 @@ int main(int argc, char **argv, char **envp) {
6264
offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
6365
offset++;
6466
continue;
67+
} else if (argv[offset] == std::string("--print-resource-usage")) {
68+
print_resource_usage = true;
69+
continue;
6570
} else {
6671
file = fopen(argv[offset], "r");
6772
if (!file) {
@@ -87,7 +92,8 @@ int main(int argc, char **argv, char **envp) {
8792
fclose(file);
8893

8994
// Drop the loader from the program arguments.
90-
int ret = load(argc - offset, &argv[offset], envp, image, size, params);
95+
int ret = load(argc - offset, &argv[offset], envp, image, size, params,
96+
print_resource_usage);
9197

9298
free(image);
9399
return ret;

libc/utils/gpu/loader/amdgpu/Loader.cpp

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,10 @@ hsa_status_t get_agent(hsa_agent_t *output_agent) {
125125
return iterate_agents(cb);
126126
}
127127

128+
void print_kernel_resources(char *kernel_name) {
129+
fprintf("Kernel resources on AMDGPU is not supported yet.\n");
130+
}
131+
128132
/// Retrieve a global memory pool with a \p flag from the agent.
129133
template <hsa_amd_memory_pool_global_flag_t flag>
130134
hsa_status_t get_agent_memory_pool(hsa_agent_t agent,
@@ -156,8 +160,9 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
156160
hsa_amd_memory_pool_t coarsegrained_pool,
157161
hsa_queue_t *queue, rpc_device_t device,
158162
const LaunchParameters &params,
159-
const char *kernel_name, args_t kernel_args) {
160-
// Look up the '_start' kernel in the loaded executable.
163+
const char *kernel_name, args_t kernel_args,
164+
bool print_resource_usage) {
165+
// Look up the kernel in the loaded executable.
161166
hsa_executable_symbol_t symbol;
162167
if (hsa_status_t err = hsa_executable_get_symbol_by_name(
163168
executable, kernel_name, &dev_agent, &symbol))
@@ -220,7 +225,7 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
220225
handle_error(err);
221226
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args);
222227

223-
// Initialie all the arguments (explicit and implicit) to zero, then set the
228+
// Initialize all the arguments (explicit and implicit) to zero, then set the
224229
// explicit arguments to the values created above.
225230
std::memset(args, 0, args_size);
226231
std::memcpy(args, &kernel_args, sizeof(args_t));
@@ -270,6 +275,9 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
270275
hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
271276
handle_error(err);
272277

278+
if (print_resource_usage)
279+
print_kernel_resources(kernel_name);
280+
273281
// Initialize the packet header and set the doorbell signal to begin execution
274282
// by the HSA runtime.
275283
uint16_t header =
@@ -327,7 +335,7 @@ static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent,
327335
}
328336

329337
int load(int argc, char **argv, char **envp, void *image, size_t size,
330-
const LaunchParameters &params) {
338+
const LaunchParameters &params, bool print_resource_usage) {
331339
// Initialize the HSA runtime used to communicate with the device.
332340
if (hsa_status_t err = hsa_init())
333341
handle_error(err);
@@ -545,15 +553,16 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
545553

546554
LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
547555
begin_args_t init_args = {argc, dev_argv, dev_envp};
548-
if (hsa_status_t err = launch_kernel(
549-
dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
550-
device, single_threaded_params, "_begin.kd", init_args))
556+
if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
557+
coarsegrained_pool, queue, device,
558+
single_threaded_params, "_begin.kd",
559+
init_args, print_resource_usage))
551560
handle_error(err);
552561

553562
start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
554-
if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
555-
coarsegrained_pool, queue, device,
556-
params, "_start.kd", args))
563+
if (hsa_status_t err = launch_kernel(
564+
dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
565+
device, params, "_start.kd", args, print_resource_usage))
557566
handle_error(err);
558567

559568
void *host_ret;
@@ -571,9 +580,10 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
571580
int ret = *static_cast<int *>(host_ret);
572581

573582
end_args_t fini_args = {ret};
574-
if (hsa_status_t err = launch_kernel(
575-
dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
576-
device, single_threaded_params, "_end.kd", fini_args))
583+
if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
584+
coarsegrained_pool, queue, device,
585+
single_threaded_params, "_end.kd",
586+
fini_args, print_resource_usage))
577587
handle_error(err);
578588

579589
if (rpc_status_t err = rpc_server_shutdown(

libc/utils/gpu/loader/nvptx/Loader.cpp

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -152,10 +152,23 @@ Expected<void *> get_ctor_dtor_array(const void *image, const size_t size,
152152
return dev_memory;
153153
}
154154

155+
void print_kernel_resources(CUmodule binary, const char *kernel_name) {
156+
CUfunction function;
157+
if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
158+
handle_error(err);
159+
int num_regs;
160+
if (CUresult err =
161+
cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, function))
162+
handle_error(err);
163+
printf("Executing kernel %s:\n", kernel_name);
164+
printf("%6s registers: %d\n", kernel_name, num_regs);
165+
}
166+
155167
template <typename args_t>
156168
CUresult launch_kernel(CUmodule binary, CUstream stream,
157169
rpc_device_t rpc_device, const LaunchParameters &params,
158-
const char *kernel_name, args_t kernel_args) {
170+
const char *kernel_name, args_t kernel_args,
171+
bool print_resource_usage) {
159172
// look up the '_start' kernel in the loaded module.
160173
CUfunction function;
161174
if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
@@ -208,6 +221,9 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
208221
},
209222
&memory_stream);
210223

224+
if (print_resource_usage)
225+
print_kernel_resources(binary, kernel_name);
226+
211227
// Call the kernel with the given arguments.
212228
if (CUresult err = cuLaunchKernel(
213229
function, params.num_blocks_x, params.num_blocks_y,
@@ -230,7 +246,7 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
230246
}
231247

232248
int load(int argc, char **argv, char **envp, void *image, size_t size,
233-
const LaunchParameters &params) {
249+
const LaunchParameters &params, bool print_resource_usage) {
234250
if (CUresult err = cuInit(0))
235251
handle_error(err);
236252
// Obtain the first device found on the system.
@@ -323,14 +339,15 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
323339

324340
LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
325341
begin_args_t init_args = {argc, dev_argv, dev_envp};
326-
if (CUresult err = launch_kernel(binary, stream, rpc_device,
327-
single_threaded_params, "_begin", init_args))
342+
if (CUresult err =
343+
launch_kernel(binary, stream, rpc_device, single_threaded_params,
344+
"_begin", init_args, print_resource_usage))
328345
handle_error(err);
329346

330347
start_args_t args = {argc, dev_argv, dev_envp,
331348
reinterpret_cast<void *>(dev_ret)};
332-
if (CUresult err =
333-
launch_kernel(binary, stream, rpc_device, params, "_start", args))
349+
if (CUresult err = launch_kernel(binary, stream, rpc_device, params, "_start",
350+
args, print_resource_usage))
334351
handle_error(err);
335352

336353
// Copy the return value back from the kernel and wait.
@@ -342,8 +359,9 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
342359
handle_error(err);
343360

344361
end_args_t fini_args = {host_ret};
345-
if (CUresult err = launch_kernel(binary, stream, rpc_device,
346-
single_threaded_params, "_end", fini_args))
362+
if (CUresult err =
363+
launch_kernel(binary, stream, rpc_device, single_threaded_params,
364+
"_end", fini_args, print_resource_usage))
347365
handle_error(err);
348366

349367
// Free the memory allocated for the device.

0 commit comments

Comments
 (0)