Skip to content

Commit faa224f

Browse files
committed
UCT/GDA/TEST: Add runtime config for device debug logs
1 parent e3cb7d6 commit faa224f

28 files changed

+359
-163
lines changed

src/tools/perf/api/libperf.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
#define UCX_LIBPERF_H
1212

1313
#include <ucs/sys/compiler.h>
14-
#include <ucs/sys/device_code.h>
14+
#include <ucs/device/device_common.h>
1515

1616
BEGIN_C_DECLS
1717

src/tools/perf/cuda/cuda_kernel.cuh

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,19 @@
1010
#include "cuda_common.h"
1111

1212
#include <tools/perf/lib/libperf_int.h>
13-
#include <ucs/sys/device_code.h>
13+
#include <ucs/device/device_log_impl.h>
1414
#include <cuda_runtime.h>
1515

1616

1717
typedef unsigned long long ucx_perf_cuda_time_t;
1818

1919
struct ucx_perf_cuda_context {
20-
unsigned max_outstanding;
21-
ucx_perf_counter_t max_iters;
22-
ucx_perf_cuda_time_t report_interval_ns;
23-
ucx_perf_counter_t completed_iters;
24-
ucs_status_t status;
20+
unsigned max_outstanding;
21+
ucx_perf_counter_t max_iters;
22+
ucx_perf_cuda_time_t report_interval_ns;
23+
ucx_perf_counter_t completed_iters;
24+
ucs_status_t status;
25+
ucs_device_log_config_t log_config;
2526
};
2627

2728
UCS_F_DEVICE ucx_perf_cuda_time_t ucx_perf_cuda_get_time_ns()
@@ -153,6 +154,7 @@ public:
153154
ULONG_MAX :
154155
ucs_time_to_nsec(perf.report_interval) / 100;
155156
m_cpu_ctx->status = UCS_ERR_NOT_IMPLEMENTED;
157+
ucs_device_log_config_init(&m_cpu_ctx->log_config);
156158
}
157159

158160
~ucx_perf_cuda_test_runner()

src/tools/perf/cuda/ucp_cuda_kernel.cu

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -255,15 +255,16 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
255255
while (request_mgr.get_pending_count() >= ctx.max_outstanding) {
256256
status = request_mgr.progress<level>(1);
257257
if (UCS_STATUS_IS_ERR(status)) {
258-
ucs_device_error("progress failed: %d", status);
258+
ucs_device_log(ERROR, &ctx.log_config, "progress failed: %d",
259+
status);
259260
goto out;
260261
}
261262
}
262263

263264
ucp_device_request_t &req = request_mgr.get_request();
264265
status = ucp_perf_cuda_send_nbx<level, cmd>(params, idx, req);
265266
if (status != UCS_OK) {
266-
ucs_device_error("send failed: %d", status);
267+
ucs_device_log(ERROR, &ctx.log_config, "send failed: %d", status);
267268
goto out;
268269
}
269270

@@ -274,7 +275,8 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
274275
while (request_mgr.get_pending_count() > 0) {
275276
status = request_mgr.progress<level>(max_iters);
276277
if (UCS_STATUS_IS_ERR(status)) {
277-
ucs_device_error("final progress failed: %d", status);
278+
ucs_device_log(ERROR, &ctx.log_config, "final progress failed: %d",
279+
status);
278280
goto out;
279281
}
280282
}
@@ -300,15 +302,17 @@ ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx,
300302
if (is_sender) {
301303
status = ucp_perf_cuda_send_sync<level, cmd>(params, idx, req);
302304
if (status != UCS_OK) {
303-
ucs_device_error("sender send failed: %d", status);
305+
ucs_device_log(ERROR, &ctx.log_config, "sender send failed: %d",
306+
status);
304307
break;
305308
}
306309
ucx_perf_cuda_wait_sn(params.counter_recv, idx + 1);
307310
} else {
308311
ucx_perf_cuda_wait_sn(params.counter_recv, idx + 1);
309312
status = ucp_perf_cuda_send_sync<level, cmd>(params, idx, req);
310313
if (status != UCS_OK) {
311-
ucs_device_error("receiver send failed: %d", status);
314+
ucs_device_log(ERROR, &ctx.log_config,
315+
"receiver send failed: %d", status);
312316
break;
313317
}
314318
}

src/tools/perf/perftest_params.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -457,7 +457,7 @@ static ucs_status_t parse_device_level(const char *opt_arg,
457457
{
458458
ucs_device_level_t level;
459459
for (level = UCS_DEVICE_LEVEL_THREAD; level <= UCS_DEVICE_LEVEL_GRID; ++level) {
460-
if (!strcmp(opt_arg, ucs_device_level_name(level))) {
460+
if (!strcasecmp(opt_arg, ucs_device_level_names[level])) {
461461
*device_level = level;
462462
return UCS_OK;
463463
}

src/ucp/api/device/ucp_device_impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
#include <ucp/api/ucp_def.h>
1313
#include <uct/api/device/uct_device_impl.h>
14-
#include <ucs/sys/device_code.h>
14+
#include <ucs/device/device_common.h>
1515
#include <ucs/type/status.h>
1616
#include <stdint.h>
1717

src/ucp/core/ucp_device.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <ucp/core/ucp_mm.h>
1414
#include <ucp/api/device/ucp_host.h>
1515
#include <ucp/api/device/ucp_device_types.h>
16+
#include <ucp/wireup/wireup_ep.h>
1617
#include <ucs/type/param.h>
1718
#include <ucp/wireup/wireup_ep.h>
1819

src/ucs/Makefile.am

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ nobase_dist_libucs_la_HEADERS = \
3434
config/ini.h \
3535
config/parser.h \
3636
config/types.h \
37+
device/cuda_device.cuh \
38+
device/device_common.h \
39+
device/device_log_impl.h \
40+
device/stub_device.h \
3741
datastruct/array.h \
3842
datastruct/callbackq.h \
3943
datastruct/callbackq_compat.h \
@@ -63,7 +67,6 @@ nobase_dist_libucs_la_HEADERS = \
6367
stats/libstats.h \
6468
sys/event_set.h \
6569
sys/compiler_def.h\
66-
sys/device_code.h \
6770
sys/math.h \
6871
sys/preprocessor.h \
6972
sys/string.h \
@@ -190,6 +193,7 @@ libucs_la_SOURCES = \
190193
debug/debug.c \
191194
debug/log.c \
192195
debug/memtrack.c \
196+
device/device_common.c \
193197
memory/memory_type.c \
194198
memory/memtype_cache.c \
195199
memory/numa.c \

src/ucs/config/global_opts.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ ucs_global_opts_t ucs_global_opts = {
2929
.log_file_rotate = 0,
3030
.log_buffer_size = 1024,
3131
.log_data_size = 0,
32+
.device_log_level = UCS_LOG_LEVEL_WARN,
3233
.mpool_fifo = 0,
3334
.handle_errors = UCS_BIT(UCS_HANDLE_ERROR_BACKTRACE),
3435
.error_signals = { NULL, 0 },
@@ -114,6 +115,10 @@ static ucs_config_field_t ucs_global_opts_table[] = {
114115
"How much packet payload to print, at most, in data mode.",
115116
ucs_offsetof(ucs_global_opts_t, log_data_size), UCS_CONFIG_TYPE_ULONG},
116117

118+
{"DEVICE_LOG_LEVEL", "warn",
119+
"Logging level for device functions.",
120+
ucs_offsetof(ucs_global_opts_t, device_log_level), UCS_CONFIG_TYPE_ENUM(ucs_log_level_names)},
121+
117122
{"LOG_PRINT_ENABLE", "n",
118123
"Enable output of ucs_print(). This option is intended for use by the library developers.",
119124
ucs_offsetof(ucs_global_opts_t, log_print_enable), UCS_CONFIG_TYPE_BOOL},

src/ucs/config/global_opts.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ typedef struct {
4646
/* Maximal amount of packet data to print per packet */
4747
size_t log_data_size;
4848

49+
/* Log level for device API */
50+
ucs_log_level_t device_log_level;
51+
4952
/* Enable ucs_print() output */
5053
int log_print_enable;
5154

src/ucs/device/cuda_device.cuh

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
/**
2+
* Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED.
3+
*
4+
* See file LICENSE for terms.
5+
*/
6+
7+
#ifndef UCS_CUDA_DEVICE_CUH
8+
#define UCS_CUDA_DEVICE_CUH
9+
10+
#include <stdint.h>
11+
12+
13+
/* Device function */
14+
#define UCS_F_DEVICE __device__ __forceinline__ static
15+
16+
17+
/* Device library function */
18+
#define UCS_F_DEVICE_LIB __device__
19+
20+
21+
/*
22+
* Read a 64-bit atomic value from a global memory address.
23+
*/
24+
UCS_F_DEVICE uint64_t ucs_device_atomic64_read(const uint64_t *ptr)
25+
{
26+
uint64_t ret;
27+
asm volatile("ld.acquire.sys.global.u64 %0, [%1];" : "=l"(ret) : "l"(ptr));
28+
return ret;
29+
}
30+
31+
32+
/*
33+
* Write a 64-bit value to counter global memory address.
34+
*/
35+
UCS_F_DEVICE void ucs_device_atomic64_write(uint64_t *ptr, uint64_t value)
36+
{
37+
asm volatile("st.release.sys.u64 [%0], %1;"
38+
:
39+
: "l"(ptr), "l"(value)
40+
: "memory");
41+
}
42+
43+
44+
/*
45+
* Read the 64-bit GPU global nanosecond timer
46+
*/
47+
UCS_F_DEVICE uint64_t ucs_device_get_time_ns(void)
48+
{
49+
uint64_t globaltimer;
50+
/* 64-bit GPU global nanosecond timer */
51+
asm volatile("mov.u64 %0, %globaltimer;" : "=l"(globaltimer));
52+
return globaltimer;
53+
}
54+
55+
/*
56+
* Load a constant from global memory.
57+
*/
58+
template<typename T> UCS_F_DEVICE T ucs_device_load_const(const T *ptr)
59+
{
60+
return __ldg(ptr);
61+
}
62+
63+
template<> inline __device__ void *ucs_device_load_const(void *const *ptr)
64+
{
65+
return (void*)__ldg((uint64_t*)ptr);
66+
}
67+
68+
#endif /* UCS_CUDA_DEVICE_CUH */

0 commit comments

Comments
 (0)