Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/tools/perf/api/libperf.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#define UCX_LIBPERF_H

#include <ucs/sys/compiler.h>
#include <ucs/sys/device_code.h>
#include <ucs/device/device_common.h>

BEGIN_C_DECLS

Expand Down
14 changes: 8 additions & 6 deletions src/tools/perf/cuda/cuda_kernel.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,19 @@
#include "cuda_common.h"

#include <tools/perf/lib/libperf_int.h>
#include <ucs/sys/device_code.h>
#include <ucs/device/device_log_impl.h>
#include <cuda_runtime.h>


typedef unsigned long long ucx_perf_cuda_time_t;

struct ucx_perf_cuda_context {
unsigned max_outstanding;
ucx_perf_counter_t max_iters;
ucx_perf_cuda_time_t report_interval_ns;
ucx_perf_counter_t completed_iters;
ucs_status_t status;
unsigned max_outstanding;
ucx_perf_counter_t max_iters;
ucx_perf_cuda_time_t report_interval_ns;
ucx_perf_counter_t completed_iters;
ucs_status_t status;
ucs_device_log_config_t log_config;
};

UCS_F_DEVICE ucx_perf_cuda_time_t ucx_perf_cuda_get_time_ns()
Expand Down Expand Up @@ -153,6 +154,7 @@ public:
ULONG_MAX :
ucs_time_to_nsec(perf.report_interval) / 100;
m_cpu_ctx->status = UCS_ERR_NOT_IMPLEMENTED;
ucs_device_log_config_init(&m_cpu_ctx->log_config);
}

~ucx_perf_cuda_test_runner()
Expand Down
14 changes: 9 additions & 5 deletions src/tools/perf/cuda/ucp_cuda_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -255,15 +255,16 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
while (request_mgr.get_pending_count() >= ctx.max_outstanding) {
status = request_mgr.progress<level>(1);
if (UCS_STATUS_IS_ERR(status)) {
ucs_device_error("progress failed: %d", status);
ucs_device_log(ERROR, &ctx.log_config, "progress failed: %d",
status);
goto out;
}
}

ucp_device_request_t &req = request_mgr.get_request();
status = ucp_perf_cuda_send_nbx<level, cmd>(params, idx, req);
if (status != UCS_OK) {
ucs_device_error("send failed: %d", status);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we keep error macro for convinience?

ucs_device_log(ERROR, &ctx.log_config, "send failed: %d", status);
goto out;
}

Expand All @@ -274,7 +275,8 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
while (request_mgr.get_pending_count() > 0) {
status = request_mgr.progress<level>(max_iters);
if (UCS_STATUS_IS_ERR(status)) {
ucs_device_error("final progress failed: %d", status);
ucs_device_log(ERROR, &ctx.log_config, "final progress failed: %d",
status);
goto out;
}
}
Expand All @@ -300,15 +302,17 @@ ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx,
if (is_sender) {
status = ucp_perf_cuda_send_sync<level, cmd>(params, idx, req);
if (status != UCS_OK) {
ucs_device_error("sender send failed: %d", status);
ucs_device_log(ERROR, &ctx.log_config, "sender send failed: %d",
status);
break;
}
ucx_perf_cuda_wait_sn(params.counter_recv, idx + 1);
} else {
ucx_perf_cuda_wait_sn(params.counter_recv, idx + 1);
status = ucp_perf_cuda_send_sync<level, cmd>(params, idx, req);
if (status != UCS_OK) {
ucs_device_error("receiver send failed: %d", status);
ucs_device_log(ERROR, &ctx.log_config,
"receiver send failed: %d", status);
break;
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/tools/perf/perftest_params.c
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,7 @@ static ucs_status_t parse_device_level(const char *opt_arg,
{
ucs_device_level_t level;
for (level = UCS_DEVICE_LEVEL_THREAD; level <= UCS_DEVICE_LEVEL_GRID; ++level) {
if (!strcmp(opt_arg, ucs_device_level_name(level))) {
if (!strcasecmp(opt_arg, ucs_device_level_names[level])) {
*device_level = level;
return UCS_OK;
}
Expand Down
2 changes: 1 addition & 1 deletion src/ucp/api/device/ucp_device_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

#include <ucp/api/ucp_def.h>
#include <uct/api/device/uct_device_impl.h>
#include <ucs/sys/device_code.h>
#include <ucs/device/device_common.h>
#include <ucs/type/status.h>
#include <stdint.h>

Expand Down
1 change: 1 addition & 0 deletions src/ucp/core/ucp_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <ucp/core/ucp_mm.h>
#include <ucp/api/device/ucp_host.h>
#include <ucp/api/device/ucp_device_types.h>
#include <ucp/wireup/wireup_ep.h>
#include <ucs/type/param.h>
#include <ucp/wireup/wireup_ep.h>

Expand Down
6 changes: 5 additions & 1 deletion src/ucs/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ nobase_dist_libucs_la_HEADERS = \
config/ini.h \
config/parser.h \
config/types.h \
device/cuda_device.cuh \
device/device_common.h \
device/device_log_impl.h \
device/stub_device.h \
datastruct/array.h \
datastruct/callbackq.h \
datastruct/callbackq_compat.h \
Expand Down Expand Up @@ -63,7 +67,6 @@ nobase_dist_libucs_la_HEADERS = \
stats/libstats.h \
sys/event_set.h \
sys/compiler_def.h\
sys/device_code.h \
sys/math.h \
sys/preprocessor.h \
sys/string.h \
Expand Down Expand Up @@ -190,6 +193,7 @@ libucs_la_SOURCES = \
debug/debug.c \
debug/log.c \
debug/memtrack.c \
device/device_common.c \
memory/memory_type.c \
memory/memtype_cache.c \
memory/numa.c \
Expand Down
5 changes: 5 additions & 0 deletions src/ucs/config/global_opts.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ ucs_global_opts_t ucs_global_opts = {
.log_file_rotate = 0,
.log_buffer_size = 1024,
.log_data_size = 0,
.device_log_level = UCS_LOG_LEVEL_WARN,
.mpool_fifo = 0,
.handle_errors = UCS_BIT(UCS_HANDLE_ERROR_BACKTRACE),
.error_signals = { NULL, 0 },
Expand Down Expand Up @@ -114,6 +115,10 @@ static ucs_config_field_t ucs_global_opts_table[] = {
"How much packet payload to print, at most, in data mode.",
ucs_offsetof(ucs_global_opts_t, log_data_size), UCS_CONFIG_TYPE_ULONG},

{"DEVICE_LOG_LEVEL", "warn",
"Logging level for device functions.",
ucs_offsetof(ucs_global_opts_t, device_log_level), UCS_CONFIG_TYPE_ENUM(ucs_log_level_names)},

{"LOG_PRINT_ENABLE", "n",
"Enable output of ucs_print(). This option is intended for use by the library developers.",
ucs_offsetof(ucs_global_opts_t, log_print_enable), UCS_CONFIG_TYPE_BOOL},
Expand Down
3 changes: 3 additions & 0 deletions src/ucs/config/global_opts.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ typedef struct {
/* Maximal amount of packet data to print per packet */
size_t log_data_size;

/* Log level for device API */
ucs_log_level_t device_log_level;

/* Enable ucs_print() output */
int log_print_enable;

Expand Down
68 changes: 68 additions & 0 deletions src/ucs/device/cuda_device.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/**
* Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED.
*
* See file LICENSE for terms.
*/

#ifndef UCS_CUDA_DEVICE_CUH
#define UCS_CUDA_DEVICE_CUH

#include <stdint.h>


/* Device function */
#define UCS_F_DEVICE __device__ __forceinline__ static


/* Device library function */
#define UCS_F_DEVICE_LIB __device__


/*
* Read a 64-bit atomic value from a global memory address.
*/
UCS_F_DEVICE uint64_t ucs_device_atomic64_read(const uint64_t *ptr)
{
uint64_t ret;
asm volatile("ld.acquire.sys.global.u64 %0, [%1];" : "=l"(ret) : "l"(ptr));
return ret;
}


/*
* Write a 64-bit value to counter global memory address.
*/
UCS_F_DEVICE void ucs_device_atomic64_write(uint64_t *ptr, uint64_t value)
{
asm volatile("st.release.sys.u64 [%0], %1;"
:
: "l"(ptr), "l"(value)
: "memory");
}


/*
* Read the 64-bit GPU global nanosecond timer
*/
UCS_F_DEVICE uint64_t ucs_device_get_time_ns(void)
{
uint64_t globaltimer;
/* 64-bit GPU global nanosecond timer */
asm volatile("mov.u64 %0, %globaltimer;" : "=l"(globaltimer));
return globaltimer;
}

/*
* Load a constant from global memory.
*/
template<typename T> UCS_F_DEVICE T ucs_device_load_const(const T *ptr)
{
return __ldg(ptr);
}

template<> inline __device__ void *ucs_device_load_const(void *const *ptr)
{
return (void*)__ldg((uint64_t*)ptr);
}

#endif /* UCS_CUDA_DEVICE_CUH */
20 changes: 20 additions & 0 deletions src/ucs/device/device_common.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/**
* Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED.
*
* See file LICENSE for terms.
*/

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include "device_common.h"

#include <ucs/config/global_opts.h>

const char *ucs_device_level_names[] = {"thread", "warp", "block", "grid"};

void ucs_device_log_config_init(ucs_device_log_config_t *config)
{
config->level = ucs_global_opts.device_log_level;
}
51 changes: 51 additions & 0 deletions src/ucs/device/device_common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/**
* Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED.
*
* See file LICENSE for terms.
*/

#ifndef UCS_DEVICE_COMMON_H
#define UCS_DEVICE_COMMON_H

#include <ucs/config/types.h>
#include <stdint.h>

#ifdef __NVCC__
#include "cuda_device.cuh"
#else
#include "stub_device.h"
#endif /* __NVCC__ */

BEGIN_C_DECLS

/* Logging configuration for device functions */
typedef struct {
uint8_t level;
} ucs_device_log_config_t;


/**
* @brief Cooperation level when calling device functions.
*/
typedef enum {
UCS_DEVICE_LEVEL_THREAD = 0,
UCS_DEVICE_LEVEL_WARP = 1,
UCS_DEVICE_LEVEL_BLOCK = 2,
UCS_DEVICE_LEVEL_GRID = 3
} ucs_device_level_t;


/** Names for @ref ucs_device_level_t */
extern const char *ucs_device_level_names[];


/* Number of threads in a warp */
#define UCS_DEVICE_NUM_THREADS_IN_WARP 32


/* Initialize the logging configuration for device functions */
void ucs_device_log_config_init(ucs_device_log_config_t *config);

END_C_DECLS

#endif /* UCS_DEVICE_COMMON_H */
77 changes: 77 additions & 0 deletions src/ucs/device/device_log_impl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/**
* Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED.
*
* See file LICENSE for terms.
*/

#ifndef UCS_DEVICE_LOG_H
#define UCS_DEVICE_LOG_H

#include "device_common.h"

#include <ucs/sys/compiler_def.h>
#include <ucs/config/types.h>
#include <stddef.h>

/* Maximal log level for which the logging code is compiled in.
Application kernels can override this value by defining
UCS_DEVICE_MAX_LOG_LEVEL before including this file. */
#ifndef UCS_DEVICE_MAX_LOG_LEVEL
#define UCS_DEVICE_MAX_LOG_LEVEL UCS_LOG_LEVEL_DEBUG
#endif


/* Helper macro to print a message from a device function including the
* thread and block indices */
#define ucs_device_log(_level, _log_config, _fmt, ...) \
do { \
if ((UCS_LOG_LEVEL_##_level <= UCS_DEVICE_MAX_LOG_LEVEL) && \
(UCS_LOG_LEVEL_##_level <= (_log_config)->level)) { \
const uint64_t _ts = ucs_device_get_time_ns(); \
printf("[%06lu.%06lu] (%4d:%-3d) %10s:%-4d %-6s " _fmt "\n", \
_ts / 1000000000ul, (_ts % 1000000000ul) / 1000ul, \
threadIdx.x, blockIdx.x, \
ucs_device_log_source_file(__FILE__), __LINE__, \
UCS_LOG_LEVEL_NAME_##_level, ##__VA_ARGS__); \
} \
} while (0)


/* Log level names */
#define UCS_LOG_LEVEL_NAME_ERROR "ERROR"
#define UCS_LOG_LEVEL_NAME_WARN "WARN"
#define UCS_LOG_LEVEL_NAME_DIAG "DIAG"
#define UCS_LOG_LEVEL_NAME_INFO "INFO"
#define UCS_LOG_LEVEL_NAME_DEBUG "DEBUG"
#define UCS_LOG_LEVEL_NAME_TRACE "TRACE"
#define UCS_LOG_LEVEL_NAME_TRACE_DATA "DATA"
#define UCS_LOG_LEVEL_NAME_TRACE_POLL "POLL"


static UCS_F_DEVICE_LIB const char *ucs_device_basename(const char *path)
{
const char *basename = path;
const char *p;

for (p = path; *p != '\0'; p++) {
if (*p == '/') {
basename = p + 1;
}
}

return basename;
Comment on lines +53 to +62
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe use standard lib:

Suggested change
const char *basename = path;
const char *p;
for (p = path; *p != '\0'; p++) {
if (*p == '/') {
basename = p + 1;
}
}
return basename;
const char *basename = strrchr(path, '/');
return basename ? basename + 1 : path;

}


UCS_F_DEVICE const char *ucs_device_log_source_file(const char *file)
{
static const char *cached_source_file = NULL;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if this caching is always safe
What happens if translation unit includes a header file which contains a logger call?
Which FILE will be cached?


if (cached_source_file == NULL) {
cached_source_file = ucs_device_basename(file);
}

return cached_source_file;
}

#endif /* UCS_DEVICE_LOG_H */
Loading
Loading