diff --git a/src/tools/perf/api/libperf.h b/src/tools/perf/api/libperf.h index 2e1338b2423..ad6d3f6b9dc 100644 --- a/src/tools/perf/api/libperf.h +++ b/src/tools/perf/api/libperf.h @@ -11,7 +11,7 @@ #define UCX_LIBPERF_H #include -#include +#include BEGIN_C_DECLS diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh index c9e71ba40c7..f94b071a1f8 100644 --- a/src/tools/perf/cuda/cuda_kernel.cuh +++ b/src/tools/perf/cuda/cuda_kernel.cuh @@ -10,18 +10,19 @@ #include "cuda_common.h" #include -#include +#include #include typedef unsigned long long ucx_perf_cuda_time_t; struct ucx_perf_cuda_context { - unsigned max_outstanding; - ucx_perf_counter_t max_iters; - ucx_perf_cuda_time_t report_interval_ns; - ucx_perf_counter_t completed_iters; - ucs_status_t status; + unsigned max_outstanding; + ucx_perf_counter_t max_iters; + ucx_perf_cuda_time_t report_interval_ns; + ucx_perf_counter_t completed_iters; + ucs_status_t status; + ucs_device_log_config_t log_config; }; UCS_F_DEVICE ucx_perf_cuda_time_t ucx_perf_cuda_get_time_ns() @@ -153,6 +154,7 @@ public: ULONG_MAX : ucs_time_to_nsec(perf.report_interval) / 100; m_cpu_ctx->status = UCS_ERR_NOT_IMPLEMENTED; + ucs_device_log_config_init(&m_cpu_ctx->log_config); } ~ucx_perf_cuda_test_runner() diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu index 5b7afaf911c..26713086d82 100644 --- a/src/tools/perf/cuda/ucp_cuda_kernel.cu +++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu @@ -255,7 +255,8 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx, while (request_mgr.get_pending_count() >= ctx.max_outstanding) { status = request_mgr.progress(1); if (UCS_STATUS_IS_ERR(status)) { - ucs_device_error("progress failed: %d", status); + ucs_device_log(ERROR, &ctx.log_config, "progress failed: %d", + status); goto out; } } @@ -263,7 +264,7 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx, ucp_device_request_t &req = request_mgr.get_request(); status = ucp_perf_cuda_send_nbx(params, idx, req); if (status != UCS_OK) { - ucs_device_error("send failed: %d", status); + ucs_device_log(ERROR, &ctx.log_config, "send failed: %d", status); goto out; } @@ -274,7 +275,8 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx, while (request_mgr.get_pending_count() > 0) { status = request_mgr.progress(max_iters); if (UCS_STATUS_IS_ERR(status)) { - ucs_device_error("final progress failed: %d", status); + ucs_device_log(ERROR, &ctx.log_config, "final progress failed: %d", + status); goto out; } } @@ -300,7 +302,8 @@ ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx, if (is_sender) { status = ucp_perf_cuda_send_sync(params, idx, req); if (status != UCS_OK) { - ucs_device_error("sender send failed: %d", status); + ucs_device_log(ERROR, &ctx.log_config, "sender send failed: %d", + status); break; } ucx_perf_cuda_wait_sn(params.counter_recv, idx + 1); @@ -308,7 +311,8 @@ ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx, ucx_perf_cuda_wait_sn(params.counter_recv, idx + 1); status = ucp_perf_cuda_send_sync(params, idx, req); if (status != UCS_OK) { - ucs_device_error("receiver send failed: %d", status); + ucs_device_log(ERROR, &ctx.log_config, + "receiver send failed: %d", status); break; } } diff --git a/src/tools/perf/perftest_params.c b/src/tools/perf/perftest_params.c index e24a0be5507..5248749161f 100644 --- a/src/tools/perf/perftest_params.c +++ b/src/tools/perf/perftest_params.c @@ -457,7 +457,7 @@ static ucs_status_t parse_device_level(const char *opt_arg, { ucs_device_level_t level; for (level = UCS_DEVICE_LEVEL_THREAD; level <= UCS_DEVICE_LEVEL_GRID; ++level) { - if (!strcmp(opt_arg, ucs_device_level_name(level))) { + if (!strcasecmp(opt_arg, ucs_device_level_names[level])) { *device_level = level; return UCS_OK; } diff --git a/src/ucp/api/device/ucp_device_impl.h b/src/ucp/api/device/ucp_device_impl.h index 5c3ece9f71a..1bea785dd61 100644 --- a/src/ucp/api/device/ucp_device_impl.h +++ b/src/ucp/api/device/ucp_device_impl.h @@ -11,7 +11,7 @@ #include #include -#include +#include #include #include diff --git a/src/ucp/core/ucp_device.c b/src/ucp/core/ucp_device.c index dffba8622b5..db83ddd7f44 100644 --- a/src/ucp/core/ucp_device.c +++ b/src/ucp/core/ucp_device.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include diff --git a/src/ucs/Makefile.am b/src/ucs/Makefile.am index 28b6fa1e805..575db46db63 100644 --- a/src/ucs/Makefile.am +++ b/src/ucs/Makefile.am @@ -34,6 +34,10 @@ nobase_dist_libucs_la_HEADERS = \ config/ini.h \ config/parser.h \ config/types.h \ + device/cuda_device.cuh \ + device/device_common.h \ + device/device_log_impl.h \ + device/stub_device.h \ datastruct/array.h \ datastruct/callbackq.h \ datastruct/callbackq_compat.h \ @@ -63,7 +67,6 @@ nobase_dist_libucs_la_HEADERS = \ stats/libstats.h \ sys/event_set.h \ sys/compiler_def.h\ - sys/device_code.h \ sys/math.h \ sys/preprocessor.h \ sys/string.h \ @@ -190,6 +193,7 @@ libucs_la_SOURCES = \ debug/debug.c \ debug/log.c \ debug/memtrack.c \ + device/device_common.c \ memory/memory_type.c \ memory/memtype_cache.c \ memory/numa.c \ diff --git a/src/ucs/config/global_opts.c b/src/ucs/config/global_opts.c index 3cb8a8128bb..318ef5d9d4f 100644 --- a/src/ucs/config/global_opts.c +++ b/src/ucs/config/global_opts.c @@ -29,6 +29,7 @@ ucs_global_opts_t ucs_global_opts = { .log_file_rotate = 0, .log_buffer_size = 1024, .log_data_size = 0, + .device_log_level = UCS_LOG_LEVEL_WARN, .mpool_fifo = 0, .handle_errors = UCS_BIT(UCS_HANDLE_ERROR_BACKTRACE), .error_signals = { NULL, 0 }, @@ -114,6 +115,10 @@ static ucs_config_field_t ucs_global_opts_table[] = { "How much packet payload to print, at most, in data mode.", ucs_offsetof(ucs_global_opts_t, log_data_size), UCS_CONFIG_TYPE_ULONG}, + {"DEVICE_LOG_LEVEL", "warn", + "Logging level for device functions.", + ucs_offsetof(ucs_global_opts_t, device_log_level), UCS_CONFIG_TYPE_ENUM(ucs_log_level_names)}, + {"LOG_PRINT_ENABLE", "n", "Enable output of ucs_print(). This option is intended for use by the library developers.", ucs_offsetof(ucs_global_opts_t, log_print_enable), UCS_CONFIG_TYPE_BOOL}, diff --git a/src/ucs/config/global_opts.h b/src/ucs/config/global_opts.h index f1d4540e8a6..1a0a86d0339 100644 --- a/src/ucs/config/global_opts.h +++ b/src/ucs/config/global_opts.h @@ -46,6 +46,9 @@ typedef struct { /* Maximal amount of packet data to print per packet */ size_t log_data_size; + /* Log level for device API */ + ucs_log_level_t device_log_level; + /* Enable ucs_print() output */ int log_print_enable; diff --git a/src/ucs/device/cuda_device.cuh b/src/ucs/device/cuda_device.cuh new file mode 100644 index 00000000000..8edfb971436 --- /dev/null +++ b/src/ucs/device/cuda_device.cuh @@ -0,0 +1,68 @@ +/** + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifndef UCS_CUDA_DEVICE_CUH +#define UCS_CUDA_DEVICE_CUH + +#include + + +/* Device function */ +#define UCS_F_DEVICE __device__ __forceinline__ static + + +/* Device library function */ +#define UCS_F_DEVICE_LIB __device__ + + +/* + * Read a 64-bit atomic value from a global memory address. + */ +UCS_F_DEVICE uint64_t ucs_device_atomic64_read(const uint64_t *ptr) +{ + uint64_t ret; + asm volatile("ld.acquire.sys.global.u64 %0, [%1];" : "=l"(ret) : "l"(ptr)); + return ret; +} + + +/* + * Write a 64-bit value to counter global memory address. + */ +UCS_F_DEVICE void ucs_device_atomic64_write(uint64_t *ptr, uint64_t value) +{ + asm volatile("st.release.sys.u64 [%0], %1;" + : + : "l"(ptr), "l"(value) + : "memory"); +} + + +/* + * Read the 64-bit GPU global nanosecond timer + */ +UCS_F_DEVICE uint64_t ucs_device_get_time_ns(void) +{ + uint64_t globaltimer; + /* 64-bit GPU global nanosecond timer */ + asm volatile("mov.u64 %0, %globaltimer;" : "=l"(globaltimer)); + return globaltimer; +} + +/* + * Load a constant from global memory. + */ +template UCS_F_DEVICE T ucs_device_load_const(const T *ptr) +{ + return __ldg(ptr); +} + +template<> inline __device__ void *ucs_device_load_const(void *const *ptr) +{ + return (void*)__ldg((uint64_t*)ptr); +} + +#endif /* UCS_CUDA_DEVICE_CUH */ diff --git a/src/ucs/device/device_common.c b/src/ucs/device/device_common.c new file mode 100644 index 00000000000..6185727990b --- /dev/null +++ b/src/ucs/device/device_common.c @@ -0,0 +1,20 @@ +/** + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "device_common.h" + +#include + +const char *ucs_device_level_names[] = {"thread", "warp", "block", "grid"}; + +void ucs_device_log_config_init(ucs_device_log_config_t *config) +{ + config->level = ucs_global_opts.device_log_level; +} diff --git a/src/ucs/device/device_common.h b/src/ucs/device/device_common.h new file mode 100644 index 00000000000..93d337dd93a --- /dev/null +++ b/src/ucs/device/device_common.h @@ -0,0 +1,51 @@ +/** + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifndef UCS_DEVICE_COMMON_H +#define UCS_DEVICE_COMMON_H + +#include +#include + +#ifdef __NVCC__ +#include "cuda_device.cuh" +#else +#include "stub_device.h" +#endif /* __NVCC__ */ + +BEGIN_C_DECLS + +/* Logging configuration for device functions */ +typedef struct { + uint8_t level; +} ucs_device_log_config_t; + + +/** + * @brief Cooperation level when calling device functions. + */ +typedef enum { + UCS_DEVICE_LEVEL_THREAD = 0, + UCS_DEVICE_LEVEL_WARP = 1, + UCS_DEVICE_LEVEL_BLOCK = 2, + UCS_DEVICE_LEVEL_GRID = 3 +} ucs_device_level_t; + + +/** Names for @ref ucs_device_level_t */ +extern const char *ucs_device_level_names[]; + + +/* Number of threads in a warp */ +#define UCS_DEVICE_NUM_THREADS_IN_WARP 32 + + +/* Initialize the logging configuration for device functions */ +void ucs_device_log_config_init(ucs_device_log_config_t *config); + +END_C_DECLS + +#endif /* UCS_DEVICE_COMMON_H */ diff --git a/src/ucs/device/device_log_impl.h b/src/ucs/device/device_log_impl.h new file mode 100644 index 00000000000..d40e377b189 --- /dev/null +++ b/src/ucs/device/device_log_impl.h @@ -0,0 +1,77 @@ +/** + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifndef UCS_DEVICE_LOG_H +#define UCS_DEVICE_LOG_H + +#include "device_common.h" + +#include +#include +#include + +/* Maximal log level for which the logging code is compiled in. + Application kernels can override this value by defining + UCS_DEVICE_MAX_LOG_LEVEL before including this file. */ +#ifndef UCS_DEVICE_MAX_LOG_LEVEL +#define UCS_DEVICE_MAX_LOG_LEVEL UCS_LOG_LEVEL_DEBUG +#endif + + +/* Helper macro to print a message from a device function including the + * thread and block indices */ +#define ucs_device_log(_level, _log_config, _fmt, ...) \ + do { \ + if ((UCS_LOG_LEVEL_##_level <= UCS_DEVICE_MAX_LOG_LEVEL) && \ + (UCS_LOG_LEVEL_##_level <= (_log_config)->level)) { \ + const uint64_t _ts = ucs_device_get_time_ns(); \ + printf("[%06lu.%06lu] (%4d:%-3d) %10s:%-4d %-6s " _fmt "\n", \ + _ts / 1000000000ul, (_ts % 1000000000ul) / 1000ul, \ + threadIdx.x, blockIdx.x, \ + ucs_device_log_source_file(__FILE__), __LINE__, \ + UCS_LOG_LEVEL_NAME_##_level, ##__VA_ARGS__); \ + } \ + } while (0) + + +/* Log level names */ +#define UCS_LOG_LEVEL_NAME_ERROR "ERROR" +#define UCS_LOG_LEVEL_NAME_WARN "WARN" +#define UCS_LOG_LEVEL_NAME_DIAG "DIAG" +#define UCS_LOG_LEVEL_NAME_INFO "INFO" +#define UCS_LOG_LEVEL_NAME_DEBUG "DEBUG" +#define UCS_LOG_LEVEL_NAME_TRACE "TRACE" +#define UCS_LOG_LEVEL_NAME_TRACE_DATA "DATA" +#define UCS_LOG_LEVEL_NAME_TRACE_POLL "POLL" + + +static UCS_F_DEVICE_LIB const char *ucs_device_basename(const char *path) +{ + const char *basename = path; + const char *p; + + for (p = path; *p != '\0'; p++) { + if (*p == '/') { + basename = p + 1; + } + } + + return basename; +} + + +UCS_F_DEVICE const char *ucs_device_log_source_file(const char *file) +{ + static const char *cached_source_file = NULL; + + if (cached_source_file == NULL) { + cached_source_file = ucs_device_basename(file); + } + + return cached_source_file; +} + +#endif /* UCS_DEVICE_LOG_H */ diff --git a/src/ucs/device/stub_device.h b/src/ucs/device/stub_device.h new file mode 100644 index 00000000000..d0b5f3e9954 --- /dev/null +++ b/src/ucs/device/stub_device.h @@ -0,0 +1,51 @@ +/** + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifndef UCS_STUB_DEVICE_CUH +#define UCS_STUB_DEVICE_CUH + +#include + +/* Device function */ +#define UCS_F_DEVICE static inline + + +/* Device library function */ +#define UCS_F_DEVICE_LIB + + +/* + * Read a 64-bit atomic value from a global memory address. + */ +UCS_F_DEVICE uint64_t ucs_device_atomic64_read(const uint64_t *ptr) +{ + return *ptr; +} + + +/* + * Write a 64-bit value to counter global memory address. + */ +UCS_F_DEVICE void ucs_device_atomic64_write(uint64_t *ptr, uint64_t value) +{ + *ptr = value; +} + + +/* + * Read the 64-bit GPU global nanosecond timer + */ +UCS_F_DEVICE uint64_t ucs_device_get_time_ns(void) +{ + return 0; +} + +/* + * Load a constant from global memory. + */ +#define ucs_device_load_const(_ptr) (*(_ptr)) + +#endif /* UCS_STUB_DEVICE_CUH */ diff --git a/src/ucs/sys/device_code.h b/src/ucs/sys/device_code.h deleted file mode 100644 index d80835fe5c1..00000000000 --- a/src/ucs/sys/device_code.h +++ /dev/null @@ -1,104 +0,0 @@ -/** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED. - * - * See file LICENSE for terms. - */ - -#ifndef UCS_DEVICE_CODE_H -#define UCS_DEVICE_CODE_H - -#include -#include - -/* - * Declare GPU specific functions - */ -#ifdef __NVCC__ -#define UCS_F_DEVICE __device__ __forceinline__ static -#else -#define UCS_F_DEVICE static inline -#endif /* __NVCC__ */ - - -/* Number of threads in a warp */ -#define UCS_DEVICE_NUM_THREADS_IN_WARP 32 - - -/** - * @brief Cooperation level when calling device functions. - */ -typedef enum { - UCS_DEVICE_LEVEL_THREAD = 0, - UCS_DEVICE_LEVEL_WARP = 1, - UCS_DEVICE_LEVEL_BLOCK = 2, - UCS_DEVICE_LEVEL_GRID = 3 -} ucs_device_level_t; - - -static UCS_F_ALWAYS_INLINE const char* -ucs_device_level_name(ucs_device_level_t level) -{ - switch (level) { - case UCS_DEVICE_LEVEL_THREAD: - return "thread"; - case UCS_DEVICE_LEVEL_WARP: - return "warp"; - case UCS_DEVICE_LEVEL_BLOCK: - return "block"; - case UCS_DEVICE_LEVEL_GRID: - return "grid"; - default: - return "unknown"; - } -} - - -/* - * Read a 64-bit atomic value from a global memory address. - */ -UCS_F_DEVICE uint64_t ucs_device_atomic64_read(const uint64_t *ptr) -{ - uint64_t ret; -#ifdef __NVCC__ - asm volatile("ld.acquire.sys.global.u64 %0, [%1];" - : "=l"(ret) - : "l"(ptr)); -#else - ret = *ptr; -#endif - return ret; -} - - -/* - * Write a 64-bit value to counter global memory address. - */ -UCS_F_DEVICE void ucs_device_atomic64_write(uint64_t *ptr, uint64_t value) -{ -#ifdef __NVCC__ - asm volatile("st.release.sys.u64 [%0], %1;" - : - : "l"(ptr), "l"(value) - : "memory"); -#else - *ptr = value; -#endif -} - - -/* Helper macro to print a message from a device function including the - * thread and block indices */ -#define ucs_device_printf(_title, _fmt, ...) \ - printf("(%d:%d) %6s " _fmt "\n", threadIdx.x, blockIdx.x, _title, \ - ##__VA_ARGS__) - -/* Print an error message from a device function */ -#define ucs_device_error(_fmt, ...) \ - ucs_device_printf("ERROR", _fmt, ##__VA_ARGS__) - - -/* Print a debug message from a device function */ -#define ucs_device_debug(_fmt, ...) \ - ucs_device_printf("DEBUG", _fmt, ##__VA_ARGS__) - -#endif diff --git a/src/uct/api/device/uct_device_impl.h b/src/uct/api/device/uct_device_impl.h index fa0d3e0f656..71305757ae4 100644 --- a/src/uct/api/device/uct_device_impl.h +++ b/src/uct/api/device/uct_device_impl.h @@ -10,9 +10,8 @@ #include "uct_device_types.h" #include +#include #include -#include - #include diff --git a/src/uct/api/device/uct_device_types.h b/src/uct/api/device/uct_device_types.h index eec4af75001..e725825c600 100644 --- a/src/uct/api/device/uct_device_types.h +++ b/src/uct/api/device/uct_device_types.h @@ -8,6 +8,7 @@ #define UCT_DEVICE_TYPES_H #include +#include #include @@ -37,7 +38,8 @@ typedef enum { /* Base class for all device endpoints */ typedef struct uct_device_ep { - uint8_t uct_tl_id; /* Defined in uct_device_tl_id_t */ + uint8_t uct_tl_id; /* Defined in uct_device_tl_id_t */ + ucs_device_log_config_t log_config; /* Logging configuration */ } uct_device_ep_t; @@ -47,9 +49,4 @@ typedef struct uct_device_completion { ucs_status_t status; /* Status of the operation */ } uct_device_completion_t; - -/* Base structure for all device memory elements */ -struct uct_device_mem_element { -}; - #endif diff --git a/src/uct/base/uct_iface.c b/src/uct/base/uct_iface.c index 79c11ad62e9..ef3c2ef4d78 100644 --- a/src/uct/base/uct_iface.c +++ b/src/uct/base/uct_iface.c @@ -815,6 +815,12 @@ ucs_status_t uct_ep_invalidate(uct_ep_h ep, unsigned flags) return iface->internal_ops->ep_invalidate(ep, flags); } +void uct_device_ep_init(uct_device_ep_t *device_ep, uct_device_tl_id_t tl_id) +{ + device_ep->uct_tl_id = tl_id; + ucs_device_log_config_init(&device_ep->log_config); +} + void uct_ep_set_iface(uct_ep_h ep, uct_iface_t *iface) { ep->iface = iface; diff --git a/src/uct/base/uct_iface.h b/src/uct/base/uct_iface.h index b7775695016..2d8042a15c3 100644 --- a/src/uct/base/uct_iface.h +++ b/src/uct/base/uct_iface.h @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -1057,6 +1058,8 @@ void uct_iface_vfs_set_dirty(uct_iface_h iface); ucs_status_t uct_ep_invalidate(uct_ep_h ep, unsigned flags); +void uct_device_ep_init(uct_device_ep_t *device_ep, uct_device_tl_id_t tl_id); + void uct_tl_register(uct_component_t *component, uct_tl_t *tl); void uct_tl_unregister(uct_tl_t *tl); diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc.cuh b/src/uct/cuda/cuda_ipc/cuda_ipc.cuh index 4923fd68a06..f3d45ca9382 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc.cuh +++ b/src/uct/cuda/cuda_ipc/cuda_ipc.cuh @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_ep.c b/src/uct/cuda/cuda_ipc/cuda_ipc_ep.c index 2d78a1f094e..951fce2f864 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_ep.c +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_ep.c @@ -249,7 +249,8 @@ ucs_status_t uct_cuda_ipc_ep_get_device_ep(uct_ep_h tl_ep, goto out; } - device_ep.uct_tl_id = UCT_DEVICE_TL_CUDA_IPC; + uct_device_ep_init(&device_ep, UCT_DEVICE_TL_CUDA_IPC); + status = UCT_CUDADRV_FUNC_LOG_ERR( cuMemAlloc((CUdeviceptr *)&ep->device_ep, sizeof(uct_device_ep_t))); if (status != UCS_OK) { diff --git a/src/uct/ib/mlx5/gdaki/gdaki.c b/src/uct/ib/mlx5/gdaki/gdaki.c index a00a75b7de7..d2b6f412ff6 100644 --- a/src/uct/ib/mlx5/gdaki/gdaki.c +++ b/src/uct/ib/mlx5/gdaki/gdaki.c @@ -172,6 +172,7 @@ static UCS_CLASS_INIT_FUNC(uct_rc_gdaki_ep_t, const uct_ep_params_t *params) goto err_dev_ep; } + uct_device_ep_init(&dev_ep.super, UCT_DEVICE_TL_RC_MLX5_GDA); dev_ep.atomic_va = iface->atomic_buff; dev_ep.atomic_lkey = htonl(iface->atomic_mr->lkey); diff --git a/src/uct/ib/mlx5/gdaki/gdaki.cuh b/src/uct/ib/mlx5/gdaki/gdaki.cuh index 3c6f43b7241..5f9931e820b 100644 --- a/src/uct/ib/mlx5/gdaki/gdaki.cuh +++ b/src/uct/ib/mlx5/gdaki/gdaki.cuh @@ -11,9 +11,14 @@ #include #include +#include #define UCT_RC_GDA_RESV_WQE_NO_RESOURCE -1ULL +#define uct_rc_mlx5_gda_log(_level, _ep, _fmt, ...) \ + ucs_device_log(_level, &(_ep)->super.log_config, "[qp 0x%x] " _fmt, \ + (_ep)->sq_num, ##__VA_ARGS__) + UCS_F_DEVICE void * uct_rc_mlx5_gda_get_wqe_ptr(uct_rc_gdaki_dev_ep_t *ep, uint16_t wqe_idx) @@ -195,6 +200,9 @@ UCS_F_DEVICE void uct_rc_mlx5_gda_wqe_prepare_put_or_atomic( doca_gpu_dev_verbs_store_wqe_seg(atseg_ptr, (uint64_t*)&(atseg)); } + uct_rc_mlx5_gda_log(TRACE_DATA, ep, "WQE[%d] opcode %d clags %d", wqe_idx, + opcode, ctrl_flags); + doca_gpu_dev_verbs_store_wqe_seg(cseg_ptr, (uint64_t*)&(cseg)); doca_gpu_dev_verbs_store_wqe_seg(rseg_ptr, (uint64_t*)&(rseg)); doca_gpu_dev_verbs_store_wqe_seg(dseg_ptr, (uint64_t*)&(dseg)); @@ -221,6 +229,7 @@ UCS_F_DEVICE void uct_rc_mlx5_gda_db(uct_rc_gdaki_dev_ep_t *ep, doca_gpu_dev_verbs_lock( &ep->sq_lock); + uct_rc_mlx5_gda_log(TRACE_DATA, ep, "DB index %lu", ep->sq_ready_index); uct_rc_mlx5_gda_ring_db(ep, ep->sq_ready_index); uct_rc_mlx5_gda_update_dbr(ep, ep->sq_ready_index); uct_rc_mlx5_gda_ring_db(ep, ep->sq_ready_index); @@ -347,6 +356,8 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi( return UCS_ERR_NO_RESOURCE; } + uct_rc_mlx5_gda_log(TRACE_DATA, ep, "RSV wqe_base %lu count %u", wqe_base, + count); fc = doca_gpu_dev_verbs_wqe_idx_inc_mask(ep->sq_wqe_pi, ep->sq_wqe_num / 2); wqe_idx = doca_gpu_dev_verbs_wqe_idx_inc_mask(wqe_base, lane_id); for (uint32_t i = lane_id; i < count; i += num_lanes) { @@ -437,6 +448,8 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi_partial( return UCS_ERR_NO_RESOURCE; } + uct_rc_mlx5_gda_log(TRACE_DATA, ep, "RSV wqe_base %lu count %u", wqe_base, + count); fc = doca_gpu_dev_verbs_wqe_idx_inc_mask(ep->sq_wqe_pi, ep->sq_wqe_num / 2); wqe_idx = doca_gpu_dev_verbs_wqe_idx_inc_mask(wqe_base, lane_id); for (uint32_t i = lane_id; i < count; i += num_lanes) { @@ -547,12 +560,11 @@ uct_rc_mlx5_gda_progress_thread(uct_rc_gdaki_dev_ep_t *ep) if (opcode == MLX5_CQE_REQ_ERR) { auto err_cqe = reinterpret_cast(cqe64); auto wqe_ptr = uct_rc_mlx5_gda_get_wqe_ptr(ep, wqe_idx); - ucs_device_error("CQE[%d] with syndrome:%x vendor:%x hw:%x " - "wqe_idx:0x%x qp:0x%x", - idx, err_cqe->syndrome, err_cqe->vendor_err_synd, - err_cqe->hw_err_synd, wqe_idx, - doca_gpu_dev_verbs_bswap32(err_cqe->s_wqe_opcode_qpn) & - 0xffffff); + uct_rc_mlx5_gda_log( + ERROR, ep, + "CQE[%d] wqe_cnt %d synd 0x%x vend_err 0x%x hw_err 0x%x", idx, + wqe_cnt, err_cqe->syndrome, err_cqe->vendor_err_synd, + err_cqe->hw_err_synd); uct_rc_mlx5_gda_qedump("WQE", wqe_ptr, 64); uct_rc_mlx5_gda_qedump("CQE", cqe64, 64); return UCS_ERR_IO_ERROR; @@ -566,6 +578,10 @@ uct_rc_mlx5_gda_progress_thread(uct_rc_gdaki_dev_ep_t *ep) uint64_t sq_wqe_pi = ep->sq_wqe_pi; pi_ref.fetch_max(((wqe_cnt - sq_wqe_pi) & 0xffff) + sq_wqe_pi + 1); + uct_rc_mlx5_gda_log(TRACE_DATA, ep, + "CQE[%d] opcode %d wqe_cnt %d sq_wqe_pi %ld", idx, + opcode, wqe_cnt, pi_ref.load()); + doca_gpu_dev_verbs_fence_release(); return UCS_OK; } diff --git a/test/gtest/ucp/cuda/test_kernels.cu b/test/gtest/ucp/cuda/test_kernels.cu index 55b18a56d6f..1cdc1cc273c 100644 --- a/test/gtest/ucp/cuda/test_kernels.cu +++ b/test/gtest/ucp/cuda/test_kernels.cu @@ -4,6 +4,8 @@ * See file LICENSE for terms. */ +#define UCS_DEVICE_MAX_LOG_LEVEL UCS_LOG_LEVEL_TRACE_POLL + #include "test_kernels.h" #include @@ -56,8 +58,9 @@ ucp_test_kernel_do_operation(const test_ucp_device_kernel_params_t ¶ms, case TEST_UCP_DEVICE_KERNEL_COUNTER_READ: uint64_t value = ucp_device_counter_read(params.local_counter.address); if (value != params.local_counter.value) { - ucs_device_error("counter value mismatch: expected %lu, got %lu", - params.local_counter.value, value); + ucs_device_log(ERROR, ¶ms.log_config, + "counter value mismatch: expected %lu, got %lu", + params.local_counter.value, value); return UCS_ERR_IO_ERROR; } /* req_ptr is not used in this case */ @@ -116,7 +119,7 @@ ucp_test_kernel(const test_ucp_device_kernel_params_t params, ucs_status_t *status_ptr) { if (blockDim.x > device_request::MAX_THREADS) { - ucs_device_error("blockDim.x > MAX_THREADS"); + ucs_device_log(ERROR, ¶ms.log_config, "blockDim.x > MAX_THREADS"); *status_ptr = UCS_ERR_INVALID_PARAM; return; } diff --git a/test/gtest/ucp/cuda/test_kernels.h b/test/gtest/ucp/cuda/test_kernels.h index 0f929f043c2..9c447a70215 100644 --- a/test/gtest/ucp/cuda/test_kernels.h +++ b/test/gtest/ucp/cuda/test_kernels.h @@ -8,7 +8,7 @@ #define CUDA_TEST_KERNELS_H_ #include -#include +#include typedef enum { TEST_UCP_DEVICE_KERNEL_PUT_SINGLE, @@ -28,6 +28,7 @@ typedef struct { bool with_request; size_t num_iters; ucp_device_mem_list_handle_h mem_list; + ucs_device_log_config_t log_config; union { struct { unsigned mem_list_index; diff --git a/test/gtest/ucp/test_ucp_device.cc b/test/gtest/ucp/test_ucp_device.cc index 4340f1c7222..db5caf203ea 100644 --- a/test/gtest/ucp/test_ucp_device.cc +++ b/test/gtest/ucp/test_ucp_device.cc @@ -4,13 +4,14 @@ * See file LICENSE for terms. */ +#include "cuda/test_kernels.h" + #include #include - #include +#include #include -#include "cuda/test_kernels.h" class test_ucp_device : public ucp_test { public: @@ -254,12 +255,9 @@ class test_ucp_device_kernel : public test_ucp_device { public: static void get_test_variants(std::vector &variants) { - // TODO move to UCS - static const char *ucs_device_level_names[] = {"thread", "warp", - "block", "grid"}; add_variant_values(variants, test_ucp_device::get_test_variants, UCS_BIT(UCS_DEVICE_LEVEL_THREAD) | - UCS_BIT(UCS_DEVICE_LEVEL_WARP), + UCS_BIT(UCS_DEVICE_LEVEL_WARP), ucs_device_level_names); } @@ -276,6 +274,7 @@ class test_ucp_device_kernel : public test_ucp_device { params.num_blocks = 1; params.level = get_device_level(); params.num_iters = num_iters; + ucs_device_log_config_init(¶ms.log_config); return params; } @@ -345,11 +344,8 @@ class test_ucp_device_xfer : public test_ucp_device_kernel { test_ucp_device_kernel_params_t init_params() { // TODO: Test different sizes and alignment - test_ucp_device_kernel_params_t params; - params.num_threads = get_num_threads(); - params.num_blocks = 1; - params.level = get_device_level(); - params.num_iters = get_num_iters(); + auto params = test_ucp_device_kernel::init_params(); + params.num_iters = get_num_iters(); switch (get_send_mode()) { case NODELAY_WITH_REQ: params.with_no_delay = true; diff --git a/test/gtest/uct/cuda/test_cuda_ipc_device.cc b/test/gtest/uct/cuda/test_cuda_ipc_device.cc index 1980122edff..dbf6df10f0c 100644 --- a/test/gtest/uct/cuda/test_cuda_ipc_device.cc +++ b/test/gtest/uct/cuda/test_cuda_ipc_device.cc @@ -146,25 +146,11 @@ Parameter packing in resource.variant (uint32_t): ((nb & 0xF) << 24) | ((nt & 0xFFF) << 12) | (off & 0xFFF); - switch (dl) { - case UCS_DEVICE_LEVEL_THREAD: - up->variant_name = "thread"; - break; - case UCS_DEVICE_LEVEL_WARP: - up->variant_name = "warp"; - break; - case UCS_DEVICE_LEVEL_BLOCK: - up->variant_name = "block"; - break; - case UCS_DEVICE_LEVEL_GRID: - up->variant_name = "grid"; - break; - default: - break; - } + up->variant_name = ucs_device_level_names[dl]; up->variant_name += "- nt" + std::to_string(nt) + "- nb" + std::to_string(nb) + - "- offset" + std::to_string(off); + "- offset" + + std::to_string(off); out.push_back(up.get()); storage.emplace_back(std::move(up)); } diff --git a/test/gtest/uct/cuda/test_kernels_uct.h b/test/gtest/uct/cuda/test_kernels_uct.h index b902aa2d0fe..3b829c6596a 100644 --- a/test/gtest/uct/cuda/test_kernels_uct.h +++ b/test/gtest/uct/cuda/test_kernels_uct.h @@ -9,7 +9,7 @@ #include #include -#include +#include namespace cuda_uct {