diff --git a/src/tools/perf/api/libperf.h b/src/tools/perf/api/libperf.h
index 2e1338b2423..ad6d3f6b9dc 100644
--- a/src/tools/perf/api/libperf.h
+++ b/src/tools/perf/api/libperf.h
@@ -11,7 +11,7 @@
 #define UCX_LIBPERF_H
 
 #include <ucs/sys/compiler.h>
-#include <ucs/sys/device_code.h>
+#include <ucs/device/device_common.h>
 
 BEGIN_C_DECLS
 
diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh
index c9e71ba40c7..f94b071a1f8 100644
--- a/src/tools/perf/cuda/cuda_kernel.cuh
+++ b/src/tools/perf/cuda/cuda_kernel.cuh
@@ -10,18 +10,19 @@
 #include "cuda_common.h"
 
 #include <tools/perf/lib/libperf_int.h>
-#include <ucs/sys/device_code.h>
+#include <ucs/device/device_log_impl.h>
 #include <cuda_runtime.h>
 
 
 typedef unsigned long long ucx_perf_cuda_time_t;
 
 struct ucx_perf_cuda_context {
-    unsigned             max_outstanding;
-    ucx_perf_counter_t   max_iters;
-    ucx_perf_cuda_time_t report_interval_ns;
-    ucx_perf_counter_t   completed_iters;
-    ucs_status_t         status;
+    unsigned                max_outstanding;
+    ucx_perf_counter_t      max_iters;
+    ucx_perf_cuda_time_t    report_interval_ns;
+    ucx_perf_counter_t      completed_iters;
+    ucs_status_t            status;
+    ucs_device_log_config_t log_config;
 };
 
 UCS_F_DEVICE ucx_perf_cuda_time_t ucx_perf_cuda_get_time_ns()
@@ -153,6 +154,7 @@ public:
                                         ULONG_MAX :
                                         ucs_time_to_nsec(perf.report_interval) / 100;
         m_cpu_ctx->status             = UCS_ERR_NOT_IMPLEMENTED;
+        ucs_device_log_config_init(&m_cpu_ctx->log_config);
     }
 
     ~ucx_perf_cuda_test_runner()
diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu
index 5b7afaf911c..26713086d82 100644
--- a/src/tools/perf/cuda/ucp_cuda_kernel.cu
+++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu
@@ -255,7 +255,8 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
         while (request_mgr.get_pending_count() >= ctx.max_outstanding) {
             status = request_mgr.progress<level>(1);
             if (UCS_STATUS_IS_ERR(status)) {
-                ucs_device_error("progress failed: %d", status);
+                ucs_device_log(ERROR, &ctx.log_config, "progress failed: %d",
+                               status);
                 goto out;
             }
         }
@@ -263,7 +264,7 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
         ucp_device_request_t &req = request_mgr.get_request();
         status = ucp_perf_cuda_send_nbx<level, cmd>(params, idx, req);
         if (status != UCS_OK) {
-            ucs_device_error("send failed: %d", status);
+            ucs_device_log(ERROR, &ctx.log_config, "send failed: %d", status);
             goto out;
         }
 
@@ -274,7 +275,8 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
     while (request_mgr.get_pending_count() > 0) {
         status = request_mgr.progress<level>(max_iters);
         if (UCS_STATUS_IS_ERR(status)) {
-            ucs_device_error("final progress failed: %d", status);
+            ucs_device_log(ERROR, &ctx.log_config, "final progress failed: %d",
+                           status);
             goto out;
         }
     }
@@ -300,7 +302,8 @@ ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx,
         if (is_sender) {
             status = ucp_perf_cuda_send_sync<level, cmd>(params, idx, req);
             if (status != UCS_OK) {
-                ucs_device_error("sender send failed: %d", status);
+                ucs_device_log(ERROR, &ctx.log_config, "sender send failed: %d",
+                               status);
                 break;
             }
             ucx_perf_cuda_wait_sn(params.counter_recv, idx + 1);
@@ -308,7 +311,8 @@ ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx,
             ucx_perf_cuda_wait_sn(params.counter_recv, idx + 1);
             status = ucp_perf_cuda_send_sync<level, cmd>(params, idx, req);
             if (status != UCS_OK) {
-                ucs_device_error("receiver send failed: %d", status);
+                ucs_device_log(ERROR, &ctx.log_config,
+                               "receiver send failed: %d", status);
                 break;
             }
         }
diff --git a/src/tools/perf/perftest_params.c b/src/tools/perf/perftest_params.c
index e24a0be5507..5248749161f 100644
--- a/src/tools/perf/perftest_params.c
+++ b/src/tools/perf/perftest_params.c
@@ -457,7 +457,7 @@ static ucs_status_t parse_device_level(const char *opt_arg,
 {
     ucs_device_level_t level;
     for (level = UCS_DEVICE_LEVEL_THREAD; level <= UCS_DEVICE_LEVEL_GRID; ++level) {
-        if (!strcmp(opt_arg, ucs_device_level_name(level))) {
+        if (!strcasecmp(opt_arg, ucs_device_level_names[level])) {
             *device_level = level;
             return UCS_OK;
         }
diff --git a/src/ucp/api/device/ucp_device_impl.h b/src/ucp/api/device/ucp_device_impl.h
index 5c3ece9f71a..1bea785dd61 100644
--- a/src/ucp/api/device/ucp_device_impl.h
+++ b/src/ucp/api/device/ucp_device_impl.h
@@ -11,7 +11,7 @@
 
 #include <ucp/api/ucp_def.h>
 #include <uct/api/device/uct_device_impl.h>
-#include <ucs/sys/device_code.h>
+#include <ucs/device/device_common.h>
 #include <ucs/type/status.h>
 #include <stdint.h>
 
diff --git a/src/ucp/core/ucp_device.c b/src/ucp/core/ucp_device.c
index dffba8622b5..db83ddd7f44 100644
--- a/src/ucp/core/ucp_device.c
+++ b/src/ucp/core/ucp_device.c
@@ -13,6 +13,7 @@
 #include <ucp/core/ucp_mm.h>
 #include <ucp/api/device/ucp_host.h>
 #include <ucp/api/device/ucp_device_types.h>
+#include <ucp/wireup/wireup_ep.h>
 #include <ucs/type/param.h>
 #include <ucp/wireup/wireup_ep.h>
 
diff --git a/src/ucs/Makefile.am b/src/ucs/Makefile.am
index 28b6fa1e805..575db46db63 100644
--- a/src/ucs/Makefile.am
+++ b/src/ucs/Makefile.am
@@ -34,6 +34,10 @@ nobase_dist_libucs_la_HEADERS = \
 	config/ini.h \
 	config/parser.h \
 	config/types.h \
+	device/cuda_device.cuh \
+	device/device_common.h \
+	device/device_log_impl.h \
+	device/stub_device.h \
 	datastruct/array.h \
 	datastruct/callbackq.h \
 	datastruct/callbackq_compat.h \
@@ -63,7 +67,6 @@ nobase_dist_libucs_la_HEADERS = \
 	stats/libstats.h \
 	sys/event_set.h \
 	sys/compiler_def.h\
-	sys/device_code.h \
 	sys/math.h \
 	sys/preprocessor.h \
 	sys/string.h \
@@ -190,6 +193,7 @@ libucs_la_SOURCES = \
 	debug/debug.c \
 	debug/log.c \
 	debug/memtrack.c \
+	device/device_common.c \
 	memory/memory_type.c \
 	memory/memtype_cache.c \
 	memory/numa.c \
diff --git a/src/ucs/config/global_opts.c b/src/ucs/config/global_opts.c
index 3cb8a8128bb..318ef5d9d4f 100644
--- a/src/ucs/config/global_opts.c
+++ b/src/ucs/config/global_opts.c
@@ -29,6 +29,7 @@ ucs_global_opts_t ucs_global_opts = {
     .log_file_rotate       = 0,
     .log_buffer_size       = 1024,
     .log_data_size         = 0,
+    .device_log_level      = UCS_LOG_LEVEL_WARN,
     .mpool_fifo            = 0,
     .handle_errors         = UCS_BIT(UCS_HANDLE_ERROR_BACKTRACE),
     .error_signals         = { NULL, 0 },
@@ -114,6 +115,10 @@ static ucs_config_field_t ucs_global_opts_table[] = {
   "How much packet payload to print, at most, in data mode.",
   ucs_offsetof(ucs_global_opts_t, log_data_size), UCS_CONFIG_TYPE_ULONG},
 
+ {"DEVICE_LOG_LEVEL", "warn",
+  "Logging level for device functions.",
+  ucs_offsetof(ucs_global_opts_t, device_log_level), UCS_CONFIG_TYPE_ENUM(ucs_log_level_names)},
+
  {"LOG_PRINT_ENABLE", "n",
   "Enable output of ucs_print(). This option is intended for use by the library developers.",
   ucs_offsetof(ucs_global_opts_t, log_print_enable), UCS_CONFIG_TYPE_BOOL},
diff --git a/src/ucs/config/global_opts.h b/src/ucs/config/global_opts.h
index f1d4540e8a6..1a0a86d0339 100644
--- a/src/ucs/config/global_opts.h
+++ b/src/ucs/config/global_opts.h
@@ -46,6 +46,9 @@ typedef struct {
     /* Maximal amount of packet data to print per packet */
     size_t                     log_data_size;
 
+    /* Log level for device API */
+    ucs_log_level_t            device_log_level;
+
     /* Enable ucs_print() output */
     int                        log_print_enable;
 
diff --git a/src/ucs/device/cuda_device.cuh b/src/ucs/device/cuda_device.cuh
new file mode 100644
index 00000000000..8edfb971436
--- /dev/null
+++ b/src/ucs/device/cuda_device.cuh
@@ -0,0 +1,68 @@
+/**
+ * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifndef UCS_CUDA_DEVICE_CUH
+#define UCS_CUDA_DEVICE_CUH
+
+#include <stdint.h>
+
+
+/* Device function */
+#define UCS_F_DEVICE __device__ __forceinline__ static
+
+
+/* Device library function */
+#define UCS_F_DEVICE_LIB __device__
+
+
+/*
+ * Read a 64-bit atomic value from a global memory address.
+ */
+UCS_F_DEVICE uint64_t ucs_device_atomic64_read(const uint64_t *ptr)
+{
+    uint64_t ret;
+    asm volatile("ld.acquire.sys.global.u64 %0, [%1];" : "=l"(ret) : "l"(ptr));
+    return ret;
+}
+
+
+/*
+ * Write a 64-bit value to counter global memory address.
+ */
+UCS_F_DEVICE void ucs_device_atomic64_write(uint64_t *ptr, uint64_t value)
+{
+    asm volatile("st.release.sys.u64 [%0], %1;"
+                 :
+                 : "l"(ptr), "l"(value)
+                 : "memory");
+}
+
+
+/*
+ * Read the 64-bit GPU global nanosecond timer
+ */
+UCS_F_DEVICE uint64_t ucs_device_get_time_ns(void)
+{
+    uint64_t globaltimer;
+    /* 64-bit GPU global nanosecond timer */
+    asm volatile("mov.u64 %0, %globaltimer;" : "=l"(globaltimer));
+    return globaltimer;
+}
+
+/*
+ * Load a constant from global memory.
+ */
+template<typename T> UCS_F_DEVICE T ucs_device_load_const(const T *ptr)
+{
+    return __ldg(ptr);
+}
+
+template<> inline __device__ void *ucs_device_load_const(void *const *ptr)
+{
+    return (void*)__ldg((uint64_t*)ptr);
+}
+
+#endif /* UCS_CUDA_DEVICE_CUH */
diff --git a/src/ucs/device/device_common.c b/src/ucs/device/device_common.c
new file mode 100644
index 00000000000..6185727990b
--- /dev/null
+++ b/src/ucs/device/device_common.c
@@ -0,0 +1,20 @@
+/**
+ * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "device_common.h"
+
+#include <ucs/config/global_opts.h>
+
+const char *ucs_device_level_names[] = {"thread", "warp", "block", "grid"};
+
+void ucs_device_log_config_init(ucs_device_log_config_t *config)
+{
+    config->level = ucs_global_opts.device_log_level;
+}
diff --git a/src/ucs/device/device_common.h b/src/ucs/device/device_common.h
new file mode 100644
index 00000000000..93d337dd93a
--- /dev/null
+++ b/src/ucs/device/device_common.h
@@ -0,0 +1,51 @@
+/**
+ * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifndef UCS_DEVICE_COMMON_H
+#define UCS_DEVICE_COMMON_H
+
+#include <ucs/config/types.h>
+#include <stdint.h>
+
+#ifdef __NVCC__
+#include "cuda_device.cuh"
+#else
+#include "stub_device.h"
+#endif /* __NVCC__ */
+
+BEGIN_C_DECLS
+
+/* Logging configuration for device functions */
+typedef struct {
+    uint8_t level;
+} ucs_device_log_config_t;
+
+
+/**
+ * @brief Cooperation level when calling device functions.
+ */
+typedef enum {
+    UCS_DEVICE_LEVEL_THREAD = 0,
+    UCS_DEVICE_LEVEL_WARP   = 1,
+    UCS_DEVICE_LEVEL_BLOCK  = 2,
+    UCS_DEVICE_LEVEL_GRID   = 3
+} ucs_device_level_t;
+
+
+/** Names for @ref ucs_device_level_t */
+extern const char *ucs_device_level_names[];
+
+
+/* Number of threads in a warp */
+#define UCS_DEVICE_NUM_THREADS_IN_WARP 32
+
+
+/* Initialize the logging configuration for device functions */
+void ucs_device_log_config_init(ucs_device_log_config_t *config);
+
+END_C_DECLS
+
+#endif /* UCS_DEVICE_COMMON_H */
diff --git a/src/ucs/device/device_log_impl.h b/src/ucs/device/device_log_impl.h
new file mode 100644
index 00000000000..d40e377b189
--- /dev/null
+++ b/src/ucs/device/device_log_impl.h
@@ -0,0 +1,77 @@
+/**
+ * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifndef UCS_DEVICE_LOG_H
+#define UCS_DEVICE_LOG_H
+
+#include "device_common.h"
+
+#include <ucs/sys/compiler_def.h>
+#include <ucs/config/types.h>
+#include <stddef.h>
+
+/* Maximal log level for which the logging code is compiled in.
+   Application kernels can override this value by defining
+   UCS_DEVICE_MAX_LOG_LEVEL before including this file. */
+#ifndef UCS_DEVICE_MAX_LOG_LEVEL
+#define UCS_DEVICE_MAX_LOG_LEVEL UCS_LOG_LEVEL_DEBUG
+#endif
+
+
+/* Helper macro to print a message from a device function including the
+ * thread and block indices */
+#define ucs_device_log(_level, _log_config, _fmt, ...) \
+    do { \
+        if ((UCS_LOG_LEVEL_##_level <= UCS_DEVICE_MAX_LOG_LEVEL) && \
+            (UCS_LOG_LEVEL_##_level <= (_log_config)->level)) { \
+            const uint64_t _ts = ucs_device_get_time_ns(); \
+            printf("[%06lu.%06lu] (%4d:%-3d) %10s:%-4d %-6s " _fmt "\n", \
+                   _ts / 1000000000ul, (_ts % 1000000000ul) / 1000ul, \
+                   threadIdx.x, blockIdx.x, \
+                   ucs_device_log_source_file(__FILE__), __LINE__, \
+                   UCS_LOG_LEVEL_NAME_##_level, ##__VA_ARGS__); \
+        } \
+    } while (0)
+
+
+/* Log level names */
+#define UCS_LOG_LEVEL_NAME_ERROR      "ERROR"
+#define UCS_LOG_LEVEL_NAME_WARN       "WARN"
+#define UCS_LOG_LEVEL_NAME_DIAG       "DIAG"
+#define UCS_LOG_LEVEL_NAME_INFO       "INFO"
+#define UCS_LOG_LEVEL_NAME_DEBUG      "DEBUG"
+#define UCS_LOG_LEVEL_NAME_TRACE      "TRACE"
+#define UCS_LOG_LEVEL_NAME_TRACE_DATA "DATA"
+#define UCS_LOG_LEVEL_NAME_TRACE_POLL "POLL"
+
+
+static UCS_F_DEVICE_LIB const char *ucs_device_basename(const char *path)
+{
+    const char *basename = path;
+    const char *p;
+
+    for (p = path; *p != '\0'; p++) {
+        if (*p == '/') {
+            basename = p + 1;
+        }
+    }
+
+    return basename;
+}
+
+
+UCS_F_DEVICE const char *ucs_device_log_source_file(const char *file)
+{
+    static const char *cached_source_file = NULL;
+
+    if (cached_source_file == NULL) {
+        cached_source_file = ucs_device_basename(file);
+    }
+
+    return cached_source_file;
+}
+
+#endif /* UCS_DEVICE_LOG_H */
diff --git a/src/ucs/device/stub_device.h b/src/ucs/device/stub_device.h
new file mode 100644
index 00000000000..d0b5f3e9954
--- /dev/null
+++ b/src/ucs/device/stub_device.h
@@ -0,0 +1,51 @@
+/**
+ * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifndef UCS_STUB_DEVICE_CUH
+#define UCS_STUB_DEVICE_CUH
+
+#include <stdint.h>
+
+/* Device function */
+#define UCS_F_DEVICE static inline
+
+
+/* Device library function */
+#define UCS_F_DEVICE_LIB
+
+
+/*
+  * Read a 64-bit atomic value from a global memory address.
+  */
+UCS_F_DEVICE uint64_t ucs_device_atomic64_read(const uint64_t *ptr)
+{
+    return *ptr;
+}
+
+
+/*
+  * Write a 64-bit value to counter global memory address.
+  */
+UCS_F_DEVICE void ucs_device_atomic64_write(uint64_t *ptr, uint64_t value)
+{
+    *ptr = value;
+}
+
+
+/*
+  * Read the 64-bit GPU global nanosecond timer
+  */
+UCS_F_DEVICE uint64_t ucs_device_get_time_ns(void)
+{
+    return 0;
+}
+
+/*
+  * Load a constant from global memory.
+  */
+#define ucs_device_load_const(_ptr) (*(_ptr))
+
+#endif /* UCS_STUB_DEVICE_CUH */
diff --git a/src/ucs/sys/device_code.h b/src/ucs/sys/device_code.h
deleted file mode 100644
index d80835fe5c1..00000000000
--- a/src/ucs/sys/device_code.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/**
- * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED.
- *
- * See file LICENSE for terms.
- */
-
-#ifndef UCS_DEVICE_CODE_H
-#define UCS_DEVICE_CODE_H
-
-#include <ucs/sys/compiler_def.h>
-#include <stdint.h>
-
-/*
- * Declare GPU specific functions
- */
-#ifdef __NVCC__
-#define UCS_F_DEVICE __device__ __forceinline__ static
-#else
-#define UCS_F_DEVICE static inline
-#endif /* __NVCC__ */
-
-
-/* Number of threads in a warp */
-#define UCS_DEVICE_NUM_THREADS_IN_WARP 32
-
-
-/**
- * @brief Cooperation level when calling device functions.
- */
-typedef enum {
-    UCS_DEVICE_LEVEL_THREAD = 0,
-    UCS_DEVICE_LEVEL_WARP   = 1,
-    UCS_DEVICE_LEVEL_BLOCK  = 2,
-    UCS_DEVICE_LEVEL_GRID   = 3
-} ucs_device_level_t;
-
-
-static UCS_F_ALWAYS_INLINE const char*
-ucs_device_level_name(ucs_device_level_t level)
-{
-    switch (level) {
-    case UCS_DEVICE_LEVEL_THREAD:
-        return "thread";
-    case UCS_DEVICE_LEVEL_WARP:
-        return "warp";
-    case UCS_DEVICE_LEVEL_BLOCK:
-        return "block";
-    case UCS_DEVICE_LEVEL_GRID:
-        return "grid";
-    default:
-        return "unknown";
-    }
-}
-
-
-/*
- * Read a 64-bit atomic value from a global memory address.
- */
-UCS_F_DEVICE uint64_t ucs_device_atomic64_read(const uint64_t *ptr)
-{
-    uint64_t ret;
-#ifdef __NVCC__
-    asm volatile("ld.acquire.sys.global.u64 %0, [%1];"
-                 : "=l"(ret)
-                 : "l"(ptr));
-#else
-    ret = *ptr;
-#endif
-    return ret;
-}
-
-
-/*
- * Write a 64-bit value to counter global memory address.
- */
-UCS_F_DEVICE void ucs_device_atomic64_write(uint64_t *ptr, uint64_t value)
-{
-#ifdef __NVCC__
-    asm volatile("st.release.sys.u64 [%0], %1;"
-             :
-             : "l"(ptr), "l"(value)
-             : "memory");
-#else
-    *ptr = value;
-#endif
-}
-
-
-/* Helper macro to print a message from a device function including the
- * thread and block indices */
-#define ucs_device_printf(_title, _fmt, ...) \
-    printf("(%d:%d) %6s " _fmt "\n", threadIdx.x, blockIdx.x, _title, \
-           ##__VA_ARGS__)
-
-/* Print an error message from a device function */
-#define ucs_device_error(_fmt, ...) \
-    ucs_device_printf("ERROR", _fmt, ##__VA_ARGS__)
-
-
-/* Print a debug message from a device function */
-#define ucs_device_debug(_fmt, ...) \
-    ucs_device_printf("DEBUG", _fmt, ##__VA_ARGS__)
-
-#endif
diff --git a/src/uct/api/device/uct_device_impl.h b/src/uct/api/device/uct_device_impl.h
index fa0d3e0f656..71305757ae4 100644
--- a/src/uct/api/device/uct_device_impl.h
+++ b/src/uct/api/device/uct_device_impl.h
@@ -10,9 +10,8 @@
 #include "uct_device_types.h"
 
 #include <uct/api/uct_def.h>
+#include <ucs/device/device_common.h>
 #include <uct/cuda/cuda_ipc/cuda_ipc.cuh>
-#include <ucs/sys/device_code.h>
-
 #include <uct/ib/mlx5/gdaki/gdaki.cuh>
 
 
diff --git a/src/uct/api/device/uct_device_types.h b/src/uct/api/device/uct_device_types.h
index eec4af75001..e725825c600 100644
--- a/src/uct/api/device/uct_device_types.h
+++ b/src/uct/api/device/uct_device_types.h
@@ -8,6 +8,7 @@
 #define UCT_DEVICE_TYPES_H
 
 #include <ucs/type/status.h>
+#include <ucs/device/device_common.h>
 #include <stdint.h>
 
 
@@ -37,7 +38,8 @@ typedef enum {
 
 /* Base class for all device endpoints */
 typedef struct uct_device_ep {
-    uint8_t uct_tl_id; /* Defined in uct_device_tl_id_t */
+    uint8_t                 uct_tl_id; /* Defined in uct_device_tl_id_t */
+    ucs_device_log_config_t log_config; /* Logging configuration */
 } uct_device_ep_t;
 
 
@@ -47,9 +49,4 @@ typedef struct uct_device_completion {
     ucs_status_t status; /* Status of the operation */
 } uct_device_completion_t;
 
-
-/* Base structure for all device memory elements */
-struct uct_device_mem_element {
-};
-
 #endif
diff --git a/src/uct/base/uct_iface.c b/src/uct/base/uct_iface.c
index 79c11ad62e9..ef3c2ef4d78 100644
--- a/src/uct/base/uct_iface.c
+++ b/src/uct/base/uct_iface.c
@@ -815,6 +815,12 @@ ucs_status_t uct_ep_invalidate(uct_ep_h ep, unsigned flags)
     return iface->internal_ops->ep_invalidate(ep, flags);
 }
 
+void uct_device_ep_init(uct_device_ep_t *device_ep, uct_device_tl_id_t tl_id)
+{
+    device_ep->uct_tl_id = tl_id;
+    ucs_device_log_config_init(&device_ep->log_config);
+}
+
 void uct_ep_set_iface(uct_ep_h ep, uct_iface_t *iface)
 {
     ep->iface = iface;
diff --git a/src/uct/base/uct_iface.h b/src/uct/base/uct_iface.h
index b7775695016..2d8042a15c3 100644
--- a/src/uct/base/uct_iface.h
+++ b/src/uct/base/uct_iface.h
@@ -12,6 +12,7 @@
 
 #include <ucs/arch/cpu.h>
 #include <uct/api/uct.h>
+#include <uct/api/device/uct_device_types.h>
 #include <uct/base/uct_component.h>
 #include <ucs/config/parser.h>
 #include <ucs/datastruct/arbiter.h>
@@ -1057,6 +1058,8 @@ void uct_iface_vfs_set_dirty(uct_iface_h iface);
 
 ucs_status_t uct_ep_invalidate(uct_ep_h ep, unsigned flags);
 
+void uct_device_ep_init(uct_device_ep_t *device_ep, uct_device_tl_id_t tl_id);
+
 void uct_tl_register(uct_component_t *component, uct_tl_t *tl);
 
 void uct_tl_unregister(uct_tl_t *tl);
diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc.cuh b/src/uct/cuda/cuda_ipc/cuda_ipc.cuh
index 4923fd68a06..f3d45ca9382 100644
--- a/src/uct/cuda/cuda_ipc/cuda_ipc.cuh
+++ b/src/uct/cuda/cuda_ipc/cuda_ipc.cuh
@@ -10,7 +10,7 @@
 #include <uct/api/uct_def.h>
 #include <uct/api/device/uct_device_types.h>
 #include <uct/cuda/cuda_ipc/cuda_ipc_device.h>
-#include <ucs/sys/device_code.h>
+#include <ucs/device/device_common.h>
 #include <ucs/type/status.h>
 #include <cuda/atomic>
 
diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_ep.c b/src/uct/cuda/cuda_ipc/cuda_ipc_ep.c
index 2d78a1f094e..951fce2f864 100644
--- a/src/uct/cuda/cuda_ipc/cuda_ipc_ep.c
+++ b/src/uct/cuda/cuda_ipc/cuda_ipc_ep.c
@@ -249,7 +249,8 @@ ucs_status_t uct_cuda_ipc_ep_get_device_ep(uct_ep_h tl_ep,
         goto out;
     }
 
-    device_ep.uct_tl_id = UCT_DEVICE_TL_CUDA_IPC;
+    uct_device_ep_init(&device_ep, UCT_DEVICE_TL_CUDA_IPC);
+
     status = UCT_CUDADRV_FUNC_LOG_ERR(
             cuMemAlloc((CUdeviceptr *)&ep->device_ep, sizeof(uct_device_ep_t)));
     if (status != UCS_OK) {
diff --git a/src/uct/ib/mlx5/gdaki/gdaki.c b/src/uct/ib/mlx5/gdaki/gdaki.c
index a00a75b7de7..d2b6f412ff6 100644
--- a/src/uct/ib/mlx5/gdaki/gdaki.c
+++ b/src/uct/ib/mlx5/gdaki/gdaki.c
@@ -172,6 +172,7 @@ static UCS_CLASS_INIT_FUNC(uct_rc_gdaki_ep_t, const uct_ep_params_t *params)
         goto err_dev_ep;
     }
 
+    uct_device_ep_init(&dev_ep.super, UCT_DEVICE_TL_RC_MLX5_GDA);
     dev_ep.atomic_va   = iface->atomic_buff;
     dev_ep.atomic_lkey = htonl(iface->atomic_mr->lkey);
 
diff --git a/src/uct/ib/mlx5/gdaki/gdaki.cuh b/src/uct/ib/mlx5/gdaki/gdaki.cuh
index 3c6f43b7241..5f9931e820b 100644
--- a/src/uct/ib/mlx5/gdaki/gdaki.cuh
+++ b/src/uct/ib/mlx5/gdaki/gdaki.cuh
@@ -11,9 +11,14 @@
 
 #include <doca_gpunetio_dev_verbs_qp.cuh>
 #include <cooperative_groups.h>
+#include <ucs/device/device_log_impl.h>
 
 #define UCT_RC_GDA_RESV_WQE_NO_RESOURCE -1ULL
 
+#define uct_rc_mlx5_gda_log(_level, _ep, _fmt, ...) \
+    ucs_device_log(_level, &(_ep)->super.log_config, "[qp 0x%x] " _fmt, \
+                   (_ep)->sq_num, ##__VA_ARGS__)
+
 
 UCS_F_DEVICE void *
 uct_rc_mlx5_gda_get_wqe_ptr(uct_rc_gdaki_dev_ep_t *ep, uint16_t wqe_idx)
@@ -195,6 +200,9 @@ UCS_F_DEVICE void uct_rc_mlx5_gda_wqe_prepare_put_or_atomic(
         doca_gpu_dev_verbs_store_wqe_seg(atseg_ptr, (uint64_t*)&(atseg));
     }
 
+    uct_rc_mlx5_gda_log(TRACE_DATA, ep, "WQE[%d] opcode %d clags %d", wqe_idx,
+                        opcode, ctrl_flags);
+
     doca_gpu_dev_verbs_store_wqe_seg(cseg_ptr, (uint64_t*)&(cseg));
     doca_gpu_dev_verbs_store_wqe_seg(rseg_ptr, (uint64_t*)&(rseg));
     doca_gpu_dev_verbs_store_wqe_seg(dseg_ptr, (uint64_t*)&(dseg));
@@ -221,6 +229,7 @@ UCS_F_DEVICE void uct_rc_mlx5_gda_db(uct_rc_gdaki_dev_ep_t *ep,
 
     doca_gpu_dev_verbs_lock<DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU>(
             &ep->sq_lock);
+    uct_rc_mlx5_gda_log(TRACE_DATA, ep, "DB index %lu", ep->sq_ready_index);
     uct_rc_mlx5_gda_ring_db(ep, ep->sq_ready_index);
     uct_rc_mlx5_gda_update_dbr(ep, ep->sq_ready_index);
     uct_rc_mlx5_gda_ring_db(ep, ep->sq_ready_index);
@@ -347,6 +356,8 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi(
         return UCS_ERR_NO_RESOURCE;
     }
 
+    uct_rc_mlx5_gda_log(TRACE_DATA, ep, "RSV wqe_base %lu count %u", wqe_base,
+                        count);
     fc = doca_gpu_dev_verbs_wqe_idx_inc_mask(ep->sq_wqe_pi, ep->sq_wqe_num / 2);
     wqe_idx = doca_gpu_dev_verbs_wqe_idx_inc_mask(wqe_base, lane_id);
     for (uint32_t i = lane_id; i < count; i += num_lanes) {
@@ -437,6 +448,8 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi_partial(
         return UCS_ERR_NO_RESOURCE;
     }
 
+    uct_rc_mlx5_gda_log(TRACE_DATA, ep, "RSV wqe_base %lu count %u", wqe_base,
+                        count);
     fc = doca_gpu_dev_verbs_wqe_idx_inc_mask(ep->sq_wqe_pi, ep->sq_wqe_num / 2);
     wqe_idx = doca_gpu_dev_verbs_wqe_idx_inc_mask(wqe_base, lane_id);
     for (uint32_t i = lane_id; i < count; i += num_lanes) {
@@ -547,12 +560,11 @@ uct_rc_mlx5_gda_progress_thread(uct_rc_gdaki_dev_ep_t *ep)
     if (opcode == MLX5_CQE_REQ_ERR) {
         auto err_cqe = reinterpret_cast<mlx5_err_cqe_ex*>(cqe64);
         auto wqe_ptr = uct_rc_mlx5_gda_get_wqe_ptr(ep, wqe_idx);
-        ucs_device_error("CQE[%d] with syndrome:%x vendor:%x hw:%x "
-                         "wqe_idx:0x%x qp:0x%x",
-                         idx, err_cqe->syndrome, err_cqe->vendor_err_synd,
-                         err_cqe->hw_err_synd, wqe_idx,
-                         doca_gpu_dev_verbs_bswap32(err_cqe->s_wqe_opcode_qpn) &
-                                 0xffffff);
+        uct_rc_mlx5_gda_log(
+                ERROR, ep,
+                "CQE[%d] wqe_cnt %d synd 0x%x vend_err 0x%x hw_err 0x%x", idx,
+                wqe_cnt, err_cqe->syndrome, err_cqe->vendor_err_synd,
+                err_cqe->hw_err_synd);
         uct_rc_mlx5_gda_qedump("WQE", wqe_ptr, 64);
         uct_rc_mlx5_gda_qedump("CQE", cqe64, 64);
         return UCS_ERR_IO_ERROR;
@@ -566,6 +578,10 @@ uct_rc_mlx5_gda_progress_thread(uct_rc_gdaki_dev_ep_t *ep)
     uint64_t sq_wqe_pi = ep->sq_wqe_pi;
     pi_ref.fetch_max(((wqe_cnt - sq_wqe_pi) & 0xffff) + sq_wqe_pi + 1);
 
+    uct_rc_mlx5_gda_log(TRACE_DATA, ep,
+                        "CQE[%d] opcode %d wqe_cnt %d sq_wqe_pi %ld", idx,
+                        opcode, wqe_cnt, pi_ref.load());
+
     doca_gpu_dev_verbs_fence_release<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU>();
     return UCS_OK;
 }
diff --git a/test/gtest/ucp/cuda/test_kernels.cu b/test/gtest/ucp/cuda/test_kernels.cu
index 55b18a56d6f..1cdc1cc273c 100644
--- a/test/gtest/ucp/cuda/test_kernels.cu
+++ b/test/gtest/ucp/cuda/test_kernels.cu
@@ -4,6 +4,8 @@
  * See file LICENSE for terms.
  */
 
+#define UCS_DEVICE_MAX_LOG_LEVEL UCS_LOG_LEVEL_TRACE_POLL
+
 #include "test_kernels.h"
 
 #include <ucp/api/device/ucp_device_impl.h>
@@ -56,8 +58,9 @@ ucp_test_kernel_do_operation(const test_ucp_device_kernel_params_t &params,
     case TEST_UCP_DEVICE_KERNEL_COUNTER_READ:
         uint64_t value = ucp_device_counter_read(params.local_counter.address);
         if (value != params.local_counter.value) {
-            ucs_device_error("counter value mismatch: expected %lu, got %lu",
-                             params.local_counter.value, value);
+            ucs_device_log(ERROR, &params.log_config,
+                           "counter value mismatch: expected %lu, got %lu",
+                           params.local_counter.value, value);
             return UCS_ERR_IO_ERROR;
         }
         /* req_ptr is not used in this case */
@@ -116,7 +119,7 @@ ucp_test_kernel(const test_ucp_device_kernel_params_t params,
                 ucs_status_t *status_ptr)
 {
     if (blockDim.x > device_request<level>::MAX_THREADS) {
-        ucs_device_error("blockDim.x > MAX_THREADS");
+        ucs_device_log(ERROR, &params.log_config, "blockDim.x > MAX_THREADS");
         *status_ptr = UCS_ERR_INVALID_PARAM;
         return;
     }
diff --git a/test/gtest/ucp/cuda/test_kernels.h b/test/gtest/ucp/cuda/test_kernels.h
index 0f929f043c2..9c447a70215 100644
--- a/test/gtest/ucp/cuda/test_kernels.h
+++ b/test/gtest/ucp/cuda/test_kernels.h
@@ -8,7 +8,7 @@
 #define CUDA_TEST_KERNELS_H_
 
 #include <ucp/api/device/ucp_host.h>
-#include <ucs/sys/device_code.h>
+#include <ucs/device/device_common.h>
 
 typedef enum {
     TEST_UCP_DEVICE_KERNEL_PUT_SINGLE,
@@ -28,6 +28,7 @@ typedef struct {
     bool                         with_request;
     size_t                       num_iters;
     ucp_device_mem_list_handle_h mem_list;
+    ucs_device_log_config_t      log_config;
     union {
         struct {
             unsigned   mem_list_index;
diff --git a/test/gtest/ucp/test_ucp_device.cc b/test/gtest/ucp/test_ucp_device.cc
index 4340f1c7222..db5caf203ea 100644
--- a/test/gtest/ucp/test_ucp_device.cc
+++ b/test/gtest/ucp/test_ucp_device.cc
@@ -4,13 +4,14 @@
  * See file LICENSE for terms.
  */
 
+#include "cuda/test_kernels.h"
+
 #include <cuda_runtime.h>
 #include <ucp/ucp_test.h>
-
 #include <ucp/api/device/ucp_device_types.h>
+#include <ucs/device/device_common.h>
 
 #include <common/cuda.h>
-#include "cuda/test_kernels.h"
 
 class test_ucp_device : public ucp_test {
 public:
@@ -254,12 +255,9 @@ class test_ucp_device_kernel : public test_ucp_device {
 public:
     static void get_test_variants(std::vector<ucp_test_variant> &variants)
     {
-        // TODO move to UCS
-        static const char *ucs_device_level_names[] = {"thread", "warp",
-                                                       "block", "grid"};
         add_variant_values(variants, test_ucp_device::get_test_variants,
                            UCS_BIT(UCS_DEVICE_LEVEL_THREAD) |
-                                   UCS_BIT(UCS_DEVICE_LEVEL_WARP),
+                           UCS_BIT(UCS_DEVICE_LEVEL_WARP),
                            ucs_device_level_names);
     }
 
@@ -276,6 +274,7 @@ class test_ucp_device_kernel : public test_ucp_device {
         params.num_blocks                      = 1;
         params.level                           = get_device_level();
         params.num_iters                       = num_iters;
+        ucs_device_log_config_init(&params.log_config);
         return params;
     }
 
@@ -345,11 +344,8 @@ class test_ucp_device_xfer : public test_ucp_device_kernel {
     test_ucp_device_kernel_params_t init_params()
     {
         // TODO: Test different sizes and alignment
-        test_ucp_device_kernel_params_t params;
-        params.num_threads = get_num_threads();
-        params.num_blocks  = 1;
-        params.level       = get_device_level();
-        params.num_iters   = get_num_iters();
+        auto params      = test_ucp_device_kernel::init_params();
+        params.num_iters = get_num_iters();
         switch (get_send_mode()) {
         case NODELAY_WITH_REQ:
             params.with_no_delay = true;
diff --git a/test/gtest/uct/cuda/test_cuda_ipc_device.cc b/test/gtest/uct/cuda/test_cuda_ipc_device.cc
index 1980122edff..dbf6df10f0c 100644
--- a/test/gtest/uct/cuda/test_cuda_ipc_device.cc
+++ b/test/gtest/uct/cuda/test_cuda_ipc_device.cc
@@ -146,25 +146,11 @@ Parameter packing in resource.variant (uint32_t):
                                           ((nb & 0xF) << 24) |
                                           ((nt & 0xFFF) << 12) |
                                           (off & 0xFFF);
-                            switch (dl) {
-                            case UCS_DEVICE_LEVEL_THREAD:
-                                up->variant_name = "thread";
-                                break;
-                            case UCS_DEVICE_LEVEL_WARP:
-                                up->variant_name = "warp";
-                                break;
-                            case UCS_DEVICE_LEVEL_BLOCK:
-                                up->variant_name = "block";
-                                break;
-                            case UCS_DEVICE_LEVEL_GRID:
-                                up->variant_name = "grid";
-                                break;
-                            default:
-                                break;
-                            }
+                            up->variant_name = ucs_device_level_names[dl];
                             up->variant_name += "- nt" + std::to_string(nt) +
                                                 "- nb" + std::to_string(nb) +
-                                                "- offset" + std::to_string(off);
+                                                "- offset" +
+                                                std::to_string(off);
                             out.push_back(up.get());
                             storage.emplace_back(std::move(up));
                         }
diff --git a/test/gtest/uct/cuda/test_kernels_uct.h b/test/gtest/uct/cuda/test_kernels_uct.h
index b902aa2d0fe..3b829c6596a 100644
--- a/test/gtest/uct/cuda/test_kernels_uct.h
+++ b/test/gtest/uct/cuda/test_kernels_uct.h
@@ -9,7 +9,7 @@
 
 #include <uct/api/device/uct_device_types.h>
 #include <uct/api/uct.h>
-#include <ucs/sys/device_code.h>
+#include <ucs/device/device_common.h>
 
 namespace cuda_uct {