From 18c020f85eba866c2d83b1314e8d6de5cdcd4833 Mon Sep 17 00:00:00 2001 From: Michal Shalev Date: Thu, 9 Oct 2025 11:28:58 +0300 Subject: [PATCH 1/3] DEVICE/API: Add NIXL device-side logging infrastructure Signed-off-by: Michal Shalev --- src/api/gpu/ucx/nixl_device.cuh | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/api/gpu/ucx/nixl_device.cuh b/src/api/gpu/ucx/nixl_device.cuh index 57356ed2c..763e906b9 100644 --- a/src/api/gpu/ucx/nixl_device.cuh +++ b/src/api/gpu/ucx/nixl_device.cuh @@ -20,6 +20,16 @@ #include #include +/* Helper macro to print a message from NIXL device function including the + * thread and block indices, file, line, and function */ +#define nixl_device_printf(_title, _fmt, ...) \ + printf("(%5d:%5d) %5s %s:%d %s: " _fmt "\n", threadIdx.x, blockIdx.x, _title, \ + __FILE__, __LINE__, __func__, ##__VA_ARGS__) + +/* Print an error message from NIXL device function */ +#define nixl_device_error(_fmt, ...) \ + nixl_device_printf("ERROR", _fmt, ##__VA_ARGS__) + struct nixlGpuXferStatusH { ucp_device_request_t device_request; }; @@ -66,7 +76,7 @@ nixlGpuConvertUcsStatus(ucs_status_t status) { if (!UCS_STATUS_IS_ERR(status)) { return NIXL_SUCCESS; } - printf("UCX returned error: %d\n", status); + nixl_device_error("UCX backend error"); return NIXL_ERR_BACKEND; } @@ -242,6 +252,7 @@ nixlGpuGetXferStatus(nixlGpuXferStatusH &xfer_status) { case UCS_INPROGRESS: return NIXL_IN_PROG; default: + nixl_device_error("UCX backend error"); return NIXL_ERR_BACKEND; } } From 61720cbd9cc20d3b19707573e933a12840644a09 Mon Sep 17 00:00:00 2001 From: Michal Shalev Date: Tue, 21 Oct 2025 02:44:48 +0300 Subject: [PATCH 2/3] DEVICE/API: Align to NIXL log Signed-off-by: Michal Shalev --- src/api/gpu/ucx/nixl_device.cuh | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/api/gpu/ucx/nixl_device.cuh b/src/api/gpu/ucx/nixl_device.cuh index 763e906b9..49a74a5f6 100644 --- a/src/api/gpu/ucx/nixl_device.cuh +++ b/src/api/gpu/ucx/nixl_device.cuh @@ -20,11 +20,22 @@ #include #include +/* Helper function to extract the base filename */ +__device__ __forceinline__ static const char* nixl_device_basefile(const char* path) { + const char* base = path; + for (const char* p = path; *p; ++p) { + if (*p == '/') { + base = p + 1; + } + } + return base; +} + /* Helper macro to print a message from NIXL device function including the - * thread and block indices, file, line, and function */ -#define nixl_device_printf(_title, _fmt, ...) \ - printf("(%5d:%5d) %5s %s:%d %s: " _fmt "\n", threadIdx.x, blockIdx.x, _title, \ - __FILE__, __LINE__, __func__, ##__VA_ARGS__) + * thread and block indices, file and line */ +#define nixl_device_printf(_log_level, _fmt, ...) \ + printf("%c T%-4d:B%-4d%*s%s:%d] " _fmt "\n", _log_level[0], threadIdx.x, blockIdx.x, \ + 17, "", nixl_device_basefile(__FILE__), __LINE__, ##__VA_ARGS__) /* Print an error message from NIXL device function */ #define nixl_device_error(_fmt, ...) \ From f44875f4fbb73e9c22b546423a071d03afa0ef57 Mon Sep 17 00:00:00 2001 From: Michal Shalev Date: Tue, 21 Oct 2025 03:01:45 +0300 Subject: [PATCH 3/3] DEVICE/API: call ucs_device_status_string Signed-off-by: Michal Shalev --- src/api/gpu/ucx/nixl_device.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/api/gpu/ucx/nixl_device.cuh b/src/api/gpu/ucx/nixl_device.cuh index bc15cb809..6dd7d8187 100644 --- a/src/api/gpu/ucx/nixl_device.cuh +++ b/src/api/gpu/ucx/nixl_device.cuh @@ -88,7 +88,7 @@ nixlGpuConvertUcsStatus(ucs_status_t status) { if (!UCS_STATUS_IS_ERR(status)) { return NIXL_IN_PROG; } - nixl_device_error("UCX backend error"); + nixl_device_error("UCX backend error: %s", ucs_device_status_string(status)); return NIXL_ERR_BACKEND; } @@ -268,7 +268,7 @@ nixlGpuGetXferStatus(nixlGpuXferStatusH &xfer_status) { case UCS_INPROGRESS: return NIXL_IN_PROG; default: - nixl_device_error("UCX backend error"); + nixl_device_error("UCX backend error: %s", ucs_device_status_string(status)); return NIXL_ERR_BACKEND; } }