Skip to content
12 changes: 12 additions & 0 deletions src/ucp/api/device/ucp_device_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ UCS_F_DEVICE ucs_status_t ucp_device_prepare_send(

if ((mem_list_h->version != UCP_DEVICE_MEM_LIST_VERSION_V1) ||
(first_mem_elem_index >= mem_list_h->mem_list_length)) {
ucs_device_error("invalid parameters: mem_list version=%u (expected %u), "
"first_mem_elem_index=%u, mem_list_length=%u",
mem_list_h->version, UCP_DEVICE_MEM_LIST_VERSION_V1,
first_mem_elem_index, mem_list_h->mem_list_length);
return UCS_ERR_INVALID_PARAM;
}

Expand Down Expand Up @@ -146,6 +150,8 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_single(
status = ucp_device_prepare_send(mem_list_h, mem_list_index, req, device_ep,
uct_elem, comp);
if (status != UCS_OK) {
ucs_device_error("send prepare failed with %s, mem_list_index=%u",
ucs_device_status_string(status), mem_list_index);
return status;
}

Expand Down Expand Up @@ -197,6 +203,8 @@ UCS_F_DEVICE ucs_status_t ucp_device_counter_inc(
status = ucp_device_prepare_send(mem_list_h, mem_list_index, req, device_ep,
uct_elem, comp);
if (status != UCS_OK) {
ucs_device_error("send prepare failed with %s, mem_list_index=%u",
ucs_device_status_string(status), mem_list_index);
return status;
}

Expand Down Expand Up @@ -261,6 +269,8 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_multi(
status = ucp_device_prepare_send(mem_list_h, 0, req, device_ep,
uct_mem_list, comp);
if (status != UCS_OK) {
ucs_device_error("send prepare failed with %s, mem_list_length=%u",
ucs_device_status_string(status), mem_list_h->mem_list_length);
return status;
}

Expand Down Expand Up @@ -337,6 +347,8 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_multi_partial(
status = ucp_device_prepare_send(mem_list_h, 0, req, device_ep,
uct_mem_list, comp);
if (status != UCS_OK) {
ucs_device_error("send prepare failed with %s, mem_list_count=%u",
ucs_device_status_string(status), mem_list_count);
return status;
}

Expand Down
4 changes: 4 additions & 0 deletions src/ucp/core/ucp_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,7 @@ ucp_device_mem_list_create(ucp_ep_h ep,
uct_allocated_memory_t mem;

if (!(ep->flags & UCP_EP_FLAG_REMOTE_CONNECTED)) {
ucs_error("ep=%p didn't complete wireup", ep);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need to remove that one eventually?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a scoped handler in the test

return UCS_ERR_NOT_CONNECTED;
}

Expand All @@ -388,6 +389,8 @@ ucp_device_mem_list_create(ucp_ep_h ep,
&local_sys_dev, &local_md_map,
&mem_type);
if (status != UCS_OK) {
ucs_error("ep=%p check parameters failed: %s", ep,
ucs_status_string(status));
return status;
}

Expand Down Expand Up @@ -426,6 +429,7 @@ ucp_device_mem_list_create(ucp_ep_h ep,
/* Track memory allocator for later release */
status = ucp_device_mem_handle_hash_insert(&mem);
if (status != UCS_OK) {
ucs_error("failed to insert handle: %s", ucs_status_string(status));
uct_mem_free(&mem);
} else {
*handle_p = mem.address;
Expand Down
29 changes: 25 additions & 4 deletions src/ucs/sys/device_code.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#define UCS_DEVICE_CODE_H

#include <ucs/sys/compiler_def.h>
#include <ucs/type/status.h>
#include <stdint.h>

/*
Expand Down Expand Up @@ -35,7 +36,7 @@ typedef enum {
} ucs_device_level_t;


static UCS_F_ALWAYS_INLINE const char*
UCS_F_DEVICE const char*
ucs_device_level_name(ucs_device_level_t level)
{
switch (level) {
Expand Down Expand Up @@ -87,10 +88,10 @@ UCS_F_DEVICE void ucs_device_atomic64_write(uint64_t *ptr, uint64_t value)


/* Helper macro to print a message from a device function including the
* thread and block indices */
* thread and block indices, file, line, and function */
#define ucs_device_printf(_title, _fmt, ...) \
printf("(%d:%d) %6s " _fmt "\n", threadIdx.x, blockIdx.x, _title, \
##__VA_ARGS__)
printf("(%d:%d) %6s %s:%d %s: " _fmt "\n", threadIdx.x, blockIdx.x, _title, \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just some ideas for improvements:

  1. Now messages are not aligned, because thread id vary from 0 to 1023. Maybe we can use a fixed width for thread:block part?
  2. I think we don't need a function name - it would make the output too long. Also not sure if it really helps in case of inline functions - did you check?
  3. Maybe we could also fix max width for file:line, so than the output looks like:
  (0:0) DEBUG filename_long.cu:1234 Message 1
(255:0) ERROR         short.cu:9    Message 2

IMO it's important that message starts at the same length, otherwise it's hard to read

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

%5d and %5s

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. Fixed with Thomas' suggestion.
  2. It works in the case of the inline functions, I tested it. IMO function name is very helpful for debug.
  3. Yes, I changed filename max width to 40 character, line number to max 4 digits and function name max to 30 characters.

This is how it looks like now:

(    0:    0) ERROR /hpc/newhome/mshalev/workspace/gdaki/ucx:255  uct_rc_mlx5_gda_ep_single     : This is an example error message from lane 0
(    0:    0) DEBUG /hpc/newhome/mshalev/workspace/gdaki/ucx:253  uct_rc_mlx5_gda_ep_single     : This is an example debug message addr=0x7fdeacc00200 remote=0x7fdeacc44400 len=8

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, much better now

  1. But still I'm against the function name, because:
  • it makes output too long
  • it's redundant since file:line is present
  • it's absent in UCX logs (I think it would be nice to have a similar layout)
UCX logs
[1760011677.835218] [rock14:3182762:0]           debug.c:1157 UCX  DEBUG using signal stack 0x7f1a76826000 size 141824
[1760011677.848311] [rock14:3182762:0]             cpu.c:338  UCX  DEBUG measured tsc frequency 2595.125 MHz after 0.08 ms
[1760011677.848337] [rock14:3182762:0]            init.c:120  UCX  DEBUG /labhome/iyastrebov/ws/ucx2/bld-devel/lib/libucs.so.0 loaded at 0x7f1a75d50000

It looks weird if we have different log formats within the same library

  1. Can we print just the basename of the file? Long log lines are hard to read
  2. Maybe align (0:1) on the right or left?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. Removed the function name.
  2. Used a shared macro to print the basename.
  3. Aligned to the left.

Now it looks like this:

                    [63      :0      ]         gdaki.cuh:440  UCX  DEBUG This is an example debug message
[1760011677.848311] [rock14:3182762:0]             cpu.c:338  UCX  DEBUG measured tsc frequency 2595.125 MHz after 0.08 ms

__FILE__, __LINE__, __func__, ##__VA_ARGS__)

/* Print an error message from a device function */
#define ucs_device_error(_fmt, ...) \
Expand All @@ -101,4 +102,24 @@ UCS_F_DEVICE void ucs_device_atomic64_write(uint64_t *ptr, uint64_t value)
#define ucs_device_debug(_fmt, ...) \
ucs_device_printf("DEBUG", _fmt, ##__VA_ARGS__)


/**
* @brief Device compatible status code to string conversion
*
* This function provides status code to string conversion that can be called
* from device code. Returns a short string representation of the status code.
*
* @param [in] status Status code to convert
*
* @return Short string representation of the status code
*/
UCS_F_DEVICE const char* ucs_device_status_string(ucs_status_t status)
{
switch (status) {
UCS_STATUS_STRING_CASES
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO X macro is an ideal candidate here, because it's much more flexible.
Also we keep all related things in one place, so it's less error prone

#define UCS_FOREACH_STATUS(_macro) \
    _macro(UCS_OK, 0, "Success") \
    _macro(UCS_INPROGRESS, 1, "Operation in progress") \
    ...

#define UCS_STATUS_ENUMIFY(ID, VALUE, _) ID = VALUE,

typedef enum {
    UCS_FOREACH_STATUS(UCS_STATUS_ENUMIFY)
} ucs_status_t;

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you, it's a great idea, I pushed a commit that changes the implementation to use X macro.

default:
return "Unknown error";
};
}

#endif
57 changes: 1 addition & 56 deletions src/ucs/type/status.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,62 +18,7 @@ const char *ucs_status_string(ucs_status_t status)
static char error_str[128] = {0};

switch (status) {
case UCS_OK:
return "Success";
case UCS_INPROGRESS:
return "Operation in progress";
case UCS_ERR_NO_MESSAGE:
return "No pending message";
case UCS_ERR_NO_RESOURCE:
return "No resources are available to initiate the operation";
case UCS_ERR_IO_ERROR:
return "Input/output error";
case UCS_ERR_NO_MEMORY:
return "Out of memory";
case UCS_ERR_INVALID_PARAM:
return "Invalid parameter";
case UCS_ERR_UNREACHABLE:
return "Destination is unreachable";
case UCS_ERR_INVALID_ADDR:
return "Address not valid";
case UCS_ERR_NOT_IMPLEMENTED:
return "Function not implemented";
case UCS_ERR_MESSAGE_TRUNCATED:
return "Message truncated";
case UCS_ERR_NO_PROGRESS:
return "No progress";
case UCS_ERR_BUFFER_TOO_SMALL:
return "Provided buffer is too small";
case UCS_ERR_NO_ELEM:
return "No such element";
case UCS_ERR_SOME_CONNECTS_FAILED:
return "Failed to connect some of the requested endpoints";
case UCS_ERR_NO_DEVICE:
return "No such device";
case UCS_ERR_BUSY:
return "Device is busy";
case UCS_ERR_CANCELED:
return "Request canceled";
case UCS_ERR_SHMEM_SEGMENT:
return "Shared memory error";
case UCS_ERR_ALREADY_EXISTS:
return "Element already exists";
case UCS_ERR_OUT_OF_RANGE:
return "Index out of range";
case UCS_ERR_TIMED_OUT:
return "Operation timed out";
case UCS_ERR_EXCEEDS_LIMIT:
return "User-defined limit was reached";
case UCS_ERR_UNSUPPORTED:
return "Unsupported operation";
case UCS_ERR_REJECTED:
return "Operation rejected by remote peer";
case UCS_ERR_NOT_CONNECTED:
return "Endpoint is not connected";
case UCS_ERR_CONNECTION_RESET:
return "Connection reset by remote peer";
case UCS_ERR_ENDPOINT_TIMEOUT:
return "Endpoint timeout";
UCS_STATUS_STRING_CASES
default:
snprintf(error_str, sizeof(error_str) - 1, "Unknown error %d", status);
return error_str;
Expand Down
64 changes: 64 additions & 0 deletions src/ucs/type/status.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,70 @@ typedef void *ucs_status_ptr_t;
#define UCS_STATUS_PTR(_status) ((void*)(intptr_t)(_status))
#define UCS_STATUS_IS_ERR(_status) ((_status) < 0)

/**
* @brief Common status code to string cases
*
* This macro defines the common switch cases for converting status codes to
* strings. It's used by both the host and device implementations to avoid
* code duplication.
*/
#define UCS_STATUS_STRING_CASES \
case UCS_OK: \
return "Success"; \
case UCS_INPROGRESS: \
return "Operation in progress"; \
case UCS_ERR_NO_MESSAGE: \
return "No pending message"; \
case UCS_ERR_NO_RESOURCE: \
return "No resources are available to initiate the operation"; \
case UCS_ERR_IO_ERROR: \
return "Input/output error"; \
case UCS_ERR_NO_MEMORY: \
return "Out of memory"; \
case UCS_ERR_INVALID_PARAM: \
return "Invalid parameter"; \
case UCS_ERR_UNREACHABLE: \
return "Destination is unreachable"; \
case UCS_ERR_INVALID_ADDR: \
return "Address not valid"; \
case UCS_ERR_NOT_IMPLEMENTED: \
return "Function not implemented"; \
case UCS_ERR_MESSAGE_TRUNCATED: \
return "Message truncated"; \
case UCS_ERR_NO_PROGRESS: \
return "No progress"; \
case UCS_ERR_BUFFER_TOO_SMALL: \
return "Provided buffer is too small"; \
case UCS_ERR_NO_ELEM: \
return "No such element"; \
case UCS_ERR_SOME_CONNECTS_FAILED: \
return "Failed to connect some of the requested endpoints"; \
case UCS_ERR_NO_DEVICE: \
return "No such device"; \
case UCS_ERR_BUSY: \
return "Device is busy"; \
case UCS_ERR_CANCELED: \
return "Request canceled"; \
case UCS_ERR_SHMEM_SEGMENT: \
return "Shared memory error"; \
case UCS_ERR_ALREADY_EXISTS: \
return "Element already exists"; \
case UCS_ERR_OUT_OF_RANGE: \
return "Index out of range"; \
case UCS_ERR_TIMED_OUT: \
return "Operation timed out"; \
case UCS_ERR_EXCEEDS_LIMIT: \
return "User-defined limit was reached"; \
case UCS_ERR_UNSUPPORTED: \
return "Unsupported operation"; \
case UCS_ERR_REJECTED: \
return "Operation rejected by remote peer"; \
case UCS_ERR_NOT_CONNECTED: \
return "Endpoint is not connected"; \
case UCS_ERR_CONNECTION_RESET: \
return "Connection reset by remote peer"; \
case UCS_ERR_ENDPOINT_TIMEOUT: \
return "Endpoint timeout";

/**
* @param status UCS status code.
Expand Down
11 changes: 11 additions & 0 deletions src/uct/api/device/uct_device_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_put_single(
comp);
}

ucs_device_error("unsupported device_ep->uct_tl_id=%d",
device_ep->uct_tl_id);
return UCS_ERR_UNSUPPORTED;
}

Expand Down Expand Up @@ -101,6 +103,8 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_atomic_add(
flags, comp);
}

ucs_device_error("unsupported device_ep->uct_tl_id=%d",
device_ep->uct_tl_id);
return UCS_ERR_UNSUPPORTED;
}

Expand Down Expand Up @@ -166,6 +170,8 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_put_multi(
flags, comp);
}

ucs_device_error("unsupported device_ep->uct_tl_id=%d",
device_ep->uct_tl_id);
return UCS_ERR_UNSUPPORTED;
}

Expand Down Expand Up @@ -238,6 +244,9 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_put_multi_partial(
counter_inc_value, counter_remote_address,
flags, comp);
}

ucs_device_error("unsupported device_ep->uct_tl_id=%d",
device_ep->uct_tl_id);
return UCS_ERR_UNSUPPORTED;
}

Expand Down Expand Up @@ -277,6 +286,8 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_check_completion(
return uct_rc_mlx5_gda_ep_check_completion<level>(device_ep, comp);
}

ucs_device_error("unsupported device_ep->uct_tl_id=%d",
device_ep->uct_tl_id);
return UCS_ERR_UNSUPPORTED;
}

Expand Down
2 changes: 2 additions & 0 deletions src/uct/ib/mlx5/gdaki/gdaki.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,7 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi(

if ((level != UCS_DEVICE_LEVEL_THREAD) &&
(level != UCS_DEVICE_LEVEL_WARP)) {
ucs_device_error("unsupported level: %s", ucs_device_level_name(level));
return UCS_ERR_UNSUPPORTED;
}

Expand Down Expand Up @@ -437,6 +438,7 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi_partial(

if ((level != UCS_DEVICE_LEVEL_THREAD) &&
(level != UCS_DEVICE_LEVEL_WARP)) {
ucs_device_error("unsupported level: %s", ucs_device_level_name(level));
return UCS_ERR_UNSUPPORTED;
}

Expand Down
Loading