Skip to content

Commit ed6648b

Browse files
committed
UCP: Allow eager inline sends for host memory when CUDA MDs are present
When CUDA memory domains are loaded, the memory type cache becomes non-empty after any GPU allocation. Previously, ucp_proto_is_inline() would conservatively disable inline (am_short) sends for all buffers when the cache was non-empty, unless the user explicitly set the memory type to HOST. This caused a performance regression for host memory buffers on systems with CUDA/ROCm installed. Fix by performing a memtype cache lookup when the cache is non-empty to positively identify whether the buffer is host memory. If the address is not found in the cache, it is host memory and inline send is safe to use. Fixes #4275
1 parent 6dda7bd commit ed6648b

File tree

3 files changed

+30
-9
lines changed

3 files changed

+30
-9
lines changed

src/ucp/core/ucp_am.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -915,7 +915,7 @@ ucp_am_try_send_short(ucp_ep_h ep, uint16_t id, uint32_t flags,
915915
}
916916

917917
if (ucp_proto_is_inline(ep, max_eager_short,
918-
header_length + length, param)) {
918+
header_length + length, buffer, param)) {
919919
return ucp_am_send_short(ep, id, flags, header, header_length, buffer,
920920
length, flags & UCP_AM_SEND_FLAG_REPLY);
921921
}

src/ucp/proto/proto_am.inl

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -649,13 +649,34 @@ ucp_proto_get_short_max(const ucp_request_t *req,
649649

650650
static UCS_F_ALWAYS_INLINE int
651651
ucp_proto_is_inline(ucp_ep_h ep, const ucp_memtype_thresh_t *max_eager_short,
652-
ssize_t length, const ucp_request_param_t *param)
652+
ssize_t length, const void *buffer,
653+
const ucp_request_param_t *param)
653654
{
654-
return (ucs_likely(length <= max_eager_short->memtype_off) ||
655-
((length <= max_eager_short->memtype_on) &&
656-
(ucs_memtype_cache_is_empty() ||
657-
((param->op_attr_mask & UCP_OP_ATTR_FIELD_MEMORY_TYPE) &&
658-
(param->memory_type == UCS_MEMORY_TYPE_HOST)))));
655+
ucs_memory_info_t mem_info;
656+
ucs_status_t status;
657+
658+
if (ucs_likely(length <= max_eager_short->memtype_off)) {
659+
return 1;
660+
}
661+
662+
if (length > max_eager_short->memtype_on) {
663+
return 0;
664+
}
665+
666+
if (ucs_memtype_cache_is_empty()) {
667+
return 1;
668+
}
669+
670+
if ((param->op_attr_mask & UCP_OP_ATTR_FIELD_MEMORY_TYPE) &&
671+
(param->memory_type == UCS_MEMORY_TYPE_HOST)) {
672+
return 1;
673+
}
674+
675+
/* Look up the buffer in the memory type cache to determine if it is host
676+
* memory. If the address is not found in the cache, it is host memory. */
677+
status = ucs_memtype_cache_lookup(buffer, length, &mem_info);
678+
return (status == UCS_ERR_NO_ELEM) ||
679+
((status == UCS_OK) && (mem_info.type == UCS_MEMORY_TYPE_HOST));
659680
}
660681

661682
static UCS_F_ALWAYS_INLINE ucp_request_t*

src/ucp/tag/tag_send.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,14 +154,14 @@ ucp_tag_send_inline(ucp_ep_h ep, const void *buffer, size_t length,
154154
ucs_status_t status;
155155

156156
if (ucp_proto_is_inline(ep, &ucp_ep_config(ep)->tag.max_eager_short,
157-
length, param)) {
157+
length, buffer, param)) {
158158
UCS_STATIC_ASSERT(sizeof(ucp_tag_t) == sizeof(ucp_eager_hdr_t));
159159
UCS_STATIC_ASSERT(sizeof(ucp_tag_t) == sizeof(uint64_t));
160160
status = uct_ep_am_short(ucp_ep_get_am_uct_ep(ep), UCP_AM_ID_EAGER_ONLY,
161161
tag, buffer, length);
162162
} else if (ucp_proto_is_inline(ep,
163163
&ucp_ep_config(ep)->tag.offload.max_eager_short,
164-
length, param)) {
164+
length, buffer, param)) {
165165
UCS_STATIC_ASSERT(sizeof(ucp_tag_t) == sizeof(uct_tag_t));
166166
status = uct_ep_tag_eager_short(ucp_ep_get_tag_uct_ep(ep), tag, buffer,
167167
length);

0 commit comments

Comments
 (0)