From d586292e60954db6f5d0a61f21fcefb8e7ae9458 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Thu, 9 Jan 2025 22:29:44 -0700 Subject: [PATCH] btl/uct: correctly set the completion status before using uct completions According to UCT documentation the status field must be preset to UCS_OK before the completion is used. My assumption here is that the field is not filled in in all cases leaving the value as potentially garbage. This CL addresses the issue but setting the status in the constructor for both RDMA completions and frags. The status is then reset on completion callback. Signed-off-by: Nathan Hjelm --- opal/mca/btl/uct/btl_uct_frag.c | 29 ++--------------------------- opal/mca/btl/uct/btl_uct_rdma.c | 16 ++++++++++++++-- 2 files changed, 16 insertions(+), 29 deletions(-) diff --git a/opal/mca/btl/uct/btl_uct_frag.c b/opal/mca/btl/uct/btl_uct_frag.c index 3e165236d05..37eee126654 100644 --- a/opal/mca/btl/uct/btl_uct_frag.c +++ b/opal/mca/btl/uct/btl_uct_frag.c @@ -2,6 +2,7 @@ /* * Copyright (c) 2018 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2025 Google, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -11,29 +12,6 @@ #include "btl_uct_frag.h" -static void mca_btl_uct_frag_completion_compat(uct_completion_t *uct_comp, ucs_status_t status) -{ - mca_btl_uct_uct_completion_t *comp = (mca_btl_uct_uct_completion_t - *) ((uintptr_t) uct_comp - - offsetof(mca_btl_uct_uct_completion_t, - uct_comp)); - - BTL_VERBOSE(("frag operation complete. frag = %p. status = %d", (void *) comp->frag, status)); - - comp->status = status; - opal_fifo_push(&comp->dev_context->completion_fifo, &comp->super.super); -} - -#if UCT_API >= ((1L<status); -} -#else -static void mca_btl_uct_frag_completion(uct_completion_t *uct_comp, ucs_status_t status) { - mca_btl_uct_frag_completion_compat(uct_comp, status); -} -#endif - static void mca_btl_uct_base_frag_constructor(mca_btl_uct_base_frag_t *frag) { mca_btl_uct_reg_t *reg = (mca_btl_uct_reg_t *) frag->base.super.registration; @@ -42,14 +20,11 @@ static void mca_btl_uct_base_frag_constructor(mca_btl_uct_base_frag_t *frag) memset((char *) frag + sizeof(frag->base), 0, sizeof(*frag) - sizeof(frag->base)); OBJ_CONSTRUCT(&frag->comp, mca_btl_uct_uct_completion_t); + frag->comp.frag = frag; frag->base.des_segments = frag->segments; frag->base.des_segment_count = 1; - frag->comp.uct_comp.func = mca_btl_uct_frag_completion; - frag->comp.uct_comp.count = 1; - frag->comp.frag = frag; - frag->segments[0].seg_addr.pval = frag->base.super.ptr; frag->uct_iov.buffer = frag->base.super.ptr; frag->uct_iov.stride = 0; diff --git a/opal/mca/btl/uct/btl_uct_rdma.c b/opal/mca/btl/uct/btl_uct_rdma.c index b81d0e32540..88ee8106796 100644 --- a/opal/mca/btl/uct/btl_uct_rdma.c +++ b/opal/mca/btl/uct/btl_uct_rdma.c @@ -2,6 +2,7 @@ /* * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2025 Google, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -18,7 +19,7 @@ static void mca_btl_uct_uct_completion_compat(uct_completion_t *uct_comp, ucs_st - offsetof(mca_btl_uct_uct_completion_t, uct_comp)); - BTL_VERBOSE(("network operation complete. status = %d", status)); + BTL_VERBOSE(("network operation complete. frag = %p, status = %d", (void *) comp->frag, status)); comp->status = status; opal_fifo_push(&comp->dev_context->completion_fifo, &comp->super.super); @@ -26,7 +27,13 @@ static void mca_btl_uct_uct_completion_compat(uct_completion_t *uct_comp, ucs_st #if UCT_API >= ((1L<status); + ucs_status_t status = uct_comp->status; + /* reset the status now as the completion can not be accessed after calling + * mca_btl_uct_uct_completion_compat (may get returned) */ + uct_comp->status = UCS_OK; + + mca_btl_uct_uct_completion_compat(uct_comp, status); + /* do not access the completion struture here */ } #else static void mca_btl_uct_uct_completion(uct_completion_t *uct_comp, ucs_status_t status) { @@ -38,6 +45,11 @@ static void mca_btl_uct_uct_completion_construct(mca_btl_uct_uct_completion_t *c { comp->frag = NULL; comp->uct_comp.func = mca_btl_uct_uct_completion; + comp.uct_comp.count = 1; + +#if UCT_API >= ((1L<uct_comp->status = UCS_OK; +#endif } OBJ_CLASS_INSTANCE(mca_btl_uct_uct_completion_t, opal_free_list_item_t,