Skip to content

Commit 3e289b1

Browse files
committed
btl/uct: correctly set the completion status before using uct completions
According to UCT documentation the status field must be preset to UCS_OK before the completion is used. My assumption here is that the field is not filled in in all cases leaving the value as potentially garbage. This CL addresses the issue but setting the status in the constructor for both RDMA completions and frags. The status is then reset on completion callback. Signed-off-by: Nathan Hjelm <[email protected]>
1 parent 119735d commit 3e289b1

File tree

2 files changed

+23
-7
lines changed

2 files changed

+23
-7
lines changed

opal/mca/btl/uct/btl_uct_frag.c

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
/*
33
* Copyright (c) 2018 Los Alamos National Security, LLC. All rights
44
* reserved.
5+
* Copyright (c) 2025 Google, LLC. All rights reserved.
56
* $COPYRIGHT$
67
*
78
* Additional copyrights may follow
@@ -26,7 +27,13 @@ static void mca_btl_uct_frag_completion_compat(uct_completion_t *uct_comp, ucs_s
2627

2728
#if UCT_API >= ((1L<<UCT_MAJOR_BIT)|(10L << UCT_MINOR_BIT))
2829
static void mca_btl_uct_frag_completion(uct_completion_t *uct_comp) {
29-
mca_btl_uct_frag_completion_compat(uct_comp, uct_comp->status);
30+
ucs_status_t status = uct_comp->status;
31+
/* reset the status now as the completion can not be accessed after calling
32+
* mca_btl (may get returned) */
33+
uct_comp->status = UCS_OK;
34+
35+
mca_btl_uct_frag_completion_compat(uct_comp, status);
36+
/* do not access the completion struture here */
3037
}
3138
#else
3239
static void mca_btl_uct_frag_completion(uct_completion_t *uct_comp, ucs_status_t status) {
@@ -42,14 +49,11 @@ static void mca_btl_uct_base_frag_constructor(mca_btl_uct_base_frag_t *frag)
4249
memset((char *) frag + sizeof(frag->base), 0, sizeof(*frag) - sizeof(frag->base));
4350

4451
OBJ_CONSTRUCT(&frag->comp, mca_btl_uct_uct_completion_t);
52+
frag->comp.frag = frag;
4553

4654
frag->base.des_segments = frag->segments;
4755
frag->base.des_segment_count = 1;
4856

49-
frag->comp.uct_comp.func = mca_btl_uct_frag_completion;
50-
frag->comp.uct_comp.count = 1;
51-
frag->comp.frag = frag;
52-
5357
frag->segments[0].seg_addr.pval = frag->base.super.ptr;
5458
frag->uct_iov.buffer = frag->base.super.ptr;
5559
frag->uct_iov.stride = 0;

opal/mca/btl/uct/btl_uct_rdma.c

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
/*
33
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
44
* reserved.
5+
* Copyright (c) 2025 Google, LLC. All rights reserved.
56
* $COPYRIGHT$
67
*
78
* Additional copyrights may follow
@@ -18,15 +19,21 @@ static void mca_btl_uct_uct_completion_compat(uct_completion_t *uct_comp, ucs_st
1819
- offsetof(mca_btl_uct_uct_completion_t,
1920
uct_comp));
2021

21-
BTL_VERBOSE(("network operation complete. status = %d", status));
22+
BTL_VERBOSE(("network operation complete. frag = %p, status = %d", (void *) comp->frag, status));
2223

2324
comp->status = status;
2425
opal_fifo_push(&comp->dev_context->completion_fifo, &comp->super.super);
2526
}
2627

2728
#if UCT_API >= ((1L<<UCT_MAJOR_BIT)|(10L << UCT_MINOR_BIT))
2829
static void mca_btl_uct_uct_completion(uct_completion_t *uct_comp) {
29-
mca_btl_uct_uct_completion_compat(uct_comp, uct_comp->status);
30+
ucs_status_t status = uct_comp->status;
31+
/* reset the status now as the completion can not be accessed after calling
32+
* mca_btl_uct_uct_completion_compat (may get returned) */
33+
uct_comp->status = UCS_OK;
34+
35+
mca_btl_uct_uct_completion_compat(uct_comp, status);
36+
/* do not access the completion struture here */
3037
}
3138
#else
3239
static void mca_btl_uct_uct_completion(uct_completion_t *uct_comp, ucs_status_t status) {
@@ -38,6 +45,11 @@ static void mca_btl_uct_uct_completion_construct(mca_btl_uct_uct_completion_t *c
3845
{
3946
comp->frag = NULL;
4047
comp->uct_comp.func = mca_btl_uct_uct_completion;
48+
comp.uct_comp.count = 1;
49+
50+
#if UCT_API >= ((1L<<UCT_MAJOR_BIT)|(10L << UCT_MINOR_BIT))
51+
comp->uct_comp->status = UCS_OK;
52+
#endif
4153
}
4254

4355
OBJ_CLASS_INSTANCE(mca_btl_uct_uct_completion_t, opal_free_list_item_t,

0 commit comments

Comments
 (0)