Skip to content

Commit 96717ed

Browse files
author
Sergey Oblomov
committed
PML/UCX: fixed hang on MPI_Finalize
- use non-blocking fence to progress UCX Signed-off-by: Sergey Oblomov <[email protected]> (cherry picked from commit 10f2d83)
1 parent 0316e7b commit 96717ed

File tree

1 file changed

+10
-10
lines changed

1 file changed

+10
-10
lines changed

ompi/mca/pml/ucx/pml_ucx.c

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -387,14 +387,19 @@ static void mca_pml_ucx_waitall(void **reqs, size_t *count_p)
387387
*count_p = 0;
388388
}
389389

390+
static void mca_pml_fence_complete_cb(int status, void *fenced)
391+
{
392+
*(int*)fenced = 1;
393+
}
394+
390395
int mca_pml_ucx_del_procs(struct ompi_proc_t **procs, size_t nprocs)
391396
{
397+
int fenced = 0;
392398
ompi_proc_t *proc;
393399
size_t num_reqs, max_reqs;
394400
void *dreq, **dreqs;
395401
ucp_ep_h ep;
396402
size_t i;
397-
ucs_status_t ret;
398403

399404
max_reqs = ompi_pml_ucx.num_disconnect;
400405
if (max_reqs > nprocs) {
@@ -436,16 +441,11 @@ int mca_pml_ucx_del_procs(struct ompi_proc_t **procs, size_t nprocs)
436441

437442
mca_pml_ucx_waitall(dreqs, &num_reqs);
438443
free(dreqs);
439-
/* flush worker to allow all pending operations to complete.
440-
* ignore error (we can do nothing here), just try to
441-
* finalize gracefully */
442-
ret = ucp_worker_flush(ompi_pml_ucx.ucp_worker);
443-
if (UCS_OK != ret) {
444-
PML_UCX_ERROR("ucp_worker_flush failed: %s",
445-
ucs_status_string(ret));
446-
}
447444

448-
opal_pmix.fence(NULL, 0);
445+
opal_pmix.fence_nb(NULL, 0, mca_pml_fence_complete_cb, &fenced);
446+
while (!fenced) {
447+
ucp_worker_progress(ompi_pml_ucx.ucp_worker);
448+
}
449449

450450
return OMPI_SUCCESS;
451451
}

0 commit comments

Comments
 (0)