Skip to content

Commit e7d60dc

Browse files
author
Sergey Oblomov
committed
PML/UCX: fixed hang on MPI_Finalize
- use non-blocking fence to progress UCX Signed-off-by: Sergey Oblomov <[email protected]> (cherry picked from commit 96717ed)
1 parent 0d90056 commit e7d60dc

File tree

1 file changed

+10
-10
lines changed

1 file changed

+10
-10
lines changed

ompi/mca/pml/ucx/pml_ucx.c

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -381,14 +381,19 @@ static void mca_pml_ucx_waitall(void **reqs, size_t *count_p)
381381
*count_p = 0;
382382
}
383383

384+
static void mca_pml_fence_complete_cb(int status, void *fenced)
385+
{
386+
*(int*)fenced = 1;
387+
}
388+
384389
int mca_pml_ucx_del_procs(struct ompi_proc_t **procs, size_t nprocs)
385390
{
391+
int fenced = 0;
386392
ompi_proc_t *proc;
387393
size_t num_reqs, max_reqs;
388394
void *dreq, **dreqs;
389395
ucp_ep_h ep;
390396
size_t i;
391-
ucs_status_t ret;
392397

393398
max_reqs = ompi_pml_ucx.num_disconnect;
394399
if (max_reqs > nprocs) {
@@ -430,16 +435,11 @@ int mca_pml_ucx_del_procs(struct ompi_proc_t **procs, size_t nprocs)
430435

431436
mca_pml_ucx_waitall(dreqs, &num_reqs);
432437
free(dreqs);
433-
/* flush worker to allow all pending operations to complete.
434-
* ignore error (we can do nothing here), just try to
435-
* finalize gracefully */
436-
ret = ucp_worker_flush(ompi_pml_ucx.ucp_worker);
437-
if (UCS_OK != ret) {
438-
PML_UCX_ERROR("ucp_worker_flush failed: %s",
439-
ucs_status_string(ret));
440-
}
441438

442-
opal_pmix.fence(NULL, 0);
439+
opal_pmix.fence_nb(NULL, 0, mca_pml_fence_complete_cb, &fenced);
440+
while (!fenced) {
441+
ucp_worker_progress(ompi_pml_ucx.ucp_worker);
442+
}
443443

444444
return OMPI_SUCCESS;
445445
}

0 commit comments

Comments
 (0)