Skip to content

Commit fc6ff41

Browse files
authored
Merge pull request #5315 from hoopoepg/topic/async-progress-on-mpi-fin-v3.0
v3.0: PML/UCX: fixed hang on MPI_Finalize
2 parents ed6e6b3 + 0399f77 commit fc6ff41

File tree

1 file changed

+13
-11
lines changed

1 file changed

+13
-11
lines changed

ompi/mca/pml/ucx/pml_ucx.c

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
/*
2-
* Copyright (C) Mellanox Technologies Ltd. 2001-2011. ALL RIGHTS RESERVED.
2+
* Copyright (C) 2001-2011 Mellanox Technologies Ltd. 2001-2011. ALL RIGHTS RESERVED.
33
* Copyright (c) 2016 The University of Tennessee and The University
44
* of Tennessee Research Foundation. All rights
55
* reserved.
6+
* Copyright (c) 2018 Research Organization for Information Science
7+
* and Technology (RIST). All rights reserved.
68
* $COPYRIGHT$
79
*
810
* Additional copyrights may follow
@@ -381,14 +383,19 @@ static void mca_pml_ucx_waitall(void **reqs, size_t *count_p)
381383
*count_p = 0;
382384
}
383385

386+
static void mca_pml_fence_complete_cb(int status, void *fenced)
387+
{
388+
*(int*)fenced = 1;
389+
}
390+
384391
int mca_pml_ucx_del_procs(struct ompi_proc_t **procs, size_t nprocs)
385392
{
393+
volatile int fenced = 0;
386394
ompi_proc_t *proc;
387395
size_t num_reqs, max_reqs;
388396
void *dreq, **dreqs;
389397
ucp_ep_h ep;
390398
size_t i;
391-
ucs_status_t ret;
392399

393400
max_reqs = ompi_pml_ucx.num_disconnect;
394401
if (max_reqs > nprocs) {
@@ -430,16 +437,11 @@ int mca_pml_ucx_del_procs(struct ompi_proc_t **procs, size_t nprocs)
430437

431438
mca_pml_ucx_waitall(dreqs, &num_reqs);
432439
free(dreqs);
433-
/* flush worker to allow all pending operations to complete.
434-
* ignore error (we can do nothing here), just try to
435-
* finalize gracefully */
436-
ret = ucp_worker_flush(ompi_pml_ucx.ucp_worker);
437-
if (UCS_OK != ret) {
438-
PML_UCX_ERROR("ucp_worker_flush failed: %s",
439-
ucs_status_string(ret));
440-
}
441440

442-
opal_pmix.fence(NULL, 0);
441+
opal_pmix.fence_nb(NULL, 0, mca_pml_fence_complete_cb, &fenced);
442+
while (!fenced) {
443+
ucp_worker_progress(ompi_pml_ucx.ucp_worker);
444+
}
443445

444446
return OMPI_SUCCESS;
445447
}

0 commit comments

Comments
 (0)