Skip to content

Commit 16e236d

Browse files
authored
Merge pull request #6688 from yosefe/topic/osc-ucx-fix-ud-self-deadlock-v4.0.x
OSC/UCX: Fix deadlock with atomic lock - v4.0
2 parents c22326e + 4f9fb3e commit 16e236d

File tree

3 files changed

+14
-5
lines changed

3 files changed

+14
-5
lines changed

ompi/mca/osc/ucx/osc_ucx_active_target.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,7 @@ int ompi_osc_ucx_post(struct ompi_group_t *group, int assert, struct ompi_win_t
276276
ompi_osc_ucx_handle_incoming_post(module, &(module->state.post_state[j]), NULL, 0);
277277
}
278278

279+
ucp_worker_progress(mca_osc_ucx_component.ucp_worker);
279280
usleep(100);
280281
} while (1);
281282
}

ompi/mca/osc/ucx/osc_ucx_comm.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,7 @@ static inline int start_atomicity(ompi_osc_ucx_module_t *module, ucp_ep_h ep, in
281281
uint64_t remote_addr = (module->state_info_array)[target].addr + OSC_UCX_STATE_ACC_LOCK_OFFSET;
282282
ucs_status_t status;
283283

284-
while (result_value != TARGET_LOCK_UNLOCKED) {
284+
for (;;) {
285285
status = opal_common_ucx_atomic_cswap(ep, TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE,
286286
&result_value, sizeof(result_value),
287287
remote_addr, rkey,
@@ -290,9 +290,13 @@ static inline int start_atomicity(ompi_osc_ucx_module_t *module, ucp_ep_h ep, in
290290
OSC_UCX_VERBOSE(1, "ucp_atomic_cswap64 failed: %d", status);
291291
return OMPI_ERROR;
292292
}
293+
if (result_value == TARGET_LOCK_UNLOCKED) {
294+
return OMPI_SUCCESS;
295+
}
296+
297+
ucp_worker_progress(mca_osc_ucx_component.ucp_worker);
293298
}
294299

295-
return OMPI_SUCCESS;
296300
}
297301

298302
static inline int end_atomicity(ompi_osc_ucx_module_t *module, ucp_ep_h ep, int target) {

ompi/mca/osc/ucx/osc_ucx_passive_target.c

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ static inline int start_shared(ompi_osc_ucx_module_t *module, int target) {
4444
} else {
4545
break;
4646
}
47+
ucp_worker_progress(mca_osc_ucx_component.ucp_worker);
4748
}
4849

4950
return OMPI_SUCCESS;
@@ -72,17 +73,20 @@ static inline int start_exclusive(ompi_osc_ucx_module_t *module, int target) {
7273
uint64_t remote_addr = (module->state_info_array)[target].addr + OSC_UCX_STATE_LOCK_OFFSET;
7374
ucs_status_t status;
7475

75-
while (result_value != TARGET_LOCK_UNLOCKED) {
76+
for (;;) {
7677
status = opal_common_ucx_atomic_cswap(ep, TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE,
7778
&result_value, sizeof(result_value),
7879
remote_addr, rkey,
7980
mca_osc_ucx_component.ucp_worker);
8081
if (status != UCS_OK) {
8182
return OMPI_ERROR;
8283
}
83-
}
84+
if (result_value == TARGET_LOCK_UNLOCKED) {
85+
return OMPI_SUCCESS;
86+
}
8487

85-
return OMPI_SUCCESS;
88+
ucp_worker_progress(mca_osc_ucx_component.ucp_worker);
89+
}
8690
}
8791

8892
static inline int end_exclusive(ompi_osc_ucx_module_t *module, int target) {

0 commit comments

Comments
 (0)