Skip to content
This repository was archived by the owner on Sep 30, 2022. It is now read-only.

Commit 3e6bc08

Browse files
committed
opal/progress: make progress function registration mt safe
This commit fixes a bug in opal progress registration that can cause crashes when a progress function is registered while another thread is in opal_progress(). Before this commit realloc is used to allocate more space for progress functions but it is possible for a thread in opal_progress() to try to read from the array that is freed by realloc before the array is re-assigned when realloc returns. To prevent this race use malloc + memcpy to fill the new array and atomically swap out the old and new array pointers. Per suggestion we now allocate a default of 8 slots for callbacks and double the current number when we run out of space. This commit also fixes leaking the callbacks_lp array. Signed-off-by: Nathan Hjelm <[email protected]> (cherry picked from open-mpi/ompi@2fad3b9) Signed-off-by: Nathan Hjelm <[email protected]>
1 parent 84ea988 commit 3e6bc08

File tree

1 file changed

+112
-75
lines changed

1 file changed

+112
-75
lines changed

opal/runtime/opal_progress.c

Lines changed: 112 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -55,11 +55,11 @@ int opal_progress_spin_count = 10000;
5555
static opal_atomic_lock_t progress_lock;
5656

5757
/* callbacks to progress */
58-
static opal_progress_callback_t *callbacks = NULL;
58+
static volatile opal_progress_callback_t *callbacks = NULL;
5959
static size_t callbacks_len = 0;
6060
static size_t callbacks_size = 0;
6161

62-
static opal_progress_callback_t *callbacks_lp = NULL;
62+
static volatile opal_progress_callback_t *callbacks_lp = NULL;
6363
static size_t callbacks_lp_len = 0;
6464
static size_t callbacks_lp_size = 0;
6565
static uint64_t callbacks_lp_mask = 0x7;
@@ -94,6 +94,9 @@ static int debug_output = -1;
9494
*/
9595
static int fake_cb(void) { return 0; }
9696

97+
static int _opal_progress_unregister (opal_progress_callback_t cb, opal_progress_callback_t *callback_array,
98+
size_t *callback_array_len);
99+
97100
/* init the progress engine - called from orte_init */
98101
int
99102
opal_progress_init(void)
@@ -110,8 +113,30 @@ opal_progress_init(void)
110113
}
111114
#endif
112115

116+
113117
callbacks_lp_mask = opal_progress_lp_call_ratio - 1;
114118

119+
callbacks_size = callbacks_lp_size = 8;
120+
121+
callbacks = malloc (callbacks_size * sizeof (callbacks[0]));
122+
callbacks_lp = malloc (callbacks_lp_size * sizeof (callbacks_lp[0]));
123+
124+
if (NULL == callbacks || NULL == callbacks_lp) {
125+
free (callbacks);
126+
free (callbacks_lp);
127+
callbacks_size = callbacks_lp_size = 0;
128+
callbacks = callbacks_lp = NULL;
129+
return OPAL_ERR_OUT_OF_RESOURCE;
130+
}
131+
132+
for (size_t i = 0 ; i < callbacks_size ; ++i) {
133+
callbacks[i] = fake_cb;
134+
}
135+
136+
for (size_t i = 0 ; i < callbacks_lp_size ; ++i) {
137+
callbacks_lp[i] = fake_cb;
138+
}
139+
115140
OPAL_OUTPUT((debug_output, "progress: initialized event flag to: %x",
116141
opal_progress_event_flag));
117142
OPAL_OUTPUT((debug_output, "progress: initialized yield_when_idle to: %s",
@@ -133,10 +158,13 @@ opal_progress_finalize(void)
133158

134159
callbacks_len = 0;
135160
callbacks_size = 0;
136-
if (NULL != callbacks) {
137-
free(callbacks);
138-
callbacks = NULL;
139-
}
161+
free(callbacks);
162+
callbacks = NULL;
163+
164+
callbacks_lp_len = 0;
165+
callbacks_lp_size = 0;
166+
free(callbacks_lp);
167+
callbacks_lp = NULL;
140168

141169
opal_atomic_unlock(&progress_lock);
142170

@@ -325,38 +353,73 @@ opal_progress_set_event_poll_rate(int polltime)
325353
#endif
326354
}
327355

356+
static int opal_progress_find_cb (opal_progress_callback_t cb, opal_progress_callback_t *cbs,
357+
size_t cbs_len)
358+
{
359+
for (size_t i = 0 ; i < cbs_len ; ++i) {
360+
if (cbs[i] == cb) {
361+
return (int) i;
362+
}
363+
}
364+
365+
return OPAL_ERR_NOT_FOUND;
366+
}
328367

329-
int
330-
opal_progress_register(opal_progress_callback_t cb)
368+
static int _opal_progress_register (opal_progress_callback_t cb, opal_progress_callback_t **cbs,
369+
size_t *cbs_size, size_t *cbs_len)
331370
{
332371
int ret = OPAL_SUCCESS;
333-
size_t index;
334-
335-
/* just in case there is a low-priority callback remove it */
336-
(void) opal_progress_unregister (cb);
337372

338-
opal_atomic_lock(&progress_lock);
373+
if (OPAL_ERR_NOT_FOUND != opal_progress_find_cb (cb, *cbs, *cbs_len)) {
374+
return OPAL_SUCCESS;
375+
}
339376

340377
/* see if we need to allocate more space */
341-
if (callbacks_len + 1 > callbacks_size) {
342-
opal_progress_callback_t *tmp;
343-
tmp = (opal_progress_callback_t*)realloc(callbacks, sizeof(opal_progress_callback_t) * (callbacks_size + 4));
378+
if (*cbs_len + 1 > *cbs_size) {
379+
opal_progress_callback_t *tmp, *old;
380+
381+
tmp = (opal_progress_callback_t *) malloc (sizeof (tmp[0]) * 2 * *cbs_size);
344382
if (tmp == NULL) {
345-
ret = OPAL_ERR_TEMP_OUT_OF_RESOURCE;
346-
goto cleanup;
383+
return OPAL_ERR_TEMP_OUT_OF_RESOURCE;
384+
}
385+
386+
if (*cbs) {
387+
/* copy old callbacks */
388+
memcpy (tmp, *cbs, sizeof(tmp[0]) * *cbs_size);
347389
}
348-
/* registering fake callbacks to fill callbacks[] */
349-
for( index = callbacks_len + 1 ; index < callbacks_size + 4 ; index++) {
350-
tmp[index] = &fake_cb;
390+
391+
for (size_t i = *cbs_len ; i < 2 * *cbs_size ; ++i) {
392+
tmp[i] = fake_cb;
351393
}
352394

353-
callbacks = tmp;
354-
callbacks_size += 4;
395+
opal_atomic_wmb ();
396+
397+
/* swap out callback array */
398+
old = opal_atomic_swap_ptr (cbs, tmp);
399+
400+
opal_atomic_wmb ();
401+
402+
free (old);
403+
*cbs_size *= 2;
355404
}
356405

357-
callbacks[callbacks_len++] = cb;
406+
cbs[0][*cbs_len] = cb;
407+
++*cbs_len;
408+
409+
opal_atomic_wmb ();
358410

359-
cleanup:
411+
return ret;
412+
}
413+
414+
int opal_progress_register (opal_progress_callback_t cb)
415+
{
416+
int ret;
417+
418+
opal_atomic_lock(&progress_lock);
419+
420+
(void) _opal_progress_unregister (cb, callbacks_lp, &callbacks_lp_len);
421+
422+
ret = _opal_progress_register (cb, &callbacks, &callbacks_size, &callbacks_len);
360423

361424
opal_atomic_unlock(&progress_lock);
362425

@@ -365,84 +428,58 @@ opal_progress_register(opal_progress_callback_t cb)
365428

366429
int opal_progress_register_lp (opal_progress_callback_t cb)
367430
{
368-
int ret = OPAL_SUCCESS;
369-
size_t index;
370-
371-
/* just in case there is a high-priority callback remove it */
372-
(void) opal_progress_unregister (cb);
431+
int ret;
373432

374433
opal_atomic_lock(&progress_lock);
375434

376-
/* see if we need to allocate more space */
377-
if (callbacks_lp_len + 1 > callbacks_lp_size) {
378-
opal_progress_callback_t *tmp;
379-
tmp = (opal_progress_callback_t*)realloc(callbacks_lp, sizeof(opal_progress_callback_t) * (callbacks_lp_size + 4));
380-
if (tmp == NULL) {
381-
ret = OPAL_ERR_TEMP_OUT_OF_RESOURCE;
382-
goto cleanup;
383-
}
384-
/* registering fake callbacks_lp to fill callbacks_lp[] */
385-
for( index = callbacks_lp_len + 1 ; index < callbacks_lp_size + 4 ; index++) {
386-
tmp[index] = &fake_cb;
387-
}
435+
(void) _opal_progress_unregister (cb, callbacks, &callbacks_len);
388436

389-
callbacks_lp = tmp;
390-
callbacks_lp_size += 4;
391-
}
392-
393-
callbacks_lp[callbacks_lp_len++] = cb;
394-
395-
cleanup:
437+
ret = _opal_progress_register (cb, &callbacks_lp, &callbacks_lp_size, &callbacks_lp_len);
396438

397439
opal_atomic_unlock(&progress_lock);
398440

399441
return ret;
400442
}
401443

402444
static int _opal_progress_unregister (opal_progress_callback_t cb, opal_progress_callback_t *callback_array,
403-
size_t callback_array_len)
445+
size_t *callback_array_len)
404446
{
405-
size_t i;
406-
int ret = OPAL_ERR_NOT_FOUND;
407-
408-
opal_atomic_lock(&progress_lock);
409-
410-
for (i = 0 ; i < callback_array_len ; ++i) {
411-
if (cb == callback_array[i]) {
412-
callback_array[i] = &fake_cb;
413-
ret = OPAL_SUCCESS;
414-
break;
415-
}
447+
int ret = opal_progress_find_cb (cb, callback_array, *callback_array_len);
448+
if (OPAL_ERR_NOT_FOUND == ret) {
449+
return ret;
416450
}
417451

418452
/* If we found the function we're unregistering: If callbacks_len
419453
is 0, we're not goig to do anything interesting anyway, so
420454
skip. If callbacks_len is 1, it will soon be 0, so no need to
421-
do any repacking. size_t can be unsigned, so 0 - 1 is bad for
422-
a loop condition :). */
423-
if (OPAL_SUCCESS == ret) {
424-
if (i < callback_array_len - 1) {
425-
memmove (callback_array + i, callback_array + i + 1,
426-
(callback_array_len - i - 1) * sizeof (callback_array[0]));
427-
}
428-
429-
callback_array[callback_array_len - 1] = &fake_cb;
430-
callback_array_len--;
455+
do any repacking. */
456+
for (size_t i = (size_t) ret ; i < *callback_array_len - 1 ; ++i) {
457+
/* copy callbacks atomically since another thread may be in
458+
* opal_progress(). */
459+
(void) opal_atomic_swap_ptr (callback_array + i, callback_array[i+1]);
431460
}
432461

433-
opal_atomic_unlock(&progress_lock);
462+
callback_array[*callback_array_len] = fake_cb;
463+
--*callback_array_len;
434464

435-
return ret;
465+
return OPAL_SUCCESS;
436466
}
437467

438468
int opal_progress_unregister (opal_progress_callback_t cb)
439469
{
440-
int ret = _opal_progress_unregister (cb, callbacks, callbacks_len);
470+
int ret;
471+
472+
opal_atomic_lock(&progress_lock);
473+
474+
ret = _opal_progress_unregister (cb, callbacks, &callbacks_len);
475+
441476
if (OPAL_SUCCESS != ret) {
442477
/* if not in the high-priority array try to remove from the lp array.
443478
* a callback will never be in both. */
444-
return _opal_progress_unregister (cb, callbacks_lp, callbacks_lp_len);
479+
ret = _opal_progress_unregister (cb, callbacks_lp, &callbacks_lp_len);
445480
}
446481

482+
opal_atomic_unlock(&progress_lock);
483+
447484
return ret;
448485
}

0 commit comments

Comments
 (0)