Skip to content

Commit 61f34e6

Browse files
committed
optionally passive wait when the progress loop is idle for a while
Add the new mpi_sleep_when_idle_threshold MCA parameter. This is only relevant when mpi_yield_when_idle is set. -1 value (default) means never pool when idle 0 value means always sleep when idle a positive n value means means opal_progress() will sleep after being invoked n times in a row and no event was available. The default is not to sleep when idle. Note the sleep funcitonality is implemented as poll(NULL, 0, 1) Thanks Paul Kapinos for bringing this to our attention Signed-off-by: Gilles Gouaillardet <[email protected]>
1 parent 6d7a780 commit 61f34e6

File tree

4 files changed

+80
-18
lines changed

4 files changed

+80
-18
lines changed

ompi/runtime/ompi_mpi_init.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
1919
* Copyright (c) 2012-2013 Inria. All rights reserved.
2020
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
21-
* Copyright (c) 2014-2016 Research Organization for Information Science
21+
* Copyright (c) 2014-2017 Research Organization for Information Science
2222
* and Technology (RIST). All rights reserved.
2323
* Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
2424
*
@@ -284,6 +284,7 @@ opal_list_t ompi_registered_datareps = {{0}};
284284

285285
bool ompi_enable_timing = false;
286286
extern bool ompi_mpi_yield_when_idle;
287+
extern uint32_t ompi_mpi_sleep_when_idle_threshold;
287288
extern int ompi_mpi_event_tick_rate;
288289

289290
/**
@@ -971,6 +972,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
971972
/* see if yield_when_idle was specified - if so, use it */
972973
opal_progress_set_yield_when_idle(ompi_mpi_yield_when_idle);
973974

975+
/* set the threshold to start sleeping when idle */
976+
opal_progress_set_sleep_when_idle_threshold(ompi_mpi_sleep_when_idle_threshold);
977+
974978
/* negative value means use default - just don't do anything */
975979
if (ompi_mpi_event_tick_rate >= 0) {
976980
opal_progress_set_event_poll_rate(ompi_mpi_event_tick_rate);

ompi/runtime/ompi_mpi_params.c

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
1818
* Copyright (c) 2015 Mellanox Technologies, Inc.
1919
* All rights reserved.
20-
* Copyright (c) 2016 Research Organization for Information Science
20+
* Copyright (c) 2016-2017 Research Organization for Information Science
2121
* and Technology (RIST). All rights reserved.
2222
* $COPYRIGHT$
2323
*
@@ -63,6 +63,7 @@ bool ompi_have_sparse_group_storage = OPAL_INT_TO_BOOL(OMPI_GROUP_SPARSE);
6363
bool ompi_use_sparse_group_storage = OPAL_INT_TO_BOOL(OMPI_GROUP_SPARSE);
6464

6565
bool ompi_mpi_yield_when_idle = false;
66+
int ompi_mpi_sleep_when_idle_threshold = -1;
6667
int ompi_mpi_event_tick_rate = -1;
6768
char *ompi_mpi_show_mca_params_string = NULL;
6869
bool ompi_mpi_have_sparse_group_storage = !!(OMPI_GROUP_SPARSE);
@@ -109,12 +110,20 @@ int ompi_mpi_register_params(void)
109110
exactly/under-subscribed, or 1 when oversubscribed */
110111
ompi_mpi_yield_when_idle = false;
111112
(void) mca_base_var_register("ompi", "mpi", NULL, "yield_when_idle",
112-
"Yield the processor when waiting for MPI communication (for MPI processes, will default to 1 when oversubscribing nodes)",
113+
"Yield the processor when waiting for communication (for MPI processes, will default to 1 when oversubscribing nodes)",
113114
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
114-
OPAL_INFO_LVL_9,
115+
OPAL_INFO_LVL_6,
115116
MCA_BASE_VAR_SCOPE_READONLY,
116117
&ompi_mpi_yield_when_idle);
117118

119+
ompi_mpi_sleep_when_idle_threshold = -1;
120+
(void) mca_base_var_register("ompi", "mpi", NULL, "sleep_when_idle_threshold",
121+
"Sleep after waiting for communication too long",
122+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
123+
OPAL_INFO_LVL_6,
124+
MCA_BASE_VAR_SCOPE_READONLY,
125+
&ompi_mpi_sleep_when_idle_threshold);
126+
118127
ompi_mpi_event_tick_rate = -1;
119128
(void) mca_base_var_register("ompi", "mpi", NULL, "event_tick_rate",
120129
"How often to progress TCP communications (0 = never, otherwise specified in microseconds)",

opal/runtime/opal_progress.c

Lines changed: 40 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2006-2016 Los Alamos National Security, LLC. All rights
1414
* reserved.
15-
* Copyright (c) 2015-2016 Research Organization for Information Science
15+
* Copyright (c) 2015-2017 Research Organization for Information Science
1616
* and Technology (RIST). All rights reserved.
1717
*
1818
* $COPYRIGHT$
@@ -27,6 +27,11 @@
2727
#ifdef HAVE_SCHED_H
2828
#include <sched.h>
2929
#endif
30+
#if HAVE_POLL_H
31+
#include <poll.h>
32+
#elif HAVE_SYS_POLL_H
33+
#include <sys/poll.h>
34+
#endif
3035

3136
#include "opal/runtime/opal_progress.h"
3237
#include "opal/mca/event/event.h"
@@ -66,6 +71,9 @@ static size_t callbacks_lp_size = 0;
6671

6772
/* do we want to call sched_yield() if nothing happened */
6873
bool opal_progress_yield_when_idle = false;
74+
/* do we want to sleep if nothing happened for a while */
75+
int opal_progress_sleep_when_idle_threshold = -1;
76+
static int yield_count = 0;
6977

7078
#if OPAL_PROGRESS_USE_TIMERS
7179
static opal_timer_t event_progress_last_time = 0;
@@ -229,16 +237,25 @@ opal_progress(void)
229237
}
230238
}
231239

240+
if (OPAL_UNLIKELY(opal_progress_yield_when_idle)) {
241+
if (events <= 0) {
242+
if (opal_progress_sleep_when_idle_threshold < 0 || yield_count < opal_progress_sleep_when_idle_threshold) {
243+
yield_count++;
232244
#if OPAL_HAVE_SCHED_YIELD
233-
if (opal_progress_yield_when_idle && events <= 0) {
234-
/* If there is nothing to do - yield the processor - otherwise
235-
* we could consume the processor for the entire time slice. If
236-
* the processor is oversubscribed - this will result in a best-case
237-
* latency equivalent to the time-slice.
238-
*/
239-
sched_yield();
240-
}
245+
/* If there is nothing to do - yield the processor - otherwise
246+
* we could consume the processor for the entire time slice. If
247+
* the processor is oversubscribed - this will result in a best-case
248+
* latency equivalent to the time-slice.
249+
*/
250+
sched_yield();
241251
#endif /* defined(HAVE_SCHED_YIELD) */
252+
} else {
253+
poll(NULL, 0, 1);
254+
}
255+
} else {
256+
yield_count = 0;
257+
}
258+
}
242259
}
243260

244261

@@ -310,6 +327,20 @@ opal_progress_set_yield_when_idle(bool yieldopt)
310327
}
311328

312329

330+
int
331+
opal_progress_set_sleep_when_idle_threshold(int thresholdopt)
332+
{
333+
int tmp = opal_progress_sleep_when_idle_threshold;
334+
335+
opal_progress_sleep_when_idle_threshold = thresholdopt;
336+
337+
OPAL_OUTPUT((debug_output, "progress: progress_set_sleep_threshold to %d",
338+
thresholdopt));
339+
340+
return tmp;
341+
}
342+
343+
313344
void
314345
opal_progress_set_event_poll_rate(int polltime)
315346
{

opal/runtime/opal_progress.h

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
* All rights reserved.
1212
* Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
1313
* reserved.
14+
* Copyright (c) 2017 Research Organization for Information Science
15+
* and Technology (RIST). All rights reserved.
1416
*
1517
* $COPYRIGHT$
1618
*
@@ -113,19 +115,35 @@ OPAL_DECLSPEC void opal_progress_event_users_decrement(void);
113115
/**
114116
* Set whether opal_progress() should yield when idle
115117
*
116-
* Set whether opal_progress() should yield the processor (either by
117-
* sched_yield() or SwitchToThread()) if no events were progressed
118-
* during the progress loop. The return value of the callback
119-
* functions is used to determine whether or not yielding is required.
118+
* Set whether opal_progress() should yield the processor by
119+
* sched_yield() no events were progressed during the progress loop.
120+
* The return value of the callback functions is used to determine
121+
* whether or not yielding is required.
120122
* By default, the event loop will yield when the progress function is
121-
* idle.
123+
* idle only when oversubscription.
122124
*
123125
* @param yieldopt Whether to yield when idle.
124126
* @return Previous value of the yield_when_idle option.
125127
*/
126128
OPAL_DECLSPEC bool opal_progress_set_yield_when_idle(bool yieldopt);
127129

128130

131+
/**
132+
* Set whether opal_progress() should sleep when idle for a while
133+
*
134+
* Set whether opal_progress() should sleep if no events were progressed
135+
* during the progress loop for some time. The return value of the callback
136+
* functions is used to determine whether or not sleeping is required.
137+
* By default, the event loop will not sleep when the progress function is
138+
* idle.
139+
* -1 means no sleeping and 0 means always sleep.
140+
*
141+
* @param thresholdopt The threshold to start sleeping.
142+
* @return Previous value of the sleep_threshold option.
143+
*/
144+
OPAL_DECLSPEC int opal_progress_set_sleep_when_idle_threshold(int thresholdopt);
145+
146+
129147
/**
130148
* Set time between calls into the event library
131149
*

0 commit comments

Comments
 (0)