Skip to content

Commit 27b83fd

Browse files
committed
optionally passive wait when the progress loop is idle for a while
Add the new mpi_poll_when_idle_threshold MCA parameter. This is only relevant when mpi_yield_when_idle is set. -1 value (default) means never pool when idle 0 value means always poll when idle a positive n value means means opal_progress() will poll() after being invoked n times in a row and no event was available. The default is not to poll when idle. Thanks Paul Kapinos for bringing this to our attention Signed-off-by: Gilles Gouaillardet <[email protected]>
1 parent 1c52d9d commit 27b83fd

File tree

4 files changed

+78
-16
lines changed

4 files changed

+78
-16
lines changed

ompi/runtime/ompi_mpi_init.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
1919
* Copyright (c) 2012-2013 Inria. All rights reserved.
2020
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
21-
* Copyright (c) 2014-2016 Research Organization for Information Science
21+
* Copyright (c) 2014-2017 Research Organization for Information Science
2222
* and Technology (RIST). All rights reserved.
2323
* Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
2424
*
@@ -284,6 +284,7 @@ opal_list_t ompi_registered_datareps = {{0}};
284284

285285
bool ompi_enable_timing = false;
286286
extern bool ompi_mpi_yield_when_idle;
287+
extern uint32_t ompi_mpi_poll_when_idle_threshold;
287288
extern int ompi_mpi_event_tick_rate;
288289

289290
/**
@@ -971,6 +972,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
971972
/* see if yield_when_idle was specified - if so, use it */
972973
opal_progress_set_yield_when_idle(ompi_mpi_yield_when_idle);
973974

975+
/* set the threshold to start polling when idle */
976+
opal_progress_set_poll_when_idle_threshold(ompi_mpi_poll_when_idle_threshold);
977+
974978
/* negative value means use default - just don't do anything */
975979
if (ompi_mpi_event_tick_rate >= 0) {
976980
opal_progress_set_event_poll_rate(ompi_mpi_event_tick_rate);

ompi/runtime/ompi_mpi_params.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
1818
* Copyright (c) 2015 Mellanox Technologies, Inc.
1919
* All rights reserved.
20-
* Copyright (c) 2016 Research Organization for Information Science
20+
* Copyright (c) 2016-2017 Research Organization for Information Science
2121
* and Technology (RIST). All rights reserved.
2222
* $COPYRIGHT$
2323
*
@@ -63,6 +63,7 @@ bool ompi_have_sparse_group_storage = OPAL_INT_TO_BOOL(OMPI_GROUP_SPARSE);
6363
bool ompi_use_sparse_group_storage = OPAL_INT_TO_BOOL(OMPI_GROUP_SPARSE);
6464

6565
bool ompi_mpi_yield_when_idle = false;
66+
uint32_t ompi_mpi_poll_when_idle_threshold = -1;
6667
int ompi_mpi_event_tick_rate = -1;
6768
char *ompi_mpi_show_mca_params_string = NULL;
6869
bool ompi_mpi_have_sparse_group_storage = !!(OMPI_GROUP_SPARSE);
@@ -115,6 +116,14 @@ int ompi_mpi_register_params(void)
115116
MCA_BASE_VAR_SCOPE_READONLY,
116117
&ompi_mpi_yield_when_idle);
117118

119+
ompi_mpi_poll_when_idle_threshold = -1;
120+
(void) mca_base_var_register("ompi", "mpi", NULL, "poll_when_idle_threshold",
121+
"Poll the processor after waiting for MPI communication too long",
122+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
123+
OPAL_INFO_LVL_9,
124+
MCA_BASE_VAR_SCOPE_READONLY,
125+
&ompi_mpi_poll_when_idle_threshold);
126+
118127
ompi_mpi_event_tick_rate = -1;
119128
(void) mca_base_var_register("ompi", "mpi", NULL, "event_tick_rate",
120129
"How often to progress TCP communications (0 = never, otherwise specified in microseconds)",

opal/runtime/opal_progress.c

Lines changed: 40 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2006-2016 Los Alamos National Security, LLC. All rights
1414
* reserved.
15-
* Copyright (c) 2015-2016 Research Organization for Information Science
15+
* Copyright (c) 2015-2017 Research Organization for Information Science
1616
* and Technology (RIST). All rights reserved.
1717
*
1818
* $COPYRIGHT$
@@ -27,6 +27,11 @@
2727
#ifdef HAVE_SCHED_H
2828
#include <sched.h>
2929
#endif
30+
#if HAVE_POLL_H
31+
#include <poll.h>
32+
#elif HAVE_SYS_POLL_H
33+
#include <sys/poll.h>
34+
#endif
3035

3136
#include "opal/runtime/opal_progress.h"
3237
#include "opal/mca/event/event.h"
@@ -66,6 +71,9 @@ static size_t callbacks_lp_size = 0;
6671

6772
/* do we want to call sched_yield() if nothing happened */
6873
bool opal_progress_yield_when_idle = false;
74+
/* do we want to poll() if nothing happened for a while */
75+
int opal_progress_poll_when_idle_threshold = -1;
76+
static int yield_count = 0;
6977

7078
#if OPAL_PROGRESS_USE_TIMERS
7179
static opal_timer_t event_progress_last_time = 0;
@@ -229,16 +237,25 @@ opal_progress(void)
229237
}
230238
}
231239

240+
if (OPAL_UNLIKELY(opal_progress_yield_when_idle)) {
241+
if (events <= 0) {
242+
if (opal_progress_poll_when_idle_threshold < 0 || yield_count < opal_progress_poll_when_idle_threshold) {
243+
yield_count++;
232244
#if OPAL_HAVE_SCHED_YIELD
233-
if (opal_progress_yield_when_idle && events <= 0) {
234-
/* If there is nothing to do - yield the processor - otherwise
235-
* we could consume the processor for the entire time slice. If
236-
* the processor is oversubscribed - this will result in a best-case
237-
* latency equivalent to the time-slice.
238-
*/
239-
sched_yield();
240-
}
245+
/* If there is nothing to do - yield the processor - otherwise
246+
* we could consume the processor for the entire time slice. If
247+
* the processor is oversubscribed - this will result in a best-case
248+
* latency equivalent to the time-slice.
249+
*/
250+
sched_yield();
241251
#endif /* defined(HAVE_SCHED_YIELD) */
252+
} else {
253+
poll(NULL, 0, 1);
254+
}
255+
} else {
256+
yield_count = 0;
257+
}
258+
}
242259
}
243260

244261

@@ -310,6 +327,20 @@ opal_progress_set_yield_when_idle(bool yieldopt)
310327
}
311328

312329

330+
int
331+
opal_progress_set_poll_when_idle_threshold(int thresholdopt)
332+
{
333+
int tmp = opal_progress_poll_when_idle_threshold;
334+
335+
opal_progress_poll_when_idle_threshold = thresholdopt;
336+
337+
OPAL_OUTPUT((debug_output, "progress: progress_set_poll_threshold to %d",
338+
thresholdopt));
339+
340+
return tmp;
341+
}
342+
343+
313344
void
314345
opal_progress_set_event_poll_rate(int polltime)
315346
{

opal/runtime/opal_progress.h

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
* All rights reserved.
1212
* Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
1313
* reserved.
14+
* Copyright (c) 2017 Research Organization for Information Science
15+
* and Technology (RIST). All rights reserved.
1416
*
1517
* $COPYRIGHT$
1618
*
@@ -113,19 +115,35 @@ OPAL_DECLSPEC void opal_progress_event_users_decrement(void);
113115
/**
114116
* Set whether opal_progress() should yield when idle
115117
*
116-
* Set whether opal_progress() should yield the processor (either by
117-
* sched_yield() or SwitchToThread()) if no events were progressed
118-
* during the progress loop. The return value of the callback
119-
* functions is used to determine whether or not yielding is required.
118+
* Set whether opal_progress() should yield the processor by
119+
* sched_yield() no events were progressed during the progress loop.
120+
* The return value of the callback functions is used to determine
121+
* whether or not yielding is required.
120122
* By default, the event loop will yield when the progress function is
121-
* idle.
123+
* idle only when oversubscription.
122124
*
123125
* @param yieldopt Whether to yield when idle.
124126
* @return Previous value of the yield_when_idle option.
125127
*/
126128
OPAL_DECLSPEC bool opal_progress_set_yield_when_idle(bool yieldopt);
127129

128130

131+
/**
132+
* Set whether opal_progress() should poll when idle for a while
133+
*
134+
* Set whether opal_progress() should poll if no events were progressed
135+
* during the progress loop for some time. The return value of the callback
136+
* functions is used to determine whether or not polling is required.
137+
* By default, the event loop will not poll when the progress function is
138+
* idle.
139+
* -1 means no polling and 0 means always poll.
140+
*
141+
* @param thresholdopt The threshold to start polling.
142+
* @return Previous value of the poll_threshold option.
143+
*/
144+
OPAL_DECLSPEC int opal_progress_set_poll_when_idle_threshold(int thresholdopt);
145+
146+
129147
/**
130148
* Set time between calls into the event library
131149
*

0 commit comments

Comments
 (0)