Skip to content

Commit 1c6ae86

Browse files
committed
optionally passive wait when the progress loop is idle for a while
Add the new mpi_poll_when_idle and mpi_poll_threhold MCA parameters to control if and when the progress loop should poll() when idle and when polling should start. The default is not to poll when idle. Thanks Paul Kapinos for bringing this to our attention Signed-off-by: Gilles Gouaillardet <[email protected]>
1 parent 1c52d9d commit 1c6ae86

File tree

4 files changed

+108
-8
lines changed

4 files changed

+108
-8
lines changed

ompi/runtime/ompi_mpi_init.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
1919
* Copyright (c) 2012-2013 Inria. All rights reserved.
2020
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
21-
* Copyright (c) 2014-2016 Research Organization for Information Science
21+
* Copyright (c) 2014-2017 Research Organization for Information Science
2222
* and Technology (RIST). All rights reserved.
2323
* Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
2424
*
@@ -284,6 +284,8 @@ opal_list_t ompi_registered_datareps = {{0}};
284284

285285
bool ompi_enable_timing = false;
286286
extern bool ompi_mpi_yield_when_idle;
287+
extern bool ompi_mpi_poll_when_idle;
288+
extern uint32_t ompi_mpi_poll_threshold;
287289
extern int ompi_mpi_event_tick_rate;
288290

289291
/**
@@ -971,6 +973,12 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
971973
/* see if yield_when_idle was specified - if so, use it */
972974
opal_progress_set_yield_when_idle(ompi_mpi_yield_when_idle);
973975

976+
/* see if poll_when_idle was specified - if so, use it */
977+
opal_progress_set_poll_when_idle(ompi_mpi_poll_when_idle);
978+
979+
/* set the threshold to start polling when idle */
980+
opal_progress_set_poll_threshold(ompi_mpi_poll_threshold);
981+
974982
/* negative value means use default - just don't do anything */
975983
if (ompi_mpi_event_tick_rate >= 0) {
976984
opal_progress_set_event_poll_rate(ompi_mpi_event_tick_rate);

ompi/runtime/ompi_mpi_params.c

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
1818
* Copyright (c) 2015 Mellanox Technologies, Inc.
1919
* All rights reserved.
20-
* Copyright (c) 2016 Research Organization for Information Science
20+
* Copyright (c) 2016-2017 Research Organization for Information Science
2121
* and Technology (RIST). All rights reserved.
2222
* $COPYRIGHT$
2323
*
@@ -63,6 +63,8 @@ bool ompi_have_sparse_group_storage = OPAL_INT_TO_BOOL(OMPI_GROUP_SPARSE);
6363
bool ompi_use_sparse_group_storage = OPAL_INT_TO_BOOL(OMPI_GROUP_SPARSE);
6464

6565
bool ompi_mpi_yield_when_idle = false;
66+
bool ompi_mpi_poll_when_idle = false;
67+
uint32_t ompi_mpi_poll_threshold = 1000;
6668
int ompi_mpi_event_tick_rate = -1;
6769
char *ompi_mpi_show_mca_params_string = NULL;
6870
bool ompi_mpi_have_sparse_group_storage = !!(OMPI_GROUP_SPARSE);
@@ -115,6 +117,22 @@ int ompi_mpi_register_params(void)
115117
MCA_BASE_VAR_SCOPE_READONLY,
116118
&ompi_mpi_yield_when_idle);
117119

120+
ompi_mpi_poll_when_idle = false;
121+
(void) mca_base_var_register("ompi", "mpi", NULL, "poll_when_idle",
122+
"Poll the processor when waiting for MPI communication (default to false)",
123+
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
124+
OPAL_INFO_LVL_9,
125+
MCA_BASE_VAR_SCOPE_READONLY,
126+
&ompi_mpi_poll_when_idle);
127+
128+
ompi_mpi_poll_threshold = 1000;
129+
(void) mca_base_var_register("ompi", "mpi", NULL, "poll_threshold",
130+
"Poll the processor when waiting for MPI communication (default to false)",
131+
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0,
132+
OPAL_INFO_LVL_9,
133+
MCA_BASE_VAR_SCOPE_READONLY,
134+
&ompi_mpi_poll_threshold);
135+
118136
ompi_mpi_event_tick_rate = -1;
119137
(void) mca_base_var_register("ompi", "mpi", NULL, "event_tick_rate",
120138
"How often to progress TCP communications (0 = never, otherwise specified in microseconds)",

opal/runtime/opal_progress.c

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2006-2016 Los Alamos National Security, LLC. All rights
1414
* reserved.
15-
* Copyright (c) 2015-2016 Research Organization for Information Science
15+
* Copyright (c) 2015-2017 Research Organization for Information Science
1616
* and Technology (RIST). All rights reserved.
1717
*
1818
* $COPYRIGHT$
@@ -27,6 +27,7 @@
2727
#ifdef HAVE_SCHED_H
2828
#include <sched.h>
2929
#endif
30+
#include <poll.h>
3031

3132
#include "opal/runtime/opal_progress.h"
3233
#include "opal/mca/event/event.h"
@@ -66,6 +67,10 @@ static size_t callbacks_lp_size = 0;
6667

6768
/* do we want to call sched_yield() if nothing happened */
6869
bool opal_progress_yield_when_idle = false;
70+
/* do we want to poll() if nothing happened for a while */
71+
uint32_t opal_progress_poll_threshold = 1000;
72+
bool opal_progress_poll_when_idle = false;
73+
static uint32_t yield_count = 0;
6974

7075
#if OPAL_PROGRESS_USE_TIMERS
7176
static opal_timer_t event_progress_last_time = 0;
@@ -138,6 +143,8 @@ opal_progress_init(void)
138143
opal_progress_event_flag));
139144
OPAL_OUTPUT((debug_output, "progress: initialized yield_when_idle to: %s",
140145
opal_progress_yield_when_idle ? "true" : "false"));
146+
OPAL_OUTPUT((debug_output, "progress: initialized poll_when_idle to: %s",
147+
opal_progress_poll_when_idle ? "true" : "false"));
141148
OPAL_OUTPUT((debug_output, "progress: initialized num users to: %d",
142149
num_event_users));
143150
OPAL_OUTPUT((debug_output, "progress: initialized poll rate to: %ld",
@@ -229,6 +236,15 @@ opal_progress(void)
229236
}
230237
}
231238

239+
if (opal_progress_poll_when_idle) {
240+
if (events <= 0) {
241+
if (yield_count++ > opal_progress_poll_threshold) {
242+
poll(NULL, 0, 1);
243+
}
244+
} else {
245+
yield_count = 0;
246+
}
247+
}
232248
#if OPAL_HAVE_SCHED_YIELD
233249
if (opal_progress_yield_when_idle && events <= 0) {
234250
/* If there is nothing to do - yield the processor - otherwise
@@ -310,6 +326,33 @@ opal_progress_set_yield_when_idle(bool yieldopt)
310326
}
311327

312328

329+
bool
330+
opal_progress_set_poll_when_idle(bool pollopt)
331+
{
332+
bool tmp = opal_progress_poll_when_idle;
333+
opal_progress_poll_when_idle = (pollopt) ? 1 : 0;
334+
335+
OPAL_OUTPUT((debug_output, "progress: progress_set_poll_when_idle to %s",
336+
opal_progress_poll_when_idle ? "true" : "false"));
337+
338+
return tmp;
339+
}
340+
341+
342+
uint32_t
343+
opal_progress_set_poll_threshold(uint32_t thresholdopt)
344+
{
345+
uint32_t tmp = opal_progress_poll_threshold;
346+
347+
opal_progress_poll_threshold = thresholdopt;
348+
349+
OPAL_OUTPUT((debug_output, "progress: progress_set_poll_threshold to %d",
350+
thresholdopt));
351+
352+
return tmp;
353+
}
354+
355+
313356
void
314357
opal_progress_set_event_poll_rate(int polltime)
315358
{

opal/runtime/opal_progress.h

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
* All rights reserved.
1212
* Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
1313
* reserved.
14+
* Copyright (c) 2017 Research Organization for Information Science
15+
* and Technology (RIST). All rights reserved.
1416
*
1517
* $COPYRIGHT$
1618
*
@@ -113,19 +115,48 @@ OPAL_DECLSPEC void opal_progress_event_users_decrement(void);
113115
/**
114116
* Set whether opal_progress() should yield when idle
115117
*
116-
* Set whether opal_progress() should yield the processor (either by
117-
* sched_yield() or SwitchToThread()) if no events were progressed
118-
* during the progress loop. The return value of the callback
119-
* functions is used to determine whether or not yielding is required.
118+
* Set whether opal_progress() should yield the processor by
119+
* sched_yield() no events were progressed during the progress loop.
120+
* The return value of the callback functions is used to determine
121+
* whether or not yielding is required.
120122
* By default, the event loop will yield when the progress function is
121-
* idle.
123+
* idle only when oversubscription.
122124
*
123125
* @param yieldopt Whether to yield when idle.
124126
* @return Previous value of the yield_when_idle option.
125127
*/
126128
OPAL_DECLSPEC bool opal_progress_set_yield_when_idle(bool yieldopt);
127129

128130

131+
/**
132+
* Set whether opal_progress() should poll when idle for a while
133+
*
134+
* Set whether opal_progress() should poll if no events were progressed
135+
* during the progress loop for some time. The return value of the callback
136+
* functions is used to determine whether or not polling is required.
137+
* By default, the event loop will not poll when the progress function is
138+
* idle.
139+
*
140+
* @param thresholdopt The threshold to start polling.
141+
* @return Previous value of the poll_threshold option.
142+
*/
143+
OPAL_DECLSPEC uint32_t opal_progress_set_poll_threshold(uint32_t thresholdopt);
144+
145+
146+
/**
147+
* Set the threshold for opal_progress() to start polling when idle.
148+
*
149+
* Set the threshold for opal_progress() to start polling when idle.
150+
* The return value of the callback functions is used to determine whether or not polling is required.
151+
* By default, the event loop will poll when the progress function is
152+
* idle.
153+
*
154+
* @param pollopt Whether to poll when idle.
155+
* @return Previous value of the poll_when_idle option.
156+
*/
157+
OPAL_DECLSPEC bool opal_progress_set_poll_when_idle(bool pollopt);
158+
159+
129160
/**
130161
* Set time between calls into the event library
131162
*

0 commit comments

Comments
 (0)