diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index a36dabc08dc..3b76e781ba2 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -18,7 +18,7 @@ * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. * Copyright (c) 2012-2013 Inria. All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. - * Copyright (c) 2014-2016 Research Organization for Information Science + * Copyright (c) 2014-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * @@ -284,6 +284,7 @@ opal_list_t ompi_registered_datareps = {{0}}; bool ompi_enable_timing = false; extern bool ompi_mpi_yield_when_idle; +extern uint32_t ompi_mpi_sleep_when_idle_threshold; extern int ompi_mpi_event_tick_rate; /** @@ -971,6 +972,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) /* see if yield_when_idle was specified - if so, use it */ opal_progress_set_yield_when_idle(ompi_mpi_yield_when_idle); + /* set the threshold to start sleeping when idle */ + opal_progress_set_sleep_when_idle_threshold(ompi_mpi_sleep_when_idle_threshold); + /* negative value means use default - just don't do anything */ if (ompi_mpi_event_tick_rate >= 0) { opal_progress_set_event_poll_rate(ompi_mpi_event_tick_rate); diff --git a/ompi/runtime/ompi_mpi_params.c b/ompi/runtime/ompi_mpi_params.c index f8376db633d..1ef789f6ed6 100644 --- a/ompi/runtime/ompi_mpi_params.c +++ b/ompi/runtime/ompi_mpi_params.c @@ -17,7 +17,7 @@ * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2015 Mellanox Technologies, Inc. * All rights reserved. - * Copyright (c) 2016 Research Organization for Information Science + * Copyright (c) 2016-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * @@ -63,6 +63,7 @@ bool ompi_have_sparse_group_storage = OPAL_INT_TO_BOOL(OMPI_GROUP_SPARSE); bool ompi_use_sparse_group_storage = OPAL_INT_TO_BOOL(OMPI_GROUP_SPARSE); bool ompi_mpi_yield_when_idle = false; +int ompi_mpi_sleep_when_idle_threshold = -1; int ompi_mpi_event_tick_rate = -1; char *ompi_mpi_show_mca_params_string = NULL; bool ompi_mpi_have_sparse_group_storage = !!(OMPI_GROUP_SPARSE); @@ -109,12 +110,20 @@ int ompi_mpi_register_params(void) exactly/under-subscribed, or 1 when oversubscribed */ ompi_mpi_yield_when_idle = false; (void) mca_base_var_register("ompi", "mpi", NULL, "yield_when_idle", - "Yield the processor when waiting for MPI communication (for MPI processes, will default to 1 when oversubscribing nodes)", + "Yield the processor when waiting for communication (for MPI processes, will default to 1 when oversubscribing nodes)", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, - OPAL_INFO_LVL_9, + OPAL_INFO_LVL_6, MCA_BASE_VAR_SCOPE_READONLY, &ompi_mpi_yield_when_idle); + ompi_mpi_sleep_when_idle_threshold = -1; + (void) mca_base_var_register("ompi", "mpi", NULL, "sleep_when_idle_threshold", + "Sleep after waiting for communication too long", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_mpi_sleep_when_idle_threshold); + ompi_mpi_event_tick_rate = -1; (void) mca_base_var_register("ompi", "mpi", NULL, "event_tick_rate", "How often to progress TCP communications (0 = never, otherwise specified in microseconds)", diff --git a/opal/runtime/opal_progress.c b/opal/runtime/opal_progress.c index 30ddcc6ac9a..f4bb70b1460 100644 --- a/opal/runtime/opal_progress.c +++ b/opal/runtime/opal_progress.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2006-2016 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2015-2016 Research Organization for Information Science + * Copyright (c) 2015-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * * $COPYRIGHT$ @@ -27,6 +27,11 @@ #ifdef HAVE_SCHED_H #include #endif +#if HAVE_POLL_H +#include +#elif HAVE_SYS_POLL_H +#include +#endif #include "opal/runtime/opal_progress.h" #include "opal/mca/event/event.h" @@ -66,6 +71,9 @@ static size_t callbacks_lp_size = 0; /* do we want to call sched_yield() if nothing happened */ bool opal_progress_yield_when_idle = false; +/* do we want to sleep if nothing happened for a while */ +int opal_progress_sleep_when_idle_threshold = -1; +static int yield_count = 0; #if OPAL_PROGRESS_USE_TIMERS static opal_timer_t event_progress_last_time = 0; @@ -229,16 +237,25 @@ opal_progress(void) } } + if (OPAL_UNLIKELY(opal_progress_yield_when_idle)) { + if (events <= 0) { + if (opal_progress_sleep_when_idle_threshold < 0 || yield_count < opal_progress_sleep_when_idle_threshold) { + yield_count++; #if OPAL_HAVE_SCHED_YIELD - if (opal_progress_yield_when_idle && events <= 0) { - /* If there is nothing to do - yield the processor - otherwise - * we could consume the processor for the entire time slice. If - * the processor is oversubscribed - this will result in a best-case - * latency equivalent to the time-slice. - */ - sched_yield(); - } + /* If there is nothing to do - yield the processor - otherwise + * we could consume the processor for the entire time slice. If + * the processor is oversubscribed - this will result in a best-case + * latency equivalent to the time-slice. + */ + sched_yield(); #endif /* defined(HAVE_SCHED_YIELD) */ + } else { + poll(NULL, 0, 1); + } + } else { + yield_count = 0; + } + } } @@ -310,6 +327,20 @@ opal_progress_set_yield_when_idle(bool yieldopt) } +int +opal_progress_set_sleep_when_idle_threshold(int thresholdopt) +{ + int tmp = opal_progress_sleep_when_idle_threshold; + + opal_progress_sleep_when_idle_threshold = thresholdopt; + + OPAL_OUTPUT((debug_output, "progress: progress_set_sleep_threshold to %d", + thresholdopt)); + + return tmp; +} + + void opal_progress_set_event_poll_rate(int polltime) { diff --git a/opal/runtime/opal_progress.h b/opal/runtime/opal_progress.h index 5badbd5a459..68ce24ba38e 100644 --- a/opal/runtime/opal_progress.h +++ b/opal/runtime/opal_progress.h @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2017 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * * $COPYRIGHT$ * @@ -113,12 +115,12 @@ OPAL_DECLSPEC void opal_progress_event_users_decrement(void); /** * Set whether opal_progress() should yield when idle * - * Set whether opal_progress() should yield the processor (either by - * sched_yield() or SwitchToThread()) if no events were progressed - * during the progress loop. The return value of the callback - * functions is used to determine whether or not yielding is required. + * Set whether opal_progress() should yield the processor by + * sched_yield() no events were progressed during the progress loop. + * The return value of the callback functions is used to determine + * whether or not yielding is required. * By default, the event loop will yield when the progress function is - * idle. + * idle only when oversubscription. * * @param yieldopt Whether to yield when idle. * @return Previous value of the yield_when_idle option. @@ -126,6 +128,22 @@ OPAL_DECLSPEC void opal_progress_event_users_decrement(void); OPAL_DECLSPEC bool opal_progress_set_yield_when_idle(bool yieldopt); +/** + * Set whether opal_progress() should sleep when idle for a while + * + * Set whether opal_progress() should sleep if no events were progressed + * during the progress loop for some time. The return value of the callback + * functions is used to determine whether or not sleeping is required. + * By default, the event loop will not sleep when the progress function is + * idle. + * -1 means no sleeping and 0 means always sleep. + * + * @param thresholdopt The threshold to start sleeping. + * @return Previous value of the sleep_threshold option. + */ +OPAL_DECLSPEC int opal_progress_set_sleep_when_idle_threshold(int thresholdopt); + + /** * Set time between calls into the event library *