Skip to content

Commit 466cbd4

Browse files
author
Ralph Castain
committed
Rework the threading in oob/tcp so that daemons (including mpirun) use multiple progress threads to get messages out to their children, and so that the oob/base uses a separate one to setup sends. This allows the daemon cmd processor to execute in parallel with relay of messages, which significantly reduces launch times at scale
Signed-off-by: Ralph Castain <[email protected]>
1 parent 917b88a commit 466cbd4

16 files changed

+459
-622
lines changed

orte/mca/oob/base/base.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2012-2013 Los Alamos National Security, LLC. All rights
1313
* reserved.
14+
* Copyright (c) 2017 Intel, Inc. All rights reserved.
1415
* $COPYRIGHT$
1516
*
1617
* Additional copyrights may follow
@@ -54,13 +55,14 @@ OPAL_TIMING_DECLARE_EXT(ORTE_DECLSPEC, tm_oob)
5455
* Convenience Typedef
5556
*/
5657
typedef struct {
58+
opal_event_base_t *ev_base;
5759
char *include;
5860
char *exclude;
5961
opal_list_t components;
6062
opal_list_t actives;
6163
int max_uri_length;
6264
opal_hash_table_t peers;
63-
bool use_module_threads;
65+
int num_threads;
6466
#if OPAL_ENABLE_TIMING
6567
bool timing;
6668
#endif
@@ -119,7 +121,7 @@ ORTE_DECLSPEC void orte_oob_base_send_nb(int fd, short args, void *cbdata);
119121
__FILE__, __LINE__); \
120122
cd = OBJ_NEW(orte_oob_send_t); \
121123
cd->msg = (m); \
122-
opal_event_set(orte_event_base, &cd->ev, -1, \
124+
opal_event_set(orte_oob_base.ev_base, &cd->ev, -1, \
123125
OPAL_EV_WRITE, \
124126
orte_oob_base_send_nb, cd); \
125127
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI); \
@@ -173,7 +175,7 @@ OBJ_CLASS_DECLARATION(mca_oob_uri_req_t);
173175
mca_oob_uri_req_t *rq; \
174176
rq = OBJ_NEW(mca_oob_uri_req_t); \
175177
rq->uri = strdup((u)); \
176-
opal_event_set(orte_event_base, &(rq)->ev, -1, \
178+
opal_event_set(orte_oob_base.ev_base, &(rq)->ev, -1, \
177179
OPAL_EV_WRITE, \
178180
orte_oob_base_set_addr, (rq)); \
179181
opal_event_set_priority(&(rq)->ev, ORTE_MSG_PRI); \
@@ -193,4 +195,3 @@ ORTE_DECLSPEC void orte_oob_base_ft_event(int fd, short args, void *cbdata);
193195

194196
END_C_DECLS
195197
#endif
196-

orte/mca/oob/base/oob_base_frame.c

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
#include "opal/class/opal_bitmap.h"
3131
#include "orte/mca/mca.h"
32+
#include "opal/runtime/opal_progress_threads.h"
3233
#include "opal/util/output.h"
3334
#include "opal/mca/base/base.h"
3435

@@ -53,19 +54,20 @@
5354
orte_oob_base_t orte_oob_base = {0};
5455
OPAL_TIMING_DECLARE(tm_oob)
5556

57+
5658
static int orte_oob_base_register(mca_base_register_flag_t flags)
5759
{
5860
if (ORTE_PROC_IS_APP || ORTE_PROC_IS_TOOL) {
59-
orte_oob_base.use_module_threads = false;
61+
orte_oob_base.num_threads = 0;
6062
} else {
61-
orte_oob_base.use_module_threads = true;
63+
orte_oob_base.num_threads = 8;
6264
}
63-
(void)mca_base_var_register("orte", "oob", "base", "enable_module_progress_threads",
64-
"Whether to independently progress OOB messages for each interface",
65+
(void)mca_base_var_register("orte", "oob", "base", "num_progress_threads",
66+
"Number of independent progress OOB messages for each interface",
6567
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
6668
OPAL_INFO_LVL_9,
6769
MCA_BASE_VAR_SCOPE_READONLY,
68-
&orte_oob_base.use_module_threads);
70+
&orte_oob_base.num_threads);
6971

7072
#if OPAL_ENABLE_TIMING
7173
/* Detailed timing setup */
@@ -107,6 +109,11 @@ static int orte_oob_base_close(void)
107109

108110
OBJ_DESTRUCT(&orte_oob_base.peers);
109111

112+
if (ORTE_PROC_IS_APP || ORTE_PROC_IS_TOOL) {
113+
opal_progress_thread_finalize(NULL);
114+
} else {
115+
opal_progress_thread_finalize("OOB-BASE");
116+
}
110117

111118
OPAL_TIMING_EVENT((&tm_oob, "Finish"));
112119
OPAL_TIMING_REPORT(orte_oob_base.timing, &tm_oob);
@@ -126,6 +133,13 @@ static int orte_oob_base_open(mca_base_open_flag_t flags)
126133
opal_hash_table_init(&orte_oob_base.peers, 128);
127134
OBJ_CONSTRUCT(&orte_oob_base.actives, opal_list_t);
128135

136+
if (ORTE_PROC_IS_APP || ORTE_PROC_IS_TOOL) {
137+
orte_oob_base.ev_base = opal_progress_thread_init(NULL);
138+
} else {
139+
orte_oob_base.ev_base = opal_progress_thread_init("OOB-BASE");
140+
}
141+
142+
129143
#if OPAL_ENABLE_FT_CR == 1
130144
/* register the FT events callback */
131145
orte_state.add_job_state(ORTE_JOB_STATE_FT_CHECKPOINT, orte_oob_base_ft_event, ORTE_ERROR_PRI);

orte/mca/oob/tcp/Makefile.am

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
1313
# Copyright (c) 2012-2013 Los Alamos National Security, LLC.
1414
# All rights reserved
15-
# Copyright (c) 2014 Intel, Inc. All rights reserved.
15+
# Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
1616
# $COPYRIGHT$
1717
#
1818
# Additional copyrights may follow
@@ -32,7 +32,6 @@ sources = \
3232
oob_tcp_sendrecv.h \
3333
oob_tcp_hdr.h \
3434
oob_tcp_peer.h \
35-
oob_tcp_ping.h \
3635
oob_tcp.c \
3736
oob_tcp_listener.c \
3837
oob_tcp_common.c \
@@ -59,4 +58,3 @@ mca_oob_tcp_la_LDFLAGS = -module -avoid-version
5958
noinst_LTLIBRARIES = $(component_noinst)
6059
libmca_oob_tcp_la_SOURCES = $(sources)
6160
libmca_oob_tcp_la_LDFLAGS = -module -avoid-version
62-

0 commit comments

Comments
 (0)