Skip to content

Commit 164fc64

Browse files
author
Ralph Castain
authored
Merge pull request #2775 from rhc54/topic/oob3
More scaling efficiencies
2 parents 917b88a + e8e5f81 commit 164fc64

23 files changed

+486
-642
lines changed

opal/dss/dss_internal.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* Copyright (c) 2004-2005 The Regents of the University of California.
1212
* All rights reserved.
1313
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
14-
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
14+
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
1515
* Copyright (c) 2014 Research Organization for Information Science
1616
* and Technology (RIST). All rights reserved.
1717
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
@@ -47,12 +47,12 @@ BEGIN_C_DECLS
4747
/*
4848
* The default starting chunk size
4949
*/
50-
#define OPAL_DSS_DEFAULT_INITIAL_SIZE 128
50+
#define OPAL_DSS_DEFAULT_INITIAL_SIZE 2048
5151
/*
5252
* The default threshold size when we switch from doubling the
5353
* buffer size to addatively increasing it
5454
*/
55-
#define OPAL_DSS_DEFAULT_THRESHOLD_SIZE 1024
55+
#define OPAL_DSS_DEFAULT_THRESHOLD_SIZE 4096
5656

5757
/*
5858
* Internal type corresponding to size_t. Do not use this in

opal/dss/dss_internal_functions.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12+
* Copyright (c) 2017 Intel, Inc. All rights reserved.
1213
* $COPYRIGHT$
1314
*
1415
* Additional copyrights may follow

orte/mca/odls/base/odls_base_default_fns.c

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,6 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
261261
orte_proc_t *pptr, *dmn;
262262
opal_buffer_t *bptr;
263263
orte_app_context_t *app;
264-
bool found;
265264
orte_node_t *node;
266265
bool newmap = false;
267266

@@ -409,6 +408,13 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
409408
if (NULL == jdata->map) {
410409
jdata->map = OBJ_NEW(orte_job_map_t);
411410
newmap = true;
411+
} else if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_MAP_INITIALIZED)) {
412+
/* zero all the node map flags */
413+
for (n=0; n < jdata->map->nodes->size; n++) {
414+
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, n))) {
415+
ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
416+
}
417+
}
412418
}
413419

414420
/* if we have a file map, then we need to load it */
@@ -454,17 +460,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
454460
opal_pointer_array_add(dmn->node->procs, pptr);
455461

456462
/* add the node to the map, if not already there */
457-
found = false;
458-
for (k=0; k < jdata->map->nodes->size; k++) {
459-
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, k))) {
460-
continue;
461-
}
462-
if (node->daemon == dmn) {
463-
found = true;
464-
break;
465-
}
466-
}
467-
if (!found) {
463+
if (!ORTE_FLAG_TEST(dmn->node, ORTE_NODE_FLAG_MAPPED)) {
468464
OBJ_RETAIN(dmn->node);
469465
opal_pointer_array_add(jdata->map->nodes, dmn->node);
470466
if (newmap) {
@@ -497,6 +493,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
497493
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx);
498494
ORTE_FLAG_SET(app, ORTE_APP_FLAG_USED_ON_NODE);
499495
}
496+
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_MAP_INITIALIZED);
500497
}
501498

502499
COMPLETE:

orte/mca/oob/base/base.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2012-2013 Los Alamos National Security, LLC. All rights
1313
* reserved.
14+
* Copyright (c) 2017 Intel, Inc. All rights reserved.
1415
* $COPYRIGHT$
1516
*
1617
* Additional copyrights may follow
@@ -54,13 +55,14 @@ OPAL_TIMING_DECLARE_EXT(ORTE_DECLSPEC, tm_oob)
5455
* Convenience Typedef
5556
*/
5657
typedef struct {
58+
opal_event_base_t *ev_base;
5759
char *include;
5860
char *exclude;
5961
opal_list_t components;
6062
opal_list_t actives;
6163
int max_uri_length;
6264
opal_hash_table_t peers;
63-
bool use_module_threads;
65+
int num_threads;
6466
#if OPAL_ENABLE_TIMING
6567
bool timing;
6668
#endif
@@ -119,7 +121,7 @@ ORTE_DECLSPEC void orte_oob_base_send_nb(int fd, short args, void *cbdata);
119121
__FILE__, __LINE__); \
120122
cd = OBJ_NEW(orte_oob_send_t); \
121123
cd->msg = (m); \
122-
opal_event_set(orte_event_base, &cd->ev, -1, \
124+
opal_event_set(orte_oob_base.ev_base, &cd->ev, -1, \
123125
OPAL_EV_WRITE, \
124126
orte_oob_base_send_nb, cd); \
125127
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI); \
@@ -173,7 +175,7 @@ OBJ_CLASS_DECLARATION(mca_oob_uri_req_t);
173175
mca_oob_uri_req_t *rq; \
174176
rq = OBJ_NEW(mca_oob_uri_req_t); \
175177
rq->uri = strdup((u)); \
176-
opal_event_set(orte_event_base, &(rq)->ev, -1, \
178+
opal_event_set(orte_oob_base.ev_base, &(rq)->ev, -1, \
177179
OPAL_EV_WRITE, \
178180
orte_oob_base_set_addr, (rq)); \
179181
opal_event_set_priority(&(rq)->ev, ORTE_MSG_PRI); \
@@ -193,4 +195,3 @@ ORTE_DECLSPEC void orte_oob_base_ft_event(int fd, short args, void *cbdata);
193195

194196
END_C_DECLS
195197
#endif
196-

orte/mca/oob/base/oob_base_frame.c

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
#include "opal/class/opal_bitmap.h"
3131
#include "orte/mca/mca.h"
32+
#include "opal/runtime/opal_progress_threads.h"
3233
#include "opal/util/output.h"
3334
#include "opal/mca/base/base.h"
3435

@@ -53,19 +54,20 @@
5354
orte_oob_base_t orte_oob_base = {0};
5455
OPAL_TIMING_DECLARE(tm_oob)
5556

57+
5658
static int orte_oob_base_register(mca_base_register_flag_t flags)
5759
{
5860
if (ORTE_PROC_IS_APP || ORTE_PROC_IS_TOOL) {
59-
orte_oob_base.use_module_threads = false;
61+
orte_oob_base.num_threads = 0;
6062
} else {
61-
orte_oob_base.use_module_threads = true;
63+
orte_oob_base.num_threads = 8;
6264
}
63-
(void)mca_base_var_register("orte", "oob", "base", "enable_module_progress_threads",
64-
"Whether to independently progress OOB messages for each interface",
65+
(void)mca_base_var_register("orte", "oob", "base", "num_progress_threads",
66+
"Number of independent progress OOB messages for each interface",
6567
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
6668
OPAL_INFO_LVL_9,
6769
MCA_BASE_VAR_SCOPE_READONLY,
68-
&orte_oob_base.use_module_threads);
70+
&orte_oob_base.num_threads);
6971

7072
#if OPAL_ENABLE_TIMING
7173
/* Detailed timing setup */
@@ -107,6 +109,11 @@ static int orte_oob_base_close(void)
107109

108110
OBJ_DESTRUCT(&orte_oob_base.peers);
109111

112+
if (ORTE_PROC_IS_APP || ORTE_PROC_IS_TOOL) {
113+
opal_progress_thread_finalize(NULL);
114+
} else {
115+
opal_progress_thread_finalize("OOB-BASE");
116+
}
110117

111118
OPAL_TIMING_EVENT((&tm_oob, "Finish"));
112119
OPAL_TIMING_REPORT(orte_oob_base.timing, &tm_oob);
@@ -126,6 +133,13 @@ static int orte_oob_base_open(mca_base_open_flag_t flags)
126133
opal_hash_table_init(&orte_oob_base.peers, 128);
127134
OBJ_CONSTRUCT(&orte_oob_base.actives, opal_list_t);
128135

136+
if (ORTE_PROC_IS_APP || ORTE_PROC_IS_TOOL) {
137+
orte_oob_base.ev_base = opal_progress_thread_init(NULL);
138+
} else {
139+
orte_oob_base.ev_base = opal_progress_thread_init("OOB-BASE");
140+
}
141+
142+
129143
#if OPAL_ENABLE_FT_CR == 1
130144
/* register the FT events callback */
131145
orte_state.add_job_state(ORTE_JOB_STATE_FT_CHECKPOINT, orte_oob_base_ft_event, ORTE_ERROR_PRI);

orte/mca/oob/tcp/Makefile.am

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
1313
# Copyright (c) 2012-2013 Los Alamos National Security, LLC.
1414
# All rights reserved
15-
# Copyright (c) 2014 Intel, Inc. All rights reserved.
15+
# Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
1616
# $COPYRIGHT$
1717
#
1818
# Additional copyrights may follow
@@ -32,7 +32,6 @@ sources = \
3232
oob_tcp_sendrecv.h \
3333
oob_tcp_hdr.h \
3434
oob_tcp_peer.h \
35-
oob_tcp_ping.h \
3635
oob_tcp.c \
3736
oob_tcp_listener.c \
3837
oob_tcp_common.c \
@@ -59,4 +58,3 @@ mca_oob_tcp_la_LDFLAGS = -module -avoid-version
5958
noinst_LTLIBRARIES = $(component_noinst)
6059
libmca_oob_tcp_la_SOURCES = $(sources)
6160
libmca_oob_tcp_la_LDFLAGS = -module -avoid-version
62-

0 commit comments

Comments
 (0)