Skip to content

Commit fe64144

Browse files
author
Ralph Castain
authored
Merge pull request #3259 from rhc54/topic/launch
Update how we pass the node regex so we pass _all_ nodes, even those without daemons.
2 parents 9850832 + 92c9964 commit fe64144

27 files changed

+625
-1007
lines changed

opal/mca/pmix/pmix2x/pmix/src/server/pmix_server.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -990,11 +990,11 @@ static void _dmodex_req(int sd, short args, void *cbdata)
990990
* may not be a contribution */
991991
if (PMIX_SUCCESS == (rc = pmix_hash_fetch(&nptr->server->myremote, info->rank, "modex", &val)) &&
992992
NULL != val) {
993-
data = val->data.bo.bytes;
994-
sz = val->data.bo.size;
995-
/* protect the data */
996-
val->data.bo.bytes = NULL;
997-
val->data.bo.size = 0;
993+
data = val->data.bo.bytes;
994+
sz = val->data.bo.size;
995+
/* protect the data */
996+
val->data.bo.bytes = NULL;
997+
val->data.bo.size = 0;
998998
PMIX_VALUE_RELEASE(val);
999999
}
10001000

opal/mca/pmix/pmix2x/pmix/src/server/pmix_server_ops.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,6 @@ pmix_status_t pmix_server_commit(pmix_peer_t *peer, pmix_buffer_t *buf)
138138
pmix_nspace_t *nptr;
139139
pmix_rank_info_t *info;
140140
pmix_dmdx_remote_t *dcd, *dcdnext;
141-
pmix_buffer_t *pbkt;
142141
pmix_value_t *val;
143142
char *data;
144143
size_t sz;
@@ -236,16 +235,19 @@ pmix_status_t pmix_server_commit(pmix_peer_t *peer, pmix_buffer_t *buf)
236235
if (dcd->cd->proc.rank == info->rank) {
237236
/* we can now fulfill this request - collect the
238237
* remote/global data from this proc */
239-
pbkt = PMIX_NEW(pmix_buffer_t);
240238
/* get any remote contribution - note that there
241239
* may not be a contribution */
240+
data = NULL;
241+
sz = 0;
242242
if (PMIX_SUCCESS == pmix_hash_fetch(&nptr->server->myremote, info->rank, "modex", &val) &&
243243
NULL != val) {
244-
PMIX_LOAD_BUFFER(pbkt, val->data.bo.bytes, val->data.bo.size);
244+
data = val->data.bo.bytes;
245+
sz = val->data.bo.size;
246+
/* protect the data */
247+
val->data.bo.bytes = NULL;
248+
val->data.bo.size = 0;
245249
PMIX_VALUE_RELEASE(val);
246250
}
247-
PMIX_UNLOAD_BUFFER(pbkt, data, sz);
248-
PMIX_RELEASE(pbkt);
249251
/* execute the callback */
250252
dcd->cd->cbfunc(PMIX_SUCCESS, data, sz, dcd->cd->cbdata);
251253
if (NULL != data) {

opal/mca/pmix/pmix2x/pmix/src/util/compress.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12-
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
12+
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
1313
* $COPYRIGHT$
1414
*
1515
* Additional copyrights may follow
@@ -31,8 +31,8 @@
3131

3232
BEGIN_C_DECLS
3333

34-
/* define a limit for storing raw strings */
35-
#define PMIX_STRING_LIMIT 512
34+
/* define a limit of 128k for raw strings */
35+
#define PMIX_STRING_LIMIT 131072
3636

3737
/* define a macro for quickly checking if a string exceeds the
3838
* compression limit */

orte/mca/ess/alps/ess_alps_module.c

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
1313
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
1414
* All rights reserved.
15+
* Copyright (c) 2017 Intel, Inc. All rights reserved.
1516
* $COPYRIGHT$
1617
*
1718
* Additional copyrights may follow
@@ -58,7 +59,6 @@ static int rte_init(void)
5859
{
5960
int ret;
6061
char *error = NULL;
61-
char **hosts = NULL;
6262

6363
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output,
6464
"ess:alps in rte_init"));
@@ -90,23 +90,11 @@ static int rte_init(void)
9090
* default procedure
9191
*/
9292
if (ORTE_PROC_IS_DAEMON) {
93-
if (NULL != orte_node_regex) {
94-
/* extract the nodes */
95-
if (ORTE_SUCCESS != (ret =
96-
orte_regex_extract_node_names(orte_node_regex, &hosts)) ||
97-
NULL == hosts) {
98-
error = "orte_regex_extract_node_names";
99-
goto fn_fail;
100-
}
101-
}
102-
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) {
93+
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup())) {
10394
ORTE_ERROR_LOG(ret);
10495
error = "orte_ess_base_orted_setup";
10596
goto fn_fail;
10697
}
107-
if (NULL != hosts) {
108-
opal_argv_free(hosts);
109-
}
11098

11199
/*
112100
* now synchronize with aprun.

orte/mca/ess/base/base.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
1313
* Copyright (c) 2012 Oak Ridge National Labs. All rights reserved.
1414
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved.
15-
* Copyright (c) 2013 Intel, Inc. All rights reserved.
15+
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
1616
* $COPYRIGHT$
1717
*
1818
* Additional copyrights may follow
@@ -67,7 +67,7 @@ ORTE_DECLSPEC void orte_ess_base_app_abort(int status, bool report);
6767
ORTE_DECLSPEC int orte_ess_base_tool_setup(void);
6868
ORTE_DECLSPEC int orte_ess_base_tool_finalize(void);
6969

70-
ORTE_DECLSPEC int orte_ess_base_orted_setup(char **hosts);
70+
ORTE_DECLSPEC int orte_ess_base_orted_setup(void);
7171
ORTE_DECLSPEC int orte_ess_base_orted_finalize(void);
7272

7373
/* Detect whether or not this proc is bound - if not,

orte/mca/ess/base/ess_base_std_orted.c

Lines changed: 17 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ static void setup_sighandler(int signal, opal_event_t *ev,
103103
}
104104

105105

106-
int orte_ess_base_orted_setup(char **hosts)
106+
int orte_ess_base_orted_setup(void)
107107
{
108108
int ret = ORTE_ERROR;
109109
int fd;
@@ -113,7 +113,6 @@ int orte_ess_base_orted_setup(char **hosts)
113113
orte_job_t *jdata;
114114
orte_proc_t *proc;
115115
orte_app_context_t *app;
116-
orte_node_t *node;
117116
char *param;
118117
hwloc_obj_t obj;
119118
unsigned i, j;
@@ -218,12 +217,9 @@ int orte_ess_base_orted_setup(char **hosts)
218217
* a specific module to use
219218
*/
220219
(void) mca_base_var_env_name("plm", &param);
221-
222220
plm_in_use = !!(getenv(param));
223221
free (param);
224-
225222
if (plm_in_use) {
226-
227223
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_plm_base_framework, 0))) {
228224
ORTE_ERROR_LOG(ret);
229225
error = "orte_plm_base_open";
@@ -332,11 +328,6 @@ int orte_ess_base_orted_setup(char **hosts)
332328
app = OBJ_NEW(orte_app_context_t);
333329
opal_pointer_array_set_item(jdata->apps, 0, app);
334330
jdata->num_apps++;
335-
/* create and store a node object where we are */
336-
node = OBJ_NEW(orte_node_t);
337-
node->name = strdup(orte_process_info.nodename);
338-
node->index = ORTE_PROC_MY_NAME->vpid;
339-
opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node);
340331

341332
/* create and store a proc object for us */
342333
proc = OBJ_NEW(orte_proc_t);
@@ -345,19 +336,6 @@ int orte_ess_base_orted_setup(char **hosts)
345336
proc->pid = orte_process_info.pid;
346337
proc->state = ORTE_PROC_STATE_RUNNING;
347338
opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc);
348-
/* record that the daemon (i.e., us) is on this node
349-
* NOTE: we do not add the proc object to the node's
350-
* proc array because we are not an application proc.
351-
* Instead, we record it in the daemon field of the
352-
* node object
353-
*/
354-
OBJ_RETAIN(proc); /* keep accounting straight */
355-
node->daemon = proc;
356-
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED);
357-
node->state = ORTE_NODE_STATE_UP;
358-
/* now point our proc node field to the node */
359-
OBJ_RETAIN(node); /* keep accounting straight */
360-
proc->node = node;
361339
/* record that the daemon job is running */
362340
jdata->num_procs = 1;
363341
jdata->state = ORTE_JOB_STATE_RUNNING;
@@ -514,7 +492,6 @@ int orte_ess_base_orted_setup(char **hosts)
514492
orte_topo_signature = opal_hwloc_base_get_topo_signature(opal_hwloc_topology);
515493
t->sig = strdup(orte_topo_signature);
516494
opal_pointer_array_add(orte_node_topologies, t);
517-
node->topology = t;
518495
if (15 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) {
519496
opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
520497
opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO);
@@ -526,12 +503,25 @@ int orte_ess_base_orted_setup(char **hosts)
526503
* after we enable_comm as that function determines our
527504
* own port, which we need in order to construct the nidmap
528505
*/
529-
if (NULL != hosts) {
506+
if (NULL != orte_node_regex) {
507+
if (ORTE_SUCCESS != (ret = orte_util_nidmap_parse(orte_node_regex))) {
508+
ORTE_ERROR_LOG(ret);
509+
error = "construct nidmap";
510+
goto error;
511+
}
512+
}
513+
514+
if (orte_static_ports) {
515+
if (NULL == orte_node_regex) {
516+
/* we didn't get the node info */
517+
error = "cannot construct daemon map for static ports - no node map info";
518+
goto error;
519+
}
530520
/* extract the node info from the environment and
531521
* build a nidmap from it - this will update the
532522
* routing plan as well
533523
*/
534-
if (ORTE_SUCCESS != (ret = orte_util_build_daemon_nidmap(hosts))) {
524+
if (ORTE_SUCCESS != (ret = orte_util_build_daemon_nidmap())) {
535525
ORTE_ERROR_LOG(ret);
536526
error = "construct daemon map from static ports";
537527
goto error;
@@ -635,6 +625,7 @@ int orte_ess_base_orted_setup(char **hosts)
635625
}
636626

637627
return ORTE_SUCCESS;
628+
638629
error:
639630
orte_show_help("help-orte-runtime.txt",
640631
"orte_init:startup:internal-failure",

orte/mca/ess/env/ess_env_module.c

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
1212
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
13-
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
13+
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
1414
* $COPYRIGHT$
1515
*
1616
* Additional copyrights may follow
@@ -98,7 +98,6 @@ static int rte_init(void)
9898
{
9999
int ret;
100100
char *error = NULL;
101-
char **hosts = NULL;
102101

103102
/* run the prolog */
104103
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
@@ -112,19 +111,11 @@ static int rte_init(void)
112111
/* if I am a daemon, complete my setup using the
113112
* default procedure
114113
*/
115-
if (NULL != orte_node_regex) {
116-
/* extract the nodes */
117-
if (ORTE_SUCCESS != (ret = orte_regex_extract_node_names(orte_node_regex, &hosts))) {
118-
error = "orte_regex_extract_node_names";
119-
goto error;
120-
}
121-
}
122-
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) {
114+
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup())) {
123115
ORTE_ERROR_LOG(ret);
124116
error = "orte_ess_base_orted_setup";
125117
goto error;
126118
}
127-
opal_argv_free(hosts);
128119
return ORTE_SUCCESS;
129120

130121
error:

orte/mca/ess/lsf/ess_lsf_module.c

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
1212
* Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved.
13-
* Copyright (c) 2013 Intel, Inc. All rights reserved.
13+
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
1414
* Copyright (c) 2016 Research Organization for Information Science
1515
* and Technology (RIST). All rights reserved.
1616
* $COPYRIGHT$
@@ -68,7 +68,6 @@ static int rte_init(void)
6868
{
6969
int ret;
7070
char *error = NULL;
71-
char **hosts = NULL;
7271

7372
/* run the prolog */
7473
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
@@ -83,19 +82,11 @@ static int rte_init(void)
8382
* default procedure
8483
*/
8584
if (ORTE_PROC_IS_DAEMON) {
86-
if (NULL != orte_node_regex) {
87-
/* extract the nodes */
88-
if (ORTE_SUCCESS != (ret = orte_regex_extract_node_names(orte_node_regex, &hosts))) {
89-
error = "orte_regex_extract_node_names";
90-
goto error;
91-
}
92-
}
93-
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) {
85+
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup())) {
9486
ORTE_ERROR_LOG(ret);
9587
error = "orte_ess_base_orted_setup";
9688
goto error;
9789
}
98-
opal_argv_free(hosts);
9990
return ORTE_SUCCESS;
10091
}
10192

orte/mca/ess/slurm/ess_slurm_module.c

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
1212
* Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved.
13-
* Copyright (c) 2013 Intel, Inc. All rights reserved.
13+
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
1414
* $COPYRIGHT$
1515
*
1616
* Additional copyrights may follow
@@ -62,7 +62,6 @@ static int rte_init(void)
6262
{
6363
int ret;
6464
char *error = NULL;
65-
char **hosts = NULL;
6665

6766
/* run the prolog */
6867
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
@@ -77,23 +76,11 @@ static int rte_init(void)
7776
* default procedure
7877
*/
7978
if (ORTE_PROC_IS_DAEMON) {
80-
if (NULL != orte_node_regex) {
81-
/* extract the nodes */
82-
if (ORTE_SUCCESS != (ret =
83-
orte_regex_extract_node_names(orte_node_regex, &hosts)) ||
84-
NULL == hosts) {
85-
error = "orte_regex_extract_node_names";
86-
goto error;
87-
}
88-
}
89-
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) {
79+
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup())) {
9080
ORTE_ERROR_LOG(ret);
9181
error = "orte_ess_base_orted_setup";
9282
goto error;
9383
}
94-
if (NULL != hosts) {
95-
opal_argv_free(hosts);
96-
}
9784
return ORTE_SUCCESS;
9885
}
9986

orte/mca/ess/tm/ess_tm_module.c

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@ static int rte_init(void)
6767
{
6868
int ret;
6969
char *error = NULL;
70-
char **hosts = NULL;
7170

7271
/* run the prolog */
7372
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
@@ -82,21 +81,11 @@ static int rte_init(void)
8281
* default procedure
8382
*/
8483
if (ORTE_PROC_IS_DAEMON) {
85-
if (NULL != orte_node_regex) {
86-
/* extract the nodes */
87-
if (ORTE_SUCCESS != (ret =
88-
orte_regex_extract_node_names(orte_node_regex, &hosts)) ||
89-
NULL == hosts) {
90-
error = "orte_regex_extract_node_names";
91-
goto error;
92-
}
93-
}
94-
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) {
84+
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup())) {
9585
ORTE_ERROR_LOG(ret);
9686
error = "orte_ess_base_orted_setup";
9787
goto error;
9888
}
99-
opal_argv_free(hosts);
10089
return ORTE_SUCCESS;
10190
}
10291

@@ -194,4 +183,3 @@ static int tm_set_name(void)
194183

195184
return ORTE_SUCCESS;
196185
}
197-

0 commit comments

Comments
 (0)