Skip to content

Commit 2b2ea2f

Browse files
author
Ralph Castain
authored
Merge pull request #2869 from rhc54/topic/staticports
Fix static port and partial allocation operations
2 parents 47450eb + b59ae14 commit 2b2ea2f

File tree

14 files changed

+142
-68
lines changed

14 files changed

+142
-68
lines changed

orte/mca/errmgr/default_orted/errmgr_default_orted.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
* reserved.
99
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
1010
* All rights reserved.
11-
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
11+
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
1212
* $COPYRIGHT$
1313
*
1414
* Additional copyrights may follow
@@ -245,9 +245,12 @@ static void proc_errors(int fd, short args, void *cbdata)
245245
* lifeline
246246
*/
247247
if (ORTE_PROC_STATE_LIFELINE_LOST == state ||
248-
ORTE_PROC_STATE_UNABLE_TO_SEND_MSG == state) {
248+
ORTE_PROC_STATE_UNABLE_TO_SEND_MSG == state ||
249+
ORTE_PROC_STATE_NO_PATH_TO_TARGET == state ||
250+
ORTE_PROC_STATE_PEER_UNKNOWN == state ||
251+
ORTE_PROC_STATE_FAILED_TO_CONNECT == state) {
249252
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
250-
"%s errmgr:orted lifeline lost - exiting",
253+
"%s errmgr:orted lifeline lost or unable to communicate - exiting",
251254
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
252255
/* set our exit status */
253256
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);

orte/mca/oob/base/oob_base_stubs.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
4646
OBJ_RELEASE(cd);
4747

4848
opal_output_verbose(5, orte_oob_base_framework.framework_output,
49-
"%s oob:base:send to target %s - %u attempt",
49+
"%s oob:base:send to target %s - attempt %u",
5050
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
5151
ORTE_NAME_PRINT(&msg->dst), msg->retries);
5252

orte/mca/oob/tcp/oob_tcp_connection.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,8 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
273273
if (mca_oob_tcp_component.max_recon_attempts < 0 ||
274274
peer->num_retries < mca_oob_tcp_component.max_recon_attempts) {
275275
struct timeval tv;
276+
/* close the current socket */
277+
CLOSE_THE_SOCKET(peer->sd);
276278
/* reset the addr states */
277279
OPAL_LIST_FOREACH(addr, &peer->addrs, mca_oob_tcp_addr_t) {
278280
addr->state = MCA_OOB_TCP_UNCONNECTED;
@@ -306,6 +308,8 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
306308
"------------------------------------------------------------",
307309
orte_process_info.nodename,
308310
(NULL == host) ? "<unknown>" : host);
311+
/* close the socket */
312+
CLOSE_THE_SOCKET(peer->sd);
309313
/* let the TCP component know that this module failed to make
310314
* the connection so it can do some bookkeeping and fail back
311315
* to the OOB level so another component can try. This will activate
@@ -350,6 +354,8 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
350354
} else {
351355
peer->state = MCA_OOB_TCP_UNCONNECTED;
352356
}
357+
/* close the socket */
358+
CLOSE_THE_SOCKET(peer->sd);
353359
return;
354360
} else {
355361
opal_output(0,
@@ -361,6 +367,8 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
361367
opal_net_get_port((struct sockaddr*)&addr->addr),
362368
opal_strerror(rc),
363369
rc);
370+
/* close the socket */
371+
CLOSE_THE_SOCKET(peer->sd);
364372
ORTE_FORCED_TERMINATE(1);
365373
}
366374

orte/mca/oob/tcp/oob_tcp_listener.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,10 @@ static int create_listen(void)
385385
conn = OBJ_NEW(mca_oob_tcp_listener_t);
386386
conn->sd = sd;
387387
conn->port = ntohs(((struct sockaddr_in*) &inaddr)->sin_port);
388+
if (orte_static_ports && 0 == orte_process_info.my_port) {
389+
/* save the first one */
390+
orte_process_info.my_port = conn->port;
391+
}
388392
opal_list_append(&mca_oob_tcp_component.listeners, &conn->item);
389393
/* and to our ports */
390394
asprintf(&tconn, "%d", ntohs(((struct sockaddr_in*) &inaddr)->sin_port));

orte/mca/plm/alps/plm_alps_module.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
1414
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
1515
* reserved.
16-
* Copyright (c) 2014 Intel Corporation. All rights reserved.
16+
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
1717
* $COPYRIGHT$
1818
*
1919
* Additional copyrights may follow
@@ -350,6 +350,17 @@ static void launch_daemons(int fd, short args, void *cbdata)
350350
/* add the daemon command (as specified by user) */
351351
orte_plm_base_setup_orted_cmd(&argc, &argv);
352352

353+
/* if we have static ports, we need to ensure that mpirun is
354+
* on the list. Since alps won't be launching a daemon on it,
355+
* it won't have been placed on the list, so create a new
356+
* version here that includes it */
357+
if (orte_static_ports) {
358+
char *ltmp;
359+
asprintf(&ltmp, "%s,%s", orte_process_info.nodename, nodelist_flat);
360+
free(nodelist_flat);
361+
nodelist_flat = ltmp;
362+
}
363+
353364
/* Add basic orted command line options, including debug flags */
354365
orte_plm_base_orted_append_basic_args(&argc, &argv,
355366
NULL,

orte/mca/plm/base/plm_base_launch_support.c

Lines changed: 39 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -84,27 +84,35 @@
8484
void orte_plm_base_set_slots(orte_node_t *node)
8585
{
8686
if (0 == strncmp(orte_set_slots, "cores", strlen(orte_set_slots))) {
87-
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
88-
HWLOC_OBJ_CORE, 0,
89-
OPAL_HWLOC_LOGICAL);
87+
if (NULL != node->topology && NULL != node->topology->topo) {
88+
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
89+
HWLOC_OBJ_CORE, 0,
90+
OPAL_HWLOC_LOGICAL);
91+
}
9092
} else if (0 == strncmp(orte_set_slots, "sockets", strlen(orte_set_slots))) {
91-
if (0 == (node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
92-
HWLOC_OBJ_SOCKET, 0,
93-
OPAL_HWLOC_LOGICAL))) {
94-
/* some systems don't report sockets - in this case,
95-
* use numanodes */
93+
if (NULL != node->topology && NULL != node->topology->topo) {
94+
if (0 == (node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
95+
HWLOC_OBJ_SOCKET, 0,
96+
OPAL_HWLOC_LOGICAL))) {
97+
/* some systems don't report sockets - in this case,
98+
* use numanodes */
99+
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
100+
HWLOC_OBJ_NODE, 0,
101+
OPAL_HWLOC_LOGICAL);
102+
}
103+
}
104+
} else if (0 == strncmp(orte_set_slots, "numas", strlen(orte_set_slots))) {
105+
if (NULL != node->topology && NULL != node->topology->topo) {
96106
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
97107
HWLOC_OBJ_NODE, 0,
98108
OPAL_HWLOC_LOGICAL);
99109
}
100-
} else if (0 == strncmp(orte_set_slots, "numas", strlen(orte_set_slots))) {
101-
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
102-
HWLOC_OBJ_NODE, 0,
103-
OPAL_HWLOC_LOGICAL);
104110
} else if (0 == strncmp(orte_set_slots, "hwthreads", strlen(orte_set_slots))) {
105-
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
106-
HWLOC_OBJ_PU, 0,
107-
OPAL_HWLOC_LOGICAL);
111+
if (NULL != node->topology && NULL != node->topology->topo) {
112+
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
113+
HWLOC_OBJ_PU, 0,
114+
OPAL_HWLOC_LOGICAL);
115+
}
108116
} else {
109117
/* must be a number */
110118
node->slots = strtol(orte_set_slots, NULL, 10);
@@ -1436,16 +1444,23 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
14361444
free(rml_uri);
14371445

14381446
/* if we have static ports, pass the node list */
1439-
if (orte_static_ports && NULL != nodes) {
1440-
/* convert the nodes to a regex */
1441-
if (ORTE_SUCCESS != (rc = orte_regex_create(nodes, &param))) {
1442-
ORTE_ERROR_LOG(rc);
1443-
return rc;
1447+
if (orte_static_ports) {
1448+
param = NULL;
1449+
if (NULL != nodes) {
1450+
/* convert the nodes to a regex */
1451+
if (ORTE_SUCCESS != (rc = orte_regex_create(nodes, &param))) {
1452+
ORTE_ERROR_LOG(rc);
1453+
return rc;
1454+
}
1455+
} else if (NULL != orte_node_regex) {
1456+
param = strdup(orte_node_regex);
1457+
}
1458+
if (NULL != param) {
1459+
opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1460+
opal_argv_append(argc, argv, "orte_node_regex");
1461+
opal_argv_append(argc, argv, param);
1462+
free(param);
14441463
}
1445-
opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1446-
opal_argv_append(argc, argv, "orte_node_regex");
1447-
opal_argv_append(argc, argv, param);
1448-
free(param);
14491464
}
14501465

14511466
/* if output-filename was specified, pass that along */

orte/mca/plm/lsf/plm_lsf_module.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
* reserved.
1515
* Copyright (c) 2008 Institut National de Recherche en Informatique
1616
* et Automatique. All rights reserved.
17-
* Copyright (c) 2014 Intel Corporation. All rights reserved.
17+
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
1818
* $COPYRIGHT$
1919
*
2020
* Additional copyrights may follow
@@ -258,6 +258,17 @@ static void launch_daemons(int fd, short args, void *cbdata)
258258
/* add the daemon command (as specified by user) */
259259
orte_plm_base_setup_orted_cmd(&argc, &argv);
260260

261+
/* if we have static ports, we need to ensure that mpirun is
262+
* on the list. Since lsf won't be launching a daemon on it,
263+
* it won't have been placed on the list, so create a new
264+
* version here that includes it */
265+
if (orte_static_ports) {
266+
char *ltmp;
267+
asprintf(&ltmp, "%s,%s", orte_process_info.nodename, nodelist);
268+
free(nodelist);
269+
nodelist = ltmp;
270+
}
271+
261272
/* Add basic orted command line options */
262273
orte_plm_base_orted_append_basic_args(&argc, &argv,
263274
"lsf",

orte/mca/plm/rsh/plm_rsh_module.c

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -328,7 +328,8 @@ static void rsh_wait_daemon(orte_proc_t *daemon, void* cbdata)
328328
static int setup_launch(int *argcptr, char ***argvptr,
329329
char *nodename,
330330
int *node_name_index1,
331-
int *proc_vpid_index, char *prefix_dir)
331+
int *proc_vpid_index, char *prefix_dir,
332+
char *nodelist)
332333
{
333334
int argc;
334335
char **argv;
@@ -613,7 +614,7 @@ static int setup_launch(int *argcptr, char ***argvptr,
613614
orte_plm_base_orted_append_basic_args(&argc, &argv,
614615
"env",
615616
proc_vpid_index,
616-
NULL);
617+
nodelist);
617618

618619
/* ensure that only the ssh plm is selected on the remote daemon */
619620
opal_argv_append_nosize(&argv, "-"OPAL_MCA_CMD_LINE_ID);
@@ -828,7 +829,7 @@ static int remote_spawn(opal_buffer_t *launch)
828829

829830
/* setup the launch */
830831
if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, orte_process_info.nodename, &node_name_index1,
831-
&proc_vpid_index, prefix))) {
832+
&proc_vpid_index, prefix, NULL))) {
832833
ORTE_ERROR_LOG(rc);
833834
OBJ_DESTRUCT(&coll);
834835
goto cleanup;
@@ -993,6 +994,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
993994
int port, *portptr;
994995
orte_namelist_t *child;
995996
char *rtmod;
997+
char *nlistflat;
996998

997999
/* if we are launching debugger daemons, then just go
9981000
* do it - no new daemons will be launched
@@ -1153,12 +1155,37 @@ static void launch_daemons(int fd, short args, void *cbdata)
11531155
orte_routed.get_routing_list(rtmod, &coll);
11541156
}
11551157

1158+
if (orte_static_ports) {
1159+
/* create a list of all nodes involved so we can pass it along */
1160+
char **nodelist = NULL;
1161+
orte_node_t *n2;
1162+
for (nnode=0; nnode < map->nodes->size; nnode++) {
1163+
if (NULL != (n2 = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) {
1164+
opal_argv_append_nosize(&nodelist, n2->name);
1165+
}
1166+
}
1167+
/* we need mpirun to be the first node on this list */
1168+
if (0 != strcmp(nodelist[0], orte_process_info.nodename)) {
1169+
opal_argv_prepend_nosize(&nodelist, orte_process_info.nodename);
1170+
}
1171+
nlistflat = opal_argv_join(nodelist, ',');
1172+
opal_argv_free(nodelist);
1173+
} else {
1174+
nlistflat = NULL;
1175+
}
1176+
11561177
/* setup the launch */
11571178
if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, node->name, &node_name_index1,
1158-
&proc_vpid_index, prefix_dir))) {
1179+
&proc_vpid_index, prefix_dir, nlistflat))) {
11591180
ORTE_ERROR_LOG(rc);
1181+
if (NULL != nlistflat) {
1182+
free(nlistflat);
1183+
}
11601184
goto cleanup;
11611185
}
1186+
if (NULL != nlistflat) {
1187+
free(nlistflat);
1188+
}
11621189

11631190
/*
11641191
* Iterate through each of the nodes

orte/mca/plm/slurm/plm_slurm_module.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* Copyright (c) 2006-2014 Cisco Systems, Inc. All rights reserved.
1313
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
1414
* reserved.
15-
* Copyright (c) 2014 Intel Corporation. All rights reserved.
15+
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
1616
* $COPYRIGHT$
1717
*
1818
* Additional copyrights may follow
@@ -354,6 +354,17 @@ static void launch_daemons(int fd, short args, void *cbdata)
354354
/* add the daemon command (as specified by user) */
355355
orte_plm_base_setup_orted_cmd(&argc, &argv);
356356

357+
/* if we have static ports, we need to ensure that mpirun is
358+
* on the list. Since slurm won't be launching a daemon on it,
359+
* it won't have been placed on the list, so create a new
360+
* version here that includes it */
361+
if (orte_static_ports) {
362+
char *ltmp;
363+
asprintf(&ltmp, "%s,%s", orte_process_info.nodename, nodelist_flat);
364+
free(nodelist_flat);
365+
nodelist_flat = ltmp;
366+
}
367+
357368
/* Add basic orted command line options, including debug flags */
358369
orte_plm_base_orted_append_basic_args(&argc, &argv,
359370
"slurm", &proc_vpid_index,

orte/mca/plm/tm/plm_tm_module.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
1313
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
1414
* reserved.
15-
* Copyright (c) 2014 Intel Corporation. All rights reserved.
15+
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
1616
* $COPYRIGHT$
1717
*
1818
* Additional copyrights may follow
@@ -278,6 +278,17 @@ static void launch_daemons(int fd, short args, void *cbdata)
278278
nodelist = opal_argv_join(nodeargv, ',');
279279
opal_argv_free(nodeargv);
280280

281+
/* if we have static ports, we need to ensure that mpirun is
282+
* on the list. Since Torque won't be launching a daemon on it,
283+
* it won't have been placed on the list, so create a new
284+
* version here that includes it */
285+
if (orte_static_ports) {
286+
char *ltmp;
287+
asprintf(&ltmp, "%s,%s", orte_process_info.nodename, nodelist);
288+
free(nodelist);
289+
nodelist = ltmp;
290+
}
291+
281292
/* Add basic orted command line options */
282293
orte_plm_base_orted_append_basic_args(&argc, &argv, "tm",
283294
&proc_vpid_index,

0 commit comments

Comments
 (0)