Skip to content

Commit 7b6e36e

Browse files
author
Ralph Castain
authored
Merge pull request #2844 from rhc54/topic/sig
Cleanup launch
2 parents 3b82015 + 399de07 commit 7b6e36e

File tree

5 files changed

+37
-55
lines changed

5 files changed

+37
-55
lines changed

orte/mca/oob/base/base.h

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ ORTE_DECLSPEC void orte_oob_base_send_nb(int fd, short args, void *cbdata);
126126
orte_oob_base_send_nb, cd); \
127127
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI); \
128128
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1); \
129-
}while(0);
129+
}while(0)
130130

131131
/* Our contact info is actually subject to change as transports
132132
* can fail at any time. So a request to obtain our URI requires
@@ -175,12 +175,9 @@ OBJ_CLASS_DECLARATION(mca_oob_uri_req_t);
175175
mca_oob_uri_req_t *rq; \
176176
rq = OBJ_NEW(mca_oob_uri_req_t); \
177177
rq->uri = strdup((u)); \
178-
opal_event_set(orte_oob_base.ev_base, &(rq)->ev, -1, \
179-
OPAL_EV_WRITE, \
180-
orte_oob_base_set_addr, (rq)); \
181-
opal_event_set_priority(&(rq)->ev, ORTE_MSG_PRI); \
182-
opal_event_active(&(rq)->ev, OPAL_EV_WRITE, 1); \
183-
}while(0);
178+
orte_oob_base_set_addr(0, 0, (void*)rq); \
179+
}while(0)
180+
184181
ORTE_DECLSPEC void orte_oob_base_set_addr(int fd, short args, void *cbdata);
185182

186183

orte/mca/oob/base/oob_base_frame.c

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -109,12 +109,6 @@ static int orte_oob_base_close(void)
109109

110110
OBJ_DESTRUCT(&orte_oob_base.peers);
111111

112-
if (ORTE_PROC_IS_APP || ORTE_PROC_IS_TOOL) {
113-
opal_progress_thread_finalize(NULL);
114-
} else {
115-
opal_progress_thread_finalize("OOB-BASE");
116-
}
117-
118112
OPAL_TIMING_EVENT((&tm_oob, "Finish"));
119113
OPAL_TIMING_REPORT(orte_oob_base.timing, &tm_oob);
120114

@@ -133,11 +127,7 @@ static int orte_oob_base_open(mca_base_open_flag_t flags)
133127
opal_hash_table_init(&orte_oob_base.peers, 128);
134128
OBJ_CONSTRUCT(&orte_oob_base.actives, opal_list_t);
135129

136-
if (ORTE_PROC_IS_APP || ORTE_PROC_IS_TOOL) {
137-
orte_oob_base.ev_base = opal_progress_thread_init(NULL);
138-
} else {
139-
orte_oob_base.ev_base = opal_progress_thread_init("OOB-BASE");
140-
}
130+
orte_oob_base.ev_base = orte_event_base;
141131

142132

143133
#if OPAL_ENABLE_FT_CR == 1

orte/mca/plm/base/plm_base_launch_support.c

Lines changed: 16 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -835,7 +835,7 @@ void orte_plm_base_daemon_topology(int status, orte_process_name_t* sender,
835835
orte_job_t *jdata;
836836

837837
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
838-
"%s plm:base:daemon_topology for daemon %s",
838+
"%s plm:base:daemon_topology recvd for daemon %s",
839839
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
840840
ORTE_NAME_PRINT(sender)));
841841

@@ -938,7 +938,7 @@ void orte_plm_base_daemon_topology(int status, orte_process_name_t* sender,
938938

939939
CLEANUP:
940940
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
941-
"%s plm:base:orted_report_launch %s for daemon %s",
941+
"%s plm:base:orted:report_topo launch %s for daemon %s",
942942
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
943943
orted_failed_launch ? "failed" : "completed",
944944
ORTE_NAME_PRINT(sender)));
@@ -985,7 +985,6 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
985985
char *rml_uri = NULL, *ptr;
986986
int rc, idx;
987987
orte_proc_t *daemon=NULL;
988-
orte_node_t *node;
989988
orte_job_t *jdata;
990989
orte_process_name_t dname;
991990
opal_buffer_t *relay;
@@ -994,7 +993,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
994993
hwloc_topology_t topo;
995994
int i;
996995
bool found;
997-
orte_daemon_cmd_flag_t cmd = ORTE_DAEMON_REPORT_TOPOLOGY_CMD;
996+
orte_daemon_cmd_flag_t cmd;
998997

999998
/* get the daemon job, if necessary */
1000999
if (NULL == jdatorted) {
@@ -1054,8 +1053,6 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
10541053
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
10551054
ORTE_NAME_PRINT(&daemon->name), nodename));
10561055

1057-
node = daemon->node;
1058-
10591056
/* look this node up, if necessary */
10601057
if (!orte_plm_globals.daemon_nodes_assigned_at_launch) {
10611058
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
@@ -1067,21 +1064,11 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
10671064
free(daemon->node->name);
10681065
daemon->node->name = strdup(nodename);
10691066
/* mark that it was verified */
1070-
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_LOC_VERIFIED);
1071-
}
1072-
1073-
if (NULL == node) {
1074-
/* this shouldn't happen - it indicates an error in the
1075-
* prior node matching logic, so report it and error out
1076-
*/
1077-
orte_show_help("help-plm-base.txt", "daemon-no-assigned-node", true,
1078-
ORTE_NAME_PRINT(&daemon->name), nodename);
1079-
orted_failed_launch = true;
1080-
goto CLEANUP;
1067+
ORTE_FLAG_SET(daemon->node, ORTE_NODE_FLAG_LOC_VERIFIED);
10811068
}
10821069

10831070
/* mark the daemon as launched */
1084-
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED);
1071+
ORTE_FLAG_SET(daemon->node, ORTE_NODE_FLAG_DAEMON_LAUNCHED);
10851072

10861073
if (orte_retain_aliases) {
10871074
char *alias, **atmp=NULL;
@@ -1113,7 +1100,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
11131100
}
11141101
alias = opal_argv_join(atmp, ',');
11151102
opal_argv_free(atmp);
1116-
orte_set_attribute(&node->attributes, ORTE_NODE_ALIAS, ORTE_ATTR_LOCAL, alias, OPAL_STRING);
1103+
orte_set_attribute(&daemon->node->attributes, ORTE_NODE_ALIAS, ORTE_ATTR_LOCAL, alias, OPAL_STRING);
11171104
free(alias);
11181105
}
11191106

@@ -1130,7 +1117,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
11301117

11311118
/* rank=1 always sends its topology back */
11321119
topo = NULL;
1133-
if (1 == sender->vpid) {
1120+
if (1 == dname.vpid) {
11341121
idx=1;
11351122
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &topo, &idx, OPAL_HWLOC_TOPO))) {
11361123
ORTE_ERROR_LOG(rc);
@@ -1151,7 +1138,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
11511138
"%s TOPOLOGY ALREADY RECORDED",
11521139
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
11531140
found = true;
1154-
node->topology = t;
1141+
daemon->node->topology = t;
11551142
if (NULL != topo) {
11561143
hwloc_topology_destroy(topo);
11571144
}
@@ -1167,12 +1154,18 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
11671154
t = OBJ_NEW(orte_topology_t);
11681155
t->sig = sig;
11691156
opal_pointer_array_add(orte_node_topologies, t);
1170-
node->topology = t;
1157+
daemon->node->topology = t;
11711158
if (NULL != topo) {
11721159
t->topo = topo;
11731160
} else {
1161+
/* nope - save the signature and request the complete topology from that node */
1162+
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1163+
"%s REQUESTING TOPOLOGY FROM %s",
1164+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1165+
ORTE_NAME_PRINT(&dname)));
11741166
/* construct the request */
11751167
relay = OBJ_NEW(opal_buffer_t);
1168+
cmd = ORTE_DAEMON_REPORT_TOPOLOGY_CMD;
11761169
if (OPAL_SUCCESS != (rc = opal_dss.pack(relay, &cmd, 1, ORTE_DAEMON_CMD))) {
11771170
ORTE_ERROR_LOG(rc);
11781171
OBJ_RELEASE(relay);
@@ -1181,7 +1174,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
11811174
}
11821175
/* send it */
11831176
orte_rml.send_buffer_nb(orte_mgmt_conduit,
1184-
sender, relay,
1177+
&dname, relay,
11851178
ORTE_RML_TAG_DAEMON,
11861179
orte_rml_send_callback, NULL);
11871180
/* we will count this node as completed

orte/orted/orted_main.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -761,8 +761,10 @@ int orte_daemon(int argc, char *argv[])
761761

762762
/* if we are rank=1, then send our topology back - otherwise, mpirun
763763
* will request it if necessary */
764-
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) {
765-
ORTE_ERROR_LOG(ret);
764+
if (1 == ORTE_PROC_MY_NAME->vpid) {
765+
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) {
766+
ORTE_ERROR_LOG(ret);
767+
}
766768
}
767769

768770
/* send to the HNP's callback - will be routed if routes are available */

orte/util/nidmap.c

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -516,9 +516,9 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer)
516516
/* decode a nodemap for a daemon */
517517
int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
518518
{
519-
int k, m, n, rc, start, endpt;
519+
int m, n, rc;
520520
orte_node_t *node;
521-
size_t num_nodes;
521+
size_t k, num_nodes, endpt;
522522
orte_job_t *daemons;
523523
orte_proc_t *dptr;
524524
char **nodes, *indices, *dvpids;
@@ -601,16 +601,16 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
601601
for (n=0; NULL != tmp[n]; n++) {
602602
/* convert the number - since it might be a range,
603603
* save the remainder pointer */
604-
nodeids[k] = strtoul(tmp[n], &rmndr, 10);
604+
nodeids[k++] = strtoul(tmp[n], &rmndr, 10);
605605
if (NULL != rmndr) {
606606
/* it must be a range - find the endpoint */
607607
++rmndr;
608+
m = nodeids[k-1] + 1;
608609
endpt = strtoul(rmndr, NULL, 10);
609-
start = nodeids[k] + 1;
610-
for (m=0; m < endpt; m++) {
611-
++k;
612-
nodeids[k] = start + m;
610+
while (k <= endpt && k < num_nodes) {
611+
nodeids[k++] = m++;
613612
}
613+
--k; // step back to compensate for later increment
614614
}
615615
++k;
616616
}
@@ -624,16 +624,16 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
624624
for (n=0; NULL != tmp[n]; n++) {
625625
/* convert the number - since it might be a range,
626626
* save the remainder pointer */
627-
dids[k] = strtoul(tmp[n], &rmndr, 10);
627+
dids[k++] = strtoul(tmp[n], &rmndr, 10);
628628
if (NULL != rmndr) {
629629
/* it must be a range - find the endpoint */
630630
++rmndr;
631631
endpt = strtoul(rmndr, NULL, 10);
632-
start = dids[k] + 1;
633-
for (m=0; m < endpt; m++) {
634-
++k;
635-
dids[k] = start + m;
632+
m = dids[k-1] + 1;
633+
while (k <= endpt && k < num_nodes) {
634+
dids[k++] = m++;
636635
}
636+
--k; // step back to compensate for later increment
637637
}
638638
++k;
639639
}

0 commit comments

Comments
 (0)