Skip to content

Commit 399de07

Browse files
author
Ralph Castain
committed
Cleanup launch
Given that we only set OOB contact info from inside of events, or before we begin threaded operations (e.g., in the ess), allow set_contact_info to directly update the oob/base framework globals. Correct the nidmap regex decompression routine. Ensure that rank=1 daemon always sends back its topology as this is the most common use-case. Signed-off-by: Ralph Castain <[email protected]>
1 parent 6c66871 commit 399de07

File tree

5 files changed

+37
-55
lines changed

5 files changed

+37
-55
lines changed

orte/mca/oob/base/base.h

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ ORTE_DECLSPEC void orte_oob_base_send_nb(int fd, short args, void *cbdata);
126126
orte_oob_base_send_nb, cd); \
127127
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI); \
128128
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1); \
129-
}while(0);
129+
}while(0)
130130

131131
/* Our contact info is actually subject to change as transports
132132
* can fail at any time. So a request to obtain our URI requires
@@ -175,12 +175,9 @@ OBJ_CLASS_DECLARATION(mca_oob_uri_req_t);
175175
mca_oob_uri_req_t *rq; \
176176
rq = OBJ_NEW(mca_oob_uri_req_t); \
177177
rq->uri = strdup((u)); \
178-
opal_event_set(orte_oob_base.ev_base, &(rq)->ev, -1, \
179-
OPAL_EV_WRITE, \
180-
orte_oob_base_set_addr, (rq)); \
181-
opal_event_set_priority(&(rq)->ev, ORTE_MSG_PRI); \
182-
opal_event_active(&(rq)->ev, OPAL_EV_WRITE, 1); \
183-
}while(0);
178+
orte_oob_base_set_addr(0, 0, (void*)rq); \
179+
}while(0)
180+
184181
ORTE_DECLSPEC void orte_oob_base_set_addr(int fd, short args, void *cbdata);
185182

186183

orte/mca/oob/base/oob_base_frame.c

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -109,12 +109,6 @@ static int orte_oob_base_close(void)
109109

110110
OBJ_DESTRUCT(&orte_oob_base.peers);
111111

112-
if (ORTE_PROC_IS_APP || ORTE_PROC_IS_TOOL) {
113-
opal_progress_thread_finalize(NULL);
114-
} else {
115-
opal_progress_thread_finalize("OOB-BASE");
116-
}
117-
118112
OPAL_TIMING_EVENT((&tm_oob, "Finish"));
119113
OPAL_TIMING_REPORT(orte_oob_base.timing, &tm_oob);
120114

@@ -133,11 +127,7 @@ static int orte_oob_base_open(mca_base_open_flag_t flags)
133127
opal_hash_table_init(&orte_oob_base.peers, 128);
134128
OBJ_CONSTRUCT(&orte_oob_base.actives, opal_list_t);
135129

136-
if (ORTE_PROC_IS_APP || ORTE_PROC_IS_TOOL) {
137-
orte_oob_base.ev_base = opal_progress_thread_init(NULL);
138-
} else {
139-
orte_oob_base.ev_base = opal_progress_thread_init("OOB-BASE");
140-
}
130+
orte_oob_base.ev_base = orte_event_base;
141131

142132

143133
#if OPAL_ENABLE_FT_CR == 1

orte/mca/plm/base/plm_base_launch_support.c

Lines changed: 16 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -835,7 +835,7 @@ void orte_plm_base_daemon_topology(int status, orte_process_name_t* sender,
835835
orte_job_t *jdata;
836836

837837
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
838-
"%s plm:base:daemon_topology for daemon %s",
838+
"%s plm:base:daemon_topology recvd for daemon %s",
839839
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
840840
ORTE_NAME_PRINT(sender)));
841841

@@ -938,7 +938,7 @@ void orte_plm_base_daemon_topology(int status, orte_process_name_t* sender,
938938

939939
CLEANUP:
940940
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
941-
"%s plm:base:orted_report_launch %s for daemon %s",
941+
"%s plm:base:orted:report_topo launch %s for daemon %s",
942942
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
943943
orted_failed_launch ? "failed" : "completed",
944944
ORTE_NAME_PRINT(sender)));
@@ -985,7 +985,6 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
985985
char *rml_uri = NULL, *ptr;
986986
int rc, idx;
987987
orte_proc_t *daemon=NULL;
988-
orte_node_t *node;
989988
orte_job_t *jdata;
990989
orte_process_name_t dname;
991990
opal_buffer_t *relay;
@@ -994,7 +993,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
994993
hwloc_topology_t topo;
995994
int i;
996995
bool found;
997-
orte_daemon_cmd_flag_t cmd = ORTE_DAEMON_REPORT_TOPOLOGY_CMD;
996+
orte_daemon_cmd_flag_t cmd;
998997

999998
/* get the daemon job, if necessary */
1000999
if (NULL == jdatorted) {
@@ -1054,8 +1053,6 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
10541053
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
10551054
ORTE_NAME_PRINT(&daemon->name), nodename));
10561055

1057-
node = daemon->node;
1058-
10591056
/* look this node up, if necessary */
10601057
if (!orte_plm_globals.daemon_nodes_assigned_at_launch) {
10611058
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
@@ -1067,21 +1064,11 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
10671064
free(daemon->node->name);
10681065
daemon->node->name = strdup(nodename);
10691066
/* mark that it was verified */
1070-
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_LOC_VERIFIED);
1071-
}
1072-
1073-
if (NULL == node) {
1074-
/* this shouldn't happen - it indicates an error in the
1075-
* prior node matching logic, so report it and error out
1076-
*/
1077-
orte_show_help("help-plm-base.txt", "daemon-no-assigned-node", true,
1078-
ORTE_NAME_PRINT(&daemon->name), nodename);
1079-
orted_failed_launch = true;
1080-
goto CLEANUP;
1067+
ORTE_FLAG_SET(daemon->node, ORTE_NODE_FLAG_LOC_VERIFIED);
10811068
}
10821069

10831070
/* mark the daemon as launched */
1084-
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED);
1071+
ORTE_FLAG_SET(daemon->node, ORTE_NODE_FLAG_DAEMON_LAUNCHED);
10851072

10861073
if (orte_retain_aliases) {
10871074
char *alias, **atmp=NULL;
@@ -1113,7 +1100,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
11131100
}
11141101
alias = opal_argv_join(atmp, ',');
11151102
opal_argv_free(atmp);
1116-
orte_set_attribute(&node->attributes, ORTE_NODE_ALIAS, ORTE_ATTR_LOCAL, alias, OPAL_STRING);
1103+
orte_set_attribute(&daemon->node->attributes, ORTE_NODE_ALIAS, ORTE_ATTR_LOCAL, alias, OPAL_STRING);
11171104
free(alias);
11181105
}
11191106

@@ -1130,7 +1117,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
11301117

11311118
/* rank=1 always sends its topology back */
11321119
topo = NULL;
1133-
if (1 == sender->vpid) {
1120+
if (1 == dname.vpid) {
11341121
idx=1;
11351122
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &topo, &idx, OPAL_HWLOC_TOPO))) {
11361123
ORTE_ERROR_LOG(rc);
@@ -1151,7 +1138,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
11511138
"%s TOPOLOGY ALREADY RECORDED",
11521139
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
11531140
found = true;
1154-
node->topology = t;
1141+
daemon->node->topology = t;
11551142
if (NULL != topo) {
11561143
hwloc_topology_destroy(topo);
11571144
}
@@ -1167,12 +1154,18 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
11671154
t = OBJ_NEW(orte_topology_t);
11681155
t->sig = sig;
11691156
opal_pointer_array_add(orte_node_topologies, t);
1170-
node->topology = t;
1157+
daemon->node->topology = t;
11711158
if (NULL != topo) {
11721159
t->topo = topo;
11731160
} else {
1161+
/* nope - save the signature and request the complete topology from that node */
1162+
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1163+
"%s REQUESTING TOPOLOGY FROM %s",
1164+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1165+
ORTE_NAME_PRINT(&dname)));
11741166
/* construct the request */
11751167
relay = OBJ_NEW(opal_buffer_t);
1168+
cmd = ORTE_DAEMON_REPORT_TOPOLOGY_CMD;
11761169
if (OPAL_SUCCESS != (rc = opal_dss.pack(relay, &cmd, 1, ORTE_DAEMON_CMD))) {
11771170
ORTE_ERROR_LOG(rc);
11781171
OBJ_RELEASE(relay);
@@ -1181,7 +1174,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
11811174
}
11821175
/* send it */
11831176
orte_rml.send_buffer_nb(orte_mgmt_conduit,
1184-
sender, relay,
1177+
&dname, relay,
11851178
ORTE_RML_TAG_DAEMON,
11861179
orte_rml_send_callback, NULL);
11871180
/* we will count this node as completed

orte/orted/orted_main.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -761,8 +761,10 @@ int orte_daemon(int argc, char *argv[])
761761

762762
/* if we are rank=1, then send our topology back - otherwise, mpirun
763763
* will request it if necessary */
764-
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) {
765-
ORTE_ERROR_LOG(ret);
764+
if (1 == ORTE_PROC_MY_NAME->vpid) {
765+
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) {
766+
ORTE_ERROR_LOG(ret);
767+
}
766768
}
767769

768770
/* send to the HNP's callback - will be routed if routes are available */

orte/util/nidmap.c

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -516,9 +516,9 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer)
516516
/* decode a nodemap for a daemon */
517517
int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
518518
{
519-
int k, m, n, rc, start, endpt;
519+
int m, n, rc;
520520
orte_node_t *node;
521-
size_t num_nodes;
521+
size_t k, num_nodes, endpt;
522522
orte_job_t *daemons;
523523
orte_proc_t *dptr;
524524
char **nodes, *indices, *dvpids;
@@ -601,16 +601,16 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
601601
for (n=0; NULL != tmp[n]; n++) {
602602
/* convert the number - since it might be a range,
603603
* save the remainder pointer */
604-
nodeids[k] = strtoul(tmp[n], &rmndr, 10);
604+
nodeids[k++] = strtoul(tmp[n], &rmndr, 10);
605605
if (NULL != rmndr) {
606606
/* it must be a range - find the endpoint */
607607
++rmndr;
608+
m = nodeids[k-1] + 1;
608609
endpt = strtoul(rmndr, NULL, 10);
609-
start = nodeids[k] + 1;
610-
for (m=0; m < endpt; m++) {
611-
++k;
612-
nodeids[k] = start + m;
610+
while (k <= endpt && k < num_nodes) {
611+
nodeids[k++] = m++;
613612
}
613+
--k; // step back to compensate for later increment
614614
}
615615
++k;
616616
}
@@ -624,16 +624,16 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
624624
for (n=0; NULL != tmp[n]; n++) {
625625
/* convert the number - since it might be a range,
626626
* save the remainder pointer */
627-
dids[k] = strtoul(tmp[n], &rmndr, 10);
627+
dids[k++] = strtoul(tmp[n], &rmndr, 10);
628628
if (NULL != rmndr) {
629629
/* it must be a range - find the endpoint */
630630
++rmndr;
631631
endpt = strtoul(rmndr, NULL, 10);
632-
start = dids[k] + 1;
633-
for (m=0; m < endpt; m++) {
634-
++k;
635-
dids[k] = start + m;
632+
m = dids[k-1] + 1;
633+
while (k <= endpt && k < num_nodes) {
634+
dids[k++] = m++;
636635
}
636+
--k; // step back to compensate for later increment
637637
}
638638
++k;
639639
}

0 commit comments

Comments
 (0)