@@ -835,7 +835,7 @@ void orte_plm_base_daemon_topology(int status, orte_process_name_t* sender,
835
835
orte_job_t * jdata ;
836
836
837
837
OPAL_OUTPUT_VERBOSE ((5 , orte_plm_base_framework .framework_output ,
838
- "%s plm:base:daemon_topology for daemon %s" ,
838
+ "%s plm:base:daemon_topology recvd for daemon %s" ,
839
839
ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
840
840
ORTE_NAME_PRINT (sender )));
841
841
@@ -938,7 +938,7 @@ void orte_plm_base_daemon_topology(int status, orte_process_name_t* sender,
938
938
939
939
CLEANUP :
940
940
OPAL_OUTPUT_VERBOSE ((5 , orte_plm_base_framework .framework_output ,
941
- "%s plm:base:orted_report_launch %s for daemon %s" ,
941
+ "%s plm:base:orted:report_topo launch %s for daemon %s" ,
942
942
ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
943
943
orted_failed_launch ? "failed" : "completed" ,
944
944
ORTE_NAME_PRINT (sender )));
@@ -985,7 +985,6 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
985
985
char * rml_uri = NULL , * ptr ;
986
986
int rc , idx ;
987
987
orte_proc_t * daemon = NULL ;
988
- orte_node_t * node ;
989
988
orte_job_t * jdata ;
990
989
orte_process_name_t dname ;
991
990
opal_buffer_t * relay ;
@@ -994,7 +993,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
994
993
hwloc_topology_t topo ;
995
994
int i ;
996
995
bool found ;
997
- orte_daemon_cmd_flag_t cmd = ORTE_DAEMON_REPORT_TOPOLOGY_CMD ;
996
+ orte_daemon_cmd_flag_t cmd ;
998
997
999
998
/* get the daemon job, if necessary */
1000
999
if (NULL == jdatorted ) {
@@ -1054,8 +1053,6 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
1054
1053
ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
1055
1054
ORTE_NAME_PRINT (& daemon -> name ), nodename ));
1056
1055
1057
- node = daemon -> node ;
1058
-
1059
1056
/* look this node up, if necessary */
1060
1057
if (!orte_plm_globals .daemon_nodes_assigned_at_launch ) {
1061
1058
OPAL_OUTPUT_VERBOSE ((5 , orte_plm_base_framework .framework_output ,
@@ -1067,21 +1064,11 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
1067
1064
free (daemon -> node -> name );
1068
1065
daemon -> node -> name = strdup (nodename );
1069
1066
/* mark that it was verified */
1070
- ORTE_FLAG_SET (node , ORTE_NODE_FLAG_LOC_VERIFIED );
1071
- }
1072
-
1073
- if (NULL == node ) {
1074
- /* this shouldn't happen - it indicates an error in the
1075
- * prior node matching logic, so report it and error out
1076
- */
1077
- orte_show_help ("help-plm-base.txt" , "daemon-no-assigned-node" , true,
1078
- ORTE_NAME_PRINT (& daemon -> name ), nodename );
1079
- orted_failed_launch = true;
1080
- goto CLEANUP ;
1067
+ ORTE_FLAG_SET (daemon -> node , ORTE_NODE_FLAG_LOC_VERIFIED );
1081
1068
}
1082
1069
1083
1070
/* mark the daemon as launched */
1084
- ORTE_FLAG_SET (node , ORTE_NODE_FLAG_DAEMON_LAUNCHED );
1071
+ ORTE_FLAG_SET (daemon -> node , ORTE_NODE_FLAG_DAEMON_LAUNCHED );
1085
1072
1086
1073
if (orte_retain_aliases ) {
1087
1074
char * alias , * * atmp = NULL ;
@@ -1113,7 +1100,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
1113
1100
}
1114
1101
alias = opal_argv_join (atmp , ',' );
1115
1102
opal_argv_free (atmp );
1116
- orte_set_attribute (& node -> attributes , ORTE_NODE_ALIAS , ORTE_ATTR_LOCAL , alias , OPAL_STRING );
1103
+ orte_set_attribute (& daemon -> node -> attributes , ORTE_NODE_ALIAS , ORTE_ATTR_LOCAL , alias , OPAL_STRING );
1117
1104
free (alias );
1118
1105
}
1119
1106
@@ -1130,7 +1117,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
1130
1117
1131
1118
/* rank=1 always sends its topology back */
1132
1119
topo = NULL ;
1133
- if (1 == sender -> vpid ) {
1120
+ if (1 == dname . vpid ) {
1134
1121
idx = 1 ;
1135
1122
if (OPAL_SUCCESS != (rc = opal_dss .unpack (buffer , & topo , & idx , OPAL_HWLOC_TOPO ))) {
1136
1123
ORTE_ERROR_LOG (rc );
@@ -1151,7 +1138,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
1151
1138
"%s TOPOLOGY ALREADY RECORDED" ,
1152
1139
ORTE_NAME_PRINT (ORTE_PROC_MY_NAME )));
1153
1140
found = true;
1154
- node -> topology = t ;
1141
+ daemon -> node -> topology = t ;
1155
1142
if (NULL != topo ) {
1156
1143
hwloc_topology_destroy (topo );
1157
1144
}
@@ -1167,12 +1154,18 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
1167
1154
t = OBJ_NEW (orte_topology_t );
1168
1155
t -> sig = sig ;
1169
1156
opal_pointer_array_add (orte_node_topologies , t );
1170
- node -> topology = t ;
1157
+ daemon -> node -> topology = t ;
1171
1158
if (NULL != topo ) {
1172
1159
t -> topo = topo ;
1173
1160
} else {
1161
+ /* nope - save the signature and request the complete topology from that node */
1162
+ OPAL_OUTPUT_VERBOSE ((5 , orte_plm_base_framework .framework_output ,
1163
+ "%s REQUESTING TOPOLOGY FROM %s" ,
1164
+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
1165
+ ORTE_NAME_PRINT (& dname )));
1174
1166
/* construct the request */
1175
1167
relay = OBJ_NEW (opal_buffer_t );
1168
+ cmd = ORTE_DAEMON_REPORT_TOPOLOGY_CMD ;
1176
1169
if (OPAL_SUCCESS != (rc = opal_dss .pack (relay , & cmd , 1 , ORTE_DAEMON_CMD ))) {
1177
1170
ORTE_ERROR_LOG (rc );
1178
1171
OBJ_RELEASE (relay );
@@ -1181,7 +1174,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
1181
1174
}
1182
1175
/* send it */
1183
1176
orte_rml .send_buffer_nb (orte_mgmt_conduit ,
1184
- sender , relay ,
1177
+ & dname , relay ,
1185
1178
ORTE_RML_TAG_DAEMON ,
1186
1179
orte_rml_send_callback , NULL );
1187
1180
/* we will count this node as completed
0 commit comments