Skip to content

Commit 919d7fc

Browse files
author
Ralph Castain
committed
We cannot use OFI to determine when daemons can finalize as we don't see the "sockets" go away. So always use the OOB for the mgmt conduit - this provides the necessary termination signal AND ensures that IOF and other mgmt messages go solely across TCP.
Cleanup the way we look for matching OFI addresses by using the opal_net_samenetwork helper function. This now works for multi-network environments, but only using the socket provider Signed-off-by: Ralph Castain <[email protected]>
1 parent f038fe6 commit 919d7fc

File tree

6 files changed

+134
-125
lines changed

6 files changed

+134
-125
lines changed

orte/mca/errmgr/default_hnp/errmgr_default_hnp.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,9 +136,17 @@ static void hnp_abort(int error_code, char *fmt, ...)
136136
char *outmsg = NULL;
137137
orte_timer_t *timer;
138138

139+
/* only do this once */
140+
if (orte_abnormal_term_ordered) {
141+
return;
142+
}
143+
139144
/* ensure we exit with non-zero status */
140145
ORTE_UPDATE_EXIT_STATUS(error_code);
141146

147+
/* set the aborting flag */
148+
orte_abnormal_term_ordered = true;
149+
142150
/* If there was a message, construct it */
143151
va_start(arglist, fmt);
144152
if (NULL != fmt) {

orte/mca/rml/base/base.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -202,9 +202,9 @@ OBJ_CLASS_DECLARATION(orte_self_send_xfer_t);
202202
do { \
203203
orte_rml_recv_t *msg; \
204204
opal_output_verbose(5, orte_rml_base_framework.framework_output, \
205-
"%s Message posted at %s:%d", \
205+
"%s Message posted at %s:%d for tag %d", \
206206
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
207-
__FILE__, __LINE__); \
207+
__FILE__, __LINE__, (t)); \
208208
msg = OBJ_NEW(orte_rml_recv_t); \
209209
msg->sender.jobid = (p)->jobid; \
210210
msg->sender.vpid = (p)->vpid; \

orte/mca/rml/ofi/rml_ofi_component.c

Lines changed: 66 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
#include "opal/mca/base/base.h"
1414
#include "opal/util/argv.h"
15+
#include "opal/util/net.h"
1516
#include "opal/util/output.h"
1617
#include "opal/mca/backtrace/backtrace.h"
1718
#include "opal/mca/event/event.h"
@@ -85,6 +86,7 @@ orte_rml_ofi_module_t orte_rml_ofi = {
8586
/* Local variables */
8687
static bool init_done = false;
8788
static char *ofi_transports_supported = NULL;
89+
static bool ofi_desired = false;
8890

8991
static int
9092
rml_ofi_component_open(void)
@@ -98,6 +100,7 @@ rml_ofi_component_open(void)
98100
orte_rml_ofi.ofi_prov_open_num = 0;
99101
OBJ_CONSTRUCT(&orte_rml_ofi.peers, opal_hash_table_t);
100102
opal_hash_table_init(&orte_rml_ofi.peers, 128);
103+
OBJ_CONSTRUCT(&orte_rml_ofi.recv_msg_queue_list, opal_list_t);
101104

102105
for( uint8_t ofi_prov_id=0; ofi_prov_id < MAX_OFI_PROVIDERS ; ofi_prov_id++) {
103106
orte_rml_ofi.ofi_prov[ofi_prov_id].fabric = NULL;
@@ -116,6 +119,12 @@ rml_ofi_component_open(void)
116119

117120
opal_output_verbose(10,orte_rml_base_framework.framework_output," from %s:%d rml_ofi_component_open()",__FILE__,__LINE__);
118121

122+
if (!ORTE_PROC_IS_HNP && !ORTE_PROC_IS_DAEMON) {
123+
return ORTE_ERROR;
124+
}
125+
if (!ofi_desired) {
126+
return ORTE_ERROR;
127+
}
119128
return ORTE_SUCCESS;
120129
}
121130

@@ -218,7 +227,7 @@ rml_ofi_component_close(void)
218227
(void **)&value, &node);
219228
while (OPAL_SUCCESS == rc) {
220229
if (NULL != value) {
221-
OBJ_RELEASE(value);
230+
OBJ_RELEASE(value);
222231
}
223232
rc = opal_hash_table_get_next_key_uint64 (&orte_rml_ofi.peers, &key,
224233
(void **) &value, node, &node);
@@ -242,7 +251,16 @@ static int rml_ofi_component_register(void)
242251
OPAL_INFO_LVL_2,
243252
MCA_BASE_VAR_SCOPE_LOCAL,
244253
&ofi_transports_supported);
245-
opal_output(0, "OFI TRANSPORTS %s", ofi_transports_supported);
254+
255+
256+
ofi_desired = false;
257+
mca_base_component_var_register(component, "desired",
258+
"Use OFI for coll conduit",
259+
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
260+
OPAL_INFO_LVL_2,
261+
MCA_BASE_VAR_SCOPE_LOCAL,
262+
&ofi_desired);
263+
246264
return ORTE_SUCCESS;
247265
}
248266

@@ -982,7 +1000,6 @@ static orte_rml_base_module_t* open_conduit(opal_list_t *attributes)
9821000
"%s - Entering rml_ofi_open_conduit()",
9831001
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
9841002

985-
9861003
/* Open all ofi endpoints */
9871004
if (!init_done) {
9881005
rml_ofi_component_init();
@@ -1135,6 +1152,12 @@ static void ofi_set_contact_info (const char *uri)
11351152
return;
11361153
}
11371154

1155+
/* Open all ofi endpoints */
1156+
if (!init_done) {
1157+
rml_ofi_component_init();
1158+
init_done = true;
1159+
}
1160+
11381161
uris = strdup(uri);
11391162
process_uri(uris);
11401163
free(uris);
@@ -1146,10 +1169,10 @@ static void process_uri( char *uri)
11461169
orte_process_name_t peer;
11471170
char *cptr, *ofiuri;
11481171
char **uris=NULL;
1149-
int rc, i=0, tot_reqd = 1, tot_found = 0;
1172+
int rc, i=0, cur_ofi_prov;
11501173
uint64_t ui64;
11511174
orte_rml_ofi_peer_t *pr;
1152-
struct sockaddr_in* ep_sockaddr;
1175+
struct sockaddr_in *ep_sockaddr, *ep_sockaddr2;
11531176

11541177
/* find the first semi-colon in the string */
11551178
cptr = strchr(uri, ';');
@@ -1176,27 +1199,21 @@ static void process_uri( char *uri)
11761199
"%s:OFI set_contact_info peer %s is me",
11771200
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
11781201
ORTE_NAME_PRINT(&peer));
1179-
//skip adding to hashtable for HNP
1180-
if (!ORTE_PROC_IS_HNP) {
1181-
return;
1182-
} else {
1183-
opal_output_verbose(15, orte_rml_base_framework.framework_output,
1184-
"%s:OFI set_contact_info - HNP process so proceeding to add to hashtable",
1185-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) );
1186-
}
1202+
return;
11871203
}
11881204

11891205
/* split the rest of the uri into component parts */
11901206
uris = opal_argv_split(cptr, ';');
11911207

11921208
/* get the peer object for this process */
11931209
memcpy(&ui64, (char*)&peer, sizeof(uint64_t));
1194-
if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_rml_ofi.peers,
1195-
ui64, (void**)&pr) ||
1210+
pr = NULL;
1211+
if (OPAL_SUCCESS != (rc = opal_hash_table_get_value_uint64(&orte_rml_ofi.peers,
1212+
ui64, (void**)&pr)) ||
11961213
NULL == pr) {
11971214
pr = OBJ_NEW(orte_rml_ofi_peer_t);
11981215
/* populate the peer object with the ofi addresses */
1199-
for(i=0; NULL != uris[i] && tot_found < tot_reqd; i++) {
1216+
for(i=0; NULL != uris[i]; i++) {
12001217
ofiuri = strdup(uris[i]);
12011218
if (NULL == ofiuri) {
12021219
opal_output_verbose(2, orte_rml_base_framework.framework_output,
@@ -1211,35 +1228,43 @@ static void process_uri( char *uri)
12111228
ep_sockaddr = malloc( sizeof ( struct sockaddr_in) );
12121229
/* ofiuri for socket provider is of format - ofi-socket:<sin_family,sin_addr,sin_port> */
12131230
convert_to_sockaddr(ofiuri, ep_sockaddr);
1214-
pr->ofi_ep = (void *)ep_sockaddr;
1215-
tot_found++;
1231+
/* see if we have this subnet in our providers - we take
1232+
* the first one that matches (other than loopback) */
1233+
for( cur_ofi_prov=0; cur_ofi_prov < orte_rml_ofi.ofi_prov_open_num ; cur_ofi_prov++ ) {
1234+
ep_sockaddr2 = (struct sockaddr_in*)orte_rml_ofi.ofi_prov[cur_ofi_prov].ep_name;
1235+
if (opal_net_samenetwork((struct sockaddr*)ep_sockaddr, (struct sockaddr*)ep_sockaddr2, 24)) {
1236+
pr->ofi_ep = (void *)ep_sockaddr;
1237+
if (OPAL_SUCCESS !=
1238+
(rc = opal_hash_table_set_value_uint64(&orte_rml_ofi.peers, ui64, (void*)pr))) {
1239+
opal_output_verbose(15, orte_rml_base_framework.framework_output,
1240+
"%s: ofi peer address insertion failed for peer %s ",
1241+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1242+
ORTE_NAME_PRINT(&peer));
1243+
ORTE_ERROR_LOG(rc);
1244+
}
1245+
opal_output_verbose(15, orte_rml_base_framework.framework_output,
1246+
"%s: ofi peer address inserted for peer %s ",
1247+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1248+
ORTE_NAME_PRINT(&peer));
1249+
opal_output_verbose(15, orte_rml_base_framework.framework_output,
1250+
"%s: ofi sock address length = %zd ",
1251+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1252+
pr->ofi_ep_len);
1253+
struct sockaddr_in* ep_sockaddr = (struct sockaddr_in*)pr->ofi_ep;
1254+
opal_output_verbose(15,orte_rml_base_framework.framework_output,
1255+
"%s OFI set_name() port = 0x%x, InternetAddr = %s ",
1256+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1257+
ntohs(ep_sockaddr->sin_port),
1258+
inet_ntoa(ep_sockaddr->sin_addr));
1259+
opal_argv_free(uris);
1260+
return;
1261+
}
1262+
}
12161263
}
12171264
free( ofiuri);
12181265
}
1219-
/* if atleast one OFI address is known for peer insert it */
1220-
if( 1 <= tot_found ) {
1221-
if (OPAL_SUCCESS !=
1222-
(rc = opal_hash_table_set_value_uint64(&orte_rml_ofi.peers, ui64, (void*)pr))) {
1223-
opal_output_verbose(15, orte_rml_base_framework.framework_output,
1224-
"%s: ofi peer address insertion failed for peer %s ",
1225-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1226-
ORTE_NAME_PRINT(&peer));
1227-
ORTE_ERROR_LOG(rc);
1228-
}
1229-
opal_output_verbose(15, orte_rml_base_framework.framework_output,
1230-
"%s: ofi peer address inserted for peer %s ",
1231-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1232-
ORTE_NAME_PRINT(&peer));
1233-
opal_output_verbose(15, orte_rml_base_framework.framework_output,
1234-
"%s: ofi sock address length = %zd ",
1235-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1236-
pr->ofi_ep_len);
1237-
struct sockaddr_in* ep_sockaddr = (struct sockaddr_in*)pr->ofi_ep;
1238-
opal_output_verbose(15,orte_rml_base_framework.framework_output,
1239-
"%s OFI set_name() port = 0x%x, InternetAddr = %s ",
1240-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ntohs(ep_sockaddr->sin_port),inet_ntoa(ep_sockaddr->sin_addr));
1241-
}
12421266
}
1267+
12431268
opal_output_verbose(10,orte_rml_base_framework.framework_output,
12441269
"%s OFI end of set_contact_info()",
12451270
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));

orte/mca/rml/ofi/rml_ofi_send.c

Lines changed: 47 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -376,17 +376,13 @@ static void send_msg(int fd, short args, void *cbdata)
376376
uint32_t total_packets;
377377
fi_addr_t dest_fi_addr;
378378
orte_rml_send_t *snd;
379-
orte_rml_recv_t *rcv;
380-
orte_self_send_xfer_t *xfer;
381379
orte_rml_ofi_request_t* ofi_send_req = OBJ_NEW( orte_rml_ofi_request_t );
382380
uint8_t ofi_prov_id = req->ofi_prov_id;
383381
orte_rml_ofi_send_pkt_t* ofi_msg_pkt;
384382
size_t datalen_per_pkt, hdrsize, data_in_pkt; // the length of data in per packet excluding the header size
385383
orte_rml_ofi_peer_t* pr;
386384
uint64_t ui64;
387385
struct sockaddr_in* ep_sockaddr;
388-
int i, bytes;
389-
char *ptr;
390386

391387
snd = OBJ_NEW(orte_rml_send_t);
392388
snd->dst = *peer;
@@ -408,85 +404,59 @@ static void send_msg(int fd, short args, void *cbdata)
408404
ORTE_NAME_PRINT(peer), tag);
409405

410406

411-
/* get the peer address by doing modex_receive */
407+
/* get the peer address from our internal hash table */
408+
opal_output_verbose(1, orte_rml_base_framework.framework_output,
409+
"%s getting contact info for DAEMON peer %s from internal hash table",
410+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(peer));
411+
memcpy(&ui64, (char*)peer, sizeof(uint64_t));
412+
if (OPAL_SUCCESS != (ret = opal_hash_table_get_value_uint64(&orte_rml_ofi.peers,
413+
ui64, (void**)&pr) || NULL == pr)) {
414+
opal_output_verbose(1, orte_rml_base_framework.framework_output,
415+
"%s rml:ofi: Send failed to get peer OFI contact info ",
416+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
417+
snd->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
418+
ORTE_RML_SEND_COMPLETE(snd);
419+
//OBJ_RELEASE( ofi_send_req);
420+
return;
421+
}
422+
opal_output_verbose(1, orte_rml_base_framework.framework_output,
423+
"%s rml:ofi: OFI peer contact info got from hash table",
424+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
425+
dest_ep_name = pr->ofi_ep;
426+
dest_ep_namelen = pr->ofi_ep_len;
427+
428+
//[Debug] printing additional info of IP
429+
switch ( orte_rml_ofi.ofi_prov[ofi_prov_id].fabric_info->addr_format)
430+
{
431+
case FI_SOCKADDR_IN :
432+
/* Address is of type sockaddr_in (IPv4) */
433+
/*[debug] - print the sockaddr - port and s_addr */
434+
ep_sockaddr = (struct sockaddr_in*)dest_ep_name;
435+
opal_output_verbose(1,orte_rml_base_framework.framework_output,
436+
"%s peer %s epnamelen is %lu, port = %d (or) 0x%x, InternetAddr = 0x%s ",
437+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ORTE_NAME_PRINT(peer),
438+
(unsigned long)orte_rml_ofi.ofi_prov[ofi_prov_id].epnamelen,ntohs(ep_sockaddr->sin_port),
439+
ntohs(ep_sockaddr->sin_port),inet_ntoa(ep_sockaddr->sin_addr));
440+
/*[end debug]*/
441+
break;
442+
}
443+
//[Debug] end debug
412444
opal_output_verbose(10, orte_rml_base_framework.framework_output,
413-
"%s calling OPAL_MODEX_RECV_STRING ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) );
414-
if (ORTE_PROC_IS_APP ) {
415-
asprintf(&pmix_key,"%s%d",orte_rml_ofi.ofi_prov[ofi_prov_id].fabric_info->fabric_attr->prov_name,ofi_prov_id);
416-
opal_output_verbose(1, orte_rml_base_framework.framework_output,
417-
"%s calling OPAL_MODEX_RECV_STRING for ORTE_PROC_APP peer - %s, key - %s ",
418-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(peer),pmix_key );
419-
OPAL_MODEX_RECV_STRING(ret, pmix_key, peer , (uint8_t **) &dest_ep_name, &dest_ep_namelen);
420-
opal_output_verbose(10, orte_rml_base_framework.framework_output, "Returned from MODEX_RECV");
421-
opal_output_verbose(50, orte_rml_base_framework.framework_output,
422-
"%s Return value from OPAL_MODEX_RECV_STRING - %d, length returned - %lu",
423-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ret, dest_ep_namelen);
424-
free(pmix_key);
425-
} else {
445+
"%s OPAL_MODEX_RECV succeded, %s peer ep name obtained. length=%lu",
446+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
447+
ORTE_NAME_PRINT(peer), dest_ep_namelen);
448+
ret = fi_av_insert(orte_rml_ofi.ofi_prov[ofi_prov_id].av, dest_ep_name,1,&dest_fi_addr,0,NULL);
449+
if( ret != 1) {
426450
opal_output_verbose(1, orte_rml_base_framework.framework_output,
427-
"%s calling OPAL_MODEX_RECV_STRING for DAEMON peer %s",
428-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(peer));
429-
memcpy(&ui64, (char*)peer, sizeof(uint64_t));
430-
if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_rml_ofi.peers,
431-
ui64, (void**)&pr) || NULL == pr) {
432-
opal_output_verbose(1, orte_rml_base_framework.framework_output,
433-
"%s rml:ofi: Send failed to get peer OFI contact info ",
434-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
435-
return;
436-
}
437-
opal_output_verbose(1, orte_rml_base_framework.framework_output,
438-
"%s rml:ofi: OFI peer contact info got from hash table",
439-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
440-
dest_ep_name = pr->ofi_ep;
441-
dest_ep_namelen = pr->ofi_ep_len;
442-
ret = OPAL_SUCCESS;
443-
}
444-
if ( OPAL_SUCCESS == ret) {
445-
//[Debug] printing additional info of IP
446-
switch ( orte_rml_ofi.ofi_prov[ofi_prov_id].fabric_info->addr_format)
447-
{
448-
case FI_SOCKADDR_IN :
449-
/* Address is of type sockaddr_in (IPv4) */
450-
/*[debug] - print the sockaddr - port and s_addr */
451-
ep_sockaddr = (struct sockaddr_in*)dest_ep_name;
452-
opal_output_verbose(1,orte_rml_base_framework.framework_output,
453-
"%s peer %s epnamelen is %d, port = %d (or) 0x%x, InternetAddr = 0x%s ",
454-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ORTE_NAME_PRINT(peer),
455-
orte_rml_ofi.ofi_prov[ofi_prov_id].epnamelen,ntohs(ep_sockaddr->sin_port),
456-
ntohs(ep_sockaddr->sin_port),inet_ntoa(ep_sockaddr->sin_addr));
457-
/*[end debug]*/
458-
break;
459-
}
460-
//[Debug] end debug
461-
opal_output_verbose(10, orte_rml_base_framework.framework_output,
462-
"%s OPAL_MODEX_RECV succeded, %s peer ep name obtained. length=%lu",
463-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
464-
ORTE_NAME_PRINT(peer), dest_ep_namelen);
465-
ret = fi_av_insert(orte_rml_ofi.ofi_prov[ofi_prov_id].av, dest_ep_name,1,&dest_fi_addr,0,NULL);
466-
if( ret != 1) {
467-
opal_output_verbose(1, orte_rml_base_framework.framework_output,
468-
"%s fi_av_insert failed in send_msg() returned %d",
469-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ret );
470-
/* call the send-callback fn with error and return, also return failure status */
471-
snd->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
451+
"%s fi_av_insert failed in send_msg() returned %d",
452+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ret );
453+
/* call the send-callback fn with error and return, also return failure status */
454+
snd->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
472455

473-
ORTE_RML_SEND_COMPLETE(snd);
474-
475-
return;
476-
}
477-
} else {
478-
479-
opal_output_verbose(1, orte_rml_base_framework.framework_output,
480-
"%s OPAL_MODEX_RECV failed to obtain %s peer ep name ",
481-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
482-
ORTE_NAME_PRINT(peer));
483-
/* call the send-callback fn with error and return, also return failure status */
484-
snd->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
485456
ORTE_RML_SEND_COMPLETE(snd);
486-
//OBJ_RELEASE( ofi_send_req);
457+
487458
return;
488459
}
489-
490460
ofi_send_req->send = snd;
491461
ofi_send_req->completion_count = 1;
492462

@@ -625,7 +595,6 @@ int orte_rml_ofi_send_nb(struct orte_rml_base_module_t* mod,
625595
void* cbdata)
626596
{
627597
orte_rml_recv_t *rcv;
628-
orte_rml_send_t *snd;
629598
int bytes;
630599
orte_self_send_xfer_t *xfer;
631600
int i;
@@ -749,7 +718,6 @@ int orte_rml_ofi_send_buffer_nb(struct orte_rml_base_module_t *mod,
749718
void* cbdata)
750719
{
751720
orte_rml_recv_t *rcv;
752-
orte_rml_send_t *snd;
753721
orte_self_send_xfer_t *xfer;
754722
ofi_send_request_t *req;
755723
orte_rml_ofi_module_t *ofi_mod = (orte_rml_ofi_module_t*)mod;

0 commit comments

Comments
 (0)