Skip to content

Commit 51b4078

Browse files
author
Ralph Castain
authored
Merge pull request #3648 from rhc54/topic/ofi
Clean up the conduit open code so we return detectable errors when co…
2 parents 68a2268 + e884cbf commit 51b4078

File tree

9 files changed

+81
-65
lines changed

9 files changed

+81
-65
lines changed

orte/include/orte/constants.h

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2014 Research Organization for Information Science
1313
* and Technology (RIST). All rights reserved.
14-
* Copyright (c) 2015 Intel, Inc. All rights reserved.
14+
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
1515
* $COPYRIGHT$
1616
*
1717
* Additional copyrights may follow
@@ -142,25 +142,16 @@ enum {
142142
ORTE_ERR_ALLOCATION_PENDING = (ORTE_ERR_BASE - 43),
143143
ORTE_ERR_NO_PATH_TO_TARGET = (ORTE_ERR_BASE - 44),
144144
ORTE_ERR_OP_IN_PROGRESS = (ORTE_ERR_BASE - 45),
145-
ORTE_ERR_OPEN_CHANNEL_PEER_FAIL = (ORTE_ERR_BASE - 46),
146-
ORTE_ERR_OPEN_CHANNEL_PEER_REJECT = (ORTE_ERR_BASE - 47),
147-
ORTE_ERR_QOS_TYPE_UNSUPPORTED = (ORTE_ERR_BASE - 48),
148-
ORTE_ERR_QOS_ACK_WINDOW_FULL = (ORTE_ERR_BASE - 49),
149-
ORTE_ERR_ACK_TIMEOUT_SENDER = (ORTE_ERR_BASE - 50),
150-
ORTE_ERR_ACK_TIMEOUT_RECEIVER = (ORTE_ERR_BASE - 51),
151-
ORTE_ERR_LOST_MSG_IN_WINDOW = (ORTE_ERR_BASE - 52),
152-
ORTE_ERR_CHANNEL_BUSY = (ORTE_ERR_BASE - 53),
153-
ORTE_ERR_DUPLICATE_MSG = (ORTE_ERR_BASE - 54),
154-
ORTE_ERR_OUT_OF_ORDER_MSG = (ORTE_ERR_BASE - 55),
155-
ORTE_ERR_OPEN_CHANNEL_DUPLICATE = (ORTE_ERR_BASE - 56),
156-
ORTE_ERR_FORCE_SELECT = (ORTE_ERR_BASE - 57),
157-
ORTE_ERR_JOB_CANCELLED = (ORTE_ERR_BASE - 58),
158-
ORTE_ERR_CONDUIT_SEND_FAIL = (ORTE_ERR_BASE - 59)
145+
ORTE_ERR_OPEN_CONDUIT_FAIL = (ORTE_ERR_BASE - 46),
146+
ORTE_ERR_DUPLICATE_MSG = (ORTE_ERR_BASE - 47),
147+
ORTE_ERR_OUT_OF_ORDER_MSG = (ORTE_ERR_BASE - 48),
148+
ORTE_ERR_FORCE_SELECT = (ORTE_ERR_BASE - 49),
149+
ORTE_ERR_JOB_CANCELLED = (ORTE_ERR_BASE - 50),
150+
ORTE_ERR_CONDUIT_SEND_FAIL = (ORTE_ERR_BASE - 51)
159151
};
160152

161153
#define ORTE_ERR_MAX (ORTE_ERR_BASE - 100)
162154

163155
END_C_DECLS
164156

165157
#endif /* ORTE_CONSTANTS_H */
166-

orte/mca/ess/base/ess_base_std_app.c

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* Copyright (c) 2010-2012 Oak Ridge National Labs. All rights reserved.
1313
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
1414
* reserved.
15-
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
15+
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
1616
* Copyright (c) 2014-2016 Research Organization for Information Science
1717
* and Technology (RIST). All rights reserved.
1818
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
@@ -223,13 +223,21 @@ int orte_ess_base_app_setup(bool db_restrict_local)
223223
OBJ_CONSTRUCT(&transports, opal_list_t);
224224
orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE,
225225
ORTE_ATTR_LOCAL, orte_mgmt_transport, OPAL_STRING);
226-
orte_mgmt_conduit = orte_rml.open_conduit(&transports);
226+
if (ORTE_RML_CONDUIT_INVALID == (orte_mgmt_conduit = orte_rml.open_conduit(&transports))) {
227+
ret = ORTE_ERR_OPEN_CONDUIT_FAIL;
228+
error = "orte_rml_open_mgmt_conduit";
229+
goto error;
230+
}
227231
OPAL_LIST_DESTRUCT(&transports);
228232

229233
OBJ_CONSTRUCT(&transports, opal_list_t);
230234
orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE,
231235
ORTE_ATTR_LOCAL, orte_coll_transport, OPAL_STRING);
232-
orte_coll_conduit = orte_rml.open_conduit(&transports);
236+
if (ORTE_RML_CONDUIT_INVALID == (orte_coll_conduit = orte_rml.open_conduit(&transports))) {
237+
ret = ORTE_ERR_OPEN_CONDUIT_FAIL;
238+
error = "orte_rml_open_coll_conduit";
239+
goto error;
240+
}
233241
OPAL_LIST_DESTRUCT(&transports);
234242

235243
/*

orte/mca/ess/base/ess_base_std_orted.c

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -424,13 +424,21 @@ int orte_ess_base_orted_setup(void)
424424
OBJ_CONSTRUCT(&transports, opal_list_t);
425425
orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE,
426426
ORTE_ATTR_LOCAL, orte_mgmt_transport, OPAL_STRING);
427-
orte_mgmt_conduit = orte_rml.open_conduit(&transports);
427+
if (ORTE_RML_CONDUIT_INVALID == (orte_mgmt_conduit = orte_rml.open_conduit(&transports))) {
428+
ret = ORTE_ERR_OPEN_CONDUIT_FAIL;
429+
error = "orte_rml_open_mgmt_conduit";
430+
goto error;
431+
}
428432
OPAL_LIST_DESTRUCT(&transports);
429433

430434
OBJ_CONSTRUCT(&transports, opal_list_t);
431435
orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE,
432436
ORTE_ATTR_LOCAL, orte_coll_transport, OPAL_STRING);
433-
orte_coll_conduit = orte_rml.open_conduit(&transports);
437+
if (ORTE_RML_CONDUIT_INVALID == (orte_coll_conduit = orte_rml.open_conduit(&transports))) {
438+
ret = ORTE_ERR_OPEN_CONDUIT_FAIL;
439+
error = "orte_rml_open_coll_conduit";
440+
goto error;
441+
}
434442
OPAL_LIST_DESTRUCT(&transports);
435443

436444
/* add our contact info to our proc object */

orte/mca/ess/hnp/ess_hnp_module.c

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -355,13 +355,21 @@ static int rte_init(void)
355355
OBJ_CONSTRUCT(&transports, opal_list_t);
356356
orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE,
357357
ORTE_ATTR_LOCAL, orte_mgmt_transport, OPAL_STRING);
358-
orte_mgmt_conduit = orte_rml.open_conduit(&transports);
358+
if (ORTE_RML_CONDUIT_INVALID == (orte_mgmt_conduit = orte_rml.open_conduit(&transports))) {
359+
ret = ORTE_ERR_OPEN_CONDUIT_FAIL;
360+
error = "orte_rml_open_mgmt_conduit";
361+
goto error;
362+
}
359363
OPAL_LIST_DESTRUCT(&transports);
360364

361365
OBJ_CONSTRUCT(&transports, opal_list_t);
362366
orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE,
363367
ORTE_ATTR_LOCAL, orte_coll_transport, OPAL_STRING);
364-
orte_coll_conduit = orte_rml.open_conduit(&transports);
368+
if (ORTE_RML_CONDUIT_INVALID == (orte_coll_conduit = orte_rml.open_conduit(&transports))) {
369+
ret = ORTE_ERR_OPEN_CONDUIT_FAIL;
370+
error = "orte_rml_open_coll_conduit";
371+
goto error;
372+
}
365373
OPAL_LIST_DESTRUCT(&transports);
366374

367375
/*

orte/mca/rml/base/rml_base_frame.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ static int orte_rml_base_open(mca_base_open_flag_t flags)
146146
OBJ_CONSTRUCT(&orte_rml_base.posted_recvs, opal_list_t);
147147
OBJ_CONSTRUCT(&orte_rml_base.unmatched_msgs, opal_list_t);
148148
OBJ_CONSTRUCT(&orte_rml_base.conduits, opal_pointer_array_t);
149-
opal_pointer_array_init(&orte_rml_base.conduits,1,INT_MAX,1);
149+
opal_pointer_array_init(&orte_rml_base.conduits,1,INT16_MAX,1);
150150

151151
/* Open up all available components */
152152
return mca_base_framework_components_open(&orte_rml_base_framework, flags);

orte/mca/rml/base/rml_base_stubs.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
66
* reserved.
77
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
8-
* Copyright (c) 2014-2016 Intel Corporation. All rights reserved.
8+
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
99
* Copyright (c) 2015-2017 Research Organization for Information Science
1010
* and Technology (RIST). All rights reserved.
1111
* $COPYRIGHT$
@@ -82,10 +82,14 @@ orte_rml_conduit_t orte_rml_API_open_conduit(opal_list_t *attributes)
8282
if (NULL != ourmod) {
8383
/* we got an answer - store this conduit in our array */
8484
rc = opal_pointer_array_add(&orte_rml_base.conduits, ourmod);
85+
if (rc < 0) {
86+
return ORTE_RML_CONDUIT_INVALID;
87+
}
8588
return rc;
8689
}
8790
/* we get here if nobody could support it */
88-
return ORTE_ERR_NOT_SUPPORTED;
91+
ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED);
92+
return ORTE_RML_CONDUIT_INVALID;
8993
}
9094

9195

orte/mca/rml/ofi/rml_ofi_component.c

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232

3333
static int rml_ofi_component_open(void);
3434
static int rml_ofi_component_close(void);
35+
static int rml_ofi_component_register(void);
36+
3537
static int rml_ofi_component_init(void);
3638
static orte_rml_base_module_t* open_conduit(opal_list_t *attributes);
3739
static orte_rml_pathway_t* query_transports(void);
@@ -55,6 +57,7 @@ orte_rml_component_t mca_rml_ofi_component = {
5557
ORTE_RELEASE_VERSION),
5658
.mca_open_component = rml_ofi_component_open,
5759
.mca_close_component = rml_ofi_component_close,
60+
.mca_register_component_params = rml_ofi_component_register
5861
},
5962
.data = {
6063
/* The component is checkpoint ready */
@@ -81,6 +84,7 @@ orte_rml_ofi_module_t orte_rml_ofi = {
8184

8285
/* Local variables */
8386
static bool init_done = false;
87+
static char *ofi_transports_supported = NULL;
8488

8589
static int
8690
rml_ofi_component_open(void)
@@ -227,6 +231,21 @@ rml_ofi_component_close(void)
227231
return ORTE_SUCCESS;
228232
}
229233

234+
static int rml_ofi_component_register(void)
235+
{
236+
mca_base_component_t *component = &mca_rml_ofi_component.base;
237+
238+
ofi_transports_supported = strdup("fabric,ethernet");
239+
mca_base_component_var_register(component, "transports",
240+
"Comma-delimited list of transports to support (default=\"fabric,ethernet\"",
241+
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
242+
OPAL_INFO_LVL_2,
243+
MCA_BASE_VAR_SCOPE_LOCAL,
244+
&ofi_transports_supported);
245+
opal_output(0, "OFI TRANSPORTS %s", ofi_transports_supported);
246+
return ORTE_SUCCESS;
247+
}
248+
230249
void print_provider_info (struct fi_info *cur_fi )
231250
{
232251
//Display all the details in the fi_info structure
@@ -279,8 +298,7 @@ static orte_rml_pathway_t* query_transports(void)
279298
/**
280299
ofi_prov [in]: the ofi ofi_prov_id that triggered the progress fn
281300
**/
282-
__opal_attribute_always_inline__ static inline int
283-
orte_rml_ofi_progress(ofi_transport_ofi_prov_t* prov)
301+
static int orte_rml_ofi_progress(ofi_transport_ofi_prov_t* prov)
284302
{
285303
ssize_t ret;
286304
int count=0; /* number of messages read and processed */
@@ -933,7 +951,16 @@ static orte_rml_base_module_t* make_module( int ofi_prov_id)
933951
memcpy(mod, &orte_rml_ofi, sizeof(orte_rml_ofi_module_t));
934952
/* setup the remaining data locations in mod, associate conduit with ofi provider selected*/
935953
mod->cur_transport_id = ofi_prov_id;
936-
954+
/* we always go direct to our target peer, so set the routed to "direct" */
955+
mod->api.routed = orte_routed.assign_module("direct");
956+
if (NULL == mod->api.routed) {
957+
/* we can't work */
958+
opal_output_verbose(20,orte_rml_base_framework.framework_output,
959+
"%s - Failed to get direct routed support, returning NULL ",
960+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
961+
free(mod);
962+
return NULL;
963+
}
937964
return (orte_rml_base_module_t*)mod;
938965
}
939966

@@ -997,19 +1024,15 @@ static orte_rml_base_module_t* open_conduit(opal_list_t *attributes)
9971024
}
9981025
}
9991026
}
1000-
/*[Debug] to check for daemon commn over ofi-ethernet, enable the default conduit ORTE_MGMT_CONDUIT over ofi */
1027+
10011028
if (orte_get_attribute(attributes, ORTE_RML_TRANSPORT_TYPE, (void**)&comp_attrib, OPAL_STRING) &&
10021029
NULL != comp_attrib) {
10031030
opal_output_verbose(20,orte_rml_base_framework.framework_output,
10041031
"%s - ORTE_RML_TRANSPORT_TYPE = %s ",
10051032
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), comp_attrib);
10061033
comps = opal_argv_split(comp_attrib, ',');
1007-
for (i=0; NULL != comps[i]; i++) {
1008-
/* changing below to check for oob, as trying to use ofi for only mgmt conduit */
1009-
if (0 == strcmp(comps[i], "oob")) {
1010-
/* changing below to check for fabric, as trying to use ofi for only coll conduit
1011-
if (0 == strcmp(comps[i], "fabric")) { */
1012-
/*if (0 == strcmp(comps[i], "ethernet")) { */
1034+
for (i=0; 0 == i; i++) {
1035+
if (NULL != strstr(ofi_transports_supported, comps[i])) {
10131036
/* we are a candidate, */
10141037
opal_output_verbose(20,orte_rml_base_framework.framework_output,
10151038
"%s - Forcibly returning ofi socket provider for ethernet transport request",

orte/mca/rml/rml_types.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ typedef uint32_t orte_rml_tag_t;
198198

199199
/* Conduit ID */
200200
typedef uint16_t orte_rml_conduit_t;
201+
#define ORTE_RML_CONDUIT_INVALID 0xff
201202

202203
/* define an object for reporting transports */
203204
typedef struct {

orte/util/error_strings.c

Lines changed: 2 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -195,39 +195,12 @@ int orte_err2str(int errnum, const char **errmsg)
195195
case ORTE_ERR_OP_IN_PROGRESS:
196196
retval = "Operation in progress";
197197
break;
198-
case ORTE_ERR_OPEN_CHANNEL_PEER_FAIL:
199-
retval = "Open channel to peer failed";
200-
break;
201-
case ORTE_ERR_OPEN_CHANNEL_PEER_REJECT:
202-
retval = "Open channel to peer was rejected";
203-
break;
204-
case ORTE_ERR_QOS_TYPE_UNSUPPORTED:
205-
retval = "QoS type unsupported";
206-
break;
207-
case ORTE_ERR_QOS_ACK_WINDOW_FULL:
208-
retval = "QoS ack window full";
209-
break;
210-
case ORTE_ERR_ACK_TIMEOUT_SENDER:
211-
retval = "Send ack timed out";
212-
break;
213-
case ORTE_ERR_ACK_TIMEOUT_RECEIVER:
214-
retval = "Recv ack timed out";
215-
break;
216-
case ORTE_ERR_LOST_MSG_IN_WINDOW:
217-
retval = "Msg lost in window";
218-
break;
219-
case ORTE_ERR_CHANNEL_BUSY:
220-
retval = "Channel busy";
221-
break;
222-
case ORTE_ERR_DUPLICATE_MSG:
223-
retval = "Duplicate message";
198+
case ORTE_ERR_OPEN_CONDUIT_FAIL:
199+
retval = "Open messaging conduit failed";
224200
break;
225201
case ORTE_ERR_OUT_OF_ORDER_MSG:
226202
retval = "Out of order message";
227203
break;
228-
case ORTE_ERR_OPEN_CHANNEL_DUPLICATE:
229-
retval = "Duplicate channel open request";
230-
break;
231204
case ORTE_ERR_FORCE_SELECT:
232205
retval = "Force select";
233206
break;

0 commit comments

Comments
 (0)