@@ -80,8 +80,18 @@ orte_rml_ofi_module_t orte_rml_ofi = {
80
80
/* Local variables */
81
81
static bool init_done = false;
82
82
static char * ofi_transports_supported = NULL ;
83
+ static char * initial_ofi_transports_supported = NULL ;
83
84
static bool ofi_desired = false;
84
85
86
+ /* return true if user override for choice of ofi provider */
87
+ bool user_override (void )
88
+ {
89
+ if ( 0 == strcmp (initial_ofi_transports_supported , ofi_transports_supported ) )
90
+ return false;
91
+ else
92
+ return true;
93
+ }
94
+
85
95
static int
86
96
rml_ofi_component_open (void )
87
97
{
@@ -232,7 +242,8 @@ static int rml_ofi_component_register(void)
232
242
{
233
243
mca_base_component_t * component = & mca_rml_ofi_component .base ;
234
244
235
- ofi_transports_supported = strdup ("fabric,ethernet" );
245
+ initial_ofi_transports_supported = strdup ("fabric" );
246
+ ofi_transports_supported = strdup (initial_ofi_transports_supported );
236
247
mca_base_component_var_register (component , "transports" ,
237
248
"Comma-delimited list of transports to support (default=\"fabric,ethernet\"" ,
238
249
MCA_BASE_VAR_TYPE_STRING , NULL , 0 , 0 ,
@@ -923,29 +934,54 @@ static int rml_ofi_component_init(void)
923
934
int get_ofi_prov_id ( opal_list_t * attributes )
924
935
{
925
936
937
+ bool choose_fabric = false, choice_made = false;
926
938
int ofi_prov_id = RML_OFI_PROV_ID_INVALID , prov_num = 0 ;
927
939
char * provider = NULL , * transport = NULL ;
928
940
char * ethernet = "sockets" , * fabric = "psm2" ;
929
941
struct fi_info * cur_fi ;
942
+ char * comp_attrib = NULL ;
943
+ char * * comps ;
944
+ int i ;
930
945
931
- /* check the list of attributes to see if we should respond
946
+ /* check the list of attributes in below order
932
947
* Attribute should have ORTE_RML_TRANSPORT_ATTRIB key
933
- * with values "ethernet" or "fabric"
948
+ * with values "ethernet" or "fabric". "fabric" is higher priority.
934
949
* (or) ORTE_RML_OFI_PROV_NAME key with values "socket" or "OPA"
935
950
* if both above attributes are missing return failure
936
951
*/
937
- if (orte_get_attribute (attributes , ORTE_RML_TRANSPORT_ATTRIB , (void * * )& transport , OPAL_STRING ) &&
938
- NULL != transport ) {
939
- if ( 0 == strcmp ( transport , "ethernet" ) ) {
940
- provider = ethernet ;
941
- } else if ( 0 == strcmp ( transport , "fabric" ) ) {
942
- provider = fabric ;
943
- }
952
+ //if (orte_get_attribute(attributes, ORTE_RML_TRANSPORT_ATTRIB, (void**)&transport, OPAL_STRING) ) {
953
+
954
+ if (orte_get_attribute (attributes , ORTE_RML_TRANSPORT_TYPE , (void * * )& comp_attrib , OPAL_STRING ) &&
955
+ NULL != comp_attrib ) {
956
+ comps = opal_argv_split (comp_attrib , ',' );
957
+ for (i = 0 ; NULL != comps [i ] && choice_made == false ; i ++ ) {
958
+ if (NULL != strstr (ofi_transports_supported , comps [i ])) {
959
+ if (0 == strcmp ( comps [i ], "ethernet" )) {
960
+ opal_output_verbose (20 ,orte_rml_base_framework .framework_output ,
961
+ "%s - Opening conduit using OFI ethernet/sockets provider" ,
962
+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ));
963
+ opal_argv_free (comps );
964
+ provider = ethernet ;
965
+ choose_fabric = false;
966
+ choice_made = false; /* continue to see if fabric is requested */
967
+ } else if ( 0 == strcmp ( comps [i ], "fabric" )) {
968
+ opal_output_verbose (20 ,orte_rml_base_framework .framework_output ,
969
+ "%s - Opening conduit using OFI fabric provider" ,
970
+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ));
971
+ opal_argv_free (comps );
972
+ choose_fabric = true;
973
+ provider = NULL ;
974
+ choice_made = true; /* fabric is highest priority so don't check for anymore */
975
+ }
976
+ }
977
+ }
944
978
}
945
979
/* if from the transport we don't know which provider we want, then check for the ORTE_RML_OFI_PROV_NAME_ATTRIB */
946
980
if ( NULL == provider ) {
947
- if (orte_get_attribute (attributes , ORTE_RML_PROVIDER_ATTRIB , (void * * )& provider , OPAL_STRING ) &&
948
- NULL != provider ) {
981
+ orte_get_attribute (attributes , ORTE_RML_PROVIDER_ATTRIB , (void * * )& provider , OPAL_STRING );
982
+ }
983
+ /* either ethernet-sockets or specific is requested. Proceed to choose that provider */
984
+ if ( NULL != provider ) {
949
985
// loop the orte_rml_ofi.ofi_provs[] and find the provider name that matches
950
986
for ( prov_num = 0 ; prov_num < orte_rml_ofi .ofi_prov_open_num && ofi_prov_id == RML_OFI_PROV_ID_INVALID ; prov_num ++ ) {
951
987
cur_fi = orte_rml_ofi .ofi_prov [prov_num ].fabric_info ;
@@ -954,11 +990,27 @@ int get_ofi_prov_id( opal_list_t *attributes)
954
990
ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),provider ,cur_fi -> fabric_attr -> prov_name );
955
991
if ( strcmp (provider ,cur_fi -> fabric_attr -> prov_name ) == 0 ) {
956
992
ofi_prov_id = prov_num ;
957
- }
993
+ opal_output_verbose (20 ,orte_rml_base_framework .framework_output ,
994
+ "%s - Choosing provider %s" ,
995
+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
996
+ cur_fi -> fabric_attr -> prov_name );
997
+ }
998
+ }
999
+ } else if ( choose_fabric ) {
1000
+ // "fabric" is requested, choose the first fabric(non-ethernet) provider
1001
+ for ( prov_num = 0 ; prov_num < orte_rml_ofi .ofi_prov_open_num && ofi_prov_id == RML_OFI_PROV_ID_INVALID ; prov_num ++ ) {
1002
+ cur_fi = orte_rml_ofi .ofi_prov [prov_num ].fabric_info ;
1003
+ opal_output_verbose (20 ,orte_rml_base_framework .framework_output ,
1004
+ "%s -choosing fabric -> comparing %s != %s " ,
1005
+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),ethernet ,cur_fi -> fabric_attr -> prov_name );
1006
+ if ( strcmp (ethernet , cur_fi -> fabric_attr -> prov_name ) != 0 ) {
1007
+ ofi_prov_id = prov_num ;
1008
+ opal_output_verbose (20 ,orte_rml_base_framework .framework_output ,
1009
+ "%s - Choosing fabric provider %s" ,
1010
+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),cur_fi -> fabric_attr -> prov_name );
958
1011
}
959
1012
}
960
1013
}
961
-
962
1014
opal_output_verbose (20 ,orte_rml_base_framework .framework_output ,
963
1015
"%s - get_ofi_prov_id(), returning ofi_prov_id=%d " ,
964
1016
ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),ofi_prov_id );
@@ -1076,22 +1128,18 @@ static orte_rml_base_module_t* open_conduit(opal_list_t *attributes)
1076
1128
"%s - ORTE_RML_TRANSPORT_TYPE = %s " ,
1077
1129
ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ), comp_attrib );
1078
1130
comps = opal_argv_split (comp_attrib , ',' );
1079
- for (i = 0 ; 0 == i ; i ++ ) {
1131
+ for (i = 0 ; NULL != comps [ i ] ; i ++ ) {
1080
1132
if (NULL != strstr (ofi_transports_supported , comps [i ])) {
1081
1133
/* we are a candidate, */
1082
1134
opal_output_verbose (20 ,orte_rml_base_framework .framework_output ,
1083
- "%s - Forcibly returning ofi socket provider for ethernet transport request " ,
1135
+ "%s - Opening conduit using OFI.. " ,
1084
1136
ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ));
1085
1137
opal_argv_free (comps );
1086
- OBJ_CONSTRUCT (& provider , opal_list_t );
1087
- orte_set_attribute (& provider , ORTE_RML_PROVIDER_ATTRIB ,
1088
- ORTE_ATTR_LOCAL , "sockets" , OPAL_STRING );
1089
- return make_module (get_ofi_prov_id (& provider ));
1138
+ return make_module (get_ofi_prov_id (attributes ));
1090
1139
}
1091
1140
}
1092
1141
opal_argv_free (comps );
1093
1142
}
1094
- /* end [Debug] */
1095
1143
1096
1144
/* Alternatively, check the attributes to see if we qualify - we only handle
1097
1145
* "pt2pt" */
@@ -1108,12 +1156,16 @@ static orte_rml_base_module_t* open_conduit(opal_list_t *attributes)
1108
1156
1109
1157
static void pr_cons (orte_rml_ofi_peer_t * ptr )
1110
1158
{
1159
+ ptr -> ofi_prov_name = NULL ;
1111
1160
ptr -> ofi_ep = NULL ;
1112
1161
ptr -> ofi_ep_len = 0 ;
1162
+ ptr -> src_prov_id = RML_OFI_PROV_ID_INVALID ;
1113
1163
}
1114
1164
1115
1165
static void pr_des (orte_rml_ofi_peer_t * ptr )
1116
1166
{
1167
+ if ( NULL != ptr -> ofi_prov_name )
1168
+ free (ptr -> ofi_prov_name );
1117
1169
if ( 0 < ptr -> ofi_ep_len )
1118
1170
free ( ptr -> ofi_ep );
1119
1171
}
0 commit comments