Skip to content

Commit f3144c7

Browse files
authored
Merge pull request #2152 from jsquyres/pr/usnic-improvements
usNIC BTL improvements w.r.t. libfabric bootstrapping
2 parents 7601e78 + 8b77359 commit f3144c7

File tree

3 files changed

+80
-97
lines changed

3 files changed

+80
-97
lines changed

opal/mca/btl/usnic/btl_usnic.h

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -236,13 +236,6 @@ typedef struct opal_btl_usnic_component_t {
236236
the prefix is non-NULL) */
237237
char *connectivity_map_prefix;
238238

239-
/** Expected return value from fi_cq_readerr() upon success. In
240-
libfabric v1.0.0 / API v1.0, the usnic provider returned
241-
sizeof(fi_cq_err_entry) upon success. In libfabric >=v1.1 /
242-
API >=v1.1, the usnic provider returned 1 upon success. */
243-
ssize_t cq_readerr_success_value;
244-
ssize_t cq_readerr_try_again_value;
245-
246239
/** Offset into the send buffer where the payload will go. For
247240
libfabric v1.0.0 / API v1.0, this is 0. For libfabric >=v1.1
248241
/ API >=v1.1, this is the endpoint.msg_prefix_size (i.e.,

opal/mca/btl/usnic/btl_usnic_component.c

Lines changed: 77 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -590,25 +590,6 @@ static void free_filter(usnic_if_filter_t *filter)
590590
free(filter);
591591
}
592592

593-
static int do_fi_getinfo(uint32_t version, struct fi_info **info_list)
594-
{
595-
struct fi_info hints = {0};
596-
struct fi_ep_attr ep_attr = {0};
597-
struct fi_fabric_attr fabric_attr = {0};
598-
599-
/* We only want providers named "usnic" that are of type EP_DGRAM */
600-
fabric_attr.prov_name = "usnic";
601-
ep_attr.type = FI_EP_DGRAM;
602-
603-
hints.caps = FI_MSG;
604-
hints.mode = FI_LOCAL_MR | FI_MSG_PREFIX;
605-
hints.addr_format = FI_SOCKADDR;
606-
hints.ep_attr = &ep_attr;
607-
hints.fabric_attr = &fabric_attr;
608-
609-
return fi_getinfo(version, NULL, 0, 0, &hints, info_list);
610-
}
611-
612593
/*
613594
* UD component initialization:
614595
* (1) read interface list from kernel and compare against component
@@ -652,62 +633,93 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
652633

653634
OBJ_CONSTRUCT(&btl_usnic_lock, opal_recursive_mutex_t);
654635

655-
/* This code understands libfabric API versions v1.0, v1.1, and
656-
v1.4. Even if we were compiled with libfabric API v1.0, we
657-
still want to request v1.1 -- here's why:
658-
659-
- In libfabric v1.0.0 (i.e., API v1.0), the usnic provider did
660-
not check the value of the "version" parameter passed into
661-
fi_getinfo()
662-
663-
- If you pass FI_VERSION(1,0) to libfabric v1.1.0 (i.e., API
664-
v1.1), the usnic provider will disable FI_MSG_PREFIX support
665-
(on the assumption that the application will not handle
666-
FI_MSG_PREFIX properly). This can happen if you compile OMPI
667-
against libfabric v1.0.0 (i.e., API v1.0) and run OMPI
668-
against libfabric v1.1.0 (i.e., API v1.1).
669-
670-
So never request API v1.0 -- always request a minimum of
671-
v1.1.
672-
673-
The usnic provider changed the strings in the fabric and domain
674-
names in API v1.4. With API <= v1.3:
636+
/* There are multiple dimensions to consider when requesting an
637+
API version number from libfabric:
638+
639+
1. This code understands libfabric API versions v1.3 through
640+
v1.4.
641+
642+
2. Open MPI may be *compiled* against one version of libfabric,
643+
but may be *running* with another.
644+
645+
3. There were usnic-specific bugs in Libfabric prior to
646+
libfabric v1.3.0 (where "v1.3.0" is the tarball/package
647+
version, not the API version; but happily, the API version
648+
was also 1.3 in Libfabric v1.3.0):
649+
650+
- In libfabric v1.0.0 (i.e., API v1.0), the usnic provider
651+
did not check the value of the "version" parameter passed
652+
into fi_getinfo()
653+
- If you pass FI_VERSION(1,0) to libfabric v1.1.0 (i.e., API
654+
v1.1), the usnic provider will disable FI_MSG_PREFIX
655+
support (on the assumption that the application will not
656+
handle FI_MSG_PREFIX properly). This can happen if you
657+
compile OMPI against libfabric v1.0.0 (i.e., API v1.0) and
658+
run OMPI against libfabric v1.1.0 (i.e., API v1.1).
659+
- Some critical AV bug fixes were included in libfabric
660+
v1.3.0; prior versions can fail in fi_av_* operations in
661+
unexpected ways (libnl: you win again!).
662+
663+
So always request a minimum API version of v1.3.
664+
665+
Note that the FI_MAJOR_VERSION and FI_MINOR_VERSION in
666+
<rdma/fabric.h> represent the API version, not the Libfabric
667+
package (i.e., tarball) version. As of Libfabric v1.3, there
668+
is currently no way to know a) what package version of
669+
Libfabric you were compiled against, and b) what package
670+
version of Libfabric you are running with.
671+
672+
Also note that the usnic provider changed the strings in the
673+
fabric and domain names in API v1.4. With API <= v1.3:
675674
676675
- fabric name is "usnic_X" (device name)
677676
- domain name is NULL
678677
679-
With libfabric API >= v1.4:
678+
With libfabric API >= v1.4, all Libfabric IP-based providers
679+
(including usnic) follow the same convention:
680680
681681
- fabric name is "a.b.c.d/e" (CIDR notation of network)
682682
- domain name is "usnic_X" (device name)
683683
684684
NOTE: The configure.m4 in this component will require libfabric
685-
>= v1.1.0 (i.e., it won't accept v1.0.0) because of a critical
686-
bug in the usnic provider in libfabric v1.0.0. However, the
687-
compatibility code with libfabric v1.0.0 in the usNIC BTL has
688-
been retained, for two reasons:
689-
690-
1. It's not harmful, nor overly complicated. So the
691-
compatibility code was not ripped out.
692-
2. At least some versions of Cisco Open MPI are shipping with
693-
an embedded (libfabric v1.0.0+critical bug fix).
685+
>= v1.1.0 (i.e., it won't accept v1.0.0) because it needs
686+
access to the usNIC extension header structures that only
687+
became available in v1.1.0.*/
694688

695-
Someday, #2 may no longer be true, and we may therefore rip out
696-
the libfabric v1.0.0 compatibility code. */
697-
698-
/* First try API version 1.4. If that doesn't work, try API
699-
version 1.1. */
689+
/* First, check to see if the libfabric we are running with is <=
690+
libfabric v1.3. If so, don't bother going further. */
700691
uint32_t libfabric_api;
701-
libfabric_api = FI_VERSION(1, 4);
702-
ret = do_fi_getinfo(libfabric_api, &info_list);
703-
// Libfabric core will return -FI_ENOSYS if it is too old
704-
if (-FI_ENOSYS == ret) {
705-
libfabric_api = FI_VERSION(1, 1);
706-
ret = do_fi_getinfo(libfabric_api, &info_list);
692+
libfabric_api = fi_version();
693+
if (libfabric_api < FI_VERSION(1, 3)) {
694+
opal_output_verbose(5, USNIC_OUT,
695+
"btl:usnic: disqualifiying myself because Libfabric does not support v1.3 of the API (v1.3 is *required* for correct usNIC functionality).");
696+
return NULL;
697+
}
698+
699+
/* Libfabric API 1.3 is fine. Above that, we know that Open MPI
700+
works with libfabric API v1.4, so just use that. */
701+
if (libfabric_api > FI_VERSION(1, 3)) {
702+
libfabric_api = FI_VERSION(1, 4);
707703
}
704+
705+
struct fi_info hints = {0};
706+
struct fi_ep_attr ep_attr = {0};
707+
struct fi_fabric_attr fabric_attr = {0};
708+
709+
/* We only want providers named "usnic" that are of type EP_DGRAM */
710+
fabric_attr.prov_name = "usnic";
711+
ep_attr.type = FI_EP_DGRAM;
712+
713+
hints.caps = FI_MSG;
714+
hints.mode = FI_LOCAL_MR | FI_MSG_PREFIX;
715+
hints.addr_format = FI_SOCKADDR;
716+
hints.ep_attr = &ep_attr;
717+
hints.fabric_attr = &fabric_attr;
718+
719+
ret = fi_getinfo(libfabric_api, NULL, 0, 0, &hints, &info_list);
708720
if (0 != ret) {
709721
opal_output_verbose(5, USNIC_OUT,
710-
"btl:usnic: disqualifiying myself due to fi_getinfo failure: %s (%d)", strerror(-ret), ret);
722+
"btl:usnic: disqualifiying myself due to fi_getinfo(3) failure: %s (%d)", strerror(-ret), ret);
711723
return NULL;
712724
}
713725

@@ -738,29 +750,6 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
738750
opal_output_verbose(5, USNIC_OUT,
739751
"btl:usnic: usNIC fabrics found");
740752

741-
/* Due to ambiguities in documentation, in libfabric v1.0.0 (i.e.,
742-
API v1.0) the usnic provider returned sizeof(struct
743-
fi_cq_err_entry) from fi_cq_readerr() upon success.
744-
745-
The ambiguities were clarified in libfabric v1.1.0 (i.e., API
746-
v1.1); the usnic provider returned 1 from fi_cq_readerr() upon
747-
success.
748-
749-
So query to see what version of the libfabric API we are
750-
running with, and adapt accordingly. */
751-
libfabric_api = fi_version();
752-
if (1 == FI_MAJOR(libfabric_api) &&
753-
0 == FI_MINOR(libfabric_api)) {
754-
// Old fi_cq_readerr() behavior: success=sizeof(...), try again=0
755-
mca_btl_usnic_component.cq_readerr_success_value =
756-
sizeof(struct fi_cq_err_entry);
757-
mca_btl_usnic_component.cq_readerr_try_again_value = 0;
758-
} else {
759-
// New fi_cq_readerr() behavior: success=1, try again=-FI_EAGAIN
760-
mca_btl_usnic_component.cq_readerr_success_value = 1;
761-
mca_btl_usnic_component.cq_readerr_try_again_value = -FI_EAGAIN;
762-
}
763-
764753
opal_proc_t *me = opal_proc_local_get();
765754
opal_process_name_t *name = &(me->proc_name);
766755
mca_btl_usnic_component.my_hashed_rte_name =
@@ -1256,12 +1245,11 @@ usnic_handle_cq_error(opal_btl_usnic_module_t* module,
12561245
}
12571246

12581247
rc = fi_cq_readerr(channel->cq, &err_entry, 0);
1259-
if (rc == mca_btl_usnic_component.cq_readerr_try_again_value) {
1248+
if (rc == -FI_EAGAIN) {
12601249
return;
1261-
} else if (rc != mca_btl_usnic_component.cq_readerr_success_value) {
1262-
BTL_ERROR(("%s: cq_readerr ret = %d (expected %d)",
1263-
module->linux_device_name, rc,
1264-
(int) mca_btl_usnic_component.cq_readerr_success_value));
1250+
} else if (rc != 1) {
1251+
BTL_ERROR(("%s: cq_readerr ret = %d (expected 1)",
1252+
module->linux_device_name, rc));
12651253
channel->chan_error = true;
12661254
}
12671255

opal/mca/btl/usnic/btl_usnic_util.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ void opal_btl_usnic_exit(opal_btl_usnic_module_t *module)
3434
}
3535
/* If we didn't find a PML error callback, just exit. */
3636
if (NULL == module) {
37+
fprintf(stderr, "*** The Open MPI usnic BTL is aborting the MPI job (via exit(3)).\n");
38+
fflush(stderr);
3739
exit(1);
3840
}
3941
}
@@ -47,7 +49,7 @@ void opal_btl_usnic_exit(opal_btl_usnic_module_t *module)
4749
module->pml_error_callback(&module->super,
4850
MCA_BTL_ERROR_FLAGS_FATAL,
4951
(opal_proc_t*) opal_proc_local_get(),
50-
"usnic");
52+
"The usnic BTL is aborting the MPI job (via PML error callback).");
5153
}
5254

5355
/* If the PML error callback returns (or if there wasn't one),

0 commit comments

Comments
 (0)