Skip to content

Commit 6d566f8

Browse files
committed
ofi/btl: Reuse MTL's domain and fabric in BTL
Share the domain among BTL and MTL to reduce the number of protection domains needed. This prevents hitting the maximum PD limit on systems with high core counts. Signed-off-by: Jessie Yang <[email protected]>
1 parent fe33997 commit 6d566f8

File tree

1 file changed

+41
-8
lines changed

1 file changed

+41
-8
lines changed

opal/mca/btl/ofi/btl_ofi_component.c

Lines changed: 41 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,12 @@
3333
#include "opal/mca/btl/btl.h"
3434
#include "opal/mca/common/ofi/common_ofi.h"
3535
#include "opal/mca/hwloc/base/base.h"
36+
#include "ompi/mca/mtl/base/base.h"
37+
#include "ompi/mca/mtl/ofi/mtl_ofi.h"
38+
#include "ompi/mca/mtl/ofi/mtl_ofi_types.h"
39+
40+
extern mca_mtl_base_module_t *ompi_mtl;
41+
3642

3743
#include <string.h>
3844

@@ -293,6 +299,7 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules,
293299
struct fi_fabric_attr fabric_attr = {0};
294300
struct fi_domain_attr domain_attr = {0};
295301
uint64_t required_caps;
302+
mca_mtl_ofi_module_t *mtl_ofi = NULL;
296303

297304
switch (mca_btl_ofi_component.mode) {
298305

@@ -314,6 +321,17 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules,
314321

315322
fabric_attr.prov_name = NULL;
316323

324+
/* Check if MTL OFI is loaded and try to reuse its fabric and domain */
325+
if (ompi_mtl && ompi_mtl_base_selected_component &&
326+
strcmp(ompi_mtl_base_selected_component->mtl_version.mca_component_name, "ofi") == 0) {
327+
mtl_ofi = (mca_mtl_ofi_module_t *) ompi_mtl;
328+
}
329+
330+
if (mtl_ofi) {
331+
BTL_VERBOSE(("Reusing MTL ofi fabric"));
332+
fabric_attr.fabric = mtl_ofi->fabric;
333+
}
334+
317335
opal_output_verbose(1, opal_common_ofi.output, "%s:%d: btl:ofi:provider_include = \"%s\"\n",
318336
__FILE__, __LINE__, *opal_common_ofi.prov_include);
319337
opal_output_verbose(1, opal_common_ofi.output, "%s:%d: btl:ofi:provider_exclude = \"%s\"\n",
@@ -339,6 +357,11 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules,
339357
domain_attr.control_progress = progress_mode;
340358
domain_attr.data_progress = progress_mode;
341359

360+
if (mtl_ofi) {
361+
BTL_VERBOSE(("Reusing MTL ofi domain"));
362+
domain_attr.domain = mtl_ofi->domain;
363+
}
364+
342365
/* select endpoint type */
343366
ep_attr.type = FI_EP_RDM;
344367

@@ -553,17 +576,27 @@ static int mca_btl_ofi_init_device(struct fi_info *info)
553576
("initializing dev:%s provider:%s", linux_device_name, info->fabric_attr->prov_name));
554577

555578
/* fabric */
556-
rc = fi_fabric(ofi_info->fabric_attr, &fabric, NULL);
557-
if (0 != rc) {
558-
BTL_VERBOSE(("%s failed fi_fabric with err=%s", linux_device_name, fi_strerror(-rc)));
559-
goto fail;
579+
if (info->fabric_attr->fabric) {
580+
BTL_VERBOSE(("Reusing existing fabric: %s", info->fabric_attr->name));
581+
fabric = info->fabric_attr->fabric;
582+
} else {
583+
rc = fi_fabric(ofi_info->fabric_attr, &fabric, NULL);
584+
if (0 != rc) {
585+
BTL_VERBOSE(("%s failed fi_fabric with err=%s", linux_device_name, fi_strerror(-rc)));
586+
goto fail;
587+
}
560588
}
561589

562590
/* domain */
563-
rc = fi_domain(fabric, ofi_info, &domain, NULL);
564-
if (0 != rc) {
565-
BTL_VERBOSE(("%s failed fi_domain with err=%s", linux_device_name, fi_strerror(-rc)));
566-
goto fail;
591+
if (info->domain_attr->domain) {
592+
BTL_VERBOSE(("Reusing existing domain: %s", info->domain_attr->name));
593+
domain = info->domain_attr->domain;
594+
} else {
595+
rc = fi_domain(fabric, ofi_info, &domain, NULL);
596+
if (0 != rc) {
597+
BTL_VERBOSE(("%s failed fi_domain with err=%s", linux_device_name, fi_strerror(-rc)));
598+
goto fail;
599+
}
567600
}
568601

569602
/**

0 commit comments

Comments
 (0)