From f65f900bbbd044fa17e9153adb3bdf2906d2b28d Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Tue, 14 Oct 2025 15:33:21 -0700 Subject: [PATCH 1/3] btl/ofi: Set domain threading model based on MPI thread support Signed-off-by: Jessie Yang --- opal/mca/btl/ofi/btl_ofi_component.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/opal/mca/btl/ofi/btl_ofi_component.c b/opal/mca/btl/ofi/btl_ofi_component.c index 89a1ff36aaa..cdd19034d87 100644 --- a/opal/mca/btl/ofi/btl_ofi_component.c +++ b/opal/mca/btl/ofi/btl_ofi_component.c @@ -339,6 +339,12 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules, domain_attr.control_progress = progress_mode; domain_attr.data_progress = progress_mode; + if (enable_mpi_threads) { + domain_attr.threading = FI_THREAD_SAFE; + } else { + domain_attr.threading = FI_THREAD_DOMAIN; + } + /* select endpoint type */ ep_attr.type = FI_EP_RDM; From 15fe24645cd21c1c99f5ce7796e86344f442971d Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Tue, 14 Oct 2025 15:35:18 -0700 Subject: [PATCH 2/3] btl/ofi: Add FI_COMPLETION flag to tx and rx attributes Add FI_COMPLETION flag to ensure completion entries are generated for all data transfer operations. Signed-off-by: Jessie Yang --- opal/mca/btl/ofi/btl_ofi_component.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/opal/mca/btl/ofi/btl_ofi_component.c b/opal/mca/btl/ofi/btl_ofi_component.c index cdd19034d87..a3de4a26a9f 100644 --- a/opal/mca/btl/ofi/btl_ofi_component.c +++ b/opal/mca/btl/ofi/btl_ofi_component.c @@ -365,7 +365,8 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules, tx_attr.iov_limit = 1; rx_attr.iov_limit = 1; - tx_attr.op_flags = FI_DELIVERY_COMPLETE; + tx_attr.op_flags = FI_DELIVERY_COMPLETE | FI_COMPLETION; + rx_attr.op_flags = FI_COMPLETION; mca_btl_ofi_component.module_count = 0; From 69d273793dfb5e26fe93e2e3de58d511cb35b3f1 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Tue, 7 Oct 2025 23:52:01 +0000 Subject: [PATCH 3/3] ofi: Share domain between MTL and BTL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Share the domain between the MTL and BTL layers to reduce the total number of domains created. This helps avoid hitting system resource limits on platforms with high core counts. Instead of having the common code allocate a single domain with the superset of all required capabilities, we attempt to reuse an existing fabric and domain if the providers can support MTL’s and BTL’s different capability sets. This approach allows providers that support domain sharing to reuse resources efficiently while still preserving flexibility. If the providers cannot reuse the fabric and domain due to incompatible requirements, separate domains will be created as before. Signed-off-by: Jessie Yang --- ompi/mca/mtl/ofi/mtl_ofi_component.c | 35 ++++-- opal/mca/btl/ofi/btl_ofi_component.c | 17 ++- opal/mca/btl/ofi/btl_ofi_module.c | 4 +- opal/mca/common/ofi/common_ofi.c | 161 ++++++++++++++++++++++++++- opal/mca/common/ofi/common_ofi.h | 57 +++++++++- 5 files changed, 253 insertions(+), 21 deletions(-) diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c index 049ff4cf8c8..1e1d6c37eb6 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_component.c +++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c @@ -694,6 +694,8 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, } hints->domain_attr->resource_mgmt = FI_RM_ENABLED; + hints->domain_attr->domain = opal_common_ofi.domain; + hints->fabric_attr->fabric = opal_common_ofi.fabric; /** * The EFA provider in Libfabric versions prior to 1.10 contains a bug @@ -715,10 +717,16 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, hints_dup->fabric_attr->prov_name = strdup("efa"); ret = fi_getinfo(fi_primary_version, NULL, NULL, 0ULL, hints_dup, &providers); + if (FI_ENODATA == -ret && (hints_dup->fabric_attr->fabric || hints_dup->domain_attr->domain)) { + /* Retry without fabric and domain */ + hints_dup->fabric_attr->fabric = NULL; + hints_dup->domain_attr->domain = NULL; + ret = fi_getinfo(fi_primary_version, NULL, NULL, 0ULL, hints_dup, &providers); + } if (FI_ENOSYS == -ret) { /* libfabric is not new enough, fallback to use older version of API */ ret = fi_getinfo(fi_alternate_version, NULL, NULL, 0ULL, hints_dup, &providers); - } + } opal_output_verbose(1, opal_common_ofi.output, "%s:%d: EFA specific fi_getinfo(): %s\n", @@ -756,6 +764,11 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, 0ULL, /* Optional flag */ hints, /* In: Hints to filter providers */ &providers); /* Out: List of matching providers */ + if (FI_ENODATA == -ret && (hints->fabric_attr->fabric || hints->domain_attr->domain)) { + hints->fabric_attr->fabric = NULL; + hints->domain_attr->domain = NULL; + ret = fi_getinfo(fi_primary_version, NULL, NULL, 0ULL, hints, &providers); + } if (FI_ENOSYS == -ret) { ret = fi_getinfo(fi_alternate_version, NULL, NULL, 0ULL, hints, &providers); } @@ -972,9 +985,8 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, * instantiate the virtual or physical network. This opens a "fabric * provider". See man fi_fabric for details. */ - ret = fi_fabric(prov->fabric_attr, /* In: Fabric attributes */ - &ompi_mtl_ofi.fabric, /* Out: Fabric handle */ - NULL); /* Optional context for fabric events */ + ret = opal_common_ofi_fi_fabric(prov->fabric_attr, /* In: Fabric attributes */ + &ompi_mtl_ofi.fabric); /* Out: Fabric handle */ if (0 != ret) { opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, "fi_fabric", @@ -988,10 +1000,9 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, * hardware port/collection of ports. Returns a domain object that can be * used to create endpoints. See man fi_domain for details. */ - ret = fi_domain(ompi_mtl_ofi.fabric, /* In: Fabric object */ - prov, /* In: Provider */ - &ompi_mtl_ofi.domain, /* Out: Domain object */ - NULL); /* Optional context for domain events */ + ret = opal_common_ofi_fi_domain(ompi_mtl_ofi.fabric, /* In: Fabric object */ + prov, /* In: Provider */ + &ompi_mtl_ofi.domain); /* Out: Domain object */ if (0 != ret) { opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, "fi_domain", @@ -1155,10 +1166,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, (void) fi_close((fid_t)ompi_mtl_ofi.ofi_ctxt[0].cq); } if (ompi_mtl_ofi.domain) { - (void) fi_close((fid_t)ompi_mtl_ofi.domain); + (void) opal_common_ofi_domain_release(ompi_mtl_ofi.domain); } if (ompi_mtl_ofi.fabric) { - (void) fi_close((fid_t)ompi_mtl_ofi.fabric); + (void) opal_common_ofi_fabric_release(ompi_mtl_ofi.fabric); } if (ompi_mtl_ofi.comm_to_context) { free(ompi_mtl_ofi.comm_to_context); @@ -1206,11 +1217,11 @@ ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl) } } - if ((ret = fi_close((fid_t)ompi_mtl_ofi.domain))) { + if ((ret = opal_common_ofi_domain_release(ompi_mtl_ofi.domain))) { goto finalize_err; } - if ((ret = fi_close((fid_t)ompi_mtl_ofi.fabric))) { + if ((ret = opal_common_ofi_fabric_release(ompi_mtl_ofi.fabric))) { goto finalize_err; } diff --git a/opal/mca/btl/ofi/btl_ofi_component.c b/opal/mca/btl/ofi/btl_ofi_component.c index a3de4a26a9f..3f1e277dd69 100644 --- a/opal/mca/btl/ofi/btl_ofi_component.c +++ b/opal/mca/btl/ofi/btl_ofi_component.c @@ -379,9 +379,18 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules, no_hmem: #endif + hints.fabric_attr->fabric = opal_common_ofi.fabric; + hints.domain_attr->domain = opal_common_ofi.domain; + /* Do the query. The earliest version that supports FI_HMEM hints is 1.9. * The earliest version the explictly allow provider to call CUDA API is 1.18 */ rc = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0, &hints, &info_list); + if (FI_ENODATA == -rc && (hints.fabric_attr->fabric || hints.domain_attr->domain)) { + /* Retry without fabric and domain */ + hints.fabric_attr->fabric = NULL; + hints.domain_attr->domain = NULL; + rc = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0, &hints, &info_list); + } if (FI_ENOSYS == -rc) { rc = fi_getinfo(FI_VERSION(1, 9), NULL, NULL, 0, &hints, &info_list); } @@ -560,14 +569,14 @@ static int mca_btl_ofi_init_device(struct fi_info *info) ("initializing dev:%s provider:%s", linux_device_name, info->fabric_attr->prov_name)); /* fabric */ - rc = fi_fabric(ofi_info->fabric_attr, &fabric, NULL); + rc = opal_common_ofi_fi_fabric(ofi_info->fabric_attr, &fabric); if (0 != rc) { BTL_VERBOSE(("%s failed fi_fabric with err=%s", linux_device_name, fi_strerror(-rc))); goto fail; } /* domain */ - rc = fi_domain(fabric, ofi_info, &domain, NULL); + rc = opal_common_ofi_fi_domain(fabric, ofi_info, &domain); if (0 != rc) { BTL_VERBOSE(("%s failed fi_domain with err=%s", linux_device_name, fi_strerror(-rc))); goto fail; @@ -750,11 +759,11 @@ static int mca_btl_ofi_init_device(struct fi_info *info) } if (NULL != domain) { - fi_close(&domain->fid); + opal_common_ofi_domain_release(domain); } if (NULL != fabric) { - fi_close(&fabric->fid); + opal_common_ofi_fabric_release(fabric); } free(module); diff --git a/opal/mca/btl/ofi/btl_ofi_module.c b/opal/mca/btl/ofi/btl_ofi_module.c index 19a9064540c..23b0dc7dfe8 100644 --- a/opal/mca/btl/ofi/btl_ofi_module.c +++ b/opal/mca/btl/ofi/btl_ofi_module.c @@ -385,11 +385,11 @@ int mca_btl_ofi_finalize(mca_btl_base_module_t *btl) } if (NULL != ofi_btl->domain) { - fi_close(&ofi_btl->domain->fid); + opal_common_ofi_domain_release(ofi_btl->domain); } if (NULL != ofi_btl->fabric) { - fi_close(&ofi_btl->fabric->fid); + opal_common_ofi_fabric_release(ofi_btl->fabric); } if (NULL != ofi_btl->fabric_info) { diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index f6195b41af3..8294263ce1f 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -6,7 +6,7 @@ * reserved. * Copyright (c) 2020-2021 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. - * Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All rights + * Copyright (c) 2021-2025 Amazon.com, Inc. or its affiliates. All rights * reserved. * Copyright (c) 2023 UT-Battelle, LLC. All rights reserved. * $COPYRIGHT$ @@ -42,7 +42,11 @@ extern opal_accelerator_base_module_t opal_accelerator; opal_common_ofi_module_t opal_common_ofi = {.prov_include = NULL, .prov_exclude = NULL, - .output = -1}; + .output = -1, + .fabric = NULL, + .domain = NULL, + .fabric_ref_count = 0, + .domain_ref_count = 0}; static const char default_prov_exclude_list[] = "shm,sockets,tcp,udp,rstream,usnic,net"; static opal_mutex_t opal_common_ofi_mutex = OPAL_MUTEX_STATIC_INIT; static int opal_common_ofi_verbose_level = 0; @@ -1257,3 +1261,156 @@ OPAL_DECLSPEC int opal_common_ofi_fi_getname(fid_t fid, void **addr, size_t *add } return ret; } + +/** + * Get or create fabric object + * + * Reuses existing fabric from fabric_attr->fabric if available, + * otherwise creates new fabric using fi_fabric(). + * + * @param fabric_attr (IN) Fabric attributes + * @param fabric (OUT) Fabric object (new or existing) + * + * @return OPAL_SUCCESS or error code + */ +int opal_common_ofi_fi_fabric(struct fi_fabric_attr *fabric_attr, + struct fid_fabric **fabric) +{ + int ret; + + OPAL_THREAD_LOCK(&opal_common_ofi_mutex); + + if (fabric_attr->fabric) { + *fabric = fabric_attr->fabric; + opal_common_ofi.fabric_ref_count++; + opal_output_verbose(1, opal_common_ofi.output, "Reusing existing fabric: %s", + fabric_attr->name); + } else { + ret = fi_fabric(fabric_attr, fabric, NULL); + if (0 != ret) { + OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex); + return ret; + } + opal_common_ofi.fabric = *fabric; + opal_common_ofi.fabric_ref_count = 1; + } + + OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex); + return OPAL_SUCCESS; +} + +/** + * Get or create domain object + * + * Reuses existing domain from info->domain_attr->domain if available, + * otherwise creates new domain using fi_domain(). + * + * @param fabric (IN) Fabric object + * @param info (IN) Provider info + * @param domain (OUT) Domain object (new or existing) + * + * @return OPAL_SUCCESS or OPAL error code + */ +int opal_common_ofi_fi_domain(struct fid_fabric *fabric, struct fi_info *info, + struct fid_domain **domain) +{ + int ret; + + OPAL_THREAD_LOCK(&opal_common_ofi_mutex); + + if (info->domain_attr->domain) { + *domain = info->domain_attr->domain; + opal_common_ofi.domain_ref_count++; + opal_output_verbose(1, opal_common_ofi.output, "Reusing existing domain: %s", + info->domain_attr->name); + } else { + ret = fi_domain(fabric, info, domain, NULL); + if (0 != ret) { + OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex); + return ret; + } + opal_common_ofi.domain = *domain; + opal_common_ofi.domain_ref_count = 1; + } + + OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex); + return OPAL_SUCCESS; +} + +/** + * Release fabric reference + * + * Decrements fabric reference count and closes fabric if count reaches zero. + * + * @param fabric (IN) Fabric object to release + * + * @return OPAL_SUCCESS or error code + */ +int opal_common_ofi_fabric_release(struct fid_fabric *fabric) +{ + int ret = OPAL_SUCCESS; + + OPAL_THREAD_LOCK(&opal_common_ofi_mutex); + + if (fabric == opal_common_ofi.fabric && opal_common_ofi.fabric_ref_count > 0) { + opal_common_ofi.fabric_ref_count--; + if (opal_common_ofi.fabric_ref_count == 0) { + ret = fi_close(&fabric->fid); + if (0 != ret) { + opal_output_verbose(1, opal_common_ofi.output, + "%s:%d: fi_close failed for fabric: %s (%d)", + __FILE__, __LINE__, fi_strerror(-ret), ret); + } + opal_common_ofi.fabric = NULL; + } + } else { + ret = fi_close(&fabric->fid); + if (0 != ret) { + opal_output_verbose(1, opal_common_ofi.output, + "%s:%d: fi_close failed for fabric: %s (%d)", + __FILE__, __LINE__, fi_strerror(-ret), ret); + } + } + + OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex); + return ret; +} + +/** + * Release domain reference + * + * Decrements domain reference count and closes domain if count reaches zero. + * + * @param domain (IN) Domain object to release + * + * @return OPAL_SUCCESS or error code + */ +int opal_common_ofi_domain_release(struct fid_domain *domain) +{ + int ret = OPAL_SUCCESS; + + OPAL_THREAD_LOCK(&opal_common_ofi_mutex); + + if (domain == opal_common_ofi.domain && opal_common_ofi.domain_ref_count > 0) { + opal_common_ofi.domain_ref_count--; + if (opal_common_ofi.domain_ref_count == 0) { + ret = fi_close(&domain->fid); + if (0 != ret) { + opal_output_verbose(1, opal_common_ofi.output, + "%s:%d: fi_close failed for domain: %s (%d)", + __FILE__, __LINE__, fi_strerror(-ret), ret); + } + opal_common_ofi.domain = NULL; + } + } else { + ret = fi_close(&domain->fid); + if (0 != ret) { + opal_output_verbose(1, opal_common_ofi.output, + "%s:%d: fi_close failed for domain: %s (%d)", + __FILE__, __LINE__, fi_strerror(-ret), ret); + } + } + + OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex); + return ret; +} diff --git a/opal/mca/common/ofi/common_ofi.h b/opal/mca/common/ofi/common_ofi.h index 3deeb0c63ec..4357840604f 100644 --- a/opal/mca/common/ofi/common_ofi.h +++ b/opal/mca/common/ofi/common_ofi.h @@ -5,7 +5,7 @@ * reserved. * Copyright (c) 2020-2024 Triad National Security, LLC. All rights * reserved. - * Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All rights + * Copyright (c) 2021-2025 Amazon.com, Inc. or its affiliates. All rights * reserved. * * $COPYRIGHT$ @@ -30,6 +30,10 @@ typedef struct opal_common_ofi_module { char **prov_include; char **prov_exclude; int output; + struct fid_fabric *fabric; + struct fid_domain *domain; + int fabric_ref_count; + int domain_ref_count; } opal_common_ofi_module_t; /** @@ -223,6 +227,57 @@ OPAL_DECLSPEC struct fi_info *opal_common_ofi_select_provider(struct fi_info *pr */ OPAL_DECLSPEC int opal_common_ofi_fi_getname(fid_t fid, void **addr, size_t *addrlen); +/** + * Get or create fabric object + * + * Reuses existing fabric from fabric_attr->fabric if available, + * otherwise creates new fabric using fi_fabric(). + * + * @param fabric_attr (IN) Fabric attributes + * @param fabric (OUT) Fabric object (new or existing) + * + * @return OPAL_SUCCESS or error code + */ +OPAL_DECLSPEC int opal_common_ofi_fi_fabric(struct fi_fabric_attr *fabric_attr, + struct fid_fabric **fabric); + +/** + * Get or create domain object + * + * Reuses existing domain from info->domain_attr->domain if available, + * otherwise creates new domain using fi_domain(). + * + * @param fabric (IN) Fabric object + * @param info (IN) Provider info + * @param domain (OUT) Domain object (new or existing) + * + * @return OPAL_SUCCESS or error code + */ +OPAL_DECLSPEC int opal_common_ofi_fi_domain(struct fid_fabric *fabric, struct fi_info *info, + struct fid_domain **domain); + +/** + * Release fabric reference + * + * Decrements fabric reference count and closes fabric if count reaches zero. + * + * @param fabric (IN) Fabric object to release + * + * @return OPAL_SUCCESS or error code + */ +OPAL_DECLSPEC int opal_common_ofi_fabric_release(struct fid_fabric *fabric); + +/** + * Release domain reference + * + * Decrements domain reference count and closes domain if count reaches zero. + * + * @param domain (IN) Domain object to release + * + * @return OPAL_SUCCESS or error code + */ +OPAL_DECLSPEC int opal_common_ofi_domain_release(struct fid_domain *domain); + END_C_DECLS #endif /* OPAL_MCA_COMMON_OFI_H */