Skip to content

Commit 5ea8638

Browse files
authored
Merge pull request #11641 from rhc54/topic/ofi
Do not compute distances when unbound
2 parents 2311331 + 450c72d commit 5ea8638

File tree

3 files changed

+32
-13
lines changed

3 files changed

+32
-13
lines changed

opal/mca/common/ofi/common_ofi.c

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
* Copyright (c) 2020-2022 Triad National Security, LLC. All rights
66
* reserved.
77
* Copyright (c) 2020-2021 Cisco Systems, Inc. All rights reserved.
8-
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
8+
* Copyright (c) 2021-2023 Nanook Consulting. All rights reserved.
99
* Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All rights
1010
* reserved.
1111
* Copyright (c) 2023 UT-Battelle, LLC. All rights reserved.
@@ -469,35 +469,42 @@ static int check_provider_attr(struct fi_info *provider_info, struct fi_info *pr
469469
static int compute_dev_distances(pmix_device_distance_t **distances,
470470
size_t *ndist)
471471
{
472-
int ret = 0;
472+
int ret = OPAL_SUCCESS;
473473
size_t ninfo;
474474
pmix_info_t *info;
475475
pmix_cpuset_t cpuset;
476-
pmix_topology_t *pmix_topo;
476+
pmix_topology_t pmix_topo = PMIX_TOPOLOGY_STATIC_INIT;
477477
pmix_device_type_t type = PMIX_DEVTYPE_OPENFABRICS |
478478
PMIX_DEVTYPE_NETWORK;
479479

480480
PMIX_CPUSET_CONSTRUCT(&cpuset);
481481
ret = PMIx_Get_cpuset(&cpuset, PMIX_CPUBIND_THREAD);
482482
if (PMIX_SUCCESS != ret) {
483+
/* we are not bound */
484+
ret = OPAL_ERR_NOT_BOUND;
483485
goto out;
484486
}
487+
/* if we are not bound, then we cannot compute distances */
488+
if (hwloc_bitmap_iszero(cpuset.bitmap) ||
489+
hwloc_bitmap_isfull(cpuset.bitmap)) {
490+
return OPAL_ERR_NOT_BOUND;
491+
}
485492

486-
/* load the PMIX topology */
487-
PMIx_Topology_free(pmix_topo, 1);
488-
ret = PMIx_Load_topology(pmix_topo);
493+
/* load the PMIX topology - this just loads a pointer to
494+
* the local topology held in PMIx, so you must not
495+
* free it */
496+
ret = PMIx_Load_topology(&pmix_topo);
489497
if (PMIX_SUCCESS != ret) {
490498
goto out;
491499
}
492500

493501
ninfo = 1;
494502
info = PMIx_Info_create(ninfo);
495503
PMIx_Info_load(&info[0], PMIX_DEVICE_TYPE, &type, PMIX_DEVTYPE);
496-
ret = PMIx_Compute_distances(pmix_topo, &cpuset, info, ninfo, distances,
504+
ret = PMIx_Compute_distances(&pmix_topo, &cpuset, info, ninfo, distances,
497505
ndist);
498506
PMIx_Info_free(info, ninfo);
499507

500-
PMIx_Topology_free(pmix_topo, 1);
501508
out:
502509
return ret;
503510
}
@@ -533,8 +540,9 @@ get_nearest_nics(int *num_distances, pmix_value_t **valin)
533540
PMIx_Info_destruct(&directive);
534541
if (ret != PMIX_SUCCESS || !val) {
535542
ret = compute_dev_distances(&distances, &ndist);
536-
if (ret)
543+
if (ret) {
537544
goto out;
545+
}
538546
goto find_nearest;
539547
}
540548

@@ -554,8 +562,9 @@ get_nearest_nics(int *num_distances, pmix_value_t **valin)
554562

555563
find_nearest:
556564
nearest = calloc(sizeof(*distances), ndist);
557-
if (!nearest)
565+
if (!nearest) {
558566
goto out;
567+
}
559568

560569
for (i = 0; i < ndist; i++) {
561570
if (distances[i].type != PMIX_DEVTYPE_NETWORK &&
@@ -596,6 +605,15 @@ get_nearest_nics(int *num_distances, pmix_value_t **valin)
596605
* distances array is not provided. False otherwise.
597606
*
598607
*/
608+
#if HWLOC_API_VERSION < 0x00020000
609+
static bool is_near(pmix_device_distance_t *distances,
610+
int num_distances,
611+
hwloc_topology_t topology,
612+
struct fi_pci_attr pci)
613+
{
614+
return true;
615+
}
616+
#else
599617
static bool is_near(pmix_device_distance_t *distances,
600618
int num_distances,
601619
hwloc_topology_t topology,
@@ -658,6 +676,7 @@ static bool is_near(pmix_device_distance_t *distances,
658676
return false;
659677
}
660678
#endif
679+
#endif // OPAL_OFI_PCI_DATA_AVAILABLE
661680

662681
/* Count providers returns the number of providers present in an fi_info list
663682
* @param (IN) provider_list struct fi_info* list of providers available
@@ -772,8 +791,8 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
772791
pmix_value_t *pmix_val;
773792
struct fi_pci_attr pci;
774793
int num_distances = 0;
775-
bool near;
776794
#endif
795+
bool near;
777796
int ret;
778797
unsigned int num_provider = 0, provider_limit = 0;
779798
bool provider_found = false;

0 commit comments

Comments
 (0)