Skip to content

Commit f39ce67

Browse files
author
Ralph Castain
authored
Merge pull request #3951 from rhc54/topic/hwloc2
Update to hwloc 2.0.0a
2 parents 69612b3 + 6ebaed8 commit f39ce67

File tree

148 files changed

+23773
-19374
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

148 files changed

+23773
-19374
lines changed

.gitignore

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -303,9 +303,7 @@ opal/mca/event/libevent*/libevent/libevent_pthreads.pc
303303
opal/mca/event/libevent*/libevent/include/event2/event-config.h
304304

305305
opal/mca/hwloc/hwloc*/hwloc/include/hwloc/autogen/config.h
306-
opal/mca/hwloc/hwloc*/hwloc/include/hwloc/autogen/config.h.in
307306
opal/mca/hwloc/hwloc*/hwloc/include/private/autogen/config.h
308-
opal/mca/hwloc/hwloc*/hwloc/include/private/autogen/config.h.in
309307
opal/mca/hwloc/base/static-components.h.new.extern
310308
opal/mca/hwloc/base/static-components.h.new.struct
311309

@@ -362,6 +360,7 @@ orte/test/mpi/accept
362360
orte/test/mpi/attach
363361
orte/test/mpi/bad_exit
364362
orte/test/mpi/bcast_loop
363+
orte/test/mpi/binding
365364
orte/test/mpi/concurrent_spawn
366365
orte/test/mpi/connect
367366
orte/test/mpi/crisscross

ompi/mca/osc/rdma/osc_rdma_active_target.c

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
* Copyright (c) 2017 The University of Tennessee and The University
1717
* of Tennessee Research Foundation. All rights
1818
* reserved.
19+
* Copyright (c) 2017 Intel, Inc. All rights reserved.
1920
* $COPYRIGHT$
2021
*
2122
* Additional copyrights may follow
@@ -242,10 +243,6 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
242243
return OMPI_SUCCESS;
243244
}
244245

245-
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
246-
return OMPI_ERR_OUT_OF_RESOURCE;
247-
}
248-
249246
/* translate group ranks into the communicator */
250247
peers = ompi_osc_rdma_get_peers (module, module->pw_group);
251248
if (OPAL_UNLIKELY(NULL == peers)) {
@@ -281,7 +278,7 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
281278
do {
282279
ompi_osc_rdma_lock_t result;
283280

284-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "attempting to post to index %d @ rank %d", post_index, peer->rank);
281+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "attempting to post to index %d @ rank %d", (int)post_index, peer->rank);
285282

286283
/* try to post. if the value isn't 0 then another rank is occupying this index */
287284
if (!ompi_osc_rdma_peer_local_state (peer)) {

opal/mca/btl/openib/btl_openib_component.c

Lines changed: 118 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
* Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved.
1919
* Copyright (c) 2011-2015 NVIDIA Corporation. All rights reserved.
2020
* Copyright (c) 2012 Oak Ridge National Laboratory. All rights reserved
21-
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
21+
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
2222
* Copyright (c) 2014-2017 Research Organization for Information Science
2323
* and Technology (RIST). All rights reserved.
2424
* Copyright (c) 2014 Bull SAS. All rights reserved.
@@ -2330,32 +2330,41 @@ static float get_ib_dev_distance(struct ibv_device *dev)
23302330
/* If we don't have hwloc, we'll default to a distance of 0,
23312331
because we have no way of measuring. */
23322332
float distance = 0;
2333+
float a, b;
2334+
int i;
2335+
hwloc_cpuset_t my_cpuset = NULL, ibv_cpuset = NULL;
2336+
hwloc_obj_t my_obj, ibv_obj, node_obj;
2337+
struct hwloc_distances_s *hwloc_distances = NULL;
23332338

2334-
#if HWLOC_API_VERSION < 0x20000
23352339
/* Override any distance logic so all devices are used */
23362340
if (0 != mca_btl_openib_component.ignore_locality ||
23372341
OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
23382342
return distance;
23392343
}
23402344

2341-
float a, b;
2342-
int i;
2343-
hwloc_cpuset_t my_cpuset = NULL, ibv_cpuset = NULL;
2344-
hwloc_obj_t my_obj, ibv_obj, node_obj;
2345-
2346-
/* Note that this struct is owned by hwloc; there's no need to
2347-
free it at the end of time */
2348-
static const struct hwloc_distances_s *hwloc_distances = NULL;
2345+
#if HWLOC_API_VERSION >= 0x20000
2346+
unsigned int j, distances_nr = 1;
2347+
int ibvindex, myindex;
2348+
#endif
23492349

23502350
if (NULL == hwloc_distances) {
2351-
hwloc_distances =
2352-
hwloc_get_whole_distance_matrix_by_type(opal_hwloc_topology,
2353-
HWLOC_OBJ_NODE);
2354-
}
2351+
#if HWLOC_API_VERSION < 0x20000
2352+
hwloc_distances =
2353+
hwloc_get_whole_distance_matrix_by_type(opal_hwloc_topology,
2354+
HWLOC_OBJ_NODE);
2355+
/* If we got no info, just return 0 */
2356+
if (NULL == hwloc_distances || NULL == hwloc_distances->latency) {
2357+
goto out;
2358+
}
23552359

2356-
/* If we got no info, just return 0 */
2357-
if (NULL == hwloc_distances || NULL == hwloc_distances->latency) {
2358-
goto out;
2360+
#else
2361+
if (0 != hwloc_distances_get_by_type(opal_hwloc_topology, HWLOC_OBJ_NODE,
2362+
&distances_nr, &hwloc_distances,
2363+
HWLOC_DISTANCES_KIND_MEANS_LATENCY, 0) || 0 == distances_nr) {
2364+
hwloc_distances = NULL;
2365+
goto out;
2366+
}
2367+
#endif
23592368
}
23602369

23612370
/* Next, find the NUMA node where this IBV device is located */
@@ -2373,16 +2382,31 @@ static float get_ib_dev_distance(struct ibv_device *dev)
23732382

23742383
opal_output_verbose(5, opal_btl_base_framework.framework_output,
23752384
"hwloc_distances->nbobjs=%d", hwloc_distances->nbobjs);
2385+
#if HWLOC_API_VERSION < 0x20000
23762386
for (i = 0; i < (int)(2 * hwloc_distances->nbobjs); i++) {
23772387
opal_output_verbose(5, opal_btl_base_framework.framework_output,
23782388
"hwloc_distances->latency[%d]=%f", i, hwloc_distances->latency[i]);
23792389
}
2390+
#else
2391+
for (i = 0; i < (int)hwloc_distances->nbobjs; i++) {
2392+
opal_output_verbose(5, opal_btl_base_framework.framework_output,
2393+
"hwloc_distances->values[%d]=%"PRIu64, i, hwloc_distances->values[i]);
2394+
}
2395+
#endif
23802396

23812397
/* If ibv_obj is a NUMA node or below, we're good. */
23822398
switch (ibv_obj->type) {
23832399
case HWLOC_OBJ_NODE:
23842400
case HWLOC_OBJ_SOCKET:
2401+
#if HWLOC_API_VERSION < 0x20000
23852402
case HWLOC_OBJ_CACHE:
2403+
#else
2404+
case HWLOC_OBJ_L1CACHE:
2405+
case HWLOC_OBJ_L2CACHE:
2406+
case HWLOC_OBJ_L3CACHE:
2407+
case HWLOC_OBJ_L4CACHE:
2408+
case HWLOC_OBJ_L5CACHE:
2409+
#endif
23862410
case HWLOC_OBJ_CORE:
23872411
case HWLOC_OBJ_PU:
23882412
while (NULL != ibv_obj && ibv_obj->type != HWLOC_OBJ_NODE) {
@@ -2402,6 +2426,22 @@ static float get_ib_dev_distance(struct ibv_device *dev)
24022426
if (NULL == ibv_obj) {
24032427
goto out;
24042428
}
2429+
#if HWLOC_API_VERSION >= 0x20000
2430+
/* the new matrix format isn't quite as friendly, so we have to
2431+
* do an exhaustive search to find the index of this object
2432+
* in that array */
2433+
ibvindex = -1;
2434+
for (j=0; j < distances_nr; j++) {
2435+
if (ibv_obj == hwloc_distances->objs[j]) {
2436+
ibvindex = j;
2437+
break;
2438+
}
2439+
}
2440+
if (-1 == ibvindex) {
2441+
OPAL_ERROR_LOG(OPAL_ERR_NOT_FOUND);
2442+
goto out;
2443+
}
2444+
#endif
24052445

24062446
opal_output_verbose(5, opal_btl_base_framework.framework_output,
24072447
"ibv_obj->logical_index=%d", ibv_obj->logical_index);
@@ -2424,7 +2464,15 @@ static float get_ib_dev_distance(struct ibv_device *dev)
24242464
switch (my_obj->type) {
24252465
case HWLOC_OBJ_NODE:
24262466
case HWLOC_OBJ_SOCKET:
2427-
case HWLOC_OBJ_CACHE:
2467+
#if HWLOC_API_VERSION < 0x20000
2468+
case HWLOC_OBJ_CACHE:
2469+
#else
2470+
case HWLOC_OBJ_L1CACHE:
2471+
case HWLOC_OBJ_L2CACHE:
2472+
case HWLOC_OBJ_L3CACHE:
2473+
case HWLOC_OBJ_L4CACHE:
2474+
case HWLOC_OBJ_L5CACHE:
2475+
#endif
24282476
case HWLOC_OBJ_CORE:
24292477
case HWLOC_OBJ_PU:
24302478
while (NULL != my_obj && my_obj->type != HWLOC_OBJ_NODE) {
@@ -2435,12 +2483,31 @@ static float get_ib_dev_distance(struct ibv_device *dev)
24352483
"my_obj->logical_index=%d", my_obj->logical_index);
24362484
/* Distance may be asymetrical, so calculate both of them
24372485
and take the max */
2438-
a = hwloc_distances->latency[my_obj->logical_index +
2439-
(ibv_obj->logical_index *
2440-
hwloc_distances->nbobjs)];
2441-
b = hwloc_distances->latency[ibv_obj->logical_index +
2442-
(my_obj->logical_index *
2443-
hwloc_distances->nbobjs)];
2486+
#if HWLOC_API_VERSION < 0x20000
2487+
a = hwloc_distances->latency[my_obj->logical_index +
2488+
(ibv_obj->logical_index *
2489+
hwloc_distances->nbobjs)];
2490+
b = hwloc_distances->latency[ibv_obj->logical_index +
2491+
(my_obj->logical_index *
2492+
hwloc_distances->nbobjs)];
2493+
#else
2494+
/* the new matrix format isn't quite as friendly, so we have to
2495+
* do an exhaustive search to find the index of this object
2496+
* in that array */
2497+
myindex = -1;
2498+
for (j=0; j < distances_nr; j++) {
2499+
if (my_obj == hwloc_distances->objs[j]) {
2500+
myindex = j;
2501+
break;
2502+
}
2503+
}
2504+
if (-1 == myindex) {
2505+
OPAL_ERROR_LOG(OPAL_ERR_NOT_FOUND);
2506+
goto out;
2507+
}
2508+
a = (float)hwloc_distances->values[myindex + (ibvindex * hwloc_distances->nbobjs)];
2509+
b = (float)hwloc_distances->values[ibvindex + (myindex * hwloc_distances->nbobjs)];
2510+
#endif
24442511
distance = (a > b) ? a : b;
24452512
}
24462513
break;
@@ -2456,13 +2523,28 @@ static float get_ib_dev_distance(struct ibv_device *dev)
24562523
node_obj = hwloc_get_obj_inside_cpuset_by_type(opal_hwloc_topology,
24572524
ibv_obj->cpuset,
24582525
HWLOC_OBJ_NODE, ++i)) {
2459-
2460-
a = hwloc_distances->latency[node_obj->logical_index +
2461-
(ibv_obj->logical_index *
2462-
hwloc_distances->nbobjs)];
2463-
b = hwloc_distances->latency[ibv_obj->logical_index +
2464-
(node_obj->logical_index *
2465-
hwloc_distances->nbobjs)];
2526+
#if HWLOC_API_VERSION < 0x20000
2527+
a = hwloc_distances->latency[node_obj->logical_index +
2528+
(ibv_obj->logical_index *
2529+
hwloc_distances->nbobjs)];
2530+
b = hwloc_distances->latency[ibv_obj->logical_index +
2531+
(node_obj->logical_index *
2532+
hwloc_distances->nbobjs)];
2533+
#else
2534+
unsigned int j;
2535+
j = node_obj->logical_index + (ibv_obj->logical_index * hwloc_distances->nbobjs);
2536+
if (j < distances_nr) {
2537+
a = (float)hwloc_distances->values[j];
2538+
} else {
2539+
goto out;
2540+
}
2541+
j = ibv_obj->logical_index + (node_obj->logical_index * hwloc_distances->nbobjs);
2542+
if (j < distances_nr) {
2543+
b = (float)hwloc_distances->values[j];
2544+
} else {
2545+
goto out;
2546+
}
2547+
#endif
24662548
a = (a > b) ? a : b;
24672549
distance = (a > distance) ? a : distance;
24682550
}
@@ -2476,10 +2558,12 @@ static float get_ib_dev_distance(struct ibv_device *dev)
24762558
if (NULL != my_cpuset) {
24772559
hwloc_bitmap_free(my_cpuset);
24782560
}
2479-
#else
2480-
#warning FIXME get_ib_dev_distance is not implemented with hwloc v2
2481-
#endif
24822561

2562+
#if HWLOC_API_VERSION < 0x20000
2563+
if (NULL != hwloc_distances) {
2564+
hwloc_distances_release(opal_hwloc_topology, hwloc_distances);
2565+
}
2566+
#endif
24832567
return distance;
24842568
}
24852569

opal/mca/btl/openib/btl_openib_proc.c

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2007-2015 Cisco Systems, Inc. All rights reserved.
1414
* Copyright (c) 2006-2007 Voltaire All rights reserved.
15-
* Copyright (c) 2014 Intel, Inc. All rights reserved.
15+
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
1616
* Copyright (c) 2015-2016 Research Organization for Information Science
1717
* and Technology (RIST). All rights reserved.
1818
* Copyright (c) 2015 Mellanox Technologies. All rights reserved.
@@ -77,8 +77,6 @@ void mca_btl_openib_proc_construct(mca_btl_openib_proc_t* ib_proc)
7777

7878
void mca_btl_openib_proc_destruct(mca_btl_openib_proc_t* ib_proc)
7979
{
80-
mca_btl_openib_proc_btlptr_t* elem;
81-
8280
/* release resources */
8381
if(NULL != ib_proc->proc_endpoints) {
8482
free(ib_proc->proc_endpoints);

opal/mca/btl/usnic/btl_usnic_hwloc.c

Lines changed: 52 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved.
3-
* Copyright (c) 2016 Intel, Inc. All rights reserved.
3+
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
44
* $COPYRIGHT$
55
*
66
* Additional copyrights may follow
@@ -26,22 +26,34 @@
2626
*/
2727
static hwloc_obj_t my_numa_node = NULL;
2828
static int num_numa_nodes = 0;
29-
static const struct hwloc_distances_s *matrix = NULL;
29+
static struct hwloc_distances_s *matrix = NULL;
30+
#if HWLOC_API_VERSION >= 0x20000
31+
static unsigned int matrix_nr = 1;
32+
#endif
3033

3134
/*
3235
* Get the hwloc distance matrix (if we don't already have it).
33-
*
34-
* Note that the matrix data structure belongs to hwloc; we are not
35-
* responsibile for freeing it.
3636
*/
3737
static int get_distance_matrix(void)
3838
{
39+
#if HWLOC_API_VERSION < 0x20000
40+
/* Note that the matrix data structure belongs to hwloc; we are not
41+
* responsible for freeing it. */
42+
3943
if (NULL == matrix) {
4044
matrix = hwloc_get_whole_distance_matrix_by_type(opal_hwloc_topology,
4145
HWLOC_OBJ_NODE);
4246
}
4347

4448
return (NULL == matrix) ? OPAL_ERROR : OPAL_SUCCESS;
49+
#else
50+
if (0 != hwloc_distances_get_by_type(opal_hwloc_topology, HWLOC_OBJ_NODE,
51+
&matrix_nr, &matrix,
52+
HWLOC_DISTANCES_KIND_MEANS_LATENCY, 0) || 0 == matrix_nr) {
53+
return OPAL_ERROR;
54+
}
55+
return OPAL_SUCCESS;
56+
#endif
4557
}
4658

4759
/*
@@ -219,6 +231,7 @@ int opal_btl_usnic_hwloc_distance(opal_btl_usnic_module_t *module)
219231

220232
/* Lookup the distance between my NUMA node and the NUMA node of
221233
the device */
234+
#if HWLOC_API_VERSION < 0x20000
222235
if (NULL != dev_numa) {
223236
module->numa_distance =
224237
matrix->latency[dev_numa->logical_index * num_numa_nodes +
@@ -229,6 +242,40 @@ int opal_btl_usnic_hwloc_distance(opal_btl_usnic_module_t *module)
229242
module->linux_device_name,
230243
module->numa_distance);
231244
}
245+
#else
246+
if (NULL != dev_numa) {
247+
int myindex, devindex;
248+
unsigned int j;
249+
myindex = -1;
250+
for (j=0; j < matrix_nr; j++) {
251+
if (my_numa_node == matrix->objs[j]) {
252+
myindex = j;
253+
break;
254+
}
255+
}
256+
if (-1 == myindex) {
257+
return OPAL_SUCCESS;
258+
}
259+
devindex = -1;
260+
for (j=0; j < matrix_nr; j++) {
261+
if (dev_numa == matrix->objs[j]) {
262+
devindex = j;
263+
break;
264+
}
265+
}
266+
if (-1 == devindex) {
267+
return OPAL_SUCCESS;
268+
}
269+
270+
module->numa_distance =
271+
matrix->values[(devindex * num_numa_nodes) + myindex];
272+
273+
opal_output_verbose(5, USNIC_OUT,
274+
"btl:usnic:filter_numa: %s is distance %d from me",
275+
module->linux_device_name,
276+
module->numa_distance);
277+
}
278+
#endif
232279

233280
return OPAL_SUCCESS;
234281
}

0 commit comments

Comments
 (0)