Skip to content

Commit dbc3f17

Browse files
author
Rolf vandeVaart
committed
Add some verbosity to help debug hwloc issues
(cherry picked from commit open-mpi/ompi@2e64a69) Conflicts: opal/mca/btl/openib/btl_openib_component.c
1 parent 7cd3cae commit dbc3f17

File tree

1 file changed

+17
-0
lines changed

1 file changed

+17
-0
lines changed

opal/mca/btl/openib/btl_openib_component.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2363,6 +2363,13 @@ static float get_ib_dev_distance(struct ibv_device *dev)
23632363
goto out;
23642364
}
23652365

2366+
opal_output_verbose(5, opal_btl_base_framework.framework_output,
2367+
"hwloc_distances->nbobjs=%d", hwloc_distances->nbobjs);
2368+
for (i = 0; i < (int)(2 * hwloc_distances->nbobjs); i++) {
2369+
opal_output_verbose(5, opal_btl_base_framework.framework_output,
2370+
"hwloc_distances->latency[%d]=%f", i, hwloc_distances->latency[i]);
2371+
}
2372+
23662373
/* If ibv_obj is a NUMA node or below, we're good. */
23672374
switch (ibv_obj->type) {
23682375
case HWLOC_OBJ_NODE:
@@ -2378,6 +2385,7 @@ static float get_ib_dev_distance(struct ibv_device *dev)
23782385
default:
23792386
/* If it's above a NUMA node, then I don't know how to compute
23802387
the distance... */
2388+
opal_output_verbose(5, opal_btl_base_framework.framework_output, "ibv_obj->type set to NULL");
23812389
ibv_obj = NULL;
23822390
break;
23832391
}
@@ -2387,6 +2395,8 @@ static float get_ib_dev_distance(struct ibv_device *dev)
23872395
goto out;
23882396
}
23892397

2398+
opal_output_verbose(5, opal_btl_base_framework.framework_output,
2399+
"ibv_obj->logical_index=%d", ibv_obj->logical_index);
23902400
/* This function is only called if the process is bound, so let's
23912401
find out where we are bound to. For the moment, we only care
23922402
about the NUMA node to which we are bound. */
@@ -2413,6 +2423,8 @@ static float get_ib_dev_distance(struct ibv_device *dev)
24132423
my_obj = my_obj->parent;
24142424
}
24152425
if (NULL != my_obj) {
2426+
opal_output_verbose(5, opal_btl_base_framework.framework_output,
2427+
"my_obj->logical_index=%d", my_obj->logical_index);
24162428
/* Distance may be asymetrical, so calculate both of them
24172429
and take the max */
24182430
a = hwloc_distances->latency[my_obj->logical_index +
@@ -2472,6 +2484,8 @@ sort_devs_by_distance(struct ibv_device **ib_devs, int count)
24722484

24732485
for (i = 0; i < count; i++) {
24742486
devs[i].ib_dev = ib_devs[i];
2487+
opal_output_verbose(5, opal_btl_base_framework.framework_output,
2488+
"Checking distance from this process to device=%s", ibv_get_device_name(ib_devs[i]));
24752489
/* If we're not bound, just assume that the device is close. */
24762490
devs[i].distance = 0;
24772491
#if OPAL_HAVE_HWLOC
@@ -2481,6 +2495,9 @@ sort_devs_by_distance(struct ibv_device **ib_devs, int count)
24812495
devs[i].distance = get_ib_dev_distance(ib_devs[i]);
24822496
}
24832497
#endif
2498+
opal_output_verbose(5, opal_btl_base_framework.framework_output,
2499+
"Process is %s: distance to device is %f",
2500+
(opal_process_info.cpuset ? "bound" : "not bound"), devs[i].distance);
24842501
}
24852502

24862503
qsort(devs, count, sizeof(struct dev_distance), compare_distance);

0 commit comments

Comments
 (0)