@@ -304,15 +304,15 @@ static int rank_by(orte_job_t *jdata,
304304{
305305 orte_app_context_t * app ;
306306 hwloc_obj_t obj ;
307- int num_objs , i , j , m , n , rc ;
307+ int num_objs , i , j , m , n , rc , nn ;
308308 orte_vpid_t num_ranked = 0 ;
309309 orte_node_t * node ;
310310 orte_proc_t * proc , * pptr ;
311- orte_vpid_t vpid ;
311+ orte_vpid_t vpid , np ;
312312 int cnt ;
313313 opal_pointer_array_t objs ;
314- bool all_done ;
315314 hwloc_obj_t locale ;
315+ orte_app_idx_t napp ;
316316
317317 if (ORTE_RANKING_SPAN & ORTE_GET_RANKING_DIRECTIVE (jdata -> map -> ranking )) {
318318 return rank_span (jdata , target , cache_level );
@@ -333,20 +333,21 @@ static int rank_by(orte_job_t *jdata,
333333 */
334334
335335 vpid = 0 ;
336- for (n = 0 ; n < jdata -> apps -> size ; n ++ ) {
336+ for (n = 0 , napp = 0 ; napp < jdata -> num_apps && n < jdata -> apps -> size ; n ++ ) {
337337 if (NULL == (app = (orte_app_context_t * )opal_pointer_array_get_item (jdata -> apps , n ))) {
338338 continue ;
339339 }
340-
340+ napp ++ ;
341341 /* setup the pointer array */
342342 OBJ_CONSTRUCT (& objs , opal_pointer_array_t );
343343 opal_pointer_array_init (& objs , 2 , INT_MAX , 2 );
344344
345345 cnt = 0 ;
346- for (m = 0 ; m < jdata -> map -> nodes -> size ; m ++ ) {
346+ for (m = 0 , nn = 0 ; nn < jdata -> map -> num_nodes && m < jdata -> map -> nodes -> size ; m ++ ) {
347347 if (NULL == (node = (orte_node_t * )opal_pointer_array_get_item (jdata -> map -> nodes , m ))) {
348348 continue ;
349349 }
350+ nn ++ ;
350351
351352 /* get the number of objects - only consider those we can actually use */
352353 num_objs = opal_hwloc_base_get_nbobjs_by_type (node -> topology -> topo , target ,
@@ -376,13 +377,20 @@ static int rank_by(orte_job_t *jdata,
376377 * Perhaps someday someone will come up with a more efficient
377378 * algorithm, but this works for now.
378379 */
379- all_done = false;
380- while (!all_done && cnt < app -> num_procs ) {
381- all_done = true;
382- for (j = 0 ; j < node -> procs -> size && cnt < app -> num_procs ; j ++ ) {
380+ i = 0 ;
381+ while (cnt < app -> num_procs ) {
382+ /* get the next object */
383+ obj = (hwloc_obj_t )opal_pointer_array_get_item (& objs , i );
384+ if (NULL == obj ) {
385+ break ;
386+ }
387+ /* scan across the procs and find the one that is on this object */
388+ np = 0 ;
389+ for (j = 0 ; np < node -> num_procs && j < node -> procs -> size && cnt < app -> num_procs ; j ++ ) {
383390 if (NULL == (proc = (orte_proc_t * )opal_pointer_array_get_item (node -> procs , j ))) {
384391 continue ;
385392 }
393+ np ++ ;
386394 /* ignore procs from other jobs */
387395 if (proc -> name .jobid != jdata -> jobid ) {
388396 opal_output_verbose (5 , orte_rmaps_base_framework .framework_output ,
@@ -404,53 +412,48 @@ static int rank_by(orte_job_t *jdata,
404412 ORTE_NAME_PRINT (& proc -> name ), num_ranked );
405413 continue ;
406414 }
407- /* cycle across the objects */
408- for (i = 0 ; i < num_objs && cnt < app -> num_procs && all_done ; i ++ ) {
409- obj = (hwloc_obj_t )opal_pointer_array_get_item (& objs , i );
410- /* protect against bozo case */
411- locale = NULL ;
412- if (!orte_get_attribute (& proc -> attributes , ORTE_PROC_HWLOC_LOCALE , (void * * )& locale , OPAL_PTR )) {
413- ORTE_ERROR_LOG (ORTE_ERROR );
414- return ORTE_ERROR ;
415- }
416- /* ignore procs not on this object */
417- if (NULL == locale ||
418- !hwloc_bitmap_intersects (obj -> cpuset , locale -> cpuset )) {
419- opal_output_verbose (5 , orte_rmaps_base_framework .framework_output ,
420- "mca:rmaps:rank_by: proc at position %d is not on object %d" ,
421- j , i );
422- continue ;
423- }
424- /* assign the vpid */
425- proc -> name .vpid = vpid ++ ;
426- if (0 == cnt ) {
427- app -> first_rank = proc -> name .vpid ;
428- }
429- cnt ++ ;
415+ /* protect against bozo case */
416+ locale = NULL ;
417+ if (!orte_get_attribute (& proc -> attributes , ORTE_PROC_HWLOC_LOCALE , (void * * )& locale , OPAL_PTR )) {
418+ ORTE_ERROR_LOG (ORTE_ERROR );
419+ return ORTE_ERROR ;
420+ }
421+ /* ignore procs not on this object */
422+ if (NULL == locale ||
423+ !hwloc_bitmap_intersects (obj -> cpuset , locale -> cpuset )) {
430424 opal_output_verbose (5 , orte_rmaps_base_framework .framework_output ,
431- "mca:rmaps:rank_by: proc in position %d is on object %d assigned rank %s" ,
432- j , i , ORTE_VPID_PRINT (proc -> name .vpid ));
433- /* insert the proc into the jdata array */
434- if (NULL != (pptr = (orte_proc_t * )opal_pointer_array_get_item (jdata -> procs , proc -> name .vpid ))) {
435- OBJ_RELEASE (pptr );
436- }
437- OBJ_RETAIN (proc );
438- if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item (jdata -> procs , proc -> name .vpid , proc ))) {
439- ORTE_ERROR_LOG (rc );
440- OBJ_DESTRUCT (& objs );
441- return rc ;
442- }
443- num_ranked ++ ;
444- /* flag that one was mapped */
445- all_done = false;
446- /* track where the highest vpid landed - this is our
447- * new bookmark
448- */
449- jdata -> bookmark = node ;
450- /* move to next object */
451- break ;
425+ "mca:rmaps:rank_by: proc at position %d is not on object %d" ,
426+ j , i );
427+ continue ;
428+ }
429+ /* assign the vpid */
430+ proc -> name .vpid = vpid ++ ;
431+ if (0 == cnt ) {
432+ app -> first_rank = proc -> name .vpid ;
452433 }
434+ cnt ++ ;
435+ opal_output_verbose (5 , orte_rmaps_base_framework .framework_output ,
436+ "mca:rmaps:rank_by: proc in position %d is on object %d assigned rank %s" ,
437+ j , i , ORTE_VPID_PRINT (proc -> name .vpid ));
438+ /* insert the proc into the jdata array */
439+ if (NULL != (pptr = (orte_proc_t * )opal_pointer_array_get_item (jdata -> procs , proc -> name .vpid ))) {
440+ OBJ_RELEASE (pptr );
441+ }
442+ OBJ_RETAIN (proc );
443+ if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item (jdata -> procs , proc -> name .vpid , proc ))) {
444+ ORTE_ERROR_LOG (rc );
445+ OBJ_DESTRUCT (& objs );
446+ return rc ;
447+ }
448+ num_ranked ++ ;
449+ /* track where the highest vpid landed - this is our
450+ * new bookmark
451+ */
452+ jdata -> bookmark = node ;
453+ /* move to next object */
454+ break ;
453455 }
456+ i ++ ;
454457 }
455458 }
456459 /* cleanup */
@@ -474,6 +477,9 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
474477
475478 map = jdata -> map ;
476479
480+ opal_output_verbose (5 , orte_rmaps_base_framework .framework_output ,
481+ "RANKING POLICY: %s" , orte_rmaps_base_print_ranking (map -> ranking ));
482+
477483 /* start with the rank-by object options - if the object isn't
478484 * included in the topology, then we obviously cannot rank by it.
479485 * However, if this was the default ranking policy (as opposed to
0 commit comments