66 * Copyright (c) 2011-2015 INRIA. All rights reserved.
77 * Copyright (c) 2012-2015 Bordeaux Poytechnic Institute
88 * Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
9- * Copyright (c) 2015-2016 Research Organization for Information Science
9+ * Copyright (c) 2015-2017 Research Organization for Information Science
1010 * and Technology (RIST). All rights reserved.
1111 * Copyright (c) 2016 Los Alamos National Security, LLC. All rights
1212 * reserved.
5555#define MY_STRING_SIZE 64
5656/*#define __DEBUG__ 1 */
5757
58-
58+ /**
59+ * This function is a allreduce between all processes to detect for oversubscription.
60+ * On each node, the local_procs will be a different array, that contains only the
61+ * local processes. Thus, that process will compute the node oversubscription and will
62+ * bring this value to the operation, while every other process on the node will
63+ * contribute 0.
64+ * Doing an AllReduce might be an overkill for this situation, but it should remain
65+ * more scalable than a star reduction (between the roots of each node (nodes_roots),
66+ * followed by a bcast to all processes.
67+ */
5968static int check_oversubscribing (int rank ,
6069 int num_nodes ,
6170 int num_objs_in_node ,
@@ -64,48 +73,13 @@ static int check_oversubscribing(int rank,
6473 int * local_procs ,
6574 ompi_communicator_t * comm_old )
6675{
67- int oversubscribed = 0 ;
68- int local_oversub = 0 ;
69- int err ;
76+ int oversubscribed = 0 , local_oversub = 0 , err ;
7077
78+ /* Only a single process per node, the local root, compute the oversubscription condition */
7179 if (rank == local_procs [0 ])
7280 if (num_objs_in_node < num_procs_in_node )
7381 local_oversub = 1 ;
7482
75- if (rank == 0 ) {
76- MPI_Request * reqs = (MPI_Request * )calloc (num_nodes - 1 , sizeof (MPI_Request ));
77- int * oversub = (int * )calloc (num_nodes , sizeof (int ));
78- int i ;
79-
80- oversub [0 ] = local_oversub ;
81- for (i = 1 ; i < num_nodes ; i ++ )
82- if (OMPI_SUCCESS != ( err = MCA_PML_CALL (irecv (& oversub [i ], 1 , MPI_INT ,
83- nodes_roots [i ], 111 , comm_old , & reqs [i - 1 ])))) {
84- /* NTH: more needs to be done to correctly clean up here */
85- free (reqs );
86- free (oversub );
87- return err ;
88- }
89-
90- if (OMPI_SUCCESS != ( err = ompi_request_wait_all (num_nodes - 1 ,
91- reqs , MPI_STATUSES_IGNORE ))) {
92- /* NTH: more needs to be done to correctly clean up here */
93- free (reqs );
94- free (oversub );
95- return err ;
96- }
97-
98- for (i = 0 ; i < num_nodes ; i ++ )
99- oversubscribed += oversub [i ];
100-
101- free (oversub );
102- free (reqs );
103- } else {
104- if (rank == local_procs [0 ])
105- if (OMPI_SUCCESS != (err = MCA_PML_CALL (send (& local_oversub , 1 , MPI_INT , 0 ,
106- 111 , MCA_PML_BASE_SEND_STANDARD , comm_old ))))
107- return err ;
108- }
10983
11084 if (OMPI_SUCCESS != (err = comm_old -> c_coll -> coll_bcast (& oversubscribed , 1 ,
11185 MPI_INT , 0 , comm_old ,
@@ -163,7 +137,7 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
163137 int num_procs_in_node = 0 ;
164138 int rank , size ;
165139 int hwloc_err ;
166- int oversubscribing_objs = 0 ;
140+ int oversubscribing_objs = 0 , oversubscribed_pus = 0 ;
167141 int i , j , idx ;
168142 uint32_t val , * pval ;
169143
@@ -269,8 +243,12 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
269243 hwloc_get_cpubind (opal_hwloc_topology ,set ,0 );
270244 num_pus_in_node = hwloc_get_nbobjs_by_type (opal_hwloc_topology , HWLOC_OBJ_PU );
271245
272- if (hwloc_bitmap_isincluded (root_obj -> cpuset ,set )){
273- /* processes are not bound on the machine */
246+ /**
247+ * In all situations (including heterogeneous environments) all processes must execute
248+ * all the calls that involve collective communications, so we have to lay the logic
249+ * accordingly.
250+ */
251+ if (hwloc_bitmap_isincluded (root_obj -> cpuset ,set )){ /* processes are not bound on the machine */
274252#ifdef __DEBUG__
275253 if (0 == rank )
276254 fprintf (stdout ,">>>>>>>>>>>>> Process Not bound <<<<<<<<<<<<<<<\n" );
@@ -285,60 +263,70 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
285263 oversubscribing_objs = check_oversubscribing (rank ,num_nodes ,
286264 num_objs_in_node ,num_procs_in_node ,
287265 nodes_roots ,local_procs ,comm_old );
288- if (oversubscribing_objs ) {
266+ } else { /* the processes are already bound */
267+ object = hwloc_get_obj_covering_cpuset (opal_hwloc_topology ,set );
268+ obj_rank = object -> logical_index ;
269+ effective_depth = object -> depth ;
270+ num_objs_in_node = hwloc_get_nbobjs_by_depth (opal_hwloc_topology , effective_depth );
271+
272+ /* Check for oversubscribing */
273+ oversubscribing_objs = check_oversubscribing (rank ,num_nodes ,
274+ num_objs_in_node ,num_procs_in_node ,
275+ nodes_roots ,local_procs ,comm_old );
276+ }
277+
278+ if (oversubscribing_objs ) {
279+ if (hwloc_bitmap_isincluded (root_obj -> cpuset ,set )){ /* processes are not bound on the machine */
289280#ifdef __DEBUG__
290281 fprintf (stdout ,"Oversubscribing OBJ/CORES resources => Trying to use PUs \n" );
291282#endif
292- int oversubscribed_pus = check_oversubscribing (rank ,num_nodes ,
293- num_pus_in_node ,num_procs_in_node ,
294- nodes_roots ,local_procs ,comm_old );
295- if (oversubscribed_pus ){
296- #ifdef __DEBUG__
297- fprintf (stdout ,"Oversubscribing PUs resources => Rank Reordering Impossible \n" );
298- #endif
299- FALLBACK ();
300- } else {
283+ oversubscribed_pus = check_oversubscribing (rank ,num_nodes ,
284+ num_pus_in_node ,num_procs_in_node ,
285+ nodes_roots ,local_procs ,comm_old );
286+ } else {
287+ /* Bound processes will participate with the same data as before */
288+ oversubscribed_pus = check_oversubscribing (rank ,num_nodes ,
289+ num_objs_in_node ,num_procs_in_node ,
290+ nodes_roots ,local_procs ,comm_old );
291+ }
292+ if (!oversubscribed_pus ) {
293+ /* Update the data used to compute the correct binding */
294+ if (hwloc_bitmap_isincluded (root_obj -> cpuset ,set )){ /* processes are not bound on the machine */
301295 obj_rank = ompi_process_info .my_local_rank %num_pus_in_node ;
302296 effective_depth = hwloc_topology_get_depth (opal_hwloc_topology ) - 1 ;
303297 num_objs_in_node = num_pus_in_node ;
304298#ifdef __DEBUG__
305299 fprintf (stdout ,"Process not bound : binding on PU#%i \n" ,obj_rank );
306300#endif
307301 }
308- } else {
309- obj_rank = ompi_process_info .my_local_rank %num_objs_in_node ;
310- effective_depth = depth ;
311- object = hwloc_get_obj_by_depth (opal_hwloc_topology ,effective_depth ,obj_rank );
312- if ( NULL == object ) FALLBACK ();
313-
314- hwloc_bitmap_copy (set ,object -> cpuset );
315- hwloc_bitmap_singlify (set ); /* we don't want the process to move */
316- hwloc_err = hwloc_set_cpubind (opal_hwloc_topology ,set ,0 );
317- if ( -1 == hwloc_err ) FALLBACK ();
318- #ifdef __DEBUG__
319- fprintf (stdout ,"Process not bound : binding on OBJ#%i \n" ,obj_rank );
320- #endif
321302 }
322- } else { /* the processes are already bound */
323- object = hwloc_get_obj_covering_cpuset (opal_hwloc_topology ,set );
324- obj_rank = object -> logical_index ;
325- effective_depth = object -> depth ;
326- num_objs_in_node = hwloc_get_nbobjs_by_depth (opal_hwloc_topology , effective_depth );
303+ }
327304
328- /* Check for oversubscribing */
329- oversubscribing_objs = check_oversubscribing (rank ,num_nodes ,
330- num_objs_in_node ,num_procs_in_node ,
331- nodes_roots ,local_procs ,comm_old );
332- if (oversubscribing_objs ) {
305+ if ( !oversubscribing_objs && !oversubscribed_pus ) {
306+ if ( hwloc_bitmap_isincluded (root_obj -> cpuset ,set ) ) { /* processes are not bound on the machine */
307+ obj_rank = ompi_process_info .my_local_rank %num_objs_in_node ;
308+ effective_depth = depth ;
309+ object = hwloc_get_obj_by_depth (opal_hwloc_topology ,effective_depth ,obj_rank );
310+ if ( NULL == object ) FALLBACK ();
311+
312+ hwloc_bitmap_copy (set ,object -> cpuset );
313+ hwloc_bitmap_singlify (set ); /* we don't want the process to move */
314+ hwloc_err = hwloc_set_cpubind (opal_hwloc_topology ,set ,0 );
315+ if ( -1 == hwloc_err ) FALLBACK ();
316+ #ifdef __DEBUG__
317+ fprintf (stdout ,"Process not bound : binding on OBJ#%i \n" ,obj_rank );
318+ #endif
319+ } else {
333320#ifdef __DEBUG__
334- fprintf (stdout ,"Oversubscribing OBJ/CORES resources => Rank Reordering Impossible\n" );
321+ fprintf (stdout ,"Process %i bound on OBJ #%i \n" ,rank ,obj_rank );
322+ fprintf (stdout ,"=====> Num obj in node : %i | num pus in node : %i\n" ,num_objs_in_node ,num_pus_in_node );
335323#endif
336- FALLBACK ();
337324 }
325+ } else {
338326#ifdef __DEBUG__
339- fprintf (stdout ,"Process %i bound on OBJ #%i \n" ,rank ,obj_rank );
340- fprintf (stdout ,"=====> Num obj in node : %i | num pus in node : %i\n" ,num_objs_in_node ,num_pus_in_node );
327+ fprintf (stdout ,"Oversubscribing PUs resources => Rank Reordering Impossible \n" );
341328#endif
329+ FALLBACK ();
342330 }
343331
344332 reqs = (MPI_Request * )calloc (num_procs_in_node - 1 ,sizeof (MPI_Request ));
@@ -493,7 +481,6 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
493481 for (i = 1 ; i < num_nodes ; i ++ )
494482 displs [i ] = displs [i - 1 ] + objs_per_node [i - 1 ];
495483
496- memset (reqs ,0 ,(num_nodes - 1 )* sizeof (MPI_Request ));
497484 memcpy (obj_mapping ,obj_to_rank_in_comm ,objs_per_node [0 ]* sizeof (int ));
498485 for (i = 1 ; i < num_nodes ; i ++ )
499486 if (OMPI_SUCCESS != ( err = MCA_PML_CALL (irecv (obj_mapping + displs [i ], objs_per_node [i ], MPI_INT ,
0 commit comments