Skip to content

Commit ec1a9a8

Browse files
authored
Merge pull request #4057 from edgargabriel/pr/performance-fixes-2
io/ompio: new aggregator selection algorithm
2 parents 0414c0c + 8fe1c63 commit ec1a9a8

File tree

8 files changed

+281
-29
lines changed

8 files changed

+281
-29
lines changed

ompi/communicator/comm_init.c

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* University of Stuttgart. All rights reserved.
1111
* Copyright (c) 2004-2005 The Regents of the University of California.
1212
* All rights reserved.
13-
* Copyright (c) 2006-2010 University of Houston. All rights reserved.
13+
* Copyright (c) 2006-2017 University of Houston. All rights reserved.
1414
* Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved.
1515
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
1616
* Copyright (c) 2012-2015 Los Alamos National Security, LLC.
@@ -35,6 +35,7 @@
3535

3636
#include "opal/util/bit_ops.h"
3737
#include "opal/util/info_subscriber.h"
38+
#include "opal/mca/pmix/pmix.h"
3839
#include "ompi/constants.h"
3940
#include "ompi/mca/pml/pml.h"
4041
#include "ompi/mca/coll/base/base.h"
@@ -150,6 +151,23 @@ int ompi_comm_init(void)
150151
because MPI_COMM_WORLD has some predefined attributes. */
151152
ompi_attr_hash_init(&ompi_mpi_comm_world.comm.c_keyhash);
152153

154+
/* Check for the binding policy used. We are only interested in
155+
whether mapby-node has been set right now (could be extended later)
156+
and only on MPI_COMM_WORLD, since for all other sub-communicators
157+
it is virtually impossible to identify their layout across nodes
158+
in the most generic sense. This is used by OMPIO for deciding which
159+
ranks to use for aggregators
160+
*/
161+
opal_process_name_t wildcard = {ORTE_PROC_MY_NAME->jobid, OPAL_VPID_WILDCARD};
162+
char *str=NULL;
163+
int rc;
164+
165+
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_MAPBY, &wildcard, &str, OPAL_STRING);
166+
if ( 0 == rc ) {
167+
if ( strstr ( str, "BYNODE") ) {
168+
OMPI_COMM_SET_MAPBY_NODE(&ompi_mpi_comm_world.comm);
169+
}
170+
}
153171
/* Setup MPI_COMM_SELF */
154172
OBJ_CONSTRUCT(&ompi_mpi_comm_self, ompi_communicator_t);
155173
assert(ompi_mpi_comm_self.comm.c_f_to_c_index == 1);

ompi/communicator/communicator.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* Copyright (c) 2004-2005 The Regents of the University of California.
1212
* All rights reserved.
1313
* Copyright (c) 2006-2017 Cisco Systems, Inc. All rights reserved
14-
* Copyright (c) 2006-2010 University of Houston. All rights reserved.
14+
* Copyright (c) 2006-2017 University of Houston. All rights reserved.
1515
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
1616
* Copyright (c) 2011-2013 Inria. All rights reserved.
1717
* Copyright (c) 2011-2013 Universite Bordeaux 1
@@ -60,6 +60,7 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_communicator_t);
6060
#define OMPI_COMM_DIST_GRAPH 0x00000400
6161
#define OMPI_COMM_PML_ADDED 0x00001000
6262
#define OMPI_COMM_EXTRA_RETAIN 0x00004000
63+
#define OMPI_COMM_MAPBY_NODE 0x00008000
6364

6465
/* some utility #defines */
6566
#define OMPI_COMM_IS_INTER(comm) ((comm)->c_flags & OMPI_COMM_INTER)
@@ -76,12 +77,14 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_communicator_t);
7677
#define OMPI_COMM_IS_TOPO(comm) (OMPI_COMM_IS_CART((comm)) || \
7778
OMPI_COMM_IS_GRAPH((comm)) || \
7879
OMPI_COMM_IS_DIST_GRAPH((comm)))
80+
#define OMPI_COMM_IS_MAPBY_NODE(comm) ((comm)->c_flags & OMPI_COMM_MAPBY_NODE)
7981

8082
#define OMPI_COMM_SET_DYNAMIC(comm) ((comm)->c_flags |= OMPI_COMM_DYNAMIC)
8183
#define OMPI_COMM_SET_INVALID(comm) ((comm)->c_flags |= OMPI_COMM_INVALID)
8284

8385
#define OMPI_COMM_SET_PML_ADDED(comm) ((comm)->c_flags |= OMPI_COMM_PML_ADDED)
8486
#define OMPI_COMM_SET_EXTRA_RETAIN(comm) ((comm)->c_flags |= OMPI_COMM_EXTRA_RETAIN)
87+
#define OMPI_COMM_SET_MAPBY_NODE(comm) ((comm)->c_flags |= OMPI_COMM_MAPBY_NODE)
8588

8689
/* a set of special tags: */
8790

ompi/mca/common/ompio/common_ompio_file_view.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,6 @@ int mca_common_ompio_set_view (mca_io_ompio_file_t *fh,
9696
if ( fh->f_flags & OMPIO_UNIFORM_FVIEW ) {
9797
fh->f_flags &= ~OMPIO_UNIFORM_FVIEW;
9898
}
99-
fh->f_flags |= OMPIO_FILE_VIEW_IS_SET;
10099
fh->f_datarep = strdup (datarep);
101100
datatype_duplicate (filetype, &fh->f_orig_filetype );
102101

@@ -113,6 +112,7 @@ int mca_common_ompio_set_view (mca_io_ompio_file_t *fh,
113112
}
114113
else {
115114
newfiletype = filetype;
115+
fh->f_flags |= OMPIO_FILE_VIEW_IS_SET;
116116
}
117117

118118
fh->f_iov_count = 0;

ompi/mca/fcoll/two_phase/fcoll_two_phase_file_read_all.c

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "fcoll_two_phase.h"
2828
#include "mpi.h"
2929
#include "ompi/constants.h"
30+
#include "ompi/communicator/communicator.h"
3031
#include "ompi/mca/fcoll/fcoll.h"
3132
#include "ompi/mca/io/ompio/io_ompio.h"
3233
#include "ompi/mca/io/io.h"
@@ -199,7 +200,7 @@ mca_fcoll_two_phase_file_read_all (mca_io_ompio_file_t *fh,
199200
}
200201

201202
if (two_phase_num_io_procs > fh->f_size){
202-
two_phase_num_io_procs = fh->f_size;
203+
two_phase_num_io_procs = fh->f_size;
203204
}
204205

205206
aggregator_list = (int *) calloc (two_phase_num_io_procs, sizeof(int));
@@ -208,9 +209,16 @@ mca_fcoll_two_phase_file_read_all (mca_io_ompio_file_t *fh,
208209
goto exit;
209210
}
210211

211-
for (i=0; i< two_phase_num_io_procs; i++){
212-
aggregator_list[i] = i * fh->f_size / two_phase_num_io_procs;
212+
if ( OMPI_COMM_IS_MAPBY_NODE (&ompi_mpi_comm_world.comm) ) {
213+
for (i =0; i< two_phase_num_io_procs; i++){
214+
aggregator_list[i] = i;
215+
}
213216
}
217+
else {
218+
for (i =0; i< two_phase_num_io_procs; i++){
219+
aggregator_list[i] = i * fh->f_size / two_phase_num_io_procs;
220+
}
221+
}
214222

215223
ret = fh->f_generate_current_file_view ((struct mca_io_ompio_file_t *)fh,
216224
max_data,

ompi/mca/fcoll/two_phase/fcoll_two_phase_file_write_all.c

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828
#include "mpi.h"
2929
#include "ompi/constants.h"
30+
#include "ompi/communicator/communicator.h"
3031
#include "ompi/mca/fcoll/fcoll.h"
3132
#include "ompi/mca/io/ompio/io_ompio.h"
3233
#include "ompi/mca/io/io.h"
@@ -235,9 +236,10 @@ mca_fcoll_two_phase_file_write_all (mca_io_ompio_file_t *fh,
235236
}
236237

237238
if (two_phase_num_io_procs > fh->f_size){
238-
two_phase_num_io_procs = fh->f_size;
239+
two_phase_num_io_procs = fh->f_size;
239240
}
240241

242+
241243
#if DEBUG_ON
242244
printf("Number of aggregators : %ld\n", two_phase_num_io_procs);
243245
#endif
@@ -248,10 +250,16 @@ mca_fcoll_two_phase_file_write_all (mca_io_ompio_file_t *fh,
248250
goto exit;
249251
}
250252

251-
for (i =0; i< two_phase_num_io_procs; i++){
252-
aggregator_list[i] = i * fh->f_size / two_phase_num_io_procs;
253+
if ( OMPI_COMM_IS_MAPBY_NODE (&ompi_mpi_comm_world.comm) ) {
254+
for (i =0; i< two_phase_num_io_procs; i++){
255+
aggregator_list[i] = i;
256+
}
253257
}
254-
258+
else {
259+
for (i =0; i< two_phase_num_io_procs; i++){
260+
aggregator_list[i] = i * fh->f_size / two_phase_num_io_procs;
261+
}
262+
}
255263

256264
ret = fh->f_generate_current_file_view ((struct mca_io_ompio_file_t*)fh,
257265
max_data,

ompi/mca/io/ompio/io_ompio.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ extern int mca_io_ompio_num_aggregators;
4949
extern int mca_io_ompio_record_offset_info;
5050
extern int mca_io_ompio_sharedfp_lazy_open;
5151
extern int mca_io_ompio_grouping_option;
52+
extern int mca_io_ompio_max_aggregators_ratio;
53+
extern int mca_io_ompio_aggregators_cutoff_threshold;
54+
5255
OMPI_DECLSPEC extern int mca_io_ompio_coll_timing_info;
5356

5457
/*

0 commit comments

Comments
 (0)