Skip to content

Commit 6e81ecc

Browse files
committed
Merge pull request open-mpi#626 from hjelmn/v2.x_add_procs
v2.x dynamic add_procs
2 parents 55b03ed + 28d9340 commit 6e81ecc

File tree

84 files changed

+2308
-2159
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

84 files changed

+2308
-2159
lines changed

ompi/communicator/comm.c

Lines changed: 155 additions & 344 deletions
Large diffs are not rendered by default.

ompi/communicator/comm_cid.c

Lines changed: 0 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -139,54 +139,8 @@ static opal_mutex_t ompi_cid_lock;
139139
static opal_list_t ompi_registered_comms;
140140

141141

142-
/* This variable is zero (false) if all processes in MPI_COMM_WORLD
143-
* did not require MPI_THREAD_MULTIPLE support, and is 1 (true) as
144-
* soon as at least one process requested support for THREAD_MULTIPLE */
145-
static int ompi_comm_world_thread_level_mult=0;
146-
147-
148142
int ompi_comm_cid_init (void)
149143
{
150-
#if OMPI_ENABLE_THREAD_MULTIPLE
151-
ompi_proc_t **procs, *thisproc;
152-
uint8_t thread_level;
153-
uint8_t *tlpointer;
154-
int ret;
155-
size_t i, size, numprocs;
156-
157-
/** Note that the following call only returns processes
158-
* with the same jobid. This is on purpose, since
159-
* we switch for the dynamic communicators anyway
160-
* to the original (slower) cid allocation algorithm.
161-
*/
162-
procs = ompi_proc_world ( &numprocs );
163-
164-
for ( i=0; i<numprocs; i++ ) {
165-
thisproc = procs[i];
166-
167-
OPAL_MODEX_RECV_STRING(ret, "MPI_THREAD_LEVEL",
168-
&thisproc->super.proc_name,
169-
(uint8_t**)&tlpointer, &size);
170-
if (OMPI_SUCCESS == ret) {
171-
thread_level = *((uint8_t *) tlpointer);
172-
if ( OMPI_THREADLEVEL_IS_MULTIPLE (thread_level) ) {
173-
ompi_comm_world_thread_level_mult = 1;
174-
break;
175-
}
176-
} else if (OMPI_ERR_NOT_IMPLEMENTED == ret) {
177-
if (ompi_mpi_thread_multiple) {
178-
ompi_comm_world_thread_level_mult = 1;
179-
}
180-
break;
181-
} else {
182-
return ret;
183-
}
184-
}
185-
free(procs);
186-
#else
187-
ompi_comm_world_thread_level_mult = 0; // silence compiler warning if not used
188-
#endif
189-
190144
return OMPI_SUCCESS;
191145
}
192146

ompi/communicator/comm_init.c

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
* Copyright (c) 2006-2010 University of Houston. All rights reserved.
1414
* Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved.
1515
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
16-
* Copyright (c) 2012-2014 Los Alamos National Security, LLC.
16+
* Copyright (c) 2012-2015 Los Alamos National Security, LLC.
1717
* All rights reserved.
1818
* Copyright (c) 2011-2013 Inria. All rights reserved.
1919
* Copyright (c) 2011-2013 Universite Bordeaux 1
@@ -102,12 +102,26 @@ int ompi_comm_init(void)
102102
OBJ_CONSTRUCT(&ompi_mpi_comm_world, ompi_communicator_t);
103103
assert(ompi_mpi_comm_world.comm.c_f_to_c_index == 0);
104104
group = OBJ_NEW(ompi_group_t);
105-
group->grp_proc_pointers = ompi_proc_world(&size);
106-
group->grp_proc_count = (int)size;
105+
106+
size = ompi_process_info.num_procs;
107+
group->grp_proc_pointers = (ompi_proc_t **) calloc (size, sizeof (ompi_proc_t *));
108+
group->grp_proc_count = size;
109+
110+
for (size_t i = 0 ; i < size ; ++i) {
111+
opal_process_name_t name = {.vpid = i, .jobid = OMPI_PROC_MY_NAME->jobid};
112+
/* look for existing ompi_proc_t that matches this name */
113+
group->grp_proc_pointers[i] = (ompi_proc_t *) ompi_proc_lookup (name);
114+
if (NULL == group->grp_proc_pointers[i]) {
115+
/* set sentinel value */
116+
group->grp_proc_pointers[i] = (ompi_proc_t *) ompi_proc_name_to_sentinel (name);
117+
} else {
118+
OBJ_RETAIN (group->grp_proc_pointers[i]);
119+
}
120+
}
121+
107122
OMPI_GROUP_SET_INTRINSIC (group);
108123
OMPI_GROUP_SET_DENSE (group);
109124
ompi_set_group_rank(group, ompi_proc_local());
110-
ompi_group_increment_proc_count (group);
111125

112126
ompi_mpi_comm_world.comm.c_contextid = 0;
113127
ompi_mpi_comm_world.comm.c_id_start_index = 4;

ompi/communicator/communicator.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -535,6 +535,18 @@ int ompi_comm_finalize (void);
535535
/**
536536
* This is THE routine, where all the communicator stuff
537537
* is really set.
538+
*
539+
* @param[out] newcomm new ompi communicator object
540+
* @param[in] oldcomm old communicator
541+
* @param[in] local_size size of local_ranks array
542+
* @param[in] local_ranks local ranks (not used if local_group != NULL)
543+
* @param[in] remote_size size of remote_ranks array
544+
* @param[in] remote_ranks remote ranks (intercomm) (not used if remote_group != NULL)
545+
* @param[in] attr attributes (can be NULL)
546+
* @param[in] errh error handler
547+
* @param[in] copy_topocomponent whether to copy the topology
548+
* @param[in] local_group local process group (may be NULL if local_ranks array supplied)
549+
* @param[in] remote_group remote process group (may be NULL)
538550
*/
539551
OMPI_DECLSPEC int ompi_comm_set ( ompi_communicator_t** newcomm,
540552
ompi_communicator_t* oldcomm,
@@ -548,6 +560,23 @@ OMPI_DECLSPEC int ompi_comm_set ( ompi_communicator_t** newcomm,
548560
ompi_group_t *local_group,
549561
ompi_group_t *remote_group );
550562

563+
/**
564+
* This is THE routine, where all the communicator stuff
565+
* is really set. Non-blocking version.
566+
*
567+
* @param[out] newcomm new ompi communicator object
568+
* @param[in] oldcomm old communicator
569+
* @param[in] local_size size of local_ranks array
570+
* @param[in] local_ranks local ranks (not used if local_group != NULL)
571+
* @param[in] remote_size size of remote_ranks array
572+
* @param[in] remote_ranks remote ranks (intercomm) (not used if remote_group != NULL)
573+
* @param[in] attr attributes (can be NULL)
574+
* @param[in] errh error handler
575+
* @param[in] copy_topocomponent whether to copy the topology
576+
* @param[in] local_group local process group (may be NULL if local_ranks array supplied)
577+
* @param[in] remote_group remote process group (may be NULL)
578+
* @param[out] req ompi_request_t object for tracking completion
579+
*/
551580
OMPI_DECLSPEC int ompi_comm_set_nb ( ompi_communicator_t **ncomm,
552581
ompi_communicator_t *oldcomm,
553582
int local_size,

ompi/dpm/dpm.c

Lines changed: 25 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
* Copyright (c) 2007-2015 Cisco Systems, Inc. All rights reserved.
1414
* Copyright (c) 2006-2009 University of Houston. All rights reserved.
1515
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
16-
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
16+
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
1717
* reserved.
1818
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
1919
* Copyright (c) 2014-2015 Research Organization for Information Science
@@ -1210,63 +1210,53 @@ static int disconnect_waitall (int count, ompi_dpm_disconnect_obj **objs)
12101210
/**********************************************************************/
12111211
/**********************************************************************/
12121212
/**********************************************************************/
1213+
static bool ompi_dpm_group_is_dyn (ompi_group_t *group, ompi_jobid_t thisjobid)
1214+
{
1215+
int size = group ? ompi_group_size (group) : 0;
1216+
1217+
for (int i = 1 ; i < size ; ++i) {
1218+
opal_process_name_t name = ompi_group_get_proc_name (group, i);
1219+
1220+
if (thisjobid != ((ompi_process_name_t *) &name)->jobid) {
1221+
/* at least one is different */
1222+
return true;
1223+
}
1224+
}
1225+
1226+
return false;
1227+
}
1228+
12131229
/* All we want to do in this function is determine if the number of
12141230
* jobids in the local and/or remote group is > 1. This tells us to
12151231
* set the disconnect flag. We don't actually care what the true
12161232
* number -is-, only that it is > 1
12171233
*/
12181234
void ompi_dpm_mark_dyncomm(ompi_communicator_t *comm)
12191235
{
1220-
int i;
1221-
int size, rsize;
1222-
bool found=false;
1236+
bool found;
12231237
ompi_jobid_t thisjobid;
1224-
ompi_group_t *grp=NULL;
1225-
ompi_proc_t *proc = NULL;
12261238

12271239
/* special case for MPI_COMM_NULL */
12281240
if (comm == MPI_COMM_NULL) {
12291241
return;
12301242
}
12311243

1232-
size = ompi_comm_size(comm);
1233-
rsize = ompi_comm_remote_size(comm);
1244+
thisjobid = ompi_group_get_proc_name (comm->c_local_group, 0).jobid;
12341245

12351246
/* loop over all processes in local group and check for
12361247
* a different jobid
12371248
*/
1238-
grp = comm->c_local_group;
1239-
proc = ompi_group_peer_lookup(grp,0);
1240-
thisjobid = ((ompi_process_name_t*)&proc->super.proc_name)->jobid;
1241-
1242-
for (i=1; i< size; i++) {
1243-
proc = ompi_group_peer_lookup(grp,i);
1244-
if (thisjobid != ((ompi_process_name_t*)&proc->super.proc_name)->jobid) {
1245-
/* at least one is different */
1246-
found = true;
1247-
goto complete;
1248-
}
1249-
}
1250-
1251-
/* if inter-comm, loop over all processes in remote_group
1252-
* and see if any are different from thisjobid
1253-
*/
1254-
grp = comm->c_remote_group;
1255-
for (i=0; i< rsize; i++) {
1256-
proc = ompi_group_peer_lookup(grp,i);
1257-
if (thisjobid != ((ompi_process_name_t*)&proc->super.proc_name)->jobid) {
1258-
/* at least one is different */
1259-
found = true;
1260-
break;
1261-
}
1249+
found = ompi_dpm_group_is_dyn (comm->c_local_group, thisjobid);
1250+
if (!found) {
1251+
/* if inter-comm, loop over all processes in remote_group
1252+
* and see if any are different from thisjobid
1253+
*/
1254+
found = ompi_dpm_group_is_dyn (comm->c_remote_group, thisjobid);
12621255
}
12631256

1264-
complete:
12651257
/* if a different jobid was found, set the disconnect flag*/
12661258
if (found) {
12671259
ompi_comm_num_dyncomm++;
12681260
OMPI_COMM_SET_DYNAMIC(comm);
12691261
}
1270-
1271-
return;
12721262
}

ompi/group/group.c

Lines changed: 51 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
1515
* Copyright (c) 2012 Oak Ridge National Labs. All rights reserved.
1616
* Copyright (c) 2012-2013 Inria. All rights reserved.
17-
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
17+
* Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
1818
* reserved.
1919
* Copyright (c) 2015 Research Organization for Information Science
2020
* and Technology (RIST). All rights reserved.
@@ -49,16 +49,14 @@ int ompi_group_translate_ranks ( ompi_group_t *group1,
4949
ompi_group_t *group2,
5050
int *ranks2)
5151
{
52-
int rank, proc, proc2;
53-
struct ompi_proc_t *proc1_pointer, *proc2_pointer;
54-
5552
if ( MPI_GROUP_EMPTY == group1 || MPI_GROUP_EMPTY == group2 ) {
56-
for (proc = 0; proc < n_ranks ; proc++) {
53+
for (int proc = 0; proc < n_ranks ; ++proc) {
5754
ranks2[proc] = MPI_UNDEFINED;
5855
}
5956
return MPI_SUCCESS;
6057
}
6158

59+
#if OMPI_GROUP_SPARSE
6260
/*
6361
* If we are translating from a parent to a child that uses the sparse format
6462
* or vice versa, we use the translate ranks function corresponding to the
@@ -80,8 +78,11 @@ int ompi_group_translate_ranks ( ompi_group_t *group1,
8078
(group1,n_ranks,ranks1,group2,ranks2);
8179
}
8280

81+
/* unknown sparse group type */
82+
assert (0);
8383
}
84-
else if( group2->grp_parent_group_ptr == group1 ) { /* from parent to child*/
84+
85+
if( group2->grp_parent_group_ptr == group1 ) { /* from parent to child*/
8586
if(OMPI_GROUP_IS_SPORADIC(group2)) {
8687
return ompi_group_translate_ranks_sporadic
8788
(group1,n_ranks,ranks1,group2,ranks2);
@@ -95,28 +96,32 @@ int ompi_group_translate_ranks ( ompi_group_t *group1,
9596
(group1,n_ranks,ranks1,group2,ranks2);
9697
}
9798

99+
/* unknown sparse group type */
100+
assert (0);
98101
}
99-
else {
100-
/* loop over all ranks */
101-
for (proc = 0; proc < n_ranks; proc++) {
102-
rank=ranks1[proc];
103-
if ( MPI_PROC_NULL == rank) {
104-
ranks2[proc] = MPI_PROC_NULL;
105-
}
106-
else {
107-
proc1_pointer = ompi_group_peer_lookup(group1 ,rank);
108-
/* initialize to no "match" */
109-
ranks2[proc] = MPI_UNDEFINED;
110-
for (proc2 = 0; proc2 < group2->grp_proc_count; proc2++) {
111-
proc2_pointer= ompi_group_peer_lookup(group2, proc2);
112-
if ( proc1_pointer == proc2_pointer) {
113-
ranks2[proc] = proc2;
114-
break;
115-
}
116-
} /* end proc2 loop */
117-
} /* end proc loop */
102+
#endif
103+
104+
/* loop over all ranks */
105+
for (int proc = 0; proc < n_ranks; ++proc) {
106+
struct ompi_proc_t *proc1_pointer, *proc2_pointer;
107+
int rank = ranks1[proc];
108+
109+
if ( MPI_PROC_NULL == rank) {
110+
ranks2[proc] = MPI_PROC_NULL;
111+
continue;
118112
}
119-
}
113+
114+
proc1_pointer = ompi_group_get_proc_ptr_raw (group1, rank);
115+
/* initialize to no "match" */
116+
ranks2[proc] = MPI_UNDEFINED;
117+
for (int proc2 = 0; proc2 < group2->grp_proc_count; ++proc2) {
118+
proc2_pointer = ompi_group_get_proc_ptr_raw (group2, proc2);
119+
if ( proc1_pointer == proc2_pointer) {
120+
ranks2[proc] = proc2;
121+
break;
122+
}
123+
} /* end proc2 loop */
124+
} /* end proc loop */
120125

121126
return MPI_SUCCESS;
122127
}
@@ -168,25 +173,6 @@ int ompi_group_dump (ompi_group_t* group)
168173
return OMPI_SUCCESS;
169174
}
170175

171-
/*
172-
* This is the function that iterates through the sparse groups to the dense group
173-
* to reach the process pointer
174-
*/
175-
ompi_proc_t* ompi_group_get_proc_ptr (ompi_group_t* group , int rank)
176-
{
177-
int ranks1,ranks2;
178-
do {
179-
if(OMPI_GROUP_IS_DENSE(group)) {
180-
return group->grp_proc_pointers[rank];
181-
}
182-
ranks1 = rank;
183-
ompi_group_translate_ranks( group, 1, &ranks1,
184-
group->grp_parent_group_ptr,&ranks2);
185-
rank = ranks2;
186-
group = group->grp_parent_group_ptr;
187-
} while (1);
188-
}
189-
190176
int ompi_group_minloc ( int list[] , int length )
191177
{
192178
int i,index,min;
@@ -568,3 +554,23 @@ int ompi_group_compare(ompi_group_t *group1,
568554

569555
return return_value;
570556
}
557+
558+
bool ompi_group_have_remote_peers (ompi_group_t *group)
559+
{
560+
for (int i = 0 ; i < group->grp_proc_count ; ++i) {
561+
ompi_proc_t *proc = NULL;
562+
#if OMPI_GROUP_SPARSE
563+
proc = ompi_group_peer_lookup (group, i);
564+
#else
565+
if (ompi_proc_is_sentinel (group->grp_proc_pointers[i])) {
566+
return true;
567+
}
568+
proc = group->grp_proc_pointers[i];
569+
#endif
570+
if (!OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) {
571+
return true;
572+
}
573+
}
574+
575+
return false;
576+
}

0 commit comments

Comments
 (0)