Skip to content

Commit eb682fd

Browse files
authored
Merge pull request pmodels#7353 from hzhou/2503_dyn_prep
misc: preparation for ch4 dynamic process revamp Approved-by: Ken Raffenetti
2 parents f7e1772 + 90b9564 commit eb682fd

File tree

24 files changed

+767
-485
lines changed

24 files changed

+767
-485
lines changed

CHANGES

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,19 @@
1515
# Yaksa is now maintained inside MPICH rather than as an external submodule.
1616

1717
# Added internal builtin datatypes and external builtin datatypes are mapped to
18-
internal types. For example, both MPI_INT and MPI_INT32_T are mapped to internal
19-
type MPIR_INT32. NOTE: direct usage of external MPI types are disallowed
20-
in MPICH internally. For example, use MPIR_INT_INTERNAL to replace direct usage
21-
of MPI_INT. Commonly used types include MPI_BYTE, MPI_CHAR, MPI_AINT, use
22-
MPIR_BYTE_INTERNAL, MPIR_CHAR_INTERNAL, MPIR_AINT_INTERNAL instead. There is no
23-
impact to users.
18+
internal types. For example, both MPI_INT and MPI_INT32_T are mapped to internal
19+
type MPIR_INT32. NOTE: direct usage of external MPI types are disallowed
20+
in MPICH internally. For example, use MPIR_INT_INTERNAL to replace direct usage
21+
of MPI_INT. Commonly used types include MPI_BYTE, MPI_CHAR, MPI_AINT, use
22+
MPIR_BYTE_INTERNAL, MPIR_CHAR_INTERNAL, MPIR_AINT_INTERNAL instead. There is no
23+
impact to users.
24+
25+
# ADI: MPID_Comm_get_lpid removed. Lpids are looked up from the local_group and
26+
remote_group in the MPIR_Comm struct.
27+
28+
# ADI: MPID_Intercomm_exchange_map renamed to MPID_Intercomm_exchange and
29+
parameters now include tag, context_id, and will perform context_id exchange
30+
and lpid exchange.
2431

2532
# Added MPI_LOGICAL1, MPI_LOGICAL2, MPI_LOGICAL4, MPI_LOGICAL8, and MPI_LOGICAL16.
2633

src/include/mpiimpl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ typedef struct MPIR_Stream MPIR_Stream;
169169
#include "mpir_errhandler.h"
170170
#include "mpir_attr_generic.h"
171171
#include "mpir_contextid.h"
172+
#include "mpir_lpid.h"
172173
#include "mpir_status.h"
173174
#include "mpir_debugger.h"
174175
#include "mpir_op.h"

src/include/mpir_group.h

Lines changed: 3 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -50,54 +50,13 @@
5050
#define MPIR_GROUP_WORLD_PTR (MPIR_Group_builtin + 1)
5151
#define MPIR_GROUP_SELF_PTR (MPIR_Group_builtin + 2)
5252

53-
/* Worlds -
54-
* We need a device-independent way of identifying processes. Assuming the concept of
55-
* "worlds", we can describe a process with (world_idx, world_rank).
56-
*
57-
* The world_idx is a local id because each process may not see all worlds. Thus,
58-
* each process only can maintain a list of worlds as it encounters them. Thus,
59-
* a process id derived from (world_idx, world_rank) is referred as LPID, or
60-
* "local process id".
61-
*
62-
* Each process should maintain a table of worlds with sufficient information so
63-
* processes can match worlds upon connection or making address exchange.
64-
*/
65-
66-
#define MPIR_NAMESPACE_MAX 128
67-
struct MPIR_World {
68-
char namespace[MPIR_NAMESPACE_MAX];
69-
/* other useful fields */
70-
int num_procs;
71-
};
72-
73-
extern struct MPIR_World MPIR_Worlds[];
74-
75-
int MPIR_add_world(const char *namespace, int num_procs);
76-
int MPIR_find_world(const char *namespace);
77-
78-
/* Abstract the integer type for lpid (process id). It is possible to use 32-bit
79-
* in principle, but 64-bit is simpler since we can trivially combine
80-
* (world_idx, world_rank).
81-
*/
82-
typedef int64_t MPIR_Lpid;
83-
84-
#define MPIR_LPID_WORLD_INDEX(lpid) ((lpid) >> 32)
85-
#define MPIR_LPID_WORLD_RANK(lpid) ((lpid) & 0xffffffff)
86-
#define MPIR_LPID_FROM(world_idx, world_rank) (((uint64_t)(world_idx) << 32) | (world_rank))
87-
/* A dynamic mask is used for temporary lpid during establishing dynamic connections.
88-
* dynamic_lpid = MPIR_LPID_DYNAMIC_MASK | index_to_dynamic_av_table
89-
*/
90-
#define MPIR_LPID_DYNAMIC_MASK ((MPIR_Lpid)0x1 << 63)
91-
9253
struct MPIR_Pmap {
93-
int size; /* same as group->size, duplicate here so Pmap is logically complete */
9454
bool use_map;
9555
union {
9656
MPIR_Lpid *map;
9757
struct {
9858
MPIR_Lpid offset;
9959
MPIR_Lpid stride;
100-
MPIR_Lpid blocksize;
10160
} stride;
10261
} u;
10362
};
@@ -148,8 +107,7 @@ int MPIR_Group_dup(MPIR_Group * old_group, MPIR_Session * session_ptr, MPIR_Grou
148107
int MPIR_Group_create_map(int size, int rank, MPIR_Session * session_ptr, MPIR_Lpid * map,
149108
MPIR_Group ** new_group_ptr);
150109
int MPIR_Group_create_stride(int size, int rank, MPIR_Session * session_ptr,
151-
MPIR_Lpid offset, MPIR_Lpid stride, MPIR_Lpid blocksize,
152-
MPIR_Group ** new_group_ptr);
110+
MPIR_Lpid offset, MPIR_Lpid stride, MPIR_Group ** new_group_ptr);
153111
int MPIR_Group_lpid_to_rank(MPIR_Group * group, MPIR_Lpid lpid);
154112

155113
int MPIR_Group_check_subset(MPIR_Group * group_ptr, MPIR_Comm * comm_ptr);
@@ -159,16 +117,14 @@ void MPIR_Group_finalize(void);
159117

160118
MPL_STATIC_INLINE_PREFIX MPIR_Lpid MPIR_Group_rank_to_lpid(MPIR_Group * group, int rank)
161119
{
162-
if (rank < 0 || rank >= group->pmap.size) {
120+
if (rank < 0 || rank >= group->size) {
163121
return MPI_UNDEFINED;
164122
}
165123

166124
if (group->pmap.use_map) {
167125
return group->pmap.u.map[rank];
168126
} else {
169-
MPIR_Lpid i_blk = rank / group->pmap.u.stride.blocksize;
170-
MPIR_Lpid r_blk = rank % group->pmap.u.stride.blocksize;
171-
return group->pmap.u.stride.offset + i_blk * group->pmap.u.stride.stride + r_blk;
127+
return group->pmap.u.stride.offset + rank * group->pmap.u.stride.stride;
172128
}
173129
}
174130

src/include/mpir_lpid.h

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
/*
2+
* Copyright (C) by Argonne National Laboratory
3+
* See COPYRIGHT in top-level directory
4+
*/
5+
6+
#ifndef MPIR_LPID_H_INCLUDED
7+
#define MPIR_LPID_H_INCLUDED
8+
9+
/* Worlds -
10+
* We need a device-independent way of identifying processes. Assuming the concept of
11+
* "worlds", we can describe a process with (world_idx, world_rank).
12+
*
13+
* The world_idx is a local id because each process may not see all worlds. Thus,
14+
* each process only can maintain a list of worlds as it encounters them. Thus,
15+
* a process id derived from (world_idx, world_rank) is referred as LPID, or
16+
* "local process id".
17+
*
18+
* Each process should maintain a table of worlds with sufficient information so
19+
* processes can match worlds upon connection or making address exchange.
20+
*/
21+
22+
#define MPIR_NAMESPACE_MAX 128
23+
struct MPIR_World {
24+
char namespace[MPIR_NAMESPACE_MAX];
25+
/* other useful fields */
26+
int num_procs;
27+
};
28+
29+
extern struct MPIR_World MPIR_Worlds[];
30+
31+
int MPIR_add_world(const char *namespace, int num_procs);
32+
int MPIR_find_world(const char *namespace);
33+
34+
/* Abstract the integer type for lpid (process id). It is possible to use 32-bit
35+
* in principle, but 64-bit is simpler since we can trivially combine
36+
* (world_idx, world_rank).
37+
*/
38+
typedef int64_t MPIR_Lpid;
39+
40+
#define MPIR_LPID_WORLD_INDEX(lpid) ((lpid) >> 32)
41+
#define MPIR_LPID_WORLD_RANK(lpid) ((lpid) & 0xffffffff)
42+
#define MPIR_LPID_FROM(world_idx, world_rank) (((uint64_t)(world_idx) << 32) | (world_rank))
43+
/* A dynamic mask is used for temporary lpid during establishing dynamic connections.
44+
* dynamic_lpid = MPIR_LPID_DYNAMIC_MASK | index_to_dynamic_av_table
45+
*/
46+
#define MPIR_LPID_DYNAMIC_MASK ((MPIR_Lpid)0x1 << 62) /* MPIR_Lpid is signed, avoid using the signed bit */
47+
#define MPIR_LPID_INVALID 0xffffffff
48+
49+
#endif /* MPIR_LPID_H_INCLUDED */

src/mpi/coll/algorithms/treealgo/treeutil.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -504,10 +504,10 @@ static int MPII_Treeutil_hierarchy_populate(MPIR_Comm * comm, int rank, int nran
504504
MPIR_Assert(upper_level != NULL);
505505

506506
/* Get wrank from the communicator as the coords are stored with wrank */
507-
MPIR_Lpid temp = 0;
508-
MPID_Comm_get_lpid(comm, r, &temp, FALSE);
509-
int wrank = (int) temp;
510-
if (wrank < 0)
507+
MPIR_Lpid temp = MPIR_comm_rank_to_lpid(comm, r);
508+
int world_idx = MPIR_LPID_WORLD_INDEX(temp);
509+
int wrank = MPIR_LPID_WORLD_RANK(temp);
510+
if (world_idx != 0)
511511
goto fn_fail;
512512
MPIR_Assert(0 <= wrank && wrank < MPIR_Process.size);
513513

src/mpi/comm/comm_impl.c

Lines changed: 77 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,7 @@ static int comm_create_local_group(MPIR_Comm * comm_ptr)
7373
MPIR_Lpid *map = MPL_malloc(n * sizeof(MPIR_Lpid), MPL_MEM_GROUP);
7474

7575
for (int i = 0; i < n; i++) {
76-
mpi_errno = MPID_Comm_get_lpid(comm_ptr, i, &map[i], FALSE);
77-
MPIR_ERR_CHECK(mpi_errno);
76+
map[i] = MPIR_Group_rank_to_lpid(comm_ptr->local_group, i);
7877
}
7978

8079
mpi_errno = MPIR_Group_create_map(n, comm_ptr->rank, comm_ptr->session_ptr, map,
@@ -238,8 +237,7 @@ int MPII_Comm_create_calculate_mapping(MPIR_Group * group_ptr,
238237
/* FIXME : BUBBLE SORT */
239238
mapping[i] = -1;
240239
for (j = 0; j < comm_ptr->local_size; j++) {
241-
MPIR_Lpid comm_lpid;
242-
MPID_Comm_get_lpid(comm_ptr, j, &comm_lpid, FALSE);
240+
MPIR_Lpid comm_lpid = MPIR_Group_rank_to_lpid(comm_ptr->local_group, j);
243241
if (comm_lpid == MPIR_Group_rank_to_lpid(group_ptr, i)) {
244242
mapping[i] = j;
245243
break;
@@ -918,8 +916,7 @@ int MPIR_Comm_remote_group_impl(MPIR_Comm * comm_ptr, MPIR_Group ** group_ptr)
918916
MPIR_Lpid *map = MPL_malloc(n * sizeof(MPIR_Lpid), MPL_MEM_GROUP);
919917

920918
for (int i = 0; i < n; i++) {
921-
mpi_errno = MPID_Comm_get_lpid(comm_ptr, i, &map[i], TRUE);
922-
MPIR_ERR_CHECK(mpi_errno);
919+
map[i] = MPIR_Group_rank_to_lpid(comm_ptr->remote_group, i);
923920
}
924921
mpi_errno = MPIR_Group_create_map(n, MPI_UNDEFINED, comm_ptr->session_ptr, map,
925922
&comm_ptr->remote_group);
@@ -952,88 +949,112 @@ int MPIR_Comm_set_info_impl(MPIR_Comm * comm_ptr, MPIR_Info * info_ptr)
952949
goto fn_exit;
953950
}
954951

952+
#if 0
953+
/* arbitrarily determine which group is the low_group by comparing
954+
* world namespaces and world ranks */
955+
static int determine_low_group(MPIR_Lpid remote_lpid, bool * is_low_group_out)
956+
{
957+
int mpi_errno = MPI_SUCCESS;
958+
959+
int my_world_idx = 0;
960+
int my_world_rank = MPIR_Process.rank;
961+
int remote_world_idx = MPIR_LPID_WORLD_INDEX(remote_lpid);
962+
int remote_world_rank = MPIR_LPID_WORLD_RANK(remote_lpid);
963+
964+
if (my_world_idx == remote_world_idx) {
965+
/* same world, just compare world ranks */
966+
MPIR_Assert(my_world_idx == 0);
967+
*is_low_group_out = (my_world_rank < remote_world_rank);
968+
} else {
969+
/* different world, compare namespace */
970+
int cmp_result = strncmp(MPIR_Worlds[my_world_idx].namespace,
971+
MPIR_Worlds[remote_world_idx].namespace,
972+
MPIR_NAMESPACE_MAX);
973+
MPIR_Assert(cmp_result != 0);
974+
if (cmp_result < 0)
975+
*is_low_group_out = false;
976+
else
977+
*is_low_group_out = true;
978+
}
979+
980+
return mpi_errno;
981+
}
982+
#endif
983+
955984
int MPIR_Intercomm_create_impl(MPIR_Comm * local_comm_ptr, int local_leader,
956985
MPIR_Comm * peer_comm_ptr, int remote_leader, int tag,
957986
MPIR_Comm ** new_intercomm_ptr)
958987
{
959988
int mpi_errno = MPI_SUCCESS;
960-
int final_context_id, recvcontext_id;
961989
int remote_size = 0;
962990
MPIR_Lpid *remote_lpids = NULL;
963-
int comm_info[3];
964-
int is_low_group = 0;
965991
MPIR_Session *session_ptr = local_comm_ptr->session_ptr;
966992

967993
MPIR_FUNC_ENTER;
968994

969-
/* Shift tag into the tagged coll space */
970-
tag |= MPIR_TAG_COLL_BIT;
971-
972-
mpi_errno = MPID_Intercomm_exchange_map(local_comm_ptr, local_leader,
973-
peer_comm_ptr, remote_leader,
974-
&remote_size, &remote_lpids, &is_low_group);
975-
MPIR_ERR_CHECK(mpi_errno);
976-
977995
/*
978996
* Create the contexts. Each group will have a context for sending
979997
* to the other group. All processes must be involved. Because
980998
* we know that the local and remote groups are disjoint, this
981999
* step will complete
9821000
*/
983-
MPL_DBG_MSG_FMT(MPIR_DBG_COMM, VERBOSE,
984-
(MPL_DBG_FDEST, "About to get contextid (local_size=%d) on rank %d",
985-
local_comm_ptr->local_size, local_comm_ptr->rank));
9861001
/* In the multi-threaded case, MPIR_Get_contextid_sparse assumes that the
9871002
* calling routine already holds the single critical section */
9881003
/* TODO: Make sure this is tag-safe */
1004+
int recvcontext_id;
9891005
mpi_errno = MPIR_Get_contextid_sparse(local_comm_ptr, &recvcontext_id, FALSE);
9901006
MPIR_ERR_CHECK(mpi_errno);
9911007
MPIR_Assert(recvcontext_id != 0);
992-
MPL_DBG_MSG_FMT(MPIR_DBG_COMM, VERBOSE, (MPL_DBG_FDEST, "Got contextid=%d", recvcontext_id));
993-
994-
/* Leaders can now swap context ids and then broadcast the value
995-
* to the local group of processes */
996-
if (local_comm_ptr->rank == local_leader) {
997-
int remote_context_id;
9981008

999-
mpi_errno =
1000-
MPIC_Sendrecv(&recvcontext_id, 1, MPIR_CONTEXT_ID_T_DATATYPE, remote_leader, tag,
1001-
&remote_context_id, 1, MPIR_CONTEXT_ID_T_DATATYPE, remote_leader, tag,
1002-
peer_comm_ptr, MPI_STATUS_IGNORE, MPIR_ERR_NONE);
1003-
MPIR_ERR_CHECK(mpi_errno);
1004-
1005-
final_context_id = remote_context_id;
1009+
/* Shift tag into the tagged coll space */
1010+
tag |= MPIR_TAG_COLL_BIT;
10061011

1007-
/* Now, send all of our local processes the remote_lpids,
1008-
* along with the final context id */
1009-
comm_info[0] = final_context_id;
1010-
MPL_DBG_MSG(MPIR_DBG_COMM, VERBOSE, "About to bcast on local_comm");
1011-
mpi_errno = MPIR_Bcast(comm_info, 1, MPIR_INT_INTERNAL, local_leader,
1012-
local_comm_ptr, MPIR_ERR_NONE);
1013-
MPIR_ERR_CHECK(mpi_errno);
1014-
MPL_DBG_MSG_D(MPIR_DBG_COMM, VERBOSE, "end of bcast on local_comm of size %d",
1015-
local_comm_ptr->local_size);
1016-
} else {
1017-
/* we're the other processes */
1018-
MPL_DBG_MSG(MPIR_DBG_COMM, VERBOSE, "About to receive bcast on local_comm");
1019-
mpi_errno = MPIR_Bcast(comm_info, 1, MPIR_INT_INTERNAL, local_leader,
1020-
local_comm_ptr, MPIR_ERR_NONE);
1021-
MPIR_ERR_CHECK(mpi_errno);
1012+
int remote_context_id;
1013+
mpi_errno = MPID_Intercomm_exchange(local_comm_ptr, local_leader,
1014+
peer_comm_ptr, remote_leader, tag,
1015+
recvcontext_id, &remote_context_id,
1016+
&remote_size, &remote_lpids);
1017+
MPIR_ERR_CHECK(mpi_errno);
10221018

1023-
/* Extract the context and group sign information */
1024-
final_context_id = comm_info[0];
1019+
bool is_low_group;
1020+
#if 0
1021+
mpi_errno = determine_low_group(remote_lpids[0], &is_low_group);
1022+
MPIR_ERR_CHECK(mpi_errno);
1023+
#else
1024+
if (local_comm_ptr->rank == local_leader) {
1025+
if (MPIR_LPID_WORLD_INDEX(remote_lpids[0]) == 0) {
1026+
is_low_group = (MPIR_Process.rank < MPIR_LPID_WORLD_RANK(remote_lpids[0]));
1027+
} else {
1028+
char remote_namespace[MPIR_NAMESPACE_MAX];
1029+
mpi_errno = MPIC_Sendrecv(MPIR_Worlds[0].namespace, MPIR_NAMESPACE_MAX,
1030+
MPIR_CHAR_INTERNAL, remote_leader, tag,
1031+
remote_namespace, MPIR_NAMESPACE_MAX, MPIR_CHAR_INTERNAL,
1032+
remote_leader, tag, peer_comm_ptr, MPI_STATUS_IGNORE,
1033+
MPIR_ERR_NONE);
1034+
MPIR_ERR_CHECK(mpi_errno);
1035+
int cmp_result;
1036+
cmp_result = strncmp(MPIR_Worlds[0].namespace, remote_namespace, MPIR_NAMESPACE_MAX);
1037+
MPIR_Assert(cmp_result != 0);
1038+
if (cmp_result < 0)
1039+
is_low_group = false;
1040+
else
1041+
is_low_group = true;
1042+
}
10251043
}
1044+
mpi_errno = MPIR_Bcast_impl(&is_low_group, 1, MPIR_C_BOOL_INTERNAL,
1045+
local_leader, local_comm_ptr, MPIR_ERR_NONE);
1046+
MPIR_ERR_CHECK(mpi_errno);
1047+
#endif
10261048

10271049
/* At last, we now have the information that we need to build the
10281050
* intercommunicator */
10291051

10301052
/* All processes in the local_comm now build the communicator */
10311053

10321054
mpi_errno = MPIR_Comm_create(new_intercomm_ptr);
1033-
if (mpi_errno)
1034-
goto fn_fail;
1055+
MPIR_ERR_CHECK(mpi_errno);
10351056

1036-
(*new_intercomm_ptr)->context_id = final_context_id;
1057+
(*new_intercomm_ptr)->context_id = remote_context_id;
10371058
(*new_intercomm_ptr)->recvcontext_id = recvcontext_id;
10381059
(*new_intercomm_ptr)->remote_size = remote_size;
10391060
(*new_intercomm_ptr)->local_size = local_comm_ptr->local_size;
@@ -1055,6 +1076,7 @@ int MPIR_Intercomm_create_impl(MPIR_Comm * local_comm_ptr, int local_leader,
10551076
/* construct remote_group */
10561077
mpi_errno = MPIR_Group_create_map(remote_size, MPI_UNDEFINED, session_ptr, remote_lpids,
10571078
&(*new_intercomm_ptr)->remote_group);
1079+
MPIR_ERR_CHECK(mpi_errno);
10581080

10591081
MPIR_Comm_set_session_ptr(*new_intercomm_ptr, session_ptr);
10601082

@@ -1113,10 +1135,9 @@ int MPIR_peer_intercomm_create(int context_id, int recvcontext_id,
11131135

11141136
MPIR_Session *session_ptr = NULL; /* Can we just use NULL session since peer_intercomm is always temporary? */
11151137
MPIR_Lpid my_lpid = MPIR_Group_rank_to_lpid(comm_self->local_group, 0);
1116-
mpi_errno = MPIR_Group_create_stride(1, 0, session_ptr, my_lpid, 1, 1,
1117-
&(*newcomm)->local_group);
1138+
mpi_errno = MPIR_Group_create_stride(1, 0, session_ptr, my_lpid, 1, &(*newcomm)->local_group);
11181139
MPIR_ERR_CHECK(mpi_errno);
1119-
mpi_errno = MPIR_Group_create_stride(1, 0, session_ptr, remote_lpid, 1, 1,
1140+
mpi_errno = MPIR_Group_create_stride(1, 0, session_ptr, remote_lpid, 1,
11201141
&(*newcomm)->remote_group);
11211142
MPIR_ERR_CHECK(mpi_errno);
11221143

0 commit comments

Comments
 (0)