Skip to content

Commit 1a8898b

Browse files
committed
osc/sm: fix pscw synchronization
The osc/sm component was using a simple counter to determine if all expected posts had arrived to start a PSCW access epoch. This is incorrect as a post may arrive from a peer that isn't part of the current start group. There are many ways this could have been fixed. This commit adds an n^2 bitmap. When a process posts it sets a bit in the bitmap associated with the access rank to indicate the post is complete. The access rank checks for and clears the bits associated with all the processes in the start group. The bitmap requires comm_size ^ 2 bits of space. This should be managable as most nodes have relatively small numbers of processes. If this changes another algorigthm can be implemented. (cherry picked from commit open-mpi/ompi@903762e) Signed-off-by: Nathan Hjelm <[email protected]>
1 parent e8ae71b commit 1a8898b

File tree

3 files changed

+253
-110
lines changed

3 files changed

+253
-110
lines changed

ompi/mca/osc/sm/osc_sm.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ struct ompi_osc_sm_lock_t {
3939
typedef struct ompi_osc_sm_lock_t ompi_osc_sm_lock_t;
4040

4141
struct ompi_osc_sm_node_state_t {
42-
int32_t post_count;
4342
int32_t complete_count;
4443
ompi_osc_sm_lock_t lock;
4544
opal_atomic_lock_t accumulate_lock;
@@ -84,6 +83,9 @@ struct ompi_osc_sm_module_t {
8483
ompi_osc_sm_global_state_t *global_state;
8584
ompi_osc_sm_node_state_t *my_node_state;
8685
ompi_osc_sm_node_state_t *node_states;
86+
uint64_t **posts;
87+
88+
opal_mutex_t lock;
8789
};
8890
typedef struct ompi_osc_sm_module_t ompi_osc_sm_module_t;
8991

ompi/mca/osc/sm/osc_sm_active_target.c

Lines changed: 179 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
22
/*
33
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
4-
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
4+
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
55
* reserved.
66
* $COPYRIGHT$
77
*
@@ -19,6 +19,74 @@
1919

2020
#include "osc_sm.h"
2121

22+
/**
23+
* compare_ranks:
24+
*
25+
* @param[in] ptra Pointer to integer item
26+
* @param[in] ptrb Pointer to integer item
27+
*
28+
* @returns 0 if *ptra == *ptrb
29+
* @returns -1 if *ptra < *ptrb
30+
* @returns 1 otherwise
31+
*
32+
* This function is used to sort the rank list. It can be removed if
33+
* groups are always in order.
34+
*/
35+
static int compare_ranks (const void *ptra, const void *ptrb)
36+
{
37+
int a = *((int *) ptra);
38+
int b = *((int *) ptrb);
39+
40+
if (a < b) {
41+
return -1;
42+
} else if (a > b) {
43+
return 1;
44+
}
45+
46+
return 0;
47+
}
48+
49+
/**
50+
* ompi_osc_pt2pt_get_comm_ranks:
51+
*
52+
* @param[in] module - OSC PT2PT module
53+
* @param[in] sub_group - Group with ranks to translate
54+
*
55+
* @returns an array of translated ranks on success or NULL on failure
56+
*
57+
* Translate the ranks given in {sub_group} into ranks in the
58+
* communicator used to create {module}.
59+
*/
60+
static int *ompi_osc_sm_group_ranks (ompi_group_t *group, ompi_group_t *sub_group)
61+
{
62+
int size = ompi_group_size(sub_group);
63+
int *ranks1, *ranks2;
64+
int ret;
65+
66+
ranks1 = calloc (size, sizeof(int));
67+
ranks2 = calloc (size, sizeof(int));
68+
if (NULL == ranks1 || NULL == ranks2) {
69+
free (ranks1);
70+
free (ranks2);
71+
return NULL;
72+
}
73+
74+
for (int i = 0 ; i < size ; ++i) {
75+
ranks1[i] = i;
76+
}
77+
78+
ret = ompi_group_translate_ranks (sub_group, size, ranks1, group, ranks2);
79+
free (ranks1);
80+
if (OMPI_SUCCESS != ret) {
81+
free (ranks2);
82+
return NULL;
83+
}
84+
85+
qsort (ranks2, size, sizeof (int), compare_ranks);
86+
87+
return ranks2;
88+
}
89+
2290

2391
int
2492
ompi_osc_sm_fence(int assert, struct ompi_win_t *win)
@@ -51,28 +119,50 @@ ompi_osc_sm_fence(int assert, struct ompi_win_t *win)
51119
}
52120
}
53121

54-
55122
int
56123
ompi_osc_sm_start(struct ompi_group_t *group,
57124
int assert,
58125
struct ompi_win_t *win)
59126
{
60127
ompi_osc_sm_module_t *module =
61128
(ompi_osc_sm_module_t*) win->w_osc_module;
129+
int my_rank = ompi_comm_rank (module->comm);
130+
131+
OBJ_RETAIN(group);
132+
133+
if (!OPAL_ATOMIC_CMPSET(&module->start_group, NULL, group)) {
134+
OBJ_RELEASE(group);
135+
return OMPI_ERR_RMA_SYNC;
136+
}
62137

63138
if (0 == (assert & MPI_MODE_NOCHECK)) {
64139
int size;
65140

66-
OBJ_RETAIN(group);
67-
module->start_group = group;
141+
int *ranks = ompi_osc_sm_group_ranks (module->comm->c_local_group, group);
142+
if (NULL == ranks) {
143+
return OMPI_ERR_OUT_OF_RESOURCE;
144+
}
145+
68146
size = ompi_group_size(module->start_group);
69147

70-
while (module->my_node_state->post_count != size) {
71-
opal_progress();
72-
opal_atomic_mb();
73-
}
74-
} else {
75-
module->start_group = NULL;
148+
for (int i = 0 ; i < size ; ++i) {
149+
int rank_byte = ranks[i] >> 6;
150+
uint64_t old, rank_bit = 1 << (ranks[i] & 0x3f);
151+
152+
/* wait for rank to post */
153+
while (!(module->posts[my_rank][rank_byte] & rank_bit)) {
154+
opal_progress();
155+
opal_atomic_mb();
156+
}
157+
158+
opal_atomic_rmb ();
159+
160+
do {
161+
old = module->posts[my_rank][rank_byte];
162+
} while (!opal_atomic_cmpset_64 ((int64_t *) module->posts[my_rank] + rank_byte, old, old ^ rank_bit));
163+
}
164+
165+
free (ranks);
76166
}
77167

78168
opal_atomic_mb();
@@ -85,30 +175,33 @@ ompi_osc_sm_complete(struct ompi_win_t *win)
85175
{
86176
ompi_osc_sm_module_t *module =
87177
(ompi_osc_sm_module_t*) win->w_osc_module;
88-
int gsize, csize;
178+
ompi_group_t *group;
179+
int gsize;
89180

90181
/* ensure all memory operations have completed */
91182
opal_atomic_mb();
92183

93-
if (NULL != module->start_group) {
94-
module->my_node_state->post_count = 0;
95-
opal_atomic_mb();
184+
group = module->start_group;
185+
if (NULL == group || !OPAL_ATOMIC_CMPSET(&module->start_group, group, NULL)) {
186+
return OMPI_ERR_RMA_SYNC;
187+
}
96188

97-
gsize = ompi_group_size(module->start_group);
98-
csize = ompi_comm_size(module->comm);
99-
for (int i = 0 ; i < gsize ; ++i) {
100-
for (int j = 0 ; j < csize ; ++j) {
101-
if (ompi_group_peer_lookup(module->start_group, i) ==
102-
ompi_comm_peer_lookup(module->comm, j)) {
103-
opal_atomic_add_32(&module->node_states[j].complete_count, 1);
104-
}
105-
}
106-
}
189+
opal_atomic_mb();
107190

108-
OBJ_RELEASE(module->start_group);
109-
module->start_group = NULL;
191+
int *ranks = ompi_osc_sm_group_ranks (module->comm->c_local_group, group);
192+
if (NULL == ranks) {
193+
return OMPI_ERR_OUT_OF_RESOURCE;
110194
}
111195

196+
gsize = ompi_group_size(group);
197+
for (int i = 0 ; i < gsize ; ++i) {
198+
(void) opal_atomic_add_32(&module->node_states[ranks[i]].complete_count, 1);
199+
}
200+
201+
free (ranks);
202+
203+
OBJ_RELEASE(group);
204+
112205
opal_atomic_mb();
113206
return OMPI_SUCCESS;
114207
}
@@ -121,29 +214,45 @@ ompi_osc_sm_post(struct ompi_group_t *group,
121214
{
122215
ompi_osc_sm_module_t *module =
123216
(ompi_osc_sm_module_t*) win->w_osc_module;
124-
int gsize, csize;
217+
int my_rank = ompi_comm_rank (module->comm);
218+
int my_byte = my_rank >> 6;
219+
uint64_t my_bit = 1 << (my_rank & 0x3f);
220+
int gsize;
221+
222+
OPAL_THREAD_LOCK(&module->lock);
223+
224+
if (NULL != module->post_group) {
225+
OPAL_THREAD_UNLOCK(&module->lock);
226+
return OMPI_ERR_RMA_SYNC;
227+
}
228+
229+
module->post_group = group;
230+
231+
OBJ_RETAIN(group);
125232

126233
if (0 == (assert & MPI_MODE_NOCHECK)) {
127-
OBJ_RETAIN(group);
128-
module->post_group = group;
234+
int *ranks = ompi_osc_sm_group_ranks (module->comm->c_local_group, group);
235+
if (NULL == ranks) {
236+
return OMPI_ERR_OUT_OF_RESOURCE;
237+
}
129238

130239
module->my_node_state->complete_count = 0;
131240
opal_atomic_mb();
132241

133242
gsize = ompi_group_size(module->post_group);
134-
csize = ompi_comm_size(module->comm);
135243
for (int i = 0 ; i < gsize ; ++i) {
136-
for (int j = 0 ; j < csize ; ++j) {
137-
if (ompi_group_peer_lookup(module->post_group, i) ==
138-
ompi_comm_peer_lookup(module->comm, j)) {
139-
opal_atomic_add_32(&module->node_states[j].post_count, 1);
140-
}
141-
}
244+
(void) opal_atomic_add_64 ((int64_t *) module->posts[ranks[i]] + my_byte, my_bit);
142245
}
143-
} else {
144-
module->post_group = NULL;
246+
247+
opal_atomic_wmb ();
248+
249+
free (ranks);
250+
251+
opal_progress ();
145252
}
146253

254+
OPAL_THREAD_UNLOCK(&module->lock);
255+
147256
return OMPI_SUCCESS;
148257
}
149258

@@ -153,19 +262,29 @@ ompi_osc_sm_wait(struct ompi_win_t *win)
153262
{
154263
ompi_osc_sm_module_t *module =
155264
(ompi_osc_sm_module_t*) win->w_osc_module;
265+
ompi_group_t *group;
156266

157-
if (NULL != module->post_group) {
158-
int size = ompi_group_size(module->post_group);
267+
OPAL_THREAD_LOCK(&module->lock);
159268

160-
while (module->my_node_state->complete_count != size) {
161-
opal_progress();
162-
opal_atomic_mb();
163-
}
269+
if (NULL == module->post_group) {
270+
OPAL_THREAD_UNLOCK(&module->lock);
271+
return OMPI_ERR_RMA_SYNC;
272+
}
164273

165-
OBJ_RELEASE(module->post_group);
166-
module->post_group = NULL;
274+
group = module->post_group;
275+
276+
int size = ompi_group_size (group);
277+
278+
while (module->my_node_state->complete_count != size) {
279+
opal_progress();
280+
opal_atomic_mb();
167281
}
168282

283+
OBJ_RELEASE(group);
284+
module->post_group = NULL;
285+
286+
OPAL_THREAD_UNLOCK(&module->lock);
287+
169288
/* ensure all memory operations have completed */
170289
opal_atomic_mb();
171290

@@ -180,19 +299,23 @@ ompi_osc_sm_test(struct ompi_win_t *win,
180299
ompi_osc_sm_module_t *module =
181300
(ompi_osc_sm_module_t*) win->w_osc_module;
182301

183-
if (NULL != module->post_group) {
184-
int size = ompi_group_size(module->post_group);
302+
OPAL_THREAD_LOCK(&module->lock);
185303

186-
if (module->my_node_state->complete_count == size) {
187-
OBJ_RELEASE(module->post_group);
188-
module->post_group = NULL;
189-
*flag = 1;
190-
}
191-
} else {
192-
opal_atomic_mb();
193-
*flag = 0;
304+
if (NULL == module->post_group) {
305+
OPAL_THREAD_UNLOCK(&module->lock);
306+
return OMPI_ERR_RMA_SYNC;
194307
}
195308

309+
int size = ompi_group_size(module->post_group);
310+
311+
if (module->my_node_state->complete_count == size) {
312+
OBJ_RELEASE(module->post_group);
313+
module->post_group = NULL;
314+
*flag = 1;
315+
}
316+
317+
OPAL_THREAD_UNLOCK(&module->lock);
318+
196319
/* ensure all memory operations have completed */
197320
opal_atomic_mb();
198321

0 commit comments

Comments
 (0)