Skip to content

Commit c193a28

Browse files
committed
osc/rdma: use only a single btl registration for local state
This commit fixes a bug that can occur on Cray Gemini networks. If multiple registrations are used for the local state then we looks the atomicity guarantees. To avoid issues like this use only a single registration handle for all local state on a node. (cherry picked from commit open-mpi/ompi@63e744f) Signed-off-by: Nathan Hjelm <[email protected]>
1 parent f7d64df commit c193a28

File tree

1 file changed

+28
-17
lines changed

1 file changed

+28
-17
lines changed

ompi/mca/osc/rdma/osc_rdma_component.c

Lines changed: 28 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,7 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void
326326
region->len = size;
327327

328328
if (module->selected_btl->btl_register_mem && size) {
329-
if (MPI_WIN_FLAVOR_ALLOCATE != module->flavor) {
329+
if (MPI_WIN_FLAVOR_ALLOCATE != module->flavor || NULL == module->state_handle) {
330330
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, *base, size, MCA_BTL_REG_FLAG_ACCESS_ANY,
331331
&module->base_handle);
332332
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
@@ -450,6 +450,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
450450
size_t local_rank_array_size, leader_peer_data_size;
451451
int my_rank = ompi_comm_rank (module->comm);
452452
int global_size = ompi_comm_size (module->comm);
453+
ompi_osc_rdma_region_t *state_region;
453454
int my_base_offset = 0;
454455
struct _local_data *temp;
455456
char *data_file;
@@ -470,8 +471,8 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
470471
leader_peer_data_size = module->region_size * module->node_count;
471472

472473
/* calculate base offsets */
473-
module->state_offset = state_base = local_rank_array_size;
474-
data_base = local_rank_array_size + leader_peer_data_size + module->state_size * local_size;
474+
module->state_offset = state_base = local_rank_array_size + module->region_size;
475+
data_base = state_base + leader_peer_data_size + module->state_size * local_size;
475476

476477
do {
477478
temp = calloc (local_size, sizeof (temp[0]));
@@ -533,12 +534,13 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
533534
break;
534535
}
535536

536-
module->rank_array = (ompi_osc_rdma_rank_data_t *) module->segment_base;
537-
538537
if (size && MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
539538
*base = (void *)((intptr_t) module->segment_base + my_base_offset);
540539
}
541540

541+
module->rank_array = (ompi_osc_rdma_rank_data_t *) module->segment_base;
542+
/* put local state region data after the rank array */
543+
state_region = (ompi_osc_rdma_region_t *) ((uintptr_t) module->segment_base + local_rank_array_size);
542544
module->state = (ompi_osc_rdma_state_t *) ((uintptr_t) module->segment_base + state_base + module->state_size * local_rank);
543545

544546
/* all local ranks share the array containing the peer data of leader ranks */
@@ -547,11 +549,18 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
547549
/* initialize my state */
548550
memset (module->state, 0, module->state_size);
549551

550-
/* just go ahead and register the whole segment */
551-
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, module->segment_base, total_size, MCA_BTL_REG_FLAG_ACCESS_ANY,
552-
&module->state_handle);
553-
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
554-
break;
552+
if (0 == local_rank) {
553+
/* just go ahead and register the whole segment */
554+
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, module->segment_base, total_size, MCA_BTL_REG_FLAG_ACCESS_ANY,
555+
&module->state_handle);
556+
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
557+
break;
558+
}
559+
560+
state_region->base = (intptr_t) module->segment_base;
561+
if (module->state_handle) {
562+
memcpy (state_region->btl_handle_data, module->state_handle, module->selected_btl->btl_registration_handle_size);
563+
}
555564
}
556565

557566
if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor) {
@@ -572,6 +581,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
572581
offset = data_base;
573582
for (int i = 0 ; i < local_size ; ++i) {
574583
ompi_osc_rdma_peer_extended_t *ex_peer;
584+
ompi_osc_rdma_state_t *peer_state;
575585
ompi_osc_rdma_peer_t *peer;
576586
int peer_rank = temp[i].rank;
577587

@@ -582,21 +592,24 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
582592

583593
ex_peer = (ompi_osc_rdma_peer_extended_t *) peer;
584594

585-
peer->state = (osc_rdma_counter_t) ((uintptr_t) module->segment_base + state_base + module->state_size * i);
595+
/* peer state local pointer */
596+
peer_state = (ompi_osc_rdma_state_t *) ((uintptr_t) module->segment_base + state_base + module->state_size * i);
586597

587598
if (local_size == global_size || (module->selected_btl->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB)) {
588599
/* all peers are local or it is safe to mix cpu and nic atomics */
589600
peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE;
601+
peer->state = (osc_rdma_counter_t) peer_state;
590602
} else {
591603
/* use my endpoint handle to modify the peer's state */
592-
peer->state_handle = module->state_handle;
593-
peer->state_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, my_rank);
604+
if (module->selected_btl->btl_register_mem) {
605+
peer->state_handle = (mca_btl_base_registration_handle_t *) state_region->btl_handle_data;
606+
}
607+
peer->state = (osc_rdma_counter_t) ((uintptr_t) state_region->base + state_base + module->state_size * i);
608+
peer->state_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, peer_rank);
594609
}
595610

596611
/* finish setting up the local peer structure */
597612
if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor) {
598-
ompi_osc_rdma_state_t *peer_state = (ompi_osc_rdma_state_t *) (intptr_t) peer->state;
599-
600613
if (!module->same_disp_unit) {
601614
ex_peer->disp_unit = peer_state->disp_unit;
602615
}
@@ -1050,8 +1063,6 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
10501063

10511064
/* calculate and store various structure sizes */
10521065

1053-
/* the following two structures have similar usage but the later is meant to be a small as possible. they may
1054-
* be merged into a single structure in a later version of this component. */
10551066
module->region_size = module->selected_btl->btl_registration_handle_size + sizeof (ompi_osc_rdma_region_t);
10561067

10571068
module->state_size = sizeof (ompi_osc_rdma_state_t);

0 commit comments

Comments
 (0)