Skip to content

Commit 5cbc19e

Browse files
authored
Merge pull request #8599 from awlauria/atomics_core_2
Fix coredump for unaligned atomics.
2 parents 0a9eee0 + 4e34bf0 commit 5cbc19e

File tree

3 files changed

+49
-28
lines changed

3 files changed

+49
-28
lines changed

ompi/mca/osc/base/base.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* University of Stuttgart. All rights reserved.
88
* Copyright (c) 2004-2005 The Regents of the University of California.
99
* All rights reserved.
10-
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
10+
* Copyright (c) 2016-2021 IBM Corporation. All rights reserved.
1111
* $COPYRIGHT$
1212
*
1313
* Additional copyrights may follow
@@ -50,6 +50,17 @@ int ompi_osc_base_finalize(void);
5050

5151
OMPI_DECLSPEC extern mca_base_framework_t ompi_osc_base_framework;
5252

53+
54+
/* Helper to check whether osc can support atomic operation on the size the operands
55+
* Currently used with rdma and ucx.
56+
*/
57+
static inline __opal_attribute_always_inline__ bool ompi_osc_base_is_atomic_size_supported(uint64_t remote_addr,
58+
size_t size)
59+
{
60+
return ((sizeof(uint32_t) == size && !(remote_addr & 0x3)) ||
61+
(sizeof(uint64_t) == size && !(remote_addr & 0x7)));
62+
}
63+
5364
END_C_DECLS
5465

5566
#endif

ompi/mca/osc/rdma/osc_rdma_accumulate.c

Lines changed: 33 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
* Copyright (c) 2019 Triad National Security, LLC. All rights
99
* reserved.
1010
* Copyright (c) 2019-2021 Google, LLC. All rights reserved.
11+
* Copyright (c) 2021 IBM Corporation. All rights reserved.
1112
* $COPYRIGHT$
1213
*
1314
* Additional copyrights may follow
@@ -19,6 +20,7 @@
1920
#include "osc_rdma_request.h"
2021
#include "osc_rdma_comm.h"
2122

23+
#include "ompi/mca/osc/base/base.h"
2224
#include "ompi/mca/osc/base/osc_base_obj_convert.h"
2325

2426
static inline void ompi_osc_rdma_peer_accumulate_cleanup (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, bool lock_acquired)
@@ -133,6 +135,22 @@ static int ompi_osc_rdma_op_mapping[OMPI_OP_NUM_OF_TYPES + 1] = {
133135
[OMPI_OP_REPLACE] = MCA_BTL_ATOMIC_SWAP,
134136
};
135137

138+
/* set the appropriate flags for this atomic */
139+
static inline int ompi_osc_rdma_set_btl_flags(ompi_osc_rdma_module_t *module, ompi_datatype_t *dt, ptrdiff_t extent) {
140+
141+
int flags = 0;
142+
143+
if(4 == extent) {
144+
flags = MCA_BTL_ATOMIC_FLAG_32BIT;
145+
}
146+
147+
if (OMPI_DATATYPE_FLAG_DATA_FLOAT & dt->super.flags) {
148+
flags |= MCA_BTL_ATOMIC_FLAG_FLOAT;
149+
}
150+
151+
return flags;
152+
}
153+
136154
static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const void *origin_addr, void *result_addr, ompi_datatype_t *dt,
137155
ptrdiff_t extent, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
138156
mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req)
@@ -151,10 +169,7 @@ static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const
151169

152170
btl_op = ompi_osc_rdma_op_mapping[op->op_type];
153171

154-
flags = (4 == extent) ? MCA_BTL_ATOMIC_FLAG_32BIT : 0;
155-
if (OMPI_DATATYPE_FLAG_DATA_FLOAT & dt->super.flags) {
156-
flags |= MCA_BTL_ATOMIC_FLAG_FLOAT;
157-
}
172+
flags = ompi_osc_rdma_set_btl_flags(module, dt, extent);
158173

159174
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating fetch-and-op using %d-bit btl atomics. origin: 0x%" PRIx64,
160175
(4 == extent) ? 32 : 64, *((int64_t *) origin_addr));
@@ -239,10 +254,7 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo
239254
origin = (8 == extent) ? ((uint64_t *) origin_addr)[0] : ((uint32_t *) origin_addr)[0];
240255

241256
/* set the appropriate flags for this atomic */
242-
flags = (4 == extent) ? MCA_BTL_ATOMIC_FLAG_32BIT : 0;
243-
if (OMPI_DATATYPE_FLAG_DATA_FLOAT & dt->super.flags) {
244-
flags |= MCA_BTL_ATOMIC_FLAG_FLOAT;
245-
}
257+
flags = ompi_osc_rdma_set_btl_flags(module, dt, extent);
246258

247259
btl_op = ompi_osc_rdma_op_mapping[op->op_type];
248260

@@ -328,19 +340,21 @@ static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const v
328340
ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_request_t *request)
329341
{
330342
ompi_osc_rdma_module_t *module = sync->module;
331-
unsigned long len = target_count * target_datatype->super.size;
343+
size_t target_dtype_size = target_datatype->super.size;
344+
unsigned long len = target_count * target_dtype_size;
332345
char *ptr = NULL;
333346
int ret;
334347

335-
request->len = target_datatype->super.size * module->network_amo_max_count;
348+
request->len = target_dtype_size * module->network_amo_max_count;
336349

337350
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating accumulate on contiguous region of %lu bytes to remote address %" PRIx64
338351
", sync %p", len, target_address, (void *) sync);
339352

340353
/* if the datatype is small enough (and the count is 1) then try to directly use the hardware to execute
341354
* the atomic operation. this should be safe in all cases as either 1) the user has assured us they will
342355
* never use atomics with count > 1, 2) we have the accumulate lock, or 3) we have an exclusive lock */
343-
if ((target_datatype->super.size <= 8) && (((unsigned long) target_count) <= module->network_amo_max_count)) {
356+
if ((target_dtype_size <= 8) && (((unsigned long) target_count) <= module->network_amo_max_count) &&
357+
ompi_osc_base_is_atomic_size_supported(target_address, target_dtype_size)) {
344358
ret = ompi_osc_rdma_gacc_amo (module, sync, source, result, result_count, result_datatype, result_convertor,
345359
peer, target_address, target_handle, target_count, target_datatype, op, request);
346360
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
@@ -659,7 +673,8 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo
659673

660674
compare = (8 == size) ? ((int64_t *) compare_addr)[0] : ((int32_t *) compare_addr)[0];
661675
source = (8 == size) ? ((int64_t *) source_addr)[0] : ((int32_t *) source_addr)[0];
662-
flags = (4 == size) ? MCA_BTL_ATOMIC_FLAG_32BIT : 0;
676+
677+
flags = ompi_osc_rdma_set_btl_flags(module, datatype, size);
663678

664679
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating compare-and-swap using %d-bit btl atomics. compare: 0x%"
665680
PRIx64 ", origin: 0x%" PRIx64, (int) size * 8, *((int64_t *) compare_addr), *((int64_t *) source_addr));
@@ -830,10 +845,12 @@ int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare
830845
* user has indicated that they will only use the same op (or same op and no op) for
831846
* operations on overlapping memory ranges. that indicates it is safe to go ahead and
832847
* use network atomic operations. */
833-
ret = ompi_osc_rdma_cas_atomic (sync, origin_addr, compare_addr, result_addr, dt,
834-
peer, target_address, target_handle, lock_acquired);
835-
if (OMPI_SUCCESS == ret) {
836-
return OMPI_SUCCESS;
848+
if(ompi_osc_base_is_atomic_size_supported(target_address, dt->super.size)) {
849+
ret = ompi_osc_rdma_cas_atomic (sync, origin_addr, compare_addr, result_addr, dt,
850+
peer, target_address, target_handle, lock_acquired);
851+
if (OMPI_SUCCESS == ret) {
852+
return OMPI_SUCCESS;
853+
}
837854
}
838855
}
839856

ompi/mca/osc/ucx/osc_ucx_comm.c

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
* Copyright (C) 2001-2017 Mellanox Technologies Ltd. ALL RIGHTS RESERVED.
33
* Copyright (c) 2019-2020 High Performance Computing Center Stuttgart,
44
* University of Stuttgart. All rights reserved.
5+
* Copyright (c) 2021 IBM Corporation. All rights reserved.
56
* $COPYRIGHT$
67
*
78
* Additional copyrights may follow
@@ -26,14 +27,6 @@
2627
return OMPI_ERROR; \
2728
}
2829

29-
/* helper to check whether UCX supports atomic operation on the size the operands */
30-
static inline __opal_attribute_always_inline__ bool is_atomic_size_supported(uint64_t remote_addr,
31-
size_t size)
32-
{
33-
return ((sizeof(uint32_t) == size && !(remote_addr & 0x3)) ||
34-
(sizeof(uint64_t) == size && !(remote_addr & 0x7)));
35-
}
36-
3730
typedef struct ucx_iovec {
3831
void *addr;
3932
size_t len;
@@ -403,7 +396,7 @@ bool use_atomic_op(
403396
ompi_datatype_type_size(origin_dt, &origin_dt_bytes);
404397
ompi_datatype_type_size(target_dt, &target_dt_bytes);
405398
/* UCX only supports 32 and 64-bit operands atm */
406-
if (is_atomic_size_supported(remote_addr, origin_dt_bytes) &&
399+
if (ompi_osc_base_is_atomic_size_supported(remote_addr, origin_dt_bytes) &&
407400
origin_dt_bytes == target_dt_bytes && origin_count == target_count) {
408401
return true;
409402
}
@@ -796,7 +789,7 @@ int ompi_osc_ucx_compare_and_swap(const void *origin_addr, const void *compare_a
796789
}
797790

798791
ompi_datatype_type_size(dt, &dt_bytes);
799-
if (is_atomic_size_supported(remote_addr, dt_bytes)) {
792+
if (ompi_osc_base_is_atomic_size_supported(remote_addr, dt_bytes)) {
800793
// fast path using UCX atomic operations
801794
return do_atomic_compare_and_swap(origin_addr, compare_addr,
802795
result_addr, dt, target,
@@ -852,7 +845,7 @@ int ompi_osc_ucx_fetch_and_op(const void *origin_addr, void *result_addr,
852845
ompi_datatype_type_size(dt, &dt_bytes);
853846

854847
/* UCX atomics are only supported on 32 and 64 bit values */
855-
if (is_atomic_size_supported(remote_addr, dt_bytes) &&
848+
if (ompi_osc_base_is_atomic_size_supported(remote_addr, dt_bytes) &&
856849
(op == &ompi_mpi_op_no_op.op || op == &ompi_mpi_op_replace.op ||
857850
op == &ompi_mpi_op_sum.op)) {
858851
uint64_t value;

0 commit comments

Comments
 (0)