From 871fdf318c52f26f8a378b372f010be855fd650e Mon Sep 17 00:00:00 2001 From: Vishwanath Venkatesan Date: Thu, 26 May 2022 22:55:17 -0700 Subject: [PATCH] SHMEM_LOCKS: MCS implementation of SHMEM LOCKS Adding MCS algorithm-based implementation for shmem_locks to improve performance for large scale SHMEM applications using locks. MCS lock is now the default algorithm, use the following MCA parameter to disable. --mca oshmem_enable_mcs_lock 0 to disable mcs locks and revert to default ticket locking. --mca oshmem_api_verbose 10 for debug information on shmem_locks. Signed-off-by: Vishwanath Venkatesan (cherry picked from commit 1396585210730d881baeb38290dbb5b66133e6ca) --- oshmem/runtime/oshmem_shmem_params.c | 20 ++- oshmem/runtime/params.h | 7 + oshmem/shmem/c/Makefile.am | 3 +- oshmem/shmem/c/shmem_clear_lock.c | 11 +- oshmem/shmem/c/shmem_mcs_lock.c | 239 +++++++++++++++++++++++++++ oshmem/shmem/c/shmem_set_lock.c | 11 +- oshmem/shmem/c/shmem_test_lock.c | 11 +- oshmem/shmem/shmem_lock.h | 3 + 8 files changed, 298 insertions(+), 7 deletions(-) create mode 100644 oshmem/shmem/c/shmem_mcs_lock.c diff --git a/oshmem/runtime/oshmem_shmem_params.c b/oshmem/runtime/oshmem_shmem_params.c index 24035be24ee..3d68fcb927a 100644 --- a/oshmem/runtime/oshmem_shmem_params.c +++ b/oshmem/runtime/oshmem_shmem_params.c @@ -17,9 +17,10 @@ #include "oshmem/constants.h" -int oshmem_shmem_lock_recursive = 0; -int oshmem_shmem_api_verbose = 0; -int oshmem_preconnect_all = 0; +int oshmem_shmem_lock_recursive = 0; +int oshmem_shmem_api_verbose = 0; +int oshmem_shmem_enable_mcs_locks = 1; +int oshmem_preconnect_all = 0; int oshmem_shmem_register_params(void) { @@ -38,6 +39,19 @@ int oshmem_shmem_register_params(void) MCA_BASE_VAR_SCOPE_READONLY, &oshmem_shmem_lock_recursive); + (void) mca_base_var_register("oshmem", + "oshmem", + NULL, + "enable_mcs_lock", + "enable mcs locks", + MCA_BASE_VAR_TYPE_INT, + NULL, + 1, + MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &oshmem_shmem_enable_mcs_locks); + (void) mca_base_var_register("oshmem", "oshmem", NULL, diff --git a/oshmem/runtime/params.h b/oshmem/runtime/params.h index e1a2d8cf1d8..979b1125d08 100644 --- a/oshmem/runtime/params.h +++ b/oshmem/runtime/params.h @@ -37,6 +37,13 @@ OSHMEM_DECLSPEC extern int oshmem_shmem_api_verbose; */ OSHMEM_DECLSPEC extern int oshmem_preconnect_all; + +/** + * Whether to force SHMEM processes to use MCS locking + * for shmem_locks + */ +OSHMEM_DECLSPEC extern int oshmem_shmem_enable_mcs_locks; + END_C_DECLS #endif /* OSHMEM_RUNTIME_PARAMS_H */ diff --git a/oshmem/shmem/c/Makefile.am b/oshmem/shmem/c/Makefile.am index d2c152073c0..194de248008 100644 --- a/oshmem/shmem/c/Makefile.am +++ b/oshmem/shmem/c/Makefile.am @@ -13,7 +13,8 @@ OSHMEM_AUX_SOURCES = \ - shmem_lock.c + shmem_lock.c \ + shmem_mcs_lock.c OSHMEM_API_SOURCES = \ shmem_init.c \ diff --git a/oshmem/shmem/c/shmem_clear_lock.c b/oshmem/shmem/c/shmem_clear_lock.c index 3051047a686..4c94038d316 100644 --- a/oshmem/shmem/c/shmem_clear_lock.c +++ b/oshmem/shmem/c/shmem_clear_lock.c @@ -1,4 +1,6 @@ /* + * Copyright (c) 2023 NVIDIA Corporation. + * All rights reserved. * Copyright (c) 2013-2016 Mellanox Technologies, Inc. * All rights reserved. * Copyright (c) 2019 Research Organization for Information Science @@ -18,6 +20,7 @@ #include "oshmem/shmem/shmem_api_logger.h" #include "oshmem/runtime/runtime.h" #include "oshmem/shmem/shmem_lock.h" +#include "oshmem/runtime/params.h" #if OSHMEM_PROFILING #include "oshmem/include/pshmem.h" @@ -27,5 +30,11 @@ void shmem_clear_lock(volatile long *lock) { - _shmem_clear_lock((void *)lock, sizeof(long)); + if (oshmem_shmem_enable_mcs_locks) { + SHMEM_API_VERBOSE(10, "Clear Lock with MCS Lock implementation"); + _shmem_mcs_clear_lock((long *)lock); + } else { + SHMEM_API_VERBOSE(10, "Clear Lock with Ticket Lock implementation"); + _shmem_clear_lock((void *)lock, sizeof(long)); + } } diff --git a/oshmem/shmem/c/shmem_mcs_lock.c b/oshmem/shmem/c/shmem_mcs_lock.c new file mode 100644 index 00000000000..3d7e97ee7b4 --- /dev/null +++ b/oshmem/shmem/c/shmem_mcs_lock.c @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2023 NVIDIA Corporation. + * All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#include "oshmem/constants.h" +#include "oshmem/include/shmem.h" +#include "oshmem/runtime/params.h" +#include "oshmem/runtime/runtime.h" +#include +#include + +#include "oshmem/shmem/shmem_api_logger.h" +#include "oshmem/shmem/shmem_lock.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/base/base.h" +#include "oshmem/mca/atomic/atomic.h" + +#define OPAL_BITWISE_SIZEOF_LONG (SIZEOF_LONG * 8) + + +/** Use basic MCS distributed lock algorithm for lock */ +struct shmem_mcs_lock { + /** has meaning only on MCSQ_TAIL OWNER */ + int tail; + /** It has meaning on all PEs */ + /** The next pointer is a combination of the PE ID and wait signal */ + int next; +}; +typedef struct shmem_mcs_lock shmem_mcs_lock_t; + +#define SHMEM_MCSL_TAIL_OWNER(lock_ptr)\ + (((uintptr_t)(lock_ptr) / sizeof(long)) % shmem_n_pes()) + +#define SHMEM_MCSL_NEXT_MASK 0x7FFFFFFFU +#define SHMEM_MCSL_SIGNAL_MASK 0x80000000U /** Wait signal mask */ +#define SHMEM_MCSL_NEXT(lock_val) ((lock_val) & SHMEM_MCSL_NEXT_MASK) +/** Improve readability */ +#define SHMEM_MCSL_GET_PE(tail_val) ((tail_val) & SHMEM_MCSL_NEXT_MASK) +#define SHMEM_MCSL_SIGNAL(lock_val) ((lock_val) & SHMEM_MCSL_SIGNAL_MASK) +#define SHMEM_MCSL_SET_SIGNAL(lock_val) ((lock_val) | SHMEM_MCSL_SIGNAL_MASK) + +void +_shmem_mcs_set_lock(long *lockp) +{ + shmem_mcs_lock_t *lock = (shmem_mcs_lock_t *) lockp; + int mcs_tail_owner = SHMEM_MCSL_TAIL_OWNER(lock); + int new_tail_req = 0; + int *tail = &(lock->tail); + int *next = &(lock->next); + int my_pe = shmem_my_pe(); + int curr = 0; + int out_value = 0; + int prev_tail = 0; + int prev_tailpe = 0; + int tval = 0; + int tmp_val = 0; + int retv = 0; + uint64_t value_tmp = 0; + + RUNTIME_CHECK_INIT(); + /** + * Initializing next pointer to next mask + * Done atomically to avoid races as NEXT pointer + * can be modified by other PEs while acquiring or + * releasing it. + */ + /** + * Can make this to be shmem_atomic_set to be safe + * in non-cc architectures + * has an impact on performance + */ + value_tmp = SHMEM_MCSL_NEXT_MASK; + out_value = SHMEM_MCSL_NEXT_MASK; + retv = MCA_ATOMIC_CALL(swap(oshmem_ctx_default, (void*)next, + (void*)&out_value, value_tmp, + sizeof(int), my_pe)); + RUNTIME_CHECK_RC(retv); + MCA_SPML_CALL(quiet(oshmem_ctx_default)); + + /** Signal for setting lock */ + new_tail_req = SHMEM_MCSL_SET_SIGNAL(my_pe); + /** + * Swap and make me the new tail and update in tail owner + * Get the previous tail PE. + */ + retv = MCA_ATOMIC_CALL(swap(oshmem_ctx_default, (void *)tail, + (void*)&prev_tail, + OSHMEM_ATOMIC_PTR_2_INT(&new_tail_req, + sizeof(new_tail_req)), + sizeof(int), mcs_tail_owner)); + RUNTIME_CHECK_RC(retv); + + prev_tailpe = SHMEM_MCSL_GET_PE(prev_tail); + if (SHMEM_MCSL_SIGNAL(prev_tail)) { + /** + * Someone else has got the lock before this PE + * Adding this PE to the previous tail PE's Next pointer + * Substract the SIGNAL Bit to avoid changing it. + */ + tmp_val = my_pe - SHMEM_MCSL_NEXT_MASK; + retv = MCA_ATOMIC_CALL(add(oshmem_ctx_default, (void*)next, tmp_val, + sizeof(int), prev_tailpe)); + RUNTIME_CHECK_RC(retv); + /** + * This value to be changed eventually by predecessor + * when its lock is released. + * Need to be done atomically to avoid any races where + * next pointer is modified by another PE acquiring or + * releasing this. + */ + retv = MCA_ATOMIC_CALL(add(oshmem_ctx_default, (void *)next, + SHMEM_MCSL_SIGNAL_MASK, sizeof(int), + my_pe)); + RUNTIME_CHECK_RC(retv); + MCA_SPML_CALL(quiet(oshmem_ctx_default)); + /** Wait for predecessor release lock to this PE signal to false. */ + retv = MCA_ATOMIC_CALL(fadd(oshmem_ctx_default, (void*)next, + (void*)&curr, tval, sizeof(int), my_pe)); + RUNTIME_CHECK_RC(retv); + + while (SHMEM_MCSL_SIGNAL(curr)) { + retv = MCA_SPML_CALL(wait((void*)next, SHMEM_CMP_NE, + (void*)&curr, SHMEM_INT)); + RUNTIME_CHECK_RC(retv); + retv = MCA_ATOMIC_CALL(fadd(oshmem_ctx_default, (void*)next, + (void*)&curr, tval, sizeof(int), + my_pe)); + RUNTIME_CHECK_RC(retv); + } + } +/** else.. this pe has got the lock as no one else had it */ +} + +void +_shmem_mcs_clear_lock(long *lockp) +{ + shmem_mcs_lock_t *lock = (shmem_mcs_lock_t *) lockp; + int mcs_tail_owner = SHMEM_MCSL_TAIL_OWNER(lock); + int *tail = &(lock->tail); + int *next = &(lock->next); + int my_pe = shmem_my_pe(); + int next_value = 0; + int swap_cond = 0; + int prev_value = 0; + int tval = 0; + int val_tmp = 0; + int nmask = 0; + int a_val = 0; + int retv = 0; + + /** + * Can make atomic fetch to be safe in non-cc architectures + * Has impact on performance + */ + retv = MCA_ATOMIC_CALL(fadd(oshmem_ctx_default, (void*)next, + (void*)&next_value, tval, sizeof(int), + my_pe)); + RUNTIME_CHECK_RC(retv); + MCA_SPML_CALL(quiet(oshmem_ctx_default)); + + if (next_value == SHMEM_MCSL_NEXT_MASK) { + swap_cond = SHMEM_MCSL_SET_SIGNAL(my_pe); + retv = MCA_ATOMIC_CALL(cswap(oshmem_ctx_default, + (void *)tail, (uint64_t *)&(prev_value), + OSHMEM_ATOMIC_PTR_2_INT(&swap_cond, + sizeof(swap_cond)), + OSHMEM_ATOMIC_PTR_2_INT(&val_tmp, + sizeof(val_tmp)), sizeof(int), + mcs_tail_owner)); + RUNTIME_CHECK_RC(retv); + + /** I am the tail.. and lock is released */ + if (prev_value == swap_cond) { + return; + } + /** + * I am not the tail, another PE maybe racing to acquire lock, + * let them complete setting themselves as our next + */ + nmask = SHMEM_MCSL_NEXT_MASK; + while(next_value == nmask) { + retv = MCA_SPML_CALL(wait((void*)next, SHMEM_CMP_NE, + (void*)&nmask, SHMEM_INT)); + RUNTIME_CHECK_RC(retv); + retv = MCA_ATOMIC_CALL(fadd(oshmem_ctx_default, (void*)next, + (void*)&next_value, tval, + sizeof(int), my_pe)); + RUNTIME_CHECK_RC(retv); + } + } + /** There is a successor release lock to the successor */ + a_val = SHMEM_MCSL_SIGNAL_MASK; + retv = MCA_ATOMIC_CALL(add(oshmem_ctx_default, + (void *)next, a_val, sizeof(a_val), + SHMEM_MCSL_NEXT(next_value))); + RUNTIME_CHECK_RC(retv); + MCA_SPML_CALL(quiet(oshmem_ctx_default)); +} + +int +_shmem_mcs_test_lock(long *lockp) +{ + shmem_mcs_lock_t *lock = (shmem_mcs_lock_t *) lockp; + int mcs_tail_owner = SHMEM_MCSL_TAIL_OWNER(lock); + int new_tail_req = 0; + int prev_tail = 0; + int tmp_cond = 0; + int *tail = &(lock->tail); + int *next = &(lock->next); + int my_pe = shmem_my_pe(); + int retv = 0; + + /** Initializing next pointer to next mask */ + *next = SHMEM_MCSL_NEXT_MASK; + + /** Signal for setting lock */ + new_tail_req = SHMEM_MCSL_SET_SIGNAL(my_pe); + + /** Check if previously cleared before swapping */ + retv = MCA_ATOMIC_CALL(cswap(oshmem_ctx_default, + (void *)tail, (uint64_t *)&(prev_tail), + OSHMEM_ATOMIC_PTR_2_INT(&tmp_cond, + sizeof(tmp_cond)), + OSHMEM_ATOMIC_PTR_2_INT(&new_tail_req, + sizeof(new_tail_req)), + sizeof(int), mcs_tail_owner)); + RUNTIME_CHECK_RC(retv); + + return (0 != prev_tail); +} diff --git a/oshmem/shmem/c/shmem_set_lock.c b/oshmem/shmem/c/shmem_set_lock.c index 514cb2111c3..90cc9bb706e 100644 --- a/oshmem/shmem/c/shmem_set_lock.c +++ b/oshmem/shmem/c/shmem_set_lock.c @@ -1,4 +1,6 @@ /* + * Copyright (c) 2023 NVIDIA Corporation. + * All rights reserved. * Copyright (c) 2013-2016 Mellanox Technologies, Inc. * All rights reserved. * Copyright (c) 2019 Research Organization for Information Science @@ -18,6 +20,7 @@ #include "oshmem/shmem/shmem_api_logger.h" #include "oshmem/runtime/runtime.h" #include "oshmem/shmem/shmem_lock.h" +#include "oshmem/runtime/params.h" #if OSHMEM_PROFILING #include "oshmem/include/pshmem.h" @@ -27,5 +30,11 @@ void shmem_set_lock(volatile long *lock) { - _shmem_set_lock((void *)lock, sizeof(long)); + if (oshmem_shmem_enable_mcs_locks) { + SHMEM_API_VERBOSE(10, "Set Lock with MCS Lock implementation"); + _shmem_mcs_set_lock((long *)lock); + } else { + SHMEM_API_VERBOSE(10, "Set Lock with Ticket Lock implementation"); + _shmem_set_lock((void *)lock, sizeof(long)); + } } diff --git a/oshmem/shmem/c/shmem_test_lock.c b/oshmem/shmem/c/shmem_test_lock.c index 217b9afde02..0cae5576f5f 100644 --- a/oshmem/shmem/c/shmem_test_lock.c +++ b/oshmem/shmem/c/shmem_test_lock.c @@ -1,4 +1,6 @@ /* + * Copyright (c) 2023 NVIDIA Corporation. + * All rights reserved. * Copyright (c) 2013-2016 Mellanox Technologies, Inc. * All rights reserved. * Copyright (c) 2019 Research Organization for Information Science @@ -18,6 +20,7 @@ #include "oshmem/include/shmem.h" #include "oshmem/shmem/shmem_api_logger.h" #include "oshmem/runtime/runtime.h" +#include "oshmem/runtime/params.h" #include "oshmem/shmem/shmem_lock.h" #if OSHMEM_PROFILING @@ -28,5 +31,11 @@ int shmem_test_lock(volatile long *lock) { - return _shmem_test_lock((void *)lock, sizeof(long)); + if (oshmem_shmem_enable_mcs_locks) { + SHMEM_API_VERBOSE(10, "Test lock using MCS Lock implementation"); + return _shmem_mcs_test_lock((long *)lock); + } else { + SHMEM_API_VERBOSE(10, "Test_lock using Ticket Lock implementation"); + return _shmem_test_lock((void *)lock, sizeof(long)); + } } diff --git a/oshmem/shmem/shmem_lock.h b/oshmem/shmem/shmem_lock.h index c338339c529..de138f45ff9 100644 --- a/oshmem/shmem/shmem_lock.h +++ b/oshmem/shmem/shmem_lock.h @@ -22,5 +22,8 @@ void _shmem_set_lock(void *lock, int lock_size); int _shmem_test_lock(void *lock, int lock_size); void _shmem_clear_lock(void *lock, int lock_size); +void _shmem_mcs_set_lock(long *lock); +void _shmem_mcs_clear_lock(long *lock); +int _shmem_mcs_test_lock(long *lock); #endif /*SHMEM_LOCK_H*/