diff --git a/CMakeLists.txt b/CMakeLists.txt index 844a4499..075cc34a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -176,6 +176,7 @@ if ( LIB_MATH AND LIB_DL AND MPI_FOUND ) if (ENABLE_IBVERBS) list(APPEND ENGINES "ibverbs") + list(APPEND ENGINES "zero") endif() endif() @@ -493,7 +494,7 @@ if (LPF_ENABLE_TESTS) TEST_PREFIX ${ENGINE}_ EXTRA_ARGS --gtest_output=xml:${test_output}/${ENGINE}_${testName} DISCOVERY_MODE POST_BUILD - DISCOVERY_TIMEOUT 15 + DISCOVERY_TIMEOUT 60 ) endfunction(add_gtest) diff --git a/NOTICE b/NOTICE index 1f386452..3992b64c 100644 --- a/NOTICE +++ b/NOTICE @@ -33,6 +33,8 @@ Implementation 1) BSMP 2) Collectives 3) Pthread implementation + - 2022 - 2024, Kiril Dichev + 1) Develop zero engine for LPF - 2018, Pierre Leca 1) Usability improvements of compiler frontends and CMake integration @@ -50,6 +52,8 @@ Quality Assurance - 2015 - 2017, Albert-Jan Yzelman 1) Performance test suite + - 2022 - 2024, Kiril Dichev + 1) Rewrite all functional tests to use CTest/Gtest Miscellaneous / Acknowledgments diff --git a/bootstrap.sh b/bootstrap.sh index 1bc1835c..4c3d4e68 100755 --- a/bootstrap.sh +++ b/bootstrap.sh @@ -278,13 +278,13 @@ echo "--------------------------------------------------" echo ${CMAKE_EXE} -Wno-dev \ -DCMAKE_INSTALL_PREFIX="$installdir" \ - -DCMAKE_BUILD_TYPE=$config \ - -DLPFLIB_MAKE_DOC=$doc \ - -DLPFLIB_MAKE_TEST_DOC=$doc \ + -DCMAKE_BUILD_TYPE=$config \ + -DLPFLIB_MAKE_DOC=$doc \ + -DLPFLIB_MAKE_TEST_DOC=$doc \ -DLPF_ENABLE_TESTS=$functests \ -DGTEST_AGREE_TO_LICENSE=$googletest_license_agreement \ - -DLPFLIB_PERFTESTS=$perftests \ - -DLPFLIB_CONFIG_NAME=${config_name:-${config}}\ + -DLPFLIB_PERFTESTS=$perftests \ + -DLPFLIB_CONFIG_NAME=${config_name:-${config}} \ -DLPF_HWLOC="${hwloc}" \ $hwloc_found_flag \ $mpi_cmake_flags \ diff --git a/cmake/mpi.cmake b/cmake/mpi.cmake index f8d55851..56075c5a 100644 --- a/cmake/mpi.cmake +++ b/cmake/mpi.cmake @@ -15,7 +15,7 @@ # limitations under the License. # -find_package(MPI) +find_package(MPI REQUIRED) # Find the 'mpirun' frontend string( REGEX REPLACE "exec$" "run" mpirun "${MPIEXEC}" ) diff --git a/include/lpf/core.h b/include/lpf/core.h index 9c0d1da8..320ca2e1 100644 --- a/include/lpf/core.h +++ b/include/lpf/core.h @@ -688,8 +688,10 @@ #ifdef __cplusplus #include +#include #else #include +#include #endif #endif // DOXYGEN @@ -705,7 +707,7 @@ extern "C" { * released, and NN the number of the specifications released before this one in * the same year. */ -#define _LPF_VERSION 202000L +#define _LPF_VERSION 202500L /** * An implementation that has defined this macro may never define the @@ -942,7 +944,7 @@ typedef void * lpf_init_t; #ifdef DOXYGEN typedef ... lpf_sync_attr_t; #else -typedef int lpf_sync_attr_t; +typedef void * lpf_sync_attr_t; #endif /** @@ -984,7 +986,7 @@ typedef struct lpf_machine { * byte. This value may depend on the actual number of processes \a p used, * the minimum message size \a min_msg_size the user aims to send and * receive, and the type of synchronisation requested via \a attr. The - * value is bitwise equivalent across all processes. + * value is bitwise equivalent across all processes. * * \param[in] p A value between 1 and #lpf_machine_t.p, where * both bounds are inclusive. @@ -1038,7 +1040,19 @@ typedef struct lpf_machine { * memory areas must be registered for direct remote memory access (DRMA). * * \par Communication - * Object of this type must not be communicated. + * Objects of this type must not be communicated; if they are, objects copied + * to a remote process in principle do \em not represent valid memory slots. + * + * \par Trivially Copyable + * Objects of this type are trivially copyable in the same sense of the C++11 + * TriviallyCopyable type category. + * + * \note Rationale: extensions could rely on the trivially copyability of memory + * slots. Therefore, while the core specification stipulates memory slots + * should not be copied across nodes with the expectation that a valid + * memory slot on process A when copied to process B yields a valid memory + * slot on process B, it must account for the possibility (provided by + * extensions) that such a copy could be meaningful. */ #ifdef DOXYGEN typedef ... lpf_memslot_t; @@ -1066,7 +1080,7 @@ typedef size_t lpf_memslot_t; #ifdef DOXYGEN typedef ... lpf_msg_attr_t; #else -typedef int lpf_msg_attr_t; +typedef void * lpf_msg_attr_t; #endif /** diff --git a/include/lpf/noc.h b/include/lpf/noc.h new file mode 100644 index 00000000..4bbe3031 --- /dev/null +++ b/include/lpf/noc.h @@ -0,0 +1,559 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LPFLIB_NOC_H +#define LPFLIB_NOC_H + +// import size_t data type for the implementation +#ifndef DOXYGEN + +#ifdef __cplusplus +#include +#else +#include +#endif + +#include + +#endif // DOXYGEN + + +#ifdef __cplusplus +extern "C" { +#endif + +/** \addtogroup LPF_EXTENSIONS LPF API extensions + * + * @{ + * + * \defgroup LPF_NOC Extensions to LPF where it need not maintain consistency. + * + * This extension specifies facilities for (de-)registering memory slots, + * registering RDMA requests, and fencing RDMA requests. These extensions are, + * as far as possible, fully compatible with the core LPF API specifications. + * Reused core API concepts include LPF contexts (#lpf_t), processor count types + * (#lpf_pid_t), memory slot types (#lpf_memslot_t), message attributes + * (#lpf_msg_attr_t), the #lpf_sync primitive, and, by extension, + * synchronization attributes (#lpf_sync_attr_t). + * + * In this extension, + * 1. LPF does not maintain consistency amongst processes that (de-)register + * memory slots while RDMA communication may occur. Maintaining the + * required consistency instead becomes the purview of the user. This + * extension specificies exactly what consistency properties the user must + * guarantee; and + * 2. provides facilities with which RDMA communication may be fenced on a + * finer granularity than when using #lpf_sync; this applies to the use of + * #lpf_put, #lpf_get, #lpf_noc_put, and #lpf_noc_get. The use of these + * facilities shall not change the semantics of an #lpf_sync that could + * follow as well (however, the use of #lpf_sync may not be needed in order + * to complete RDMA requests). + * + * These two mechanisms for achieving different types of non-coherency may be + * employed orthogonally. For the first extension, the following primitives are + * provided: + * - #lpf_noc_resize_memory_register, + * - #lpf_noc_register, + * - #lpf_noc_deregister, + * - #lpf_noc_put, and + * - #lpf_noc_get. + * While these primitives re-use the standard #lpf_memslot_t, implementations + * may handle so-called non-coherent memory slots differently from normal memory + * slots. One key requirement that non-coherent memory slots should support, is + * that they should be byte-copiable and also safe to communicate across + * processes. + * + * \note At this point in time, this first extension set is not implemented by + * any engine. + * + * For the second extension, the following primitives are provided: + * - #lpf_noc_flush_sent, and + * - #lpf_noc_flush_received. + * + * \warning If LPF is considered a tool for the so-called hero + * programmer, then please note that this variant is even harder + * to program with. + * + * \note At present, no debug layer exists for this extension. It is unclear if + * such a debug layer is even possible (precisely because LPF in this + * extension does not maintain consistency, there is no way a debug layer + * could enforce it). + * + * \par Engines that implement the first non-coherent extension set + * None. + * + * \par Engines that implement the second non-coherent extension set + * - the \em zero engine. + * + * @{ + */ + + +/** + * The version of this no-conflict LPF specification. All implementations shall + * define this macro. The format is YYYYNN, where YYYY is the year the + * specification was released, and NN the number of the specifications released + * before this one in the same year. + */ +#define _LPF_NOC_VERSION 202400L + +/** + * Resizes the memory register for non-coherent RDMA. + * + * After a successful call to this function, the local process has enough + * resources to register \a max_regs memory regions in a non-coherent way. + * + * Each registration via lpf_noc_register() counts as one. Such registrations + * remain taking up capacity in the register until they are released via a call + * to lpf_noc_deregister(), which lowers the count of used memory registerations + * by one. + * + * There are no runtime out-of-bounds checks prescribed for lpf_noc_register()-- + * this would also be too costly as error checking would require communication. + * + * If memory allocation were successful, the return value is #LPF_SUCCESS and + * the local process may assume the new buffer size \a max_regs. + * + * In the case of insufficient local memory the return value will be + * #LPF_ERR_OUT_OF_MEMORY. In that case, it is as if the call never happened and + * the user may retry the call locally after freeing up unused resources. Should + * retrying not lead to a successful call, the programmer may opt to broadcast + * the error (using existing slots) or to give up by returning from the spmd + * section. + * + * \note The current maximum cannot be retrieved from the runtime. Instead, the + * programmer must track this information herself. To provide + * encapsulation, see lpf_rehook(). + * + * \note When the given memory register capacity is smaller than the current + * capacity, the runtime is allowed but not required to release the + * allocated memory. Such a call shall always be successful and return + * #LPF_SUCCESS. + * + * \note This means that an implementation that allows shrinking the given + * capacity must also ensure the old buffer remains intact in case there + * is not enough memory to allocate a smaller one. + * + * \note The last invocation of lpf_noc_resize_memory_register() determines the + * maximum number of memory registrations using lpf_noc_register() that + * can be maintained concurrently. + * + * \par Thread safety + * This function is safe to be called from different LPF processes only. Any + * further thread safety may be guaranteed by the implementation, but is not + * specified. Similar conditions hold for all LPF primitives that take an + * argument of type #lpf_t; see #lpf_t for more information. + * + * \param[in,out] ctx The runtime state as provided by lpf_exec(). + * \param[in] max_regs The requested maximum number of memory regions that can + * be registered. This value must be the same on all + * processes. + * + * \returns #LPF_SUCCESS + * When this process successfully acquires the resources. + * + * \returns #LPF_ERR_OUT_OF_MEMORY + * When there was not enough memory left on the heap. In this case + * the effect is the same as when this call did not occur at all. + * + * \par BSP costs + * None. + * + * \par Runtime costs + * \f$ \Theta( \mathit{max\_regs} ) \f$. + */ +extern _LPFLIB_API +lpf_err_t lpf_noc_resize_memory_register( lpf_t ctx, size_t max_regs ); + +/** + * Registers a local memory area, preparing its use for intra-process + * communication. + * + * The registration process is necessary to enable Remote Direct Memory Access + * (RDMA) primitives, such as lpf_get(), lpf_noc_get(), lpf_put(), and + * lpf_noc_put(). + * + * This is \em not a collective function. For #lpf_get and #lpf_put, the memory + * slot returned by this function is equivalent to a memory slot returned by + * #lpf_register_local; the \a memslot returned by a successful call to this + * function (hence) is immediately valid. A successful call (hence) immediately + * consumes one memory slot capacity; see also #lpf_resize_memory_register on + * how to ensure sufficient capacity. + * + * Different from a memory slot returned by #lpf_register_local, a memory slot + * returned by a successful call to this function may serve as either a local + * or remote memory slot for #lpf_noc_put and #lpf_noc_get. + * + * Use of the returned memory slot to indicate a remote memory area may only + * occur by copying the returned memory slot to another LPF process. This may + * be done using the standard #lpf_put and #lpf_get methods or by using + * auxiliary communication mechanisms. The memory slot thus communicated only + * refers to a valid memory area on the process it originated from; any other + * use leads to undefined behaviour. + * + * \note Note that the ability to copy memory slots to act as identifiers of + * remote areas exploits the LPF core specification that instances of + * the #lpf_memslot_t type are, indeed, byte-copyable. + * + * A memory slot returned by a successful call to this function may be + * destroyed via a call to the standard #lpf_deregister. The deregistration + * takes effect immediately. No communication using the deregistered slot + * should occur during that superstep, or otherwise undefined behaviour occurs. + * + * Only the process that created the returned memory slot can destroy it; other + * LPF processes than the one which created it that attempt to destroy the + * returned memory slot invoke undefined behaviour. + * + * Other than the above specified differences, the arguments to this function + * are the same as for #lpf_register_local: + * + * \param[in,out] ctx The runtime state as provided by lpf_exec(). + * \param[in] pointer The pointer to the memory area to register. + * \param[in] size The size of the memory area to register in bytes. + * \param[out] memslot Where to store the memory slot identifier. + * + * \note Registering a slot with zero \a size is valid. The resulting memory + * slot cannot be written to nor read from by remote LPF processes. + * + * \note In particular, passing \c NULL as \a pointer and \c 0 for \a size is + * valid. + * + * \returns #LPF_SUCCESS + * Successfully registered the memory region and successfully + * assigned a memory slot identifier. + * + * \note One registration consumes one memory slot from the pool of locally + * available memory slots, which must have been preallocated by + * lpf_resize_memory_register() or recycled by lpf_deregister(). Always + * use lpf_resize_memory_register() at the start of the SPMD function + * that is executed by lpf_exec(), since lpf_exec() itself does not + * preallocate slots. + * + * \note It is illegal to request more memory slots than have previously been + * registered with lpf_resize_memory_register(). There is no runtime + * check for this error, because a safe way out cannot be guaranteed + * without significant parallel error checking overhead. + * + * \par Thread safety + * This function is safe to be called from different LPF processes only. Any + * further thread safety may be guaranteed by the implementation, but is not + * specified. Similar conditions hold for all LPF primitives that take an + * argument of type #lpf_t; see #lpf_t for more information. + * + * \par BSP costs + * + * None. + * + * \par Runtime costs + * + * \f$ \mathcal{O}( \texttt{size} ) \f$. + * + * \note This asymptotic bound may be attained for implementations that require + * linear-time processing on the registered memory area, such as to effect + * memory pinning. If this is not required, a good implementation will + * require only \f$ \Theta(1) \f$ time. + */ +extern _LPFLIB_API +lpf_err_t lpf_noc_register( + lpf_t ctx, + void * pointer, + size_t size, + lpf_memslot_t * memslot +); + +/** + * Deregisters a memory area previously registered using lpf_noc_register(). + * + * After a successful deregistration, the slot is returned to the pool of free + * memory slots. The total number of memory slots may be set via a call to + * lpf_noc_resize_memory_register(). + * + * Deregistration takes effect immediately. A call to this function is not + * collective, and the order of deregistration does not need to match the order + * of registration. Any local or remote communication using the given \a memslot + * in the current superstep invokes undefined behaviour. + * + * \par Thread safety + * This function is safe to be called from different LPF processes only. Any + * further thread safety may be guaranteed by the implementation, but is not + * specified. Similar conditions hold for all LPF primitives that take an + * argument of type #lpf_t; see #lpf_t for more information. + * + * \param[in,out] ctx The runtime state as provided by lpf_exec(). + * \param[in] memslot The memory slot identifier to de-register. + * + * \returns #LPF_SUCCESS + * Successfully deregistered the memory region. + * + * \par BSP costs + * None. + * + * \par Runtime costs + * \f$ \mathcal{O}(n) \f$, where \f$ n \f$ is the size of the memory region + * corresponding to \a memslot. + */ +extern _LPFLIB_API +lpf_err_t lpf_noc_deregister( + lpf_t ctx, + lpf_memslot_t memslot +); + +/** + * Copies contents of local memory into the memory of remote processes. + * + * This operation is guaranteed to be completed after a call to the next + * lpf_sync() exits. Until that time it occupies one entry in the operations + * queue. + * + * Concurrent reads or writes from or to the same memory area are + * allowed in the same way they are for the core primitive #lpf_put. + * + * This primitive differs from #lpf_put in that the \a dst_slot may be the + * result of a successful call to #lpf_noc_register, while \a src_slot \em must + * be the results of such a successful call. In both cases, the slot need + * \em not have been registered before the last call to #lpf_sync. + * + * \par Thread safety + * This function is safe to be called from different LPF processes only. Any + * further thread safety may be guaranteed by the implementation, but is not + * specified. Similar conditions hold for all LPF primitives that take an + * argument of type #lpf_t; see #lpf_t for more information. + * + * \param[in,out] ctx The runtime state as provided by lpf_exec() + * \param[in] src_slot The memory slot of the local source memory area + * registered using lpf_register_local(), + * lpf_register_global(), or lpf_noc_register() + * \param[in] src_offset The offset of reading out the source memory area, + * w.r.t. the base location of the registered area + * expressed in bytes. + * \param[in] dst_pid The process ID of the destination process. + * \param[in] dst_slot The memory slot of the destination memory area at + * \a pid, registered using lpf_register_global() or + * lpf_noc_register(). + * \param[in] dst_offset The offset of writing to the destination memory area + * w.r.t. the base location of the registered area + * expressed in bytes. + * \param[in] size The number of bytes to copy from the source memory area + * to the destination memory area. + * \param[in] attr + * \parblock + * In case an \a attr not equal to #LPF_MSG_DEFAULT is provided, the + * the message created by this function may have modified semantics + * that may be used to extend this API. Examples include: + * + * -# delaying the superstep deadline of delivery, and/or + * -# DRMA with message combining semantics. + * + * These attributes are stored after a call to this function has + * completed and may be modified immediately after without affecting + * any messages already scheduled. + * \endparblock + * + * \note See #lpf_put for notes regarding #lpf_msg_attr_t. + * + * \returns #LPF_SUCCESS + * When the communication request was recorded successfully. + * + * \par BSP costs + * This function will increase + * \f$ t_{c}^{(s)} \f$ + * and + * \f$ r_{c}^{(\mathit{pid})} \f$ + * by \a size, where c is the current superstep number and s is this process ID + * (as provided by #lpf_exec)). See \ref BSPCOSTS on how this affects real-time + * communication costs. + * + * \par Runtime costs + * See \ref BSPCOSTS. + */ +extern _LPFLIB_API +lpf_err_t lpf_noc_put( + lpf_t ctx, + lpf_memslot_t src_slot, + size_t src_offset, + lpf_pid_t dst_pid, + lpf_memslot_t dst_slot, + size_t dst_offset, + size_t size, + lpf_msg_attr_t attr +); + +/** + * Copies contents from remote memory to local memory. + * + * This operation completes after one call to lpf_sync(). Until that time it + * occupies one entry in the operations queue. + * + * Concurrent reads or writes from or to the same memory area are allowed in the + * same way it is for #lpf_get. + * + * This primitive differs from #lpf_get in that the \a src_slot may be the + * result of a successful call to #lpf_noc_register, while \a dst_slot \em must + * be the result of such a successful call. In both cases, the slot need \em not + * have been registered before the last call to #lpf_sync. + * + * \par Thread safety + * This function is safe to be called from different LPF processes only. Any + * further thread safety may be guaranteed by the implementation, but is not + * specified. Similar conditions hold for all LPF primitives that take an + * argument of type #lpf_t; see #lpf_t for more information. + * + * \param[in,out] ctx The runtime state as provided by lpf_exec(). + * \param[in] src_pid The process ID of the source process. + * \param[in] src_slot The memory slot of the source memory area at \a pid, as + * globally registered with lpf_register_global() or + * lpf_noc_register(). + * \param[in] src_offset The offset of reading out the source memory area, + * w.r.t. the base location of the registered area + * expressed in bytes. + * \param[in] dst_slot The memory slot of the local destination memory area + * registered using lpf_register_local(), + * lpf_register_global(), or lpf_noc_register(). + * \param[in] dst_offset The offset of writing to the destination memory area + * w.r.t. the base location of the registered area + * expressed in bytes. + * \param[in] size The number of bytes to copy from the source + * remote memory location. + * \param[in] attr + * \parblock + * In case an \a attr not equal to #LPF_MSG_DEFAULT is provided, the + * the message created by this function may have modified semantics + * that may be used to extend this API. Examples include: + * + * -# delaying the superstep deadline of delivery, and/or + * -# DRMA with message combining semantics. + * + * These attributes are stored after a call to this function has + * completed and may be modified immediately after without affecting + * any messages already scheduled. + * \endparblock + * + * \note See #lpf_get for notes on the use of #lpf_msg_attr_t. + * + * \returns #LPF_SUCCESS + * When the communication request was recorded successfully. + * + * \par BSP costs + * This function will increase + * \f$ r_{c}^{(s)} \f$ + * and + * \f$ t_{c}^{(\mathit{pid})} \f$ + * by \a size, where c is the current superstep number and s is this process ID + * (as provided via lpf_exec(). See \ref BSPCOSTS on how this affects real-time + * communication costs. + * + * \par Runtime costs + * See \ref BSPCOSTS. + */ +extern _LPFLIB_API +lpf_err_t lpf_noc_get( + lpf_t ctx, + lpf_pid_t src_pid, + lpf_memslot_t src_slot, + size_t src_offset, + lpf_memslot_t dst_slot, + size_t dst_offset, + size_t size, + lpf_msg_attr_t attr +); + +/** + * Processes completed outgoing RDMA requests that have occurred without calling + * #lpf_sync. + * + * Some fabrics require user-space to regularly flush internal queues at a rate + * that does matches (or exceeds) that of outgoing RDMA request completions. It + * is implementation-specified how many times or at what frequency flushes must + * be performed. + * + * @param[in] ctx The LPF context. + * @param[in] attr The synchronisation attribute. + * + * \note Rationale: \a attr is requested as given different attributes, + * different internal queues may be processed. + * + * \par Thread safety + * This function is safe to be called from different LPF processes only. Any + * further thread safety may be guaranteed by the implementation, but is not + * specified. Similar conditions hold for all LPF primitives that take an + * argument of type #lpf_t; see #lpf_t for more information. + * + * \returns #LPF_SUCCESS This function never fails. + * + * \par BSP costs + * None; by using this primitive, the overall BSP cost remains unaffected. + * + * \par Runtime costs + * \f$ \mathcal{O}( n ) \f$, where \f$ n \f$ is the maximum number of + * simultaneously outstanding RDMA requests (see #lpf_resize_message_queue). + * When calling this function several times within the same superstep, the + * aggregate runtime cost remains \f$ \mathcal{O}(n) \f$. + * + * \note The above is not big-Theta, as some implementations do not require + * user-space flushes. + */ +extern _LPFLIB_API +lpf_err_t lpf_noc_flush_sent( lpf_t ctx, lpf_sync_attr_t attr ); + +/** + * Processes completed incoming RDMA requests that have occurred without calling + * #lpf_sync. + * + * Some fabrics require user-space to regularly flush internal queues at a rate + * that does matches (or exceeds) that of outgoing RDMA request completions. It + * is implementation-specified how many times or at what frequency flushes must + * be performed. + * + * @param[in] ctx The LPF context. + * @param[in] attr The synchronisation attribute. + * + * \note Rationale: \a attr is requested as given different attributes, + * different internal queues may be processed. + * + * \par Thread safety + * This function is safe to be called from different LPF processes only. Any + * further thread safety may be guaranteed by the implementation, but is not + * specified. Similar conditions hold for all LPF primitives that take an + * argument of type #lpf_t; see #lpf_t for more information. + * + * \returns #LPF_SUCCESS This function never fails. + * + * \par BSP costs + * None; by using this primitive, the overall BSP cost remains unaffected. + * + * \par Runtime costs + * \f$ \mathcal{O}( n ) \f$, where \f$ n \f$ is the maximum number of + * simultaneously outstanding RDMA requests (see #lpf_resize_message_queue). + * When calling this function several times within the same superstep, the + * aggregate runtime cost remains \f$ \mathcal{O}(n) \f$. + * + * \note The above is not big-Theta, as some implementations do not require + * user-space flushes. + */ +extern _LPFLIB_API +lpf_err_t lpf_noc_flush_received( lpf_t ctx, lpf_sync_attr_t attr ); + +/** + * @} + * @} + */ + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/lpf/tags.h b/include/lpf/tags.h new file mode 100644 index 00000000..812f685d --- /dev/null +++ b/include/lpf/tags.h @@ -0,0 +1,474 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LPFLIB_TAGS_H +#define LPFLIB_TAGS_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** \addtogroup LPF_EXTENSIONS LPF API extensions + * @{ + * + * \defgroup LPF_TAGS + * + * Tags enable identifying groups of messages that a call to #lpf_sync should + * wait on. This is an extension on the classic BSP behaviour that all messages + * issued during the communication phase of a superstep must be waited on; tags + * instead identify potentially multiple independent communication phases. + * Rather than #lpf_sync ending all communication phases, it may now elect to + * end a specific communication phase only, as identified by a tag. + * + * This mechanism is implemented by allowing tags to be tied to LPF message + * attributes as well as to LPF synchronization attributes. + * + * @{ + */ + +/** + * The specification version of the tags. + * + * All implementations shall define this macro. The format is YYYNN, where YYYY + * is the year the specification was released, and NN the number of + * specifications released before this one in the same year. + */ +#define _LPF_TAGS_VERSION 202500L + +/** + * The type of an LPF tag. + * + * \par Communication + * Objects of this type must not be communicated. + */ +#ifdef DOXYGEN +typedef ... lpf_tag_t; +#else +typedef uint32_t lpf_tag_t; +#endif + +/** + * A dummy value to initialize an #lpf_tag_t instance at declaration. + * + * \note A debug implementation may check for this value so that errors can be + * detected. + */ +extern _LPFLIB_VAR const lpf_tag_t LPF_INVALID_TAG; + +/** + * Resizes the tag register for subsequent supersteps. + * + * The new capacity becomes valid \em after a next call to lpf_sync(). The + * initial capacity is zero. + * + * Each call to lpf_create_tag counts as one, while every valid call to + * lpf_destroy_tag decrements the number of registered tags by one. The + * initializer tag #LPF_INVALID_TAG does not count towards the number of + * registered tags. + * + * If allocation was successful, the return value is #LPF_SUCCESS. In the case + * of insufficient local memory, the return value is #LPF_ERR_OUT_OF_MEMORY. + * + * \note The current maximum nor currently registered number of tags cannot be + * retrieved from the run-time. Instead, the programmer must track this + * information herself. To provide encapsulation, please see lpf_rehook(). + * + * A call to this function with \a max_tags smaller than the current capacity + * shall always return #LPF_SUCCESS. + * + * \note When the given new capacity is smaller than the current capacity, the + * run-time is allowed but not required to release any superfluous + * memory. Implementations that do so must ensure that in case there was + * no space to allocate the smaller buffer, the older larger buffer + * remains intact (calls to this function requesting smaller-than-current + * capacity shall never fail). + * + * \par Thread safety + * This function is safe to be called from different LPF processes only. + * + * \returns #LPF_SUCCESS When the process acquired resources for registering + * \a max_tags tags. + * + * \returns #LPF_ERR_OUT_OF_MEMORY When there was not enough memory left on the + * heap. On return, the effect is the same as + * when this call did not occur at all. + * + * \par BSP costs + * None. + * + * \par Runtime costs + * \f$ \mathcal{O}( \mathit{max\_tags} ) \f$. + */ +extern _LPFLIB_API +lpf_err_t lpf_resize_tag_register( + lpf_t ctx, + size_t max_tags +); + +/** + * Creates a new tag. + * + * This is a collective function, meaning that all processes call this + * primitive in the same superstep and in the same order. + * + * Once a tag is created, it takes one tag registration slot. The maximum + * number of registrations is given by lpf_resize_tag_register. On entering + * this call, the user shall ensure at least one tag register remains free. + * + * @param[in,out] ctx The LPF context. + * @param[in] active Whether the calling process will be active within the + * newly-created tag. + * @param[out] tag Location where to store the newly created tag. One tag + * i registration slot is consumed. + * + * Only processes active within a tag may use that tag during RDMA requests + * (put, get, and sync). Use of this tag by any other process invites undefined + * behaviour. + * + * \note Implementations may modify the memory area pointed to by \a tag even if + * \a active is false. Such modified values should remain unused + * by RDMA requests, however. (Their only possible valid use is when + * supplied to a matching call to lpf_tags_destroy()). + * + * @returns #LPF_SUCCESS If the creation of the tag is successful. + */ +extern _LPFLIB_API +lpf_err_t lpf_tag_create( + lpf_t ctx, + bool active, + lpf_tag_t * tag +); + +/** + * Destroys a tag created by #lpf_tags_create. + * + * This is a collective function, meaning that all processes must call this + * primitive on the same tag in the same superstep and in the same order. + * + * @param[in,out] ctx The LPF context. + * @param[in] tag The tag to be destroyed. + * + * The given \a tag must have been the result of a previous succesful call to + * #lpf_tags_create that was not already followed by a successful call to + * #lpf_tags_destroy. + * + * \note Even processes who marked themselves as inactive during tag creation + * must actively participate in their destruction. Implementations may + * optimise this process by translating destruction to a no-op on those + * processes. + * + * After a successful call to this function, the number of registered tags + * decreases by one. + * + * @returns #LPF_SUCCESS If the destruction of the tag is successful. + */ +extern _LPFLIB_API +lpf_err_t lpf_tag_destroy( + lpf_t ctx, + lpf_tag_t tag +); + +/** + * Creates a new message attribute that is compatible with the LPF tags + * extension. + * + * If an implementation supports additional extensions that employ message + * attributes, then attributes initialised by this extension must result in a + * valid message attribute for use with those other extensions also. + * + * \note This does \em not imply that using message attributes from multiple + * extensions simultaneously always yields sensible behaviour; this + * depends on the specification of the extensions. + * + * This extension is compatible with zero-cost synchronization extensions. + * + * @param[in,out] ctx The LPF context. + * @param[out] attr Where a new message attribute will be allocated. + * + * After a successful function call, applying the returned \a attr without + * modification shall induce the same behaviour as applying #LPF_MSG_DEFAULT. + * + * \par Thread safety + * This function is safe to be called from different LPF processes only. + * + * \returns #LPF_SUCCESS When a new \a attr was successfully constructed. After + * the call to this function, the attribute pointed to by + * \a attr shall be a valid message attribute. + * + * \returns #LPF_ERR_OUT_OF_MEMORY When not enough system resources were + * available to create a new message attribute. + * + * \par BSP costs + * None. + * + * \par Runtime costs + * \f$ \Theta( 1 ) \f$. + */ +extern _LPFLIB_API +lpf_err_t lpf_tag_create_mattr( + lpf_t ctx, + lpf_msg_attr_t * attr +); + +/** + * Creates a new synchronization attribute that is compatible with the LPF tags + * extension. + * + * If an implementation supports additional extensions that employ + * synchronization attributes, then attributes initialised by this extension + * must result in a valid synchronization attribute for use with those other + * extensions also. + * + * \note This does \em not imply that using synchronization attributes from + * multiple extensions simultaneously always yields sensible behaviour; + * this depends on the specification of the extensions. + * + * This extension is compatible with zero-cost synchronization extensions. + * + * @param[in,out] ctx The LPF context. + * @param[out] attr Where a new message attribute will be allocated. + * + * After a successful function call, applying the returned \a attr without + * modification shall induce the same behaviour as applying #LPF_MSG_DEFAULT. + * + * \par Thread safety + * This function is safe to be called from different LPF processes only. + * + * \returns #LPF_SUCCESS When a new \a attr was successfully constructed. After + * the call, the attribute pointed to by \a attr shall be + * a valid synchronisation attribute. + * + * \returns #LPF_ERR_OUT_OF_MEMORY When not enough system resources were + * available to create a new message attribute. + * + * \par BSP costs + * None. + * + * \par Runtime costs + * \f$ \Theta( 1 ) \f$. + */ +extern _LPFLIB_API +lpf_err_t lpf_tag_create_sattr( + lpf_t ctx, + lpf_sync_attr_t * attr +); + +/** + * Destroys a valid message attribute. + * + * The given \a attr must \em not equal #LPF_MSG_DEFAULT (the default message + * attribute may not be destroyed). The given \a attr must be created by this + * extension \em or by an extension that is compatible with the tags extension. + * + * This function may be called on message attributes created by the zero-cost + * synchronisation extension. + * + * @param[in,out] ctx The LPF context. + * @param[out] attr The message attribute to be destroyed. + * + * After a successful function call, the given \a attr shall become invalid and + * must not be used in subsequent calls to any LPF primitive. + * + * \par Thread safety + * This function is safe to be called from different LPF processes only. + * + * \returns #LPF_SUCCESS A call to this function always succeeds. + * + * \par BSP costs + * None. + * + * \par Runtime costs + * \f$ \Theta( 1 ) \f$. + */ +extern _LPFLIB_API +lpf_err_t lpf_tag_destroy_mattr( + lpf_t ctx, + lpf_msg_attr_t attr +); + +/** + * Destroys a valid synchronization attribute. + * + * The given \a attr must \em not equal #LPF_SYNC_DEFAULT (the default + * synchronization attribute may not be destroyed). The given \a attr must be + * created by this extension \em or by an extension that is compatible with the + * tags extension. + * + * This function may be called on synchronisation attributes created by the + * zero-cost synchronisation extension. + * + * @param[in,out] ctx The LPF context. + * @param[out] attr The message attribute to be destroyed. + * + * After a successful function call, the given \a attr shall become invalid and + * must not be used in subsequent calls to any LPF primitive. + * + * \par Thread safety + * This function is safe to be called from different LPF processes only. + * + * \returns #LPF_SUCCESS A call to this function always succeeds. + * + * \par BSP costs + * None. + * + * \par Runtime costs + * \f$ \Theta( 1 ) \f$. + */ +extern _LPFLIB_API +lpf_err_t lpf_tag_destroy_sattr( + lpf_t ctx, + lpf_sync_attr_t attr +); + +/** + * Retrieves a tag from a message attribute. + * + * @param[in,out] ctx The LPF context. + * @param[in] attr The message attribute. + * @param[out] tag Where to store the tag that was attached to \a attr. + * + * The given \a attr must be valid. + * + * \note An implementation must at least support attribute initialization via + * #lpf_tag_create_mattr. + * + * If \a attr was not attached a tag, then #LPF_INVALID_TAG will be returned at + * \a tag. + * + * \par Thread safety + * This function is safe to be called from different LPF processes only. + * + * \returns #LPF_SUCCESS A call to this function always succeeds. + * + * \par BSP costs + * None. + * + * \par Runtime costs + * \f$ \Theta( 1 ) \f$. + */ +extern _LPFLIB_API +lpf_err_t lpf_tag_get_mattr( + lpf_t ctx, + lpf_msg_attr_t attr, + lpf_tag_t * tag +); + +/** + * Attaches a tag to a given message attribute. + * + * @param[in,out] ctx The LPF context. + * @param[in] tag The tag to attach to \a attr. + * @param[in,out] attr Where to attach the \a tag to. + * + * The given \a attr must be valid. + * + * \note An implementation must at least support attribute initialization via + * #lpf_tag_create_mattr. + * + * \par Thread safety + * This function is safe to be called from different LPF processes only. + * + * \returns #LPF_SUCCESS A call to this function always succeeds. + * + * \par BSP costs + * None. + * + * \par Runtime costs + * \f$ \Theta( 1 ) \f$. + */ +extern _LPFLIB_API +lpf_err_t lpf_tag_set_mattr( + lpf_t ctx, + lpf_tag_t tag, + lpf_msg_attr_t attr +); + +/** + * Retrieves a tag from a synchronization attribute. + * + * @param[in,out] ctx The LPF context. + * @param[in] attr The synchronization attribute. + * @param[out] tag Where to store the tag that was attached to \a attr. + * + * The given \a attr must be valid. + * + * \note An implementation must at least support attribute initialization via + * #lpf_tag_create_sattr. + * + * If \a attr was not attached a tag, then #LPF_INVALID_TAG will be returned at + * \a tag. + * + * \par Thread safety + * This function is safe to be called from different LPF processes only. + * + * \returns #LPF_SUCCESS A call to this function always succeeds. + * + * \par BSP costs + * None. + * + * \par Runime costs + * \f$ \Theta( 1 ) \f$. + */ +extern _LPFLIB_API +lpf_err_t lpf_tag_get_sattr( + lpf_t ctx, + lpf_sync_attr_t attr, + lpf_tag_t * tag +); + +/** + * Attaches a tag to a given synchronization attribute. + * + * @param[in,out] ctx The LPF context. + * @param[in] tag The tag to attach to \a attr. + * @param[in,out] attr Where to attach the \a tag to. + * + * The given \a attr must be valid. + * + * \note An implementation must at least support attribute initialization via + * #lpf_tag_create_sattr. + * + * \par Thread safety + * This function is safe to be called from different LPF processes only. + * + * \returns #LPF_SUCCESS A call to this function always succeeds. + * + * \par BSP costs + * None. + * + * \par Runtime costs + * \f$ \Theta( 1 ) \f$. + */ +extern _LPFLIB_API +lpf_err_t lpf_tag_set_sattr( + lpf_t ctx, + lpf_tag_t tag, + lpf_sync_attr_t attr +); + +/** + * @} + * @} + */ + +#ifdef __cplusplus +} +#endif + +#endif // LPFLIB_TAGS_H diff --git a/include/lpf/zero.h b/include/lpf/zero.h new file mode 100644 index 00000000..8302865d --- /dev/null +++ b/include/lpf/zero.h @@ -0,0 +1,333 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LPFLIB_ZERO_H +#define LPFLIB_ZERO_H + +#ifdef __cplusplus +extern "C" { +#endif + +/** \addtogroup LPF_EXTENSIONS LPF API extensions + * @{ + * + * \defgroup LPF_ZERO_COST_SYNC + * + * This extension provides so-called zero-cost synchronisation + * mechanisms on top of LPF. This term was coined by Alpert and Philbin back in + * 1997 [1]. It is rooted in the idea that BSP programs annotate how many bytes + * are expected to be sent and received as part of a given communication phase. + * If, simultaneously, network interfaces can keep track of processed incoming, + * respectively, outgoing bytes, then processes need only query its local + * network interface to determine whether a superstep has completed-- thus + * avoiding the need for either collectives or barriers. + * + * This extension provides a variant of zero-cost synchronisation that is based + * on counting the number of messages rather than number of bytes. It is + * compatible with the concept of a \em tag; see \ref LPF_TAGS. + * + * [1] Alpert, R. and Philbin, J., 1997. cBSP: Zero-cost synchronization in a + * modified BSP model. NEC Research Institute, Princeton, NJ, USA, + * Tech. Rep, pp.97-054. + * + * @{ + */ + +/** + * The specification version of zero-cost synchronisation. + */ +#define LPF_ZERO_COST_SYNC 202500L + +/** + * Creates a new message attribute that is compatible with the LPF zero-cost + * synchronisation extension. + * + * If an implementation supports additional extensions that employ message + * attributes, then attributes initialised by this extension must result in a + * valid message attribute for use with those other extensions also. + * + * \note This does \em not imply that using message attributes from multiple + * extensions simultaneously always yields sensible behaviour; this + * depends on the specification of the extensions. + * + * This extension is compatible with the tags extension. + * + * @param[in,out] ctx The LPF context. + * @param[out] attr Where a new message attribute will be allocated. + * + * After a successful function call, applying the returned \a attr without + * modification shall induce the same behaviour as applying #LPF_MSG_DEFAULT. + * + * \par Thread safety + * This function is safe to be called from different LPF processes only. + * + * \returns #LPF_SUCCESS When a new \a attr was successfully constructed. After + * the call to this function, the attribute pointed to by + * \a attr shall be a valid message attribute. + * + * \returns #LPF_ERR_OUT_OF_MEMORY When not enough system resources were + * available to create a new message attribute. + * + * \par BSP costs + * None. + * + * \par Runtime costs + * \f$ \Theta( 1 ) \f$. + */ +extern _LPFLIB_API +lpf_err_t lpf_zero_create_mattr( + lpf_t ctx, + lpf_msg_attr_t * attr +); + +/** + * Creates a new synchronization attribute that is compatible with the LPF + * zero-cost synchronization extension. + * + * If an implementation supports additional extensions that employ + * synchronization attributes, then attributes initialised by this extension + * must result in a valid synchronization attribute for use with those other + * extensions also. + * + * \note This does \em not imply that using synchronization attributes from + * multiple extensions simultaneously always yields sensible behaviour; + * this depends on the specification of the extensions. + * + * This extension is compatible with the tags extension. + * + * @param[in,out] ctx The LPF context. + * @param[out] attr Where a new message attribute will be allocated. + * + * After a successful function call, applying the returned \a attr without + * modification shall induce the same behaviour as applying #LPF_MSG_DEFAULT. + * + * \par Thread safety + * This function is safe to be called from different LPF processes only. + * + * \returns #LPF_SUCCESS When a new \a attr was successfully constructed. After + * the call, the attribute pointed to by \a attr shall be + * a valid synchronisation attribute. + * + * \returns #LPF_ERR_OUT_OF_MEMORY When not enough system resources were + * available to create a new message attribute. + * + * \par BSP costs + * None. + * + * \par Runtime costs + * \f$ \Theta( 1 ) \f$. + */ +extern _LPFLIB_API +lpf_err_t lpf_zero_create_sattr( + lpf_t ctx, + lpf_sync_attr_t * attr +); + +/** + * Destroys a valid message attribute. + * + * The given \a attr must \em not equal #LPF_MSG_DEFAULT (the default message + * attribute may not be destroyed). The given \a attr must be created by this + * extension \em or by an extension that is compatible with the tags extension. + * + * This function may be called on message attributes created by the tags + * extension. + * + * @param[in,out] ctx The LPF context. + * @param[out] attr The message attribute to be destroyed. + * + * After a successful function call, the given \a attr shall become invalid and + * must not be used in subsequent calls to any LPF primitive. + * + * \par Thread safety + * This function is safe to be called from different LPF processes only. + * + * \returns #LPF_SUCCESS A call to this function always succeeds. + * + * \par BSP costs + * None. + * + * \par Runtime costs + * \f$ \Theta( 1 ) \f$. + */ +extern _LPFLIB_API +lpf_err_t lpf_zero_destroy_mattr( + lpf_t ctx, + lpf_msg_attr_t attr +); + +/** + * Destroys a valid synchronization attribute. + * + * The given \a attr must \em not equal #LPF_SYNC_DEFAULT (the default + * synchronization attribute may not be destroyed). The given \a attr must be + * created by this extension \em or by an extension that is compatible with the + * tags extension. + * + * This function may be called on synchronization attributes created by the tags + * extension. + * + * @param[in,out] ctx The LPF context. + * @param[out] attr The message attribute to be destroyed. + * + * After a successful function call, the given \a attr shall become invalid and + * must not be used in subsequent calls to any LPF primitive. + * + * \par Thread safety + * This function is safe to be called from different LPF processes only. + * + * \returns #LPF_SUCCESS A call to this function always succeeds. + * + * \par BSP costs + * None. + * + * \par Runtime costs + * \f$ \Theta( 1 ) \f$. + */ +extern _LPFLIB_API +lpf_err_t lpf_zero_destroy_sattr( + lpf_t ctx, + lpf_sync_attr_t attr +); + +/** + * Attaches zero-cost synchronisation attributes to the given LPF + * synchronisation attribute. + * + * @param[in,out] ctx The LPF context. + * @param[in] expected_sent The expected number of messages sent out from this + * process. + * @param[in] expected_rcvd The expected number of messages received at this + * process. + * @param[in,out] attr Where to attach the zero-cost sync attributes. + * + * The given \a attr must have been created via #lpf_zero_create_sattr or must + * be created by another extension that is compatible with this zero-cost + * synchronization extension. + * + * If the resulting \a attr is used within a subsequent call to #lpf_sync, + * the spec demands that the #lpf_sync call is collective. The zero-cost + * synchronisation extension furthermore requires that each of those collective + * calls to #lpf_sync have matching zero-cost attributes attached to them. Here, + * ``matching'' means that the combination of all attributes given at all + * processes correctly corresponds to the global communication pattern that that + * #lpf_sync requires wait completion for. + * + * \par Thread safety + * This function is safe to be called from different LPF processes only. + * + * \returns #LPF_SUCCESS If the attachment of the zero-cost synchronisation + * attributes is successful. + * + * \par BSP costs + * None. + * + * \par Runtime costs + * \f$ \Theta( 1 ) \f$. + */ +extern _LPFLIB_API +lpf_err_t lpf_zero_set_expected( + lpf_t ctx, + size_t expected_sent, size_t expected_rcvd, + lpf_sync_attr_t attr +); + +/** + * Retrieves the attached zero-cost information from the given synchronisation + * attribute. + * + * @param[in,out] ctx The LPF context. + * @param[in] attr The synchronisation attribute to retrieve the + * zero-cost attributes from. + * @param[out] expected_sent Where to store the expected number of sent + * messages. + * @param[out] expected_rcvd Where to store the expected number of received + * messages. + * + * The given \a attr must have been created via #lpf_zero_create_sattr or must + * be created by another extension that is compatible with this zero-cost + * synchronization extension. + * + * If \a attr did not have a preceding call to #lpf_zero_set_expected, then the + * default values (0) are returned. An expected zero for both received and sent + * number of messages indicates a regular (non zero-cost) synchronization. + * + * \par Thread safety + * This function is safe to be called from different LPF processes only. + * + * \returns #LPF_SUCCESS A call to this function always succeeds. + * + * \par BSP costs + * None. + * + * \par Runtime costs + * \f$ \Theta( 1 ) \f$. + */ +extern _LPFLIB_API +lpf_err_t lpf_zero_get_expected( + lpf_t ctx, + lpf_sync_attr_t attr, + size_t * expected_sent, size_t * expected_rcvd +); + +/** + * Retrieves the current locally-received number of messages. + * + * @param[in,out] ctx The LPF context. + * @param[in] attr The synchronisation attribute to retrieve the + * status of. + * @param[out] rcvd Where to store the number of received messages. + * @param[out] sent Where to store the number of sent messages. + * + * The given \a attr must have been created via #lpf_zero_create_sattr or must + * be created by another extension that is compatible with this zero-cost + * synchronization extension. + * + * \note Rationale: this function is useful for implementing task-aware + * interfaces around zero-cost synchronisation mechanisms. + * + * \par Thread safety + * This function is safe to be called from different LPF processes only. + * + * \returns #LPF_SUCCESS A call to this function always succeeds. + * + * \par BSP costs + * None. + * + * \par Runtime costs + * \f$ \Theta( 1 ) \f$. + * + * \note A call to this function may imply querying the network interface, + * and hence the constant-time factor of a call to this function may be + * non-trivial; use of this function is recommended to be sparingly. + */ +extern _LPFLIB_API +lpf_err_t lpf_zero_get_status( + lpf_t ctx, lpf_sync_attr_t attr, + size_t * rcvd, size_t * sent +); + +/** + * @} + * @} + */ + +#ifdef __cplusplus +} +#endif + +#endif // LPFLIB_ZERO_H diff --git a/lpfrun.in b/lpfrun.in index 640fdc00..558a96d5 100644 --- a/lpfrun.in +++ b/lpfrun.in @@ -57,7 +57,7 @@ function printhelp() echo echo " -engine " echo " Allow you to choose the engine. Currently supported" - echo " are: pthread, mpirma, mpimsg, ibverbs, hybrid" + echo " are: pthread, mpirma, mpimsg, ibverbs, zero, hybrid" echo echo " -probe " echo " Set the number of seconds to probe the system for BSP" @@ -846,7 +846,7 @@ case $engine in exit_status=$? ;; - mpirma|mpimsg|ibverbs) + mpirma|mpimsg|ibverbs|zero) mpi_impl=$(mpi_detect) proc_args= @@ -1128,8 +1128,8 @@ case $engine in ;; *) - echo "Engine '$engine' is not supported. Please choose 'pthread'," - echo "'mpirma', or 'hybrid'" + echo "Engine '$engine' is not supported. Please choose " + echo "'pthread', 'mpirma', 'mpimsg', 'ibverbs, 'zero', 'hybrid'" exit_status=1 ;; esac diff --git a/post-install/post-install-test.cmake.in b/post-install/post-install-test.cmake.in index edd06922..05786d26 100644 --- a/post-install/post-install-test.cmake.in +++ b/post-install/post-install-test.cmake.in @@ -353,6 +353,9 @@ endif() ###### CMake integration using generated CMake module file ############ foreach(engine @ENGINES@) + if ("${engine}" STREQUAL "zero") + continue() + endif() message("Testing generated CMake module files for engine ${engine}") set(test_dir @builddir@/cmake-module-test-${engine}) diff --git a/post-install/test-lpf-nprocs.c b/post-install/test-lpf-nprocs.c index cf274b3f..554b5775 100644 --- a/post-install/test-lpf-nprocs.c +++ b/post-install/test-lpf-nprocs.c @@ -53,6 +53,8 @@ void spmd( lpf_t lpf, lpf_pid_t pid, lpf_pid_t nprocs, lpf_args_t args ) lpf_memslot_t mem_slot = LPF_INVALID_MEMSLOT; lpf_register_global( lpf, mem, nprocs, &mem_slot ); + lpf_sync(lpf, LPF_SYNC_DEFAULT); + if (pid != 0) lpf_get( lpf, 0, params_slot, 0, params_slot, 0, sizeof(params), LPF_MSG_DEFAULT ); diff --git a/src/MPI/CMakeLists.txt b/src/MPI/CMakeLists.txt index 757b9004..864bdca2 100644 --- a/src/MPI/CMakeLists.txt +++ b/src/MPI/CMakeLists.txt @@ -23,7 +23,7 @@ if (MPI_FOUND) endif() if (ENABLE_IBVERBS) - list(APPEND MPI_ENGINES ibverbs) + list(APPEND MPI_ENGINES ibverbs zero) endif() if (MPI_IBARRIER) @@ -49,10 +49,12 @@ if (MPI_FOUND) set(ibverbs_sources) if (LPF_IMPL_ID STREQUAL ibverbs) - set(ibverbs_sources ibverbs.cpp) - endif() - - add_library(raw_${libname} OBJECT + set(ibverbs_sources ibverbs.cpp) + endif() + if (LPF_IMPL_ID STREQUAL zero) + set(ibverbs_sources zero.cpp) + endif() + add_library(raw_${libname} OBJECT memorytable.cpp mesgqueue.cpp mpilib.cpp @@ -65,61 +67,61 @@ if (MPI_FOUND) spall2all.c messagesort.cpp spall2all.cpp - init.cpp + init.cpp ${ibverbs_sources} ) - target_compile_flags(raw_${libname} + target_compile_flags(raw_${libname} INTERFACE "-fPIC") - target_compile_definitions(raw_${libname} + target_compile_definitions(raw_${libname} PRIVATE "LPF_CORE_MPI_USES_${LPF_IMPL_ID}=1" "LPF_CORE_WARM_UP_PROBE=1" "LPF_CORE_IMPL_ID=${LPF_IMPL_ID}" "LPF_CORE_IMPL_CONFIG=${LPF_IMPL_CONFIG}" - ) - target_include_directories(raw_${libname} - PRIVATE ${MPI_C_INCLUDE_PATH} - ) - if (iface STREQUAL "spec_") - target_compile_definitions(raw_${libname} + ) + target_include_directories(raw_${libname} + PRIVATE ${MPI_C_INCLUDE_PATH} + ) + if (iface STREQUAL "spec_") + target_compile_definitions(raw_${libname} PRIVATE "LPF_CORE_STATIC_DISPATCH=1" "LPF_CORE_STATIC_DISPATCH_ID=${LPF_IMPL_ID}" "LPF_CORE_STATIC_DISPATCH_CONFIG=${LPF_IMPL_CONFIG}" ) - endif() + endif() - #Always build the shared library, because we need that for the lpfrun - add_library(${libname} SHARED - $ + #Always build the shared library, because we need that for the lpfrun + add_library(${libname} SHARED + $ $ - ) - set_target_properties(${libname} PROPERTIES SOVERSION ${SOVERSION} + ) + set_target_properties(${libname} PROPERTIES SOVERSION ${SOVERSION} MACOSX_RPATH TRUE) - target_compile_flags(${libname} + target_compile_flags(${libname} INTERFACE "-fPIC") - if (iface STREQUAL "spec_") - target_compile_definitions(${libname} - INTERFACE "LPF_CORE_STATIC_DISPATCH=1" + if (iface STREQUAL "spec_") + target_compile_definitions(${libname} + INTERFACE "LPF_CORE_STATIC_DISPATCH=1" "LPF_CORE_STATIC_DISPATCH_ID=${LPF_IMPL_ID}" "LPF_CORE_STATIC_DISPATCH_CONFIG=${LPF_IMPL_CONFIG}" + ) + endif() + target_include_directories(${libname} + PUBLIC ${MPI_C_INCLUDE_PATH} + INTERFACE $ + $ ) - endif() - target_include_directories(${libname} - PUBLIC ${MPI_C_INCLUDE_PATH} - INTERFACE $ - $ - ) - endforeach(LPF_IMPL_ID) + endforeach(LPF_IMPL_ID) endforeach(iface) # link function that e.g. hybrid implementation can also use. function(lpf_link_mpi_core target engine) - target_link_libraries(${target} + target_link_libraries(${target} ${MPI_C_LIBRARIES} ${LIB_MATH} ${LIB_DL} @@ -127,9 +129,9 @@ if (MPI_FOUND) ${LIB_POSIX_THREADS} ) - if (engine STREQUAL ibverbs) - target_link_libraries(${target} ${LIB_IBVERBS}) - endif() + if (engine STREQUAL ibverbs OR engine STREQUAL zero) + target_link_libraries(${target} ${LIB_IBVERBS}) + endif() endfunction() @@ -144,15 +146,15 @@ if (MPI_FOUND) ARCHIVE DESTINATION ${INSTALL_LIB} ) endforeach() - + include_directories(${MPI_C_INCLUDE_PATH}) - # add a test for dynamichook + # add a test for dynamichook if (NOT IS_OPENMPI AND LPF_ENABLE_TESTS) add_gtest(dynamichook.t "mpimsg" ON - ${CMAKE_CURRENT_SOURCE_DIR}/dynamichook.t.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dynamichook.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dynamichook.t.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dynamichook.cpp ${CMAKE_CURRENT_SOURCE_DIR}/mpilib.cpp) - + configure_file( dynamichook.t.sh.in dynamichook.t.sh @ONLY) set( dynamic_hook_t_sh "${CMAKE_CURRENT_BINARY_DIR}/dynamichook.t.sh") add_test(NAME dynamichook_1proc @@ -173,25 +175,29 @@ if (MPI_FOUND) # Other unit tests if (ENABLE_IBVERBS AND LPF_ENABLE_TESTS) - add_gtest( ibverbs_test "ibverbs" ON ${CMAKE_CURRENT_SOURCE_DIR}/ibverbs.t.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/ibverbs.cpp + add_gtest( ibverbs_test "ibverbs" ON ${CMAKE_CURRENT_SOURCE_DIR}/ibverbs.t.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/ibverbs.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/mpilib.cpp) + + add_gtest( zero_test "zero" ON ${CMAKE_CURRENT_SOURCE_DIR}/zero.t.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/zero.cpp ${CMAKE_CURRENT_SOURCE_DIR}/mpilib.cpp) endif() foreach (engine ${MPI_ENGINES}) add_gtest( spall2all_test_${engine} ${engine} ON - ${CMAKE_CURRENT_SOURCE_DIR}/spall2all.t.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/spall2all.c - ${CMAKE_CURRENT_SOURCE_DIR}/spall2all.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/spall2all.t.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/spall2all.c + ${CMAKE_CURRENT_SOURCE_DIR}/spall2all.cpp ${CMAKE_CURRENT_SOURCE_DIR}/mpilib.cpp) add_gtest( dall2all_test_${engine} ${engine} ON - ${CMAKE_CURRENT_SOURCE_DIR}/dall2all.t.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dall2all.t.cpp ${CMAKE_CURRENT_SOURCE_DIR}/mpilib.cpp) if (MPI_IBARRIER) add_gtest( hall2all_test_${engine} ${engine} ON - ${CMAKE_CURRENT_SOURCE_DIR}/hall2all.t.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/hall2all.t.cpp ${CMAKE_CURRENT_SOURCE_DIR}/mpilib.cpp) endif() diff --git a/src/MPI/core.cpp b/src/MPI/core.cpp index 94a9658f..c4f7f900 100644 --- a/src/MPI/core.cpp +++ b/src/MPI/core.cpp @@ -16,6 +16,7 @@ */ #include +#include #include #include @@ -41,8 +42,8 @@ // that may deviate from the stdlib abort() const int LPF_HAS_ABORT = 2; -// Error codes. -// Note: Some code (e.g. in process::broadcastSymbol) depends on the +// Error codes. +// Note: Some code (e.g. in process::broadcastSymbol) depends on the // fact that numbers are assigned in order of severity, where 0 means // no error and 3 means unrecoverable error. That way the severest error // status can be replicated through Communication::allreduceMax @@ -50,11 +51,13 @@ const lpf_err_t LPF_SUCCESS = 0; const lpf_err_t LPF_ERR_OUT_OF_MEMORY = 1; const lpf_err_t LPF_ERR_FATAL = 2; +const lpf_tag_t LPF_INVALID_TAG = std::numeric_limits< uint32_t >::max(); + const lpf_args_t LPF_NO_ARGS = { NULL, 0, NULL, 0, NULL, 0 }; -const lpf_sync_attr_t LPF_SYNC_DEFAULT = 0; +const lpf_sync_attr_t LPF_SYNC_DEFAULT = NULL; -const lpf_msg_attr_t LPF_MSG_DEFAULT = 0; +const lpf_msg_attr_t LPF_MSG_DEFAULT = NULL; const lpf_pid_t LPF_MAX_P = UINT_MAX; @@ -66,13 +69,13 @@ const lpf_init_t LPF_INIT_NONE = NULL; extern "C" const int LPF_MPI_AUTO_INITIALIZE __attribute__((weak)) = 1; -const lpf_t LPF_ROOT = static_cast(const_cast("LPF_ROOT")) ; +const lpf_t LPF_ROOT = static_cast(const_cast("LPF_ROOT")) ; const lpf_machine_t LPF_INVALID_MACHINE = { 0, 0, NULL, NULL }; namespace { lpf::Interface * realContext( lpf_t ctx ) - { + { if ( LPF_ROOT == ctx ) return lpf::Interface::root(); else @@ -80,6 +83,7 @@ namespace { } } +// MPI extension lpf_err_t lpf_mpi_initialize_with_mpicomm( MPI_Comm comm, lpf_init_t * init) { @@ -92,9 +96,9 @@ lpf_err_t lpf_mpi_initialize_with_mpicomm( MPI_Comm comm, lpf_init_t * init) return status; } -lpf_err_t lpf_mpi_initialize_over_tcp( +lpf_err_t lpf_mpi_initialize_over_tcp( const char * server, const char * port, int timeout, - lpf_pid_t pid, lpf_pid_t nprocs, + lpf_pid_t pid, lpf_pid_t nprocs, lpf_init_t * init ) { try { @@ -103,7 +107,7 @@ lpf_err_t lpf_mpi_initialize_over_tcp( // Create an MPI communicator MPI_Comm comm = lpf::mpi::dynamicHook( - server, port, pid, nprocs, + server, port, pid, nprocs, lpf::Time::fromSeconds( timeout / 1000.0) ); // wrap it @@ -143,7 +147,7 @@ lpf_err_t lpf_mpi_initialize_over_tcp( } lpf_err_t lpf_mpi_finalize( lpf_init_t context ) { - + lpf_err_t status = LPF_SUCCESS; delete static_cast< lpf::mpi::Comm *>(context); @@ -151,6 +155,265 @@ lpf_err_t lpf_mpi_finalize( lpf_init_t context ) { return status; } +// tags extension + +lpf_err_t lpf_tag_create_mattr( + lpf_t ctx, + lpf_msg_attr_t * attr +) +{ + (void) ctx; + *attr = LPF_MSG_DEFAULT; + return LPF_SUCCESS; +} + +lpf_err_t lpf_tag_destroy_mattr( + lpf_t ctx, + lpf_msg_attr_t attr +) +{ + (void) ctx; + (void) attr; + return LPF_SUCCESS; +} + +lpf_err_t lpf_tag_create_sattr( + lpf_t ctx, + lpf_sync_attr_t * attr +) +{ + lpf_err_t ret = LPF_SUCCESS; + lpf::Interface * i = realContext(ctx); + if (!i->isAborted()) { + try { + ret = i->createNewSyncAttr(attr); + } catch (const std::bad_alloc &) { + LOG(2, "lpf_tag_create_sattr: out of memory (bad_alloc)"); + return LPF_ERR_OUT_OF_MEMORY; + } catch (const std::exception &e) { + LOG(1, "lpf_tag_create_sattr fatal error: " << e.what()); + return LPF_ERR_FATAL; + } + } + return ret; +} + +lpf_err_t lpf_tag_destroy_sattr( + lpf_t ctx, + lpf_sync_attr_t attr +) +{ + lpf::Interface * i = realContext(ctx); + if (!i->isAborted()) { + i->destroySyncAttr(attr); + } + return LPF_SUCCESS; +} + +lpf_err_t lpf_tag_get_mattr( + lpf_t ctx, + lpf_msg_attr_t attr, + lpf_tag_t * tag +) +{ + (void) ctx; + ASSERT( tag != NULL ); + *tag = *static_cast< uint32_t * >(attr); + return LPF_SUCCESS; +} + +lpf_err_t lpf_tag_get_sattr( + lpf_t ctx, + lpf_sync_attr_t attr, + lpf_tag_t * tag +) +{ + ASSERT( tag != NULL ); + lpf::Interface * i = realContext(ctx); + if (!i->isAborted()) { + *tag = i->getTagFromSyncAttr(attr); + } + return LPF_SUCCESS; +} + +lpf_err_t lpf_tag_set_sattr( + lpf_t ctx, + lpf_tag_t tag, + lpf_sync_attr_t attr +) +{ + ASSERT( attr != NULL ); + lpf::Interface * i = realContext(ctx); + if (!i->isAborted()) { + i->setTagInSyncAttr(tag,attr); + } + return LPF_SUCCESS; +} + +lpf_err_t lpf_tag_set_mattr( + lpf_t ctx, + lpf_tag_t tag, + lpf_msg_attr_t attr +) +{ + (void) ctx; + ASSERT( attr != NULL ); + *static_cast< uint32_t * >(attr) = tag; + return LPF_SUCCESS; +} + +lpf_err_t lpf_resize_tag_register( + lpf_t ctx, + size_t max_tags +) +{ + lpf::Interface * i = realContext(ctx); + if (i->isAborted()) + return LPF_SUCCESS; + + try { + return i->resizeTagRegister(max_tags); + } catch (const std::exception & e) { + LOG(1, "lpf_resize_tag_register fatal error: " << e.what()); + return LPF_ERR_FATAL; + } +} + +lpf_err_t lpf_tag_create( + lpf_t ctx, + bool active, + lpf_tag_t * tag +) +{ + (void)active; + lpf::Interface * i = realContext(ctx); + if (!i->isAborted()) { + try { + *tag = i->registerTag(); + } catch (const std::exception & e) { + LOG(1, "lpf_tag_create fatal error: " << e.what()); + return LPF_ERR_FATAL; + } + } + return LPF_SUCCESS; +} + +lpf_err_t lpf_tag_destroy( + lpf_t ctx, + lpf_tag_t tag +) +{ + lpf::Interface * i = realContext(ctx); + if (!i->isAborted()) { + try { + i->destroyTag(tag); + } catch (const std::exception & e) { + LOG(1, "lpf_tag_destroy fatal error: " << e.what()); + return LPF_ERR_FATAL; + } + } + return LPF_SUCCESS; +} + +// zero-cost extension + +lpf_err_t lpf_zero_create_sattr( + lpf_t ctx, + lpf_sync_attr_t * attr +) +{ + return lpf_tag_create_sattr(ctx,attr); +} + +lpf_err_t lpf_zero_destroy_sattr( + lpf_t ctx, + lpf_sync_attr_t attr +) +{ + return lpf_tag_destroy_sattr(ctx,attr); +} + +lpf_err_t lpf_zero_create_mattr( + lpf_t ctx, + lpf_msg_attr_t * attr +) +{ + return lpf_tag_create_mattr(ctx,attr); +} + +lpf_err_t lpf_zero_destroy_mattr( + lpf_t ctx, + lpf_msg_attr_t attr +) +{ + return lpf_tag_destroy_mattr(ctx,attr); +} + +lpf_err_t lpf_zero_set_expected( + lpf_t ctx, + size_t expected_sent, size_t expected_rcvd, + lpf_sync_attr_t attr +) +{ + ASSERT( attr != NULL ); + lpf::Interface * i = realContext(ctx); + if (!i->isAborted()) { + i->setZCAttr(expected_sent,expected_rcvd,attr); + } + return LPF_SUCCESS; +} + +lpf_err_t lpf_zero_get_expected( + lpf_t ctx, + lpf_sync_attr_t attr, + size_t * expected_sent, size_t * expected_rcvd +) +{ + ASSERT( attr != NULL ); + ASSERT( expected_sent != NULL ); + ASSERT( expected_rcvd != NULL ); + lpf::Interface * i = realContext(ctx); + if (!i->isAborted()) { + i->getZCAttr(attr,*expected_sent,*expected_rcvd); + } + return LPF_SUCCESS; +} + +lpf_err_t lpf_zero_get_status( + lpf_t ctx, lpf_sync_attr_t attr, + size_t * rcvd, size_t * sent +) +{ + lpf::Interface * i = realContext(ctx); + if (!i->isAborted()) { + i->getRcvdMsgCount(rcvd,attr); + i->getSentMsgCount(sent,attr); + } + return LPF_SUCCESS; +} + +// non-coherent extension + +lpf_err_t lpf_noc_flush_sent( lpf_t ctx) +{ + lpf::Interface * i = realContext(ctx); + if (!i->isAborted()) { + i->flushSent(); + } + return LPF_SUCCESS; +} + +lpf_err_t lpf_noc_flush_received( lpf_t ctx) +{ + lpf::Interface * i = realContext(ctx); + if (!i->isAborted()) { + i->flushReceived(); + } + return LPF_SUCCESS; +} + +// core functionality + lpf_err_t lpf_hook( lpf_init_t _init, lpf_spmd_t spmd, @@ -173,7 +436,7 @@ lpf_err_t lpf_rehook( lpf_err_t lpf_exec( lpf_t ctx, - lpf_pid_t P, + lpf_pid_t P, lpf_spmd_t spmd, lpf_args_t args ) @@ -223,48 +486,43 @@ lpf_err_t lpf_deregister( } lpf_err_t lpf_put( lpf_t ctx, - lpf_memslot_t src_slot, - size_t src_offset, - lpf_pid_t dst_pid, - lpf_memslot_t dst_slot, - size_t dst_offset, - size_t size, - lpf_msg_attr_t attr + lpf_memslot_t src_slot, + size_t src_offset, + lpf_pid_t dst_pid, + lpf_memslot_t dst_slot, + size_t dst_offset, + size_t size, + lpf_msg_attr_t attr ) { - (void) attr; // ignore parameter 'msg' since this implementation only + (void) attr; // ignore parameter 'msg' since this implementation only // implements core functionality lpf::Interface * i = realContext(ctx); if (!i->isAborted()) - i->put( src_slot, src_offset, dst_pid, dst_slot, dst_offset, size ); + i->put( src_slot, src_offset, dst_pid, dst_slot, dst_offset, size, attr); return LPF_SUCCESS; } - lpf_err_t lpf_get( - lpf_t ctx, - lpf_pid_t pid, - lpf_memslot_t src, - size_t src_offset, - lpf_memslot_t dst, + lpf_t ctx, + lpf_pid_t pid, + lpf_memslot_t src, + size_t src_offset, + lpf_memslot_t dst, lpf_memslot_t dst_offset, size_t size, lpf_msg_attr_t attr ) { - (void) attr; // ignore parameter 'msg' since this implementation only - // implements core functionality lpf::Interface * i = realContext(ctx); if (!i->isAborted()) - i->get( pid, src, src_offset, dst, dst_offset, size ); + i->get( pid, src, src_offset, dst, dst_offset, size, attr); return LPF_SUCCESS; } lpf_err_t lpf_sync( lpf_t ctx, lpf_sync_attr_t attr ) { - (void) attr; // ignore attr parameter since this implementation only - // implements core functionality - return realContext(ctx)->sync(); + return realContext(ctx)->sync(attr); } lpf_err_t lpf_probe( lpf_t ctx, lpf_machine_t * params ) @@ -282,7 +540,7 @@ lpf_err_t lpf_resize_memory_register( lpf_t ctx, size_t max_regs ) lpf::Interface * i = realContext(ctx); if (i->isAborted()) return LPF_SUCCESS; - + return i->resizeMemreg(max_regs); } @@ -291,7 +549,7 @@ lpf_err_t lpf_resize_message_queue( lpf_t ctx, size_t max_msgs ) lpf::Interface * i = realContext(ctx); if (i->isAborted()) return LPF_SUCCESS; - + return i->resizeMesgQueue(max_msgs); } @@ -301,4 +559,3 @@ lpf_err_t lpf_abort( lpf_t ctx ) { return LPF_SUCCESS; } - diff --git a/src/MPI/ibverbs.cpp b/src/MPI/ibverbs.cpp index 44852caa..73103aad 100644 --- a/src/MPI/ibverbs.cpp +++ b/src/MPI/ibverbs.cpp @@ -45,14 +45,11 @@ namespace { } } - IBVerbs :: IBVerbs( Communication & comm ) : m_pid( comm.pid() ) , m_nprocs( comm.nprocs() ) - , m_devName() , m_ibPort( Config::instance().getIBPort() ) , m_gidIdx( Config::instance().getIBGidIndex() ) - , m_mtu( getMTU( Config::instance().getIBMTU() )) , m_maxRegSize(0) , m_maxMsgSize(0) , m_minNrMsgs(0) @@ -60,19 +57,21 @@ IBVerbs :: IBVerbs( Communication & comm ) , m_device() , m_pd() , m_cq() + , m_dummyMemReg() + , m_comm( comm ) + , m_mtu( getMTU( Config::instance().getIBMTU() )) + , m_devName() , m_stagedQps( m_nprocs ) , m_connectedQps( m_nprocs ) , m_srs() , m_srsHeads( m_nprocs, 0u ) , m_nMsgsPerPeer( m_nprocs, 0u ) - , m_activePeers(0, m_nprocs) , m_peerList() , m_sges() , m_wcs(m_nprocs) - , m_memreg() - , m_dummyMemReg() , m_dummyBuffer() - , m_comm( comm ) + , m_activePeers(0, m_nprocs) + , m_memreg() { m_peerList.reserve( m_nprocs ); @@ -97,7 +96,6 @@ IBVerbs :: IBVerbs( Communication & comm ) throw Exception( "No Infiniband devices available" ); } - std::string wantDevName = Config::instance().getIBDeviceName(); LOG( 3, "Searching for device '"<< wantDevName << "'" ); struct ibv_device * dev = NULL; @@ -144,7 +142,8 @@ IBVerbs :: IBVerbs( Communication & comm ) // maximum number of work requests per Queue Pair m_maxSrs = std::min( m_deviceAttr.max_qp_wr, // maximum work requests per QP m_deviceAttr.max_cqe ); // maximum entries per CQ - LOG(3, "Maximum number of send requests is the minimum of " + + LOG(3, "Initial maximum number of send requests is the minimum of " << m_deviceAttr.max_qp_wr << " (the maximum of work requests per QP)" << " and " << m_deviceAttr.max_cqe << " (the maximum of completion " << " queue entries per QP), nameley " << m_maxSrs ); @@ -196,6 +195,58 @@ IBVerbs :: IBVerbs( Communication & comm ) LOG(3, "Allocated completion queue with " << m_nprocs << " entries."); + /* + * Unfortunately, some RDMA devices advertise max_qp_wr but + * support a much smaller number. We can probe that. + * Note that the inofficial documentation on rdmamojo.com states: + * + * There may be RDMA devices that for specific transport types may support + * less outstanding Work Requests than the maximum reported value. + * + * Therefore, we here do binary search to find the actual value + */ + struct ibv_qp_init_attr testAttr; + (void) std::memset(&testAttr, 0, sizeof(testAttr)); + + // We only care about the attr.cap.max_send_wr + testAttr.qp_type = IBV_QPT_RC; + + struct ibv_qp * ibv_new_qp_p; + testAttr.cap.max_send_wr = m_maxSrs; + testAttr.send_cq = m_cq.get(); + testAttr.recv_cq = m_cq.get(); + ibv_new_qp_p = ibv_create_qp(m_pd.get(), &testAttr); + if (ibv_new_qp_p == NULL) { + size_t left = 1; + size_t right = m_maxSrs; + size_t largestOkaySize = 0; + while (left <= right) + { + size_t mid = (left + right) / 2; + testAttr.cap.max_send_wr = mid; + // test if call succeeds + ibv_new_qp_p = ibv_create_qp(m_pd.get(), &testAttr); + if (ibv_new_qp_p == NULL) { + if (errno != EINVAL) { // error points to unsupported max_send_wr by device + throw Exception("Unexpected error code during binary search for maximum send WR."); + } + else { + right = mid - 1; + } + } + else { + // clean up dummy QP + ibv_destroy_qp(ibv_new_qp_p); + left = mid + 1; + // record that we still succeed + largestOkaySize = mid; + } + } + ASSERT(largestOkaySize > 0); + m_maxSrs = largestOkaySize; + LOG(3, "Revised maximum number of send requests is " << m_maxSrs ); + } + // allocate dummy buffer m_dummyBuffer.resize( 8 ); struct ibv_mr * const ibv_reg_mr_new_p = ibv_reg_mr( @@ -237,6 +288,7 @@ void IBVerbs :: stageQPs( size_t maxMsgs ) attr.cap.max_recv_sge = 1; struct ibv_qp * const ibv_new_qp_p = ibv_create_qp( m_pd.get(), &attr ); + if( ibv_new_qp_p == NULL ) { m_stagedQps[i].reset(); } else { @@ -460,8 +512,8 @@ IBVerbs :: SlotID IBVerbs :: regLocal( void * addr, size_t size ) MemoryRegistration local; local.addr = addr; local.size = size; - local.lkey = size?slot.mr->lkey:0; - local.rkey = size?slot.mr->rkey:0; + local.lkey = size ? slot.mr->lkey : 0; + local.rkey = size ? slot.mr->rkey : 0; SlotID id = m_memreg.addLocalReg( slot ); @@ -504,8 +556,8 @@ IBVerbs :: SlotID IBVerbs :: regGlobal( void * addr, size_t size ) MemoryRegistration local; local.addr = addr; local.size = size; - local.lkey = size?slot.mr->lkey:0; - local.rkey = size?slot.mr->rkey:0; + local.lkey = size ? slot.mr->lkey : 0; + local.rkey = size ? slot.mr->rkey : 0; LOG(4, "All-gathering memory register data" ); diff --git a/src/MPI/ibverbs.hpp b/src/MPI/ibverbs.hpp index a96030a2..ab3685db 100644 --- a/src/MPI/ibverbs.hpp +++ b/src/MPI/ibverbs.hpp @@ -20,7 +20,7 @@ #include #include -#if __cplusplus >= 201103L +#if __cplusplus >= 201103L #include #else #include @@ -34,18 +34,18 @@ #include "memreg.hpp" namespace lpf { - + class Communication; - + namespace mpi { -#if __cplusplus >= 201103L +#if __cplusplus >= 201103L using std::shared_ptr; #else using std::tr1::shared_ptr; #endif -class _LPFLIB_LOCAL IBVerbs +class _LPFLIB_LOCAL IBVerbs { public: struct Exception; @@ -57,7 +57,7 @@ class _LPFLIB_LOCAL IBVerbs void resizeMemreg( size_t size ); void resizeMesgq( size_t size ); - + SlotID regLocal( void * addr, size_t size ); SlotID regGlobal( void * addr, size_t size ); void dereg( SlotID id ); @@ -66,30 +66,29 @@ class _LPFLIB_LOCAL IBVerbs return m_maxMsgSize; } - void put( SlotID srcSlot, size_t srcOffset, + void put( SlotID srcSlot, size_t srcOffset, int dstPid, SlotID dstSlot, size_t dstOffset, size_t size ); - void get( int srcPid, SlotID srcSlot, size_t srcOffset, + void get( int srcPid, SlotID srcSlot, size_t srcOffset, SlotID dstSlot, size_t dstOffset, size_t size ); - // Do the communication and synchronize // 'Reconnect' must be a globally replicated value void sync( bool reconnect); + private: IBVerbs & operator=(const IBVerbs & ); // assignment prohibited IBVerbs( const IBVerbs & ); // copying prohibited - void stageQPs(size_t maxMsgs ); - void reconnectQPs(); - + void stageQPs(size_t maxMsgs ); + void reconnectQPs(); struct MemoryRegistration { - void * addr; - size_t size; - uint32_t lkey; - uint32_t rkey; + void * addr; + size_t size; + uint32_t lkey; + uint32_t rkey; }; struct MemorySlot { @@ -97,51 +96,55 @@ class _LPFLIB_LOCAL IBVerbs std::vector< MemoryRegistration > glob; // array for global registrations }; - int m_pid; // local process ID - int m_nprocs; // number of processes + int m_pid; // local process ID + int m_nprocs; // number of processes + int m_ibPort; // local IB port to work with + int m_gidIdx; + size_t m_maxRegSize; + size_t m_maxMsgSize; + size_t m_minNrMsgs; + size_t m_maxSrs; // maximum number of sends requests per QP + + shared_ptr< struct ibv_context > m_device; // device handle + shared_ptr< struct ibv_pd > m_pd; // protection domain + shared_ptr< struct ibv_cq > m_cq; // complation queue + shared_ptr< struct ibv_mr > m_dummyMemReg; // registration of dummy + // buffer + Communication & m_comm; + + ibv_mtu m_mtu; + + std::string m_devName; // IB device name - std::string m_devName; // IB device name - int m_ibPort; // local IB port to work with - int m_gidIdx; - uint16_t m_lid; // LID of the IB port - ibv_mtu m_mtu; struct ibv_device_attr m_deviceAttr; - size_t m_maxRegSize; - size_t m_maxMsgSize; - size_t m_minNrMsgs; - size_t m_maxSrs; // maximum number of sends requests per QP - shared_ptr< struct ibv_context > m_device; // device handle - shared_ptr< struct ibv_pd > m_pd; // protection domain - shared_ptr< struct ibv_cq > m_cq; // complation queue + uint16_t m_lid; // LID of the IB port // Disconnected queue pairs - std::vector< shared_ptr< struct ibv_qp > > m_stagedQps; + std::vector< shared_ptr< struct ibv_qp > > m_stagedQps; // Connected queue pairs - std::vector< shared_ptr< struct ibv_qp > > m_connectedQps; + std::vector< shared_ptr< struct ibv_qp > > m_connectedQps; + std::vector< struct ibv_send_wr > m_srs; // array of send requests + std::vector< size_t > m_srsHeads; // head of send queue per + // peer + std::vector< size_t > m_nMsgsPerPeer; // number of messages per + // peer + std::vector< pid_t > m_peerList; - std::vector< struct ibv_send_wr > m_srs; // array of send requests - std::vector< size_t > m_srsHeads; // head of send queue per peer - std::vector< size_t > m_nMsgsPerPeer; // number of messages per peer - SparseSet< pid_t > m_activePeers; // - std::vector< pid_t > m_peerList; + std::vector< struct ibv_sge > m_sges; // array of scatter/gather + // entries + std::vector< struct ibv_wc > m_wcs; // array of work completions + std::vector< char > m_dummyBuffer; // dummy receive buffer - std::vector< struct ibv_sge > m_sges; // array of scatter/gather entries - std::vector< struct ibv_wc > m_wcs; // array of work completions + SparseSet< pid_t > m_activePeers; CombinedMemoryRegister< MemorySlot > m_memreg; - - shared_ptr< struct ibv_mr > m_dummyMemReg; // registration of dummy buffer - std::vector< char > m_dummyBuffer; // dummy receive buffer - - Communication & m_comm; }; - } } diff --git a/src/MPI/ibverbs.t.cpp b/src/MPI/ibverbs.t.cpp index 8b916711..dc2e80a5 100644 --- a/src/MPI/ibverbs.t.cpp +++ b/src/MPI/ibverbs.t.cpp @@ -226,7 +226,6 @@ TEST_F( IBVerbsTests, getAllToAll ) verbs->sync(true); - EXPECT_EQ(a, a2); EXPECT_EQ(b, b2); diff --git a/src/MPI/init.cpp b/src/MPI/init.cpp index 68d16866..97768de1 100644 --- a/src/MPI/init.cpp +++ b/src/MPI/init.cpp @@ -54,9 +54,10 @@ namespace lpf { (engine.compare( "mpirma" ) == 0) || (engine.compare( "mpimsg" ) == 0) || (engine.compare( "ibverbs" ) == 0) || + (engine.compare( "zero" ) == 0) || (engine.compare( "hybrid" ) == 0); if( !engine_is_MPI ) { - (void) std::fprintf( stderr, "Warning: program was compiled for the mpirma, mpimsg, ibverbs, or hybrid engine but run-time requests the %s engine instead. For stable results please compile the program into a universal LPF program (by omitting the -engine flag to the lpfcc/lpfcxx utilities).\n", engine.c_str() ); + (void) std::fprintf( stderr, "Warning: program was compiled for the mpirma, mpimsg, ibverbs, zero, or hybrid engine but run-time requests the %s engine instead. For stable results please compile the program into a universal LPF program (by omitting the -engine flag to the lpfcc/lpfcxx utilities).\n", engine.c_str() ); } if( mpi_initializer_ran || !engine_is_MPI ) { diff --git a/src/MPI/interface.cpp b/src/MPI/interface.cpp index 30ece40d..e7f7374a 100644 --- a/src/MPI/interface.cpp +++ b/src/MPI/interface.cpp @@ -93,20 +93,38 @@ catch ( const std::bad_alloc & e) void Interface :: put( memslot_t srcSlot, size_t srcOffset, pid_t dstPid, memslot_t dstSlot, size_t dstOffset, - size_t size ) + size_t size, lpf_msg_attr_t attr ) { m_mesgQueue.put( srcSlot, srcOffset, dstPid, dstSlot, dstOffset, - size ); + size, attr); +} + +void Interface :: flushSent() { + m_mesgQueue.flushSent(); +} + +void Interface :: flushReceived() { + m_mesgQueue.flushReceived(); +} + +err_t Interface :: createNewSyncAttr(sync_attr_t * attr) +{ + if ( 0 == m_aborted ) + { + m_mesgQueue.createNewSyncAttr(attr); + return LPF_SUCCESS; + } + return LPF_ERR_FATAL; } void Interface :: get( pid_t srcPid, memslot_t srcSlot, size_t srcOffset, memslot_t dstSlot, size_t dstOffset, - size_t size ) + size_t size, lpf_msg_attr_t attr ) { m_mesgQueue.get( srcPid, srcSlot, srcOffset, dstSlot, dstOffset, - size ); + size, attr); } memslot_t Interface :: registerGlobal( void * mem, size_t size ) @@ -119,11 +137,21 @@ memslot_t Interface :: registerLocal( void * mem, size_t size ) return m_mesgQueue.addLocalReg( mem, size ); } +tag_t Interface :: registerTag() +{ + return m_mesgQueue.addTag(); +} + void Interface :: deregister( memslot_t slot ) { m_mesgQueue.removeReg( slot ); } +void Interface :: destroyTag( tag_t tag ) +{ + m_mesgQueue.removeTag( tag ); +} + err_t Interface :: resizeMemreg( size_t nRegs ) { return m_mesgQueue.resizeMemreg( nRegs ); @@ -134,12 +162,24 @@ err_t Interface :: resizeMesgQueue( size_t nMsgs ) return m_mesgQueue.resizeMesgQueue( nMsgs ); } +err_t Interface :: resizeTagRegister( size_t nTags ) +{ + return m_mesgQueue.resizeTagreg( nTags ); +} + void Interface :: abort() { ASSERT( 0 == m_aborted ); +#ifdef LPF_CORE_MPI_USES_zero + int vote = 1; + int voted; + m_comm.allreduceSum(&vote, &voted, 1); + m_aborted = voted; +#else // signal all other processes at the start of the next 'sync' that // this process aborted. - m_aborted = m_mesgQueue.sync( true ); + m_aborted = m_mesgQueue.sync( true, LPF_SYNC_DEFAULT ); +#endif } pid_t Interface :: isAborted() const @@ -147,11 +187,11 @@ pid_t Interface :: isAborted() const return m_aborted; } -err_t Interface :: sync() +err_t Interface :: sync( sync_attr_t attr ) { if ( 0 == m_aborted ) { - m_aborted = m_mesgQueue.sync( false ); + m_aborted = m_mesgQueue.sync( false, attr ); } if ( 0 == m_aborted ) diff --git a/src/MPI/interface.hpp b/src/MPI/interface.hpp index 732f0a9b..6649af30 100644 --- a/src/MPI/interface.hpp +++ b/src/MPI/interface.hpp @@ -27,7 +27,7 @@ namespace lpf { - class _LPFLIB_LOCAL Process; +class _LPFLIB_LOCAL Process; class _LPFLIB_LOCAL Interface { @@ -39,36 +39,107 @@ class _LPFLIB_LOCAL Interface } _LPFLIB_API - static void initRoot(int *argc, char ***argv); + static void initRoot(int *argc, char ***argv) ; - Interface( mpi::Comm machine, Process & subprocess ); + Interface( mpi::Comm machine, Process & subprocess ) ; void put( memslot_t srcSlot, size_t srcOffset, pid_t dstPid, memslot_t dstSlot, size_t dstOffset, - size_t size ) ; // nothrow + size_t size, lpf_msg_attr_t attr) ; // nothrow void get( pid_t srcPid, memslot_t srcSlot, size_t srcOffset, memslot_t dstSlot, size_t dstOffset, - size_t size ) ;// nothrow + size_t size, lpf_msg_attr_t attr) ; // nothrow memslot_t registerGlobal( void * mem, size_t size ) ; // nothrow memslot_t registerLocal( void * mem, size_t size ) ; // nothrow + tag_t registerTag() ; // can throw(!) + void deregister( memslot_t slot ) ; // nothrow + void destroyTag( tag_t tag ) ; // can throw(!) + err_t resizeMemreg( size_t nRegs ) ; // nothrow err_t resizeMesgQueue( size_t nMsgs ) ; // nothrow + err_t resizeTagRegister( size_t nTags ) ; // can throw(!) + void abort() ; // nothrow pid_t isAborted() const ; - err_t sync(); // nothrow + err_t sync( sync_attr_t attr ) ; // nothrow err_t exec( pid_t P, spmd_t spmd, args_t args ) ; - static err_t hook( const mpi::Comm & comm , spmd_t spmd, args_t args ); + static err_t hook( const mpi::Comm & comm , spmd_t spmd, args_t args ) ; + + err_t createNewSyncAttr(sync_attr_t * attr) ; + + inline void destroySyncAttr(sync_attr_t attr) + { + if ( 0 == m_aborted ) + { + return m_mesgQueue.destroySyncAttr(attr); + } + } + + inline tag_t getTagFromSyncAttr(sync_attr_t attr) noexcept + { + if ( 0 == m_aborted ) + { + return m_mesgQueue.getTagFromSyncAttr(attr); + } + return LPF_INVALID_TAG; + } + + inline void setTagInSyncAttr(tag_t tag, sync_attr_t attr) noexcept + { + if ( 0 == m_aborted ) + { + m_mesgQueue.setTagInSyncAttr(tag,attr); + } + } + + inline void setZCAttr(size_t sent, size_t rcvd, sync_attr_t attr) noexcept + { + if ( 0 == m_aborted ) + { + m_mesgQueue.setZCAttr(sent,rcvd,attr); + } + } + + inline void getZCAttr(sync_attr_t attr, size_t &sent, size_t &rcvd) noexcept + { + if ( 0 == m_aborted ) + { + m_mesgQueue.getZCAttr(attr,sent,rcvd); + } + } + + typedef size_t SlotID; + + inline void getRcvdMsgCount(size_t * msgs, sync_attr_t attr) noexcept + { + if ( 0 == m_aborted ) + { + m_mesgQueue.getRcvdMsgCount(msgs, attr); + } + } + + inline void getSentMsgCount(size_t * msgs, sync_attr_t attr) noexcept + { + if ( 0 == m_aborted ) + { + m_mesgQueue.getSentMsgCount(msgs, attr); + } + } + + void flushSent(); + + void flushReceived(); err_t rehook( spmd_t spmd, args_t args); diff --git a/src/MPI/memorytable.cpp b/src/MPI/memorytable.cpp index 3bb7a792..57dff485 100644 --- a/src/MPI/memorytable.cpp +++ b/src/MPI/memorytable.cpp @@ -23,8 +23,8 @@ namespace lpf { MemoryTable :: MemoryTable( Communication & comm -#ifdef LPF_CORE_MPI_USES_ibverbs - , mpi::IBVerbs & ibverbs +#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero + , IBVerbs & ibverbs #endif ) : m_memreg() @@ -34,7 +34,7 @@ MemoryTable :: MemoryTable( Communication & comm , m_removed( 0, 0 ) , m_comm( comm ) #endif -#ifdef LPF_CORE_MPI_USES_ibverbs +#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero , m_added( 0, 0 ) , m_ibverbs( ibverbs ) , m_comm( comm ) @@ -45,7 +45,7 @@ MemoryTable :: MemoryTable( Communication & comm MemoryTable :: Slot MemoryTable :: addLocal( void * mem, std::size_t size ) // nothrow { -#ifdef LPF_CORE_MPI_USES_ibverbs +#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero Memory rec( mem, size, m_ibverbs.regLocal( mem, size)); #else Memory rec( mem, size); @@ -55,14 +55,14 @@ MemoryTable :: addLocal( void * mem, std::size_t size ) // nothrow MemoryTable :: Slot MemoryTable :: addGlobal( void * mem, std::size_t size ) // nothrow -{ -#ifdef LPF_CORE_MPI_USES_ibverbs - Memory rec(mem, size, -1); +{ +#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero + Memory rec(mem, size, -1); #else - Memory rec(mem, size); + Memory rec(mem, size); #endif - Slot slot = m_memreg.addGlobalReg(rec) ; -#if defined LPF_CORE_MPI_USES_mpirma || defined LPF_CORE_MPI_USES_ibverbs + Slot slot = m_memreg.addGlobalReg(rec) ; +#if defined LPF_CORE_MPI_USES_mpirma || defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero m_added.insert( slot ); #endif return slot; @@ -92,7 +92,7 @@ void MemoryTable :: remove( Slot slot ) // nothrow m_memreg.removeReg( slot ); #endif -#ifdef LPF_CORE_MPI_USES_ibverbs +#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero if (m_added.contains(slot)) { m_added.erase(slot); } @@ -123,7 +123,7 @@ void MemoryTable :: reserve( size_t size ) // throws bad_alloc, strong safe m_memreg.reserve( size ); #endif -#ifdef LPF_CORE_MPI_USES_ibverbs +#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero m_memreg.reserve( size ); size_t range = m_memreg.range(); m_added.resize( range ); @@ -139,24 +139,25 @@ size_t MemoryTable :: capacity() const } size_t MemoryTable :: range() const -{ +{ return m_memreg.range(); } bool MemoryTable :: needsSync() const -{ +{ #ifdef LPF_CORE_MPI_USES_mpirma return ! m_added.empty() || !m_removed.empty(); -#endif -#ifdef LPF_CORE_MPI_USES_mpimsg +#elif LPF_CORE_MPI_USES_mpimsg return false; -#endif -#ifdef LPF_CORE_MPI_USES_ibverbs +#elif defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero return !m_added.empty(); +#else // This case should NOT occur? + fprintf(stderr, "An unknown engine in MPI/memorytable.cpp\n"); + std::abort(); #endif } -void MemoryTable :: sync( ) +void MemoryTable :: sync( ) { #ifdef LPF_CORE_MPI_USES_mpirma if ( !m_removed.empty() ) @@ -184,17 +185,17 @@ void MemoryTable :: sync( ) ASSERT( !isLocalSlot( *i )); void * base = m_memreg.lookup( *i).addr; size_t size = m_memreg.lookup( *i ).size; - Window w = m_comm.createMemslot( base, size ); + Window w = m_comm.createMemslot( base, size ); m_windows[ *i ] = w; m_comm.fence( w ); } // clear the added list m_added.clear(); - } // if + } // if #endif -#ifdef LPF_CORE_MPI_USES_ibverbs +#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero if ( !m_added.empty() ) { // Register the global with IBverbs @@ -204,7 +205,7 @@ void MemoryTable :: sync( ) ASSERT( !isLocalSlot( *i )); void * base = m_memreg.lookup( *i).addr; size_t size = m_memreg.lookup( *i ).size; - mpi::IBVerbs::SlotID s = m_ibverbs.regGlobal( base, size ); + IBVerbs::SlotID s = m_ibverbs.regGlobal( base, size ); m_memreg.update( *i ).slot = s; } diff --git a/src/MPI/memorytable.hpp b/src/MPI/memorytable.hpp index 18dd5038..55f1fe59 100644 --- a/src/MPI/memorytable.hpp +++ b/src/MPI/memorytable.hpp @@ -1,4 +1,3 @@ - /* * Copyright 2021 Huawei Technologies Co., Ltd. * @@ -27,6 +26,9 @@ #ifdef LPF_CORE_MPI_USES_ibverbs #include "ibverbs.hpp" #endif +#ifdef LPF_CORE_MPI_USES_zero +#include "zero.hpp" +#endif #include @@ -41,12 +43,18 @@ class _LPFLIB_LOCAL MemoryTable #ifdef LPF_CORE_MPI_USES_mpirma typedef Communication::Memslot Window; #endif +#ifdef LPF_CORE_MPI_USES_ibverbs + typedef mpi::IBVerbs IBVerbs; +#elif defined LPF_CORE_MPI_USES_zero + typedef mpi::Zero IBVerbs; +#endif struct Memory { char *addr; size_t size; -#ifdef LPF_CORE_MPI_USES_ibverbs - mpi::IBVerbs::SlotID slot; - Memory( void * a, size_t s, mpi::IBVerbs::SlotID sl) +#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero + typedef IBVerbs::SlotID SlotID; + SlotID slot; + Memory( void * a, size_t s, SlotID sl) : addr(static_cast(a)) , size(s), slot(sl) {} Memory() : addr(NULL), size(0u), slot(-1) {} @@ -67,6 +75,8 @@ class _LPFLIB_LOCAL MemoryTable #ifdef LPF_CORE_MPI_USES_ibverbs explicit MemoryTable( Communication & comm, mpi::IBVerbs & verbs ); +#elif defined LPF_CORE_MPI_USES_zero + explicit MemoryTable( Communication & comm, mpi::Zero & verbs ); #else explicit MemoryTable( Communication & comm ); #endif @@ -78,7 +88,11 @@ class _LPFLIB_LOCAL MemoryTable void remove( Slot slot ); // nothrow void * getAddress( Slot slot, size_t offset ) const // nothrow - { ASSERT( offset <= m_memreg.lookup(slot).size ); + { + if (offset > m_memreg.lookup(slot).size) { + LOG(5, "Offset:" << offset << " m_Memreg.lookup(slot).size = " << m_memreg.lookup(slot).size); + } + ASSERT( offset <= m_memreg.lookup(slot).size ); return m_memreg.lookup(slot).addr + offset; } @@ -90,8 +104,12 @@ class _LPFLIB_LOCAL MemoryTable { return m_windows[ slot ]; } #endif -#ifdef LPF_CORE_MPI_USES_ibverbs +#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero +#ifdef LPF_CORE_MPI_USES_ibverbs mpi::IBVerbs::SlotID getVerbID( Slot slot ) const +#elif defined LPF_CORE_MPI_USES_zero + mpi::Zero::SlotID getVerbID( Slot slot ) const +#endif { return m_memreg.lookup( slot ).slot; } #endif @@ -118,9 +136,13 @@ class _LPFLIB_LOCAL MemoryTable Communication & m_comm; #endif -#ifdef LPF_CORE_MPI_USES_ibverbs +#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero DirtyList m_added; +#ifdef LPF_CORE_MPI_USES_ibverbs mpi::IBVerbs & m_ibverbs; +#elif defined LPF_CORE_MPI_USES_zero + mpi::Zero & m_ibverbs; +#endif Communication & m_comm; #endif }; diff --git a/src/MPI/mesgqueue.cpp b/src/MPI/mesgqueue.cpp index 0f610a52..07f6b641 100644 --- a/src/MPI/mesgqueue.cpp +++ b/src/MPI/mesgqueue.cpp @@ -16,6 +16,11 @@ */ #include "mesgqueue.hpp" +#ifdef LPF_CORE_MPI_USES_zero +#include "zero.hpp" +#else +#include "ibverbs.hpp" +#endif #include "mpilib.hpp" #include "log.hpp" #include "assert.hpp" @@ -97,14 +102,14 @@ MessageQueue :: MessageQueue( Communication & comm ) , m_edgeRecv() , m_edgeSend() , m_edgeBuffer() -#if defined LPF_CORE_MPI_USES_mpirma || defined LPF_CORE_MPI_USES_ibverbs +#if defined LPF_CORE_MPI_USES_mpirma || defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero , m_edgeBufferSlot( m_memreg.invalidSlot() ) #endif , m_bodySends() , m_bodyRecvs() , m_comm( dynamic_cast(comm) ) -#ifdef LPF_CORE_MPI_USES_ibverbs - , m_ibverbs( m_comm ) +#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero + , m_ibverbs(m_comm) , m_memreg( m_comm, m_ibverbs ) #else , m_memreg( m_comm ) @@ -179,7 +184,7 @@ err_t MessageQueue :: resizeMesgQueue( size_t nMsgs ) #ifdef LPF_CORE_MPI_USES_mpimsg m_comm.reserveMsgs( 6* nMsgs ); //another factor three stems from sending edges separately . #endif -#ifdef LPF_CORE_MPI_USES_ibverbs +#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero m_ibverbs.resizeMesgq( 6*nMsgs); #endif @@ -243,6 +248,23 @@ err_t MessageQueue :: resizeMemreg( size_t nRegs ) return LPF_SUCCESS; } +err_t MessageQueue :: resizeTagreg( size_t nRegs ) +{ +#ifdef LPF_CORE_MPI_USES_zero + try { + m_ibverbs.resizeTagreg( nRegs ); + } catch (const std::bad_alloc &) { + return LPF_ERR_OUT_OF_MEMORY; + } catch (...) { + return LPF_ERR_FATAL; + } + return LPF_SUCCESS; +#else + (void) nRegs; + throw std::runtime_error("Selected engine does not support tags"); +#endif +} + memslot_t MessageQueue :: addLocalReg( void * mem, std::size_t size) { memslot_t slot = m_memreg.addLocal( mem, size ); @@ -259,6 +281,15 @@ memslot_t MessageQueue :: addGlobalReg( void * mem, std::size_t size ) return slot; } +tag_t MessageQueue :: addTag() +{ +#ifdef LPF_CORE_MPI_USES_zero + return m_ibverbs.regTag(); +#else + throw std::runtime_error("Selected engine does not support tags"); +#endif +} + void MessageQueue :: removeReg( memslot_t slot ) { if (m_memreg.getSize( slot ) > 0) @@ -267,91 +298,127 @@ void MessageQueue :: removeReg( memslot_t slot ) m_memreg.remove( slot ); } +void MessageQueue :: removeTag( tag_t tag ) +{ +#ifdef LPF_CORE_MPI_USES_zero + m_ibverbs.deregTag( tag ); +#else + (void) tag; + throw std::runtime_error("Selected engine does not support tags"); +#endif +} + void MessageQueue :: get( pid_t srcPid, memslot_t srcSlot, size_t srcOffset, - memslot_t dstSlot, size_t dstOffset, size_t size ) + memslot_t dstSlot, size_t dstOffset, size_t size, lpf_msg_attr_t attr) { - if (size > 0) + if( size == 0 ) { return; } + ASSERT( ! m_memreg.isLocalSlot( srcSlot ) ); + if ( srcPid == static_cast(m_pid) ) { - ASSERT( ! m_memreg.isLocalSlot( srcSlot ) ); - void * address = m_memreg.getAddress( dstSlot, dstOffset ); - if ( srcPid == static_cast(m_pid) ) - { - std::memcpy( address, m_memreg.getAddress( srcSlot, srcOffset), size); - } - else - { - using mpi::ipc::newMsg; - - if (size <= m_tinyMsgSize ) - { - // send immediately the request to the source - newMsg( BufGet, m_tinyMsgBuf.data(), m_tinyMsgBuf.size() ) - .write( DstPid , m_pid ) - .write( SrcSlot, srcSlot) - .write( DstSlot, dstSlot) - .write( SrcOffset, srcOffset ) - .write( DstOffset, dstOffset ) - .write( Size, size ) - .send( *m_firstQueue, srcPid ); - } - else - { - // send the request to the destination process (this process) - // for write conflict resolution - newMsg( HpGet, m_tinyMsgBuf.data(), m_tinyMsgBuf.size() ) - .write( SrcPid, srcPid ) - .write( DstPid, m_pid ) - .write( SrcSlot, srcSlot ) - .write( DstSlot, dstSlot ) - .write( SrcOffset, srcOffset ) - .write( DstOffset, dstOffset ) - .write( Size, size ) - . send( *m_firstQueue, m_pid ); - } - } + void * const address = m_memreg.getAddress( dstSlot, dstOffset ); + (void) std::memcpy( + address, + m_memreg.getAddress( srcSlot, srcOffset), size + ); + return; } +#ifdef LPF_CORE_MPI_USES_zero + m_ibverbs.get( + srcPid, + m_memreg.getVerbID( srcSlot ), + srcOffset, + m_memreg.getVerbID( dstSlot ), + dstOffset, + size, attr); +#else + (void) attr; // this engine does not use message attributes + using mpi::ipc::newMsg; + + if (size <= m_tinyMsgSize ) + { + // send immediately the request to the source + newMsg( BufGet, m_tinyMsgBuf.data(), m_tinyMsgBuf.size() ) + .write( DstPid , m_pid ) + .write( SrcSlot, srcSlot) + .write( DstSlot, dstSlot) + .write( SrcOffset, srcOffset ) + .write( DstOffset, dstOffset ) + .write( Size, size ) + .send( *m_firstQueue, srcPid ); + } else { + // send the request to the destination process (this process) + // for write conflict resolution + newMsg( HpGet, m_tinyMsgBuf.data(), m_tinyMsgBuf.size() ) + .write( SrcPid, srcPid ) + .write( DstPid, m_pid ) + .write( SrcSlot, srcSlot ) + .write( DstSlot, dstSlot ) + .write( SrcOffset, srcOffset ) + .write( DstOffset, dstOffset ) + .write( Size, size ) + .send( *m_firstQueue, m_pid ); + } +#endif } void MessageQueue :: put( memslot_t srcSlot, size_t srcOffset, - pid_t dstPid, memslot_t dstSlot, size_t dstOffset, size_t size ) + pid_t dstPid, memslot_t dstSlot, size_t dstOffset, size_t size, lpf_msg_attr_t attr) { - if (size > 0) + if (size == 0 ) { return; } + ASSERT( ! m_memreg.isLocalSlot( dstSlot ) ); + void * const address = m_memreg.getAddress( srcSlot, srcOffset ); + if ( dstPid == static_cast(m_pid) ) { - ASSERT( ! m_memreg.isLocalSlot( dstSlot ) ); - void * address = m_memreg.getAddress( srcSlot, srcOffset ); - if ( dstPid == static_cast(m_pid) ) - { - std::memcpy( m_memreg.getAddress( dstSlot, dstOffset), address, size); - } - else - { - using mpi::ipc::newMsg; - if (size <= m_tinyMsgSize ) - { - newMsg( BufPut, m_tinyMsgBuf.data(), m_tinyMsgBuf.size() ) - .write( DstSlot, dstSlot ) - .write( DstOffset, dstOffset ) - .write( Payload, address, size ) - . send( *m_firstQueue, dstPid ); - } - else - { - newMsg( HpPut, m_tinyMsgBuf.data(), m_tinyMsgBuf.size() ) - .write( SrcPid, m_pid ) - .write( DstPid, dstPid ) - .write( SrcSlot, srcSlot ) - .write( DstSlot, dstSlot ) - .write( SrcOffset, srcOffset ) - .write( DstOffset, dstOffset ) - .write( Size, size ) - .send( *m_firstQueue, dstPid ); - } - } + (void) std::memcpy( + m_memreg.getAddress( dstSlot, dstOffset), + address, size + ); + return; + } +#ifdef LPF_CORE_MPI_USES_zero + m_ibverbs.put( m_memreg.getVerbID( srcSlot), + srcOffset, + dstPid, + m_memreg.getVerbID( dstSlot), + dstOffset, + size, + attr); +#else + (void) attr; // this engine does not use message attributes + using mpi::ipc::newMsg; + if (size <= m_tinyMsgSize ) + { + newMsg( BufPut, m_tinyMsgBuf.data(), m_tinyMsgBuf.size() ) + .write( DstSlot, dstSlot ) + .write( DstOffset, dstOffset ) + .write( Payload, address, size ) + .send( *m_firstQueue, dstPid ); + } else { + newMsg( HpPut, m_tinyMsgBuf.data(), m_tinyMsgBuf.size() ) + .write( SrcPid, m_pid ) + .write( DstPid, dstPid ) + .write( SrcSlot, srcSlot ) + .write( DstSlot, dstSlot ) + .write( SrcOffset, srcOffset ) + .write( DstOffset, dstOffset ) + .write( Size, size ) + .send( *m_firstQueue, dstPid ); } +#endif } -int MessageQueue :: sync( bool abort ) +int MessageQueue :: sync(bool abort, sync_attr_t attr) { +#ifdef LPF_CORE_MPI_USES_zero + // if not, deal with normal sync + (void)abort; + m_memreg.sync(); + m_ibverbs.sync(m_resized, + static_cast< Backend::SyncAttr * >(attr)); + m_resized = false; +#else + (void)attr; + LOG(4, "mpi :: MessageQueue :: sync( abort " << (abort?"true":"false") << " )"); using mpi::ipc::newMsg; @@ -971,9 +1038,34 @@ int MessageQueue :: sync( bool abort ) ASSERT( m_bodyRecvs.empty() ); LOG(4, "End of synchronisation"); +#endif return 0; } +void MessageQueue :: createNewSyncAttr(sync_attr_t * attr) +{ + ASSERT(attr != NULL); +#ifdef LPF_CORE_MPI_USES_zero + m_ibverbs.createNewSyncAttr( + reinterpret_cast< Backend::SyncAttr * * >(attr)); +#else + *attr = LPF_SYNC_DEFAULT; +#endif +} + +void MessageQueue :: flushSent() +{ +#ifdef LPF_CORE_MPI_USES_zero + m_ibverbs.flushSent(); +#endif +} + +void MessageQueue :: flushReceived() +{ +#ifdef LPF_CORE_MPI_USES_zero + m_ibverbs.flushReceived(); +#endif +} } // namespace lpf diff --git a/src/MPI/mesgqueue.hpp b/src/MPI/mesgqueue.hpp index 27e7beb5..424ba5bf 100644 --- a/src/MPI/mesgqueue.hpp +++ b/src/MPI/mesgqueue.hpp @@ -36,34 +36,133 @@ #ifdef LPF_CORE_MPI_USES_ibverbs #include "ibverbs.hpp" #endif +#ifdef LPF_CORE_MPI_USES_zero +#include "zero.hpp" +#endif + namespace lpf { class _LPFLIB_LOCAL MessageQueue { + public: explicit MessageQueue( Communication & comm ); err_t resizeMemreg( size_t nRegs ); err_t resizeMesgQueue( size_t nMsgs ); - + err_t resizeTagreg( size_t nTags ); memslot_t addLocalReg( void * mem, std::size_t size ); memslot_t addGlobalReg( void * mem, std::size_t size ); - void removeReg( memslot_t slot ); + tag_t addTag(); + + void removeReg( memslot_t slot ); + void removeTag( tag_t tag ); void get( pid_t srcPid, memslot_t srcSlot, size_t srcOffset, - memslot_t dstSlot, size_t dstOffset, size_t size ); + memslot_t dstSlot, size_t dstOffset, size_t size, lpf_msg_attr_t attr); void put( memslot_t srcSlot, size_t srcOffset, - pid_t dstPid, memslot_t dstSlot, size_t dstOffset, size_t size ); + pid_t dstPid, memslot_t dstSlot, size_t dstOffset, size_t size, lpf_msg_attr_t attr); // returns how many processes have entered in an aborted state - int sync( bool abort ); + int sync(bool abort, sync_attr_t attr); + + inline void getRcvdMsgCount(size_t * msgs, sync_attr_t attr) noexcept + { + ASSERT(msgs != nullptr); +#ifdef LPF_CORE_MPI_USES_zero + m_ibverbs.get_rcvd_msg_count(*msgs, + static_cast< Backend::SyncAttr * >(attr)); +#else + (void)attr; +#endif + } + + inline void getSentMsgCount(size_t * msgs, sync_attr_t attr) noexcept + { + ASSERT(msgs != nullptr); +#ifdef LPF_CORE_MPI_USES_zero + m_ibverbs.get_sent_msg_count(*msgs, + static_cast< Backend::SyncAttr * >(attr)); +#else + (void)attr; +#endif + } + + void flushSent(); + + void flushReceived(); + + int countingSyncPerSlot(memslot_t slot, size_t expected_sent, size_t expected_rcvd); + + int syncPerSlot(memslot_t slot); + + void createNewSyncAttr(sync_attr_t * attr); + + inline void destroySyncAttr(sync_attr_t attr) + { +#ifdef LPF_CORE_MPI_USES_zero + m_ibverbs.destroySyncAttr( + static_cast< Backend::SyncAttr * >(attr)); +#else + (void)attr; +#endif + } + + inline tag_t getTagFromSyncAttr(sync_attr_t attr) noexcept + { + ASSERT(attr != NULL); +#ifdef LPF_CORE_MPI_USES_zero + return m_ibverbs.getTag( + *static_cast< Backend::SyncAttr * >(attr)); +#else + return LPF_INVALID_TAG; +#endif + } + + inline void setTagInSyncAttr(tag_t tag, sync_attr_t attr) noexcept + { + ASSERT(attr != NULL); +#ifdef LPF_CORE_MPI_USES_zero + return m_ibverbs.setTag(tag, + *static_cast< Backend::SyncAttr * >(attr)); +#else + (void)tag; +#endif + } + + inline void setZCAttr(size_t sent, size_t rcvd, sync_attr_t attr) noexcept + { + ASSERT(attr != NULL); +#ifdef LPF_CORE_MPI_USES_zero + return m_ibverbs.setZCAttr(sent,rcvd, + *static_cast< Backend::SyncAttr * >(attr)); +#else + (void)sent; + (void)rcvd; + (void)attr; +#endif + } + + inline void getZCAttr(sync_attr_t attr, size_t &sent, size_t &rcvd) noexcept + { + ASSERT(attr != NULL); +#ifdef LPF_CORE_MPI_USES_zero + return m_ibverbs.getZCAttr( + *static_cast< Backend::SyncAttr * >(attr), + sent, rcvd); +#else + (void)attr; + (void)sent; + (void)rcvd; +#endif + } private: - enum Msgs { BufPut , + enum Msgs { BufPut , BufGet, BufGetReply, HpPut, HpGet , HpBodyReply , HpEdges, HpEdgesReply }; @@ -72,7 +171,7 @@ class _LPFLIB_LOCAL MessageQueue SrcPid, DstPid, SrcOffset, DstOffset, BufOffset, SrcSlot, DstSlot, Size, - RoundedDstOffset, RoundedSize, + RoundedDstOffset, RoundedSize, Payload, Head, Tail}; struct Edge { @@ -106,6 +205,11 @@ class _LPFLIB_LOCAL MessageQueue typedef mpi::VirtualAllToAll Queue; +#if defined LPF_CORE_MPI_USES_ibverbs + typedef mpi::IBVerbs Backend; +#elif defined LPF_CORE_MPI_USES_zero + typedef mpi::Zero Backend; +#endif static Queue * newQueue( pid_t pid, pid_t nprocs ); const pid_t m_pid, m_nprocs; @@ -126,14 +230,14 @@ class _LPFLIB_LOCAL MessageQueue std::vector< Edge > m_edgeRecv; std::vector< Edge > m_edgeSend; std::vector< char > m_edgeBuffer; -#if defined LPF_CORE_MPI_USES_mpirma || defined LPF_CORE_MPI_USES_ibverbs +#if defined LPF_CORE_MPI_USES_mpirma || defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero memslot_t m_edgeBufferSlot; #endif std::vector< Body > m_bodySends; std::vector< Body > m_bodyRecvs; mpi::Comm m_comm; -#ifdef LPF_CORE_MPI_USES_ibverbs - mpi::IBVerbs m_ibverbs; +#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero + Backend m_ibverbs; #endif MemoryTable m_memreg; std::vector< char > m_tinyMsgBuf; diff --git a/src/MPI/types.hpp b/src/MPI/types.hpp index f587e437..ae5ae61c 100644 --- a/src/MPI/types.hpp +++ b/src/MPI/types.hpp @@ -19,15 +19,18 @@ #define LPF_CORE_TYPES_HPP #include "lpf/core.h" +#include "lpf/tags.h" namespace lpf { typedef lpf_err_t err_t; typedef lpf_pid_t pid_t; +typedef lpf_tag_t tag_t; typedef lpf_args_t args_t; typedef lpf_spmd_t spmd_t; typedef lpf_memslot_t memslot_t; typedef lpf_machine_t machine_t; +typedef lpf_sync_attr_t sync_attr_t; } diff --git a/src/MPI/zero.cpp b/src/MPI/zero.cpp new file mode 100644 index 00000000..2053fbf5 --- /dev/null +++ b/src/MPI/zero.cpp @@ -0,0 +1,1091 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "log.hpp" +#include "zero.hpp" +#include "config.hpp" +#include "communication.hpp" + +#include +#include +#include +#include +#include +#include + +#define POLL_BATCH 64 +#define MAX_POLLING 128 + + +namespace lpf { namespace mpi { + +struct Zero::Exception : std::runtime_error { + Exception(const char * what) : std::runtime_error( what ) {} +}; + +namespace { + ibv_mtu getMTU( unsigned size ) { + switch (size) { + case 256: return IBV_MTU_256; + case 512: return IBV_MTU_512; + case 1024: return IBV_MTU_1024; + case 2048: return IBV_MTU_2048; + case 4096: return IBV_MTU_4096; + default: throw Zero::Exception("Illegal MTU size"); + } + return IBV_MTU_4096; + } +} + +Zero :: Zero( Communication & comm ) + : m_pid( comm.pid() ) + , m_nprocs( comm.nprocs() ) + , m_ibPort( Config::instance().getIBPort() ) + , m_gidIdx( Config::instance().getIBGidIndex() ) + , m_maxRegSize(0) + , m_maxMsgSize(0) + , m_cqSize(1) + , m_minNrMsgs(0) + , m_maxSrs(0) + , m_postCount(0) + , m_recvCount(0) + , m_tag_capacity(0) + , m_device() + , m_pd() + , m_cqLocal() + , m_cqRemote() + , m_dummyMemReg() + , m_numMsgs(0) + , m_recvTotalInitMsgCount(0) + , m_sentMsgs(0) + , m_recvdMsgs(0) + , m_comm( comm ) + , m_devName() + , m_mtu( getMTU( Config::instance().getIBMTU() )) + , m_stagedQps( m_nprocs ) + , m_connectedQps( m_nprocs ) + , m_srs() + , m_srsHeads( m_nprocs, 0u ) + , m_nMsgsPerPeer( m_nprocs, 0u ) + , m_peerList() + , m_sges() + , m_dummyBuffer() + , m_activePeers(0, m_nprocs) + , m_memreg() +{ + m_peerList.reserve( m_nprocs ); + + int numDevices = -1; + struct ibv_device * * const try_get_device_list = + ibv_get_device_list( &numDevices ); + + if (!try_get_device_list) { + LOG(1, "Cannot get list of Infiniband devices" ); + throw Exception( "failed to get IB devices list"); + } + + shared_ptr< struct ibv_device * > devList( + try_get_device_list, + ibv_free_device_list ); + + LOG(3, "Retrieved Infiniband device list, which has " << numDevices + << " devices" ); + + if (numDevices < 1) { + LOG(1, "There are " << numDevices << " Infiniband devices" + " available, which is not enough" ); + throw Exception( "No Infiniband devices available" ); + } + + std::string wantDevName = Config::instance().getIBDeviceName(); + LOG( 3, "Searching for device '"<< wantDevName << "'" ); + struct ibv_device * dev = NULL; + for (int i = 0; i < numDevices; i ++) + { + std::string name = ibv_get_device_name( (&*devList)[i]); + LOG(3, "Device " << i << " has name '" << name << "'" ); + if ( wantDevName.empty() || name == wantDevName ) { + LOG(3, "Found device '" << name << "'" ); + m_devName = name; + dev = (&*devList)[i]; + break; + } + } + + if (dev == NULL) { + LOG(1, "Could not find device '" << wantDevName << "'" ); + throw Exception("Infiniband device not found"); + } + + struct ibv_context * const ibv_context_new_p = ibv_open_device(dev); + if( ibv_context_new_p == NULL ) + m_device.reset(); + else + m_device.reset( ibv_context_new_p, ibv_close_device ); + if (!m_device) { + LOG(1, "Failed to open Infiniband device '" << m_devName << "'"); + throw Exception("Cannot open IB device"); + } + LOG(3, "Opened Infiniband device '" << m_devName << "'" ); + + devList.reset(); + LOG(3, "Closed Infiniband device list" ); + + std::memset(&m_deviceAttr, 0, sizeof(m_deviceAttr)); + if (ibv_query_device( m_device.get(), &m_deviceAttr )) + throw Exception("Cannot query device"); + + LOG(3, "Queried IB device capabilities" ); + + m_maxRegSize = m_deviceAttr.max_mr_size; + LOG(3, "Maximum size for memory registration = " << m_maxRegSize ); + + // maximum number of work requests per Queue Pair + m_maxSrs = std::min( m_deviceAttr.max_qp_wr, // maximum work requests per QP + m_deviceAttr.max_cqe ); // maximum entries per CQ + LOG(3, "Maximum number of send requests is the minimum of " + << m_deviceAttr.max_qp_wr << " (the maximum of work requests per QP)" + << " and " << m_deviceAttr.max_cqe << " (the maximum of completion " + << " queue entries per QP), nameley " << m_maxSrs ); + + if ( m_deviceAttr.max_cqe < m_nprocs ) + throw Exception("Completion queue has insufficient completion queue capabilities"); + + struct ibv_port_attr port_attr; std::memset( &port_attr, 0, sizeof(port_attr)); + if (ibv_query_port( m_device.get(), m_ibPort, & port_attr )) + throw Exception("Cannot query IB port"); + + LOG(3, "Queried IB port " << m_ibPort << " capabilities" ); + + // store Maximum message size + m_maxMsgSize = port_attr.max_msg_sz; + LOG(3, "Maximum IB message size is " << m_maxMsgSize ); + + size_t sysRam = Config::instance().getLocalRamSize(); + m_minNrMsgs = sysRam / m_maxMsgSize; + LOG(3, "Minimum number of messages to allocate = " + "total system RAM / maximum message size = " + << sysRam << " / " << m_maxMsgSize << " = " << m_minNrMsgs ); + + // store LID + m_lid = port_attr.lid; + LOG(3, "LID is " << m_lid ); + + struct ibv_pd * const pd_new_p = ibv_alloc_pd( m_device.get() ); + if( pd_new_p == NULL ) + m_pd.reset(); + else + m_pd.reset( pd_new_p, ibv_dealloc_pd ); + if (!m_pd) { + LOG(1, "Could not allocate protection domain "); + throw Exception("Could not allocate protection domain"); + } + LOG(3, "Opened protection domain"); + + /** + * New notification functionality for HiCR + */ + struct ibv_srq_init_attr srq_init_attr; + srq_init_attr.srq_context = NULL; + srq_init_attr.attr.max_wr = m_deviceAttr.max_srq_wr; + srq_init_attr.attr.max_sge = m_deviceAttr.max_srq_sge; + srq_init_attr.attr.srq_limit = 0; + m_srq.reset(ibv_create_srq(m_pd.get(), &srq_init_attr ), + ibv_destroy_srq); + + m_cqLocal.reset(ibv_create_cq( m_device.get(), m_cqSize, NULL, NULL, 0), + ibv_destroy_cq); + if (!m_cqLocal) { + LOG(1, "Could not allocate completion queue with '" + << m_nprocs << " entries" ); + throw Exception("Could not allocate completion queue"); + } + m_cqRemote.reset( + ibv_create_cq( m_device.get(), m_cqSize * m_nprocs, NULL, NULL, 0), + ibv_destroy_cq); + if (!m_cqLocal) { + LOG(1, "Could not allocate completion queue with '" + << m_nprocs << " entries" ); + throw Exception("Could not allocate completion queue"); + } + + LOG(3, "Allocated completion queue with " << m_nprocs << " entries."); + + // allocate dummy buffer + m_dummyBuffer.resize( 8 ); + struct ibv_mr * const ibv_reg_mr_new_p = ibv_reg_mr( + m_pd.get(), m_dummyBuffer.data(), m_dummyBuffer.size(), + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE + ); + if( ibv_reg_mr_new_p == NULL ) + m_dummyMemReg.reset(); + else + m_dummyMemReg.reset( ibv_reg_mr_new_p, ibv_dereg_mr ); + if (!m_dummyMemReg) { + LOG(1, "Could not register memory region"); + throw Exception("Could not register memory region"); + } + + LOG(3, "Queue pairs have been successfully initialized"); + +} + +Zero :: ~Zero() +{ } + +inline void Zero :: tryIncrement(const Op op, const Phase phase, + const TagID tag) noexcept +{ + if (tag == INVALID_TAG) { + LOG(2, "Zero::tryIncrement called on invalid tag"); + return; + } + + switch (phase) { + case Phase::INIT: + // dynamically increase the capacity + // of registered tag arrays + // Somewhat arbitrarily I choose here to + // increase by factor 8 each time + if (m_tag_capacity <= tag) { + LOG(3, "Dynamically reallocated tags: " << tag << " -> " << (tag + 1) * 8); + resizeTagreg((tag + 1) * 8); + } + rcvdMsgCount[tag] = 0; + getMsgCount[tag] = 0; + m_recvInitMsgCount[tag] = 0; + m_getInitMsgCount[tag] = 0; + sentMsgCount[tag] = 0; + m_sendInitMsgCount[tag] = 0; + tagActive[tag] = true; + break; + case Phase::PRE: + if (op == Op::SEND) { + (void)m_numMsgs++; + (void)m_sendInitMsgCount[tag]++; + } + if (op == Op::RECV) { + (void)m_recvTotalInitMsgCount++; + (void)m_recvInitMsgCount[tag]++; + } + if (op == Op::GET) { + (void)m_recvTotalInitMsgCount++; + (void)m_getInitMsgCount[tag]++; + } + break; + case Phase::POST: + if (op == Op::RECV) { + (void)m_recvdMsgs++; + (void)rcvdMsgCount[tag]++; + } + if (op == Op::GET) { + (void)m_recvdMsgs++; + (void)getMsgCount[tag]++; + } + if (op == Op::SEND) { + (void)m_sentMsgs++; + (void)sentMsgCount[tag]++; + } + break; + } +} + +void Zero :: stageQPs( size_t maxMsgs ) +{ + // create the queue pairs + for ( size_t i = 0; i < static_cast(m_nprocs); ++i) { + struct ibv_qp_init_attr attr; + std::memset(&attr, 0, sizeof(attr)); + + attr.qp_type = IBV_QPT_RC; // we want reliable connection + attr.sq_sig_all = 0; // only wait for selected messages + attr.send_cq = m_cqLocal.get(); + attr.recv_cq = m_cqRemote.get(); + attr.srq = m_srq.get(); + attr.cap.max_send_wr = std::min(maxMsgs + m_minNrMsgs,m_maxSrs/4); + attr.cap.max_recv_wr = std::min(maxMsgs + m_minNrMsgs,m_maxSrs/4); + attr.cap.max_send_sge = 1; + attr.cap.max_recv_sge = 1; + + struct ibv_qp * const ibv_new_qp_p = ibv_create_qp( m_pd.get(), &attr ); + ASSERT(m_stagedQps.size() > i); + if( ibv_new_qp_p == NULL ) { + m_stagedQps[i].reset(); + } else { + m_stagedQps[i].reset( ibv_new_qp_p, ibv_destroy_qp ); + } + if (!m_stagedQps[i]) { + LOG( 1, "Could not create Infiniband Queue pair number " << i ); + throw std::bad_alloc(); + } + + LOG(3, "Created new Queue pair for " << m_pid << " -> " << i + << " with qp_num = " << ibv_new_qp_p->qp_num); + } +} + +void Zero :: doRemoteProgress() { + struct ibv_wc wcs[POLL_BATCH]; + struct ibv_recv_wr wr; + struct ibv_sge sg; + struct ibv_recv_wr *bad_wr; + sg.addr = (uint64_t) NULL; + sg.length = 0; + sg.lkey = 0; + wr.next = NULL; + wr.sg_list = &sg; + wr.num_sge = 0; + wr.wr_id = 66; + int pollResult, totalResults = 0; + do { + pollResult = ibv_poll_cq(m_cqRemote.get(), POLL_BATCH, wcs); + if (pollResult > 0) { + LOG(3, "Process " << m_pid << " signals: I received " << pollResult + << " remote messages in doRemoteProgress"); + } + else if (pollResult < 0) + { + LOG( 1, "Failed to poll IB completion queue" ); + throw Exception("Poll CQ failure"); + } + + for(int i = 0; i < pollResult; i++) { + if (wcs[i].status != IBV_WC_SUCCESS) { + LOG( 2, "Got bad completion status from IB message." + " status = 0x" << std::hex << wcs[i].status + << ", vendor syndrome = 0x" << std::hex + << wcs[i].vendor_err ); + } + else + { + LOG(3, "Process " << m_pid << " Recv wcs[" << i << "].src_qp = "<< wcs[i].src_qp); + LOG(3, "Process " << m_pid << " Recv wcs[" << i << "].slid = "<< wcs[i].slid); + LOG(3, "Process " << m_pid << " Recv wcs[" << i << "].wr_id = "<< wcs[i].wr_id); + LOG(3, "Process " << m_pid << " Recv wcs[" << i << "].imm_data = "<< wcs[i].imm_data); + + /** + * Here is a trick: + * The sender sends relatively generic LPF memslot ID. + * But for IB Verbs, we need to translate that into + * an IB Verbs slot via @getVerbID -- or there will be + * a mismatch when IB Verbs looks up the slot ID + */ + + // Note: Ignore compare-and-swap atomics! + if (wcs[i].opcode != IBV_WC_COMP_SWAP) { + TagID tag; + // This receive is from a PUT call + if (wcs[i].opcode == IBV_WC_RECV_RDMA_WITH_IMM) { + tag = wcs[i].imm_data; + tryIncrement(Op::RECV, Phase::POST, tag); + LOG(3, "Rank " << m_pid << " increments received message count to " << rcvdMsgCount[tag] << " for LPF slot " << tag); + } + } + ibv_post_srq_recv(m_srq.get(), &wr, &bad_wr); + } + } + if(pollResult > 0) totalResults += pollResult; + } while (pollResult == POLL_BATCH && totalResults < MAX_POLLING); +} + +void Zero :: reconnectQPs() +{ + ASSERT( m_stagedQps[0] ); + m_comm.barrier(); + + union ibv_gid myGid; + std::vector< uint32_t> localQpNums, remoteQpNums; + std::vector< uint16_t> lids; + std::vector< union ibv_gid > gids; + try { + // Exchange info about the queue pairs + if (m_gidIdx >= 0) { + if (ibv_query_gid(m_device.get(), m_ibPort, m_gidIdx, &myGid)) { + LOG(1, "Could not get GID of Infiniband device port " << m_ibPort); + throw Exception( "Could not get gid for IB port"); + } + LOG(3, "GID of Infiniband device was retrieved" ); + } + else { + std::memset( &myGid, 0, sizeof(myGid) ); + LOG(3, "GID of Infiniband device will not be used" ); + } + + localQpNums.resize(m_nprocs); + remoteQpNums.resize(m_nprocs); + lids.resize(m_nprocs); + gids.resize(m_nprocs); + + for ( int i = 0; i < m_nprocs; ++i) + localQpNums[i] = m_stagedQps[i]->qp_num; + } + catch(...) + { + m_comm.allreduceOr( true ); + throw; + } + if (m_comm.allreduceOr( false) ) + throw Exception("Peer failed to allocate memory or query device while setting-up QP"); + + m_comm.allToAll( localQpNums.data(), remoteQpNums.data() ); + m_comm.allgather( m_lid, lids.data() ); + m_comm.allgather( myGid, gids.data() ); + + LOG(3, "Connection initialisation data has been exchanged"); + + try { + // Bring QPs to INIT + for (int i = 0; i < m_nprocs; ++i ) { + struct ibv_qp_attr attr; + int flags; + + std::memset(&attr, 0, sizeof(attr)); + attr.qp_state = IBV_QPS_INIT; + attr.port_num = m_ibPort; + attr.pkey_index = 0; + attr.qp_access_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC; + flags = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS; + if ( ibv_modify_qp(m_stagedQps[i].get(), &attr, flags) ) { + LOG(1, "Cannot bring state of QP " << i << " to INIT"); + throw Exception("Failed to bring QP's state to Init" ); + } + + // post a dummy receive + + struct ibv_recv_wr rr; std::memset(&rr, 0, sizeof(rr)); + struct ibv_sge sge; std::memset(&sge, 0, sizeof(sge)); + sge.addr = reinterpret_cast(m_dummyBuffer.data()); + sge.length = m_dummyBuffer.size(); + sge.lkey = m_dummyMemReg->lkey; + rr.next = NULL; + rr.wr_id = 46; + rr.sg_list = &sge; + rr.num_sge = 1; + + // Bring QP to RTR + std::memset(&attr, 0, sizeof(attr)); + attr.qp_state = IBV_QPS_RTR; + attr.path_mtu = m_mtu; + attr.dest_qp_num = remoteQpNums[i]; + attr.rq_psn = 0; + attr.max_dest_rd_atomic = 1; + attr.min_rnr_timer = 0x12; + attr.ah_attr.is_global = 0; + attr.ah_attr.dlid = lids[i]; + attr.ah_attr.sl = 0; + attr.ah_attr.src_path_bits = 0; + attr.ah_attr.port_num = m_ibPort; + if (m_gidIdx >= 0) + { + attr.ah_attr.is_global = 1; + attr.ah_attr.port_num = 1; + memcpy(&attr.ah_attr.grh.dgid, &gids[i], 16); + attr.ah_attr.grh.flow_label = 0; + attr.ah_attr.grh.hop_limit = 1; + attr.ah_attr.grh.sgid_index = m_gidIdx; + attr.ah_attr.grh.traffic_class = 0; + } + flags = IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER; + + if (ibv_modify_qp(m_stagedQps[i].get(), &attr, flags)) { + LOG(1, "Cannot bring state of QP " << i << " to RTR" ); + throw Exception("Failed to bring QP's state to RTR" ); + } + + // Bring QP to RTS + std::memset(&attr, 0, sizeof(attr)); + attr.qp_state = IBV_QPS_RTS; + attr.timeout = 0x12; + attr.retry_cnt = 0;//7; + attr.rnr_retry = 0;//7; + attr.sq_psn = 0; + attr.max_rd_atomic = 1; + flags = IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | + IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC; + if( ibv_modify_qp(m_stagedQps[i].get(), &attr, flags)) { + LOG(1, "Cannot bring state of QP " << i << " to RTS" ); + throw Exception("Failed to bring QP's state to RTS" ); + } + + LOG(3, "Connected Queue pair for " << m_pid << " -> " << i ); + + } // for each peer + } + catch(...) { + m_comm.allreduceOr( true ); + throw; + } + + if (m_comm.allreduceOr( false )) + throw Exception("Another peer failed to set-up Infiniband queue pairs"); + + LOG(3, "All staged queue pairs have been connected" ); + + m_connectedQps.swap( m_stagedQps ); + + LOG(3, "All old queue pairs have been removed"); + + m_comm.barrier(); +} + +void Zero :: resizeMemreg( size_t size ) +{ + if ( size > size_t(std::numeric_limits::max()) ) + { + LOG(2, "Could not expand memory register, because integer will overflow"); + throw Exception("Could not increase memory register"); + } + if ( int(size) > m_deviceAttr.max_mr ) { + LOG(2, "IB device only supports " << m_deviceAttr.max_mr + << " memory registrations, while " << size + << " are being requested" ); + throw std::bad_alloc() ; + } + + MemoryRegistration newMR = { nullptr, 0, 0, 0, m_pid}; + MemorySlot dflt; dflt.glob.resize( m_nprocs, newMR); + + m_memreg.reserve( size, dflt ); +} + +void Zero :: resizeMesgq( size_t size ) +{ + + m_cqSize = std::min(size,m_maxSrs/4); + size_t remote_size = std::min(m_cqSize*m_nprocs,m_maxSrs/4); + if (m_cqLocal) { + ibv_resize_cq(m_cqLocal.get(), m_cqSize); + } + if(remote_size >= m_postCount){ + if (m_cqRemote) { + ibv_resize_cq(m_cqRemote.get(), remote_size); + } + } + stageQPs(m_cqSize); + reconnectQPs(); + if(remote_size >= m_postCount){ + if (m_srq) { + struct ibv_recv_wr wr; + struct ibv_sge sg; + struct ibv_recv_wr *bad_wr; + sg.addr = (uint64_t) NULL; + sg.length = 0; + sg.lkey = 0; + wr.next = NULL; + wr.sg_list = &sg; + wr.num_sge = 0; + wr.wr_id = m_pid; + for(int i = m_postCount; i < (int)remote_size; ++i){ + ibv_post_srq_recv(m_srq.get(), &wr, &bad_wr); + m_postCount++; + } + } + } + LOG(4, "Message queue has been reallocated to size " << size ); +} + +void Zero :: resizeTagreg( size_t size ) +{ + if( m_tag_capacity >= size ) { + LOG(4, "Tag queue: smaller capacity required, request ignored" ); + return; + } + + ASSERT( size > m_tag_capacity ); + + // reserve new capacity + m_free_tags.reserve( size ); + m_recvInitMsgCount.resize(size, 0); + m_getInitMsgCount.resize(size, 0); + m_sendInitMsgCount.resize(size, 0); + rcvdMsgCount.resize(size, 0); + getMsgCount.resize(size, 0); + sentMsgCount.resize(size, 0); + tagActive.resize(size, 0); + + // if ok, push new tag IDs to free tags + for( size_t k = m_tag_capacity; k < size; ++k ) { + m_free_tags.push_back( static_cast(k) ); + } + + // correct tag capacity + m_tag_capacity = size; + + LOG(4, "Tag queue: new capacity in effect ( " << size << " )"); +} + +Zero :: SlotID Zero :: regLocal( void * addr, size_t size ) +{ + ASSERT( size <= m_maxRegSize ); + + MemorySlot slot; + if ( size > 0) { + LOG(4, "Registering locally memory area at " << addr << " of size " << size ); + struct ibv_mr * const ibv_mr_new_p = ibv_reg_mr( + m_pd.get(), addr, size, + IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_ATOMIC + ); + if( ibv_mr_new_p == NULL ) + slot.mr.reset(); + else + slot.mr.reset( ibv_mr_new_p, ibv_dereg_mr ); + if (!slot.mr) { + LOG(1, "Could not register memory area at " + << addr << " of size " << size << " with IB device"); + throw Exception("Could not register memory area"); + } + } + MemoryRegistration local((char *) addr, size, size?slot.mr->lkey:0, + size?slot.mr->rkey:0, m_pid); + + SlotID id = m_memreg.addLocalReg( slot ); + + m_memreg.update( id ).glob.resize( m_nprocs ); + m_memreg.update( id ).glob[m_pid] = local; + LOG(4, "Memory area " << addr << " of size " << size + << " has been locally registered. Slot = " << id ); + return id; +} + +Zero :: SlotID Zero :: regGlobal( void * addr, size_t size ) +{ + ASSERT( size <= m_maxRegSize ); + + MemorySlot slot; + if ( size > 0 ) { + LOG(4, "Registering globally memory area at " << addr << " of size " << size ); + struct ibv_mr * const ibv_mr_new_p = ibv_reg_mr( + m_pd.get(), addr, size, + IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_ATOMIC + ); + if( ibv_mr_new_p == NULL ) + slot.mr.reset(); + else + slot.mr.reset( ibv_mr_new_p, ibv_dereg_mr ); + if (!slot.mr) { + LOG(1, "Could not register memory area at " + << addr << " of size " << size << " with IB device"); + m_comm.allreduceAnd(true); + throw Exception("Could not register memory area"); + } + } + if (m_comm.allreduceOr(false)) + throw Exception("Another process could not register memory area"); + + SlotID id = m_memreg.addGlobalReg( slot ); + MemorySlot & ref = m_memreg.update(id); + // exchange memory registration info globally + ref.glob.resize(m_nprocs); + + MemoryRegistration local((char *) addr, size, size?slot.mr->lkey:0, + size?slot.mr->rkey:0, m_pid); + LOG(4, "All-gathering memory register data" ); + + m_comm.allgather( local, ref.glob.data() ); + LOG(4, "Memory area " << addr << " of size " << size + << " has been globally registered. Slot = " << id ); + return id; +} + +Zero :: TagID Zero :: regTag() { + if( m_free_tags.size() == 0 ) { + throw Exception("No free tags available"); + } + const TagID ret = m_free_tags.back(); + // Initialize a new tag + tryIncrement(Op::SEND, Phase::INIT, ret); + m_free_tags.pop_back(); + LOG(4, "Tag " << ret << " has been allocated"); + return ret; +} + +void Zero :: dereg( SlotID id ) +{ + m_memreg.removeReg( id ); + LOG(4, "Memory area of slot " << id << " has been deregistered"); +} + +void Zero :: deregTag( TagID id ) +{ + ASSERT( m_free_tags.size() < m_tag_capacity ); + m_free_tags.push_back( id ); + tagActive[id] = false; + m_recvInitMsgCount[id] = 0; + m_getInitMsgCount[id] = 0; + m_sendInitMsgCount[id] = 0; + rcvdMsgCount[id] = 0; + sentMsgCount[id] = 0; + LOG(4, "Tag " << id << " has been released"); +} + +void Zero :: put( SlotID srcSlot, size_t srcOffset, + int dstPid, SlotID dstSlot, size_t dstOffset, size_t size, lpf_msg_attr_t attr) +{ + const MemorySlot & src = m_memreg.lookup( srcSlot ); + const MemorySlot & dst = m_memreg.lookup( dstSlot ); + const uint32_t tag = attr == NULL + ? INVALID_TAG + : * static_cast< uint32_t * >(attr); + + ASSERT( src.mr ); + + int numMsgs = size/m_maxMsgSize + (size % m_maxMsgSize > 0); + //+1 if last msg size < m_maxMsgSize + if (size == 0) numMsgs = 1; + + struct ibv_sge sges[numMsgs]; + struct ibv_send_wr srs[numMsgs]; + struct ibv_sge *sge; + struct ibv_send_wr *sr; + for (int i=0; i < numMsgs; i++) { + sge = &sges[i]; std::memset(sge, 0, sizeof(ibv_sge)); + sr = &srs[i]; std::memset(sr, 0, sizeof(ibv_send_wr)); + const char * localAddr + = static_cast(src.glob[m_pid]._addr) + srcOffset; + const char * remoteAddr + = static_cast(dst.glob[dstPid]._addr) + dstOffset; + + sge->addr = reinterpret_cast( localAddr ); + sge->length = std::min(size, m_maxMsgSize ); + sge->lkey = src.mr->lkey; + sges[i] = *sge; + + bool lastMsg = (i == numMsgs-1); + sr->next = lastMsg ? NULL : &srs[ i+1]; + // since reliable connection guarantees keeps packets in order, + // we only need a signal from the last message in the queue + sr->send_flags = lastMsg ? IBV_SEND_SIGNALED : 0; + sr->opcode = lastMsg? IBV_WR_RDMA_WRITE_WITH_IMM : IBV_WR_RDMA_WRITE; + // use wr_id to store the comm tag (passed as attr) + sr->wr_id = tag; + // use wr_id to store the comm tag (passed as attr) + sr->imm_data = tag; + + sr->sg_list = &sges[i]; + sr->num_sge = 1; + sr->wr.rdma.remote_addr = reinterpret_cast( remoteAddr ); + sr->wr.rdma.rkey = dst.glob[dstPid]._rkey; + + srs[i] = *sr; + size -= sge->length; + srcOffset += sge->length; + dstOffset += sge->length; + + LOG(4, "PID " << m_pid << ": Enqueued put message of " << sge->length + << " bytes to " << dstPid << " on slot" << dstSlot << " and tag " << attr); + } + struct ibv_send_wr *bad_wr = NULL; + // srs[0] should be sufficient because the rest of srs are on a chain + if (int err = ibv_post_send(m_connectedQps[dstPid].get(), &srs[0], &bad_wr )) + { + LOG(1, "Error while posting RDMA requests: " << std::strerror(err) ); + throw Exception("Error while posting RDMA requests"); + } + + tryIncrement(Op::SEND, Phase::PRE, tag); +} + +void Zero :: get( int srcPid, SlotID srcSlot, size_t srcOffset, + SlotID dstSlot, size_t dstOffset, size_t size, lpf_msg_attr_t attr) +{ + const MemorySlot & src = m_memreg.lookup( srcSlot ); + const MemorySlot & dst = m_memreg.lookup( dstSlot ); + const uint32_t tag = attr == NULL + ? INVALID_TAG + : * static_cast< uint32_t * >(attr); + + ASSERT( dst.mr ); + + int numMsgs = size/m_maxMsgSize + (size % m_maxMsgSize > 0); + //+1 if last msg size < m_maxMsgSize + + struct ibv_sge sges[numMsgs+1]; + struct ibv_send_wr srs[numMsgs+1]; + struct ibv_sge *sge; + struct ibv_send_wr *sr; + + + for(int i = 0; i< numMsgs; i++){ + sge = &sges[i]; std::memset(sge, 0, sizeof(ibv_sge)); + sr = &srs[i]; std::memset(sr, 0, sizeof(ibv_send_wr)); + + const char * localAddr + = static_cast(dst.glob[m_pid]._addr) + dstOffset; + const char * remoteAddr + = static_cast(src.glob[srcPid]._addr) + srcOffset; + + sge->addr = reinterpret_cast( localAddr ); + sge->length = std::min(size, m_maxMsgSize ); + sge->lkey = dst.mr->lkey; + sges[i] = *sge; + LOG(4, "PID " << m_pid << ": Enqueued get message of " << sge->length + << " bytes from " << srcPid << " on slot" << srcSlot ); + + bool lastMsg = (i == numMsgs-1); + sr->next = lastMsg ? NULL : &srs[ i+1]; + sr->send_flags = lastMsg ? IBV_SEND_SIGNALED : 0; + + sr->sg_list = &sges[i]; + sr->num_sge = 1; + sr->opcode = IBV_WR_RDMA_READ; + sr->wr.rdma.remote_addr = reinterpret_cast( remoteAddr ); + sr->wr.rdma.rkey = src.glob[srcPid]._rkey; + // This logic is reversed compared to ::put + sr->wr_id = tag; // <= This enables virtual tag matching + sr->imm_data = 0; // This is irrelevant as we don't send _WITH_IMM + srs[i] = *sr; + size -= sge->length; + srcOffset += sge->length; + dstOffset += sge->length; + } + + struct ibv_send_wr *bad_wr = NULL; + if (int err = ibv_post_send(m_connectedQps[srcPid].get(), &srs[0], &bad_wr )) + { + + LOG(1, "Error while posting RDMA requests: " << std::strerror(err) ); + if (err == ENOMEM) { + LOG(1, "Specific error code: ENOMEM (send queue is full or no resources)"); + } + throw Exception("Error while posting RDMA requests"); + } + tryIncrement(Op::GET, Phase::PRE, tag); + +} + +void Zero :: get_rcvd_msg_count(size_t &rcvd_msgs, const struct SyncAttr * attr) + noexcept +{ + if( attr == nullptr || attr->tag == INVALID_TAG ) { + rcvd_msgs = m_recvdMsgs; + } else { + rcvd_msgs = rcvdMsgCount[attr->tag] + getMsgCount[attr->tag]; + } +} + +void Zero :: get_sent_msg_count(size_t &sent_msgs, const struct SyncAttr * attr) + noexcept +{ + if( attr == nullptr || attr->tag == INVALID_TAG ) { + sent_msgs = m_sentMsgs; + } else { + sent_msgs = sentMsgCount[attr->tag]; + } +} + +void Zero :: createNewSyncAttr(struct SyncAttr * * attr) { + *attr = new struct SyncAttr; + (*attr)->tag = std::numeric_limits::max(); + (*attr)->expected_sent = 0; + (*attr)->expected_rcvd = 0; +} + +void Zero :: doLocalProgress(int& error) { + + error = 0; + LOG(1, "Polling for messages" ); + struct ibv_wc wcs[POLL_BATCH]; + int pollResult = ibv_poll_cq(m_cqLocal.get(), POLL_BATCH, wcs); + if ( pollResult > 0) { + LOG(4, "Process " << m_pid << ": Received " << pollResult << " acknowledgements"); + + for (int i = 0; i < pollResult ; ++i) { + if (wcs[i].status != IBV_WC_SUCCESS) + { + LOG( 2, "Got bad completion status from IB message." + " status = 0x" << std::hex << wcs[i].status + << ", vendor syndrome = 0x" << std::hex + << wcs[i].vendor_err ); + const char * status_descr; + status_descr = ibv_wc_status_str(wcs[i].status); + LOG( 2, "Process " << m_pid << ": The work completion status string: " << status_descr); + error = 1; + } + else { + LOG(4, "Process " << m_pid << " Send wcs[" << i << "].src_qp = "<< wcs[i].src_qp); + LOG(4, "Process " << m_pid << " Send wcs[" << i << "].slid = "<< wcs[i].slid); + LOG(4, "Process " << m_pid << " Send wcs[" << i << "].wr_id = "<< wcs[i].wr_id); + LOG(4, "Process " << m_pid << " Send wcs[" << i << "].imm_data = "<< wcs[i].imm_data); + } + + TagID slot = wcs[i].wr_id; + // Ignore compare-and-swap atomics! + if (wcs[i].opcode != IBV_WC_COMP_SWAP) { + // This is a GET call completion + if (wcs[i].opcode == IBV_WC_RDMA_READ) { + tryIncrement(Op::GET, Phase::POST, slot); + LOG(4, "Rank " << m_pid << " with GET, increments getMsgCount to " + << getMsgCount[slot] << " for LPF slot " << slot); + } + // This is a put call completing + if (wcs[i].opcode == IBV_WC_RDMA_WRITE) { + tryIncrement(Op::SEND, Phase::POST, slot); + LOG(4, "Rank " << m_pid << " with SEND, increments sentMsgCount to " + << sentMsgCount[slot] << " for LPF slot " << slot); + } + + } + } + } + else if (pollResult < 0) + { + LOG( 1, "Failed to poll IB completion queue" ); + throw Exception("Poll CQ failure"); + } +} + +void Zero :: flushReceived() { + doRemoteProgress(); +} + +void Zero :: flushSent() +{ + int isError = 0; + + bool sendsComplete; + do { + sendsComplete = true; + for (size_t i = 0; i sentMsgCount[i] || m_getInitMsgCount[i] > getMsgCount[i]) { + sendsComplete = false; + doLocalProgress(isError); + if (isError) { + LOG(1, "Error in doLocalProgress. Most likely issue is " + << "that receiver is not calling ibv_post_srq!\n"); + std::abort(); + } + } + } + } + } while (!sendsComplete); + +} + +void Zero :: countingSyncPerSlot(const TagID tag, const size_t expectedSent, + const size_t expectedRecvd) +{ + bool sentOK = false; + bool recvdOK = false; + if (expectedSent == 0) { sentOK = true; } + if (expectedRecvd == 0) { recvdOK = true; } + int error; + + // This is semantically equivalent to a non-blocking test call, + // triggering progress on the network card without expecting anything + // from a particular tag + if (tag == INVALID_TAG && sentOK && recvdOK) { + doLocalProgress(error); + if (error) { + LOG(1, "Error in doLocalProgress"); + throw std::runtime_error("Error in doLocalProgress"); + } + // this call triggers doRemoteProgress + doRemoteProgress(); + } + + // This is a blocking call on a particular tag with some expected + // sent / received messages + else { + if (tagActive[tag]) { + do { + doLocalProgress(error); + if (error) { + LOG(1, "Error in doLocalProgress"); + throw std::runtime_error("Error in doLocalProgress"); + } + // this call triggers doRemoteProgress + doRemoteProgress(); + + /* + * 1) Are we expecting nothing here (sentOK/recvdOK = true) + * 2) do the sent and received messages match our expectations? + */ + sentOK = (sentOK || sentMsgCount[tag] >= expectedSent); + // We can receive messages passively (from remote puts) and actively (from our gets) + recvdOK = (recvdOK || (rcvdMsgCount[tag] + getMsgCount[tag]) >= expectedRecvd); + LOG(4, "PID: " << m_pid << " rcvdMsgCount[" << tag << "] = " << rcvdMsgCount[tag] + << " expectedRecvd = " << expectedRecvd + << " rcvdMsgCount[" << tag << "] = " << rcvdMsgCount[tag] + << " getMsgCount[" << tag << "] = " << getMsgCount[tag] + << " sentMsgCount[" << tag << "] = " << sentMsgCount[tag] + << " expectedSent = " << expectedSent); + } while (!(sentOK && recvdOK)); + } + } +} + +void Zero :: syncPerTag(TagID tag) { + int error; + // this barrier ensures m_recvInitMsgCount is accurate (TBC) + m_comm.barrier(); + do { + doLocalProgress(error); + if (error) { + LOG(1, "Error in doLocalProgress"); + throw std::runtime_error("Error in doLocalProgress"); + } + doRemoteProgress(); + } + while ((rcvdMsgCount.at(tag) < m_recvInitMsgCount.at(tag)) || + (sentMsgCount.at(tag) < m_sendInitMsgCount.at(tag))); + // this barrier ensures local buffers remain locked until remote uses are + // guaranteed complete. TODO FIXME: an acknowledgement mechanism would + // make this barrier unnecessary. + m_comm.barrier(); +} + +void Zero :: sync(bool resized,const struct SyncAttr * attr) +{ + const bool defaultSync = (attr == nullptr) ; + if (defaultSync) + { + LOG(4, "Process " << m_pid << " going for default sync (uses barrier)"); + (void) resized; + + // flush send queues + flushSent(); + // flush receive queues + flushReceived(); + + m_comm.barrier(); + + // done + return; + } + + ASSERT(attr != NULL); + + const bool tagSync = attr->expected_sent == 0 && attr->expected_rcvd == 0 + && attr->tag != INVALID_TAG; + if (tagSync) + { + LOG(4, "Process " << m_pid << " going for syncPerTag (uses barrier)"); + syncPerTag(attr->tag); + return; + } + + LOG(4, "Process " << m_pid << " going for countingSync (no barrier!)"); + countingSyncPerSlot(attr->tag,attr->expected_sent,attr->expected_rcvd); +} + + +} } diff --git a/src/MPI/zero.hpp b/src/MPI/zero.hpp new file mode 100644 index 00000000..1885eba9 --- /dev/null +++ b/src/MPI/zero.hpp @@ -0,0 +1,276 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LPF_CORE_MPI_ZERO_HPP +#define LPF_CORE_MPI_ZERO_HPP + +#include +#include +#include +#include +#if __cplusplus >= 201103L + #include +#else + #include +#endif + +#include + +#include "linkage.hpp" +#include "sparseset.hpp" +#include "memreg.hpp" +#include "lpf/core.h" + +namespace lpf { + +class Communication; + +namespace mpi { + +#if __cplusplus >= 201103L +using std::shared_ptr; +#else +using std::tr1::shared_ptr; +#endif + +class MemoryRegistration { + public: + char * _addr; + size_t _size; + uint32_t _lkey; + uint32_t _rkey; + int _pid; + MemoryRegistration( + char * addr, size_t size, + uint32_t lkey, uint32_t rkey, + int pid + ) : _addr(addr), _size(size), _lkey(lkey), _rkey(rkey), _pid(pid) + {} + MemoryRegistration() : + _addr(nullptr), _size(0), + _lkey(0), _rkey(0), _pid(-1) + {} + size_t serialize(char ** buf); + static MemoryRegistration * deserialize(char * buf); +}; + +class _LPFLIB_LOCAL Zero +{ + +public: + + typedef size_t SlotID; + typedef uint32_t TagID; + + static constexpr TagID INVALID_TAG = std::numeric_limits::max(); + + struct Exception; + + struct SyncAttr { + TagID tag; + size_t expected_sent; + size_t expected_rcvd; + }; + + explicit Zero( Communication & ); + ~Zero(); + + void resizeMemreg( size_t size ); + void resizeMesgq( size_t size ); + void resizeTagreg( size_t size ); + + SlotID regLocal( void * addr, size_t size ); + SlotID regGlobal( void * addr, size_t size ); + TagID regTag(); + + void dereg( SlotID id ); + void deregTag( TagID id ); + + size_t getMaxMsgSize() const { + return m_maxMsgSize; + } + + void put( SlotID srcSlot, size_t srcOffset, + int dstPid, SlotID dstSlot, size_t dstOffset, size_t size, lpf_msg_attr_t attr ); + + void get( int srcPid, SlotID srcSlot, size_t srcOffset, + SlotID dstSlot, size_t dstOffset, size_t size, lpf_msg_attr_t attr ); + + void flushSent(); + + void flushReceived(); + + void doRemoteProgress(); + + void countingSyncPerSlot(const TagID tag, const size_t sent, + const size_t recvd); + + /** + * @syncPerTag only guarantees that all already scheduled sends (via put), + * or receives (via get) associated with a slot are completed. It does + * not guarantee that not scheduled operations will be scheduled (e.g. + * no guarantee that a remote process will wait til data is put into its + * memory, as it does schedule the operation (one-sided). + */ + void syncPerTag(TagID tag); + + // Do the communication and synchronize + // 'Reconnect' must be a globally replicated value + void sync(bool reconnect, const struct SyncAttr * attr); + + void get_rcvd_msg_count(size_t &rcvd_msgs, + const struct SyncAttr * attr) noexcept; + void get_sent_msg_count(size_t &sent_msgs, + const struct SyncAttr * attr) noexcept; + + void createNewSyncAttr(struct SyncAttr * * attr); + + inline void destroySyncAttr(struct SyncAttr * attr) + { + delete attr; + } + + inline TagID getTag(const struct SyncAttr &attr) noexcept + { + return attr.tag; + } + + inline void setTag(const TagID tag, struct SyncAttr &attr) noexcept + { + attr.tag = tag; + } + + inline void setZCAttr(size_t sent, size_t rcvd, struct SyncAttr &attr) + noexcept + { + attr.expected_sent = sent; + attr.expected_rcvd = rcvd; + } + + inline void getZCAttr(const struct SyncAttr &attr, + size_t &sent, size_t &rcvd) noexcept + { + sent = attr.expected_sent; + rcvd = attr.expected_rcvd; + } + +protected: + + typedef enum Op { + SEND, + RECV, + GET + } Op; + + typedef enum Phase { + INIT, + PRE, + POST + } Phase; + + Zero & operator=(const Zero & ); // assignment prohibited + Zero( const Zero & ); // copying prohibited + + void stageQPs(size_t maxMsgs ); + void reconnectQPs(); + + void doProgress(); + void tryIncrement(const Op op, const Phase phase, const TagID slot) + noexcept; + + void doLocalProgress(int& error); + + struct MemorySlot { + shared_ptr< struct ibv_mr > mr; // verbs structure + std::vector< MemoryRegistration > glob; // array for global registrations + }; + + int m_pid; // local process ID + int m_nprocs; // number of processes + int m_ibPort; // local IB port to work with + int m_gidIdx; + size_t m_maxRegSize; + size_t m_maxMsgSize; + size_t m_cqSize; + size_t m_minNrMsgs; + size_t m_maxSrs; // maximum number of sends requests per QP + size_t m_postCount; + size_t m_recvCount; + size_t m_tag_capacity; + + shared_ptr< struct ibv_context > m_device; // device handle + shared_ptr< struct ibv_pd > m_pd; // protection domain + shared_ptr< struct ibv_cq > m_cq; // complation queue + shared_ptr< struct ibv_cq > m_cqLocal; // completion queue + shared_ptr< struct ibv_cq > m_cqRemote; // completion queue + shared_ptr< struct ibv_srq > m_srq; // shared receive queue + shared_ptr< struct ibv_mr > m_dummyMemReg; // registration of dummy + // buffer + std::atomic_size_t m_numMsgs; + std::atomic_size_t m_recvTotalInitMsgCount; + std::atomic_size_t m_sentMsgs; + std::atomic_size_t m_recvdMsgs; + + uint16_t m_lid; // LID of the IB port + + Communication & m_comm; + + std::string m_devName; // IB device name + + ibv_mtu m_mtu; + + struct ibv_device_attr m_deviceAttr; + + std::vector m_free_tags; + std::vector m_recvInitMsgCount; + std::vector m_getInitMsgCount; + std::vector m_sendInitMsgCount; + + // Disconnected queue pairs + std::vector< shared_ptr< struct ibv_qp > > m_stagedQps; + + // Connected queue pairs + std::vector< shared_ptr< struct ibv_qp > > m_connectedQps; + + std::vector< struct ibv_send_wr > m_srs; // array of send requests + std::vector< size_t > m_srsHeads; // head of send queue per + // peer + std::vector< size_t > m_nMsgsPerPeer; // number of messages per + // peer + std::vector< pid_t > m_peerList; + + std::vector< struct ibv_sge > m_sges; // array of scatter/gather + // entries + std::vector< struct ibv_wc > m_wcs; // array of work completions + std::vector< char > m_dummyBuffer; // dummy receive buffer + + std::vector rcvdMsgCount; + std::vector sentMsgCount; + std::vector getMsgCount; + std::vector tagActive; + + SparseSet< pid_t > m_activePeers; + + CombinedMemoryRegister< MemorySlot > m_memreg; + +}; + + +} } + + +#endif diff --git a/src/MPI/zero.t.cpp b/src/MPI/zero.t.cpp new file mode 100644 index 00000000..81dfbd8b --- /dev/null +++ b/src/MPI/zero.t.cpp @@ -0,0 +1,324 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "zero.hpp" +#include "assert.hpp" +#include "mpilib.hpp" + +#include +#include + +using namespace lpf::mpi; + +extern "C" const int LPF_MPI_AUTO_INITIALIZE=0; + + +/** + * \pre P >= 1 + * \pre P <= 2 + */ +class ZeroTests : public testing::Test { + + protected: + + static void SetUpTestSuite() { + + MPI_Init(NULL, NULL); + Lib::instance(); + comm = new Comm(); + *comm = Lib::instance().world(); + comm->barrier(); + verbs = new Zero( *comm ); + } + + static void TearDownTestSuite() { + delete verbs; + verbs = nullptr; + delete comm; + comm = nullptr; + MPI_Finalize(); + } + + static Comm *comm; + static Zero *verbs; +}; + +lpf::mpi::Comm * ZeroTests::comm = nullptr; +Zero * ZeroTests::verbs = nullptr; + + +TEST_F( ZeroTests, init ) +{ + + comm->barrier(); +} + + +TEST_F( ZeroTests, resizeMemreg ) +{ + + verbs->resizeMemreg( 2 ); + + comm->barrier(); +} + + +TEST_F( ZeroTests, resizeMesgq ) +{ + + verbs->resizeMesgq( 2 ); + + comm->barrier(); +} + +TEST_F( ZeroTests, regVars ) +{ + + + char buf1[30] = "Hi"; + char buf2[30] = "Boe"; + + verbs->resizeMemreg( 2 ); + + Zero::SlotID b1 = verbs->regLocal( buf1, sizeof(buf1) ); + Zero::SlotID b2 = verbs->regGlobal( buf2, sizeof(buf2) ); + + comm->barrier(); + verbs->dereg(b1); + verbs->dereg(b2); +} + + +TEST_F( ZeroTests, put ) +{ + + char buf1[30] = "Hi"; + char buf2[30] = "Boe"; + + verbs->resizeMemreg( 2 ); + verbs->resizeMesgq( 1 ); + + Zero::SlotID b1 = verbs->regLocal( buf1, sizeof(buf1) ); + Zero::SlotID b2 = verbs->regGlobal( buf2, sizeof(buf2) ); + + comm->barrier(); + + verbs->put( b1, 0, (comm->pid() + 1)%comm->nprocs(), b2, 0, sizeof(buf1)); + + verbs->sync(true, nullptr); + EXPECT_EQ( "Hi", std::string(buf1) ); + EXPECT_EQ( "Hi", std::string(buf2) ); + verbs->dereg(b1); + verbs->dereg(b2); +} + + +TEST_F( ZeroTests, get ) +{ + + char buf1[30] = "Hoi"; + char buf2[30] = "Vreemd"; + + verbs->resizeMemreg( 2 ); + verbs->resizeMesgq( 1 ); + + Zero::SlotID b1 = verbs->regLocal( buf1, sizeof(buf1) ); + Zero::SlotID b2 = verbs->regGlobal( buf2, sizeof(buf2) ); + + comm->barrier(); + + verbs->get( (comm->pid() + 1)%comm->nprocs(), b2, 0, + b1, 0, sizeof(buf2)); + + verbs->sync(true, nullptr); + EXPECT_EQ( "Vreemd", std::string(buf1) ); + EXPECT_EQ( "Vreemd", std::string(buf2) ); + verbs->dereg(b1); + verbs->dereg(b2); +} + + +TEST_F( ZeroTests, putAllToAll ) +{ + int nprocs = comm->nprocs(); + int pid = comm->pid(); + + const int H = 2.5 * nprocs; + + std::vector< int > a(H); + std::vector< int > b(H); + + for (int i = 0; i < H; ++i) { + a[i] = i * nprocs + pid ; + b[i] = nprocs*nprocs - ( i * nprocs + pid); + } + + verbs->resizeMemreg( 2 ); + verbs->resizeMesgq( H ); + + Zero::SlotID a1 = verbs->regGlobal( a.data(), sizeof(int)*a.size()); + Zero::SlotID b1 = verbs->regGlobal( b.data(), sizeof(int)*b.size()); + + comm->barrier(); + + for (int i = 0; i < H; ++i) { + int dstPid = (pid + i ) % nprocs; + verbs->put( a1, sizeof(int)*i, + dstPid, b1, sizeof(int)*i, sizeof(int)); + } + + verbs->sync(true, nullptr); + + for (int i = 0; i < H; ++i) { + int srcPid = (nprocs + pid - (i%nprocs)) % nprocs; + EXPECT_EQ( i*nprocs + pid, a[i] ) ; + EXPECT_EQ( i*nprocs + srcPid, b[i] ); + } + verbs->dereg(a1); + verbs->dereg(b1); + +} + +TEST_F( ZeroTests, getAllToAll ) +{ + int nprocs = comm->nprocs(); + int pid = comm->pid(); + + const int H = 100.3 * nprocs; + + std::vector< int > a(H), a2(H); + std::vector< int > b(H), b2(H); + + for (int i = 0; i < H; ++i) { + a[i] = i * nprocs + pid ; + a2[i] = a[i]; + b[i] = nprocs*nprocs - ( i * nprocs + pid); + b2[i] = i*nprocs+ (nprocs + pid + i) % nprocs; + } + + verbs->resizeMemreg( 2 ); + verbs->resizeMesgq( H ); + + Zero::SlotID a1 = verbs->regGlobal( a.data(), sizeof(int)*a.size()); + Zero::SlotID b1 = verbs->regGlobal( b.data(), sizeof(int)*b.size()); + + comm->barrier(); + + for (int i = 0; i < H; ++i) { + int srcPid = (pid + i) % nprocs; + verbs->get( srcPid, a1, sizeof(int)*i, + b1, sizeof(int)*i, sizeof(int)); + } + + verbs->sync(true, nullptr); + + EXPECT_EQ(a, a2); + EXPECT_EQ(b, b2); + + verbs->dereg(a1); + verbs->dereg(b1); + +} + + +TEST_F( ZeroTests, putHuge ) +{ + std::vector hugeMsg(3*verbs->getMaxMsgSize()); + std::vector< char > hugeBuf(3*verbs->getMaxMsgSize()); + LOG(4, "Allocating putHuge with vector size: " << hugeMsg.size()); + + for ( size_t i = 0; i < hugeMsg.size() ; ++i) + hugeMsg[i] = char( i ); + + verbs->resizeMemreg( 2 ); + verbs->resizeMesgq( 1 ); + + Zero::SlotID b1 = verbs->regLocal( hugeMsg.data(), hugeMsg.size() ); + Zero::SlotID b2 = verbs->regGlobal( hugeBuf.data(), hugeBuf.size() ); + + comm->barrier(); + + verbs->put( b1, 0, (comm->pid() + 1)%comm->nprocs(), b2, 0, hugeMsg.size() * sizeof(char) ); + + verbs->sync(true, nullptr); + + EXPECT_EQ( hugeMsg, hugeBuf ); + + verbs->dereg(b1); + verbs->dereg(b2); +} + +TEST_F( ZeroTests, getHuge ) +{ + + std::vector hugeMsg(3*verbs->getMaxMsgSize()); + std::vector< char > hugeBuf(3*verbs->getMaxMsgSize()); + LOG(4, "Allocating getHuge with vector size: " << hugeMsg.size()); + + for ( size_t i = 0; i < hugeMsg.size() ; ++i) + hugeMsg[i] = char(i); + + verbs->resizeMemreg( 2 ); + verbs->resizeMesgq( 1 ); + + Zero::SlotID b1 = verbs->regLocal( hugeMsg.data(), hugeMsg.size() ); + Zero::SlotID b2 = verbs->regGlobal( hugeBuf.data(), hugeBuf.size() ); + + comm->barrier(); + + verbs->get( (comm->pid() + 1)%comm->nprocs(), b2, 0, b1, 0, hugeMsg.size() * sizeof(char)); + + verbs->sync(true, nullptr); + + EXPECT_EQ(hugeMsg, hugeBuf); + + verbs->dereg(b1); + verbs->dereg(b2); +} + +TEST_F( ZeroTests, manyPuts ) +{ + + const unsigned N = 5000; + std::vector< unsigned char > buf1( N ); + std::vector< unsigned char > buf2( N ); + for (unsigned int i = 0 ; i < N; ++ i) + buf1[i] = i + comm->pid() ; + + verbs->resizeMemreg( 2 ); + verbs->resizeMesgq( N ); + + Zero::SlotID b1 = verbs->regLocal( buf1.data(), buf1.size() ); + Zero::SlotID b2 = verbs->regGlobal( buf2.data(), buf1.size() ); + + comm->barrier(); + + for ( unsigned i = 0 ; i < N; ++i) + verbs->put( b1, i, (comm->pid() + 1)%comm->nprocs(), b2, i, 1); + + verbs->sync(true, nullptr); + for ( unsigned i = 0 ; i < N; ++i) { + unsigned char b2_exp = i + (comm->pid() + comm->nprocs() - 1) % comm->nprocs(); + unsigned char b1_exp = i + comm->pid(); + EXPECT_EQ( b2_exp, buf2[i]); + EXPECT_EQ( b1_exp, buf1[i] ); + } + + verbs->dereg(b1); + verbs->dereg(b2); +} + diff --git a/src/debug/CMakeLists.txt b/src/debug/CMakeLists.txt index 7f3f9c92..0679775c 100644 --- a/src/debug/CMakeLists.txt +++ b/src/debug/CMakeLists.txt @@ -38,4 +38,3 @@ install(TARGETS ${libname} EXPORT lpf ) add_gtest(rwconflict_test "pthread" rwconflict.t.cpp rwconflict.cpp) - #$ ) diff --git a/src/hybrid/core.cpp b/src/hybrid/core.cpp index 404edda8..d98caf05 100644 --- a/src/hybrid/core.cpp +++ b/src/hybrid/core.cpp @@ -28,7 +28,7 @@ #include #include -#if __cplusplus >= 201103L +#if __cplusplus >= 201103L #include #else #include @@ -49,7 +49,7 @@ _LPFLIB_VAR const lpf_args_t LPF_NO_ARGS = { NULL, 0, NULL, 0, NULL, 0 }; _LPFLIB_VAR const lpf_sync_attr_t LPF_SYNC_DEFAULT = 0; -_LPFLIB_VAR const lpf_msg_attr_t LPF_MSG_DEFAULT = 0; +_LPFLIB_VAR const lpf_msg_attr_t LPF_MSG_DEFAULT = NULL; _LPFLIB_VAR const lpf_pid_t LPF_MAX_P = UINT_MAX; @@ -59,7 +59,7 @@ _LPFLIB_VAR const lpf_t LPF_NONE = NULL; _LPFLIB_VAR const lpf_init_t LPF_INIT_NONE = NULL; -_LPFLIB_VAR const lpf_t LPF_ROOT = static_cast(const_cast("LPF_ROOT")) ; +_LPFLIB_VAR const lpf_t LPF_ROOT = static_cast(const_cast("LPF_ROOT")) ; _LPFLIB_VAR const lpf_machine_t LPF_INVALID_MACHINE = { 0, 0, NULL, NULL }; @@ -68,7 +68,7 @@ namespace { using lpf::hybrid::LPF_CORE_IMPL_CONFIG::MachineParams; struct Init { - + lpf::hybrid::Thread m_thread; lpf::hybrid::MPI m_mpi; lpf_pid_t m_threadId, m_nThreads; @@ -84,18 +84,18 @@ namespace { lpf::hybrid::ThreadState * realContext( lpf_t ctx ) - { + { lpf_t c; if (ctx == LPF_ROOT) - c = &lpf::hybrid::ThreadState::root(); + c = &lpf::hybrid::ThreadState::root(); else c = ctx; return static_cast< lpf::hybrid::ThreadState *>(c); } } -_LPFLIB_API lpf_err_t lpf_hybrid_intialize( USE_THREAD(_t) thread, USE_MPI(_t) mpi, - lpf_pid_t threadId, lpf_pid_t nThreads, +_LPFLIB_API lpf_err_t lpf_hybrid_intialize( USE_THREAD(_t) thread, USE_MPI(_t) mpi, + lpf_pid_t threadId, lpf_pid_t nThreads, lpf_pid_t nodeId, lpf_pid_t nNodes, lpf_init_t * init ) { using namespace lpf::hybrid; @@ -138,12 +138,12 @@ _LPFLIB_API lpf_err_t lpf_hook( lpf_init_t init, lpf_spmd_t spmd, lpf_args_t arg using namespace lpf::hybrid; Init * params = static_cast(init); -#if __cplusplus >= 201103L +#if __cplusplus >= 201103L std::shared_ptr nodeState; #else std::tr1::shared_ptr nodeState; #endif - + NodeState * nodeStatePtr = NULL; if (params->m_threadId == 0) { @@ -172,15 +172,15 @@ _LPFLIB_API lpf_err_t lpf_hook( lpf_init_t init, lpf_spmd_t spmd, lpf_args_t arg } catch(std::bad_alloc & e ) { - LOG(1, "Not enough memory to run SPMD function on thread " - << params->m_threadId << " of node " + LOG(1, "Not enough memory to run SPMD function on thread " + << params->m_threadId << " of node " << nodeStatePtr->nodeId() ); failure = true; } catch(...) { - LOG(1, "SPMD function of thread " - << params->m_threadId << " of node " + LOG(1, "SPMD function of thread " + << params->m_threadId << " of node " << nodeStatePtr->nodeId() << " threw an unexpected exception"); failure = true; } @@ -188,7 +188,7 @@ _LPFLIB_API lpf_err_t lpf_hook( lpf_init_t init, lpf_spmd_t spmd, lpf_args_t arg trc = reduceOr( params->m_thread, 0, failure); if ( trc != Thread::SUCCESS ) return LPF_ERR_FATAL; - if ( params->m_threadId == 0) + if ( params->m_threadId == 0) { MPI::err_t nrc = MPI::SUCCESS; nrc = reduceOr( params->m_mpi, 0, failure); @@ -198,7 +198,7 @@ _LPFLIB_API lpf_err_t lpf_hook( lpf_init_t init, lpf_spmd_t spmd, lpf_args_t arg } trc = broadcast( params->m_thread, 0, failure ); if ( trc != Thread::SUCCESS ) return LPF_ERR_FATAL; - + return failure?LPF_ERR_FATAL:LPF_SUCCESS; } @@ -281,16 +281,15 @@ _LPFLIB_API lpf_err_t lpf_deregister( } _LPFLIB_API lpf_err_t lpf_put( lpf_t ctx, - lpf_memslot_t src_slot, - size_t src_offset, - lpf_pid_t dst_pid, - lpf_memslot_t dst_slot, - size_t dst_offset, - size_t size, - lpf_msg_attr_t attr + lpf_memslot_t src_slot, + size_t src_offset, + lpf_pid_t dst_pid, + lpf_memslot_t dst_slot, + size_t dst_offset, + size_t size, + lpf_msg_attr_t attr ) { - (void) attr; using namespace lpf::hybrid; if (ctx == LPF_SINGLE_PROCESS) { char * null = NULL; @@ -301,24 +300,25 @@ _LPFLIB_API lpf_err_t lpf_put( lpf_t ctx, } ThreadState * t = realContext(ctx); - if (!t->error()) - t->put( src_slot, src_offset, dst_pid, dst_slot, dst_offset, size ); + if (!t->error()) { + t->put( src_slot, src_offset, dst_pid, dst_slot, dst_offset, size, + attr ); + } return LPF_SUCCESS; } _LPFLIB_API lpf_err_t lpf_get( - lpf_t ctx, - lpf_pid_t src_pid, - lpf_memslot_t src_slot, - size_t src_offset, - lpf_memslot_t dst_slot, + lpf_t ctx, + lpf_pid_t src_pid, + lpf_memslot_t src_slot, + size_t src_offset, + lpf_memslot_t dst_slot, lpf_memslot_t dst_offset, size_t size, lpf_msg_attr_t attr ) { - (void) attr; using namespace lpf::hybrid; if (ctx == LPF_SINGLE_PROCESS) { char * null = NULL; @@ -329,8 +329,10 @@ _LPFLIB_API lpf_err_t lpf_get( } ThreadState * t = realContext(ctx); - if (!t->error()) - t->get( src_pid, src_slot, src_offset, dst_slot, dst_offset, size ); + if (!t->error()) { + t->get( src_pid, src_slot, src_offset, dst_slot, dst_offset, size, + attr ); + } return LPF_SUCCESS; } @@ -338,12 +340,11 @@ _LPFLIB_API lpf_err_t lpf_sync( lpf_t ctx, lpf_sync_attr_t attr ) { (void) attr; using namespace lpf::hybrid; - if (ctx == LPF_SINGLE_PROCESS) + if (ctx == LPF_SINGLE_PROCESS) return LPF_SUCCESS; return realContext(ctx)->sync(); } - _LPFLIB_API lpf_err_t lpf_probe( lpf_t ctx, lpf_machine_t * params ) { using namespace lpf::hybrid; @@ -364,7 +365,7 @@ _LPFLIB_API lpf_err_t lpf_probe( lpf_t ctx, lpf_machine_t * params ) _LPFLIB_API lpf_err_t lpf_resize_message_queue( lpf_t ctx, size_t max_msgs ) { using namespace lpf::hybrid; - if (ctx == LPF_SINGLE_PROCESS) + if (ctx == LPF_SINGLE_PROCESS) return LPF_SUCCESS; ThreadState * t = realContext(ctx); @@ -377,7 +378,7 @@ _LPFLIB_API lpf_err_t lpf_resize_message_queue( lpf_t ctx, size_t max_msgs ) _LPFLIB_API lpf_err_t lpf_resize_memory_register( lpf_t ctx, size_t max_regs ) { using namespace lpf::hybrid; - if (ctx == LPF_SINGLE_PROCESS) + if (ctx == LPF_SINGLE_PROCESS) return LPF_SUCCESS; ThreadState * t = realContext(ctx); diff --git a/src/hybrid/state.hpp b/src/hybrid/state.hpp index 6ae1dd3a..ddc98d64 100644 --- a/src/hybrid/state.hpp +++ b/src/hybrid/state.hpp @@ -289,8 +289,12 @@ class _LPFLIB_LOCAL ThreadState { void put( lpf_memslot_t src_slot, size_t src_offset, pid_t dst_pid, lpf_memslot_t dst_slot, size_t dst_offset, - size_t size) + size_t size, lpf_msg_attr_t attr ) { + (void) attr; // current implementation ignores attributes -- note that + // handling e.g. zero-cost in the hybrid setting is not exactly + // trivial, and that simply applying zero-cost on the top level + // only will not lead to correct behaviour typedef NodeMemReg::Memory Memory; if (size <= 0) return; @@ -314,8 +318,12 @@ class _LPFLIB_LOCAL ThreadState { void get( pid_t src_pid, lpf_memslot_t src_slot, size_t src_offset, lpf_memslot_t dst_slot, size_t dst_offset, - size_t size ) + size_t size, lpf_msg_attr_t attr ) { + (void) attr; // current implementation ignores attributes -- note that + // handling e.g. zero-cost in the hybrid setting is not exactly + // trivial, and that simply applying zero-cost on the top level + // only will not lead to correct behaviour typedef NodeMemReg::Memory Memory; if (size <= 0) return; diff --git a/src/imp/core.c b/src/imp/core.c index e076b811..bb6c88e0 100644 --- a/src/imp/core.c +++ b/src/imp/core.c @@ -34,7 +34,7 @@ const lpf_args_t LPF_NO_ARGS = { NULL, 0, NULL, 0, NULL, 0 }; const lpf_sync_attr_t LPF_SYNC_DEFAULT = 0; -const lpf_msg_attr_t LPF_MSG_DEFAULT = 0; +const lpf_msg_attr_t LPF_MSG_DEFAULT = NULL; const lpf_pid_t LPF_MAX_P = UINT_MAX; diff --git a/src/pthreads/core.cpp b/src/pthreads/core.cpp index 080b6a1d..776f0c1c 100644 --- a/src/pthreads/core.cpp +++ b/src/pthreads/core.cpp @@ -52,7 +52,7 @@ const lpf_args_t LPF_NO_ARGS = { NULL, 0, NULL, 0, NULL, 0 }; const lpf_sync_attr_t LPF_SYNC_DEFAULT = 0; -const lpf_msg_attr_t LPF_MSG_DEFAULT = 0; +const lpf_msg_attr_t LPF_MSG_DEFAULT = NULL; const lpf_pid_t LPF_MAX_P = UINT_MAX; @@ -296,8 +296,8 @@ lpf_err_t lpf_put( lpf_msg_attr_t attr ) { - (void) attr; // ignore parameter 'msg' since this implementation only - // implements core functionality + (void) attr; // ignore parameter 'msg' since this engine only implements + // core functionality lpf::ThreadLocalData * thread = realCtx(ctx); if (!thread->isAborted()) @@ -318,8 +318,8 @@ lpf_err_t lpf_get( lpf_msg_attr_t attr ) { - (void) attr; // ignore parameter 'msg' since this implementation only - // implements core functionality + (void) attr; // ignore parameter 'msg' since this engine only implements + // core functionality lpf::ThreadLocalData * thread = realCtx(ctx); if (!thread->isAborted()) diff --git a/src/pthreads/threadlocaldata.hpp b/src/pthreads/threadlocaldata.hpp index 66d56160..92f99b72 100644 --- a/src/pthreads/threadlocaldata.hpp +++ b/src/pthreads/threadlocaldata.hpp @@ -105,7 +105,7 @@ class _LPFLIB_LOCAL ThreadLocalData { return m_atExit[0]; } err_t sync( bool expectExit = false ); // nothrow - + private: ThreadLocalData( const ThreadLocalData & ) ; // prohibit copying ThreadLocalData & operator=( const ThreadLocalData & ); // prohibit assignment diff --git a/tests/functional/CMakeLists.txt b/tests/functional/CMakeLists.txt index 0eb7eea6..65182f6f 100644 --- a/tests/functional/CMakeLists.txt +++ b/tests/functional/CMakeLists.txt @@ -126,7 +126,6 @@ foreach (LPF_IMPL_ID ${ENGINES}) get_filename_component(baseName ${testSource} NAME_WE ) set(exeName "${baseName}_${LPF_IMPL_ID}_${LPF_IMPL_CONFIG}${mode}") add_gtest(${exeName} ${LPF_IMPL_ID} ${debug} "${CMAKE_CURRENT_SOURCE_DIR}/${testSource}") - endforeach(testSource) endforeach(LPF_IMPL_ID) diff --git a/tests/functional/func_bsplib_hpsend_many.cpp b/tests/functional/func_bsplib_hpsend_many.cpp index d531eea8..3de0d3c1 100644 --- a/tests/functional/func_bsplib_hpsend_many.cpp +++ b/tests/functional/func_bsplib_hpsend_many.cpp @@ -31,8 +31,8 @@ void spmd( lpf_t lpf, lpf_pid_t pid, lpf_pid_t nprocs, lpf_args_t args) bsplib_t bsplib; size_t maxhpregs = (size_t) -1; - const int pthread = 1, mpirma = 2, mpimsg = 3, hybrid = 4, ibverbs=5; - (void) pthread; (void) mpirma; (void) mpimsg; (void) hybrid; (void) ibverbs; + const int pthread = 1, mpirma = 2, mpimsg = 3, hybrid = 4, ibverbs=5, zero=6; + (void) pthread; (void) mpirma; (void) mpimsg; (void) hybrid; (void) ibverbs; (void) zero; if (LPF_CORE_IMPL_ID == mpirma ) { maxhpregs = 10; // because MPI RMA only supports a limited number diff --git a/tests/functional/func_lpf_probe_parallel_nested.cpp b/tests/functional/func_lpf_probe_parallel_nested.cpp index f594b7b8..5381bffe 100644 --- a/tests/functional/func_lpf_probe_parallel_nested.cpp +++ b/tests/functional/func_lpf_probe_parallel_nested.cpp @@ -117,8 +117,8 @@ void spmd1( lpf_t lpf, lpf_pid_t pid, lpf_pid_t nprocs, lpf_args_t args) EXPECT_LT( 0.0, (*(subMachine.g))(machine.p, (size_t)(-1), LPF_SYNC_DEFAULT) ); EXPECT_LT( 0.0, (*(subMachine.l))(machine.p, (size_t)(-1), LPF_SYNC_DEFAULT) ); - const int pthread = 1, mpirma = 1, mpimsg = 1, hybrid = 0, ibverbs=1; - (void) pthread; (void) mpirma; (void) mpimsg; (void) hybrid; (void) ibverbs; + const int pthread = 1, mpirma = 1, mpimsg = 1, hybrid = 0, ibverbs=1, zero = 1; + (void) pthread; (void) mpirma; (void) mpimsg; (void) hybrid; (void) ibverbs; (void) zero; if (LPF_CORE_IMPL_ID) // this part is disabled for the hybrid implementation, because { // that one doesn't do generic nesting of lpf_exec's EXPECT_EQ( 1, subMachine.free_p == 2 || subMachine.free_p == 3 ); @@ -203,5 +203,4 @@ TEST( API, func_lpf_probe_parallel_nested ) rc = lpf_exec( LPF_ROOT, machine.p / 2, &spmd1, args ); EXPECT_EQ( LPF_SUCCESS, rc ); - } diff --git a/tests/functional/macro_LPF_VERSION.cpp b/tests/functional/macro_LPF_VERSION.cpp index 7588aeea..f513f635 100644 --- a/tests/functional/macro_LPF_VERSION.cpp +++ b/tests/functional/macro_LPF_VERSION.cpp @@ -19,10 +19,10 @@ #include "gtest/gtest.h" #ifdef _LPF_VERSION - #if _LPF_VERSION == 202000L + #if _LPF_VERSION == 202500L // everything is OK #else - #error Macro _LPF_VERSION has not been defined as 202000L + #error Macro _LPF_VERSION has not been defined as 202500L #endif #else #error Macro _LPF_VERSION has not been defined @@ -35,5 +35,5 @@ */ TEST( API, macro_LPF_VERSION ) { - EXPECT_EQ( 202000L, _LPF_VERSION ); + EXPECT_EQ( 202500L, _LPF_VERSION ); }