diff --git a/CMakeLists.txt b/CMakeLists.txt
index 844a4499..075cc34a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -176,6 +176,7 @@ if ( LIB_MATH AND LIB_DL AND MPI_FOUND )
 
     if (ENABLE_IBVERBS)
         list(APPEND ENGINES "ibverbs")
+        list(APPEND ENGINES "zero")
     endif()
 
 endif()
@@ -493,7 +494,7 @@ if (LPF_ENABLE_TESTS)
             TEST_PREFIX ${ENGINE}_
             EXTRA_ARGS --gtest_output=xml:${test_output}/${ENGINE}_${testName}
             DISCOVERY_MODE POST_BUILD
-            DISCOVERY_TIMEOUT 15
+            DISCOVERY_TIMEOUT 60
         )
 
     endfunction(add_gtest)
diff --git a/NOTICE b/NOTICE
index 1f386452..3992b64c 100644
--- a/NOTICE
+++ b/NOTICE
@@ -33,6 +33,8 @@ Implementation
         1) BSMP
         2) Collectives
         3) Pthread implementation
+    - 2022 - 2024, Kiril Dichev
+        1) Develop zero engine for LPF
 
     - 2018, Pierre Leca
         1) Usability improvements of compiler frontends and CMake integration
@@ -50,6 +52,8 @@ Quality Assurance
 
     - 2015 - 2017, Albert-Jan Yzelman
         1) Performance test suite
+    - 2022 - 2024, Kiril Dichev
+        1) Rewrite all functional tests to use CTest/Gtest
 
 
 Miscellaneous / Acknowledgments
diff --git a/bootstrap.sh b/bootstrap.sh
index 1bc1835c..4c3d4e68 100755
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -278,13 +278,13 @@ echo "--------------------------------------------------"
 echo
 ${CMAKE_EXE} -Wno-dev \
       -DCMAKE_INSTALL_PREFIX="$installdir" \
-      -DCMAKE_BUILD_TYPE=$config           \
-      -DLPFLIB_MAKE_DOC=$doc         \
-      -DLPFLIB_MAKE_TEST_DOC=$doc    \
+      -DCMAKE_BUILD_TYPE=$config \
+      -DLPFLIB_MAKE_DOC=$doc \
+      -DLPFLIB_MAKE_TEST_DOC=$doc \
       -DLPF_ENABLE_TESTS=$functests \
       -DGTEST_AGREE_TO_LICENSE=$googletest_license_agreement \
-      -DLPFLIB_PERFTESTS=$perftests  \
-      -DLPFLIB_CONFIG_NAME=${config_name:-${config}}\
+      -DLPFLIB_PERFTESTS=$perftests \
+      -DLPFLIB_CONFIG_NAME=${config_name:-${config}} \
       -DLPF_HWLOC="${hwloc}" \
       $hwloc_found_flag \
       $mpi_cmake_flags \
diff --git a/cmake/mpi.cmake b/cmake/mpi.cmake
index f8d55851..56075c5a 100644
--- a/cmake/mpi.cmake
+++ b/cmake/mpi.cmake
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 
-find_package(MPI)
+find_package(MPI REQUIRED)
 
 # Find the 'mpirun' frontend
 string( REGEX REPLACE "exec$" "run" mpirun "${MPIEXEC}" )
diff --git a/include/lpf/core.h b/include/lpf/core.h
index 9c0d1da8..320ca2e1 100644
--- a/include/lpf/core.h
+++ b/include/lpf/core.h
@@ -688,8 +688,10 @@
 
 #ifdef __cplusplus
 #include <cstddef>
+#include <cstdint>
 #else
 #include <stddef.h>
+#include <stdint.h>
 #endif
 
 #endif // DOXYGEN
@@ -705,7 +707,7 @@ extern "C" {
  * released, and NN the number of the specifications released before this one in
  * the same year.
  */
-#define _LPF_VERSION 202000L
+#define _LPF_VERSION 202500L
 
 /**
  * An implementation that has defined this macro may never define the
@@ -942,7 +944,7 @@ typedef void * lpf_init_t;
 #ifdef DOXYGEN
 typedef ... lpf_sync_attr_t;
 #else
-typedef int lpf_sync_attr_t;
+typedef void * lpf_sync_attr_t;
 #endif
 
 /**
@@ -984,7 +986,7 @@ typedef struct lpf_machine {
      * byte. This value may depend on the actual number of processes \a p used,
      * the minimum message size \a min_msg_size the user aims to send and
      * receive, and the type of synchronisation requested via \a attr. The
-	 * value is bitwise equivalent across all processes.
+     * value is bitwise equivalent across all processes.
      *
      * \param[in] p            A value between 1 and #lpf_machine_t.p, where
      *                         both bounds are inclusive.
@@ -1038,7 +1040,19 @@ typedef struct lpf_machine {
  * memory areas must be registered for direct remote memory access (DRMA).
  *
  * \par Communication
- * Object of this type must not be communicated.
+ * Objects of this type must not be communicated; if they are, objects copied
+ * to a remote process in principle do \em not represent valid memory slots.
+ *
+ * \par Trivially Copyable
+ * Objects of this type are trivially copyable in the same sense of the C++11
+ * TriviallyCopyable type category.
+ *
+ * \note Rationale: extensions could rely on the trivially copyability of memory
+ *       slots. Therefore, while the core specification stipulates memory slots
+ *       should not be copied across nodes with the expectation that a valid
+ *       memory slot on process A when copied to process B yields a valid memory
+ *       slot on process B, it must account for the possibility (provided by
+ *       extensions) that such a copy could be meaningful.
  */
 #ifdef DOXYGEN
 typedef ... lpf_memslot_t;
@@ -1066,7 +1080,7 @@ typedef size_t lpf_memslot_t;
 #ifdef DOXYGEN
 typedef ... lpf_msg_attr_t;
 #else
-typedef int lpf_msg_attr_t;
+typedef void * lpf_msg_attr_t;
 #endif
 
 /**
diff --git a/include/lpf/noc.h b/include/lpf/noc.h
new file mode 100644
index 00000000..4bbe3031
--- /dev/null
+++ b/include/lpf/noc.h
@@ -0,0 +1,559 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LPFLIB_NOC_H
+#define LPFLIB_NOC_H
+
+// import size_t data type for the implementation
+#ifndef DOXYGEN
+
+#ifdef __cplusplus
+#include <cstddef>
+#else
+#include <stddef.h>
+#endif
+
+#include <lpf/core.h>
+
+#endif // DOXYGEN
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** \addtogroup LPF_EXTENSIONS LPF API extensions
+ *
+ * @{
+ *
+ * \defgroup LPF_NOC Extensions to LPF where it need not maintain consistency.
+ *
+ * This extension specifies facilities for (de-)registering memory slots,
+ * registering RDMA requests, and fencing RDMA requests. These extensions are,
+ * as far as possible, fully compatible with the core LPF API specifications.
+ * Reused core API concepts include LPF contexts (#lpf_t), processor count types
+ * (#lpf_pid_t), memory slot types (#lpf_memslot_t), message attributes
+ * (#lpf_msg_attr_t), the #lpf_sync primitive, and, by extension,
+ * synchronization attributes (#lpf_sync_attr_t).
+ *
+ * In this extension,
+ *  1. LPF does not maintain consistency amongst processes that (de-)register
+ *     memory slots while RDMA communication may occur. Maintaining the
+ *     required consistency instead becomes the purview of the user. This
+ *     extension specificies exactly what consistency properties the user must
+ *     guarantee; and
+ *  2. provides facilities with which RDMA communication may be fenced on a
+ *     finer granularity than when using #lpf_sync; this applies to the use of
+ *     #lpf_put, #lpf_get, #lpf_noc_put, and #lpf_noc_get. The use of these
+ *     facilities shall not change the semantics of an #lpf_sync that could
+ *     follow as well (however, the use of #lpf_sync may not be needed in order
+ *     to complete RDMA requests).
+ *
+ * These two mechanisms for achieving different types of non-coherency may be
+ * employed orthogonally. For the first extension, the following primitives are
+ * provided:
+ *  - #lpf_noc_resize_memory_register,
+ *  - #lpf_noc_register,
+ *  - #lpf_noc_deregister,
+ *  - #lpf_noc_put, and
+ *  - #lpf_noc_get.
+ * While these primitives re-use the standard #lpf_memslot_t, implementations
+ * may handle so-called non-coherent memory slots differently from normal memory
+ * slots. One key requirement that non-coherent memory slots should support, is
+ * that they should be byte-copiable and also safe to communicate across
+ * processes.
+ *
+ * \note At this point in time, this first extension set is not implemented by
+ *       any engine.
+ *
+ * For the second extension, the following primitives are provided:
+ *  - #lpf_noc_flush_sent, and
+ *  - #lpf_noc_flush_received.
+ *
+ * \warning If LPF is considered a tool for the so-called <em>hero
+ *          programmer</em>, then please note that this variant is even harder
+ *          to program with.
+ *
+ * \note At present, no debug layer exists for this extension. It is unclear if
+ *       such a debug layer is even possible (precisely because LPF in this
+ *       extension does not maintain consistency, there is no way a debug layer
+ *       could enforce it).
+ *
+ * \par Engines that implement the first non-coherent extension set
+ * None.
+ *
+ * \par Engines that implement the second non-coherent extension set
+ * - the \em zero engine.
+ *
+ * @{
+ */
+
+
+/**
+ * The version of this no-conflict LPF specification. All implementations shall
+ * define this macro. The format is YYYYNN, where YYYY is the year the
+ * specification was released, and NN the number of the specifications released
+ * before this one in the same year.
+ */
+#define _LPF_NOC_VERSION 202400L
+
+/**
+ * Resizes the memory register for non-coherent RDMA.
+ *
+ * After a successful call to this function, the local process has enough
+ * resources to register \a max_regs memory regions in a non-coherent way.
+ *
+ * Each registration via lpf_noc_register() counts as one. Such registrations
+ * remain taking up capacity in the register until they are released via a call
+ * to lpf_noc_deregister(), which lowers the count of used memory registerations
+ * by one.
+ *
+ * There are no runtime out-of-bounds checks prescribed for lpf_noc_register()--
+ * this would also be too costly as error checking would require communication.
+ *
+ * If memory allocation were successful, the return value is #LPF_SUCCESS and
+ * the local process may assume the new buffer size \a max_regs.
+ *
+ * In the case of insufficient local memory the return value will be
+ * #LPF_ERR_OUT_OF_MEMORY. In that case, it is as if the call never happened and
+ * the user may retry the call locally after freeing up unused resources. Should
+ * retrying not lead to a successful call, the programmer may opt to broadcast
+ * the error (using existing slots) or to give up by returning from the spmd
+ * section.
+ *
+ * \note The current maximum cannot be retrieved from the runtime. Instead, the
+ *       programmer must track this information herself. To provide
+ *       encapsulation, see lpf_rehook().
+ *
+ * \note When the given memory register capacity is smaller than the current
+ *       capacity, the runtime is allowed but not required to release the
+ *       allocated memory. Such a call shall always be successful and return
+ *       #LPF_SUCCESS.
+ *
+ * \note This means that an implementation that allows shrinking the given
+ *       capacity must also ensure the old buffer remains intact in case there
+ *       is not enough memory to allocate a smaller one.
+ *
+ * \note The last invocation of lpf_noc_resize_memory_register() determines the
+ *       maximum number of memory registrations using lpf_noc_register() that
+ *       can be maintained concurrently.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only. Any
+ * further thread safety may be guaranteed by the implementation, but is not
+ * specified. Similar conditions hold for all LPF primitives that take an
+ * argument of type #lpf_t; see #lpf_t for more information.
+ *
+ * \param[in,out]   ctx The runtime state as provided by lpf_exec().
+ * \param[in]  max_regs The requested maximum number of memory regions that can
+ *                      be registered. This value must be the same on all
+ *                      processes.
+ *
+ * \returns #LPF_SUCCESS
+ *            When this process successfully acquires the resources.
+ *
+ * \returns #LPF_ERR_OUT_OF_MEMORY
+ *            When there was not enough memory left on the heap. In this case
+ *            the effect is the same as when this call did not occur at all.
+ *
+ * \par BSP costs
+ * None.
+ *
+ * \par Runtime costs
+ * \f$ \Theta( \mathit{max\_regs} ) \f$.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_noc_resize_memory_register( lpf_t ctx, size_t max_regs );
+
+/**
+ * Registers a local memory area, preparing its use for intra-process
+ * communication.
+ *
+ * The registration process is necessary to enable Remote Direct Memory Access
+ * (RDMA) primitives, such as lpf_get(), lpf_noc_get(), lpf_put(), and
+ * lpf_noc_put().
+ *
+ * This is \em not a collective function. For #lpf_get and #lpf_put, the memory
+ * slot returned by this function is equivalent to a memory slot returned by
+ * #lpf_register_local; the \a memslot returned by a successful call to this
+ * function (hence) is immediately valid. A successful call (hence) immediately
+ * consumes one memory slot capacity; see also #lpf_resize_memory_register on
+ * how to ensure sufficient capacity.
+ *
+ * Different from a memory slot returned by #lpf_register_local, a memory slot
+ * returned by a successful call to this function may serve as either a local
+ * or remote memory slot for #lpf_noc_put and #lpf_noc_get.
+ *
+ * Use of the returned memory slot to indicate a remote memory area may only
+ * occur by copying the returned memory slot to another LPF process. This may
+ * be done using the standard #lpf_put and #lpf_get methods or by using
+ * auxiliary communication mechanisms. The memory slot thus communicated only
+ * refers to a valid memory area on the process it originated from; any other
+ * use leads to undefined behaviour.
+ *
+ * \note Note that the ability to copy memory slots to act as identifiers of
+ *       remote areas exploits the LPF core specification that instances of
+ *       the #lpf_memslot_t type are, indeed, byte-copyable.
+ *
+ * A memory slot returned by a successful call to this function may be
+ * destroyed via a call to the standard #lpf_deregister. The deregistration
+ * takes effect immediately. No communication using the deregistered slot
+ * should occur during that superstep, or otherwise undefined behaviour occurs.
+ *
+ * Only the process that created the returned memory slot can destroy it; other
+ * LPF processes than the one which created it that attempt to destroy the
+ * returned memory slot invoke undefined behaviour.
+ *
+ * Other than the above specified differences, the arguments to this function
+ * are the same as for #lpf_register_local:
+ *
+ * \param[in,out] ctx     The runtime state as provided by lpf_exec().
+ * \param[in]     pointer The pointer to the memory area to register.
+ * \param[in]     size    The size of the memory area to register in bytes.
+ * \param[out]    memslot Where to store the memory slot identifier.
+ *
+ * \note Registering a slot with zero \a size is valid. The resulting memory
+ *       slot cannot be written to nor read from by remote LPF processes.
+ *
+ * \note In particular, passing \c NULL as \a pointer and \c 0 for \a size is
+ *       valid.
+ *
+ * \returns #LPF_SUCCESS
+ *            Successfully registered the memory region and successfully
+ *            assigned a memory slot identifier.
+ *
+ * \note One registration consumes one memory slot from the pool of locally
+ *       available memory slots, which must have been preallocated by
+ *       lpf_resize_memory_register() or recycled by lpf_deregister(). Always
+ *       use lpf_resize_memory_register() at the start of the SPMD function
+ *       that is executed by lpf_exec(), since lpf_exec() itself does not
+ *       preallocate slots.
+ *
+ * \note It is illegal to request more memory slots than have previously been
+ *       registered with lpf_resize_memory_register(). There is no runtime
+ *       check for this error, because a safe way out cannot be guaranteed
+ *       without significant parallel error checking overhead.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only. Any
+ * further thread safety may be guaranteed by the implementation, but is not
+ * specified. Similar conditions hold for all LPF primitives that take an
+ * argument of type #lpf_t; see #lpf_t for more information.
+ *
+ * \par BSP costs
+ *
+ * None.
+ *
+ * \par Runtime costs
+ *
+ * \f$ \mathcal{O}( \texttt{size} ) \f$.
+ *
+ * \note This asymptotic bound may be attained for implementations that require
+ *       linear-time processing on the registered memory area, such as to effect
+ *       memory pinning. If this is not required, a good implementation will
+ *       require only \f$ \Theta(1) \f$ time.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_noc_register(
+    lpf_t ctx,
+    void * pointer,
+    size_t size,
+    lpf_memslot_t * memslot
+);
+
+/**
+ * Deregisters a memory area previously registered using lpf_noc_register().
+ *
+ * After a successful deregistration, the slot is returned to the pool of free
+ * memory slots. The total number of memory slots may be set via a call to
+ * lpf_noc_resize_memory_register().
+ *
+ * Deregistration takes effect immediately. A call to this function is not
+ * collective, and the order of deregistration does not need to match the order
+ * of registration. Any local or remote communication using the given \a memslot
+ * in the current superstep invokes undefined behaviour.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only. Any
+ * further thread safety may be guaranteed by the implementation, but is not
+ * specified. Similar conditions hold for all LPF primitives that take an
+ * argument of type #lpf_t; see #lpf_t for more information.
+ *
+ * \param[in,out] ctx The runtime state as provided by lpf_exec().
+ * \param[in] memslot The memory slot identifier to de-register.
+ *
+ * \returns #LPF_SUCCESS
+ *            Successfully deregistered the memory region.
+ *
+ * \par BSP costs
+ * None.
+ *
+ * \par Runtime costs
+ * \f$ \mathcal{O}(n) \f$, where \f$ n \f$ is the size of the memory region
+ * corresponding to \a memslot.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_noc_deregister(
+    lpf_t ctx,
+    lpf_memslot_t memslot
+);
+
+/**
+ * Copies contents of local memory into the memory of remote processes.
+ *
+ * This operation is guaranteed to be completed after a call to the next
+ * lpf_sync() exits. Until that time it occupies one entry in the operations
+ * queue.
+ *
+ * Concurrent reads or writes from or to the same memory area are
+ * allowed in the same way they are for the core primitive #lpf_put.
+ *
+ * This primitive differs from #lpf_put in that the \a dst_slot may be the
+ * result of a successful call to #lpf_noc_register, while \a src_slot \em must
+ * be the results of such a successful call. In both cases, the slot need
+ * \em not have been registered before the last call to #lpf_sync.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only. Any
+ * further thread safety may be guaranteed by the implementation, but is not
+ * specified. Similar conditions hold for all LPF primitives that take an
+ * argument of type #lpf_t; see #lpf_t for more information.
+ *
+ * \param[in,out] ctx    The runtime state as provided by lpf_exec()
+ * \param[in] src_slot   The memory slot of the local source memory area
+ *                       registered using lpf_register_local(),
+ *                       lpf_register_global(), or lpf_noc_register()
+ * \param[in] src_offset The offset of reading out the source memory area,
+ *                       w.r.t. the base location of the registered area
+ *                       expressed in bytes.
+ * \param[in] dst_pid    The process ID of the destination process.
+ * \param[in] dst_slot   The memory slot of the destination memory area at
+ *                       \a pid, registered using lpf_register_global() or
+ *                       lpf_noc_register().
+ * \param[in] dst_offset The offset of writing to the destination memory area
+ *                       w.r.t. the base location of the registered area
+ *                       expressed in bytes.
+ * \param[in] size       The number of bytes to copy from the source memory area
+ *                       to the destination memory area.
+ * \param[in] attr
+ *            \parblock
+ *            In case an \a attr not equal to #LPF_MSG_DEFAULT is provided, the
+ *            the message created by this function may have modified semantics
+ *            that may be used to extend this API. Examples include:
+ *
+ *              -# delaying the superstep deadline of delivery, and/or
+ *              -# DRMA with message combining semantics.
+ *
+ *            These attributes are stored after a call to this function has
+ *            completed and may be modified immediately after without affecting
+ *            any messages already scheduled.
+ *            \endparblock
+ *
+ * \note See #lpf_put for notes regarding #lpf_msg_attr_t.
+ *
+ * \returns #LPF_SUCCESS
+ *            When the communication request was recorded successfully.
+ *
+ * \par BSP costs
+ * This function will increase
+ *     \f$ t_{c}^{(s)} \f$
+ * and
+ *     \f$ r_{c}^{(\mathit{pid})} \f$
+ * by \a size, where c is the current superstep number and s is this process ID
+ * (as provided by #lpf_exec)). See \ref BSPCOSTS on how this affects real-time
+ * communication costs.
+ *
+ * \par Runtime costs
+ * See \ref BSPCOSTS.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_noc_put(
+    lpf_t ctx,
+    lpf_memslot_t src_slot,
+    size_t src_offset,
+    lpf_pid_t dst_pid,
+    lpf_memslot_t dst_slot,
+    size_t dst_offset,
+    size_t size,
+    lpf_msg_attr_t attr
+);
+
+/**
+ * Copies contents from remote memory to local memory.
+ *
+ * This operation completes after one call to lpf_sync(). Until that time it
+ * occupies one entry in the operations queue.
+ *
+ * Concurrent reads or writes from or to the same memory area are allowed in the
+ * same way it is for #lpf_get.
+ *
+ * This primitive differs from #lpf_get in that the \a src_slot may be the
+ * result of a successful call to #lpf_noc_register, while \a dst_slot \em must
+ * be the result of such a successful call. In both cases, the slot need \em not
+ * have been registered before the last call to #lpf_sync.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only. Any
+ * further thread safety may be guaranteed by the implementation, but is not
+ * specified. Similar conditions hold for all LPF primitives that take an
+ * argument of type #lpf_t; see #lpf_t for more information.
+ *
+ * \param[in,out] ctx    The runtime state as provided by lpf_exec().
+ * \param[in] src_pid    The process ID of the source process.
+ * \param[in] src_slot   The memory slot of the source memory area at \a pid, as
+ *                       globally registered with lpf_register_global() or
+ *                       lpf_noc_register().
+ * \param[in] src_offset The offset of reading out the source memory area,
+ *                       w.r.t. the base location of the registered area
+ *                       expressed in bytes.
+ * \param[in] dst_slot   The memory slot of the local destination memory area
+ *                       registered using lpf_register_local(),
+ *                       lpf_register_global(), or lpf_noc_register().
+ * \param[in] dst_offset The offset of writing to the destination memory area
+ *                       w.r.t. the base location of the registered area
+ *                       expressed in bytes.
+ * \param[in] size       The number of bytes to copy from the source
+ *                       remote memory location.
+ * \param[in] attr
+ *            \parblock
+ *            In case an \a attr not equal to #LPF_MSG_DEFAULT is provided, the
+ *            the message created by this function may have modified semantics
+ *            that may be used to extend this API. Examples include:
+ *
+ *              -# delaying the superstep deadline of delivery, and/or
+ *              -# DRMA with message combining semantics.
+ *
+ *            These attributes are stored after a call to this function has
+ *            completed and may be modified immediately after without affecting
+ *            any messages already scheduled.
+ *            \endparblock
+ *
+ * \note See #lpf_get for notes on the use of #lpf_msg_attr_t.
+ *
+ * \returns #LPF_SUCCESS
+ *            When the communication request was recorded successfully.
+ *
+ * \par BSP costs
+ * This function will increase
+ *   \f$ r_{c}^{(s)} \f$
+ * and
+ *   \f$ t_{c}^{(\mathit{pid})} \f$
+ * by \a size, where c is the current superstep number and s is this process ID
+ * (as provided via lpf_exec(). See \ref BSPCOSTS on how this affects real-time
+ * communication costs.
+ *
+ * \par Runtime costs
+ * See \ref BSPCOSTS.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_noc_get(
+    lpf_t ctx,
+    lpf_pid_t src_pid,
+    lpf_memslot_t src_slot,
+    size_t src_offset,
+    lpf_memslot_t dst_slot,
+    size_t dst_offset,
+    size_t size,
+    lpf_msg_attr_t attr
+);
+
+/**
+ * Processes completed outgoing RDMA requests that have occurred without calling
+ * #lpf_sync.
+ *
+ * Some fabrics require user-space to regularly flush internal queues at a rate
+ * that does matches (or exceeds) that of outgoing RDMA request completions. It
+ * is implementation-specified how many times or at what frequency flushes must
+ * be performed.
+ *
+ * @param[in] ctx  The LPF context.
+ * @param[in] attr The synchronisation attribute.
+ *
+ * \note Rationale: \a attr is requested as given different attributes,
+ *       different internal queues may be processed.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only. Any
+ * further thread safety may be guaranteed by the implementation, but is not
+ * specified. Similar conditions hold for all LPF primitives that take an
+ * argument of type #lpf_t; see #lpf_t for more information.
+ *
+ * \returns #LPF_SUCCESS This function never fails.
+ *
+ * \par BSP costs
+ * None; by using this primitive, the overall BSP cost remains unaffected.
+ *
+ * \par Runtime costs
+ * \f$ \mathcal{O}( n ) \f$, where \f$ n \f$ is the maximum number of
+ * simultaneously outstanding RDMA requests (see #lpf_resize_message_queue).
+ * When calling this function several times within the same superstep, the
+ * aggregate runtime cost remains \f$ \mathcal{O}(n) \f$.
+ *
+ * \note The above is not big-Theta, as some implementations do not require
+ *       user-space flushes.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_noc_flush_sent( lpf_t ctx, lpf_sync_attr_t attr );
+
+/**
+ * Processes completed incoming RDMA requests that have occurred without calling
+ * #lpf_sync.
+ *
+ * Some fabrics require user-space to regularly flush internal queues at a rate
+ * that does matches (or exceeds) that of outgoing RDMA request completions. It
+ * is implementation-specified how many times or at what frequency flushes must
+ * be performed.
+ *
+ * @param[in] ctx  The LPF context.
+ * @param[in] attr The synchronisation attribute.
+ *
+ * \note Rationale: \a attr is requested as given different attributes,
+ *       different internal queues may be processed.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only. Any
+ * further thread safety may be guaranteed by the implementation, but is not
+ * specified. Similar conditions hold for all LPF primitives that take an
+ * argument of type #lpf_t; see #lpf_t for more information.
+ *
+ * \returns #LPF_SUCCESS This function never fails.
+ *
+ * \par BSP costs
+ * None; by using this primitive, the overall BSP cost remains unaffected.
+ *
+ * \par Runtime costs
+ * \f$ \mathcal{O}( n ) \f$, where \f$ n \f$ is the maximum number of
+ * simultaneously outstanding RDMA requests (see #lpf_resize_message_queue).
+ * When calling this function several times within the same superstep, the
+ * aggregate runtime cost remains \f$ \mathcal{O}(n) \f$.
+ *
+ * \note The above is not big-Theta, as some implementations do not require
+ *       user-space flushes.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_noc_flush_received( lpf_t ctx, lpf_sync_attr_t attr );
+
+/**
+ * @}
+ * @}
+ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/lpf/tags.h b/include/lpf/tags.h
new file mode 100644
index 00000000..812f685d
--- /dev/null
+++ b/include/lpf/tags.h
@@ -0,0 +1,474 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LPFLIB_TAGS_H
+#define LPFLIB_TAGS_H
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** \addtogroup LPF_EXTENSIONS LPF API extensions
+ * @{
+ *
+ * \defgroup LPF_TAGS
+ *
+ * Tags enable identifying groups of messages that a call to #lpf_sync should
+ * wait on. This is an extension on the classic BSP behaviour that all messages
+ * issued during the communication phase of a superstep must be waited on; tags
+ * instead identify potentially multiple independent communication phases.
+ * Rather than #lpf_sync ending all communication phases, it may now elect to
+ * end a specific communication phase only, as identified by a tag.
+ *
+ * This mechanism is implemented by allowing tags to be tied to LPF message
+ * attributes as well as to LPF synchronization attributes.
+ *
+ * @{
+ */
+
+/**
+ * The specification version of the tags.
+ *
+ * All implementations shall define this macro. The format is YYYNN, where YYYY
+ * is the year the specification was released, and NN the number of
+ * specifications released before this one in the same year.
+ */
+#define _LPF_TAGS_VERSION 202500L
+
+/**
+ * The type of an LPF tag.
+ *
+ * \par Communication
+ * Objects of this type must not be communicated.
+ */
+#ifdef DOXYGEN
+typedef ... lpf_tag_t;
+#else
+typedef uint32_t lpf_tag_t;
+#endif
+
+/**
+ * A dummy value to initialize an #lpf_tag_t instance at declaration.
+ *
+ * \note A debug implementation may check for this value so that errors can be
+ *       detected.
+ */
+extern _LPFLIB_VAR const lpf_tag_t LPF_INVALID_TAG;
+
+/**
+ * Resizes the tag register for subsequent supersteps.
+ *
+ * The new capacity becomes valid \em after a next call to lpf_sync(). The
+ * initial capacity is zero.
+ *
+ * Each call to lpf_create_tag counts as one, while every valid call to
+ * lpf_destroy_tag decrements the number of registered tags by one. The
+ * initializer tag #LPF_INVALID_TAG does not count towards the number of
+ * registered tags.
+ *
+ * If allocation was successful, the return value is #LPF_SUCCESS. In the case
+ * of insufficient local memory, the return value is #LPF_ERR_OUT_OF_MEMORY.
+ *
+ * \note The current maximum nor currently registered number of tags cannot be
+ *       retrieved from the run-time. Instead, the programmer must track this
+ *       information herself. To provide encapsulation, please see lpf_rehook().
+ *
+ * A call to this function with \a max_tags smaller than the current capacity
+ * shall always return #LPF_SUCCESS.
+ *
+ * \note When the given new capacity is smaller than the current capacity, the
+ *       run-time is allowed but not required to release any superfluous
+ *       memory. Implementations that do so must ensure that in case there was
+ *       no space to allocate the smaller buffer, the older larger buffer
+ *       remains intact (calls to this function requesting smaller-than-current
+ *       capacity shall never fail).
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only.
+ *
+ * \returns #LPF_SUCCESS When the process acquired resources for registering
+ *                       \a max_tags tags.
+ *
+ * \returns #LPF_ERR_OUT_OF_MEMORY When there was not enough memory left on the
+ *                                 heap. On return, the effect is the same as
+ *                                 when this call did not occur at all.
+ *
+ * \par BSP costs
+ * None.
+ *
+ * \par Runtime costs
+ * \f$ \mathcal{O}( \mathit{max\_tags} ) \f$.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_resize_tag_register(
+    lpf_t ctx,
+    size_t max_tags
+);
+
+/**
+ * Creates a new tag.
+ *
+ * This is a collective function, meaning that all processes call this
+ * primitive in the same superstep and in the same order.
+ *
+ * Once a tag is created, it takes one tag registration slot. The maximum
+ * number of registrations is given by lpf_resize_tag_register. On entering
+ * this call, the user shall ensure at least one tag register remains free.
+ *
+ * @param[in,out] ctx    The LPF context.
+ * @param[in]     active Whether the calling process will be active within the
+ *                       newly-created tag.
+ * @param[out]    tag    Location where to store the newly created tag. One tag
+ * i                     registration slot is consumed.
+ *
+ * Only processes active within a tag may use that tag during RDMA requests
+ * (put, get, and sync). Use of this tag by any other process invites undefined
+ * behaviour.
+ *
+ * \note Implementations may modify the memory area pointed to by \a tag even if
+ *       \a active is <tt>false</tt>. Such modified values should remain unused
+ *       by RDMA requests, however. (Their only possible valid use is when
+ *       supplied to a matching call to lpf_tags_destroy()).
+ *
+ * @returns #LPF_SUCCESS If the creation of the tag is successful.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_tag_create(
+    lpf_t ctx,
+    bool active,
+    lpf_tag_t * tag
+);
+
+/**
+ * Destroys a tag created by #lpf_tags_create.
+ *
+ * This is a collective function, meaning that all processes must call this
+ * primitive on the same tag in the same superstep and in the same order.
+ *
+ * @param[in,out] ctx The LPF context.
+ * @param[in]     tag The tag to be destroyed.
+ *
+ * The given \a tag must have been the result of a previous succesful call to
+ * #lpf_tags_create that was not already followed by a successful call to
+ * #lpf_tags_destroy.
+ *
+ * \note Even processes who marked themselves as inactive during tag creation
+ *       must actively participate in their destruction. Implementations may
+ *       optimise this process by translating destruction to a no-op on those
+ *       processes.
+ *
+ * After a successful call to this function, the number of registered tags
+ * decreases by one.
+ *
+ * @returns #LPF_SUCCESS If the destruction of the tag is successful.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_tag_destroy(
+    lpf_t ctx,
+    lpf_tag_t tag
+);
+
+/**
+ * Creates a new message attribute that is compatible with the LPF tags
+ * extension.
+ *
+ * If an implementation supports additional extensions that employ message
+ * attributes, then attributes initialised by this extension must result in a
+ * valid message attribute for use with those other extensions also.
+ *
+ * \note This does \em not imply that using message attributes from multiple
+ *       extensions simultaneously always yields sensible behaviour; this
+ *       depends on the specification of the extensions.
+ *
+ * This extension is compatible with zero-cost synchronization extensions.
+ *
+ * @param[in,out] ctx  The LPF context.
+ * @param[out]    attr Where a new message attribute will be allocated.
+ *
+ * After a successful function call, applying the returned \a attr without
+ * modification shall induce the same behaviour as applying #LPF_MSG_DEFAULT.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only.
+ *
+ * \returns #LPF_SUCCESS When a new \a attr was successfully constructed. After
+ *                       the call to this function, the attribute pointed to by
+ *                       \a attr shall be a valid message attribute.
+ *
+ * \returns #LPF_ERR_OUT_OF_MEMORY When not enough system resources were
+ *                                 available to create a new message attribute.
+ *
+ * \par BSP costs
+ * None.
+ *
+ * \par Runtime costs
+ * \f$ \Theta( 1 ) \f$.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_tag_create_mattr(
+    lpf_t ctx,
+    lpf_msg_attr_t * attr
+);
+
+/**
+ * Creates a new synchronization attribute that is compatible with the LPF tags
+ * extension.
+ *
+ * If an implementation supports additional extensions that employ
+ * synchronization attributes, then attributes initialised by this extension
+ * must result in a valid synchronization attribute for use with those other
+ * extensions also.
+ *
+ * \note This does \em not imply that using synchronization attributes from
+ *       multiple extensions simultaneously always yields sensible behaviour;
+ *       this depends on the specification of the extensions.
+ *
+ * This extension is compatible with zero-cost synchronization extensions.
+ *
+ * @param[in,out] ctx  The LPF context.
+ * @param[out]    attr Where a new message attribute will be allocated.
+ *
+ * After a successful function call, applying the returned \a attr without
+ * modification shall induce the same behaviour as applying #LPF_MSG_DEFAULT.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only.
+ *
+ * \returns #LPF_SUCCESS When a new \a attr was successfully constructed. After
+ *                       the call, the attribute pointed to by \a attr shall be
+ *                       a valid synchronisation attribute.
+ *
+ * \returns #LPF_ERR_OUT_OF_MEMORY When not enough system resources were
+ *                                 available to create a new message attribute.
+ *
+ * \par BSP costs
+ * None.
+ *
+ * \par Runtime costs
+ * \f$ \Theta( 1 ) \f$.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_tag_create_sattr(
+    lpf_t ctx,
+    lpf_sync_attr_t * attr
+);
+
+/**
+ * Destroys a valid message attribute.
+ *
+ * The given \a attr must \em not equal #LPF_MSG_DEFAULT (the default message
+ * attribute may not be destroyed). The given \a attr must be created by this
+ * extension \em or by an extension that is compatible with the tags extension.
+ *
+ * This function may be called on message attributes created by the zero-cost
+ * synchronisation extension.
+ *
+ * @param[in,out] ctx  The LPF context.
+ * @param[out]    attr The message attribute to be destroyed.
+ *
+ * After a successful function call, the given \a attr shall become invalid and
+ * must not be used in subsequent calls to any LPF primitive.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only.
+ *
+ * \returns #LPF_SUCCESS A call to this function always succeeds.
+ *
+ * \par BSP costs
+ * None.
+ *
+ * \par Runtime costs
+ * \f$ \Theta( 1 ) \f$.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_tag_destroy_mattr(
+    lpf_t ctx,
+    lpf_msg_attr_t attr
+);
+
+/**
+ * Destroys a valid synchronization attribute.
+ *
+ * The given \a attr must \em not equal #LPF_SYNC_DEFAULT (the default
+ * synchronization attribute may not be destroyed). The given \a attr must be
+ * created by this extension \em or by an extension that is compatible with the
+ * tags extension.
+ *
+ * This function may be called on synchronisation attributes created by the
+ * zero-cost synchronisation extension.
+ *
+ * @param[in,out] ctx  The LPF context.
+ * @param[out]    attr The message attribute to be destroyed.
+ *
+ * After a successful function call, the given \a attr shall become invalid and
+ * must not be used in subsequent calls to any LPF primitive.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only.
+ *
+ * \returns #LPF_SUCCESS A call to this function always succeeds.
+ *
+ * \par BSP costs
+ * None.
+ *
+ * \par Runtime costs
+ * \f$ \Theta( 1 ) \f$.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_tag_destroy_sattr(
+    lpf_t ctx,
+    lpf_sync_attr_t attr
+);
+
+/**
+ * Retrieves a tag from a message attribute.
+ *
+ * @param[in,out] ctx  The LPF context.
+ * @param[in]     attr The message attribute.
+ * @param[out]    tag  Where to store the tag that was attached to \a attr.
+ *
+ * The given \a attr must be valid.
+ *
+ * \note An implementation must at least support attribute initialization via
+ *       #lpf_tag_create_mattr.
+ *
+ * If \a attr was not attached a tag, then #LPF_INVALID_TAG will be returned at
+ * \a tag.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only.
+ *
+ * \returns #LPF_SUCCESS A call to this function always succeeds.
+ *
+ * \par BSP costs
+ * None.
+ *
+ * \par Runtime costs
+ * \f$ \Theta( 1 ) \f$.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_tag_get_mattr(
+    lpf_t ctx,
+    lpf_msg_attr_t attr,
+    lpf_tag_t * tag
+);
+
+/**
+ * Attaches a tag to a given message attribute.
+ *
+ * @param[in,out] ctx  The LPF context.
+ * @param[in]     tag  The tag to attach to \a attr.
+ * @param[in,out] attr Where to attach the \a tag to.
+ *
+ * The given \a attr must be valid.
+ *
+ * \note An implementation must at least support attribute initialization via
+ *       #lpf_tag_create_mattr.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only.
+ *
+ * \returns #LPF_SUCCESS A call to this function always succeeds.
+ *
+ * \par BSP costs
+ * None.
+ *
+ * \par Runtime costs
+ * \f$ \Theta( 1 ) \f$.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_tag_set_mattr(
+    lpf_t ctx,
+    lpf_tag_t tag,
+    lpf_msg_attr_t attr
+);
+
+/**
+ * Retrieves a tag from a synchronization attribute.
+ *
+ * @param[in,out] ctx  The LPF context.
+ * @param[in]     attr The synchronization attribute.
+ * @param[out]    tag  Where to store the tag that was attached to \a attr.
+ *
+ * The given \a attr must be valid.
+ *
+ * \note An implementation must at least support attribute initialization via
+ *       #lpf_tag_create_sattr.
+ *
+ * If \a attr was not attached a tag, then #LPF_INVALID_TAG will be returned at
+ * \a tag.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only.
+ *
+ * \returns #LPF_SUCCESS A call to this function always succeeds.
+ *
+ * \par BSP costs
+ * None.
+ *
+ * \par Runime costs
+ * \f$ \Theta( 1 ) \f$.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_tag_get_sattr(
+    lpf_t ctx,
+    lpf_sync_attr_t attr,
+    lpf_tag_t * tag
+);
+
+/**
+ * Attaches a tag to a given synchronization attribute.
+ *
+ * @param[in,out] ctx  The LPF context.
+ * @param[in]     tag  The tag to attach to \a attr.
+ * @param[in,out] attr Where to attach the \a tag to.
+ *
+ * The given \a attr must be valid.
+ *
+ * \note An implementation must at least support attribute initialization via
+ *       #lpf_tag_create_sattr.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only.
+ *
+ * \returns #LPF_SUCCESS A call to this function always succeeds.
+ *
+ * \par BSP costs
+ * None.
+ *
+ * \par Runtime costs
+ * \f$ \Theta( 1 ) \f$.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_tag_set_sattr(
+    lpf_t ctx,
+    lpf_tag_t tag,
+    lpf_sync_attr_t attr
+);
+
+/**
+ * @}
+ * @}
+ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // LPFLIB_TAGS_H
diff --git a/include/lpf/zero.h b/include/lpf/zero.h
new file mode 100644
index 00000000..8302865d
--- /dev/null
+++ b/include/lpf/zero.h
@@ -0,0 +1,333 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LPFLIB_ZERO_H
+#define LPFLIB_ZERO_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** \addtogroup LPF_EXTENSIONS LPF API extensions
+ * @{
+ *
+ * \defgroup LPF_ZERO_COST_SYNC
+ *
+ * This extension provides so-called <em>zero-cost synchronisation</em>
+ * mechanisms on top of LPF. This term was coined by Alpert and Philbin back in
+ * 1997 [1]. It is rooted in the idea that BSP programs annotate how many bytes
+ * are expected to be sent and received as part of a given communication phase.
+ * If, simultaneously, network interfaces can keep track of processed incoming,
+ * respectively, outgoing bytes, then processes need only query its local
+ * network interface to determine whether a superstep has completed-- thus
+ * avoiding the need for either collectives or barriers.
+ *
+ * This extension provides a variant of zero-cost synchronisation that is based
+ * on counting the number of messages rather than number of bytes. It is
+ * compatible with the concept of a \em tag; see \ref LPF_TAGS.
+ *
+ * [1] Alpert, R. and Philbin, J., 1997. cBSP: Zero-cost synchronization in a
+ *     modified BSP model. NEC Research Institute, Princeton, NJ, USA,
+ *     Tech. Rep, pp.97-054.
+ *
+ * @{
+ */
+
+/**
+ * The specification version of zero-cost synchronisation.
+ */
+#define LPF_ZERO_COST_SYNC 202500L
+
+/**
+ * Creates a new message attribute that is compatible with the LPF zero-cost
+ * synchronisation extension.
+ *
+ * If an implementation supports additional extensions that employ message
+ * attributes, then attributes initialised by this extension must result in a
+ * valid message attribute for use with those other extensions also.
+ *
+ * \note This does \em not imply that using message attributes from multiple
+ *       extensions simultaneously always yields sensible behaviour; this
+ *       depends on the specification of the extensions.
+ *
+ * This extension is compatible with the tags extension.
+ *
+ * @param[in,out] ctx  The LPF context.
+ * @param[out]    attr Where a new message attribute will be allocated.
+ *
+ * After a successful function call, applying the returned \a attr without
+ * modification shall induce the same behaviour as applying #LPF_MSG_DEFAULT.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only.
+ *
+ * \returns #LPF_SUCCESS When a new \a attr was successfully constructed. After
+ *                       the call to this function, the attribute pointed to by
+ *                       \a attr shall be a valid message attribute.
+ *
+ * \returns #LPF_ERR_OUT_OF_MEMORY When not enough system resources were
+ *                                 available to create a new message attribute.
+ *
+ * \par BSP costs
+ * None.
+ *
+ * \par Runtime costs
+ * \f$ \Theta( 1 ) \f$.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_zero_create_mattr(
+    lpf_t ctx,
+    lpf_msg_attr_t * attr
+);
+
+/**
+ * Creates a new synchronization attribute that is compatible with the LPF
+ * zero-cost synchronization extension.
+ *
+ * If an implementation supports additional extensions that employ
+ * synchronization attributes, then attributes initialised by this extension
+ * must result in a valid synchronization attribute for use with those other
+ * extensions also.
+ *
+ * \note This does \em not imply that using synchronization attributes from
+ *       multiple extensions simultaneously always yields sensible behaviour;
+ *       this depends on the specification of the extensions.
+ *
+ * This extension is compatible with the tags extension.
+ *
+ * @param[in,out] ctx  The LPF context.
+ * @param[out]    attr Where a new message attribute will be allocated.
+ *
+ * After a successful function call, applying the returned \a attr without
+ * modification shall induce the same behaviour as applying #LPF_MSG_DEFAULT.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only.
+ *
+ * \returns #LPF_SUCCESS When a new \a attr was successfully constructed. After
+ *                       the call, the attribute pointed to by \a attr shall be
+ *                       a valid synchronisation attribute.
+ *
+ * \returns #LPF_ERR_OUT_OF_MEMORY When not enough system resources were
+ *                                 available to create a new message attribute.
+ *
+ * \par BSP costs
+ * None.
+ *
+ * \par Runtime costs
+ * \f$ \Theta( 1 ) \f$.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_zero_create_sattr(
+    lpf_t ctx,
+    lpf_sync_attr_t * attr
+);
+
+/**
+ * Destroys a valid message attribute.
+ *
+ * The given \a attr must \em not equal #LPF_MSG_DEFAULT (the default message
+ * attribute may not be destroyed). The given \a attr must be created by this
+ * extension \em or by an extension that is compatible with the tags extension.
+ *
+ * This function may be called on message attributes created by the tags
+ * extension.
+ *
+ * @param[in,out] ctx  The LPF context.
+ * @param[out]    attr The message attribute to be destroyed.
+ *
+ * After a successful function call, the given \a attr shall become invalid and
+ * must not be used in subsequent calls to any LPF primitive.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only.
+ *
+ * \returns #LPF_SUCCESS A call to this function always succeeds.
+ *
+ * \par BSP costs
+ * None.
+ *
+ * \par Runtime costs
+ * \f$ \Theta( 1 ) \f$.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_zero_destroy_mattr(
+    lpf_t ctx,
+    lpf_msg_attr_t attr
+);
+
+/**
+ * Destroys a valid synchronization attribute.
+ *
+ * The given \a attr must \em not equal #LPF_SYNC_DEFAULT (the default
+ * synchronization attribute may not be destroyed). The given \a attr must be
+ * created by this extension \em or by an extension that is compatible with the
+ * tags extension.
+ *
+ * This function may be called on synchronization attributes created by the tags
+ * extension.
+ *
+ * @param[in,out] ctx  The LPF context.
+ * @param[out]    attr The message attribute to be destroyed.
+ *
+ * After a successful function call, the given \a attr shall become invalid and
+ * must not be used in subsequent calls to any LPF primitive.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only.
+ *
+ * \returns #LPF_SUCCESS A call to this function always succeeds.
+ *
+ * \par BSP costs
+ * None.
+ *
+ * \par Runtime costs
+ * \f$ \Theta( 1 ) \f$.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_zero_destroy_sattr(
+    lpf_t ctx,
+    lpf_sync_attr_t attr
+);
+
+/**
+ * Attaches zero-cost synchronisation attributes to the given LPF
+ * synchronisation attribute.
+ *
+ * @param[in,out] ctx       The LPF context.
+ * @param[in] expected_sent The expected number of messages sent out from this
+ *                          process.
+ * @param[in] expected_rcvd The expected number of messages received at this
+ *                          process.
+ * @param[in,out] attr      Where to attach the zero-cost sync attributes.
+ *
+ * The given \a attr must have been created via #lpf_zero_create_sattr or must
+ * be created by another extension that is compatible with this zero-cost
+ * synchronization extension.
+ *
+ * If the resulting \a attr is used within a subsequent call to #lpf_sync,
+ * the spec demands that the #lpf_sync call is collective. The zero-cost
+ * synchronisation extension furthermore requires that each of those collective
+ * calls to #lpf_sync have matching zero-cost attributes attached to them. Here,
+ * ``matching'' means that the combination of all attributes given at all
+ * processes correctly corresponds to the global communication pattern that that
+ * #lpf_sync requires wait completion for.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only.
+ *
+ * \returns #LPF_SUCCESS If the attachment of the zero-cost synchronisation
+ *                       attributes is successful.
+ *
+ * \par BSP costs
+ * None.
+ *
+ * \par Runtime costs
+ * \f$ \Theta( 1 ) \f$.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_zero_set_expected(
+    lpf_t ctx,
+    size_t expected_sent, size_t expected_rcvd,
+    lpf_sync_attr_t attr
+);
+
+/**
+ * Retrieves the attached zero-cost information from the given synchronisation
+ * attribute.
+ *
+ * @param[in,out] ctx           The LPF context.
+ * @param[in]     attr          The synchronisation attribute to retrieve the
+ *                              zero-cost attributes from.
+ * @param[out]    expected_sent Where to store the expected number of sent
+ *                              messages.
+ * @param[out]    expected_rcvd Where to store the expected number of received
+ *                              messages.
+ *
+ * The given \a attr must have been created via #lpf_zero_create_sattr or must
+ * be created by another extension that is compatible with this zero-cost
+ * synchronization extension.
+ *
+ * If \a attr did not have a preceding call to #lpf_zero_set_expected, then the
+ * default values (0) are returned. An expected zero for both received and sent
+ * number of messages indicates a regular (non zero-cost) synchronization.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only.
+ *
+ * \returns #LPF_SUCCESS A call to this function always succeeds.
+ *
+ * \par BSP costs
+ * None.
+ *
+ * \par Runtime costs
+ * \f$ \Theta( 1 ) \f$.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_zero_get_expected(
+    lpf_t ctx,
+    lpf_sync_attr_t attr,
+    size_t * expected_sent, size_t * expected_rcvd
+);
+
+/**
+ * Retrieves the current locally-received number of messages.
+ *
+ * @param[in,out] ctx           The LPF context.
+ * @param[in]     attr          The synchronisation attribute to retrieve the
+ *                              status of.
+ * @param[out]    rcvd          Where to store the number of received messages.
+ * @param[out]    sent          Where to store the number of sent messages.
+ *
+ * The given \a attr must have been created via #lpf_zero_create_sattr or must
+ * be created by another extension that is compatible with this zero-cost
+ * synchronization extension.
+ *
+ * \note Rationale: this function is useful for implementing task-aware
+ *       interfaces around zero-cost synchronisation mechanisms.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only.
+ *
+ * \returns #LPF_SUCCESS A call to this function always succeeds.
+ *
+ * \par BSP costs
+ * None.
+ *
+ * \par Runtime costs
+ * \f$ \Theta( 1 ) \f$.
+ *
+ * \note A call to this function may imply querying the network interface,
+ *       and hence the constant-time factor of a call to this function may be
+ *       non-trivial; use of this function is recommended to be sparingly.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_zero_get_status(
+    lpf_t ctx, lpf_sync_attr_t attr,
+    size_t * rcvd, size_t * sent
+);
+
+/**
+ * @}
+ * @}
+ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // LPFLIB_ZERO_H
diff --git a/lpfrun.in b/lpfrun.in
index 640fdc00..558a96d5 100644
--- a/lpfrun.in
+++ b/lpfrun.in
@@ -57,7 +57,7 @@ function printhelp()
     echo
     echo "   -engine <engine>"
     echo "               Allow you to choose the engine. Currently supported"
-    echo "               are: pthread, mpirma, mpimsg, ibverbs, hybrid"
+    echo "               are: pthread, mpirma, mpimsg, ibverbs, zero, hybrid"
     echo 
     echo "   -probe <seconds>"
     echo "               Set the number of seconds to probe the system for BSP"
@@ -846,7 +846,7 @@ case $engine in
         exit_status=$?
         ;;
 
-    mpirma|mpimsg|ibverbs)
+    mpirma|mpimsg|ibverbs|zero)
 
         mpi_impl=$(mpi_detect)
         proc_args=
@@ -1128,8 +1128,8 @@ case $engine in
         ;;
 
     *)
-        echo "Engine '$engine' is not supported. Please choose 'pthread',"
-        echo "'mpirma', or 'hybrid'"
+        echo "Engine '$engine' is not supported. Please choose "
+        echo "'pthread', 'mpirma', 'mpimsg', 'ibverbs, 'zero', 'hybrid'"
         exit_status=1
         ;;
 esac
diff --git a/post-install/post-install-test.cmake.in b/post-install/post-install-test.cmake.in
index edd06922..05786d26 100644
--- a/post-install/post-install-test.cmake.in
+++ b/post-install/post-install-test.cmake.in
@@ -353,6 +353,9 @@ endif()
 ######   CMake integration using generated CMake module file ############
 
 foreach(engine @ENGINES@)
+    if ("${engine}" STREQUAL "zero") 
+        continue()
+    endif()
     message("Testing generated CMake module files for engine ${engine}")
 
     set(test_dir @builddir@/cmake-module-test-${engine})
diff --git a/post-install/test-lpf-nprocs.c b/post-install/test-lpf-nprocs.c
index cf274b3f..554b5775 100644
--- a/post-install/test-lpf-nprocs.c
+++ b/post-install/test-lpf-nprocs.c
@@ -53,6 +53,8 @@ void spmd( lpf_t lpf, lpf_pid_t pid, lpf_pid_t nprocs, lpf_args_t args )
     lpf_memslot_t mem_slot = LPF_INVALID_MEMSLOT;
     lpf_register_global( lpf, mem, nprocs, &mem_slot );
 
+    lpf_sync(lpf, LPF_SYNC_DEFAULT);
+
     if (pid != 0) 
         lpf_get( lpf, 0, params_slot, 0, params_slot, 0, sizeof(params), LPF_MSG_DEFAULT );
 
diff --git a/src/MPI/CMakeLists.txt b/src/MPI/CMakeLists.txt
index 757b9004..864bdca2 100644
--- a/src/MPI/CMakeLists.txt
+++ b/src/MPI/CMakeLists.txt
@@ -23,7 +23,7 @@ if (MPI_FOUND)
     endif()
 
     if (ENABLE_IBVERBS)
-        list(APPEND MPI_ENGINES ibverbs)
+        list(APPEND MPI_ENGINES ibverbs zero)
     endif()
 
     if (MPI_IBARRIER)
@@ -49,10 +49,12 @@ if (MPI_FOUND)
 
             set(ibverbs_sources)
             if (LPF_IMPL_ID STREQUAL ibverbs)
-            set(ibverbs_sources ibverbs.cpp)
-        endif()
-
-        add_library(raw_${libname} OBJECT
+                set(ibverbs_sources ibverbs.cpp)
+            endif()
+            if (LPF_IMPL_ID STREQUAL zero)
+                set(ibverbs_sources zero.cpp)
+            endif()
+            add_library(raw_${libname} OBJECT
                 memorytable.cpp
                 mesgqueue.cpp
                 mpilib.cpp
@@ -65,61 +67,61 @@ if (MPI_FOUND)
                 spall2all.c
                 messagesort.cpp
                 spall2all.cpp
-                init.cpp
+		init.cpp
                 ${ibverbs_sources}
             )
 
 
-        target_compile_flags(raw_${libname} 
+            target_compile_flags(raw_${libname}
                 INTERFACE "-fPIC")
 
-        target_compile_definitions(raw_${libname} 
+            target_compile_definitions(raw_${libname}
                 PRIVATE "LPF_CORE_MPI_USES_${LPF_IMPL_ID}=1"
                         "LPF_CORE_WARM_UP_PROBE=1"
                         "LPF_CORE_IMPL_ID=${LPF_IMPL_ID}"
                         "LPF_CORE_IMPL_CONFIG=${LPF_IMPL_CONFIG}"
-        )
-        target_include_directories(raw_${libname} 
-             PRIVATE  ${MPI_C_INCLUDE_PATH}
-        )
-        if (iface STREQUAL "spec_")
-            target_compile_definitions(raw_${libname} 
+            )
+            target_include_directories(raw_${libname}
+                PRIVATE  ${MPI_C_INCLUDE_PATH}
+            )
+            if (iface STREQUAL "spec_")
+                target_compile_definitions(raw_${libname}
                    PRIVATE "LPF_CORE_STATIC_DISPATCH=1"
                            "LPF_CORE_STATIC_DISPATCH_ID=${LPF_IMPL_ID}"
                            "LPF_CORE_STATIC_DISPATCH_CONFIG=${LPF_IMPL_CONFIG}"
             )
-        endif()
+            endif()
 
-        #Always build the shared library, because we need that for the lpfrun
-        add_library(${libname} SHARED
-                $<TARGET_OBJECTS:raw_${libname}> 
+            #Always build the shared library, because we need that for the lpfrun
+            add_library(${libname} SHARED
+                $<TARGET_OBJECTS:raw_${libname}>
                 $<TARGET_OBJECTS:${comlib}>
-        )
-        set_target_properties(${libname} PROPERTIES SOVERSION ${SOVERSION}
+            )
+            set_target_properties(${libname} PROPERTIES SOVERSION ${SOVERSION}
                                                     MACOSX_RPATH TRUE)
 
-        target_compile_flags(${libname} 
+            target_compile_flags(${libname}
                 INTERFACE "-fPIC")
 
-        if (iface STREQUAL "spec_")
-            target_compile_definitions(${libname} 
-               INTERFACE "LPF_CORE_STATIC_DISPATCH=1"
+            if (iface STREQUAL "spec_")
+                target_compile_definitions(${libname}
+                   INTERFACE "LPF_CORE_STATIC_DISPATCH=1"
                          "LPF_CORE_STATIC_DISPATCH_ID=${LPF_IMPL_ID}"
                          "LPF_CORE_STATIC_DISPATCH_CONFIG=${LPF_IMPL_CONFIG}"
+                )
+            endif()
+            target_include_directories(${libname}
+                PUBLIC   ${MPI_C_INCLUDE_PATH}
+                INTERFACE $<INSTALL_INTERFACE:${INSTALL_HEADERS}>
+                    $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
             )
-        endif()
-        target_include_directories(${libname} 
-             PUBLIC   ${MPI_C_INCLUDE_PATH}
-             INTERFACE $<INSTALL_INTERFACE:${INSTALL_HEADERS}>
-                       $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
-        )
 
-    endforeach(LPF_IMPL_ID)
+        endforeach(LPF_IMPL_ID)
     endforeach(iface)
 
     # link function that e.g. hybrid implementation can also use.
     function(lpf_link_mpi_core target engine)
-        target_link_libraries(${target} 
+        target_link_libraries(${target}
                 ${MPI_C_LIBRARIES}
                 ${LIB_MATH}
                 ${LIB_DL}
@@ -127,9 +129,9 @@ if (MPI_FOUND)
                 ${LIB_POSIX_THREADS}
         )
 
-        if (engine STREQUAL ibverbs)
-           target_link_libraries(${target} ${LIB_IBVERBS})
-        endif()
+    if (engine STREQUAL ibverbs OR engine STREQUAL zero)
+        target_link_libraries(${target} ${LIB_IBVERBS})
+    endif()
     endfunction()
 
 
@@ -144,15 +146,15 @@ if (MPI_FOUND)
                 ARCHIVE DESTINATION ${INSTALL_LIB}
                )
     endforeach()
-  
+
     include_directories(${MPI_C_INCLUDE_PATH})
-    # add a test for dynamichook 
+    # add a test for dynamichook
     if (NOT IS_OPENMPI AND LPF_ENABLE_TESTS)
         add_gtest(dynamichook.t "mpimsg" ON
-            ${CMAKE_CURRENT_SOURCE_DIR}/dynamichook.t.cpp 
-            ${CMAKE_CURRENT_SOURCE_DIR}/dynamichook.cpp 
+            ${CMAKE_CURRENT_SOURCE_DIR}/dynamichook.t.cpp
+            ${CMAKE_CURRENT_SOURCE_DIR}/dynamichook.cpp
             ${CMAKE_CURRENT_SOURCE_DIR}/mpilib.cpp)
-        
+
             configure_file( dynamichook.t.sh.in dynamichook.t.sh @ONLY)
             set( dynamic_hook_t_sh "${CMAKE_CURRENT_BINARY_DIR}/dynamichook.t.sh")
             add_test(NAME dynamichook_1proc
@@ -173,25 +175,29 @@ if (MPI_FOUND)
 
 # Other unit tests
     if (ENABLE_IBVERBS AND LPF_ENABLE_TESTS)
-        add_gtest( ibverbs_test "ibverbs" ON ${CMAKE_CURRENT_SOURCE_DIR}/ibverbs.t.cpp 
-            ${CMAKE_CURRENT_SOURCE_DIR}/ibverbs.cpp 
+        add_gtest( ibverbs_test "ibverbs" ON ${CMAKE_CURRENT_SOURCE_DIR}/ibverbs.t.cpp
+            ${CMAKE_CURRENT_SOURCE_DIR}/ibverbs.cpp
+            ${CMAKE_CURRENT_SOURCE_DIR}/mpilib.cpp)
+
+        add_gtest( zero_test "zero" ON ${CMAKE_CURRENT_SOURCE_DIR}/zero.t.cpp
+            ${CMAKE_CURRENT_SOURCE_DIR}/zero.cpp
             ${CMAKE_CURRENT_SOURCE_DIR}/mpilib.cpp)
     endif()
 
     foreach (engine ${MPI_ENGINES})
         add_gtest( spall2all_test_${engine} ${engine} ON
-            ${CMAKE_CURRENT_SOURCE_DIR}/spall2all.t.cpp 
-            ${CMAKE_CURRENT_SOURCE_DIR}/spall2all.c 
-            ${CMAKE_CURRENT_SOURCE_DIR}/spall2all.cpp 
+            ${CMAKE_CURRENT_SOURCE_DIR}/spall2all.t.cpp
+            ${CMAKE_CURRENT_SOURCE_DIR}/spall2all.c
+            ${CMAKE_CURRENT_SOURCE_DIR}/spall2all.cpp
             ${CMAKE_CURRENT_SOURCE_DIR}/mpilib.cpp)
 
         add_gtest( dall2all_test_${engine} ${engine} ON
-            ${CMAKE_CURRENT_SOURCE_DIR}/dall2all.t.cpp 
+            ${CMAKE_CURRENT_SOURCE_DIR}/dall2all.t.cpp
             ${CMAKE_CURRENT_SOURCE_DIR}/mpilib.cpp)
 
         if (MPI_IBARRIER)
             add_gtest( hall2all_test_${engine} ${engine} ON
-                ${CMAKE_CURRENT_SOURCE_DIR}/hall2all.t.cpp 
+                ${CMAKE_CURRENT_SOURCE_DIR}/hall2all.t.cpp
                 ${CMAKE_CURRENT_SOURCE_DIR}/mpilib.cpp)
         endif()
 
diff --git a/src/MPI/core.cpp b/src/MPI/core.cpp
index 94a9658f..c4f7f900 100644
--- a/src/MPI/core.cpp
+++ b/src/MPI/core.cpp
@@ -16,6 +16,7 @@
  */
 
 #include <lpf/core.h>
+#include <lpf/zero.h>
 #include <lpf/mpi.h>
 #include <lpf/abort.h>
 
@@ -41,8 +42,8 @@
 // that may deviate from the stdlib abort()
 const int LPF_HAS_ABORT = 2;
 
-// Error codes. 
-// Note: Some code (e.g. in process::broadcastSymbol) depends on the 
+// Error codes.
+// Note: Some code (e.g. in process::broadcastSymbol) depends on the
 // fact that numbers are assigned in order of severity, where 0 means
 // no error and 3 means unrecoverable error. That way the severest error
 // status can be replicated through Communication::allreduceMax
@@ -50,11 +51,13 @@ const lpf_err_t LPF_SUCCESS = 0;
 const lpf_err_t LPF_ERR_OUT_OF_MEMORY = 1;
 const lpf_err_t LPF_ERR_FATAL = 2;
 
+const lpf_tag_t LPF_INVALID_TAG = std::numeric_limits< uint32_t >::max();
+
 const lpf_args_t LPF_NO_ARGS = { NULL, 0, NULL, 0, NULL, 0 };
 
-const lpf_sync_attr_t LPF_SYNC_DEFAULT = 0;
+const lpf_sync_attr_t LPF_SYNC_DEFAULT = NULL;
 
-const lpf_msg_attr_t LPF_MSG_DEFAULT = 0;
+const lpf_msg_attr_t LPF_MSG_DEFAULT = NULL;
 
 const lpf_pid_t LPF_MAX_P = UINT_MAX;
 
@@ -66,13 +69,13 @@ const lpf_init_t LPF_INIT_NONE = NULL;
 
 extern "C" const int LPF_MPI_AUTO_INITIALIZE __attribute__((weak)) = 1;
 
-const lpf_t LPF_ROOT = static_cast<void*>(const_cast<char *>("LPF_ROOT")) ; 
+const lpf_t LPF_ROOT = static_cast<void*>(const_cast<char *>("LPF_ROOT")) ;
 
 const lpf_machine_t LPF_INVALID_MACHINE = { 0, 0, NULL, NULL };
 
 namespace {
     lpf::Interface * realContext( lpf_t ctx )
-    { 
+    {
         if  ( LPF_ROOT == ctx )
             return lpf::Interface::root();
         else
@@ -80,6 +83,7 @@ namespace {
     }
 }
 
+// MPI extension
 
 lpf_err_t lpf_mpi_initialize_with_mpicomm( MPI_Comm comm, lpf_init_t * init)
 {
@@ -92,9 +96,9 @@ lpf_err_t lpf_mpi_initialize_with_mpicomm( MPI_Comm comm, lpf_init_t * init)
     return status;
 }
 
-lpf_err_t lpf_mpi_initialize_over_tcp( 
+lpf_err_t lpf_mpi_initialize_over_tcp(
         const char * server, const char * port, int timeout,
-        lpf_pid_t pid, lpf_pid_t nprocs, 
+        lpf_pid_t pid, lpf_pid_t nprocs,
         lpf_init_t * init )
 {
     try {
@@ -103,7 +107,7 @@ lpf_err_t lpf_mpi_initialize_over_tcp(
 
         // Create an MPI communicator
         MPI_Comm comm = lpf::mpi::dynamicHook(
-                server, port, pid, nprocs, 
+                server, port, pid, nprocs,
                 lpf::Time::fromSeconds( timeout / 1000.0) );
 
         // wrap it
@@ -143,7 +147,7 @@ lpf_err_t lpf_mpi_initialize_over_tcp(
 }
 
 lpf_err_t lpf_mpi_finalize( lpf_init_t context ) {
- 
+
     lpf_err_t status = LPF_SUCCESS;
 
     delete static_cast< lpf::mpi::Comm *>(context);
@@ -151,6 +155,265 @@ lpf_err_t lpf_mpi_finalize( lpf_init_t context ) {
     return status;
 }
 
+// tags extension
+
+lpf_err_t lpf_tag_create_mattr(
+    lpf_t ctx,
+    lpf_msg_attr_t * attr
+)
+{
+    (void) ctx;
+    *attr = LPF_MSG_DEFAULT;
+    return LPF_SUCCESS;
+}
+
+lpf_err_t lpf_tag_destroy_mattr(
+    lpf_t ctx,
+    lpf_msg_attr_t attr
+)
+{
+    (void) ctx;
+    (void) attr;
+    return LPF_SUCCESS;
+}
+
+lpf_err_t lpf_tag_create_sattr(
+    lpf_t ctx,
+    lpf_sync_attr_t * attr
+)
+{
+    lpf_err_t ret = LPF_SUCCESS;
+    lpf::Interface * i = realContext(ctx);
+    if (!i->isAborted()) {
+        try {
+            ret = i->createNewSyncAttr(attr);
+	} catch (const std::bad_alloc &) {
+            LOG(2, "lpf_tag_create_sattr: out of memory (bad_alloc)");
+            return LPF_ERR_OUT_OF_MEMORY;
+	} catch (const std::exception &e) {
+            LOG(1, "lpf_tag_create_sattr fatal error: " << e.what());
+            return LPF_ERR_FATAL;
+	}
+    }
+    return ret;
+}
+
+lpf_err_t lpf_tag_destroy_sattr(
+    lpf_t ctx,
+    lpf_sync_attr_t attr
+)
+{
+    lpf::Interface * i = realContext(ctx);
+    if (!i->isAborted()) {
+        i->destroySyncAttr(attr);
+    }
+    return LPF_SUCCESS;
+}
+
+lpf_err_t lpf_tag_get_mattr(
+    lpf_t ctx,
+    lpf_msg_attr_t attr,
+    lpf_tag_t * tag
+)
+{
+    (void) ctx;
+    ASSERT( tag != NULL );
+    *tag = *static_cast< uint32_t * >(attr);
+    return LPF_SUCCESS;
+}
+
+lpf_err_t lpf_tag_get_sattr(
+    lpf_t ctx,
+    lpf_sync_attr_t attr,
+    lpf_tag_t * tag
+)
+{
+    ASSERT( tag != NULL );
+    lpf::Interface * i = realContext(ctx);
+    if (!i->isAborted()) {
+        *tag = i->getTagFromSyncAttr(attr);
+    }
+    return LPF_SUCCESS;
+}
+
+lpf_err_t lpf_tag_set_sattr(
+    lpf_t ctx,
+    lpf_tag_t tag,
+    lpf_sync_attr_t attr
+)
+{
+    ASSERT( attr != NULL );
+    lpf::Interface * i = realContext(ctx);
+    if (!i->isAborted()) {
+        i->setTagInSyncAttr(tag,attr);
+    }
+    return LPF_SUCCESS;
+}
+
+lpf_err_t lpf_tag_set_mattr(
+    lpf_t ctx,
+    lpf_tag_t tag,
+    lpf_msg_attr_t attr
+)
+{
+    (void) ctx;
+    ASSERT( attr != NULL );
+    *static_cast< uint32_t * >(attr) = tag;
+    return LPF_SUCCESS;
+}
+
+lpf_err_t lpf_resize_tag_register(
+    lpf_t ctx,
+    size_t max_tags
+)
+{
+    lpf::Interface * i = realContext(ctx);
+    if (i->isAborted())
+        return LPF_SUCCESS;
+
+    try {
+        return i->resizeTagRegister(max_tags);
+    } catch (const std::exception & e) {
+        LOG(1, "lpf_resize_tag_register fatal error: " << e.what());
+	return LPF_ERR_FATAL;
+    }
+}
+
+lpf_err_t lpf_tag_create(
+    lpf_t ctx,
+    bool active,
+    lpf_tag_t * tag
+)
+{
+    (void)active;
+    lpf::Interface * i = realContext(ctx);
+    if (!i->isAborted()) {
+        try {
+            *tag = i->registerTag();
+        } catch (const std::exception & e) {
+            LOG(1, "lpf_tag_create fatal error: " << e.what());
+            return LPF_ERR_FATAL;
+        }
+    }
+    return LPF_SUCCESS;
+}
+
+lpf_err_t lpf_tag_destroy(
+    lpf_t ctx,
+    lpf_tag_t tag
+)
+{
+    lpf::Interface * i = realContext(ctx);
+    if (!i->isAborted()) {
+        try {
+            i->destroyTag(tag);
+        } catch (const std::exception & e) {
+            LOG(1, "lpf_tag_destroy fatal error: " << e.what());
+            return LPF_ERR_FATAL;
+        }
+    }
+    return LPF_SUCCESS;
+}
+
+// zero-cost extension
+
+lpf_err_t lpf_zero_create_sattr(
+    lpf_t ctx,
+    lpf_sync_attr_t * attr
+)
+{
+    return lpf_tag_create_sattr(ctx,attr);
+}
+
+lpf_err_t lpf_zero_destroy_sattr(
+    lpf_t ctx,
+    lpf_sync_attr_t attr
+)
+{
+    return lpf_tag_destroy_sattr(ctx,attr);
+}
+
+lpf_err_t lpf_zero_create_mattr(
+    lpf_t ctx,
+    lpf_msg_attr_t * attr
+)
+{
+    return lpf_tag_create_mattr(ctx,attr);
+}
+
+lpf_err_t lpf_zero_destroy_mattr(
+    lpf_t ctx,
+    lpf_msg_attr_t attr
+)
+{
+    return lpf_tag_destroy_mattr(ctx,attr);
+}
+
+lpf_err_t lpf_zero_set_expected(
+    lpf_t ctx,
+    size_t expected_sent, size_t expected_rcvd,
+    lpf_sync_attr_t attr
+)
+{
+    ASSERT( attr != NULL );
+    lpf::Interface * i = realContext(ctx);
+    if (!i->isAborted()) {
+        i->setZCAttr(expected_sent,expected_rcvd,attr);
+    }
+    return LPF_SUCCESS;
+}
+
+lpf_err_t lpf_zero_get_expected(
+    lpf_t ctx,
+    lpf_sync_attr_t attr,
+    size_t * expected_sent, size_t * expected_rcvd
+)
+{
+    ASSERT( attr != NULL );
+    ASSERT( expected_sent != NULL );
+    ASSERT( expected_rcvd != NULL );
+    lpf::Interface * i = realContext(ctx);
+    if (!i->isAborted()) {
+        i->getZCAttr(attr,*expected_sent,*expected_rcvd);
+    }
+    return LPF_SUCCESS;
+}
+
+lpf_err_t lpf_zero_get_status(
+    lpf_t ctx, lpf_sync_attr_t attr,
+    size_t * rcvd, size_t * sent
+)
+{
+    lpf::Interface * i = realContext(ctx);
+    if (!i->isAborted()) {
+        i->getRcvdMsgCount(rcvd,attr);
+	i->getSentMsgCount(sent,attr);
+    }
+    return LPF_SUCCESS;
+}
+
+// non-coherent extension
+
+lpf_err_t lpf_noc_flush_sent( lpf_t ctx)
+{
+    lpf::Interface * i = realContext(ctx);
+    if (!i->isAborted()) {
+        i->flushSent();
+    }
+    return LPF_SUCCESS;
+}
+
+lpf_err_t lpf_noc_flush_received( lpf_t ctx)
+{
+    lpf::Interface * i = realContext(ctx);
+    if (!i->isAborted()) {
+        i->flushReceived();
+    }
+    return LPF_SUCCESS;
+}
+
+// core functionality
+
 lpf_err_t lpf_hook(
     lpf_init_t _init,
     lpf_spmd_t spmd,
@@ -173,7 +436,7 @@ lpf_err_t lpf_rehook(
 
 lpf_err_t lpf_exec(
     lpf_t ctx,
-    lpf_pid_t P, 
+    lpf_pid_t P,
     lpf_spmd_t spmd,
     lpf_args_t args
 )
@@ -223,48 +486,43 @@ lpf_err_t lpf_deregister(
 }
 
 lpf_err_t lpf_put( lpf_t ctx,
-                       lpf_memslot_t src_slot, 
-                       size_t src_offset,
-                       lpf_pid_t dst_pid, 
-                       lpf_memslot_t dst_slot, 
-                       size_t dst_offset, 
-                       size_t size, 
-                       lpf_msg_attr_t attr
+    lpf_memslot_t src_slot,
+    size_t src_offset,
+    lpf_pid_t dst_pid,
+    lpf_memslot_t dst_slot,
+    size_t dst_offset,
+    size_t size,
+    lpf_msg_attr_t attr
 )
 {
-    (void) attr; // ignore parameter 'msg' since this implementation only 
+    (void) attr; // ignore parameter 'msg' since this implementation only
                  // implements core functionality
     lpf::Interface * i = realContext(ctx);
     if (!i->isAborted())
-        i->put( src_slot, src_offset, dst_pid, dst_slot, dst_offset, size );
+        i->put( src_slot, src_offset, dst_pid, dst_slot, dst_offset, size, attr);
     return LPF_SUCCESS;
 }
 
-
 lpf_err_t lpf_get(
-    lpf_t ctx, 
-    lpf_pid_t pid, 
-    lpf_memslot_t src, 
-    size_t src_offset, 
-    lpf_memslot_t dst, 
+    lpf_t ctx,
+    lpf_pid_t pid,
+    lpf_memslot_t src,
+    size_t src_offset,
+    lpf_memslot_t dst,
     lpf_memslot_t dst_offset,
     size_t size,
     lpf_msg_attr_t attr
 )
 {
-    (void) attr; // ignore parameter 'msg' since this implementation only 
-                 // implements core functionality
     lpf::Interface * i = realContext(ctx);
     if (!i->isAborted())
-        i->get( pid, src, src_offset, dst, dst_offset, size );
+        i->get( pid, src, src_offset, dst, dst_offset, size, attr);
     return LPF_SUCCESS;
 }
 
 lpf_err_t lpf_sync( lpf_t ctx, lpf_sync_attr_t attr )
 {
-    (void) attr; // ignore attr parameter since this implementation only
-                 // implements core functionality
-    return realContext(ctx)->sync();
+    return realContext(ctx)->sync(attr);
 }
 
 lpf_err_t lpf_probe( lpf_t ctx, lpf_machine_t * params )
@@ -282,7 +540,7 @@ lpf_err_t lpf_resize_memory_register( lpf_t ctx, size_t max_regs )
     lpf::Interface * i = realContext(ctx);
     if (i->isAborted())
         return LPF_SUCCESS;
-    
+
     return i->resizeMemreg(max_regs);
 }
 
@@ -291,7 +549,7 @@ lpf_err_t lpf_resize_message_queue( lpf_t ctx, size_t max_msgs )
     lpf::Interface * i = realContext(ctx);
     if (i->isAborted())
         return LPF_SUCCESS;
-    
+
     return i->resizeMesgQueue(max_msgs);
 }
 
@@ -301,4 +559,3 @@ lpf_err_t lpf_abort( lpf_t ctx ) {
     return LPF_SUCCESS;
 }
 
-
diff --git a/src/MPI/ibverbs.cpp b/src/MPI/ibverbs.cpp
index 44852caa..73103aad 100644
--- a/src/MPI/ibverbs.cpp
+++ b/src/MPI/ibverbs.cpp
@@ -45,14 +45,11 @@ namespace {
     }
 }
 
-
 IBVerbs :: IBVerbs( Communication & comm )
     : m_pid( comm.pid() )
     , m_nprocs( comm.nprocs() )
-    , m_devName()
     , m_ibPort( Config::instance().getIBPort() )
     , m_gidIdx( Config::instance().getIBGidIndex() )
-    , m_mtu( getMTU( Config::instance().getIBMTU() ))
     , m_maxRegSize(0)
     , m_maxMsgSize(0)
     , m_minNrMsgs(0)
@@ -60,19 +57,21 @@ IBVerbs :: IBVerbs( Communication & comm )
     , m_device()
     , m_pd()
     , m_cq()
+    , m_dummyMemReg()
+    , m_comm( comm )
+    , m_mtu( getMTU( Config::instance().getIBMTU() ))
+    , m_devName()
     , m_stagedQps( m_nprocs )
     , m_connectedQps( m_nprocs )
     , m_srs()
     , m_srsHeads( m_nprocs, 0u )
     , m_nMsgsPerPeer( m_nprocs, 0u )
-    , m_activePeers(0, m_nprocs)
     , m_peerList()
     , m_sges()
     , m_wcs(m_nprocs)
-    , m_memreg()
-    , m_dummyMemReg()
     , m_dummyBuffer()
-    , m_comm( comm )
+    , m_activePeers(0, m_nprocs)
+    , m_memreg()
 {
     m_peerList.reserve( m_nprocs );
 
@@ -97,7 +96,6 @@ IBVerbs :: IBVerbs( Communication & comm )
         throw Exception( "No Infiniband devices available" );
     }
 
-
     std::string wantDevName = Config::instance().getIBDeviceName();
     LOG( 3, "Searching for device '"<< wantDevName << "'" );
     struct ibv_device * dev = NULL;
@@ -144,7 +142,8 @@ IBVerbs :: IBVerbs( Communication & comm )
     // maximum number of work requests per Queue Pair
     m_maxSrs = std::min<size_t>( m_deviceAttr.max_qp_wr, // maximum work requests per QP
                                  m_deviceAttr.max_cqe ); // maximum entries per CQ
-    LOG(3, "Maximum number of send requests is the minimum of "
+
+    LOG(3, "Initial maximum number of send requests is the minimum of "
             << m_deviceAttr.max_qp_wr << " (the maximum of work requests per QP)"
             << " and " << m_deviceAttr.max_cqe << " (the maximum of completion "
             << " queue entries per QP), nameley " << m_maxSrs );
@@ -196,6 +195,58 @@ IBVerbs :: IBVerbs( Communication & comm )
 
     LOG(3, "Allocated completion queue with " << m_nprocs << " entries.");
 
+    /* 
+     * Unfortunately, some RDMA devices advertise max_qp_wr but 
+     * support a much smaller number. We can probe that.
+     * Note that the inofficial documentation on rdmamojo.com states:
+     * <quote>
+     * There may be RDMA devices that for specific transport types may support
+     * less outstanding Work Requests than the maximum reported value.
+     * </quote>
+     * Therefore, we here do binary search to find the actual value
+     */
+    struct ibv_qp_init_attr testAttr;
+    (void) std::memset(&testAttr, 0, sizeof(testAttr));
+
+    // We only care about the attr.cap.max_send_wr
+    testAttr.qp_type = IBV_QPT_RC;
+
+    struct ibv_qp * ibv_new_qp_p; 
+    testAttr.cap.max_send_wr = m_maxSrs;
+    testAttr.send_cq = m_cq.get();
+    testAttr.recv_cq = m_cq.get();
+    ibv_new_qp_p = ibv_create_qp(m_pd.get(), &testAttr);
+    if (ibv_new_qp_p == NULL) {
+        size_t left = 1;
+        size_t right = m_maxSrs;
+        size_t largestOkaySize = 0;
+        while (left <= right) 
+        {
+            size_t mid = (left + right) / 2;
+            testAttr.cap.max_send_wr = mid;
+            // test if call succeeds
+            ibv_new_qp_p = ibv_create_qp(m_pd.get(), &testAttr);
+            if (ibv_new_qp_p == NULL) {
+                if (errno != EINVAL) { // error points to unsupported max_send_wr by device
+                    throw Exception("Unexpected error code during binary search for maximum send WR.");
+                }
+                else {
+                    right = mid - 1;
+                }
+            }
+            else {
+                // clean up dummy QP
+                ibv_destroy_qp(ibv_new_qp_p);
+                left = mid + 1;
+                // record that we still succeed
+                largestOkaySize = mid;
+            }
+        }
+        ASSERT(largestOkaySize > 0);
+        m_maxSrs = largestOkaySize;
+        LOG(3, "Revised maximum number of send requests is " << m_maxSrs );
+    }
+
     // allocate dummy buffer
     m_dummyBuffer.resize( 8 );
     struct ibv_mr * const ibv_reg_mr_new_p = ibv_reg_mr(
@@ -237,6 +288,7 @@ void IBVerbs :: stageQPs( size_t maxMsgs )
         attr.cap.max_recv_sge = 1;
 
         struct ibv_qp * const ibv_new_qp_p = ibv_create_qp( m_pd.get(), &attr );
+
         if( ibv_new_qp_p == NULL ) {
             m_stagedQps[i].reset();
         } else {
@@ -460,8 +512,8 @@ IBVerbs :: SlotID IBVerbs :: regLocal( void * addr, size_t size )
     MemoryRegistration local;
     local.addr = addr;
     local.size = size;
-    local.lkey = size?slot.mr->lkey:0;
-    local.rkey = size?slot.mr->rkey:0;
+    local.lkey = size ? slot.mr->lkey : 0;
+    local.rkey = size ? slot.mr->rkey : 0;
 
     SlotID id =  m_memreg.addLocalReg( slot );
 
@@ -504,8 +556,8 @@ IBVerbs :: SlotID IBVerbs :: regGlobal( void * addr, size_t size )
     MemoryRegistration local;
     local.addr = addr;
     local.size = size;
-    local.lkey = size?slot.mr->lkey:0;
-    local.rkey = size?slot.mr->rkey:0;
+    local.lkey = size ? slot.mr->lkey : 0;
+    local.rkey = size ? slot.mr->rkey : 0;
 
     LOG(4, "All-gathering memory register data" );
 
diff --git a/src/MPI/ibverbs.hpp b/src/MPI/ibverbs.hpp
index a96030a2..ab3685db 100644
--- a/src/MPI/ibverbs.hpp
+++ b/src/MPI/ibverbs.hpp
@@ -20,7 +20,7 @@
 
 #include <string>
 #include <vector>
-#if __cplusplus >= 201103L    
+#if __cplusplus >= 201103L
   #include <memory>
 #else
   #include <tr1/memory>
@@ -34,18 +34,18 @@
 #include "memreg.hpp"
 
 namespace lpf {
-    
+
     class Communication;
-    
+
     namespace mpi {
 
-#if __cplusplus >= 201103L    
+#if __cplusplus >= 201103L
 using std::shared_ptr;
 #else
 using std::tr1::shared_ptr;
 #endif
 
-class _LPFLIB_LOCAL IBVerbs 
+class _LPFLIB_LOCAL IBVerbs
 {
 public:
     struct Exception;
@@ -57,7 +57,7 @@ class _LPFLIB_LOCAL IBVerbs
 
     void resizeMemreg( size_t size );
     void resizeMesgq( size_t size );
-    
+
     SlotID regLocal( void * addr, size_t size );
     SlotID regGlobal( void * addr, size_t size );
     void dereg( SlotID id );
@@ -66,30 +66,29 @@ class _LPFLIB_LOCAL IBVerbs
         return m_maxMsgSize;
     }
 
-    void put( SlotID srcSlot, size_t srcOffset, 
+    void put( SlotID srcSlot, size_t srcOffset,
               int dstPid, SlotID dstSlot, size_t dstOffset, size_t size );
 
-    void get( int srcPid, SlotID srcSlot, size_t srcOffset, 
+    void get( int srcPid, SlotID srcSlot, size_t srcOffset,
               SlotID dstSlot, size_t dstOffset, size_t size );
 
-
     // Do the communication and synchronize
     // 'Reconnect' must be a globally replicated value
     void sync( bool reconnect);
 
+
 private:
     IBVerbs & operator=(const IBVerbs & ); // assignment prohibited
     IBVerbs( const IBVerbs & ); // copying prohibited
 
-    void stageQPs(size_t maxMsgs ); 
-    void reconnectQPs(); 
-
+    void stageQPs(size_t maxMsgs );
+    void reconnectQPs();
 
     struct MemoryRegistration {
-        void *   addr;
-        size_t   size;
-        uint32_t lkey;
-        uint32_t rkey;
+        void * addr;
+	size_t size;
+	uint32_t lkey;
+	uint32_t rkey;
     };
 
     struct MemorySlot {
@@ -97,51 +96,55 @@ class _LPFLIB_LOCAL IBVerbs
         std::vector< MemoryRegistration > glob; // array for global registrations
     };
 
-    int          m_pid; // local process ID
-    int          m_nprocs; // number of processes
+    int    m_pid;    // local process ID
+    int    m_nprocs; // number of processes
+    int    m_ibPort; // local IB port to work with
+    int    m_gidIdx;
+    size_t m_maxRegSize;
+    size_t m_maxMsgSize;
+    size_t m_minNrMsgs;
+    size_t m_maxSrs; // maximum number of sends requests per QP
+
+    shared_ptr< struct ibv_context > m_device;      // device handle
+    shared_ptr< struct ibv_pd >      m_pd;          // protection domain
+    shared_ptr< struct ibv_cq >      m_cq;          // complation queue
+    shared_ptr< struct ibv_mr >      m_dummyMemReg; // registration of dummy
+                                                    // buffer
+    Communication & m_comm;
+
+    ibv_mtu m_mtu;
+
+    std::string m_devName; // IB device name
 
-    std::string  m_devName; // IB device name
-    int          m_ibPort;  // local IB port to work with
-    int          m_gidIdx; 
-    uint16_t     m_lid;     // LID of the IB port
-    ibv_mtu      m_mtu;   
     struct ibv_device_attr m_deviceAttr;
-    size_t       m_maxRegSize;
-    size_t       m_maxMsgSize; 
-    size_t       m_minNrMsgs;
-    size_t       m_maxSrs; // maximum number of sends requests per QP  
 
-    shared_ptr< struct ibv_context > m_device; // device handle
-    shared_ptr< struct ibv_pd >      m_pd;     // protection domain
-    shared_ptr< struct ibv_cq >      m_cq;     // complation queue
+    uint16_t m_lid;     // LID of the IB port
 
     // Disconnected queue pairs
-    std::vector< shared_ptr< struct ibv_qp > > m_stagedQps; 
+    std::vector< shared_ptr< struct ibv_qp > > m_stagedQps;
 
     // Connected queue pairs
-    std::vector< shared_ptr< struct ibv_qp > > m_connectedQps; 
+    std::vector< shared_ptr< struct ibv_qp > > m_connectedQps;
 
+    std::vector< struct ibv_send_wr > m_srs;          // array of send requests
+    std::vector< size_t >             m_srsHeads;     // head of send queue per
+                                                      // peer
+    std::vector< size_t >             m_nMsgsPerPeer; // number of messages per
+                                                      // peer
+    std::vector< pid_t >              m_peerList;
 
-    std::vector< struct ibv_send_wr > m_srs; // array of send requests
-    std::vector< size_t >        m_srsHeads; // head of send queue per peer
-    std::vector< size_t >        m_nMsgsPerPeer; // number of messages per peer
-    SparseSet< pid_t >           m_activePeers; // 
-    std::vector< pid_t >         m_peerList;
+    std::vector< struct ibv_sge > m_sges;        // array of scatter/gather
+                                                 // entries
+    std::vector< struct ibv_wc >  m_wcs;         // array of work completions
+    std::vector< char >           m_dummyBuffer; // dummy receive buffer
 
-    std::vector< struct ibv_sge > m_sges; // array of scatter/gather entries
-    std::vector< struct ibv_wc > m_wcs; // array of work completions
+    SparseSet< pid_t >           m_activePeers;
 
     CombinedMemoryRegister< MemorySlot > m_memreg;
 
-
-    shared_ptr< struct ibv_mr > m_dummyMemReg; // registration of dummy buffer
-    std::vector< char > m_dummyBuffer; // dummy receive buffer
-
-    Communication & m_comm;
 };
 
 
-
 } }
 
 
diff --git a/src/MPI/ibverbs.t.cpp b/src/MPI/ibverbs.t.cpp
index 8b916711..dc2e80a5 100644
--- a/src/MPI/ibverbs.t.cpp
+++ b/src/MPI/ibverbs.t.cpp
@@ -226,7 +226,6 @@ TEST_F( IBVerbsTests, getAllToAll )
 
     verbs->sync(true);
 
-
     EXPECT_EQ(a, a2);
     EXPECT_EQ(b, b2);
 
diff --git a/src/MPI/init.cpp b/src/MPI/init.cpp
index 68d16866..97768de1 100644
--- a/src/MPI/init.cpp
+++ b/src/MPI/init.cpp
@@ -54,9 +54,10 @@ namespace lpf {
 			(engine.compare( "mpirma" ) == 0) ||
 			(engine.compare( "mpimsg" ) == 0) ||
 			(engine.compare( "ibverbs" ) == 0) ||
+			(engine.compare( "zero" ) == 0) ||
 			(engine.compare( "hybrid" ) == 0);
 		if( !engine_is_MPI ) {
-			(void) std::fprintf( stderr, "Warning: program was compiled for the mpirma, mpimsg, ibverbs, or hybrid engine but run-time requests the %s engine instead. For stable results please compile the program into a universal LPF program (by omitting the -engine flag to the lpfcc/lpfcxx utilities).\n", engine.c_str() );
+			(void) std::fprintf( stderr, "Warning: program was compiled for the mpirma, mpimsg, ibverbs, zero, or hybrid engine but run-time requests the %s engine instead. For stable results please compile the program into a universal LPF program (by omitting the -engine flag to the lpfcc/lpfcxx utilities).\n", engine.c_str() );
 		}
 
 		if( mpi_initializer_ran || !engine_is_MPI ) {
diff --git a/src/MPI/interface.cpp b/src/MPI/interface.cpp
index 30ece40d..e7f7374a 100644
--- a/src/MPI/interface.cpp
+++ b/src/MPI/interface.cpp
@@ -93,20 +93,38 @@ catch ( const std::bad_alloc & e)
 
 void Interface :: put( memslot_t srcSlot, size_t srcOffset, 
         pid_t dstPid, memslot_t dstSlot, size_t dstOffset,
-        size_t size ) 
+        size_t size, lpf_msg_attr_t attr )
 {
     m_mesgQueue.put( srcSlot, srcOffset,
             dstPid, dstSlot, dstOffset, 
-            size );
+            size, attr);
+}
+
+void Interface :: flushSent() {
+    m_mesgQueue.flushSent();
+}
+
+void Interface :: flushReceived() {
+    m_mesgQueue.flushReceived();
+}
+
+err_t Interface :: createNewSyncAttr(sync_attr_t * attr)
+{
+    if ( 0 == m_aborted )
+    {
+        m_mesgQueue.createNewSyncAttr(attr);
+        return LPF_SUCCESS;
+    }
+    return LPF_ERR_FATAL;
 }
 
 void Interface :: get( pid_t srcPid, memslot_t srcSlot, size_t srcOffset, 
         memslot_t dstSlot, size_t dstOffset,
-        size_t size )
+        size_t size, lpf_msg_attr_t attr )
 {
     m_mesgQueue.get( srcPid, srcSlot, srcOffset,
             dstSlot, dstOffset,
-            size );
+            size, attr);
 }
 
 memslot_t Interface :: registerGlobal( void * mem, size_t size )
@@ -119,11 +137,21 @@ memslot_t Interface :: registerLocal( void * mem, size_t size )
     return m_mesgQueue.addLocalReg( mem, size );
 }
 
+tag_t Interface :: registerTag()
+{
+    return m_mesgQueue.addTag();
+}
+
 void Interface :: deregister( memslot_t slot )
 {
     m_mesgQueue.removeReg( slot );
 }
 
+void Interface :: destroyTag( tag_t tag )
+{
+    m_mesgQueue.removeTag( tag );
+}
+
 err_t Interface :: resizeMemreg( size_t nRegs ) 
 {
     return m_mesgQueue.resizeMemreg( nRegs );
@@ -134,12 +162,24 @@ err_t Interface :: resizeMesgQueue( size_t nMsgs )
     return m_mesgQueue.resizeMesgQueue( nMsgs );
 }
 
+err_t Interface :: resizeTagRegister( size_t nTags )
+{
+    return m_mesgQueue.resizeTagreg( nTags );
+}
+
 void Interface :: abort()
 {
     ASSERT( 0 == m_aborted );
+#ifdef LPF_CORE_MPI_USES_zero
+    int vote = 1;
+    int voted;
+    m_comm.allreduceSum(&vote, &voted, 1);
+    m_aborted = voted;
+#else
     // signal all other processes at the start of the next 'sync' that
     // this process aborted.
-    m_aborted = m_mesgQueue.sync( true );
+    m_aborted = m_mesgQueue.sync( true, LPF_SYNC_DEFAULT );
+#endif
 }
 
 pid_t Interface  :: isAborted() const
@@ -147,11 +187,11 @@ pid_t Interface  :: isAborted() const
     return m_aborted;
 }
 
-err_t Interface ::  sync()
+err_t Interface ::  sync( sync_attr_t attr )
 {
     if ( 0 == m_aborted )
     {
-        m_aborted = m_mesgQueue.sync( false );
+        m_aborted = m_mesgQueue.sync( false, attr );
     }
     
     if ( 0 == m_aborted )
diff --git a/src/MPI/interface.hpp b/src/MPI/interface.hpp
index 732f0a9b..6649af30 100644
--- a/src/MPI/interface.hpp
+++ b/src/MPI/interface.hpp
@@ -27,7 +27,7 @@
 
 namespace lpf
 {
-    class _LPFLIB_LOCAL Process;
+class _LPFLIB_LOCAL Process;
 
 class _LPFLIB_LOCAL Interface  
 {
@@ -39,36 +39,107 @@ class _LPFLIB_LOCAL Interface
     }
 
     _LPFLIB_API
-    static void initRoot(int *argc, char ***argv);
+    static void initRoot(int *argc, char ***argv) ;
 
-    Interface( mpi::Comm machine, Process & subprocess );
+    Interface( mpi::Comm machine, Process & subprocess ) ;
 
     void put( memslot_t srcSlot, size_t srcOffset, 
             pid_t dstPid, memslot_t dstSlot, size_t dstOffset,
-            size_t size ) ; // nothrow
+            size_t size, lpf_msg_attr_t attr) ; // nothrow
 
     void get( pid_t srcPid, memslot_t srcSlot, size_t srcOffset, 
             memslot_t dstSlot, size_t dstOffset,
-            size_t size ) ;// nothrow
+            size_t size, lpf_msg_attr_t attr) ; // nothrow
 
     memslot_t registerGlobal( void * mem, size_t size ) ; // nothrow
 
     memslot_t registerLocal( void * mem, size_t size ) ;  // nothrow
 
+    tag_t registerTag() ; // can throw(!)
+
     void deregister( memslot_t slot ) ; // nothrow
 
+    void destroyTag( tag_t tag ) ; // can throw(!)
+
     err_t resizeMemreg( size_t nRegs ) ; // nothrow
     err_t resizeMesgQueue( size_t nMsgs ) ; // nothrow
 
+    err_t resizeTagRegister( size_t nTags ) ; // can throw(!)
+
     void abort() ; // nothrow
 
     pid_t isAborted() const ;
  
-    err_t sync(); // nothrow
+    err_t sync( sync_attr_t attr ) ; // nothrow
 
     err_t exec( pid_t P, spmd_t spmd, args_t args ) ;
 
-    static err_t hook( const mpi::Comm & comm , spmd_t spmd, args_t args );
+    static err_t hook( const mpi::Comm & comm , spmd_t spmd, args_t args ) ;
+
+    err_t createNewSyncAttr(sync_attr_t * attr) ;
+
+    inline void destroySyncAttr(sync_attr_t attr)
+    {
+        if ( 0 == m_aborted )
+        {
+            return m_mesgQueue.destroySyncAttr(attr);
+        }
+    }
+
+    inline tag_t getTagFromSyncAttr(sync_attr_t attr) noexcept
+    {
+        if ( 0 == m_aborted )
+        {
+            return m_mesgQueue.getTagFromSyncAttr(attr);
+        }
+        return LPF_INVALID_TAG;
+    }
+
+    inline void setTagInSyncAttr(tag_t tag, sync_attr_t attr) noexcept
+    {
+        if ( 0 == m_aborted )
+        {
+            m_mesgQueue.setTagInSyncAttr(tag,attr);
+        }
+    }
+
+    inline void setZCAttr(size_t sent, size_t rcvd, sync_attr_t attr) noexcept
+    {
+        if ( 0 == m_aborted )
+        {
+            m_mesgQueue.setZCAttr(sent,rcvd,attr);
+        }
+    }
+
+    inline void getZCAttr(sync_attr_t attr, size_t &sent, size_t &rcvd) noexcept
+    {
+        if ( 0 == m_aborted )
+        {
+            m_mesgQueue.getZCAttr(attr,sent,rcvd);
+        }
+    }
+
+    typedef size_t SlotID;
+
+    inline void getRcvdMsgCount(size_t * msgs, sync_attr_t attr) noexcept
+    {
+        if ( 0 == m_aborted )
+        {
+            m_mesgQueue.getRcvdMsgCount(msgs, attr);
+        }
+    }
+
+    inline void getSentMsgCount(size_t * msgs, sync_attr_t attr) noexcept
+    {
+        if ( 0 == m_aborted )
+        {
+            m_mesgQueue.getSentMsgCount(msgs, attr);
+        }
+    }
+
+    void flushSent();
+
+    void flushReceived();
 
     err_t rehook( spmd_t spmd, args_t args);
 
diff --git a/src/MPI/memorytable.cpp b/src/MPI/memorytable.cpp
index 3bb7a792..57dff485 100644
--- a/src/MPI/memorytable.cpp
+++ b/src/MPI/memorytable.cpp
@@ -23,8 +23,8 @@
 namespace lpf {
 
 MemoryTable :: MemoryTable( Communication & comm
-#ifdef LPF_CORE_MPI_USES_ibverbs
-        , mpi::IBVerbs & ibverbs
+#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero
+        , IBVerbs & ibverbs
 #endif
         )
     : m_memreg()
@@ -34,7 +34,7 @@ MemoryTable :: MemoryTable( Communication & comm
     , m_removed( 0, 0 )
     , m_comm( comm )
 #endif
-#ifdef LPF_CORE_MPI_USES_ibverbs
+#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero
     , m_added( 0, 0 )
     , m_ibverbs( ibverbs )
     , m_comm( comm )
@@ -45,7 +45,7 @@ MemoryTable :: MemoryTable( Communication & comm
 MemoryTable :: Slot
 MemoryTable :: addLocal( void * mem, std::size_t size )  // nothrow
 {
-#ifdef LPF_CORE_MPI_USES_ibverbs
+#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero
     Memory rec( mem, size, m_ibverbs.regLocal( mem, size));
 #else
     Memory rec( mem, size);
@@ -55,14 +55,14 @@ MemoryTable :: addLocal( void * mem, std::size_t size )  // nothrow
 
 MemoryTable :: Slot
 MemoryTable :: addGlobal( void * mem, std::size_t size ) // nothrow
-{ 
-#ifdef LPF_CORE_MPI_USES_ibverbs
-    Memory rec(mem, size, -1); 
+{
+#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero
+    Memory rec(mem, size, -1);
 #else
-    Memory rec(mem, size); 
+    Memory rec(mem, size);
 #endif
-    Slot slot = m_memreg.addGlobalReg(rec) ; 
-#if defined LPF_CORE_MPI_USES_mpirma || defined LPF_CORE_MPI_USES_ibverbs
+    Slot slot = m_memreg.addGlobalReg(rec) ;
+#if defined LPF_CORE_MPI_USES_mpirma || defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero
     m_added.insert( slot );
 #endif
     return slot;
@@ -92,7 +92,7 @@ void MemoryTable :: remove( Slot slot )   // nothrow
     m_memreg.removeReg( slot );
 #endif
 
-#ifdef LPF_CORE_MPI_USES_ibverbs
+#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero
     if (m_added.contains(slot)) {
         m_added.erase(slot);
     }
@@ -123,7 +123,7 @@ void MemoryTable :: reserve( size_t size ) // throws bad_alloc, strong safe
     m_memreg.reserve( size );
 #endif
 
-#ifdef LPF_CORE_MPI_USES_ibverbs
+#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero
     m_memreg.reserve( size );
     size_t range = m_memreg.range();
     m_added.resize( range );
@@ -139,24 +139,25 @@ size_t MemoryTable :: capacity() const
 }
 
 size_t MemoryTable :: range() const
-{ 
+{
     return m_memreg.range();
 }
 
 bool MemoryTable :: needsSync() const
-{ 
+{
 #ifdef LPF_CORE_MPI_USES_mpirma
     return ! m_added.empty() || !m_removed.empty();
-#endif
-#ifdef LPF_CORE_MPI_USES_mpimsg
+#elif LPF_CORE_MPI_USES_mpimsg
     return false;
-#endif
-#ifdef LPF_CORE_MPI_USES_ibverbs
+#elif defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero
     return !m_added.empty();
+#else // This case should NOT occur?
+    fprintf(stderr, "An unknown engine in MPI/memorytable.cpp\n");
+    std::abort();
 #endif
 }
 
-void MemoryTable :: sync(  ) 
+void MemoryTable :: sync(  )
 {
 #ifdef LPF_CORE_MPI_USES_mpirma
     if ( !m_removed.empty() )
@@ -184,17 +185,17 @@ void MemoryTable :: sync(  )
             ASSERT( !isLocalSlot( *i ));
             void * base = m_memreg.lookup( *i).addr;
             size_t size = m_memreg.lookup( *i ).size;
-            Window w = m_comm.createMemslot( base, size ); 
+            Window w = m_comm.createMemslot( base, size );
             m_windows[ *i ] = w;
             m_comm.fence( w );
         }
 
         // clear the added list
         m_added.clear();
-    } // if 
+    } // if
 #endif
 
-#ifdef LPF_CORE_MPI_USES_ibverbs
+#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero
     if ( !m_added.empty() )
     {
         // Register the global with IBverbs
@@ -204,7 +205,7 @@ void MemoryTable :: sync(  )
             ASSERT( !isLocalSlot( *i ));
             void * base = m_memreg.lookup( *i).addr;
             size_t size = m_memreg.lookup( *i ).size;
-            mpi::IBVerbs::SlotID s = m_ibverbs.regGlobal( base, size ); 
+            IBVerbs::SlotID s = m_ibverbs.regGlobal( base, size );
             m_memreg.update( *i ).slot = s;
         }
 
diff --git a/src/MPI/memorytable.hpp b/src/MPI/memorytable.hpp
index 18dd5038..55f1fe59 100644
--- a/src/MPI/memorytable.hpp
+++ b/src/MPI/memorytable.hpp
@@ -1,4 +1,3 @@
-
 /*
  *   Copyright 2021 Huawei Technologies Co., Ltd.
  *
@@ -27,6 +26,9 @@
 #ifdef LPF_CORE_MPI_USES_ibverbs
 #include "ibverbs.hpp"
 #endif
+#ifdef LPF_CORE_MPI_USES_zero
+#include "zero.hpp"
+#endif
 
 
 #include <vector>
@@ -41,12 +43,18 @@ class _LPFLIB_LOCAL MemoryTable
 #ifdef LPF_CORE_MPI_USES_mpirma
     typedef Communication::Memslot Window;
 #endif
+#ifdef LPF_CORE_MPI_USES_ibverbs
+    typedef mpi::IBVerbs IBVerbs;
+#elif defined LPF_CORE_MPI_USES_zero
+    typedef mpi::Zero IBVerbs;
+#endif
 
     struct Memory {
         char *addr; size_t size; 
-#ifdef LPF_CORE_MPI_USES_ibverbs
-        mpi::IBVerbs::SlotID slot;
-        Memory( void * a, size_t s, mpi::IBVerbs::SlotID sl)
+#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero
+	typedef IBVerbs::SlotID SlotID;
+        SlotID slot;
+        Memory( void * a, size_t s, SlotID sl)
             : addr(static_cast<char *>(a))
             , size(s), slot(sl) {}
         Memory() : addr(NULL), size(0u), slot(-1) {}
@@ -67,6 +75,8 @@ class _LPFLIB_LOCAL MemoryTable
 
 #ifdef LPF_CORE_MPI_USES_ibverbs
     explicit MemoryTable( Communication & comm, mpi::IBVerbs & verbs );
+#elif defined LPF_CORE_MPI_USES_zero
+    explicit MemoryTable( Communication & comm, mpi::Zero & verbs );
 #else
     explicit MemoryTable( Communication & comm );
 #endif
@@ -78,7 +88,11 @@ class _LPFLIB_LOCAL MemoryTable
     void remove( Slot slot );   // nothrow
 
     void * getAddress( Slot slot, size_t offset ) const  // nothrow
-    {   ASSERT( offset <= m_memreg.lookup(slot).size  ); 
+    {   
+        if (offset > m_memreg.lookup(slot).size) {
+            LOG(5, "Offset:" << offset << " m_Memreg.lookup(slot).size = " << m_memreg.lookup(slot).size);
+        }
+        ASSERT( offset <= m_memreg.lookup(slot).size  ); 
         return m_memreg.lookup(slot).addr + offset;
     }
 
@@ -90,8 +104,12 @@ class _LPFLIB_LOCAL MemoryTable
     { return m_windows[ slot ]; }
 #endif
 
-#ifdef  LPF_CORE_MPI_USES_ibverbs
+#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero
+#ifdef LPF_CORE_MPI_USES_ibverbs
     mpi::IBVerbs::SlotID getVerbID( Slot slot ) const
+#elif defined LPF_CORE_MPI_USES_zero
+    mpi::Zero::SlotID getVerbID( Slot slot ) const
+#endif
     { return m_memreg.lookup( slot ).slot; }
 #endif
 
@@ -118,9 +136,13 @@ class _LPFLIB_LOCAL MemoryTable
     Communication & m_comm;
 #endif
 
-#ifdef LPF_CORE_MPI_USES_ibverbs
+#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero
     DirtyList      m_added;
+#ifdef LPF_CORE_MPI_USES_ibverbs
     mpi::IBVerbs  & m_ibverbs;
+#elif defined LPF_CORE_MPI_USES_zero
+    mpi::Zero     & m_ibverbs;
+#endif
     Communication & m_comm;
 #endif
 };
diff --git a/src/MPI/mesgqueue.cpp b/src/MPI/mesgqueue.cpp
index 0f610a52..07f6b641 100644
--- a/src/MPI/mesgqueue.cpp
+++ b/src/MPI/mesgqueue.cpp
@@ -16,6 +16,11 @@
  */
 
 #include "mesgqueue.hpp"
+#ifdef LPF_CORE_MPI_USES_zero
+#include "zero.hpp"
+#else
+#include "ibverbs.hpp"
+#endif
 #include "mpilib.hpp"
 #include "log.hpp"
 #include "assert.hpp"
@@ -97,14 +102,14 @@ MessageQueue :: MessageQueue( Communication & comm )
     , m_edgeRecv()
     , m_edgeSend()
     , m_edgeBuffer()
-#if defined LPF_CORE_MPI_USES_mpirma || defined LPF_CORE_MPI_USES_ibverbs
+#if defined LPF_CORE_MPI_USES_mpirma || defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero
     , m_edgeBufferSlot( m_memreg.invalidSlot() )
 #endif
     , m_bodySends()
     , m_bodyRecvs()
     , m_comm( dynamic_cast<mpi::Comm &>(comm) )
-#ifdef LPF_CORE_MPI_USES_ibverbs
-    , m_ibverbs( m_comm )
+#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero
+    , m_ibverbs(m_comm)
     , m_memreg( m_comm, m_ibverbs )
 #else
     , m_memreg( m_comm )
@@ -179,7 +184,7 @@ err_t MessageQueue :: resizeMesgQueue( size_t nMsgs )
 #ifdef LPF_CORE_MPI_USES_mpimsg
         m_comm.reserveMsgs( 6* nMsgs ); //another factor three stems from sending edges separately .
 #endif
-#ifdef LPF_CORE_MPI_USES_ibverbs
+#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero
         m_ibverbs.resizeMesgq( 6*nMsgs);
 #endif
 
@@ -243,6 +248,23 @@ err_t MessageQueue :: resizeMemreg( size_t nRegs )
     return LPF_SUCCESS;
 }
 
+err_t MessageQueue :: resizeTagreg( size_t nRegs )
+{
+#ifdef LPF_CORE_MPI_USES_zero
+    try {
+        m_ibverbs.resizeTagreg( nRegs );
+    } catch (const std::bad_alloc &) {
+        return LPF_ERR_OUT_OF_MEMORY;
+    } catch (...) {
+        return LPF_ERR_FATAL;
+    }
+    return LPF_SUCCESS;
+#else
+    (void) nRegs;
+    throw std::runtime_error("Selected engine does not support tags");
+#endif
+}
+
 memslot_t MessageQueue :: addLocalReg( void * mem, std::size_t size)
 {
     memslot_t slot = m_memreg.addLocal( mem, size );
@@ -259,6 +281,15 @@ memslot_t MessageQueue :: addGlobalReg( void * mem, std::size_t size )
     return slot;
 }
 
+tag_t MessageQueue :: addTag()
+{
+#ifdef LPF_CORE_MPI_USES_zero
+    return m_ibverbs.regTag();
+#else
+    throw std::runtime_error("Selected engine does not support tags");
+#endif
+}
+
 void MessageQueue :: removeReg( memslot_t slot )
 {
     if (m_memreg.getSize( slot ) > 0)
@@ -267,91 +298,127 @@ void MessageQueue :: removeReg( memslot_t slot )
     m_memreg.remove( slot );
 }
 
+void MessageQueue :: removeTag( tag_t tag )
+{
+#ifdef LPF_CORE_MPI_USES_zero
+    m_ibverbs.deregTag( tag );
+#else
+    (void) tag;
+    throw std::runtime_error("Selected engine does not support tags");
+#endif
+}
+
 void MessageQueue :: get( pid_t srcPid, memslot_t srcSlot, size_t srcOffset,
-        memslot_t dstSlot, size_t dstOffset, size_t size )
+        memslot_t dstSlot, size_t dstOffset, size_t size, lpf_msg_attr_t attr)
 {
-    if (size > 0)
+    if( size == 0 ) { return; }
+    ASSERT( ! m_memreg.isLocalSlot( srcSlot ) );
+    if ( srcPid == static_cast<pid_t>(m_pid) )
     {
-        ASSERT( ! m_memreg.isLocalSlot( srcSlot ) );
-        void * address = m_memreg.getAddress( dstSlot, dstOffset );
-        if ( srcPid == static_cast<pid_t>(m_pid) )
-        {
-            std::memcpy( address, m_memreg.getAddress( srcSlot, srcOffset), size);
-        }
-        else
-        {
-            using mpi::ipc::newMsg;
-
-            if (size <= m_tinyMsgSize )
-            {
-                // send immediately the request to the source
-                newMsg( BufGet, m_tinyMsgBuf.data(), m_tinyMsgBuf.size() )
-                    .write( DstPid ,  m_pid )
-                    .write( SrcSlot, srcSlot)
-                    .write( DstSlot, dstSlot)
-                    .write( SrcOffset, srcOffset )
-                    .write( DstOffset, dstOffset )
-                    .write( Size, size )
-                    .send( *m_firstQueue, srcPid );
-            }
-            else
-            {
-                // send the request to the destination process (this process)
-                // for write conflict resolution
-                newMsg( HpGet, m_tinyMsgBuf.data(), m_tinyMsgBuf.size() )
-                    .write( SrcPid, srcPid )
-                    .write( DstPid, m_pid )
-                    .write( SrcSlot, srcSlot )
-                    .write( DstSlot, dstSlot )
-                    .write( SrcOffset, srcOffset )
-                    .write( DstOffset, dstOffset )
-                    .write( Size, size )
-                    . send( *m_firstQueue, m_pid );
-            }
-        }
+        void * const address = m_memreg.getAddress( dstSlot, dstOffset );
+        (void) std::memcpy(
+            address,
+            m_memreg.getAddress( srcSlot, srcOffset), size
+        );
+        return;
     }
+#ifdef LPF_CORE_MPI_USES_zero
+    m_ibverbs.get(
+            srcPid,
+            m_memreg.getVerbID( srcSlot ),
+            srcOffset,
+            m_memreg.getVerbID( dstSlot ),
+            dstOffset,
+            size, attr);
+#else
+    (void) attr; // this engine does not use message attributes
+    using mpi::ipc::newMsg;
+
+    if (size <= m_tinyMsgSize )
+    {
+        // send immediately the request to the source
+        newMsg( BufGet, m_tinyMsgBuf.data(), m_tinyMsgBuf.size() )
+            .write( DstPid ,  m_pid )
+            .write( SrcSlot, srcSlot)
+            .write( DstSlot, dstSlot)
+            .write( SrcOffset, srcOffset )
+            .write( DstOffset, dstOffset )
+            .write( Size, size )
+            .send( *m_firstQueue, srcPid );
+    } else {
+        // send the request to the destination process (this process)
+        // for write conflict resolution
+        newMsg( HpGet, m_tinyMsgBuf.data(), m_tinyMsgBuf.size() )
+            .write( SrcPid, srcPid )
+            .write( DstPid, m_pid )
+            .write( SrcSlot, srcSlot )
+            .write( DstSlot, dstSlot )
+            .write( SrcOffset, srcOffset )
+            .write( DstOffset, dstOffset )
+            .write( Size, size )
+            .send( *m_firstQueue, m_pid );
+     }
+#endif
 }
 
 void MessageQueue :: put( memslot_t srcSlot, size_t srcOffset,
-        pid_t dstPid, memslot_t dstSlot, size_t dstOffset, size_t size )
+        pid_t dstPid, memslot_t dstSlot, size_t dstOffset, size_t size, lpf_msg_attr_t attr)
 {
-    if (size > 0)
+    if (size == 0 ) { return; }
+    ASSERT( ! m_memreg.isLocalSlot( dstSlot ) );
+    void * const address = m_memreg.getAddress( srcSlot, srcOffset );
+    if ( dstPid == static_cast<pid_t>(m_pid) )
     {
-        ASSERT( ! m_memreg.isLocalSlot( dstSlot ) );
-        void * address = m_memreg.getAddress( srcSlot, srcOffset );
-        if ( dstPid == static_cast<pid_t>(m_pid) )
-        {
-            std::memcpy( m_memreg.getAddress( dstSlot, dstOffset), address, size);
-        }
-        else
-        {
-            using mpi::ipc::newMsg;
-            if (size <= m_tinyMsgSize )
-            {
-                newMsg( BufPut, m_tinyMsgBuf.data(), m_tinyMsgBuf.size() )
-                    .write( DstSlot, dstSlot )
-                    .write( DstOffset, dstOffset )
-                    .write( Payload, address, size )
-                    . send( *m_firstQueue, dstPid );
-            }
-            else
-            {
-                newMsg( HpPut, m_tinyMsgBuf.data(), m_tinyMsgBuf.size() )
-                    .write( SrcPid, m_pid )
-                    .write( DstPid, dstPid )
-                    .write( SrcSlot, srcSlot )
-                    .write( DstSlot, dstSlot )
-                    .write( SrcOffset, srcOffset )
-                    .write( DstOffset, dstOffset )
-                    .write( Size, size )
-                    .send( *m_firstQueue, dstPid );
-            }
-        }
+        (void) std::memcpy(
+            m_memreg.getAddress( dstSlot, dstOffset),
+            address, size
+        );
+        return;
+    }
+#ifdef LPF_CORE_MPI_USES_zero
+    m_ibverbs.put( m_memreg.getVerbID( srcSlot),
+            srcOffset,
+            dstPid,
+            m_memreg.getVerbID( dstSlot),
+            dstOffset,
+            size,
+            attr);
+#else
+    (void) attr; // this engine does not use message attributes
+    using mpi::ipc::newMsg;
+    if (size <= m_tinyMsgSize )
+    {
+        newMsg( BufPut, m_tinyMsgBuf.data(), m_tinyMsgBuf.size() )
+            .write( DstSlot, dstSlot )
+            .write( DstOffset, dstOffset )
+            .write( Payload, address, size )
+            .send( *m_firstQueue, dstPid );
+    } else {
+        newMsg( HpPut, m_tinyMsgBuf.data(), m_tinyMsgBuf.size() )
+            .write( SrcPid, m_pid )
+            .write( DstPid, dstPid )
+            .write( SrcSlot, srcSlot )
+            .write( DstSlot, dstSlot )
+            .write( SrcOffset, srcOffset )
+            .write( DstOffset, dstOffset )
+            .write( Size, size )
+            .send( *m_firstQueue, dstPid );
     }
+#endif
 }
 
-int MessageQueue :: sync( bool abort )
+int MessageQueue :: sync(bool abort, sync_attr_t attr)
 {
+#ifdef LPF_CORE_MPI_USES_zero
+    // if not, deal with normal sync
+    (void)abort;
+    m_memreg.sync();
+    m_ibverbs.sync(m_resized,
+        static_cast< Backend::SyncAttr * >(attr));
+    m_resized = false;
+#else
+    (void)attr;
+
     LOG(4, "mpi :: MessageQueue :: sync( abort " << (abort?"true":"false")
             << " )");
     using mpi::ipc::newMsg;
@@ -971,9 +1038,34 @@ int MessageQueue :: sync( bool abort )
     ASSERT( m_bodyRecvs.empty() );
 
     LOG(4, "End of synchronisation");
+#endif
     return 0;
 }
 
+void MessageQueue :: createNewSyncAttr(sync_attr_t * attr)
+{
+    ASSERT(attr != NULL);
+#ifdef LPF_CORE_MPI_USES_zero
+    m_ibverbs.createNewSyncAttr(
+        reinterpret_cast< Backend::SyncAttr * * >(attr));
+#else
+    *attr = LPF_SYNC_DEFAULT;
+#endif
+}
+
+void MessageQueue :: flushSent()
+{
+#ifdef LPF_CORE_MPI_USES_zero
+        m_ibverbs.flushSent();
+#endif
+}
+
+void MessageQueue :: flushReceived()
+{
+#ifdef LPF_CORE_MPI_USES_zero
+        m_ibverbs.flushReceived();
+#endif
+}
 
 
 } // namespace lpf
diff --git a/src/MPI/mesgqueue.hpp b/src/MPI/mesgqueue.hpp
index 27e7beb5..424ba5bf 100644
--- a/src/MPI/mesgqueue.hpp
+++ b/src/MPI/mesgqueue.hpp
@@ -36,34 +36,133 @@
 #ifdef LPF_CORE_MPI_USES_ibverbs
 #include "ibverbs.hpp"
 #endif
+#ifdef LPF_CORE_MPI_USES_zero
+#include "zero.hpp"
+#endif
+
 
 namespace lpf {
 
 class _LPFLIB_LOCAL MessageQueue
 {
+
 public:
     explicit MessageQueue( Communication & comm );
 
     err_t resizeMemreg( size_t nRegs );
     err_t resizeMesgQueue( size_t nMsgs );
-
+    err_t resizeTagreg( size_t nTags );
 
     memslot_t addLocalReg( void * mem, std::size_t size );
     memslot_t addGlobalReg( void * mem, std::size_t size );
-    void      removeReg( memslot_t slot );
+    tag_t addTag();
+
+    void removeReg( memslot_t slot );
+    void removeTag( tag_t tag );
 
     void get( pid_t srcPid, memslot_t srcSlot, size_t srcOffset,
-            memslot_t dstSlot, size_t dstOffset, size_t size );
+        memslot_t dstSlot, size_t dstOffset, size_t size, lpf_msg_attr_t attr);
 
     void put( memslot_t srcSlot, size_t srcOffset,
-            pid_t dstPid, memslot_t dstSlot, size_t dstOffset, size_t size );
+        pid_t dstPid, memslot_t dstSlot, size_t dstOffset, size_t size, lpf_msg_attr_t attr);
 
 
     // returns how many processes have entered in an aborted state
-    int sync( bool abort );
+    int sync(bool abort, sync_attr_t attr);
+
+    inline void getRcvdMsgCount(size_t * msgs, sync_attr_t attr) noexcept
+    {
+        ASSERT(msgs != nullptr);
+#ifdef LPF_CORE_MPI_USES_zero
+        m_ibverbs.get_rcvd_msg_count(*msgs,
+            static_cast< Backend::SyncAttr * >(attr));
+#else
+        (void)attr;
+#endif
+    }
+
+    inline void getSentMsgCount(size_t * msgs, sync_attr_t attr) noexcept
+    {
+        ASSERT(msgs != nullptr);
+#ifdef LPF_CORE_MPI_USES_zero
+        m_ibverbs.get_sent_msg_count(*msgs,
+            static_cast< Backend::SyncAttr * >(attr));
+#else
+        (void)attr;
+#endif
+    }
+
+    void flushSent();
+
+    void flushReceived();
+
+    int countingSyncPerSlot(memslot_t slot, size_t expected_sent, size_t expected_rcvd);
+
+    int syncPerSlot(memslot_t slot);
+
+    void createNewSyncAttr(sync_attr_t * attr);
+
+    inline void destroySyncAttr(sync_attr_t attr)
+    {
+#ifdef LPF_CORE_MPI_USES_zero
+        m_ibverbs.destroySyncAttr(
+            static_cast< Backend::SyncAttr * >(attr));
+#else
+        (void)attr;
+#endif
+    }
+
+    inline tag_t getTagFromSyncAttr(sync_attr_t attr) noexcept
+    {
+        ASSERT(attr != NULL);
+#ifdef LPF_CORE_MPI_USES_zero
+        return m_ibverbs.getTag(
+            *static_cast< Backend::SyncAttr * >(attr));
+#else
+        return LPF_INVALID_TAG;
+#endif
+    }
+
+    inline void setTagInSyncAttr(tag_t tag, sync_attr_t attr) noexcept
+    {
+        ASSERT(attr != NULL);
+#ifdef LPF_CORE_MPI_USES_zero
+        return m_ibverbs.setTag(tag,
+            *static_cast< Backend::SyncAttr * >(attr));
+#else
+        (void)tag;
+#endif
+    }
+
+    inline void setZCAttr(size_t sent, size_t rcvd, sync_attr_t attr) noexcept
+    {
+        ASSERT(attr != NULL);
+#ifdef LPF_CORE_MPI_USES_zero
+        return m_ibverbs.setZCAttr(sent,rcvd,
+            *static_cast< Backend::SyncAttr * >(attr));
+#else
+        (void)sent;
+        (void)rcvd;
+        (void)attr;
+#endif
+    }
+
+    inline void getZCAttr(sync_attr_t attr, size_t &sent, size_t &rcvd) noexcept
+    {
+        ASSERT(attr != NULL);
+#ifdef LPF_CORE_MPI_USES_zero
+        return m_ibverbs.getZCAttr(
+            *static_cast< Backend::SyncAttr * >(attr),
+            sent, rcvd);
+#else
+        (void)attr;
+        (void)sent;
+        (void)rcvd;
+#endif
+    }
 
 private:
-    enum Msgs { BufPut , 
+    enum Msgs { BufPut ,
         BufGet, BufGetReply,
         HpPut, HpGet , HpBodyReply ,
         HpEdges, HpEdgesReply };
@@ -72,7 +171,7 @@ class _LPFLIB_LOCAL MessageQueue
         SrcPid, DstPid,
         SrcOffset, DstOffset, BufOffset,
         SrcSlot, DstSlot, Size,
-        RoundedDstOffset, RoundedSize, 
+        RoundedDstOffset, RoundedSize,
         Payload, Head, Tail};
 
     struct Edge {
@@ -106,6 +205,11 @@ class _LPFLIB_LOCAL MessageQueue
 
 
     typedef mpi::VirtualAllToAll Queue;
+#if defined LPF_CORE_MPI_USES_ibverbs
+    typedef mpi::IBVerbs Backend;
+#elif defined LPF_CORE_MPI_USES_zero
+    typedef mpi::Zero Backend;
+#endif
     static Queue * newQueue( pid_t pid, pid_t nprocs );
 
     const pid_t m_pid, m_nprocs;
@@ -126,14 +230,14 @@ class _LPFLIB_LOCAL MessageQueue
     std::vector< Edge > m_edgeRecv;
     std::vector< Edge > m_edgeSend;
     std::vector< char > m_edgeBuffer;
-#if defined LPF_CORE_MPI_USES_mpirma || defined LPF_CORE_MPI_USES_ibverbs
+#if defined LPF_CORE_MPI_USES_mpirma || defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero
     memslot_t m_edgeBufferSlot;
 #endif
     std::vector< Body > m_bodySends;
     std::vector< Body > m_bodyRecvs;
     mpi::Comm m_comm;
-#ifdef LPF_CORE_MPI_USES_ibverbs
-    mpi::IBVerbs m_ibverbs;
+#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero
+    Backend m_ibverbs;
 #endif
     MemoryTable m_memreg;
     std::vector< char > m_tinyMsgBuf;
diff --git a/src/MPI/types.hpp b/src/MPI/types.hpp
index f587e437..ae5ae61c 100644
--- a/src/MPI/types.hpp
+++ b/src/MPI/types.hpp
@@ -19,15 +19,18 @@
 #define LPF_CORE_TYPES_HPP
 
 #include "lpf/core.h"
+#include "lpf/tags.h"
 
 namespace lpf {
 
 typedef lpf_err_t err_t;
 typedef lpf_pid_t pid_t;
+typedef lpf_tag_t tag_t;
 typedef lpf_args_t args_t;
 typedef lpf_spmd_t spmd_t;
 typedef lpf_memslot_t memslot_t;
 typedef lpf_machine_t machine_t;
+typedef lpf_sync_attr_t sync_attr_t;
 
 }
 
diff --git a/src/MPI/zero.cpp b/src/MPI/zero.cpp
new file mode 100644
index 00000000..2053fbf5
--- /dev/null
+++ b/src/MPI/zero.cpp
@@ -0,0 +1,1091 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "log.hpp"
+#include "zero.hpp"
+#include "config.hpp"
+#include "communication.hpp"
+
+#include <limits>
+#include <cstdint>
+#include <cstring>
+#include <unistd.h>
+#include <algorithm>
+#include <stdexcept>
+
+#define POLL_BATCH 64
+#define MAX_POLLING 128
+
+
+namespace lpf { namespace mpi {
+
+struct Zero::Exception : std::runtime_error {
+    Exception(const char * what) : std::runtime_error( what ) {}
+};
+
+namespace {
+    ibv_mtu getMTU( unsigned  size ) {
+        switch (size) {
+            case 256: return IBV_MTU_256;
+            case 512: return IBV_MTU_512;
+            case 1024: return IBV_MTU_1024;
+            case 2048: return IBV_MTU_2048;
+            case 4096: return IBV_MTU_4096;
+            default: throw Zero::Exception("Illegal MTU size");
+        }
+        return IBV_MTU_4096;
+    }
+}
+
+Zero :: Zero( Communication & comm )
+    : m_pid( comm.pid() )
+    , m_nprocs( comm.nprocs() )
+    , m_ibPort( Config::instance().getIBPort() )
+    , m_gidIdx( Config::instance().getIBGidIndex() )
+    , m_maxRegSize(0)
+    , m_maxMsgSize(0)
+    , m_cqSize(1)
+    , m_minNrMsgs(0)
+    , m_maxSrs(0)
+    , m_postCount(0)
+    , m_recvCount(0)
+    , m_tag_capacity(0)
+    , m_device()
+    , m_pd()
+    , m_cqLocal()
+    , m_cqRemote()
+    , m_dummyMemReg()
+    , m_numMsgs(0)
+    , m_recvTotalInitMsgCount(0)
+    , m_sentMsgs(0)
+    , m_recvdMsgs(0)
+    , m_comm( comm )
+    , m_devName()
+    , m_mtu( getMTU( Config::instance().getIBMTU() ))
+    , m_stagedQps( m_nprocs )
+    , m_connectedQps( m_nprocs )
+    , m_srs()
+    , m_srsHeads( m_nprocs, 0u )
+    , m_nMsgsPerPeer( m_nprocs, 0u )
+    , m_peerList()
+    , m_sges()
+    , m_dummyBuffer()
+    , m_activePeers(0, m_nprocs)
+    , m_memreg()
+{
+    m_peerList.reserve( m_nprocs );
+
+    int numDevices = -1;
+    struct ibv_device * * const try_get_device_list =
+        ibv_get_device_list( &numDevices );
+
+    if (!try_get_device_list) {
+        LOG(1, "Cannot get list of Infiniband devices" );
+        throw Exception( "failed to get IB devices list");
+    }
+
+    shared_ptr< struct ibv_device * > devList(
+            try_get_device_list,
+            ibv_free_device_list );
+
+    LOG(3, "Retrieved Infiniband device list, which has " << numDevices
+            << " devices"  );
+
+    if (numDevices < 1) {
+        LOG(1, "There are " << numDevices << " Infiniband devices"
+                " available, which is not enough" );
+        throw Exception( "No Infiniband devices available" );
+    }
+
+    std::string wantDevName = Config::instance().getIBDeviceName();
+    LOG( 3, "Searching for device '"<< wantDevName << "'" );
+    struct ibv_device * dev = NULL;
+    for (int i = 0; i < numDevices; i ++)
+    {
+        std::string name = ibv_get_device_name( (&*devList)[i]);
+        LOG(3, "Device " << i << " has name '" << name << "'" );
+        if ( wantDevName.empty() || name == wantDevName ) {
+            LOG(3, "Found device '" << name << "'" );
+            m_devName = name;
+            dev = (&*devList)[i];
+            break;
+        }
+    }
+
+    if (dev == NULL) {
+        LOG(1, "Could not find device '" << wantDevName << "'" );
+        throw Exception("Infiniband device not found");
+    }
+
+    struct ibv_context * const ibv_context_new_p = ibv_open_device(dev);
+    if( ibv_context_new_p == NULL )
+        m_device.reset();
+    else
+        m_device.reset( ibv_context_new_p, ibv_close_device );
+    if (!m_device) {
+        LOG(1, "Failed to open Infiniband device '" << m_devName << "'");
+        throw Exception("Cannot open IB device");
+    }
+    LOG(3, "Opened Infiniband device '" << m_devName << "'" );
+
+    devList.reset();
+    LOG(3, "Closed Infiniband device list" );
+
+    std::memset(&m_deviceAttr, 0, sizeof(m_deviceAttr));
+    if (ibv_query_device( m_device.get(), &m_deviceAttr ))
+        throw Exception("Cannot query device");
+
+    LOG(3, "Queried IB device capabilities" );
+
+    m_maxRegSize = m_deviceAttr.max_mr_size;
+    LOG(3, "Maximum size for memory registration = " << m_maxRegSize );
+
+    // maximum number of work requests per Queue Pair
+    m_maxSrs = std::min<size_t>( m_deviceAttr.max_qp_wr, // maximum work requests per QP
+                                 m_deviceAttr.max_cqe ); // maximum entries per CQ
+    LOG(3, "Maximum number of send requests is the minimum of "
+            << m_deviceAttr.max_qp_wr << " (the maximum of work requests per QP)"
+            << " and " << m_deviceAttr.max_cqe << " (the maximum of completion "
+            << " queue entries per QP), nameley " << m_maxSrs );
+
+    if ( m_deviceAttr.max_cqe < m_nprocs )
+        throw Exception("Completion queue has insufficient completion queue capabilities");
+
+    struct ibv_port_attr port_attr; std::memset( &port_attr, 0, sizeof(port_attr));
+    if (ibv_query_port( m_device.get(), m_ibPort, & port_attr ))
+        throw Exception("Cannot query IB port");
+
+    LOG(3, "Queried IB port " << m_ibPort << " capabilities" );
+
+    // store Maximum message size
+    m_maxMsgSize = port_attr.max_msg_sz;
+    LOG(3, "Maximum IB message size is " << m_maxMsgSize );
+
+    size_t sysRam = Config::instance().getLocalRamSize();
+    m_minNrMsgs = sysRam  / m_maxMsgSize;
+    LOG(3, "Minimum number of messages to allocate = "
+            "total system RAM / maximum message size = "
+            <<  sysRam << " / " << m_maxMsgSize << " = "  << m_minNrMsgs );
+
+    // store LID
+    m_lid = port_attr.lid;
+    LOG(3, "LID is " << m_lid );
+
+    struct ibv_pd * const pd_new_p = ibv_alloc_pd( m_device.get() );
+    if( pd_new_p == NULL )
+        m_pd.reset();
+    else
+        m_pd.reset( pd_new_p, ibv_dealloc_pd );
+    if (!m_pd) {
+        LOG(1, "Could not allocate protection domain ");
+        throw Exception("Could not allocate protection domain");
+    }
+    LOG(3, "Opened protection domain");
+
+    /**
+     * New notification functionality for HiCR
+     */
+    struct ibv_srq_init_attr srq_init_attr;
+    srq_init_attr.srq_context = NULL;
+    srq_init_attr.attr.max_wr =  m_deviceAttr.max_srq_wr;
+    srq_init_attr.attr.max_sge = m_deviceAttr.max_srq_sge;
+    srq_init_attr.attr.srq_limit = 0;
+    m_srq.reset(ibv_create_srq(m_pd.get(), &srq_init_attr ),
+            ibv_destroy_srq);
+
+    m_cqLocal.reset(ibv_create_cq( m_device.get(), m_cqSize, NULL, NULL, 0),
+        ibv_destroy_cq);
+    if (!m_cqLocal) {
+        LOG(1, "Could not allocate completion queue with '"
+                << m_nprocs << " entries" );
+        throw Exception("Could not allocate completion queue");
+    }
+    m_cqRemote.reset(
+        ibv_create_cq( m_device.get(), m_cqSize * m_nprocs, NULL, NULL, 0),
+	ibv_destroy_cq);
+    if (!m_cqLocal) {
+        LOG(1, "Could not allocate completion queue with '"
+                << m_nprocs << " entries" );
+        throw Exception("Could not allocate completion queue");
+    }
+
+    LOG(3, "Allocated completion queue with " << m_nprocs << " entries.");
+
+    // allocate dummy buffer
+    m_dummyBuffer.resize( 8 );
+    struct ibv_mr * const ibv_reg_mr_new_p = ibv_reg_mr(
+        m_pd.get(), m_dummyBuffer.data(), m_dummyBuffer.size(),
+        IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE
+    );
+    if( ibv_reg_mr_new_p == NULL )
+        m_dummyMemReg.reset();
+    else
+        m_dummyMemReg.reset( ibv_reg_mr_new_p, ibv_dereg_mr );
+    if (!m_dummyMemReg) {
+        LOG(1, "Could not register memory region");
+        throw Exception("Could not register memory region");
+    }
+
+    LOG(3, "Queue pairs have been successfully initialized");
+
+}
+
+Zero :: ~Zero()
+{ }
+
+inline void Zero :: tryIncrement(const Op op, const Phase phase,
+    const TagID tag) noexcept
+{
+    if (tag == INVALID_TAG) {
+        LOG(2, "Zero::tryIncrement called on invalid tag");
+        return;
+    }
+
+    switch (phase) {
+        case Phase::INIT:
+            // dynamically increase the capacity
+            // of registered tag arrays
+            // Somewhat arbitrarily I choose here to
+            // increase by factor 8 each time
+            if (m_tag_capacity <= tag) {
+                LOG(3, "Dynamically reallocated tags: " << tag << " -> " << (tag + 1) *  8);
+                resizeTagreg((tag + 1) * 8);
+            }
+            rcvdMsgCount[tag] = 0;
+            getMsgCount[tag] = 0;
+            m_recvInitMsgCount[tag] = 0;
+            m_getInitMsgCount[tag] = 0;
+            sentMsgCount[tag] = 0;
+            m_sendInitMsgCount[tag] = 0;
+            tagActive[tag] = true;
+            break;
+        case Phase::PRE:
+            if (op == Op::SEND) {
+                (void)m_numMsgs++;
+                (void)m_sendInitMsgCount[tag]++;
+            }
+            if (op == Op::RECV) {
+                (void)m_recvTotalInitMsgCount++;
+                (void)m_recvInitMsgCount[tag]++;
+            }
+            if  (op == Op::GET) {
+                (void)m_recvTotalInitMsgCount++;
+                (void)m_getInitMsgCount[tag]++;
+            }
+            break;
+        case Phase::POST:
+            if (op == Op::RECV) {
+                (void)m_recvdMsgs++;
+                (void)rcvdMsgCount[tag]++;
+            }
+            if (op == Op::GET) {
+                (void)m_recvdMsgs++;
+                (void)getMsgCount[tag]++;
+            }
+            if (op == Op::SEND) {
+                (void)m_sentMsgs++;
+                (void)sentMsgCount[tag]++;
+            }
+            break;
+    }
+}
+
+void Zero :: stageQPs( size_t maxMsgs )
+{
+    // create the queue pairs
+    for ( size_t i = 0; i < static_cast<size_t>(m_nprocs); ++i) {
+        struct ibv_qp_init_attr attr;
+        std::memset(&attr, 0, sizeof(attr));
+
+        attr.qp_type = IBV_QPT_RC; // we want reliable connection
+        attr.sq_sig_all = 0; // only wait for selected messages
+        attr.send_cq = m_cqLocal.get();
+        attr.recv_cq = m_cqRemote.get();
+        attr.srq = m_srq.get();
+        attr.cap.max_send_wr = std::min<size_t>(maxMsgs + m_minNrMsgs,m_maxSrs/4);
+        attr.cap.max_recv_wr = std::min<size_t>(maxMsgs + m_minNrMsgs,m_maxSrs/4);
+        attr.cap.max_send_sge = 1;
+        attr.cap.max_recv_sge = 1;
+
+        struct ibv_qp * const ibv_new_qp_p = ibv_create_qp( m_pd.get(), &attr );
+        ASSERT(m_stagedQps.size() > i);
+        if( ibv_new_qp_p == NULL ) {
+            m_stagedQps[i].reset();
+        } else {
+            m_stagedQps[i].reset( ibv_new_qp_p, ibv_destroy_qp );
+        }
+        if (!m_stagedQps[i]) {
+            LOG( 1, "Could not create Infiniband Queue pair number " << i );
+            throw std::bad_alloc();
+        }
+
+        LOG(3, "Created new Queue pair for " << m_pid << " -> " << i
+            << " with qp_num = " << ibv_new_qp_p->qp_num);
+    }
+}
+
+void Zero :: doRemoteProgress() {
+    struct ibv_wc wcs[POLL_BATCH];
+    struct ibv_recv_wr wr;
+    struct ibv_sge sg;
+    struct ibv_recv_wr *bad_wr;
+    sg.addr = (uint64_t) NULL;
+    sg.length = 0;
+    sg.lkey = 0;
+    wr.next = NULL;
+    wr.sg_list = &sg;
+    wr.num_sge = 0;
+    wr.wr_id = 66;
+    int pollResult, totalResults = 0;
+    do {
+        pollResult = ibv_poll_cq(m_cqRemote.get(), POLL_BATCH, wcs);
+        if (pollResult > 0) {
+            LOG(3, "Process " << m_pid << " signals: I received " << pollResult
+                << " remote messages in doRemoteProgress");
+        }
+        else if (pollResult < 0)
+        {
+            LOG( 1, "Failed to poll IB completion queue" );
+            throw Exception("Poll CQ failure");
+        }
+
+        for(int i = 0; i < pollResult; i++) {
+            if (wcs[i].status != IBV_WC_SUCCESS) {
+                LOG( 2, "Got bad completion status from IB message."
+                        " status = 0x" << std::hex << wcs[i].status
+                        << ", vendor syndrome = 0x" << std::hex
+                        << wcs[i].vendor_err );
+            }
+            else
+            {
+                LOG(3, "Process " << m_pid << " Recv wcs[" << i << "].src_qp = "<< wcs[i].src_qp);
+                LOG(3, "Process " << m_pid << " Recv wcs[" << i << "].slid = "<< wcs[i].slid);
+                LOG(3, "Process " << m_pid << " Recv wcs[" << i << "].wr_id = "<< wcs[i].wr_id);
+                LOG(3, "Process " << m_pid << " Recv wcs[" << i << "].imm_data = "<< wcs[i].imm_data);
+
+                /**
+                 * Here is a trick:
+                 * The sender sends relatively generic LPF memslot ID.
+                 * But for IB Verbs, we need to translate that into
+                 * an IB Verbs slot via @getVerbID -- or there will be
+                 * a mismatch when IB Verbs looks up the slot ID
+                 */
+
+                // Note: Ignore compare-and-swap atomics!
+                if (wcs[i].opcode != IBV_WC_COMP_SWAP) {
+                    TagID tag;
+                    // This receive is from a PUT call
+                    if (wcs[i].opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
+                        tag = wcs[i].imm_data;
+                        tryIncrement(Op::RECV, Phase::POST, tag);
+                        LOG(3, "Rank " << m_pid << " increments received message count to " << rcvdMsgCount[tag] << " for LPF slot " << tag);
+                    }
+                }
+                ibv_post_srq_recv(m_srq.get(), &wr, &bad_wr);
+            }
+        }
+        if(pollResult > 0) totalResults += pollResult;
+    } while (pollResult == POLL_BATCH && totalResults < MAX_POLLING);
+}
+
+void Zero :: reconnectQPs()
+{
+    ASSERT( m_stagedQps[0] );
+    m_comm.barrier();
+
+    union ibv_gid myGid;
+    std::vector< uint32_t> localQpNums, remoteQpNums;
+    std::vector< uint16_t> lids;
+    std::vector< union ibv_gid > gids;
+    try {
+        // Exchange info about the queue pairs
+        if (m_gidIdx >= 0) {
+            if (ibv_query_gid(m_device.get(), m_ibPort, m_gidIdx, &myGid)) {
+                LOG(1, "Could not get GID of Infiniband device port " << m_ibPort);
+                throw Exception( "Could not get gid for IB port");
+            }
+            LOG(3, "GID of Infiniband device was retrieved" );
+        }
+        else {
+            std::memset( &myGid, 0, sizeof(myGid) );
+            LOG(3, "GID of Infiniband device will not be used" );
+        }
+
+        localQpNums.resize(m_nprocs);
+        remoteQpNums.resize(m_nprocs);
+        lids.resize(m_nprocs);
+        gids.resize(m_nprocs);
+
+        for ( int i = 0; i < m_nprocs; ++i)
+            localQpNums[i] = m_stagedQps[i]->qp_num;
+    }
+    catch(...)
+    {
+        m_comm.allreduceOr( true );
+        throw;
+    }
+    if (m_comm.allreduceOr( false) )
+        throw Exception("Peer failed to allocate memory or query device while setting-up QP");
+
+    m_comm.allToAll( localQpNums.data(), remoteQpNums.data() );
+    m_comm.allgather( m_lid, lids.data() );
+    m_comm.allgather( myGid, gids.data() );
+
+    LOG(3, "Connection initialisation data has been exchanged");
+
+    try {
+        // Bring QPs to INIT
+        for (int i = 0; i < m_nprocs; ++i ) {
+            struct ibv_qp_attr  attr;
+            int                 flags;
+
+            std::memset(&attr, 0, sizeof(attr));
+            attr.qp_state = IBV_QPS_INIT;
+            attr.port_num = m_ibPort;
+            attr.pkey_index = 0;
+            attr.qp_access_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ |
+                IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC;
+            flags = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS;
+            if ( ibv_modify_qp(m_stagedQps[i].get(), &attr, flags) ) {
+                LOG(1, "Cannot bring state of QP " << i << " to INIT");
+                throw Exception("Failed to bring QP's state to Init" );
+            }
+
+            // post a dummy receive
+
+            struct ibv_recv_wr rr;  std::memset(&rr, 0, sizeof(rr));
+            struct ibv_sge     sge; std::memset(&sge, 0, sizeof(sge));
+            sge.addr = reinterpret_cast<uintptr_t>(m_dummyBuffer.data());
+            sge.length = m_dummyBuffer.size();
+            sge.lkey = m_dummyMemReg->lkey;
+            rr.next = NULL;
+            rr.wr_id = 46;
+            rr.sg_list = &sge;
+            rr.num_sge = 1;
+
+            // Bring QP to RTR
+            std::memset(&attr, 0, sizeof(attr));
+            attr.qp_state = IBV_QPS_RTR;
+            attr.path_mtu = m_mtu;
+            attr.dest_qp_num = remoteQpNums[i];
+            attr.rq_psn = 0;
+            attr.max_dest_rd_atomic = 1;
+            attr.min_rnr_timer = 0x12;
+            attr.ah_attr.is_global = 0;
+            attr.ah_attr.dlid = lids[i];
+            attr.ah_attr.sl = 0;
+            attr.ah_attr.src_path_bits  = 0;
+            attr.ah_attr.port_num = m_ibPort;
+            if (m_gidIdx >= 0)
+            {
+                attr.ah_attr.is_global = 1;
+                attr.ah_attr.port_num = 1;
+                memcpy(&attr.ah_attr.grh.dgid, &gids[i], 16);
+                attr.ah_attr.grh.flow_label = 0;
+                attr.ah_attr.grh.hop_limit = 1;
+                attr.ah_attr.grh.sgid_index = m_gidIdx;
+                attr.ah_attr.grh.traffic_class = 0;
+            }
+            flags = IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN |
+                IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER;
+
+            if (ibv_modify_qp(m_stagedQps[i].get(), &attr, flags)) {
+                LOG(1, "Cannot bring state of QP " << i << " to RTR" );
+                throw Exception("Failed to bring QP's state to RTR" );
+            }
+
+            // Bring QP to RTS
+            std::memset(&attr, 0, sizeof(attr));
+            attr.qp_state      = IBV_QPS_RTS;
+            attr.timeout       = 0x12;
+            attr.retry_cnt     = 0;//7;
+            attr.rnr_retry     = 0;//7;
+            attr.sq_psn        = 0;
+            attr.max_rd_atomic = 1;
+            flags = IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT |
+                IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC;
+            if( ibv_modify_qp(m_stagedQps[i].get(), &attr, flags))  {
+                LOG(1, "Cannot bring state of QP " << i << " to RTS" );
+                throw Exception("Failed to bring QP's state to RTS" );
+            }
+
+            LOG(3, "Connected Queue pair for " << m_pid << " -> " << i );
+
+        } // for each peer
+    }
+    catch(...) {
+        m_comm.allreduceOr( true );
+        throw;
+    }
+
+    if (m_comm.allreduceOr( false ))
+        throw Exception("Another peer failed to set-up Infiniband queue pairs");
+
+    LOG(3, "All staged queue pairs have been connected" );
+
+    m_connectedQps.swap( m_stagedQps );
+
+    LOG(3, "All old queue pairs have been removed");
+
+    m_comm.barrier();
+}
+
+void Zero :: resizeMemreg( size_t size )
+{
+    if ( size > size_t(std::numeric_limits<int>::max()) )
+    {
+        LOG(2, "Could not expand memory register, because integer will overflow");
+        throw Exception("Could not increase memory register");
+    }
+    if ( int(size) > m_deviceAttr.max_mr ) {
+        LOG(2, "IB device only supports " << m_deviceAttr.max_mr
+                << " memory registrations, while " << size
+                << " are being requested" );
+        throw std::bad_alloc() ;
+    }
+
+    MemoryRegistration newMR = { nullptr, 0, 0, 0, m_pid};
+    MemorySlot dflt; dflt.glob.resize( m_nprocs, newMR);
+
+    m_memreg.reserve( size, dflt );
+}
+
+void Zero :: resizeMesgq( size_t size )
+{
+
+    m_cqSize = std::min<size_t>(size,m_maxSrs/4);
+    size_t remote_size = std::min<size_t>(m_cqSize*m_nprocs,m_maxSrs/4);
+    if (m_cqLocal) {
+        ibv_resize_cq(m_cqLocal.get(), m_cqSize);
+    }
+    if(remote_size >= m_postCount){
+        if (m_cqRemote) {
+            ibv_resize_cq(m_cqRemote.get(),  remote_size);
+        }
+    }
+    stageQPs(m_cqSize);
+    reconnectQPs();
+    if(remote_size >= m_postCount){
+        if (m_srq) {
+            struct ibv_recv_wr wr;
+            struct ibv_sge sg;
+            struct ibv_recv_wr *bad_wr;
+            sg.addr = (uint64_t) NULL;
+            sg.length = 0;
+            sg.lkey = 0;
+            wr.next = NULL;
+            wr.sg_list = &sg;
+            wr.num_sge = 0;
+            wr.wr_id = m_pid;
+            for(int i = m_postCount; i < (int)remote_size; ++i){
+                ibv_post_srq_recv(m_srq.get(), &wr, &bad_wr);
+                m_postCount++;
+            }
+        }
+    }
+    LOG(4, "Message queue has been reallocated to size " << size );
+}
+
+void Zero :: resizeTagreg( size_t size )
+{
+    if( m_tag_capacity >= size ) {
+        LOG(4, "Tag queue: smaller capacity required, request ignored" );
+        return;
+    }
+
+    ASSERT( size > m_tag_capacity );
+
+    // reserve new capacity
+    m_free_tags.reserve( size );
+    m_recvInitMsgCount.resize(size, 0);
+    m_getInitMsgCount.resize(size, 0);
+    m_sendInitMsgCount.resize(size, 0);
+    rcvdMsgCount.resize(size, 0);
+    getMsgCount.resize(size, 0);
+    sentMsgCount.resize(size, 0);
+    tagActive.resize(size, 0);
+
+    // if ok, push new tag IDs to free tags
+    for( size_t k = m_tag_capacity; k < size; ++k ) {
+        m_free_tags.push_back( static_cast<TagID>(k) );
+    }
+
+    // correct tag capacity
+    m_tag_capacity = size;
+
+    LOG(4, "Tag queue: new capacity in effect ( " << size << " )");
+}
+
+Zero :: SlotID Zero :: regLocal( void * addr, size_t size )
+{
+    ASSERT( size <= m_maxRegSize );
+
+    MemorySlot slot;
+    if ( size > 0) {
+        LOG(4, "Registering locally memory area at " << addr << " of size  " << size );
+        struct ibv_mr * const ibv_mr_new_p = ibv_reg_mr(
+            m_pd.get(), addr, size,
+            IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE |
+            IBV_ACCESS_REMOTE_ATOMIC
+        );
+        if( ibv_mr_new_p == NULL )
+            slot.mr.reset();
+        else
+            slot.mr.reset( ibv_mr_new_p, ibv_dereg_mr );
+        if (!slot.mr) {
+            LOG(1, "Could not register memory area at "
+                   << addr << " of size " << size << " with IB device");
+            throw Exception("Could not register memory area");
+        }
+    }
+    MemoryRegistration local((char *) addr, size, size?slot.mr->lkey:0,
+        size?slot.mr->rkey:0, m_pid);
+
+    SlotID id =  m_memreg.addLocalReg( slot );
+
+    m_memreg.update( id ).glob.resize( m_nprocs );
+    m_memreg.update( id ).glob[m_pid] = local;
+    LOG(4, "Memory area " << addr << " of size " << size
+        << " has been locally registered. Slot = " << id );
+    return id;
+}
+
+Zero :: SlotID Zero :: regGlobal( void * addr, size_t size )
+{
+    ASSERT( size <= m_maxRegSize );
+
+    MemorySlot slot;
+    if ( size > 0 ) {
+        LOG(4, "Registering globally memory area at " << addr << " of size  " << size );
+        struct ibv_mr * const ibv_mr_new_p = ibv_reg_mr(
+            m_pd.get(), addr, size,
+            IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE |
+            IBV_ACCESS_REMOTE_ATOMIC
+        );
+        if( ibv_mr_new_p == NULL )
+            slot.mr.reset();
+        else
+            slot.mr.reset( ibv_mr_new_p, ibv_dereg_mr );
+        if (!slot.mr) {
+            LOG(1, "Could not register memory area at "
+                   << addr << " of size " << size << " with IB device");
+            m_comm.allreduceAnd(true);
+            throw Exception("Could not register memory area");
+        }
+    }
+    if (m_comm.allreduceOr(false))
+        throw Exception("Another process could not register memory area");
+
+    SlotID id = m_memreg.addGlobalReg( slot );
+    MemorySlot & ref = m_memreg.update(id);
+    // exchange memory registration info globally
+    ref.glob.resize(m_nprocs);
+
+    MemoryRegistration local((char *) addr, size, size?slot.mr->lkey:0,
+            size?slot.mr->rkey:0, m_pid);
+    LOG(4, "All-gathering memory register data" );
+
+    m_comm.allgather( local, ref.glob.data() );
+    LOG(4, "Memory area " << addr << " of size " << size
+            << " has been globally registered. Slot = " << id );
+    return id;
+}
+
+Zero :: TagID Zero :: regTag() {
+    if( m_free_tags.size() == 0 ) {
+        throw Exception("No free tags available");
+    }
+    const TagID ret = m_free_tags.back();
+    // Initialize a new tag
+    tryIncrement(Op::SEND, Phase::INIT, ret);
+    m_free_tags.pop_back();
+    LOG(4, "Tag " << ret << " has been allocated");
+    return ret;
+}
+
+void Zero :: dereg( SlotID id )
+{
+    m_memreg.removeReg( id );
+    LOG(4, "Memory area of slot " << id << " has been deregistered");
+}
+
+void Zero :: deregTag( TagID id )
+{
+    ASSERT( m_free_tags.size() < m_tag_capacity );
+    m_free_tags.push_back( id );
+    tagActive[id] = false;
+    m_recvInitMsgCount[id] = 0;
+    m_getInitMsgCount[id] = 0;
+    m_sendInitMsgCount[id] = 0;
+    rcvdMsgCount[id] = 0;
+    sentMsgCount[id] = 0;
+    LOG(4, "Tag " << id << " has been released");
+}
+
+void Zero :: put( SlotID srcSlot, size_t srcOffset,
+              int dstPid, SlotID dstSlot, size_t dstOffset, size_t size, lpf_msg_attr_t attr)
+{
+    const MemorySlot & src = m_memreg.lookup( srcSlot );
+    const MemorySlot & dst = m_memreg.lookup( dstSlot );
+    const uint32_t tag = attr == NULL
+	    ? INVALID_TAG
+	    : * static_cast< uint32_t * >(attr);
+
+    ASSERT( src.mr );
+
+    int numMsgs = size/m_maxMsgSize + (size % m_maxMsgSize > 0);
+        //+1 if last msg size < m_maxMsgSize
+    if (size == 0) numMsgs = 1;
+
+    struct ibv_sge     sges[numMsgs];
+    struct ibv_send_wr srs[numMsgs];
+    struct ibv_sge     *sge;
+    struct ibv_send_wr *sr;
+    for (int i=0; i < numMsgs; i++) {
+        sge = &sges[i]; std::memset(sge, 0, sizeof(ibv_sge));
+        sr = &srs[i]; std::memset(sr, 0, sizeof(ibv_send_wr));
+        const char * localAddr
+            = static_cast<const char *>(src.glob[m_pid]._addr) + srcOffset;
+        const char * remoteAddr
+            = static_cast<const char *>(dst.glob[dstPid]._addr) + dstOffset;
+
+        sge->addr = reinterpret_cast<uintptr_t>( localAddr );
+        sge->length =  std::min<size_t>(size, m_maxMsgSize );
+        sge->lkey = src.mr->lkey;
+        sges[i] = *sge;
+
+        bool lastMsg = (i == numMsgs-1);
+        sr->next = lastMsg ? NULL : &srs[ i+1];
+        // since reliable connection guarantees keeps packets in order,
+        // we only need a signal from the last message in the queue
+        sr->send_flags = lastMsg ? IBV_SEND_SIGNALED : 0;
+        sr->opcode = lastMsg? IBV_WR_RDMA_WRITE_WITH_IMM : IBV_WR_RDMA_WRITE;
+        // use wr_id to store the comm tag (passed as attr)
+        sr->wr_id = tag;
+        // use wr_id to store the comm tag (passed as attr)
+        sr->imm_data = tag;
+
+        sr->sg_list = &sges[i];
+        sr->num_sge = 1;
+        sr->wr.rdma.remote_addr = reinterpret_cast<uintptr_t>( remoteAddr );
+        sr->wr.rdma.rkey = dst.glob[dstPid]._rkey;
+
+        srs[i] = *sr;
+        size -= sge->length;
+        srcOffset += sge->length;
+        dstOffset += sge->length;
+
+        LOG(4, "PID " << m_pid << ": Enqueued put message of " << sge->length
+            << " bytes to " << dstPid << " on slot" << dstSlot << " and tag " << attr);
+    }
+    struct ibv_send_wr *bad_wr = NULL;
+    // srs[0] should be sufficient because the rest of srs are on a chain
+    if (int err = ibv_post_send(m_connectedQps[dstPid].get(), &srs[0], &bad_wr ))
+    {
+        LOG(1, "Error while posting RDMA requests: " << std::strerror(err) );
+        throw Exception("Error while posting RDMA requests");
+    }
+
+    tryIncrement(Op::SEND, Phase::PRE, tag);
+}
+
+void Zero :: get( int srcPid, SlotID srcSlot, size_t srcOffset,
+                  SlotID dstSlot, size_t dstOffset, size_t size, lpf_msg_attr_t attr)
+{
+    const MemorySlot & src = m_memreg.lookup( srcSlot );
+    const MemorySlot & dst = m_memreg.lookup( dstSlot );
+    const uint32_t tag = attr == NULL
+	    ? INVALID_TAG
+	    : * static_cast< uint32_t * >(attr);
+
+    ASSERT( dst.mr );
+
+    int numMsgs = size/m_maxMsgSize + (size % m_maxMsgSize > 0);
+        //+1 if last msg size < m_maxMsgSize
+
+    struct ibv_sge     sges[numMsgs+1];
+    struct ibv_send_wr srs[numMsgs+1];
+    struct ibv_sge     *sge;
+    struct ibv_send_wr *sr;
+
+
+    for(int i = 0; i< numMsgs; i++){
+        sge = &sges[i]; std::memset(sge, 0, sizeof(ibv_sge));
+        sr = &srs[i]; std::memset(sr, 0, sizeof(ibv_send_wr));
+
+        const char * localAddr
+            = static_cast<const char *>(dst.glob[m_pid]._addr) + dstOffset;
+        const char * remoteAddr
+            = static_cast<const char *>(src.glob[srcPid]._addr) + srcOffset;
+
+        sge->addr = reinterpret_cast<uintptr_t>( localAddr );
+        sge->length = std::min<size_t>(size, m_maxMsgSize );
+        sge->lkey = dst.mr->lkey;
+        sges[i] = *sge;
+        LOG(4, "PID " << m_pid << ": Enqueued get message of " << sge->length
+            << " bytes from " << srcPid << " on slot" << srcSlot );
+
+        bool lastMsg = (i == numMsgs-1);
+        sr->next = lastMsg ? NULL : &srs[ i+1];
+        sr->send_flags = lastMsg ? IBV_SEND_SIGNALED : 0;
+
+        sr->sg_list = &sges[i];
+        sr->num_sge = 1;
+        sr->opcode = IBV_WR_RDMA_READ;
+        sr->wr.rdma.remote_addr = reinterpret_cast<uintptr_t>( remoteAddr );
+        sr->wr.rdma.rkey = src.glob[srcPid]._rkey;
+        // This logic is reversed compared to ::put
+        sr->wr_id = tag; // <= This enables virtual tag matching
+        sr->imm_data = 0; // This is irrelevant as we don't send _WITH_IMM
+        srs[i] = *sr;
+        size -= sge->length;
+        srcOffset += sge->length;
+        dstOffset += sge->length;
+    }
+
+    struct ibv_send_wr *bad_wr = NULL;
+    if (int err = ibv_post_send(m_connectedQps[srcPid].get(), &srs[0], &bad_wr ))
+    {
+
+        LOG(1, "Error while posting RDMA requests: " << std::strerror(err) );
+        if (err == ENOMEM) {
+            LOG(1, "Specific error code: ENOMEM (send queue is full or no resources)");
+        }
+        throw Exception("Error while posting RDMA requests");
+    }
+    tryIncrement(Op::GET, Phase::PRE, tag);
+
+}
+
+void Zero :: get_rcvd_msg_count(size_t &rcvd_msgs, const struct SyncAttr * attr)
+     noexcept
+{
+    if( attr == nullptr || attr->tag == INVALID_TAG ) {
+        rcvd_msgs = m_recvdMsgs;
+    } else {
+        rcvd_msgs = rcvdMsgCount[attr->tag] + getMsgCount[attr->tag];
+    }
+}
+
+void Zero :: get_sent_msg_count(size_t &sent_msgs, const struct SyncAttr * attr)
+    noexcept
+{
+    if( attr == nullptr || attr->tag == INVALID_TAG ) {
+        sent_msgs = m_sentMsgs;
+    } else {
+        sent_msgs = sentMsgCount[attr->tag];
+    }
+}
+
+void Zero :: createNewSyncAttr(struct SyncAttr * * attr) {
+    *attr = new struct SyncAttr;
+    (*attr)->tag = std::numeric_limits<uint32_t>::max();
+    (*attr)->expected_sent = 0;
+    (*attr)->expected_rcvd = 0;
+}
+
+void Zero :: doLocalProgress(int& error) {
+
+    error = 0;
+    LOG(1, "Polling for messages" );
+    struct ibv_wc wcs[POLL_BATCH];
+    int pollResult = ibv_poll_cq(m_cqLocal.get(), POLL_BATCH, wcs);
+    if ( pollResult > 0) {
+        LOG(4, "Process " << m_pid << ": Received " << pollResult << " acknowledgements");
+
+        for (int i = 0; i < pollResult ; ++i) {
+            if (wcs[i].status != IBV_WC_SUCCESS)
+            {
+                LOG( 2, "Got bad completion status from IB message."
+                        " status = 0x" << std::hex << wcs[i].status
+                        << ", vendor syndrome = 0x" << std::hex
+                        << wcs[i].vendor_err );
+                const char * status_descr;
+                status_descr = ibv_wc_status_str(wcs[i].status);
+                LOG( 2, "Process " << m_pid << ": The work completion status string: " << status_descr);
+                error = 1;
+            }
+            else {
+                LOG(4, "Process " << m_pid << " Send wcs[" << i << "].src_qp = "<< wcs[i].src_qp);
+                LOG(4, "Process " << m_pid << " Send wcs[" << i << "].slid = "<< wcs[i].slid);
+                LOG(4, "Process " << m_pid << " Send wcs[" << i << "].wr_id = "<< wcs[i].wr_id);
+                LOG(4, "Process " << m_pid << " Send wcs[" << i << "].imm_data = "<< wcs[i].imm_data);
+            }
+
+            TagID slot = wcs[i].wr_id;
+            // Ignore compare-and-swap atomics!
+            if (wcs[i].opcode != IBV_WC_COMP_SWAP) {
+                // This is a GET call completion
+                if (wcs[i].opcode == IBV_WC_RDMA_READ) {
+                    tryIncrement(Op::GET, Phase::POST, slot);
+                    LOG(4, "Rank " << m_pid << " with GET, increments getMsgCount to "
+                        << getMsgCount[slot] << " for LPF slot " << slot);
+                }
+                // This is a put call completing
+                if (wcs[i].opcode == IBV_WC_RDMA_WRITE) {
+                    tryIncrement(Op::SEND, Phase::POST, slot);
+                    LOG(4, "Rank " << m_pid << " with SEND, increments sentMsgCount to "
+                        << sentMsgCount[slot] << " for LPF slot " << slot);
+                }
+
+            }
+        }
+    }
+    else if (pollResult < 0)
+    {
+        LOG( 1, "Failed to poll IB completion queue" );
+        throw Exception("Poll CQ failure");
+    }
+}
+
+void Zero :: flushReceived() {
+        doRemoteProgress();
+}
+
+void Zero :: flushSent()
+{
+    int isError = 0;
+
+    bool sendsComplete;
+    do {
+        sendsComplete = true;
+        for (size_t i = 0; i<tagActive.size(); i++) {
+            if (tagActive[i]) {
+                if (m_sendInitMsgCount[i] > sentMsgCount[i] || m_getInitMsgCount[i] > getMsgCount[i]) {
+                    sendsComplete = false;
+                    doLocalProgress(isError);
+                    if (isError) {
+                        LOG(1, "Error in doLocalProgress. Most likely issue is "
+                            << "that receiver is not calling ibv_post_srq!\n");
+                        std::abort();
+                    }
+                }
+            }
+        }
+    } while (!sendsComplete);
+
+}
+
+void Zero :: countingSyncPerSlot(const TagID tag, const size_t expectedSent,
+    const size_t expectedRecvd)
+{
+    bool sentOK = false;
+    bool recvdOK = false;
+    if (expectedSent == 0) { sentOK = true; }
+    if (expectedRecvd == 0) { recvdOK = true; }
+    int error;
+
+    // This is semantically equivalent to a non-blocking test call,
+    // triggering progress on the network card without expecting anything
+    // from a particular tag
+    if (tag == INVALID_TAG && sentOK && recvdOK) {
+        doLocalProgress(error);
+        if (error) {
+            LOG(1, "Error in doLocalProgress");
+            throw std::runtime_error("Error in doLocalProgress");
+        }
+        // this call triggers doRemoteProgress
+        doRemoteProgress();
+    }
+
+    // This is a blocking call on a particular tag with some expected
+    // sent / received messages
+    else {
+        if (tagActive[tag]) {
+            do {
+                doLocalProgress(error);
+                if (error) {
+                    LOG(1, "Error in doLocalProgress");
+                    throw std::runtime_error("Error in doLocalProgress");
+                }
+                // this call triggers doRemoteProgress
+                doRemoteProgress();
+
+                /*
+                 * 1) Are we expecting nothing here (sentOK/recvdOK = true)
+                 * 2) do the sent and received messages  match our expectations?
+                 */
+                sentOK = (sentOK || sentMsgCount[tag] >= expectedSent);
+                // We can receive messages passively (from remote puts) and actively (from our gets)
+                recvdOK = (recvdOK || (rcvdMsgCount[tag] + getMsgCount[tag]) >= expectedRecvd);
+                LOG(4, "PID: " << m_pid << " rcvdMsgCount[" << tag << "] = " << rcvdMsgCount[tag]
+                        << " expectedRecvd = " << expectedRecvd
+                        << " rcvdMsgCount[" << tag << "] = " << rcvdMsgCount[tag]
+                        << " getMsgCount[" << tag << "] = " << getMsgCount[tag]
+                        << " sentMsgCount[" << tag << "] = " << sentMsgCount[tag]
+                        << " expectedSent = " << expectedSent);
+            } while (!(sentOK && recvdOK));
+        }
+    }
+}
+
+void Zero :: syncPerTag(TagID tag) {
+    int error;
+    // this barrier ensures m_recvInitMsgCount is accurate (TBC)
+    m_comm.barrier();
+    do {
+        doLocalProgress(error);
+        if (error) {
+            LOG(1, "Error in doLocalProgress");
+            throw std::runtime_error("Error in doLocalProgress");
+        }
+        doRemoteProgress();
+    }
+    while ((rcvdMsgCount.at(tag) < m_recvInitMsgCount.at(tag)) ||
+        (sentMsgCount.at(tag) < m_sendInitMsgCount.at(tag)));
+    // this barrier ensures local buffers remain locked until remote uses are
+    // guaranteed complete. TODO FIXME: an acknowledgement mechanism would
+    // make this barrier unnecessary.
+    m_comm.barrier();
+}
+
+void Zero :: sync(bool resized,const struct SyncAttr * attr)
+{
+    const bool defaultSync = (attr == nullptr) ;
+    if (defaultSync)
+    {
+        LOG(4, "Process " << m_pid << " going for default sync (uses barrier)");
+        (void) resized;
+
+        // flush send queues
+        flushSent();
+        // flush receive queues
+        flushReceived();
+
+        m_comm.barrier();
+
+        // done
+        return;
+    }
+
+    ASSERT(attr != NULL);
+
+    const bool tagSync = attr->expected_sent == 0 && attr->expected_rcvd == 0
+        && attr->tag != INVALID_TAG;
+    if (tagSync)
+    {
+        LOG(4, "Process " << m_pid << " going for syncPerTag (uses barrier)");
+        syncPerTag(attr->tag);
+        return;
+    }
+
+    LOG(4, "Process " << m_pid << " going for countingSync (no barrier!)");
+    countingSyncPerSlot(attr->tag,attr->expected_sent,attr->expected_rcvd);
+}
+
+
+} }
diff --git a/src/MPI/zero.hpp b/src/MPI/zero.hpp
new file mode 100644
index 00000000..1885eba9
--- /dev/null
+++ b/src/MPI/zero.hpp
@@ -0,0 +1,276 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LPF_CORE_MPI_ZERO_HPP
+#define LPF_CORE_MPI_ZERO_HPP
+
+#include <atomic>
+#include <limits>
+#include <string>
+#include <vector>
+#if __cplusplus >= 201103L
+  #include <memory>
+#else
+  #include <tr1/memory>
+#endif
+
+#include <infiniband/verbs.h>
+
+#include "linkage.hpp"
+#include "sparseset.hpp"
+#include "memreg.hpp"
+#include "lpf/core.h"
+
+namespace lpf {
+
+class Communication;
+
+namespace mpi {
+
+#if __cplusplus >= 201103L
+using std::shared_ptr;
+#else
+using std::tr1::shared_ptr;
+#endif
+
+class MemoryRegistration {
+    public:
+        char *   _addr;
+        size_t   _size;
+        uint32_t _lkey;
+        uint32_t _rkey;
+        int _pid;
+        MemoryRegistration(
+            char * addr, size_t size,
+            uint32_t lkey, uint32_t rkey,
+            int pid
+        ) : _addr(addr), _size(size), _lkey(lkey), _rkey(rkey), _pid(pid)
+        {}
+        MemoryRegistration() :
+            _addr(nullptr), _size(0),
+            _lkey(0), _rkey(0), _pid(-1)
+        {}
+        size_t serialize(char ** buf);
+        static MemoryRegistration * deserialize(char * buf);
+};
+
+class _LPFLIB_LOCAL Zero
+{
+
+public:
+
+    typedef size_t SlotID;
+    typedef uint32_t TagID;
+
+    static constexpr TagID INVALID_TAG = std::numeric_limits<TagID>::max();
+
+    struct Exception;
+
+    struct SyncAttr {
+        TagID tag;
+        size_t expected_sent;
+        size_t expected_rcvd;
+    };
+
+    explicit Zero( Communication & );
+    ~Zero();
+
+    void resizeMemreg( size_t size );
+    void resizeMesgq( size_t size );
+    void resizeTagreg( size_t size );
+
+    SlotID regLocal( void * addr, size_t size );
+    SlotID regGlobal( void * addr, size_t size );
+    TagID regTag();
+
+    void dereg( SlotID id );
+    void deregTag( TagID id );
+
+    size_t getMaxMsgSize() const {
+        return m_maxMsgSize;
+    }
+
+    void put( SlotID srcSlot, size_t srcOffset,
+        int dstPid, SlotID dstSlot, size_t dstOffset, size_t size, lpf_msg_attr_t attr );
+
+    void get( int srcPid, SlotID srcSlot, size_t srcOffset,
+        SlotID dstSlot, size_t dstOffset, size_t size, lpf_msg_attr_t attr );
+
+    void flushSent();
+
+    void flushReceived();
+
+    void doRemoteProgress();
+
+    void countingSyncPerSlot(const TagID tag, const size_t sent,
+        const size_t recvd);
+
+    /**
+     * @syncPerTag only guarantees that all already scheduled sends (via put),
+     * or receives (via get) associated with a slot are completed. It does
+     * not guarantee that not scheduled operations will be scheduled (e.g.
+     * no guarantee that a remote process will wait til data is put into its
+     * memory, as it does schedule the operation (one-sided).
+     */
+    void syncPerTag(TagID tag);
+
+    // Do the communication and synchronize
+    // 'Reconnect' must be a globally replicated value
+    void sync(bool reconnect, const struct SyncAttr * attr);
+
+    void get_rcvd_msg_count(size_t &rcvd_msgs,
+        const struct SyncAttr * attr) noexcept;
+    void get_sent_msg_count(size_t &sent_msgs,
+        const struct SyncAttr * attr) noexcept;
+
+    void createNewSyncAttr(struct SyncAttr * * attr);
+
+    inline void destroySyncAttr(struct SyncAttr * attr)
+    {
+        delete attr;
+    }
+
+    inline TagID getTag(const struct SyncAttr &attr) noexcept
+    {
+        return attr.tag;
+    }
+
+    inline void setTag(const TagID tag, struct SyncAttr &attr) noexcept
+    {
+        attr.tag = tag;
+    }
+
+    inline void setZCAttr(size_t sent, size_t rcvd, struct SyncAttr &attr)
+        noexcept
+    {
+        attr.expected_sent = sent;
+        attr.expected_rcvd = rcvd;
+    }
+
+    inline void getZCAttr(const struct SyncAttr &attr,
+        size_t &sent, size_t &rcvd) noexcept
+    {
+        sent = attr.expected_sent;
+        rcvd = attr.expected_rcvd;
+    }
+
+protected:
+
+    typedef enum Op {
+        SEND,
+        RECV,
+        GET
+    } Op;
+
+    typedef enum Phase {
+        INIT,
+        PRE,
+        POST
+    } Phase;
+
+    Zero & operator=(const Zero & ); // assignment prohibited
+    Zero( const Zero & ); // copying prohibited
+
+    void stageQPs(size_t maxMsgs );
+    void reconnectQPs();
+
+    void doProgress();
+    void tryIncrement(const Op op, const Phase phase, const TagID slot)
+        noexcept;
+
+    void doLocalProgress(int& error);
+
+    struct MemorySlot {
+        shared_ptr< struct ibv_mr > mr;    // verbs structure
+        std::vector< MemoryRegistration > glob; // array for global registrations
+    };
+
+    int    m_pid;    // local process ID
+    int    m_nprocs; // number of processes
+    int    m_ibPort; // local IB port to work with
+    int    m_gidIdx;
+    size_t m_maxRegSize;
+    size_t m_maxMsgSize;
+    size_t m_cqSize;
+    size_t m_minNrMsgs;
+    size_t m_maxSrs; // maximum number of sends requests per QP
+    size_t m_postCount;
+    size_t m_recvCount;
+    size_t m_tag_capacity;
+
+    shared_ptr< struct ibv_context > m_device;      // device handle
+    shared_ptr< struct ibv_pd >      m_pd;          // protection domain
+    shared_ptr< struct ibv_cq >      m_cq;          // complation queue
+    shared_ptr< struct ibv_cq >      m_cqLocal;     // completion queue
+    shared_ptr< struct ibv_cq >      m_cqRemote;    // completion queue
+    shared_ptr< struct ibv_srq >     m_srq;         // shared receive queue
+    shared_ptr< struct ibv_mr >      m_dummyMemReg; // registration of dummy
+                                                    // buffer
+    std::atomic_size_t m_numMsgs;
+    std::atomic_size_t m_recvTotalInitMsgCount;
+    std::atomic_size_t m_sentMsgs;
+    std::atomic_size_t m_recvdMsgs;
+
+    uint16_t     m_lid;     // LID of the IB port
+
+    Communication & m_comm;
+
+    std::string  m_devName; // IB device name
+
+    ibv_mtu      m_mtu;
+
+    struct ibv_device_attr m_deviceAttr;
+
+    std::vector<TagID>  m_free_tags;
+    std::vector<size_t> m_recvInitMsgCount;
+    std::vector<size_t> m_getInitMsgCount;
+    std::vector<size_t> m_sendInitMsgCount;
+
+    // Disconnected queue pairs
+    std::vector< shared_ptr< struct ibv_qp > > m_stagedQps;
+
+    // Connected queue pairs
+    std::vector< shared_ptr< struct ibv_qp > > m_connectedQps;
+
+    std::vector< struct ibv_send_wr > m_srs;          // array of send requests
+    std::vector< size_t >             m_srsHeads;     // head of send queue per
+                                                      // peer
+    std::vector< size_t >             m_nMsgsPerPeer; // number of messages per
+                                                      // peer
+    std::vector< pid_t >              m_peerList;
+
+    std::vector< struct ibv_sge > m_sges;        // array of scatter/gather
+                                                 // entries
+    std::vector< struct ibv_wc >  m_wcs;         // array of work completions
+    std::vector< char >           m_dummyBuffer; // dummy receive buffer
+
+    std::vector<size_t> rcvdMsgCount;
+    std::vector<size_t> sentMsgCount;
+    std::vector<size_t> getMsgCount;
+    std::vector<bool>   tagActive;
+
+    SparseSet< pid_t > m_activePeers;
+
+    CombinedMemoryRegister< MemorySlot > m_memreg;
+
+};
+
+
+} }
+
+
+#endif
diff --git a/src/MPI/zero.t.cpp b/src/MPI/zero.t.cpp
new file mode 100644
index 00000000..81dfbd8b
--- /dev/null
+++ b/src/MPI/zero.t.cpp
@@ -0,0 +1,324 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "zero.hpp"
+#include "assert.hpp"
+#include "mpilib.hpp"
+
+#include <gtest/gtest.h>
+#include <iostream>
+
+using namespace lpf::mpi;
+
+extern "C" const int LPF_MPI_AUTO_INITIALIZE=0;
+
+
+/** 
+ * \pre P >= 1
+ * \pre P <= 2
+ */
+class ZeroTests : public testing::Test {
+
+    protected:
+
+        static void SetUpTestSuite() {
+
+            MPI_Init(NULL, NULL);
+            Lib::instance();
+            comm = new Comm();
+            *comm = Lib::instance().world();
+            comm->barrier();
+            verbs = new Zero( *comm );
+        }
+
+        static void TearDownTestSuite() {
+            delete verbs;
+            verbs = nullptr;
+            delete comm;
+            comm = nullptr;
+            MPI_Finalize();
+        }
+
+        static Comm *comm;
+        static Zero *verbs;
+};
+
+lpf::mpi::Comm * ZeroTests::comm = nullptr;
+Zero * ZeroTests::verbs = nullptr;
+
+
+TEST_F( ZeroTests, init )
+{
+
+    comm->barrier();
+}
+
+
+TEST_F( ZeroTests, resizeMemreg )
+{
+
+    verbs->resizeMemreg( 2 );
+
+    comm->barrier();
+}
+
+
+TEST_F( ZeroTests, resizeMesgq )
+{
+
+    verbs->resizeMesgq( 2 );
+
+    comm->barrier();
+}
+
+TEST_F( ZeroTests, regVars )
+{
+
+
+    char buf1[30] = "Hi";
+    char buf2[30] = "Boe";
+
+    verbs->resizeMemreg( 2 );
+
+    Zero::SlotID b1 = verbs->regLocal( buf1, sizeof(buf1) );
+    Zero::SlotID b2 = verbs->regGlobal( buf2, sizeof(buf2) );
+
+    comm->barrier();
+    verbs->dereg(b1);
+    verbs->dereg(b2);
+}
+
+
+TEST_F( ZeroTests, put )
+{
+
+    char buf1[30] = "Hi";
+    char buf2[30] = "Boe";
+
+    verbs->resizeMemreg( 2 );
+    verbs->resizeMesgq( 1 );
+
+    Zero::SlotID b1 = verbs->regLocal( buf1, sizeof(buf1) );
+    Zero::SlotID b2 = verbs->regGlobal( buf2, sizeof(buf2) );
+
+    comm->barrier();
+
+    verbs->put( b1, 0, (comm->pid() + 1)%comm->nprocs(), b2, 0, sizeof(buf1));
+
+    verbs->sync(true, nullptr);
+    EXPECT_EQ( "Hi", std::string(buf1) );
+    EXPECT_EQ( "Hi", std::string(buf2) );
+    verbs->dereg(b1);
+    verbs->dereg(b2);
+}
+
+
+TEST_F( ZeroTests, get )
+{
+
+    char buf1[30] = "Hoi";
+    char buf2[30] = "Vreemd";
+
+    verbs->resizeMemreg( 2 );
+    verbs->resizeMesgq( 1 );
+
+    Zero::SlotID b1 = verbs->regLocal( buf1, sizeof(buf1) );
+    Zero::SlotID b2 = verbs->regGlobal( buf2, sizeof(buf2) );
+
+    comm->barrier();
+
+    verbs->get( (comm->pid() + 1)%comm->nprocs(), b2, 0,
+            b1, 0, sizeof(buf2));
+
+    verbs->sync(true, nullptr);
+    EXPECT_EQ( "Vreemd", std::string(buf1) );
+    EXPECT_EQ( "Vreemd", std::string(buf2) );
+    verbs->dereg(b1);
+    verbs->dereg(b2);
+}
+
+
+TEST_F( ZeroTests, putAllToAll )
+{
+    int nprocs = comm->nprocs();
+    int pid = comm->pid();
+    
+    const int H = 2.5 * nprocs;
+
+    std::vector< int > a(H);
+    std::vector< int > b(H);
+
+    for (int i = 0; i < H; ++i) {
+        a[i] = i * nprocs + pid ;
+        b[i] = nprocs*nprocs - ( i * nprocs + pid);
+    }
+
+    verbs->resizeMemreg( 2 );
+    verbs->resizeMesgq( H );
+
+    Zero::SlotID a1 = verbs->regGlobal( a.data(), sizeof(int)*a.size());
+    Zero::SlotID b1 = verbs->regGlobal( b.data(), sizeof(int)*b.size());
+
+    comm->barrier();
+
+    for (int i = 0; i < H; ++i) {
+        int dstPid = (pid + i ) % nprocs;
+        verbs->put( a1, sizeof(int)*i,
+                dstPid, b1, sizeof(int)*i, sizeof(int));
+    }
+
+    verbs->sync(true, nullptr);
+
+    for (int i = 0; i < H; ++i) {
+        int srcPid = (nprocs + pid - (i%nprocs)) % nprocs;
+        EXPECT_EQ( i*nprocs + pid, a[i] ) ;
+        EXPECT_EQ( i*nprocs + srcPid, b[i] );
+    }
+    verbs->dereg(a1);
+    verbs->dereg(b1);
+
+}
+
+TEST_F( ZeroTests, getAllToAll )
+{
+    int nprocs = comm->nprocs();
+    int pid = comm->pid();
+
+    const int H = 100.3 * nprocs;
+
+    std::vector< int > a(H), a2(H);
+    std::vector< int > b(H), b2(H);
+
+    for (int i = 0; i < H; ++i) {
+        a[i] = i * nprocs + pid ;
+        a2[i] = a[i]; 
+        b[i] = nprocs*nprocs - ( i * nprocs + pid);
+        b2[i] = i*nprocs+ (nprocs + pid + i) % nprocs;
+    }
+
+    verbs->resizeMemreg( 2 );
+    verbs->resizeMesgq( H );
+
+    Zero::SlotID a1 = verbs->regGlobal( a.data(), sizeof(int)*a.size());
+    Zero::SlotID b1 = verbs->regGlobal( b.data(), sizeof(int)*b.size());
+
+    comm->barrier();
+
+    for (int i = 0; i < H; ++i) {
+        int srcPid = (pid + i) % nprocs;
+        verbs->get( srcPid, a1, sizeof(int)*i,
+                b1, sizeof(int)*i, sizeof(int));
+    }
+
+    verbs->sync(true, nullptr);
+
+    EXPECT_EQ(a, a2);
+    EXPECT_EQ(b, b2);
+
+    verbs->dereg(a1);
+    verbs->dereg(b1);
+
+}
+
+
+TEST_F( ZeroTests, putHuge )
+{
+    std::vector<char> hugeMsg(3*verbs->getMaxMsgSize());
+    std::vector< char > hugeBuf(3*verbs->getMaxMsgSize());
+    LOG(4, "Allocating putHuge with vector size: " << hugeMsg.size());
+
+    for ( size_t i = 0; i < hugeMsg.size() ; ++i)
+        hugeMsg[i] = char( i );
+
+    verbs->resizeMemreg( 2 );
+    verbs->resizeMesgq( 1 );
+
+    Zero::SlotID b1 = verbs->regLocal( hugeMsg.data(), hugeMsg.size() );
+    Zero::SlotID b2 = verbs->regGlobal( hugeBuf.data(), hugeBuf.size() );
+
+    comm->barrier();
+
+    verbs->put( b1, 0, (comm->pid() + 1)%comm->nprocs(), b2, 0, hugeMsg.size() * sizeof(char) );
+
+    verbs->sync(true, nullptr);
+
+    EXPECT_EQ( hugeMsg, hugeBuf );
+
+    verbs->dereg(b1);
+    verbs->dereg(b2);
+}
+
+TEST_F( ZeroTests, getHuge )
+{
+
+    std::vector<char> hugeMsg(3*verbs->getMaxMsgSize());
+    std::vector< char > hugeBuf(3*verbs->getMaxMsgSize());
+    LOG(4, "Allocating getHuge with vector size: " << hugeMsg.size());
+
+    for ( size_t i = 0; i < hugeMsg.size() ; ++i)
+        hugeMsg[i] = char(i);
+
+    verbs->resizeMemreg( 2 );
+    verbs->resizeMesgq( 1 );
+
+    Zero::SlotID b1 = verbs->regLocal( hugeMsg.data(), hugeMsg.size() );
+    Zero::SlotID b2 = verbs->regGlobal( hugeBuf.data(), hugeBuf.size() );
+
+    comm->barrier();
+
+    verbs->get( (comm->pid() + 1)%comm->nprocs(), b2, 0, b1, 0, hugeMsg.size() * sizeof(char));
+
+    verbs->sync(true, nullptr);
+
+    EXPECT_EQ(hugeMsg, hugeBuf);
+
+    verbs->dereg(b1);
+    verbs->dereg(b2);
+}
+
+TEST_F( ZeroTests, manyPuts )
+{
+
+    const unsigned N = 5000;
+    std::vector< unsigned char > buf1( N );
+    std::vector< unsigned char > buf2( N );
+    for (unsigned int i = 0 ; i < N; ++ i)
+        buf1[i] = i + comm->pid() ;
+
+    verbs->resizeMemreg( 2 );
+    verbs->resizeMesgq( N );
+
+    Zero::SlotID b1 = verbs->regLocal( buf1.data(), buf1.size()  );
+    Zero::SlotID b2 = verbs->regGlobal( buf2.data(), buf1.size() );
+
+    comm->barrier();
+
+    for ( unsigned i = 0 ; i < N; ++i)
+        verbs->put( b1, i, (comm->pid() + 1)%comm->nprocs(), b2, i, 1);
+
+    verbs->sync(true, nullptr);
+    for ( unsigned i = 0 ; i < N; ++i) {
+        unsigned char b2_exp = i + (comm->pid() + comm->nprocs() - 1)  % comm->nprocs();
+        unsigned char b1_exp = i + comm->pid();
+        EXPECT_EQ( b2_exp, buf2[i]);
+        EXPECT_EQ( b1_exp, buf1[i] );
+    }
+
+    verbs->dereg(b1);
+    verbs->dereg(b2);
+}
+
diff --git a/src/debug/CMakeLists.txt b/src/debug/CMakeLists.txt
index 7f3f9c92..0679775c 100644
--- a/src/debug/CMakeLists.txt
+++ b/src/debug/CMakeLists.txt
@@ -38,4 +38,3 @@ install(TARGETS ${libname} EXPORT lpf
 )
 
 add_gtest(rwconflict_test "pthread" rwconflict.t.cpp rwconflict.cpp)
-   #$<TARGET_OBJECTS:${comlib}> )
diff --git a/src/hybrid/core.cpp b/src/hybrid/core.cpp
index 404edda8..d98caf05 100644
--- a/src/hybrid/core.cpp
+++ b/src/hybrid/core.cpp
@@ -28,7 +28,7 @@
 #include <cstdint>
 #include <climits>
 
-#if __cplusplus >= 201103L    
+#if __cplusplus >= 201103L
   #include <memory>
 #else
   #include <tr1/memory>
@@ -49,7 +49,7 @@ _LPFLIB_VAR const lpf_args_t LPF_NO_ARGS = { NULL, 0, NULL, 0, NULL, 0 };
 
 _LPFLIB_VAR const lpf_sync_attr_t LPF_SYNC_DEFAULT = 0;
 
-_LPFLIB_VAR const lpf_msg_attr_t LPF_MSG_DEFAULT = 0;
+_LPFLIB_VAR const lpf_msg_attr_t LPF_MSG_DEFAULT = NULL;
 
 _LPFLIB_VAR const lpf_pid_t LPF_MAX_P = UINT_MAX;
 
@@ -59,7 +59,7 @@ _LPFLIB_VAR const lpf_t LPF_NONE = NULL;
 
 _LPFLIB_VAR const lpf_init_t LPF_INIT_NONE = NULL;
 
-_LPFLIB_VAR const lpf_t LPF_ROOT = static_cast<void*>(const_cast<char *>("LPF_ROOT")) ; 
+_LPFLIB_VAR const lpf_t LPF_ROOT = static_cast<void*>(const_cast<char *>("LPF_ROOT")) ;
 
 _LPFLIB_VAR const lpf_machine_t LPF_INVALID_MACHINE = { 0, 0, NULL, NULL };
 
@@ -68,7 +68,7 @@ namespace {
     using lpf::hybrid::LPF_CORE_IMPL_CONFIG::MachineParams;
 
     struct Init {
-    
+
         lpf::hybrid::Thread m_thread;
         lpf::hybrid::MPI    m_mpi;
         lpf_pid_t m_threadId, m_nThreads;
@@ -84,18 +84,18 @@ namespace {
 
 
     lpf::hybrid::ThreadState * realContext( lpf_t ctx )
-    { 
+    {
         lpf_t c;
         if (ctx == LPF_ROOT)
-            c = &lpf::hybrid::ThreadState::root(); 
+            c = &lpf::hybrid::ThreadState::root();
         else
             c = ctx;
         return static_cast< lpf::hybrid::ThreadState *>(c);
     }
 }
 
-_LPFLIB_API lpf_err_t lpf_hybrid_intialize( USE_THREAD(_t) thread, USE_MPI(_t) mpi, 
-        lpf_pid_t threadId, lpf_pid_t nThreads, 
+_LPFLIB_API lpf_err_t lpf_hybrid_intialize( USE_THREAD(_t) thread, USE_MPI(_t) mpi,
+        lpf_pid_t threadId, lpf_pid_t nThreads,
         lpf_pid_t nodeId, lpf_pid_t nNodes, lpf_init_t * init )
 {
     using namespace lpf::hybrid;
@@ -138,12 +138,12 @@ _LPFLIB_API lpf_err_t lpf_hook( lpf_init_t init, lpf_spmd_t spmd, lpf_args_t arg
     using namespace lpf::hybrid;
     Init * params = static_cast<Init *>(init);
 
-#if __cplusplus >= 201103L    
+#if __cplusplus >= 201103L
     std::shared_ptr<NodeState> nodeState;
 #else
     std::tr1::shared_ptr<NodeState> nodeState;
 #endif
-        
+
     NodeState * nodeStatePtr = NULL;
     if (params->m_threadId == 0)
     {
@@ -172,15 +172,15 @@ _LPFLIB_API lpf_err_t lpf_hook( lpf_init_t init, lpf_spmd_t spmd, lpf_args_t arg
     }
     catch(std::bad_alloc & e )
     {
-        LOG(1, "Not enough memory to run SPMD function on thread " 
-                << params->m_threadId << " of node " 
+        LOG(1, "Not enough memory to run SPMD function on thread "
+                << params->m_threadId << " of node "
                 << nodeStatePtr->nodeId() );
         failure = true;
     }
     catch(...)
     {
-        LOG(1, "SPMD function of thread " 
-                << params->m_threadId << " of node " 
+        LOG(1, "SPMD function of thread "
+                << params->m_threadId << " of node "
                 << nodeStatePtr->nodeId() << " threw an unexpected exception");
         failure = true;
     }
@@ -188,7 +188,7 @@ _LPFLIB_API lpf_err_t lpf_hook( lpf_init_t init, lpf_spmd_t spmd, lpf_args_t arg
     trc = reduceOr( params->m_thread, 0, failure);
     if ( trc != Thread::SUCCESS ) return LPF_ERR_FATAL;
 
-    if ( params->m_threadId == 0) 
+    if ( params->m_threadId == 0)
     {
         MPI::err_t nrc = MPI::SUCCESS;
         nrc = reduceOr( params->m_mpi, 0, failure);
@@ -198,7 +198,7 @@ _LPFLIB_API lpf_err_t lpf_hook( lpf_init_t init, lpf_spmd_t spmd, lpf_args_t arg
     }
     trc = broadcast( params->m_thread, 0, failure );
     if ( trc != Thread::SUCCESS ) return LPF_ERR_FATAL;
-    
+
     return failure?LPF_ERR_FATAL:LPF_SUCCESS;
 }
 
@@ -281,16 +281,15 @@ _LPFLIB_API lpf_err_t lpf_deregister(
 }
 
 _LPFLIB_API lpf_err_t lpf_put( lpf_t ctx,
-                       lpf_memslot_t src_slot, 
-                       size_t src_offset,
-                       lpf_pid_t dst_pid, 
-                       lpf_memslot_t dst_slot, 
-                       size_t dst_offset, 
-                       size_t size, 
-                       lpf_msg_attr_t attr
+    lpf_memslot_t src_slot,
+    size_t src_offset,
+    lpf_pid_t dst_pid,
+    lpf_memslot_t dst_slot,
+    size_t dst_offset,
+    size_t size,
+    lpf_msg_attr_t attr
 )
 {
-    (void) attr;
     using namespace lpf::hybrid;
     if (ctx == LPF_SINGLE_PROCESS) {
         char * null = NULL;
@@ -301,24 +300,25 @@ _LPFLIB_API lpf_err_t lpf_put( lpf_t ctx,
     }
 
     ThreadState * t = realContext(ctx);
-    if (!t->error())
-        t->put( src_slot, src_offset, dst_pid, dst_slot, dst_offset, size );
+    if (!t->error()) {
+        t->put( src_slot, src_offset, dst_pid, dst_slot, dst_offset, size,
+            attr );
+    }
     return LPF_SUCCESS;
 }
 
 
 _LPFLIB_API lpf_err_t lpf_get(
-    lpf_t ctx, 
-    lpf_pid_t src_pid, 
-    lpf_memslot_t src_slot, 
-    size_t src_offset, 
-    lpf_memslot_t dst_slot, 
+    lpf_t ctx,
+    lpf_pid_t src_pid,
+    lpf_memslot_t src_slot,
+    size_t src_offset,
+    lpf_memslot_t dst_slot,
     lpf_memslot_t dst_offset,
     size_t size,
     lpf_msg_attr_t attr
 )
 {
-    (void) attr;
     using namespace lpf::hybrid;
     if (ctx == LPF_SINGLE_PROCESS) {
         char * null = NULL;
@@ -329,8 +329,10 @@ _LPFLIB_API lpf_err_t lpf_get(
     }
 
     ThreadState * t = realContext(ctx);
-    if (!t->error())
-        t->get( src_pid, src_slot, src_offset, dst_slot, dst_offset, size );
+    if (!t->error()) {
+        t->get( src_pid, src_slot, src_offset, dst_slot, dst_offset, size,
+            attr );
+    }
     return LPF_SUCCESS;
 }
 
@@ -338,12 +340,11 @@ _LPFLIB_API lpf_err_t lpf_sync( lpf_t ctx, lpf_sync_attr_t attr )
 {
     (void) attr;
     using namespace lpf::hybrid;
-    if (ctx == LPF_SINGLE_PROCESS) 
+    if (ctx == LPF_SINGLE_PROCESS)
         return LPF_SUCCESS;
     return realContext(ctx)->sync();
 }
 
-
 _LPFLIB_API lpf_err_t lpf_probe( lpf_t ctx, lpf_machine_t * params )
 {
     using namespace lpf::hybrid;
@@ -364,7 +365,7 @@ _LPFLIB_API lpf_err_t lpf_probe( lpf_t ctx, lpf_machine_t * params )
 _LPFLIB_API lpf_err_t lpf_resize_message_queue( lpf_t ctx, size_t max_msgs )
 {
     using namespace lpf::hybrid;
-    if (ctx == LPF_SINGLE_PROCESS) 
+    if (ctx == LPF_SINGLE_PROCESS)
        return LPF_SUCCESS;
 
     ThreadState * t = realContext(ctx);
@@ -377,7 +378,7 @@ _LPFLIB_API lpf_err_t lpf_resize_message_queue( lpf_t ctx, size_t max_msgs )
 _LPFLIB_API lpf_err_t lpf_resize_memory_register( lpf_t ctx, size_t max_regs )
 {
     using namespace lpf::hybrid;
-    if (ctx == LPF_SINGLE_PROCESS) 
+    if (ctx == LPF_SINGLE_PROCESS)
        return LPF_SUCCESS;
 
     ThreadState * t = realContext(ctx);
diff --git a/src/hybrid/state.hpp b/src/hybrid/state.hpp
index 6ae1dd3a..ddc98d64 100644
--- a/src/hybrid/state.hpp
+++ b/src/hybrid/state.hpp
@@ -289,8 +289,12 @@ class _LPFLIB_LOCAL ThreadState {
 
     void put( lpf_memslot_t src_slot, size_t src_offset, 
             pid_t dst_pid, lpf_memslot_t dst_slot, size_t dst_offset, 
-            size_t size)
+            size_t size, lpf_msg_attr_t attr )
     { 
+        (void) attr; // current implementation ignores attributes -- note that
+	             // handling e.g. zero-cost in the hybrid setting is not exactly
+		     // trivial, and that simply applying zero-cost on the top level
+		     // only will not lead to correct behaviour
         typedef NodeMemReg::Memory Memory;
         if (size <= 0) return;
 
@@ -314,8 +318,12 @@ class _LPFLIB_LOCAL ThreadState {
 
     void get( pid_t src_pid, lpf_memslot_t src_slot, size_t src_offset, 
             lpf_memslot_t dst_slot, size_t dst_offset,
-            size_t size )
+            size_t size, lpf_msg_attr_t attr )
     { 
+        (void) attr; // current implementation ignores attributes -- note that
+	             // handling e.g. zero-cost in the hybrid setting is not exactly
+		     // trivial, and that simply applying zero-cost on the top level
+		     // only will not lead to correct behaviour
         typedef NodeMemReg::Memory Memory;
         if (size <= 0) return;
 
diff --git a/src/imp/core.c b/src/imp/core.c
index e076b811..bb6c88e0 100644
--- a/src/imp/core.c
+++ b/src/imp/core.c
@@ -34,7 +34,7 @@ const lpf_args_t LPF_NO_ARGS = { NULL, 0, NULL, 0, NULL, 0 };
 
 const lpf_sync_attr_t LPF_SYNC_DEFAULT = 0;
 
-const lpf_msg_attr_t LPF_MSG_DEFAULT = 0;
+const lpf_msg_attr_t LPF_MSG_DEFAULT = NULL;
 
 const lpf_pid_t LPF_MAX_P = UINT_MAX;
 
diff --git a/src/pthreads/core.cpp b/src/pthreads/core.cpp
index 080b6a1d..776f0c1c 100644
--- a/src/pthreads/core.cpp
+++ b/src/pthreads/core.cpp
@@ -52,7 +52,7 @@ const lpf_args_t LPF_NO_ARGS = { NULL, 0, NULL, 0, NULL, 0 };
 
 const lpf_sync_attr_t LPF_SYNC_DEFAULT = 0;
 
-const lpf_msg_attr_t LPF_MSG_DEFAULT = 0;
+const lpf_msg_attr_t LPF_MSG_DEFAULT = NULL;
 
 const lpf_pid_t LPF_MAX_P = UINT_MAX;
 
@@ -296,8 +296,8 @@ lpf_err_t lpf_put(
     lpf_msg_attr_t attr
 )
 {
-    (void) attr; // ignore parameter 'msg' since this implementation only 
-                 // implements core functionality
+    (void) attr; // ignore parameter 'msg' since this engine only implements
+                 // core functionality
     lpf::ThreadLocalData * thread = realCtx(ctx);
 
     if (!thread->isAborted())
@@ -318,8 +318,8 @@ lpf_err_t lpf_get(
     lpf_msg_attr_t attr
 )
 {
-    (void) attr; // ignore parameter 'msg' since this implementation only 
-                 // implements core functionality
+    (void) attr; // ignore parameter 'msg' since this engine only implements
+                 // core functionality
     lpf::ThreadLocalData * thread = realCtx(ctx);
 
     if (!thread->isAborted())
diff --git a/src/pthreads/threadlocaldata.hpp b/src/pthreads/threadlocaldata.hpp
index 66d56160..92f99b72 100644
--- a/src/pthreads/threadlocaldata.hpp
+++ b/src/pthreads/threadlocaldata.hpp
@@ -105,7 +105,7 @@ class _LPFLIB_LOCAL ThreadLocalData
     { return m_atExit[0]; }
  
     err_t sync( bool expectExit = false ); // nothrow
-       
+
 private:
     ThreadLocalData( const ThreadLocalData & ) ; // prohibit copying
     ThreadLocalData & operator=( const ThreadLocalData & ); // prohibit assignment 
diff --git a/tests/functional/CMakeLists.txt b/tests/functional/CMakeLists.txt
index 0eb7eea6..65182f6f 100644
--- a/tests/functional/CMakeLists.txt
+++ b/tests/functional/CMakeLists.txt
@@ -126,7 +126,6 @@ foreach (LPF_IMPL_ID ${ENGINES})
         get_filename_component(baseName ${testSource} NAME_WE  )
         set(exeName "${baseName}_${LPF_IMPL_ID}_${LPF_IMPL_CONFIG}${mode}")
         add_gtest(${exeName} ${LPF_IMPL_ID} ${debug} "${CMAKE_CURRENT_SOURCE_DIR}/${testSource}")
-
     endforeach(testSource)
 endforeach(LPF_IMPL_ID)
 
diff --git a/tests/functional/func_bsplib_hpsend_many.cpp b/tests/functional/func_bsplib_hpsend_many.cpp
index d531eea8..3de0d3c1 100644
--- a/tests/functional/func_bsplib_hpsend_many.cpp
+++ b/tests/functional/func_bsplib_hpsend_many.cpp
@@ -31,8 +31,8 @@ void spmd( lpf_t lpf, lpf_pid_t pid, lpf_pid_t nprocs, lpf_args_t args)
     bsplib_t bsplib;
     size_t maxhpregs = (size_t) -1;
    
-    const int pthread = 1, mpirma = 2, mpimsg = 3, hybrid = 4, ibverbs=5; 
-    (void) pthread; (void) mpirma; (void) mpimsg; (void) hybrid; (void) ibverbs;
+    const int pthread = 1, mpirma = 2, mpimsg = 3, hybrid = 4, ibverbs=5, zero=6; 
+    (void) pthread; (void) mpirma; (void) mpimsg; (void) hybrid; (void) ibverbs; (void) zero;
     if (LPF_CORE_IMPL_ID == mpirma )
     {                     
         maxhpregs = 10; // because MPI RMA only supports a limited number
diff --git a/tests/functional/func_lpf_probe_parallel_nested.cpp b/tests/functional/func_lpf_probe_parallel_nested.cpp
index f594b7b8..5381bffe 100644
--- a/tests/functional/func_lpf_probe_parallel_nested.cpp
+++ b/tests/functional/func_lpf_probe_parallel_nested.cpp
@@ -117,8 +117,8 @@ void spmd1( lpf_t lpf, lpf_pid_t pid, lpf_pid_t nprocs, lpf_args_t args)
     EXPECT_LT( 0.0, (*(subMachine.g))(machine.p, (size_t)(-1), LPF_SYNC_DEFAULT) );
     EXPECT_LT( 0.0, (*(subMachine.l))(machine.p, (size_t)(-1), LPF_SYNC_DEFAULT) );
 
-    const int pthread = 1, mpirma = 1, mpimsg = 1, hybrid = 0, ibverbs=1; 
-    (void) pthread; (void) mpirma; (void) mpimsg; (void) hybrid; (void) ibverbs;
+    const int pthread = 1, mpirma = 1, mpimsg = 1, hybrid = 0, ibverbs=1, zero = 1;
+    (void) pthread; (void) mpirma; (void) mpimsg; (void) hybrid; (void) ibverbs; (void) zero;
     if (LPF_CORE_IMPL_ID) // this part is disabled for the hybrid implementation, because
     {                     // that one doesn't do generic nesting of lpf_exec's
         EXPECT_EQ( 1,  subMachine.free_p == 2 || subMachine.free_p == 3 );
@@ -203,5 +203,4 @@ TEST( API, func_lpf_probe_parallel_nested )
 
     rc = lpf_exec( LPF_ROOT, machine.p / 2, &spmd1, args );
     EXPECT_EQ( LPF_SUCCESS, rc );
-
 }
diff --git a/tests/functional/macro_LPF_VERSION.cpp b/tests/functional/macro_LPF_VERSION.cpp
index 7588aeea..f513f635 100644
--- a/tests/functional/macro_LPF_VERSION.cpp
+++ b/tests/functional/macro_LPF_VERSION.cpp
@@ -19,10 +19,10 @@
 #include "gtest/gtest.h"
 
 #ifdef _LPF_VERSION
-  #if _LPF_VERSION == 202000L
+  #if _LPF_VERSION == 202500L
     // everything is OK
   #else
-     #error Macro _LPF_VERSION has not been defined as 202000L
+     #error Macro _LPF_VERSION has not been defined as 202500L
   #endif
 #else
    #error Macro _LPF_VERSION has not been defined
@@ -35,5 +35,5 @@
  */
 TEST( API, macro_LPF_VERSION )
 {
-    EXPECT_EQ( 202000L, _LPF_VERSION );
+    EXPECT_EQ( 202500L, _LPF_VERSION );
 }