diff --git a/CMakeLists.txt b/CMakeLists.txt
index e0850a84..eb12c8bf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -52,12 +52,12 @@ set(CPACK_PACKAGE_VERSION_MAJOR "${VERSION_MAJOR}")
 set(CPACK_PACKAGE_VERSION_MINOR "${VERSION_MINOR}")
 set(CPACK_PACKAGE_VERSION_PATCH "${VERSION_PATCH}")
 set(CPACK_PACKAGE_DESCRIPTION_SUMMARY
-	"A high performance BSP communications library" )
+  "A high performance BSP communications library" )
 
 set(CPACK_SOURCE_GENERATOR "TGZ" )
 set(CPACK_SOURCE_IGNORE_FILES "/\\\\.git/" "/\\\\.svn/" "\\\\.swp$" "/site/" "/build/" "/pclint/" "/junit/" "/ideas/" )
 set(CPACK_SOURCE_PACKAGE_FILE_NAME
-	"LPF-${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}-${VERSION_PACKAGE}")
+  "LPF-${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}-${VERSION_PACKAGE}")
 
 set(CPACK_GENERATOR "RPM")
 set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64")
@@ -183,10 +183,29 @@ endif()
 
 #enable the hybrid engine
 if ( LIB_POSIX_THREADS AND LIB_MATH AND LIB_DL AND MPI_FOUND
-        AND MPI_IS_THREAD_COMPAT AND MPI_IS_NOT_OPENMPI1
-	AND ENABLE_IBVERBS )
-    list(APPEND ENGINES "hybrid")
-    set(HYBRID_ENGINE_ENABLED on)
+        AND MPI_IS_THREAD_COMPAT AND MPI_IS_NOT_OPENMPI1 )
+    if( ENABLE_IBVERBS )
+        set(LPFLIB_HYBRID_MPI_ENGINE "ibverbs" CACHE STRING
+            "Choice of MPI engine to use for inter-process communication")
+        list(APPEND ENGINES "hybrid")
+        set(HYBRID_ENGINE_ENABLED on)
+    elseif( MPI_RMA )
+        set(LPFLIB_HYBRID_MPI_ENGINE "mpirma" CACHE STRING
+            "Choice of MPI engine to use for inter-process communication")
+        list(APPEND ENGINES "hybrid")
+        set(HYBRID_ENGINE_ENABLED on)
+    elseif( LIB_MATH AND LIB_DL AND MPI_FOUND )
+        set(LPFLIB_HYBRID_MPI_ENGINE "mpimsg" CACHE STRING
+            "Choice of MPI engine to use for inter-process communication")
+        list(APPEND ENGINES "hybrid")
+        set(HYBRID_ENGINE_ENABLED on)
+    endif()
+    if( HYBRID_ENGINE_ENABLED )
+        message( "Hybrid engine will be built using the ${LPFLIB_HYBRID_MPI_ENGINE} engine" )
+    else()
+	message( "No suitable inter-node communication engine found; "
+	    "hybrid engine will not be built" )
+    endif()
 endif()
 
 message( STATUS "The following engines will be built: ${ENGINES}")
@@ -209,6 +228,7 @@ endif()
 
 # When system is not Linux, enable conditionally compiled blocks
 if (APPLE)
+    message( WARNING "LPF compilation on OS X is not regularly tested" )
     add_definitions(-DLPF_ON_MACOS=1)
 endif()
 
@@ -233,8 +253,8 @@ option(LPF_ENABLE_TESTS
        "Enable unit and API tests. This uses Google Testing and Mocking Framework"
        OFF)
 option(GTEST_AGREE_TO_LICENSE
-	"Does the user agree to the GoogleTest license"
-	OFF)
+      "Does the user agree to the GoogleTest license"
+      OFF)
 
 # C++ standard -- Google tests require newer C++ standard than C++11
 if (LPF_ENABLE_TESTS)
@@ -312,14 +332,43 @@ endfunction(target_compile_flags)
 # Source
 set(lpf_cflags)
 set(lpf_lib_link_flags)
-set(lpf_exe_link_flags "-rdynamic")
+set(lpf_exe_link_flags)
+
+# Populate lpf_cflags, lpf_lib_link_flags, lpf_exe_link_flags according to
+# (enabled) engine requirements
+#  - 0) PThreads engine needs nothing special
+#  - 1) MPI-based engines:
+if ( LIB_MATH AND LIB_DL AND MPI_FOUND )
+    # -fPIC and -rdynamic are necessary to ensure that symbols can be
+    # looked up by dlsym which is the mechanism lpf_exec uses to broadcast the
+    # function that should be executed
+    set(rdyn_lflag "-rdynamic")
+    if (APPLE)
+        # OS X does not support -rdynamic
+	set(rdyn_lflag "")
+    endif ()
+
+    # include flags:
+    set( mpi_include_flags )
+    string( REPLACE ";" " -I" mpi_include_flags "${MPI_C_INCLUDE_PATH}" )
+    set(lpf_cflags "${lpf_cflags} -I${mpi_include_flags} -fPIC")
+
+    # linker flags:
+    set(lib_lflags "${MPI_C_LINK_FLAGS}")  #Note: the core library is already linked with MPI_C_LIBRARIES.
+    string(REPLACE ";" " " lib_lflags "${lib_lflags}") # So, no need to also link executables with it.
+    set(lpf_lib_link_flags "${lpf_lib_link_flags} ${lib_lflags} ${rdyn_lflag}")
+
+    # executable linker flags:
+    set(lpf_exe_link_flags "${lpf_exe_link_flags} ${rdyn_lflag}")
+endif ()
+#  ...add requirements from other engines here...
 
 # Collating all compile & link flags
 set(LPF_CORE_COMPILE_FLAGS "${lpf_cflags}" CACHE STRING "Compilation flags for all user code" )
 set(LPF_CORE_LIB_LINK_FLAGS "${lpf_lib_link_flags}" CACHE STRING "Flags to link user libraries" )
 set(LPF_CORE_EXE_LINK_FLAGS "${lpf_exe_link_flags}" CACHE STRING "Flags to link user executables" )
 
-# Compiling LPF programmes in the build dir
+# Compiling LPF programs in the build dir
 function( target_link_exe_with_core target )
     set(engine "imp")
     if (ARGV1)
@@ -343,10 +392,12 @@ if (LPF_ENABLE_TESTS)
     message(STATUS "Unit and API tests will be built. This requires CMake version 3.29 or higher, since we use recent features of the GoogleTest package in CMake.")
 
     if (NOT GTEST_AGREE_TO_LICENSE)
-	    message(FATAL_ERROR "The user needs to agree with the GoogleTest license to use tests (option GTEST_AGREE_TO_LICENSE=TRUE)")
+        message(FATAL_ERROR "The user needs to agree with the GoogleTest license to use tests (option GTEST_AGREE_TO_LICENSE=TRUE)")
     endif()
     # Enable testing in CMake
     enable_testing()
+    include(ProcessorCount)
+    ProcessorCount(processorCount)
     find_package(GTest)
     include(GoogleTest)
     if(NOT GTest_FOUND) # if not found, download it and pull it in
@@ -367,64 +418,84 @@ if (LPF_ENABLE_TESTS)
     file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/junit)
     set(test_output "${CMAKE_BINARY_DIR}/junit")
 
-    set(MY_TEST_LAUNCHER ${CMAKE_BINARY_DIR}/test_launcher.py)
-    configure_file( ${CMAKE_SOURCE_DIR}/test_launcher.py ${MY_TEST_LAUNCHER} @ONLY FILE_PERMISSIONS WORLD_EXECUTE OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_EXECUTE GROUP_READ)
-    if( NOT Python3_FOUND )
-        find_package( Python3 REQUIRED)
-    endif()
+    find_package( Python3 REQUIRED COMPONENTS Interpreter)
+    set(MY_TEST_LAUNCHER ${Python3_EXECUTABLE} ${CMAKE_BINARY_DIR}/test_launcher.py)
+    configure_file( ${CMAKE_SOURCE_DIR}/test_launcher.py.in ${CMAKE_BINARY_DIR}/test_launcher.py @ONLY FILE_PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_EXECUTE GROUP_READ)
 
     # Macro for adding a new GoogleTest test
     function(add_gtest testName ENGINE debug testSource )
-	    if ("{$ENGINE}" STREQUAL "")
-		    message(FATAL_ERROR "engine cannot be empty, ever!")
-	    endif()
-	    add_executable(${testName} ${testSource} ${ARGN})
-	    target_compile_definitions(${testName} PUBLIC LPF_CORE_IMPL_ID=${ENGINE})
-	    target_compile_definitions(${testName} PUBLIC LPF_CORE_MPI_USES_${ENGINE})
-	    if (debug)
-		    target_include_directories( ${testName} BEFORE PRIVATE ${CMAKE_SOURCE_DIR}/include/debug )
-		    target_link_libraries(${testName} lpf_debug lpf_hl_debug GTest::gtest GTest::gtest_main)
-	    else(debug)
-		    target_link_libraries(${testName} GTest::gtest GTest::gtest_main)
-	    endif(debug)
-
-
-	    # Extract test-specific information from comments of tests
-	    file(READ ${testSource} fileContents)
-	    string(REGEX MATCH "Exit code: ([0-9]+)" _ ${fileContents})
-	    set(retCode ${CMAKE_MATCH_1})
-	    string(REGEX MATCH "pre P >= ([0-9]+)"  _ ${fileContents})
-	    set(minProcs ${CMAKE_MATCH_1})
-	    string(REGEX MATCH "pre P <= ([0-9]+)" _ ${fileContents})
-	    set(maxProcs ${CMAKE_MATCH_1})
-	    string(REGEX MATCH "-probe ([0-9]+.[0-9]+)" _ ${fileContents})
-	    set(lpfProbeSecs ${CMAKE_MATCH_1})
-
-	    target_link_exe_with_core(${testName} ${ENGINE})
-
-
-	    if ("${minProcs}" STREQUAL "")
-		    set(minProcs "1")
-	    endif()
-	    if ("${maxProcs}" STREQUAL "")
-		    set(maxProcs "5")
-	    endif()
-	    if ("${lpfProbeSecs}" STREQUAL "")
-		    set(lpfProbeSecs "0.0")
-	    endif()
-	    if ("${retCode}" STREQUAL "")
-		    set(retCode "0")
-	    endif()
-
-	    # Most recent approach to Gtests, recommended!
-	    set_property(TARGET ${testName} PROPERTY TEST_LAUNCHER ${MY_TEST_LAUNCHER};-e;${ENGINE};-L;${CMAKE_BINARY_DIR}/lpfrun_build;-p;${minProcs};-P;${maxProcs};-t;${lpfProbeSecs};-R;${retCode})
-	    gtest_discover_tests(${testName}
-		    TEST_PREFIX ${ENGINE}_
-		    EXTRA_ARGS --gtest_output=xml:${test_output}/${ENGINE}_${testName}
-		    DISCOVERY_MODE POST_BUILD
-		    DISCOVERY_TIMEOUT 15
-		    )
+        if ("{$ENGINE}" STREQUAL "")
+            message(FATAL_ERROR "engine cannot be empty, ever!")
+        endif()
+        add_executable(${testName} ${testSource} ${ARGN})
+        target_compile_definitions(${testName} PUBLIC LPF_CORE_IMPL_ID=${ENGINE})
+        target_compile_definitions(${testName} PUBLIC LPF_CORE_MPI_USES_${ENGINE})
+        if (debug)
+            target_include_directories( ${testName} BEFORE PRIVATE ${CMAKE_SOURCE_DIR}/include/debug )
+            target_link_libraries(${testName} lpf_debug lpf_hl_debug GTest::gtest GTest::gtest_main)
+        else(debug)
+            target_link_libraries(${testName} GTest::gtest GTest::gtest_main)
+        endif(debug)
+
+
+        # Extract test-specific information from comments of tests
+        file(READ ${testSource} fileContents)
+        string(REGEX MATCH "Exit code: ([0-9]+)" _ ${fileContents})
+        set(retCode ${CMAKE_MATCH_1})
+        string(REGEX MATCH "pre P >= ([0-9]+)"  _ ${fileContents})
+        set(minProcs ${CMAKE_MATCH_1})
+        string(REGEX MATCH "pre P <= ([0-9]+)" _ ${fileContents})
+        set(maxProcs ${CMAKE_MATCH_1})
+        string(REGEX MATCH "-probe ([0-9]+.[0-9]+)" _ ${fileContents})
+        set(lpfProbeSecs ${CMAKE_MATCH_1})
+
+        target_link_exe_with_core(${testName} ${ENGINE})
+
+        # The "\pre P <= max" comment in a test indicates the desired number of
+        # maximum LPF processes. If the test does not define a desired number of 
+        # maximum LPF processes, it will be set to 5.
+        #
+        # The "\pre P >= min" comment in a test indicates the desired number of
+        # minimum LPF processes. If the test does not define a desired minimum 
+        # number of LPF processes, it will be set to 1.
+        #
+        # Let 'processorCount' be the detected number of processors by the system.
+        # If this number is smaller than the desider minimum and/or maximum number
+        # of processes, it overwrites these
+        #
+        # Most tests only define a mininum number of desired processes, such as
+        # "\pre P >= 1". In those cases, the test will execute for the range 1,..,5
+        # (including)
+
+        if ("${minProcs}" STREQUAL "")
+            set(minProcs "1")
+        endif()
+        if ("${maxProcs}" STREQUAL "")
+            set(maxProcs "5")
+        endif()
+        # cap min with processorCount, if needed
+        if ("${minProcs}" GREATER "${processorCount}")
+            set(minProcs ${processorCount})
+        endif()
+        # cap max with processorCount, if needed
+        if ("${maxProcs}" GREATER "${processorCount}")
+            set(maxProcs ${processorCount})
+        endif()
+        if ("${lpfProbeSecs}" STREQUAL "")
+            set(lpfProbeSecs "0.0")
+        endif()
+        if ("${retCode}" STREQUAL "")
+            set(retCode "0")
+        endif()
 
+        # Most recent approach to Gtests, recommended!
+        set_property(TARGET ${testName} PROPERTY TEST_LAUNCHER ${MY_TEST_LAUNCHER};--engine;${ENGINE};--parallel_launcher;${CMAKE_BINARY_DIR}/lpfrun_build;--min_process_count;${minProcs};--max_process_count;${maxProcs};--lpf_probe_timer;${lpfProbeSecs};--expected_return_code;${retCode})
+        gtest_discover_tests(${testName}
+            TEST_PREFIX ${ENGINE}_
+            EXTRA_ARGS --gtest_output=xml:${test_output}/${ENGINE}_${testName}
+            DISCOVERY_MODE POST_BUILD
+            DISCOVERY_TIMEOUT 15
+        )
 
     endfunction(add_gtest)
 
@@ -436,10 +507,11 @@ else(LPF_ENABLE_TESTS)
 
 endif(LPF_ENABLE_TESTS)
 
+# Main LPF library includes and sources
 include_directories(include)
 include_directories(src/common)
-
 add_subdirectory(src)
+
 # Apps
 add_subdirectory(src/utils)
 
diff --git a/README b/README
index 626173c7..26b0300b 100644
--- a/README
+++ b/README
@@ -38,6 +38,10 @@ Optional MPI engine requires
 Optional for thread pinning by Pthreads and hybrid engines
  - hwloc > 1.11
 
+Optional tests requires
+ - GNU C++ compiler (C++17 compatible),
+ - Python 3.
+
 Optional (see --enable-doc) documentation requires
  - doxygen > 1.5.6,
  - graphviz,
diff --git a/bootstrap.sh b/bootstrap.sh
index 60c1ec26..14628772 100755
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -192,7 +192,7 @@ EOF
 
        --with-mpiexec=*)
             mpiexec="${arg#--with-mpiexec=}"
-            mpi_cmake_flags="${mpi_cmake_flags} -DMPIEXEC=$mpiexec"
+            mpi_cmake_flags="${mpi_cmake_flags} -DMPIEXEC=$mpiexec -DMPIEXEC_EXECUTABLE=$mpiexec"
             shift;
             ;;
 
@@ -288,8 +288,8 @@ ${CMAKE_EXE} -Wno-dev \
       -DLPF_HWLOC="${hwloc}" \
       $hwloc_found_flag \
       $mpi_cmake_flags \
-      "$extra_flags" \
-      "$perf_flags" \
+      ${extra_flags+"$extra_flags"} \
+      ${perf_flags+"$perf_flags"} \
       "$@" $srcdir \
      || { echo FAIL "Failed to configure LPF; Please check your chosen configuration"; exit 1; }
 
diff --git a/cmake/mpi.cmake b/cmake/mpi.cmake
index bd7ca9a5..f8d55851 100644
--- a/cmake/mpi.cmake
+++ b/cmake/mpi.cmake
@@ -170,8 +170,17 @@ try_run( IBVERBS_INIT_RUNS IBVERBS_INIT_COMPILES
 endif()
 
 set(ENABLE_IBVERBS FALSE)
-if (LIB_IBVERBS AND NOT IBVERBS_INIT_RUNS STREQUAL "FAILED_TO_RUN")
-  set(ENABLE_IBVERBS TRUE)
+if (LPF_ENABLE_TESTS)
+    # The Google Test integration requires that tests successfully compiled are
+    # also runnable
+    if (LIB_IBVERBS AND NOT IBVERBS_INIT_RUNS STREQUAL "FAILED_TO_RUN")
+        set(ENABLE_IBVERBS TRUE)
+    endif()
+else()
+    # Without the aforementioned Google Test requirement, we can safely build
+    # it and allow the user to deploy the built binaries on IB-enabled nodes.
+    if (LIB_IBVERBS)
+        set(ENABLE_IBVERBS TRUE)
+    endif()
 endif()
 
-
diff --git a/doc/lpf_core.cfg.in b/doc/lpf_core.cfg.in
index 0a8de71c..bfb940b2 100644
--- a/doc/lpf_core.cfg.in
+++ b/doc/lpf_core.cfg.in
@@ -742,8 +742,8 @@ INPUT                  = @PROJECT_SOURCE_DIR@/include/lpf/core.h \
                          @PROJECT_SOURCE_DIR@/include/bsp/bsp.h \
                          @PROJECT_SOURCE_DIR@/include/lpf/hybrid.h \
                          @PROJECT_SOURCE_DIR@/include/lpf/mpirpc-client.h \
-                         @PROJECT_SOURCE_DIR@/include/lpf/rpc-client.h
-
+                         @PROJECT_SOURCE_DIR@/include/lpf/rpc-client.h \
+                         @PROJECT_SOURCE_DIR@/include/lpf/abort.h
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
diff --git a/include/lpf/abort.h b/include/lpf/abort.h
new file mode 100644
index 00000000..383a6ab8
--- /dev/null
+++ b/include/lpf/abort.h
@@ -0,0 +1,152 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LPFLIB_ABORT_H
+#define LPFLIB_ABORT_H
+
+#include "lpf/static_dispatch.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** \addtogroup LPF_EXTENSIONS LPF API extensions
+ * @{
+ *
+ * \defgroup LPF_ABORT Functionality for aborting LPF applications
+ *
+ * If #LPF_HAS_ABORT has a nonzero value, then a call to #lpf_abort from any
+ * process in a distributed application, will abort the entire application.
+ *
+ * \note As with all LPF extensions, it is \em not mandatory for all LPF
+ *       implementations to support this one.
+ *
+ * If #LPF_HAS_ABORT has a zero value, then a call to #lpf_abort shall have no
+ * other effect than it returning #LPF_SUCCESS.
+ *
+ * Therefore,
+ *  - LPF implementations that cannot support an abort functionality may still
+ *    provide a valid, albeit trivial, implementation of this extension.
+ *  - LPF applications that aim to rely on #lpf_abort should first ensure that
+ *    #LPF_HAS_ABORT is nonzero.
+ *
+ * \warning Portable LPF implementations best not rely on #lpf_abort at all.
+ *          Although sometimes unavoidable, the recommendation is to avoid the
+ *          use of this extension as best as possible.
+ *
+ * \note One case where #lpf_abort is absolutely required is for \em testing an
+ *       LPF debug layer. Such a layer should detect erroneous usage, report it,
+ *       but then typically cannot continue execution. In this case, relying on
+ *       the standard abort or exit functionalities to terminate the process the
+ *       error was detected at, typically results in implementation-specific
+ *       (i.e., undefined) behaviour with regards to how the application at
+ *       large terminates. This means that a test-suite for such a debug layer
+ *       cannot reliably detect whether a distributed application has terminated
+ *       for the expected reasons. In this case, #lpf_abort provides a reliable
+ *       mechanism that such a test requires.
+ *
+ * @{
+ */
+
+/**
+ * Whether the active LPF engine supports aborting distributed applications.
+ *
+ * If the value of this field is zero (0), then a call to #lpf_abort will be a
+ * no-op and always return #LPF_SUCCESS.
+ */
+extern _LPFLIB_VAR const int LPF_HAS_ABORT ;
+
+/**
+ * A call to this function aborts the distributed application as soon as
+ * possible.
+ *
+ * \warning This function corresponds to a no-op if #LPF_HAS_ABORT equals zero.
+ *
+ * The below specification only applies when #LPF_HAS_ABORT contains a non-zero
+ * value; otherwise, a call to this function will have no other effect besides
+ * returning #LPF_SUCCESS.
+ *
+ * \note Rationale: the capability to abort relies on the software stack that
+ *       underlies LPF, and in aiming to be a minimal API, LPF does not wish to
+ *       force such a capabilities unto the underlying software or system.
+ *
+ * \note Applications that rely on #lpf_abort therefore should first check if
+ *       the capability is supported.
+ *
+ * \note The recommended way to abort LPF applications that is fully supported
+ *       by the core specification alone (i.e., excluding this #lpf_abort
+ *       extension), is to simply exit the process that should be aborted.
+ *       Compliant LPF implementations will then quit sibling processes <em>at
+ *       latest</em> at a call to #lpf_sync that should handle communications
+ *       with the exited process. Sibling processes may also exit early without
+ *       involvement of LPF. In all cases, the parent call to #lpf_exec,
+ *       #lpf_hook, or #lpf_rehook should return with #LPF_ERR_FATAL.
+ *
+ * \warning Therefore, whenever possible, code implemented on top of LPF ideally
+ *          does not rely on #lpf_abort. Instead, error handling more reliably
+ *          could be implemented on top of the above-described default LPF
+ *          behaviour.
+ *
+ * The call to #lpf_abort differs from the stdlib <tt>abort</tt>; for example,
+ * implementations are not required to raise SIGABRT as part of a call to
+ * #lpf_abort. Instead, the requirements are that:
+ *  1. processes that call this function terminate during the call to
+ *     #lpf_abort;
+ *  2. all other processes associated with the distributed application terminate
+ *     at latest during a next call to #lpf_sync that should have handled
+ *     communications with any aborted process;
+ *  3. regardless of whether LPF aborted sibling processes, whether they exited
+ *     gracefully, or whether they also called #lpf_abort, the process(es) which
+ *     made the parent call to #lpf_exec, #lpf_hook, or #lpf_rehook should
+ *     either: a) terminate also, at latest when all (other) associated
+ *     processes have terminated, (exclusive-)or b) return #LPF_ERR_FATAL.
+ *     Which behaviour (a or b) will be followed is up to the implementation,
+ *     and portable applications should account for both possibilities.
+ *
+ * \note In the above, \em other is between parenthesis since the processes
+ *       executing the application may be fully disjoint from the process that
+ *       spawned the application. In this case it is natural to elect that the
+ *       spawning process returns #LPF_ERR_FATAL, though under this
+ *       specification also that process may be aborted before the spawning
+ *       call returns.
+ *
+ * \note If one of the associated processes deadlock (e.g. due to executing
+ *       <tt>while(1){}</tt>), it shall remain undefined when the entire
+ *       application aborts. Implementations shall make a best effort to do this
+ *       as early as possible.
+ *
+ * \note Though implied by the above, we note explicitly that #lpf_abort is
+ *       \em not a collective function; a single process calling #lpf_abort can
+ *       terminate all associated processes.
+ *
+ * @returns #LPF_SUCCESS If and only if #LPF_HAS_ABORT equals zero.
+ *
+ * If #LPF_HAS_ABORT is nonzero, then this function shall not return.
+ */
+extern _LPFLIB_API 
+lpf_err_t lpf_abort(lpf_t ctx);
+
+/**
+ * @}
+ * @}
+ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/lpf/core.h b/include/lpf/core.h
index c9a7f921..0d4434e2 100644
--- a/include/lpf/core.h
+++ b/include/lpf/core.h
@@ -126,6 +126,8 @@
  *  - \ref LPF_EXTENSIONS
  *     - \ref LPF_PTHREAD
  *     - \ref LPF_MPI
+ *     - \ref LPF_HYBRID
+ *     - \ref LPF_ABORT
  *  - \ref LPF_HL
  *     - \ref LPF_BSPLIB
  *     - \ref LPF_COLLECTIVES
@@ -988,7 +990,7 @@ typedef struct lpf_machine {
      *                         both bounds are inclusive.
      * \param[in] min_msg_size A byte size value that is larger or equal to 0.
      * \param[in] attr         A #lpf_sync_attr_t value. When in doubt, always
-     *                         use #LPF_SYNC_DEFAULT
+     *                         use #LPF_SYNC_DEFAULT.
      *
      * \returns The guaranteed value for the message gap given an LPF SPMD
      *          section using \a p processes, for a superstep in which a user
@@ -2425,6 +2427,15 @@ lpf_err_t lpf_get_rcvd_msg_count( lpf_t ctx, size_t *rcvd_msgs);
 extern _LPFLIB_API
 lpf_err_t lpf_get_sent_msg_count_per_slot( lpf_t ctx, size_t *sent_msgs, lpf_memslot_t slot);
 
+/**
+ * This function returns in @rcvd_msgs the total received message count.
+ * It is only implemented for the zero backend (on Infiniband)
+ * \param[in] ctx The LPF context
+ * \param[out] sent_msgs Sent message count
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_get_sent_msg_count( lpf_t ctx, size_t *sent_msgs);
+
 /**
  * This function blocks until all the scheduled messages via
  * ibv_post_send are completed (via ibv_poll_cq). This includes
diff --git a/include/lpf/hybrid.h b/include/lpf/hybrid.h
index 00845f08..4c324adf 100644
--- a/include/lpf/hybrid.h
+++ b/include/lpf/hybrid.h
@@ -28,7 +28,7 @@ extern "C" {
  *
  * @{
  *
- * \defgroup LPF_HYBRID Specific to Hybrid implementation
+ * \defgroup LPF_HYBRID Specific to the hybrid engine
  *
  * @{
  */
diff --git a/include/lpf/noc.h b/include/lpf/noc.h
new file mode 100644
index 00000000..8949f6e7
--- /dev/null
+++ b/include/lpf/noc.h
@@ -0,0 +1,472 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LPFLIB_NOC_H
+#define LPFLIB_NOC_H
+
+// import size_t data type for the implementation
+#ifndef DOXYGEN
+
+#ifdef __cplusplus
+#include <cstddef>
+#else
+#include <stddef.h>
+#endif
+
+#include <lpf/core.h>
+
+#endif // DOXYGEN
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** \addtogroup LPF_EXTENSIONS LPF API extensions
+ *
+ * @{
+ *
+ * \defgroup LPF_NOC Extensions to LPF where it need not maintain consistency.
+ *
+ * This extension specifies facilities for (de-)registering memory slots,
+ * registering RDMA requests, and fencing RDMA requests. These extensions are,
+ * as far as possible, fully compatible with the core LPF definitions. These
+ * include LPF contexts (#lpf_t), processor count types (#lpf_pid_t), memory
+ * slot types (#lpf_memslot_t), and message attributes (#lpf_msg_attr_t).
+ *
+ * In this extension, LPF does not maintain consistency amongst processes that
+ * (de-)register memory slots while RDMA communication may occur. Maintaining
+ * the required consistency instead becomes the purview of the user. This
+ * extension specificies exactly what consistency properties the user must
+ * guarantee.
+ *
+ * \warning If LPF is considered a tool for the so-called <em>hero
+ *          programmer</em>, then please note that this variant is even harder
+ *          to program with.
+ *
+ * \note At present, no debug layer exists for this extension. It is unclear if
+ *       such a debug layer is even possible (precisely because LPF in this
+ *       extension does not maintain consistency, there is no way a debug layer
+ *       could enforce it).
+ *
+ * @{
+ */
+
+
+/**
+ * The version of this no-conflict LPF specification. All implementations shall
+ * define this macro. The format is YYYYNN, where YYYY is the year the
+ * specification was released, and NN the number of the specifications released
+ * before this one in the same year.
+ */
+#define _LPF_NOC_VERSION 202400L
+
+/**
+ * Resizes the memory register for non-coherent RDMA.
+ *
+ * After a successful call to this function, the local process has enough
+ * resources to register \a max_regs memory regions in a non-coherent way.
+ *
+ * Each registration via lpf_noc_register() counts as one. Such registrations
+ * remain taking up capacity in the register until they are released via a call
+ * to lpf_noc_deregister(), which lowers the count of used memory registerations
+ * by one.
+ *
+ * There are no runtime out-of-bounds checks prescribed for lpf_noc_register()--
+ * this would also be too costly as error checking would require communication.
+ *
+ * If memory allocation were successful, the return value is #LPF_SUCCESS and
+ * the local process may assume the new buffer size \a max_regs.
+ *
+ * In the case of insufficient local memory the return value will be
+ * #LPF_ERR_OUT_OF_MEMORY. In that case, it is as if the call never happened and
+ * the user may retry the call locally after freeing up unused resources. Should
+ * retrying not lead to a successful call, the programmer may opt to broadcast
+ * the error (using existing slots) or to give up by returning from the spmd
+ * section.
+ *
+ * \note The current maximum cannot be retrieved from the runtime. Instead, the
+ *       programmer must track this information herself. To provide
+ *       encapsulation, see lpf_rehook().
+ *
+ * \note When the given memory register capacity is smaller than the current
+ *       capacity, the runtime is allowed but not required to release the
+ *       allocated memory. Such a call shall always be successful and return
+ *       #LPF_SUCCESS.
+ *
+ * \note This means that an implementation that allows shrinking the given
+ *       capacity must also ensure the old buffer remains intact in case there
+ *       is not enough memory to allocate a smaller one.
+ *
+ * \note The last invocation of lpf_noc_resize_memory_register() determines the
+ *       maximum number of memory registrations using lpf_noc_register() that
+ *       can be maintained concurrently.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only. Any
+ * further thread safety may be guaranteed by the implementation, but is not
+ * specified. Similar conditions hold for all LPF primitives that take an
+ * argument of type #lpf_t; see #lpf_t for more information.
+ *
+ * \param[in,out]   ctx The runtime state as provided by lpf_exec().
+ * \param[in]  max_regs The requested maximum number of memory regions that can
+ *                      be registered. This value must be the same on all
+ *                      processes.
+ *
+ * \returns #LPF_SUCCESS
+ *            When this process successfully acquires the resources.
+ *
+ * \returns #LPF_ERR_OUT_OF_MEMORY
+ *            When there was not enough memory left on the heap. In this case
+ *            the effect is the same as when this call did not occur at all.
+ *
+ * \par BSP costs
+ * None
+ *
+ * See also \ref BSPCOSTS.
+ *
+ * \par Runtime costs
+ * \f$ \Theta( \mathit{max\_regs} ) \f$.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_noc_resize_memory_register( lpf_t ctx, size_t max_regs );
+
+/**
+ * Registers a local memory area, preparing its use for intra-process
+ * communication.
+ *
+ * The registration process is necessary to enable Remote Direct Memory Access
+ * (RDMA) primitives, such as lpf_get() and lpf_put().
+ *
+ * This is \em not a collective function. For #lpf_get and #lpf_put, the memory
+ * slot returned by this function is equivalent to a memory slot returned by
+ * #lpf_register_local; the \a memslot returned by a successful call to this
+ * function (hence) is immediately valid. A successful call (hence) immediately
+ * consumes one memory slot capacity; see also #lpf_resize_memory_register on
+ * how to ensure sufficient capacity.
+ *
+ * Different from a memory slot returned by #lpf_register_local, a memory slot
+ * returned by a successful call to this function may serve as either a local
+ * or remote memory slot for #lpf_noc_put and #lpf_noc_get.
+ *
+ * Use of the returned memory slot to indicate a remote memory area may only
+ * occur by copying the returned memory slot to another LPF process. This may
+ * be done using the standard #lpf_put and #lpf_get methods or by using
+ * auxiliary communication mechanisms. The memory slot thus communicated only
+ * refers to a valid memory area on the process it originated from; any other
+ * use leads to undefined behaviour.
+ *
+ * \note Note that the ability to copy memory slots to act as identifiers of
+ *       remote areas exploits the LPF core specification that instances of
+ *       the #lpf_memslot_t type are, indeed, byte-copyable.
+ *
+ * A memory slot returned by a successful call to this function may be
+ * destroyed via a call to the standard #lpf_deregister. The deregistration
+ * takes effect immediately. No communication using the deregistered slot
+ * should occur during that superstep, or otherwise undefined behaviour occurs.
+ *
+ * Only the process that created the returned memory slot can destroy it; other
+ * LPF processes than the one which created it that attempt to destroy the
+ * returned memory slot, invoke undefined behaviour.
+ *
+ * Other than the above specified differences, the arguments to this function
+ * are the same as for #lpf_register_local:
+ *
+ * \param[in,out] ctx     The runtime state as provided by lpf_exec().
+ * \param[in]     pointer The pointer to the memory area to register.
+ * \param[in]     size    The size of the memory area to register in bytes.
+ * \param[out]    memslot Where to store the memory slot identifier.
+ *
+ * \note Registering a slot with zero \a size is valid. The resulting memory
+ *       slot cannot be written to nor read from by remote LPF processes.
+ *
+ * \note In particular, passing \c NULL as \a pointer and \c 0 for \a size is
+ *       valid.
+ *
+ * \returns #LPF_SUCCESS
+ *            Successfully registered the memory region and successfully
+ *            assigned a memory slot identifier.
+ *
+ * \note One registration consumes one memory slot from the pool of locally
+ *       available memory slots, which must have been preallocated by
+ *       lpf_resize_memory_register() or recycled by lpf_deregister(). Always
+ *       use lpf_resize_memory_register() at the start of the SPMD function
+ *       that is executed by lpf_exec(), since lpf_exec() itself does not
+ *       preallocate slots.
+ *
+ * \note It is illegal to request more memory slots than have previously been
+ *       registered with lpf_resize_memory_register(). There is no runtime
+ *       check for this error, because a safe way out cannot be guaranteed
+ *       without significant parallel error checking overhead.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only. Any
+ * further thread safety may be guaranteed by the implementation, but is not
+ * specified. Similar conditions hold for all LPF primitives that take an
+ * argument of type #lpf_t; see #lpf_t for more information.
+ *
+ * \par BSP costs
+ *
+ * None.
+ *
+ * \par Runtime costs
+ *
+ * \f$ \mathcal{O}( \texttt{size} ) \f$.
+ *
+ * \note This asymptotic bound may be attained for implementations that require
+ *       linear-time processing on the registered memory area, such as to effect
+ *       memory pinning. If this is not required, a good implementation will
+ *       require only \f$ \Theta(1) \f$ time.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_noc_register(
+    lpf_t ctx,
+    void * pointer,
+    size_t size,
+    lpf_memslot_t * memslot
+);
+
+/**
+ * Deregisters a memory area previously registered using lpf_noc_register().
+ *
+ * After a successful deregistration, the slot is returned to the pool of free
+ * memory slots. The total number of memory slots may be set via a call to
+ * lpf_noc_resize_memory_register().
+ *
+ * Deregistration takes effect immediately. A call to this function is not
+ * collective, and the other of deregistration does not need to match the order
+ * of registration. Any local or remote communication using the given \a memslot
+ * in the current superstep invokes undefined behaviour.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only. Any
+ * further thread safety may be guaranteed by the implementation, but is not
+ * specified. Similar conditions hold for all LPF primitives that take an
+ * argument of type #lpf_t; see #lpf_t for more information.
+ *
+ * \param[in,out] ctx The runtime state as provided by lpf_exec().
+ * \param[in] memslot The memory slot identifier to de-register.
+ *
+ * \returns #LPF_SUCCESS
+ *            Successfully deregistered the memory region.
+ *
+ * \par BSP costs
+ * None.
+ *
+ * \par Runtime costs
+ * \f$ \mathcal{O}(n) \f$, where \f$ n \f$ is the size of the memory region
+ * corresponding to \a memslot.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_noc_deregister(
+    lpf_t ctx,
+    lpf_memslot_t memslot
+);
+
+/**
+ * Copies contents of local memory into the memory of remote processes.
+ *
+ * This operation is guaranteed to be completed after a call to the next
+ * lpf_sync() exits.
+ *
+ * Until that time it occupies one entry in the operations queue.
+ *
+ * Concurrent reads or writes from or to the same memory area are
+ * allowed in the same way they are for the core primitive #lpf_put.
+ *
+ * This primitive differs from #lpf_put in that the \a dst_slot may be the
+ * result of a successful call to #lpf_noc_register, while \a src_slot \em must
+ * be the results of such a successful call. In both cases, the slot need
+ * \em not have been registered before the last call to #lpf_sync.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only. Any
+ * further thread safety may be guaranteed by the implementation, but is not
+ * specified. Similar conditions hold for all LPF primitives that take an
+ * argument of type #lpf_t; see #lpf_t for more information.
+ *
+ * \param[in,out] ctx    The runtime state as provided by lpf_exec()
+ * \param[in] src_slot   The memory slot of the local source memory area
+ *                       registered using lpf_register_local(),
+ *                       lpf_register_global(), or lpf_noc_register()
+ * \param[in] src_offset The offset of reading out the source memory area,
+ *                       w.r.t. the base location of the registered area
+ *                       expressed in bytes.
+ * \param[in] dst_pid    The process ID of the destination process.
+ * \param[in] dst_slot   The memory slot of the destination memory area at
+ *                       \a pid, registered using lpf_register_global() or
+ *                       lpf_noc_register().
+ * \param[in] dst_offset The offset of writing to the destination memory area
+ *                       w.r.t. the base location of the registered area
+ *                       expressed in bytes.
+ * \param[in] size       The number of bytes to copy from the source memory area
+ *                       to the destination memory area.
+ * \param[in] attr
+ *            \parblock
+ *            In case an \a attr not equal to #LPF_MSG_DEFAULT is provided, the
+ *            the message created by this function may have modified semantics
+ *            that may be used to extend this API. Examples include:
+ *
+ *              -# delaying the superstep deadline of delivery, and/or
+ *              -# DRMA with message combining semantics.
+ *
+ *            These attributes are stored after a call to this function has
+ *            completed and may be modified immediately after without affecting
+ *            any messages already scheduled.
+ *            \endparblock
+ *
+ * \note See #lpf_put for notes regarding #lpf_msg_attr_t.
+ *
+ * \returns #LPF_SUCCESS
+ *            When the communication request was recorded successfully.
+ *
+ * \par BSP costs
+ * This function will increase
+ *     \f$ t_{c}^{(s)} \f$
+ * and
+ *     \f$ r_{c}^{(\mathit{pid})} \f$
+ * by \a size, where c is the current superstep number and s is this process ID
+ * (as provided by #lpf_exec)). See \ref BSPCOSTS on how this affects real-time
+ * communication costs.
+ *
+ * \par Runtime costs
+ * See \ref BSPCOSTS.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_noc_put(
+    lpf_t ctx,
+    lpf_memslot_t src_slot,
+    size_t src_offset,
+    lpf_pid_t dst_pid,
+    lpf_memslot_t dst_slot,
+    size_t dst_offset,
+    size_t size,
+    lpf_msg_attr_t attr
+);
+
+/**
+ * Copies contents from remote memory to local memory.
+ *
+ * This operation completes after one call to lpf_sync().
+ *
+ * Until that time it occupies one entry in the operations queue.
+ *
+ * Concurrent reads or writes from or to the same memory area are allowed in the
+ * same way it is for #lpf_get.
+ *
+ * This primitive differs from #lpf_get in that the \a src_slot may be the
+ * result of a successful call to #lpf_noc_register, while \a dst_slot \em must
+ * be the results of such a successful call. In both cases, the slot need
+ * \em not have been registered before the last call to #lpf_sync.
+ *
+ * \par Thread safety
+ * This function is safe to be called from different LPF processes only. Any
+ * further thread safety may be guaranteed by the implementation, but is not
+ * specified. Similar conditions hold for all LPF primitives that take an
+ * argument of type #lpf_t; see #lpf_t for more information.
+ *
+ * \param[in,out] ctx    The runtime state as provided by lpf_exec().
+ * \param[in] src_pid    The process ID of the source process.
+ * \param[in] src_slot   The memory slot of the source memory area at \a pid, as
+ *                       globally registered with lpf_register_global() or
+ *                       lpf_noc_register().
+ * \param[in] src_offset The offset of reading out the source memory area,
+ *                       w.r.t. the base location of the registered area
+ *                       expressed in bytes.
+ * \param[in] dst_slot   The memory slot of the local destination memory area
+ *                       registered using lpf_register_local(),
+ *                       lpf_register_global(), or lpf_noc_register().
+ * \param[in] dst_offset The offset of writing to the destination memory area
+ *                       w.r.t. the base location of the registered area
+ *                       expressed in bytes.
+ * \param[in] size       The number of bytes to copy from the source
+ *                       remote memory location.
+ * \param[in] attr
+ *            \parblock
+ *            In case an \a attr not equal to #LPF_MSG_DEFAULT is provided, the
+ *            the message created by this function may have modified semantics
+ *            that may be used to extend this API. Examples include:
+ *
+ *              -# delaying the superstep deadline of delivery, and/or
+ *              -# DRMA with message combining semantics.
+ *
+ *            These attributes are stored after a call to this function has
+ *            completed and may be modified immediately after without affecting
+ *            any messages already scheduled.
+ *            \endparblock
+ *
+ * \note See #lpf_get for notes on the use of #lpf_msg_attr_t.
+ *
+ * \returns #LPF_SUCCESS
+ *            When the communication request was recorded successfully.
+ *
+ * \par BSP costs
+ * This function will increase
+ *   \f$ r_{c}^{(s)} \f$
+ * and
+ *   \f$ t_{c}^{(\mathit{pid})} \f$
+ * by \a size, where c is the current superstep number and s is this process ID
+ * (as provided via lpf_exec(). See \ref BSPCOSTS on how this affects real-time
+ * communication costs.
+ *
+ * \par Runtime costs
+ * See \ref BSPCOSTS.
+ */
+extern _LPFLIB_API
+lpf_err_t lpf_noc_get(
+    lpf_t ctx,
+    lpf_pid_t src_pid,
+    lpf_memslot_t src_slot,
+    size_t src_offset,
+    lpf_memslot_t dst_slot,
+    size_t dst_offset,
+    size_t size,
+    lpf_msg_attr_t attr
+);
+
+extern _LPFLIB_API
+lpf_err_t lpf_noc_serialize_slot(
+        lpf_t ctx,
+        lpf_memslot_t slot,
+        char  ** buff,
+        size_t * buff_size
+);
+/*
+ * lpf_deserialize_slot may only be called on a slot
+ * already registered via lpf_noc_register.
+ * This call sets the memory registration attributes from 
+ * the byte array buff with byte size buff_size.
+ * This array must have been created via a call to
+ * @lpf_serialize_slot
+ */
+extern _LPFLIB_API
+    lpf_err_t lpf_noc_deserialize_slot(
+            lpf_t ctx,
+            char * buff,
+            lpf_memslot_t slot
+);
+/**
+ * @}
+ *
+ * @}
+ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/lpf/pthread.h b/include/lpf/pthread.h
index ba68f3f8..454eaf0e 100644
--- a/include/lpf/pthread.h
+++ b/include/lpf/pthread.h
@@ -28,7 +28,7 @@ extern "C" {
  *
  * @{
  *
- * \defgroup LPF_PTHREAD Specific to Pthreads 
+ * \defgroup LPF_PTHREAD Specific to the Pthreads engine
  *
  * @{
  */
diff --git a/include/lpf/static_dispatch.h b/include/lpf/static_dispatch.h
index 8816f9e9..71d65526 100644
--- a/include/lpf/static_dispatch.h
+++ b/include/lpf/static_dispatch.h
@@ -45,6 +45,7 @@
 #undef lpf_register_local
 #undef lpf_get_rcvd_msg_count
 #undef lpf_get_rcvd_msg_count_per_slot
+#undef lpf_get_sent_msg_count
 #undef lpf_get_sent_msg_count_per_slot
 #undef lpf_register_global
 #undef lpf_flush_sent
@@ -85,6 +86,7 @@
 #undef LPF_NONE
 #undef LPF_INIT_NONE
 #undef LPF_NO_ARGS
+#undef LPF_HAS_ABORT
 
 #ifdef LPF_FUNC
 
@@ -96,6 +98,7 @@
 #define lpf_register_local  LPF_FUNC(register_local)
 #define lpf_get_rcvd_msg_count LPF_FUNC(get_rcvd_msg_count)
 #define lpf_get_rcvd_msg_count_per_slot LPF_FUNC(get_rcvd_msg_count_per_slot)
+#define lpf_get_sent_msg_count LPF_FUNC(get_sent_msg_count)
 #define lpf_get_sent_msg_count_per_slot LPF_FUNC(get_sent_msg_count_per_slot)
 #define lpf_flush_sent LPF_FUNC(flush_sent)
 #define lpf_flush_received LPF_FUNC(flush_received)
@@ -136,6 +139,7 @@
 #define LPF_NONE              LPF_CONST(NONE)
 #define LPF_INIT_NONE         LPF_CONST(INIT_NONE)
 #define LPF_NO_ARGS           LPF_CONST(NO_ARGS)
+#define LPF_HAS_ABORT         LPF_CONST(HAS_ABORT)
 
 #endif
 
diff --git a/lpfcc.in b/lpfcc.in
index b1a89659..b58da83a 100644
--- a/lpfcc.in
+++ b/lpfcc.in
@@ -187,6 +187,32 @@ do
             shift
             ;;
 
+       # The below two special cases are to ensure good integration with CMake.
+       # Note that the arguments that follow -MT and -MQ are object files, which
+       # otherwise would be appended to the objects list. In case of manual
+       # usage of lpfcc, therefore, the use of these flags should come after --,
+       # however, it is unclear how to pass that to CMake and we choose this
+       # solution instead (nor would that be desired-- ideally, lpfcc can act as
+       # a "regular" CC from the CMake perspective)
+
+       -MT)
+            other_args[$arg_number]="-MT"
+            arg_number=$((arg_number + 1))
+            shift
+            other_args[$arg_number]="$arg"
+            arg_number=$((arg_number + 1))
+            shift
+            ;;
+
+       -MQ)
+            other_args[$arg_number]="-MQ"
+            arg_number=$((arg_number + 1))
+            shift
+            other_args[$arg_number]="$arg"
+            arg_number=$((arg_number + 1))
+            shift
+            ;;
+
        *)   case $state in
 
                 engine)
diff --git a/post-install/cmake-module-test/src/CMakeLists.txt b/post-install/cmake-module-test/src/CMakeLists.txt
index fe1ae2a8..eeef8252 100644
--- a/post-install/cmake-module-test/src/CMakeLists.txt
+++ b/post-install/cmake-module-test/src/CMakeLists.txt
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.10)
 project(findlpf_test)
 
 find_package(lpf REQUIRED CONFIG)
diff --git a/post-install/func_lpf_hook_subset.mpimsg.cpp b/post-install/func_lpf_hook_subset.mpimsg.cpp
new file mode 100644
index 00000000..6b7d3a5c
--- /dev/null
+++ b/post-install/func_lpf_hook_subset.mpimsg.cpp
@@ -0,0 +1,67 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <lpf/core.h>
+#include <lpf/mpi.h>
+
+#include <mpi.h>
+
+
+const int LPF_MPI_AUTO_INITIALIZE=0;
+
+void test_spmd( lpf_t ctx, lpf_pid_t pid, lpf_pid_t nprocs, lpf_args_t args)
+{
+    (void) ctx;
+    (void) pid;
+    (void) nprocs;
+    (void) args;
+    return;
+}
+
+void subset_func(MPI_Comm comm)
+{
+    MPI_Barrier(comm);
+
+    lpf_init_t init;
+    lpf_err_t rc = lpf_mpi_initialize_with_mpicomm(comm, &init);
+
+    rc = lpf_hook(init, test_spmd, LPF_NO_ARGS);
+}
+
+int main(int argc, char **argv)
+{
+    MPI_Init(&argc, &argv);
+
+    int s;
+    MPI_Comm_rank(MPI_COMM_WORLD, &s);
+
+    int subset = s < 2; // Processes are divided into 2 subsets {0,1} and {2,...,p-1}
+
+    MPI_Comm subset_comm;
+    MPI_Comm_split(MPI_COMM_WORLD, subset, s, &subset_comm);
+
+// only the first subset enters that function
+    if (subset)
+    {
+        subset_func(subset_comm);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD); // Paranoid barrier
+
+    MPI_Finalize();
+    
+}
diff --git a/post-install/post-install-test.cmake.in b/post-install/post-install-test.cmake.in
index 65c9ef9f..75c5de13 100644
--- a/post-install/post-install-test.cmake.in
+++ b/post-install/post-install-test.cmake.in
@@ -268,12 +268,12 @@ if (MPI_FOUND)
     endif()
 
 
-    message("Compiling a simple LPF program with mpimsg engine")
+    message("Compiling a simple MPI LPF program with mpimsg engine")
     # Compile this to check whether mpi.h can be found
     execute_process(
-           COMMAND @bindir@/lpfcc -engine mpimsg  -I@common@
-                   @testdir@/func_lpf_hook_subset.mpimsg.c
-                -o lpfhook_subset_mpimsg_cc
+           COMMAND @bindir@/lpfcxx -engine mpimsg  -I@common@
+                   @srcdir@/func_lpf_hook_subset.mpimsg.cpp -c
+                -o lpfhook_subset_mpimsg_cc.o
            WORKING_DIRECTORY @builddir@
            RESULT_VARIABLE status
            )
@@ -353,6 +353,9 @@ endif()
 ######   CMake integration using generated CMake module file ############
 
 foreach(engine @ENGINES@)
+    if ("${engine}" STREQUAL "zero") 
+        continue()
+    endif()
     message("Testing generated CMake module files for engine ${engine}")
 
     set(test_dir @builddir@/cmake-module-test-${engine})
diff --git a/post-install/test-lpf-nprocs.c b/post-install/test-lpf-nprocs.c
index cf274b3f..554b5775 100644
--- a/post-install/test-lpf-nprocs.c
+++ b/post-install/test-lpf-nprocs.c
@@ -53,6 +53,8 @@ void spmd( lpf_t lpf, lpf_pid_t pid, lpf_pid_t nprocs, lpf_args_t args )
     lpf_memslot_t mem_slot = LPF_INVALID_MEMSLOT;
     lpf_register_global( lpf, mem, nprocs, &mem_slot );
 
+    lpf_sync(lpf, LPF_SYNC_DEFAULT);
+
     if (pid != 0) 
         lpf_get( lpf, 0, params_slot, 0, params_slot, 0, sizeof(params), LPF_MSG_DEFAULT );
 
diff --git a/src/MPI/CMakeLists.txt b/src/MPI/CMakeLists.txt
index 9633d1d5..636b243c 100644
--- a/src/MPI/CMakeLists.txt
+++ b/src/MPI/CMakeLists.txt
@@ -35,27 +35,25 @@ if (MPI_FOUND)
     set_target_properties( lpf_proxy_dummy PROPERTIES LINK_FLAGS
             "${MPI_C_LINK_FLAGS}" )
     target_include_directories( lpf_proxy_dummy PRIVATE ${MPI_C_INCLUDE_PATH})
-    target_compile_flags(lpf_proxy_dummy PRIVATE  ${MPI_C_COMPILE_FLAGS})
     install( TARGETS lpf_proxy_dummy RUNTIME DESTINATION ${INSTALL_HELPERS} )
-        
 
     set(LPF_IMPL_CONFIG ${LPFLIB_CONFIG_NAME})
 
 
-# univ_ stands for universal interface => lpf_exec, lpf_put, etc...
-# spec_ stands for specific interface => lpf_mpimsg_release_exec, lpf_mpimsg_release_put, etc...
+    # univ_ stands for universal interface => lpf_exec, lpf_put, etc...
+    # spec_ stands for specific interface => lpf_mpimsg_release_exec, lpf_mpimsg_release_put, etc...
     foreach (iface  "univ_" "spec_" )
-    foreach (LPF_IMPL_ID ${MPI_ENGINES})
-        set(libname "lpf_core_${iface}${LPF_IMPL_ID}_${LPF_IMPL_CONFIG}")
-        set(comlib  "lpf_common_${LPFLIB_CONFIG_NAME}")
-        
-        set(ibverbs_sources)
-        if (LPF_IMPL_ID STREQUAL ibverbs)
+        foreach (LPF_IMPL_ID ${MPI_ENGINES})
+            set(libname "lpf_core_${iface}${LPF_IMPL_ID}_${LPF_IMPL_CONFIG}")
+            set(comlib  "lpf_common_${LPFLIB_CONFIG_NAME}")
+
+            set(ibverbs_sources)
+            if (LPF_IMPL_ID STREQUAL ibverbs)
             set(ibverbs_sources ibverbs.cpp)
         endif()
 
         if (LPF_IMPL_ID STREQUAL zero)
-            set(ibverbs_sources ibverbsZero.cpp)
+            set(ibverbs_sources ibverbsZero.cpp ibverbsNoc.cpp)
         endif()
 
         add_library(raw_${libname} OBJECT
@@ -71,15 +69,13 @@ if (MPI_FOUND)
                 spall2all.c
                 messagesort.cpp
                 spall2all.cpp
-		init.cpp
+                init.cpp
                 ${ibverbs_sources}
             )
 
 
         target_compile_flags(raw_${libname} 
-                PUBLIC ${MPI_C_COMPILE_FLAGS} 
-                INTERFACE "-fPIC"
-                )
+                INTERFACE "-fPIC")
 
         target_compile_definitions(raw_${libname} 
                 PRIVATE "LPF_CORE_MPI_USES_${LPF_IMPL_ID}=1"
@@ -107,9 +103,7 @@ if (MPI_FOUND)
                                                     MACOSX_RPATH TRUE)
 
         target_compile_flags(${libname} 
-                PUBLIC ${MPI_C_COMPILE_FLAGS} 
-                INTERFACE "-fPIC"
-        )
+                INTERFACE "-fPIC")
 
         if (iface STREQUAL "spec_")
             target_compile_definitions(${libname} 
@@ -175,31 +169,43 @@ if (MPI_FOUND)
             ${CMAKE_CURRENT_SOURCE_DIR}/dynamichook.cpp 
             ${CMAKE_CURRENT_SOURCE_DIR}/mpilib.cpp)
         
-                configure_file( dynamichook.t.sh.in dynamichook.t.sh @ONLY)
-                set( dynamic_hook_t_sh "${CMAKE_CURRENT_BINARY_DIR}/dynamichook.t.sh")
-                add_test(NAME dynamichook_1proc
-                         COMMAND bash ${dynamic_hook_t_sh} 1)
-                set_tests_properties( dynamichook_1proc PROPERTIES TIMEOUT 30 )
-                add_test(NAME dynamichook_2proc
-                         COMMAND bash ${dynamic_hook_t_sh} 2)
-                set_tests_properties( dynamichook_2proc PROPERTIES TIMEOUT 30 )
-                add_test(NAME dynamichook_3proc
-                         COMMAND bash ${dynamic_hook_t_sh} 3)
-                set_tests_properties( dynamichook_3proc PROPERTIES TIMEOUT 30 )
-                add_test(NAME dynamichook_10proc
-                         COMMAND bash ${dynamic_hook_t_sh} 10)
-                set_tests_properties( dynamichook_10proc PROPERTIES TIMEOUT 30 )
+            configure_file( dynamichook.t.sh.in dynamichook.t.sh @ONLY)
+            set( dynamic_hook_t_sh "${CMAKE_CURRENT_BINARY_DIR}/dynamichook.t.sh")
+            add_test(NAME dynamichook_1proc
+                COMMAND bash ${dynamic_hook_t_sh} 1)
+            # We set all dynamichook tests to run in serial mode, without any other tests,
+            # since these tests occupy the same port and would block each other
+            set_tests_properties( dynamichook_1proc PROPERTIES TIMEOUT 30 RUN_SERIAL TRUE)
+            add_test(NAME dynamichook_2proc
+                COMMAND bash ${dynamic_hook_t_sh} 2)
+            set_tests_properties( dynamichook_2proc PROPERTIES TIMEOUT 30 RUN_SERIAL TRUE)
+            add_test(NAME dynamichook_3proc
+                COMMAND bash ${dynamic_hook_t_sh} 3)
+            set_tests_properties( dynamichook_3proc PROPERTIES TIMEOUT 30 RUN_SERIAL TRUE)
+            add_test(NAME dynamichook_10proc
+                COMMAND bash ${dynamic_hook_t_sh} 10)
+            set_tests_properties( dynamichook_10proc PROPERTIES TIMEOUT 30 RUN_SERIAL TRUE)
     endif()
 
 # Other unit tests
     if (ENABLE_IBVERBS AND LPF_ENABLE_TESTS)
         add_gtest( ibverbs_test "ibverbs" ON ${CMAKE_CURRENT_SOURCE_DIR}/ibverbs.t.cpp 
-            ${CMAKE_CURRENT_SOURCE_DIR}/ibverbs.cpp 
-            ${CMAKE_CURRENT_SOURCE_DIR}/mpilib.cpp)
+            ibverbs.cpp mpilib.cpp)
 
         add_gtest( zero_test "zero" ON ${CMAKE_CURRENT_SOURCE_DIR}/ibverbs.t.cpp 
             ${CMAKE_CURRENT_SOURCE_DIR}/ibverbsZero.cpp 
             ${CMAKE_CURRENT_SOURCE_DIR}/mpilib.cpp)
+
+        # NOC test for HiCR
+        set(mode "")
+        set(LPF_IMPL_ID "zero")
+        set(LPF_IMPL_CONFIG ${LPFLIB_CONFIG_NAME})
+        set(exeName "func_verbs_test_noc_register_${LPF_IMPL_ID}_${LPF_IMPL_CONFIG}${mode}")
+        add_gtest(${exeName} ${LPF_IMPL_ID} ON ${CMAKE_CURRENT_SOURCE_DIR}/func_verbs_test_noc_register.cpp
+            ${CMAKE_CURRENT_SOURCE_DIR}/ibverbsZero.cpp
+            ${CMAKE_CURRENT_SOURCE_DIR}/ibverbsNoc.cpp
+            ${CMAKE_CURRENT_SOURCE_DIR}/mpilib.cpp)
+
     endif()
 
     foreach (engine ${MPI_ENGINES})
@@ -229,4 +235,3 @@ if (MPI_FOUND)
 
 endif(MPI_FOUND)
 
-
diff --git a/src/MPI/core.cpp b/src/MPI/core.cpp
index 4340bd27..dc3f0a0f 100644
--- a/src/MPI/core.cpp
+++ b/src/MPI/core.cpp
@@ -15,8 +15,10 @@
  * limitations under the License.
  */
 
+#include <lpf/noc.h>
 #include <lpf/core.h>
 #include <lpf/mpi.h>
+#include <lpf/abort.h>
 
 #include <vector>
 #include <limits>
@@ -36,6 +38,11 @@
 
 #include <mpi.h>
 
+
+// the value 2 in this implementation indicates support for lpf_abort in a way
+// that may deviate from the stdlib abort()
+const int LPF_HAS_ABORT = 2;
+
 // Error codes. 
 // Note: Some code (e.g. in process::broadcastSymbol) depends on the 
 // fact that numbers are assigned in order of severity, where 0 means
@@ -331,6 +338,15 @@ lpf_err_t lpf_get_rcvd_msg_count( lpf_t ctx, size_t * rcvd_msgs)
     return LPF_SUCCESS;
 }
 
+lpf_err_t lpf_get_sent_msg_count( lpf_t ctx, size_t * sent_msgs)
+{
+    lpf::Interface * i = realContext(ctx);
+    if (!i->isAborted()) {
+        i->getSentMsgCount(sent_msgs);
+    }
+    return LPF_SUCCESS;
+}
+
 lpf_err_t lpf_get_sent_msg_count_per_slot( lpf_t ctx, size_t * sent_msgs, lpf_memslot_t slot)
 {
     lpf::Interface * i = realContext(ctx);
@@ -392,4 +408,106 @@ lpf_err_t lpf_abort( lpf_t ctx ) {
     return LPF_SUCCESS;
 }
 
+lpf_err_t lpf_noc_resize_memory_register( lpf_t ctx, size_t max_regs ) 
+{
+    lpf::Interface * i = realContext(ctx);
+    if (i->isAborted())
+        return LPF_SUCCESS;
+    
+    return i->nocResizeMemreg(max_regs);
+}
+
+lpf_err_t lpf_noc_register(
+    lpf_t ctx,
+    void * pointer,
+    size_t size,
+    lpf_memslot_t * memslot
+) 
+{
+    lpf::Interface * i = realContext(ctx);
+    if (!i->isAborted())
+        *memslot = i->nocRegister(pointer, size);
+    return LPF_SUCCESS;
+}
+
+lpf_err_t lpf_noc_deregister(
+    lpf_t ctx,
+    lpf_memslot_t memslot
+) 
+{
+    lpf::Interface * i = realContext(ctx);
+    if (!i->isAborted())
+        i->nocDeregister(memslot);
+    
+    return LPF_SUCCESS;
+}
+
+lpf_err_t lpf_noc_put(
+    lpf_t ctx,
+    lpf_memslot_t src_slot,
+    size_t src_offset,
+    lpf_pid_t dst_pid,
+    lpf_memslot_t dst_slot,
+    size_t dst_offset,
+    size_t size,
+    lpf_msg_attr_t attr
+)
+{
+    (void) attr; // ignore parameter 'msg' since this implementation only 
+                 // implements core functionality
+    lpf::Interface * i = realContext(ctx);
+    if (!i->isAborted())
+        i->nocPut( src_slot, src_offset, dst_pid, dst_slot, dst_offset, size );
+
+    return LPF_SUCCESS;
+    
+}
+
+lpf_err_t lpf_noc_get(
+    lpf_t ctx,
+    lpf_pid_t pid,
+    lpf_memslot_t src,
+    size_t src_offset,
+    lpf_memslot_t dst,
+    size_t dst_offset,
+    size_t size,
+    lpf_msg_attr_t attr
+)
+{
+    (void) attr; // ignore parameter 'msg' since this implementation only 
+                 // implements core functionality
+    lpf::Interface * i = realContext(ctx);
+    if (!i->isAborted())
+        i->nocGet( pid, src, src_offset, dst, dst_offset, size );
+
+    return LPF_SUCCESS;
+}
+
+lpf_err_t lpf_noc_serialize_slot(
+        lpf_t ctx,
+        lpf_memslot_t slot,
+        char  ** buff,
+        size_t * buff_size
+)
+{
+    lpf::Interface * i = realContext(ctx);
+    if (!i->isAborted())
+        return i->serializeSlot(slot, buff, buff_size);
+
+    return LPF_ERR_FATAL;
+}
+
+lpf_err_t lpf_noc_deserialize_slot(
+        lpf_t ctx,
+        char * buff,
+        lpf_memslot_t slot
+)
+{
+    lpf::Interface * i = realContext(ctx);
+    if (!i->isAborted())
+        return i->deserializeSlot( buff, slot);
+
+    return LPF_ERR_FATAL;
+
+}
 
diff --git a/src/MPI/func_verbs_test_noc_register.cpp b/src/MPI/func_verbs_test_noc_register.cpp
new file mode 100644
index 00000000..6e9ad17f
--- /dev/null
+++ b/src/MPI/func_verbs_test_noc_register.cpp
@@ -0,0 +1,86 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ibverbsNoc.hpp"
+#include "mpilib.hpp"
+#include <string.h>
+#include "gtest/gtest.h"
+
+
+using namespace lpf::mpi;
+
+extern "C" const int LPF_MPI_AUTO_INITIALIZE=0;
+
+
+/** 
+ * \test Testing NOC functionality
+ * \pre P >= 2
+ * \pre P <= 2
+ * \return Exit code: 0
+ */
+TEST( API, func_verbsAPI_zero_test_noc_ring )
+{
+
+    char buf1[30] = {'\0'};
+    char buf2[30] = {'\0'};
+
+    strcpy(buf1, "HELLO");
+
+    MPI_Init(NULL, NULL);
+    Lib::instance();
+    Comm * comm = new Comm();
+    *comm = Lib::instance().world();
+    int rank = comm->pid();
+    assert(comm->nprocs() > 0);
+    comm->barrier();
+    IBVerbsNoc * verbs = new IBVerbsNoc( *comm );
+    
+    verbs->resizeMemreg(3);
+    comm->barrier();
+    
+    verbs->resizeMesgq( 2 );
+    comm->barrier();
+
+    IBVerbs::SlotID b1 = verbs->regLocal( buf1, sizeof(buf1) );
+    IBVerbs::SlotID b2 = verbs->regNoc( buf2, sizeof(buf2) );
+
+    auto mr = verbs->getMR(b1, rank);
+    mr = verbs->getMR(b2, rank);
+    assert(mr._addr != nullptr);
+    char * buffer;
+    size_t bufSize = mr.serialize(&buffer);
+    std::string bufAsString(buffer);
+       
+    int left = (comm->nprocs() + rank - 1) % comm->nprocs();
+    int right = (rank + 1) % comm->nprocs();
+    char rmtBuff[bufSize];
+    std::stringstream ss(buffer);
+
+    MPI_Sendrecv(buffer, bufSize, MPI_BYTE, left, 0, rmtBuff, bufSize, MPI_BYTE, right, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+    MemoryRegistration * newMr = MemoryRegistration::deserialize(rmtBuff);
+    verbs->setMR(b2, right, *newMr);
+    comm->barrier();
+    verbs->put( b1, 0, right, b2, 0, sizeof(buf1));
+    verbs->sync(true);
+    EXPECT_EQ(std::string(buf2), std::string(buf1));
+    verbs->dereg(b1);
+    verbs->dereg(b2);
+    delete verbs;
+    delete comm;
+    MPI_Finalize();
+}
diff --git a/src/MPI/ibverbs.cpp b/src/MPI/ibverbs.cpp
index 5dcdbfc8..77832aae 100644
--- a/src/MPI/ibverbs.cpp
+++ b/src/MPI/ibverbs.cpp
@@ -45,9 +45,20 @@ namespace {
     }
 }
 
+size_t MemoryRegistration :: serialize(char ** buf) {
+    (void) buf;
+    throw IBVerbs::Exception( "MemoryRegistration::serialize(char ** buf) not implemented for base IBVerbs class");
+}
+
+MemoryRegistration * MemoryRegistration :: deserialize(char * buf)
+{
+    (void) buf;
+    throw IBVerbs::Exception( "MemoryRegistration::deserialize(char * buf) not implemented for base IBVerbs class");
+}
 
 IBVerbs :: IBVerbs( Communication & comm )
-    : m_pid( comm.pid() )
+    : m_comm( comm )
+    , m_pid( comm.pid() )
     , m_nprocs( comm.nprocs() )
     , m_devName()
     , m_ibPort( Config::instance().getIBPort() )
@@ -72,7 +83,6 @@ IBVerbs :: IBVerbs( Communication & comm )
     , m_memreg()
     , m_dummyMemReg()
     , m_dummyBuffer()
-    , m_comm( comm )
 {
     m_peerList.reserve( m_nprocs );
 
@@ -97,7 +107,6 @@ IBVerbs :: IBVerbs( Communication & comm )
         throw Exception( "No Infiniband devices available" );
     }
 
-
     std::string wantDevName = Config::instance().getIBDeviceName();
     LOG( 3, "Searching for device '"<< wantDevName << "'" );
     struct ibv_device * dev = NULL;
@@ -463,8 +472,8 @@ void IBVerbs :: resizeMemreg( size_t size )
         throw std::bad_alloc() ;
     }
 
-    MemoryRegistration null = { 0, 0, 0, 0 };
-    MemorySlot dflt; dflt.glob.resize( m_nprocs, null );
+    MemoryRegistration newMR = { nullptr, 0, 0, 0, m_pid};
+    MemorySlot dflt; dflt.glob.resize( m_nprocs, newMR );
 
     m_memreg.reserve( size, dflt );
 }
@@ -507,11 +516,7 @@ IBVerbs :: SlotID IBVerbs :: regLocal( void * addr, size_t size )
             throw Exception("Could not register memory area");
         }
     }
-    MemoryRegistration local;
-    local.addr = addr;
-    local.size = size;
-    local.lkey = size?slot.mr->lkey:0;
-    local.rkey = size?slot.mr->rkey:0;
+    MemoryRegistration local((char *) addr, size, size?slot.mr->lkey:0, size?slot.mr->rkey:0, m_pid);
 
     SlotID id =  m_memreg.addLocalReg( slot );
 
@@ -551,11 +556,7 @@ IBVerbs :: SlotID IBVerbs :: regGlobal( void * addr, size_t size )
     // exchange memory registration info globally
     ref.glob.resize(m_nprocs);
 
-    MemoryRegistration local;
-    local.addr = addr;
-    local.size = size;
-    local.lkey = size?slot.mr->lkey:0;
-    local.rkey = size?slot.mr->rkey:0;
+    MemoryRegistration local((char *) addr, size, size?slot.mr->lkey:0, size?slot.mr->rkey:0, m_pid);
 
     LOG(4, "All-gathering memory register data" );
 
@@ -583,13 +584,13 @@ void IBVerbs :: put( SlotID srcSlot, size_t srcOffset,
         struct ibv_send_wr sr; std::memset(&sr, 0, sizeof(sr));
 
         const char * localAddr
-            = static_cast<const char *>(src.glob[m_pid].addr) + srcOffset;
+            = static_cast<const char *>(src.glob[m_pid]._addr) + srcOffset;
         const char * remoteAddr
-            = static_cast<const char *>(dst.glob[dstPid].addr) + dstOffset;
+            = static_cast<const char *>(dst.glob[dstPid]._addr) + dstOffset;
 
         sge.addr = reinterpret_cast<uintptr_t>( localAddr );
         sge.length = std::min<size_t>(size, m_maxMsgSize );
-        sge.lkey = src.mr->lkey;
+            sge.lkey = src.mr->lkey;
         m_sges.push_back( sge );
 
         bool lastMsg = ! m_activePeers.contains( dstPid );
@@ -603,7 +604,7 @@ void IBVerbs :: put( SlotID srcSlot, size_t srcOffset,
         sr.num_sge = 1;
         sr.opcode = IBV_WR_RDMA_WRITE;
         sr.wr.rdma.remote_addr = reinterpret_cast<uintptr_t>( remoteAddr );
-        sr.wr.rdma.rkey = dst.glob[dstPid].rkey;
+        sr.wr.rdma.rkey = dst.glob[dstPid]._rkey;
 
         m_srsHeads[ dstPid ] = m_srs.size();
         m_srs.push_back( sr );
@@ -632,9 +633,9 @@ void IBVerbs :: get( int srcPid, SlotID srcSlot, size_t srcOffset,
         struct ibv_send_wr sr; std::memset(&sr, 0, sizeof(sr));
 
         const char * localAddr
-            = static_cast<const char *>(dst.glob[m_pid].addr) + dstOffset;
+            = static_cast<const char *>(dst.glob[m_pid]._addr) + dstOffset;
         const char * remoteAddr
-            = static_cast<const char *>(src.glob[srcPid].addr) + srcOffset;
+            = static_cast<const char *>(src.glob[srcPid]._addr) + srcOffset;
 
         sge.addr = reinterpret_cast<uintptr_t>( localAddr );
         sge.length = std::min<size_t>(size, m_maxMsgSize );
@@ -652,7 +653,7 @@ void IBVerbs :: get( int srcPid, SlotID srcSlot, size_t srcOffset,
         sr.num_sge = 1;
         sr.opcode = IBV_WR_RDMA_READ;
         sr.wr.rdma.remote_addr = reinterpret_cast<uintptr_t>( remoteAddr );
-        sr.wr.rdma.rkey = src.glob[srcPid].rkey;
+        sr.wr.rdma.rkey = src.glob[srcPid]._rkey;
 
         m_srsHeads[ srcPid ] = m_srs.size();
         m_srs.push_back( sr );
diff --git a/src/MPI/ibverbs.hpp b/src/MPI/ibverbs.hpp
index f53c9354..b165f777 100644
--- a/src/MPI/ibverbs.hpp
+++ b/src/MPI/ibverbs.hpp
@@ -58,6 +58,23 @@ using std::shared_ptr;
 using std::tr1::shared_ptr;
 #endif
 
+class MemoryRegistration {
+    public:
+        char *   _addr;
+        size_t   _size;
+        uint32_t _lkey;
+        uint32_t _rkey;
+        int _pid;
+        MemoryRegistration(char * addr, size_t size, uint32_t lkey, uint32_t rkey, int pid) : _addr(addr),
+        _size(size), _lkey(lkey), _rkey(rkey), _pid(pid)
+        { }
+        MemoryRegistration() : _addr(nullptr), _size(0), _lkey(0), _rkey(0), _pid(-1) {}
+        size_t serialize(char ** buf);
+        static MemoryRegistration * deserialize(char * buf);
+
+};
+
+
 class _LPFLIB_LOCAL IBVerbs 
 {
 public:
@@ -72,6 +89,7 @@ class _LPFLIB_LOCAL IBVerbs
     void resizeMesgq( size_t size );
     
     SlotID regLocal( void * addr, size_t size );
+    SlotID regNoc( void * addr, size_t size );
     SlotID regGlobal( void * addr, size_t size );
     void dereg( SlotID id );
 
@@ -93,7 +111,7 @@ class _LPFLIB_LOCAL IBVerbs
 
     void doRemoteProgress();
 
-    void countingSyncPerSlot(bool resized, SlotID tag, size_t sent, size_t recvd);
+    void countingSyncPerSlot(SlotID tag, size_t sent, size_t recvd);
     /**
      * @syncPerSlot only guarantees that all already scheduled sends (via put), 
      * or receives (via get) associated with a slot are completed. It does 
@@ -101,16 +119,18 @@ class _LPFLIB_LOCAL IBVerbs
      * no guarantee that a remote process will wait til data is put into its 
      * memory, as it does schedule the operation (one-sided).
      */
-    void syncPerSlot(bool resized, SlotID slot);
+    void syncPerSlot(SlotID slot);
 
     // Do the communication and synchronize
     // 'Reconnect' must be a globally replicated value
     void sync( bool reconnect);
 
     void get_rcvd_msg_count(size_t * rcvd_msgs);
+    void get_sent_msg_count(size_t * sent_msgs);
     void get_rcvd_msg_count_per_slot(size_t * rcvd_msgs, SlotID slot);
     void get_sent_msg_count_per_slot(size_t * sent_msgs, SlotID slot);
-private:
+
+protected:
     IBVerbs & operator=(const IBVerbs & ); // assignment prohibited
     IBVerbs( const IBVerbs & ); // copying prohibited
 
@@ -123,22 +143,16 @@ class _LPFLIB_LOCAL IBVerbs
     void doProgress();
     void tryIncrement(Op op, Phase phase, SlotID slot);
 
-    struct MemoryRegistration {
-        void *   addr;
-        size_t   size;
-        uint32_t lkey;
-        uint32_t rkey;
-    };
-
     struct MemorySlot {
         shared_ptr< struct ibv_mr > mr;    // verbs structure
         std::vector< MemoryRegistration > glob; // array for global registrations
     };
 
+
+    Communication & m_comm;
     int          m_pid; // local process ID
     int          m_nprocs; // number of processes
     std::atomic_size_t m_numMsgs;
-    //std::atomic_size_t m_sendTotalInitMsgCount;
     std::atomic_size_t m_recvTotalInitMsgCount;
     std::atomic_size_t m_sentMsgs;
     std::atomic_size_t m_recvdMsgs;
@@ -157,8 +171,6 @@ class _LPFLIB_LOCAL IBVerbs
     size_t		m_cqSize;
     size_t       m_minNrMsgs;
     size_t       m_maxSrs; // maximum number of sends requests per QP  
-    size_t m_postCount;
-    size_t m_recvCount;
 
     shared_ptr< struct ibv_context > m_device; // device handle
     shared_ptr< struct ibv_pd >      m_pd;     // protection domain
@@ -173,10 +185,6 @@ class _LPFLIB_LOCAL IBVerbs
     // Connected queue pairs
     std::vector< shared_ptr<struct ibv_qp> > m_connectedQps; 
 
-    std::vector<size_t> rcvdMsgCount;
-    std::vector<size_t> sentMsgCount;
-    std::vector<size_t> getMsgCount;
-    std::vector<bool> slotActive;
 
 
     std::vector< struct ibv_send_wr > m_srs; // array of send requests
@@ -193,8 +201,13 @@ class _LPFLIB_LOCAL IBVerbs
 
     shared_ptr< struct ibv_mr > m_dummyMemReg; // registration of dummy buffer
     std::vector< char > m_dummyBuffer; // dummy receive buffer
-
-    Communication & m_comm;
+                                       //
+    std::vector<size_t> rcvdMsgCount;
+    std::vector<size_t> sentMsgCount;
+    std::vector<size_t> getMsgCount;
+    std::vector<bool> slotActive;
+    size_t m_postCount;
+    size_t m_recvCount;
 };
 
 
diff --git a/src/MPI/ibverbsNoc.cpp b/src/MPI/ibverbsNoc.cpp
new file mode 100644
index 00000000..7f185fc1
--- /dev/null
+++ b/src/MPI/ibverbsNoc.cpp
@@ -0,0 +1,98 @@
+#include "ibverbsNoc.hpp"
+
+namespace lpf 
+{
+namespace mpi
+{
+
+    size_t MemoryRegistration :: serialize(char ** buf) {
+        std::stringstream ss;
+        size_t bufSize = sizeof(uintptr_t) + sizeof(size_t) + 2*sizeof(uint32_t) + sizeof(int);
+        *buf = new char[bufSize];
+        char *ptr = *buf;
+        uintptr_t addrAsUintPtr = reinterpret_cast<uintptr_t>(_addr);
+        memcpy(ptr, &addrAsUintPtr, sizeof(uintptr_t));
+        ptr += sizeof(uintptr_t);
+        memcpy(ptr, &_size, sizeof(size_t));
+        ptr += sizeof(size_t);
+        memcpy(ptr, &_lkey, sizeof(uint32_t));
+        ptr += sizeof(uint32_t);
+        memcpy(ptr, &_rkey, sizeof(uint32_t));
+        ptr += sizeof(uint32_t);
+        memcpy(ptr, &_pid, sizeof(int));
+        return bufSize;
+    }
+
+    MemoryRegistration * MemoryRegistration :: deserialize(char * buf) {
+
+        char *   addr;
+        size_t   size;
+        uint32_t lkey;
+        uint32_t rkey;
+        uintptr_t addrAsUintPtr;
+        int pid;
+        char * ptr = buf;
+        memcpy(&addrAsUintPtr, ptr, sizeof(uintptr_t));
+        addr = reinterpret_cast<char *>(addrAsUintPtr);
+        ptr += sizeof(uintptr_t);
+        memcpy(&size, ptr, sizeof(size_t));
+        ptr += sizeof(size_t);
+        memcpy(&lkey, ptr, sizeof(uint32_t));
+        ptr += sizeof(uint32_t);
+        memcpy(&rkey, ptr, sizeof(uint32_t));
+        ptr += sizeof(uint32_t);
+        memcpy(&pid, ptr, sizeof(int));
+        return new MemoryRegistration(addr, size, lkey, rkey, pid);
+    }
+
+    struct IBVerbsNoc::Exception : std::runtime_error {
+        Exception(const char * what) : std::runtime_error( what ) {}
+    };
+
+    MemoryRegistration IBVerbsNoc :: getMR(SlotID slotId, int pid) 
+    {
+        const MemorySlot & slot = m_memreg.lookup( slotId );
+        return slot.glob[pid];
+    }
+
+    void IBVerbsNoc::setMR(SlotID slotId, int pid, MemoryRegistration & mr)
+    {
+        m_memreg.update(slotId).glob[pid] = mr;
+    }
+
+    IBVerbsNoc::IBVerbsNoc(Communication & comm) : IBVerbs(comm)
+    {
+    }
+
+    IBVerbs::SlotID IBVerbsNoc :: regNoc( void * addr, size_t size )
+    {
+        ASSERT( size <= m_maxRegSize );
+
+        MemorySlot slot;
+        if ( size > 0) {
+            LOG(4, "IBVerbsNoc::regLocal: Registering locally memory area at " << addr << " of size  " << size );
+            struct ibv_mr * const ibv_mr_new_p = ibv_reg_mr(
+                    m_pd.get(), addr, size,
+                    IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE
+                    );
+            if( ibv_mr_new_p == NULL )
+                slot.mr.reset();
+            else
+                slot.mr.reset( ibv_mr_new_p, ibv_dereg_mr );
+            if (!slot.mr) {
+                LOG(1, "Could not register memory area at "
+                        << addr << " of size " << size << " with IB device");
+                throw Exception("Could not register memory area");
+            }
+        }
+        MemoryRegistration local((char *) addr, size, size?slot.mr->lkey:0, size?slot.mr->rkey:0, m_pid);
+
+        SlotID id =  m_memreg.addNocReg( slot );
+        m_memreg.update( id ).glob.resize( m_nprocs );
+        m_memreg.update( id ).glob[m_pid] = local;
+        LOG(4, "Memory area " << addr << " of size " << size << " has been locally registered as NOC slot. Slot = " << id );
+        return id;
+    }
+
+} // namespace mpi
+} // namespace lpf
diff --git a/src/MPI/ibverbsNoc.hpp b/src/MPI/ibverbsNoc.hpp
new file mode 100644
index 00000000..d9ece946
--- /dev/null
+++ b/src/MPI/ibverbsNoc.hpp
@@ -0,0 +1,19 @@
+#pragma once
+
+#include "ibverbs.hpp"
+
+namespace lpf 
+{
+
+namespace mpi 
+{
+    class _LPFLIB_LOCAL IBVerbsNoc : public IBVerbs {
+        public:
+            IBVerbsNoc(Communication & comm);
+            IBVerbs::SlotID regNoc( void * addr, size_t size );
+            MemoryRegistration getMR(SlotID slotId, int pid);
+            void setMR(SlotID slotId, int pid, MemoryRegistration & mr);
+
+    };
+} // namespace mpi
+} // namespace lpf
diff --git a/src/MPI/ibverbsZero.cpp b/src/MPI/ibverbsZero.cpp
index 6f52fa5b..7cec923a 100644
--- a/src/MPI/ibverbsZero.cpp
+++ b/src/MPI/ibverbsZero.cpp
@@ -53,14 +53,20 @@ namespace {
 
 
 IBVerbs :: IBVerbs( Communication & comm )
-    : m_pid( comm.pid() )
+    : m_comm( comm )
+    , m_pid( comm.pid() )
     , m_nprocs( comm.nprocs() )
+    , m_numMsgs(0)
+    , m_recvTotalInitMsgCount(0)
+    , m_sentMsgs(0)
+    , m_recvdMsgs(0)
     , m_devName()
     , m_ibPort( Config::instance().getIBPort() )
     , m_gidIdx( Config::instance().getIBGidIndex() )
     , m_mtu( getMTU( Config::instance().getIBMTU() ))
     , m_maxRegSize(0)
     , m_maxMsgSize(0)
+    , m_cqSize(1)
     , m_minNrMsgs(0)
     , m_maxSrs(0)
     , m_device()
@@ -78,14 +84,8 @@ IBVerbs :: IBVerbs( Communication & comm )
     , m_memreg()
     , m_dummyMemReg()
     , m_dummyBuffer()
-    , m_comm( comm )
-    , m_cqSize(1)
     , m_postCount(0)
     , m_recvCount(0)
-    , m_numMsgs(0)
-    , m_recvTotalInitMsgCount(0)
-    , m_sentMsgs(0)
-    , m_recvdMsgs(0)
 {
 
     // arrays instead of hashmap for counters
@@ -260,7 +260,7 @@ IBVerbs :: ~IBVerbs()
 
 
 inline void IBVerbs :: tryIncrement(Op op, Phase phase, SlotID slot) {
-    
+
     switch (phase) {
         case Phase::INIT:
             rcvdMsgCount[slot] = 0;
@@ -306,7 +306,7 @@ inline void IBVerbs :: tryIncrement(Op op, Phase phase, SlotID slot) {
 void IBVerbs :: stageQPs( size_t maxMsgs )
 {
     // create the queue pairs
-    for ( int i = 0; i < m_nprocs; ++i) {
+    for ( size_t i = 0; i < static_cast<size_t>(m_nprocs); ++i) {
         struct ibv_qp_init_attr attr;
         std::memset(&attr, 0, sizeof(attr));
 
@@ -321,6 +321,7 @@ void IBVerbs :: stageQPs( size_t maxMsgs )
         attr.cap.max_recv_sge = 1;
 
         struct ibv_qp * const ibv_new_qp_p = ibv_create_qp( m_pd.get(), &attr );
+        ASSERT(m_stagedQps.size() > i);
         if( ibv_new_qp_p == NULL ) {
             m_stagedQps[i].reset();
         } else {
@@ -352,7 +353,7 @@ void IBVerbs :: doRemoteProgress() {
 		pollResult = ibv_poll_cq(m_cqRemote.get(), POLL_BATCH, wcs);
         if (pollResult > 0) {
             LOG(3, "Process " << m_pid << " signals: I received " << pollResult << " remote messages in doRemoteProgress");
-        }  
+        }
         else if (pollResult < 0)
         {
             LOG( 1, "Failed to poll IB completion queue" );
@@ -367,10 +368,10 @@ void IBVerbs :: doRemoteProgress() {
                         << wcs[i].vendor_err );
             }
             else {
-                LOG(2, "Process " << m_pid << " Recv wcs[" << i << "].src_qp = "<< wcs[i].src_qp);
-                LOG(2, "Process " << m_pid << " Recv wcs[" << i << "].slid = "<< wcs[i].slid);
-                LOG(2, "Process " << m_pid << " Recv wcs[" << i << "].wr_id = "<< wcs[i].wr_id);
-                LOG(2, "Process " << m_pid << " Recv wcs[" << i << "].imm_data = "<< wcs[i].imm_data);
+                LOG(3, "Process " << m_pid << " Recv wcs[" << i << "].src_qp = "<< wcs[i].src_qp);
+                LOG(3, "Process " << m_pid << " Recv wcs[" << i << "].slid = "<< wcs[i].slid);
+                LOG(3, "Process " << m_pid << " Recv wcs[" << i << "].wr_id = "<< wcs[i].wr_id);
+                LOG(3, "Process " << m_pid << " Recv wcs[" << i << "].imm_data = "<< wcs[i].imm_data);
 
                 /**
                  * Here is a trick:
@@ -463,7 +464,6 @@ void IBVerbs :: reconnectQPs()
 
             struct ibv_recv_wr rr;  std::memset(&rr, 0, sizeof(rr));
             struct ibv_sge     sge; std::memset(&sge, 0, sizeof(sge));
-            struct ibv_recv_wr *bad_wr = NULL;
             sge.addr = reinterpret_cast<uintptr_t>(m_dummyBuffer.data());
             sge.length = m_dummyBuffer.size();
             sge.lkey = m_dummyMemReg->lkey;
@@ -553,8 +553,8 @@ void IBVerbs :: resizeMemreg( size_t size )
         throw std::bad_alloc() ;
     }
 
-    MemoryRegistration null = { 0, 0, 0, 0 };
-    MemorySlot dflt; dflt.glob.resize( m_nprocs, null );
+    MemoryRegistration newMR = { nullptr, 0, 0, 0, m_pid};
+    MemorySlot dflt; dflt.glob.resize( m_nprocs, newMR);
 
     m_memreg.reserve( size, dflt );
 }
@@ -616,14 +616,10 @@ IBVerbs :: SlotID IBVerbs :: regLocal( void * addr, size_t size )
             throw Exception("Could not register memory area");
         }
     }
-    MemoryRegistration local;
-    local.addr = addr;
-    local.size = size;
-    local.lkey = size?slot.mr->lkey:0;
-    local.rkey = size?slot.mr->rkey:0;
+    MemoryRegistration local((char *) addr, size, size?slot.mr->lkey:0, size?slot.mr->rkey:0, m_pid);
 
     SlotID id =  m_memreg.addLocalReg( slot );
-    tryIncrement(Op::SEND/* <- dummy for init */, Phase::INIT, id);
+    tryIncrement(Op::SEND, Phase::INIT, id);
 
     m_memreg.update( id ).glob.resize( m_nprocs );
     m_memreg.update( id ).glob[m_pid] = local;
@@ -662,12 +658,7 @@ IBVerbs :: SlotID IBVerbs :: regGlobal( void * addr, size_t size )
     // exchange memory registration info globally
     ref.glob.resize(m_nprocs);
 
-    MemoryRegistration local;
-    local.addr = addr;
-    local.size = size;
-    local.lkey = size?slot.mr->lkey:0;
-    local.rkey = size?slot.mr->rkey:0;
-
+    MemoryRegistration local((char *) addr, size, size?slot.mr->lkey:0, size?slot.mr->rkey:0, m_pid);
     LOG(4, "All-gathering memory register data" );
 
     m_comm.allgather( local, ref.glob.data() );
@@ -694,9 +685,9 @@ void IBVerbs :: blockingCompareAndSwap(SlotID srcSlot, size_t srcOffset, int dst
 	const MemorySlot & dst = m_memreg.lookup( dstSlot);
 
     char * localAddr
-        = static_cast<char *>(src.glob[m_pid].addr) + srcOffset;
+        = static_cast<char *>(src.glob[m_pid]._addr) + srcOffset;
         const char * remoteAddr
-            = static_cast<const char *>(dst.glob[dstPid].addr) + dstOffset;
+            = static_cast<const char *>(dst.glob[dstPid]._addr) + dstOffset;
 
 	struct ibv_sge sge;
 	memset(&sge, 0, sizeof(sge));
@@ -704,7 +695,6 @@ void IBVerbs :: blockingCompareAndSwap(SlotID srcSlot, size_t srcOffset, int dst
 	sge.length =  std::min<size_t>(size, m_maxMsgSize );
         sge.lkey = src.mr->lkey;
 
-	struct ibv_wc wcs[POLL_BATCH];
 	struct ibv_send_wr wr;
 	memset(&wr, 0, sizeof(wr));
 	wr.wr_id = srcSlot;
@@ -716,7 +706,7 @@ void IBVerbs :: blockingCompareAndSwap(SlotID srcSlot, size_t srcOffset, int dst
 	wr.wr.atomic.remote_addr = reinterpret_cast<uintptr_t>(remoteAddr);
 	wr.wr.atomic.compare_add = compare_add;
 	wr.wr.atomic.swap = swap;
-	wr.wr.atomic.rkey = dst.glob[dstPid].rkey;
+	wr.wr.atomic.rkey = dst.glob[dstPid]._rkey;
 	struct ibv_send_wr *bad_wr;
 	int error;
     std::vector<ibv_wc_opcode> opcodes;
@@ -729,7 +719,7 @@ void IBVerbs :: blockingCompareAndSwap(SlotID srcSlot, size_t srcOffset, int dst
 	}
 
     /**
-     * Keep waiting on a completion of events until you 
+     * Keep waiting on a completion of events until you
      * register a completed atomic compare-and-swap
      */
     do {
@@ -741,7 +731,7 @@ void IBVerbs :: blockingCompareAndSwap(SlotID srcSlot, size_t srcOffset, int dst
     } while (std::find(opcodes.begin(), opcodes.end(), IBV_WC_COMP_SWAP) == opcodes.end());
 
 	uint64_t * remoteValueFound = reinterpret_cast<uint64_t *>(localAddr);
-	/* 
+	/*
      * if we fetched the value we expected, then
      * we are holding the lock now (that is, we swapped successfully!)
      * else, re-post your request for the lock
@@ -775,9 +765,9 @@ void IBVerbs :: put( SlotID srcSlot, size_t srcOffset,
         sge = &sges[i]; std::memset(sge, 0, sizeof(ibv_sge));
         sr = &srs[i]; std::memset(sr, 0, sizeof(ibv_send_wr));
         const char * localAddr
-            = static_cast<const char *>(src.glob[m_pid].addr) + srcOffset;
+            = static_cast<const char *>(src.glob[m_pid]._addr) + srcOffset;
         const char * remoteAddr
-            = static_cast<const char *>(dst.glob[dstPid].addr) + dstOffset;
+            = static_cast<const char *>(dst.glob[dstPid]._addr) + dstOffset;
 
         sge->addr = reinterpret_cast<uintptr_t>( localAddr );
         sge->length =  std::min<size_t>(size, m_maxMsgSize );
@@ -791,9 +781,9 @@ void IBVerbs :: put( SlotID srcSlot, size_t srcOffset,
         sr->send_flags = lastMsg ? IBV_SEND_SIGNALED : 0;
         sr->opcode = lastMsg? IBV_WR_RDMA_WRITE_WITH_IMM : IBV_WR_RDMA_WRITE;
         /* use wr_id to later demultiplex srcSlot */
-        sr->wr_id = srcSlot; 
+        sr->wr_id = srcSlot;
         /*
-         * In HiCR, we need to know at receiver end which slot 
+         * In HiCR, we need to know at receiver end which slot
          * has received the message. But here is a trick:
          */
         sr->imm_data = dstSlot;
@@ -801,7 +791,7 @@ void IBVerbs :: put( SlotID srcSlot, size_t srcOffset,
         sr->sg_list = &sges[i];
         sr->num_sge = 1;
         sr->wr.rdma.remote_addr = reinterpret_cast<uintptr_t>( remoteAddr );
-        sr->wr.rdma.rkey = dst.glob[dstPid].rkey;
+        sr->wr.rdma.rkey = dst.glob[dstPid]._rkey;
 
         srs[i] = *sr;
         size -= sge->length;
@@ -843,9 +833,9 @@ void IBVerbs :: get( int srcPid, SlotID srcSlot, size_t srcOffset,
 		sr = &srs[i]; std::memset(sr, 0, sizeof(ibv_send_wr));
 
 		const char * localAddr
-			= static_cast<const char *>(dst.glob[m_pid].addr) + dstOffset;
+			= static_cast<const char *>(dst.glob[m_pid]._addr) + dstOffset;
 		const char * remoteAddr
-			= static_cast<const char *>(src.glob[srcPid].addr) + srcOffset;
+			= static_cast<const char *>(src.glob[srcPid]._addr) + srcOffset;
 
 		sge->addr = reinterpret_cast<uintptr_t>( localAddr );
 		sge->length = std::min<size_t>(size, m_maxMsgSize );
@@ -861,7 +851,7 @@ void IBVerbs :: get( int srcPid, SlotID srcSlot, size_t srcOffset,
 		sr->num_sge = 1;
 		sr->opcode = IBV_WR_RDMA_READ;
 		sr->wr.rdma.remote_addr = reinterpret_cast<uintptr_t>( remoteAddr );
-		sr->wr.rdma.rkey = src.glob[srcPid].rkey;
+		sr->wr.rdma.rkey = src.glob[srcPid]._rkey;
         // This logic is reversed compared to ::put
         // (not srcSlot, as this slot is remote)
         sr->wr_id = dstSlot; // <= DO NOT CHANGE THIS !!!
@@ -890,25 +880,29 @@ void IBVerbs :: get_rcvd_msg_count(size_t * rcvd_msgs) {
     *rcvd_msgs = m_recvdMsgs;
 }
 
+void IBVerbs :: get_sent_msg_count(size_t * sent_msgs) {
+    *sent_msgs = m_sentMsgs;
+}
+
 void IBVerbs :: get_rcvd_msg_count_per_slot(size_t * rcvd_msgs, SlotID slot)
 {
-    *rcvd_msgs = rcvdMsgCount[slot];
+    *rcvd_msgs = rcvdMsgCount[slot] + getMsgCount[slot];
 }
 
 void IBVerbs :: get_sent_msg_count_per_slot(size_t * sent_msgs, SlotID slot)
 {
-    *sent_msgs = sentMsgCount.at(slot);
+    *sent_msgs = sentMsgCount[slot];
 }
 
 std::vector<ibv_wc_opcode> IBVerbs :: wait_completion(int& error) {
 
     error = 0;
-    LOG(5, "Polling for messages" );
+    LOG(1, "Polling for messages" );
     struct ibv_wc wcs[POLL_BATCH];
     int pollResult = ibv_poll_cq(m_cqLocal.get(), POLL_BATCH, wcs);
     std::vector<ibv_wc_opcode> opcodes;
     if ( pollResult > 0) {
-        LOG(3, "Process " << m_pid << ": Received " << pollResult << " acknowledgements");
+        LOG(4, "Process " << m_pid << ": Received " << pollResult << " acknowledgements");
 
         for (int i = 0; i < pollResult ; ++i) {
             if (wcs[i].status != IBV_WC_SUCCESS)
@@ -923,10 +917,10 @@ std::vector<ibv_wc_opcode> IBVerbs :: wait_completion(int& error) {
                 error = 1;
             }
             else {
-                LOG(3, "Process " << m_pid << " Send wcs[" << i << "].src_qp = "<< wcs[i].src_qp);
-                LOG(3, "Process " << m_pid << " Send wcs[" << i << "].slid = "<< wcs[i].slid);
-                LOG(3, "Process " << m_pid << " Send wcs[" << i << "].wr_id = "<< wcs[i].wr_id);
-                LOG(3, "Process " << m_pid << " Send wcs[" << i << "].imm_data = "<< wcs[i].imm_data);
+                LOG(4, "Process " << m_pid << " Send wcs[" << i << "].src_qp = "<< wcs[i].src_qp);
+                LOG(4, "Process " << m_pid << " Send wcs[" << i << "].slid = "<< wcs[i].slid);
+                LOG(4, "Process " << m_pid << " Send wcs[" << i << "].wr_id = "<< wcs[i].wr_id);
+                LOG(4, "Process " << m_pid << " Send wcs[" << i << "].imm_data = "<< wcs[i].imm_data);
             }
 
             SlotID slot = wcs[i].wr_id;
@@ -936,18 +930,20 @@ std::vector<ibv_wc_opcode> IBVerbs :: wait_completion(int& error) {
                 // This is a get call completing
                 if (wcs[i].opcode == IBV_WC_RDMA_READ) {
                     tryIncrement(Op::GET, Phase::POST, slot);
+					LOG(4, "Rank " << m_pid << " with GET, increments getMsgCount to " << getMsgCount[slot] << " for LPF slot " << slot);
                 }
                 // This is a put call completing
-                if (wcs[i].opcode == IBV_WC_RDMA_WRITE)
+                if (wcs[i].opcode == IBV_WC_RDMA_WRITE) {
                     tryIncrement(Op::SEND, Phase::POST, slot);
+					LOG(4, "Rank " << m_pid << " with SEND, increments getMsgCount to " << sentMsgCount[slot] << " for LPF slot " << slot);
+				}
 
-                LOG(3, "Rank " << m_pid << " increments sent message count to " << sentMsgCount[slot] << " for LPF slot " << slot);
             }
         }
     }
     else if (pollResult < 0)
     {
-        LOG( 5, "Failed to poll IB completion queue" );
+        LOG( 1, "Failed to poll IB completion queue" );
         throw Exception("Poll CQ failure");
     }
     return opcodes;
@@ -980,10 +976,12 @@ void IBVerbs :: flushSent()
 
 }
 
-void IBVerbs :: countingSyncPerSlot(bool resized, SlotID slot, size_t expectedSent, size_t expectedRecvd) {
+void IBVerbs :: countingSyncPerSlot(SlotID slot, size_t expectedSent, size_t expectedRecvd) {
 
-    size_t actualRecvd;
-    size_t actualSent;
+	bool sentOK = false;
+	bool recvdOK = false;
+	if (expectedSent == 0) sentOK = true;
+	if (expectedRecvd == 0) recvdOK = true;
     int error;
     if (slotActive[slot]) {
         do {
@@ -995,14 +993,25 @@ void IBVerbs :: countingSyncPerSlot(bool resized, SlotID slot, size_t expectedSe
             // this call triggers doRemoteProgress
             doRemoteProgress();
 
-        } while (
-                (rcvdMsgCount[slot] < m_recvInitMsgCount[slot]) ||
-                (sentMsgCount[slot] < m_sendInitMsgCount[slot])
-                );
+			/*
+			 * 1) Are we expecting nothing here (sentOK/recvdOK = true)
+             * 2) do the sent and received messages  match our expectations?
+			 */
+			sentOK = (sentOK || sentMsgCount[slot] >= expectedSent);
+			// We can receive messages passively (from remote puts) and actively (from our gets)
+			recvdOK = (recvdOK || (rcvdMsgCount[slot] + getMsgCount[slot]) >= expectedRecvd);
+		    LOG(4, "PID: " << m_pid << " rcvdMsgCount[" << slot << "] = " << rcvdMsgCount[slot]
+					<< " expectedRecvd = " << expectedRecvd
+					<< " sentMsgCount[" << slot << "] = " << sentMsgCount[slot]
+					<< " expectedSent = " << expectedSent
+					<< " m_recvInitMsgCount[" << slot << "] = " << m_recvInitMsgCount[slot]
+					<< " m_sendInitMsgCount[" << slot << "] = " << m_sendInitMsgCount[slot]);
+
+        } while (!(sentOK && recvdOK));
     }
 }
 
-void IBVerbs :: syncPerSlot(bool resized, SlotID slot) {
+void IBVerbs :: syncPerSlot(SlotID slot) {
     int error;
 
     do {
@@ -1034,15 +1043,14 @@ void IBVerbs :: syncPerSlot(bool resized, SlotID slot) {
 
 void IBVerbs :: sync(bool resized)
 {
-
-    int error = 0;
+    (void) resized;
 
     // flush send queues
     flushSent();
     // flush receive queues
     flushReceived();
 
-    LOG(1, "Process " << m_pid << " will call barrier\n");
+    LOG(4, "Process " << m_pid << " will call barrier at end of sync\n");
     m_comm.barrier();
 
 
diff --git a/src/MPI/interface.cpp b/src/MPI/interface.cpp
index 80123e58..53203042 100644
--- a/src/MPI/interface.cpp
+++ b/src/MPI/interface.cpp
@@ -75,7 +75,10 @@ void Interface :: initRoot(int *argc, char ***argv)
 Interface :: Interface( mpi::Comm machine, Process & subprocess )
 try : m_comm( machine )
     , m_subprocess( subprocess )
-    , m_mesgQueue( m_comm )
+    
+//#if defined (LPF_CORE_MPI_USES_zero) || defined (LPF_CORE_MPI_USES_ibverbs)
+     ,m_mesgQueue( m_comm)
+//#endif
     , m_aborted( false )
 {
      if ( machine.allreduceOr( false ) )
@@ -129,6 +132,15 @@ void Interface :: getSentMsgCountPerSlot(size_t * msgs, SlotID slot) {
     m_mesgQueue.getSentMsgCountPerSlot(msgs, slot);
 }
 
+
+void Interface :: getRcvdMsgCount(size_t * msgs) {
+    m_mesgQueue.getRcvdMsgCount(msgs);
+}
+
+void Interface :: getSentMsgCount(size_t * msgs) {
+    m_mesgQueue.getSentMsgCount(msgs);
+}
+
 void Interface :: flushSent() {
     m_mesgQueue.flushSent();
 }
@@ -137,10 +149,6 @@ void Interface :: flushReceived() {
     m_mesgQueue.flushReceived();
 }
 
-void Interface :: getRcvdMsgCount(size_t * msgs) {
-    m_mesgQueue.getRcvdMsgCount(msgs);
-}
-
 err_t Interface :: countingSyncPerSlot(memslot_t slot, size_t expected_sent, size_t expected_rcvd)
 {
     if ( 0 == m_aborted )
@@ -219,6 +227,51 @@ void Interface :: abort()
 #endif
 }
 
+/* start NOC extensions */
+memslot_t Interface :: nocRegister( void * mem, size_t size )
+{
+    return m_mesgQueue.addNocReg( mem, size );
+}
+
+void Interface :: nocDeregister( memslot_t slot)
+{
+    m_mesgQueue.removeReg(slot);
+}
+
+err_t Interface :: nocResizeMemreg( size_t nRegs )
+{
+    return m_mesgQueue.resizeMemreg(nRegs);
+}
+
+void Interface :: nocPut( memslot_t srcSlot, size_t srcOffset, 
+        pid_t dstPid, memslot_t dstSlot, size_t dstOffset,
+        size_t size )
+{
+    m_mesgQueue.put( srcSlot, srcOffset,
+            dstPid, dstSlot, dstOffset, 
+            size );
+}
+
+void Interface :: nocGet( pid_t srcPid, memslot_t srcSlot, size_t srcOffset, 
+        memslot_t dstSlot, size_t dstOffset,
+        size_t size )
+{
+    m_mesgQueue.get( srcPid, srcSlot, srcOffset,
+            dstSlot, dstOffset,
+            size );
+}
+
+err_t Interface :: serializeSlot(SlotID slot, char ** buff, size_t *buff_size)
+{
+    return m_mesgQueue.serializeSlot(slot, buff, buff_size);
+}
+
+err_t Interface :: deserializeSlot(char * buff, SlotID slot)
+{
+    return m_mesgQueue.deserializeSlot(buff, slot);
+}
+/* end NOC extensions */
+
 pid_t Interface  :: isAborted() const
 {
     return m_aborted;
diff --git a/src/MPI/interface.hpp b/src/MPI/interface.hpp
index 02e48b3c..004e9edc 100644
--- a/src/MPI/interface.hpp
+++ b/src/MPI/interface.hpp
@@ -61,6 +61,23 @@ class _LPFLIB_LOCAL Interface
     err_t resizeMesgQueue( size_t nMsgs ) ; // nothrow
 
     void abort() ; // nothrow
+                   //
+    /* start NOC extensions */
+    memslot_t nocRegister( void * mem, size_t size ) ; // nothrow
+    void nocDeregister( memslot_t slot) ; // nothrow
+    err_t nocResizeMemreg( size_t nRegs ) ; // nothrow
+    void nocPut( memslot_t srcSlot, size_t srcOffset, 
+            pid_t dstPid, memslot_t dstSlot, size_t dstOffset,
+            size_t size ) ; // nothrow
+
+    void nocGet( pid_t srcPid, memslot_t srcSlot, size_t srcOffset, 
+            memslot_t dstSlot, size_t dstOffset,
+            size_t size ) ;// nothrow
+
+    err_t serializeSlot(memslot_t slot, char ** buff, size_t *buff_size);
+
+    err_t deserializeSlot(char * buff, memslot_t slot);
+    /* end NOC extensions */
 
     pid_t isAborted() const ;
  
@@ -82,6 +99,8 @@ class _LPFLIB_LOCAL Interface
 
     void getSentMsgCountPerSlot(size_t * msgs, SlotID slot);
 
+    void getSentMsgCount(size_t * msgs);
+
     void getRcvdMsgCount(size_t * msgs);
 
     void flushSent();
@@ -108,6 +127,7 @@ class _LPFLIB_LOCAL Interface
     mpi::Comm m_comm;
     Process & m_subprocess;
     MessageQueue m_mesgQueue;
+
     pid_t m_aborted;
 
     static Interface * s_root;
diff --git a/src/MPI/memorytable.cpp b/src/MPI/memorytable.cpp
index 51947985..4ebb546d 100644
--- a/src/MPI/memorytable.cpp
+++ b/src/MPI/memorytable.cpp
@@ -23,8 +23,10 @@
 namespace lpf {
 
 MemoryTable :: MemoryTable( Communication & comm
-#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero
+#if defined LPF_CORE_MPI_USES_ibverbs 
         , mpi::IBVerbs & ibverbs
+#elif defined LPF_CORE_MPI_USES_zero
+        , mpi::IBVerbsNoc & ibverbs
 #endif
         )
     : m_memreg()
@@ -42,6 +44,17 @@ MemoryTable :: MemoryTable( Communication & comm
 { (void) comm; }
 
 
+MemoryTable :: Slot
+MemoryTable :: addNoc( void * mem, std::size_t size )  // nothrow
+{
+#if defined LPF_CORE_MPI_USES_zero 
+    Memory rec( mem, size, m_ibverbs.regNoc(mem, size));
+    return m_memreg.addNocReg( rec);
+#else
+    return m_memreg.invalidSlot();
+#endif
+}
+
 MemoryTable :: Slot
 MemoryTable :: addLocal( void * mem, std::size_t size )  // nothrow
 {
@@ -53,6 +66,15 @@ MemoryTable :: addLocal( void * mem, std::size_t size )  // nothrow
     return m_memreg.addLocalReg( rec);
 }
 
+#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero
+mpi::IBVerbs::SlotID MemoryTable :: getVerbID(MemoryTable::Slot slot) const
+{
+    Memory sl = m_memreg.lookup(slot);
+    ASSERT(sl.slot != m_memreg.invalidSlot());
+    return m_memreg.lookup( slot ).slot;
+}
+#endif
+
 MemoryTable :: Slot
 MemoryTable :: addGlobal( void * mem, std::size_t size ) // nothrow
 { 
diff --git a/src/MPI/memorytable.hpp b/src/MPI/memorytable.hpp
index 05c01eee..1308aa33 100644
--- a/src/MPI/memorytable.hpp
+++ b/src/MPI/memorytable.hpp
@@ -23,8 +23,10 @@
 #include "assert.hpp"
 #include "linkage.hpp"
 
-#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero
+#if defined LPF_CORE_MPI_USES_ibverbs
 #include "ibverbs.hpp"
+#elif defined LPF_CORE_MPI_USES_zero
+#include "ibverbsNoc.hpp"
 #endif
 
 
@@ -64,14 +66,18 @@ class _LPFLIB_LOCAL MemoryTable
     static Slot invalidSlot() 
     { return Register::invalidSlot(); }
 
-#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero
+#if defined LPF_CORE_MPI_USES_ibverbs
     explicit MemoryTable( Communication & comm, mpi::IBVerbs & verbs );
+#elif defined LPF_CORE_MPI_USES_zero
+    explicit MemoryTable( Communication & comm, mpi::IBVerbsNoc & verbs );
 #else
     explicit MemoryTable( Communication & comm );
 #endif
 
     Slot addLocal( void * mem, std::size_t size ) ; // nothrow
 
+    Slot addNoc( void * mem, std::size_t size ) ; // nothrow
+
     Slot addGlobal( void * mem, std::size_t size ); // nothrow
     
     void remove( Slot slot );   // nothrow
@@ -90,8 +96,7 @@ class _LPFLIB_LOCAL MemoryTable
 #endif
 
 #if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero
-    mpi::IBVerbs::SlotID getVerbID( Slot slot ) const
-    { return m_memreg.lookup( slot ).slot; }
+    mpi::IBVerbs::SlotID getVerbID( Slot slot ) const;
 #endif
 
     void reserve( size_t size ); // throws bad_alloc, strong safe
@@ -117,9 +122,13 @@ class _LPFLIB_LOCAL MemoryTable
     Communication & m_comm;
 #endif
 
-#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero
+#if defined LPF_CORE_MPI_USES_ibverbs
+    DirtyList      m_added;
+    mpi::IBVerbs & m_ibverbs;
+    Communication & m_comm;
+#elif defined LPF_CORE_MPI_USES_zero
     DirtyList      m_added;
-    mpi::IBVerbs  & m_ibverbs;
+    mpi::IBVerbsNoc & m_ibverbs;
     Communication & m_comm;
 #endif
 };
diff --git a/src/MPI/mesgqueue.cpp b/src/MPI/mesgqueue.cpp
index f81a618a..0c0f05f2 100644
--- a/src/MPI/mesgqueue.cpp
+++ b/src/MPI/mesgqueue.cpp
@@ -16,6 +16,7 @@
  */
 
 #include "mesgqueue.hpp"
+#include "ibverbs.hpp"
 #include "mpilib.hpp"
 #include "log.hpp"
 #include "assert.hpp"
@@ -103,13 +104,13 @@ MessageQueue :: MessageQueue( Communication & comm )
     , m_bodySends()
     , m_bodyRecvs()
     , m_comm( dynamic_cast<mpi::Comm &>(comm) )
+    , m_tinyMsgBuf( m_tinyMsgSize + largestHeader(m_nprocs, m_memRange, 0, 0))
 #if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero
-    , m_ibverbs( m_comm )
+    , m_ibverbs(m_comm)
     , m_memreg( m_comm, m_ibverbs )
 #else
     , m_memreg( m_comm )
 #endif
-    , m_tinyMsgBuf( m_tinyMsgSize + largestHeader(m_nprocs, m_memRange, 0, 0))
 {
     m_memreg.reserve(1); // reserve slot for edgeBuffer
 }
@@ -243,6 +244,48 @@ err_t MessageQueue :: resizeMemreg( size_t nRegs )
     return LPF_SUCCESS;
 }
 
+
+memslot_t MessageQueue :: addNocReg( void * mem, std::size_t size)
+{
+    memslot_t slot = m_memreg.addNoc( mem, size );
+    ASSERT(slot != LPF_INVALID_MEMSLOT);
+    if (size > 0)
+        m_msgsort.addRegister( slot, static_cast<char *>( mem ), size);
+    return slot;
+}
+
+err_t MessageQueue :: serializeSlot(memslot_t slot, char ** mem, std::size_t * size)
+{
+    ASSERT(slot != LPF_INVALID_MEMSLOT);
+    ASSERT(mem != nullptr);
+    ASSERT(size != nullptr);
+#ifdef LPF_CORE_MPI_USES_zero
+    auto mr = m_ibverbs.getMR(m_memreg.getVerbID(slot), m_pid);
+    *size = mr.serialize(mem);
+    return LPF_SUCCESS;
+#else
+    LOG( 3, "Error: serialize slot is only implemented for zero engine at the moment.");
+    return LPF_ERR_FATAL;
+#endif
+
+}
+
+err_t MessageQueue :: deserializeSlot(char * mem, memslot_t slot)
+{
+    ASSERT(mem != nullptr);
+    ASSERT(slot != LPF_INVALID_MEMSLOT);
+#ifdef LPF_CORE_MPI_USES_zero
+    auto mr = mpi::MemoryRegistration::deserialize(mem);
+    m_ibverbs.setMR(m_memreg.getVerbID(slot), mr->_pid, *mr);
+    return LPF_SUCCESS;
+#else
+    LOG( 3, "Error: deserialize slot is only implemented for zero engine at the moment.");
+    return LPF_ERR_FATAL;
+#endif
+
+}
+
+
 memslot_t MessageQueue :: addLocalReg( void * mem, std::size_t size)
 {
     memslot_t slot = m_memreg.addLocal( mem, size );
@@ -324,6 +367,12 @@ void MessageQueue :: get( pid_t srcPid, memslot_t srcSlot, size_t srcOffset,
 void MessageQueue :: lockSlot( memslot_t srcSlot, size_t srcOffset,
         pid_t dstPid, memslot_t dstSlot, size_t dstOffset, size_t size )
 {
+    ASSERT(srcSlot != LPF_INVALID_MEMSLOT);
+    ASSERT(dstSlot != LPF_INVALID_MEMSLOT);
+    (void) srcOffset;
+    (void) dstOffset;
+    (void) dstPid;
+    (void) size;
 #ifdef LPF_CORE_MPI_USES_zero
 m_ibverbs.blockingCompareAndSwap(m_memreg.getVerbID(srcSlot), srcOffset, dstPid, m_memreg.getVerbID(dstSlot), dstOffset, size, 0ULL, 1ULL);
 #endif
@@ -332,6 +381,12 @@ m_ibverbs.blockingCompareAndSwap(m_memreg.getVerbID(srcSlot), srcOffset, dstPid,
 void MessageQueue :: unlockSlot( memslot_t srcSlot, size_t srcOffset,
         pid_t dstPid, memslot_t dstSlot, size_t dstOffset, size_t size )
 {
+    ASSERT(srcSlot != LPF_INVALID_MEMSLOT);
+    ASSERT(dstSlot != LPF_INVALID_MEMSLOT);
+    (void) srcOffset;
+    (void) dstOffset;
+    (void) dstPid;
+    (void) size;
 #ifdef LPF_CORE_MPI_USES_zero
 m_ibverbs.blockingCompareAndSwap(m_memreg.getVerbID(srcSlot), srcOffset, dstPid, m_memreg.getVerbID(dstSlot), dstOffset, size, 1ULL, 0ULL);
 #endif
@@ -389,6 +444,7 @@ int MessageQueue :: sync( bool abort )
 {
 #ifdef LPF_CORE_MPI_USES_zero
     // if not, deal with normal sync
+    (void) abort;
     m_memreg.sync();
 	m_ibverbs.sync(m_resized);
     m_resized = false;
@@ -1018,32 +1074,33 @@ int MessageQueue :: sync( bool abort )
 
 }
 
-int MessageQueue :: countingSyncPerSlot(SlotID slot, size_t expected_sent, size_t expected_rcvd)
+int MessageQueue :: countingSyncPerSlot(memslot_t slot, size_t expected_sent, size_t expected_rcvd)
 {
 
+    ASSERT(slot != LPF_INVALID_MEMSLOT);
+    (void) expected_sent;
+    (void) expected_rcvd;
 #ifdef LPF_CORE_MPI_USES_zero
 
     // if not, deal with normal sync
     m_memreg.sync();
-
-	m_ibverbs.countingSyncPerSlot(m_resized, slot, expected_sent, expected_rcvd);
-
+	m_ibverbs.countingSyncPerSlot(m_memreg.getVerbID(slot), expected_sent, expected_rcvd);
     m_resized = false;
 
+
 #endif
 	return 0;
 }
 
-int MessageQueue :: syncPerSlot(SlotID slot)
+int MessageQueue :: syncPerSlot(memslot_t slot)
 {
 
+    ASSERT(slot != LPF_INVALID_MEMSLOT);
 #ifdef LPF_CORE_MPI_USES_zero
 
     // if not, deal with normal sync
     m_memreg.sync();
-
-	m_ibverbs.syncPerSlot(m_resized, slot);
-
+	m_ibverbs.syncPerSlot(m_memreg.getVerbID(slot));
     m_resized = false;
 
 #endif
@@ -1051,28 +1108,41 @@ int MessageQueue :: syncPerSlot(SlotID slot)
 }
 
 
-void MessageQueue :: getRcvdMsgCountPerSlot(size_t * msgs, SlotID slot)
+void MessageQueue :: getRcvdMsgCountPerSlot(size_t * msgs, memslot_t slot)
 {
 
+    ASSERT(msgs != nullptr);
+    ASSERT(slot != LPF_INVALID_MEMSLOT);
 #ifdef LPF_CORE_MPI_USES_zero
     *msgs = 0;
-        m_ibverbs.get_rcvd_msg_count_per_slot(msgs, slot);
+    m_ibverbs.get_rcvd_msg_count_per_slot(msgs, m_memreg.getVerbID(slot));
 #endif
 }
 
 void MessageQueue :: getRcvdMsgCount(size_t * msgs)
 {
+    ASSERT(msgs != nullptr);
 #ifdef LPF_CORE_MPI_USES_zero
     *msgs = 0;
-        m_ibverbs.get_rcvd_msg_count(msgs);
+    m_ibverbs.get_rcvd_msg_count(msgs);
 #endif
 }
 
-void MessageQueue :: getSentMsgCountPerSlot(size_t * msgs, SlotID slot)
+void MessageQueue :: getSentMsgCount(size_t * msgs)
+{
+    ASSERT(msgs != nullptr);
+#ifdef LPF_CORE_MPI_USES_zero
+    *msgs = 0;
+    m_ibverbs.get_sent_msg_count(msgs);
+#endif
+}
+void MessageQueue :: getSentMsgCountPerSlot(size_t * msgs, memslot_t slot)
 {
+    ASSERT(msgs != nullptr);
+    ASSERT(slot != LPF_INVALID_MEMSLOT);
 #ifdef LPF_CORE_MPI_USES_zero
     *msgs = 0;
-        m_ibverbs.get_sent_msg_count_per_slot(msgs, slot);
+    m_ibverbs.get_sent_msg_count_per_slot(msgs, m_memreg.getVerbID(slot));
 #endif
 }
 
diff --git a/src/MPI/mesgqueue.hpp b/src/MPI/mesgqueue.hpp
index b4f1f796..198afa04 100644
--- a/src/MPI/mesgqueue.hpp
+++ b/src/MPI/mesgqueue.hpp
@@ -26,6 +26,9 @@
 #include "messagesort.hpp"
 #include "mpilib.hpp"
 #include "linkage.hpp"
+#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero
+#include "ibverbsNoc.hpp"
+#endif
 
 #if __cplusplus >= 201103L
 #include <memory>
@@ -33,12 +36,7 @@
 #include <tr1/memory>
 #endif
 
-#if defined LPF_CORE_MPI_USES_ibverbs || defined LPF_CORE_MPI_USES_zero
-#include "ibverbs.hpp"
-#endif
-
 //only for HiCR
-typedef size_t SlotID;
 
 namespace lpf {
 
@@ -53,7 +51,9 @@ class _LPFLIB_LOCAL MessageQueue
 
 
     memslot_t addLocalReg( void * mem, std::size_t size );
+
     memslot_t addGlobalReg( void * mem, std::size_t size );
+
     void      removeReg( memslot_t slot );
 
     void get( pid_t srcPid, memslot_t srcSlot, size_t srcOffset,
@@ -67,31 +67,36 @@ class _LPFLIB_LOCAL MessageQueue
     int sync( bool abort );
 
 //only for HiCR
-//#ifdef 
     void lockSlot( memslot_t srcSlot, size_t srcOffset,
             pid_t dstPid, memslot_t dstSlot, size_t dstOffset, size_t size );
 
     void unlockSlot( memslot_t srcSlot, size_t srcOffset,
 		    pid_t dstPid, memslot_t dstSlot, size_t dstOffset, size_t size );
 
-    void getRcvdMsgCountPerSlot(size_t * msgs, SlotID slot);
+    void getRcvdMsgCountPerSlot(size_t * msgs, memslot_t slot);
 
     void getRcvdMsgCount(size_t * msgs);
 
-    void getSentMsgCountPerSlot(size_t * msgs, SlotID slot);
+    void getSentMsgCountPerSlot(size_t * msgs, memslot_t slot);
+
+    void getSentMsgCount(size_t * msgs);
 
     void flushSent();
 
     void flushReceived();
 
-    int countingSyncPerSlot(SlotID slot, size_t expected_sent, size_t expected_rcvd);
+    int countingSyncPerSlot(memslot_t slot, size_t expected_sent, size_t expected_rcvd);
+
+    int syncPerSlot(memslot_t slot);
+    // NOC extensions
+    memslot_t addNocReg( void * mem, std::size_t size );
 
-    int syncPerSlot(SlotID slot);
+    err_t serializeSlot(memslot_t slot, char ** buff, std::size_t * buff_size);
+    err_t deserializeSlot(char * buff, memslot_t slot);
 // end only for HiCR
-//#endif
 
 private:
-    enum Msgs { BufPut , 
+    enum Msgs { BufPut ,
         BufGet, BufGetReply,
         HpPut, HpGet , HpBodyReply ,
         HpEdges, HpEdgesReply };
@@ -100,7 +105,7 @@ class _LPFLIB_LOCAL MessageQueue
         SrcPid, DstPid,
         SrcOffset, DstOffset, BufOffset,
         SrcSlot, DstSlot, Size,
-        RoundedDstOffset, RoundedSize, 
+        RoundedDstOffset, RoundedSize,
         Payload, Head, Tail};
 
     struct Edge {
@@ -160,11 +165,14 @@ class _LPFLIB_LOCAL MessageQueue
     std::vector< Body > m_bodySends;
     std::vector< Body > m_bodyRecvs;
     mpi::Comm m_comm;
-#if defined LPF_CORE_MPI_USES_ibverbs  || defined LPF_CORE_MPI_USES_zero
+    std::vector< char > m_tinyMsgBuf;
+protected:
+#if defined LPF_CORE_MPI_USES_ibverbs
     mpi::IBVerbs m_ibverbs;
+#elif defined LPF_CORE_MPI_USES_zero
+    mpi::IBVerbsNoc m_ibverbs;
 #endif
     MemoryTable m_memreg;
-    std::vector< char > m_tinyMsgBuf;
 };
 
 
diff --git a/src/common/memreg.hpp b/src/common/memreg.hpp
index f48d519c..9e6c4b87 100644
--- a/src/common/memreg.hpp
+++ b/src/common/memreg.hpp
@@ -211,6 +211,7 @@ class CombinedMemoryRegister
     void destroy() {
         m_local.destroy();
         m_global.destroy();
+        m_noc.destroy();
     }
 
     Slot addLocalReg( Record record )  // nothrow
@@ -218,6 +219,12 @@ class CombinedMemoryRegister
         return toLocal( m_local.add( record ) ); 
     } 
 
+    Slot addNocReg( Record record )  // nothrow
+    { 
+        Slot a = toNoc( m_noc.add( record ) ); 
+        return a;
+    } 
+
     Slot addGlobalReg( Record record ) // nothrow
     { 
         return toGlobal( m_global.add(record) ); 
@@ -227,24 +234,31 @@ class CombinedMemoryRegister
     {
         if (isLocalSlot(slot))
             m_local.remove( fromLocal(slot) ) ;
-        else
+        else if (isGlobalSlot(slot))
             m_global.remove( fromGlobal( slot ) );
+        else 
+            m_noc.remove(fromNoc( slot ) );
      }
 
     const Record & lookup( Slot slot ) const // nothrow
     {
         if (isLocalSlot(slot))
             return m_local.lookup( fromLocal(slot));
-        else
+        else if (isGlobalSlot(slot))
             return m_global.lookup( fromGlobal( slot ));
+        else {// isNocSlot(slot) == true
+            return m_noc.lookup( fromNoc( slot ));
+        }
     }
 
     Record & update( Slot slot ) // nothrow
     {
         if (isLocalSlot(slot))
             return m_local.update( fromLocal(slot));
-        else
+        else if (isGlobalSlot(slot))
             return m_global.update( fromGlobal( slot ));
+        else // noc Slot
+            return m_noc.update(fromNoc(slot));
     }
 
     void reserve( size_t size, const Record & defaultRecord = Record() )
@@ -252,36 +266,50 @@ class CombinedMemoryRegister
     {
         m_global.reserve( size, defaultRecord );
         m_local.reserve( size, defaultRecord );
+        m_noc.reserve( size, defaultRecord );
     }
 
     size_t capacity( ) const
     { 
-        return std::min( m_global.capacity(), m_local.capacity() );
+        return std::min(std::min( m_global.capacity(), m_local.capacity()), m_noc.capacity() );
     }
 
     size_t range() const
     {
-        return std::max( 2*m_global.capacity(), 2*m_local.capacity()+1 );
+        return std::max(std::max( 3*m_global.capacity(), 3*m_local.capacity()+1), 3*m_noc.capacity()+2);
     }
 
     static bool isLocalSlot( Slot slot ) 
-    { return slot % 2 == 1; }
+    { return slot % 3 == 1; }
+
+    static bool isGlobalSlot( Slot slot ) 
+    { return slot % 3 == 0; }
+
+    static bool isNocSlot( Slot slot ) 
+    { return slot % 3 == 2; }
 
 private:
     static Slot fromGlobal( Slot slot )
-    { return slot / 2; }
+    { return slot / 3; }
 
     static Slot fromLocal( Slot slot )
-    { return (slot - 1) / 2; }
+    { return (slot - 1) / 3; }
+
+    static Slot fromNoc( Slot slot )
+    { return (slot - 2) / 3; }
 
     static Slot toGlobal( Slot slot )
-    { return 2*slot; }
+    { return 3*slot; }
 
     static Slot toLocal( Slot slot )
-    { return 2*slot + 1; }
+    { return 3*slot + 1; }
+
+    static Slot toNoc( Slot slot )
+    { return 3*slot + 2; }
 
     MemoryRegister<Record> m_local;
     MemoryRegister<Record> m_global;
+    MemoryRegister<Record> m_noc;
 };
 
 } // namespace lpf
diff --git a/src/debug/CMakeLists.txt b/src/debug/CMakeLists.txt
index 28af2937..7f3f9c92 100644
--- a/src/debug/CMakeLists.txt
+++ b/src/debug/CMakeLists.txt
@@ -25,8 +25,8 @@ add_library( ${libname}
     rwconflict.cpp
     $<TARGET_OBJECTS:${comlib}>
 )
-target_link_libraries( ${libname} ${LIB_POSIX_THREADS})
-target_include_directories( ${libname} PRIVATE ${MPI_C_INCLUDE_PATH})
+target_link_libraries(${libname} ${LIB_POSIX_THREADS})
+target_include_directories(${libname} PRIVATE ${MPI_C_INCLUDE_PATH})
 set_target_properties(${libname} PROPERTIES 
         SOVERSION ${SOVERSION}
 )
@@ -35,6 +35,7 @@ install(TARGETS ${libname} EXPORT lpf
         RUNTIME DESTINATION ${INSTALL_BIN}
         LIBRARY DESTINATION ${INSTALL_LIB}
         ARCHIVE DESTINATION ${INSTALL_LIB}
-       )
+)
 
-add_gtest(rwconflict_test "pthread" ON rwconflict.t.cpp rwconflict.cpp)
+add_gtest(rwconflict_test "pthread" rwconflict.t.cpp rwconflict.cpp)
+   #$<TARGET_OBJECTS:${comlib}> )
diff --git a/src/debug/core.cpp b/src/debug/core.cpp
index 00f025f6..c3d0adec 100644
--- a/src/debug/core.cpp
+++ b/src/debug/core.cpp
@@ -16,6 +16,7 @@
  */
 
 #include "debug/lpf/core.h"
+#include "lpf/abort.h"
 
 #undef lpf_get
 #undef lpf_put
@@ -29,12 +30,6 @@
 #undef lpf_exec
 #undef lpf_hook
 #undef lpf_rehook
-#undef lpf_abort
-#undef lpf_get_rcvd_msg_count
-#undef lpf_get_rcvd_msg_count_per_slot
-#undef lpf_get_sent_msg_count_per_slot
-#undef lpf_flush
-#undef lpf_abort
 
 #undef lpf_init_t
 #undef lpf_pid_t
@@ -62,6 +57,7 @@
 #undef LPF_NONE
 #undef LPF_INIT_NONE
 #undef LPF_NO_ARGS
+#undef LPF_HAS_ABORT
 
 #if __cplusplus >= 201103L
 #include <unordered_map>
@@ -105,9 +101,21 @@ class _LPFLIB_LOCAL Interface {
     }
 
     static void threadInit() {
+        // in the below we use std::abort as these are critical *internal*
+        // errors, not errors in the use of LPF core functionality.
+        // By contrast, errors that appear due to misuse of the LPF core primitives
+        // should call lpf_abort. This initialiser ensures that the underlying LPF
+        // engine has support for lpf_abort.
+        // The above logic about when to std::abort and when to lpf_abort is applied
+        // consistently in the below implementation. Only (seemingly) exceptions will
+        // be documented henceforth.
         int rc = pthread_key_create( &s_threadKeyCtxStore, &destroyCtxStore );
         if (rc) {
-            LOG( 0, "Internal error while initializing thread static storage");
+            LOG( 0, "Internal error while initializing thread static storage" );
+            std::abort();
+        }
+        if( ! LPF_HAS_ABORT ) {
+            LOG( 0, "Debug layer relies on lpf_abort, but selected engine does not support it" );
             std::abort();
         }
     }
@@ -491,6 +499,13 @@ class _LPFLIB_LOCAL Interface {
     static lpf_err_t hook( const char * file, int line,
             lpf_init_t init, lpf_spmd_t spmd, lpf_args_t args )
     {
+        // the lpf_hook could arise from any non-LPF context -- this is in fact
+        // why it exists: hooking from within an LPF context to create a subcontext is
+        // provided by lpf_rehook instead.
+        // Because the callee context is potentially not controlled by the underlying
+        // LPF engine, and because the callee context in the non-trivial case consists
+        // of multiple distributed processes, we cannot rely on lpf_abort. The only
+        // thing we can do is rely on the standard abort.
         if ( spmd == NULL ) {
             LOG( 0, file << ":" << line
                     << ": Invalid argument passed to lpf_hook: NULL spmd argument" );
@@ -703,18 +718,6 @@ class _LPFLIB_LOCAL Interface {
         return LPF_SUCCESS;
     }
 
-    lpf_err_t get_rcvd_msg_count_per_slot(size_t *msgs, lpf_memslot_t slot) {
-        return LPF_SUCCESS;
-    }
-
-    lpf_err_t get_sent_msg_count_per_slot(size_t *msgs, lpf_memslot_t slot) {
-        return LPF_SUCCESS;
-    }
-
-    lpf_err_t get_rcvd_msg_count(size_t *msgs) {
-        return LPF_SUCCESS;
-    }
-
     lpf_err_t register_local( const char * file, int line,
             void * pointer, size_t size, lpf_memslot_t * memslot )
     {
@@ -1023,7 +1026,6 @@ class _LPFLIB_LOCAL Interface {
         return LPF_SUCCESS;
     }
 
-
     lpf_err_t abort(const char * file, int line) {
         (void) file;
         (void) line;
diff --git a/src/hybrid/CMakeLists.txt b/src/hybrid/CMakeLists.txt
index c2a87b14..ea1a3885 100644
--- a/src/hybrid/CMakeLists.txt
+++ b/src/hybrid/CMakeLists.txt
@@ -20,8 +20,10 @@ if (HYBRID_ENGINE_ENABLED)
 set(LPF_IMPL_ID hybrid)
 set(LPF_IMPL_CONFIG ${LPFLIB_CONFIG_NAME})
 
-set(LPFLIB_HYBRID_MPI_ENGINE "ibverbs" CACHE STRING
-        "Choice of MPI engine to use for inter-process communication")
+if( NOT DEFINED LPFLIB_HYBRID_MPI_ENGINE )
+	message( FATAL_ERROR "Hybrid engine is enabled but no inter-node engine was selected" )
+endif()
+
 set(mpi_engine  "${LPFLIB_HYBRID_MPI_ENGINE}" )
 message( STATUS "Hybrid implementation's multi-node layer is '${mpi_engine}'")
 
diff --git a/src/hybrid/core.cpp b/src/hybrid/core.cpp
index 39226a18..16b738d6 100644
--- a/src/hybrid/core.cpp
+++ b/src/hybrid/core.cpp
@@ -37,6 +37,10 @@
 
 extern "C" {
 
+// the value 2 in this implementation indicates support for lpf_abort in a way
+// that may deviate from the stdlib abort()
+_LPFLIB_VAR const int LPF_HAS_ABORT = 2;
+
 _LPFLIB_VAR const lpf_err_t LPF_SUCCESS = 0;
 _LPFLIB_VAR const lpf_err_t LPF_ERR_OUT_OF_MEMORY = 1;
 _LPFLIB_VAR const lpf_err_t LPF_ERR_FATAL = 2;
@@ -414,11 +418,15 @@ _LPFLIB_API lpf_err_t lpf_get_rcvd_msg_count( lpf_t ctx, size_t * rcvd_msgs)
 
 _LPFLIB_API lpf_err_t lpf_get_rcvd_msg_count_per_slot( lpf_t ctx, size_t * rcvd_msgs, lpf_memslot_t slot )
 {
+
     using namespace lpf::hybrid;
+    if (ctx == LPF_SINGLE_PROCESS)
+        return LPF_SUCCESS;
     ThreadState * t = realContext(ctx);
-    MPI mpi = t->nodeState().mpi();
-    mpi.abort();
-    return LPF_SUCCESS;
+    if (!t->error())
+        return t->getRcvdMsgCountPerSlot(rcvd_msgs, slot);
+    else
+        return LPF_SUCCESS;
 }
 
 _LPFLIB_API lpf_err_t lpf_get_sent_msg_count_per_slot( lpf_t ctx, size_t * sent_msgs, lpf_memslot_t slot )
@@ -428,7 +436,19 @@ _LPFLIB_API lpf_err_t lpf_get_sent_msg_count_per_slot( lpf_t ctx, size_t * sent_
         return LPF_SUCCESS;
     ThreadState * t = realContext(ctx);
     if (!t->error())
-        return t->getSentMsgCount(sent_msgs, slot);
+        return t->getSentMsgCountPerSlot(sent_msgs, slot);
+    else
+        return LPF_SUCCESS;
+}
+
+_LPFLIB_API lpf_err_t lpf_get_sent_msg_count( lpf_t ctx, size_t * sent_msgs)
+{
+    using namespace lpf::hybrid;
+    if (ctx == LPF_SINGLE_PROCESS)
+        return LPF_SUCCESS;
+    ThreadState * t = realContext(ctx);
+    if (!t->error())
+        return t->getSentMsgCount(sent_msgs);
     else
         return LPF_SUCCESS;
 }
diff --git a/src/hybrid/dispatch.hpp b/src/hybrid/dispatch.hpp
index 2dc83c2b..e9f6b6b6 100644
--- a/src/hybrid/dispatch.hpp
+++ b/src/hybrid/dispatch.hpp
@@ -19,23 +19,29 @@
 #define LPF_CORE_HYBRID_DISPATCH_HPP
 
 #undef LPFLIB_CORE_H
+#undef LPFLIB_ABORT_H
 #define LPF_CORE_STATIC_DISPATCH
 #define LPF_CORE_STATIC_DISPATCH_ID pthread
 #define LPF_CORE_STATIC_DISPATCH_CONFIG LPF_CORE_IMPL_CONFIG
 #include <lpf/core.h>
+#include <lpf/abort.h>
 #undef LPF_CORE_STATIC_DISPATCH_ID
 #undef LPF_CORE_STATIC_DISPATCH_CONFIG
 
 #undef LPFLIB_CORE_H
+#undef LPFLIB_ABORT_H
 #define LPF_CORE_STATIC_DISPATCH_ID LPF_CORE_MULTI_NODE_ENGINE
 #define LPF_CORE_STATIC_DISPATCH_CONFIG LPF_CORE_IMPL_CONFIG
 #include <lpf/core.h>
+#include <lpf/abort.h>
 #undef LPF_CORE_STATIC_DISPATCH_ID
 #undef LPF_CORE_STATIC_DISPATCH_CONFIG
 
 #undef LPFLIB_CORE_H
+#undef LPFLIB_ABORT_H
 #undef LPF_CORE_STATIC_DISPATCH
 #include <lpf/core.h>
+#include <lpf/abort.h>
 
 #define USE_THREAD( symbol ) \
        LPF_RENAME_PRIMITIVE4( lpf, pthread, LPF_CORE_IMPL_CONFIG, symbol )
@@ -121,6 +127,9 @@ namespace lpf { namespace hybrid {
         err_t get_rcvd_msg_count( size_t * rcvd_msgs) 
         { return USE_THREAD( get_rcvd_msg_count)(m_ctx, rcvd_msgs); }
 
+        err_t get_sent_msg_count( size_t * sent_msgs) 
+        { return USE_THREAD( get_sent_msg_count)(m_ctx, sent_msgs); }
+
         err_t flush_sent()
         { return USE_THREAD(flush_sent)(m_ctx); }
 
@@ -223,15 +232,18 @@ namespace lpf { namespace hybrid {
         err_t deregister( memslot_t memslot) 
         { return USE_MPI( deregister)(m_ctx, memslot); }
 
+        err_t get_rcvd_msg_count( size_t * rcvd_msgs) 
+        { return USE_MPI( get_rcvd_msg_count)(m_ctx, rcvd_msgs); }
+
         err_t get_rcvd_msg_count_per_slot(size_t *rcvd_msgs, lpf_memslot_t slot) 
         { return USE_MPI( get_rcvd_msg_count_per_slot)( m_ctx, rcvd_msgs, slot); }
 
+        err_t get_sent_msg_count( size_t * sent_msgs) 
+        { return USE_MPI( get_sent_msg_count)(m_ctx, sent_msgs); }
+
         err_t get_sent_msg_count_per_slot(size_t *sent_msgs, lpf_memslot_t slot) 
         { return USE_MPI( get_sent_msg_count_per_slot)( m_ctx, sent_msgs, slot); }
 
-        err_t get_rcvd_msg_count( size_t * rcvd_msgs) 
-        { return USE_MPI( get_rcvd_msg_count)(m_ctx, rcvd_msgs); }
-
         err_t flush_sent()
         {return USE_MPI( flush_sent)(m_ctx);}
 
diff --git a/src/hybrid/state.hpp b/src/hybrid/state.hpp
index 06e8faf3..f890be6b 100644
--- a/src/hybrid/state.hpp
+++ b/src/hybrid/state.hpp
@@ -111,13 +111,6 @@ class _LPFLIB_LOCAL NodeState {
         return m_mpi.sync();
     }
 
-//    MPI::err_t counting_sync_per_slot(lpf_memslot_t slot, size_t expected_sent, size_t expected_rcvd) 
-//    {
-//        m_memreg.flush( m_mpi );
-//        m_msgQueue.flush( m_mpi, m_memreg );
-//        return m_mpi.counting_sync_per_slot(slot, expected_sent, expected_rcvd);
-//    }
-
     static double messageGap( lpf_pid_t nprocs, size_t minMsgSize, lpf_sync_attr_t attr)
     {
         (void) nprocs;
@@ -422,14 +415,19 @@ class _LPFLIB_LOCAL ThreadState {
 
     bool error() const { return m_error; }
 
-    lpf_pid_t getRcvdMsgCount(size_t * rcvd_msgs, lpf_memslot_t slot) {
+    lpf_pid_t getRcvdMsgCountPerSlot(size_t * rcvd_msgs, lpf_memslot_t slot) {
 
         return m_nodeState.mpi().get_rcvd_msg_count_per_slot(rcvd_msgs, slot);
     }
 
-    lpf_pid_t getSentMsgCount(size_t * sent_msgs, lpf_memslot_t slot) {
+    lpf_pid_t getSentMsgCountPerSlot(size_t * rcvd_msgs, lpf_memslot_t slot) {
+
+        return m_nodeState.mpi().get_sent_msg_count_per_slot(rcvd_msgs, slot);
+    }
+
+    lpf_pid_t getSentMsgCount(size_t * sent_msgs) {
 
-        return m_nodeState.mpi().get_sent_msg_count_per_slot(sent_msgs, slot);
+        return m_nodeState.mpi().get_sent_msg_count(sent_msgs);
     }
 
     lpf_pid_t getRcvdMsgCount(size_t * rcvd_msgs) {
diff --git a/src/imp/core.c b/src/imp/core.c
index 7b4c3db2..994a18fd 100644
--- a/src/imp/core.c
+++ b/src/imp/core.c
@@ -16,12 +16,15 @@
  */
 
 #include <lpf/core.h>
+#include <lpf/noc.h>
 
 #include <limits.h>
 #include <stddef.h>
 #include <string.h>
 #include <stdint.h>
 
+const int LPF_HAS_ABORT = 0;
+
 const lpf_err_t LPF_SUCCESS = 0;
 const lpf_err_t LPF_ERR_OUT_OF_MEMORY = 1;
 const lpf_err_t LPF_ERR_FATAL = 2;
@@ -141,6 +144,9 @@ lpf_err_t lpf_counting_sync_per_slot( lpf_t lpf, lpf_sync_attr_t attr, lpf_memsl
 {
     (void) lpf;
     (void) attr; 
+    (void) slot;
+    (void) expected_sent;
+    (void) expected_rcvd;
     return LPF_SUCCESS;
 }
 
@@ -154,6 +160,15 @@ lpf_err_t lpf_lock_slot(
     size_t size,
     lpf_msg_attr_t attr
 ) {
+
+    (void) ctx;
+    (void) src_slot;
+    (void) src_offset;
+    (void) dst_pid;
+    (void) dst_slot;
+    (void) dst_offset;
+    (void) size;
+    (void) attr;
 	return LPF_SUCCESS;
 }
 
@@ -167,6 +182,14 @@ lpf_err_t lpf_unlock_slot(
     size_t size,
     lpf_msg_attr_t attr
 ) {
+    (void) ctx;
+    (void) src_slot;
+    (void) src_offset;
+    (void) dst_pid;
+    (void) dst_slot;
+    (void) dst_offset;
+    (void) size;
+    (void) attr;
 	return LPF_SUCCESS;
 }
 
@@ -215,17 +238,26 @@ lpf_err_t lpf_resize_memory_register( lpf_t lpf, size_t max_regs )
 lpf_err_t lpf_get_rcvd_msg_count_per_slot( lpf_t lpf, size_t * rcvd_msgs, lpf_memslot_t slot) {
     (void) lpf;
     *rcvd_msgs = 0;
+    (void) slot;
     return LPF_SUCCESS;
 }
 
 lpf_err_t lpf_get_rcvd_msg_count( lpf_t lpf, size_t * rcvd_msgs) {
     (void) lpf;
+    *rcvd_msgs = 0;
+    return LPF_SUCCESS;
+}
+
+lpf_err_t lpf_get_sent_msg_count( lpf_t lpf, size_t * sent_msgs) {
+    (void) lpf;
+    *sent_msgs = 0;
     return LPF_SUCCESS;
 }
 
 lpf_err_t lpf_get_sent_msg_count_per_slot( lpf_t lpf, size_t * sent_msgs, lpf_memslot_t slot) {
     (void) lpf;
     *sent_msgs = 0;
+    (void) slot;
     return LPF_SUCCESS;
 }
 
@@ -239,3 +271,79 @@ lpf_err_t lpf_abort( lpf_t lpf)
     (void) lpf;
     return LPF_SUCCESS;
 }
+
+lpf_err_t lpf_noc_resize_memory_register( lpf_t ctx, size_t max_regs ) 
+{
+    (void) ctx;
+    (void) max_regs;
+    return LPF_SUCCESS;
+}
+
+lpf_err_t lpf_noc_register(
+    lpf_t ctx,
+    void * pointer,
+    size_t size,
+    lpf_memslot_t * memslot
+) 
+{
+    (void) ctx;
+    (void) pointer;
+    (void) size;
+    (void) memslot;
+    return LPF_SUCCESS;
+}
+
+lpf_err_t lpf_noc_deregister(
+    lpf_t ctx,
+    lpf_memslot_t memslot
+) 
+{
+    (void) ctx;
+    (void) memslot;
+    return LPF_SUCCESS;
+}
+
+lpf_err_t lpf_noc_put(
+    lpf_t ctx,
+    lpf_memslot_t src_slot,
+    size_t src_offset,
+    lpf_pid_t dst_pid,
+    lpf_memslot_t dst_slot,
+    size_t dst_offset,
+    size_t size,
+    lpf_msg_attr_t attr
+)
+{
+    (void) ctx;
+    (void) src_slot;
+    (void) src_offset;
+    (void) dst_pid;
+    (void) dst_slot;
+    (void) dst_offset;
+    (void) size;
+    (void) attr;
+    return LPF_SUCCESS;
+}
+
+lpf_err_t lpf_noc_get(
+    lpf_t ctx,
+    lpf_pid_t src_pid,
+    lpf_memslot_t src_slot,
+    size_t src_offset,
+    lpf_memslot_t dst_slot,
+    size_t dst_offset,
+    size_t size,
+    lpf_msg_attr_t attr
+)
+{
+    (void) ctx;
+    (void) src_pid;
+    (void) src_slot;
+    (void) src_offset;
+    (void) dst_slot;
+    (void) dst_offset;
+    (void) size;
+    (void) attr;
+
+    return LPF_SUCCESS;
+}
diff --git a/src/pthreads/barrier.cpp b/src/pthreads/barrier.cpp
index cacfbbf8..92442474 100644
--- a/src/pthreads/barrier.cpp
+++ b/src/pthreads/barrier.cpp
@@ -82,9 +82,9 @@ namespace {
         {
 #ifdef VALGRIND_MEMCHECK        
 #ifdef LPF_ON_MACOS
-            pthread_yield_np();
+            sched_yield_np();
 #else
-            pthread_yield(); // allow other processes to progress
+            sched_yield(); // allow other processes to progress
 #endif
 #endif
         }
@@ -145,7 +145,7 @@ namespace {
             if (m_available )
                 _mm_mwait(0, 0);
             else 
-                pthread_yield();
+                sched_yield();
         }
 
         bool m_available;
@@ -160,9 +160,9 @@ namespace {
         void pause() 
         { 
 #ifdef LPF_ON_MACOS
-            pthread_yield_np();
+            sched_yield_np();
 #else
-            if (pthread_yield()) {
+            if (sched_yield()) {
                 LOG(2, "While waiting, the Posix thread library failed to "
                        "yield the CPU to the OS" );
             }
diff --git a/src/pthreads/core.cpp b/src/pthreads/core.cpp
index 763d9a44..38799e9a 100644
--- a/src/pthreads/core.cpp
+++ b/src/pthreads/core.cpp
@@ -17,6 +17,7 @@
 
 #include <lpf/core.h>
 #include <lpf/pthread.h>
+#include <lpf/abort.h>
 
 #include "threadlocaldata.hpp"
 #include "machineparams.hpp"
@@ -37,6 +38,10 @@
 
 #include <pthread.h> // for pthreads
 
+// the value 2 in this implementation indicates support for lpf_abort in a way
+// that may deviate from the stdlib abort()
+const int LPF_HAS_ABORT = 2;
+
 const lpf_err_t LPF_SUCCESS = 0;
 const lpf_err_t LPF_ERR_OUT_OF_MEMORY = 1;
 const lpf_err_t LPF_ERR_FATAL = 2;
@@ -386,6 +391,7 @@ lpf_err_t lpf_resize_memory_register( lpf_t ctx, size_t max_regs )
 }
 
 lpf_err_t lpf_get_rcvd_msg_count_per_slot(lpf_t ctx, size_t * msgs, lpf_memslot_t slot) {
+    (void) slot;
     *msgs = 0;
     lpf::ThreadLocalData * t = realCtx(ctx);
     if (t->isAborted())
@@ -403,6 +409,15 @@ lpf_err_t lpf_get_rcvd_msg_count(lpf_t ctx, size_t * msgs) {
 }
 
 lpf_err_t lpf_get_sent_msg_count_per_slot(lpf_t ctx, size_t * msgs, lpf_memslot_t slot) {
+    *msgs = 0;
+    (void) slot;
+    lpf::ThreadLocalData * t = realCtx(ctx);
+    if (t->isAborted())
+        return LPF_SUCCESS;
+    return LPF_SUCCESS;
+}
+
+lpf_err_t lpf_get_sent_msg_count(lpf_t ctx, size_t * msgs) {
     *msgs = 0;
     lpf::ThreadLocalData * t = realCtx(ctx);
     if (t->isAborted())
diff --git a/src/pthreads/threadlocaldata.cpp b/src/pthreads/threadlocaldata.cpp
index 6a62e4d3..1923b272 100644
--- a/src/pthreads/threadlocaldata.cpp
+++ b/src/pthreads/threadlocaldata.cpp
@@ -442,6 +442,10 @@ err_t ThreadLocalData ::  sync( bool expectExit)
 }
 
 err_t ThreadLocalData :: countingSyncPerSlot(bool expectExit,  lpf_memslot_t slot, size_t expected_sent, size_t expected_rcvd) {
+    (void) expectExit;
+    (void) slot;
+    (void) expected_sent;
+    (void)  expected_rcvd;
     return LPF_SUCCESS;
 }
 
diff --git a/test_launcher.py.in b/test_launcher.py.in
new file mode 100644
index 00000000..656e570f
--- /dev/null
+++ b/test_launcher.py.in
@@ -0,0 +1,38 @@
+import argparse
+import subprocess
+import sys
+
+parser = argparse.ArgumentParser( description='Death test launcher' )
+parser.add_argument("-e", "--engine", type=str)
+parser.add_argument("-L", "--parallel_launcher", type=str)
+parser.add_argument("-p", "--min_process_count", type=int)
+parser.add_argument("-P", "--max_process_count", type=int)
+parser.add_argument("-t", "--lpf_probe_timer", type=float)
+parser.add_argument("-R", "--expected_return_code", type=int)
+parser.add_argument( 'cmd', nargs=argparse.REMAINDER )
+args = parser.parse_args()
+
+# This is only for passing Gtest info to CMake
+# The parallel launcher is still needed as Open MPI
+# binaries terminate without the launcher on our cluster,
+# even for single process runs
+if args.cmd[-1] == '--gtest_list_tests':
+    run_cmd = [args.parallel_launcher, '-engine', args.engine, '-n', '1'] + args.cmd
+    cmd = subprocess.run( run_cmd)
+    sys.exit(cmd.returncode)
+# Actual use of our launcher
+else:
+    for i in range(args.min_process_count, args.max_process_count+1):
+        if args.lpf_probe_timer > 0.0:
+            run_cmd = [args.parallel_launcher, '-engine', args.engine, '-probe', str(args.lpf_probe_timer), '-n', str(i)] + args.cmd
+        else:
+            run_cmd = [args.parallel_launcher, '-engine', args.engine, '-n', str(i)] + args.cmd
+        print("Run command: ")
+        print(run_cmd)
+        cmd = subprocess.run( run_cmd)
+        print("Test returned code = " + str(cmd.returncode))
+        retcode = cmd.returncode
+        if (retcode != args.expected_return_code):
+            print("Test " + args.cmd[0] + args.cmd[1] + "\nreturned\t" + str(retcode) + "\nexpected return code was: " + str(args.expected_return_code))
+            sys.exit(1)
+    print("Test " + args.cmd[0] + args.cmd[1] + " passed")
diff --git a/tests/functional/CMakeLists.txt b/tests/functional/CMakeLists.txt
index 11e4baf1..04ebf85d 100644
--- a/tests/functional/CMakeLists.txt
+++ b/tests/functional/CMakeLists.txt
@@ -77,26 +77,20 @@ set(test_sources
     func_lpf_exec_single_call_single_arg_single_proc.cpp
     func_lpf_get_parallel_alltoall.cpp
     func_lpf_get_parallel_huge.cpp
-    func_lpf_get_parallel_single.cpp
-    #func_lpf_hook_simple.mpirma.cpp
-    #func_lpf_hook_simple.pthread.cpp
-    #func_lpf_hook_subset.mpimsg.cpp
-    #func_lpf_hook_tcp.mpirma.cpp
-    #func_lpf_hook_tcp_timeout.mpirma.cpp
-    #func_lpf_put_parallel_bad_pattern.cpp <= in exception_list
-    func_lpf_put_and_get_overlapping.cpp
     func_lpf_get_parallel_overlapping_complete.cpp
-    func_lpf_put_parallel_overlapping_complete.cpp
     func_lpf_get_parallel_overlapping_pyramid.cpp
-    func_lpf_put_parallel_overlapping_pyramid.cpp
     func_lpf_get_parallel_overlapping_rooftiling.cpp
-    func_lpf_put_parallel_overlapping_rooftiling.cpp
+    func_lpf_get_parallel_single.cpp
     func_lpf_probe_parallel_full.cpp
     func_lpf_probe_parallel_nested.cpp
     func_lpf_probe_root.cpp
+    func_lpf_put_and_get_overlapping.cpp
     func_lpf_put_parallel_alltoall.cpp
     func_lpf_put_parallel_big.cpp
     func_lpf_put_parallel_huge.cpp
+    func_lpf_put_parallel_overlapping_complete.cpp
+    func_lpf_put_parallel_overlapping_pyramid.cpp
+    func_lpf_put_parallel_overlapping_rooftiling.cpp
     func_lpf_put_parallel_single.cpp
     func_lpf_register_and_deregister_irregularly.cpp
     func_lpf_register_and_deregister_many_global.cpp
@@ -142,21 +136,39 @@ foreach (LPF_IMPL_ID ${ENGINES})
         string(REGEX MATCH "overlapping|early|bsplib" foundTest ${testSource})
         if (NOT ${LPF_IMPL_ID} STREQUAL "zero") 
             add_gtest(${exeName} ${LPF_IMPL_ID} ${debug} "${CMAKE_CURRENT_SOURCE_DIR}/${testSource}")
-
-            string(REGEX REPLACE "(.${LPF_IMPL_ID})?.cpp$" "" baseName ${testSource})
-            get_filename_component(baseName ${testSource} NAME_WE  )
-            set(exeName "${baseName}_${LPF_IMPL_ID}_${LPF_IMPL_CONFIG}${mode}")
         elseif ("${foundTest}" STREQUAL "")
             add_gtest(${exeName} ${LPF_IMPL_ID} ${debug} "${CMAKE_CURRENT_SOURCE_DIR}/${testSource}")
-
-            string(REGEX REPLACE "(.${LPF_IMPL_ID})?.cpp$" "" baseName ${testSource})
-            get_filename_component(baseName ${testSource} NAME_WE  )
-            set(exeName "${baseName}_${LPF_IMPL_ID}_${LPF_IMPL_CONFIG}${mode}")
         endif()
 
     endforeach(testSource)
+
 endforeach(LPF_IMPL_ID)
 
+# Individual test for NOC (Non-coherence) protocol, only for zero engine
+# (part of HiCR project)
+set(LPF_IMPL_CONFIG ${LPFLIB_CONFIG_NAME})
+set(mode "")
+set(exeName "func_lpf_test_noc_ring_zero_${LPF_IMPL_CONFIG}${mode}")
+add_gtest(${exeName} "zero" OFF "${CMAKE_CURRENT_SOURCE_DIR}/func_lpf_test_noc_ring.cpp")
+
+# start of engine-specific tests
+foreach (LPF_IMPL_ID ${ENGINES})
+    if ("${LPF_IMPL_ID}" STREQUAL "pthread" OR "${LPF_IMPL_ID}" STREQUAL "mpirma")
+        foreach(testSource func_lpf_hook_simple.${LPF_IMPL_ID}.cpp)
+            string(REGEX REPLACE "(.${LPF_IMPL_ID})?.cpp$" "" baseName ${testSource})
+            get_filename_component(baseName ${testSource} NAME_WE  )
+            set(exeName "${baseName}_${LPF_IMPL_ID}_${LPF_IMPL_CONFIG}${mode}")
+            add_gtest(${exeName} ${LPF_IMPL_ID} ON "${CMAKE_CURRENT_SOURCE_DIR}/${testSource}")
+        endforeach(testSource)
+    endif()
+    if ("${LPF_IMPL_ID}" STREQUAL "mpimsg")
+        add_gtest(func_lpf_hook_subset.mpimsg mpimsg ON "${CMAKE_CURRENT_SOURCE_DIR}/func_lpf_hook_subset.mpimsg.cpp")
+    endif()
+    if ("${LPF_IMPL_ID}" STREQUAL "mpirma")
+        add_gtest(func_lpf_hook_tcp_timeout.mpirma mpirma ON "${CMAKE_CURRENT_SOURCE_DIR}/func_lpf_hook_tcp_timeout.mpirma.cpp")
+    endif()
+endforeach(LPF_IMPL_ID)
+# end of engine-specific tests
 
 include_directories(.)
 add_subdirectory(debug)
diff --git a/tests/functional/collectives/CMakeLists.txt b/tests/functional/collectives/CMakeLists.txt
index 463b4de5..80aaa137 100644
--- a/tests/functional/collectives/CMakeLists.txt
+++ b/tests/functional/collectives/CMakeLists.txt
@@ -17,21 +17,21 @@
 
 
 set(c99_tests_sources
-func_lpf_allcombine.cpp
-func_lpf_allgather.cpp
-func_lpf_allgather_overlapped.cpp
-func_lpf_allreduce.cpp
-func_lpf_alltoall.cpp
-func_lpf_broadcast.cpp
-func_lpf_broadcast_prime_size_object.cpp
-func_lpf_broadcast_small_prime_size_object.cpp
-func_lpf_collectives_init.cpp
-func_lpf_collectives_init_overflow.cpp
-func_lpf_combine.cpp
-func_lpf_gather.cpp
-func_lpf_reduce.cpp
-func_lpf_scatter.cpp
-func_lpf_zero_cost.cpp
+    func_lpf_allcombine.cpp
+    func_lpf_allgather.cpp
+    func_lpf_allgather_overlapped.cpp
+    func_lpf_allreduce.cpp
+    func_lpf_alltoall.cpp
+    func_lpf_broadcast.cpp
+    func_lpf_broadcast_prime_size_object.cpp
+    func_lpf_broadcast_small_prime_size_object.cpp
+    func_lpf_collectives_init.cpp
+    func_lpf_collectives_init_overflow.cpp
+    func_lpf_combine.cpp
+    func_lpf_gather.cpp
+    func_lpf_reduce.cpp
+    func_lpf_scatter.cpp
+    func_lpf_zero_cost.cpp
 )
 
 foreach (LPF_IMPL_ID ${ENGINES})
@@ -51,9 +51,5 @@ foreach (LPF_IMPL_ID ${ENGINES})
 
         add_gtest(${exeName} ${LPF_IMPL_ID} ${debug} "${CMAKE_CURRENT_SOURCE_DIR}/${testSource}")
 
-        string(REGEX REPLACE "(.${LPF_IMPL_ID})?.cpp$" "" baseName ${testSource})
-        get_filename_component(baseName ${testSource} NAME_WE  )
-        set(exeName "${baseName}_${LPF_IMPL_ID}_${LPF_IMPL_CONFIG}${mode}")
-
     endforeach(testSource)
 endforeach(LPF_IMPL_ID)
diff --git a/tests/functional/debug/CMakeLists.txt b/tests/functional/debug/CMakeLists.txt
index 0292d488..67ffcb5d 100644
--- a/tests/functional/debug/CMakeLists.txt
+++ b/tests/functional/debug/CMakeLists.txt
@@ -37,10 +37,6 @@ set(debug_test_sources
     func_lpf_debug_global_deregister_order_mismatch.cpp
     func_lpf_debug_global_deregister_unequal.cpp
     func_lpf_debug_global_register_null_memreg.cpp
-    #func_lpf_debug_hook_null_f_symbols.pthread.cpp
-    #func_lpf_debug_hook_null_input.pthread.cpp
-    #func_lpf_debug_hook_null_output.pthread.cpp
-    #func_lpf_debug_hook_null_spmd.pthread.cpp
     func_lpf_debug_local_register_null_memreg.cpp
     func_lpf_debug_put_after_deregister_dest_after_sync.cpp
     func_lpf_debug_put_after_deregister_dest.cpp
@@ -90,10 +86,18 @@ foreach (LPF_IMPL_ID ${ENGINES})
 
         add_gtest(${exeName} ${LPF_IMPL_ID} ${debug} "${CMAKE_CURRENT_SOURCE_DIR}/${testSource}" )
 
-        string(REGEX REPLACE "(.${LPF_IMPL_ID})?.cpp$" "" baseName ${CMAKE_CURRENT_SOURCE_DIR}/${testSource})
-        get_filename_component(baseName ${testSource} NAME_WE  )
-        set(exeName "${baseName}_${LPF_IMPL_ID}_${LPF_IMPL_CONFIG}${mode}")
-
     endforeach(testSource)
 endforeach(LPF_IMPL_ID)
 
+add_gtest(func_lpf_debug_hook_f_symbols_pthread "pthread" ON
+	${CMAKE_CURRENT_SOURCE_DIR}/func_lpf_debug_hook_null_f_symbols.pthread.cpp)
+
+add_gtest(func_lpf_debug_hook_null_input_pthread "pthread" ON
+	${CMAKE_CURRENT_SOURCE_DIR}/func_lpf_debug_hook_null_input.pthread.cpp)
+
+add_gtest(func_lpf_debug_hook_null_output_pthread "pthread" ON
+	${CMAKE_CURRENT_SOURCE_DIR}/func_lpf_debug_hook_null_output.pthread.cpp)
+
+add_gtest(func_lpf_debug_hook_null_spmd_pthread "pthread" ON
+	${CMAKE_CURRENT_SOURCE_DIR}/func_lpf_debug_hook_null_spmd.pthread.cpp)
+
diff --git a/tests/functional/debug/func_lpf_debug_deregister_non_existing_slot.cpp b/tests/functional/debug/func_lpf_debug_deregister_non_existing_slot.cpp
index 139bad91..5afa95a2 100644
--- a/tests/functional/debug/func_lpf_debug_deregister_non_existing_slot.cpp
+++ b/tests/functional/debug/func_lpf_debug_deregister_non_existing_slot.cpp
@@ -21,47 +21,22 @@
 
 void spmd( lpf_t lpf, lpf_pid_t pid, lpf_pid_t nprocs, lpf_args_t args )
 {
-    (void) args;
-    int x = 3; int y = 6;
-    lpf_memslot_t xSlot = LPF_INVALID_MEMSLOT;
-    lpf_memslot_t ySlot = LPF_INVALID_MEMSLOT;
-
-    lpf_err_t rc = lpf_resize_memory_register( lpf, 2 );
-    EXPECT_EQ(LPF_SUCCESS, rc );
-    
-    rc = lpf_resize_message_queue( lpf, 2 );
-    EXPECT_EQ( LPF_SUCCESS, rc );
-
-    rc = lpf_sync( lpf, LPF_SYNC_DEFAULT );
-    EXPECT_EQ( LPF_SUCCESS, rc );
-
-    rc = lpf_register_global( lpf, &x, sizeof(x), &xSlot );
-    EXPECT_EQ( LPF_SUCCESS, rc );
-
-    rc = lpf_register_global( lpf, &y, sizeof(y), &ySlot );
-    EXPECT_EQ( LPF_SUCCESS, rc );
-
-    rc = lpf_sync( lpf, LPF_SYNC_DEFAULT );
-    EXPECT_EQ( LPF_SUCCESS, rc );
-
-    rc = lpf_get( lpf, (pid+1)%nprocs, xSlot, 3, ySlot, 0, sizeof(x), LPF_MSG_DEFAULT );
-    EXPECT_EQ( LPF_SUCCESS, rc );
-
-    FAIL();
-    // the write error will be detected at this sync
-    //rc = lpf_sync( lpf, LPF_SYNC_DEFAULT );
+    (void) pid; (void) nprocs; (void) args;
+    lpf_memslot_t slot; 
+    memset( &slot, 1, sizeof(slot));  // init to some weird data
 
+    lpf_deregister( lpf, slot );
 }
 
 /** 
- * \test Testing for a lpf_get() that reads past globally registered memory bounds
- * \pre P >= 2
- * \return Message: source memory .* is read past the end by 3 bytes
+ * \test Deregister a non-registered slot
+ * \pre P >= 1
+ * \return Message: Invalid attempt to deregister a memory slot, because it has not been registered before
  * \return Exit code: 6
  */
-TEST( API, func_lpf_debug_deregister_non_existing_slot )
+TEST(API, func_lpf_debug_deregister_non_existing_slot )
 {
     lpf_err_t rc = LPF_SUCCESS;
     rc = lpf_exec( LPF_ROOT, LPF_MAX_P, &spmd, LPF_NO_ARGS );
-    EXPECT_EQ( LPF_SUCCESS, rc );
+    EXPECT_EQ(LPF_SUCCESS, rc );
 }
diff --git a/tests/functional/debug/func_lpf_debug_get_too_many_requests.cpp b/tests/functional/debug/func_lpf_debug_get_too_many_requests.cpp
index 90da31e8..895260c9 100644
--- a/tests/functional/debug/func_lpf_debug_get_too_many_requests.cpp
+++ b/tests/functional/debug/func_lpf_debug_get_too_many_requests.cpp
@@ -55,7 +55,6 @@ void spmd( lpf_t lpf, lpf_pid_t pid, lpf_pid_t nprocs, lpf_args_t args )
 
     EXPECT_EQ(3, y[0] );
     EXPECT_EQ(4, y[1] );
-    //FAIL();
 }
 
 /** 
diff --git a/tests/functional/debug/func_lpf_debug_hook_null_f_symbols.pthread.cpp b/tests/functional/debug/func_lpf_debug_hook_null_f_symbols.pthread.cpp
index c898128d..51eb6a5e 100644
--- a/tests/functional/debug/func_lpf_debug_hook_null_f_symbols.pthread.cpp
+++ b/tests/functional/debug/func_lpf_debug_hook_null_f_symbols.pthread.cpp
@@ -54,17 +54,25 @@ void * pthread_spmd( void * _data ) {
         &init
     );
     EXPECT_EQ( rc, LPF_SUCCESS );
-    FAIL();
+
+    rc = lpf_hook( init, &lpf_spmd, args );
+    EXPECT_EQ( rc, LPF_SUCCESS );
+
+    rc = lpf_pthread_finalize( init );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     return NULL;
 }
 
+// the below tests for return code 134 as this is what aborted programs return
+// as an error code on modern systems
+
 /** 
  * \test Tests lpf_hook on pthread implementation with NULL f_symbols
  * \pre P <= 1
  * \pre P >= 1
  * \return Message: NULL f_symbols argument while f_size is non-zero
- * \return Exit code: 6
+ * \return Exit code: 134
  */
 TEST( API, func_lpf_hook_null_f_symbols )
 {
diff --git a/tests/functional/debug/func_lpf_debug_hook_null_input.pthread.cpp b/tests/functional/debug/func_lpf_debug_hook_null_input.pthread.cpp
index 04c7c355..051f8542 100644
--- a/tests/functional/debug/func_lpf_debug_hook_null_input.pthread.cpp
+++ b/tests/functional/debug/func_lpf_debug_hook_null_input.pthread.cpp
@@ -17,7 +17,7 @@
 
 #include <lpf/core.h>
 #include <lpf/pthread.h>
-#include "Test.h"
+#include "gtest/gtest.h"
 
 #include <pthread.h>
 #include <unistd.h>
@@ -32,7 +32,7 @@ void lpf_spmd( lpf_t ctx, lpf_pid_t pid, lpf_pid_t nprocs, lpf_args_t args )
 { (void) ctx; (void) pid; (void) nprocs; (void) args; }
 
 void * pthread_spmd( void * _data ) {
-    EXPECT_NE( "%p", _data, NULL );
+    EXPECT_NE( _data, (void*)NULL );
 
     const struct thread_local_data data = * ((struct thread_local_data*) _data);
     const int pts_rc = pthread_setspecific( pid_key, _data );
@@ -46,61 +46,62 @@ void * pthread_spmd( void * _data ) {
     lpf_init_t init;
     lpf_err_t rc = LPF_SUCCESS;
 
-    EXPECT_EQ( "%d", pts_rc, 0 );
+    EXPECT_EQ( pts_rc, 0 );
 
     rc = lpf_pthread_initialize(
         (lpf_pid_t)data.s,
         (lpf_pid_t)data.P,
         &init
     );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     rc = lpf_hook( init, &lpf_spmd, args );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     rc = lpf_pthread_finalize( init );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     return NULL;
 }
 
+// the below tests for return code 134 as this is what aborted programs return
+// as an error code on modern systems
+
 /** 
  * \test Tests lpf_hook on pthread implementation with NULL input 
  * \pre P <= 1
  * \pre P >= 1
  * \return Message: NULL input argument while input_size is non-zero
- * \return Exit code: 6
+ * \return Exit code: 134
  */
-TEST( func_lpf_hook_null_input )
+TEST( API, func_lpf_hook_null_input )
 {
     long k = 0;
     const long P = sysconf( _SC_NPROCESSORS_ONLN );
 
     const int ptc_rc = pthread_key_create( &pid_key, NULL );
-    EXPECT_EQ( "%d", ptc_rc, 0 );
+    EXPECT_EQ( ptc_rc, 0 );
 
     pthread_t * const threads = (pthread_t*) malloc( P * sizeof(pthread_t) );
-    EXPECT_NE( "%p", threads, NULL );
+    EXPECT_NE( threads, (pthread_t*)NULL );
 
     struct thread_local_data * const data = (struct thread_local_data*) malloc( P * sizeof(struct thread_local_data) );
-    EXPECT_NE( "%p", data, NULL );
+    EXPECT_NE( data, (struct thread_local_data*)NULL );
 
     for( k = 0; k < P; ++k ) {
         data[ k ].P = P;
         data[ k ].s = k;
         const int rval = pthread_create( threads + k, NULL, &pthread_spmd, data + k );
-        EXPECT_EQ( "%d", rval, 0 );
+        EXPECT_EQ( rval, 0 );
     }
 
     for( k = 0; k < P; ++k ) {
         const int rval = pthread_join( threads[ k ], NULL );
-        EXPECT_EQ( "%d", rval, 0 );
+        EXPECT_EQ( rval, 0 );
     }
 
     const int ptd_rc = pthread_key_delete( pid_key );
-    EXPECT_EQ( "%d", ptd_rc, 0 );
-
-    return 0;
+    EXPECT_EQ( ptd_rc, 0 );
 }
 
 
diff --git a/tests/functional/debug/func_lpf_debug_hook_null_output.pthread.cpp b/tests/functional/debug/func_lpf_debug_hook_null_output.pthread.cpp
index 02268258..eec3be9a 100644
--- a/tests/functional/debug/func_lpf_debug_hook_null_output.pthread.cpp
+++ b/tests/functional/debug/func_lpf_debug_hook_null_output.pthread.cpp
@@ -17,7 +17,7 @@
 
 #include <lpf/core.h>
 #include <lpf/pthread.h>
-#include "Test.h"
+#include "gtest/gtest.h"
 
 #include <pthread.h>
 #include <unistd.h>
@@ -32,7 +32,7 @@ void lpf_spmd( lpf_t ctx, lpf_pid_t pid, lpf_pid_t nprocs, lpf_args_t args )
 { (void) ctx; (void) pid; (void) nprocs; (void) args; }
 
 void * pthread_spmd( void * _data ) {
-    EXPECT_NE( "%p", _data, NULL );
+    EXPECT_NE( _data, (void*)NULL );
 
     const struct thread_local_data data = * ((struct thread_local_data*) _data);
     const int pts_rc = pthread_setspecific( pid_key, _data );
@@ -46,61 +46,62 @@ void * pthread_spmd( void * _data ) {
     lpf_init_t init;
     lpf_err_t rc = LPF_SUCCESS;
 
-    EXPECT_EQ( "%d", pts_rc, 0 );
+    EXPECT_EQ( pts_rc, 0 );
 
     rc = lpf_pthread_initialize(
         (lpf_pid_t)data.s,
         (lpf_pid_t)data.P,
         &init
     );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     rc = lpf_hook( init, &lpf_spmd, args );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     rc = lpf_pthread_finalize( init );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     return NULL;
 }
 
+// the below tests for return code 134 as this is what aborted programs return
+// as an error code on modern systems
+
 /** 
  * \test Tests lpf_hook on pthread implementation with NULL output
  * \pre P <= 1
  * \pre P >= 1
  * \return Message: NULL output argument while output_size is non-zero
- * \return Exit code: 6
+ * \return Exit code: 134
  */
-TEST( func_lpf_hook_null_output )
+TEST( API, func_lpf_hook_null_output )
 {
     long k = 0;
     const long P = sysconf( _SC_NPROCESSORS_ONLN );
 
     const int ptc_rc = pthread_key_create( &pid_key, NULL );
-    EXPECT_EQ( "%d", ptc_rc, 0 );
+    EXPECT_EQ( ptc_rc, 0 );
 
     pthread_t * const threads = (pthread_t*) malloc( P * sizeof(pthread_t) );
-    EXPECT_NE( "%p", threads, NULL );
+    EXPECT_NE( threads, (pthread_t *)NULL );
 
     struct thread_local_data * const data = (struct thread_local_data*) malloc( P * sizeof(struct thread_local_data) );
-    EXPECT_NE( "%p", data, NULL );
+    EXPECT_NE( data, (struct thread_local_data*)NULL );
 
     for( k = 0; k < P; ++k ) {
         data[ k ].P = P;
         data[ k ].s = k;
         const int rval = pthread_create( threads + k, NULL, &pthread_spmd, data + k );
-        EXPECT_EQ( "%d", rval, 0 );
+        EXPECT_EQ( rval, 0 );
     }
 
     for( k = 0; k < P; ++k ) {
         const int rval = pthread_join( threads[ k ], NULL );
-        EXPECT_EQ( "%d", rval, 0 );
+        EXPECT_EQ( rval, 0 );
     }
 
     const int ptd_rc = pthread_key_delete( pid_key );
-    EXPECT_EQ( "%d", ptd_rc, 0 );
-
-    return 0;
+    EXPECT_EQ( ptd_rc, 0 );
 }
 
 
diff --git a/tests/functional/debug/func_lpf_debug_hook_null_spmd.pthread.cpp b/tests/functional/debug/func_lpf_debug_hook_null_spmd.pthread.cpp
index 20209c16..00bcc0c7 100644
--- a/tests/functional/debug/func_lpf_debug_hook_null_spmd.pthread.cpp
+++ b/tests/functional/debug/func_lpf_debug_hook_null_spmd.pthread.cpp
@@ -17,7 +17,7 @@
 
 #include <lpf/core.h>
 #include <lpf/pthread.h>
-#include "Test.h"
+#include "gtest/gtest.h"
 
 #include <pthread.h>
 #include <unistd.h>
@@ -30,7 +30,7 @@ struct thread_local_data {
 
 
 void * pthread_spmd( void * _data ) {
-    EXPECT_NE( "%p", _data, NULL );
+    EXPECT_NE( _data, (void*)NULL );
 
     const struct thread_local_data data = * ((struct thread_local_data*) _data);
     const int pts_rc = pthread_setspecific( pid_key, _data );
@@ -44,61 +44,62 @@ void * pthread_spmd( void * _data ) {
     lpf_init_t init;
     lpf_err_t rc = LPF_SUCCESS;
 
-    EXPECT_EQ( "%d", pts_rc, 0 );
+    EXPECT_EQ( pts_rc, 0 );
 
     rc = lpf_pthread_initialize(
         (lpf_pid_t)data.s,
         (lpf_pid_t)data.P,
         &init
     );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     rc = lpf_hook( init, NULL, args );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     rc = lpf_pthread_finalize( init );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     return NULL;
 }
 
+// the below tests for return code 134 as this is what aborted programs return
+// as an error code on modern systems
+
 /** 
  * \test Tests lpf_hook on pthread implementation with NULL spmd 
  * \pre P <= 1
  * \pre P >= 1
  * \return Message: NULL spmd argument
- * \return Exit code: 6
+ * \return Exit code: 134
  */
-TEST( func_lpf_hook_null_spmd )
+TEST( API, func_lpf_hook_null_spmd )
 {
     long k = 0;
     const long P = sysconf( _SC_NPROCESSORS_ONLN );
 
     const int ptc_rc = pthread_key_create( &pid_key, NULL );
-    EXPECT_EQ( "%d", ptc_rc, 0 );
+    EXPECT_EQ( ptc_rc, 0 );
 
     pthread_t * const threads = (pthread_t*) malloc( P * sizeof(pthread_t) );
-    EXPECT_NE( "%p", threads, NULL );
+    EXPECT_NE( threads, (pthread_t*)NULL );
 
     struct thread_local_data * const data = (struct thread_local_data*) malloc( P * sizeof(struct thread_local_data) );
-    EXPECT_NE( "%p", data, NULL );
+    EXPECT_NE( data, (struct thread_local_data *)NULL );
 
     for( k = 0; k < P; ++k ) {
         data[ k ].P = P;
         data[ k ].s = k;
         const int rval = pthread_create( threads + k, NULL, &pthread_spmd, data + k );
-        EXPECT_EQ( "%d", rval, 0 );
+        EXPECT_EQ( rval, 0 );
     }
 
     for( k = 0; k < P; ++k ) {
         const int rval = pthread_join( threads[ k ], NULL );
-        EXPECT_EQ( "%d", rval, 0 );
+        EXPECT_EQ( rval, 0 );
     }
 
     const int ptd_rc = pthread_key_delete( pid_key );
-    EXPECT_EQ( "%d", ptd_rc, 0 );
-
-    return 0;
+    EXPECT_EQ( ptd_rc, 0 );
 }
 
 
diff --git a/tests/functional/debug/func_lpf_debug_put_read_write_conflict_among_many.cpp b/tests/functional/debug/func_lpf_debug_put_read_write_conflict_among_many.cpp
index cb4da30e..d31498b3 100644
--- a/tests/functional/debug/func_lpf_debug_put_read_write_conflict_among_many.cpp
+++ b/tests/functional/debug/func_lpf_debug_put_read_write_conflict_among_many.cpp
@@ -75,3 +75,5 @@ TEST( API, func_lpf_debug_put_read_write_conflict_among_many )
     rc = lpf_exec( LPF_ROOT, LPF_MAX_P, &spmd, LPF_NO_ARGS );
     EXPECT_EQ( LPF_SUCCESS, rc );
 }
+
+
diff --git a/tests/functional/exception_list b/tests/functional/exception_list
deleted file mode 100644
index c7590fc1..00000000
--- a/tests/functional/exception_list
+++ /dev/null
@@ -1,5 +0,0 @@
-func_lpf_put_parallel_bad_pattern_.*
-func_lpf_hook_tcp_mpi..._[^_]*_mvapich2
-func_lpf_hook_tcp_mpi..._[^_]*_openmpi_gcc_64_1_10_7
-func_lpf_hook_tcp_timeout_mpi..._[^_]*_openmpi_gcc_64_1_10_7
-func_lpf_hook_tcp_mpi..._[^_]*_mpich_ge_gcc_64_3_2rc2
diff --git a/tests/functional/func_lpf_hook_simple.mpirma.cpp b/tests/functional/func_lpf_hook_simple.mpirma.cpp
index 5dfa7104..81016a39 100644
--- a/tests/functional/func_lpf_hook_simple.mpirma.cpp
+++ b/tests/functional/func_lpf_hook_simple.mpirma.cpp
@@ -17,7 +17,7 @@
 
 #include <lpf/core.h>
 #include <lpf/mpi.h>
-#include "Test.h"
+#include "gtest/gtest.h"
 
 #include <stdlib.h>
 #include <mpi.h>
@@ -28,11 +28,11 @@ void spmd( lpf_t ctx, lpf_pid_t pid, lpf_pid_t nprocs, lpf_args_t args )
     lpf_err_t rc = LPF_SUCCESS;
 
     rc = lpf_resize_message_queue( ctx, 2);
-    EXPECT_EQ( "%d", LPF_SUCCESS, rc );
+    EXPECT_EQ( LPF_SUCCESS, rc );
     rc = lpf_resize_memory_register( ctx, 2);
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
     rc = lpf_sync(ctx, LPF_SYNC_DEFAULT );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     int x = 5 - pid;
     int y = pid;
@@ -41,21 +41,21 @@ void spmd( lpf_t ctx, lpf_pid_t pid, lpf_pid_t nprocs, lpf_args_t args )
     lpf_memslot_t ySlot = LPF_INVALID_MEMSLOT;
 
     rc = lpf_register_global( ctx, &x, sizeof(x), &xSlot );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
     rc = lpf_register_global( ctx, &y, sizeof(y), &ySlot );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     rc = lpf_sync( ctx, LPF_SYNC_DEFAULT );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     rc = lpf_put( ctx, xSlot, 0, (pid + 1) % nprocs, ySlot, 0, sizeof(x), LPF_MSG_DEFAULT );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     rc = lpf_sync( ctx, LPF_SYNC_DEFAULT );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
-    EXPECT_EQ( "%d", x, (int) (5 - pid) );
-    EXPECT_EQ( "%d", y, (int) (5 - (pid + nprocs -1) % nprocs) );
+    EXPECT_EQ( x, (int) (5 - pid) );
+    EXPECT_EQ( y, (int) (5 - (pid + nprocs -1) % nprocs) );
 }
 
 // disable automatic initialization.
@@ -66,7 +66,7 @@ const int LPF_MPI_AUTO_INITIALIZE=0;
  * \pre P >= 1
  * \return Exit code: 0
  */
-TEST( func_lpf_hook_simple_mpi )
+TEST(API, func_lpf_hook_simple_mpi)
 {
     lpf_err_t rc = LPF_SUCCESS;
     MPI_Init(NULL, NULL);
@@ -79,16 +79,15 @@ TEST( func_lpf_hook_simple_mpi )
 
     lpf_init_t init;
     rc = lpf_mpi_initialize_with_mpicomm( MPI_COMM_WORLD, &init);
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     rc = lpf_hook( init, &spmd, LPF_NO_ARGS );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     rc = lpf_mpi_finalize( init );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     MPI_Finalize();
-    return 0;
 }
 
 
diff --git a/tests/functional/func_lpf_hook_simple.pthread.cpp b/tests/functional/func_lpf_hook_simple.pthread.cpp
index 3b33bdc6..6438b676 100644
--- a/tests/functional/func_lpf_hook_simple.pthread.cpp
+++ b/tests/functional/func_lpf_hook_simple.pthread.cpp
@@ -17,7 +17,7 @@
 
 #include <lpf/core.h>
 #include <lpf/pthread.h>
-#include "Test.h"
+#include "gtest/gtest.h"
 
 #include <pthread.h>
 #include <unistd.h>
@@ -36,18 +36,18 @@ struct thread_local_data {
 void lpf_spmd( lpf_t ctx, lpf_pid_t pid, lpf_pid_t nprocs, lpf_args_t args )
 {
     (void) ctx;
-    const struct thread_local_data * const data = pthread_getspecific( pid_key );
-
-    EXPECT_EQ( "%zd", (size_t)nprocs, (size_t)(data->P) );
-    EXPECT_EQ( "%zd", (size_t)pid, (size_t)(data->s) );
-    EXPECT_EQ( "%zd", (size_t)(args.input_size), (size_t)(sizeof( struct thread_local_data)) );
-    EXPECT_EQ( "%zd", (size_t)(args.output_size), (size_t)0 );
-    EXPECT_EQ( "%p", args.input, data );
-    EXPECT_EQ( "%p", args.output, NULL );
+    const struct thread_local_data * const data = static_cast<thread_local_data *>(pthread_getspecific( pid_key ));
+
+    EXPECT_EQ( (size_t)nprocs, (size_t)(data->P) );
+    EXPECT_EQ( (size_t)pid, (size_t)(data->s) );
+    EXPECT_EQ( (size_t)(args.input_size), (size_t)(sizeof( struct thread_local_data)) );
+    EXPECT_EQ( (size_t)(args.output_size), (size_t)0 );
+    EXPECT_EQ( args.input, data );
+    EXPECT_EQ( args.output, nullptr );
 }
 
 void * pthread_spmd( void * _data ) {
-    EXPECT_NE( "%p", _data, NULL );
+    EXPECT_NE( _data, nullptr);
 
     const struct thread_local_data data = * ((struct thread_local_data*) _data);
     const int pts_rc = pthread_setspecific( pid_key, _data );
@@ -61,20 +61,20 @@ void * pthread_spmd( void * _data ) {
     lpf_init_t init;
     lpf_err_t rc = LPF_SUCCESS;
 
-    EXPECT_EQ( "%d", pts_rc, 0 );
+    EXPECT_EQ( pts_rc, 0 );
 
     rc = lpf_pthread_initialize(
         (lpf_pid_t)data.s,
         (lpf_pid_t)data.P,
         &init
     );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     rc = lpf_hook( init, &lpf_spmd, args );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     rc = lpf_pthread_finalize( init );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     return NULL;
 }
@@ -85,36 +85,35 @@ void * pthread_spmd( void * _data ) {
  * \pre P >= 1
  * \return Exit code: 0
  */
-TEST( func_lpf_hook_simple_pthread )
+TEST(API, func_lpf_hook_simple_pthread )
 {
     long k = 0;
     const long P = sysconf( _SC_NPROCESSORS_ONLN );
 
     const int ptc_rc = pthread_key_create( &pid_key, NULL );
-    EXPECT_EQ( "%d", ptc_rc, 0 );
+    EXPECT_EQ( ptc_rc, 0 );
 
     pthread_t * const threads = (pthread_t*) malloc( P * sizeof(pthread_t) );
-    EXPECT_NE( "%p", threads, NULL );
+    EXPECT_NE( threads, nullptr );
 
     struct thread_local_data * const data = (struct thread_local_data*) malloc( P * sizeof(struct thread_local_data) );
-    EXPECT_NE( "%p", data, NULL );
+    EXPECT_NE( data, nullptr );
 
     for( k = 0; k < P; ++k ) {
         data[ k ].P = P;
         data[ k ].s = k;
         const int rval = pthread_create( threads + k, NULL, &pthread_spmd, data + k );
-        EXPECT_EQ( "%d", rval, 0 );
+        EXPECT_EQ( rval, 0 );
     }
 
     for( k = 0; k < P; ++k ) {
         const int rval = pthread_join( threads[ k ], NULL );
-        EXPECT_EQ( "%d", rval, 0 );
+        EXPECT_EQ( rval, 0 );
     }
 
     const int ptd_rc = pthread_key_delete( pid_key );
-    EXPECT_EQ( "%d", ptd_rc, 0 );
+    EXPECT_EQ( ptd_rc, 0 );
 
-    return 0;
 }
 
 
diff --git a/tests/functional/func_lpf_hook_subset.mpimsg.cpp b/tests/functional/func_lpf_hook_subset.mpimsg.cpp
index f073e443..6693bab3 100644
--- a/tests/functional/func_lpf_hook_subset.mpimsg.cpp
+++ b/tests/functional/func_lpf_hook_subset.mpimsg.cpp
@@ -17,7 +17,7 @@
 
 #include <lpf/core.h>
 #include <lpf/mpi.h>
-#include "Test.h"
+#include "gtest/gtest.h"
 
 #include <mpi.h>
 
@@ -39,10 +39,10 @@ void subset_func(MPI_Comm comm)
 
     lpf_init_t init;
     lpf_err_t rc = lpf_mpi_initialize_with_mpicomm(comm, &init);
-    EXPECT_EQ( "%d", LPF_SUCCESS, rc );
+    EXPECT_EQ( LPF_SUCCESS, rc );
 
     rc = lpf_hook(init, test_spmd, LPF_NO_ARGS);
-    EXPECT_EQ( "%d", LPF_SUCCESS, rc );
+    EXPECT_EQ( LPF_SUCCESS, rc );
 }
 
 /**
@@ -50,7 +50,7 @@ void subset_func(MPI_Comm comm)
  * \pre P >= 3
  * \return Exit code: 0
  */
-TEST( func_lpf_hook_subset )
+TEST(API, func_lpf_hook_subset )
 {
     MPI_Init(NULL, NULL);
 
@@ -71,5 +71,5 @@ TEST( func_lpf_hook_subset )
     MPI_Barrier(MPI_COMM_WORLD); // Paranoid barrier
 
     MPI_Finalize();
-    return 0;
+    
 }
diff --git a/tests/functional/func_lpf_hook_tcp.mpirma.cpp b/tests/functional/func_lpf_hook_tcp.mpirma.cpp
index 2921e6fc..0d7f0290 100644
--- a/tests/functional/func_lpf_hook_tcp.mpirma.cpp
+++ b/tests/functional/func_lpf_hook_tcp.mpirma.cpp
@@ -17,28 +17,35 @@
 
 #include <lpf/core.h>
 #include <lpf/mpi.h>
-#include "Test.h"
+#include "gtest/gtest.h"
 
 #include <stdlib.h>
 #include <mpi.h>
 
+static int myargc;
+static char **myargv;
+
+// disable automatic initialization.
+const int LPF_MPI_AUTO_INITIALIZE=0; 
+
+
 void spmd( lpf_t ctx, lpf_pid_t pid, lpf_pid_t nprocs, lpf_args_t args )
 {
     lpf_err_t rc = LPF_SUCCESS;
 
     struct { int pid, nprocs; } params;
-    EXPECT_EQ( "%lu", sizeof(params), args.input_size );
+    EXPECT_EQ( sizeof(params), args.input_size );
 
     memcpy( &params, args.input, sizeof(params));
-    EXPECT_EQ( "%u", (lpf_pid_t) params.pid, pid );
-    EXPECT_EQ( "%u", (lpf_pid_t) params.nprocs, nprocs );
+    EXPECT_EQ( (lpf_pid_t) params.pid, pid );
+    EXPECT_EQ( (lpf_pid_t) params.nprocs, nprocs );
 
     rc = lpf_resize_message_queue( ctx, 2);
-    EXPECT_EQ( "%d", LPF_SUCCESS, rc );
+    EXPECT_EQ( LPF_SUCCESS, rc );
     rc = lpf_resize_memory_register( ctx, 2);
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
     rc = lpf_sync(ctx, LPF_SYNC_DEFAULT );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     int x = 5 - pid;
     int y = pid;
@@ -47,25 +54,23 @@ void spmd( lpf_t ctx, lpf_pid_t pid, lpf_pid_t nprocs, lpf_args_t args )
     lpf_memslot_t ySlot = LPF_INVALID_MEMSLOT;
 
     rc = lpf_register_global( ctx, &x, sizeof(x), &xSlot );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
     rc = lpf_register_global( ctx, &y, sizeof(y), &ySlot );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     rc = lpf_sync( ctx, LPF_SYNC_DEFAULT );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     rc = lpf_put( ctx, xSlot, 0, (pid + 1) % nprocs, ySlot, 0, sizeof(x), LPF_MSG_DEFAULT );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     rc = lpf_sync( ctx, LPF_SYNC_DEFAULT );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
-    EXPECT_EQ( "%d", x, (int) (5 - pid) );
-    EXPECT_EQ( "%d", y, (int) (5 - (pid + nprocs -1) % nprocs) );
+    EXPECT_EQ( x, (int) (5 - pid) );
+    EXPECT_EQ( y, (int) (5 - (pid + nprocs -1) % nprocs) );
 }
 
-// disable automatic initialization.
-const int LPF_MPI_AUTO_INITIALIZE=0; 
 
 /** 
  * \test Tests lpf_hook on mpi implementation using TCP/IP to initialize. The pids and nprocs are checked for their correctness.
@@ -73,15 +78,14 @@ const int LPF_MPI_AUTO_INITIALIZE=0;
  * \return Exit code: 0
  * \note Independent processes: yes
  */
-TEST( func_lpf_hook_tcp )
+TEST( API, func_lpf_hook_tcp_mpirma )
 {
     lpf_err_t rc = LPF_SUCCESS;
-    MPI_Init(&argc, &argv);
 
     struct { int pid, nprocs; } params = { 0, 0};
-    EXPECT_GT("%d", argc, 2 );
-    params.pid = atoi( argv[1] );
-    params.nprocs = atoi( argv[2] );
+    EXPECT_GT( myargc, 2 );
+    params.pid = atoi( myargv[1] );
+    params.nprocs = atoi( myargv[2] );
 
     lpf_init_t init;
     rc = lpf_mpi_initialize_over_tcp( 
@@ -89,7 +93,7 @@ TEST( func_lpf_hook_tcp )
             params.pid, params.nprocs, &init); // let e.g. Intel MPI try a few
                                                // alternative fabrics
 
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     lpf_args_t args;
     args.input = &params;
@@ -100,13 +104,20 @@ TEST( func_lpf_hook_tcp )
     args.f_size = 0;
 
     rc = lpf_hook( init, &spmd, args );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     rc = lpf_mpi_finalize( init );
-    EXPECT_EQ( "%d", rc, LPF_SUCCESS );
+    EXPECT_EQ( rc, LPF_SUCCESS );
 
     MPI_Finalize();
-    return 0;
+}
+
+int main(int argc, char **argv) {
+    myargc = argc;
+    myargv = argv;
+    testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+
 }
 
 
diff --git a/tests/functional/func_lpf_hook_tcp_timeout.mpirma.cpp b/tests/functional/func_lpf_hook_tcp_timeout.mpirma.cpp
index e8aba501..94d3edd6 100644
--- a/tests/functional/func_lpf_hook_tcp_timeout.mpirma.cpp
+++ b/tests/functional/func_lpf_hook_tcp_timeout.mpirma.cpp
@@ -17,7 +17,7 @@
 
 #include <lpf/core.h>
 #include <lpf/mpi.h>
-#include "Test.h"
+#include "gtest/gtest.h"
 
 #include <stdlib.h>
 #include <mpi.h>
@@ -31,7 +31,7 @@ const int LPF_MPI_AUTO_INITIALIZE=0;
  * \pre P <= 100
  * \return Exit code: 1
  */
-TEST( func_lpf_hook_tcp_timeout_mpi )
+TEST(API, func_lpf_hook_tcp_timeout_mpi )
 {
     MPI_Init(NULL, NULL);
 
@@ -45,9 +45,8 @@ TEST( func_lpf_hook_tcp_timeout_mpi )
             "localhost", "9325", 999,
             pid, nprocs, &init);
 
-    EXPECT_EQ( "%d", rc, LPF_ERR_FATAL );
+    EXPECT_EQ( rc, LPF_ERR_FATAL );
 
-    return 0;
 }
 
 
diff --git a/tests/functional/func_lpf_put_parallel_bad_pattern.cpp b/tests/functional/func_lpf_put_parallel_bad_pattern.cpp
deleted file mode 100644
index fe1d8f48..00000000
--- a/tests/functional/func_lpf_put_parallel_bad_pattern.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-
-/*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <lpf/core.h>
-#include <math.h>
-
-#include "gtest/gtest.h"
-
-void spmd( lpf_t lpf, lpf_pid_t pid, lpf_pid_t nprocs, lpf_args_t args)
-{
-    (void) args; // ignore args parameter
-
-    lpf_err_t rc = LPF_SUCCESS;
-    const unsigned n = sqrt(nprocs);
-    unsigned i;
-    unsigned * xs, *ys;
-    ys = (unsigned *) malloc( sizeof(ys[0]) * n);
-    xs = (unsigned *) malloc( sizeof(xs[0]) * n);
-    for (i = 0; i < n; ++i)
-    {
-        xs[i] = i;
-        ys[i] = 0;
-    }
-        
-    rc = lpf_resize_message_queue( lpf, n);
-    EXPECT_EQ( LPF_SUCCESS, rc );
-    rc = lpf_resize_memory_register( lpf, 2 );
-    EXPECT_EQ( LPF_SUCCESS, rc );
-    rc = lpf_sync( lpf, LPF_SYNC_DEFAULT );
-    EXPECT_EQ( LPF_SUCCESS, rc );
- 
-    lpf_memslot_t xslot = LPF_INVALID_MEMSLOT;
-    lpf_memslot_t yslot = LPF_INVALID_MEMSLOT;
-    rc = lpf_register_local( lpf, xs, sizeof(xs[0]) * n, &xslot );
-    EXPECT_EQ( LPF_SUCCESS, rc );
-    rc = lpf_register_global( lpf, ys, sizeof(ys[0]) * n, &yslot );
-    EXPECT_EQ( LPF_SUCCESS, rc );
-
-    rc = lpf_sync( lpf, LPF_SYNC_DEFAULT);
-    EXPECT_EQ( LPF_SUCCESS, rc );
-
-    // Check that data is OK.
-    for (i = 0; i < n; ++i)
-    {
-        EXPECT_EQ( i, xs[i] );
-        EXPECT_EQ( 0u, ys[i] );
-    }
-
-    if ( pid < n )
-    {
-        for ( i = 0; i < n; ++ i)
-        {
-            EXPECT_LT( i*n, nprocs);
-            rc = lpf_put( lpf, xslot, sizeof(xs[0])*i, 
-                    i*n, yslot, sizeof(ys[0])*pid, sizeof(xs[0]), 
-                    LPF_MSG_DEFAULT );
-            EXPECT_EQ( LPF_SUCCESS, rc );
-        }
-    }
-
-        
-    rc = lpf_sync( lpf, LPF_SYNC_DEFAULT );
-    EXPECT_EQ( LPF_SUCCESS, rc );
-
-    for (i = 0; i < n; ++i)
-    {
-        EXPECT_EQ( i, xs[i] );
-        if ( pid % n == 0 && pid < n*n)
-            EXPECT_EQ( pid / n, ys[i] );
-        else
-            EXPECT_EQ( 0, ys[i] );
-    }
-
-}
-
-/** 
- * \test Test lpf_put by doing a pattern which bad for a sparse all-to-all
- * \pre P >= 5
- * \pre P <= 5
- * \return Exit code: 0
- */
-TEST( API, func_lpf_put_parallel_bad_pattern )
-{
-    lpf_err_t rc = lpf_exec( LPF_ROOT, LPF_MAX_P, spmd, LPF_NO_ARGS);
-    EXPECT_EQ( LPF_SUCCESS, rc );
-}
diff --git a/tests/functional/func_lpf_test_noc_ring.cpp b/tests/functional/func_lpf_test_noc_ring.cpp
new file mode 100644
index 00000000..1050b68e
--- /dev/null
+++ b/tests/functional/func_lpf_test_noc_ring.cpp
@@ -0,0 +1,85 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mpi.h"
+#include <lpf/core.h>
+#include <lpf/noc.h>
+#include "gtest/gtest.h"
+
+void spmd( lpf_t lpf, lpf_pid_t pid, lpf_pid_t nprocs, lpf_args_t args)
+{
+    (void) args; // ignore args parameter
+
+    lpf_err_t rc = LPF_SUCCESS;
+
+    char buf1[30] = {'\0'};
+    char buf2[30] = {'\0'};
+
+    strcpy(buf1, "HELLO");
+
+    rc = lpf_resize_memory_register(lpf, 2); // identical to lpf_noc_resize at the moment
+    EXPECT_EQ( LPF_SUCCESS, rc );
+    EXPECT_EQ( LPF_SUCCESS, rc );
+    rc = lpf_resize_message_queue( lpf, 2);
+    EXPECT_EQ( LPF_SUCCESS, rc );
+
+    rc = lpf_sync( lpf, LPF_SYNC_DEFAULT );
+    EXPECT_EQ( LPF_SUCCESS, rc );
+
+    lpf_memslot_t xslot = LPF_INVALID_MEMSLOT;
+    lpf_memslot_t yslot = LPF_INVALID_MEMSLOT;
+    rc = lpf_register_local( lpf, buf1, sizeof(buf1), &xslot );
+    EXPECT_EQ( LPF_SUCCESS, rc );
+    rc = lpf_noc_register( lpf, buf2, sizeof(buf2), &yslot );
+    EXPECT_EQ( LPF_SUCCESS, rc );
+
+       
+    int left = (nprocs + pid - 1) % nprocs;
+    int right = ( pid + 1) % nprocs;
+
+    char * buffer;
+    size_t bufferSize; 
+    lpf_noc_serialize_slot(lpf, yslot, &buffer, &bufferSize);
+    char rmtBuff[bufferSize];
+
+    MPI_Sendrecv(buffer, bufferSize, MPI_BYTE, left, 0, rmtBuff, bufferSize, MPI_BYTE, right, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+    rc = lpf_noc_deserialize_slot(lpf, rmtBuff, yslot);
+    EXPECT_EQ( LPF_SUCCESS, rc );
+    rc = lpf_noc_put(lpf, xslot, 0, right, yslot, 0, sizeof(buf1), LPF_MSG_DEFAULT);
+    EXPECT_EQ( LPF_SUCCESS, rc );
+    rc = lpf_sync(lpf, LPF_SYNC_DEFAULT);
+    EXPECT_EQ( LPF_SUCCESS, rc );
+    EXPECT_EQ(std::string(buf2), std::string(buf1));
+    rc = lpf_deregister(lpf, xslot);
+    EXPECT_EQ( LPF_SUCCESS, rc );
+    rc = lpf_noc_deregister(lpf, yslot);
+    EXPECT_EQ( LPF_SUCCESS, rc );
+
+}
+
+/** 
+ * \test Testing NOC functionality
+ * \pre P >= 2
+ * \pre P <= 2
+ * \return Exit code: 0
+ */
+TEST( API, func_lpfAPI_test_noc_ring )
+{
+    lpf_err_t rc = lpf_exec( LPF_ROOT, LPF_MAX_P, spmd, LPF_NO_ARGS);
+    EXPECT_EQ( LPF_SUCCESS, rc );
+}