diff --git a/CMakeLists.txt b/CMakeLists.txt
index 68299ad49..8020c64ee 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -52,6 +52,7 @@ endif()
 option( WITH_REFERENCE_BACKEND "With Reference backend" ON )
 option( WITH_ALP_REFERENCE_BACKEND "With Reference Dense backend" ON )
 option( WITH_OMP_BACKEND "With OMP backend" ON )
+option( WITH_ALP_OMP_BACKEND "With OMP Dense backend" ON )
 option( WITH_NUMA "With NUMA support" ON )
 option( LPF_INSTALL_PATH "Path to the LPF tools for the BSP1D and Hybrid backends" OFF )
 # the following options depend on LPF_INSTALL_PATH being set
@@ -190,6 +191,7 @@ endif()
 set( WITH_REFERENCE_BACKEND_HEADERS OFF )
 set( WITH_ALP_REFERENCE_BACKEND_HEADERS OFF )
 set( WITH_OMP_BACKEND_HEADERS OFF )
+set( WITH_ALP_OMP_BACKEND_HEADERS OFF )
 
 # activate headers based on requested backends
 if( WITH_REFERENCE_BACKEND OR WITH_BSP1D_BACKEND )
@@ -206,6 +208,10 @@ if( WITH_ALP_REFERENCE_BACKEND )
 	set( WITH_ALP_REFERENCE_BACKEND_HEADERS ON )
 endif()
 
+if( WITH_ALP_OMP_BACKEND )
+	set( WITH_ALP_OMP_BACKEND_HEADERS ON )
+endif()
+
 add_subdirectory( include )
 
 ### BACKEND IMPLEMENTATIONS
diff --git a/bootstrap.sh b/bootstrap.sh
index 030279452..2fad6b04e 100755
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -94,6 +94,7 @@ reference=yes
 banshee=no
 lpf=no
 alp_reference=yes
+alp_omp=yes
 show=no
 FLAGS=$''
 LPF_INSTALL_PATH=
@@ -297,6 +298,11 @@ the current directory before invocation or confirm the deletion of its content w
 	else
 		CMAKE_OPTS+=" -DWITH_ALP_REFERENCE_BACKEND=ON"
 	fi
+	if [[ "${alp_omp}" == "no" ]]; then
+		CMAKE_OPTS+=" -DWITH_ALP_OMP_BACKEND=OFF"
+	else
+		CMAKE_OPTS+=" -DWITH_ALP_OMP_BACKEND=ON"
+	fi
 	if [[ "${lpf}" == "yes" ]]; then
 		CMAKE_OPTS+=" -DLPF_INSTALL_PATH='${ABSOLUTE_LPF_INSTALL_PATH}'"
 	fi
diff --git a/cmake/AddGRBInstall.cmake b/cmake/AddGRBInstall.cmake
index 3d50788a0..2e57c6c81 100644
--- a/cmake/AddGRBInstall.cmake
+++ b/cmake/AddGRBInstall.cmake
@@ -45,6 +45,7 @@ install( EXPORT GraphBLASTargets
 set( ALP_UTILS_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}" )
 set( SHMEM_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/sequential" )
 set( ALP_REFERENCE_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/alp/reference" )
+set( ALP_OMP_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/alp/omp" )
 set( BSP1D_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/spmd" )
 set( HYBRID_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/hybrid" )
 
@@ -136,6 +137,14 @@ if( WITH_OMP_BACKEND )
 	)
 endif()
 
+if( WITH_ALP_OMP_BACKEND )
+	addBackendWrapperGenOptions( "alp_omp"
+		COMPILE_DEFINITIONS "${ALP_OMP_SELECTION_DEFS}"
+		LINK_FLAGS "'${ALP_OMP_BACKEND_INSTALL_DIR}/lib${BACKEND_LIBRARY_OUTPUT_NAME}.a'"
+			"'${ALP_UTILS_INSTALL_DIR}/lib${ALP_UTILS_LIBRARY_OUTPUT_NAME}.a'" "${NUMA_LFLAG}"
+	)
+endif()
+
 # distributed memory backends
 if( WITH_BSP1D_BACKEND OR WITH_HYBRID_BACKEND )
 	assert_valid_variables( LPFRUN LPFCPP )
diff --git a/cmake/AddGRBVars.cmake b/cmake/AddGRBVars.cmake
index 2d324b404..e10bd37f3 100644
--- a/cmake/AddGRBVars.cmake
+++ b/cmake/AddGRBVars.cmake
@@ -32,6 +32,7 @@ set( REFERENCE_OMP_BACKEND_DEFAULT_NAME "backend_reference_omp" )
 set( BSP1D_BACKEND_DEFAULT_NAME "backend_bsp1d" )
 set( HYBRID_BACKEND_DEFAULT_NAME "backend_hybrid" )
 set( ALP_REFERENCE_BACKEND_DEFAULT_NAME "backend_alp_reference" )
+set( ALP_OMP_BACKEND_DEFAULT_NAME "backend_alp_omp" )
 
 
 ### COMPILER DEFINITIONS FOR HEADERS INCLUSION AND FOR BACKEND SELECTION
@@ -41,11 +42,16 @@ set( REFERENCE_INCLUDE_DEFS "_GRB_WITH_REFERENCE" )
 set( REFERENCE_OMP_INCLUDE_DEFS "_GRB_WITH_OMP" )
 set( LPF_INCLUDE_DEFS "_GRB_WITH_LPF" )
 set( ALP_REFERENCE_INCLUDE_DEFS "_ALP_WITH_REFERENCE" )
+set( ALP_OMP_INCLUDE_DEFS "_ALP_WITH_OMP;_ALP_OMP_WITH_REFERENCE" )
 
 # compiler definitions to select a backend
 set( REFERENCE_SELECTION_DEFS "_GRB_BACKEND=reference" )
 set( REFERENCE_OMP_SELECTION_DEFS "_GRB_BACKEND=reference_omp" )
 set( ALP_REFERENCE_SELECTION_DEFS "_ALP_BACKEND=reference" )
+set( ALP_OMP_SELECTION_DEFS
+		"_ALP_BACKEND=omp"
+		"_ALP_SECONDARY_BACKEND=reference"
+)
 set( BSP1D_SELECTION_DEFS
 		"_GRB_BACKEND=BSP1D"
 		"_GRB_BSP1D_BACKEND=reference"
@@ -59,7 +65,7 @@ set( HYBRID_SELECTION_DEFS
 set( NO_NUMA_DEF "_GRB_NO_LIBNUMA" )
 
 ### **ALL** BACKENDS, EVEN IF NOT ENABLED BY USER
-set( ALL_BACKENDS "reference" "reference_omp" "bsp1d" "hybrid" "alp_reference" )
+set( ALL_BACKENDS "reference" "reference_omp" "bsp1d" "hybrid" "alp_reference" "alp_omp" )
 
 
 # list of user-enabled backends, for tests and wrapper scripts (do not change!)
@@ -81,6 +87,10 @@ if( WITH_ALP_REFERENCE_BACKEND )
 	list( APPEND AVAILABLE_BACKENDS "alp_reference" )
 endif()
 
+if( WITH_ALP_OMP_BACKEND )
+	list( APPEND AVAILABLE_BACKENDS "alp_omp" )
+endif()
+
 # distributed memory backends
 if( WITH_BSP1D_BACKEND )
 	list( APPEND AVAILABLE_BACKENDS "bsp1d" )
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
index 4e2c17aa3..b9a92bff4 100644
--- a/include/CMakeLists.txt
+++ b/include/CMakeLists.txt
@@ -67,7 +67,9 @@ install( TARGETS alp_utils_headers EXPORT GraphBLASTargets
 	INCLUDES DESTINATION "${INCLUDE_INSTALL_DIR}"
 )
 
-if( WITH_ALP_REFERENCE_BACKEND_HEADERS )
+if( WITH_ALP_REFERENCE_BACKEND_HEADERS OR
+	WITH_ALP_OMP_BACKEND_HEADERS
+)
 	add_subdirectory( alp )
 endif()
 
diff --git a/include/alp.hpp b/include/alp.hpp
index 6e6646c78..3ebc36637 100644
--- a/include/alp.hpp
+++ b/include/alp.hpp
@@ -134,7 +134,7 @@
 // clang-format re-ordering
 #if 1
 // load active configuration
-// #include <alp/config.hpp> //defines _ALP_BACKEND and _WITH_BSP
+ #include <alp/config.hpp> //defines _ALP_BACKEND
 #endif
 
 // #pragma message "Included ALP.hpp"
diff --git a/include/alp/CMakeLists.txt b/include/alp/CMakeLists.txt
index c2256749b..690a371b4 100644
--- a/include/alp/CMakeLists.txt
+++ b/include/alp/CMakeLists.txt
@@ -22,6 +22,7 @@
 # set a default backend (if they want to do so).
 #
 assert_defined_variables( ALP_REFERENCE_INCLUDE_DEFS WITH_ALP_REFERENCE_BACKEND_HEADERS )
+assert_defined_variables( ALP_OMP_INCLUDE_DEFS WITH_ALP_OMP_BACKEND_HEADERS )
 assert_valid_variables( INCLUDE_INSTALL_DIR )
 
 # to avoid flaky acrobatics with regex or glob expressions, copy main files directly
@@ -91,6 +92,22 @@ if( WITH_ALP_REFERENCE_BACKEND )
 	)
 endif()
 
+if( WITH_ALP_OMP_BACKEND_HEADERS )
+	add_library( backend_alp_omp_headers INTERFACE )
+	target_link_libraries( backend_alp_omp_headers INTERFACE backend_headers_nodefs )
+	target_link_libraries( backend_alp_omp_headers INTERFACE OpenMP::OpenMP_CXX )
+	target_compile_definitions( backend_alp_omp_headers INTERFACE "${ALP_OMP_INCLUDE_DEFS}" )
+
+	install( TARGETS backend_alp_omp_headers EXPORT GraphBLASTargets )
+endif()
+
+if( WITH_ALP_OMP_BACKEND )
+	install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/alp/omp"
+		DESTINATION "${ALP_INCLUDE_INSTALL_DIR}/alp_omp"
+		FILES_MATCHING REGEX "${HEADERS_REGEX}"
+	)
+endif()
+
 # this target lists the algorithms implemented on top of the generic functionalities,
 # hence it depends only on backend_headers_nodefs
 add_library( alp_algorithms INTERFACE )
diff --git a/include/alp/amf-based/functorbasedmatrix.hpp b/include/alp/amf-based/functorbasedmatrix.hpp
index f48ecca4d..0e56a8688 100644
--- a/include/alp/amf-based/functorbasedmatrix.hpp
+++ b/include/alp/amf-based/functorbasedmatrix.hpp
@@ -84,6 +84,7 @@ namespace alp {
 				typedef T value_type;
 				/** Type returned by access function */
 				typedef T access_type;
+				typedef T const_access_type;
 				/** Type of the index used to access the physical storage */
 				typedef std::pair< size_t, size_t > storage_index_type;
 
diff --git a/include/alp/amf-based/matrix.hpp b/include/alp/amf-based/matrix.hpp
index 13240bb6e..8fac4ec3f 100644
--- a/include/alp/amf-based/matrix.hpp
+++ b/include/alp/amf-based/matrix.hpp
@@ -90,7 +90,7 @@ namespace alp {
 			typename MatrixType,
 			std::enable_if_t< is_matrix< MatrixType >::value > * = nullptr
 		>
-		const typename MatrixType::access_type access( const MatrixType &, const typename MatrixType::storage_index_type & );
+		typename MatrixType::const_access_type access( const MatrixType &, const typename MatrixType::storage_index_type & );
 
 		template<
 			typename MatrixType,
@@ -146,7 +146,7 @@ namespace alp {
 					typename MatrixType,
 					std::enable_if_t< is_matrix< MatrixType >::value > *
 				>
-				friend const typename MatrixType::access_type access( const MatrixType &A, const typename MatrixType::storage_index_type &storageIndex );
+				friend typename MatrixType::const_access_type access( const MatrixType &A, const typename MatrixType::storage_index_type &storageIndex );
 
 				template<
 					typename MatrixType,
@@ -168,9 +168,9 @@ namespace alp {
 					static_cast< DerivedMatrix & >( *this ).setInitialized( initialized );
 				}
 
-				template< typename AccessType, typename StorageIndexType >
-				const AccessType access( const StorageIndexType storageIndex ) const {
-					static_assert( std::is_same< AccessType, typename DerivedMatrix::access_type >::value );
+				template< typename ConstAccessType, typename StorageIndexType >
+				ConstAccessType access( const StorageIndexType storageIndex ) const {
+					static_assert( std::is_same< ConstAccessType, typename DerivedMatrix::const_access_type >::value );
 					static_assert( std::is_same< StorageIndexType, typename DerivedMatrix::storage_index_type >::value );
 					return static_cast< const DerivedMatrix & >( *this ).access( storageIndex );
 				}
@@ -348,8 +348,8 @@ namespace alp {
 			>
 			Matrix( const size_t rows, const size_t cols, const size_t cap = 0 ) :
 				base_type(
-					storage::AMFFactory::FromPolynomial<
-						typename internal::determine_poly_factory< structure, ImfR, ImfC, backend >::factory_type
+					storage::AMFFactory< backend >::template FromPolynomial<
+						Structure, ImfR, ImfC
 					>::Create(
 						ImfR( rows ),
 						ImfC( cols )
@@ -383,8 +383,8 @@ namespace alp {
 			>
 			Matrix( const size_t dim, const size_t cap = 0 ) :
 				base_type(
-					storage::AMFFactory::FromPolynomial<
-						typename internal::determine_poly_factory< structure, ImfR, ImfC, backend >::factory_type
+					storage::AMFFactory< backend >::template FromPolynomial<
+						Structure, ImfR, ImfC
 					>::Create(
 						ImfR( dim ),
 						ImfC( dim )
@@ -418,7 +418,7 @@ namespace alp {
 			Matrix( SourceType &source_matrix, ImfR imf_r, ImfC imf_c ) :
 				base_type(
 					getContainer( source_matrix ),
-					storage::AMFFactory::Compose<
+					storage::AMFFactory< backend >::template Compose<
 						ImfR, ImfC, typename SourceType::base_type::amf_type
 					>::Create( imf_r, imf_c, internal::getAmf( source_matrix ) )
 				) {}
@@ -441,14 +441,43 @@ namespace alp {
 			Matrix( SourceType &source_matrix ) :
 				base_type(
 					getContainer( source_matrix ),
-					storage::AMFFactory::Reshape< View::type_id, typename SourceType::amf_type >::Create( internal::getAmf( source_matrix ) )
+					storage::AMFFactory< backend >::template Reshape< View::type_id, typename SourceType::amf_type >::Create( internal::getAmf( source_matrix ) )
 				) {}
 
 			/**
-			 * @deprecated
-			 * Constructor for a view over another storage-based matrix.
+			 * Constructor for a view over an internal container of another matrix.
 			 *
 			 * @tparam SourceType  The type of the target matrix.
+			 * @tparam AmfType  The type of the amf corresponding to the layout of
+			 *                  the provided container.
+			 *                  Used as a template parameter to avoid hard
+			 *                  compilation error in the case of FunctorBasedMatrix,
+			 *                  when base_type::amf_type does not exist.
+			 */
+			template<
+				typename BufferType,
+				typename AmfType,
+				std::enable_if_t<
+					!is_container< BufferType >::value &&
+					internal::is_view_over_storage< View >::value
+				> * = nullptr
+			>
+			Matrix( BufferType &&buffer, const size_t buffer_size, AmfType &&amf ) :
+				base_type(
+					buffer,
+					buffer_size,
+					std::forward< typename base_type::amf_type >( amf )
+				) {
+				static_assert(
+					std::is_same< typename base_type::amf_type, typename std::remove_reference< AmfType >::type >::value,
+					"The type of the provided AMF does not match the type of constructed container's AMF"
+				);
+			}
+
+			/**
+			 * Constructor for a view over another matrix' internal container.
+			 *
+			 * @tparam ContainerType  The type of the internal container.
 			 * @tparam AmfType     The type of the amf used to construct the matrix.
 			 *                     Used as a template parameter to benefit from
 			 *                     SFINAE for the case of FunctorBasedMatrix, when
@@ -457,21 +486,21 @@ namespace alp {
 			 *                     result in a hard compilation error.
 			 */
 			template<
-				typename SourceType,
+				typename ContainerType,
 				typename AmfType,
 				std::enable_if_t<
-					std::is_same< SourceType, typename View::applied_to >::value &&
+					internal::is_container< ContainerType >::value &&
 					internal::is_view_over_storage< View >::value &&
 					!internal::requires_allocation< View >::value
 				> * = nullptr
 			>
-			Matrix( SourceType &source_matrix, AmfType &&amf ) :
+			Matrix( ContainerType &container, AmfType &&amf ) :
 				base_type(
-					getContainer( source_matrix ),
+					container,
 					std::forward< typename base_type::amf_type >( amf )
 				) {
 				static_assert(
-					std::is_same< typename base_type::amf_type, AmfType >::value,
+					std::is_same< typename base_type::amf_type, typename std::remove_reference< AmfType >::type >::value,
 					"The AMF type of the constructor parameter needs to match the AMF type of this container specialization."
 				);
 			}
@@ -1086,10 +1115,10 @@ namespace alp {
 			typename MatrixType,
 			std::enable_if_t< is_matrix< MatrixType >::value > * = nullptr
 		>
-		const typename MatrixType::access_type access( const MatrixType &A, const typename MatrixType::storage_index_type &storageIndex ) {
+		typename MatrixType::const_access_type access( const MatrixType &A, const typename MatrixType::storage_index_type &storageIndex ) {
 			return static_cast<
 				const MatrixBase< typename MatrixType::base_type > &
-			>( A ).template access< typename MatrixType::access_type, typename MatrixType::storage_index_type >( storageIndex );
+			>( A ).template access< typename MatrixType::const_access_type, typename MatrixType::storage_index_type >( storageIndex );
 		}
 
 		/** Non-constant variant. **/
diff --git a/include/alp/amf-based/storage.hpp b/include/alp/amf-based/storage.hpp
index 15a73572f..7d5561339 100644
--- a/include/alp/amf-based/storage.hpp
+++ b/include/alp/amf-based/storage.hpp
@@ -36,6 +36,27 @@
 
 namespace alp {
 
+	namespace internal {
+
+		/**
+		 * Determines the mapping polynomial type and exposes a factory method
+		 * to create instances of that polynomial.
+		 *
+		 * All specializations of this type trait should define the factory
+		 * method following the same signature. The factory method shall
+		 * return an object of the type exposed as \a type.
+		 *
+		 * @tparam Structure  Matrix structure
+		 * @tparam ImfR       Row IMF type
+		 * @tparam ImfC       Column IMF type
+		 * @tparam backend    The backend
+		 *
+		 */
+		template< typename Structure, typename ImfR, typename ImfC, enum Backend backend >
+		struct determine_poly_factory {};
+
+	} // namespace internal
+
 	namespace storage {
 
 		enum StorageOrientation {
@@ -581,6 +602,10 @@ namespace alp {
 
 		}; // namespace polynomials
 
+		/** Forward declaration */
+		template< enum Backend backend >
+		class AMFFactory;
+
 		/**
 		 * Access Mapping Function (AMF) maps logical matrix coordinates (i, j)
 		 * to the corresponding matrix element's location in the physical container.
@@ -599,10 +624,13 @@ namespace alp {
 		 * All AMF specializations shall expose the effective types of the IMFs
 		 * and the mapping polynomial, since these may change after the fusion.
 		 */
-		template< typename ImfR, typename ImfC, typename MappingPolynomial >
+		template<
+			typename ImfR, typename ImfC, typename MappingPolynomial,
+			enum Backend backend
+		>
 		class AMF {
 
-			friend class AMFFactory;
+			friend class AMFFactory< backend >;
 
 			public:
 
@@ -747,8 +775,9 @@ namespace alp {
 		}; // class AMF
 
 		/**
-		 * @brief Collects AMF factory classes
+		 * Collects AMF factory classes.
 		 */
+		template< enum Backend backend >
 		struct AMFFactory {
 
 			/**
@@ -811,9 +840,9 @@ namespace alp {
 
 				public:
 
-					typedef AMF< final_imf_r_type, final_imf_c_type, final_polynomial_type > amf_type;
+					typedef AMF< final_imf_r_type, final_imf_c_type, final_polynomial_type, backend > amf_type;
 
-					static amf_type Create( ViewImfR imf_r, ViewImfC imf_c, const AMF< SourceImfR, SourceImfC, SourcePoly > &amf ) {
+					static amf_type Create( ViewImfR imf_r, ViewImfC imf_c, const AMF< SourceImfR, SourceImfC, SourcePoly, backend > &amf ) {
 						composed_imf_r_type composed_imf_r = imf::ComposedFactory< SourceImfR, ViewImfR >::create( amf.imf_r, imf_r );
 						composed_imf_c_type composed_imf_c = imf::ComposedFactory< SourceImfC, ViewImfC >::create( amf.imf_c, imf_c );
 						return amf_type(
@@ -841,10 +870,21 @@ namespace alp {
 			 * @tparam PolyType  Type of the mapping polynomial.
 			 *
 			 */
-			template< typename PolyFactory >
+			template< typename Structure, typename ImfR, typename ImfC >
 			struct FromPolynomial {
 
-				typedef AMF< imf::Id, imf::Id, typename PolyFactory::poly_type > amf_type;
+				// Ensure compatibility of IMF types.
+				// Original Matrix has imf::Id as both IMFs.
+				// Original Vector has ImfR = imf::Id and ImfC = imf::Zero.
+				static_assert(
+					std::is_same< ImfR, imf::Id >::value &&
+					( std::is_same< ImfC, imf::Id >::value || std::is_same< ImfC, imf::Zero >::value ),
+					"AMF factory FromPolynomial can only be used for an original container."
+				);
+
+				typedef typename internal::determine_poly_factory< Structure, ImfR, ImfC, backend >::factory_type PolyFactory;
+
+				typedef AMF< imf::Id, imf::Id, typename PolyFactory::poly_type, backend > amf_type;
 
 				/**
 				 * Factory method used by 2D containers.
@@ -890,13 +930,13 @@ namespace alp {
 					static_assert(
 						std::is_same<
 							amf_type,
-							typename Compose< imf::Id, imf::Zero, AMF< imf::Id, imf::Id, typename PolyFactory::poly_type > >::amf_type
+							typename Compose< imf::Id, imf::Zero, AMF< imf::Id, imf::Id, typename PolyFactory::poly_type, backend > >::amf_type
 						>::value,
 						"The factory method returns the object of different type than declared. This is a bug."
 					);
-					return Compose< imf::Id, imf::Zero, AMF< imf::Id, imf::Id, typename PolyFactory::poly_type > >::Create(
+					return Compose< imf::Id, imf::Zero, AMF< imf::Id, imf::Id, typename PolyFactory::poly_type, backend > >::Create(
 						imf_r, imf_c,
-						FromPolynomial< PolyFactory >::Create( imf::Id( imf_r.N ), imf::Id( imf_c.N ) )
+						FromPolynomial< Structure, imf::Id, imf::Zero >::Create( imf::Id( imf_r.N ), imf::Id( imf_c.N ) )
 					);
 				}
 
@@ -950,7 +990,8 @@ namespace alp {
 					typename polynomials::apply_view<
 						view::transpose,
 						typename SourceAMF::mapping_polynomial_type
-					>::type
+					>::type,
+					backend
 				> amf_type;
 
 				static amf_type Create( const SourceAMF &amf ) {
@@ -958,7 +999,8 @@ namespace alp {
 					return AMF<
 						typename SourceAMF::imf_c_type,
 						typename SourceAMF::imf_r_type,
-						new_mapping_polynomial_type
+						new_mapping_polynomial_type,
+						backend
 					>(
 						amf.imf_c,
 						amf.imf_r,
@@ -1004,7 +1046,7 @@ namespace alp {
 
 				public:
 
-					typedef AMF< imf::Id, imf::Zero, new_poly_type > amf_type;
+					typedef AMF< imf::Id, imf::Zero, new_poly_type, backend > amf_type;
 
 					static amf_type Create( const SourceAMF &amf ) {
 						assert( amf.getLogicalDimensions().first == amf.getLogicalDimensions().second );
@@ -1034,10 +1076,10 @@ namespace alp {
 			template< typename SourceAMF >
 			struct Reshape< view::matrix, SourceAMF > {
 
-				typedef typename AMFFactory::Compose< imf::Id, imf::Id, SourceAMF >::amf_type amf_type;
+				typedef typename Compose< imf::Id, imf::Id, SourceAMF >::amf_type amf_type;
 
 				static amf_type Create( const SourceAMF &amf ) {
-					return storage::AMFFactory::Compose< imf::Id, imf::Id, SourceAMF >::Create(
+					return Compose< imf::Id, imf::Id, SourceAMF >::Create(
 						imf::Id( amf.getLogicalDimensions().first ),
 						imf::Id( amf.getLogicalDimensions().second ),
 						amf
@@ -1054,23 +1096,6 @@ namespace alp {
 
 	namespace internal {
 
-		/**
-		 * Determines the mapping polynomial type and exposes a factory method
-		 * to create instances of that polynomial.
-		 *
-		 * All specializations of this type trait should define the factory
-		 * method following the same signature. The factory method shall
-		 * return an object of the type exposed as \a type.
-		 *
-		 * @tparam Structure  Matrix structure
-		 * @tparam ImfR       Row IMF type
-		 * @tparam ImfC       Column IMF type
-		 * @tparam backend    The backend
-		 *
-		 */
-		template< typename Structure, typename ImfR, typename ImfC, enum Backend backend >
-		struct determine_poly_factory {};
-
 		/**
 		 * Determines the AMF type for a matrix having the provided static properties.
 		 *
@@ -1131,10 +1156,10 @@ namespace alp {
 
 			typedef typename std::conditional<
 				View::type_id == view::gather,
-				typename storage::AMFFactory::Compose<
+				typename storage::AMFFactory< backend >::template Compose<
 					ImfR, ImfC, typename View::applied_to::amf_type
 				>::amf_type,
-				typename storage::AMFFactory::Reshape<
+				typename storage::AMFFactory< backend >::template Reshape<
 					View::type_id,
 					typename View::applied_to::amf_type
 				>::amf_type
@@ -1142,7 +1167,7 @@ namespace alp {
 
 		};
 
-		/** Specialization for containers that allocate storage */
+		/** Specialization for storage-based containers that allocate storage */
 		template< typename Structure, typename ImfC, enum Backend backend >
 		struct determine_amf_type< Structure, view::Original< void >, imf::Id, ImfC, backend > {
 
@@ -1151,12 +1176,12 @@ namespace alp {
 				"Incompatible combination of parameters provided to determine_amf_type."
 			);
 
-			typedef typename storage::AMFFactory::FromPolynomial<
-				typename determine_poly_factory< Structure, imf::Id, ImfC, backend >::factory_type
+			typedef typename storage::AMFFactory< backend >::template FromPolynomial<
+				Structure, imf::Id, ImfC
 			>::amf_type type;
 		};
 
-		/** Specialization for containers that allocate storage */
+		/** Specialization for functor-based containers that allocate storage */
 		template< typename Structure, typename ImfC, enum Backend backend, typename Lambda >
 		struct determine_amf_type< Structure, view::Functor< Lambda >, imf::Id, ImfC, backend > {
 
@@ -1165,10 +1190,10 @@ namespace alp {
 				"Incompatible combination of parameters provided to determine_amf_type."
 			);
 
-			typedef typename storage::AMFFactory::FromPolynomial<
-				storage::polynomials::NoneFactory
-			>::amf_type type;
+			// A functor-based container does not have an AMF
+			typedef void type;
 		};
+
 	} // namespace internal
 
 } // namespace alp
diff --git a/include/alp/amf-based/storagebasedmatrix.hpp b/include/alp/amf-based/storagebasedmatrix.hpp
index d1c38c59f..71c112554 100644
--- a/include/alp/amf-based/storagebasedmatrix.hpp
+++ b/include/alp/amf-based/storagebasedmatrix.hpp
@@ -62,19 +62,6 @@ namespace alp {
 		>
 		void setInitialized( MatrixType &, const bool ) noexcept;
 
-		/** Forward declarations for access functions */
-		template<
-			typename MatrixType,
-			std::enable_if_t< is_matrix< MatrixType >::value > * = nullptr
-		>
-		const typename MatrixType::access_type access( const MatrixType &, const typename MatrixType::storage_index_type & );
-
-		template<
-			typename MatrixType,
-			std::enable_if_t< is_matrix< MatrixType >::value > * = nullptr
-		>
-		typename MatrixType::access_type access( MatrixType &, const typename MatrixType::storage_index_type & );
-
 		/** Forward declaration */
 		template< typename T, typename AmfType, bool requires_allocation, Backend backend >
 		class StorageBasedMatrix;
@@ -126,29 +113,6 @@ namespace alp {
 		>
 		const typename MatrixType::amf_type &getAmf( const MatrixType &A ) noexcept;
 
-		/** Forward declaration */
-		template< typename DerivedMatrix >
-		class MatrixBase;
-
-		template< typename DerivedMatrix >
-		std::pair< size_t, size_t > dims( const MatrixBase< DerivedMatrix > & A ) noexcept;
-
-		template<
-			typename MatrixType,
-			std::enable_if_t< internal::is_storage_based< MatrixType >::value > * = nullptr
-		>
-		size_t getStorageDimensions( const MatrixType &A ) noexcept;
-
-		template< typename MatrixType,
-			std::enable_if_t< is_matrix< MatrixType>::value > * = nullptr
-		>
-		bool getInitialized( const MatrixType &A ) noexcept;
-
-		template< typename MatrixType,
-			std::enable_if_t< is_matrix< MatrixType>::value > * = nullptr
-		>
-		void setInitialized( MatrixType &, const bool ) noexcept;
-
 		/**
 		 * Matrix container specialization
 		 * Implements both original containers and views on containers.
@@ -181,6 +145,7 @@ namespace alp {
 				typedef typename AmfType::imf_c_type imf_c_type;
 				/** Type returned by access function */
 				typedef T &access_type;
+				typedef const T &const_access_type;
 				/** Type of the index used to access the physical storage */
 				typedef size_t storage_index_type;
 
@@ -269,7 +234,7 @@ namespace alp {
 				 *
 				 * @return const reference or value of the element at given position.
 				 */
-				const access_type access( const storage_index_type &storageIndex ) const {
+				const_access_type access( const storage_index_type &storageIndex ) const {
 					return container[ storageIndex ];
 				}
 
@@ -310,6 +275,11 @@ namespace alp {
 					container( container ),
 					amf( std::move( amf ) ) {}
 
+				/** View on another raw container */
+				StorageBasedMatrix( T *buffer, const size_t buffer_size, AmfType &&amf ) :
+					container( buffer, buffer_size ),
+					amf( std::move( amf ) ) {}
+
 		}; // class StorageBasedMatrix
 
 
diff --git a/include/alp/backends.hpp b/include/alp/backends.hpp
index 1b7befaf0..4b1aa8b52 100644
--- a/include/alp/backends.hpp
+++ b/include/alp/backends.hpp
@@ -40,7 +40,12 @@ namespace alp {
 		/**
 		 * The ALP reference backend.
 		 */
-		reference
+		reference,
+
+		/**
+		 * The ALP OpenMP backend.
+		 */
+		omp,
 
 	};
 
diff --git a/include/alp/base/config.hpp b/include/alp/base/config.hpp
index 00455225b..5a5dcc0fe 100644
--- a/include/alp/base/config.hpp
+++ b/include/alp/base/config.hpp
@@ -41,6 +41,15 @@
  #define _ALP_BACKEND reference
 #endif
 
+// if the user did not define _ALP_SECONDARY_BACKEND, set it to the default
+// sequential implementation. This setting may be used by other backends for
+// backend-specific purposes. For example, a parallel backend may use this
+// setting to constrol to which sequential backend it dispatches sequential
+// work.
+#ifndef _ALP_SECONDARY_BACKEND
+ #define _ALP_SECONDARY_BACKEND reference
+#endif
+
 /**
  * The main GraphBLAS namespace.
  *
diff --git a/include/alp/blas3.hpp b/include/alp/blas3.hpp
index e1f6d0baa..976d3ccb7 100644
--- a/include/alp/blas3.hpp
+++ b/include/alp/blas3.hpp
@@ -26,6 +26,9 @@
 #ifdef _ALP_WITH_REFERENCE
  #include <alp/reference/blas3.hpp>
 #endif
+#ifdef _ALP_WITH_OMP
+ #include <alp/omp/blas3.hpp>
+#endif
 
 #endif // end _H_ALP_BLAS3
 
diff --git a/include/alp/config.hpp b/include/alp/config.hpp
index 630c30de7..3913ee6da 100644
--- a/include/alp/config.hpp
+++ b/include/alp/config.hpp
@@ -27,6 +27,9 @@
 #ifdef _ALP_WITH_REFERENCE
  #include "alp/reference/config.hpp"
 #endif
+#ifdef _ALP_WITH_OMP
+ #include "alp/omp/config.hpp"
+#endif
 
 #endif // end ``_H_ALP_CONFIG''
 
diff --git a/include/alp/exec.hpp b/include/alp/exec.hpp
index 9fbadaee7..c20e21cfd 100644
--- a/include/alp/exec.hpp
+++ b/include/alp/exec.hpp
@@ -31,6 +31,9 @@
 #ifdef _ALP_WITH_REFERENCE
  #include "alp/reference/exec.hpp"
 #endif
+#ifdef _ALP_WITH_OMP
+ #include "alp/omp/exec.hpp"
+#endif
 
 #ifdef _ALP_BACKEND
 namespace alp {
diff --git a/include/alp/init.hpp b/include/alp/init.hpp
index a07812bfe..d59782b24 100644
--- a/include/alp/init.hpp
+++ b/include/alp/init.hpp
@@ -27,6 +27,9 @@
 #ifdef _ALP_WITH_REFERENCE
  #include "alp/reference/init.hpp"
 #endif
+#ifdef _ALP_WITH_OMP
+ #include "alp/omp/init.hpp"
+#endif
 
 #endif // end ``_H_ALP_INIT''
 
diff --git a/include/alp/io.hpp b/include/alp/io.hpp
index de1536188..73bb7e082 100644
--- a/include/alp/io.hpp
+++ b/include/alp/io.hpp
@@ -27,6 +27,9 @@
 #ifdef _ALP_WITH_REFERENCE
  #include <alp/reference/io.hpp>
 #endif
+#ifdef _ALP_WITH_OMP
+ #include <alp/omp/io.hpp>
+#endif
 
 #endif // end ``_H_ALP_IO''
 
diff --git a/include/alp/matrix.hpp b/include/alp/matrix.hpp
index 22b63eff0..20031e3c7 100644
--- a/include/alp/matrix.hpp
+++ b/include/alp/matrix.hpp
@@ -31,6 +31,9 @@
 #ifdef _ALP_WITH_REFERENCE
  #include <alp/reference/matrix.hpp>
 #endif
+#ifdef _ALP_WITH_OMP
+ #include <alp/omp/matrix.hpp>
+#endif
 
 // specify default only if requested during compilation
 #ifdef _ALP_BACKEND
diff --git a/include/alp/omp/blas3.hpp b/include/alp/omp/blas3.hpp
new file mode 100644
index 000000000..4954e00b5
--- /dev/null
+++ b/include/alp/omp/blas3.hpp
@@ -0,0 +1,511 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_OMP_BLAS3
+#define _H_ALP_OMP_BLAS3
+
+#include <algorithm>   // for std::min/max
+#include <type_traits> // for std::enable_if
+
+#include <alp/descriptors.hpp>
+#include <alp/structures.hpp>
+
+#include <alp/base/blas3.hpp>
+
+#include "matrix.hpp"
+#include "storage.hpp"
+
+// Include backend to which sequential work is delegated
+#ifdef _ALP_OMP_WITH_REFERENCE
+ #include <alp/reference/blas2.hpp>
+ #include <alp/reference/blas3.hpp>
+ #include <alp/reference/io.hpp>
+#endif
+
+#ifndef NDEBUG
+#include "../../../tests/utils/print_alp_containers.hpp"
+#endif
+
+
+namespace alp {
+
+	namespace internal {
+
+		/**
+		 * \internal general mxm implementation that all mxm variants using 
+		 * structured matrices refer to.
+		 */
+		template<
+			bool allow_void,
+			class MulMonoid,
+			typename OutputType, typename InputType1, typename InputType2,
+			class Operator, class Monoid,
+			typename OutputStructure, typename OutputView, 
+			typename OutputImfR, typename OutputImfC,
+			typename InputStructure1, typename InputView1, 
+			typename InputImfR1, typename InputImfC1,
+			typename InputStructure2, typename InputView2, 
+			typename InputImfR2, typename InputImfC2
+		>
+		RC mxm_generic( 
+			alp::Matrix< OutputType, OutputStructure, 
+			Density::Dense, OutputView, OutputImfR, OutputImfC, omp > &C,
+			alp::Matrix< InputType1, InputStructure1, 
+			Density::Dense, InputView1, InputImfR1, InputImfC1, omp > &A,
+			alp::Matrix< InputType2, InputStructure2, 
+			Density::Dense, InputView2, InputImfR2, InputImfC2, omp > &B,
+			const Operator &oper,
+			const Monoid &monoid,
+			const MulMonoid &mulMonoid,
+			const std::enable_if_t< !alp::is_object< OutputType >::value &&
+				!alp::is_object< InputType1 >::value && !
+				alp::is_object< InputType2 >::value &&
+				alp::is_operator< Operator >::value &&
+				alp::is_monoid< Monoid >::value,
+			void > * const = NULL
+		) {
+
+			static_assert( 
+				!(
+					std::is_same< InputType1, void >::value ||
+					std::is_same< InputType2, void >::value
+				),
+				"alp::internal::mxm_generic: the operator-monoid version of mxm cannot be "
+				"used if either of the input matrices is a pattern matrix (of type "
+				"void)"
+			);
+
+#ifndef NDEBUG
+			std::cout << "In alp::internal::mxm_generic (omp)\n";
+#endif
+
+			// Early exit checks 
+			if( ! internal::getInitialized( A ) || ! internal::getInitialized( B ) ||
+				! internal::getInitialized( C )
+			) {
+				internal::setInitialized( C, false );
+				return SUCCESS;
+			}
+
+			const std::ptrdiff_t m { static_cast< std::ptrdiff_t >( nrows( C ) ) };
+			const std::ptrdiff_t n { static_cast< std::ptrdiff_t >( ncols( C ) ) };
+			const std::ptrdiff_t m_a { static_cast< std::ptrdiff_t >( nrows( A ) ) };
+			const std::ptrdiff_t k { static_cast< std::ptrdiff_t >( ncols( A ) ) };
+			const std::ptrdiff_t k_b { static_cast< std::ptrdiff_t >( nrows( B ) ) };
+			const std::ptrdiff_t n_b { static_cast< std::ptrdiff_t >( ncols( B ) ) };
+
+			if( m != m_a || k != k_b || n != n_b ) {
+				return MISMATCH;
+			}
+
+			const Distribution_2_5D &da = internal::getAmf( A ).getDistribution();
+			const Distribution_2_5D &db = internal::getAmf( B ).getDistribution();
+			const Distribution_2_5D &dc = internal::getAmf( C ).getDistribution();
+
+			const auto tg_a = da.getThreadGridDims();
+			const auto tg_b = db.getThreadGridDims();
+			const auto tg_c = dc.getThreadGridDims();
+			
+			// 2.5D factor ranges within 3D limits
+			if( tg_c.rt != tg_a.rt || tg_a.rt != tg_b.rt || ( tg_a.tc % tg_a.rt ) != 0 ) {
+				return MISMATCH;
+			}
+
+			if( tg_c.tr != tg_a.tr || tg_c.tc != tg_b.tc || tg_a.tc != tg_b.tr ) {
+				return MISMATCH;
+			}
+
+			using th_coord_t = typename Distribution_2_5D::ThreadCoords;
+
+			RC rc = SUCCESS;
+
+#ifndef NDEBUG
+			#pragma omp critical
+			std::cerr << "In alp::internal::mxm_generic (omp)\n"
+				"\tBeginning parallel region" << std::endl;
+#endif
+
+			#pragma omp parallel
+			{			
+				const size_t thread = config::OMP::current_thread_ID();
+
+				const th_coord_t th_ijk_a = da.getThreadCoords( thread );
+				const th_coord_t th_ijk_b = db.getThreadCoords( thread );
+				const th_coord_t th_ijk_c = dc.getThreadCoords( thread );
+
+				RC local_rc = SUCCESS;
+
+				// Broadcast A and B to all c-dimensional layers
+				if( local_rc == SUCCESS && da.isActiveThread( th_ijk_a ) && th_ijk_a.rt > 0 ) {
+
+#ifndef NDEBUG
+					#pragma omp critical
+					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+						"\tCopying A." << std::endl;
+#endif
+
+					const auto set_block_grid_dims_a = da.getLocalBlockGridDims( th_ijk_a );
+
+					th_coord_t th_ij0_a( th_ijk_a.tr, th_ijk_a.tc, 0 );
+
+					for( size_t br = 0; br < set_block_grid_dims_a.first; ++br ) {
+						for( size_t bc = 0; bc < set_block_grid_dims_a.second; ++bc ) {
+
+							auto refAij0 = get_view( A, th_ij0_a, br, bc );
+							auto refAijk = get_view( A, th_ijk_a, br, bc );
+
+							local_rc = local_rc ? local_rc : set( refAijk, refAij0 );
+
+						}
+					}
+				
+				} // End Broadcast of A
+
+				if( local_rc == SUCCESS && db.isActiveThread( th_ijk_b ) && th_ijk_b.rt > 0 ) {
+
+#ifndef NDEBUG
+					#pragma omp critical
+					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+						"\tCopying B." << std::endl;
+#endif
+
+					const auto set_block_grid_dims_b = db.getLocalBlockGridDims( th_ijk_b );
+
+					th_coord_t th_ij0_b( th_ijk_b.tr, th_ijk_b.tc, 0 );
+
+					for( size_t br = 0; br < set_block_grid_dims_b.first; ++br ) {
+						for( size_t bc = 0; bc < set_block_grid_dims_b.second; ++bc ) {
+
+							auto refBij0 = get_view( B, th_ij0_b, br, bc );
+							auto refBijk = get_view( B, th_ijk_b, br, bc );
+
+							local_rc = local_rc ? local_rc : set( refBijk, refBij0 );
+						}
+					}
+				} // End Broadcast of B
+
+				if( local_rc == SUCCESS && dc.isActiveThread( th_ijk_c ) && th_ijk_c.rt > 0 ) {
+
+#ifndef NDEBUG
+					#pragma omp critical
+					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+						"\tZeroing C." << std::endl;
+#endif
+
+					const auto block_grid_dims_c = dc.getLocalBlockGridDims( th_ijk_c );
+
+					alp::Scalar< 
+						OutputType, alp::structures::General, config::default_sequential_backend 
+					> zero( monoid.template getIdentity< OutputType >() );
+
+					for( size_t br = 0; br < block_grid_dims_c.first; ++br ) {
+						for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) {
+							auto refCijk = get_view( C, th_ijk_c, br, bc );
+							local_rc = local_rc ? local_rc : set( refCijk, zero );
+						}
+					}
+				} // End Zero-ing of C
+
+				// Different values for rc could converge here (eg, MISMATCH, FAILED).
+				if( local_rc != SUCCESS ) {
+#ifndef NDEBUG
+					#pragma omp critical
+					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+						"\tIssues replicating input matrices." << std::endl;
+#endif
+					rc = local_rc;
+				}
+
+				// End Broadcast of A and B and zero-ing of C
+				#pragma omp barrier
+
+#ifndef NDEBUG
+				#pragma omp critical
+				std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+					"\tPassing barrier" << std::endl;
+#endif
+
+				if( rc == SUCCESS && dc.isActiveThread( th_ijk_c ) ) {
+
+					const auto block_grid_dims_c = dc.getLocalBlockGridDims( th_ijk_c );
+
+					// Initialize circular shifts at stride of Rt
+					size_t c_a = utils::modulus( th_ijk_a.tc + th_ijk_a.tr + th_ijk_a.rt * tg_a.tc / tg_a.rt, tg_a.tc );
+					size_t r_b = utils::modulus( th_ijk_b.tr + th_ijk_b.tc + th_ijk_b.rt * tg_b.tr / tg_b.rt, tg_b.tr );
+
+					// Per-c-dimensional-layer partial computation
+					for( size_t r = 0; r < tg_a.tc / tg_a.rt; ++r ) {
+
+						const th_coord_t th_isk_a( th_ijk_a.tr, c_a, th_ijk_a.rt );
+						const th_coord_t th_sjk_b( r_b, th_ijk_b.tc, th_ijk_b.rt );
+
+#ifndef NDEBUG
+						if( thread == 1 ) {
+							#pragma omp critical
+							{
+								std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+									"\tCompute iteration r=" << r << "\n"
+									"\tStarting from A( " << th_ijk_a.tr << ", " << c_a << ", " << th_ijk_a.rt << " )\n"
+									"\tStarting from B( " << r_b << ", " << th_ijk_b.tc << ", " << th_ijk_b.rt << " )" <<std::endl;
+							}
+						}
+#endif
+
+						const auto mxm_block_grid_dims_a = da.getLocalBlockGridDims( th_isk_a );
+						const auto mxm_block_grid_dims_b = db.getLocalBlockGridDims( th_sjk_b );
+
+						if( block_grid_dims_c.first != mxm_block_grid_dims_a.first 
+							|| block_grid_dims_c.second != mxm_block_grid_dims_b.second 
+							|| mxm_block_grid_dims_a.second != mxm_block_grid_dims_b.first 
+						) {
+#ifndef NDEBUG
+							#pragma omp critical
+							std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+								"\tMismatching local block grid size on mxm." << std::endl;
+#endif
+							local_rc = MISMATCH;
+						}
+
+						if( local_rc == SUCCESS ) {
+
+							for( size_t bk = 0; bk < mxm_block_grid_dims_a.second; ++bk ) {
+								for( size_t br = 0; br < block_grid_dims_c.first; ++br ) {
+	
+									const auto refA_loc = get_view( A, th_isk_a, br, bk );
+
+									for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) {
+
+										const auto refB_loc = get_view( B, th_sjk_b, bk, bc );
+										auto refC_ijk = get_view( C, th_ijk_c, br, bc );
+
+#ifndef NDEBUG
+										if( thread == 1 ) {
+											#pragma omp critical 
+											{
+												std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+													"\tIteration r=" << r << " computing: " << bk << " " << br << " " << bc << std::endl;
+												print_matrix("Aref pre", refA_loc );
+												print_matrix("Bref pre", refB_loc );
+												print_matrix("Cref pre", refC_ijk );
+											}
+										}
+#endif
+
+										// Delegate the call to the sequential mxm implementation
+										local_rc = local_rc ? local_rc : mxm_generic< allow_void >( refC_ijk, refA_loc, refB_loc, oper, monoid, mulMonoid );
+
+#ifndef NDEBUG
+										if( thread == 1 ) {
+											#pragma omp critical 
+											{
+												print_matrix("Cref post", refC_ijk );
+											}
+										}
+#endif
+
+									}
+								}
+							}
+
+							// Circular shift rightwards for A
+							c_a = utils::modulus( c_a + 1, tg_a.tc );
+							// Circular shift downwards for B
+							r_b = utils::modulus( r_b + 1, tg_b.tr );
+
+						} 
+
+					} 
+
+				} // End computation per layer of c-dimension
+
+				// Different values for rc could converge here (eg, MISMATCH, FAILED).
+				if( local_rc != SUCCESS ) {
+#ifndef NDEBUG
+					#pragma omp critical
+					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+						"\tIssues with local mxm computations." << std::endl;
+#endif
+					rc = local_rc;
+				}
+
+				// End layer-by-layer partial computation
+				#pragma omp barrier
+
+				// Final c-dimension reduction
+				if( rc == SUCCESS && dc.isActiveThread( th_ijk_c ) && th_ijk_c.rt == 0 ) {
+					
+#ifndef NDEBUG
+					#pragma omp critical
+					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+						"\tEntering reduction." << std::endl;
+#endif
+
+					const auto block_grid_dims_c = dc.getLocalBlockGridDims( th_ijk_c );
+
+					for( size_t r = 1; r < tg_c.rt; ++r ) {
+
+#ifndef NDEBUG
+						#pragma omp critical
+						std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+							"\tReduction iteration r=" << r << std::endl;
+#endif
+
+						const th_coord_t th_ijr_c( th_ijk_c.tr, th_ijk_c.tc, r );
+
+						for( size_t br = 0; br < block_grid_dims_c.first; ++br ) {
+							for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) {
+
+								auto refCij0 = internal::get_view( C, th_ijk_c, br, bc ); // k == 0 
+								auto refCijr = internal::get_view( C, th_ijr_c, br, bc );
+
+								// Final result in C at layer 0
+								local_rc = local_rc ? local_rc : foldl( refCij0, refCijr, monoid );
+							}
+						}
+
+					}
+				
+				} // End of final reduction along c-dimension
+				
+				// Different values for rc could converge here (eg, MISMATCH, FAILED).
+				if( local_rc != SUCCESS ) {
+#ifndef NDEBUG
+					#pragma omp critical
+					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+						"\tIssues with final reduction." << std::endl;
+#endif
+					rc = local_rc;
+				}
+
+			}
+
+			return rc;
+
+		}
+
+	} // namespace internal
+
+	/**
+	 * Dense Matrix-Matrix multiply between structured matrices.
+	 * Version with semiring parameter.
+	 *
+	 * @tparam descr      		The descriptors under which to perform the computation.
+	 * @tparam OutputStructMatT The structured matrix type of the output matrix.
+	 * @tparam InputStructMatT1 The structured matrix type of the the left-hand side input
+	 *                    		matrix.
+	 * @tparam InputStructMatT2 The structured matrix type of the right-hand side input
+	 *                    		matrix.
+	 * @tparam Semiring   		The semiring under which to perform the
+	 *                    		multiplication.
+	 *
+	 * @returns SUCCESS  If the computation completed as intended.
+	 * @returns MISMATCH Whenever the structures or dimensions of \a A, \a B, and \a C do
+ *                       not match. All input data containers are left
+ *                       untouched if this exit code is returned; it will be
+ *                       as though this call was never made.
+	 *
+	 * @param[out] C 	The output matrix \f$ C = AB \f$ when the function returns
+	 *               	#SUCCESS.
+	 * @param[in]  A 	The left-hand side input matrix \f$ A \f$.
+	 * @param[in]  B 	The left-hand side input matrix \f$ B \f$.
+	 * @param[in] ring  (Optional.) The semiring under which the computation should
+	 *                             proceed.
+	 * @param phase 	The execution phase.
+	 */
+	template<
+		typename OutputType, typename InputType1, typename InputType2,
+		typename OutputStructure, typename OutputView, 
+		typename OutputImfR, typename OutputImfC,
+		typename InputStructure1, typename InputView1, 
+		typename InputImfR1, typename InputImfC1,
+		typename InputStructure2, typename InputView2, 
+		typename InputImfR2, typename InputImfC2,
+		class Semiring
+	>
+	RC mxm( 
+		alp::Matrix< OutputType, OutputStructure, 
+		Density::Dense, OutputView, OutputImfR, OutputImfC, omp > &C,
+		alp::Matrix< InputType1, InputStructure1, 
+		Density::Dense, InputView1, InputImfR1, InputImfC1, omp > &A,
+		alp::Matrix< InputType2, InputStructure2, 
+		Density::Dense, InputView2, InputImfR2, InputImfC2, omp > &B,
+		const Semiring & ring = Semiring(),
+		const PHASE &phase = NUMERICAL,
+		const std::enable_if_t< 
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_semiring< Semiring >::value,
+			void 
+		> * const = NULL 
+	) {
+		(void)phase;
+
+		return internal::mxm_generic< false >( 
+			C, A, B,
+			ring.getMultiplicativeOperator(), ring.getAdditiveMonoid(), ring.getMultiplicativeMonoid()
+		);
+
+	}
+
+	/**
+	 * Dense Matrix-Matrix multiply between structured matrices.
+	 * Version with additive monoid and multiplicative operator
+	 */
+	template< 
+		typename OutputType, typename InputType1, typename InputType2,
+		typename OutputStructure, typename OutputView, 
+		typename OutputImfR, typename OutputImfC,
+		typename InputStructure1, typename InputView1, 
+		typename InputImfR1, typename InputImfC1,
+		typename InputStructure2, typename InputView2, 
+		typename InputImfR2, typename InputImfC2,
+		class Operator, class Monoid
+	>
+	RC mxm( 
+		alp::Matrix< OutputType, OutputStructure, 
+		Density::Dense, OutputView, OutputImfR, OutputImfC, omp > &C,
+		alp::Matrix< InputType1, InputStructure1, 
+		Density::Dense, InputView1, InputImfR1, InputImfC1, omp > &A,
+		alp::Matrix< InputType2, InputStructure2, 
+		Density::Dense, InputView2, InputImfR2, InputImfC2, omp > &B,
+		const Operator & mulOp,
+		const Monoid & addM,
+		const PHASE &phase = NUMERICAL,
+		const std::enable_if_t< 
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_operator< Operator >::value && alp::is_monoid< Monoid >::value,
+			void 
+		> * const = NULL 
+	) {
+		(void)phase;
+
+		return internal::mxm_generic< false >( C, A, B, mulOp, addM, Monoid() );
+	
+	}
+
+} // end namespace ``alp''
+
+#endif // end ``_H_ALP_OMP_BLAS3''
+
diff --git a/include/alp/omp/config.hpp b/include/alp/omp/config.hpp
new file mode 100644
index 000000000..79200558f
--- /dev/null
+++ b/include/alp/omp/config.hpp
@@ -0,0 +1,52 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#ifndef _H_ALP_OMP_CONFIG
+#define _H_ALP_OMP_CONFIG
+
+#include <graphblas/omp/config.hpp>
+
+#include <alp/base/config.hpp>
+
+
+namespace alp {
+
+	namespace config {
+
+		/** The default sequential backend to be selected for this parallel backend. */
+		static constexpr alp::Backend default_sequential_backend = _ALP_SECONDARY_BACKEND;
+
+		class OMP : public grb::config::OMP {};
+
+		// Dimensions of blocks counted in number of elements per dimension
+		constexpr size_t BLOCK_ROW_DIM = 2;
+		constexpr size_t BLOCK_COL_DIM = 2;
+
+		// Temporary squared solution to accomodate for 2.5D mxm
+		constexpr size_t THREAD_ROW_DIM = 4;
+		constexpr size_t THREAD_COL_DIM = 4;
+
+		constexpr size_t REPLICATION_FACTOR_THREADS = 1;
+
+
+	} // namespace config
+
+} // namespace alp
+
+#endif
+
diff --git a/include/alp/omp/exec.hpp b/include/alp/omp/exec.hpp
new file mode 100644
index 000000000..ff62c14f4
--- /dev/null
+++ b/include/alp/omp/exec.hpp
@@ -0,0 +1,113 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_OMP_EXEC
+#define _H_ALP_OMP_EXEC
+
+#include <alp/backends.hpp>
+
+#include <alp/base/exec.hpp>
+
+#ifdef _ALP_OMP_WITH_REFERENCE
+ #include <alp/reference/init.hpp>
+#endif
+namespace alp {
+
+	/**
+	 * \internal No implementation notes.
+	 */
+	template< EXEC_MODE mode >
+	class Launcher< mode, omp > {
+
+		public:
+
+			/** \internal No implementation notes. */
+			Launcher(
+				const size_t process_id = 0,
+				const size_t nprocs = 1,
+				const std::string hostname = "localhost",
+				const std::string port = "0"
+			) {
+				(void) process_id;
+				(void) nprocs;
+				(void) hostname;
+				(void) port;
+			}
+
+			/** \internal No implementation notes. */
+			template< typename U >
+			RC exec(
+				void ( *alp_program )( const void *, const size_t, U & ),
+				const void *data_in, const size_t in_size,
+				U &data_out,
+				const bool broadcast = false
+			) const {
+				(void)broadcast; // value doesn't matter for a single user process
+				std::cerr << "Entering exec().\n";
+				// intialise GraphBLAS
+				RC ret = init();
+				// call graphBLAS algo
+				if( ret == SUCCESS ) {
+					( *alp_program )( data_in, in_size, data_out );
+				}
+				// finalise the GraphBLAS
+				if( ret == SUCCESS ) {
+					ret = finalize();
+				}
+				// and done
+				return ret;
+			}
+
+			/** \internal No implementation notes. */
+			template< typename T, typename U >
+			RC exec(
+				void ( *alp_program )( const T &, U & ),
+				const T &data_in, U &data_out,
+				const bool broadcast = false
+			) {
+				(void)broadcast; // value doesn't matter for a single user process
+				std::cerr << "Entering exec().\n";
+				// intialise GraphBLAS
+				RC ret = init();
+				// call graphBLAS algo
+				if( ret == SUCCESS ) {
+					( *alp_program )( data_in, data_out );
+				}
+				// finalise the GraphBLAS
+				if( ret == SUCCESS ) {
+					ret = finalize();
+				}
+				// and done
+				return ret;
+			}
+
+			/** \internal No implementation notes. */
+			static inline RC finalize() {
+				return SUCCESS;
+			}
+
+	};
+
+} // end namespace ``alp''
+
+#endif // end ``_H_ALP_OMP_EXEC''
+
diff --git a/include/alp/omp/init.hpp b/include/alp/omp/init.hpp
new file mode 100644
index 000000000..4d63409f9
--- /dev/null
+++ b/include/alp/omp/init.hpp
@@ -0,0 +1,41 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_OMP_INIT
+#define _H_ALP_OMP_INIT
+
+#include <alp/base/init.hpp>
+
+namespace alp {
+
+	/** \internal No-op init */
+	template<>
+	RC init< omp >( const size_t, const size_t, void * const );
+
+	/** \internal No-op init */
+	template<>
+	RC finalize< omp >();
+
+} // end namespace ``alp''
+
+#endif // end ``_H_ALP_OMP_INIT''
+
diff --git a/include/alp/omp/io.hpp b/include/alp/omp/io.hpp
new file mode 100644
index 000000000..9fc66e815
--- /dev/null
+++ b/include/alp/omp/io.hpp
@@ -0,0 +1,189 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_OMP_IO
+#define _H_ALP_OMP_IO
+
+#include <type_traits>
+
+#include <alp/base/io.hpp>
+
+#include "matrix.hpp"
+#include "scalar.hpp"
+#include "storage.hpp"
+
+// Include backend to which sequential work is delegated
+#ifdef _ALP_OMP_WITH_REFERENCE
+ #include <alp/reference/io.hpp>
+#endif
+
+#define NO_CAST_ASSERT( x, y, z )                                              \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | Provide a value that matches the expected type.\n" \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+namespace alp {
+
+	/**
+	 * Sets all elements of the given matrix to the value of the given scalar.
+	 * C = val
+	 *
+	 * @tparam descr
+	 * @tparam OutputType      Data type of the output matrix C
+	 * @tparam OutputStructure Structure of the matrix C
+	 * @tparam OutputView      View type applied to the matrix C
+	 * @tparam InputType       Data type of the scalar a
+	 *
+	 * @param C    Matrix whose values are to be set
+	 * @param val  The value to set the elements of the matrix C
+	 *
+	 * @return RC  SUCCESS on the successful execution of the set
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType, typename InputStructure
+	>
+	RC set(
+		Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, omp > &C,
+		const Scalar< InputType, InputStructure, omp > &val
+	) noexcept {
+
+		static_assert(
+			!std::is_same< OutputType, void >::value,
+			"alp::set (set to matrix): cannot have a pattern matrix as output"
+		);
+#ifdef _DEBUG
+		std::cout << "Called alp::set (matrix-to-value, omp)" << std::endl;
+#endif
+		// static checks
+		NO_CAST_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< InputType, OutputType >::value ),
+			"alp::set", "called with non-matching value types"
+		);
+
+		static_assert(
+			!internal::is_functor_based<
+				Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, omp >
+			>::value,
+			"alp::set cannot be called with a functor-based matrix as a destination."
+		);
+
+		if( !internal::getInitialized( val ) ) {
+			internal::setInitialized( C, false );
+			return SUCCESS;
+		}
+
+		const Distribution_2_5D &d = internal::getAmf( C ).getDistribution();
+
+		RC rc = SUCCESS;
+
+		#pragma omp parallel for
+		for( size_t thread = 0; thread < config::OMP::current_threads(); ++thread ) {
+			const auto t_coords = d.getThreadCoords( thread );
+			const auto block_grid_dims = d.getLocalBlockGridDims( t_coords );
+
+			RC local_rc = SUCCESS;
+
+			for( size_t br = 0; br < block_grid_dims.first; ++br ) {
+				for( size_t bc = 0; bc < block_grid_dims.second; ++bc ) {
+
+					// Get a sequential matrix view over the block
+					auto refC = internal::get_view( C, t_coords, br, bc );
+
+					// Construct a sequential Scalar container from the input Scalar
+					Scalar< InputType, InputStructure, config::default_sequential_backend > ref_val( *val );
+
+					// Delegate the call to the sequential set implementation
+					local_rc = local_rc ? local_rc : set( refC, ref_val );
+
+					if( local_rc != SUCCESS ) {
+						rc = local_rc;
+					}
+				}
+			}
+		}
+
+		internal::setInitialized( C, true );
+		return rc;
+	}
+
+	/**
+	 * Temporarily assuming 1-1, row-major mapping with user container.
+	 */
+	template< 
+		typename InputType, typename Structure, typename View, 
+		typename ImfR, typename ImfC, typename fwd_iterator 
+	>
+	RC buildMatrix(
+		Matrix< InputType, Structure, Density::Dense, View, ImfR, ImfC, omp > &A,
+		const fwd_iterator &start,
+		const fwd_iterator &end
+	) noexcept {
+		
+		static_assert( 
+			std::is_same< InputType, typename fwd_iterator::value_type >::value, 
+			"alp::buildMatrix (omp): Mismatching type between user-provided input "
+			"container and output ALP container."
+		);
+
+		const size_t m = nrows( A );
+		const size_t n = ncols( A );
+
+		if( ( end - start ) != static_cast< std::ptrdiff_t >( m * n ) ) {
+			
+			std::cerr << "alp::buildMatrix (omp): Mismatching sizes between "
+				"user-provided input container and output ALP container." << std::endl;
+
+			return MISMATCH;
+		}
+
+		internal::setInitialized(A, true);
+
+		fwd_iterator it = start;
+
+		for( size_t i = 0; i < m; ++i ) {
+			for( size_t j = 0; j < n; ++j ) {
+				internal::access( A, internal::getStorageIndex( A, i, j ) ) = *( it++ );
+			}
+		}
+		
+		return SUCCESS;
+	}
+
+} // end namespace ``alp''
+
+#undef NO_CAST_ASSERT
+
+#endif // end ``_H_ALP_OMP_IO''
+
diff --git a/include/alp/omp/matrix.hpp b/include/alp/omp/matrix.hpp
new file mode 100644
index 000000000..94e600979
--- /dev/null
+++ b/include/alp/omp/matrix.hpp
@@ -0,0 +1,111 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_OMP_MATRIX
+#define _H_ALP_OMP_MATRIX
+
+#include <type_traits>
+
+#include <alp/backends.hpp>
+#include <alp/type_traits.hpp>
+
+#include <alp/base/matrix.hpp>
+#include <alp/amf-based/matrix.hpp>
+
+#include "storage.hpp"
+#include "storagebasedmatrix.hpp"
+#include "vector.hpp"
+
+#ifdef _ALP_OMP_WITH_REFERENCE
+ #include <alp/reference/storage.hpp> // For AMFFactory
+#endif
+
+
+namespace alp {
+
+	// Currently no backend specific implementation
+
+	namespace internal {
+
+		/**
+		 * Exposes a block of the parallel matrix as a sequential ALP matrix.
+		 *
+		 * The underlying container (buffer/block) is obtained from the parallel
+		 * container, while the AMF is constructed based on the properties of the
+		 * block and the applied gather view (i.e., the IMFs associated to it).
+		 *
+		 */
+		template<
+			enum view::Views target_view = view::original,
+			typename SourceMatrix,
+			typename ThreadCoords,
+			std::enable_if_t<
+				alp::is_matrix< SourceMatrix >::value
+			> * = nullptr
+		>
+		typename new_container_type_from<
+			typename SourceMatrix::template view_type< view::gather >::type
+		>::template change_backend< config::default_sequential_backend >::type
+		get_view( SourceMatrix &source, const ThreadCoords t, const size_t br, const size_t bc ) {
+
+			// get the container
+			const auto &distribution = getAmf( source ).getDistribution();
+			const size_t thread_id = distribution.getThreadId( t );
+			const size_t block_id = distribution.getLocalBlockId( t, br, bc );
+			auto &container = getLocalContainer( getContainer( source ), thread_id, block_id );
+
+			// make an AMF
+			// note: When making a view over a vector, the second imf must be imf::Zero
+
+			// Type of AMF factory corresponding to the full local block's AMF
+			using original_amf_factory = alp::storage::AMFFactory< config::default_sequential_backend >::FromPolynomial<
+				structures::General, imf::Id, imf::Id
+			>;
+
+			// AMF factory after applying the global view
+			using amf_factory = alp::storage::AMFFactory< config::default_sequential_backend >::Compose<
+				imf::Strided, imf::Strided, typename original_amf_factory::amf_type
+			>;
+
+			const auto block_dims = distribution.getBlockDimensions();
+
+			typename amf_factory::amf_type amf = amf_factory::Create(
+				imf::Id( block_dims.first ),
+				imf::Id( block_dims.second ),
+				original_amf_factory::Create( imf::Id( block_dims.first ), imf::Id( block_dims.second ) )
+			);
+
+			// create a sequential container with the container and AMF above
+			using target_t = typename new_container_type_from<
+				typename SourceMatrix::template view_type< view::gather >::type
+			>::template change_backend< config::default_sequential_backend >::type;
+
+			target_t blk_matrix( container, amf );
+			internal::setInitialized( blk_matrix, internal::getInitialized( source ) );
+			return blk_matrix;
+		}
+
+	} // namespace internal
+
+} // namespace alp
+
+#endif // end ``_H_ALP_OMP_MATRIX''
diff --git a/include/alp/omp/scalar.hpp b/include/alp/omp/scalar.hpp
new file mode 100644
index 000000000..881e43139
--- /dev/null
+++ b/include/alp/omp/scalar.hpp
@@ -0,0 +1,204 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _H_ALP_OMP_SCALAR
+#define _H_ALP_OMP_SCALAR
+
+
+#include <stdexcept>
+#include <memory>
+
+#include <assert.h>
+
+#include <alp/backends.hpp>
+#include <alp/density.hpp>
+#include <alp/rc.hpp>
+#include <alp/structures.hpp>
+#include <alp/views.hpp>
+
+#include <alp/base/scalar.hpp>
+
+
+namespace alp {
+
+	namespace internal {
+		template< typename T, typename Structure >
+		bool getInitialized( const Scalar< T, Structure, omp > & ) noexcept;
+
+		template< typename T, typename Structure >
+		void setInitialized( Scalar< T, Structure, omp > &, bool ) noexcept;
+	} // end namespace ``alp::internal''
+
+	/**
+	 * An ALP scalar.
+	 *
+	 * This is an opaque data type for scalars.
+	 *
+	 * @tparam T                 The type of the vector elements. \a T shall not
+	 *                           be a ALP type.
+	 * @tparam Structure         One of the structures.
+	 *
+	 * \warning Creating a alp::Scalar of other ALP types is
+	 *                <em>not allowed</em>.
+	 *          Passing a ALP type as template parameter will lead to
+	 *          undefined behaviour.
+	 *
+	 */
+	template< typename T, typename Structure >
+	class Scalar< T, Structure, omp > {
+
+		private:
+
+			typedef Scalar< T, Structure, omp > self_type;
+
+			friend bool internal::getInitialized<>( const self_type & ) noexcept;
+
+			friend void internal::setInitialized<>( self_type &, bool ) noexcept;
+
+			// Scalar value
+			T value;
+
+			/** Whether the scalar value is currently initialized */
+			bool initialized;
+
+		public:
+			/** @see Vector::value_type. */
+			typedef T value_type;
+
+			/** @see Vector::lambda_reference */
+			typedef T& lambda_reference;
+			typedef const T& const_lambda_reference;
+
+			/**
+			 * The main ALP scalar constructor.
+			 *
+			 * The constructed object will be uninitalised after successful construction.
+			 *
+			 *
+			 * @return SUCCESS This function never fails.
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor may allocate \f$ \Theta(1) \f$ bytes
+			 *           of dynamic memory.
+			 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+			 *           memory beyond that at constructor entry.
+			 *        -# This constructor incurs \f$ \Theta(1) \f$ data movement.
+			 *        -# This constructor \em may make system calls.
+			 * \endparblock
+			 *
+			 */
+			Scalar() : initialized( false ) {}
+
+			/**
+			 * The ALP scalar constructor for converting C/C++ scalar to ALP scalar.
+			 *
+			 * The constructed object will be initialized after successful construction.
+			 *
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor may allocate \f$ \Theta(1) \f$ bytes
+			 *           of dynamic memory.
+			 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+			 *           memory beyond that at constructor entry.
+			 *        -# This constructor incurs \f$ \Theta(1) \f$ data movement.
+			 *        -# This constructor \em may make system calls.
+			 * \endparblock
+			 *
+			 */
+			explicit Scalar( const T &value ) : value( value ), initialized( true ) {}
+
+			/**
+			 * Copy constructor.
+			 *
+			 * @param other The scalar to copy. The initialization state of the copy
+			 *              reflects the state of \a other.
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor allocates \f$ \Theta(1) \f$ bytes
+			 *           of dynamic memory.
+			 *        -# This constructor incurs \f$ \Theta(1) \f$ of data
+			 *           movement.
+			 *        -# This constructor \em may make system calls.
+			 * \endparblock
+			 *
+			 */
+			Scalar( const Scalar &other ) : value( other.value ), initialized( other.initialized ) {
+				// const RC rc = set( *this, other ); // note: initialized will be set as part of this call
+				// if( rc != SUCCESS ) {
+				// 	throw std::runtime_error( "alp::Scalar< T, Structure, Density::Dense, View::Original< void >, omp > (copy constructor): error during call to alp::set (" + toString( rc ) + ")" );
+				// }
+			}
+
+			/**
+			 * Move constructor. The new scalar equals the given
+			 * scalar. Invalidates the use of the input scalar.
+			 *
+			 * @param[in] other The ALP scalar to move to this new instance.
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor will not allocate any new dynamic memory.
+			 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+			 *           memory beyond that at constructor entry.
+			 *        -# This constructor will move \f$ \Theta(1) \f$ bytes of data.
+			 * \endparblock
+			 */
+			Scalar( Scalar &&other ) : value( other.value ), initialized( other.initialized ) {
+				other.initialized = false;
+			}
+
+			/** \internal No implementation notes. */
+			lambda_reference operator*() noexcept {
+				assert( internal::getInitialized( *this ) );
+				return value;
+			}
+
+			/** \internal No implementation notes. */
+			const_lambda_reference operator*() const noexcept {
+				assert( internal::getInitialized( *this ) );
+				return value;
+			}
+
+	}; // class Scalar with physical container
+
+	/** Identifies any omp scalar as an ALP scalar. */
+	template< typename T, typename Structure >
+	struct is_scalar< Scalar< T, Structure, omp > > : std::true_type {};
+
+	namespace internal {
+		template< typename T, typename Structure >
+		bool getInitialized( const Scalar< T, Structure, omp > &s ) noexcept {
+			return s.initialized;
+		}
+
+		template< typename T, typename Structure >
+		void setInitialized( Scalar< T, Structure, omp > &s, bool initialized ) noexcept {
+			s.initialized = initialized;
+		}
+	} // end namespace ``alp::internal''
+
+} // end namespace ``alp''
+
+#endif // end ``_H_ALP_OMP_SCALAR''
+
diff --git a/include/alp/omp/storage.hpp b/include/alp/omp/storage.hpp
new file mode 100644
index 000000000..6e3e23cf6
--- /dev/null
+++ b/include/alp/omp/storage.hpp
@@ -0,0 +1,772 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ *
+ * @file
+ *
+ * This file registers mechanisms for coordinate mapping between
+ * logical and physical iteration spaces.
+ *
+ */
+
+#ifndef _H_ALP_OMP_STORAGE
+#define _H_ALP_OMP_STORAGE
+
+#include <cmath>
+
+#include <alp/amf-based/storage.hpp>
+
+namespace alp {
+
+	namespace internal {
+
+		/** Specialization for matrices */
+		template< typename Structure >
+		struct determine_poly_factory< Structure, imf::Id, imf::Id, omp > {
+
+			typedef storage::polynomials::FullFactory<> factory_type;
+		};
+
+		/** Specialization for vectors */
+		template< typename Structure >
+		struct determine_poly_factory< Structure, imf::Id, imf::Zero, omp > {
+
+			typedef storage::polynomials::ArrayFactory factory_type;
+		};
+
+	} // namespace internal
+
+
+	/**
+	 * Implements mapping between global and local iteration spaces
+	 * for shared-memory parallel backend.
+	 * The logical coordinates are represented as pair (i, j) of row and
+	 * column positions within the matrix.
+	 * The local coordinates are represented as (tr, tc, rt, br, bc, il, jl),
+	 * where:
+	 *  - tr is thread row-coordinate
+	 *  - tc is thread column-coordinate
+	 *  - rt is replication factor for thread coordinates
+	 *  - br is block row-coordinate
+	 *  - bc is block column-coordinate
+	 *  - i  is element's row-coordinate within its block
+	 *  - j  is element's column-coordinate within its block
+	 *
+	 * This implementation assumes block-cyclic distribution of blocks
+	 * among threads.
+	 *
+	 */
+	class Distribution_2_5D {
+
+		public:
+
+			/** Type encapsulating thread coordinates within the thread grid. */
+			struct ThreadCoords {
+				const size_t tr;
+				const size_t tc;
+				const size_t rt;
+
+				ThreadCoords( const size_t tr, const size_t tc, const size_t rt ) : tr( tr ), tc( tc ), rt( rt ) {}
+			};
+
+			/** Type encapsulating the global element coordinate. */
+			struct GlobalCoord {
+
+				const size_t i;
+				const size_t j;
+
+				GlobalCoord( const size_t i, const size_t j ) : i( i ), j( j ) {}
+
+			};
+
+			/** Type encapsulating the local element coordinate. */
+			struct LocalCoord {
+
+				const ThreadCoords t;
+				const size_t br;
+				const size_t bc;
+				const size_t i;
+				const size_t j;
+
+				LocalCoord(
+					const size_t tr, const size_t tc,
+					const size_t rt,
+					const size_t br, const size_t bc,
+					const size_t i, const size_t j
+				) :
+					t( tr, tc, rt ),
+					br( br ), bc( bc ),
+					i( i ), j( j ) {}
+
+				const ThreadCoords &getThreadCoords() const {
+					return t;
+				}
+	
+			};
+
+			/** Type encapsulating the global block coordinate. */
+			struct GlobalBlockCoord {
+
+				const size_t br;
+				const size_t bc;
+
+				GlobalBlockCoord(
+					const size_t br, const size_t bc
+				) :
+					br( br ), bc( bc ) {}
+
+			};
+
+			// /** Type encapsulating the local block coordinate. */
+			// struct LocalBlockCoord {
+
+			// 	const size_t tr;
+			// 	const size_t tc;
+			// 	const size_t rt;
+			// 	const size_t br;
+			// 	const size_t bc;
+
+			// 	LocalBlockCoord(
+			// 		const size_t tr, const size_t tc,
+			// 		const size_t rt,
+			// 		const size_t br, const size_t bc
+			// 	) :
+			// 		tr( tr ), tc( tc ),
+			// 		rt( rt ),
+			// 		br( br ), bc( bc ) {}
+
+			// };
+
+		private:
+
+			/** Row and column dimensions of the associated container */
+			const size_t m;
+			const size_t n;
+			/** Replication factor in thread-coordinate space */
+			static constexpr size_t Rt = config::REPLICATION_FACTOR_THREADS;
+			/** The row and column dimensions of the global block grid */
+			const size_t Br;
+			const size_t Bc;
+			/** The row and column dimensions of the thread grid */
+			const size_t Tr;
+			const size_t Tc;
+
+		public:
+
+			Distribution_2_5D(
+				const size_t m, const size_t n,
+				const size_t num_threads
+			) :
+				m( m ), n( n ),
+				Br( static_cast< size_t >( std::ceil( static_cast< double >( m ) / config::BLOCK_ROW_DIM ) ) ),
+				Bc( static_cast< size_t >( std::ceil( static_cast< double >( n ) / config::BLOCK_COL_DIM ) ) ),
+				Tr( ( Br > config::THREAD_ROW_DIM ) ? config::THREAD_ROW_DIM : Br ), 
+				Tc( ( Bc > config::THREAD_COL_DIM ) ? config::THREAD_COL_DIM : Bc ) 
+			{
+				if( num_threads != config::THREAD_ROW_DIM * config::THREAD_COL_DIM * config::REPLICATION_FACTOR_THREADS ) {
+					std::cerr << "Warning: Provided number of threads cannot be factorized in a 2.5D grid:\n"
+						"\t" << num_threads << " != " << config::THREAD_ROW_DIM << " x " << config::THREAD_COL_DIM << " x " << config::REPLICATION_FACTOR_THREADS << std::endl;
+				}
+			}
+
+			// LocalBlockCoord mapBlockGlobalToLocal( const GlobalBlockCoord &g ) const {
+			// 	(void) g;
+			// 	return LocalBlockCoord( 0, 0, 0, 0, 0 );
+			// }
+
+			// GlobalBlockCoord mapBlockLocalToGlobal( const LocalBlockCoord &l ) const {
+			// 	const size_t block_id_r = l.br * Tr + l.tr;
+			// 	const size_t block_id_c = l.bc * Tc + l.tc;
+			// 	return GlobalBlockCoord( block_id_r, block_id_c );// Temporary
+			// }
+
+			LocalCoord mapGlobalToLocal( const GlobalCoord &g ) const {
+				const size_t global_br = g.i / config::BLOCK_ROW_DIM;
+				const size_t local_br = global_br / Tr;
+				const size_t tr = global_br % Tr;
+				const size_t local_i = g.i % config::BLOCK_ROW_DIM;
+
+				const size_t global_bc = g.j / config::BLOCK_COL_DIM;
+				const size_t local_bc = global_bc / Tc;
+				const size_t tc = global_bc % Tc;
+				const size_t local_j = g.j % config::BLOCK_COL_DIM;
+
+				return LocalCoord(
+					tr, tc,
+					0, // Rt always maps to the front layer
+					local_br, local_bc,
+					local_i, local_j
+				);
+			}
+
+			// /**
+			//  * Maps coordinates from local to global space.
+			//  *
+			//  * \todo Add implementation
+			//  */
+			// GlobalCoord mapLocalToGlobal( const LocalCoord &l ) const {
+			// 	(void) l;
+			// 	return GlobalCoord( 0, 0 );
+			// }
+
+			/** 
+			 * Returns the thread ID corresponding to the given thread coordinates. 
+			 * The fixed thread grid enables to map left-over threads.
+			 */
+			size_t getThreadId( const ThreadCoords t ) const {
+				return t.rt * config::THREAD_ROW_DIM * config::THREAD_COL_DIM + t.tr * config::THREAD_COL_DIM + t.tc;
+			}
+
+			size_t getNumberOfThreads() const {
+				return config::THREAD_ROW_DIM * config::THREAD_COL_DIM * config::REPLICATION_FACTOR_THREADS;
+			}
+
+			/** Returns the thread grid size */
+			ThreadCoords getThreadGridDims() const {
+				return { Tr, Tc, Rt };
+			}
+
+			/** Returns the total global amount of blocks */
+			std::pair< size_t, size_t > getGlobalBlockGridDims() const {
+				return { Br, Bc };
+			}
+
+			/** Returns the dimensions of the block grid associated to the given thread */
+			std::pair< size_t, size_t > getLocalBlockGridDims( const ThreadCoords t ) const {
+				// The RHS of the + operand covers the case
+				// when the last block of threads is not full
+				const size_t blocks_r = Br / Tr + ( t.tr < Br % Tr ? 1 : 0 );
+				const size_t blocks_c = Bc / Tc + ( t.tc < Bc % Tc ? 1 : 0 );
+				return { blocks_r, blocks_c };
+			}
+
+			// /** Returns the global block coordinates based on the thread and local block coordinates */
+			// std::pair< size_t, size_t > getGlobalBlockCoords( const size_t tr, const size_t tc, const size_t br, const size_t bc ) const {
+			// 	const size_t global_br = br * Tr + tr % Tr;
+			// 	const size_t global_bc = bc * Tc + tc % Tc;
+			// 	return { global_br, global_bc };
+			// }
+
+			// size_t getGlobalBlockId( const size_t tr, const size_t tc, const size_t br, const size_t bc ) const {
+			// 	const auto global_coords = getGlobalBlockCoords( tr, tc, br, bc );
+			// 	return global_coords.first * Bc + global_coords.second;
+			// }
+
+			size_t getLocalBlockId( const LocalCoord &local ) const {
+				return local.br * getLocalBlockGridDims( local.getThreadCoords() ).second + local.bc;
+			}
+
+			size_t getLocalBlockId( const ThreadCoords &t, const size_t br, const size_t bc ) const {
+				return br * getLocalBlockGridDims( t ).second + bc;
+			}
+
+			/**
+			 * Returns the dimensions of the block given by the block id
+			 */
+			//std::pair< size_t, size_t > getBlockDimensions( const size_t tr, const size_t tc, const size_t br, const size_t bc ) const {
+			//	const auto global_block_coords = getGlobalBlockCoords( tr, tc, br, bc );
+			//	const size_t block_height = ( global_block_coords.first < Br - 1 ) ?
+			//		( config::BLOCK_ROW_DIM ) :
+			//		( m - config::BLOCK_ROW_DIM * ( Br - 1 ) );
+			//	const size_t block_width = ( global_block_coords.second < Bc - 1 ) ?
+			//		( config::BLOCK_COL_DIM ) :
+			//		( n - config::BLOCK_COL_DIM * ( Bc - 1 ) );
+			//	return { block_height, block_width };
+			//}
+
+			/** Returns the dimensions of the block given by the block id */
+			constexpr std::pair< size_t, size_t > getBlockDimensions() const {
+				return { config::BLOCK_ROW_DIM, config::BLOCK_COL_DIM };
+			}
+
+			/** Returns the size (in number of elements) of the block defined by the thread and local block coordinates. */
+			size_t getBlockSize() const {
+				const auto dims = getBlockDimensions();
+				return dims.first * dims.second;
+			}
+
+			/** For a given block, returns its offset from the beginning of the buffer in which it is stored */
+			size_t getBlocksOffset( const ThreadCoords &t, const size_t br, const size_t bc ) const {
+				// The offset is calculated as the sum of sizes of all previous blocks
+				const size_t block_coord_1D = br * getLocalBlockGridDims( t ).second + bc;
+				return block_coord_1D * getBlockSize();
+			}
+
+			ThreadCoords getThreadCoords( const size_t thread_id ) const {
+				const size_t rt = thread_id / ( config::THREAD_ROW_DIM * config::THREAD_COL_DIM );
+				const size_t tr = ( thread_id % ( config::THREAD_ROW_DIM * config::THREAD_COL_DIM ) ) / config::THREAD_COL_DIM;
+				const size_t tc = ( thread_id % ( config::THREAD_ROW_DIM * config::THREAD_COL_DIM ) ) % config::THREAD_COL_DIM;
+				return { tr, tc, rt };
+			}
+
+			bool isActiveThread(const ThreadCoords &t ) const {
+				return ( t.tr < Tr ) && ( t.tc < Tc ) && ( t.rt < Rt );
+			}
+
+			bool isActiveThread(const size_t thread_id ) const {
+				const auto th_coords = getThreadCoords( thread_id );
+				return isActiveThread( th_coords );
+			}
+
+	};
+		
+	namespace storage {
+
+		/**
+		 * AMF for parallel shared memory backend.
+		 *
+		 * This implementation makes the following assumptions:
+		 *  - all blocks use the same storage scheme, independent of their non-zero structure
+		 *
+		 * @tparam ImfR  Index-Mapping Function associated to row dimension.
+		 * @tparam ImfC  Index-Mapping Function associated to column dimension.
+		 * @tparam PolyFactory  The type of factory for storage polynomials
+		 *                      used to construct polynomials for all blocks.
+		 */
+		template< typename ImfR, typename ImfC, typename PolyFactory >
+		class AMF< ImfR, ImfC, PolyFactory, omp > {
+
+			friend class AMFFactory< omp >;
+
+			public:
+
+				/** Expose static properties */
+				typedef ImfR imf_r_type;
+				typedef ImfC imf_c_type;
+				typedef PolyFactory poly_factory_type;
+				typedef typename PolyFactory::poly_type mapping_polynomial_type;
+
+				/** Expose types defined within the class */
+				typedef struct StorageIndexType {
+
+					size_t buffer_id;
+					size_t block_id;
+					size_t offset;
+
+					StorageIndexType( const size_t buffer_id, const size_t block_id, const size_t offset ) :
+						buffer_id( buffer_id ), block_id( block_id ), offset( offset ) {}
+
+				} storage_index_type;
+
+			private:
+
+				const imf_r_type imf_r;
+				const imf_c_type imf_c;
+
+				constexpr static bool is_matrix = std::is_same< ImfC, imf::Id >::value;
+
+				/**
+				 * Number of threads used to initialize the associated container.
+				 * This impacts the number of allocated blocks.
+				 */
+				const size_t num_threads;
+
+				const Distribution_2_5D distribution;
+
+				AMF(
+					ImfR imf_r,
+					ImfC imf_c,
+					size_t num_threads = config::OMP::threads()
+				) :
+					imf_r( imf_r ), imf_c( imf_c ),
+					num_threads( num_threads ),
+					distribution( imf_r.n, imf_c.n, num_threads ) {
+					std::cout << "Entering AMF normal constructor\n";
+				}
+
+				AMF( const AMF & ) = delete;
+				AMF &operator=( const AMF & ) = delete;
+
+			public:
+
+				AMF( AMF &&amf ) :
+					imf_r( std::move( amf.imf_r ) ),
+					imf_c( std::move( amf.imf_c ) ),
+					num_threads( amf.num_threads ),
+					distribution( std::move( amf.distribution ) ) {
+					std::cout << "Entering OMP AMF move constructor\n";
+				}
+
+				const Distribution_2_5D &getDistribution() const {
+					return distribution;
+				}
+
+				/**
+				 * Returns dimensions of the logical layout of the associated container.
+				 *
+				 * @return  A pair of two values, number of rows and columns, respectively.
+				 */
+				std::pair< size_t, size_t> getLogicalDimensions() const {
+					return std::make_pair( imf_r.n, imf_c.n );
+				}
+
+				/**
+				 * @brief Returns a storage index based on the coordinates in the
+				 *        logical iteration space.
+				 *
+				 * @tparam R  ImfR type
+				 * @tparam C  ImfC type
+				 *
+				 * @param[in] i  row-coordinate
+				 * @param[in] j  column-coordinate
+				 * @param[in] s  current process ID
+				 * @param[in] P  total number of processes
+				 *
+				 * @return  storage index corresponding to the provided logical
+				 *          coordinates and parameters s and P.
+				 *
+				 * \note It is not necessary to call imf.map() function if the imf
+				 *       has the type imf::Id. To implement SFINAE-driven selection
+				 *       of the getStorageIndex, dummy parameters R and C are added.
+				 *       They are set to the ImfR and ImfC by default and a static
+				 *       assert ensures that external caller does not force a call
+				 *       to wrong implementation by explicitly specifying values
+				 *       for R and/or C.
+				 *
+				 */
+				storage_index_type getStorageIndex( const size_t i, const size_t j, const size_t s, const size_t P ) const {
+					(void) s;
+					(void) P;
+					const typename Distribution_2_5D::GlobalCoord global( imf_r.map( i ), imf_c.map( j ) );
+					const typename Distribution_2_5D::LocalCoord local = distribution.mapGlobalToLocal( global );
+
+					const size_t thread = distribution.getThreadId( local.getThreadCoords() );
+					const size_t local_block = distribution.getLocalBlockId( local );
+					const size_t local_element = local.i * config::BLOCK_ROW_DIM + local.j;
+
+					return storage_index_type( thread, local_block, local_element );
+				}
+
+				/**
+				 * Returns coordinates in the logical iteration space based on
+				 * the storage index.
+				 *
+				 * @param[in] storageIndex  storage index in the physical
+				 *                          iteration space
+				 * @param[in] s             current process ID
+				 * @param[in] P             total number of processes
+				 *
+				 * @return  a pair of row- and column-coordinates in the
+				 *          logical iteration space.
+				 */
+				std::pair< size_t, size_t > getCoords( const size_t storageIndex, const size_t s, const size_t P ) const;
+
+		}; // class AMF
+
+		/**
+		 * Collects AMF factory classes.
+		 */
+		template<>
+		struct AMFFactory< omp > {
+
+			/**
+			 * Specialization of AMFFactory for shared-memory parallel backend.
+			 * 
+			 * @tparam ViewImfR   The type of IMF applied to the row coordinate.
+			 * @tparam ViewImfC   The type of IMF applied to the column coordinate.
+			 * @tparam SourceAMF  The type of the target AMF
+			 *
+			 */
+			template< typename ViewImfR, typename ViewImfC, typename SourceAMF >
+			struct Compose {
+
+				private:
+
+					/** Extract target IMF and polynomial types */
+					typedef typename SourceAMF::imf_r_type SourceImfR;
+					typedef typename SourceAMF::imf_c_type SourceImfC;
+					typedef typename SourceAMF::mapping_polynomial_type SourcePoly;
+
+					/** Compose row and column IMFs */
+					typedef typename imf::ComposedFactory< SourceImfR, ViewImfR >::type composed_imf_r_type;
+					typedef typename imf::ComposedFactory< SourceImfC, ViewImfC >::type composed_imf_c_type;
+
+					/** Fuse composed row IMF into the target polynomial */
+					typedef typename polynomials::fuse_on_i<
+						composed_imf_r_type,
+						SourcePoly
+					> fused_row;
+
+					/** Fuse composed column IMF into the intermediate polynomial obtained above */
+					typedef typename polynomials::fuse_on_j<
+						composed_imf_c_type,
+						typename fused_row::resulting_polynomial_type
+					> fused_row_col;
+
+					typedef typename fused_row::resulting_imf_type final_imf_r_type;
+					typedef typename fused_row_col::resulting_imf_type final_imf_c_type;
+					typedef typename fused_row_col::resulting_polynomial_type final_polynomial_type;
+
+				public:
+
+					typedef AMF< final_imf_r_type, final_imf_c_type, final_polynomial_type, omp > amf_type;
+
+					static amf_type Create( ViewImfR imf_r, ViewImfC imf_c, const AMF< SourceImfR, SourceImfC, SourcePoly, omp > &amf ) {
+						composed_imf_r_type composed_imf_r = imf::ComposedFactory< SourceImfR, ViewImfR >::create( amf.imf_r, imf_r );
+						composed_imf_c_type composed_imf_c = imf::ComposedFactory< SourceImfC, ViewImfC >::create( amf.imf_c, imf_c );
+						return amf_type(
+							fused_row::CreateImf( composed_imf_r ),
+							fused_row_col::CreateImf( composed_imf_c ),
+							fused_row_col::CreatePolynomial(
+								composed_imf_c,
+								fused_row::CreatePolynomial( composed_imf_r, amf.map_poly )
+							),
+							amf.storage_dimensions
+						);
+					}
+
+					Compose() = delete;
+
+			}; // class Compose
+
+			/**
+			 * @brief Describes an AMF for a container that requires allocation
+			 *        and exposes the AMFs type and a factory method to create it.
+			 *
+			 * A container that requires allocation is accompanied by Id IMFs for
+			 * both row and column dimensions and the provided mapping polynomial.
+			 *
+			 * @tparam PolyType  Type of the mapping polynomial.
+			 *
+			 */
+			template< typename Structure, typename ImfR, typename ImfC >
+			struct FromPolynomial {
+
+				// Ensure compatibility of IMF types.
+				// Original Matrix has imf::Id as both IMFs.
+				// Original Vector has ImfR = imf::Id and ImfC = imf::Zero.
+				static_assert(
+					std::is_same< ImfR, imf::Id >::value &&
+					( std::is_same< ImfC, imf::Id >::value || std::is_same< ImfC, imf::Zero >::value ),
+					"AMF factory FromPolynomial can only be used for an original container."
+				);
+
+				typedef typename internal::determine_poly_factory< Structure, ImfR, ImfC, omp >::factory_type PolyFactory;
+
+				typedef AMF< imf::Id, imf::Id, PolyFactory, omp > amf_type;
+
+				/**
+				 * Factory method used by 2D containers.
+				 *
+				 * @param[in] imf_r               Row IMF
+				 * @param[in] imf_c               Column IMF
+				 * @param[in] poly                Mapping polynomial
+				 * @param[in] storage_dimensions  Size of the allocated storage
+				 *
+				 * @return  An AMF object of the type \a amf_type
+				 *
+				 */
+				static amf_type Create( imf::Id imf_r, imf::Id imf_c ) {
+					return amf_type( imf_r, imf_c );
+				}
+
+				/**
+				 * Factory method used by 1D containers.
+				 *
+				 * Exploits the fact that fusion of strided IMFs into the polynomial
+				 * always succeeds and results in Id IMFs. As a result, the
+				 * constructed AMF is of the type \a amf_type.
+				 *
+				 * @param[in] imf_r               Row IMF
+				 * @param[in] imf_c               Column IMF
+				 * @param[in] poly                Mapping polynomial
+				 * @param[in] storage_dimensions  Size of the allocated storage
+				 *
+				 * @return  An AMF object of the type \a amf_type
+				 *
+				 * \note \internal To exploit existing mechanism for IMF fusion
+				 *                 into the polynomial, this method creates a
+				 *                 dummy AMF out of two Id IMFs and the provided
+				 *                 polynomial and composes the provided Strided
+				 *                 IMFs with the dummy AMF.
+				 */
+				static amf_type Create( imf::Id imf_r, imf::Zero imf_c ) {
+
+					/**
+					 * Ensure that the assumptions do not break upon potential
+					 * future changes to AMFFactory::Compose.
+					 */
+					static_assert(
+						std::is_same<
+							amf_type,
+							typename Compose< imf::Id, imf::Zero, AMF< imf::Id, imf::Id, typename PolyFactory::poly_type, omp > >::amf_type
+						>::value,
+						"The factory method returns the object of different type than declared. This is a bug."
+					);
+					return Compose< imf::Id, imf::Zero, AMF< imf::Id, imf::Id, typename PolyFactory::poly_type, omp > >::Create(
+						imf_r, imf_c,
+						FromPolynomial< structures::General, imf::Id, imf::Zero >::Create( imf::Id( imf_r.N ), imf::Id( imf_c.N ) )
+					);
+				}
+
+				FromPolynomial() = delete;
+
+			}; // class FromPolynomial
+
+			/**
+			 * @brief Transforms the provided AMF by applying the provided View type.
+			 *
+			 * Exposes the type of the resulting AMF and implements a factory
+			 * method that creates objects of such type.
+			 *
+			 * @tparam view       The enum value of the desired view type.
+			 * @tparam SourceAMF  The type of the target AMF
+			 *
+			 */
+			template< enum view::Views view, typename SourceAMF >
+			struct Reshape {
+
+				typedef SourceAMF amf_type;
+
+				static amf_type Create( const SourceAMF &amf ) {
+					throw std::invalid_argument( "Not implemented for the provided view type." );
+					return amf;
+				}
+
+				Reshape() = delete;
+
+			}; // class Reshape
+
+			template< typename SourceAMF >
+			struct Reshape< view::original, SourceAMF > {
+
+				typedef SourceAMF amf_type;
+
+				static amf_type Create( const SourceAMF &amf ) {
+					return amf_type( amf.imf_r, amf.imf_c, amf.map_poly, amf.storage_dimensions );
+				}
+
+				Reshape() = delete;
+
+			}; // class Reshape< original, ... >
+
+			template< typename SourceAMF >
+			struct Reshape< view::transpose, SourceAMF > {
+
+				typedef AMF<
+					typename SourceAMF::imf_c_type,
+					typename SourceAMF::imf_r_type,
+					typename polynomials::apply_view<
+						view::transpose,
+						typename SourceAMF::mapping_polynomial_type
+					>::type,
+					omp
+				> amf_type;
+
+				static amf_type Create( const SourceAMF &amf ) {
+					typedef typename polynomials::apply_view< view::transpose, typename SourceAMF::mapping_polynomial_type >::type new_mapping_polynomial_type;
+					return AMF<
+						typename SourceAMF::imf_c_type,
+						typename SourceAMF::imf_r_type,
+						new_mapping_polynomial_type,
+						omp
+					>(
+						amf.imf_c,
+						amf.imf_r,
+						new_mapping_polynomial_type(
+							amf.map_poly.ay2, amf.map_poly.ax2, amf.map_poly.axy,
+							amf.map_poly.ay, amf.map_poly.ax,
+							amf.map_poly.a0
+						),
+						amf.storage_dimensions
+					);
+				}
+
+				Reshape() = delete;
+
+			}; // class Reshape< transpose, ... >
+
+			/**
+			 * Specialization for diagonal views
+			 *
+			 * Diagonal view is implemented by taking a square view over the matrix.
+			 *
+			 * \note \internal Converts a mapping polynomial from a bivariate-quadratic
+			 *                 to univariate quadratic by summing j-factors into
+			 *                 corresponding i-factors.
+			 *                 Implicitely applies a largest possible square view by
+			 *                 using Strided IMFs.
+			 *
+			 */
+			template< typename SourceAMF >
+			struct Reshape< view::diagonal, SourceAMF > {
+
+				private:
+
+					/** Short name of the original mapping polynomial type */
+					typedef typename SourceAMF::mapping_polynomial_type orig_p;
+
+					/** The type of the resulting polynomial */
+					typedef polynomials::BivariateQuadratic<
+						orig_p::Ax2 || orig_p::Ay2 || orig_p::Axy, 0, 0,
+						orig_p::Ax || orig_p::Ay, 0,
+						orig_p::A0, orig_p::D
+					> new_poly_type;
+
+				public:
+
+					typedef AMF< imf::Id, imf::Zero, new_poly_type, omp > amf_type;
+
+					static amf_type Create( const SourceAMF &amf ) {
+						assert( amf.getLogicalDimensions().first == amf.getLogicalDimensions().second );
+						return amf_type(
+							imf::Id( amf.getLogicalDimensions().first ),
+							imf::Zero( 1 ),
+							new_poly_type(
+								orig_p::Ax2 * amf.map_poly.ax2 + orig_p::Ay2 * amf.map_poly.ay2 + orig_p::Axy * amf.map_poly.axy, 0, 0,
+								orig_p::Ax * amf.map_poly.ax + orig_p::Ay * amf.map_poly.ay, 0,
+								amf.map_poly.a0
+							),
+							amf.storage_dimensions
+						);
+					}
+
+					Reshape() = delete;
+
+			}; // class Reshape< diagonal, ... >
+
+			/**
+			 * Specialization for matrix views over vectors
+			 *
+			 * \note \internal The resulting AMF is equivalent to applying
+			 *                 a composition with two ID IMFs.
+			 *
+			 */
+			template< typename SourceAMF >
+			struct Reshape< view::matrix, SourceAMF > {
+
+				typedef typename Compose< imf::Id, imf::Id, SourceAMF >::amf_type amf_type;
+
+				static amf_type Create( const SourceAMF &amf ) {
+					return Compose< imf::Id, imf::Id, SourceAMF >::Create(
+						imf::Id( amf.getLogicalDimensions().first ),
+						imf::Id( amf.getLogicalDimensions().second ),
+						amf
+					);
+				}
+
+				Reshape() = delete;
+
+			}; // class Reshape< diagonal, ... >
+
+		}; // class AMFFactory
+
+	} // namespace storage
+
+} // namespace alp
+
+#endif // _H_ALP_OMP_STORAGE
diff --git a/include/alp/omp/storagebasedmatrix.hpp b/include/alp/omp/storagebasedmatrix.hpp
new file mode 100644
index 000000000..50b333300
--- /dev/null
+++ b/include/alp/omp/storagebasedmatrix.hpp
@@ -0,0 +1,178 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _H_ALP_OMP_STORAGEBASEDMATRIX
+#define _H_ALP_OMP_STORAGEBASEDMATRIX
+
+#include <alp/backends.hpp>
+#include <alp/ops.hpp>
+#include <alp/type_traits.hpp>
+#include <alp/utils.hpp>
+
+#include <alp/amf-based/storagebasedmatrix.hpp>
+
+#include "config.hpp"
+#include "storage.hpp"
+#include "vector.hpp"
+
+
+namespace alp {
+
+	namespace internal {
+
+		/**
+		 * Matrix container specialization
+		 * Implements both original containers and views on containers.
+		 * @tparam requires_allocation True if the class is an original container
+		 *                             False if the class is a view of another matrix
+		 */
+		template< typename T, typename AmfType, bool requires_allocation >
+		class StorageBasedMatrix< T, AmfType, requires_allocation, omp > :
+			public MatrixBase< StorageBasedMatrix< T, AmfType, requires_allocation, omp > > {
+
+			template<
+				typename MatrixType,
+				std::enable_if_t< internal::is_storage_based< MatrixType >::value > *
+			>
+			friend size_t getStorageDimensions( const MatrixType &A ) noexcept;
+
+			/** Get the reference to the AMF of a storage-based matrix */
+			template<
+				typename MatrixType,
+				std::enable_if_t< internal::is_storage_based< MatrixType >::value > *
+			>
+			friend const typename MatrixType::amf_type &getAmf( const MatrixType &A ) noexcept;
+
+			public:
+
+				/** Expose static properties */
+
+				typedef T value_type;
+				typedef AmfType amf_type;
+				typedef typename AmfType::imf_r_type imf_r_type;
+				typedef typename AmfType::imf_c_type imf_c_type;
+				/** Type returned by access function */
+				typedef T &access_type;
+				typedef const T &const_access_type;
+				/** Type of the index used to access the physical storage */
+				typedef typename AmfType::storage_index_type storage_index_type;
+
+			protected:
+				typedef StorageBasedMatrix< T, AmfType, requires_allocation, omp > self_type;
+				friend MatrixBase< self_type >;
+
+				typedef typename std::conditional<
+					requires_allocation,
+					internal::Vector< T, omp >,
+					internal::Vector< T, omp > &
+				>::type container_type;
+
+				/** A container-type view is characterized by its association with a physical container */
+				container_type container;
+
+				/**
+				 * Access mapping function maps a pair of logical coordinates
+				 * into the concrete coordinate inside the actual container.
+				 * \see AMF
+				 */
+				AmfType amf;
+
+				/**
+				 * determines the size of the matrix via the domain of
+				 * the index mapping functions.
+				 *
+				 * @return A pair of dimensions.
+				 */
+				std::pair< size_t, size_t > dims() const noexcept {
+					return amf.getLogicalDimensions();
+				}
+
+				size_t getStorageDimensions() const noexcept {
+					return amf.getStorageDimensions();
+				}
+
+				friend const Vector< T, omp > &getContainer( const self_type &A ) {
+					return A.container;
+				}
+
+				friend Vector< T, omp > &getContainer( self_type &A ) {
+					return A.container;
+				}
+
+				bool getInitialized() const noexcept {
+					return internal::getInitialized( container );
+				}
+
+				void setInitialized( const bool initialized ) noexcept {
+					internal::setInitialized( container , initialized );
+				}
+
+				const AmfType &getAmf() const noexcept {
+					return amf;
+				}
+
+				/**
+				 * Returns a constant reference to the element corresponding to
+				 * the provided storage index.
+				 *
+				 * @param storageIndex  storage index in the physical iteration
+				 *                      space.
+				 *
+				 * @return const reference or value of the element at given position.
+				 * \note This function may result in accessing memory belonging to
+				 *       another thread, which may incurr performance penalty.
+				 */
+				const_access_type access( const storage_index_type &si ) const {
+					return getRaw( getLocalContainer( container, si.buffer_id, si.block_id ) )[ si.offset ];
+				}
+
+				access_type access( const storage_index_type &si ) {
+					return getRaw( getLocalContainer( container, si.buffer_id, si.block_id ) )[ si.offset ];
+				}
+
+				storage_index_type getStorageIndex( const size_t i, const size_t j, const size_t s, const size_t P ) const {
+					return amf.getStorageIndex( i, j, s, P );
+				}
+
+				/**
+				 * Construct a new structured matrix Base object
+				 *
+				 * @param rows The number of rows of the matrix.
+				 * @param cols The number of columns of the matrix.
+				 * @param smf  The storage mapping function assigned to this matrix.
+				 */
+				StorageBasedMatrix( AmfType &&amf ) :
+					// \todo enable only if ImfR and ImfC are imf::Id
+					container( amf.getDistribution() ),
+					amf( std::move( amf ) ) {
+#ifdef DEBUG
+					std::cout << "Entering OMP StorageBasedMatrix constructor\n";
+#endif
+				}
+
+				/** View on another container */
+				StorageBasedMatrix( Vector< T, omp > &container, AmfType &&amf ) :
+					container( container ),
+					amf( std::move( amf ) ) {}
+
+		}; // class StorageBasedMatrix
+
+	} // namespace internal
+
+} // namespace alp
+
+#endif // end ``_H_ALP_OMP_STORAGEBASEDMATRIX''
diff --git a/include/alp/omp/vector.hpp b/include/alp/omp/vector.hpp
new file mode 100644
index 000000000..b653de004
--- /dev/null
+++ b/include/alp/omp/vector.hpp
@@ -0,0 +1,355 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_OMP_VECTOR
+#define _H_ALP_OMP_VECTOR
+
+
+#include <stdexcept>
+#include <memory>
+
+#include <assert.h>
+
+#include <alp/backends.hpp>
+#include <alp/rc.hpp>
+#include <alp/imf.hpp>
+#include <alp/density.hpp>
+#include <alp/views.hpp>
+
+#include <alp/base/vector.hpp>
+#include <alp/amf-based/vector.hpp>
+
+#include "config.hpp"
+#include "matrix.hpp"
+#include "storage.hpp"
+
+#ifdef _ALP_OMP_WITH_REFERENCE
+ #include <alp/reference/vector.hpp>
+#endif
+
+
+namespace alp {
+
+	namespace internal {
+
+		template< typename T >
+		size_t getLength( const Vector< T, omp > & ) noexcept;
+
+		template< typename T >
+		const bool & getInitialized( const Vector< T, omp > & v ) noexcept;
+
+		template< typename T >
+		void setInitialized( Vector< T, omp > & v, const bool initialized ) noexcept;
+
+		template< typename T >
+		const Vector< T, config::default_sequential_backend > &getLocalContainer(
+			const Vector< T, omp > &v, const size_t thread, const size_t block
+		) noexcept;
+
+		template< typename T >
+		Vector< T, config::default_sequential_backend > &getLocalContainer(
+			Vector< T, omp > &v, const size_t thread, const size_t block
+		) noexcept;
+
+		/**
+		 * The parallel shared memory implementation of the ALP/Dense vector.
+		 *
+		 * @tparam T The type of an element of this vector. \a T shall not be a
+		 *           GraphBLAS type.
+		 *
+		 * \warning Creating a alp::Vector of other GraphBLAS types is
+		 *                <em>not allowed</em>.
+		 *          Passing a GraphBLAS type as template parameter will lead to
+		 *          undefined behaviour.
+		 */
+		template< typename T >
+		class Vector< T, omp > {
+
+			/* ********************
+				IO friends
+			   ******************** */
+
+			friend size_t internal::getLength< T >( const Vector< T, omp > & ) noexcept;
+
+			friend const bool & internal::getInitialized< T >( const Vector< T, omp > & ) noexcept;
+
+			friend void internal::setInitialized< T >( Vector< T, omp > & , bool ) noexcept;
+
+			friend const Vector< T, config::default_sequential_backend > &getLocalContainer<>(
+				const Vector< T, omp > &v, const size_t thread, const size_t block
+			) noexcept;
+
+			friend Vector< T, config::default_sequential_backend > &getLocalContainer<>(
+				Vector< T, omp > &v, const size_t thread, const size_t block
+			) noexcept;
+
+			private:
+
+				/** The distribution of the vector among threads. */
+				Distribution_2_5D distr;
+
+				/** The number of buffers. */
+				size_t num_buffers;
+
+				/** The array of buffers. */
+				T **buffers;
+
+				/** Containers constructed around portions of buffers. */
+				using container_type = Vector< T, config::default_sequential_backend >;
+				std::vector< std::vector< container_type > > containers;
+
+				/** Whether the container is presently initialized. */
+				bool initialized;
+
+			public:
+
+				/** Exposes the element type. */
+				typedef T value_type;
+
+				/**
+				 * The main ALP/Dense vector constructor.
+				 *
+				 * The constructed object will be uninitalised after successful construction.
+				 *
+				 *
+				 * @param length      The number of elements in the new vector.
+				 *
+				 * @return SUCCESS This function never fails.
+				 *
+				 * \parblock
+				 * \par Performance semantics.
+				 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+				 *        -# This constructor may allocate \f$ \Theta( length ) \f$ bytes
+				 *           of dynamic memory.
+				 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+				 *           memory beyond that at constructor entry.
+				 *        -# This constructor incurs \f$ \Theta(1) \f$ data movement.
+				 *        -# This constructor \em may make system calls.
+				 * \endparblock
+				 *
+				 * \warning Avoid the use of this constructor within performance critical
+				 *          code sections.
+				 */
+				Vector(
+					const Distribution_2_5D &d,
+					const size_t cap = 0
+				) : distr( d ),
+					num_buffers( d.getNumberOfThreads() ),
+					containers( num_buffers ),
+					initialized( false ) {
+
+					(void) cap;
+
+#ifdef DEBUG
+					std::cout << "Entered OMP internal::Vector constructor\n";
+#endif
+
+					// TODO: Implement allocation properly
+					buffers = new ( std::nothrow ) value_type*[ num_buffers ];
+					if( ( num_buffers > 0 ) && ( buffers == nullptr ) ){
+						throw std::runtime_error( "Could not allocate memory during alp::Vector<omp> construction." );
+					}
+
+					#pragma omp parallel
+					{
+						const size_t thread = config::OMP::current_thread_ID();
+
+						const auto t_coords = d.getThreadCoords( thread );
+
+						if ( d.isActiveThread( t_coords ) ) {
+
+							const auto block_grid_dims = d.getLocalBlockGridDims( t_coords );
+
+							// Assuming that all blocks are of the same size
+							const size_t alloc_size = block_grid_dims.first * block_grid_dims.second * d.getBlockSize();
+
+#ifdef DEBUG
+							#pragma omp critical
+							{
+								if( thread != config::OMP::current_thread_ID() ) {
+									std::cout << "Warning: thread != OMP::current_thread_id()\n";
+								}
+								std::cout << "Thread with global coordinates tr = " << t_coords.tr << " tc = " << t_coords.tc
+									<< " on OpenMP thread " << config::OMP::current_thread_ID()
+									<< " allocating buffer of " << alloc_size << " elements "
+									<< " holding " << block_grid_dims.first << " x " << block_grid_dims.second << " blocks.\n";
+							}
+#endif
+
+							// TODO: Implement allocation properly
+							buffers[ thread ] = new ( std::nothrow ) value_type[ alloc_size ];
+
+							if( buffers[ thread ] == nullptr ) {
+								throw std::runtime_error( "Could not allocate memory during alp::Vector<omp> construction." );
+							}
+
+							// Reserve space for all internal container wrappers to avoid re-allocation
+							containers[ thread ].reserve( block_grid_dims.first * block_grid_dims.second );
+
+							// Populate the array of internal container wrappers
+							for( size_t br = 0; br < block_grid_dims.first; ++br ) {
+								for( size_t bc = 0; bc < block_grid_dims.second; ++bc ) {
+									const size_t offset = d.getBlocksOffset( t_coords, br, bc );
+									containers[ thread ].emplace_back( &( buffers[ thread ][ offset ] ), d.getBlockSize() );
+								}
+							}
+
+							// Ensure that the array contains the expected number of containers
+							assert( containers[ thread ].size() == block_grid_dims.first * block_grid_dims.second );
+						
+						} // End active threads allocation
+					}
+				}
+
+				/**
+				 * Copy constructor.
+				 *
+				 * @param other The vector to copy. The initialization state of the copy
+				 *              reflects the state of \a other.
+				 *
+				 * \parblock
+				 * \par Performance semantics.
+				 *      Allocates the same capacity as the \a other vector, even if the
+				 *      actual number of elements contained in \a other is less.
+				 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+				 *        -# This constructor allocates \f$ \Theta(\max{mn, cap} ) \f$ bytes
+				 *           of dynamic memory.
+				 *        -# This constructor incurs \f$ \Theta(mn) \f$ of data
+				 *           movement.
+				 *        -# This constructor \em may make system calls.
+				 * \endparblock
+				 *
+				 * \warning Avoid the use of this constructor within performance critical
+				 *          code sections.
+				 */
+				Vector( const Vector< T, omp > &other ) : Vector( other.n, other.cap ) {
+					initialized = other.initialized;
+					// const RC rc = set( *this, other ); // note: initialized will be set as part of this call
+					// if( rc != SUCCESS ) {
+					// 	throw std::runtime_error( "alp::Vector< T, omp > (copy constructor): error during call to alp::set (" + toString( rc ) + ")" );
+					// }
+				}
+
+				/**
+				 * Move constructor. The new vector equal the given
+				 * vector. Invalidates the use of the input vector.
+				 *
+				 * @param[in] other The GraphBLAS vector to move to this new instance.
+				 *
+				 * \parblock
+				 * \par Performance semantics.
+				 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+				 *        -# This constructor will not allocate any new dynamic memory.
+				 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+				 *           memory beyond that at constructor entry.
+				 *        -# This constructor will move \f$ \Theta(1) \f$ bytes of data.
+				 * \endparblock
+				 */
+				Vector( Vector< T, omp > &&other ) : buffers( other.buffers ) {
+					other.buffers = nullptr;
+					// data_deleter = std::move( other.data_deleter );
+					// initialized = other.initialized; other.initialized = false;
+				}
+
+				/**
+				 * Vector destructor.
+				 *
+				 * \parblock
+				 * \par Performance semantics.
+				 *        -# This destructor entails \f$ \Theta(1) \f$ amount of work.
+				 *        -# This destructor will not perform any memory allocations.
+				 *        -# This destructor will use \f$ \mathcal{O}(1) \f$ extra bytes of
+				 *           memory beyond that at constructor entry.
+				 *        -# This destructor will move \f$ \Theta(1) \f$ bytes of data.
+				 *        -# This destructor makes system calls.
+				 * \endparblock
+				 *
+				 * \warning Avoid calling destructors from within performance critical
+				 *          code sections.
+				 */
+				~Vector() {
+					if( buffers != nullptr ) {
+						#pragma omp parallel
+						{
+							const size_t thread = config::OMP::current_thread_ID();
+
+							if ( distr.isActiveThread( thread ) ) {
+								if( buffers[ thread ] != nullptr ) {
+									delete [] buffers[ thread ];
+								}
+							}
+						}
+						delete [] buffers;
+					}
+				}
+
+		};
+
+		/** Identifies any omp internal vector as an internal container. */
+		template< typename T >
+		struct is_container< internal::Vector< T, omp > > : std::true_type {};
+
+	} // end namespace ``alp::internal''
+
+	namespace internal {
+
+		template< typename T >
+		size_t getLength( const Vector< T, omp > &v ) noexcept {
+			return v.n;
+		}
+
+		template< typename T >
+		const bool & getInitialized( const Vector< T, omp > & v ) noexcept {
+			return v.initialized;
+		}
+
+		template< typename T >
+		void setInitialized( Vector< T, omp > & v, bool initialized ) noexcept {
+			v.initialized = initialized;
+		}
+
+		template< typename T >
+		const Vector< T, config::default_sequential_backend > &getLocalContainer(
+			const Vector< T, omp > &v,
+			const size_t thread, const size_t block
+		) noexcept {
+			assert( thread < v.num_buffers );
+			assert( block < v.containers[ thread ].size() );
+			return v.containers[ thread ][ block ];
+		}
+
+		template< typename T >
+		Vector< T, config::default_sequential_backend > &getLocalContainer(
+			Vector< T, omp > &v,
+			const size_t thread, const size_t block
+		) noexcept {
+			assert( thread < v.num_buffers );
+			assert( block < v.containers[ thread ].size() );
+			return v.containers[ thread ][ block ];
+		}
+
+	} // end namespace ``alp::internal''
+
+} // end namespace ``alp''
+
+#endif // end ``_H_ALP_OMP_VECTOR''
+
diff --git a/include/alp/reference/blas2.hpp b/include/alp/reference/blas2.hpp
index e29d6457e..09a4820dc 100644
--- a/include/alp/reference/blas2.hpp
+++ b/include/alp/reference/blas2.hpp
@@ -950,6 +950,9 @@ namespace alp {
 			!alp::is_object< IOType >::value && !alp::is_object< InputType >::value && alp::is_monoid< Monoid >::value
 		> * const = nullptr
 	) {
+#ifdef _DEBUG
+		std::cout << "Called alp::foldl (scalar-to-matrix, monoid, reference)" << std::endl;
+#endif
 		// static sanity checks
 		NO_CAST_OP_ASSERT(
 			( !( descr & descriptors::no_casting ) || std::is_same< typename Monoid::D1, IOType >::value ),
@@ -992,6 +995,9 @@ namespace alp {
 			!alp::is_object< IOType >::value && !alp::is_object< InputType >::value && alp::is_operator< Operator >::value
 		> * const = nullptr
 	) {
+#ifdef _DEBUG
+		std::cout << "Called alp::foldl (scalar-to-matrix, operator, reference)" << std::endl;
+#endif
 		// static sanity checks
 		NO_CAST_OP_ASSERT(
 			( !( descr & descriptors::no_casting ) || std::is_same< typename Operator::D1, IOType >::value ),
diff --git a/include/alp/reference/blas3.hpp b/include/alp/reference/blas3.hpp
index b0ad5ef3d..8a76ee6e3 100644
--- a/include/alp/reference/blas3.hpp
+++ b/include/alp/reference/blas3.hpp
@@ -490,46 +490,71 @@ namespace alp {
 	 * @param phase 	The execution phase.
 	 */
 	template<
-		typename OutputStructMatT,
-		typename InputStructMatT1,
-		typename InputStructMatT2,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename OutputStructure, typename OutputView, 
+		typename OutputImfR, typename OutputImfC,
+		typename InputStructure1, typename InputView1, 
+		typename InputImfR1, typename InputImfC1,
+		typename InputStructure2, typename InputView2, 
+		typename InputImfR2, typename InputImfC2,
 		class Semiring
 	>
 	RC mxm( 
-		OutputStructMatT & C,
-		const InputStructMatT1 & A,
-		const InputStructMatT2 & B,
+		alp::Matrix< OutputType, OutputStructure, 
+		Density::Dense, OutputView, OutputImfR, OutputImfC, reference > &C,
+		const alp::Matrix< InputType1, InputStructure1, 
+		Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &A,
+		const alp::Matrix< InputType2, InputStructure2, 
+		Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &B,
 		const Semiring & ring = Semiring(),
 		const PHASE &phase = NUMERICAL,
-		const std::enable_if_t< 
-			! alp::is_object< typename OutputStructMatT::value_type >::value && ! alp::is_object< typename InputStructMatT1::value_type >::value && ! alp::is_object< typename InputStructMatT2::value_type >::value && alp::is_semiring< Semiring >::value
-		> * const = NULL 
+		const typename std::enable_if< 
+			!alp::is_object< OutputType >::value && 
+			!alp::is_object< InputType1 >::value && 
+			!alp::is_object< InputType2 >::value && 
+			alp::is_semiring< Semiring >::value,
+			void 
+		>::type * const = NULL 
 	) {
 		(void)phase;
 
-		return internal::mxm_generic< false >( C, A, B, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid(), ring.getMultiplicativeMonoid() );
+		return internal::mxm_generic< false >( 
+			C, A, B, 
+			ring.getMultiplicativeOperator(), ring.getAdditiveMonoid(), ring.getMultiplicativeMonoid() 
+		);
 	}
 
 	/**
 	 * Dense Matrix-Matrix multiply between structured matrices.
 	 * Version with additive monoid and multiplicative operator
 	 */
-	template< typename OutputStructMatT, 
-		typename InputStructMatT1,
-		typename InputStructMatT2,
+	template< 
+		typename OutputType, typename InputType1, typename InputType2,
+		typename OutputStructure, typename OutputView, 
+		typename OutputImfR, typename OutputImfC,
+		typename InputStructure1, typename InputView1, 
+		typename InputImfR1, typename InputImfC1,
+		typename InputStructure2, typename InputView2, 
+		typename InputImfR2, typename InputImfC2,
 		class Operator, class Monoid
 	>
 	RC mxm( 
-		OutputStructMatT & C,
-		const InputStructMatT1 & A,
-		const InputStructMatT2 & B,
+		alp::Matrix< OutputType, OutputStructure, 
+		Density::Dense, OutputView, OutputImfR, OutputImfC, reference > &C,
+		const alp::Matrix< InputType1, InputStructure1, 
+		Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &A,
+		const alp::Matrix< InputType2, InputStructure2, 
+		Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &B,
 		const Operator & mulOp,
 		const Monoid & addM,
 		const PHASE &phase = NUMERICAL,
-		const std::enable_if_t< 
-			! alp::is_object< typename OutputStructMatT::value_type >::value && ! alp::is_object< typename InputStructMatT1::value_type >::value && ! alp::is_object< typename InputStructMatT2::value_type >::value &&
-		    alp::is_operator< Operator >::value && alp::is_monoid< Monoid >::value
-		> * const = NULL 
+		const typename std::enable_if< 
+			!alp::is_object< OutputType >::value && 
+			!alp::is_object< InputType1 >::value && 
+			!alp::is_object< InputType2 >::value &&
+			alp::is_operator< Operator >::value && alp::is_monoid< Monoid >::value,
+			void 
+		>::type * const = NULL 
 	) {
 		(void)phase;
 
diff --git a/include/alp/reference/io.hpp b/include/alp/reference/io.hpp
index c6de63f5d..da1e287c5 100644
--- a/include/alp/reference/io.hpp
+++ b/include/alp/reference/io.hpp
@@ -878,9 +878,6 @@ namespace alp {
 		const fwd_iterator &start,
 		const fwd_iterator &end
 	) noexcept {
-		(void)A;
-		(void)start;
-		(void)end;
 
 		// Temporarily assuming 1-1 mapping with user container
 		internal::setInitialized(A, true);
diff --git a/include/alp/reference/vector.hpp b/include/alp/reference/vector.hpp
index d84e6f986..e1b035442 100644
--- a/include/alp/reference/vector.hpp
+++ b/include/alp/reference/vector.hpp
@@ -107,6 +107,8 @@ namespace alp {
 				/** Whether the container presently is uninitialized. */
 				bool initialized;
 
+				/** Whether the destructor should free the vector data. True when the vector data is allocated by this class. */
+				bool should_free_data;
 
 			public:
 
@@ -140,7 +142,8 @@ namespace alp {
 				 * \warning Avoid the use of this constructor within performance critical
 				 *          code sections.
 				 */
-				Vector( const size_t length, const size_t cap = 0 ) : n( length ), cap( std::max( length, cap ) ), initialized( false ) {
+				Vector( const size_t length, const size_t cap = 0 ) :
+					n( length ), cap( std::max( length, cap ) ), initialized( false ), should_free_data( true ) {
 					// TODO: Implement allocation properly
 					if( n > 0) {
 						data = new (std::nothrow) T[ n ];
@@ -153,6 +156,34 @@ namespace alp {
 					}
 				}
 
+				/**
+				 * The ALP/Dense vector constructor providing an already allocated data.
+				 *
+				 * The constructed object will be uninitalised after successful construction.
+				 *
+				 * @param data    The pointer to an allocated array of elements of
+				 *                elements of types T.
+				 * @param length  The number of elements the vector above guarantees
+				 *                to hold.
+				 * @param cap     The capacity (not used, kept for compatibility with sparse backend)
+				 *
+				 * \parblock
+				 * \par Performance semantics.
+				 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+				 *        -# This constructor may allocate \f$ \Theta( 1 ) \f$ bytes
+				 *           of dynamic memory.
+				 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+				 *           memory beyond that at constructor entry.
+				 *        -# This constructor incurs \f$ \Theta(1) \f$ data movement.
+				 *        -# This constructor does \em not make system calls.
+				 * \endparblock
+				 *
+				 * \warning Avoid the use of this constructor within performance critical
+				 *          code sections.
+				 */
+				Vector( T *data, const size_t length, const size_t cap = 0 ) :
+					n( length ), cap( std::max( length, cap ) ), data( data ), initialized( false ), should_free_data( false ) {}
+
 				/**
 				 * Copy constructor.
 				 *
@@ -222,7 +253,7 @@ namespace alp {
 				 *          code sections.
 				 */
 				~Vector() {
-					if( data != nullptr ) {
+					if( data != nullptr && should_free_data ) {
 						delete [] data;
 					}
 				}
diff --git a/include/alp/scalar.hpp b/include/alp/scalar.hpp
index 5cd5f30b3..33210aba2 100644
--- a/include/alp/scalar.hpp
+++ b/include/alp/scalar.hpp
@@ -28,6 +28,9 @@
 #ifdef _ALP_WITH_REFERENCE
  #include <alp/reference/scalar.hpp>
 #endif
+#ifdef _ALP_WITH_OMP
+ #include <alp/omp/scalar.hpp>
+#endif
 
 // specify default only if requested during compilation
 // #ifdef _ALP_BACKEND
diff --git a/include/alp/storage.hpp b/include/alp/storage.hpp
index d13957fd6..2fd3f8816 100644
--- a/include/alp/storage.hpp
+++ b/include/alp/storage.hpp
@@ -30,4 +30,8 @@
  #include <alp/reference/storage.hpp>
 #endif
 
+#ifdef _ALP_WITH_OMP
+ #include <alp/omp/storage.hpp>
+#endif
+
 #endif // _H_ALP_STORAGE
diff --git a/include/alp/type_traits.hpp b/include/alp/type_traits.hpp
index 39f3ebf6a..975a8a14b 100644
--- a/include/alp/type_traits.hpp
+++ b/include/alp/type_traits.hpp
@@ -409,6 +409,12 @@ namespace alp {
 				typedef new_container_type_from< type > _and_;
 			};
 
+			template< enum Backend new_backend >
+			struct change_backend {
+				typedef Container< T, Structure, density, View, ImfR, ImfC, new_backend > type;
+				typedef new_container_type_from< type > _and_;
+			};
+
 			private:
 				new_container_type_from() = delete;
 		};
diff --git a/include/alp/utils.hpp b/include/alp/utils.hpp
index 330c69b3b..d053f998d 100644
--- a/include/alp/utils.hpp
+++ b/include/alp/utils.hpp
@@ -26,6 +26,7 @@
 #include <assert.h>
 
 #include <cmath>  //fabs
+#include <functional> // modulus
 #include <limits> //numeric_limits
 #include <type_traits>
 
@@ -295,7 +296,21 @@ namespace alp {
 			}
 
 		};
-
+		
+		
+		/**
+		 * Computation of modulus operation \f$ mod(x, n) \f$ based on remainder 
+		 * of division for type \a T within range [0, n).
+		 * Assumes \a T implements: \a operator%, \a operator<, and \a operator+.
+		 */
+		template < typename T >
+		T modulus( const T x, const T n ) {
+			
+			const T rem = x % n;
+			
+			return ( rem < static_cast< T >( 0 ) ) ? rem + n : rem;
+		};
+					
 	} // namespace utils
 
 } // namespace alp
diff --git a/include/alp/vector.hpp b/include/alp/vector.hpp
index a5b8a8c41..78d19a323 100644
--- a/include/alp/vector.hpp
+++ b/include/alp/vector.hpp
@@ -30,6 +30,9 @@
 #ifdef _ALP_WITH_REFERENCE
  #include <alp/reference/vector.hpp>
 #endif
+#ifdef _ALP_WITH_OMP
+ #include <alp/omp/vector.hpp>
+#endif
 
 // specify default only if requested during compilation
 #ifdef _ALP_BACKEND
diff --git a/src/alp/CMakeLists.txt b/src/alp/CMakeLists.txt
index 29cc55f2c..18f4f4b9b 100644
--- a/src/alp/CMakeLists.txt
+++ b/src/alp/CMakeLists.txt
@@ -36,3 +36,7 @@ set( backend_alp_reference_srcs
 if( WITH_ALP_REFERENCE_BACKEND )
 	add_subdirectory( reference )
 endif()
+
+if( WITH_ALP_OMP_BACKEND )
+	add_subdirectory( omp )
+endif()
diff --git a/src/alp/omp/CMakeLists.txt b/src/alp/omp/CMakeLists.txt
new file mode 100644
index 000000000..9cf4ff7c1
--- /dev/null
+++ b/src/alp/omp/CMakeLists.txt
@@ -0,0 +1,54 @@
+#
+#   Copyright 2021 Huawei Technologies Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+assert_valid_variables( BACKEND_LIBRARY_OUTPUT_NAME VERSION
+	ALP_OMP_BACKEND_INSTALL_DIR INCLUDE_INSTALL_DIR
+	ALP_OMP_BACKEND_DEFAULT_NAME
+	ALP_OMP_SELECTION_DEFS
+)
+
+# sources for ALP OMP backend
+set( backend_alp_omp_srcs
+	${CMAKE_CURRENT_SOURCE_DIR}/../descriptors.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/../rc.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/init.cpp
+)
+
+add_library( backend_alp_omp_static STATIC "${backend_alp_omp_srcs}" )
+
+target_link_libraries( backend_alp_omp_static PUBLIC backend_alp_omp_headers )
+
+set_target_properties( backend_alp_omp_static PROPERTIES
+	OUTPUT_NAME "${BACKEND_LIBRARY_OUTPUT_NAME}"
+)
+
+set_target_properties( backend_alp_omp_static PROPERTIES
+	ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/omp"
+)
+
+target_compile_definitions( backend_alp_omp_static PUBLIC "${ALP_OMP_SELECTION_DEFS}" )
+target_link_libraries( backend_alp_omp_static PRIVATE backend_flags )
+
+add_dependencies( libs backend_alp_omp_static )
+
+install( TARGETS backend_alp_omp_static
+	EXPORT GraphBLASTargets
+	ARCHIVE DESTINATION "${ALP_OMP_BACKEND_INSTALL_DIR}"
+	)
+
+# this is an alias for add_grb_executables() to select the backend to link against
+# DO NOT CHANGE THE ALIAS NAME!
+add_library( "${ALP_OMP_BACKEND_DEFAULT_NAME}" ALIAS backend_alp_omp_static )
diff --git a/src/alp/omp/init.cpp b/src/alp/omp/init.cpp
new file mode 100644
index 000000000..f57351285
--- /dev/null
+++ b/src/alp/omp/init.cpp
@@ -0,0 +1,60 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 2nd of February, 2017
+ */
+
+#include <alp/reference/init.hpp>
+#include <alp/omp/config.hpp>
+#include <alp/rc.hpp>
+
+#ifndef _GRB_NO_LIBNUMA
+ #include <numa.h> //numa_set_localalloc
+#endif
+#include <omp.h> //omp_get_num_threads
+
+
+template<>
+alp::RC alp::init< alp::omp >( const size_t s, const size_t P, void * const data ) {
+	(void) data;
+	RC rc = alp::SUCCESS;
+	// print output
+	const auto T = config::OMP::threads();
+	std::cout << "Info: alp::init (omp) called. OpenMP is set to utilise " << T << " threads.\n";
+
+	// sanity checks
+	if( P > 1 ) {
+		return alp::UNSUPPORTED;
+	}
+	if( s > 0 ) {
+		return alp::PANIC;
+	}
+#ifndef _GRB_NO_LIBNUMA
+	// set memory policy
+	numa_set_localalloc();
+#endif
+	return rc;
+}
+
+template<>
+alp::RC alp::finalize< alp::omp >() {
+	std::cout << "Info: alp::finalize (omp) called.\n";
+	return alp::SUCCESS;
+}
+
diff --git a/tests/smoke/smoketests.sh b/tests/smoke/smoketests.sh
index 9838353c2..883b68797 100755
--- a/tests/smoke/smoketests.sh
+++ b/tests/smoke/smoketests.sh
@@ -511,7 +511,9 @@ for BACKEND in ${BACKENDS[@]}; do
 done
 
 for BACKEND in ${BACKENDS[@]}; do
-	if [ "${BACKEND:0:4}" != "alp_" ]; then
+	# Temporarily execute tests only for alp_reference backend
+	# until all backends start supporting all smoke tests.
+	if [ "${BACKEND}" != "alp_reference" ]; then
 		continue
 	fi
 
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 3674ad618..f16504294 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -265,6 +265,14 @@ add_grb_executables( dense_mxm dense_mxm.cpp
 	BACKENDS alp_reference
 )
 
+add_grb_executables( dense_omp_matrix dense_omp_matrix.cpp
+	BACKENDS alp_omp
+)
+
+add_grb_executables( dense_omp_mxm dense_omp_mxm.cpp
+	BACKENDS alp_omp
+)
+
 add_grb_executables( dense_outer dense_outer.cpp
 	BACKENDS alp_reference
 )
diff --git a/tests/unit/dense_omp_matrix.cpp b/tests/unit/dense_omp_matrix.cpp
new file mode 100644
index 000000000..bf93e808b
--- /dev/null
+++ b/tests/unit/dense_omp_matrix.cpp
@@ -0,0 +1,95 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define _DEBUG
+
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <vector>
+#include <memory>
+
+#include <alp.hpp>
+#include "../utils/print_alp_containers.hpp"
+
+void alp_program( const size_t &n, alp::RC &rc ) {
+
+	typedef double T;
+
+	rc = alp::SUCCESS;
+	alp::Semiring< alp::operators::add< T >, alp::operators::mul< T >, alp::identities::zero, alp::identities::one > ring;
+
+	std::cout << "\tStarting dense OMP matrix test with size: " << n << " x " << n <<  "\n";
+
+	// create the matrix
+	alp::Matrix< float, alp::structures::General > M( n, n );
+
+	// set all matrix elements to 1
+	rc = rc ? rc : alp::set( M, alp::Scalar< T >( ring.template getOne< T >() ) );
+
+	print_matrix( "M", M );
+
+}
+
+int main( int argc, char **argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 5;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read % 2 != 0 ) {
+			std::cerr << "Given value for n is odd\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an even integer, the "
+					 "test size.\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	alp::Launcher< alp::AUTOMATIC > launcher;
+	alp::RC out;
+	if( launcher.exec( &alp_program, in, out, true ) != alp::SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != alp::SUCCESS ) {
+		std::cerr << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+	} else {
+		std::cout << "Test OK" << std::endl;
+	}
+	return 0;
+}
diff --git a/tests/unit/dense_omp_mxm.cpp b/tests/unit/dense_omp_mxm.cpp
new file mode 100644
index 000000000..3628d5d45
--- /dev/null
+++ b/tests/unit/dense_omp_mxm.cpp
@@ -0,0 +1,348 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include <alp.hpp>
+#ifdef NDEBUG
+ #include "../utils/print_alp_containers.hpp"
+#endif
+
+using namespace alp;
+
+
+template< typename T >
+void print_stdvec_as_matrix( std::string name, const std::vector< T > &vA, const size_t m, const size_t n, const size_t lda ) {
+
+	std::cout << "Vec " << name << ":" << std::endl;
+	for( size_t row = 0; row < m; ++row ) {
+		std::cout << "[\t";
+		for( size_t col = 0; col < n; ++col ) {
+			std::cout << vA[ row * lda + col ] << "\t";
+		}
+		std::cout << "]" << std::endl;
+	}
+}
+
+template< typename T, typename Operator, typename Monoid >
+void mxm_stdvec_as_matrix(	
+	std::vector< T > &vC, const size_t ldc,
+	const std::vector< T > &vA, const size_t lda,
+	const std::vector< T > &vB, const size_t ldb,
+	const size_t m, const size_t k, const size_t n,
+	const Operator oper,
+	const Monoid monoid 
+) {
+    
+	T temp;
+
+#ifndef NDEBUG
+	print_stdvec_as_matrix( "vA", vA, n, n, n );
+	print_stdvec_as_matrix( "vB", vB, n, n, n );
+	print_stdvec_as_matrix( "vC - PRE", vC, n, n, n );
+#endif
+
+	for( size_t i = 0; i < m; ++i ) {
+		for( size_t j = 0; j < n; ++j ) {
+			T &c_val { vC[ i * ldc + j ] };
+			for( size_t l = 0; l < k; ++l ) {
+					const T &a_val { vA[ i * lda + l ] };
+					const T &b_val { vB[ l * ldb + j ] };
+					// std::cout << c_val << " += " << a_val << " * " << b_val << std::endl;
+					(void)internal::apply( temp, a_val, b_val, oper );
+					// std::cout << "temp = " << temp << std::endl;
+					(void)internal::foldl( c_val, temp, monoid.getOperator() );
+			}
+		}
+	}
+
+#ifndef NDEBUG
+	print_stdvec_as_matrix( "vC - POST", vC, n, n, n );
+#endif
+}
+
+template< typename Structure, typename T >
+void stdvec_build_matrix( std::vector< T > &vA, const size_t m, const size_t n, const size_t lda, const T zero, const T one ) {
+
+		if( std::is_same< Structure, structures::General >::value ) {
+			std::fill( vA.begin(), vA.end(), one );
+		} else if( std::is_same< Structure, structures::Symmetric >::value ) {
+			std::fill( vA.begin(), vA.end(), one );
+		} else if( std::is_same< Structure, structures::UpperTriangular >::value ) {
+			for( size_t row = 0; row < m; ++row ) {
+				for( size_t col = 0; col < row; ++col ) {
+					vA[ row * lda + col ] = zero;
+				}
+				for( size_t col = row; col < n; ++col ) {
+					vA[ row * lda + col ] = one;
+				}
+			}
+		}
+
+}
+
+template< typename Structure, typename T >
+void stdvec_build_matrix( std::vector< T > &vA, const size_t m, const size_t n, const size_t lda, const T zero, const T one, const T inc ) {
+
+		T val = one;
+		if( std::is_same< Structure, structures::General >::value ) {
+			for( size_t row = 0; row < m; ++row ) {
+				for( size_t col = 0; col < n; ++col ) {
+					vA[ row * lda + col ] = val;
+					val += inc;
+				}
+			}
+		} else if( std::is_same< Structure, structures::Symmetric >::value ) {
+			for( size_t row = 0; row < m; ++row ) {
+				for( size_t col = row; col < n; ++col ) {
+					vA[ row * lda + col ] = vA[ col * lda + row ] = val;
+					val += inc;
+				}
+			}
+		} else if( std::is_same< Structure, structures::UpperTriangular >::value ) {
+			for( size_t row = 0; row < m; ++row ) {
+				for( size_t col = 0; col < row; ++col ) {
+					vA[ row * lda + col ] = zero;
+				}
+				for( size_t col = row; col < n; ++col ) {
+					vA[ row * lda + col ] = val;
+					val += inc;
+				}
+			}
+		}
+
+}
+
+template< typename Structure, typename T >
+void stdvec_build_matrix_packed( std::vector< T > &vA, const T one ) {
+
+	std::fill( vA.begin(), vA.end(), one );
+
+}
+
+template< typename Structure, typename T >
+void stdvec_build_matrix_packed( std::vector< T > &vA, const T one, const T inc ) {
+
+		T val = one;
+		if( std::is_same< Structure, structures::Symmetric >::value ) { // Assumes Packed Row - Upper
+			for( auto &elem: vA ) {
+				elem = val;
+				val += inc;
+			}
+		} else if( std::is_same< Structure, structures::UpperTriangular >::value ) { // Assumes Packed Row - Upper
+			for( auto &elem: vA ) {
+				elem = val;
+				val += inc;
+			}
+		}
+
+}
+
+template< typename MatType, typename T >
+void diff_stdvec_matrix( const std::vector< T > &vA, const size_t m, const size_t n, const size_t lda,
+						 const MatType & mA, double threshold = 1e-7 ) {
+
+	if( std::is_same< typename MatType::structure, structures::General >::value ) {
+		for( size_t row = 0; row < m; ++row ) {
+			for( size_t col = 0; col < n; ++col ) {
+				double va = ( double )( vA[ row * lda + col ] );
+				double vm = ( double )( internal::access( mA, internal::getStorageIndex( mA, row, col ) ) );
+				double re = std::abs( ( va - vm ) / va );
+				if( re > threshold ) {
+					std::cout << "Error ( " << row << ", " << col << " ): " << va << " v " << vm << std::endl; 
+				}
+			}
+		}
+	} else if( std::is_same< typename MatType::structure, structures::Symmetric >::value ) {
+		for( size_t row = 0; row < m; ++row ) {
+			for( size_t col = row; col < n; ++col ) {
+				double va = ( double )( vA[ row * lda + col ] );
+				double vm = ( double )( internal::access( mA, internal::getStorageIndex( mA, row, col ) ) );
+				double re = std::abs( ( va - vm ) / va );
+				if( re > threshold ) {
+					std::cout << "Error ( " << row << ", " << col << " ): " << va << " v " << vm << std::endl; 
+				}
+			}
+		}
+	} else if( std::is_same< typename MatType::structure, structures::UpperTriangular >::value ) {
+		for( size_t row = 0; row < m; ++row ) {
+			for( size_t col = row; col < n; ++col ) {
+				double va = ( double )( vA[ row * lda + col ] );
+				double vm = ( double )( internal::access( mA, internal::getStorageIndex( mA, row, col ) ) );
+				double re = std::abs( ( va - vm ) / va );
+				if( re > threshold ) {
+					std::cout << "Error ( " << row << ", " << col << " ): " << va << " v " << vm << std::endl; 
+				}
+			}
+		}
+	}
+
+}
+
+template < typename T, typename SemiringT >
+void run_mxm( const size_t m, const size_t k, const size_t n, alp::RC &rc ) {
+
+	const SemiringT ring;
+	const T one  = ring.template getOne< T >();
+	const T zero = ring.template getZero< T >();
+
+	std::vector< T > A_data( m * k );
+	std::vector< T > B_data( k * n );
+	std::vector< T > C_data( m * n, zero );
+
+	std::vector< T > A_vec( m * k );
+	std::vector< T > B_vec( k * n );
+	std::vector< T > C_vec( m * n, zero );
+
+	std::cout << "\tTesting dense General mxm " << m << " " << k << " " << n << std::endl;
+
+	stdvec_build_matrix< structures::General >( A_data, m, k, k, zero, one, one );
+	stdvec_build_matrix< structures::General >( B_data, k, n, n, zero, one, one );
+
+	// initialize test
+	alp::Matrix< T, structures::General > A( m, k );
+	alp::Matrix< T, structures::General > B( k, n );
+	alp::Matrix< T, structures::General > C( m, n );
+
+	// Initialize input matrices
+	rc = rc ? rc : alp::buildMatrix( A, A_data.begin(), A_data.end() );
+	if ( rc != alp::SUCCESS ) {
+		std::cerr << "\tIssues building A" << std::endl;
+		return;
+	}
+	rc = rc ? rc : alp::buildMatrix( B, B_data.begin(), B_data.end() );
+	rc = rc ? rc : alp::buildMatrix( C, C_data.begin(), C_data.end() );
+
+	if ( rc != alp::SUCCESS ) {
+		std::cerr << "\tIssues building matrices" << std::endl;
+		return;
+	}
+
+#ifndef NDEBUG
+	print_matrix( "A", A );
+	print_matrix( "B", B );
+	print_matrix( "C - PRE", C );
+#endif
+
+	rc = rc ? rc : alp::mxm( C, A, B, ring );
+
+#ifndef NDEBUG
+	print_matrix( "C - POST", C );
+#endif
+
+	if ( rc != alp::SUCCESS )
+		return;
+
+	stdvec_build_matrix< structures::General >( A_vec, m, k, k, zero, one, one );
+	stdvec_build_matrix< structures::General >( B_vec, k, n, n, zero, one, one );
+
+	mxm_stdvec_as_matrix( C_vec, n, A_vec, k, B_vec, n, m, k, n, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() );
+
+	diff_stdvec_matrix( C_vec, m, n, n, C );
+
+
+	std::cout << "\tDone." << std::endl;
+
+}
+
+#define M ( alp::config::BLOCK_ROW_DIM * n )
+#define K ( alp::config::BLOCK_COL_DIM * 2 * n )
+#define N ( alp::config::BLOCK_COL_DIM * 3 * n )
+
+void alp_program( const size_t &n, alp::RC &rc ) {
+
+	using T = double;
+
+	using SemiringT = alp::Semiring< 
+		alp::operators::add< T >, alp::operators::mul< T >, 
+		alp::identities::zero, alp::identities::one 
+	>;
+
+	rc = alp::SUCCESS;
+
+	/** 
+	 * Testing cubic mxm.
+	 */
+	run_mxm< T, SemiringT >( M, M, M, rc );
+
+	/**
+	 * Testing rectangular mxm
+	 */
+	run_mxm< T, SemiringT >( M, K, N, rc );
+
+	/**
+	 * Testing outer-prod of blocks mxm
+	 */
+	run_mxm< T, SemiringT >( M, alp::config::BLOCK_COL_DIM, N, rc );
+
+	/**
+	 * Testing dot-prod of blocks mxm
+	 */
+	run_mxm< T, SemiringT >( alp::config::BLOCK_ROW_DIM, M, alp::config::BLOCK_COL_DIM, rc );
+
+}
+
+int main( int argc, char **argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 4;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read % 2 != 0 ) {
+			std::cerr << "Given value for n is odd\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an even integer, the "
+					 "test size.\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << " " << in << "\n";
+	alp::Launcher< AUTOMATIC > launcher;
+	alp::RC out;
+	if( launcher.exec( &alp_program, in, out, true ) != SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != SUCCESS ) {
+		std::cerr << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+	} else {
+		std::cout << "Test OK" << std::endl;
+	}
+	return 0;
+}
+