diff --git a/CMakeLists.txt b/CMakeLists.txt
index 02c49eb37..742af7511 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,7 +37,7 @@ project( GraphBLAS
 	DESCRIPTION "The ultimate engine for sparse computation"
 	LANGUAGES CXX C
 )
-set( CMAKE_CXX_STANDARD 11 )
+set( CMAKE_CXX_STANDARD 14 )
 set( CMAKE_CXX_STANDARD_REQUIRED ON )
 
 # install within the build directory by default (NOT to /usr/local or the likes)
@@ -52,13 +52,14 @@ endif()
 option( WITH_REFERENCE_BACKEND "With Reference backend" ON )
 option( WITH_OMP_BACKEND "With OMP backend" ON )
 option( WITH_HYPERDAGS_BACKEND "With Hyperdags backend" ON )
+option( WITH_ASCEND_BACKEND "With Ascend backend" ON )
 if( WITH_HYPERDAGS_BACKEND )
 	if( NOT DEFINED WITH_HYPERDAGS_USING )
 		set( WITH_HYPERDAGS_USING "reference" )
 	endif()
 endif()
 option( WITH_NONBLOCKING_BACKEND "With Nonblocking backend" ON )
-option( WITH_NUMA "With NUMA support" ON )
+option( WITH_NUMA "With NUMA support" OFF )
 option( LPF_INSTALL_PATH "Path to the LPF tools for the BSP1D and Hybrid backends" OFF )
 # the following options depend on LPF_INSTALL_PATH being set
 include(CMakeDependentOption)
@@ -192,6 +193,12 @@ if( WITH_HYBRID_BACKEND )
 	endif()
 endif()
 
+# Enable nonblocking backend if ascend is active
+if( WITH_ASCEND_BACKEND )
+	set( WITH_NONBLOCKING_BACKEND ON )
+	message( STATUS "Enabling compilation of nonblocking backend: required by the Ascend backend" )
+endif()
+
 # Enabling reference_omp backend if non-blocking is active
 if( WITH_NONBLOCKING_BACKEND )
 	if( NOT WITH_OMP_BACKEND )
diff --git a/bootstrap.sh b/bootstrap.sh
index e24d75d45..3717a65a0 100755
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -80,6 +80,7 @@ the location where LPF is installed"
 	echo "                                        optional; default value is reference"
 	echo "                                        clashes with --no-hyperdags"
 	echo "  --no-nonblocking                    - disables the nonblocking backend"
+	echo "  --no-ascend                         - disables the ascend backend"
 	echo "  --[debug | coverage]-build          - build the project with debug | coverage options (tests will run much slower!)"
 	echo "  --generator=<value>                 - set the generator for CMake (otherwise use CMake's default)"
 	echo "  --show                              - show generation commands instead of running them"
@@ -102,6 +103,7 @@ reference=yes
 hyperdags=yes
 hyperdags_using=reference
 nonblocking=yes
+ascend=yes
 banshee=no
 lpf=no
 show=no
@@ -176,6 +178,9 @@ or assume default paths (--with-lpf)"
 	--no-nonblocking)
 			nonblocking=no
 			;;
+	--no-ascend)
+			ascend=no
+			;;
 	--debug-build)
 			debug_build=yes
 			;;
@@ -286,7 +291,7 @@ CURRENT_DIR="$(pwd)"
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
 
 # CONFIGURE CMAKE BUILDING INFRASTRUCTURE
-if [[ "${reference}" == "yes" || "${lpf}" == "yes" || "${nonblocking}" == "yes" ]]; then
+if [[ "${reference}" == "yes" || "${lpf}" == "yes" || "${nonblocking}" == "yes" || "${ascend}" == "yes" ]]; then
 	BUILD_DIR="${CURRENT_DIR}"
 
 	printf "Checking for cmake..."
@@ -363,6 +368,9 @@ the current directory before invocation or confirm the deletion of its content w
 	if [[ "${nonblocking}" == "no" ]]; then
 		CMAKE_OPTS+=" -DWITH_NONBLOCKING_BACKEND=OFF"
 	fi
+	if [[ "${ascend}" == "no" ]]; then
+		CMAKE_OPTS+=" -DWITH_ASCEND_BACKEND=OFF"
+	fi
 	if [[ "${lpf}" == "yes" ]]; then
 		CMAKE_OPTS+=" -DLPF_INSTALL_PATH='${ABSOLUTE_LPF_INSTALL_PATH}'"
 	fi
diff --git a/cmake/AddGRBInstall.cmake b/cmake/AddGRBInstall.cmake
index 94bd58f31..49ae44ab0 100644
--- a/cmake/AddGRBInstall.cmake
+++ b/cmake/AddGRBInstall.cmake
@@ -47,7 +47,7 @@ set( SHMEM_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/sequential" )
 set( HYPERDAGS_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/hyperdags" )
 set( BSP1D_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/spmd" )
 set( HYBRID_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/hybrid" )
-
+set( ASCEND_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/ascend" )
 
 
 # addBackendWrapperGenOptions
@@ -146,6 +146,14 @@ if( WITH_NONBLOCKING_BACKEND )
 	)
 endif()
 
+if( WITH_ASCEND_BACKEND )
+	addBackendWrapperGenOptions( "ascend"
+		COMPILE_DEFINITIONS "${ASCEND_SELECTION_DEFS};${ASCEND_INCLUDE_DEFS}"
+		LINK_FLAGS "'${SHMEM_BACKEND_INSTALL_DIR}/lib${BACKEND_LIBRARY_OUTPUT_NAME}.a'"
+			"'${ALP_UTILS_INSTALL_DIR}/lib${ALP_UTILS_LIBRARY_OUTPUT_NAME}.a'" "${NUMA_LFLAG}"
+	)
+endif()
+
 # distributed memory backends
 if( WITH_BSP1D_BACKEND OR WITH_HYBRID_BACKEND )
 	assert_valid_variables( LPFRUN LPFCPP )
diff --git a/cmake/AddGRBVars.cmake b/cmake/AddGRBVars.cmake
index fab0f9ac9..a5235a519 100644
--- a/cmake/AddGRBVars.cmake
+++ b/cmake/AddGRBVars.cmake
@@ -33,6 +33,7 @@ set( BSP1D_BACKEND_DEFAULT_NAME "backend_bsp1d" )
 set( HYBRID_BACKEND_DEFAULT_NAME "backend_hybrid" )
 set( HYPERDAGS_BACKEND_DEFAULT_NAME "backend_hyperdags" )
 set( NONBLOCKING_BACKEND_DEFAULT_NAME "backend_nonblocking" )
+set( ASCEND_BACKEND_DEFAULT_NAME "backend_ascend" )
 
 ### COMPILER DEFINITIONS FOR HEADERS INCLUSION AND FOR BACKEND SELECTION
 
@@ -41,6 +42,7 @@ set( REFERENCE_INCLUDE_DEFS "_GRB_WITH_REFERENCE" )
 set( REFERENCE_OMP_INCLUDE_DEFS "_GRB_WITH_OMP" )
 set( HYPERDAGS_INCLUDE_DEFS "_GRB_WITH_HYPERDAGS" )
 set( NONBLOCKING_INCLUDE_DEFS "_GRB_WITH_NONBLOCKING" )
+set( ASCEND_INCLUDE_DEFS "_GRB_WITH_ASCEND" )
 set( LPF_INCLUDE_DEFS "_GRB_WITH_LPF" )
 
 # compiler definitions to select a backend
@@ -51,6 +53,7 @@ set( HYPERDAGS_SELECTION_DEFS
 	"_GRB_WITH_HYPERDAGS_USING=${WITH_HYPERDAGS_USING}"
 )
 set( NONBLOCKING_SELECTION_DEFS "_GRB_BACKEND=nonblocking" )
+set( ASCEND_SELECTION_DEFS "_GRB_BACKEND=ascend" )
 set( BSP1D_SELECTION_DEFS
 		"_GRB_BACKEND=BSP1D"
 		"_GRB_BSP1D_BACKEND=reference"
@@ -64,7 +67,7 @@ set( HYBRID_SELECTION_DEFS
 set( NO_NUMA_DEF "_GRB_NO_LIBNUMA" )
 
 ### **ALL** BACKENDS, EVEN IF NOT ENABLED BY USER
-set( ALL_BACKENDS "reference" "reference_omp" "hyperdags" "nonblocking" "bsp1d" "hybrid" )
+set( ALL_BACKENDS "reference" "reference_omp" "hyperdags" "nonblocking" "ascend" "bsp1d" "hybrid" )
 
 # list of user-enabled backends, for tests and wrapper scripts (do not change!)
 set( AVAILABLE_BACKENDS "" )
@@ -90,6 +93,10 @@ if( WITH_NONBLOCKING_BACKEND )
 	list( APPEND AVAILABLE_BACKENDS "nonblocking" )
 endif()
 
+if( WITH_ASCEND_BACKEND )
+	list( APPEND AVAILABLE_BACKENDS "ascend" )
+endif()
+
 # distributed memory backends
 if( WITH_BSP1D_BACKEND )
 	list( APPEND AVAILABLE_BACKENDS "bsp1d" )
diff --git a/docs/Build_and_test_infra.md b/docs/Build_and_test_infra.md
index 8e28e47cb..d0d4fd3c0 100644
--- a/docs/Build_and_test_infra.md
+++ b/docs/Build_and_test_infra.md
@@ -726,7 +726,7 @@ build path, with
         set_target_properties( backend_example_static PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/example_output_dir" )
         ```
 
-1. add the new library to the `libs` target, which allows users to compile all
+7. add the new library to the `libs` target, which allows users to compile all
 backend libraries at once
 
     ```cmake
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 9a6affa1a..87f2d48b1 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -14,7 +14,7 @@
 # limitations under the License.
 #
 
-assert_defined_variables( WITH_REFERENCE_BACKEND WITH_OMP_BACKEND )
+assert_defined_variables( WITH_REFERENCE_BACKEND WITH_OMP_BACKEND WITH_ASCEND_BACKEND )
 
 # target listing all examples, to build them at once with 'make examples'
 add_custom_target( examples)
@@ -31,3 +31,57 @@ if( WITH_OMP_BACKEND )
 	add_dependencies( examples sp_reference_omp )
 endif()
 
+if( WITH_ASCEND_BACKEND )
+	add_executable( alp_ascend_addOp_ascend unittests/alp_ascend_addOp.cpp )
+	target_link_libraries( alp_ascend_addOp_ascend backend_ascend common_flags )
+	add_dependencies( examples alp_ascend_addOp_ascend )
+endif()
+
+if( WITH_ASCEND_BACKEND )
+	add_executable( alp_ascend_addOpv1_ascend unittests/alp_ascend_addOpv1.cpp )
+	target_link_libraries( alp_ascend_addOpv1_ascend backend_ascend common_flags )
+	add_dependencies( examples alp_ascend_addOpv1_ascend )
+endif()
+
+if( WITH_ASCEND_BACKEND )
+	add_executable( alp_ascend_movedataOpv01_ascend unittests/alp_ascend_movedataOpv01.cpp )
+	target_link_libraries( alp_ascend_movedataOpv01_ascend backend_ascend common_flags )
+	add_dependencies( examples alp_ascend_movedataOpv01_ascend )
+endif()
+
+if( WITH_ASCEND_BACKEND )
+	add_executable( alp_ascend_softmaxOp_ascend unittests/alp_ascend_softmaxOp.cpp )
+	target_link_libraries( alp_ascend_softmaxOp_ascend backend_ascend common_flags )
+	add_dependencies( examples alp_ascend_softmaxOp_ascend )
+endif()
+
+if( WITH_ASCEND_BACKEND )
+	add_executable( alp_ascend_softmaxOpv1_ascend unittests/alp_ascend_softmaxOpv1.cpp )
+	target_link_libraries( alp_ascend_softmaxOpv1_ascend backend_ascend common_flags )
+	add_dependencies( examples alp_ascend_softmaxOpv1_ascend )
+endif()
+
+if( WITH_ASCEND_BACKEND )
+	add_executable( alp_ascend_softmaxOpv3_ascend unittests/alp_ascend_softmaxOpv3.cpp )
+	target_link_libraries( alp_ascend_softmaxOpv3_ascend backend_ascend common_flags )
+	add_dependencies( examples alp_ascend_softmaxOpv3_ascend )
+endif()
+
+if( WITH_ASCEND_BACKEND )
+	add_executable( alp_ascend_softmaxOpv4_ascend unittests/alp_ascend_softmaxOpv4.cpp )
+	target_link_libraries( alp_ascend_softmaxOpv4_ascend backend_ascend common_flags )
+	add_dependencies( examples alp_ascend_softmaxOpv4_ascend )
+endif()      
+
+if( WITH_ASCEND_BACKEND )
+	add_executable( alp_ascend_onlinesoftmaxOp_ascend unittests/alp_ascend_onlinesoftmaxOp.cpp )
+	target_link_libraries( alp_ascend_onlinesoftmaxOp_ascend backend_ascend common_flags )
+	add_dependencies( examples alp_ascend_onlinesoftmaxOp_ascend )
+endif()      
+
+if( WITH_ASCEND_BACKEND )
+	add_executable( ascend_flashattentionOp_ascend ascend_flashattentionOp.cpp )
+	target_link_libraries( ascend_flashattentionOp_ascend backend_ascend common_flags )
+	#add_dependencies( examples ascend_flashattentionOp_ascend )
+endif()
+
diff --git a/examples/Makefile b/examples/Makefile
new file mode 100644
index 000000000..ee257c7d7
--- /dev/null
+++ b/examples/Makefile
@@ -0,0 +1,11 @@
+
+# Makefile for the Ascend examples
+
+.PHONY: all
+
+all:
+	ascendcc -b 910B -c -o op.o op.cpp
+	ascendcc -I/home/yzelman/Packages/CANN/samples/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/kernel_invocation/kernel_template/ -b 910B -c -o op_host.o ../examples/ascend_host.cpp
+	ascendcc -o main.exe op.o op_host.o
+	LD_LIBRARY_PATH=/home/yzelman/Packages/CANN/x86_64/ascend-toolkit/latest/x86_64-linux/lib64/:/home/yzelman/Packages/CANN/x86_64/ascend-toolkit/latest/x86_64-linux/devlib/x86_64/ ./main.exe
+
diff --git a/examples/alp_ascend_softMaxOp-manuallyTiled.cpp b/examples/alp_ascend_softMaxOp-manuallyTiled.cpp
new file mode 100644
index 000000000..68fc66468
--- /dev/null
+++ b/examples/alp_ascend_softMaxOp-manuallyTiled.cpp
@@ -0,0 +1,108 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+#define DEBUG
+
+#include <alpAscend.hpp>
+
+using namespace alp;
+
+
+// alp::Grid< 1, 4 > note:
+//   - Thread dimensionality = 1, means that the 1D thread grid maps to first
+//     axis of the problem grid. A refinement of this API may make this
+//     configurable.
+template < typename GridType >
+void ascend_code( const GridType &grid, RC &rc ) {
+	rc = alp::RC::FAILED;
+
+	Tensor Sin( alp::Datatype::FP16, make_axes( 0, 1, 2, 3 ) );
+	Tensor Sout( alp::Datatype::FP16, make_axes( 0, 1, 2, 3 ) );
+
+	rc = grid.forEach( make_axes( 0 ), [ & ] () {
+
+		auto S_block_in_ub  = getView( Sin );  // T(1,2,3)
+		auto S_block_out_ub = getView( Sout ); // T(1,2,3)
+		Tensor localTensor_ub( alp::Datatype::FP16, make_axes( 1, 2 ) ); // T(1,2)
+
+		rc = grid.forEach( make_axes( 1 ), [ & ] () {
+
+			auto S_block_in  = getView( S_block_in_ub );  // T(2,3)
+			auto S_block_out = getView( S_block_out_ub ); // T(2,3)
+			auto localTensor = getView( localTensor_ub ); // T(2)
+
+			//     T(2)         T(2,3)
+			apply( localTensor, S_block_in, "max", make_axes( 3 ) );
+			//     T(2,3)       T(2,3)        T(2)
+			apply( S_block_out, S_block_in, localTensor, "minus", make_axes( 3 ) );
+			//     T(2,3)
+			foldl( S_block_out, "exp" );
+			//     T(2)         T(2,3)
+			apply( localTensor, S_block_out, "add", make_axes( 3 ) );
+			//     T(2,3)       T(2)
+			foldl( S_block_out, localTensor, "divide", make_axes( 3 ) );
+			//     T(2,3)
+
+		} );
+
+		store( S_block_out );
+
+	} );
+
+	return;
+}
+
+int main( int argc, char ** argv ) {
+
+	// default options
+	bool printUsage = false;
+
+	// input error checking
+	if( argc > 1 ) {
+		printUsage = true;
+	}
+
+	// print help on error
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << "\n";
+		return 10;
+	}
+
+	// start opgen
+	std::cout << "//This is AscendOpGen example " << argv[ 0 ] << "\n";
+	alp::RC error_code = alp::RC::SUCCESS;
+	try {
+		error_code = alp::compile< 1, 4 >( ascend_code, "KernelSoftmax" );
+	} catch( std::exception &e ) {
+		std::cerr << "alp::compile threw error: " << e.what() << "\n";
+		return 20;
+	}
+	if( error_code != alp::RC::SUCCESS ) {
+		std::cerr << std::flush;
+		std::cout << "Codegen FAILED (" << alp::toString( error_code ) << ")"
+			<< std::endl;
+		return 30;
+	} else {
+		std::cout << "//Codegen OK" << std::endl;
+		return 0;
+	}
+
+}
+
diff --git a/examples/ascend_flashattentionOp-v2.cpp b/examples/ascend_flashattentionOp-v2.cpp
new file mode 100644
index 000000000..eec996727
--- /dev/null
+++ b/examples/ascend_flashattentionOp-v2.cpp
@@ -0,0 +1,178 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+
+#include <alpAscend.hpp>
+
+using namespace alp;
+
+
+// alp::Grid< 1, 3 > note:
+//   - Thread dimensionality = 1, means that the 1D thread grid maps to first
+//     axis of the problem grid. A refinement of this API may make this
+//     configurable.
+void ascend_code( const Grid< 1, 3 > &grid, RC &rc ) { // max shape = (  m,  Tr,  N )
+	rc = alp::RC::FAILED;
+
+	// input          // Q and O are 'canonically' aligned.
+	Tensor Q( grid, type::FP16, axes( 0, 1, 2 ) ); //  shape = (m, Tr, N)
+	Tensor K( grid, type::FP16, axes( 2, 0, 1 ) ); //  shape = (N, m, Tr)  // transposed shape compared to Q
+	Tensor V( grid, type::FP16, axes( 2, 0, 1 ) ); //  shape = (N, m, Tr)  // transposed shape compared to Q
+
+	// temp
+	Tensor m( grid, type::FP16, axes( 0, 1 ) );    //  shape = (m, Tr) =  (m, Tr , 1)  = ( m, Tr, 1, 1, .. )
+	                                               //  scalar shape = (1, 1, 1)
+	// output
+	Tensor l( grid, type::FP16, axes( 0, 1 ) );    //  shape = (m, Tr)
+	Tensor O( grid, type::FP16, axes( 0, 1, 2 ) ); //  shape = (m, Tr, N)
+
+	set( O, 0 );
+	set( l, values::zero );           // values::zero is equivalent to 0
+	set( m, values::minus_infinity );
+	
+	// forEach cuts the grid into small pieces that are processed concurrently
+	rc = grid.forEach( [ &grid, &Q, &K, &V, &l, &m ] () {
+		// a view gets the local part to be processed
+		// e.g. axes( O_block ) = alp::axes( threadID(), 1, 2 )
+		auto O_block = O.getView( grid );
+
+		auto Q_block = Q.getView( grid );
+
+		// if tensors are permuted, the "cut" dimension still refers to that defined
+		// by the grid. E.g.  axes( K_block ) = alp::axes( 2, threadID(), 1 )
+		auto K_block = K.getView( grid );
+		auto V_block = V.getView( grid );
+		auto l_block = l.getView( grid );
+		auto m_block = m.getView( grid );
+
+		// tensor version of Stmp = mxm( Q_block, K_block )
+		//  - tensor contraction along one axis
+		//  - 2 is the contraction axis
+		Tensor Stmp( grid, type::FP16, axes( 0, 1, 1 ) ); // AJ2D: I think this should have been 1, 1? Or the below mxm ex. is wrong?
+		Stmp = Q_block( "i", "m", "k" ) * K_block( "k", "j", "m" ); // AJ2D: is this correct in Einstein notation?
+		                                                            //       It seems to me to match the below code
+									    //       (although I don't get foldl with a semiring)
+
+		// tensor contraction in one axis:
+		// alp::semiring multiplication and accumualtion operators
+		// e.g. Stmp[ : , : ] = mxm( Q_block[ threadID(), :, : ], K_block[ :, threadID(), : ] )
+		// set( Stmp, values::zero );
+		// alp::foldl( Stmp,  Q_block, K_block, alp::semiring(), alp::axes( 2 ) );
+		// NOTE:  in general multiple axes needed with proper reduction rules:
+		// here, Dim(Stmp) + 2*Dim(axes) = Dim(Q_block) + Dim(Q_block)
+
+
+		Tensor tmp( grid, type::FP16, axes( 1 ) );
+		set( tmp, m_block ); // AJ2D: here tmp is one-dimensional but m_block is two-dimensional?
+		                     //       I think this means the parallelised dimension has only one fiber,
+				     //       not a block of fibers, perhaps? That could work (though the codegen
+				     //       would have to coalesce them back). I had assumed we got back a block
+				     //       of some size close to n/p. If we have a block then the following
+				     //       seems correct and perhaps more clear?
+		// Tensor tmp( grid, type::FP16, axes( 0, 1 ) )
+		// set( tmp, m_block )
+		
+		// two was the "contraction" axis, e.g. row-wise reduction
+		max( m_block, Stmp ); // AJ2D: I think here the axes become confusing. If the axes of Stmp are correct
+		                      //       (I modified it), then the "axes(2)" which used to be here do not match
+				      //       any axes in m_bock and Stmp. Translating it into matrix land, Stmp is
+				      //       n x n while m_block is m x n. If m = 1 (see above comment block), then
+				      //       indeed what max is a reduction, but it remains ambiguous over what
+				      //       dimension the reduction should go (rows or columns -- both are the same
+				      //       mode). If m > 1, then the semantics I suppose are to broadcast the
+				      //       result of max( Stmp ) into m_block?
+				      //
+				      //       Would the following perhaps be clearer?
+		// tmp = max( Stmp( "i", "j" ), "j" );
+		// m_block( "i", "j" ) = tmp( "j" ); // broadcast tmp to m_block
+
+		// AJ2D: in the below, I will just assume Einstein notation while simplifying the code
+
+		// 'row-wise' Stmp -= m_block
+		Stmp( "i", "j" ) -= m_block( "j" );
+
+		// if no axes are specified apply along all axes
+		// This is equivalent to reduction with scalar, just inplace
+		// Stmp = exp(Stmp)
+		Stmp = exp( Stmp );
+
+		// tmp=exp(tmp-m_block)
+		tmp = exp( tmp - m_block );
+		
+		// l_block += rowsum(Stmp)
+		l_block += sum( Stmp( "i", "j" ), "j" );
+
+		// 'row-wise' O_block *= tmp
+		O_block *= tmp;
+
+		// tensor version of O_block = mxm( Stmp,  V_block ), i.e., contraction
+		Oblock( "i", "j", "k" ) +=  Stmp( "i", "r" ) * V_block( "k", "r", "j" );
+
+		// 'row-wise' O_block *=  1/l_block
+		O_block /= l_block;
+		// or div( O_block, l_block );
+
+		// l_block = log(m_block) + m_block
+		l_block = log( m_block ) + m_block;
+
+		// store output
+		alp::store( O_block );
+		alp::store( l_block );
+	} );
+
+	return;
+}
+
+int main( int argc, char ** argv ) {
+
+	// default options
+	bool printUsage = false;
+
+	// input error checking
+	if( argc > 1 ) {
+		printUsage = true;
+	}
+
+	// print help on error
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << "\n";
+		return 10;
+	}
+
+	// start opgen
+	std::cout << "//This is AscendOpGen example " << argv[ 0 ] << "\n";
+	alp::RC error_code = alp::RC::SUCCESS;
+	try {
+		error_code = alp::compile< 1, 1 >( ascend_code );
+	} catch( std::exception &e ) {
+		std::cerr << "alp::compile threw error: " << e.what() << "\n";
+		return 20;
+	}
+	if( error_code != alp::RC::SUCCESS ) {
+		std::cerr << std::flush;
+		std::cout << "Codegen FAILED (" << alp::toString( error_code ) << ")"
+			<< std::endl;
+		return 30;
+	} else {
+		std::cout << "//Codegen OK" << std::endl;
+		return 0;
+	}
+
+}
+
diff --git a/examples/ascend_flashattentionOp-v3.cpp b/examples/ascend_flashattentionOp-v3.cpp
new file mode 100644
index 000000000..2db2e0cbd
--- /dev/null
+++ b/examples/ascend_flashattentionOp-v3.cpp
@@ -0,0 +1,146 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+
+#include <alpAscend.hpp>
+
+using namespace alp;
+
+
+// alp::Grid< 1, 5 > note:
+//   - Thread dimensionality = 1, means that the 1D thread grid maps to first
+//     axis of the problem grid. A refinement of this API may make this
+//     configurable.
+void ascend_code( const Grid< 1, 5 > &grid, RC &rc ) { // max shape = ( Tr, Br, Tc, Bc, d )
+	rc = alp::RC::FAILED;
+
+	// input          // Q and O are 'canonically' aligned.
+	Tensor Q( grid, type::FP16, axes( 0, 1, 4 ) ); //  shape = (Tr, Br, d)
+	Tensor K( grid, type::FP16, axes( 4, 2, 3 ) ); //  shape = (d, Tc, Bc)  // transposed shape compared to Q
+	Tensor V( grid, type::FP16, axes( 4, 2, 3 ) ); //  shape = (d, Tc, Bc)  // transposed shape compared to Q
+
+	// temp
+	Tensor m( grid, type::FP16, axes( 0, 1 ) );    //  shape = (Tr, Br) =  (Tr, Br , 1)  = ( Tr, Br, 1, 1, .. )
+	                                               //  scalar shape = (1, 1, 1)
+	// output
+	Tensor l( grid, type::FP16, axes( 0, 1 ) );    //  shape = (Tr, Br)
+	Tensor O( grid, type::FP16, axes( 0, 1, 2 ) ); //  shape = (Tr, Br, d)
+
+	set( O, 0 );
+	set( l, values::zero );           // values::zero is equivalent to 0
+	set( m, values::minus_infinity );
+
+	// forEach cuts the grid into small pieces that are processed concurrently
+	rc = grid.forEach( [ &grid, &Q, &K, &V, &l, &m ] () {
+		// a view gets the local part to be processed
+		// e.g. axes( O_block ) = alp::axes( threadID(), 1, 4 )
+		auto O_block = O.getView( grid );
+		auto Q_block = Q.getView( grid );
+		auto K_block = K.getView( grid );
+		auto V_block = V.getView( grid );
+		auto l_block = l.getView( grid );
+		auto m_block = m.getView( grid );
+
+		// tensor version of Stmp = mxm( Q_block, K_block )
+		//  - tensor contraction along one axis
+		//  - 2 is the contraction axis
+		Tensor Stmp( grid, type::FP16, axes( 0, 2, 3 ) );
+		Stmp = Q_block( "i", "j", "k" ) * K_block( "l", "m", "k" );
+		// not contracted and non-stored index imply loop, e.g. loop over "j" here
+
+
+		Tensor tmp( grid, type::FP16, axes( 0, 1 ) );
+		set( tmp, m_block );
+
+		// row-wise max
+		// do this operation for all l indices
+		m_block( "i", "j" ) = max( m_block( "i", "j" ), Stmp( "i", "k", "l" ) , "l");
+
+		// row-wise Stmp -= m_block
+		// do this operation for all l indices
+		Stmp( "i", "k", "l" ) = minus( Stmp( "i",  "k", "l" ),  m_block( "i", "j" ), "l" );
+
+		// if no axes are specified then apply along all axes
+		// This is equivalent to reduction with scalar, just inplace
+		// Stmp = exp(Stmp)
+		Stmp = exp( Stmp );
+
+		// tmp=exp(tmp-m_block)
+		tmp = exp( tmp - m_block );
+
+		// l_block += rowsum(Stmp)
+		l_block += sum( Stmp( "i", "j", "k" ), "k" );
+
+		// 'row-wise' O_block *= tmp
+		O_block *= tmp;
+
+		// tensor version of O_block = mxm( Stmp,  V_block ), i.e., contraction
+		O_block( "i", "j", "k" ) +=  Stmp( "i", "l", "m" ) * V_block( "k", "r", "j" );
+
+		// 'row-wise' O_block *=  1/l_block
+		O_block /= l_block;
+
+		// l_block = log(m_block) + m_block
+		l_block = log( m_block ) + m_block;
+
+		// store output
+		alp::store( O_block );
+		alp::store( l_block );
+	} );
+
+	return;
+}
+
+int main( int argc, char ** argv ) {
+
+	// default options
+	bool printUsage = false;
+
+	// input error checking
+	if( argc > 1 ) {
+		printUsage = true;
+	}
+
+	// print help on error
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << "\n";
+		return 10;
+	}
+
+	// start opgen
+	std::cout << "//This is AscendOpGen example " << argv[ 0 ] << "\n";
+	alp::RC error_code = alp::RC::SUCCESS;
+	try {
+		error_code = alp::compile< 1, 5 >( ascend_code );
+	} catch( std::exception &e ) {
+		std::cerr << "alp::compile threw error: " << e.what() << "\n";
+		return 20;
+	}
+	if( error_code != alp::RC::SUCCESS ) {
+		std::cerr << std::flush;
+		std::cout << "Codegen FAILED (" << alp::toString( error_code ) << ")"
+			<< std::endl;
+		return 30;
+	} else {
+		std::cout << "//Codegen OK" << std::endl;
+		return 0;
+	}
+
+}
+
diff --git a/examples/ascend_flashattentionOp.cpp b/examples/ascend_flashattentionOp.cpp
new file mode 100644
index 000000000..79cb0d84a
--- /dev/null
+++ b/examples/ascend_flashattentionOp.cpp
@@ -0,0 +1,171 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+
+#include <alpAscend.hpp>
+
+// alp::Grid< 1, 5 > note:
+//   - Thread dimensionality = 1, means that the 1D thread grid maps to first
+//     axis of the problem grid. A refinement of this API may make this
+//     configurable.
+template < typename GridType >
+void ascend_code( const GridType &grid, alp::RC &rc ) {
+
+	// shape = ( Tr,Tc,Br,Bc,d )
+	// Tr = number for row-blocks, Br = row-length of rowblocks;  Tr*Tc = N
+	// Tc = number for column-blocks, Bc = column-length of rowblocks;  Tr*Tc = M
+	// for softmax N == M, i.e. Sin and Sout are square matrices
+	rc = alp::RC::SUCCESS;
+
+	// input
+	alp::Tensor Qtensorin( alp::Datatype::FP16, alp::make_axes( 0, 2, 4 ) );  //  shape = ( Tr,Br,d )
+	alp::Tensor Ktensorin( alp::Datatype::FP16, alp::make_axes( 1, 3, 4 ) );  //  shape = ( Tc,Bc,d )
+	alp::Tensor Vtensorin( alp::Datatype::FP16, alp::make_axes( 1, 3, 4 ) );  //  shape = ( Tc,Bc,d )
+
+	// temp
+	alp::Tensor Otensorout( alp::Datatype::FP16, alp::make_axes( 0, 2, 4 ) ); //  shape = ( Tr,Br,d )
+	alp::Tensor mtensorout( alp::Datatype::FP16, alp::make_axes( 0, 2 ) );    //  shape = ( Tr,Br )
+	alp::Tensor ltensorout( alp::Datatype::FP16, alp::make_axes( 0, 2 ) );    //  shape = ( Tr,Br )
+
+	rc = !rc ? rc : grid.forEach( alp::make_axes( 0 ), [ & ] () {
+
+		auto Q_block_in  = Qtensorin.getView();  // T(2,4)
+
+		auto O_block_out = Otensorout.getView(); // T(2,4)
+		auto m_block_out = mtensorout.getView(); // T(2)
+		auto l_block_out = ltensorout.getView(); // T(2)
+
+		 //TODO: fix, i.e. double replace with  half
+		alp::set( m_block_out, -alp::Infinity<double> );
+		alp::set( l_block_out, alp::Zero<double>  );
+
+		rc = !rc ? rc : grid.forEach( alp::make_axes( 1 ), [ & ] () {
+
+				// these tensors will have original axes with axes 0 and 1 removed
+				// Sij=S[i0,i1,:,:]
+
+				auto K_block_in = Ktensorin.getView(); // T(3,4)
+				auto V_block_in = Vtensorin.getView(); // T(3,4)
+
+				alp::Tensor Sij(       alp::Datatype::FP16, alp::make_axes( 2, 3 ) );
+				alp::Tensor Temp(      alp::Datatype::FP16, alp::make_axes( 2, 3 ) );
+				alp::Tensor rowmaxS(   alp::Datatype::FP16, alp::make_axes( 2 ) );
+				alp::Tensor mi_old(    alp::Datatype::FP16, alp::make_axes( 2 ) );
+				alp::Tensor expmidiff( alp::Datatype::FP16, alp::make_axes( 2 ) );
+
+				//          T(2,3)  T(2,4)     T(3,4)
+				alp::apply( Sij, Q_block_in, K_block_in, "mxm", alp::make_axes( 4 ) );
+
+				// mi_old=cp.copy(mtensor[i,:])
+				//        T(2)    T(2)
+				alp::set( mi_old, m_block_out);
+
+				// rowmaxS=np.max(Si,axis=-1)
+				//          T(2)    T(2,3)
+				alp::apply( rowmaxS, Sij, "max", alp::make_axes( 3 ) );
+
+				// mtensor[i,:]=np.maximum(mtensor[i,:],rowmaxS)
+				//          T(2)         T(2)
+				alp::foldl( m_block_out, rowmaxS, "max" );
+
+				// Si=Si-np.expand_dims(mtensor[i,:], axis=-1)
+				//          T(2,3)      T(2)
+				alp::foldl( Sij, m_block_out, "minus", alp::make_axes( 3 ) );
+
+				// Si=np.exp(Si)
+				alp::foldl( Sij, "exp" );
+
+				// expmidiff=np.exp(mi_old-mtensor[i,:])
+				//          T(2)       T(2)    T(2)
+				alp::apply( expmidiff, mi_old, m_block_out, "minus" );
+
+				alp::foldl( expmidiff, "exp" );
+
+				// ltensor[i,:]*=expmidiff
+				//          T(2)         T(2)
+				alp::foldl( l_block_out, expmidiff, "times" );
+
+				// ltensor[i,:]+= np.sum(Si,axis=-1)
+				//          T(2)         T(2,3)
+				alp::foldl( l_block_out, Sij, "add", alp::make_axes( 3 ) );
+
+				// Otensor[i,:,:]*=np.expand_dims(expmidiff, axis=(-2,-1))
+				//          T(2,4)       T(2)
+				alp::foldl( O_block_out, expmidiff, "times", alp::make_axes( 4 ) );
+
+				//          T(2,3)      T(2,4)   T(3,4)
+				alp::apply( Temp, Sij, V_block_in,  "mxm", alp::make_axes( 4 ) );
+				//          T(2,3)       T(2,3)
+				alp::foldl( O_block_out, Temp , "add" );
+
+		} );
+
+		// Otensor[i,:,:]/=np.expand_dims(ltensor[i,:], axis=(-2,-1))
+		//     T(2,3)       T(2)
+		alp::foldl( O_block_out, l_block_out, "divide", alp::make_axes( 3 ) );
+
+		//ltensor[i,:] = mtensor[i,:] + log(ltensor[i,:])
+		// skip for now
+
+		alp::store( O_block_out );
+		alp::store( l_block_out );
+		alp::store( m_block_out );
+
+	} );
+
+	return;
+}
+
+int main( int argc, char ** argv ) {
+
+	// default options
+	bool printUsage = false;
+
+	// input error checking
+	if( argc > 1 ) {
+		printUsage = true;
+	}
+
+	// print help on error
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << "\n";
+		return 10;
+	}
+
+	// start opgen
+	std::cout << "//This is AscendOpGen example " << argv[ 0 ] << "\n";
+	alp::RC error_code = alp::RC::SUCCESS;
+	try {
+		error_code = alp::compile< 1, 5 >( ascend_code, "KernelFlashattention" );
+	} catch( std::exception &e ) {
+		std::cerr << "alp::compile threw error: " << e.what() << "\n";
+		return 20;
+	}
+	if( error_code != alp::RC::SUCCESS ) {
+		std::cerr << std::flush;
+		std::cout << "Codegen FAILED (" << alp::toString( error_code ) << ")"
+			<< std::endl;
+		return 30;
+	} else {
+		std::cout << "//Codegen OK" << std::endl;
+		return 0;
+	}
+
+}
+
diff --git a/examples/compile_and_run_flashattention.sh b/examples/compile_and_run_flashattention.sh
new file mode 100755
index 000000000..3581335e3
--- /dev/null
+++ b/examples/compile_and_run_flashattention.sh
@@ -0,0 +1,67 @@
+#set current directory  
+CWD=$(pwd)
+
+bashargn=$#
+#echo "bashargn  $bashargn"
+if [[ "$bashargn" == 2 ]]
+then
+    opfile=$1
+    hostfile=$2
+else
+    opfile="op.cpp"
+    hostfile="$CWD/flashattention_custom_main.cpp"
+
+    #cleanup any previous output 
+    rm -f a_npu input/*.bin output/output_z.bin *.o op.cpp
+    rm -rf  /tmp/build_alp/
+
+    #build ALP code gnerator, i.e. ascend_flashattentionOp_ascend executable
+    mkdir /tmp/build_alp/ && cd /tmp/build_alp/ && cmake $CWD/../ && make ascend_flashattentionOp_ascend && cd $CWD
+
+    #run ALP code generator generate, store it into op.cpp
+    /tmp/build_alp/examples/./ascend_flashattentionOp_ascend > op.cpp
+
+    cat op.cpp
+fi
+
+echo "compile: $opfile and $hostfile"
+
+#compile ascend code
+# set the compiler path and the ASCEND_TOOLKIT_INSTALL_PATH
+ASCEND_TOOLKIT_INSTALL_PATH="/usr/local/Ascend/ascend-toolkit/latest"
+ccec_compiler="/home/HwHiAiUser/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin/ccec"
+
+#compile generated kernel code, i.e. $opfile 
+$ccec_compiler -xcce -DTILING_KEY_VAR=0  -I"$ASCEND_TOOLKIT_INSTALL_PATH/acllib/include" -I"$ASCEND_TOOLKIT_INSTALL_PATH/compiler/tikcpp/tikcfw" -I"$ASCEND_TOOLKIT_INSTALL_PATH/compiler/tikcpp/tikcfw/impl" -I"$ASCEND_TOOLKIT_INSTALL_PATH/compiler/tikcpp/tikcfw/interface" -I"$ASCEND_TOOLKIT_INSTALL_PATH/tools/tikicpulib/lib/include"  -O2 -std=c++17 --cce-aicore-arch=dav-c100  --cce-auto-sync -fPIC -pthread -o $opfile.o -c $opfile
+
+#compile template host code, i.e. $hostfile
+$ccec_compiler -xcce -DTILING_KEY_VAR=0  -I"$ASCEND_TOOLKIT_INSTALL_PATH/acllib/include" -I"$ASCEND_TOOLKIT_INSTALL_PATH/compiler/tikcpp/tikcfw" -I"$ASCEND_TOOLKIT_INSTALL_PATH/compiler/tikcpp/tikcfw/impl" -I"$ASCEND_TOOLKIT_INSTALL_PATH/compiler/tikcpp/tikcfw/interface" -I"$ASCEND_TOOLKIT_INSTALL_PATH/tools/tikicpulib/lib/include" -O2 -std=c++17 --cce-aicore-arch=dav-c100  --cce-auto-sync -fPIC -pthread -o $hostfile.o -c $hostfile
+
+#link the executable, i.e. a_npu
+$ccec_compiler --cce-fatobj-link --cce-aicore-arch=dav-c100 $opfile.o $hostfile.o -o a_npu -L"$ASCEND_TOOLKIT_INSTALL_PATH/runtime/lib64" -L"$ASCEND_TOOLKIT_INSTALL_PATH/tools/simulator/Ascend910A/lib" -L"$ASCEND_TOOLKIT_INSTALL_PATH/tools/tikicpulib/lib/Ascend910A" -lstdc++ -lruntime -lascendcl
+
+
+#generate input data in "input" directory
+# and the reference output data in "output" directory
+rm -f runtime*.csv
+for n in {0..0}
+do
+    rm -rf input output
+    #vec_length=$(( 8 * 2048 * ( 2 ** $n) ))
+    echo "generate input"
+    echo "python3 flashattention_custom.py ${vec_length}"
+    mkdir -p input
+    mkdir -p output
+    python3 flashattention_custom.py #${vec_length}
+
+    #run ascend example, run ./a_npu on 910
+    echo "run ascend example"
+    echo "./a_npu ${vec_length}"
+    ./a_npu #${vec_length}
+
+    python3 flashattention_check.py
+
+done
+
+
+
diff --git a/examples/compile_and_run_onlinesoftmax.sh b/examples/compile_and_run_onlinesoftmax.sh
new file mode 100755
index 000000000..45e18fa34
--- /dev/null
+++ b/examples/compile_and_run_onlinesoftmax.sh
@@ -0,0 +1,61 @@
+#set current directory  
+CWD=$(pwd)
+
+bashargn=$#
+#echo "bashargn  $bashargn"
+if [[ "$bashargn" == 2 ]]
+then
+    opfile=$1
+    hostfile=$2
+else
+    opfile="op.cpp"
+    hostfile="$CWD/onlinesoftmax_custom_main.cpp"
+
+    #cleanup any previous output 
+    rm -f a_npu input/*.bin output/output_z.bin *.o op.cpp
+    rm -rf  /tmp/build_alp/
+
+    #build ALP code gnerator, i.e. ascend_onlinesoftmaxOp_ascend executable
+    mkdir /tmp/build_alp/ && cd /tmp/build_alp/ && cmake $CWD/../ && make ascend_onlinesoftmaxOp_ascend && cd $CWD
+
+    #run ALP code generator generate, store it into op.cpp
+    /tmp/build_alp/examples/./ascend_onlinesoftmaxOp_ascend > op.cpp
+
+    cat op.cpp
+fi
+
+echo "compile: $opfile and $hostfile"
+
+#compile ascend code
+# set the compiler path and the ASCEND_TOOLKIT_INSTALL_PATH
+ASCEND_TOOLKIT_INSTALL_PATH="/usr/local/Ascend/ascend-toolkit/latest"
+ccec_compiler="/home/HwHiAiUser/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin/ccec"
+
+#compile generated kernel code, i.e. $opfile 
+$ccec_compiler -xcce -DTILING_KEY_VAR=0  -I"$ASCEND_TOOLKIT_INSTALL_PATH/acllib/include" -I"$ASCEND_TOOLKIT_INSTALL_PATH/compiler/tikcpp/tikcfw" -I"$ASCEND_TOOLKIT_INSTALL_PATH/compiler/tikcpp/tikcfw/impl" -I"$ASCEND_TOOLKIT_INSTALL_PATH/compiler/tikcpp/tikcfw/interface" -I"$ASCEND_TOOLKIT_INSTALL_PATH/tools/tikicpulib/lib/include"  -O2 -std=c++17 --cce-aicore-arch=dav-c100  --cce-auto-sync -fPIC -pthread -o $opfile.o -c $opfile
+
+#compile template host code, i.e. $hostfile
+$ccec_compiler -xcce -DTILING_KEY_VAR=0  -I"$ASCEND_TOOLKIT_INSTALL_PATH/acllib/include" -I"$ASCEND_TOOLKIT_INSTALL_PATH/compiler/tikcpp/tikcfw" -I"$ASCEND_TOOLKIT_INSTALL_PATH/compiler/tikcpp/tikcfw/impl" -I"$ASCEND_TOOLKIT_INSTALL_PATH/compiler/tikcpp/tikcfw/interface" -I"$ASCEND_TOOLKIT_INSTALL_PATH/tools/tikicpulib/lib/include" -O2 -std=c++17 --cce-aicore-arch=dav-c100  --cce-auto-sync -fPIC -pthread -o $hostfile.o -c $hostfile
+
+#link the executable, i.e. a_npu
+$ccec_compiler --cce-fatobj-link --cce-aicore-arch=dav-c100 $opfile.o $hostfile.o -o a_npu -L"$ASCEND_TOOLKIT_INSTALL_PATH/runtime/lib64" -L"$ASCEND_TOOLKIT_INSTALL_PATH/tools/simulator/Ascend910A/lib" -L"$ASCEND_TOOLKIT_INSTALL_PATH/tools/tikicpulib/lib/Ascend910A" -lstdc++ -lruntime -lascendcl
+
+rm -f runtime*.csv
+rm -rf input output
+echo "generate input"
+echo "python3 onlinesoftmax_custom.py"
+mkdir -p input
+mkdir -p output
+python3 onlinesoftmax_custom.py
+
+#run ascend example, run ./a_npu on 910
+echo "run ascend example"
+echo "./a_npu ${vec_length}"
+./a_npu #${vec_length}
+
+#python3 onlinesoftmax_print.py
+python3 softmax_check-v5.py
+#echo "NO onlinesoftmax_custom.py"
+
+
+
diff --git a/examples/online_softmax.py b/examples/online_softmax.py
new file mode 100644
index 000000000..295783627
--- /dev/null
+++ b/examples/online_softmax.py
@@ -0,0 +1,96 @@
+import numpy as np
+import copy as cp
+
+
+def simplesoftmax(S_in):
+    S=cp.copy(S_in)    
+    # S=softmax(S)
+    rowmaxS=np.max(S,axis=1)
+    S=S-np.tile(rowmaxS, (np.shape(S)[0],1)).T
+    S=np.exp(S)
+    rowsumS=np.sum(S,axis=1)
+    S=S/(np.tile(rowsumS, (np.shape(S)[0],1)).T)
+    
+    return(S)
+
+
+def onlinesoftmax(S_in,Br=5,Bc=4):
+    save_shape=S_in.shape
+        
+    N,x=S_in.shape
+
+    #check for too large block sizes
+    Br=min(Br,N)
+    Bc=min(Bc,N)
+    #get number of row/column blocks
+    Tr=N//Br
+    if(Tr*Br!=N):
+        Tr+=1
+    Tc=N//Bc
+    if(Tc*Bc!=N):
+        Tc+=1
+        
+    # outputs
+    # Initialize om HBM
+    O=np.zeros((N,N))
+    l=np.zeros(N)
+    m=np.zeros(N)-np.Infinity
+    
+    
+    #switch to tensors
+    # dimensions (Tr, Br, Tc, Bc, d)
+    Otensor=np.reshape(O,(Tr,Br,Tc,Bc))
+    mtensor=np.reshape(m,(Tr,Br))
+    ltensor=np.reshape(l,(Tr,Br))
+    del(m,l)
+    
+    QKtensor=np.reshape(S_in,(Tr,Br,Tc,Bc))
+
+    
+    for i in range(Tr):
+            
+        for j in range(Tc):
+            Si=QKtensor[i,:,j,:]
+
+            mi_old=cp.copy(mtensor[i,:])
+                
+            rowmaxS=np.max(Si,axis=-1)
+
+            mtensor[i,:]=np.maximum(mtensor[i,:],rowmaxS)
+
+            Si=Si-np.expand_dims(mtensor[i,:], axis=-1)
+
+            Si=np.exp(Si)
+
+            expmidiff=np.exp(mi_old-mtensor[i,:])
+            ltensor[i,:]*=expmidiff
+
+            ltensor[i,:]+= np.sum(Si,axis=-1)
+
+            Otensor[i,:,:,:]*=np.expand_dims(expmidiff, axis=(-2,-1))
+
+            Otensor[i,:,j,:]=Si
+
+        Otensor[i,:,:,:]/=np.expand_dims(ltensor[i,:], axis=(-2,-1))
+
+    O=np.reshape(Otensor,(N,N))
+            
+    return(O,ltensor)
+
+
+shape1=(128,16)
+Q=np.random.random(shape1)
+K=np.random.random(shape1)
+V=np.identity(shape1[0])
+
+Stmp=Q.dot(K.T)
+
+Osimple=simplesoftmax(Stmp)
+Oflash,llash=onlinesoftmax(Stmp,Br=8,Bc=4)
+
+print("difference=",np.linalg.norm(Osimple-Oflash))
+
+
+
+
+
diff --git a/examples/onlinesoftmax_custom.py b/examples/onlinesoftmax_custom.py
new file mode 100644
index 000000000..9f3668e1c
--- /dev/null
+++ b/examples/onlinesoftmax_custom.py
@@ -0,0 +1,26 @@
+#!/usr/bin/python3
+# -*- coding:utf-8 -*-
+# Copyright 2022-2023 Huawei Technologies Co., Ltd
+import numpy as np
+import copy
+
+def gen_golden_data():
+    x1_gm_type = np.float16
+
+    n0=16
+    n1=32
+    n2=16
+    n3=16
+    n4=16
+
+    N1 = n0*n1*n2*n3
+    shape1 = (n0,n1,n2,n3)
+
+    S0_gm = np.random.randint(1, 10, [N1]).astype(x1_gm_type)
+    infilename = "./input/s0_gm.bin"
+    S0_gm.tofile( infilename )
+
+
+
+if __name__ == "__main__":
+    gen_golden_data()
diff --git a/examples/onlinesoftmax_custom_main.cpp b/examples/onlinesoftmax_custom_main.cpp
new file mode 100644
index 000000000..2a6a8958e
--- /dev/null
+++ b/examples/onlinesoftmax_custom_main.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
+ * This file constains code of cpu debug and npu code.We read data from bin file
+ * and write result to file.
+ */
+#include <algorithm>
+#include <chrono>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include "data_utils.h"
+
+#include "acl/acl.h"
+
+extern void custom_KernelOnlineSoftmax_do(
+	uint32_t coreDim, void* l2ctrl, void* stream,
+	uint8_t *param_Sin, uint8_t *param_Sout, uint8_t *param_m, uint8_t *param_l,
+	uint32_t _p, uint32_t n0,
+	uint32_t n1, uint32_t n2, uint32_t n3 );
+
+#define DTYPE uint16_t
+
+constexpr uint32_t n0=16;
+constexpr uint32_t n1=32;
+constexpr uint32_t n2=16;
+constexpr uint32_t n3=16;
+
+constexpr uint32_t N2 = n0*n2;
+constexpr uint32_t N3 = n0*n1*n2*n3;
+
+#define REPS 20
+
+int32_t main(int32_t argc, char* argv[])
+{
+    size_t param_m_FileSize = N2 * sizeof( DTYPE );
+    size_t param_l_FileSize = N2 * sizeof( DTYPE );
+    size_t param_Sin_FileSize = N3 * sizeof( DTYPE );
+    size_t param_Sout_FileSize = N3 * sizeof( DTYPE );
+    uint32_t blockDim = 4;
+
+    CHECK_ACL(aclInit(nullptr));
+    aclrtContext context;
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    CHECK_ACL(aclrtCreateContext(&context, deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+
+    ///////////////   allocate on host ////////////////////////
+
+    uint8_t *param_m_Host;
+    CHECK_ACL(aclrtMallocHost((void**)(&param_m_Host), param_m_FileSize));
+
+    uint8_t *param_l_Host;
+    CHECK_ACL(aclrtMallocHost((void**)(&param_l_Host), param_l_FileSize));
+
+    uint8_t *param_Sin_Host;
+    CHECK_ACL(aclrtMallocHost((void**)(&param_Sin_Host), param_Sin_FileSize));
+    ReadFile("./input/s0_gm.bin", param_Sin_FileSize, param_Sin_Host, param_Sin_FileSize);
+
+    uint8_t *param_Sout_Host;
+    CHECK_ACL(aclrtMallocHost((void**)(&param_Sout_Host), param_Sout_FileSize));
+
+    ///////////////   allocate on device ////////////////////////
+
+    uint8_t *param_m_Device;
+    CHECK_ACL(aclrtMalloc((void**)&param_m_Device, param_m_FileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    uint8_t *param_l_Device;
+    CHECK_ACL(aclrtMalloc((void**)&param_l_Device, param_l_FileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    uint8_t *param_Sin_Device;
+    CHECK_ACL(aclrtMalloc((void**)&param_Sin_Device, param_Sin_FileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    uint8_t *param_Sout_Device;
+    CHECK_ACL(aclrtMalloc((void**)&param_Sout_Device, param_Sout_FileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    std::vector< double > meas_vec( REPS );
+
+    for ( auto i = 0; i < REPS; ++i ) {
+	CHECK_ACL(aclrtMemcpy(param_Sin_Device, param_Sin_FileSize, param_Sin_Host, param_Sin_FileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+        std::cout << "Softmax rep " << i << std::endl;
+        auto begin = std::chrono::high_resolution_clock::now();
+
+        custom_KernelOnlineSoftmax_do(
+		blockDim, nullptr, stream,
+		param_Sin_Device, param_Sout_Device,
+		param_m_Device, param_l_Device,
+		blockDim, n0, n1, n2, n3
+	);
+	CHECK_ACL(aclrtSynchronizeStream(stream));
+
+        auto end = std::chrono::high_resolution_clock::now();
+        meas_vec[ i ] = static_cast< double >( std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count() );
+    }
+
+    std::sort( meas_vec.begin(), meas_vec.end() );
+    auto avg = std::accumulate( meas_vec.cbegin(), meas_vec.cend(), 0. ) / meas_vec.size();
+    auto min = *( std::min_element( meas_vec.cbegin(), meas_vec.cend() ) );
+    auto max = *( std::max_element( meas_vec.cbegin(), meas_vec.cend() ) );
+    auto size = meas_vec.size();
+    auto med = ( size % 2 == 0 ) ? ( meas_vec[ size / 2 - 1 ] + meas_vec[ size / 2 ] ) / 2 : meas_vec[ size / 2 ];
+    std::cout << "Measured Time (avg, ms): " << avg * 1e-6 << std::endl;
+    std::cout << "              (min, ms): " << min * 1e-6 << std::endl;
+    std::cout << "              (max, ms): " << max * 1e-6 << std::endl;
+    std::cout << "              (med, ms): " << med * 1e-6 << std::endl;
+
+
+    CHECK_ACL(aclrtMemcpy(param_m_Host, param_m_FileSize, param_m_Device, param_m_FileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    CHECK_ACL(aclrtMemcpy(param_l_Host, param_l_FileSize, param_l_Device, param_l_FileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    CHECK_ACL(aclrtMemcpy(param_Sout_Host, param_Sout_FileSize, param_Sout_Device, param_Sout_FileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+
+    WriteFile("./output/output_s1.bin", param_Sout_Host, param_Sout_FileSize);
+    CHECK_ACL(aclrtFreeHost(param_Sin_Host));
+    CHECK_ACL(aclrtFreeHost(param_Sout_Host));
+
+    CHECK_ACL(aclrtFree(param_Sin_Device));
+    CHECK_ACL(aclrtFree(param_Sout_Device));
+
+    WriteFile("./output/output_m.bin", param_m_Host, param_m_FileSize);
+    WriteFile("./output/output_l.bin", param_l_Host, param_l_FileSize);
+    CHECK_ACL(aclrtFreeHost(param_l_Host));
+    CHECK_ACL(aclrtFreeHost(param_m_Host));
+
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtDestroyContext(context));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+
+    return 0;
+}
diff --git a/examples/softmax_custom-v1-1.cpp b/examples/softmax_custom-v1-1.cpp
new file mode 100644
index 000000000..9eae29a99
--- /dev/null
+++ b/examples/softmax_custom-v1-1.cpp
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
+ */
+
+#include "kernel_operator.h"
+
+using namespace AscendC;
+
+constexpr int32_t BUFFER_NUM = 1;                                     // tensor num for each queue
+
+__aicore__ inline int32_t RoundUp(int32_t a, int32_t b) {
+    return (a + b - 1) / b;
+}
+
+//template < typename T >
+class KernelSoftmax {
+public:
+	__aicore__ inline KernelSoftmax( const uint32_t _p0, const uint32_t _n0, const uint32_t _n1, const uint32_t _n2 ) {
+		p0 = _p0;
+		p1 = 1;
+		p2 = 1;
+
+		n0 = _n0;
+		n1 = _n1;
+		n2 = _n2;
+
+		block_length0 = ( n0 * n1 * n2 ) / ( p0 * p1 * p2 );
+		tile_length0 = ( n1 * n2 ) / BUFFER_NUM;
+
+	}
+
+  __aicore__ inline void Init( GM_ADDR tensor0, GM_ADDR tensor1 )
+    {
+
+		// get start index for current core, core parallel
+		_tensor0_0Gm.SetGlobalBuffer( (__gm__ half *)tensor0 +  block_length0 * GetBlockIdx(), block_length0);
+		_tensor1_0Gm.SetGlobalBuffer((__gm__ half *)tensor1 + block_length0 * GetBlockIdx(), block_length0);
+
+		// Min workspace for reduction ops.
+		// Taking the largest btw MaxReduce and SumReduce (ie, MaxReduce) as specified in the AscendC manual
+		// at Secs. 8.1.5.10.1 and 8.1.5.10.3
+		ascend_el_per_blk = ONE_BLK_SIZE / sizeof( half );
+		int32_t elementsPerRepeat = ONE_REPEAT_BYTE_SIZE / sizeof( half );
+		int32_t firstMaxRepeat = n2 / elementsPerRepeat;
+		int32_t iter1OutputCount = firstMaxRepeat * 2;
+		int32_t tmpBufsColsReduce = RoundUp( iter1OutputCount, ascend_el_per_blk ) * ascend_el_per_blk;
+
+		totWorkSpaceSize = (
+			ascend_el_per_blk + tmpBufsColsReduce // Output + workspace for Max/SumReduce
+		+ n2
+		) * sizeof( half );
+
+		pipe.InitBuffer( inQueue_tensor0_0, BUFFER_NUM,  n2 * sizeof( half ) );
+		pipe.InitBuffer( outQueue_tensor1_0, BUFFER_NUM, n2 * sizeof( half ) );
+
+    }
+
+    __aicore__ inline void Process()
+    {
+		// loop count ( including effect of using BUFFER_NUM )
+		const uint32_t loopCount0 = ( n0 * BUFFER_NUM ) / p0;
+		for (uint32_t i0 = 0; i0 < loopCount0; i0++) {
+			uint32_t i = i0;
+
+		pipe.InitBuffer( tempBuf_tensor5_0, totWorkSpaceSize );
+		_tensor5_0temp = tempBuf_tensor5_0.Get< half >( );
+		pipe.InitBuffer( tempBuf_tensor6_0, totWorkSpaceSize );
+		_tensor6_0temp = tempBuf_tensor6_0.Get< half >( );
+
+		pipe.InitBuffer( localBuf_tensor4_0, n1 );
+		_tensor4_0Gm = localBuf_tensor4_0.Get< half >( ); // _tensor4_0Gm comes from API
+
+
+		// This loop comes from axis 1, does not need data movement
+		// For now process a tile row by row
+		const uint32_t loopCount1 = n1 / BUFFER_NUM;
+		for( uint32_t i1 = 0; i1 < n1 ; ++i1 ) {
+			CopyIn0(i0,i1);
+			Compute0( i1 );
+			CopyOut0(i0,i1);
+		}
+		// free input tensors for reuse
+		// inQueue_tensor0_0.FreeTensor( _tensor0Local );
+		}
+    }
+
+
+private:
+
+	__aicore__ inline void CopyIn0(
+		uint32_t _i0, uint32_t _i1
+	) {
+		// alloc tensor from queue memory
+		_tensor0Local = inQueue_tensor0_0.AllocTensor< half >();
+		// copy progress_th tile from global tensor to local tensor
+		DataCopy( _tensor0Local, _tensor0_0Gm[ _i0 * n1 * n2 + _i1 * n2  ], n2 );
+		// enque input tensors to VECIN queue
+		inQueue_tensor0_0.EnQue( _tensor0Local );
+
+		// deque input tensors from VECIN queue
+		_tensor0Local = inQueue_tensor0_0.DeQue< half >();
+		_tensor1Local = outQueue_tensor1_0.AllocTensor< half >();
+
+	}
+    __aicore__ inline void Compute0(uint32_t _i1)
+    {
+		// apply( _tensor4_0Gm, S_block_in, "max", make_axes(2)  )
+		ReduceMax( _tensor5_0temp, _tensor0Local, _tensor5_0temp[ ascend_el_per_blk ], n2, false );
+		half max_ = _tensor5_0temp.GetValue( 0 );
+		Duplicate( _tensor4_0Gm, max_, n2 ); // broadcast
+
+		// apply( S_block_out, S_block_in, _tensor4_0Gm, "minus", make_axes(2) );
+		Sub( _tensor1Local, _tensor0Local,  _tensor4_0Gm, n2 );
+
+		// foldl( S_block_out, "exp" );
+		Exp( _tensor1Local, _tensor1Local, n2 );
+
+		// apply( _tensor4_0Gm, S_block_out, "add", make_axes(2) );
+		ReduceSum( _tensor6_0temp, _tensor1Local, _tensor6_0temp[ ascend_el_per_blk ], n2 );
+		half rec_sum_ = _tensor6_0temp.GetValue( 0 );
+		Duplicate( _tensor4_0Gm, rec_sum_, n2 ); // broadcast
+
+		// foldl( S_block_out, _tensor4_0Gm, "divide", make_axes(2) );
+		Div( _tensor1Local, _tensor1Local, _tensor4_0Gm, n2 );
+
+    }
+	__aicore__ inline void CopyOut0(
+		uint32_t _i0, uint32_t _i1
+	) {
+		outQueue_tensor1_0.EnQue< half >( _tensor1Local );
+		// free input tensors for reuse
+		inQueue_tensor0_0.FreeTensor( _tensor0Local );
+
+		// deque output tensor from VECOUT queue
+		_tensor1Local = outQueue_tensor1_0.DeQue< half >();
+		DataCopy( _tensor1_0Gm[ _i0 * n1 * n2 + _i1 * n2 ], _tensor1Local, n2 );
+		// free output tensor for reuse
+		outQueue_tensor1_0.FreeTensor( _tensor1Local );
+    }
+
+	private:
+	TPipe pipe;
+	// create queues for input, in this case depth is equal to buffer num
+	TQue<QuePosition::VECIN, BUFFER_NUM> inQueue_tensor0_0;
+	// create queue for output, in this case depth is equal to buffer num
+	TQue<QuePosition::VECOUT, BUFFER_NUM> outQueue_tensor1_0;
+
+	uint32_t p0, p1, p2, n0, n1, n2;
+	uint32_t block_length0, tile_length0;
+	int32_t ascend_el_per_blk, totWorkSpaceSize;
+
+	GlobalTensor< half > _tensor0_0Gm, _tensor1_0Gm;
+	LocalTensor< half > _tensor0Local;
+	LocalTensor< half > _tensor1Local;
+	LocalTensor< half > _tensor5_0temp;
+	LocalTensor< half > _tensor6_0temp;
+	LocalTensor< half > _tensor4_0Gm;
+
+	TBuf< QuePosition::VECCALC > tempBuf_tensor5_0;
+	TBuf< QuePosition::VECCALC > tempBuf_tensor6_0;
+	TBuf< QuePosition::VECCALC > localBuf_tensor4_0;
+
+};
+
+extern "C" __global__ __aicore__ void custom_KernelSoftmax(
+    GM_ADDR in, GM_ADDR out,
+	uint32_t _p, uint32_t _n0, uint32_t _n1, uint32_t _n2 ) {
+    KernelSoftmax op(_p, _n0, _n1, _n2 );
+    op.Init( in, out );
+    op.Process();
+}
+
+#ifndef __CCE_KT_TEST__
+// call of kernel function
+void custom_KernelSoftmax_do( uint32_t blockDim, void* l2ctrl, void* stream, uint8_t* in, uint8_t* out, uint32_t _p, uint32_t _n0, uint32_t _n1, uint32_t _n2 )
+{
+  custom_KernelSoftmax<<< blockDim, l2ctrl, stream >>>( in, out, _p, _n0, _n1, _n2 );
+}
+#endif
diff --git a/examples/softmax_custom-v1.cpp b/examples/softmax_custom-v1.cpp
new file mode 100644
index 000000000..fc17296be
--- /dev/null
+++ b/examples/softmax_custom-v1.cpp
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
+ */
+
+#include "kernel_operator.h"
+
+using namespace AscendC;
+
+constexpr int32_t BUFFER_NUM = 2;                                     // tensor num for each queue
+
+__aicore__ inline int32_t RoundUp(int32_t a, int32_t b) {
+    return (a + b - 1) / b;
+}
+
+//template < typename T >
+class KernelSoftmax {
+public:
+	__aicore__ inline KernelSoftmax( const uint32_t _p0, const uint32_t _n0, const uint32_t _n1, const uint32_t _n2 ) {
+		p0 = _p0;
+		p1 = 1;
+		p2 = 1;
+
+		n0 = _n0;
+		n1 = _n1;
+		n2 = _n2;
+
+		block_length0 = ( n0 * n1 * n2 ) / ( p0 * p1 * p2 );
+		tile_length0 = ( n1 * n2 ) / BUFFER_NUM;
+
+	}
+
+  __aicore__ inline void Init( GM_ADDR tensor0, GM_ADDR tensor1 )
+    {
+
+		// get start index for current core, core parallel
+		_tensor0_0Gm.SetGlobalBuffer( (__gm__ half *)tensor0 +  block_length0 * GetBlockIdx(), block_length0);
+		_tensor1_0Gm.SetGlobalBuffer((__gm__ half *)tensor1 + block_length0 * GetBlockIdx(), block_length0);
+
+		// Min workspace for reduction ops.
+		// Taking the largest btw MaxReduce and SumReduce (ie, MaxReduce) as specified in the AscendC manual
+		// at Secs. 8.1.5.10.1 and 8.1.5.10.3
+		ascend_el_per_blk = ONE_BLK_SIZE / sizeof( half );
+		int32_t elementsPerRepeat = ONE_REPEAT_BYTE_SIZE / sizeof( half );
+		int32_t firstMaxRepeat = n2 / elementsPerRepeat;
+		int32_t iter1OutputCount = firstMaxRepeat * 2;
+		int32_t tmpBufsColsReduce = RoundUp( iter1OutputCount, ascend_el_per_blk ) * ascend_el_per_blk;
+
+		totWorkSpaceSize = (
+			ascend_el_per_blk + tmpBufsColsReduce // Output + workspace for Max/SumReduce
+		+ n2
+		) * sizeof( half );
+
+		pipe.InitBuffer( inQueue_tensor0_0, BUFFER_NUM,  tile_length0 * sizeof( half ) );
+		pipe.InitBuffer( outQueue_tensor1_0, BUFFER_NUM, tile_length0 * sizeof( half ) );
+
+    }
+
+    __aicore__ inline void Process()
+    {
+		// loop count ( including effect of using BUFFER_NUM )
+		const uint32_t loopCount0 = ( n0 * BUFFER_NUM ) / p0;
+		for (uint32_t i0 = 0; i0 < loopCount0; i0++) {
+			uint32_t i = i0;
+
+			CopyIn0(i);
+
+			// This loop comes from axis 1, does not need data movement
+			// For now process a tile row by row
+			const uint32_t loopCount1 = n1 / BUFFER_NUM;
+			for( uint32_t i1 = 0; i1 < loopCount1 ; ++i1 ) {
+				Compute0( i1 );
+			}
+
+			// free input tensors for reuse
+			// inQueue_tensor0_0.FreeTensor( _tensor0Local );
+
+			CopyOut0(i);
+		}
+    }
+
+
+private:
+
+    __aicore__ inline void CopyIn0(uint32_t progress)
+    {
+		// alloc tensor from queue memory
+		_tensor0Local = inQueue_tensor0_0.AllocTensor< half >();
+		// copy progress_th tile from global tensor to local tensor
+		DataCopy( _tensor0Local, _tensor0_0Gm[ progress * tile_length0 ], tile_length0 );
+		// enque input tensors to VECIN queue
+		inQueue_tensor0_0.EnQue( _tensor0Local );
+
+		// deque input tensors from VECIN queue
+		_tensor0Local = inQueue_tensor0_0.DeQue< half >();
+		_tensor1Local = outQueue_tensor1_0.AllocTensor< half >();
+
+		pipe.InitBuffer( tempBuf_tensor5_0, totWorkSpaceSize );
+		_tensor5_0temp = tempBuf_tensor5_0.Get< half >( );
+		pipe.InitBuffer( tempBuf_tensor6_0, totWorkSpaceSize );
+		_tensor6_0temp = tempBuf_tensor6_0.Get< half >( );
+
+		pipe.InitBuffer( localBuf_tensor4_0, n1 );
+		_tensor4_0Gm = localBuf_tensor4_0.Get< half >( ); // _tensor4_0Gm comes from API
+
+    }
+    __aicore__ inline void Compute0(uint32_t _i1)
+    {
+		// apply( _tensor4_0Gm, S_block_in, "max", make_axes(2)  )
+		ReduceMax( _tensor5_0temp[ 0 ], _tensor0Local[ _i1 * n2 ], _tensor5_0temp[ ascend_el_per_blk ], n2, false );
+		half max_ = _tensor5_0temp[ 0 ].GetValue( 0 );
+		Duplicate( _tensor4_0Gm, max_, n2 ); // broadcast
+
+		// apply( S_block_out, S_block_in, _tensor4_0Gm, "minus", make_axes(2) );
+		Sub( _tensor1Local[ _i1 * n2 ], _tensor0Local[ _i1 * n2 ],  _tensor4_0Gm, n2 );
+
+		// foldl( S_block_out, "exp" );
+		Exp( _tensor1Local[ _i1 * n2 ], _tensor1Local[ _i1 * n2 ], n2 );
+
+		// apply( _tensor4_0Gm, S_block_out, "add", make_axes(2) );
+		ReduceSum( _tensor6_0temp[ 0 ], _tensor1Local[ _i1 * n2 ], _tensor6_0temp[ ascend_el_per_blk ], n2 );
+		half rec_sum_ = _tensor6_0temp[ 0 ].GetValue( 0 );
+		Duplicate( _tensor4_0Gm, rec_sum_, n2 ); // broadcast
+
+		// foldl( S_block_out, _tensor4_0Gm, "divide", make_axes(2) );
+		Div( _tensor1Local[ _i1 * n2 ], _tensor1Local[ _i1 * n2 ], _tensor4_0Gm, n2 );
+
+    }
+    __aicore__ inline void CopyOut0(uint32_t progress)
+    {
+		outQueue_tensor1_0.EnQue< half >( _tensor1Local );
+		// free input tensors for reuse
+		inQueue_tensor0_0.FreeTensor( _tensor0Local );
+
+		// deque output tensor from VECOUT queue
+		_tensor1Local = outQueue_tensor1_0.DeQue< half >();
+		DataCopy( _tensor1_0Gm[ progress * tile_length0 ], _tensor1Local, tile_length0 );
+		// free output tensor for reuse
+		outQueue_tensor1_0.FreeTensor( _tensor1Local );
+    }
+
+	private:
+	TPipe pipe;
+	// create queues for input, in this case depth is equal to buffer num
+	TQue<QuePosition::VECIN, BUFFER_NUM> inQueue_tensor0_0;
+	// create queue for output, in this case depth is equal to buffer num
+	TQue<QuePosition::VECOUT, BUFFER_NUM> outQueue_tensor1_0;
+
+	uint32_t p0, p1, p2, n0, n1, n2;
+	uint32_t block_length0, tile_length0;
+	int32_t ascend_el_per_blk, totWorkSpaceSize;
+
+	GlobalTensor< half > _tensor0_0Gm, _tensor1_0Gm;
+	LocalTensor< half > _tensor0Local;
+	LocalTensor< half > _tensor1Local;
+	LocalTensor< half > _tensor5_0temp;
+	LocalTensor< half > _tensor6_0temp;
+	LocalTensor< half > _tensor4_0Gm;
+
+	TBuf< QuePosition::VECCALC > tempBuf_tensor5_0;
+	TBuf< QuePosition::VECCALC > tempBuf_tensor6_0;
+	TBuf< QuePosition::VECCALC > localBuf_tensor4_0;
+
+};
+
+extern "C" __global__ __aicore__ void custom_KernelSoftmax(
+    GM_ADDR in, GM_ADDR out,
+	uint32_t _p, uint32_t _n0, uint32_t _n1, uint32_t _n2 ) {
+    KernelSoftmax op(_p, _n0, _n1, _n2 );
+    op.Init( in, out );
+    op.Process();
+}
+
+#ifndef __CCE_KT_TEST__
+// call of kernel function
+void custom_KernelSoftmax_do( uint32_t blockDim, void* l2ctrl, void* stream, uint8_t* in, uint8_t* out, uint32_t _p, uint32_t _n0, uint32_t _n1, uint32_t _n2 )
+{
+  custom_KernelSoftmax<<< blockDim, l2ctrl, stream >>>( in, out, _p, _n0, _n1, _n2 );
+}
+#endif
diff --git a/examples/softmax_custom-v3.cpp b/examples/softmax_custom-v3.cpp
new file mode 100644
index 000000000..94e43f9c9
--- /dev/null
+++ b/examples/softmax_custom-v3.cpp
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
+ */
+
+#include "kernel_operator.h"
+#include "ascendlib.hpp"
+
+using namespace AscendC;
+
+constexpr int32_t BUFFER_NUM = 1;                                     // tensor num for each queue
+
+__aicore__ inline int32_t RoundUp(int32_t a, int32_t b) {
+    return (a + b - 1) / b;
+}
+
+//template < typename T >
+class KernelSoftmax {
+public:
+	__aicore__ inline KernelSoftmax( const uint32_t _p0, const uint32_t _n0, const uint32_t _n1, const uint32_t _n2, const uint32_t _n3 ) {
+		p0 = _p0;
+		p1 = 1;
+		p2 = 1;
+		p3 = 1;
+
+		n0 = _n0;
+		n1 = _n1;
+		n2 = _n2;
+		n3 = _n3;
+
+		block_length0 = ( n0 * n1 * n2 * n3 ) / ( p0 * p1 * p2 * p3 );
+		tile_length0 = ( n1 * n2 * n3 ) / BUFFER_NUM;
+
+	}
+
+  __aicore__ inline void Init( GM_ADDR tensor0, GM_ADDR tensor1 )
+    {
+
+		// get start index for current core, core parallel
+		_tensor0_0Gm.SetGlobalBuffer( (__gm__ half *)tensor0 +  block_length0 * GetBlockIdx(), block_length0);
+		_tensor1_0Gm.SetGlobalBuffer((__gm__ half *)tensor1 + block_length0 * GetBlockIdx(), block_length0);
+
+		// Min workspace for reduction ops.
+		// Taking the largest btw MaxReduce and SumReduce (ie, MaxReduce) as specified in the AscendC manual
+		// at Secs. 8.1.5.10.1 and 8.1.5.10.3
+		ascend_el_per_blk = ONE_BLK_SIZE / sizeof( half );
+		int32_t elementsPerRepeat = ONE_REPEAT_BYTE_SIZE / sizeof( half );
+		int32_t firstMaxRepeat = n3 / elementsPerRepeat;
+		int32_t iter1OutputCount = firstMaxRepeat * 2;
+		int32_t tmpBufsColsReduce = RoundUp( iter1OutputCount, ascend_el_per_blk ) * ascend_el_per_blk;
+
+		totWorkSpaceSize = (
+			ascend_el_per_blk + tmpBufsColsReduce // Output + workspace for Max/SumReduce
+		+ n3
+		) * sizeof( half );
+
+		pipe.InitBuffer( inQueue_tensor0_0, BUFFER_NUM,  tile_length0 * sizeof( half ) );
+		pipe.InitBuffer( outQueue_tensor1_0, BUFFER_NUM, tile_length0 * sizeof( half ) );
+
+    }
+
+    __aicore__ inline void Process()
+    {
+	    tempBuffInit();
+
+		const uint32_t loopCount0 =  n0  / p0;
+		for (uint32_t i0 = 0; i0 < loopCount0; i0++) {
+
+			const uint32_t loopCount1 = n1 ;
+			for (uint32_t i1 = 0; i1 < n1; i1++) {
+
+				uint32_t gm_pointer = i0*n1*n2*n3  +  i1*n2*n3;
+				//uint32_t blocklen=n3;
+				uint32_t stride=n3;
+				//uint32_t nblocks=n2;
+
+				CopyIn0(gm_pointer,n3,stride,n2);
+
+				// apply( _tensor2Local, S_block_in, "max", make_axes(2)  )
+				alp::BlockReduceMax( _tensor2Local, _tensor0Local, _tensor5_0temp[ ascend_el_per_blk ], n2, n3 );
+
+
+				// apply( S_block_out, S_block_in, _tensor2Local, "minus", make_axes(2) );
+				alp::BlockBcastMinus( _tensor1Local, _tensor0Local, _tensor2Local, _tensor5_0temp, n2, n3 );
+
+				alp::BlockExp( _tensor1Local, _tensor1Local, n2, n3 );
+
+				// apply( _tensor2Local, S_block_out, "add", make_axes(2) );
+				alp::BlockReduceSum( _tensor2Local, _tensor1Local, _tensor6_0temp[ ascend_el_per_blk ], n2, n3 );
+
+
+				// foldl( S_block_out, _tensor2Local, "divide", make_axes(2) );
+				alp::BlockBcastDivide( _tensor1Local, _tensor1Local, _tensor2Local, _tensor5_0temp, n2, n3 );
+
+				CopyOut0(gm_pointer,n3,stride,n2);
+
+			}
+		}
+
+    }
+
+
+private:
+
+
+	__aicore__ inline void tempBuffInit() {
+
+		pipe.InitBuffer( tempBuf_tensor5_0, totWorkSpaceSize );
+		_tensor5_0temp = tempBuf_tensor5_0.Get< half >( );
+
+		pipe.InitBuffer( tempBuf_tensor6_0, totWorkSpaceSize );
+		_tensor6_0temp = tempBuf_tensor6_0.Get< half >( );
+
+		pipe.InitBuffer( localBuf_tensor4_0, n2 );
+		_tensor2Local = localBuf_tensor4_0.Get< half >( ); // _tensor2Local comes from API
+	}
+
+
+	__aicore__ inline void CopyIn0(
+		uint32_t gm_pointer, uint32_t  blocklen, uint32_t stride, uint32_t nblocks
+	)
+    {
+		// alloc tensor from queue memory
+		_tensor0Local = inQueue_tensor0_0.AllocTensor< half >();
+		// copy progress_th tile from global tensor to local tensor
+
+		// DataCopyParams dcpy_param;
+		// dcpy_param.blockCount=nblocks;
+		// dcpy_param.blockLen  =blocklen;
+		// dcpy_param.srcStride =stride;
+		// dcpy_param.dstStride =0;
+		// DataCopy( _tensor0Local, _tensor0_0Gm[ gm_pointer ], dcpy_param );
+		// DataCopy( _tensor0Local, _tensor0_0Gm[ gm_pointer ], blocklen );
+		for( uint32_t k = 0; k < nblocks ; ++k ) {
+			DataCopy( _tensor0Local[ k*blocklen ], _tensor0_0Gm[ gm_pointer + k*stride ], blocklen );
+		}
+
+		// enque input tensors to VECIN queue
+		inQueue_tensor0_0.EnQue( _tensor0Local );
+
+		// deque input tensors from VECIN queue
+		_tensor0Local = inQueue_tensor0_0.DeQue< half >();
+		_tensor1Local = outQueue_tensor1_0.AllocTensor< half >();
+
+    }
+
+    __aicore__ inline void CopyOut0(
+	    		uint32_t gm_pointer, uint32_t  blocklen, uint32_t stride, uint32_t nblocks
+    ) {
+		outQueue_tensor1_0.EnQue< half >( _tensor1Local );
+		// free input tensors for reuse
+		inQueue_tensor0_0.FreeTensor( _tensor0Local );
+
+		// deque output tensor from VECOUT queue
+		_tensor1Local = outQueue_tensor1_0.DeQue< half >();
+
+		// DataCopyParams dcpy_param;
+		// dcpy_param.blockCount=nblocks;
+		// dcpy_param.blockLen  =blocklen;
+		// dcpy_param.srcStride =0;
+		// dcpy_param.dstStride =stride;
+		// DataCopy( _tensor1_0Gm[ gm_pointer ], _tensor1Local, dcpy_param );
+		// DataCopy( _tensor1_0Gm[ gm_pointer ], _tensor1Local, blocklen );
+		for( uint32_t k = 0; k < nblocks ; ++k ) {
+			DataCopy( _tensor1_0Gm[ gm_pointer + k*stride ], _tensor1Local[ k*blocklen ], blocklen );
+		}
+
+		// free output tensor for reuse
+		outQueue_tensor1_0.FreeTensor( _tensor1Local );
+    }
+
+	private:
+	TPipe pipe;
+	// create queues for input, in this case depth is equal to buffer num
+	TQue<QuePosition::VECIN, BUFFER_NUM> inQueue_tensor0_0;
+	// create queue for output, in this case depth is equal to buffer num
+	TQue<QuePosition::VECOUT, BUFFER_NUM> outQueue_tensor1_0;
+
+	uint32_t p0, p1, p2, p3, n0, n1, n2, n3;
+	uint32_t block_length0, tile_length0;
+	int32_t ascend_el_per_blk, totWorkSpaceSize;
+
+	GlobalTensor< half > _tensor0_0Gm, _tensor1_0Gm;
+	LocalTensor< half > _tensor0Local;
+	LocalTensor< half > _tensor1Local;
+
+	LocalTensor< half > _tensor5_0temp;
+	LocalTensor< half > _tensor6_0temp;
+	LocalTensor< half > _tensor2Local;
+
+	TBuf< QuePosition::VECCALC > tempBuf_tensor5_0;
+	TBuf< QuePosition::VECCALC > tempBuf_tensor6_0;
+	TBuf< QuePosition::VECCALC > localBuf_tensor4_0;
+
+};
+
+extern "C" __global__ __aicore__ void custom_KernelSoftmax(
+    GM_ADDR in, GM_ADDR out,
+	uint32_t _p, uint32_t _n0, uint32_t _n1, uint32_t _n2, uint32_t _n3 ) {
+    KernelSoftmax op(_p, _n0, _n1, _n2, _n3 );
+    op.Init( in, out );
+    op.Process();
+}
+
+#ifndef __CCE_KT_TEST__
+// call of kernel function
+void custom_KernelSoftmax_do( uint32_t blockDim, void* l2ctrl, void* stream, uint8_t* in, uint8_t* out, uint32_t _p, uint32_t _n0, uint32_t _n1, uint32_t _n2, uint32_t _n3 )
+{
+  custom_KernelSoftmax<<< blockDim, l2ctrl, stream >>>( in, out, _p, _n0, _n1, _n2, _n3 );
+}
+#endif
diff --git a/examples/softmax_custom-v4.cpp b/examples/softmax_custom-v4.cpp
new file mode 100644
index 000000000..77b69ebf0
--- /dev/null
+++ b/examples/softmax_custom-v4.cpp
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
+ */
+
+#include "kernel_operator.h"
+#include "ascendlib.hpp"
+
+using namespace AscendC;
+
+constexpr int32_t BUFFER_NUM = 1;                                     // tensor num for each queue
+
+__aicore__ inline int32_t RoundUp(int32_t a, int32_t b) {
+    return (a + b - 1) / b;
+}
+
+//template < typename T >
+class KernelSoftmax {
+public:
+/*
+	__aicore__ inline KernelSoftmax(
+		const uint32_t _p0,
+		const uint32_t _n0,
+		const uint32_t _n1,
+		const uint32_t _n2,
+		const uint32_t _n3,
+		const uint32_t _n4,
+		const uint32_t _n5
+	) {
+		p0 = _p0;
+		p1 = 1;
+		p2 = 1;
+		p3 = 1;
+		p4 = 1;
+		p5 = 1;
+
+		n0 = _n0;
+		n1 = _n1;
+		n2 = _n2;
+		n3 = _n3;
+		n4 = _n4;
+		n5 = _n5;
+
+		block_length0 = ( n0 * n1 * n2 * n3 * n4 * n5 ) / ( p0 * p1 * p2 * p3 * p4 * p5 );
+		tile_length0 = ( n1 * n2 * n3 * n4 * n5 ) / BUFFER_NUM;
+
+	}
+*/
+  __aicore__ inline void Init( GM_ADDR tensor0, GM_ADDR tensor1 )
+    {
+
+		// get start index for current core, core parallel
+		_tensor0_0Gm.SetGlobalBuffer( (__gm__ half *)tensor0 +  block_length0 * GetBlockIdx(), block_length0);
+		_tensor1_0Gm.SetGlobalBuffer((__gm__ half *)tensor1 + block_length0 * GetBlockIdx(), block_length0);
+
+		// Min workspace for reduction ops.
+		// Taking the largest btw MaxReduce and SumReduce (ie, MaxReduce) as specified in the AscendC manual
+		// at Secs. 8.1.5.10.1 and 8.1.5.10.3
+		ascend_el_per_blk = ONE_BLK_SIZE / sizeof( half );
+		int32_t elementsPerRepeat = ONE_REPEAT_BYTE_SIZE / sizeof( half );
+		int32_t firstMaxRepeat = n3 / elementsPerRepeat;
+		int32_t iter1OutputCount = firstMaxRepeat * 2;
+		int32_t tmpBufsColsReduce = RoundUp( iter1OutputCount, ascend_el_per_blk ) * ascend_el_per_blk;
+
+		totWorkSpaceSize = (
+			ascend_el_per_blk + tmpBufsColsReduce // Output + workspace for Max/SumReduce
+		+ n3
+		) * sizeof( half );
+
+		pipe.InitBuffer( inQueue_tensor0_0, BUFFER_NUM,  tile_length0 * sizeof( half ) );
+		pipe.InitBuffer( outQueue_tensor1_0, BUFFER_NUM, tile_length0 * sizeof( half ) );
+
+    }
+
+    __aicore__ inline void Process()
+    {
+	    tempBuffInit();
+
+	    // loop count ( including effect of using BUFFER_NUM )
+
+	    const uint32_t loopCount0 =  n0  / p0;
+	    for (uint32_t i0 = 0; i0 < loopCount0; i0++) {
+
+		    for (uint32_t i1 = 0; i1 < n1; i1++) {
+			    // no loop i2
+			    for (uint32_t i3 = 0; i3 < n3; i3++) {
+				    for (uint32_t i4 = 0; i4 < n4; i4++) {
+					    // no loop i5
+
+
+					    uint32_t gm_pointer = i0*n1*n2*n3*n4*n5  +  i1*n2*n3*n4*n5   +  i3*n4*n5  +  i4*n5;
+					    uint32_t blocklen=n5;
+					    uint32_t stride=n3*n4*n5;
+					    uint32_t nblocks=n2;
+
+					    CopyIn0(gm_pointer,blocklen,stride,nblocks);
+
+					    alp::BlockReduceMax( _tensor4_0Gm, _tensor0Local, _tensor5_0temp[ ascend_el_per_blk ], nblocks, blocklen );
+
+					    alp::BlockBcastMinus( _tensor1Local, _tensor0Local, _tensor4_0Gm, _tensor5_0temp, nblocks, blocklen );
+
+					    alp::BlockExp( _tensor1Local, _tensor1Local, nblocks, blocklen );
+
+					    alp::BlockReduceSum( _tensor4_0Gm, _tensor1Local, _tensor5_0temp[ ascend_el_per_blk ], nblocks, blocklen );
+
+					    alp::BlockBcastDivide( _tensor1Local, _tensor1Local, _tensor4_0Gm, _tensor5_0temp, nblocks, blocklen );
+
+					    CopyOut0(gm_pointer,blocklen,stride,nblocks);
+
+				    }
+			    }
+		    }
+	    }
+    }
+
+
+private:
+
+
+	__aicore__ inline void tempBuffInit() {
+
+		pipe.InitBuffer( tempBuf_tensor5_0, totWorkSpaceSize );
+		_tensor5_0temp = tempBuf_tensor5_0.Get< half >( );
+
+		// pipe.InitBuffer( tempBuf_tensor6_0, totWorkSpaceSize );
+		// _tensor6_0temp = tempBuf_tensor6_0.Get< half >( );
+
+		pipe.InitBuffer( localBuf_tensor4_0, n2 );
+		_tensor4_0Gm = localBuf_tensor4_0.Get< half >( ); // _tensor4_0Gm comes from API
+	}
+
+
+	__aicore__ inline void CopyIn0(
+		uint32_t gm_pointer, uint32_t  blocklen, uint32_t stride, uint32_t nblocks
+	)
+    {
+		// alloc tensor from queue memory
+		_tensor0Local = inQueue_tensor0_0.AllocTensor< half >();
+		// copy progress_th tile from global tensor to local tensor
+
+		// DataCopyParams dcpy_param;
+		// dcpy_param.blockCount=nblocks;
+		// dcpy_param.blockLen  =blocklen;
+		// dcpy_param.srcStride =stride;
+		// dcpy_param.dstStride =0;
+		// DataCopy( _tensor0Local, _tensor0_0Gm[ gm_pointer ], dcpy_param );
+		// DataCopy( _tensor0Local, _tensor0_0Gm[ gm_pointer ], blocklen );
+		for( uint32_t k = 0; k < nblocks ; ++k ) {
+			DataCopy( _tensor0Local[ k*blocklen ], _tensor0_0Gm[ gm_pointer + k*stride ], blocklen );
+		}
+
+		// enque input tensors to VECIN queue
+		inQueue_tensor0_0.EnQue( _tensor0Local );
+
+		// deque input tensors from VECIN queue
+		_tensor0Local = inQueue_tensor0_0.DeQue< half >();
+		_tensor1Local = outQueue_tensor1_0.AllocTensor< half >();
+
+
+    }
+
+    __aicore__ inline void CopyOut0(
+	    		uint32_t gm_pointer, uint32_t  blocklen, uint32_t stride, uint32_t nblocks
+    )
+    {
+		outQueue_tensor1_0.EnQue< half >( _tensor1Local );
+		// free input tensors for reuse
+		inQueue_tensor0_0.FreeTensor( _tensor0Local );
+
+		// deque output tensor from VECOUT queue
+		_tensor1Local = outQueue_tensor1_0.DeQue< half >();
+
+		// DataCopyParams dcpy_param;
+		// dcpy_param.blockCount=nblocks;
+		// dcpy_param.blockLen  =blocklen;
+		// dcpy_param.srcStride =0;
+		// dcpy_param.dstStride =stride;
+		// DataCopy( _tensor1_0Gm[ gm_pointer ], _tensor1Local, dcpy_param );
+		// DataCopy( _tensor1_0Gm[ gm_pointer ], _tensor1Local, blocklen );
+		for( uint32_t k = 0; k < nblocks ; ++k ) {
+			DataCopy( _tensor1_0Gm[ gm_pointer + k*stride ], _tensor1Local[ k*blocklen ], blocklen );
+		}
+
+		// free output tensor for reuse
+		outQueue_tensor1_0.FreeTensor( _tensor1Local );
+    }
+
+	private:
+	TPipe pipe;
+	// create queues for input, in this case depth is equal to buffer num
+	TQue<QuePosition::VECIN, BUFFER_NUM> inQueue_tensor0_0;
+	// create queue for output, in this case depth is equal to buffer num
+	TQue<QuePosition::VECOUT, BUFFER_NUM> outQueue_tensor1_0;
+
+	uint32_t p0, p1, p2, p3, p4, p5;
+	uint32_t n0, n1, n2, n3, n4, n5;
+	uint32_t block_length0, tile_length0;
+	int32_t ascend_el_per_blk, totWorkSpaceSize;
+
+	GlobalTensor< half > _tensor0_0Gm, _tensor1_0Gm;
+	LocalTensor< half > _tensor0Local;
+	LocalTensor< half > _tensor1Local;
+	LocalTensor< half > _tensor5_0temp;
+	// LocalTensor< half > _tensor6_0temp;
+	LocalTensor< half > _tensor4_0Gm;
+
+	TBuf< QuePosition::VECCALC > tempBuf_tensor5_0;
+	// TBuf< QuePosition::VECCALC > tempBuf_tensor6_0;
+	TBuf< QuePosition::VECCALC > localBuf_tensor4_0;
+
+};
+
+extern "C" __global__ __aicore__ void custom_KernelSoftmax(
+    GM_ADDR in, GM_ADDR out,
+	uint32_t _p, uint32_t _n0, uint32_t _n1, uint32_t _n2, uint32_t _n3, uint32_t _n4, uint32_t _n5 ) {
+    KernelSoftmax op(_p, _n0, _n1, _n2, _n3, _n4, _n5 );
+    op.Init( in, out );
+    op.Process();
+}
+
+#ifndef __CCE_KT_TEST__
+// call of kernel function
+void custom_KernelSoftmax_do(
+	uint32_t blockDim, void* l2ctrl, void* stream, uint8_t* in, uint8_t* out, uint32_t _p,
+	uint32_t _n0, uint32_t _n1, uint32_t _n2, uint32_t _n3, uint32_t _n4, uint32_t _n5
+) {
+  custom_KernelSoftmax<<< blockDim, l2ctrl, stream >>>( in, out, _p, _n0, _n1, _n2, _n3, _n4, _n5 );
+}
+#endif
diff --git a/examples/softmax_custom-v5.cpp b/examples/softmax_custom-v5.cpp
new file mode 100644
index 000000000..4dd038f0a
--- /dev/null
+++ b/examples/softmax_custom-v5.cpp
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
+ */
+
+#include "kernel_operator.h"
+#include "ascendlib.hpp"
+
+#define TMP_MXM
+
+using namespace AscendC;
+
+constexpr int32_t BUFFER_NUM = 1;                                     // tensor num for each queue
+
+__aicore__ inline int32_t RoundUp(int32_t a, int32_t b) {
+    return (a + b - 1) / b;
+}
+
+//template < typename T >
+class KernelOnlineSoftmax {
+public:
+	__aicore__ inline KernelOnlineSoftmax( const uint32_t _p0, const uint32_t _n0, const uint32_t _n1, const uint32_t _n2, const uint32_t _n3 ) {
+		p0 = _p0;
+		p1 = 1;
+		p2 = 1;
+		p3 = 1;
+
+		n0 = _n0; // Tr
+		n1 = _n1; // Tc
+		n2 = _n2; // Br
+		n3 = _n3; // Bc  // Sij(Br,Bc)
+
+		block_length_out1 = ( n0 * n2 ) / ( p0  * p2 );
+		tile_length_out1 = ( n2 ) / BUFFER_NUM;
+
+		block_length_out2 = ( n0 * n2 ) / ( p0  * p2 );
+		tile_length_out2 = ( n2 ) / BUFFER_NUM;
+	}
+
+  __aicore__ inline void Init(
+	  GM_ADDR tensorOut1, GM_ADDR tensorOut2,
+	  GM_ADDR tensorS0, GM_ADDR tensorS1
+  ) {
+
+	  // get start index for current core, core parallel
+
+	  _tensorOutm_Gm.SetGlobalBuffer( (__gm__ half *)tensorOut1 +  block_length_out1 * GetBlockIdx(), block_length_out1 );
+	  _tensorOutl_Gm.SetGlobalBuffer( (__gm__ half *)tensorOut2 +  block_length_out2 * GetBlockIdx(), block_length_out2 );
+	  pipe.InitBuffer( outQueue_tensor_l, BUFFER_NUM,  tile_length_out1 * sizeof( half ) );
+	  pipe.InitBuffer( outQueue_tensor_m, BUFFER_NUM,  tile_length_out2 * sizeof( half ) );
+
+	  uint32_t block_length_in_s = ( n0 * n1 * n2 * n3 ) / ( p0 * p1 * p2 * p3 );
+	  uint32_t tile_length_in_s = ( n1 * n2 * n3 ) / BUFFER_NUM;
+
+	  _tensorS0_Gm.SetGlobalBuffer( (__gm__ half *)tensorS0 + block_length_in_s * GetBlockIdx(), block_length_in_s );
+	  _tensorS1_Gm.SetGlobalBuffer( (__gm__ half *)tensorS1 + block_length_in_s * GetBlockIdx(), block_length_in_s );
+	  pipe.InitBuffer(  inQueue_tensor_S0,  BUFFER_NUM,  n1*n2*n3 * sizeof( half ) );
+	  pipe.InitBuffer( outQueue_tensor_S1,  BUFFER_NUM,  n1*n2*n3 * sizeof( half ) );
+
+
+	  // Min workspace for reduction ops.
+	  // Taking the largest btw MaxReduce and SumReduce (ie, MaxReduce) as specified in the AscendC manual
+	  // at Secs. 8.1.5.10.1 and 8.1.5.10.3
+	  ascend_el_per_blk = ONE_BLK_SIZE / sizeof( half );
+	  int32_t elementsPerRepeat = ONE_REPEAT_BYTE_SIZE / sizeof( half );
+	  int32_t firstMaxRepeat = n3 / elementsPerRepeat;
+	  int32_t iter1OutputCount = firstMaxRepeat * 2;
+	  int32_t tmpBufsColsReduce = RoundUp( iter1OutputCount, ascend_el_per_blk ) * ascend_el_per_blk;
+
+	  totWorkSpaceSize = (
+		  ascend_el_per_blk + tmpBufsColsReduce // Output + workspace for Max/SumReduce
+		  + n3
+	  ) * sizeof( half );
+
+
+	    pipe.InitBuffer( tempBuf_alltensors, totWorkSpaceSize + 3 * n2 );
+	    _tensor_Work4     = tempBuf_alltensors.Get< half >();
+
+	    // 0:
+	    // ascend_el_per_blk: TEMP / HIDDEN 
+	    // rowmaxS: totWorkSpaceSize
+	    // mi_old: rowmaxS + n2;
+	    // expmidiff: mi_old + n2;
+	    //
+	    //
+	    expmidiff= totWorkSpaceSize;
+	    mi_old = rowmaxS + n2;
+	    rowmaxS = mi_old + n2;
+
+    }
+
+    __aicore__ inline void Process()
+    {
+	    half Zero = 0;
+
+	    const uint32_t loopCount0 =  n0  / p0;
+	    for (uint32_t i0 = 0; i0 < loopCount0; i0++) {
+
+		    //*******************************//
+		    // auto m_block_out = mtensorout.getView(); // T(2)
+		    _tensor_m_i0 = outQueue_tensor_m.AllocTensor< half >();
+		    outQueue_tensor_m.EnQue( _tensor_m_i0 );
+		    _tensor_m_i0 = outQueue_tensor_m.DeQue< half >();
+
+
+		    // alp::set( m_block_out, -alp::Infinity<double> );
+		    half mInf = -65504.0;                 //----
+		    Duplicate( _tensor_m_i0, mInf, n2 ); //----		//TODO SET scalar
+
+
+		    // DataCopy here
+		    //*******************************//
+		    // auto l_block_out = ltensorout.getView(); // T(2)
+		    _tensor_l_i0 = outQueue_tensor_l.AllocTensor< half >();
+		    outQueue_tensor_l.EnQue( _tensor_l_i0 );
+		    _tensor_l_i0 = outQueue_tensor_l.DeQue< half >();
+
+
+		    // alp::set( l_block_out, alp::Zero<double>  );
+		    Duplicate( _tensor_l_i0, Zero, n2 );  //----	//TODO SET scalar
+
+
+		    // DataCopy here
+		    //*******************************//
+
+		    const uint32_t loopCount1 = n1 ;
+		    for( uint32_t i1 = 0; i1 < n1; i1++ ) {
+
+			    _tensorSijIn = inQueue_tensor_S0.AllocTensor< half >();
+			    _tensorSijOut = outQueue_tensor_S1.AllocTensor< half >();
+
+			    // alp::Tensor Sij(       alp::Datatype::FP16, alp::make_axes( 2, 3 ) );
+
+			    // alp::Tensor Temp(      alp::Datatype::FP16, alp::make_axes( 2, 3 ) );
+
+			    // alp::Tensor rowmaxS(   alp::Datatype::FP16, alp::make_axes( 2 ) );
+
+			    // alp::Tensor mi_old(    alp::Datatype::FP16, alp::make_axes( 2 ) );
+
+			    // alp::Tensor expmidiff( alp::Datatype::FP16, alp::make_axes( 2 ) );
+
+
+			    DataCopy( _tensorSijIn, _tensorS0_Gm[ i0*n1*n2*n3 + i1*n2*n3 ],  n2*n3  );
+			    inQueue_tensor_S0.EnQue( _tensorSijIn );
+			    _tensorSijIn = inQueue_tensor_S0.DeQue< half >();
+
+			    // +++++++++++++++++++++++++++++ //
+			    // Online softmax
+
+			    // set( mi_old, m_block_out);
+			    DataCopy( _tensor_Work4[mi_old], _tensor_m_i0, n2 );
+
+			    // apply( rowmaxS, S_block_in, "max", make_axes( 3 ) );
+			    alp::BlockReduceMax( _tensor_Work4[rowmaxS], _tensorSijIn, _tensor_Work4[ ascend_el_per_blk ], n2, n3 );
+
+			    // foldl( m_block_out, rowmaxS, "max" );
+			    Max( _tensor_m_i0, _tensor_m_i0, _tensor_Work4[rowmaxS], n2 );
+
+			    // // apply( S_block_out, S_block_in, m_block_out, "minus", make_axes( 3 ) );
+			    alp::BlockBcastMinus( _tensorSijOut, _tensorSijIn, _tensor_m_i0, _tensor_Work4, n2, n3 );
+
+			    // Si=np.exp(Si)
+			    alp::BlockExp( _tensorSijOut, _tensorSijOut, n2, n3 );
+
+			    // expmidiff=np.exp(mi_old-mtensor[i,:])
+			    Duplicate( _tensor_Work4[expmidiff], Zero, n2 );  //----
+			    Sub( _tensor_Work4[expmidiff], _tensor_Work4[mi_old], _tensor_m_i0, n2 );
+			    Exp( _tensor_Work4[expmidiff], _tensor_Work4[expmidiff], n2 );
+
+			    // foldl( l_block_out, expmidiff, "times" );
+			    Mul( _tensor_l_i0, _tensor_l_i0, _tensor_Work4[expmidiff], n2 );
+
+			    // foldl( l_block_out, S_block_out, "add", make_axes( 3 ) );
+			    alp::BlockReduceSum( _tensor_Work4[rowmaxS], _tensorSijOut, _tensor_Work4[ ascend_el_per_blk ] , n2, n3 );
+			    Add( _tensor_l_i0, _tensor_l_i0, _tensor_Work4[rowmaxS], n2 );
+
+			    // +++++++++++++++++++++++++++++ //
+
+			    outQueue_tensor_S1.EnQue( _tensorSijOut );
+			    _tensorSijOut = outQueue_tensor_S1.DeQue< half >();
+			    DataCopy( _tensorS1_Gm[ i0*n1*n2*n3 + i1*n2*n3 ], _tensorSijOut,  n2*n3  );
+
+			    inQueue_tensor_S0.FreeTensor( _tensorSijIn );
+			    outQueue_tensor_S1.FreeTensor( _tensorSijOut );
+		    }
+
+    			// // Uptade ltensor
+    			// // CopyOUT ltensor & mtensor
+
+		    DataCopy(  _tensorOutm_Gm[ i0 * n2 ], _tensor_m_i0, n2 );
+		    DataCopy(  _tensorOutl_Gm[ i0 * n2 ], _tensor_l_i0, n2 );
+
+		    outQueue_tensor_m.FreeTensor( _tensor_m_i0 );
+		    outQueue_tensor_l.FreeTensor( _tensor_l_i0 );
+
+    		}
+
+    }
+
+
+private:
+
+
+	private:
+
+	uint32_t p0, p1, p2, p3;
+	uint32_t n0, n1, n2, n3;
+	uint32_t block_length_out1, tile_length_out1;
+	uint32_t block_length_out2, tile_length_out2;
+
+	int32_t ascend_el_per_blk, totWorkSpaceSize;
+	int32_t rowmaxS, mi_old, expmidiff;
+
+	TPipe pipe;
+
+	// create queue for output, in this case depth is equal to buffer num
+	TQue<QuePosition::VECOUT, BUFFER_NUM> outQueue_tensor_S1;
+	TQue<QuePosition::VECIN, BUFFER_NUM> inQueue_tensor_S0;
+
+	TQue<QuePosition::VECOUT, BUFFER_NUM> outQueue_tensor_m;
+	TQue<QuePosition::VECOUT, BUFFER_NUM> outQueue_tensor_l;
+
+
+	GlobalTensor< half > _tensorOutm_Gm;
+	GlobalTensor< half > _tensorOutl_Gm;
+	GlobalTensor< half >  _tensorS0_Gm;
+	GlobalTensor< half >  _tensorS1_Gm;
+
+	LocalTensor< half > _tensorSijOut;
+	LocalTensor< half > _tensorSijIn;
+
+	LocalTensor< half > _tensor_m_i0;
+	LocalTensor< half > _tensor_l_i0;
+
+	LocalTensor< half > _tensor_Work4;
+
+
+
+	TBuf< QuePosition::VECCALC > tempBuf_alltensors;
+};
+
+extern "C" __global__ __aicore__ void custom_KernelOnlineSoftmax(
+	GM_ADDR out1, GM_ADDR out2,
+	GM_ADDR S0, GM_ADDR S1,
+	uint32_t _p, uint32_t _n0, uint32_t _n1, uint32_t _n2, uint32_t _n3
+) {
+	KernelOnlineSoftmax op(_p, _n0, _n1, _n2, _n3 );
+	op.Init(
+		out1, out2,
+		S0, S1
+	);  // TODO fix Init
+	op.Process(); // TODO fix Process
+}
+
+#ifndef __CCE_KT_TEST__
+// call of kernel function
+void custom_KernelOnlineSoftmax_do(
+	uint32_t blockDim, void* l2ctrl, void* stream,
+	uint8_t* out1, uint8_t* out2, 
+	uint8_t* s0, uint8_t* s1,
+	uint32_t _p, uint32_t _n0, uint32_t _n1, uint32_t _n2, uint32_t _n3
+) {
+  custom_KernelOnlineSoftmax<<< blockDim, l2ctrl, stream >>>(
+	  out1, out2,
+	  s0, s1,
+	  _p, _n0, _n1, _n2, _n3
+  );
+}
+#endif
diff --git a/examples/softmax_custom.cpp b/examples/softmax_custom.cpp
new file mode 100644
index 000000000..fc17296be
--- /dev/null
+++ b/examples/softmax_custom.cpp
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
+ */
+
+#include "kernel_operator.h"
+
+using namespace AscendC;
+
+constexpr int32_t BUFFER_NUM = 2;                                     // tensor num for each queue
+
+__aicore__ inline int32_t RoundUp(int32_t a, int32_t b) {
+    return (a + b - 1) / b;
+}
+
+//template < typename T >
+class KernelSoftmax {
+public:
+	__aicore__ inline KernelSoftmax( const uint32_t _p0, const uint32_t _n0, const uint32_t _n1, const uint32_t _n2 ) {
+		p0 = _p0;
+		p1 = 1;
+		p2 = 1;
+
+		n0 = _n0;
+		n1 = _n1;
+		n2 = _n2;
+
+		block_length0 = ( n0 * n1 * n2 ) / ( p0 * p1 * p2 );
+		tile_length0 = ( n1 * n2 ) / BUFFER_NUM;
+
+	}
+
+  __aicore__ inline void Init( GM_ADDR tensor0, GM_ADDR tensor1 )
+    {
+
+		// get start index for current core, core parallel
+		_tensor0_0Gm.SetGlobalBuffer( (__gm__ half *)tensor0 +  block_length0 * GetBlockIdx(), block_length0);
+		_tensor1_0Gm.SetGlobalBuffer((__gm__ half *)tensor1 + block_length0 * GetBlockIdx(), block_length0);
+
+		// Min workspace for reduction ops.
+		// Taking the largest btw MaxReduce and SumReduce (ie, MaxReduce) as specified in the AscendC manual
+		// at Secs. 8.1.5.10.1 and 8.1.5.10.3
+		ascend_el_per_blk = ONE_BLK_SIZE / sizeof( half );
+		int32_t elementsPerRepeat = ONE_REPEAT_BYTE_SIZE / sizeof( half );
+		int32_t firstMaxRepeat = n2 / elementsPerRepeat;
+		int32_t iter1OutputCount = firstMaxRepeat * 2;
+		int32_t tmpBufsColsReduce = RoundUp( iter1OutputCount, ascend_el_per_blk ) * ascend_el_per_blk;
+
+		totWorkSpaceSize = (
+			ascend_el_per_blk + tmpBufsColsReduce // Output + workspace for Max/SumReduce
+		+ n2
+		) * sizeof( half );
+
+		pipe.InitBuffer( inQueue_tensor0_0, BUFFER_NUM,  tile_length0 * sizeof( half ) );
+		pipe.InitBuffer( outQueue_tensor1_0, BUFFER_NUM, tile_length0 * sizeof( half ) );
+
+    }
+
+    __aicore__ inline void Process()
+    {
+		// loop count ( including effect of using BUFFER_NUM )
+		const uint32_t loopCount0 = ( n0 * BUFFER_NUM ) / p0;
+		for (uint32_t i0 = 0; i0 < loopCount0; i0++) {
+			uint32_t i = i0;
+
+			CopyIn0(i);
+
+			// This loop comes from axis 1, does not need data movement
+			// For now process a tile row by row
+			const uint32_t loopCount1 = n1 / BUFFER_NUM;
+			for( uint32_t i1 = 0; i1 < loopCount1 ; ++i1 ) {
+				Compute0( i1 );
+			}
+
+			// free input tensors for reuse
+			// inQueue_tensor0_0.FreeTensor( _tensor0Local );
+
+			CopyOut0(i);
+		}
+    }
+
+
+private:
+
+    __aicore__ inline void CopyIn0(uint32_t progress)
+    {
+		// alloc tensor from queue memory
+		_tensor0Local = inQueue_tensor0_0.AllocTensor< half >();
+		// copy progress_th tile from global tensor to local tensor
+		DataCopy( _tensor0Local, _tensor0_0Gm[ progress * tile_length0 ], tile_length0 );
+		// enque input tensors to VECIN queue
+		inQueue_tensor0_0.EnQue( _tensor0Local );
+
+		// deque input tensors from VECIN queue
+		_tensor0Local = inQueue_tensor0_0.DeQue< half >();
+		_tensor1Local = outQueue_tensor1_0.AllocTensor< half >();
+
+		pipe.InitBuffer( tempBuf_tensor5_0, totWorkSpaceSize );
+		_tensor5_0temp = tempBuf_tensor5_0.Get< half >( );
+		pipe.InitBuffer( tempBuf_tensor6_0, totWorkSpaceSize );
+		_tensor6_0temp = tempBuf_tensor6_0.Get< half >( );
+
+		pipe.InitBuffer( localBuf_tensor4_0, n1 );
+		_tensor4_0Gm = localBuf_tensor4_0.Get< half >( ); // _tensor4_0Gm comes from API
+
+    }
+    __aicore__ inline void Compute0(uint32_t _i1)
+    {
+		// apply( _tensor4_0Gm, S_block_in, "max", make_axes(2)  )
+		ReduceMax( _tensor5_0temp[ 0 ], _tensor0Local[ _i1 * n2 ], _tensor5_0temp[ ascend_el_per_blk ], n2, false );
+		half max_ = _tensor5_0temp[ 0 ].GetValue( 0 );
+		Duplicate( _tensor4_0Gm, max_, n2 ); // broadcast
+
+		// apply( S_block_out, S_block_in, _tensor4_0Gm, "minus", make_axes(2) );
+		Sub( _tensor1Local[ _i1 * n2 ], _tensor0Local[ _i1 * n2 ],  _tensor4_0Gm, n2 );
+
+		// foldl( S_block_out, "exp" );
+		Exp( _tensor1Local[ _i1 * n2 ], _tensor1Local[ _i1 * n2 ], n2 );
+
+		// apply( _tensor4_0Gm, S_block_out, "add", make_axes(2) );
+		ReduceSum( _tensor6_0temp[ 0 ], _tensor1Local[ _i1 * n2 ], _tensor6_0temp[ ascend_el_per_blk ], n2 );
+		half rec_sum_ = _tensor6_0temp[ 0 ].GetValue( 0 );
+		Duplicate( _tensor4_0Gm, rec_sum_, n2 ); // broadcast
+
+		// foldl( S_block_out, _tensor4_0Gm, "divide", make_axes(2) );
+		Div( _tensor1Local[ _i1 * n2 ], _tensor1Local[ _i1 * n2 ], _tensor4_0Gm, n2 );
+
+    }
+    __aicore__ inline void CopyOut0(uint32_t progress)
+    {
+		outQueue_tensor1_0.EnQue< half >( _tensor1Local );
+		// free input tensors for reuse
+		inQueue_tensor0_0.FreeTensor( _tensor0Local );
+
+		// deque output tensor from VECOUT queue
+		_tensor1Local = outQueue_tensor1_0.DeQue< half >();
+		DataCopy( _tensor1_0Gm[ progress * tile_length0 ], _tensor1Local, tile_length0 );
+		// free output tensor for reuse
+		outQueue_tensor1_0.FreeTensor( _tensor1Local );
+    }
+
+	private:
+	TPipe pipe;
+	// create queues for input, in this case depth is equal to buffer num
+	TQue<QuePosition::VECIN, BUFFER_NUM> inQueue_tensor0_0;
+	// create queue for output, in this case depth is equal to buffer num
+	TQue<QuePosition::VECOUT, BUFFER_NUM> outQueue_tensor1_0;
+
+	uint32_t p0, p1, p2, n0, n1, n2;
+	uint32_t block_length0, tile_length0;
+	int32_t ascend_el_per_blk, totWorkSpaceSize;
+
+	GlobalTensor< half > _tensor0_0Gm, _tensor1_0Gm;
+	LocalTensor< half > _tensor0Local;
+	LocalTensor< half > _tensor1Local;
+	LocalTensor< half > _tensor5_0temp;
+	LocalTensor< half > _tensor6_0temp;
+	LocalTensor< half > _tensor4_0Gm;
+
+	TBuf< QuePosition::VECCALC > tempBuf_tensor5_0;
+	TBuf< QuePosition::VECCALC > tempBuf_tensor6_0;
+	TBuf< QuePosition::VECCALC > localBuf_tensor4_0;
+
+};
+
+extern "C" __global__ __aicore__ void custom_KernelSoftmax(
+    GM_ADDR in, GM_ADDR out,
+	uint32_t _p, uint32_t _n0, uint32_t _n1, uint32_t _n2 ) {
+    KernelSoftmax op(_p, _n0, _n1, _n2 );
+    op.Init( in, out );
+    op.Process();
+}
+
+#ifndef __CCE_KT_TEST__
+// call of kernel function
+void custom_KernelSoftmax_do( uint32_t blockDim, void* l2ctrl, void* stream, uint8_t* in, uint8_t* out, uint32_t _p, uint32_t _n0, uint32_t _n1, uint32_t _n2 )
+{
+  custom_KernelSoftmax<<< blockDim, l2ctrl, stream >>>( in, out, _p, _n0, _n1, _n2 );
+}
+#endif
diff --git a/examples/softmax_custom_main.cpp b/examples/softmax_custom_main.cpp
new file mode 120000
index 000000000..61df9c48e
--- /dev/null
+++ b/examples/softmax_custom_main.cpp
@@ -0,0 +1 @@
+unittests/host_ascend_softmaxOp.cpp
\ No newline at end of file
diff --git a/examples/unittests/HOST_TEST_TEMPLATE.cpp b/examples/unittests/HOST_TEST_TEMPLATE.cpp
new file mode 100644
index 000000000..cd38835cc
--- /dev/null
+++ b/examples/unittests/HOST_TEST_TEMPLATE.cpp
@@ -0,0 +1,138 @@
+
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
+ * This file constains code of cpu debug and npu code.We read data from bin file
+ * and write result to file.
+ */
+#include <algorithm>
+#include <chrono>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+//#define _ANALYTIC_MODEL_
+
+#ifdef _ANALYTIC_MODEL_
+#include "analytic_model.hpp"
+#endif
+
+#include "data_utils.h"
+
+#ifdef __CCE_KT_TEST__
+
+#include "tikicpulib.h"
+extern "C" __global__ __aicore__ void custom_##KERNELNAME##(
+##CPUFRWDECTENSORALLLIST##,
+##CPUFRWDECTHRDGRIDLIST##,
+##CPUFRWDECTENSORSIZESLIST##
+##ANALYTICMODELFORMALPARAMS##
+);
+
+#else
+
+#include "acl/acl.h"
+extern void custom_##KERNELNAME##_do(
+	uint32_t coreDim, void* l2ctrl, void* stream,
+##FRWDECTENSORALLLIST##,
+##FRWDECTHRDGRIDLIST##,
+##FRWDECTENSORSIZESLIST##
+##ANALYTICMODELFORMALPARAMS##
+);
+
+#endif
+
+
+
+#define DTYPE uint16_t
+
+#define REPS ##REPEATS##
+
+int32_t main(int32_t argc, char* argv[]){
+	int rc = 0;
+	uint32_t blockDim = ##NTHREADS##;
+	uint32_t _p0 = ##NTHREADS##;
+##DECLARESIZES##
+
+##DECLARETENSORSIZES##
+
+##DECLAREANALYTICMODELPARAMS##
+
+#ifdef __CCE_KT_TEST__
+##CPUDECLARETENSOR##
+##CPUREADFILES##
+
+	AscendC::SetKernelMode(KernelMode::AIV_MODE);
+	ICPU_RUN_KF(
+		custom_##KERNELNAME##,
+		blockDim,
+##CPUTENSORLIST##,
+		blockDim,
+##ALLDIMENSIONSLIST##, ##ANALYTICMODELPARAMS##
+	); // run the Kernel
+
+##CPUWRITETENSOR##
+
+##CPUFREETENSOR##
+#else
+
+	CHECK_ACL(aclInit(nullptr));
+	aclrtContext context;
+	int32_t deviceId = ##DEVICEID##;
+	CHECK_ACL(aclrtSetDevice(deviceId));
+	CHECK_ACL(aclrtCreateContext(&context, deviceId));
+	aclrtStream stream = nullptr;
+	CHECK_ACL(aclrtCreateStream(&stream));
+
+##HOSTDECLARETENSOR##
+##HOSTREADFILES##
+##DEVICEDECLARETENSOR##
+
+	std::vector< double > meas_vec( REPS );
+
+	for ( auto i = 0; i < REPS; ++i ) {
+##HOST2DEVICEMOVE##
+		std::cout << "custom_##KERNELNAME## rep " << i << std::endl;
+		auto begin = std::chrono::high_resolution_clock::now();
+		custom_##KERNELNAME##_do(
+			blockDim, nullptr, stream,
+##DEVICETENSORLIST##,
+			blockDim,
+##ALLDIMENSIONSLIST##, ##ANALYTICMODELPARAMS##
+		);
+		rc = aclrtSynchronizeStream(stream);
+		CHECK_ACL(rc);
+		if( rc != 0 ) {
+			break;
+		}
+		auto end = std::chrono::high_resolution_clock::now();
+		meas_vec[ i ] = static_cast< double >( std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count() );
+	}
+
+	std::sort( meas_vec.begin(), meas_vec.end() );
+	auto avg = std::accumulate( meas_vec.cbegin(), meas_vec.cend(), 0. ) / meas_vec.size();
+	auto min = *( std::min_element( meas_vec.cbegin(), meas_vec.cend() ) );
+	auto max = *( std::max_element( meas_vec.cbegin(), meas_vec.cend() ) );
+	auto size = meas_vec.size();
+	auto med = ( size % 2 == 0 ) ? ( meas_vec[ size / 2 - 1 ] + meas_vec[ size / 2 ] ) / 2 : meas_vec[ size / 2 ];
+	std::cout << "Measured Time (avg, ms): " << avg * 1e-6 << std::endl;
+	std::cout << "              (min, ms): " << min * 1e-6 << std::endl;
+	std::cout << "              (max, ms): " << max * 1e-6 << std::endl;
+	std::cout << "              (med, ms): " << med * 1e-6 << std::endl;
+
+##DEVICE2HOSTMOVE##
+##DEVICEFREETENSOR##
+##WRITETENSOR##
+##HOSTFREETENSOR##
+
+	CHECK_ACL(aclrtDestroyStream(stream));
+	CHECK_ACL(aclrtDestroyContext(context));
+	CHECK_ACL(aclrtResetDevice(deviceId));
+	CHECK_ACL(aclFinalize());
+#endif
+	if( rc != 0 ) {
+		return 1;
+	} else {
+		return 0;
+	}
+
+}
diff --git a/examples/unittests/Makefile b/examples/unittests/Makefile
new file mode 100644
index 000000000..294c7f2a6
--- /dev/null
+++ b/examples/unittests/Makefile
@@ -0,0 +1,56 @@
+ASCEND_TOOLKIT_INSTALL_PATH=$(ASCEND_HOME_PATH)
+
+CXX=/usr/bin/c++
+ccec_compiler=$(ASCEND_TOOLKIT_INSTALL_PATH)/compiler/ccec_compiler/bin/ccec
+
+ifeq ($(ASCEND_VERSION),910A)
+	ccec_falgs=-I$(ALP_ROOT)/include/graphblas/ascend/ -I$(ALP_ROOT)/include/asclib -xcce -DTILING_KEY_VAR=0  -I$(ASCEND_TOOLKIT_INSTALL_PATH)/acllib/include -I$(ASCEND_TOOLKIT_INSTALL_PATH)/compiler/tikcpp/tikcfw -I$(ASCEND_TOOLKIT_INSTALL_PATH)/compiler/tikcpp/tikcfw/impl -I$(ASCEND_TOOLKIT_INSTALL_PATH)/compiler/tikcpp/tikcfw/interface -I$(ASCEND_TOOLKIT_INSTALL_PATH)/tools/tikicpulib/lib/include  -O2 -std=c++17 --cce-aicore-arch=dav-c100  --cce-auto-sync -fPIC -pthread
+	ccec_link_falgs=--cce-fatobj-link --cce-aicore-arch=dav-c100 -L$(ASCEND_TOOLKIT_INSTALL_PATH)/runtime/lib64 -L$(ASCEND_TOOLKIT_INSTALL_PATH)/tools/simulator/Ascend910A/lib -L$(ASCEND_TOOLKIT_INSTALL_PATH)/tools/tikicpulib/lib/Ascend910A -lstdc++ -lruntime -lascendcl -lm
+endif
+
+ifeq ($(ASCEND_VERSION),910B)
+	ccec_falgs=-I$(ALP_ROOT)/include/graphblas/ascend/ -I$(ALP_ROOT)/include/asclib -xcce -DTILING_KEY_VAR=0  -I$(ASCEND_TOOLKIT_INSTALL_PATH)/acllib/include -I$(ASCEND_TOOLKIT_INSTALL_PATH)/compiler/tikcpp/tikcfw -I$(ASCEND_TOOLKIT_INSTALL_PATH)/compiler/tikcpp/tikcfw/impl -I$(ASCEND_TOOLKIT_INSTALL_PATH)/compiler/tikcpp/tikcfw/interface -I$(ASCEND_TOOLKIT_INSTALL_PATH)/tools/tikicpulib/lib/include  -O2 -std=c++17 --cce-aicore-arch=dav-c220-vec -mllvm -cce-aicore-function-stack-size=16000 -mllvm -cce-aicore-record-overflow=false -mllvm -cce-aicore-addr-transform  --cce-auto-sync -fPIC -pthread -DASCEND910B
+	ccec_link_falgs=--cce-fatobj-link --cce-aicore-arch=dav-c220-vec -L$(ASCEND_TOOLKIT_INSTALL_PATH)/runtime/lib64 -L$(ASCEND_TOOLKIT_INSTALL_PATH)/tools/simulator/Ascend910B1/lib -L$(ASCEND_TOOLKIT_INSTALL_PATH)/tools/tikicpulib/lib/Ascend910B1 -lstdc++ -lruntime -lascendcl -lm -DASCEND910B
+endif
+
+MODE=npu
+
+ifeq ($(ASCEND_CPU_MODE),ON)
+ifeq ($(ASCEND_VERSION),910A)
+MODE=cpu
+ccec_compiler=$(CXX)
+
+ccec_falgs=-I$(ALP_ROOT)/include/graphblas/ascend/ -I$(ALP_ROOT)/include/asclib -D__CCE_AICORE__=100 -D__CCE_KT_TEST__=1 -D__DAV_C100__  -I${ASCEND_HOME_PATH}/acllib/include -isystem ${ASCEND_HOME_PATH}/tools/tikicpulib/lib/include -isystem ${ASCEND_HOME_PATH}/tools/tikicpulib/../../compiler/tikcpp/tikcfw -isystem ${ASCEND_HOME_PATH}/tools/tikicpulib/../../compiler/tikcpp/tikcfw/impl -isystem ${ASCEND_HOME_PATH}/tools/tikicpulib/../../compiler/tikcpp/tikcfw/interface -std=gnu++1z -g -std=c++17
+
+ccec_link_falgs=-L${ASCEND_HOME_PATH}/tools/tikicpulib/lib  -L${ASCEND_HOME_PATH}/tools/tikicpulib/lib/Ascend910A  -L${ASCEND_HOME_PATH}/tools/tikicpulib/../simulator/Ascend910A/lib  -L${ASCEND_HOME_PATH}/tools/tikicpulib/../../lib64  -Wl,-rpath,${ASCEND_HOME_PATH}/tools/tikicpulib/lib:${ASCEND_HOME_PATH}/tools/tikicpulib/lib/Ascend910A:${ASCEND_HOME_PATH}/tools/tikicpulib/../simulator/Ascend910A/lib:${ASCEND_HOME_PATH}/tools/tikicpulib/../../lib64 -lascendcl -Wl,--no-as-needed -l_pvmodel -ltikcpp_debug ${ASCEND_HOME_PATH}/tools/tikicpulib/lib/libtikicpulib_cceprint.so ${ASCEND_HOME_PATH}/tools/tikicpulib/lib/libtikicpulib_npuchk.so ${ASCEND_HOME_PATH}/tools/tikicpulib/lib/libtikicpulib_stubreg.so -Wl,--as-needed
+
+endif
+endif
+
+$(hostfile): $(target_cmake) $(host_template) $(host_code_inp) $(generate_host)
+	@echo python3 $(generate_host) $(host_template) $(hostfile) $(host_code_inp) 10 0 8
+	@python3 $(generate_host) $(host_template) $(hostfile) $(host_code_inp) 10 0 8
+
+$(host_code_inp): $(target_cmake)
+	@echo @$(target_cmake)
+	@$(target_cmake)
+
+$(devicefile): $(target_cmake)
+	@$(target_cmake)
+
+$(basename $(notdir $(hostfile))).$(MODE).o : $(hostfile)
+	@echo $(ccec_compiler) $(ccec_falgs) -c $(hostfile) -o $(basename $(notdir $(hostfile))).$(MODE).o
+	$(ccec_compiler) $(ccec_falgs) -c $(hostfile) -o $(basename $(notdir $(hostfile))).$(MODE).o
+
+$(basename $(notdir $(devicefile))).$(MODE).o : $(devicefile)
+	@echo $(ccec_compiler) $(ccec_falgs) -c $(devicefile) -o $(basename $(notdir $(devicefile))).$(MODE).o
+	$(ccec_compiler) $(ccec_falgs) -c $(devicefile) -o $(basename $(notdir $(devicefile))).$(MODE).o
+
+$(target): $(basename $(notdir $(devicefile))).$(MODE).o $(basename $(notdir $(hostfile))).$(MODE).o
+	$(ccec_compiler) $(basename $(notdir $(devicefile))).$(MODE).o $(basename $(notdir $(hostfile))).$(MODE).o -o $(target) $(ccec_link_falgs)
+
+clean:
+	@rm -f $(basename $(notdir $(devicefile))).$(MODE).o $(basename $(notdir $(hostfile))).$(MODE).o $(target)
+
+cleanall:
+	@rm -f .*o
diff --git a/examples/unittests/alp_ascend_addOp.cpp b/examples/unittests/alp_ascend_addOp.cpp
new file mode 100644
index 000000000..32d13a793
--- /dev/null
+++ b/examples/unittests/alp_ascend_addOp.cpp
@@ -0,0 +1,83 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+
+#include <alpAscend.hpp>
+
+using namespace alp;
+
+template < typename GridType >
+void ascend_code( const GridType &grid, RC&rc ) {
+	rc = RC::FAILED;
+
+	Tensor x_global(Datatype::FP16, make_axes( "i", "j" ) ); // 0 is default
+	Tensor y_global(Datatype::FP16, make_axes( "i", "j" ) );
+	Tensor z_global(Datatype::FP16, make_axes( "i", "j" ) );
+
+	rc = grid.forEach( make_axes( "i" ), [ & ] () {
+		auto x_block = getView( x_global );
+		auto y_block = getView( y_global );
+		auto z_block = getView( z_global );
+
+		// add( z_block, x_block, y_block, make_axes( 1 ) ); // z = x + y
+		z_block( "j" ) = add( x_block( "j" ), y_block( "j" ), "j" ); // z = x + y
+
+		store( z_block );
+	} );
+
+	return;
+}
+
+int main( int argc, char ** argv ) {
+
+	// default options
+	bool printUsage = false;
+
+	// input error checking
+	if( argc > 1 ) {
+		printUsage = true;
+	}
+
+	// print help on error
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << "\n";
+		return 10;
+	}
+
+	// start opgen
+	std::cout << "//This is AscendOpGen example " << argv[ 0 ] << "\n";
+	RC error_code = RC::SUCCESS;
+	try {
+		error_code = alp::compile< 1, 2 >( ascend_code, "addOp" );
+	} catch( std::exception &e ) {
+		std::cerr << "alp::compile threw error: " << e.what() << "\n";
+		return 20;
+	}
+	if( error_code != RC::SUCCESS ) {
+		std::cerr << std::flush;
+		std::cout << "Codegen FAILED (" << toString( error_code ) << ")"
+			<< std::endl;
+		return 30;
+	} else {
+		std::cout << "//Codegen OK" << std::endl;
+		return 0;
+	}
+
+}
+
diff --git a/examples/unittests/alp_ascend_addOpv1.cpp b/examples/unittests/alp_ascend_addOpv1.cpp
new file mode 100644
index 000000000..3efbf2308
--- /dev/null
+++ b/examples/unittests/alp_ascend_addOpv1.cpp
@@ -0,0 +1,83 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+
+#include <alpAscend.hpp>
+
+using namespace alp;
+
+template < typename GridType >
+void ascend_code( const GridType &grid, RC&rc ) {
+	rc = RC::FAILED;
+
+	Tensor x_global(Datatype::FP16, make_axes( "i" ) ); // 0 is default
+	Tensor y_global(Datatype::FP16, make_axes( "i" ) );
+	Tensor z_global(Datatype::FP16, make_axes( "i" ) );
+
+	rc = grid.forEach( make_axes( "i" ), [ & ] () {
+		auto x_block = getView( x_global );
+		auto y_block = getView( y_global );
+		auto z_block = getView( z_global );
+
+		apply( z_block, x_block, y_block, "add" ); // z = x + y
+//		z_block( "j" ) = add( x_block( "j" ), y_block( "j" ), "j" ); // z = x + y
+
+		store( z_block );
+	} );
+
+	return;
+}
+
+int main( int argc, char ** argv ) {
+
+	// default options
+	bool printUsage = false;
+
+	// input error checking
+	if( argc > 1 ) {
+		printUsage = true;
+	}
+
+	// print help on error
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << "\n";
+		return 10;
+	}
+
+	// start opgen
+	std::cout << "//This is AscendOpGen example " << argv[ 0 ] << "\n";
+	RC error_code = RC::SUCCESS;
+	try {
+		error_code = alp::compile< 1, 1 >( ascend_code, "addOpv1" );
+	} catch( std::exception &e ) {
+		std::cerr << "alp::compile threw error: " << e.what() << "\n";
+		return 20;
+	}
+	if( error_code != RC::SUCCESS ) {
+		std::cerr << std::flush;
+		std::cout << "Codegen FAILED (" << toString( error_code ) << ")"
+			<< std::endl;
+		return 30;
+	} else {
+		std::cout << "//Codegen OK" << std::endl;
+		return 0;
+	}
+
+}
+
diff --git a/examples/unittests/alp_ascend_movedataOpv01.cpp b/examples/unittests/alp_ascend_movedataOpv01.cpp
new file mode 100644
index 000000000..4bf21e900
--- /dev/null
+++ b/examples/unittests/alp_ascend_movedataOpv01.cpp
@@ -0,0 +1,90 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+#define DEBUG
+
+#include <alpAscend.hpp>
+
+using namespace alp;
+
+
+// alp::Grid< 1, 3 > note:
+//   - Thread dimensionality = 1, means that the 1D thread grid maps to first
+//     axis of the problem grid. A refinement of this API may make this
+//     configurable.
+template < typename GridType >
+void ascend_code( const GridType &grid, RC &rc ) { // max shape = ( Tr, Br, d )
+	rc = RC::FAILED;
+
+	Tensor Sin(Datatype::FP16, make_axes( "i", "j", "k" ) );
+	Tensor Sout(Datatype::FP16, make_axes( "i", "j", "k" ) );
+
+	rc = grid.forEach( make_axes( "i" ), [ & ] () {
+
+		auto S_block_in = getView( Sin );
+		auto S_block_out = getView( Sout );
+
+		Tensor localTensor(Datatype::FP16, make_axes( "j", "k" ) );
+
+		set( localTensor, S_block_in);
+		set( S_block_out, localTensor);
+
+		store( S_block_out );
+
+	} );
+}
+
+int main( int argc, char ** argv ) {
+
+	// default options
+	bool printUsage = false;
+
+	// input error checking
+	if( argc > 1 ) {
+		printUsage = true;
+	}
+
+	// print help on error
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << "\n";
+		return 10;
+	}
+
+	// start opgen
+	std::cout << "//This is AscendOpGen example " << argv[ 0 ] << "\n";
+	RC error_code = RC::SUCCESS;
+	try {
+		error_code = alp::compile< 1, 3 >( ascend_code, "movedataOpv01" );
+	} catch( std::exception &e ) {
+		std::cerr << "alp::compile threw error: " << e.what() << "\n";
+		return 20;
+	}
+	if( error_code != RC::SUCCESS ) {
+		std::cerr << std::flush;
+		std::cout << "Codegen FAILED (" << toString( error_code ) << ")"
+			<< std::endl;
+		return 30;
+	} else {
+		std::cout << "//Codegen OK" << std::endl;
+		return 0;
+	}
+
+}
+
diff --git a/examples/unittests/alp_ascend_onlinesoftmaxOp.cpp b/examples/unittests/alp_ascend_onlinesoftmaxOp.cpp
new file mode 100644
index 000000000..0f33d0fd3
--- /dev/null
+++ b/examples/unittests/alp_ascend_onlinesoftmaxOp.cpp
@@ -0,0 +1,173 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+#define DEBUG
+
+#include <alpAscend.hpp>
+
+using namespace alp;
+
+
+// alp::Grid< 1, 4 > note:
+//   - Thread dimensionality = 1, means that the 1D thread grid maps to first
+//     axis of the problem grid. A refinement of this API may make this
+//     configurable.
+template < typename GridType >
+void ascend_code( const GridType &grid, RC &rc ) {
+
+	// max shape = ( Tr,Tc,Br,Bc )
+	// Tr = number for row-blocks, Br = row-length of rowblocks;  Tr*Tc = N
+	// Tc = number for column-blocks, Bc = column-length of rowblocks;  Tr*Tc = M
+	// for softmax N == M, i.e. Sin and Sout are square matrices
+	rc = alp::RC::FAILED;
+
+	Tensor mtensorout( alp::Datatype::FP16, make_axes( 0, 2 ) );	//  shape = ( Tr,Br )
+	Tensor ltensorout( alp::Datatype::FP16, make_axes( 0, 2 ) );	//  shape = ( Tr,Br )clear
+	Tensor Sin( alp::Datatype::FP16, make_axes( 0, 1, 2, 3 ) );	//  shape = ( Tr,Tc,Br,Bc )
+	Tensor Sout( alp::Datatype::FP16, make_axes( 0, 1, 2, 3 ) );	//  shape = ( Tr,Tc,Br,Bc )
+
+	rc = grid.forEach( make_axes( 0 ), [ & ] () {
+
+		auto m_block_out = getView( mtensorout );
+		auto l_block_out = getView( ltensorout );
+
+//-->
+		set( m_block_out, -alp::Infinity< double > );	// TODO the double should re replaced by alp::Datatype::FP16
+//-->
+		set( l_block_out, alp::Zero< double > ); //TODO
+
+		grid.forEach(
+			make_axes( 1 ),		// prallel loop- > for(i0=0; i0<n0; ++i0 ) { ...
+			[ & ] () {
+
+				// these tensors will have original axes with axes 0 and 2 removed
+				// S_block_in=S[i0,i1,:,:]
+				auto S_block_in = getView( Sin );
+				auto S_block_out = getView( Sout );
+
+				Tensor rowmaxS( alp::Datatype::FP16, make_axes( 2 ) );
+				Tensor mi_old( alp::Datatype::FP16, make_axes( 2 ) );
+				Tensor expmidiff( alp::Datatype::FP16, make_axes( 2 ) );
+
+				// mi_old=cp.copy(mtensor[i,:])
+				//   T(2)    T(2)
+
+				set( mi_old, m_block_out); //TODO: use VectorSet(vec,vec,n2);
+
+				// rowmaxS=np.max(Si,axis=-1)
+				//      T(2)    T(2,3)
+				// foldl( rowmaxS, S_block_in, "max", make_axes( 3 ) );
+				rowmaxS( 2 ) = max( S_block_in( 2, 3 ), 3 );
+
+				// mtensor[i,:]=np.maximum(mtensor[i,:],rowmaxS)
+				//     T(2)         T(2)
+
+				foldl( m_block_out, rowmaxS, "max" ); //TODO use VectorEwiseMinus
+
+				// Si=Si-np.expand_dims(mtensor[i,:], axis=-1)
+
+				//     T(2,3)      T(2)
+
+				// apply( S_block_out, S_block_in, m_block_out, "minus", make_axes( 3 ) );
+				S_block_out( 2, 3 ) = minus( S_block_in( 2, 3 ), m_block_out( 2 ), 3 );
+
+				// Si=np.exp(Si)
+				foldl( S_block_out, "exp" );
+
+				// expmidiff=np.exp(mi_old-mtensor[i,:])
+				//     T(2)       T(2)    T(2)
+				// apply( expmidiff, mi_old, m_block_out, "minus" );
+				expmidiff( 2 ) = minus( mi_old( 2 ), m_block_out( 2 ), 2 );
+
+				foldl( expmidiff, "exp" );
+
+				// ltensor[i,:]*=expmidiff
+				//     T(2)         T(2)
+
+				foldl( l_block_out, expmidiff, "times" ); //TODO use VectorEwiseMultiply()
+
+				// ltensor[i,:]+= np.sum(Si,axis=-1)
+				//     T(2)         T(2,3)
+
+				// foldl( l_block_out, S_block_out, "add", make_axes( 3 ) );
+				foldl( rowmaxS, S_block_out, "add", make_axes( 3 ) );
+				foldl( l_block_out, rowmaxS, "add", make_axes( 3 ) );
+
+				//TODO use
+				// BlockReduceSum(work0, S_block_out, workhidden, .. );
+				// VectorEwiseSum(l_block_out,l_block_out,work0, ..);
+
+				// Otensor[i,:,:,:]*=np.expand_dims(expmidiff, axis=(-2,-1))
+				// foldl( S_block_out, expmidiff, "times", make_axes( 3 ) );
+
+				store( S_block_out );
+		} );
+
+		// Otensor[i,:,j,:]=Si
+		// already output
+
+		// Otensor[i,:,:,:]/=np.expand_dims(ltensor[i,:], axis=(-2,-1))
+		// foldl( S_block_out, l_block_out, "divide", make_axes( 3 ) );
+
+		store( l_block_out );
+		store( m_block_out );
+
+	} );
+
+	return;
+}
+
+int main( int argc, char ** argv ) {
+
+	// default options
+	bool printUsage = false;
+
+	// input error checking
+	if( argc > 1 ) {
+		printUsage = true;
+	}
+
+	// print help on error
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << "\n";
+		return 10;
+	}
+
+	// start opgen
+	std::cout << "//This is AscendOpGen example " << argv[ 0 ] << "\n";
+	alp::RC error_code = alp::RC::SUCCESS;
+	try {
+		error_code = alp::compile< 1, 4 >( ascend_code, "onlinesoftmaxOp" );
+	} catch( std::exception &e ) {
+		std::cerr << "alp::compile threw error: " << e.what() << "\n";
+		return 20;
+	}
+	if( error_code != alp::RC::SUCCESS ) {
+		std::cerr << std::flush;
+		std::cout << "Codegen FAILED (" << alp::toString( error_code ) << ")"
+			<< std::endl;
+		return 30;
+	} else {
+		std::cout << "//Codegen OK" << std::endl;
+		return 0;
+	}
+
+}
+
diff --git a/examples/unittests/alp_ascend_softmaxOp.cpp b/examples/unittests/alp_ascend_softmaxOp.cpp
new file mode 100644
index 000000000..51d55b953
--- /dev/null
+++ b/examples/unittests/alp_ascend_softmaxOp.cpp
@@ -0,0 +1,115 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+#define DEBUG
+
+#include <alpAscend.hpp>
+
+using namespace alp;
+
+
+// alp::Grid< 1, 3 > note:
+//   - Thread dimensionality = 1, means that the 1D thread grid maps to first
+//     axis of the problem grid. A refinement of this API may make this
+//     configurable.
+template < typename GridType >
+void ascend_code( const GridType &grid, RC &rc ) { // max shape = ( Tr, Br, d )
+	rc = RC::FAILED;
+
+	Tensor Sin(Datatype::FP16, make_axes( "i", "j", "k" ) ); //  shape = (Tr, Br, d)
+	Tensor Sout(Datatype::FP16, make_axes( "i", "j", "k" ) ); //  shape = (Tr, Br, d)
+	// Tensor of rank R, has R strides, defined in order to iterate
+	// the memory container. i.e. S with shape = (Tr, Br, d)
+	// element (i,j,k) is located in i*(Br*d) + j*(d) + k position
+	// (this is not the only mapping indices -> memory location)
+	// These basic (Tr, Br, d) stride has to be inherited by any view created from that
+	// container, in order to be able to properly iterate the memory container.
+
+
+	// forEach cuts the grid into small pieces that are processed concurrently
+	rc = grid.forEach( make_axes( "i" ), [ &Sin, &Sout ] () {
+
+		auto S_block_in = getView( Sin );                                         // S_block_in  allocate in UB ts0 x n1 x n2
+		auto S_block_out = getView( Sout );                                       // S_block_out allocate in UB ts0 x n1 x n2
+
+		Tensor localTensor(Datatype::FP16, make_axes( "j" ) );                // localTensor allocate in UB ts0 x n1
+
+		//     T(1)         T(1,2)
+		// apply( localTensor, S_block_in, "max", make_axes( "k" ) );                  // asc::max( localTensor, S_block_in, "2" )
+		localTensor( "j" ) = max( S_block_in("j", "k" ), "k" );
+
+		//     T(1,2)       T(1,2)
+		// apply( S_block_out, S_block_in, localTensor, "minus", make_axes( "k" ) );
+		S_block_out( "j", "k" ) = minus( S_block_in("j", "k" ), localTensor( "j" ) , "k" );
+
+		//     T(1,2)
+		// apply( S_block_out, S_block_out, "exp", make_axes( "k" ) );
+		foldl( S_block_out, "exp" );
+
+		//     T(1)         T(1,2)
+		// apply( localTensor, S_block_out, "add", make_axes( "k" ) );
+		localTensor( "j" ) = add( S_block_out("j", "k"), "k" );
+
+		//     T(1,2)       T(1)
+		// apply( S_block_out, S_block_out, localTensor, "divide", make_axes( "k" ) );
+		foldl( S_block_out, localTensor, "divide", make_axes( "k" ) );
+
+		store( S_block_out );
+
+	} );
+}
+
+int main( int argc, char ** argv ) {
+
+	// default options
+	bool printUsage = false;
+
+	// input error checking
+	if( argc > 1 ) {
+		printUsage = true;
+	}
+
+	// print help on error
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << "\n";
+		return 10;
+	}
+
+	// start opgen
+	std::cout << "//This is AscendOpGen example " << argv[ 0 ] << "\n";
+	RC error_code = RC::SUCCESS;
+	try {
+		error_code = alp::compile< 1, 3 >( ascend_code, "softmaxOp" );
+	} catch( std::exception &e ) {
+		std::cerr << "alp::compile threw error: " << e.what() << "\n";
+		return 20;
+	}
+	if( error_code != RC::SUCCESS ) {
+		std::cerr << std::flush;
+		std::cout << "Codegen FAILED (" << toString( error_code ) << ")"
+			<< std::endl;
+		return 30;
+	} else {
+		std::cout << "//Codegen OK" << std::endl;
+		return 0;
+	}
+
+}
+
diff --git a/examples/unittests/alp_ascend_softmaxOpv1.cpp b/examples/unittests/alp_ascend_softmaxOpv1.cpp
new file mode 100644
index 000000000..f079ad986
--- /dev/null
+++ b/examples/unittests/alp_ascend_softmaxOpv1.cpp
@@ -0,0 +1,112 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+#define DEBUG
+
+#include <alpAscend.hpp>
+
+using namespace alp;
+
+
+// alp::Grid< 1, 3 > note:
+//   - Thread dimensionality = 1, means that the 1D thread grid maps to first
+//     axis of the problem grid. A refinement of this API may make this
+//     configurable.
+template < typename GridType >
+void ascend_code( const GridType &grid, RC &rc ) { // max shape = ( Tr, Br, d )
+	rc = alp::RC::FAILED;
+
+	Tensor Sin( alp::Datatype::FP16, make_axes( "i", "j", "k" ) ); //  shape = (Tr, Br, d)
+	Tensor Sout( alp::Datatype::FP16, make_axes( "i", "j", "k" ) ); //  shape = (Tr, Br, d)
+
+	rc = grid.forEach( make_axes( "i" ), [ & ] () {
+
+		rc = grid.forEach( make_axes( "j" ), [ & ] () {
+
+			auto S_block_in  = getView( Sin );  // T(2)
+			auto S_block_out = getView( Sout ); // T(2)
+			Tensor localTensor( alp::Datatype::FP16, make_axes( ) ); // T()
+			//Scalar localTensor( alp::Datatype::FP16 );
+
+			//     T()          T(2)                ->    ReduceMax( A, B, n2 )
+			// apply( localTensor, S_block_in, "max", make_axes( "k" ) );
+			localTensor( "j" ) = max( S_block_in("j", "k" ), "k" );
+
+			//     T(2)         T(2)        T()     ->    BcastMinus( A, B, C, n2 )
+			// apply( S_block_out, S_block_in, localTensor, "minus", make_axes( "k" ) );
+			S_block_out( "j", "k" ) = minus( S_block_in("j", "k" ), localTensor( "j" ) , "k" );
+
+			//     T(2)                             ->    InplaceExp( A, n2 )
+			foldl( S_block_out, "exp" );
+
+			//     T()          T(2)                ->    ReduceAdd( A, B, n2 )
+			// apply( localTensor, S_block_out, "add", make_axes( "k" ) );
+			localTensor( "j" ) = add( S_block_out("j", "k"), "k" );
+
+			//     T(2)         T()                 ->    BcastDivide( A, B, n2 )
+			foldl( S_block_out, localTensor, "divide", make_axes( "k" ) );
+
+			//     T(2)
+			store( S_block_out );
+
+		} );
+
+	} );
+
+	return;
+}
+
+int main( int argc, char ** argv ) {
+
+	// default options
+	bool printUsage = false;
+
+	// input error checking
+	if( argc > 1 ) {
+		printUsage = true;
+	}
+
+	// print help on error
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << "\n";
+		return 10;
+	}
+
+	// start opgen
+	std::cout << "//This is AscendOpGen example " << argv[ 0 ] << "\n";
+	alp::RC error_code = alp::RC::SUCCESS;
+	try {
+		error_code = alp::compile< 1, 3 >( ascend_code, "softmaxOpv1" );
+	} catch( std::exception &e ) {
+		std::cerr << "alp::compile threw error: " << e.what() << "\n";
+		return 20;
+	}
+	if( error_code != alp::RC::SUCCESS ) {
+		std::cerr << std::flush;
+		std::cout << "Codegen FAILED (" << alp::toString( error_code ) << ")"
+			<< std::endl;
+		return 30;
+	} else {
+		std::cout << "//Codegen OK" << std::endl;
+		return 0;
+	}
+
+}
+
diff --git a/examples/unittests/alp_ascend_softmaxOpv3.cpp b/examples/unittests/alp_ascend_softmaxOpv3.cpp
new file mode 100644
index 000000000..c417e5406
--- /dev/null
+++ b/examples/unittests/alp_ascend_softmaxOpv3.cpp
@@ -0,0 +1,111 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+#define DEBUG
+
+#include <alpAscend.hpp>
+
+using namespace alp;
+
+
+// alp::Grid< 1, 4 > note:
+//   - Thread dimensionality = 1, means that the 1D thread grid maps to first
+//     axis of the problem grid. A refinement of this API may make this
+//     configurable.
+template < typename GridType >
+void ascend_code( const GridType &grid, RC &rc ) {
+	rc = alp::RC::FAILED;
+
+	Tensor Sin( alp::Datatype::FP16, make_axes( "i", "j", "k", "l" ) );
+	Tensor Sout( alp::Datatype::FP16, make_axes( "i", "j", "k", "l" ) );
+
+	rc = grid.forEach( make_axes( "i" ), [ & ] () {
+
+		rc = grid.forEach( make_axes( "j" ), [ & ] () {
+
+			auto S_block_in  = getView( Sin );  // T(2,3)
+			auto S_block_out = getView( Sout ); // T(2,3)
+			Tensor localTensor( alp::Datatype::FP16, make_axes( "k" ) ); // T(2)
+
+			//     T(2)         T(2,3)
+			// apply( localTensor, S_block_in, "max", make_axes( "l" ) );
+			localTensor( "k" ) = max( S_block_in("k", "l" ), "l" );
+
+			//     T(2,3)       T(2,3)        T(2)
+			// apply( S_block_out, S_block_in, localTensor, "minus", make_axes( "l" ) );
+			S_block_out( "k", "l" ) = minus( S_block_in("k", "l" ), localTensor( "k" ) , "l" );
+
+			//     T(2,3)
+			foldl( S_block_out, "exp" );
+
+			//     T(2)         T(2,3)
+			// apply( localTensor, S_block_out, "add", make_axes( "l" ) );
+			localTensor( "k" ) = add( S_block_out("k", "l" ), "l" );
+
+			//     T(2,3)       T(2)
+			foldl( S_block_out, localTensor, "divide", make_axes( "k" ) );
+
+			//     T(2,3)
+			store( S_block_out );
+
+		} );
+
+	} );
+
+	return;
+}
+
+int main( int argc, char ** argv ) {
+
+	// default options
+	bool printUsage = false;
+
+	// input error checking
+	if( argc > 1 ) {
+		printUsage = true;
+	}
+
+	// print help on error
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << "\n";
+		return 10;
+	}
+
+	// start opgen
+	std::cout << "//This is AscendOpGen example " << argv[ 0 ] << "\n";
+	alp::RC error_code = alp::RC::SUCCESS;
+	try {
+		error_code = alp::compile< 1, 4 >( ascend_code, "softmaxOpv3" );
+	} catch( std::exception &e ) {
+		std::cerr << "alp::compile threw error: " << e.what() << "\n";
+		return 20;
+	}
+	if( error_code != alp::RC::SUCCESS ) {
+		std::cerr << std::flush;
+		std::cout << "Codegen FAILED (" << alp::toString( error_code ) << ")"
+			<< std::endl;
+		return 30;
+	} else {
+		std::cout << "//Codegen OK" << std::endl;
+		return 0;
+	}
+
+}
+
diff --git a/examples/unittests/alp_ascend_softmaxOpv4.cpp b/examples/unittests/alp_ascend_softmaxOpv4.cpp
new file mode 100644
index 000000000..3b1283b87
--- /dev/null
+++ b/examples/unittests/alp_ascend_softmaxOpv4.cpp
@@ -0,0 +1,114 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+#define DEBUG
+
+#include <alpAscend.hpp>
+
+using namespace alp;
+
+
+// alp::Grid< 1, 6 > note:
+//   - Thread dimensionality = 1, means that the 1D thread grid maps to first
+//     axis of the problem grid. A refinement of this API may make this
+//     configurable.
+template < typename GridType >
+void ascend_code( const GridType &grid, RC &rc ) {
+	rc = RC::FAILED;
+
+	Tensor Sin(Datatype::FP16, make_axes( "i", "j", "k", "l", "m", "n" ) );
+	Tensor Sout(Datatype::FP16, make_axes( "i", "j", "k", "l", "m", "n" ) );
+
+	rc = grid.forEach( make_axes( "i" ), [ & ] () {
+
+		rc = grid.forEach( make_axes( "j" ), [ & ] () {
+
+			rc = grid.forEach( make_axes( "l" ), [ & ] () {
+
+				rc = grid.forEach( make_axes( "m" ), [ & ] () {
+
+					auto S_block_in  = getView( Sin );  // T(2,5)
+					auto S_block_out = getView( Sout ); // T(2,5)
+					Tensor localTensor(Datatype::FP16, make_axes( "k" ) ); // T(2)
+
+					//     T(2)          T(2,5)
+					// apply( localTensor, S_block_in, "max", make_axes( "n" ) );
+					localTensor( "k" ) = max( S_block_in("k", "n" ), "n" );
+
+					//     T(2,5)         T(2,5)        T(2)
+					// apply( S_block_out, S_block_in, localTensor, "minus", make_axes( "n" ) );
+					S_block_out( "k", "n" ) = minus( S_block_in("k", "n" ), localTensor( "k" ) , "n" );
+
+					//     T(2,5)
+					foldl( S_block_out, "exp" );
+
+					//     T(2)          T(2,5)
+					// apply( localTensor, S_block_out, "add", make_axes( "n" ) );
+					localTensor( "k" ) = add( S_block_out("k", "n" ), "n" );
+
+					//     T(2,5)         T(2)
+					foldl( S_block_out, localTensor, "divide", make_axes( "n" ) );
+
+					//     T(2,5)
+					store( S_block_out );
+
+				} );
+			} );
+		} );
+	} );
+}
+
+int main( int argc, char ** argv ) {
+
+	// default options
+	bool printUsage = false;
+
+	// input error checking
+	if( argc > 1 ) {
+		printUsage = true;
+	}
+
+	// print help on error
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << "\n";
+		return 10;
+	}
+
+	// start opgen
+	std::cout << "//This is AscendOpGen example " << argv[ 0 ] << "\n";
+	RC error_code = RC::SUCCESS;
+	try {
+		error_code = alp::compile< 1, 6 >( ascend_code, "softmaxOpv4" );
+	} catch( std::exception &e ) {
+		std::cerr << "alp::compile threw error: " << e.what() << "\n";
+		return 20;
+	}
+	if( error_code != RC::SUCCESS ) {
+		std::cerr << std::flush;
+		std::cout << "Codegen FAILED (" << toString( error_code ) << ")"
+			<< std::endl;
+		return 30;
+	} else {
+		std::cout << "//Codegen OK" << std::endl;
+		return 0;
+	}
+
+}
+
diff --git a/examples/unittests/check_data_movedataOp-v01.py b/examples/unittests/check_data_movedataOp-v01.py
new file mode 100644
index 000000000..863e52d40
--- /dev/null
+++ b/examples/unittests/check_data_movedataOp-v01.py
@@ -0,0 +1,83 @@
+#!/usr/bin/python3
+# -*- coding:utf-8 -*-
+# Copyright 2022-2023 Huawei Technologies Co., Ltd
+import numpy as np
+import copy
+import glob
+import re
+import sys
+
+class bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKCYAN = '\033[96m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+
+def check_golden_data():
+    check = True
+    tol = 1.e-2
+
+    goldenfilename = "./output/golden.bin"
+
+    outfiles=glob.glob("./output/param1.bin")
+    # sort outfiles
+    if(len(outfiles)>1):
+        ii=[ int(re.search(r'\d+', fname).group()) for fname in outfiles ]
+        outfiles=np.array(outfiles)[np.argsort(ii)]
+
+    n0=int(sys.argv[1])
+    n1=int(sys.argv[2])
+    n2=int(sys.argv[3])
+
+    M = n0*n1
+    N = n2
+
+    dtype = np.float16
+    print("(N,M)=",N,M)
+    golden = np.fromfile( goldenfilename, dtype=dtype )
+
+    # print(f"Golden: {golden[:10]}")
+    # print(f"Output: {output[:10]}")
+
+    print("Golden:")
+    reshaped_golden = np.reshape(golden, (M, N) )
+    for pos, row in enumerate( reshaped_golden[[0,1,-2,-1]] ):
+        print(f"{pos}: {row}")
+
+    
+    for outfilename in outfiles:
+        
+        output = np.fromfile( outfilename, dtype=dtype )
+        
+
+        print("Output:",outfilename)
+        reshaped_output = np.reshape(output, (M, N) )
+        for pos, row in enumerate( reshaped_output[[0,1,-2,-1],:20] ):
+            print(f"{pos}: {row}")
+
+        # diff = (golden.astype(float) - output.astype(float))**2
+        # diff = np.cumsum((diff.flatten()))
+        # print("Diff**2:")
+        # reshaped_output = np.reshape(diff, (M, N) )
+        # for pos, row in enumerate( reshaped_output[[0,1,-2,-1]] ):
+        #     print(f"{pos}: {row}")
+
+        norm_diff = np.linalg.norm( (golden.astype(float) - output.astype(float)) )
+        norm_diff_relative=norm_diff/np.linalg.norm( golden.astype(float) )
+        print(f"Comparing in {goldenfilename} and {outfilename} (size {np.shape(golden)} and type {dtype}): absolute = {norm_diff} : relative = {norm_diff_relative} ")
+        check = check and (norm_diff_relative<tol)
+    if(check):
+        print(bcolors.OKGREEN + "Test OK!" + bcolors.ENDC)
+    else:
+        print(bcolors.FAIL + "Test Failed" + bcolors.ENDC)
+        
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    assert(len(sys.argv)==4)
+    check_golden_data()
diff --git a/examples/unittests/check_data_onlinesoftmaxOp.py b/examples/unittests/check_data_onlinesoftmaxOp.py
new file mode 100644
index 000000000..fb1562841
--- /dev/null
+++ b/examples/unittests/check_data_onlinesoftmaxOp.py
@@ -0,0 +1,165 @@
+#!/usr/bin/python3
+# -*- coding:utf-8 -*-
+# Copyright 2022-2023 Huawei Technologies Co., Ltd
+import numpy as np
+import copy
+import glob
+import re
+import sys
+
+class bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKCYAN = '\033[96m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+def check_golden_data():
+
+    check = True
+
+    n0=int(sys.argv[1])
+    n1=int(sys.argv[2])
+    n2=int(sys.argv[3])
+    n3=int(sys.argv[4])
+    n4=16
+
+    shape1=(n0,n2,n4)
+    shape2=(n1,n3,n4)
+    shape3=(n0,n2)
+    shape4=(n0,n1,n2,n3)
+
+    dtype = np.float16
+
+    #############################################
+
+    outfilename="output/m0_golden.bin"
+    golden = np.fromfile( outfilename, dtype=dtype )
+    print("Golden:",outfilename)
+    reshaped_golden = np.reshape(golden, shape3 )
+
+    print("golden m")
+    # for i0 in range(n0):
+    #     print(i0,", ".join(reshaped_golden[i0,:].astype(str)))
+
+    outfilename="output/param0.bin"
+    output = np.fromfile( outfilename, dtype=dtype )
+    print("Output:",outfilename)
+    reshaped_output = np.reshape(output, shape3 )
+
+    # print("output m")
+    # for i0 in range(n0):
+    #     print(i0,", ".join(reshaped_output[i0,:].astype(str)))
+
+    # print("output - golden m")
+    # for i0 in range(n0):
+    #     print(i0,", ".join((reshaped_golden[i0,:]-reshaped_output[i0,:]).astype(str)))
+
+    norm_diff = np.linalg.norm( ( output.astype(float) - golden.astype(float) ) )
+    check = check and ( norm_diff < 1.e-4 )
+    print(f"DiffNorm(m) {outfilename,} (size {shape3} and type {dtype}): absolute = {norm_diff}")
+
+    #############################################
+
+    outfilename="output/l0_golden.bin"
+    golden = np.fromfile( outfilename, dtype=dtype )
+    print("Golden:",outfilename)
+    reshaped_golden = np.reshape(golden, shape3 )
+
+    # print("golden l")
+    # for i0 in range(n0):
+    #     print(i0,", ".join(reshaped_golden[i0,:].astype(str)))
+
+    outfilename="output/param1.bin"
+    output = np.fromfile( outfilename, dtype=dtype )
+    print("Output:",outfilename)
+    reshaped_output = np.reshape(output, shape3 )
+
+    # print("output l")
+    # for i0 in range(n0):
+    #     print(i0,", ".join(reshaped_output[i0,:].astype(str)))
+
+    norm_diff = np.linalg.norm( ( output.astype(float) - golden.astype(float) ) )
+    norm_relative = norm_diff / np.linalg.norm( golden.astype(float) )
+    check = check and ( norm_relative < 1.e-2 )
+    print(f"DiffNorm(l) {outfilename,} (size {shape3} and type {dtype}): absolute = {norm_diff}, relative = {norm_relative}")
+
+
+    outfilename="output/param3.bin"
+    output = np.fromfile( outfilename, dtype=dtype )
+
+    outfilename="output/s1_golden.bin"
+    golden = np.fromfile( outfilename, dtype=dtype )
+
+
+    reshaped_output = np.reshape(output, shape4 )
+    reshaped_golden = np.reshape(golden, shape4 )
+    # for i0 in range(n0):
+    #     for i1 in range(n1):
+    #         nrm1=np.linalg.norm( ( reshaped_output[i0,i1,:,:].astype(float) ) )
+    #         nrm2=np.linalg.norm( ( reshaped_golden[i0,i1,:,:].astype(float) ) )
+    #         nrm3=np.linalg.norm( ( reshaped_output[i0,i1,:,:].astype(float) - reshaped_golden[i0,i1,:,:].astype(float) ) )
+    #         print(i0,i1,nrm1,nrm2,nrm3)
+    #         # if(i0==0 and i1 == 0):
+    #         #     print("o","i0 =",i0,"i1=",i1)
+    #         #     for i2 in range(n2):
+    #         #         print(i2,", ".join(reshaped_output[i0,i1,i2,:].astype(str)))
+    #         #     print("g","i0 =",i0,"i1=",i1)
+    #         #     for i2 in range(n2):
+    #         #         print(i2,", ".join(reshaped_golden[i0,i1,i2,:].astype(str)))
+
+    norm_output = np.linalg.norm( ( output.astype(float) ) )
+    norm_golden = np.linalg.norm( ( golden.astype(float) ) )
+
+    norm_diff = np.linalg.norm( ( output.astype(float) - golden.astype(float) ) )
+    norm_relative = norm_diff / np.linalg.norm( ( golden.astype(float) ) )
+    check = check and ( norm_relative < 1.e-4 )
+    print(f"DiffNorm(s1) {outfilename,} (size {shape4} and type {dtype}): absolute = {norm_diff}, relative = {norm_relative}")
+
+
+    #############################################
+
+    # outfilename="output/output_o.bin"
+    # output = np.fromfile( outfilename, dtype=dtype )
+    # print("Output:",outfilename)
+    # reshaped_output = np.reshape(output, shape1 )
+    # norm_diff = np.linalg.norm( ( output.astype(float) ) )
+    # print(f"Norm {outfilename} (size {shape1} and type {dtype}): absolute = {norm_diff}")
+
+
+    # K = np.reshape( np.fromfile( "input/k_gm.bin", dtype=dtype ), (n1*n3,n4) )
+    # Q = np.reshape( np.fromfile( "input/q_gm.bin", dtype=dtype ), (n0*n2,n4) )
+    # V = np.reshape( np.fromfile( "input/v_gm.bin", dtype=dtype ), (n4,n1*n3) )
+
+
+    # O=Q.dot(K.T).dot(V.T)
+    # oflat=np.reshape( O, (n0*n2*n4) )
+    # print(output.astype(float))
+    # print(oflat.astype(float))
+
+
+    # for outfilename in outfiles:
+    #     output = np.fromfile( outfilename, dtype=dtype )
+    #     print("Output:",outfilename)
+    #     reshaped_output = np.reshape(output, shape3 )
+    #     for i in range(n0):
+    #         print(i,reshaped_output[i,:])
+    #     norm_diff = np.linalg.norm( (golden.astype(float) - output.astype(float)) )
+    #     norm_diff_relative = norm_diff/np.linalg.norm( golden.astype(float) )
+    #     print(f"Comparing in {goldenfilename} and {outfilename} (size {np.shape(golden)} and type {dtype}): absolute = {norm_diff}, relative = {norm_diff_relative}")
+
+    if(check):
+        print(bcolors.OKGREEN + "Test OK!" + bcolors.ENDC)
+    else:
+        print(bcolors.FAIL + "Test Failed" + bcolors.ENDC)
+        
+        sys.exit(1)
+
+
+
+
+if __name__ == "__main__":
+    assert(len(sys.argv)==5)
+    check_golden_data()
diff --git a/examples/unittests/check_data_softmaxOp-v1.py b/examples/unittests/check_data_softmaxOp-v1.py
new file mode 100644
index 000000000..b3afda088
--- /dev/null
+++ b/examples/unittests/check_data_softmaxOp-v1.py
@@ -0,0 +1,83 @@
+#!/usr/bin/python3
+# -*- coding:utf-8 -*-
+# Copyright 2022-2023 Huawei Technologies Co., Ltd
+import numpy as np
+import copy
+import glob
+import re
+import sys
+
+class bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKCYAN = '\033[96m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+
+def check_golden_data():
+    check = True
+    tol = 1.e-2
+
+    goldenfilename = "./output/golden.bin"
+
+    outfiles=glob.glob("./output/param1.bin")
+    # sort outfiles
+    if(len(outfiles)>1):
+        ii=[ int(re.search(r'\d+', fname).group()) for fname in outfiles ]
+        outfiles=np.array(outfiles)[np.argsort(ii)]
+
+    n0=int(sys.argv[1])
+    n1=int(sys.argv[2])
+    n2=int(sys.argv[3])
+    
+    M = n0*n1
+    N = n2
+
+    dtype = np.float16
+    print("(N,M)=",N,M)
+    golden = np.fromfile( goldenfilename, dtype=dtype )
+
+    # print(f"Golden: {golden[:10]}")
+    # print(f"Output: {output[:10]}")
+
+    print("Golden:")
+    reshaped_golden = np.reshape(golden, (M, N) )
+    for pos, row in enumerate( reshaped_golden[[0,1,-2,-1]] ):
+        print(f"{pos}: {row}")
+
+    
+    for outfilename in outfiles:
+        
+        output = np.fromfile( outfilename, dtype=dtype )
+        
+
+        print("Output:",outfilename)
+        reshaped_output = np.reshape(output, (M, N) )
+        for pos, row in enumerate( reshaped_output[[0,1,-2,-1],:20] ):
+            print(f"{pos}: {row}")
+
+        # diff = (golden.astype(float) - output.astype(float))**2
+        # diff = np.cumsum((diff.flatten()))
+        # print("Diff**2:")
+        # reshaped_output = np.reshape(diff, (M, N) )
+        # for pos, row in enumerate( reshaped_output[[0,1,-2,-1]] ):
+        #     print(f"{pos}: {row}")
+
+        norm_diff = np.linalg.norm( (golden.astype(float) - output.astype(float)) )
+        norm_diff_relative=norm_diff/np.linalg.norm( golden.astype(float) )
+        print(f"Comparing in {goldenfilename} and {outfilename} (size {np.shape(golden)} and type {dtype}): absolute = {norm_diff} : relative = {norm_diff_relative} ")
+        check = check and (norm_diff_relative<tol)
+    if(check):
+        print(bcolors.OKGREEN + "Test OK!" + bcolors.ENDC)
+    else:
+        print(bcolors.FAIL + "Test Failed" + bcolors.ENDC)
+        
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    assert(len(sys.argv)==4)
+    check_golden_data()
diff --git a/examples/unittests/check_data_softmaxOp-v3.py b/examples/unittests/check_data_softmaxOp-v3.py
new file mode 100644
index 000000000..f0a5f115c
--- /dev/null
+++ b/examples/unittests/check_data_softmaxOp-v3.py
@@ -0,0 +1,99 @@
+#!/usr/bin/python3
+# -*- coding:utf-8 -*-
+# Copyright 2022-2023 Huawei Technologies Co., Ltd
+import numpy as np
+import copy
+import glob
+import re
+import sys
+
+class bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKCYAN = '\033[96m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+
+def check_golden_data():
+    check = True
+    tol = 1.e-2
+
+    goldenfilename = "./output/golden.bin"
+
+    # outfilename = "./output/output.bin"
+    outfiles=glob.glob("./output/param1.bin")
+    # sort outfiles
+    if(len(outfiles)>1):
+        ii=[ int(re.search(r'\d+', fname).group()) for fname in outfiles ]
+        outfiles=np.array(outfiles)[np.argsort(ii)]
+
+    n0=int(sys.argv[1])
+    n1=int(sys.argv[2])
+    n2=int(sys.argv[3])
+    n3=int(sys.argv[4])
+    
+    M = n0*n1
+    N = n2*n3
+    shape1=(n0,n1,n2,n3)
+
+    dtype = np.float16
+
+    printblocks=[(0,0),(0,n1-1),(n0-1,n1-1),(n0-1,0)]
+
+    golden = np.fromfile( goldenfilename, dtype=dtype )
+
+    # print(f"Golden: {golden[:10]}")
+    # print(f"Output: {output[:10]}")
+
+
+    print("Golden:")
+    reshaped_golden = np.reshape(golden, shape1 )
+    # for i,j in printblocks:
+    #     print("i=",i,"  j=",j)
+    #     for i2 in range(n2):
+    #         print(reshaped_golden[i,j,i2,:])
+
+
+    for outfilename in outfiles:
+
+        output = np.fromfile( outfilename, dtype=dtype )
+        print("Output:",outfilename)
+        reshaped_output = np.reshape(output, shape1 )
+        # for i,j in printblocks:
+        #     print("i=",i,"  j=",j)
+        #     for i2 in range(n2):
+        #         print(reshaped_output[i,j,i2,:])
+
+        # diff = (golden.astype(float) - output.astype(float))**2
+        # diff = np.cumsum((diff.flatten()))
+        # print("Diff**2:")
+        # reshaped_output = np.reshape(diff, (M, N) )
+        # for pos, row in enumerate( reshaped_output[[0,1,-2,-1]] ):
+        #     print(f"{pos}: {row}")
+
+        norm_diff = np.linalg.norm( (golden.astype(float) - output.astype(float)) )
+        norm_diff_relative=norm_diff/np.linalg.norm( golden.astype(float) )
+        print(f"Comparing in {goldenfilename} and {outfilename} (size {np.shape(golden)} and type {dtype}): absolute = {norm_diff} : relative = {norm_diff_relative} ")
+        # for i0 in range(n0):
+        #     for i1 in range(n1):
+        #         for i2 in range(n2):
+        #             rownorm=np.linalg.norm( reshaped_output[i0,i1,i2,:]-reshaped_golden[i0,i1,i2,:])
+        #             #print(i0,i1,i2,rownorm)
+        #             if(rownorm>1.e-10):
+        #                 print(i0,i1,i2,rownorm)
+        #                 print("        output=",reshaped_output[i0,i1,i2,:])
+        #                 print("        golden=",reshaped_golden[i0,i1,i2,:])
+        check = check and (norm_diff_relative<tol)
+    if(check):
+        print(bcolors.OKGREEN + "Test OK!" + bcolors.ENDC)
+    else:
+        print(bcolors.FAIL + "Test Failed" + bcolors.ENDC)
+        
+        sys.exit(1)
+
+if __name__ == "__main__":
+    assert(len(sys.argv)==5)
+    check_golden_data()
diff --git a/examples/unittests/check_data_softmaxOp-v4.py b/examples/unittests/check_data_softmaxOp-v4.py
new file mode 100644
index 000000000..6f8fda1c8
--- /dev/null
+++ b/examples/unittests/check_data_softmaxOp-v4.py
@@ -0,0 +1,89 @@
+#!/usr/bin/python3
+# -*- coding:utf-8 -*-
+# Copyright 2022-2023 Huawei Technologies Co., Ltd
+import numpy as np
+import copy
+import glob
+import re
+import sys
+
+class bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKCYAN = '\033[96m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+
+def check_golden_data():
+    check = True
+    tol = 1.e-2
+
+    goldenfilename = "./output/golden.bin"
+
+    # outfilename = "./output/output.bin"
+    outfiles=glob.glob("./output/param1.bin")
+    # sort outfiles
+    if(len(outfiles)>1):
+        ii=[ int(re.search(r'\d+', fname).group()) for fname in outfiles ]
+        outfiles=np.array(outfiles)[np.argsort(ii)]
+
+    n0=int(sys.argv[1])
+    n1=int(sys.argv[2])
+    n2=int(sys.argv[3])
+    n3=int(sys.argv[4])
+    n4=int(sys.argv[5])
+    n5=int(sys.argv[6])
+
+    N = n0*n1*n2*n3*n4*n5
+    shape1=(n0,n1,n2,n3,n4,n5)
+
+    dtype = np.float16
+
+    # printblocks=[(0,0),(0,n1-1),(n0-1,n1-1),(n0-1,0)]
+    
+    golden = np.fromfile( goldenfilename, dtype=dtype )
+
+    # print(f"Golden: {golden[:10]}")
+    # print(f"Output: {output[:10]}")
+
+
+    print("Golden:")
+    reshaped_golden = np.reshape(golden, shape1 )        
+    
+    for outfilename in outfiles:
+        
+        output = np.fromfile( outfilename, dtype=dtype )
+        print("Output:",outfilename)
+        reshaped_output = np.reshape(output, shape1 )
+
+
+        # for i0 in range(n0):
+        #     for i1 in range(n1):
+        #         for i2 in range(n2):
+        #             for i3 in range(n3):
+        #                 for i4 in range(n4):
+        #                     tmp_diff=np.linalg.norm(reshaped_output[i0,i1,i2,i3,i4,:]-reshaped_golden[i0,i1,i2,i3,i4,:])
+        #                     print(i0,i1,i2,i3,i4,
+        #                           " d=",tmp_diff,
+        #                           "  o=",reshaped_output[i0,i1,i2,i3,i4,:2],".",
+        #                           "  g=",reshaped_golden[i0,i1,i2,i3,i4,:2],".")
+
+        
+        norm_diff = np.linalg.norm( (golden.astype(float) - output.astype(float)) )
+        norm_diff_relative=norm_diff/np.linalg.norm( golden.astype(float) )
+        print(f"Comparing in {goldenfilename} and {outfilename} (size {np.shape(golden)} and type {dtype}): absolute = {norm_diff} : relative = {norm_diff_relative} ")
+        check = check and (norm_diff_relative<tol)
+    if(check):
+        print(bcolors.OKGREEN + "Test OK!" + bcolors.ENDC)
+    else:
+        print(bcolors.FAIL + "Test Failed" + bcolors.ENDC)
+        
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    assert(len(sys.argv)==7)
+    check_golden_data()
diff --git a/examples/unittests/check_data_softmaxOp.py b/examples/unittests/check_data_softmaxOp.py
new file mode 100644
index 000000000..b3afda088
--- /dev/null
+++ b/examples/unittests/check_data_softmaxOp.py
@@ -0,0 +1,83 @@
+#!/usr/bin/python3
+# -*- coding:utf-8 -*-
+# Copyright 2022-2023 Huawei Technologies Co., Ltd
+import numpy as np
+import copy
+import glob
+import re
+import sys
+
+class bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKCYAN = '\033[96m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+
+def check_golden_data():
+    check = True
+    tol = 1.e-2
+
+    goldenfilename = "./output/golden.bin"
+
+    outfiles=glob.glob("./output/param1.bin")
+    # sort outfiles
+    if(len(outfiles)>1):
+        ii=[ int(re.search(r'\d+', fname).group()) for fname in outfiles ]
+        outfiles=np.array(outfiles)[np.argsort(ii)]
+
+    n0=int(sys.argv[1])
+    n1=int(sys.argv[2])
+    n2=int(sys.argv[3])
+    
+    M = n0*n1
+    N = n2
+
+    dtype = np.float16
+    print("(N,M)=",N,M)
+    golden = np.fromfile( goldenfilename, dtype=dtype )
+
+    # print(f"Golden: {golden[:10]}")
+    # print(f"Output: {output[:10]}")
+
+    print("Golden:")
+    reshaped_golden = np.reshape(golden, (M, N) )
+    for pos, row in enumerate( reshaped_golden[[0,1,-2,-1]] ):
+        print(f"{pos}: {row}")
+
+    
+    for outfilename in outfiles:
+        
+        output = np.fromfile( outfilename, dtype=dtype )
+        
+
+        print("Output:",outfilename)
+        reshaped_output = np.reshape(output, (M, N) )
+        for pos, row in enumerate( reshaped_output[[0,1,-2,-1],:20] ):
+            print(f"{pos}: {row}")
+
+        # diff = (golden.astype(float) - output.astype(float))**2
+        # diff = np.cumsum((diff.flatten()))
+        # print("Diff**2:")
+        # reshaped_output = np.reshape(diff, (M, N) )
+        # for pos, row in enumerate( reshaped_output[[0,1,-2,-1]] ):
+        #     print(f"{pos}: {row}")
+
+        norm_diff = np.linalg.norm( (golden.astype(float) - output.astype(float)) )
+        norm_diff_relative=norm_diff/np.linalg.norm( golden.astype(float) )
+        print(f"Comparing in {goldenfilename} and {outfilename} (size {np.shape(golden)} and type {dtype}): absolute = {norm_diff} : relative = {norm_diff_relative} ")
+        check = check and (norm_diff_relative<tol)
+    if(check):
+        print(bcolors.OKGREEN + "Test OK!" + bcolors.ENDC)
+    else:
+        print(bcolors.FAIL + "Test Failed" + bcolors.ENDC)
+        
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    assert(len(sys.argv)==4)
+    check_golden_data()
diff --git a/examples/unittests/compile.sh b/examples/unittests/compile.sh
new file mode 100755
index 000000000..6d940b40a
--- /dev/null
+++ b/examples/unittests/compile.sh
@@ -0,0 +1,90 @@
+#set current directory
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+ALP_ROOT=$( realpath $SCRIPT_DIR/../../ )
+CWD=$(pwd)
+KERNELNAME=$TNAME
+if [[ "$ASCEND_CPU_MODE" == "ON" ]] ; then MODE="cpu"; else MODE="npu"; fi
+TARGET=${TNAME}_${MODE}
+TARGET_cmake=alp_ascend_${TNAME}_ascend
+mkdir -p $CWD/src
+hostfile_default="$CWD/src/host_ascend_${TNAME}.cpp"
+[ -z ${HOST_TEST_TEMPLATE} ] && HOST_TEST_TEMPLATE=$(pwd)/HOST_TEST_TEMPLATE.cpp
+HOST_CODE_INP=$CWD/src/generate_host_code_${TNAME}.inp
+
+bashargn=$#
+
+[ -z ${ASCEND_VERSION} ] &&  echo "ASCEND_VERSION not set"  && exit 1
+! [[ "$ASCEND_VERSION" =~ (910A|910B) ]] &&  echo "ASCEND_VERSION possible values: 910A 910B"  && exit 1
+[ -z ${ALP_ROOT} ] &&  echo "ALP_ROOT not set"  && exit 1
+[ -z ${KERNELNAME} ] &&  echo "KERNELNAME not set"  && exit 1
+[ -z ${TARGET} ] &&  echo "TARGET not set"  && exit 1
+[ -z ${TARGET_cmake} ] &&  echo "TARGET_cmake not set"  && exit 1
+[ -z ${CWD} ] &&  echo "CWD not set"  && exit 1
+[ -z ${HOST_TEST_TEMPLATE} ] &&  echo "HOST_TEST_TEMPLATE not set"  && exit 1
+[ -z ${HOST_CODE_INP} ] &&  echo "HOST_CODE_INP not set"  && exit 1
+if [ -z ${ASCEND_HOME_PATH} ]
+then
+   trydir01="/usr/local/Ascend/ascend-toolkit/latest"
+   if [ -d "$trydir01" ]
+   then
+      ASCEND_HOME_PATH="$trydir01"
+   fi
+fi
+[ -z ${ASCEND_HOME_PATH} ] &&  echo "ASCEND_HOME_PATH not set"  && exit 1
+
+
+if [[ "$bashargn" == 2 ]]
+then
+    #use provided code
+    opfile=$1
+    hostfile=$2
+
+    opfile=$(realpath $opfile)
+    hostfile=$(realpath $hostfile)
+    echo "opfile=$opfile"
+    echo "hostfile=$hostfile"
+else
+    #generate the code
+    mkdir -p src
+    opfile="src/${KERNELNAME}_npu_op.cpp"
+    hostfile="$hostfile_default"
+
+    #cleanup any previous output
+    mkdir -p bin
+    #build ALP code gnerator, i.e. ascend_softmaxOp_ascend executable
+    if [ -z "$BUILD_DIR" ]
+    then
+	echo "BUILD_DIR is not set, create tmp BUILD_DIR in /tmp/build_alp/";
+	rm -rf  /tmp/build_alp/
+	mkdir /tmp/build_alp/ && cd /tmp/build_alp/ && cmake $ALP_ROOT && make -j$(nproc) $TARGET_cmake && cd $CWD || { echo "codegen build failed" && exit 1; }
+	BUILD_DIR=/tmp/build_alp/
+    else
+	echo "reuse BUILD_DIR";
+	mkdir -p $BUILD_DIR
+	cd $BUILD_DIR && cmake $ALP_ROOT && make -j$(nproc) $TARGET_cmake && cd $CWD || { echo "codegen build failed" && exit 1; }
+
+    fi
+
+    # make devicecode
+    cd src
+    make $opfile devicefile="$opfile" target_cmake="$BUILD_DIR/examples/$TARGET_cmake" -f ${CWD}/Makefile || { echo "generate device code failed " && exit 1; }
+    cd ..
+    ls $opfile || { echo "$opfile not generated" && exit 1; }
+
+    generate_host=$(pwd)/generate_host_code.py
+
+    opfile=$(realpath $opfile)
+    hostfile=$(realpath $hostfile)
+
+    # make hostcode
+    make $hostfile hostfile="$hostfile" generate_host="$generate_host" target_cmake="$BUILD_DIR/examples/$TARGET_cmake" host_template="$HOST_TEST_TEMPLATE" host_code_inp="$HOST_CODE_INP" ALP_ROOT="$ALP_ROOT" ASCEND_HOME_PATH="$ASCEND_HOME_PATH" ASCEND_VERSION="$ASCEND_VERSION" ASCEND_CPU_MODE="$ASCEND_CPU_MODE" -f ${CWD}/Makefile || { echo "generate host code failed " && exit 1; }
+
+fi
+
+
+
+mkdir -p bin
+cd bin
+make $TARGET target=$TARGET  hostfile="$hostfile" devicefile="$opfile" ALP_ROOT="$ALP_ROOT" ASCEND_HOME_PATH="$ASCEND_HOME_PATH" ASCEND_VERSION="$ASCEND_VERSION" ASCEND_CPU_MODE="$ASCEND_CPU_MODE" -f ${CWD}/Makefile || { echo "ascend build failed" && exit 1; }
+cd ../
+
diff --git a/examples/unittests/compile_and_run_addOp.sh b/examples/unittests/compile_and_run_addOp.sh
new file mode 100755
index 000000000..0d49b6f40
--- /dev/null
+++ b/examples/unittests/compile_and_run_addOp.sh
@@ -0,0 +1,42 @@
+TNAME="addOp"
+
+. compile.sh
+
+#generate input data in "input" directory
+# and the reference output data in "output" directory
+cd bin
+rm -f runtime*.csv
+for n in {0..0}
+do
+    for axes in "256 2048" "512 2048" "1024 2048" "2048 2048" "4096 2048" "8192 2048"
+    do
+	rm -rf input output
+
+	echo "generate input"
+	mkdir -p input
+	mkdir -p output
+	python3 ../make_data_addOp.py ${axes} || { echo "$TARGET make data failed" && exit 1; }
+
+	echo "run ascend example"
+	echo "./$TARGET"
+	./$TARGET ${axes} || { echo "$TARGET returned error" && exit 1; }
+
+	#check the result correctness
+	echo "compare md5sum : ";md5sum output/*.bin
+	md5_ref=($(md5sum output/golden.bin))
+	md5_res=($(md5sum output/param2.bin))
+	RED='\033[0;31m'
+	GREEN='\033[0;32m'
+	DEF='\033[0m'
+	if [ "$md5_ref" == "$md5_res" ]
+	then
+	    printf "${GREEN}Test OK!${DEF}\n"
+	else
+	    printf "${RED}Test FAILED!${DEF}\n"
+	    exit 1
+	fi
+    done
+done
+
+
+
diff --git a/examples/unittests/compile_and_run_addOpv1.sh b/examples/unittests/compile_and_run_addOpv1.sh
new file mode 100755
index 000000000..c485286f9
--- /dev/null
+++ b/examples/unittests/compile_and_run_addOpv1.sh
@@ -0,0 +1,42 @@
+TNAME="addOpv1"
+
+. compile.sh
+
+#generate input data in "input" directory
+# and the reference output data in "output" directory
+cd bin
+rm -f runtime*.csv
+for n in {0..0}
+do
+    for axes in "256" "512" "1024" "2048" "4096" "8192"
+    do
+	rm -rf input output
+
+	echo "generate input"
+	mkdir -p input
+	mkdir -p output
+	python3 ../make_data_addOpv1.py ${axes} || { echo "$TARGET make data failed" && exit 1; }
+
+	echo "run ascend example"
+	echo "./$TARGET"
+	./$TARGET ${axes} || { echo "$TARGET returned error" && exit 1; }
+
+	#check the result correctness
+	echo "compare md5sum : ";md5sum output/*.bin
+	md5_ref=($(md5sum output/golden.bin))
+	md5_res=($(md5sum output/param2.bin))
+	RED='\033[0;31m'
+	GREEN='\033[0;32m'
+	DEF='\033[0m'
+	if [ "$md5_ref" == "$md5_res" ]
+	then
+	    printf "${GREEN}Test OK!${DEF}\n"
+	else
+	    printf "${RED}Test FAILED!${DEF}\n"
+	    exit 1
+	fi
+    done
+done
+
+
+
diff --git a/examples/unittests/compile_and_run_movedataOp-v01.sh b/examples/unittests/compile_and_run_movedataOp-v01.sh
new file mode 100755
index 000000000..47acf98ea
--- /dev/null
+++ b/examples/unittests/compile_and_run_movedataOp-v01.sh
@@ -0,0 +1,30 @@
+TNAME="movedataOpv01"
+
+. compile.sh
+
+#generate input data in "input" directory
+# and the reference output data in "output" directory
+cd bin/
+rm -f runtime*.csv
+for n in {0..0}
+do
+    for axes in "32 16 16" "64 16 16" "128 16 16" "256 16 16"  "512 16 16"
+    do
+	echo "axes=$axes"
+	rm -rf input output
+
+	echo "generate input"
+	mkdir -p input
+	mkdir -p output
+	python3 ../make_data_movedataOp-v01.py $axes  || { echo "$TARGET data generation failed" && exit 1; }
+
+	echo "run ascend example"
+	echo "./$TARGET $axes"
+	./$TARGET $axes   || { echo "$TARGET failed" && exit 1; }
+
+	python3 ../check_data_movedataOp-v01.py $axes || { echo "$TARGET check failed" && exit 1; }
+    done
+done
+
+
+
diff --git a/examples/unittests/compile_and_run_onlinesoftmaxOp.sh b/examples/unittests/compile_and_run_onlinesoftmaxOp.sh
new file mode 100755
index 000000000..82c78b301
--- /dev/null
+++ b/examples/unittests/compile_and_run_onlinesoftmaxOp.sh
@@ -0,0 +1,26 @@
+TNAME="onlinesoftmaxOp"
+
+. compile.sh
+
+#generate input data in "input" directory
+# and the reference output data in "output" directory
+cd bin/
+rm -f runtime*.csv
+for n in {0..0}
+do
+    for axes in "16 32 16 16" "16 32 32 16" "16 32 32 32" "16 32 32 64" "32 16 16 64"
+    do
+	rm -rf input output
+
+	echo "generate input"
+	mkdir -p input
+	mkdir -p output
+	python3 ../make_data_onlinesoftmaxOp.py $axes || { echo "$TARGET make data failed" && exit 1; }
+
+	echo "run ascend example"
+	./$TARGET $axes || { echo "$TARGET failed" && exit 1; }
+
+	python3 ../check_data_onlinesoftmaxOp.py $axes  || { echo "$TARGET check failed" && exit 1; }
+    done
+
+done
diff --git a/examples/unittests/compile_and_run_softmaxOp-v1.sh b/examples/unittests/compile_and_run_softmaxOp-v1.sh
new file mode 100755
index 000000000..475e6cf59
--- /dev/null
+++ b/examples/unittests/compile_and_run_softmaxOp-v1.sh
@@ -0,0 +1,27 @@
+TNAME="softmaxOpv1"
+
+. compile.sh
+
+#generate input data in "input" directory
+# and the reference output data in "output" directory
+cd bin/
+rm -f runtime*.csv
+for n in {0..0}
+do
+    for axes in "8 32 64" "8 32 128" "8 256 128"  "32 128 128"
+    do
+	rm -rf input output
+
+	echo "generate input"
+	mkdir -p input
+	mkdir -p output
+	python3 ../make_data_softmaxOp-v1.py $axes || { echo "$TARGET make data failed" && exit 1; }
+
+	echo "run ascend example"
+	echo "./$TARGET $axes"
+	./$TARGET $axes || { echo "$TARGET failed" && exit 1; }
+
+	python3 ../check_data_softmaxOp-v1.py $axes || { echo "$TARGET check failed" && exit 1; }
+    done
+
+done
diff --git a/examples/unittests/compile_and_run_softmaxOp-v3.sh b/examples/unittests/compile_and_run_softmaxOp-v3.sh
new file mode 100755
index 000000000..38bc4e025
--- /dev/null
+++ b/examples/unittests/compile_and_run_softmaxOp-v3.sh
@@ -0,0 +1,27 @@
+TNAME="softmaxOpv3"
+
+. compile.sh
+
+#generate input data in "input" directory
+# and the reference output data in "output" directory
+cd bin/
+rm -f runtime*.csv
+for n in {0..0}
+do
+    for axes in "16 32 16 16" "16 32 32 16" "16 32 32 32" "16 32 32 64" "32 16 16 64"
+    do
+	rm -rf input output
+
+	echo "generate input"
+	mkdir -p input
+	mkdir -p output
+	python3 ../make_data_softmaxOp-v3.py $axes || { echo "$TARGET make data failed" && exit 1; }
+
+	echo "run ascend example"
+	echo "./$TARGET $axes"
+	./$TARGET $axes || { echo "$TARGET failed" && exit 1; }
+
+	python3 ../check_data_softmaxOp-v3.py $axes || { echo "$TARGET check failed" && exit 1; }
+    done
+
+done
diff --git a/examples/unittests/compile_and_run_softmaxOp-v4.sh b/examples/unittests/compile_and_run_softmaxOp-v4.sh
new file mode 100755
index 000000000..84a3fdf89
--- /dev/null
+++ b/examples/unittests/compile_and_run_softmaxOp-v4.sh
@@ -0,0 +1,27 @@
+TNAME="softmaxOpv4"
+
+. compile.sh
+
+#generate input data in "input" directory
+# and the reference output data in "output" directory
+cd bin/
+rm -f runtime*.csv
+for n in {0..0}
+do
+    for axes in "16 4 16 8 4 16"  "16 4 16 8 4 128" "32 4 16 8 4 16"  "16 4 32 8 4 16"
+    do
+	rm -rf input output
+
+	echo "generate input"
+	mkdir -p input
+	mkdir -p output
+	python3 ../make_data_softmaxOp-v4.py $axes || { echo "$TARGET make data failed" && exit 1; }
+
+	echo "run ascend example"
+	echo "./$TARGET $axes"
+	./$TARGET $axes || { echo "$TARGET failed" && exit 1; }
+
+	python3 ../check_data_softmaxOp-v4.py $axes || { echo "$TARGET check failed" && exit 1; }
+    done
+
+done
diff --git a/examples/unittests/compile_and_run_softmaxOp.sh b/examples/unittests/compile_and_run_softmaxOp.sh
new file mode 100755
index 000000000..630226558
--- /dev/null
+++ b/examples/unittests/compile_and_run_softmaxOp.sh
@@ -0,0 +1,30 @@
+TNAME="softmaxOp"
+
+. compile.sh
+
+#generate input data in "input" directory
+# and the reference output data in "output" directory
+cd bin/
+rm -f runtime*.csv
+for n in {0..0}
+do
+    for axes in "1024 32 128" "1024 128 64" "1024 128 128" "1024 256 64"
+    do
+	rm -rf input output
+
+	echo "generate input"
+	mkdir -p input
+	mkdir -p output
+	python3 ../make_data_softmaxOp.py $axes || { echo "$TARGET data generation failed" && exit 1; }
+
+	echo "run ascend example"
+	echo "./$TARGET $axes"
+	./$TARGET $axes || { echo "$TARGET  failed" && exit 1; }
+
+	python3 ../check_data_softmaxOp.py $axes || { echo "$TARGET check failed" && exit 1; }
+    done
+
+done
+
+
+
diff --git a/examples/unittests/generate_host_code.py b/examples/unittests/generate_host_code.py
new file mode 100644
index 000000000..852ca23f3
--- /dev/null
+++ b/examples/unittests/generate_host_code.py
@@ -0,0 +1,287 @@
+#!/usr/bin/python3
+# -*- coding:utf-8 -*-
+# Copyright 2022-2023 Huawei Technologies Co., Ltd
+import numpy as np
+import copy
+import re
+import argparse
+
+host_code_template=""
+
+class Tensor:
+    """A simple tensor class"""
+    def __init__(self,grid,axes,inout,tid):
+        assert( inout=="in" or inout=="out" )
+        self.grid=grid
+        self.axes=axes
+        self.inout=inout
+        self.tid=tid
+
+        self.paramName="param"+str(tid)+inout
+        self.paramHostname=self.paramName+"Host"
+        self.paramDevicename=self.paramName+"Device"
+        self.paramFileSize=self.paramName+"FileSize"
+
+        if( inout=="out" ):
+            self.paramFileNameOut='"./output/param'+str(tid)+'.bin"'
+        else:
+            self.paramFileNameIn='"./input/input'+str(tid)+'.bin"'
+
+        self.paramFileSizeExpr=" * ".join(["_n"+str(k) for k in self.axes])
+
+    def print(self):
+        print(" Tensor[ grid = " ,self.grid, " axes= ", self.axes, " inout= ", self.inout, " tid = ", self.tid , " ]" )
+
+
+def parse_tensor_line(problem_grid_in,LineIn):
+    tensors_all_str=np.array((LineIn.split()[0]).split(","))
+    i1=np.where(tensors_all_str=="in")[0]
+    i2=np.where(tensors_all_str=="out")[0]
+    ii=np.sort(np.concatenate((i1,i2)))
+    tensors_all=np.split(tensors_all_str,ii+1)[:-1]
+    tensors_axes=[ np.array(a[:-1]).astype(int) for a in tensors_all]
+    tensors_inout=[ a[-1] for a in tensors_all]
+    tensors_all_list=[ Tensor(problem_grid_in,a,io,tid) for tid,(a,io) in enumerate(zip(tensors_axes,tensors_inout)) ]
+    return(tensors_all_list)
+
+def get_grid_from_mcd(grid,tabs="\t"):
+    s=""
+    for i,n in enumerate(grid):
+        s=s+tabs+"uint32_t _n"+str(n)+" = atoi(argv["+str(i+1)+"]);\n"
+    return(s)
+
+def get_declaretensorsizes(tensors,tabs="\t"):
+    s=""
+    for t in tensors:
+        s=s+tabs+"size_t "+t.paramFileSize+" = "+t.paramFileSizeExpr+" * sizeof( DTYPE );\n"
+    return(s)
+
+def get_host_alloc(tensors,tabs="\t"):
+    s=""
+    for t in tensors:
+        s=s+tabs+"uint8_t *"+t.paramHostname+";\n"
+        s=s+tabs+"CHECK_ACL(aclrtMallocHost((void**)(&"+t.paramHostname+"), "+t.paramFileSize+"));\n"
+    return(s)
+
+def get_host_readfiles(tensors,tabs="\t"):
+    s=""
+    for t in tensors:
+        if( t.inout == "in" ):
+            s=s+tabs+'ReadFile('+t.paramFileNameIn+', '+t.paramFileSize+', '+t.paramHostname+', '+t.paramFileSize+');\n'
+    return(s)
+
+def get_device_alloc(tensors,tabs="\t"):
+    s=""
+    for t in tensors:
+        s=s+tabs+"uint8_t *"+t.paramDevicename+";\n"
+        s=s+tabs+"CHECK_ACL(aclrtMalloc((void**)(&"+t.paramDevicename+"), "+t.paramFileSize+", ACL_MEM_MALLOC_HUGE_FIRST));\n"
+    return(s)
+
+def get_host2device_move(tensors,tabs="\t\t"):
+    s=""
+    for t in tensors:
+        if( t.inout == "in" ):
+            s=s+tabs+"CHECK_ACL(aclrtMemcpy("+t.paramDevicename+", "+t.paramFileSize+", "+t.paramHostname+", "+t.paramFileSize+", ACL_MEMCPY_HOST_TO_DEVICE));\n"
+    return(s)
+
+def get_devicetensor_arglist(tensors,tabs="\t\t\t"):
+    s=tabs+",".join([t.paramDevicename for t in tensors])
+    return(s)
+
+def get_alldim_list(grid,tabs="\t\t\t"):
+    s=tabs+", ".join(["_n"+str(k) for k in grid])
+    return(s)
+
+def get_device2host_move(tensors,tabs="\t"):
+    s=""
+    for t in tensors:
+        if( t.inout == "out" ):
+            s=s+tabs+"CHECK_ACL(aclrtMemcpy("+t.paramHostname+", "+t.paramFileSize+", "+t.paramDevicename+", "+t.paramFileSize+", ACL_MEMCPY_DEVICE_TO_HOST));\n"
+    return(s)
+
+def get_device_free(tensors,tabs="\t"):
+    s=""
+    for t in tensors:
+        s=s+tabs+"CHECK_ACL(aclrtFree("+t.paramDevicename+"));\n"
+    return(s)
+
+def get_host_free(tensors,tabs="\t"):
+    s=""
+    for t in tensors:
+        s=s+tabs+"CHECK_ACL(aclrtFreeHost("+t.paramHostname+"));\n"
+    return(s)
+
+def get_host_write(tensors,tabs="\t"):
+    s=""
+    for t in tensors:
+        if( t.inout == "out" ):
+            s=s+tabs+'WriteFile('+t.paramFileNameOut+', '+t.paramHostname+', '+t.paramFileSize+');\n'
+    return(s)
+
+def get_frwdec_tensorlist(tensors,tabs="\t"):
+    s=tabs+", ".join([ "uint8_t *"+t.paramName for t in tensors])
+    return(s)
+
+def get_frwdec_alldim_list(grid,tabs="\t"):
+    s=tabs+", ".join(["uint32_t n"+str(k) for k in grid])
+    return(s)
+
+def get_frwdec_all_thrd_dim_list(grid,tabs="\t"):
+    s=tabs+", ".join(["uint32_t _p"+str(k) for k in grid])
+    return(s)
+
+##############  cpu code gen  ##################
+
+def get_cpu_alloc(tensors,tabs="\t"):
+    s=""
+    for t in tensors:
+        s=s+tabs+"uint8_t* "+t.paramName+" = (uint8_t*)AscendC::GmAlloc("+t.paramFileSize+");\n"
+    return(s)
+
+def get_cpu_readfiles(tensors,tabs="\t"):
+    s=""
+    for t in tensors:
+        if( t.inout == "in" ):
+            s=s+tabs+'ReadFile('+t.paramFileNameIn+', '+t.paramFileSize+', '+t.paramName+', '+t.paramFileSize+');\n'
+    return(s)
+
+def get_cputensor_arglist(tensors,tabs="\t\t"):
+    s=tabs+",".join([t.paramName for t in tensors])
+    return(s)
+
+def get_cpu_write(tensors,tabs="\t"):
+    s=""
+    for t in tensors:
+        if( t.inout == "out" ):
+            s=s+tabs+'WriteFile('+t.paramFileNameOut+', '+t.paramName+', '+t.paramFileSize+');\n'
+    return(s)
+
+def get_cpu_free(tensors,tabs="\t"):
+    s=""
+    for t in tensors:
+        s=s+tabs+"AscendC::GmFree((void *)"+t.paramName+");\n"
+    return(s)
+
+def get_cpu_frwdec_tensorlist(tensors,tabs="\t"):
+    s=tabs+", ".join([ "GM_ADDR "+t.paramName for t in tensors])
+    return(s)
+
+def get_cpu_frwdec_alldim_list(grid,tabs="\t"):
+    s=tabs+", ".join(["uint32_t n"+str(k) for k in grid])
+    return(s)
+
+def get_cpu_frwdec_all_thrd_dim_list(grid,tabs="\t"):
+    s=tabs+", ".join(["uint32_t _p"+str(k) for k in grid])
+    return(s)
+
+def get_analytic_mdel_arg_list(t,tabs=""):
+    s=tabs+", ".join([x.split()[-1] for x in t.strip().split(",") if x])
+    return(s)
+
+def get_analytic_model_init(t,tcode,tabs="\t"):
+    s=""
+    s=s+"\n#ifdef _ANALYTIC_MODEL_\n"
+    s=s+"\n".join(tcode)
+    s=s+"\n#else\n"
+    s=s+";\n ".join([tabs+x.strip()+" = 1;" for x in t.strip().split(",") if x])
+    s=s+"\n#endif\n"
+    return(s)
+
+
+
+parser = argparse.ArgumentParser(description='Generate host test code.')
+parser.add_argument('template_file', type=str, nargs='+',
+                    help='host code will be generated from this template')
+parser.add_argument('out_file', type=str, nargs='+',
+                    help='generated host code file name')
+parser.add_argument('in_file', type=str, nargs='+',
+                    help='input paramters')
+parser.add_argument('repeats', type=str, nargs='+', default="10",
+                    help='number or repeats in the unit tests')
+parser.add_argument('device_id', type=str, nargs='+', default="0",
+                    help='device id used in tests')
+parser.add_argument('nthreads', type=str, nargs='+', default="8",
+                    help='number of threads used in tests')
+args = parser.parse_args()
+
+template_file=args.template_file[0]
+out_file=args.out_file[0]
+in_file=args.in_file[0]
+repeats=args.repeats[0]
+device_id=args.device_id[0]
+nthreads=args.nthreads[0]
+
+print("args.template_file=",args.template_file)
+print("args.out_file=",args.out_file)
+print("args.in_file=",args.in_file)
+print("args.repeats=",args.repeats)
+print("args.device_id=",args.device_id)
+print("args.nthreads=",args.nthreads)
+
+file1 = open(in_file, 'r')
+Lines = file1.readlines()
+file1.close()
+
+thread_grid=np.array((Lines[0].split()[0]).split(",")).astype(int)
+problem_grid=np.array((Lines[1].split()[0]).split(",")).astype(int)
+tensors_all=parse_tensor_line(problem_grid,Lines[2])
+kernel_name=Lines[3].split()[0]
+analyticModelFormalParams=Lines[4]
+i1=np.where( [ "BEGIN_ANALYTIC_MODEL" in l for l in Lines ] )[0][0]
+i2=np.where( [ "END_ANALYTIC_MODEL" in l for l in Lines ] )[0][0]
+analyticModelInitCode=Lines[i1+1:i2]
+print("kernel_name =",kernel_name)
+print("thread_grid =",thread_grid)
+print("problem_grid =",problem_grid)
+print("analyticModelFormalParams =",analyticModelFormalParams)
+print("analyticModelInitCode =",analyticModelInitCode)
+print("tensors =")
+for t in tensors_all:
+    t.print()
+
+replace_rules=[
+    ("##KERNELNAME##",kernel_name),
+    ("##REPEATS##",repeats),
+    ("##DEVICEID##",device_id),
+    ("##DECLARESIZES##",get_grid_from_mcd(problem_grid)),
+    ("##NTHREADS##",nthreads),
+    ("##DECLARETENSORSIZES##",get_declaretensorsizes(tensors_all)),
+    ("##HOSTDECLARETENSOR##",get_host_alloc(tensors_all)),
+    ("##HOSTREADFILES##",get_host_readfiles(tensors_all)),
+    ("##DEVICEDECLARETENSOR##",get_device_alloc(tensors_all)),
+    ("##HOST2DEVICEMOVE##",get_host2device_move(tensors_all)),
+    ("##DEVICETENSORLIST##",get_devicetensor_arglist(tensors_all)),
+    ("##ALLDIMENSIONSLIST##",get_alldim_list(problem_grid)),
+    ("##DEVICE2HOSTMOVE##",get_device2host_move(tensors_all)),
+    ("##DEVICEFREETENSOR##",get_device_free(tensors_all)),
+    ("##HOSTFREETENSOR##",get_host_free(tensors_all)),
+    ("##WRITETENSOR##",get_host_write(tensors_all)),
+    ("##FRWDECTENSORALLLIST##",get_frwdec_tensorlist(tensors_all)),
+    ("##FRWDECTENSORSIZESLIST##",get_frwdec_alldim_list(problem_grid)),
+    ("##FRWDECTHRDGRIDLIST##",get_frwdec_all_thrd_dim_list(thread_grid)),
+    ("##CPUDECLARETENSOR##",get_cpu_alloc(tensors_all)),
+    ("##CPUREADFILES##",get_cpu_readfiles(tensors_all)),
+    ("##CPUTENSORLIST##",get_cputensor_arglist(tensors_all)),
+    ("##CPUWRITETENSOR##",get_cpu_write(tensors_all)),
+    ("##CPUFREETENSOR##",get_cpu_free(tensors_all)),
+    ("##CPUFRWDECTENSORALLLIST##",get_cpu_frwdec_tensorlist(tensors_all)),
+    ("##CPUFRWDECTENSORSIZESLIST##",get_cpu_frwdec_alldim_list(problem_grid)),
+    ("##CPUFRWDECTHRDGRIDLIST##",get_cpu_frwdec_all_thrd_dim_list(thread_grid)),
+    ("##ANALYTICMODELFORMALPARAMS##","\t"+analyticModelFormalParams),
+    ("##ANALYTICMODELPARAMS##",get_analytic_mdel_arg_list(analyticModelFormalParams)),
+    ("##DECLAREANALYTICMODELPARAMS##",get_analytic_model_init(analyticModelFormalParams,analyticModelInitCode))
+
+]
+
+file1 = open(template_file, 'r')
+Lines = file1.readlines()
+
+text=copy.copy(Lines)
+for old,new in replace_rules:
+    for i in range(len(text)):
+        text[i] = text[i].replace(old,new)
+
+# writing to file
+file1 = open(out_file, 'w')
+file1.writelines(text)
+file1.close()
diff --git a/examples/unittests/make_data_addOp.py b/examples/unittests/make_data_addOp.py
new file mode 100644
index 000000000..174a0581d
--- /dev/null
+++ b/examples/unittests/make_data_addOp.py
@@ -0,0 +1,25 @@
+#!/usr/bin/python3
+# -*- coding:utf-8 -*-
+# Copyright 2022-2023 Huawei Technologies Co., Ltd
+import numpy as np
+import sys
+
+def gen_golden_data_simple():
+
+    n0=int(sys.argv[1])
+    n1=int(sys.argv[2])
+
+    N = n0*n1
+
+    input_x = np.random.uniform(-100, 100, N ).astype(np.float16)
+    input_y = np.random.uniform(-100, 100, N ).astype(np.float16)
+    golden = (input_x + input_y).astype(np.float16)
+
+    input_x.tofile("./input/input0.bin")
+    input_y.tofile("./input/input1.bin")
+    golden.tofile("./output/golden.bin")
+
+
+if __name__ == "__main__":
+    assert(len(sys.argv)==3)
+    gen_golden_data_simple()
diff --git a/examples/unittests/make_data_addOpv1.py b/examples/unittests/make_data_addOpv1.py
new file mode 100644
index 000000000..0e6bb99cd
--- /dev/null
+++ b/examples/unittests/make_data_addOpv1.py
@@ -0,0 +1,24 @@
+#!/usr/bin/python3
+# -*- coding:utf-8 -*-
+# Copyright 2022-2023 Huawei Technologies Co., Ltd
+import numpy as np
+import sys
+
+def gen_golden_data_simple():
+
+    n0=int(sys.argv[1])
+
+    N = n0
+
+    input_x = np.random.uniform(-100, 100, N ).astype(np.float16)
+    input_y = np.random.uniform(-100, 100, N ).astype(np.float16)
+    golden = (input_x + input_y).astype(np.float16)
+
+    input_x.tofile("./input/input0.bin")
+    input_y.tofile("./input/input1.bin")
+    golden.tofile("./output/golden.bin")
+
+
+if __name__ == "__main__":
+    assert(len(sys.argv)==2)
+    gen_golden_data_simple()
diff --git a/examples/unittests/make_data_movedataOp-v01.py b/examples/unittests/make_data_movedataOp-v01.py
new file mode 100644
index 000000000..80831b372
--- /dev/null
+++ b/examples/unittests/make_data_movedataOp-v01.py
@@ -0,0 +1,42 @@
+#!/usr/bin/python3
+# -*- coding:utf-8 -*-
+# Copyright 2022-2023 Huawei Technologies Co., Ltd
+import numpy as np
+import copy
+import sys
+
+def gen_golden_data():
+    x1_gm_type = np.float16
+
+    n0=int(sys.argv[1])
+    n1=int(sys.argv[2])
+    n2=int(sys.argv[3])
+    
+    M = n0*n1
+    N = n2
+
+    x1_gm = np.random.randint(1, 10, [M, N]).astype(x1_gm_type)
+
+    S = copy.copy(x1_gm)
+
+    # S=softmax(S)
+    # rowmaxS=np.max(S,axis=1)
+    # S=S-np.tile(rowmaxS, (np.shape(S)[1],1)).T
+    # S=np.exp(S)
+    # rowsumS=1/np.sum(S,axis=1)
+    # S=S*(np.tile(rowsumS, (np.shape(S)[1],1)).T)
+
+    golden = S
+
+    infilename = "./input/input0.bin"
+    outfilename = "./output/golden.bin"
+
+    x1_gm.tofile( infilename )
+    golden.tofile( outfilename )
+
+    print(f"I/O of size {M} x {N} and type {x1_gm_type} generated in {infilename} and {outfilename}")
+    # print( golden )
+
+if __name__ == "__main__":
+    assert(len(sys.argv)==4)
+    gen_golden_data()
diff --git a/examples/unittests/make_data_onlinesoftmaxOp.py b/examples/unittests/make_data_onlinesoftmaxOp.py
new file mode 100644
index 000000000..0dfd47e54
--- /dev/null
+++ b/examples/unittests/make_data_onlinesoftmaxOp.py
@@ -0,0 +1,124 @@
+#!/usr/bin/python3
+# -*- coding:utf-8 -*-
+# Copyright 2022-2023 Huawei Technologies Co., Ltd
+import numpy as np
+import copy
+import sys
+
+def gen_golden_data():
+    x1_gm_type = np.float16
+
+    n0=int(sys.argv[1])
+    n1=int(sys.argv[2])
+    n2=int(sys.argv[3])
+    n3=int(sys.argv[4])
+    n4=16
+
+    N0 = n0*n2*n4
+    N1 = n1*n3*n4
+
+    shape0 = (n0,n2,n4)
+    shape1 = (n1,n3,n4)
+    shape2 = (n0,n1,n2,n3)
+    shape3 = (n0,n2)
+
+    Q_gm = np.random.randint(1, 4, [N0]).astype(x1_gm_type)
+    V_gm = np.random.randint(1, 4, [N1]).astype(x1_gm_type)
+    K_gm = np.random.randint(1, 4, [N1]).astype(x1_gm_type)
+
+    Q = copy.copy(Q_gm)
+    Q = np.reshape(Q,shape0)
+    #infilename = "./input/q_gm.bin"
+    #Q_gm.tofile( infilename )
+
+    K = copy.copy(K_gm)
+    K = np.reshape(K,shape1)
+    #infilename = "./input/k_gm.bin"
+    #K_gm.tofile( infilename )
+
+    V = copy.copy(V_gm)
+    V = np.reshape(V,shape1)
+    #infilename = "./input/v_gm.bin"
+    #V_gm.tofile( infilename )
+
+
+    S0 = np.zeros(shape2).astype(x1_gm_type)
+    for i0 in range(n0):
+        for i1 in range(n1):
+            S0[i0,i1,:,:]=Q[i0,:,:].dot(V[i1,:,:].T)
+    S0_gm = np.reshape(S0,n0*n1*n2*n3)
+    #infilename = "./input/s0_gm.bin"
+    infilename = "./input/input2.bin"
+    S0_gm.tofile( infilename )
+
+    m0 = np.zeros(shape3).astype(x1_gm_type)
+    l0 = np.zeros(shape3).astype(x1_gm_type)
+    S1 = np.zeros(shape2).astype(x1_gm_type)
+    for i0 in range(n0):
+        l0[i0,:]=0
+        m0[i0,:]=-65504.0
+        for i1 in range(n1):
+            Sij=S0[i0,i1,:,:]
+            # Pi=Sij
+            mi_old=copy.copy(m0[i0,:])
+
+            rowmaxS=np.max(Sij,axis=1)
+            m0[i0,:]=np.maximum(m0[i0,:],rowmaxS) # m0[i0,:]=rowmaxS  # TEMP
+            mi_bcast=np.tile(m0[i0,:], (np.shape(Sij)[1],1))
+            Pi=Sij-mi_bcast.T
+            Pi=np.exp(Pi)
+
+            expmidiff=np.exp(mi_old-m0[i0,:])
+            l0[i0,:]*=expmidiff
+            l0[i0,:]+=np.sum(Pi,axis=1)
+
+            S1[i0,i1,:,:]=Pi
+
+
+
+
+    # print("m0=",m0)
+    # print("l0=",l0)
+    m0_gm = np.reshape(m0,n0*n2)
+    l0_gm = np.reshape(l0,n0*n2)
+    s1_gm = np.reshape(S1,n0*n1*n2*n3)
+    goldenfilename = "./output/m0_golden.bin"
+    m0_gm.tofile( goldenfilename )
+    goldenfilename = "./output/l0_golden.bin"
+    l0_gm.tofile( goldenfilename )
+    goldenfilename = "./output/s1_golden.bin"
+    s1_gm.tofile( goldenfilename )
+
+    S2 = np.zeros(shape2).astype(x1_gm_type)
+    for i0 in range(n0):
+        for i1 in range(n1):
+            S2[i0,i1,:,:]=Q[i0,:,:].dot(V[i1,:,:].T)
+    S2_gm = np.reshape(S2,n0*n1*n2*n3)
+    infilename = "./input/s2_gm.bin"
+    S2_gm.tofile( infilename )
+
+    # S = copy.copy(x1_gm)
+    # S = np.reshape(S,(n0,n1,n2,n3))
+
+    # # S=block_softmax(S)
+    # for i0 in range(n0):
+    #     for i1 in range(n1):
+    #         Stmp=S[i0,i1,:,:]
+    #         rowmaxS=np.max(Stmp,axis=1)
+    #         Stmp=Stmp-np.tile(rowmaxS, (np.shape(Stmp)[1],1)).T
+    #         Stmp=np.exp(Stmp)
+    #         rowsumS=1/np.sum(Stmp,axis=1)
+    #         Stmp=Stmp*(np.tile(rowsumS, (np.shape(Stmp)[1],1)).T)
+    #         S[i0,i1,:,:]=Stmp
+
+    # golden = S
+
+    # outfilename = "./output/golden.bin"
+    # golden.tofile( outfilename )
+
+    # print(f"I/O of size {n0} x {n1} x {n2} x {n3} and type {x1_gm_type} generated in {infilename} and {outfilename}")
+    # # print( golden )
+
+if __name__ == "__main__":
+    assert(len(sys.argv)==5)
+    gen_golden_data()
diff --git a/examples/unittests/make_data_softmaxOp-v1.py b/examples/unittests/make_data_softmaxOp-v1.py
new file mode 100644
index 000000000..e041b891a
--- /dev/null
+++ b/examples/unittests/make_data_softmaxOp-v1.py
@@ -0,0 +1,42 @@
+#!/usr/bin/python3
+# -*- coding:utf-8 -*-
+# Copyright 2022-2023 Huawei Technologies Co., Ltd
+import numpy as np
+import copy
+import sys
+
+def gen_golden_data():
+    x1_gm_type = np.float16
+
+    n0=int(sys.argv[1])
+    n1=int(sys.argv[2])
+    n2=int(sys.argv[3])
+    
+    M = n0*n1
+    N = n2
+
+    x1_gm = np.random.randint(1, 10, [M, N]).astype(x1_gm_type)
+
+    S = copy.copy(x1_gm)
+
+    # S=softmax(S)
+    rowmaxS=np.max(S,axis=1)
+    S=S-np.tile(rowmaxS, (np.shape(S)[1],1)).T
+    S=np.exp(S)
+    rowsumS=1/np.sum(S,axis=1)
+    S=S*(np.tile(rowsumS, (np.shape(S)[1],1)).T)
+    
+    golden = S
+
+    infilename = "./input/input0.bin"
+    outfilename = "./output/golden.bin"
+
+    x1_gm.tofile( infilename )
+    golden.tofile( outfilename )
+    
+    print(f"I/O of size {M} x {N} and type {x1_gm_type} generated in {infilename} and {outfilename}")
+    # print( golden )
+
+if __name__ == "__main__":
+    assert(len(sys.argv)==4)
+    gen_golden_data()
diff --git a/examples/unittests/make_data_softmaxOp-v3.py b/examples/unittests/make_data_softmaxOp-v3.py
new file mode 100644
index 000000000..772267d80
--- /dev/null
+++ b/examples/unittests/make_data_softmaxOp-v3.py
@@ -0,0 +1,48 @@
+#!/usr/bin/python3
+# -*- coding:utf-8 -*-
+# Copyright 2022-2023 Huawei Technologies Co., Ltd
+import numpy as np
+import copy
+import sys
+
+def gen_golden_data():
+    x1_gm_type = np.float16
+
+    n0=int(sys.argv[1])
+    n1=int(sys.argv[2])
+    n2=int(sys.argv[3])
+    n3=int(sys.argv[4])
+    
+    M = n0*n1
+    N = n2*n3
+
+    x1_gm = np.random.randint(1, 10, [N, M]).astype(x1_gm_type)
+
+    S = copy.copy(x1_gm)
+    S = np.reshape(S,(n0,n1,n2,n3))
+
+    # S=block_softmax(S)
+    for i0 in range(n0):
+        for i1 in range(n1):
+            Stmp=S[i0,i1,:,:]
+            rowmaxS=np.max(Stmp,axis=1)
+            Stmp=Stmp-np.tile(rowmaxS, (np.shape(Stmp)[1],1)).T
+            Stmp=np.exp(Stmp)
+            rowsumS=1/np.sum(Stmp,axis=1)
+            Stmp=Stmp*(np.tile(rowsumS, (np.shape(Stmp)[1],1)).T)
+            S[i0,i1,:,:]=Stmp
+
+    golden = S
+
+    infilename = "./input/input0.bin"
+    outfilename = "./output/golden.bin"
+
+    x1_gm.tofile( infilename )
+    golden.tofile( outfilename )
+
+    print(f"I/O of size {n0} x {n1} x {n2} x {n3} and type {x1_gm_type} generated in {infilename} and {outfilename}")
+    # print( golden )
+
+if __name__ == "__main__":
+    assert(len(sys.argv)==5)
+    gen_golden_data()
diff --git a/examples/unittests/make_data_softmaxOp-v4.py b/examples/unittests/make_data_softmaxOp-v4.py
new file mode 100644
index 000000000..73fd7e416
--- /dev/null
+++ b/examples/unittests/make_data_softmaxOp-v4.py
@@ -0,0 +1,52 @@
+#!/usr/bin/python3
+# -*- coding:utf-8 -*-
+# Copyright 2022-2023 Huawei Technologies Co., Ltd
+import numpy as np
+import copy
+import sys
+
+def gen_golden_data():
+    x1_gm_type = np.float16
+
+    n0=int(sys.argv[1])
+    n1=int(sys.argv[2])
+    n2=int(sys.argv[3])
+    n3=int(sys.argv[4])
+    n4=int(sys.argv[5])
+    n5=int(sys.argv[6])
+
+    N = n0*n1*n2*n3*n4*n5
+
+    x1_gm = np.random.randint(1, 10, [N]).astype(x1_gm_type)
+
+    S = copy.copy(x1_gm)
+
+    S=np.reshape(S,(n0,n1,n2,n3,n4,n5))
+    for i0 in range(n0):
+        for i1 in range(n1):
+            for i3 in range(n3):
+                for i4 in range(n4):
+                    Stmp=S[i0,i1,:,i3,i4,:]
+                    rowmaxStmp=np.max(Stmp,axis=1)
+                    Stmp=Stmp-np.tile(rowmaxStmp, (np.shape(Stmp)[1],1)).T
+                    Stmp=np.exp(Stmp)
+                    rowsumStmp=1/np.sum(Stmp,axis=1)
+                    Stmp=Stmp*(np.tile(rowsumStmp, (np.shape(Stmp)[1],1)).T)
+                    S[i0,i1,:,i3,i4,:]=Stmp
+
+
+    golden = np.reshape(S,(n0*n1*n2*n3*n4*n5))
+
+    infilename = "./input/input0.bin"
+    outfilename = "./output/golden.bin"
+
+    x1_gm.tofile( infilename )
+    golden.tofile( outfilename )
+
+    print(f"I/O of size {n0} x {n1} x {n2} x {n3} x {n4} x {n5} and type {x1_gm_type} generated in {infilename} and {outfilename}")
+    # print( golden )
+
+
+if __name__ == "__main__":
+    assert(len(sys.argv)==7)
+    gen_golden_data()
diff --git a/examples/unittests/make_data_softmaxOp.py b/examples/unittests/make_data_softmaxOp.py
new file mode 100644
index 000000000..28ef13518
--- /dev/null
+++ b/examples/unittests/make_data_softmaxOp.py
@@ -0,0 +1,42 @@
+#!/usr/bin/python3
+# -*- coding:utf-8 -*-
+# Copyright 2022-2023 Huawei Technologies Co., Ltd
+import numpy as np
+import copy
+import sys
+
+def gen_golden_data():
+    x1_gm_type = np.float16
+
+    n0=int(sys.argv[1])
+    n1=int(sys.argv[2])
+    n2=int(sys.argv[3])
+    
+    M = n0*n1
+    N = n2
+
+    x1_gm = np.random.randint(1, 10, [M, N]).astype(x1_gm_type)
+
+    S = copy.copy(x1_gm)
+
+    # S=softmax(S)
+    rowmaxS=np.max(S,axis=1)
+    S=S-np.tile(rowmaxS, (np.shape(S)[1],1)).T
+    S=np.exp(S)
+    rowsumS=1/np.sum(S,axis=1)
+    S=S*(np.tile(rowsumS, (np.shape(S)[1],1)).T)
+
+    golden = S
+
+    infilename = "./input/input0.bin"
+    outfilename = "./output/golden.bin"
+
+    x1_gm.tofile( infilename )
+    golden.tofile( outfilename )
+
+    print(f"I/O of size {M} x {N} and type {x1_gm_type} generated in {infilename} and {outfilename}")
+    # print( golden )
+
+if __name__ == "__main__":
+    assert(len(sys.argv)==4)
+    gen_golden_data()
diff --git a/examples/unittests/test_all.sh b/examples/unittests/test_all.sh
new file mode 100755
index 000000000..162eced88
--- /dev/null
+++ b/examples/unittests/test_all.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+SCRIPTS="compile_and_run_movedataOp-v01.sh compile_and_run_addOp.sh compile_and_run_addOpv1.sh compile_and_run_softmaxOp.sh compile_and_run_softmaxOp-v1.sh compile_and_run_softmaxOp-v3.sh compile_and_run_softmaxOp-v4.sh compile_and_run_onlinesoftmaxOp.sh"
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+DEF='\033[0m'
+
+echo ""
+
+BUILD=$(pwd)/build_alp/
+rm -rf $BUILD
+mkdir $BUILD
+
+PAD_LEN=$(for script in $SCRIPTS ; do echo $script ; done | wc --max-line-length)
+PAD_LEN="$((PAD_LEN-16))"
+
+for script in $SCRIPTS
+do
+	testname=$(echo -n ${script:16:-3})
+	BUILD_DIR=$BUILD ./$script 2&>> /dev/null
+	if [ $? -ne 0 ]
+	then
+		printf "%-${PAD_LEN}s ${RED}FAILED${DEF} \n" $testname
+		exit 1
+	else
+		printf "%-${PAD_LEN}s ${GREEN}PASSED${DEF} \n" $testname
+	fi
+done
+echo -e "\nAll tests OK!"
+rm -rf $BUILD
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
index 2511528ee..5f5cc2a51 100644
--- a/include/CMakeLists.txt
+++ b/include/CMakeLists.txt
@@ -40,6 +40,9 @@ set( HEADERS_REGEX ".+\.(hpp|h|hxx|hh|h\\+\\+)$" )
 
 # to avoid flaky acrobatics with regex or glob expressions, copy main files directly
 install( FILES "graphblas.hpp" DESTINATION "${INCLUDE_INSTALL_DIR}" )
+if( WITH_ASCEND_BACKEND )
+	install( FILES "alpAscend.hpp" DESTINATION "${INCLUDE_INSTALL_DIR}" )
+endif()
 set( root_files
 	"graphblas/backends.hpp" "graphblas/benchmark.hpp"
 	"graphblas/blas0.hpp" "graphblas/blas1.hpp" "graphblas/blas2.hpp"
@@ -169,6 +172,21 @@ if( WITH_NONBLOCKING_BACKEND )
 	install( TARGETS backend_nonblocking_headers EXPORT GraphBLASTargets )
 endif()
 
+if( WITH_ASCEND_BACKEND )
+	add_library( backend_ascend_headers INTERFACE )
+	target_link_libraries( backend_ascend_headers INTERFACE backend_reference_headers )
+	target_compile_definitions( backend_ascend_headers INTERFACE "${ASCEND_INCLUDE_DEFS}" )
+	target_include_directories( backend_ascend_headers INTERFACE
+		$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+		$<INSTALL_INTERFACE:.>
+	)
+	install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/graphblas/ascend/"
+		DESTINATION "${GRB_INCLUDE_INSTALL_DIR}/ascend"
+		FILES_MATCHING REGEX "${HEADERS_REGEX}"
+	)
+	install( TARGETS backend_ascend_headers EXPORT GraphBLASTargets )
+endif()
+
 if( WITH_BSP1D_BACKEND OR WITH_HYBRID_BACKEND )
 	# copy headers, which are common to both distributed backends
 	install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/graphblas/bsp/"
diff --git a/include/alpAscend.hpp b/include/alpAscend.hpp
new file mode 100644
index 000000000..f20da3b2c
--- /dev/null
+++ b/include/alpAscend.hpp
@@ -0,0 +1,344 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * The main header to include in order to use ALP/Ascend codegen.
+ *
+ * @author A. N. Yzelman.
+ * @date 12th of September, 2023.
+ */
+
+#ifndef _H_ALPASCEND
+#define _H_ALPASCEND
+
+#include <functional>
+#include <limits>
+#include <cstddef>
+
+#include <graphblas.hpp>
+#include <graphblas/ascend/tensor.hpp>
+#include <graphblas/ascend/operators.hpp>
+#include <graphblas/ascend/grid.hpp>
+#include <graphblas/ascend/opgen.hpp>
+#include <graphblas/ascend/lazy_evaluation.hpp>
+#include <graphblas/ascend/symbolTable.hpp>
+
+/**
+ * \defgroup ALPAscend ALP/Ascend
+ *
+ * This the ALP/Ascend module.
+ *
+ * @{
+ */
+
+namespace alp
+{
+	namespace internal
+	{
+		extern iGrid *igrid;
+		extern AscendLazyEvaluation ale;
+		extern SymbolTable symbols;
+	}
+}
+
+/** The ALP/Ascend namespace */
+namespace alp {
+
+	using grb::RC;
+
+	using grb::toString;
+
+	namespace internal {
+
+		template< size_t process_mesh_order, size_t problem_mesh_order >
+		using AscendCodeFunction = void (*) (
+				const alp::Grid< process_mesh_order, problem_mesh_order > &,
+				alp::RC &
+			);
+	
+	}
+
+	template< size_t process_mesh_order, size_t problem_mesh_order >
+	static grb::RC compile(
+		const internal::AscendCodeFunction<
+			process_mesh_order,
+			problem_mesh_order
+		> ascend_code,
+		const std::string &kernel_name
+	) {
+		grb::RC ret = grb::PANIC;
+		grb::Launcher< grb::EXEC_MODE::AUTOMATIC > launcher;
+		alp::Grid< process_mesh_order, problem_mesh_order > grid;
+
+		alp::internal::igrid =
+			new alp::internal::iGrid( process_mesh_order, problem_mesh_order );
+
+		internal::OpGen::kernel_id = kernel_name;
+
+		std::ofstream output_device_code;
+		output_device_code.open ( internal::OpGen::kernel_id + "_npu_op.cpp", std::ofstream::out | std::ofstream::trunc);
+
+		std::ofstream output_host_log;
+		output_host_log.open ( "generate_host_code_" + internal::OpGen::kernel_id + ".inp", std::ofstream::out | std::ofstream::trunc);
+
+		output_host_log << "0";
+		for( size_t i = 1; i < process_mesh_order; ++i ) {
+			output_host_log << "," << i;
+		}
+		output_host_log << std::endl;
+
+		output_host_log << "0";
+		for( size_t i = 1; i < problem_mesh_order; ++i ) {
+			output_host_log << "," << i;
+		}
+		output_host_log << std::endl;
+
+		// TODO perhaps the processSize and problemSize members should be generated
+		//		more than once, for every forEach
+		//		only the tile_num is the same?
+
+		// const uint32_t _p0
+		internal::OpGen::hostFormalParam << "const uint32_t _" << alp::internal::igrid->processSize( 0 );
+
+		// , const uint32_t _p1, const uint32_t _p2, const uint32_t _p3 ...
+		for( size_t i = 1; i < process_mesh_order; ++i ) {
+			internal::OpGen::hostFormalParam << ", const uint32_t _" << alp::internal::igrid->processSize( i );
+		}
+
+		// , const uint32_t _n0, const uint32_t _n1, const uint32_t _n2
+		for( size_t i = 0; i < problem_mesh_order; ++i ) {
+			internal::OpGen::hostFormalParam << ", const uint32_t _" << alp::internal::igrid->problemSize( i );
+		}
+
+		// _p0
+		internal::OpGen::hostArg << "_" << alp::internal::igrid->processSize( 0 );
+
+		// , _p1, _p2, _p3 ...
+		for( size_t i = 1; i < process_mesh_order; ++i ) {
+			internal::OpGen::hostArg << ", _" << alp::internal::igrid->processSize( i );
+		}
+
+		// , _n0, _n1, _n2 ...
+		for( size_t i = 0; i < problem_mesh_order; ++i ) {
+			internal::OpGen::hostArg << ", _" << alp::internal::igrid->problemSize( i );
+		}
+
+		// p0 = _p0;
+		// p1 = _p1;
+		// p2 = _p2;
+		// ...
+		// when i < process_mesh_order
+		for( size_t i = 0; i < process_mesh_order; ++i ) {
+			internal::OpGen::constrBody << "\n";
+			internal::OpGen::constrBody << "\t\t\t"
+				<< alp::internal::igrid->processSize( i )
+				<< " = _" << alp::internal::igrid->processSize( i )
+				<< ";";
+		}
+
+		// p1 = 1;
+		// p2 = 1;
+		// ...
+		// when process_mesh_order <= i < problem_mesh_order
+		for( size_t i = process_mesh_order; i < problem_mesh_order; ++i ) {
+			internal::OpGen::constrBody << "\n";
+			internal::OpGen::constrBody << "\t\t\t"
+				<< alp::internal::igrid->processSize( i )
+				<< " = 1;";
+		}
+
+		internal::OpGen::constrBody << "\n";
+
+		// n0 = _n0;
+		// n1 = _n1;
+		// n2 = _n2;
+		// ...
+		for( size_t i = 0; i < problem_mesh_order; ++i ) {
+			internal::OpGen::constrBody << "\n";
+			internal::OpGen::constrBody << "\t\t\t"
+				<< alp::internal::igrid->problemSize( i ) << " = _"
+				<< alp::internal::igrid->problemSize( i ) << ";";
+		}
+
+		internal::OpGen::constrBody << "\n";
+
+		// uint32_t p0;
+		// uint32_t p1;
+		// uint32_t p2;
+		for( size_t i = 0; i < problem_mesh_order; ++i ) {
+			internal::OpGen::classMembers << "\t\tuint32_t "
+				<< alp::internal::igrid->processSize( i ) << ";\n";
+		}
+
+		internal::OpGen::classMembers << "\n";
+
+		// uint32_t n0;
+		// uint32_t n1;
+		// uint32_t n2;
+		for( size_t i = 0; i < problem_mesh_order; ++i ) {
+			internal::OpGen::classMembers << "\t\tuint32_t "
+				<< alp::internal::igrid->problemSize( i ) << ";\n";
+		}
+
+		internal::OpGen::classMembers << "\n";
+
+		const RC launch_rc = launcher.exec<
+			alp::Grid< process_mesh_order, problem_mesh_order >,
+			alp::RC
+		> (
+			ascend_code, grid, ret, true
+		);
+		if( launch_rc != grb::SUCCESS ) {
+			throw std::runtime_error( "Launching codegen FAILED" );
+		}
+
+		// ANALYTIC MODEL
+		{
+			std::stringstream analyticModelArgs;
+			std::stringstream analyticModelFormalParams;
+			std::stringstream analyticModelDecls;
+			std::stringstream analyticModelConstrBody;
+
+			// host body generation appends to hostArgs, so the below line must follow the previous one(!)
+			 alp::internal::ale.generateHostBody( internal::OpGen::hostBody,
+									analyticModelArgs, analyticModelFormalParams,
+									analyticModelDecls, analyticModelConstrBody );
+
+			internal::OpGen::hostArg << analyticModelArgs.str();
+			internal::OpGen::analyticModelFormalParams << analyticModelFormalParams.str();
+			internal::OpGen::classMembers << analyticModelDecls.str();
+			internal::OpGen::constrBody << analyticModelConstrBody.str();
+		}
+
+		/*
+		 * Only once we are here we have execute all the forEach,
+		 * and thus we have all the information we need to generate
+		 * code and performs optimizations, especially across
+		 * different forEach, and including handling multiple
+		 * pipelines that may be built by the same forEach
+		 *
+		 */
+
+//		alp::internal::symbols.debug_print();
+//		alp::internal::ale.debug_print();
+
+		// CLASS MEMBER DECLARATIONS
+		{
+			std::stringstream decl;
+			alp::internal::ale.generateDeclarations( decl );
+			internal::OpGen::declarations << decl.str();
+		}
+
+		// CONSTRUCTOR BODY
+//		{
+//			std::stringstream constructor;
+//			alp::internal::ale.generateConstructor( constructor );
+//			internal::OpGen::constrBody << constructor.str();
+//		}
+
+		// INIT BODY
+		{
+			if( alp::internal::symbols.existsTBufTensorDecl() == true ) {
+
+				//TODO I should make the datatype a parameter
+				std::string temp_data_type = "half";
+				std::stringstream max_n;
+/*
+				max_n << "std::max( { " << alp::internal::igrid->problemSize( 0 );
+
+				for( size_t i = 1; i < problem_mesh_order; ++i ) {
+					max_n << ", " << alp::internal::igrid->problemSize( i );
+				}
+
+				// close all open parentheses
+				max_n << " } )";
+*/
+				if( problem_mesh_order == 1 ) {
+					max_n << "" << alp::internal::igrid->problemSize( 0 ) << "";
+				} else {
+					max_n << "alp::max( " << alp::internal::igrid->problemSize( 0 ) << ", ";
+
+					for( size_t i = 1; i < problem_mesh_order - 1; ++i ) {
+						max_n << "alp::max( " << alp::internal::igrid->problemSize( i ) << ", ";
+					}
+
+					// this corresponds to the last one, which is a special case
+					// since it doesn't open a new recursive std::max
+					max_n << alp::internal::igrid->problemSize( problem_mesh_order - 1 );
+
+					// close all open parentheses
+					for( size_t i = 1; i < problem_mesh_order; ++i ) {
+						max_n << " )";
+					}
+				}
+
+				internal::OpGen::initBody << "\n";
+				internal::OpGen::initBody << "\t\t\tint32_t totWorkSpaceSize = alp::computeBufferSize( " << max_n.str() << ", sizeof( " << temp_data_type << " ) );\n";
+
+			}
+
+			std::stringstream init;
+			alp::internal::ale.generateInit( init );
+			internal::OpGen::initBody << init.str();
+
+			if( alp::internal::symbols.existsTBufTensorDecl() == true ) {
+				std::stringstream temp_local_init;
+				alp::internal::symbols.generateTempLocalInit( temp_local_init );
+				internal::OpGen::initBody << temp_local_init.str();
+			}
+		}
+
+		// PROCESS
+		{
+			std::stringstream process, processCall;
+			alp::internal::ale.generateProcess( process, processCall );
+			internal::OpGen::processFunc.push_back( std::move( process ) );
+			internal::OpGen::genericProcessBody << processCall.str();
+		}
+
+		alp::internal::OpGen::generate( output_device_code );
+
+		std::stringstream listOfGlobalTensors;
+		alp::internal::symbols.printHostLogFile( listOfGlobalTensors );
+		output_host_log << listOfGlobalTensors.str() << std::endl;
+
+		output_host_log << internal::OpGen::kernel_id << std::endl;
+
+		output_host_log << internal::OpGen::analyticModelFormalParams.str() << std::endl;
+
+		output_host_log << "$BEGIN_ANALYTIC_MODEL" << std::endl;
+		output_host_log << internal::OpGen::hostBody.str();
+		output_host_log << "$END_ANALYTIC_MODEL" << std::endl;
+
+		output_device_code.close();
+		output_host_log.close();
+
+		internal::OpGen::compileClear();
+
+		delete alp::internal::igrid;
+
+		return ret;
+	}
+
+}
+
+/** @} */
+
+#endif // end _H_ALPASCEND
+
diff --git a/include/asclib/analytic_model.hpp b/include/asclib/analytic_model.hpp
new file mode 100644
index 000000000..37bff0b1c
--- /dev/null
+++ b/include/asclib/analytic_model.hpp
@@ -0,0 +1,496 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * The analytic model to be used by the AscendC code, at operator run-time.
+ *
+ * @author A. N. Yzelman
+ * @date 25th of October, 2023
+ */
+
+#ifndef _H_ALP_ASCEND_ANALYTIC_MODEL
+#define _H_ALP_ASCEND_ANALYTIC_MODEL
+
+#include <cmath>
+#include <string>
+#include <array>
+#include <vector> // TODO FIXME factor this out -- too high runtime overhead
+#include <algorithm>
+
+#ifdef _DEBUG
+ #include <iostream>
+#endif
+
+#include <assert.h>
+
+#ifndef ASC_FORCE_BINARY_SEARCH
+ /** Set this macro to true to force a binary search */
+ #define ASC_FORCE_BINARY_SEARCH false
+#endif
+
+
+/** The ALP@Ascend namespace for run-time components. */
+namespace asc {
+
+	/**
+	 * The analytic model is parametrised in the dimensionality of the process
+	 * mesh and the problem mesh.
+	 *
+	 * For the tensors that are in the pipeline, it furthermore requires static
+	 * knowledge on whether the dynamic axes (the axes over which the user program
+	 * iterates) of the tensors involved with the pipeline, differ.
+	 */
+	template< size_t process_order, size_t problem_order, bool has_differing_dyn_axes >
+	class AnalyticModel {
+
+		private:
+
+			/** Whether to force a binary search */
+			static constexpr const bool force_binary = ASC_FORCE_BINARY_SEARCH;
+
+			/** The total scratchpad space, in bytes. */
+			const size_t totalSpace;
+
+			std::array< size_t, process_order > processSizes;
+
+			std::array< size_t, problem_order > problemSizes;
+
+			std::array< bool, problem_order > iterationAxes;
+
+			std::vector< std::pair< std::vector< unsigned int >, size_t > > tensors;
+
+			std::array< size_t, problem_order > blockLengths;
+
+			std::vector< unsigned int > largestDynamicAxes;
+
+			size_t largestSize;
+
+			size_t largestStaticSize;
+
+			size_t totalStaticSize;
+
+			/** The size of buffers used by the AscendC program. */
+			size_t bufferSize;
+
+			unsigned int numStages;
+
+			unsigned int nDynamicAxes;
+
+			/** Whether the block lengths have been computed. */
+			bool lock;
+
+			/** Checks whether current block lengths overrun the buffer */
+			bool feasible() const {
+				const size_t effectiveBufferSize = totalSpace - bufferSize;
+				size_t required = 0;
+				for( const auto &pair : tensors ) {
+					size_t size = pair.second;
+					for( const auto &dyn_axis : pair.first ) {
+						const size_t tileSize = std::max( 1ul, blockLengths[ dyn_axis ] );
+						size *= tileSize;
+					}
+					required += size;
+				}
+#ifdef _DEBUG
+				std::cout << "\t\tfeasibility of current solution: " << required << " <= "
+					<< effectiveBufferSize << "\n";
+#endif
+				return required <= effectiveBufferSize;
+			}
+
+			void analyticSolve() {
+				const size_t n = tensors.size();
+				const size_t effectiveBufferSize = totalSpace - bufferSize;
+				const size_t maxMul = effectiveBufferSize / totalStaticSize;
+				const unsigned int d = largestDynamicAxes.size();
+#ifdef _DEBUG
+				std::cout << "\tanalyticSolve called with n = " << n << ", "
+					<< "effectiveBufferSize = " << effectiveBufferSize << ", "
+					<< "largestStaticSize = " << largestStaticSize << ", "
+					<< "totalStaticSize = " << totalStaticSize << ", "
+					<< "maxMul = " << maxMul << ", and "
+					<< "d = " << d << "\n";
+#endif
+				if( d == 1 ) {
+#ifdef _DEBUG
+					std::cout << "\t\tsuggested blocksize is " << maxMul << "\n";
+#endif
+					blockLengths[ largestDynamicAxes[ 0 ] ] = maxMul;
+				} else {
+					// taking max with 1 is safe since we already know 1, 1, ..., 1 is a sol
+					const double root = std::max( std::pow(
+							static_cast< double >(maxMul),
+							static_cast< double >(1) / static_cast< double >(d) ),
+						static_cast< double >(1) );
+#ifdef _DEBUG
+					std::cout << "\t\tinitial suggested blocksize is " << root << "\n";
+#endif
+					// select solution
+					size_t sizeTaken = totalStaticSize;
+					for( const auto &axis : largestDynamicAxes ) {
+						blockLengths[ axis ] = root;
+						sizeTaken *= root;
+					}
+					// add one until we fill up the buffer: O(d) work
+					unsigned int incDim = 0;
+					assert( totalStaticSize > 0 );
+					while( sizeTaken + totalStaticSize <= effectiveBufferSize ) {
+						(void) ++(blockLengths[ largestDynamicAxes[ incDim ] ]);
+#ifdef _DEBUG
+						std::cout << "\t\tblock_length" << largestDynamicAxes[ incDim ]
+							<< "incremented with one\n";
+#endif
+						sizeTaken += totalStaticSize;
+						(void) ++incDim;
+						if( incDim % largestDynamicAxes.size() == 0 ) {
+							assert( sizeTaken + totalStaticSize > effectiveBufferSize );
+						}
+					}
+				}
+#ifdef _DEBUG
+				std::cout << "\t\tWill return the following solution:\n";
+				for( unsigned int i = 0; i < problem_order; ++i ) {
+					std::cout << "\t\t\tblock_length" << i << " = "
+						<< blockLengths[ i ] << "\n";
+				}
+#endif
+			}
+
+			void binarySearch() {
+				if( !feasible() ) {
+					// only in this case we need to compute a non-trivial block length
+					// we follow a greedy approach where we increase the dimension of the
+					// blocking only if blocking in one direction was not feasible
+					unsigned int dim = 1;
+					std::array< size_t, problem_order > loSizes;
+					std::array< size_t, problem_order > curSizes;
+					std::array< size_t, problem_order > hiSizes;
+					bool foundFeasible = false;
+					std::array< size_t, problem_order > lastFeasible;
+					// NOTE this finds the asymptotic optimum if there's one iteration axis
+					// TODO work out the model in multiple dimensions
+					while( !foundFeasible ) {
+						// set up binary search
+						assert( dim <= largestDynamicAxes.size() );
+						for( unsigned int i = 0; i < dim; ++i ) {
+							const size_t size = problemSizes[ largestDynamicAxes[ i ] ];
+							loSizes[ i ] = 1;
+#ifdef _DEBUG
+							std::cout << "\tproblemSizes[ " << i << " ] = " << problemSizes[ i ]
+								<< "\n";
+#endif
+							curSizes[ i ] = std::max( 1ul, size / 2 );
+							hiSizes[ i ] = size;
+							blockLengths[ i ] = 1;
+						}
+						// start binary search
+						bool converged = false;
+						while( !converged ) {
+#ifdef _DEBUG
+							for( unsigned int i = 0; i < dim; ++i ) {
+								std::cout << "\tcurrent search: " << loSizes[ i ] << ", "
+									<< curSizes[ i ] << ", " << hiSizes[ i ] << "\n";
+							}
+#endif
+							// active & evaluate current guess
+							bool notFeasible = true;
+							{
+								unsigned int curDim = 0;
+								for( const auto &dyn_axis : largestDynamicAxes ) {
+									blockLengths[ dyn_axis ] = curSizes[ curDim ];
+									(void) ++curDim;
+									if( curDim >= dim ) { break; }
+								}
+								notFeasible = !feasible();
+							}
+							// update search direction
+							const std::array< size_t, problem_order > lastCur = curSizes;
+							if( notFeasible ) {
+								// mid point is not feasible, update hi and cur
+								for( unsigned int i = 0; i < dim; ++i ) {
+									hiSizes[ i ] = curSizes[ i ];
+									curSizes[ i ] = std::max( 1ul,
+										(hiSizes[ i ] - loSizes[ i ]) / 2 + loSizes[ i ] );
+								}
+							} else {
+								foundFeasible = true;
+								lastFeasible = curSizes;
+								// mid point is feasible, update lo and cur
+								for( unsigned int i = 0; i < dim; ++i ) {
+									loSizes[ i ] = curSizes[ i ];
+									curSizes[ i ] = std::max( 1ul,
+										(hiSizes[ i ] - loSizes[ i ]) / 2 + loSizes[ i ] );
+								}
+							}
+							// check convergence
+							converged = true;
+							for( unsigned int i = 0; i < dim; ++i ) {
+								if( lastCur[ i ] != curSizes[ i ] ) {
+									converged = false;
+								}
+							}
+						} // end binary search
+						if( !foundFeasible ) {
+#ifdef _DEBUG
+							std::cout << "\tend of binary search without finding any feasible "
+								<< "solution at dim " << dim << "\n";
+#endif
+							(void) ++dim;
+							if( dim >= largestDynamicAxes.size() ) {
+								// This situation should never occur, because the trivial solution of
+								// blockSize one everywhere should, before calling this function,
+								// already have been determined to be feasible.
+								throw std::runtime_error( "Search failed but this situation should "
+									"never be encountered-- please submit a bug report" );
+							}
+						}
+					}
+					// re-activate last found feasible solution
+					assert( foundFeasible );
+					unsigned int curDim = 0;
+					for( const auto &dyn_axis : largestDynamicAxes ) {
+						blockLengths[ dyn_axis ] = lastFeasible[ curDim ];
+						(void) ++curDim;
+						if( curDim >= dim ) { break; }
+					}
+					assert( feasible() );
+				}
+			}
+
+			void computeBlockLengths() {
+#ifdef _DEBUG
+				std::cout << "\tIn computeBlockLengths()\n"
+					<< "\t\tlargestDynamicAxes.size() = " << largestDynamicAxes.size() << "\n";
+#endif
+				for( unsigned int i = 0; i < problem_order; ++i ) {
+					blockLengths[ i ] = 1;
+				}
+				if( !feasible() ) {
+					throw std::runtime_error( "Operator cannot be executed for the given "
+						"problem sizes." );
+				}
+				std::vector< unsigned int > activeProcIDs; // TODO FIXME remove dependence on std::vector (for performance)
+				unsigned int procGridDim = 0;
+				for( unsigned int i = 0; i < process_order; ++i ) {
+					assert( processSizes[ i ] > 0 );
+					if( processSizes[ i ] > 1 ) {
+						activeProcIDs.push_back( i );
+						(void) ++procGridDim;
+					}
+				}
+				if( procGridDim > largestDynamicAxes.size() ) {
+					// we need to reduce the process mesh
+					// we just alternate between expanding the first
+					// largestDynamicAxes mesh sizes
+					unsigned int curProcInd = 0;
+					for( unsigned int i = largestDynamicAxes.size(); i < procGridDim; ++i ) {
+						processSizes[ curProcInd ] *= processSizes[ i ];
+						processSizes[ i ] = 1;
+						(void) ++curProcInd;
+						if( curProcInd % procGridDim == 0 ) {
+							curProcInd = 0;
+						}
+					}
+				}
+				// compute effective dynamic sizes
+				for( const auto &dyn_axis : largestDynamicAxes ) {
+					const size_t n = problemSizes[ dyn_axis ];
+					const size_t p = processSizes[ dyn_axis ];
+					if( n % p == 0 ) {
+						problemSizes[ dyn_axis ] = n / p;
+					} else {
+						problemSizes[ dyn_axis ] = n / p + 1;
+					}
+				}
+				// check for trivial solution
+				for( const auto &dyn_axis : largestDynamicAxes ) {
+#ifdef _DEBUG
+					std::cout << "\tSetting blockLengths[ " << dyn_axis << " ] to "
+						<< problemSizes[ dyn_axis ] << "\n";
+#endif
+					blockLengths[ dyn_axis ] = problemSizes[ dyn_axis ];
+				}
+				if( !feasible() ) {
+					// choose between solution strategy
+					if( force_binary || (problem_order > 1 && has_differing_dyn_axes) ) {
+						binarySearch();
+					} else {
+						analyticSolve();
+					}
+				}
+
+				// done
+				lock = true;
+			}
+
+
+		public:
+
+			/**
+			 * After successful creation, the analytic model is \em unlocked, meaning
+			 * information of the pipeline may be ingested.
+			 *
+			 * TODO: the analytic model currently takes a single scratchpad size,
+			 *       \a spsize. But probably it should take two: one for the vector
+			 *       unit, and one for the tensor unit.
+			 */
+			AnalyticModel(
+				const size_t spSize,
+				std::array< size_t, process_order > procSizes,
+				std::array< size_t, problem_order > probSizes,
+				std::array< bool, problem_order > iterAxes
+			) :
+				totalSpace( spSize ),
+				processSizes( std::move( procSizes ) ),
+				problemSizes( std::move( probSizes ) ),
+				iterationAxes( std::move( iterAxes ) ),
+				largestSize( 0 ), largestStaticSize( 0 ), totalStaticSize( 0 ),
+				bufferSize( 0 ), numStages( 0 ),
+				lock( false )
+			{
+				nDynamicAxes = 0;
+				for( unsigned int i = 0; i < problem_order; ++i ) {
+					if( iterationAxes[ i ] ) {
+						(void) ++nDynamicAxes;
+					}
+					blockLengths[ i ] = 0;
+				}
+			}
+
+			/**
+			 * Registers a buffer required by the pipeline.
+			 *
+			 * Buffers are not allowed to have dynamic dimensions.
+			 *
+			 * \warning This function does not check for violation of this requirement.
+			 */
+			void addBuffer(
+				const size_t elemSize,
+				const std::array< bool, problem_order > &tensor
+			) noexcept {
+				assert( !lock );
+				size_t curSize = elemSize;
+				for( unsigned int i = 0; i < problem_order; ++i ) {
+					if( tensor[ i ] ) {
+						curSize *= problemSizes[ i ];
+					}
+				}
+				bufferSize += curSize;
+			}
+
+			/**
+			 * Registers a general tensor required by the pipeline.
+			 *
+			 * The given tensor is guaranteed smaller than some other tensor that has
+			 * been, or will be, passed to #addGlobalTensor.
+			 */
+			void addMinorTensor(
+				const size_t elemSize,
+				const std::array< bool, problem_order > &tensor
+			) noexcept {
+				assert( !lock );
+				size_t staticSize = elemSize;
+				std::vector< unsigned int > dynamicAxes;
+				for( size_t i = 0; i < problem_order; ++i ) {
+					if( tensor[ i ] ) {
+						if( iterationAxes[ i ] ) {
+							dynamicAxes.push_back( i );
+						} else {
+							staticSize *= problemSizes[ i ];
+						}
+					} 
+				}
+				totalStaticSize += staticSize;
+				tensors.push_back( std::make_pair( dynamicAxes, staticSize ) );
+#ifdef _DEBUG
+				std::cout << "Added minor tensor with " << elemSize << "-byte elements, "
+					<< dynamicAxes.size() << " dynamic axes, and a static size of "
+					<< staticSize << " bytes.\n";
+#endif
+			}
+
+			/**
+			 * Registers a general tensor required by the pipeline.
+			 */
+			void addGlobalTensor(
+				const size_t elemSize,
+				const std::array< bool, problem_order > &tensor
+			) {
+				assert( !lock );
+				size_t staticSize = elemSize;
+				std::vector< unsigned int > dynamicAxes;
+				for( size_t i = 0; i < problem_order; ++i ) {
+					if( tensor[ i ] ) {
+						if( iterationAxes[ i ] ) {
+							dynamicAxes.push_back( i );
+						} else {
+							staticSize *= problemSizes[ i ];
+						}
+					}
+				}
+				totalStaticSize += staticSize;
+				tensors.push_back( std::make_pair( dynamicAxes, staticSize ) );
+				size_t globalSize = staticSize;
+				for( const unsigned int &axis : dynamicAxes ) {
+					globalSize *= problemSizes[ axis ];
+				}
+#ifdef _DEBUG
+				std::cout << "\tadded global tensor with elements of " << elemSize
+					<< " bytes, with a globalSize of " << globalSize
+					<< " bytes, while the current largest size is " << largestSize
+					<< ", and #dynamic axes is " << dynamicAxes.size()
+					<< "\n";
+#endif
+				if( globalSize > largestSize ) {
+					largestDynamicAxes = std::move( dynamicAxes );
+					largestSize = globalSize;
+					largestStaticSize = staticSize;
+				}
+			}
+
+			/**
+			 * This is actually a place-holder for a mechanism that gives the analytic
+			 * model more precise information on the stages in the pipeline. Rationale
+			 * on why this is needed: some stages (AscendC operators) require work space
+			 * buffers.
+			 *
+			 * @param[in] n The number of stages in the pipeline.
+			 */
+			void setNumStages( const size_t n ) {
+				numStages = n;
+			}
+
+			/**
+			 * Computes the block sizes suggested by the analytic model.
+			 *
+			 * Locks the analytic model.
+			 */
+			size_t getBlockSize( const unsigned int axis ) {
+				if( !lock ) {
+					computeBlockLengths();
+				}
+				return blockLengths[ axis ];
+			}
+
+	};
+
+}
+
+#endif
+
diff --git a/include/asclib/ascendlib.hpp b/include/asclib/ascendlib.hpp
new file mode 100644
index 000000000..51059139e
--- /dev/null
+++ b/include/asclib/ascendlib.hpp
@@ -0,0 +1,348 @@
+#include <kernel_operator.h>
+
+using namespace AscendC;
+
+namespace alp {
+
+	__aicore__ inline int32_t max( const int32_t a, const int32_t b ) {
+		if( a > b) {
+			return a;
+		}
+		return b;
+	}
+
+	__aicore__ inline int32_t RoundUp(int32_t a, int32_t b) {
+		return (a + b - 1) / b;
+	}
+
+	__aicore__ inline int32_t computeBufferSize( const uint32_t max_n, const uint32_t data_size )
+	{
+			// Initializing data required by temporary Tensors
+			int32_t ascend_el_per_blk = ONE_BLK_SIZE / data_size;
+			int32_t elementsPerRepeat = ONE_REPEAT_BYTE_SIZE / data_size;
+			int32_t firstMaxRepeat = max_n / elementsPerRepeat;
+			int32_t iter1OutputCount = firstMaxRepeat * 2;
+			int32_t tmpBufsColsReduce = RoundUp( iter1OutputCount, ascend_el_per_blk ) * ascend_el_per_blk;
+			int32_t totWorkSpaceSize = ( ascend_el_per_blk + tmpBufsColsReduce + max_n );
+
+			return totWorkSpaceSize;
+	}
+
+	template< typename T3 = half, typename T1, typename T2 >
+	__aicore__ inline void DataMove(
+		T1 tensorOut,
+		T2 tensorIn,
+		const uint32_t blocklen
+	) {
+		DataCopy<T3>( tensorOut, tensorIn, blocklen );
+	}
+
+	template< typename T3 = half, typename T1, typename T2 >
+	__aicore__ inline void DataMove(
+		T1 tensorOut,
+		T2 tensorIn,
+		const uint32_t nblocks,
+		const uint32_t blocklen,
+		const uint32_t src_stride,
+		const uint32_t dst_stride
+	) {
+		DataCopyParams dcp;
+		dcp.blockCount = nblocks;
+		dcp.blockLen   = sizeof( T3 ) * blocklen / 32 ;
+		dcp.srcStride  = sizeof( T3 ) * ( src_stride - blocklen ) / 32;
+		dcp.dstStride  = sizeof( T3 ) * ( dst_stride - blocklen ) / 32;
+		DataCopy<T3>( tensorOut, tensorIn, dcp );
+	}
+
+	// Bock (matrix) versions
+
+	__aicore__ inline void BlockSet(
+		AscendC::LocalTensor< half > tensorOut,
+		half value,
+		const uint32_t nblocks,
+		const uint32_t blocklen
+	) {
+		Duplicate( tensorOut, value, nblocks * blocklen );
+	}
+
+	__aicore__ inline void BlockSet(
+		AscendC::LocalTensor< half > tensorOut,
+		AscendC::LocalTensor< half > tensorIn,
+		const uint32_t nblocks,
+		const uint32_t blocklen
+	) {
+		DataCopy( tensorOut, tensorIn, nblocks * blocklen );
+	}
+
+	__aicore__ inline void BlockExp(
+		AscendC::LocalTensor< half > tensorOut,
+		AscendC::LocalTensor< half > tensorIn,
+		const uint32_t nblocks,
+		const uint32_t blocklen
+	) {
+		for( uint32_t k = 0; k < nblocks ; ++k ) {
+			Exp( tensorOut[ k * blocklen ], tensorIn[ k * blocklen ], blocklen );
+		}
+	}
+
+	__aicore__ inline void BlockReduceSum(
+		AscendC::LocalTensor< half > tensorOut,
+		AscendC::LocalTensor< half > tensorIn,
+		AscendC::LocalTensor< half > Work,
+		const uint32_t nblocks,
+		const uint32_t blocklen
+	) {
+		// for( uint32_t k = 0; k < nblocks ; ++k ) {
+		// 	ReduceSum( tensorOut[ k ], tensorIn[ k * blocklen ], Work, blocklen );
+		// }
+		uint32_t repeat = nblocks;
+		uint32_t srcRepStride = blocklen;
+		srcRepStride = ( sizeof( half ) * srcRepStride ) / 32;
+		uint32_t nr = repeat/255;
+		if( repeat % 255 ) nr++;
+		for( uint32_t ir = 0; ir < nr ; ++ir  ) {
+			uint32_t locrepeat = 255;
+			if( ir == nr - 1 ) locrepeat = repeat - ir * 255;
+			WholeReduceSum<half>(
+				tensorOut[ ir * 255 ],
+				tensorIn[ ir * 255 * blocklen ],
+				blocklen,   // mask
+				locrepeat,  // repeat
+				1,    // dstStride
+				1,    // srcBlkStride
+				srcRepStride// srcRepStride
+			);
+		}
+
+	}
+
+	__aicore__ inline void BlockReduceMax(
+		AscendC::LocalTensor< half > tensorOut,
+		AscendC::LocalTensor< half > tensorIn,
+		AscendC::LocalTensor< half > Work,
+		const uint32_t nblocks,
+		const uint32_t blocklen
+	) {
+#ifdef ASCEND910B
+		uint32_t repeat = nblocks;
+		uint32_t srcRepStride = blocklen;
+		srcRepStride = ( sizeof( half ) * srcRepStride ) / 32;
+		uint32_t nr = repeat/255;
+		if( repeat % 255 ) nr++;
+		for( uint32_t ir = 0; ir < nr ; ++ir  ) {
+			uint32_t locrepeat = 255;
+			if( ir == nr - 1 ) locrepeat = repeat - ir * 255;
+			WholeReduceMax<half>(
+				tensorOut[ ir * 255 ],
+				tensorIn[ ir * 255 * blocklen ],
+				blocklen,   // mask
+				locrepeat,  // repeat
+				1,    // dstStride
+				1,    // srcBlkStride
+				srcRepStride, // srcRepStride
+				ReduceOrder::ORDER_ONLY_VALUE
+			);
+		}
+#else
+		// TODO replace with better
+		for( uint32_t k = 0; k < nblocks ; ++k ) {
+			ReduceMax( tensorOut[ k ], tensorIn[ k * blocklen ], Work, blocklen );
+		}
+#endif
+	}
+
+	__aicore__ inline void BlockBcastMinus(
+		AscendC::LocalTensor< half > tensorOut,
+		AscendC::LocalTensor< half > tensorInA,
+		AscendC::LocalTensor< half > tensorInB,
+		AscendC::LocalTensor< half > Work,
+		const uint32_t nblocks,
+		const uint32_t blocklen
+	) {
+		for( uint32_t k = 0; k < nblocks ; ++k ) {
+			Duplicate( Work, tensorInB[ k ].GetValue( 0 ), blocklen ); // broadcast
+			Sub( tensorOut[ k * blocklen ], tensorInA[ k * blocklen ], Work, blocklen );
+		}
+	}
+
+	__aicore__ inline void BlockEwiseMinus(
+		AscendC::LocalTensor< half > tensorOut,
+		AscendC::LocalTensor< half > tensorInA,
+		AscendC::LocalTensor< half > tensorInB,
+		const uint32_t nblocks,
+		const uint32_t blocklen
+	) {
+		Sub( tensorOut, tensorInA, tensorInB, nblocks * blocklen );
+	}
+
+	__aicore__ inline void BlockEwiseSum(
+		AscendC::LocalTensor< half > tensorOut,
+		AscendC::LocalTensor< half > tensorInA,
+		AscendC::LocalTensor< half > tensorInB,
+		const uint32_t nblocks,
+		const uint32_t blocklen
+	) {
+		Add( tensorOut, tensorInA, tensorInB, nblocks * blocklen );
+	}
+
+	__aicore__ inline void BlockEwiseMax(
+		AscendC::LocalTensor< half > tensorOut,
+		AscendC::LocalTensor< half > tensorInA,
+		AscendC::LocalTensor< half > tensorInB,
+		const uint32_t nblocks,
+		const uint32_t blocklen
+	) {
+		Max( tensorOut, tensorInA, tensorInB, nblocks * blocklen );
+	}
+
+	__aicore__ inline void BlockBcastDivide(
+		AscendC::LocalTensor< half > tensorOut,
+		AscendC::LocalTensor< half > tensorInA,
+		AscendC::LocalTensor< half > tensorInB,
+		AscendC::LocalTensor< half > Work,
+		const uint32_t nblocks,
+		const uint32_t blocklen
+	) {
+		for( uint32_t k = 0; k < nblocks ; ++k ) {
+			Duplicate( Work, tensorInB[ k ].GetValue( 0 ), blocklen ); // broadcast
+			Div( tensorOut[ k * blocklen ], tensorInA[ k * blocklen ], Work, blocklen );
+		}
+	}
+
+	__aicore__ inline void BlockBcastMultiply(
+		AscendC::LocalTensor< half > tensorOut,
+		AscendC::LocalTensor< half > tensorInA,
+		AscendC::LocalTensor< half > tensorInB,
+		AscendC::LocalTensor< half > Work,
+		const uint32_t nblocks,
+		const uint32_t blocklen
+	) {
+		for( uint32_t k = 0; k < nblocks ; ++k ) {
+			Duplicate( Work, tensorInB[ k ].GetValue( 0 ), blocklen ); // broadcast
+			Mul( tensorOut[ k * blocklen ], tensorInA[ k * blocklen ], Work, blocklen );
+		}
+	}
+
+	__aicore__ inline void BlockEwiseMultiply(
+		AscendC::LocalTensor< half > tensorOut,
+		AscendC::LocalTensor< half > tensorInA,
+		AscendC::LocalTensor< half > tensorInB,
+		const uint32_t nblocks,
+		const uint32_t blocklen
+	) {
+		Mul( tensorOut, tensorInA, tensorInB, nblocks * blocklen );
+	}
+
+	// Vector versions
+
+	__aicore__ inline void VectorSet(
+		AscendC::LocalTensor< half > tensorOut,
+		half value,
+		const uint32_t blocklen
+	) {
+		Duplicate( tensorOut, value, blocklen );
+	}
+
+	__aicore__ inline void VectorSet(
+		AscendC::LocalTensor< half > tensorOut,
+		AscendC::LocalTensor< half > tensorIn,
+		const uint32_t blocklen
+	) {
+		DataCopy( tensorOut, tensorIn, blocklen );
+	}
+
+	__aicore__ inline void VectorExp(
+		AscendC::LocalTensor< half > tensorOut,
+		AscendC::LocalTensor< half > tensorIn,
+		const uint32_t blocklen
+	) {
+		Exp( tensorOut, tensorIn, blocklen );
+	}
+
+	__aicore__ inline void VectorReduceSum(
+		AscendC::LocalTensor< half > tensorOut,
+		AscendC::LocalTensor< half > tensorIn,
+		AscendC::LocalTensor< half > Work,
+		const uint32_t blocklen
+	) {
+		ReduceSum( tensorOut, tensorIn, Work, blocklen );
+	}
+
+	__aicore__ inline void VectorReduceMax(
+		AscendC::LocalTensor< half > tensorOut,
+		AscendC::LocalTensor< half > tensorIn,
+		AscendC::LocalTensor< half > Work,
+		const uint32_t blocklen
+	) {
+		ReduceMax( tensorOut, tensorIn, Work, blocklen );
+	}
+
+	__aicore__ inline void VectorBcastMinus(
+		AscendC::LocalTensor< half > tensorOut,
+		AscendC::LocalTensor< half > tensorInA,
+		AscendC::LocalTensor< half > tensorInB,
+		AscendC::LocalTensor< half > Work,
+		const uint32_t blocklen
+	) {
+		Duplicate( Work, tensorInB.GetValue( 0 ), blocklen ); // broadcast
+		Sub( tensorOut, tensorInA, Work, blocklen );
+	}
+
+	__aicore__ inline void VectorEwiseMinus(
+		AscendC::LocalTensor< half > tensorOut,
+		AscendC::LocalTensor< half > tensorInA,
+		AscendC::LocalTensor< half > tensorInB,
+		const uint32_t blocklen
+	) {
+		Sub( tensorOut, tensorInA, tensorInB, blocklen );
+	}
+
+	__aicore__ inline void VectorEwiseSum(
+		AscendC::LocalTensor< half > tensorOut,
+		AscendC::LocalTensor< half > tensorInA,
+		AscendC::LocalTensor< half > tensorInB,
+		const uint32_t blocklen
+	) {
+		Add( tensorOut, tensorInA, tensorInB, blocklen );
+	}
+
+	__aicore__ inline void VectorEwiseMax(
+		AscendC::LocalTensor< half > tensorOut,
+		AscendC::LocalTensor< half > tensorInA,
+		AscendC::LocalTensor< half > tensorInB,
+		const uint32_t blocklen
+	) {
+		Max( tensorOut, tensorInA, tensorInB, blocklen );
+	}
+
+	__aicore__ inline void VectorBcastDivide(
+		AscendC::LocalTensor< half > tensorOut,
+		AscendC::LocalTensor< half > tensorInA,
+		AscendC::LocalTensor< half > tensorInB,
+		AscendC::LocalTensor< half > Work,
+		const uint32_t blocklen
+	) {
+		Duplicate( Work, tensorInB.GetValue( 0 ), blocklen ); // broadcast
+		Div( tensorOut, tensorInA, Work, blocklen );
+	}
+
+	__aicore__ inline void VectorBcastMultiply(
+		AscendC::LocalTensor< half > tensorOut,
+		AscendC::LocalTensor< half > tensorInA,
+		AscendC::LocalTensor< half > tensorInB,
+		AscendC::LocalTensor< half > Work,
+		const uint32_t blocklen
+	) {
+		Duplicate( Work, tensorInB.GetValue( 0 ), blocklen ); // broadcast
+		Mul( tensorOut, tensorInA, Work, blocklen );
+	}
+
+	__aicore__ inline void VectorEwiseMultiply(
+		AscendC::LocalTensor< half > tensorOut,
+		AscendC::LocalTensor< half > tensorInA,
+		AscendC::LocalTensor< half > tensorInB,
+		const uint32_t blocklen
+	) {
+		Mul( tensorOut, tensorInA, tensorInB, blocklen );
+	}
+}
diff --git a/include/graphblas/ascend/alloc.hpp b/include/graphblas/ascend/alloc.hpp
new file mode 100644
index 000000000..d31123580
--- /dev/null
+++ b/include/graphblas/ascend/alloc.hpp
@@ -0,0 +1,65 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Allocator functions for the Ascend backend
+ *
+ * @author A. N. Yzelman
+ * @date 12th of September, 2023
+ */
+
+#ifndef _H_GRB_ALLOC_ASCEND
+#define _H_GRB_ALLOC_ASCEND
+
+#include <iostream>
+
+#include <graphblas/base/alloc.hpp>
+
+#include "config.hpp"
+
+
+namespace grb {
+
+	namespace utils {
+
+		namespace internal {
+
+			template<>
+			class Allocator< ascend > {
+
+				private:
+
+					/** Prevent initialisation. */
+					Allocator();
+
+				public:
+
+					/** Refer to the standard allocation mechanism. */
+					typedef AllocatorFunctions< reference > functions;
+
+			};
+
+		} // namespace internal
+
+	} // namespace utils
+
+} // namespace grb
+
+#endif
+
diff --git a/include/graphblas/ascend/benchmark.hpp b/include/graphblas/ascend/benchmark.hpp
new file mode 100644
index 000000000..0b1835671
--- /dev/null
+++ b/include/graphblas/ascend/benchmark.hpp
@@ -0,0 +1,95 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Ascend implementation of the benchmarker.
+ *
+ * @author A. N. Yzelman 
+ * @date 12th of September, 2023
+ */
+
+#ifndef _H_GRB_ASCEND_BENCH
+#define _H_GRB_ASCEND_BENCH
+
+#include <graphblas/base/benchmark.hpp>
+#include <graphblas/rc.hpp>
+
+#include "exec.hpp"
+
+
+namespace grb {
+
+	/**
+	 * The Benchmarker class is based on that of the reference backend
+	 *
+	 * \internal The public API simply wraps the reference Benchmarker.
+	 */
+	template< enum EXEC_MODE mode >
+	class Benchmarker< mode, ascend > {
+
+		private:
+
+			/** \internal Reuse reference benchmarker. */
+			Benchmarker< mode, reference > ref;
+
+
+		public:
+
+			/** \internal Mirror reference constructor. */
+			Benchmarker(
+				size_t process_id = 0,
+				size_t nprocs = 1,
+				std::string hostname = "localhost",
+				std::string port = "0"
+			) :
+				ref(process_id, nprocs, hostname, port)
+			{}
+
+			/** \internal Mirror reference exec. */
+			template< typename U >
+			RC exec(
+				void ( *grb_program )( const void *, const size_t, U & ),
+				const void * data_in, const size_t in_size,
+				U &data_out,
+				const size_t inner, const size_t outer,
+				const bool broadcast = false
+			) const {
+				return ref.exec(
+					grb_program, data_in, in_size, data_out, inner, outer, broadcast
+				);
+			}
+
+			/** \internal Mirror reference exec. */
+			template< typename T, typename U >
+			RC exec(
+				void ( *grb_program )( const T &, U & ),
+				const T &data_in, U &data_out,
+				const size_t inner,
+				const size_t outer,
+				const bool broadcast = false
+			) {
+				return ref.exec( grb_program, data_in, data_out, inner, outer, broadcast );
+			}
+
+	};
+
+} // namespace grb
+
+#endif // end ``_H_GRB_ASCEND_BENCH''
+
diff --git a/include/graphblas/ascend/blas1.hpp b/include/graphblas/ascend/blas1.hpp
new file mode 100644
index 000000000..e10bffeeb
--- /dev/null
+++ b/include/graphblas/ascend/blas1.hpp
@@ -0,0 +1,11500 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Level-1 primitive implementation for Ascend.
+ *
+ * @author A. N. Yzelman
+ * @date 12th of September, 2023
+ */
+
+#ifndef _H_GRB_ASCEND_BLAS1
+#define _H_GRB_ASCEND_BLAS1
+
+#include <iostream>    //for printing to stderr
+#include <type_traits> //for std::enable_if
+
+#include <graphblas/backends.hpp>
+#include <graphblas/blas0.hpp>
+#include <graphblas/descriptors.hpp>
+#include <graphblas/internalops.hpp>
+#include <graphblas/ops.hpp>
+#include <graphblas/rc.hpp>
+#include <graphblas/semiring.hpp>
+
+#include <graphblas/nonblocking/lazy_evaluation.hpp>
+
+#include "coordinates.hpp"
+#include "vector.hpp"
+#include "vector_wrapper.hpp"
+#include "boolean_dispatcher_blas1.hpp"
+
+#define NO_CAST_ASSERT( x, y, z )                                              \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | Provide a value that matches the expected type.\n" \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+#define NO_CAST_OP_ASSERT( x, y, z )                                           \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | For all mismatches in the domains of input "       \
+		"parameters and the operator domains, as specified in the "            \
+		"documentation of the function " y ", supply an input argument of "    \
+		"the expected type instead.\n"                                         \
+		"* Possible fix 3 | Provide a compatible operator where all domains "  \
+		"match those of the input parameters, as specified in the "            \
+		"documentation of the function " y ".\n"                               \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+
+namespace grb {
+
+	namespace internal {
+
+		extern LazyEvaluation le;
+
+	}
+
+}
+
+namespace grb {
+
+	/**
+	 * \defgroup BLAS1_NB The Level-1 ALP/GraphBLAS routines -- ascend backend
+	 *
+	 * @{
+	 */
+
+	namespace internal {
+
+		template<
+			bool left,
+			class Monoid,
+			typename InputType,
+			class Coords
+		>
+		RC fold_from_vector_to_scalar_dense(
+			typename Monoid::D3 &thread_local_output,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Vector< InputType, ascend, Coords > &to_fold,
+			const Monoid &monoid
+		) {
+			const InputType *__restrict__ const raw = internal::getRaw( to_fold );
+
+			const size_t start = lower_bound;
+			const size_t end = upper_bound;
+
+			if( start < end ) {
+				if( left ) {
+					monoid.getOperator().foldlArray(
+						thread_local_output, raw + start, end - start );
+				} else {
+					monoid.getOperator().foldrArray(
+						raw + start, thread_local_output, end - start );
+				}
+			}
+			assert( false );
+			return UNSUPPORTED;
+		}
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool left,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_input_to_fold,
+			bool already_dense_mask,
+#endif
+			class Monoid,
+			typename InputType,
+			typename MaskType,
+			class Coords
+		>
+		RC fold_from_vector_to_scalar_vectorDriven(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_input_to_fold,
+			bool already_dense_mask,
+#endif
+			typename Monoid::D3 &thread_local_output,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_to_fold,
+			const Coords &local_mask,
+			const Vector< InputType, ascend, Coords > &to_fold,
+			const Vector< MaskType, ascend, Coords > &mask,
+			const Monoid &monoid
+		) {
+			const size_t n = internal::getCoordinates( to_fold ).size();
+			const size_t local_n = upper_bound - lower_bound;
+			const size_t local_to_fold_nz = ( already_dense_input_to_fold )
+				? local_n
+				: local_to_fold.nonzeroes();
+
+			assert( n > 0 );
+			assert( !masked || internal::getCoordinates( mask ).size() == n );
+
+#ifdef NDEBUG
+			(void) n;
+			(void) local_n;
+#endif
+
+			RC ret = SUCCESS;
+
+			const size_t start = 0;
+			const size_t end = local_to_fold_nz;
+
+			// compute thread-local partial reduction
+			for( size_t k = start; k < end; ++k ) {
+				const size_t i = ( (already_dense_input_to_fold)
+					? k
+					: local_to_fold.index( k ) ) + lower_bound;
+				if( masked ) {
+					if( already_dense_mask ) {
+						if( !utils::interpretMask< descr >(
+							internal::getCoordinates( mask ).assigned( i ),
+							internal::getRaw( mask ), i )
+						) {
+							continue;
+						}
+					} else {
+						if( !utils::interpretMask< descr >(
+							local_mask.assigned( i - lower_bound ), internal::getRaw( mask ), i )
+						) {
+							continue;
+						}
+					}
+				}
+				RC local_rc;
+				if( left ) {
+					local_rc = foldl< descr >( thread_local_output,
+						internal::getRaw( to_fold )[ i ], monoid.getOperator() );
+				} else {
+					local_rc = foldr< descr >( internal::getRaw( to_fold )[ i ],
+						thread_local_output, monoid.getOperator() );
+				}
+				assert( local_rc == SUCCESS );
+				if( local_rc != SUCCESS ) {
+					ret = local_rc;
+				}
+			}
+
+			return ret;
+		}
+
+		template<
+			Descriptor descr,
+			bool left,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_input_to_fold,
+			bool already_dense_mask,
+#endif
+			class Monoid,
+			typename InputType,
+			typename MaskType,
+			class Coords
+		>
+		RC fold_from_vector_to_scalar_maskDriven(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_input_to_fold,
+			bool already_dense_mask,
+#endif
+			typename Monoid::D3 &thread_local_output,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_to_fold,
+			const Coords &local_mask,
+			const Vector< InputType, ascend, Coords > &to_fold,
+			const Vector< MaskType, ascend, Coords > &mask,
+			const Monoid &monoid
+		) {
+			const size_t n = internal::getCoordinates( to_fold ).size();
+
+			assert( internal::getCoordinates( mask ).size() == n );
+			assert( n > 0 );
+#ifdef NDEBUG
+			(void) n;
+#endif
+			const size_t local_n = upper_bound - lower_bound;
+			const size_t local_mask_nz = ( already_dense_mask )
+				? local_n
+				: local_mask.nonzeroes();
+
+			RC ret = SUCCESS;
+
+			const size_t start = 0;
+			const size_t end = local_mask_nz;
+
+			// compute thread-local partial reduction
+			for( size_t k = start; k < end; ++k ) {
+				const size_t i = ( (already_dense_mask)
+						? k
+						: local_mask.index( k )
+					) + lower_bound;
+				if( !( already_dense_input_to_fold ||
+					local_to_fold.assigned( i - lower_bound ) )
+				) {
+					continue;
+				}
+				if( !utils::interpretMask< descr >( true, internal::getRaw( mask ), i ) ) {
+					continue;
+				}
+				RC local_rc;
+				if( left ) {
+					local_rc = foldl< descr >( thread_local_output,
+						internal::getRaw( to_fold )[ i ], monoid.getOperator() );
+				} else {
+					local_rc = foldr< descr >( internal::getRaw( to_fold )[ i ],
+						thread_local_output, monoid.getOperator() );
+				}
+				assert( local_rc == SUCCESS );
+				if( local_rc != SUCCESS ) {
+					ret = local_rc;
+				}
+			}
+
+			return ret;
+		}
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool left,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_input_to_fold,
+			bool already_dense_mask,
+#endif
+			class Monoid,
+			typename InputType,
+			typename MaskType,
+			class Coords
+		>
+		RC fold_from_vector_to_scalar_fullLoopSparse(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_input_to_fold,
+			bool already_dense_mask,
+#endif
+			typename Monoid::D3 &thread_local_output,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_to_fold,
+			const Coords &local_mask,
+			const Vector< InputType, ascend, Coords > &to_fold,
+			const Vector< MaskType, ascend, Coords > &mask,
+			const Monoid &monoid
+		) {
+#ifdef _DEBUG
+			std::cout << "Entered fold_from_vector_to_scalar_fullLoopSparse\n";
+#endif
+
+#ifndef NDEBUG
+			const size_t n = internal::getCoordinates( to_fold ).size();
+			const size_t local_n = already_dense_input_to_fold
+				? upper_bound - lower_bound
+				: local_to_fold.size();
+			assert( local_n > 0 );
+
+			(void) n;
+#endif
+			RC ret = SUCCESS;
+
+			size_t i = lower_bound;
+			const size_t end = upper_bound;
+
+			// some sanity checks
+			assert( i <= end );
+			assert( end <= n );
+
+			// assume current i needs to be processed, forward until we find an index
+			// for which the mask evaluates true
+			bool process_current_i = true;
+			if( masked && i < end ) {
+				process_current_i = utils::interpretMask< descr >(
+					already_dense_mask
+						? internal::getCoordinates( mask ).assigned( i )
+						: local_mask.assigned( i - lower_bound ),
+					internal::getRaw( mask ), i ) && (
+						already_dense_input_to_fold || local_to_fold.assigned( i - lower_bound )
+					);
+				// if not
+				while( !process_current_i ) {
+					// forward to next element
+					(void) ++i;
+					// check that we are within bounds
+					if( i == end ) {
+						break;
+					}
+					// evaluate whether we should process this i-th element
+					process_current_i = utils::interpretMask< descr >(
+						already_dense_mask
+							? internal::getCoordinates( mask ).assigned( i )
+							: local_mask.assigned( i - lower_bound ),
+						internal::getRaw( mask ), i ) && (
+							already_dense_input_to_fold || local_to_fold.assigned( i - lower_bound )
+						);
+				}
+			}
+
+			if( !masked && i < end ) {
+				process_current_i = local_to_fold.assigned( i - lower_bound );
+				while( !process_current_i ) {
+					(void) ++i;
+					if( i == end ) {
+						break;
+					}
+					process_current_i = already_dense_input_to_fold ||
+						local_to_fold.assigned( i - lower_bound );
+				}
+			}
+
+#ifndef NDEBUG
+			if( i < end ) {
+				assert( i < n );
+			}
+#endif
+
+			// declare thread-local variable and set our variable to the first value in
+			// our block
+			typename Monoid::D3 local =
+				monoid.template getIdentity< typename Monoid::D3 >();
+			if( end > 0 ) {
+				if( i < end ) {
+#ifdef _DEBUG
+					std::cout << "\t processing start index " << i << "\n";
+#endif
+					local = static_cast< typename Monoid::D3 >(
+						internal::getRaw( to_fold )[ i ] );
+				}
+			}
+
+			// if we have more values to fold
+			if( i + 1 < end ) {
+
+				// keep going until we run out of values to fold
+				while( true ) {
+
+					// forward to next variable
+					(void) ++i;
+
+					// forward more (possibly) if in the masked case
+					if( masked && i < end ) {
+						assert( i < n );
+						process_current_i = utils::interpretMask< descr >(
+								already_dense_mask
+									? internal::getCoordinates( mask ).assigned( i )
+									: local_mask.assigned( i - lower_bound ),
+								internal::getRaw( mask ), i
+							) && (
+								already_dense_input_to_fold ||
+								local_to_fold.assigned( i - lower_bound )
+							);
+						while( !process_current_i ) {
+							(void) ++i;
+							if( i == end ) {
+								break;
+							}
+							assert( i < end );
+							assert( i < n );
+							process_current_i = utils::interpretMask< descr >(
+									already_dense_mask
+										? internal::getCoordinates( mask ).assigned( i )
+										: local_mask.assigned( i - lower_bound ),
+									internal::getRaw( mask ), i
+								) && (
+									already_dense_input_to_fold ||
+									local_to_fold.assigned( i - lower_bound )
+								);
+						}
+					}
+					if( !masked && i < end ) {
+						assert( i < n );
+						process_current_i = already_dense_input_to_fold ||
+							local_to_fold.assigned( i - lower_bound );
+						while( !process_current_i ) {
+							(void) ++i;
+							if( i == end ) {
+								break;
+							}
+							assert( i < end );
+							assert( i < n );
+							process_current_i = already_dense_input_to_fold ||
+								local_to_fold.assigned( i - lower_bound );
+						}
+					}
+
+					// stop if past end
+					if( i >= end ) {
+						break;
+					}
+
+#ifdef _DEBUG
+					std::cout << "\t processing index " << i << "\n";
+#endif
+
+					// do fold
+					assert( i < n );
+					if( left ) {
+						ret = ret ? ret : foldl< descr >( local, internal::getRaw( to_fold )[ i ],
+							monoid.getOperator() );
+					} else {
+						ret = ret ? ret : foldr< descr >( internal::getRaw( to_fold )[ i ], local,
+							monoid.getOperator() );
+					}
+					assert( ret == SUCCESS );
+
+					if( ret != SUCCESS ) {
+						break;
+					}
+				}
+			}
+
+			if( left ) {
+				ret = ret ? ret : foldl< descr >( thread_local_output, local,
+					monoid.getOperator() );
+			} else {
+				ret = ret ? ret : foldr< descr >( local, thread_local_output,
+					monoid.getOperator() );
+			}
+			assert( ret == SUCCESS );
+
+			return ret;
+		}
+
+		/**
+		 * Dispatches to any of the four above variants depending on asymptotic cost
+		 * analysis.
+		 */
+		template<
+			Descriptor descr = descriptors::no_operation,
+			bool masked,
+			bool left, // if this is false, assumes right-looking fold
+			class Monoid,
+			typename IOType,
+			typename InputType,
+			typename MaskType,
+			typename Coords
+		>
+		RC fold_from_vector_to_scalar_generic(
+			IOType &fold_into,
+			const Vector< InputType, ascend, Coords > &to_fold,
+			const Vector< MaskType, ascend, Coords > &mask,
+			const Monoid &monoid
+		) {
+			// static sanity checks
+			static_assert( grb::is_monoid< Monoid >::value,
+				"grb::foldl can only be called using monoids. This "
+				"function should not have been called-- please submit a "
+				"bugreport." );
+
+			const size_t n = internal::getCoordinates( to_fold ).size();
+
+			// mask must be of equal size as input vector
+			if( masked && n != size( mask ) ) {
+				return MISMATCH;
+			}
+
+			// handle trivial cases
+			if( n == 0 ) {
+				return SUCCESS;
+			}
+
+			// some globals used during the folding
+			RC ret = SUCCESS;
+			typename Monoid::D3 global =
+				monoid.template getIdentity< typename Monoid::D3 >();
+
+			size_t local_reduced_size = sysconf( _SC_NPROCESSORS_ONLN ) *
+				config::CACHE_LINE_SIZE::value();
+			IOType local_reduced[ local_reduced_size ];
+
+			for(
+				size_t i = 0;
+				i < local_reduced_size;
+				i += config::CACHE_LINE_SIZE::value()
+			) {
+				local_reduced[ i ] = monoid.template getIdentity< typename Monoid::D3 >();
+			}
+
+			constexpr const bool dense_descr = descr & descriptors::dense;
+
+			internal::Pipeline::stage_type func =
+				[&to_fold, &mask, &monoid, &local_reduced] (
+					internal::Pipeline &pipeline,
+					const size_t lower_bound,
+					const size_t upper_bound
+				) {
+#ifdef _ASCEND_DEBUG
+					#pragma omp critical
+					std::cout << "\t\tExecution of stage fold_from_vector_to_scalar_generic "
+						"in the range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+					RC ret = SUCCESS;
+
+					Coords local_to_fold, local_mask;
+					size_t local_n = upper_bound - lower_bound;
+					size_t local_to_fold_nz = local_n;
+					size_t local_mask_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					const bool already_dense_vectors = dense_descr ||
+						pipeline.allAlreadyDenseVectors();
+#else
+					(void) pipeline;
+					constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+					bool already_dense_input_to_fold = true;
+					bool already_dense_mask = true;
+
+					if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+						already_dense_input_to_fold = pipeline.containsAlreadyDenseVector(
+							&internal::getCoordinates( to_fold ) );
+						if( !already_dense_input_to_fold ) {
+#else
+							already_dense_input_to_fold = false;
+#endif
+							local_to_fold = internal::getCoordinates( to_fold ).asyncSubset(
+								lower_bound, upper_bound );
+							local_to_fold_nz = local_to_fold.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+						}
+#endif
+						if( masked ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+							already_dense_mask = pipeline.containsAlreadyDenseVector(
+								&internal::getCoordinates( mask ) );
+							if( !already_dense_mask ) {
+#else
+								already_dense_mask = false;
+#endif
+								local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+									upper_bound );
+								local_mask_nz = local_mask.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+							}
+#endif
+						}
+					}
+
+					unsigned int thread_id = omp_get_thread_num() *
+						config::CACHE_LINE_SIZE::value();
+
+					// dispatch, dense variant
+					if( ( (descr & descriptors::dense) || local_to_fold_nz == local_n ) && (
+							!masked || (
+								(descr & descriptors::structural) &&
+								!(descr & descriptors::invert_mask) &&
+								local_mask_nz == local_n
+							)
+						)
+					) {
+#ifdef _DEBUG
+						std::cout << "\t dispatching to dense variant\n";
+#endif
+						ret = fold_from_vector_to_scalar_dense< left >(
+							local_reduced[ thread_id ], lower_bound, upper_bound, to_fold, monoid );
+					} else if( masked && (descr & descriptors::invert_mask ) ) {
+						// in this case we are forced to dispatch to O(n)
+#ifdef _DEBUG
+						std::cout << "\t forced dispatch to O(n) sparse variant\n";
+#endif
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+						ret = boolean_dispatcher_fold_from_vector_to_scalar_fullLoopSparse<
+#else
+						ret = fold_from_vector_to_scalar_fullLoopSparse<
+#endif
+								descr, true, left
+							>(
+								already_dense_input_to_fold, already_dense_mask,
+								local_reduced[ thread_id ], lower_bound, upper_bound,
+								local_to_fold, local_mask, to_fold, mask, monoid
+							);
+					} else {
+						constexpr const size_t threeWs =
+							sizeof( typename Coords::StackType ) +
+							sizeof( typename Coords::ArrayType ) +
+							MaskWordSize< descr, MaskType >::value;
+						const size_t fullLoop = masked
+							? 2 * sizeof( typename Coords::ArrayType ) * local_n +
+								sizeof( MaskType ) * local_mask_nz
+							: sizeof( typename Coords::ArrayType ) * local_n;
+						const size_t vectorLoop = masked
+							? threeWs * local_to_fold_nz
+							: sizeof( typename Coords::StackType ) * local_to_fold_nz;
+						const size_t maskLoop = masked
+							? threeWs * local_mask_nz
+							: std::numeric_limits< size_t >::max();
+						if( fullLoop >= vectorLoop && maskLoop >= vectorLoop ) {
+#ifdef _DEBUG
+							std::cout << "\t dispatching to vector-driven sparse variant\n";
+#endif
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+							ret = boolean_dispatcher_fold_from_vector_to_scalar_vectorDriven<
+#else
+							ret = fold_from_vector_to_scalar_vectorDriven<
+#endif
+									descr, masked, left
+								>(
+									already_dense_input_to_fold, already_dense_mask,
+									local_reduced[ thread_id ], lower_bound, upper_bound,
+									local_to_fold, local_mask, to_fold, mask, monoid
+								);
+						} else if( vectorLoop >= fullLoop && maskLoop >= fullLoop ) {
+#ifdef _DEBUG
+							std::cout << "\t dispatching to O(n) sparse variant\n";
+#endif
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+							ret = boolean_dispatcher_fold_from_vector_to_scalar_fullLoopSparse<
+#else
+							ret = fold_from_vector_to_scalar_fullLoopSparse<
+#endif
+									descr, masked, left
+								>(
+									already_dense_input_to_fold, already_dense_mask,
+									local_reduced[ thread_id ], lower_bound, upper_bound,
+									local_to_fold, local_mask, to_fold, mask, monoid
+								);
+						} else {
+							assert( maskLoop < fullLoop && maskLoop < vectorLoop );
+							assert( masked );
+#ifdef _DEBUG
+							std::cout << "\t dispatching to mask-driven sparse variant\n";
+#endif
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+							ret = boolean_dispatcher_fold_from_vector_to_scalar_maskDriven<
+#else
+							ret = fold_from_vector_to_scalar_maskDriven<
+#endif
+									descr, left
+								>(
+									already_dense_input_to_fold, already_dense_mask,
+									local_reduced[ thread_id ], lower_bound, upper_bound,
+									local_to_fold, local_mask, to_fold, mask, monoid
+								);
+						}
+					}
+
+					return ret;
+				};
+
+#ifdef _ASCEND_DEBUG
+			std::cout << "\t\tStage added to a pipeline: "
+				<< "fold_from_vector_to_scalar_generic" << std::endl;
+#endif
+
+			ret = ret ? ret : internal::le.addStage(
+					std::move( func ),
+					internal::Opcode::BLAS1_FOLD_VECTOR_SCALAR_GENERIC,
+					n,
+					sizeof( IOType ),
+					dense_descr,
+					true,
+					nullptr, nullptr, nullptr, nullptr,
+					&to_fold,
+					( masked ) ? &mask : nullptr,
+					nullptr,
+					nullptr,
+					&internal::getCoordinates( to_fold ),
+					(masked) ? &internal::getCoordinates( mask ) : nullptr,
+					nullptr,
+					nullptr,
+					nullptr
+				);
+
+			if( ret == SUCCESS ) {
+				for(
+					size_t i = 0;
+					i < local_reduced_size;
+					i += config::CACHE_LINE_SIZE::value()
+				) {
+					RC rc;
+					if( left ) {
+						rc = foldl< descr >( global, local_reduced[ i ], monoid.getOperator() );
+					} else {
+						rc = foldr< descr >( local_reduced[ i ], global, monoid.getOperator() );
+					}
+					assert( rc == SUCCESS );
+					if( rc != SUCCESS ) {
+						ret = rc;
+					}
+				}
+			}
+
+			// accumulate
+#ifdef _DEBUG
+			std::cout << "\t accumulating " << global << " into " << fold_into << "\n";
+#endif
+
+			if( ret == SUCCESS ) {
+				if( left ) {
+					ret = foldl< descr >( fold_into, global, monoid.getOperator() );
+				} else {
+					ret = foldr< descr >( global, fold_into, monoid.getOperator() );
+				}
+			}
+
+			return ret;
+		}
+
+		/**
+		 * \internal
+		 * @tparam left   If false, right-looking fold is assumed (and left-looking
+		 *                otherwise)
+		 * @tparam sparse Whether \a vector was sparse
+		 * @tparam monoid Whether \a op is actually a monoid
+		 * \endinternal
+		 */
+		template<
+			Descriptor descr,
+			bool left,
+			bool sparse,
+			bool masked,
+			bool monoid,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_output,
+			bool already_dense_mask,
+#endif
+			typename MaskType,
+			typename IOType,
+			typename InputType,
+			typename Coords,
+			class OP
+		>
+		RC fold_from_scalar_to_vector_generic(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_output,
+			bool already_dense_mask,
+#endif
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_vector,
+			const Coords * const local_mask_ptr,
+			Vector< IOType, ascend, Coords > &vector,
+			const Vector< MaskType, ascend, Coords > * const mask,
+			const InputType &scalar,
+			const OP &op,
+			const Phase &phase
+		) {
+			constexpr const bool dense_descr = descr & descriptors::dense;
+			assert( !masked || mask != nullptr );
+			assert( !masked || local_mask_ptr != nullptr );
+
+			Coords local_mask;
+			if( masked ) {
+				local_mask = *local_mask_ptr;
+			}
+
+			const size_t local_n = upper_bound - lower_bound;
+			const size_t local_vector_nz = (sparse || !already_dense_output)
+				? local_vector.nonzeroes() : local_n;
+			const size_t local_mask_nz = ( masked )
+				? ( ( already_dense_mask )
+						? local_n
+						: local_mask.nonzeroes()
+					)
+				: 0;
+
+			const size_t n = internal::getCoordinates( vector ).size();
+
+			if( masked && internal::getCoordinates( *mask ).size() != n ) {
+				return MISMATCH;
+			}
+			if( dense_descr && sparse ) {
+				return ILLEGAL;
+			}
+			if( n == 0 ) {
+				return SUCCESS;
+			}
+			if( phase == RESIZE ) {
+				return SUCCESS;
+			}
+
+			assert( phase == EXECUTE );
+			IOType * __restrict__ const x = internal::getRaw( vector );
+			const MaskType * __restrict__ const m = ( masked )
+				? internal::getRaw( *mask )
+				: nullptr;
+
+			if( sparse && monoid && !masked ) {
+				for( size_t i = lower_bound; i < upper_bound; ++i ) {
+					if( already_dense_output || local_vector.assigned( i - lower_bound ) ) {
+						if( left ) {
+							(void) foldl< descr >( x[ i ], scalar, op );
+						} else {
+							(void) foldr< descr >( scalar, x[ i ], op );
+						}
+					} else {
+						x[ i ] = static_cast< IOType >( scalar );
+					}
+				}
+
+				if( !already_dense_output ) {
+					local_vector.local_assignAllNotAlreadyAssigned();
+				}
+			} else if( sparse && monoid && masked ) {
+				for( size_t i = 0; i < local_mask_nz; ++i ) {
+					const size_t index = ( ( already_dense_mask )
+						? i
+						: local_mask.index( i ) ) + lower_bound;
+					if( already_dense_mask ) {
+						if( !internal::getCoordinates( *mask ).template mask< descr >(
+							index, m )
+						) {
+							continue;
+						}
+					} else {
+						if( !local_mask.template mask< descr >( index - lower_bound,
+							m + lower_bound )
+						) {
+							continue;
+						}
+					}
+					if( already_dense_output || local_vector.assign( index - lower_bound ) ) {
+						if( left ) {
+							(void) foldl< descr >( x[ index ], scalar, op );
+						} else {
+							(void) foldr< descr >( scalar, x[ index ], op );
+						}
+					} else {
+						x[ index ] = static_cast< IOType >( scalar );
+					}
+				}
+			} else if( sparse && !monoid ) {
+				const bool maskDriven = masked ? local_mask_nz < local_vector_nz : false;
+				if( maskDriven ) {
+					for( size_t i = 0; i < local_mask_nz; ++i ) {
+						const size_t index = ( ( already_dense_mask )
+							? i
+							: local_mask.index( i ) ) + lower_bound;
+						if( already_dense_mask ) {
+							if( !internal::getCoordinates( *mask ).template mask< descr >(
+								index, m )
+							) {
+								continue;
+							}
+						} else {
+							if( !local_mask.template mask< descr >( index - lower_bound,
+								m + lower_bound )
+							) {
+								continue;
+							}
+						}
+						if( already_dense_output || local_vector.assign( index - lower_bound ) ) {
+							if( left ) {
+								(void) foldl< descr >( x[ index ], scalar, op );
+							} else {
+								(void) foldr< descr >( scalar, x[ index ], op );
+							}
+						}
+					}
+				} else {
+					for( size_t i = 0; i < local_vector_nz; ++i ) {
+						const size_t index = (already_dense_output
+								? i
+								: local_vector.index( i )
+							) + lower_bound;
+						if( masked ) {
+							if( already_dense_mask ) {
+								if( !( internal::getCoordinates( *mask ).template mask< descr >(
+									index, m ) )
+								) {
+									continue;
+								}
+							} else {
+								if( !local_mask.template mask< descr >( index - lower_bound, m +
+									lower_bound )
+								) {
+									continue;
+								}
+							}
+						}
+						if( left ) {
+							(void) foldl< descr >( x[ index ], scalar, op );
+						} else {
+							(void) foldr< descr >( scalar, x[ index ], op );
+						}
+					}
+				}
+			} else if( !sparse && masked ) {
+				for( size_t i = 0; i < local_mask_nz; ++i ) {
+					const size_t index = ( ( already_dense_mask )
+						? i
+						: local_mask.index( i ) ) + lower_bound;
+					if( already_dense_mask ) {
+						if( !( internal::getCoordinates( *mask ).template mask< descr >(
+							index, m ) )
+						) {
+							continue;
+						}
+					} else {
+						if( !local_mask.template mask< descr >( index - lower_bound, m +
+							lower_bound )
+						) {
+							continue;
+						}
+					}
+
+					if( left ) {
+						(void) foldl< descr >( x[ index ], scalar, op );
+					} else {
+						(void) foldr< descr >( scalar, x[ index ], op );
+					}
+				}
+			} else {
+				// if target vector is dense and there is no mask, then
+				// there is no difference between monoid or non-monoid behaviour.
+				assert( !sparse );
+				assert( !masked );
+				assert( local_vector_nz == local_n );
+
+				if( local_n > 0 ) {
+					if( left ) {
+						op.eWiseFoldlAS( x + lower_bound, scalar, local_n );
+					} else {
+						op.eWiseFoldrSA( scalar, x + lower_bound, local_n );
+					}
+				}
+			}
+
+			assert( false );
+			return UNSUPPORTED;
+		}
+
+		template<
+			Descriptor descr,
+			bool left, // if this is false, the right-looking fold is assumed
+			bool sparse,
+			bool masked,
+			bool monoid,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_output,
+			bool already_dense_input_to_fold,
+			bool already_dense_mask,
+#endif
+			typename MaskType,
+			typename IOType,
+			typename IType,
+			typename Coords,
+			class OP
+		>
+		RC fold_from_vector_to_vector_generic(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_output,
+			bool already_dense_input_to_fold,
+			bool already_dense_mask,
+#endif
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_fold_into,
+			const Coords * const local_m_ptr,
+			const Coords &local_to_fold,
+			Vector< IOType, ascend, Coords > &fold_into,
+			const Vector< MaskType, ascend, Coords > * const m,
+			const Vector< IType, ascend, Coords > &to_fold,
+			const OP &op,
+			const Phase phase
+		) {
+			constexpr const bool dense_descr = descr & descriptors::dense;
+			assert( !masked || (m != nullptr) );
+
+			Coords local_m;
+			if( masked && !already_dense_mask ) {
+				local_m = *local_m_ptr;
+			}
+
+			const size_t local_n = upper_bound - lower_bound;
+			const size_t local_fold_into_nz = already_dense_output
+				? local_n
+				: local_fold_into.nonzeroes();
+			const size_t local_to_fold_nz = already_dense_input_to_fold
+				? local_n
+				: local_to_fold.nonzeroes();
+			const size_t local_m_nz = ( masked )
+				? ( already_dense_mask
+						? local_n
+						: local_m.nonzeroes()
+					)
+				: 0;
+
+			const size_t n = size( fold_into );
+			if( n != size( to_fold ) ) {
+				return MISMATCH;
+			}
+			if( masked && size( *m ) != n ) {
+				return MISMATCH;
+			}
+			if( dense_descr && sparse ) {
+				return ILLEGAL;
+			}
+			if( phase == RESIZE ) {
+				return SUCCESS;
+			}
+
+			assert( phase == EXECUTE );
+
+			if( !sparse && !masked ) {
+#ifdef _DEBUG
+				std::cout << "fold_from_vector_to_vector_generic: in dense variant\n";
+#endif
+
+#ifdef _DEBUG
+				std::cout << "fold_from_vector_to_vector_generic: in sequential variant\n";
+#endif
+				if( left ) {
+					op.eWiseFoldlAA( internal::getRaw( fold_into ) + lower_bound,
+						internal::getRaw( to_fold ) + lower_bound, local_n );
+				} else {
+					op.eWiseFoldrAA( internal::getRaw( to_fold ) + lower_bound,
+						internal::getRaw( fold_into ) + lower_bound, local_n );
+				}
+			} else {
+#ifdef _DEBUG
+				std::cout << "fold_from_vector_to_vector_generic: in sparse variant\n";
+				std::cout << "\tfolding vector of " << local_to_fold_nz << " nonzeroes "
+					<< "into a vector of " << local_fold_into_nz << " nonzeroes...\n";
+#endif
+				if(
+					masked &&
+					local_fold_into_nz == local_n &&
+					local_to_fold_nz == local_n
+				) {
+					// use sparsity structure of mask for this eWiseFold
+					if( left ) {
+#ifdef _DEBUG
+						std::cout << "fold_from_vector_to_vector_generic: foldl, using the "
+							<< "mask's sparsity structure\n";
+#endif
+						for( size_t k = 0; k < local_m_nz; ++k ) {
+							const size_t i = ( already_dense_mask
+									? k
+									: local_m.index( k )
+								) + lower_bound;
+#ifdef _DEBUG
+							std::cout << "Left-folding " << to_fold[ i ] << " into "
+								<< fold_into[ i ];
+#endif
+							(void) foldl< descr >( fold_into[ i ], to_fold[ i ], op );
+#ifdef _DEBUG
+							std::cout << " resulting into " << fold_into[ i ] << "\n";
+#endif
+						}
+					} else {
+#ifdef _DEBUG
+						std::cout << "fold_from_vector_to_vector_generic: foldl, using the "
+							<< "mask's sparsity structure\n";
+#endif
+						for( size_t k = 0; k < local_m_nz; ++k ) {
+							const size_t i = ( already_dense_mask
+									? k
+									: local_m.index( k )
+								) + lower_bound;
+#ifdef _DEBUG
+							std::cout << "Right-folding " << to_fold[ i ] << " into "
+								<< fold_into[ i ];
+#endif
+							(void) foldr< descr >( to_fold[ i ], fold_into[ i ], op );
+#ifdef _DEBUG
+							std::cout << " resulting into " << fold_into[ i ] << "\n";
+#endif
+						}
+					}
+				} else if( !masked && local_fold_into_nz == local_n ) {
+					// use sparsity structure of to_fold for this eWiseFold
+					if( left ) {
+#ifdef _DEBUG
+						std::cout << "fold_from_vector_to_vector_generic: foldl, using "
+							<< "to_fold's sparsity\n";
+#endif
+						for( size_t k = 0; k < local_to_fold_nz; ++k ) {
+							const size_t i = ( already_dense_input_to_fold
+									? k
+									: local_to_fold.index( k )
+								) + lower_bound;
+#ifdef _DEBUG
+								std::cout << "Left-folding " << to_fold[ i ] << " into "
+									<< fold_into[ i ];
+#endif
+								(void) foldl< descr >( fold_into[ i ], to_fold[ i ], op );
+#ifdef _DEBUG
+								std::cout << " resulting into " << fold_into[ i ] << "\n";
+#endif
+						}
+					} else {
+#ifdef _DEBUG
+						std::cout << "fold_from_vector_to_vector_generic: foldl, using "
+							<< "to_fold's sparsity\n";
+#endif
+						for( size_t k = 0; k < local_to_fold_nz; ++k ) {
+							const size_t i = ( already_dense_input_to_fold
+									? k
+									: local_to_fold.index( k )
+								) + lower_bound;
+#ifdef _DEBUG
+							std::cout << "Right-folding " << to_fold[ i ] << " into "
+								<< fold_into[ i ];
+#endif
+							(void) foldr< descr >( to_fold[ i ], fold_into[ i ], op );
+#ifdef _DEBUG
+							std::cout << " resulting into " << fold_into[ i ] << "\n";
+#endif
+						}
+					}
+				} else if( !masked && local_to_fold_nz == local_n ) {
+					// use sparsity structure of fold_into for this eWiseFold
+					if( left ) {
+#ifdef _DEBUG
+						std::cout << "fold_from_vector_to_vector_generic: foldl, using "
+							<< "fold_into's sparsity\n";
+#endif
+						for( size_t k = 0; k < local_fold_into_nz; ++k ) {
+							const size_t i = ( already_dense_output
+									? k
+									: local_fold_into.index( k )
+								) + lower_bound;
+#ifdef _DEBUG
+							std::cout << "Left-folding " << to_fold[ i ] << " into "
+								<< fold_into[ i ];
+#endif
+							(void) foldl< descr >( fold_into[ i ], to_fold[ i ], op );
+#ifdef _DEBUG
+							std::cout << " resulting into " << fold_into[ i ] << "\n";
+#endif
+						}
+					} else {
+#ifdef _DEBUG
+						std::cout << "fold_from_vector_to_vector_generic: foldr, using "
+							<< "fold_into's sparsity\n";
+#endif
+						for( size_t k = 0; k < local_fold_into_nz; ++k ) {
+							const size_t i = ( already_dense_output ?
+									k :
+									local_fold_into.index( k )
+								) + lower_bound;
+#ifdef _DEBUG
+							std::cout << "Right-folding " << to_fold[ i ] << " into " << fold_into[ i ];
+#endif
+							(void) foldr< descr >( to_fold[ i ], fold_into[ i ], op );
+#ifdef _DEBUG
+							std::cout << " resulting into " << fold_into[ i ] << "\n";
+#endif
+						}
+					}
+				} else {
+#ifdef _DEBUG
+					std::cout << "fold_from_vector_to_vector_generic: using specialised "
+						<< "code to merge two sparse vectors and, potentially, "
+						<< "output masks\n";
+#endif
+					const IType * __restrict__ const tf_raw = internal::getRaw( to_fold );
+					IOType * __restrict__ const fi_raw = internal::getRaw( fold_into );
+#ifdef _DEBUG
+					std::cout << "\tin sequential version...\n";
+#endif
+					for( size_t k = 0; k < local_to_fold_nz; ++k ) {
+						const size_t i = ( already_dense_input_to_fold
+								? k
+								: local_to_fold.index( k )
+							) + lower_bound;
+						if( masked ) {
+							if( already_dense_mask ) {
+								if( !internal::getCoordinates( *m ).template mask< descr >( i,
+									internal::getRaw( *m ) )
+								) {
+									continue;
+								}
+							} else {
+								if( !local_m.template mask< descr >( i - lower_bound,
+									internal::getRaw( *m ) + lower_bound )
+								) {
+									continue;
+								}
+							}
+						}
+
+						assert( i < n );
+						if( already_dense_output ||
+							local_fold_into.assigned( i - lower_bound )
+						) {
+							if( left ) {
+#ifdef _DEBUG
+								std::cout << "\tfoldl< descr >( fi_raw[ i ], tf_raw[ i ], op ), i = "
+									<< i << ": " << tf_raw[ i ] << " goes into " << fi_raw[ i ];
+#endif
+								(void)foldl< descr >( fi_raw[ i ], tf_raw[ i ], op );
+#ifdef _DEBUG
+								std::cout << " which results in " << fi_raw[ i ] << "\n";
+#endif
+							} else {
+#ifdef _DEBUG
+								std::cout << "\tfoldr< descr >( tf_raw[ i ], fi_raw[ i ], op ), i = "
+									<< i << ": " << tf_raw[ i ] << " goes into " << fi_raw[ i ];
+#endif
+								(void) foldr< descr >( tf_raw[ i ], fi_raw[ i ], op );
+#ifdef _DEBUG
+								std::cout << " which results in " << fi_raw[ i ] << "\n";
+#endif
+							}
+						} else if( monoid ) {
+#ifdef _DEBUG
+							std::cout << "\tindex " << i << " is unset. Old value " << fi_raw[ i ]
+								<< " will be overwritten with " << tf_raw[ i ] << "\n";
+#endif
+							fi_raw[ i ] = tf_raw[ i ];
+							(void) local_fold_into.assign( i - lower_bound );
+						}
+					}
+				}
+			}
+
+#ifdef _DEBUG
+			std::cout << "\tCall to fold_from_vector_to_vector_generic done. "
+				<< "Output now contains " << local_fold_into_nz << " / "
+				<< local_n << " nonzeroes.\n";
+#endif
+			assert( false );
+			return UNSUPPORTED;
+		}
+
+	} // namespace internal
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename InputType,
+		typename IOType,
+		typename MaskType,
+		typename Coords
+	>
+	RC foldr(
+		const Vector< InputType, ascend, Coords > &x,
+		const Vector< MaskType, ascend, Coords > &mask,
+		IOType &beta,
+		const Monoid &monoid = Monoid(),
+		const typename std::enable_if<
+				!grb::is_object< InputType >::value &&
+				!grb::is_object< IOType >::value &&
+				!grb::is_object< MaskType >::value &&
+				grb::is_monoid< Monoid >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< IOType, InputType >::value ), "grb::foldr",
+			"called with a scalar IO type that does not match the input vector type" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Monoid::D1 >::value ), "grb::foldr",
+			"called with an input vector value type that does not match the first "
+			"domain of the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Monoid::D2 >::value ), "grb::foldr",
+			"called with an input vector type that does not match the second domain of "
+			"the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Monoid::D3 >::value ), "grb::foldr",
+			"called with an input vector type that does not match the third domain of "
+			"the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::foldr",
+			"called with a vector mask type that is not boolean" );
+
+		if( size( mask ) > 0 ) {
+			return internal::template fold_from_vector_to_scalar_generic<
+					descr, true, false
+				>( beta, x, mask, monoid );
+		} else {
+			return internal::template fold_from_vector_to_scalar_generic<
+					descr, false, false
+				>( beta, x, mask, monoid );
+		}
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename InputType,
+		typename IOType,
+		typename Coords
+	>
+	RC foldr(
+		const Vector< InputType, ascend, Coords > &x,
+		IOType &beta,
+		const Monoid &monoid = Monoid(),
+		const typename std::enable_if<
+				!grb::is_object< InputType >::value &&
+				!grb::is_object< IOType >::value &&
+				grb::is_monoid< Monoid >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< IOType, InputType >::value ), "grb::foldr",
+			"called with a scalar IO type that does not match the input vector type" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Monoid::D1 >::value ), "grb::foldr",
+			"called with an input vector value type that does not match the first "
+			"domain of the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Monoid::D2 >::value ), "grb::foldr",
+			"called with an input vector type that does not match the second domain of "
+			"the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Monoid::D3 >::value ), "grb::foldr",
+			"called with an input vector type that does not match the third domain of "
+			"the given monoid" );
+
+		Vector< bool, ascend, Coords > empty_mask( 0 );
+		return internal::template fold_from_vector_to_scalar_generic<
+				descr, false, false
+			>( beta, x, empty_mask, monoid );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename IOType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldr(
+		const InputType &alpha,
+		Vector< IOType, ascend, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< InputType >::value &&
+				!grb::is_object< IOType >::value &&
+				grb::is_monoid< Monoid >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, IOType >::value ), "grb::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType >::value ), "grb::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, IOType >::value ), "grb::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given operator" );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[alpha, &y, &monoid, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _ASCEND_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage foldl(alpha, y, monoid) in the range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				const Vector< bool, ascend, Coords > * const null_mask = nullptr;
+				const Coords * const local_null_mask = nullptr;
+
+				Coords local_y;
+				size_t local_y_nz;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr || pipeline.allAlreadyDenseVectors();
+#else
+				(void) pipeline;
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+				bool already_dense_output = true;
+
+				if( !already_dense_vectors ) {
+					const size_t local_n = upper_bound - lower_bound;
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector( &internal::getCoordinates( y ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_y = internal::getCoordinates( y ).asyncSubset( lower_bound, upper_bound );
+						local_y_nz = local_y.nonzeroes();
+						if( local_y_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+					rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+							descr, false, true, false, true
+						>(
+							already_dense_output, true,
+							lower_bound, upper_bound, local_y, local_null_mask,
+							y, null_mask, alpha, monoid.getOperator(), phase
+						);
+				} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+					rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+							descr, false, false, false, true
+						>(
+							already_dense_output, true,
+							lower_bound, upper_bound, local_y, local_null_mask,
+							y, null_mask, alpha, monoid.getOperator(), phase
+						);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( !already_dense_output ) {
+#else
+				if( !already_dense_vectors ) {
+#endif
+					internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound,
+						upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_FOLD_SCALAR_VECTOR_GENERIC,
+				internal::getCoordinates( y ).size(),
+				sizeof( IOType ),
+				dense_descr, true,
+				&y, nullptr,
+				&internal::getCoordinates( y ), nullptr,
+				nullptr, nullptr, nullptr, nullptr,
+				nullptr, nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: foldl(alpha, y, monoid)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename IOType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldr(
+		const InputType &alpha,
+		Vector< IOType, ascend, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< InputType >::value &&
+				!grb::is_object< IOType >::value &&
+				grb::is_operator< OP >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, IOType >::value ), "grb::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType >::value ), "grb::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, IOType >::value ), "grb::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given operator" );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[alpha, &y, &op, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _ASCEND_DEBUG
+				#pragma omp critical
+				{
+					std::cout << "\t\tExecution of stage foldl(alpha, y, op) in the range("
+						<< lower_bound << ", " << upper_bound << ")" << std::endl;
+				}
+#endif
+				RC rc = SUCCESS;
+
+				const Vector< bool, ascend, Coords > * const null_mask = nullptr;
+				const Coords * const local_null_mask = nullptr;
+
+				Coords local_y;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_y_nz;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				(void) pipeline;
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+				bool already_dense_output = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( y ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+							upper_bound );
+						local_y_nz = local_y.nonzeroes();
+						if( local_y_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+					rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+							descr, false, true, false, false
+						>(
+							already_dense_output, true,
+							lower_bound, upper_bound,
+							local_y, local_null_mask, y, null_mask,
+							alpha, op, phase
+						);
+				} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+					rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+							descr, false, false, false, false
+						>(
+							already_dense_output, true,
+							lower_bound, upper_bound, local_y, local_null_mask,
+							y, null_mask, alpha, op, phase
+						);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( !already_dense_output ) {
+#else
+				if( !already_dense_vectors ) {
+#endif
+					internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound,
+						upper_bound );
+				}
+
+				return rc;
+			};
+
+		ret = ret ? ret : internal::le.addStage(
+			std::move( func ),
+			internal::Opcode::BLAS1_FOLD_SCALAR_VECTOR_GENERIC,
+			internal::getCoordinates( y ).size(),
+			sizeof( IOType ),
+			dense_descr, true,
+			&y, nullptr,
+			&internal::getCoordinates( y ), nullptr,
+			nullptr, nullptr, nullptr, nullptr,
+			nullptr, nullptr, nullptr, nullptr,
+			nullptr
+		);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: foldl(alpha, y, op)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename IOType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldr(
+		const Vector< InputType, ascend, Coords > &x,
+		Vector< IOType, ascend, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				grb::is_operator< OP >::value &&
+				!grb::is_object< InputType >::value &&
+				!grb::is_object< IOType >::value,
+			void
+		>::type * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType >::value ), "grb::eWiseFoldr",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, IOType >::value ), "grb::eWiseFoldr",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, IOType >::value ), "grb::eWiseFoldr",
+			"called on a vector y of a type that does not match the third domain "
+			"of the given operator" );
+
+		const size_t n = size( x );
+		if( n != size( y ) ) {
+			return MISMATCH;
+		}
+
+#ifdef _DEBUG
+		std::cout << "In foldr ([T]<-[T])\n";
+#endif
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[&x, &y, &op, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _ASCEND_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage foldr(x, y, operator) in the range("
+					<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				const Vector< bool, ascend, Coords > * const null_mask = nullptr;
+				const Coords * const local_null_mask = nullptr;
+
+				Coords local_x, local_y;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_x_nz, local_y_nz;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				(void) pipeline;
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+				bool already_dense_output = true;
+				bool already_dense_input = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( y ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+							upper_bound );
+						local_y_nz = local_y.nonzeroes();
+						if( local_y_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_input = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_input ) {
+#else
+						already_dense_input = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+						if( local_x_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, false, true, false, false
+						>(
+							already_dense_output, already_dense_input, true,
+							lower_bound, upper_bound,
+							local_y, local_null_mask,
+							local_x, y,
+							null_mask, x,
+							op, phase
+						);
+				} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, false, false, false, false
+						>(
+							already_dense_output, already_dense_input, true,
+							lower_bound, upper_bound,
+							local_y, local_null_mask,
+							local_x,
+							y, null_mask,
+							x,
+							op, phase
+						);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( !already_dense_output ) {
+#else
+				if( !already_dense_vectors ) {
+#endif
+					internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound,
+						upper_bound );
+				}
+
+				return rc;
+			};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_FOLD_VECTOR_VECTOR_GENERIC,
+				n, sizeof( IOType ),
+				dense_descr, true,
+				getID( y ),
+				&y, nullptr,
+				&internal::getCoordinates( y ), nullptr,
+				getID( x ), SIZE_MAX, SIZE_MAX, SIZE_MAX,
+				&x, nullptr, nullptr, nullptr,
+				&internal::getCoordinates( x ), nullptr, nullptr, nullptr,
+				SIZE_MAX, nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: foldr(x, y, operator)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename IOType,
+		typename MaskType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldr(
+		const Vector< InputType, ascend, Coords > &x,
+		const Vector< MaskType, ascend, Coords > &m,
+		Vector< IOType, ascend, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< grb::is_operator< OP >::value &&
+			!grb::is_object< InputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< IOType >::value, void
+		>::type * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType >::value ), "grb::eWiseFoldr",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, IOType >::value ), "grb::eWiseFoldr",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, IOType >::value ), "grb::eWiseFoldr",
+			"called on a vector y of a type that does not match the third domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseFoldr",
+			"called with a non-Boolean mask" );
+
+		if( size( m ) == 0 ) {
+			return foldr< descr >( x, y, op, phase );
+		}
+
+		const size_t n = size( x );
+		if( n != size( y ) || n != size( m ) ) {
+			return MISMATCH;
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[&x, &m, &y, &op, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _ASCEND_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage foldr(x, m, y, operator) in the range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				Coords local_x, local_m, local_y;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_x_nz, local_y_nz;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+				bool already_dense_output = true;
+				bool already_dense_mask = true;
+				bool already_dense_input = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( y ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+							upper_bound );
+						local_y_nz = local_y.nonzeroes();
+						if( local_y_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_mask = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( m ) );
+					if( !already_dense_mask ) {
+#else
+						already_dense_mask = false;
+#endif
+						local_m = internal::getCoordinates( m ).asyncSubset( lower_bound,
+							upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_input = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_input ) {
+#else
+						already_dense_input = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+						if( local_x_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, false, true, true, false
+						>(
+							already_dense_output, already_dense_input, already_dense_mask,
+							lower_bound, upper_bound,
+							local_y, &local_m, local_x,
+							y, &m, x,
+							op, phase
+						);
+				} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, false, false, true, false
+						>(
+							already_dense_output, already_dense_input, already_dense_mask,
+							lower_bound, upper_bound,
+							local_y, &local_m, local_x,
+							y, &m, x,
+							op, phase
+						);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( !already_dense_output ) {
+#else
+				if( !already_dense_vectors ) {
+#endif
+					internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound,
+						upper_bound );
+				}
+
+				return rc;
+			};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_FOLD_MASKED_VECTOR_VECTOR_GENERIC,
+				n, sizeof( IOType ),
+				dense_descr, true,
+				&y, nullptr, &internal::getCoordinates( y ), nullptr,
+				&x, &m, nullptr, nullptr,
+				&internal::getCoordinates( x ), &internal::getCoordinates( m ),
+				nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: foldr(x, m, y, operator)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename IOType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldr(
+		const Vector< InputType, ascend, Coords > &x,
+		Vector< IOType, ascend, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< grb::is_monoid< Monoid >::value &&
+				!grb::is_object< InputType >::value &&
+				!grb::is_object< IOType >::value,
+			void
+		>::type * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType >::value ), "grb::eWiseFoldr",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, IOType >::value ), "grb::eWiseFoldr",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, IOType >::value ), "grb::eWiseFoldr",
+			"called on a vector y of a type that does not match the third domain "
+			"of the given monoid" );
+
+		// dynamic sanity checks
+		const size_t n = size( x );
+		if( n != size( y ) ) {
+			return MISMATCH;
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[&x, &y, &monoid, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _ASCEND_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage foldr(x, y, monoid) in the range("
+					<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				const Vector< bool, ascend, Coords > * const null_mask = nullptr;
+				const Coords * const local_null_mask = nullptr;
+
+				Coords local_x, local_y;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_x_nz, local_y_nz;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+				bool already_dense_output = true;
+				bool already_dense_input = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( y ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+							upper_bound );
+						local_y_nz = local_y.nonzeroes();
+						if( local_y_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_input = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_input ) {
+#else
+						already_dense_input = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+						if( local_x_nz < local_n ) {
+							sparse = true;
+						}
+					}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, false, true, false, true
+						>(
+							already_dense_output, already_dense_input, true,
+							lower_bound, upper_bound,
+							local_y, local_null_mask, local_x,
+							y, null_mask, x,
+							monoid.getOperator(), phase
+						);
+				} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, false, false, false, true
+						>(
+							already_dense_output, already_dense_input, true,
+							lower_bound, upper_bound,
+							local_y, local_null_mask, local_x,
+							y, null_mask, x,
+							monoid.getOperator(), phase
+						);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			if( !already_dense_output ) {
+#else
+			if( !already_dense_vectors ) {
+#endif
+				internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_FOLD_VECTOR_VECTOR_GENERIC,
+				n, sizeof( IOType ), dense_descr, true,
+				&y, nullptr, &internal::getCoordinates( y ), nullptr,
+				&x, nullptr, nullptr, nullptr,
+				&internal::getCoordinates( x ), nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: foldr(x, y, monoid)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename IOType,
+		typename MaskType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldr(
+		const Vector< InputType, ascend, Coords > &x,
+		const Vector< MaskType, ascend, Coords > &m,
+		Vector< IOType, ascend, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< grb::is_monoid< Monoid >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType >::value &&
+			!grb::is_object< IOType >::value, void
+		>::type * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType >::value ), "grb::eWiseFoldr",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, IOType >::value ), "grb::eWiseFoldr",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, IOType >::value ), "grb::eWiseFoldr",
+			"called on a vector y of a type that does not match the third domain "
+			"of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseFoldr",
+			"called with a mask of non-Boolean type" );
+
+		// check empty mask
+		if( size( m ) == 0 ) {
+			return foldr< descr >( x, y, monoid, phase );
+		}
+
+		// dynamic sanity checks
+		const size_t n = size( x );
+		if( n != size( y ) || n != size( m ) ) {
+			return MISMATCH;
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[&x, &m, &y, &monoid, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _ASCEND_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage foldr(x, m, y, monoid) in the range("
+					<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				Coords local_x, local_m, local_y;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_x_nz, local_y_nz;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+				bool already_dense_output = true;
+				bool already_dense_mask = true;
+				bool already_dense_input = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( y ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+							upper_bound );
+						local_y_nz = local_y.nonzeroes();
+						if( local_y_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_mask = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( m ) );
+					if( !already_dense_mask ) {
+#else
+						already_dense_mask = false;
+#endif
+						local_m = internal::getCoordinates( m ).asyncSubset( lower_bound,
+							upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_input = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_input ) {
+#else
+						already_dense_input = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+						if( local_x_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, false, true, true, true
+						>(
+							already_dense_output, already_dense_input, already_dense_mask,
+							lower_bound, upper_bound,
+							local_y, &local_m, local_x,
+							y, &m, x,
+							monoid.getOperator(), phase
+						);
+				} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, false, false, true, true
+						>(
+							already_dense_output, already_dense_input, already_dense_mask,
+							lower_bound, upper_bound,
+							local_y, &local_m, local_x,
+							y, &m, x,
+							monoid.getOperator(), phase
+						);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( !already_dense_output ) {
+#else
+				if( !already_dense_vectors ) {
+#endif
+					internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound,
+						upper_bound );
+				}
+
+				return rc;
+			};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_FOLD_MASKED_VECTOR_VECTOR_GENERIC,
+				n, sizeof( IOType ), dense_descr, true,
+				&y, nullptr, &internal::getCoordinates( y ), nullptr,
+				&x, &m, nullptr, nullptr,
+				&internal::getCoordinates( x ), &internal::getCoordinates( m ),
+				nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+			std::cout << "\t\tStage added to a pipeline: foldr(x, m, y, monoid)"
+				<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Op,
+		typename IOType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldl(
+		Vector< IOType, ascend, Coords > &x,
+		const InputType beta,
+		const Op &op = Op(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			grb::is_operator< Op >::value, void
+		>::type * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Op::D1, IOType >::value ),
+			"grb::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Op::D2, InputType >::value ),
+			"grb::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Op::D3, IOType >::value ),
+			"grb::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given operator" );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[&x, beta, &op, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _ASCEND_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage foldl(x, beta, op) in the range("
+					<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				const Vector< bool, ascend, Coords > * const null_mask = nullptr;
+				const Coords * const local_null_mask = nullptr;
+
+				Coords local_x;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_x_nz = local_n;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+				bool already_dense_output = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+						if( local_x_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+					rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+							descr, true, true, false, false
+						>(
+							already_dense_output, true,
+							lower_bound, upper_bound,
+							local_x, local_null_mask,
+							x, null_mask,
+							beta,
+							op, phase
+						);
+				} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+				rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+						descr, true, false, false, false
+					>(
+						already_dense_output, true,
+						lower_bound, upper_bound,
+						local_x, local_null_mask,
+						x, null_mask, beta,
+						op, phase
+					);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( !already_dense_output ) {
+#else
+				if( !already_dense_vectors ) {
+#endif
+					internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound, upper_bound );
+				}
+
+				return rc;
+			};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_FOLD_SCALAR_VECTOR_GENERIC,
+				internal::getCoordinates( x ).size(), sizeof( IOType ),
+				dense_descr, true,
+				&x, nullptr,
+				&internal::getCoordinates( x ), nullptr,
+				nullptr, nullptr, nullptr, nullptr,
+				nullptr, nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: foldl(x, beta, op)" << std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Op,
+		typename IOType,
+		typename MaskType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldl(
+		Vector< IOType, ascend, Coords > &x,
+		const Vector< MaskType, ascend, Coords > &m,
+		const InputType beta,
+		const Op &op = Op(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< IOType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType >::value &&
+			grb::is_operator< Op >::value, void
+		>::type * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Op::D1, IOType >::value ),
+			"grb::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Op::D2, InputType >::value ),
+			"grb::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Op::D3, IOType >::value ),
+			"grb::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting ) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::foldl (reference, vector <- scalar, masked)",
+			"provided mask does not have boolean entries" );
+
+		// check empty mask
+		if( size( m ) == 0 ) {
+			return foldl< descr >( x, beta, op, phase );
+		}
+
+		// dynamic checks
+		const size_t n = size( x );
+		if( size( m ) != n ) {
+			return MISMATCH;
+		}
+
+		// catch trivial phase
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[&x, &m, beta, &op, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _ASCEND_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage foldl(x, m, beta, op) in the range("
+					<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				Coords local_x, local_mask;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_x_nz = local_n;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+				bool already_dense_output = true;
+				bool already_dense_mask = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+						if( local_x_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_mask = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( m ) );
+					if( !already_dense_mask ) {
+#else
+						already_dense_mask = false;
+#endif
+						local_mask = internal::getCoordinates( m ).asyncSubset( lower_bound,
+							upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+					rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+							descr, true, true, true, false
+						>(
+							already_dense_output, already_dense_mask,
+							lower_bound, upper_bound,
+							local_x, &local_mask,
+							x, &m,
+							beta,
+							op, phase
+						);
+				} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+					rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+							descr, true, false, true, false
+						>(
+							already_dense_output, already_dense_mask,
+							lower_bound, upper_bound,
+							local_x, &local_mask,
+							x, &m,
+							beta,
+							op, phase
+						);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( !already_dense_output ) {
+#else
+				if( !already_dense_vectors ) {
+#endif
+					internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+						upper_bound );
+				}
+
+				return rc;
+			};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_FOLD_MASKED_SCALAR_VECTOR_GENERIC,
+				n, sizeof( IOType ),
+				dense_descr, true,
+				&x, nullptr, &internal::getCoordinates( x ), nullptr,
+				&m, nullptr, nullptr, nullptr,
+				&internal::getCoordinates( m ), nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: foldl(x, m, beta, op)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename IOType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldl(
+		Vector< IOType, ascend, Coords > &x,
+		const InputType beta,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, IOType >::value ), "grb::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType >::value ), "grb::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, IOType >::value ), "grb::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given monoid" );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[&x, beta, &monoid, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _ASCEND_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage foldl(x, beta, monoid) in the range("
+					<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				const Vector< bool, ascend, Coords > * const null_mask = nullptr;
+				const Coords * const local_null_mask = nullptr;
+
+				Coords local_x;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_x_nz = local_n;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				(void) pipeline;
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+				bool already_dense_output = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+						if( local_x_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+					rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+							descr, true, true, false, true
+						>(
+							already_dense_output, true,
+							lower_bound, upper_bound,
+							local_x, local_null_mask,
+							x, null_mask,
+							beta,
+							monoid.getOperator(), phase
+						);
+				} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+					rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+							descr, true, false, false, true
+						>(
+							already_dense_output, true,
+							lower_bound, upper_bound,
+							local_x, local_null_mask,
+							x, null_mask,
+							beta,
+							monoid.getOperator(), phase
+						);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( !already_dense_output ) {
+#else
+				if( !already_dense_vectors ) {
+#endif
+					internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+						upper_bound );
+				}
+
+				return rc;
+			};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_FOLD_SCALAR_VECTOR_GENERIC,
+				internal::getCoordinates( x ).size(), sizeof( IOType ),
+				dense_descr, true,
+				&x, nullptr, &internal::getCoordinates( x ), nullptr,
+				nullptr, nullptr, nullptr, nullptr,
+				nullptr, nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: foldl(x, beta, monoid)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename IOType,
+		typename MaskType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldl(
+		Vector< IOType, ascend, Coords > &x,
+		const Vector< MaskType, ascend, Coords > &m,
+		const InputType &beta,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< IOType >::value &&
+				!grb::is_object< MaskType >::value &&
+				!grb::is_object< InputType >::value &&
+				grb::is_monoid< Monoid >::value,
+			void
+		>::type * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, IOType >::value ), "grb::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType >::value ), "grb::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, IOType >::value ), "grb::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ),
+			"grb::foldl (ascend, vector <- scalar, masked, monoid)",
+			"provided mask does not have boolean entries" );
+
+		// check for empty mask
+		if( size( m ) == 0 ) {
+			return foldl< descr >( x, beta, monoid, phase );
+		}
+
+		// dynamic checks
+		const size_t n = size( x );
+		if( n != size( m ) ) { return MISMATCH; }
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[&x, &m, beta, &monoid, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _ASCEND_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage foldl(x, m, beta, monoid) in the "
+					<< "range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				Coords local_x, local_m;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_x_nz = local_n;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				(void) pipeline;
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+				bool already_dense_output = true;
+				bool already_dense_mask = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+						if( local_x_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_mask = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( m ) );
+					if( !already_dense_mask ) {
+#else
+						already_dense_mask = false;
+#endif
+						local_m = internal::getCoordinates( m ).asyncSubset( lower_bound,
+							upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+					rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+							descr, true, true, true, true
+						>(
+							already_dense_output, already_dense_mask,
+							lower_bound, upper_bound,
+							local_x, &local_m,
+							x, &m,
+							beta,
+							monoid.getOperator(), phase
+						);
+				} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+					rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+							descr, true, false, true, true
+						>(
+							already_dense_output, already_dense_mask,
+							lower_bound, upper_bound,
+							local_x, &local_m,
+							x, &m,
+							beta,
+							monoid.getOperator(), phase
+						);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( !already_dense_output ) {
+#else
+				if( !already_dense_vectors ) {
+#endif
+					internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+						upper_bound );
+				}
+
+				return rc;
+			};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_FOLD_MASKED_SCALAR_VECTOR_GENERIC,
+				internal::getCoordinates( x ).size(), sizeof( IOType ),
+				dense_descr, true,
+				&x, nullptr, &internal::getCoordinates( x ), nullptr,
+				&m, nullptr, nullptr, nullptr,
+				&internal::getCoordinates( m ), nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: foldl(x, m, beta, monoid)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename IOType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldl(
+		Vector< IOType, ascend, Coords > &x,
+		const Vector< InputType, ascend, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< grb::is_operator< OP >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value, void
+		>::type * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, IOType >::value ), "grb::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType >::value ), "grb::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( (!( descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, IOType >::value ), "grb::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given operator" );
+
+		// dynamic sanity checks
+		const size_t n = size( x );
+		if( n != size( y ) ) {
+			return MISMATCH;
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[&x, &y, &op, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _ASCEND_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage foldl(x, y, operator) in the range("
+					<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				const Vector< bool, ascend, Coords > * const null_mask = nullptr;
+				const Coords * const local_null_mask = nullptr;
+
+				Coords local_x, local_y;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_x_nz, local_y_nz;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				(void) pipeline;
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+				bool already_dense_output = true;
+				bool already_dense_input = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+						if( local_x_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_input = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( y ) );
+					if( !already_dense_input ) {
+#else
+						already_dense_input = false;
+#endif
+						local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+							upper_bound );
+						local_y_nz = local_y.nonzeroes();
+						if( local_y_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, true, true, false, false
+						>(
+							already_dense_output, already_dense_input, true,
+							lower_bound, upper_bound,
+							local_x, local_null_mask, local_y,
+							x, null_mask, y,
+							op, phase
+						);
+				} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, true, false, false, false
+						>(
+							already_dense_output, already_dense_input, true,
+							lower_bound, upper_bound,
+							local_x, local_null_mask, local_y,
+							x, null_mask, y,
+							op, phase
+						);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( !already_dense_output ) {
+#else
+				if( !already_dense_vectors ) {
+#endif
+					internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+						upper_bound );
+				}
+
+				return rc;
+			};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_FOLD_VECTOR_VECTOR_GENERIC,
+				n, sizeof( IOType ), dense_descr, true,
+				getID( x ),
+				&x, nullptr, &internal::getCoordinates( x ), nullptr,
+				getID( y ), SIZE_MAX, SIZE_MAX, SIZE_MAX,
+				&y, nullptr, nullptr, nullptr,
+				&internal::getCoordinates( y ), nullptr, nullptr, nullptr,
+				SIZE_MAX, nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: foldl(x, y, operator)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename IOType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldl(
+		Vector< IOType, ascend, Coords > &x,
+		const Vector< InputType, ascend, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< grb::is_monoid< Monoid >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value, void
+		>::type * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, IOType >::value ), "grb::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType >::value ), "grb::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, IOType >::value ), "grb::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given operator" );
+
+		// dynamic sanity checks
+		const size_t n = size( x );
+		if( n != size( y ) ) {
+			return MISMATCH;
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[&x, &y, &monoid, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _ASCEND_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage foldl(x, y, monoid) in the range("
+					<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				const Vector< bool, ascend, Coords > * const null_mask = nullptr;
+				const Coords * const local_null_mask = nullptr;
+
+				Coords local_x, local_y;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_x_nz, local_y_nz;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				(void) pipeline;
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+				bool already_dense_output = true;
+				bool already_dense_input = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+						if( local_x_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_input = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( y ) );
+					if( !already_dense_input ) {
+#else
+						already_dense_input = false;
+#endif
+						local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+							upper_bound );
+						local_y_nz = local_y.nonzeroes();
+						if( local_y_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, true, true, false, true
+						>(
+							already_dense_output, already_dense_input, true,
+							lower_bound, upper_bound,
+							local_x, local_null_mask, local_y,
+							x, null_mask, y,
+							monoid.getOperator(), phase
+						);
+				} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, true, false, false, true
+						>(
+							already_dense_output, already_dense_input, true,
+							lower_bound, upper_bound,
+							local_x, local_null_mask, local_y,
+							x, null_mask, y,
+							monoid.getOperator(), phase
+						);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( !already_dense_output ) {
+#else
+				if( !already_dense_vectors ) {
+#endif
+					internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+						upper_bound );
+				}
+
+				return rc;
+			};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_FOLD_VECTOR_VECTOR_GENERIC,
+				n, sizeof( IOType ), dense_descr, true,
+				&x, nullptr, &internal::getCoordinates( x ), nullptr,
+				&y, nullptr, nullptr, nullptr,
+				&internal::getCoordinates( y ), nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: foldl(x, y, monoid)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename IOType,
+		typename MaskType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldl(
+		Vector< IOType, ascend, Coords > &x,
+		const Vector< MaskType, ascend, Coords > &m,
+		const Vector< InputType, ascend, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< grb::is_operator< OP >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType >::value, void
+		>::type * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, IOType >::value ), "grb::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType >::value ), "grb::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, IOType >::value ), "grb::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::foldl",
+			"called with a mask that does not have boolean entries " );
+
+		// catch empty mask
+		if( size( m ) == 0 ) {
+			return foldl< descr >( x, y, op, phase );
+		}
+
+		// dynamic sanity checks
+		const size_t n = size( x );
+		if( n != size( y ) || n != size( m ) ) {
+			return MISMATCH;
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[&x, &m, &y, &op, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _ASCEND_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage foldl(x, m, y, op) in the range("
+					<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				Coords local_x, local_y, local_m;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_x_nz = local_n;
+				size_t local_y_nz = local_n;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				(void) pipeline;
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+				bool already_dense_output = true;
+				bool already_dense_input = true;
+				bool already_dense_mask = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+						if( local_x_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_mask = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( m ) );
+					if( !already_dense_mask ) {
+#else
+						already_dense_mask = false;
+#endif
+						local_m = internal::getCoordinates( m ).asyncSubset( lower_bound,
+							upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_input = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( y ) );
+					if( !already_dense_input ) {
+#else
+						already_dense_input = false;
+#endif
+						local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+							upper_bound );
+						local_y_nz = local_y.nonzeroes();
+						if( local_y_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, true, true, true, false
+						>(
+							already_dense_output, already_dense_input, already_dense_mask,
+							lower_bound, upper_bound,
+							local_x, &local_m, local_y,
+							x, &m, y,
+							op, phase
+						);
+				} else {
+					assert( local_x_nz == local_n );
+					assert( local_y_nz == local_n );
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, true, false, true, false
+						>(
+							already_dense_output, already_dense_input, already_dense_mask,
+							lower_bound, upper_bound,
+							local_x, &local_m, local_y,
+							x, &m, y,
+							op, phase
+						);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			if( !already_dense_output ) {
+#else
+			if( !already_dense_vectors ) {
+#endif
+				internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_FOLD_MASKED_VECTOR_VECTOR_GENERIC,
+				n, sizeof( IOType ), dense_descr, true,
+				&x, nullptr, &internal::getCoordinates( x ), nullptr,
+				&y, &m, nullptr, nullptr,
+				&internal::getCoordinates( y ), &internal::getCoordinates( m ), nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: foldl(x, m, y, op)" << std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename IOType,
+		typename MaskType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldl(
+		Vector< IOType, ascend, Coords > &x,
+		const Vector< MaskType, ascend, Coords > &m,
+		const Vector< InputType, ascend, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< grb::is_monoid< Monoid >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType >::value, void
+		>::type * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, IOType >::value ), "grb::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType >::value ), "grb::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, IOType >::value ), "grb::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::foldl",
+			"called with a mask that does not have boolean entries" );
+
+		// catch empty mask
+		if( size( m ) == 0 ) {
+			return foldl< descr >( x, y, monoid, phase );
+		}
+
+		// dynamic sanity checks
+		const size_t n = size( x );
+		if( n != size( y ) || n != size( m ) ) {
+			return MISMATCH;
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[&x, &m, &y, &monoid, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _ASCEND_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage foldl(x, m, y, monoid) in the range("
+					<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				Coords local_x, local_y, local_m;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_x_nz = local_n;
+				size_t local_y_nz = local_n;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				(void) pipeline;
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+				bool already_dense_output = true;
+				bool already_dense_input = true;
+				bool already_dense_mask = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+						if( local_x_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_mask = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( m ) );
+					if( !already_dense_mask ) {
+#else
+						already_dense_mask = false;
+#endif
+						local_m = internal::getCoordinates( m ).asyncSubset( lower_bound,
+							upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_input = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( y ) );
+					if( !already_dense_input ) {
+#else
+						already_dense_input = false;
+#endif
+						local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+							upper_bound );
+						local_y_nz = local_y.nonzeroes();
+						if( local_y_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, true, true, true, true
+						>(
+							already_dense_output, already_dense_input, already_dense_mask,
+							lower_bound, upper_bound,
+							local_x, &local_m, local_y,
+							x, &m, y,
+							monoid.getOperator(), phase
+						);
+				} else {
+					assert( local_x_nz == local_n );
+					assert( local_y_nz == local_n );
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, true, false, true, true
+						>(
+							already_dense_output, already_dense_input, already_dense_mask,
+							lower_bound, upper_bound,
+							local_x, &local_m, local_y,
+							x, &m, y,
+							monoid.getOperator(), phase
+						);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( !already_dense_output ) {
+#else
+				if( !already_dense_vectors ) {
+#endif
+					internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+						upper_bound );
+				}
+
+				return rc;
+			};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_FOLD_MASKED_VECTOR_VECTOR_GENERIC,
+				n, sizeof( IOType ),
+				dense_descr, true,
+				&x, nullptr, &internal::getCoordinates( x ), nullptr,
+				&y, &m, nullptr, nullptr,
+				&internal::getCoordinates( y ), &internal::getCoordinates( m ),
+				nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: foldl(x, m, y, monoid)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	namespace internal {
+
+		template<
+			bool left_scalar,
+			bool right_scalar,
+			bool left_sparse,
+			bool right_sparse,
+			Descriptor descr, class OP,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+#endif
+			typename OutputType,
+			typename InputType1,
+			typename InputType2,
+			typename Coords
+		>
+		RC dense_apply_generic(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+#endif
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, ascend, Coords > &z_vector,
+			const internal::Wrapper< left_scalar, InputType1, Coords > x_wrapper,
+			const internal::Wrapper< right_scalar, InputType2, Coords > y_wrapper,
+			const OP &op
+		) {
+#ifdef _DEBUG
+			std::cout << "\t internal::dense_apply_generic called\n";
+#endif
+			static_assert( !(left_scalar && left_sparse),
+				"The left-hand side must be scalar OR sparse, but cannot be both!" );
+			static_assert( !(right_scalar && right_sparse),
+				"The right-hand side must be scalar OR sparse, but cannot be both!" );
+			static_assert( !(left_sparse && right_sparse),
+				"If both left- and right-hand sides are sparse, use sparse_apply_generic "
+				"instead." );
+
+			// create local copies of the input const pointers
+			OutputType * __restrict__ const z_p = internal::getRaw( z_vector );
+			const InputType1 * __restrict__ x_p = x_wrapper.getRaw();
+			const InputType2 * __restrict__ y_p = y_wrapper.getRaw();
+
+			const size_t local_n = upper_bound - lower_bound;
+
+			constexpr const size_t block_size = OP::blocksize;
+			const size_t num_blocks = local_n / block_size;
+
+#ifndef NDEBUG
+			const bool has_coda = local_n % block_size > 0;
+#endif
+			size_t i = 0 + lower_bound;
+			const size_t start = 0;
+			const size_t end = num_blocks;
+
+			// declare and initialise local buffers for SIMD
+			OutputType z_b[ block_size ];
+			InputType1 x_b[ block_size ];
+			InputType2 y_b[ block_size ];
+			bool x_m[ block_size ];
+			bool y_m[ block_size ];
+			for( size_t k = 0; k < block_size; ++k ) {
+				if( left_scalar ) {
+					x_b[ k ] = x_wrapper.getValue();
+				}
+				if( right_scalar ) {
+					y_b[ k ] = y_wrapper.getValue();
+				}
+			}
+
+			for( size_t block = start; block < end; ++block ) {
+				size_t local_i = i;
+				for( size_t k = 0; k < block_size; ++k ) {
+					if( !left_scalar ) {
+						x_b[ k ] = x_p[ local_i ];
+					}
+					if( !right_scalar ) {
+						y_b[ k ] = y_p[ local_i ];
+					}
+					if( left_sparse ) {
+						x_m[ k ] = already_dense_input_x || local_x.assigned( local_i -
+							lower_bound );
+					}
+					if( right_sparse ) {
+						y_m[ k ] = already_dense_input_y || local_y.assigned( local_i -
+							lower_bound );
+					}
+					(void) ++local_i;
+				}
+				for( size_t k = 0; k < block_size; ++k ) {
+					RC rc = SUCCESS;
+					if( left_sparse && !x_m[ k ] ) {
+						z_b[ k ] = y_b[ k ]; // WARNING: assumes monoid semantics!
+					} else if( right_sparse && !y_m[ k ] ) {
+						z_b[ k ] = x_b[ k ]; // WARNING: assumes monoid semantics!
+					} else {
+						rc = apply( z_b[ k ], x_b[ k ], y_b[ k ], op );
+					}
+					assert( rc == SUCCESS );
+#ifdef NDEBUG
+					(void) rc;
+#endif
+				}
+				for( size_t k = 0; k < block_size; ++k, ++i ) {
+					z_p[ i ] = z_b[ k ];
+				}
+			}
+
+#ifndef NDEBUG
+			if( has_coda ) {
+				assert( i < local_n + lower_bound );
+			} else {
+				assert( i == local_n + lower_bound );
+			}
+#endif
+
+			i = end * block_size + lower_bound;
+			for( ; i < local_n + lower_bound; ++i ) {
+				RC rc = SUCCESS;
+				if( left_scalar && right_scalar ) {
+					rc = apply( z_p[ i ], x_wrapper.getValue(), y_wrapper.getValue(), op );
+				} else if( left_scalar && !right_scalar ) {
+					if( right_sparse && !( already_dense_input_y || local_y.assigned( i -
+						lower_bound ) )
+					) {
+						z_p[ i ] = x_wrapper.getValue();
+					} else {
+						rc = apply( z_p[ i ], x_wrapper.getValue(), y_p[ i ], op );
+					}
+				} else if( !left_scalar && right_scalar ) {
+					if( left_sparse && !( already_dense_input_x || local_x.assigned( i -
+						lower_bound ) )
+					) {
+						z_p[ i ] = y_wrapper.getValue();
+					} else {
+						rc = apply( z_p[ i ], x_p[ i ], y_wrapper.getValue(), op );
+					}
+				} else {
+					assert( !left_scalar && !right_scalar );
+					if( left_sparse && !(already_dense_input_x || local_x.assigned( i -
+						lower_bound ) )
+					) {
+						z_p[ i ] = y_p[ i ];
+					} else if( right_sparse && !(already_dense_input_y || local_y.assigned( i -
+						lower_bound ) )
+					) {
+						z_p[ i ] = x_p[ i ];
+					} else {
+						assert( !left_sparse && !right_sparse );
+						rc = apply( z_p[ i ], x_p[ i ], y_p[ i ], op );
+					}
+				}
+				assert( rc == SUCCESS );
+#ifdef NDEBUG
+				(void) rc;
+#endif
+			}
+
+			assert( false );
+			return UNSUPPORTED;
+		}
+
+		template<
+			bool masked,
+			bool monoid,
+			bool x_scalar,
+			bool y_scalar,
+			Descriptor descr,
+			class OP,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_mask,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+#endif
+			typename OutputType,
+			typename MaskType,
+			typename InputType1,
+			typename InputType2,
+			typename Coords
+		>
+		RC sparse_apply_generic(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_mask,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+#endif
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_z,
+			const Coords * const local_mask_ptr,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, ascend, Coords > &z_vector,
+			const Vector< MaskType, ascend, Coords > * const mask_vector,
+			const internal::Wrapper< x_scalar, InputType1, Coords > x_wrapper,
+			const internal::Wrapper< y_scalar, InputType2, Coords > y_wrapper,
+			const OP &op
+		) {
+#ifndef GRB_NO_NOOP_CHECKS
+			static_assert( !internal::maybe_noop< OP >::value, "Warning: you may be "
+				"generating an output vector with uninitialised values. Define "
+				"the GRB_NO_NOOP_CHECKS macro to disable this check.\n" );
+#endif
+			// create local copies of the input const pointers
+			OutputType * __restrict__ const z_p = internal::getRaw( z_vector );
+			const MaskType * __restrict__ const mask_p = ( masked )
+				? internal::getRaw( *mask_vector )
+				: nullptr;
+			const InputType1 * __restrict__ x_p = x_wrapper.getRaw();
+			const InputType2 * __restrict__ y_p = y_wrapper.getRaw();
+
+			Coords local_mask;
+			if( masked ) {
+				local_mask = *local_mask_ptr;
+			}
+
+			const size_t local_n = upper_bound - lower_bound;
+			const size_t local_x_nz = already_dense_input_x
+				? local_n
+				: local_x.nonzeroes();
+			const size_t local_y_nz = already_dense_input_y
+				? local_n
+				: local_y.nonzeroes();
+
+			// assertions
+			assert( !masked || local_mask_ptr != nullptr );
+			assert( !masked || local_mask_ptr->size() == local_n );
+			assert( x_scalar || local_x_nz <= local_n );
+			assert( y_scalar || local_y_nz <= local_n );
+
+#ifdef _DEBUG
+			std::cout << "\tinternal::sparse_apply_generic called\n";
+#endif
+			constexpr const size_t block_size = OP::blocksize;
+
+			// swap so that we do the expensive pass over the container with the fewest
+			// nonzeroes first
+			assert( !x_scalar || !y_scalar );
+			const bool swap = ( ( x_scalar || already_dense_input_x )
+					? local_n
+					: local_x_nz
+				) > ( ( y_scalar || already_dense_input_y )
+					? local_n
+					: local_y_nz
+				);
+			const Coordinates< nonblocking > &loop_coors = swap ? local_y : local_x;
+			const Coordinates< nonblocking > &chk_coors = swap ? local_x : local_y;
+			const bool already_dense_loop = swap
+				? already_dense_input_y
+				: already_dense_input_x;
+			const bool already_dense_chk = swap
+				? already_dense_input_x
+				: already_dense_input_y;
+
+			const size_t loop_coors_nz = swap ? local_y_nz : local_x_nz;
+			const size_t chk_coors_nz = swap ? local_x_nz : local_y_nz;
+#ifdef _DEBUG
+			std::cout << "\t\tfirst-phase loop of size " << loop_coors.size() << "\n";
+			if( x_scalar || y_scalar ) {
+				std::cout << "\t\tthere will be no second phase because one of the inputs "
+					<< "is scalar\n";
+			} else {
+				std::cout << "\t\tsecond-phase loop of size " << chk_coors.size() << "\n";
+			}
+#endif
+			// declare buffers for vectorisation
+			size_t offsets[ block_size ];
+			OutputType z_b[ block_size ];
+			InputType1 x_b[ block_size ];
+			InputType2 y_b[ block_size ];
+			bool mask[ block_size ];
+			bool x_m[ block_size ];
+			bool y_m[ block_size ];
+
+			if( x_scalar ) {
+				for( size_t k = 0; k < block_size; ++k ) {
+					x_b[ k ] = x_wrapper.getValue();
+				}
+			}
+			if( y_scalar ) {
+				for( size_t k = 0; k < block_size; ++k ) {
+					y_b[ k ] = y_wrapper.getValue();
+				}
+			}
+
+			// expensive pass #1
+			size_t start = 0;
+			size_t end = loop_coors_nz / block_size;
+			size_t k = 0;
+			for( size_t b = start; b < end; ++b ) {
+				// perform gathers
+				for( size_t i = 0; i < block_size; ++i ) {
+					const size_t index = ( already_dense_loop )
+						? ( ( k++ ) + lower_bound )
+						: ( loop_coors.index( k++ ) + lower_bound );
+					offsets[ i ] = index;
+					assert( index < local_n + lower_bound );
+					if( masked ) {
+						if( already_dense_mask ) {
+							mask[ i ] = internal::getCoordinates( *mask_vector ).template
+								mask< descr >( index, mask_p );
+						} else {
+							mask[ i ] = local_mask.template mask< descr >( index - lower_bound,
+								mask_p + lower_bound );
+						}
+					}
+				}
+				// perform gathers
+				for( size_t i = 0; i < block_size; ++i ) {
+					if( !masked || mask[ i ] ) {
+						if( !x_scalar ) {
+							x_b[ i ] = x_p[ offsets[ i ] ];
+						}
+						if( !x_scalar && !y_scalar ) {
+							y_m[ i ] = already_dense_chk || chk_coors.assigned( offsets[ i ] -
+								lower_bound );
+						} else {
+							y_m[ i ] = true;
+						}
+						if( !y_scalar ) {
+							y_b[ i ] = y_p[ offsets[ i ] ];
+						}
+					} else {
+						y_m[ i ] = false;
+					}
+				}
+				// perform compute
+				for( size_t i = 0; i < block_size; ++i ) {
+					RC rc = SUCCESS;
+					if( y_m[ i ] ) {
+						rc = apply( z_b[ i ], x_b[ i ], y_b[ i ], op );
+					} else if( monoid ) {
+						if( swap ) {
+							z_b[ i ] = static_cast< typename OP::D3 >( x_b[ i ] );
+						} else {
+							z_b[ i ] = static_cast< typename OP::D3 >( y_b[ i ] );
+						}
+					}
+					assert( rc == SUCCESS );
+#ifdef NDEBUG
+					(void) rc;
+#endif
+				}
+				// part that may or may not be vectorised (can we do something about this??)
+				for( size_t i = 0; i < block_size; ++i ) {
+					if( !masked || mask[ i ] ) {
+						if( y_m[ i ] || monoid ) {
+							(void) local_z.assign( offsets[ i ] - lower_bound );
+						}
+					}
+				}
+				// perform scatter
+				for( size_t i = 0; i < block_size; ++i ) {
+					if( !masked || mask[ i ] ) {
+						if( monoid || y_m[ i ] ) {
+							GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // the only way the below could write
+							                                    // an uninitialised value is if the
+											    // static_assert at the top of this
+							z_p[ offsets[ i ] ] = z_b[ i ];     // function had triggered. See also
+							GRB_UTIL_RESTORE_WARNINGS           // internal issue #321.
+						}
+					}
+				}
+			}
+
+			for( ; k < loop_coors_nz; ++k ) {
+				const size_t index = ( already_dense_loop )
+					? k + lower_bound
+					: loop_coors.index( k ) + lower_bound;
+				if( masked ) {
+					if( already_dense_mask ) {
+						if( !internal::getCoordinates( *mask_vector ).template mask< descr >(
+							index, mask_p )
+						) {
+							continue;
+						}
+					} else {
+						if( !local_mask.template mask< descr >( index - lower_bound, mask_p +
+							lower_bound )
+						) {
+							continue;
+						}
+					}
+				}
+				RC rc = SUCCESS;
+				(void) local_z.assign( index - lower_bound );
+				if( x_scalar || y_scalar || already_dense_chk || chk_coors.assigned(
+					index - lower_bound )
+				) {
+					rc = apply(
+						z_p[ index ],
+						( x_scalar )
+							? x_wrapper.getValue()
+							: x_p[ index ],
+						( y_scalar )
+							? y_wrapper.getValue()
+							: y_p[ index ],
+						op
+					);
+				} else if( monoid ) {
+					if( swap ) {
+						z_p[ index ] = x_scalar ?
+							static_cast< typename OP::D3 >( x_wrapper.getValue() ) :
+							static_cast< typename OP::D3 >( x_p[ index ] );
+					} else {
+						z_p[ index ] = y_scalar ?
+							static_cast< typename OP::D3 >( y_wrapper.getValue() ) :
+							static_cast< typename OP::D3 >( y_p[ index ] );
+					}
+				}
+				assert( rc == SUCCESS );
+#ifdef NDEBUG
+				(void) rc;
+#endif
+			}
+
+			// cheaper pass #2, only required if we are using monoid semantics
+			// AND if both inputs are vectors
+			if( monoid && !x_scalar && !y_scalar ) {
+				start = 0;
+				end = chk_coors_nz / block_size;
+				k = 0;
+				for( size_t b = start; b < end; ++b ) {
+					// streaming load
+					for( size_t i = 0; i < block_size; i++ ) {
+						offsets[ i ] = ( already_dense_chk )
+							? ( ( k++ ) + lower_bound )
+							: ( chk_coors.index( k++ ) + lower_bound );
+						assert( offsets[ i ] < local_n + lower_bound );
+					}
+					// pure gather
+					for( size_t i = 0; i < block_size; i++ ) {
+						x_m[ i ] = already_dense_loop || loop_coors.assigned( offsets[ i ] -
+							lower_bound );
+					}
+					// gather-like
+					for( size_t i = 0; i < block_size; i++ ) {
+						if( masked ) {
+							if( already_dense_mask ) {
+								mask[ i ] = utils::interpretMask< descr >(
+										internal::getCoordinates( *mask_vector ).assigned( offsets[ i ] ),
+										mask_p, offsets[ i ]
+									);
+							} else {
+								mask[ i ] = utils::interpretMask< descr >(
+										local_mask.assigned( offsets[ i ] - lower_bound ),
+										mask_p, offsets[ i ]
+									);
+							}
+						}
+					}
+					// SIMD
+					for( size_t i = 0; i < block_size; i++ ) {
+						x_m[ i ] = ! x_m[ i ];
+					}
+					// SIMD
+					for( size_t i = 0; i < block_size; i++ ) {
+						if( masked ) {
+							mask[ i ] = mask[ i ] && x_m[ i ];
+						}
+					}
+					if( !swap ) {
+						// gather
+						for( size_t i = 0; i < block_size; ++i ) {
+							if( masked ) {
+								if( mask[ i ] ) {
+									y_b[ i ] = y_p[ offsets[ i ] ];
+								}
+							} else {
+								if( x_m[ i ] ) {
+									y_b[ i ] = y_p[ offsets[ i ] ];
+								}
+							}
+						}
+						// SIMD
+						for( size_t i = 0; i < block_size; i++ ) {
+							if( masked ) {
+								if( mask[ i ] ) {
+									z_b[ i ] = y_b[ i ];
+								}
+							} else {
+								if( x_m[ i ] ) {
+									z_b[ i ] = y_b[ i ];
+								}
+							}
+						}
+					} else {
+						// gather
+						for( size_t i = 0; i < block_size; ++i ) {
+							if( masked ) {
+								if( mask[ i ] ) {
+									x_b[ i ] = x_p[ offsets[ i ] ];
+								}
+							} else {
+								if( x_m[ i ] ) {
+									x_b[ i ] = x_p[ offsets[ i ] ];
+								}
+							}
+						}
+						// SIMD
+						for( size_t i = 0; i < block_size; i++ ) {
+							if( masked ) {
+								if( mask[ i ] ) {
+									z_b[ i ] = static_cast< typename OP::D3 >( x_b[ i ] );
+								}
+							} else {
+								if( x_m[ i ] ) {
+									z_b[ i ] = static_cast< typename OP::D3 >( x_b[ i ] );
+								}
+							}
+						}
+					}
+					// SIMD-like
+					for( size_t i = 0; i < block_size; i++ ) {
+						if( masked ) {
+							if( mask[ i ] ) {
+								(void)local_z.assign( offsets[ i ] - lower_bound );
+							}
+						} else {
+							if( x_m[ i ] ) {
+								(void)local_z.assign( offsets[ i ] - lower_bound );
+							}
+						}
+					}
+					// scatter
+					for( size_t i = 0; i < block_size; i++ ) {
+						if( masked ) {
+							if( mask[ i ] ) {
+								GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED
+
+								z_p[ offsets[ i ] ] = z_b[ i ];
+
+								GRB_UTIL_RESTORE_WARNINGS
+							}
+						} else {
+							if( x_m[ i ] ) {
+#ifdef _DEBUG
+								std::cout << "\t\t writing out " << z_b[ i ] << " to index "
+									<< offsets[ i ] << "\n";
+#endif
+								GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // the only way the below could write
+								                                    // an uninitialised value is if the
+												    // static_assert at the top of this
+								z_p[ offsets[ i ] ] = z_b[ i ];     // function had triggered. See also
+								GRB_UTIL_RESTORE_WARNINGS           // internal issue #321.
+							}
+						}
+					}
+				}
+				for( ; k < chk_coors_nz; ++k ) {
+					const size_t index = ( ( already_dense_chk )
+						? k
+						: chk_coors.index( k ) ) + lower_bound;
+					assert( index < local_n + lower_bound );
+					if( already_dense_loop || loop_coors.assigned( index - lower_bound) ) {
+						continue;
+					}
+					if( masked ) {
+						if( already_dense_mask ) {
+							if( !internal::getCoordinates( *mask_vector ).template mask< descr >(
+								index, mask_p )
+							) {
+								continue;
+							}
+						} else {
+							if( !local_mask.template mask< descr >( index - lower_bound , mask_p +
+								lower_bound )
+							) {
+								continue;
+							}
+						}
+					}
+					(void) local_z.assign( index - lower_bound );
+					z_p[ index ] = swap ? x_p[ index ] : y_p[ index ];
+				}
+			}
+
+			return SUCCESS;
+		}
+
+		template<
+			bool left_scalar,
+			bool right_scalar,
+			bool left_sparse,
+			bool right_sparse,
+			Descriptor descr,
+			class OP,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_mask,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+#endif
+			typename OutputType, typename MaskType,
+			typename InputType1, typename InputType2,
+			typename Coords
+		>
+		RC masked_apply_generic(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_mask,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+#endif
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_z,
+			const Coords &local_mask,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, ascend, Coords > &z_vector,
+			const Vector< MaskType, ascend, Coords > &mask_vector,
+			const internal::Wrapper< left_scalar, InputType1, Coords > x_wrapper,
+			const internal::Wrapper< right_scalar, InputType2, Coords > y_wrapper,
+			const OP &op,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			const InputType1 * const left_identity,
+			const InputType2 * const right_identity
+#else
+			const InputType1 * const left_identity = nullptr,
+			const InputType2 * const right_identity = nullptr
+#endif
+		) {
+#ifdef _DEBUG
+			std::cout << "In masked_apply_generic< " << left_scalar << ", "
+				<< right_scalar << ", " << left_sparse << ", " << right_sparse << ", "
+				<< descr << " > with lower_bound = " << lower_bound << " and upper_bound = "
+				<< upper_bound << "\n";
+#endif
+			// assertions
+			static_assert( !(left_scalar && left_sparse),
+				"left_scalar and left_sparse cannot both be set!"
+			);
+			static_assert( !(right_scalar && right_sparse),
+				"right_scalar and right_sparse cannot both be set!"
+			);
+			assert( !left_sparse || left_identity != nullptr );
+			assert( !right_sparse || right_identity != nullptr );
+
+			// create local copies of the input const pointers
+			OutputType * __restrict__ const z_p = internal::getRaw( z_vector );
+			const MaskType * __restrict__ const mask_p = internal::getRaw( mask_vector );
+			const InputType1 * __restrict__ x_p = x_wrapper.getRaw();
+			const InputType2 * __restrict__ y_p = y_wrapper.getRaw();
+
+			const size_t local_n = upper_bound - lower_bound;
+			const size_t local_mask_nz = ( already_dense_mask )
+				? local_n
+				: local_mask.nonzeroes();
+#ifdef _DEBUG
+			std::cout << "\tinternal::masked_apply_generic called with nnz(mask)="
+				<< local_mask_nz << " and descriptor " << descr << "\n";
+			if( local_mask_nz > 0 ) {
+				std::cout << "\t\tNonzero mask indices: "
+					<< ( already_dense_mask ? 0 : local_mask.index( 0 ) );
+				assert( local_mask.assigned( local_mask.index( 0 ) ) );
+				for( size_t k = 1; k < local_mask_nz; ++k ) {
+					std::cout << ", "
+						<< ( ( already_dense_mask ) ? k : local_mask.index( k ) );
+					assert(
+						already_dense_mask ||
+						local_mask.assigned( local_mask.index( k ) )
+					);
+				}
+				std::cout << "\n";
+			}
+
+			size_t unset = 0;
+			for( size_t i = 0; i < local_n; ++i ) {
+				if( !( already_dense_mask || local_mask.assigned( i ) ) ) {
+					(void) ++unset;
+				}
+			}
+			assert( unset == local_n - local_mask_nz );
+#endif
+			// whether to use a Theta(n) or a Theta(nnz(mask)) loop
+			const bool bigLoop = local_mask_nz == local_n ||
+				(descr & descriptors::invert_mask);
+
+			// get block size
+			constexpr size_t size_t_block_size = config::SIMD_SIZE::value() /
+				sizeof( size_t );
+			constexpr size_t op_block_size = OP::blocksize;
+			constexpr size_t min_block_size = op_block_size > size_t_block_size
+				? size_t_block_size
+				: op_block_size;
+
+			if( bigLoop ) {
+#ifdef _DEBUG
+				std::cerr << "\t in bigLoop variant\n";
+#endif
+				size_t i = 0 + lower_bound;
+
+				constexpr const size_t block_size = op_block_size;
+				const size_t num_blocks = local_n / block_size;
+				const size_t start = 0;
+				const size_t end = num_blocks;
+
+				// declare buffers that fit in a single SIMD register and initialise if
+				// needed
+				bool mask_b[ block_size ];
+				OutputType z_b[ block_size ];
+				InputType1 x_b[ block_size ];
+				InputType2 y_b[ block_size ];
+				for( size_t k = 0; k < block_size; ++k ) {
+					if( left_scalar ) {
+						x_b[ k ] = x_wrapper.getValue();
+					}
+					if( right_scalar ) {
+						y_b[ k ] = y_wrapper.getValue();
+					}
+				}
+				for( size_t b = start; b < end; ++b ) {
+					for( size_t k = 0; k < block_size; ++k ) {
+						const size_t index = i + k;
+						assert( index < local_n + lower_bound );
+						if( already_dense_mask ) {
+							mask_b[ k ] = internal::getCoordinates( mask_vector ).template
+								mask< descr >( index, mask_p );
+						} else {
+							mask_b[ k ] = local_mask.template
+								mask< descr >( index - lower_bound, mask_p + lower_bound );
+						}
+					}
+					// check for no output
+					if( left_sparse && right_sparse ) {
+						for( size_t k = 0; k < block_size; ++k ) {
+							const size_t index = i + k;
+							assert( index < local_n + lower_bound );
+							if( mask_b[ k ] ) {
+								if( !( already_dense_input_x ||
+										local_x.assigned( index - lower_bound )
+									) && !(
+										already_dense_input_y ||
+										local_y.assigned( index - lower_bound )
+									)
+								) {
+									mask_b[ k ] = false;
+								}
+							}
+						}
+					}
+					for( size_t k = 0; k < block_size; ++k ) {
+						const size_t index = i + k;
+						assert( index < local_n + lower_bound );
+						if( mask_b[ k ] ) {
+							if( !left_scalar ) {
+								if( left_sparse && !(
+									already_dense_input_x || local_x.assigned( index - lower_bound )
+								) ) {
+									x_b[ k ] = *left_identity;
+								} else {
+									x_b[ k ] = *( x_p + index );
+								}
+							}
+							if( !right_scalar ) {
+								if( right_sparse && !(
+									already_dense_input_y || local_y.assigned( index - lower_bound )
+								) ) {
+									y_b[ k ] = *right_identity;
+								} else {
+									y_b[ k ] = *( y_p + index );
+								}
+							}
+						}
+					}
+					for( size_t k = 0; k < block_size; ++k ) {
+						if( mask_b[ k ] ) {
+							apply( z_b[ k ], x_b[ k ], y_b[ k ], op );
+						}
+					}
+					for( size_t k = 0; k < block_size; ++k ) {
+						const size_t index = i + k;
+						assert( index < local_n + lower_bound );
+						if( mask_b[ k ] ) {
+							(void) local_z.assign( index - lower_bound );
+							GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // This is only triggered with
+							*( z_p + index ) = z_b[ k ];        // mask_b[ k ], which in the above
+							GRB_UTIL_RESTORE_WARNINGS           // loop also triggeres initialising
+							                                    // z_b[ k ]
+						}
+					}
+
+					i += block_size;
+				}
+				// scalar coda
+				for(
+					size_t i = end * block_size + lower_bound;
+					i < local_n + lower_bound;
+					++i
+				) {
+					if( already_dense_mask ) {
+						if( !internal::getCoordinates( mask_vector ).template mask< descr >( i,
+							mask_p )
+						) {
+							continue;
+						}
+					} else {
+						if( !local_mask.template mask< descr >( i - lower_bound, mask_p +
+							lower_bound )
+						) {
+							continue;
+						}
+					}
+
+					if( left_sparse && right_sparse ) {
+						if( !( already_dense_input_x || local_x.assigned( i  - lower_bound ) ) &&
+							!( already_dense_input_y || local_y.assigned( i - lower_bound ) )
+						) {
+							continue;
+						}
+					}
+					(void) local_z.assign( i - lower_bound );
+					const InputType1 x_e = left_scalar
+							? x_wrapper.getValue()
+							: ( (!left_sparse || already_dense_input_x ||
+								local_x.assigned( i - lower_bound ))
+								? *(x_p + i)
+								: *left_identity
+							);
+					const InputType2 y_e = right_scalar
+							? y_wrapper.getValue()
+							: ( (!right_sparse || already_dense_input_y ||
+								local_y.assigned( i - lower_bound ))
+								? *(y_p + i)
+								: *right_identity
+							);
+					OutputType * const z_e = z_p + i;
+					apply( *z_e, x_e, y_e, op );
+				}
+			} else {
+#ifdef _DEBUG
+				std::cerr << "\t in smallLoop variant\n";
+#endif
+				// declare buffers that fit in a single SIMD register and initialise if
+				// needed
+				constexpr const size_t block_size = size_t_block_size > 0
+					? min_block_size
+					: op_block_size;
+				bool mask_b[ block_size ];
+				OutputType z_b[ block_size ];
+				InputType1 x_b[ block_size ];
+				InputType2 y_b[ block_size ];
+				size_t indices[ block_size ];
+				for( size_t k = 0; k < block_size; ++k ) {
+					if( left_scalar ) {
+						x_b[ k ] = x_wrapper.getValue();
+					}
+					if( right_scalar ) {
+						y_b[ k ] = y_wrapper.getValue();
+					}
+				}
+
+				// loop over mask pattern
+				const size_t mask_nnz = local_mask_nz;
+				const size_t num_blocks = mask_nnz / block_size;
+				const size_t start = 0;
+				const size_t end = num_blocks;
+
+				size_t k = 0;
+
+				// vectorised code
+				for( size_t b = start; b < end; ++b ) {
+					for( size_t t = 0; t < block_size; ++t ) {
+						indices[ t ] = (already_dense_mask ) ? k + t : local_mask.index( k + t );
+					}
+					for( size_t t = 0; t < block_size; ++t ) {
+						if( already_dense_mask ) {
+							mask_b[ t ] = internal::getCoordinates( mask_vector ).template
+								mask< descr >( indices[ t ], mask_p );
+						} else {
+							mask_b[ t ] = local_mask.template
+								mask< descr >( indices[ t ], mask_p + lower_bound );
+						}
+					}
+					for( size_t t = 0; t < block_size; ++t ) {
+						if( mask_b[ t ] ) {
+							if( !left_scalar ) {
+								if( left_sparse && !( already_dense_input_x ||
+									local_x.assigned( indices[ t ] ) )
+								) {
+									x_b[ t ] = *left_identity;
+								} else {
+									x_b[ t ] = *( x_p + indices[ t ] + lower_bound );
+								}
+							}
+							if( !right_scalar ) {
+								if( right_sparse && !( already_dense_input_y ||
+									local_y.assigned( indices[ t ] ) )
+								) {
+									y_b[ t ] = *right_identity;
+								} else {
+									y_b[ t ] = *( y_p + indices[ t ] + lower_bound );
+								}
+							}
+						}
+					}
+					// check for no output
+					if( left_sparse && right_sparse ) {
+						for( size_t t = 0; t < block_size; ++t ) {
+							const size_t index = indices[ t ];
+							assert( index < local_n + lower_bound );
+							if( mask_b[ t ] ) {
+								if( !( already_dense_input_x || local_x.assigned( index ) ) &&
+									!( already_dense_input_y || local_y.assigned( index ) )
+								) {
+									mask_b[ t ] = false;
+								}
+							}
+						}
+					}
+					for( size_t t = 0; t < block_size; ++t ) {
+						if( mask_b[ t ] ) {
+							apply( z_b[ t ], x_b[ t ], y_b[ t ], op );
+						}
+					}
+					for( size_t t = 0; t < block_size; ++t ) {
+						if( mask_b[ t ] ) {
+							(void) local_z.assign( indices[ t ] );
+							GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED               // z_b is computed from
+							*( z_p + indices[ t ] + lower_bound ) = z_b[ t ]; // x_b and y_b, which
+							GRB_UTIL_RESTORE_WARNINGS                         // are both initialised
+							                                                  // if mask_b is true
+						}
+					}
+
+					k += block_size;
+				}
+
+				// scalar coda
+				for( size_t k = end * block_size; k < mask_nnz; ++k ) {
+					const size_t i = already_dense_mask
+						? k + lower_bound
+						: local_mask.index( k ) + lower_bound;
+					if( ( already_dense_mask &&
+							internal::getCoordinates( mask_vector ).template mask< descr >(
+								i, mask_p
+							)
+						) || local_mask.template mask< descr >(
+							i - lower_bound, mask_p + lower_bound
+						)
+					) {
+						if( left_sparse && right_sparse ) {
+							if( !( already_dense_input_x || local_x.assigned( i  - lower_bound ) ) &&
+								!( already_dense_input_y || local_y.assigned( i - lower_bound ) )
+							) {
+								continue;
+							}
+						}
+						(void) local_z.assign( i - lower_bound );
+						const InputType1 x_e = left_scalar
+							? x_wrapper.getValue()
+							: (
+								(!left_sparse || already_dense_input_x ||
+									local_x.assigned( i - lower_bound ) )
+									? *(x_p + i)
+									: *left_identity
+							);
+						const InputType2 y_e = right_scalar
+							? y_wrapper.getValue()
+							: (
+								(!right_sparse || already_dense_input_y ||
+									local_y.assigned( i - lower_bound ) )
+									? *(y_p + i)
+									: *right_identity
+							);
+						OutputType * const z_e = z_p + i;
+						apply( *z_e, x_e, y_e, op );
+					}
+				}
+			}
+			assert( false );
+			return UNSUPPORTED;
+		}
+
+	} // end namespace ``grb::internal''
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< InputType1, ascend, Coords > &x,
+		const InputType2 beta,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< OP >::value, void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given operator" );
+#ifdef _DEBUG
+		std::cout << "In eWiseApply ([T1]<-[T2]<-T3), operator variant\n";
+#endif
+		// sanity check
+		auto &z_coors = internal::getCoordinates( z );
+		const size_t n = z_coors.size();
+		if( internal::getCoordinates( x ).size() != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[&z, &x, beta, &op] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _ASCEND_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage eWiseApply(z, x, beta, operator) in "
+					<< "the range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				const Vector< bool, ascend, Coords > * const null_mask = nullptr;
+				const Coords * const local_null_mask = nullptr;
+
+				Coords local_mask, local_x, local_y, local_z;
+				const size_t local_n = upper_bound - lower_bound;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				(void) pipeline;
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+				bool already_dense_input_x = true;
+
+				size_t local_x_nz = local_n;
+
+				if( !already_dense_vectors ) {
+					local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+						upper_bound );
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_input_x = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_input_x ) {
+#else
+						already_dense_input_x = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				const internal::Wrapper< false, InputType1, Coords > x_wrapper( x );
+				const internal::Wrapper< true, InputType2, Coords > y_wrapper( beta );
+
+				// the global stack counter must be set to 0 unless it's guaranteed
+				// that none of the local_clear and local_assignAll will be invoked
+				// - local_clear is not invoked when the dense descriptor is given,
+				//   since the output vector will eventually become dense
+				// - local_assignAll is not invoked when the output vector is already dense
+				// therefore, the following condition relies on global information,
+				// i.e., the dense descriptor and the already_dense_output
+				if( !already_dense_vectors ) {
+					if( lower_bound == 0 ) {
+						internal::getCoordinates( z ).reset_global_nnz_counter();
+					}
+				}
+
+				if( local_x_nz == local_n ) {
+					if( !already_dense_vectors ) {
+						local_z.local_assignAll( );
+					}
+
+					// call dense apply
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_dense_apply_generic<
+#else
+					rc = internal::dense_apply_generic<
+#endif
+							false, true, false, false, descr | descriptors::dense, OP,
+							OutputType, InputType1, InputType2, Coords
+						>(
+							already_dense_input_x, true,
+							lower_bound, upper_bound,
+							local_x, local_y,
+							z, x_wrapper, y_wrapper,
+							op
+						);
+				} else {
+					if( !already_dense_vectors ) {
+						local_z.local_clear();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+						pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+					}
+
+					// since z and x may not perfectly overlap, and since the intersection is
+					// unknown a priori, we must iterate over the nonzeroes of x
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_sparse_apply_generic<
+#else
+					rc = internal::sparse_apply_generic<
+#endif
+							false, false, false, true, descr, OP,
+							OutputType, bool, InputType1, InputType2, Coords
+						>(
+							true, already_dense_input_x, true,
+							lower_bound, upper_bound,
+							local_z, local_null_mask, local_x, local_y,
+							z, null_mask, x_wrapper, y_wrapper, op
+						);
+				}
+
+				if( !already_dense_vectors ) {
+					internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+						upper_bound );
+				}
+
+				return rc;
+			};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_EWISEAPPLY,
+				n, sizeof( OutputType ), dense_descr, true,
+				&z, nullptr, &internal::getCoordinates( z ), nullptr,
+				&x, nullptr, nullptr, nullptr,
+				&internal::getCoordinates( x ), nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseApply(z, x, beta, operator)"
+			<< std::endl;
+#endif
+
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, ascend, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< OP >::value, void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given operator" );
+#ifdef _DEBUG
+		std::cout << "In eWiseApply ([T1]<-T2<-T3), operator variant\n";
+#endif
+		if( (descr & descriptors::dense) && nnz( z ) < size( z ) ) {
+			return ILLEGAL;
+		}
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		typename OP::D3 val;
+		RC ret = apply< descr >( val, alpha, beta, op );
+		ret = ret ? ret : set< descr >( z, val );
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename OutputType, typename MaskType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< MaskType, ascend, Coords > &mask,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< OP >::value, void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+			"called with an output mask element type that is not Boolean " );
+#ifdef _DEBUG
+		std::cout << "In masked eWiseApply ([T1]<-T2<-T3), operator variant\n";
+#endif
+		// check trivial dispatch
+		if( size( mask ) == 0 ) {
+			return eWiseApply< descr >( z, alpha, beta, op, phase );
+		}
+
+		// dynamic checks
+		if( size( mask ) != size( z ) ) {
+			return MISMATCH;
+		}
+		if( (descr & descriptors::dense) &&
+			( nnz( z ) < size( z ) || nnz( mask ) < size( mask ) )
+		) {
+			return ILLEGAL;
+		}
+
+		// check trivial dispatch
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		typename OP::D3 val;
+		RC ret = apply< descr >( val, alpha, beta, op );
+		ret = ret ? ret : set< descr >( z, mask, val );
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, ascend, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_monoid< Monoid >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given monoid" );
+#ifdef _DEBUG
+		std::cout << "In eWiseApply ([T1]<-T2<-T3), monoid variant\n";
+#endif
+		// simply delegate to operator variant
+		return eWiseApply< descr >( z, alpha, beta, monoid.getOperator(), phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename OutputType, typename MaskType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< MaskType, ascend, Coords > &mask,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+			"called with an output mask element type that is not Boolean " );
+#ifdef _DEBUG
+		std::cout << "In masked eWiseApply ([T1]<-T2<-T3), monoid variant\n";
+#endif
+		// simply delegate to operator variant
+		return eWiseApply< descr >( z, mask, alpha, beta, monoid.getOperator(),
+			phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename OutputType,
+		typename MaskType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< MaskType, ascend, Coords > &mask,
+		const Vector< InputType1, ascend, Coords > &x,
+		const InputType2 beta,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+				!grb::is_object< MaskType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_operator< OP >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+			"called with an output mask element type that is not Boolean " );
+#ifdef _DEBUG
+		std::cout << "In masked eWiseApply ([T1]<-[T2]<-T3, using operator)\n";
+#endif
+		// check for empty mask
+		if( size( mask ) == 0 ) {
+			return eWiseApply< descr >( z, x, beta, op );
+		}
+
+		// other run-time checks
+		const size_t n = internal::getCoordinates( z ).size();
+		if( internal::getCoordinates( x ).size() != n ) {
+			return MISMATCH;
+		}
+		if( internal::getCoordinates( mask ).size() != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+		constexpr const bool dense_mask = dense_descr &&
+			(descr & descriptors::structural) && !(descr & descriptors::invert_mask);
+
+		internal::Pipeline::stage_type func =
+			[&z, &mask, &x, beta, &op] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _ASCEND_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage eWiseApply(z, mask, x, beta, "
+					<< "operator) in the range(" << lower_bound << ", " << upper_bound << ")"
+					<< std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				Coords local_mask, local_x, local_y, local_z;
+				const size_t local_n = upper_bound - lower_bound;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+				const bool mask_is_dense = (descr & descriptors::structural) &&
+					!(descr & descriptors::invert_mask) && already_dense_vectors;
+
+				bool already_dense_mask = true;
+				bool already_dense_input_x = true;
+
+				size_t local_mask_nz = local_n;
+				size_t local_x_nz = local_n;
+
+				if( !mask_is_dense ) {
+					local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+						upper_bound );
+					if( dense_descr && local_z.nonzeroes() < local_n ) {
+						return ILLEGAL;
+					}
+				}
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_mask = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( mask ) );
+					if( !already_dense_mask ) {
+#else
+						already_dense_mask = false;
+#endif
+						local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+							upper_bound );
+						local_mask_nz = local_mask.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_input_x = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_input_x ) {
+#else
+						already_dense_mask = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				const internal::Wrapper< false, InputType1, Coords > x_wrapper( x );
+				const internal::Wrapper< true, InputType2, Coords > y_wrapper( beta );
+
+				if( !mask_is_dense ) {
+					// the output sparsity structure is implied by mask and descr
+					local_z.local_clear();
+					if( lower_bound == 0 ) {
+						internal::getCoordinates( z ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+						pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+						if( dense_descr ) {
+							pipeline.markMaybeSparseDenseDescriptorVerification(
+								&internal::getCoordinates( z ) );
+						}
+					}
+				}
+
+				if(
+					(descr & descriptors::dense) ||
+					(local_x_nz == local_n) ||
+					(local_mask_nz <= local_x_nz)
+				) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_masked_apply_generic<
+#else
+					rc = internal::masked_apply_generic<
+#endif
+							false, true, false, false, descr, OP,
+							OutputType, MaskType, InputType1, InputType2, Coords
+						>(
+							already_dense_mask, already_dense_input_x, true,
+							lower_bound, upper_bound,
+							local_z, local_mask, local_x, local_y,
+							z, mask, x_wrapper, y_wrapper,
+							op
+						);
+				} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_sparse_apply_generic<
+#else
+					rc = internal::sparse_apply_generic<
+#endif
+							true, false, false, true, descr, OP,
+							OutputType, bool, InputType1, InputType2, Coords
+						>(
+							already_dense_mask, already_dense_input_x, true,
+							lower_bound, upper_bound,
+							local_z, &local_mask, local_x, local_y,
+							z, &mask, x_wrapper, y_wrapper,
+							op
+						);
+				}
+
+				if( !mask_is_dense ) {
+					internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+						upper_bound );
+				}
+
+				return rc;
+			};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_MASKED_EWISEAPPLY,
+				n, sizeof( OutputType ), dense_descr, dense_mask,
+				&z, nullptr, &internal::getCoordinates( z ), nullptr,
+				&x, &mask, nullptr, nullptr,
+				&internal::getCoordinates( x ), &internal::getCoordinates( mask ),
+				nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseApply(z, mask, x, beta, "
+			<< "operator)" << std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< InputType1, ascend, Coords > &x,
+		const Vector< InputType2, ascend, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_monoid< Monoid >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given monoid" );
+#ifdef _DEBUG
+		std::cout << "In unmasked eWiseApply ([T1]<-[T2]<-[T3], using monoid)\n";
+#endif
+		// other run-time checks
+		const size_t n = internal::getCoordinates( z ).size();
+		if( internal::getCoordinates( x ).size() != n ) {
+			return MISMATCH;
+		}
+		if( internal::getCoordinates( y ).size() != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// check if we can dispatch to dense variant
+		if( (descr & descriptors::dense) ) {
+			return eWiseApply< descr >( z, x, y, monoid.getOperator() );
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func = [&z, &x, &y, &monoid, phase] (
+			internal::Pipeline &pipeline,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _ASCEND_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage eWiseApply(z, x, y, monoid) in the "
+				<< "range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			const Vector< bool, ascend, Coords > * const null_mask = nullptr;
+			const Coords * const local_null_mask = nullptr;
+
+			Coords local_x, local_y, local_z;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			( void )pipeline;
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+			bool already_dense_input_x = true;
+			bool already_dense_input_y = true;
+
+			if( !already_dense_vectors ) {
+				local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+					upper_bound );
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_input_x = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( x ) );
+				if( !already_dense_input_x ) {
+#else
+					already_dense_input_x = false;
+#endif
+					local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+						upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+
+				already_dense_input_y = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( y ) );
+				if( !already_dense_input_y ) {
+#else
+					already_dense_input_y = false;
+#endif
+					local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+						upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			const internal::Wrapper< false, InputType1, Coords > x_wrapper( x );
+			const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+			// we are in the unmasked sparse variant
+			const auto op = monoid.getOperator();
+
+			if( !already_dense_vectors ) {
+				// z will have an a-priori unknown sparsity structure
+				local_z.local_clear();
+				if( lower_bound == 0 ) {
+					internal::getCoordinates( z ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+				}
+			}
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+			rc = internal::boolean_dispatcher_sparse_apply_generic<
+#else
+			rc = internal::sparse_apply_generic<
+#endif
+					false, true, false, false, descr, typename Monoid::Operator,
+					OutputType, bool, InputType1, InputType2, Coords
+				>(
+					true, already_dense_input_x, already_dense_input_y,
+					lower_bound, upper_bound,
+					local_z, local_null_mask, local_x, local_y,
+					z, null_mask, x_wrapper, y_wrapper,
+					op
+				);
+
+			if( !already_dense_vectors ) {
+				internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_EWISEAPPLY,
+				n, sizeof( OutputType ), dense_descr, true,
+				getID( z ),
+				&z, nullptr, &internal::getCoordinates( z ), nullptr,
+				getID( x ), getID( y ), SIZE_MAX, SIZE_MAX,
+				&x, &y, nullptr, nullptr,
+				&internal::getCoordinates( x ), &internal::getCoordinates( y ),
+				nullptr, nullptr,
+				SIZE_MAX, nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseApply(z, x, y, monoid)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, ascend, Coords > &z,
+		const InputType1 alpha,
+		const Vector< InputType2, ascend, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_monoid< Monoid >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given monoid" );
+#ifdef _DEBUG
+		std::cout << "In unmasked eWiseApply ([T1]<-T2<-[T3], using monoid)\n";
+#endif
+		// other run-time checks
+		const size_t n = internal::getCoordinates( z ).size();
+		if( internal::getCoordinates( y ).size() != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		RC ret = SUCCESS;
+
+		// check if we can dispatch to dense variant
+		if( (descr & descriptors::dense) ) {
+			return eWiseApply< descr >( z, alpha, y, monoid.getOperator() );
+		}
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func = [&z, alpha, &y, &monoid] (
+			internal::Pipeline &pipeline,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _ASCEND_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage eWiseApply(z, alpha, y, monoid) in the "
+				<< "range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			Coords local_x, local_y, local_z;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			(void) pipeline;
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			bool already_dense_output = true;
+#endif
+			bool already_dense_input_y = true;
+
+			// when it's guaranteed that the output will become dense
+			// the only criterion to avoid reading the local coordinates is if it the
+			// output is already dense
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			already_dense_output = pipeline.containsAlreadyDenseVector(
+				&internal::getCoordinates( z ) );
+			if( !already_dense_output ) {
+#endif
+				local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+					upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			}
+#endif
+			if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_input_y = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( y ) );
+				if( !already_dense_input_y ) {
+#else
+					already_dense_input_y = false;
+#endif
+					local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+						upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			const internal::Wrapper< true, InputType1, Coords > x_wrapper( alpha );
+			const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+			// we are in the unmasked sparse variant
+			const auto &op = monoid.getOperator();
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			if( !already_dense_output ) {
+#endif
+				local_z.local_assignAllNotAlreadyAssigned();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			}
+#endif
+
+			// dispatch to generic function
+#ifdef GRB_BOOLEAN_DISPATCHER
+			rc = internal::boolean_dispatcher_dense_apply_generic<
+#else
+			rc = internal::dense_apply_generic<
+#endif
+					true, false, false, true, descr, typename Monoid::Operator,
+					OutputType, InputType1, InputType2, Coords
+				>(
+					true, already_dense_input_y,
+					lower_bound, upper_bound,
+					local_x, local_y,
+					z, x_wrapper, y_wrapper, op
+				);
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			if( !already_dense_output ) {
+#else
+			if( !already_dense_vectors ) {
+#endif
+				internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_EWISEAPPLY,
+				n, sizeof( OutputType ), dense_descr, true,
+				&z, nullptr, &internal::getCoordinates( z ), nullptr,
+				&y, nullptr, nullptr, nullptr,
+				&internal::getCoordinates( y ), nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseApply(z, alpha, y, monoid)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< InputType1, ascend, Coords > &x,
+		const InputType2 beta,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_monoid< Monoid >::value,
+			void >::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given monoid" );
+#ifdef _DEBUG
+		std::cout << "In unmasked eWiseApply ([T1]<-[T2]<-T3, using monoid)\n";
+#endif
+		// other run-time checks
+		const size_t n = internal::getCoordinates( z ).size();
+		if( internal::getCoordinates( x ).size() != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// check if we can dispatch to dense variant
+		if( (descr & descriptors::dense) ) {
+			return eWiseApply< descr >( z, x, beta, monoid.getOperator() );
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func = [&z, &x, beta, &monoid] (
+			internal::Pipeline &pipeline,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _ASCEND_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage eWiseApply(z, x, beta, monoid) in the "
+				<< "range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			Coords local_x, local_y, local_z;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			(void) pipeline;
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			bool already_dense_output = true;
+#endif
+			bool already_dense_input_x = true;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			already_dense_output = pipeline.containsAlreadyDenseVector(
+				&internal::getCoordinates( z ) );
+			if( !already_dense_output ) {
+#endif
+				local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+					upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			}
+#endif
+
+			if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_input_x = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( x ) );
+				if( !already_dense_input_x ) {
+#else
+					already_dense_input_x = false;
+#endif
+					local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+						upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			const internal::Wrapper< false, InputType1, Coords > x_wrapper( x );
+			const internal::Wrapper< true, InputType2, Coords > y_wrapper( beta );
+
+			// we are in the unmasked sparse variant
+			const auto &op = monoid.getOperator();
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			if( !already_dense_output ) {
+#endif
+				// the result will always be dense
+				local_z.local_assignAllNotAlreadyAssigned();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			}
+#endif
+
+			// dispatch
+#ifdef GRB_BOOLEAN_DISPATCHER
+			rc = internal::boolean_dispatcher_dense_apply_generic<
+#else
+			rc = internal::dense_apply_generic<
+#endif
+					false, true, true, false, descr, typename Monoid::Operator,
+					OutputType, InputType1, InputType2, Coords
+				>(
+					already_dense_input_x, true,
+					lower_bound, upper_bound,
+					local_x, local_y,
+					z, x_wrapper, y_wrapper,
+					op
+				);
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			if( !already_dense_output ) {
+#else
+
+			if( !already_dense_vectors ) {
+#endif
+				internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_EWISEAPPLY,
+				n, sizeof( OutputType ), dense_descr, true,
+				&z, nullptr, &internal::getCoordinates( z ), nullptr,
+				&x, nullptr, nullptr, nullptr,
+				&internal::getCoordinates( x ), nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseApply(z, x, beta, monoid)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename OutputType,
+		typename MaskType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< MaskType, ascend, Coords > &mask,
+		const Vector< InputType1, ascend, Coords > &x,
+		const Vector< InputType2, ascend, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< MaskType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_monoid< Monoid >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+			"called with an output mask element type that is not Boolean " );
+#ifdef _DEBUG
+		std::cout << "In masked eWiseApply ([T1]<-[T2]<-[T3], using monoid)\n";
+#endif
+		if( size( mask ) == 0 ) {
+			return eWiseApply< descr >( z, x, y, monoid, phase );
+		}
+
+		// other run-time checks
+		const size_t n = internal::getCoordinates( z ).size();
+		if( internal::getCoordinates( x ).size() != n ) {
+			return MISMATCH;
+		}
+		if( internal::getCoordinates( y ).size() != n ) {
+			return MISMATCH;
+		}
+		if( internal::getCoordinates( mask ).size() != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// check if we can dispatch to dense variant
+		if( (descr & descriptors::dense) ) {
+			return eWiseApply< descr >( z, mask, x, y, monoid.getOperator() );
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+		constexpr const bool dense_mask = dense_descr &&
+			(descr & descriptors::structural) && !(descr & descriptors::invert_mask);
+
+		internal::Pipeline::stage_type func = [&z, &mask, &x, &y, &monoid] (
+			internal::Pipeline &pipeline,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _ASCEND_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage eWiseApply(z, mask, x, y, monoid) in "
+				<< "the range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			Coords local_mask, local_x, local_y, local_z;
+			const size_t local_n = upper_bound - lower_bound;
+			size_t local_mask_nz = local_n;
+			size_t local_x_nz = local_n;
+			size_t local_y_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+			const bool mask_is_dense = (descr & descriptors::structural) &&
+				!(descr & descriptors::invert_mask) && already_dense_vectors;
+
+			bool already_dense_mask = true;
+			bool already_dense_input_x = true;
+			bool already_dense_input_y = true;
+
+			if( !mask_is_dense ) {
+				local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+					upper_bound );
+				if( dense_descr && local_z.nonzeroes() < local_n ) {
+					return ILLEGAL;
+				}
+			}
+
+			if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_mask = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( mask ) );
+				if( !already_dense_mask ) {
+#else
+					already_dense_mask = false;
+#endif
+					local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+						upper_bound );
+					local_mask_nz = local_mask.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+
+				already_dense_input_x = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( x ) );
+				if( !already_dense_input_x ) {
+#else
+					already_dense_input_x = false;
+#endif
+					local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+						upper_bound );
+					local_x_nz = local_x.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+
+				already_dense_input_y = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( y ) );
+				if( !already_dense_input_y ) {
+#else
+					already_dense_input_y = false;
+#endif
+					local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+						upper_bound );
+					local_y_nz = local_y.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			const internal::Wrapper< false, InputType1, Coords > x_wrapper( x );
+			const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+			// we are in the masked sparse variant
+			const InputType1 left_identity = monoid.template getIdentity< InputType1 >();
+			const InputType2 right_identity =
+				monoid.template getIdentity< InputType2 >();
+			const auto &op = monoid.getOperator();
+
+			if( !mask_is_dense ) {
+				// z will have an a priori unknown sparsity structure
+				local_z.local_clear();
+				if( lower_bound == 0 ) {
+					internal::getCoordinates( z ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+					if( dense_descr ) {
+						pipeline.markMaybeSparseDenseDescriptorVerification(
+							&internal::getCoordinates( z ) );
+					}
+				}
+			}
+
+			if( local_x_nz < local_n &&
+				local_y_nz < local_n &&
+				local_x_nz + local_y_nz < local_mask_nz
+			) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+				rc = internal::boolean_dispatcher_sparse_apply_generic<
+#else
+				rc = internal::sparse_apply_generic<
+#endif
+						true, true, false, false, descr, typename Monoid::Operator,
+						OutputType, bool, InputType1, InputType2, Coords
+					>(
+						already_dense_mask, already_dense_input_x, already_dense_input_y,
+						lower_bound, upper_bound,
+						local_z, &local_mask, local_x, local_y,
+						z, &mask, x_wrapper, y_wrapper,
+						op
+					);
+			} else if( local_x_nz < local_n && local_y_nz == local_n ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+				rc = internal::boolean_dispatcher_masked_apply_generic<
+#else
+				rc = internal::masked_apply_generic<
+#endif
+						false, false, true, false, descr, typename Monoid::Operator,
+						OutputType, MaskType, InputType1, InputType2, Coords
+					>(
+						already_dense_mask, already_dense_input_x, already_dense_input_y,
+						lower_bound, upper_bound,
+						local_z, local_mask, local_x, local_y,
+						z, mask, x_wrapper, y_wrapper,
+						op,
+						&left_identity, nullptr
+					);
+			} else if( local_y_nz < local_n && local_x_nz == local_n ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+				rc = internal::boolean_dispatcher_masked_apply_generic<
+#else
+				rc = internal::masked_apply_generic<
+#endif
+						false, false, false, true, descr, typename Monoid::Operator,
+						OutputType, MaskType, InputType1, InputType2, Coords
+					>(
+						already_dense_mask, already_dense_input_x, already_dense_input_y,
+						lower_bound, upper_bound,
+						local_z, local_mask, local_x, local_y,
+						z, mask, x_wrapper, y_wrapper,
+						op,
+						nullptr, &right_identity
+					);
+			} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+				rc = internal::boolean_dispatcher_masked_apply_generic<
+#else
+				rc = internal::masked_apply_generic<
+#endif
+						false, false, true, true, descr, typename Monoid::Operator,
+						OutputType, MaskType, InputType1, InputType2, Coords
+					>(
+						already_dense_mask, already_dense_input_x, already_dense_input_y,
+						lower_bound, upper_bound,
+						local_z, local_mask, local_x, local_y,
+						z, mask, x_wrapper, y_wrapper,
+						op,
+						&left_identity, &right_identity
+					);
+			}
+
+			if( !mask_is_dense ) {
+				internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_MASKED_EWISEAPPLY,
+				n, sizeof( OutputType ), dense_descr, dense_mask,
+				&z, nullptr, &internal::getCoordinates( z ), nullptr,
+				&x, &y, &mask, nullptr,
+				&internal::getCoordinates( x ), &internal::getCoordinates( y ),
+				&internal::getCoordinates( mask ), nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseApply(z, mask, x, y, "
+			<< "monoid)" << std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename OutputType,
+		typename MaskType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< MaskType, ascend, Coords > &mask,
+		const InputType1 alpha,
+		const Vector< InputType2, ascend, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+				!grb::is_object< MaskType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_monoid< Monoid >::value,
+			void >::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+			"called with an output mask element type that is not Boolean " );
+#ifdef _DEBUG
+		std::cout << "In masked eWiseApply ([T1]<-T2<-[T3], using monoid)\n";
+#endif
+		if( size( mask ) == 0 ) {
+			return eWiseApply< descr >( z, alpha, y, monoid );
+		}
+
+		// other run-time checks
+		const size_t n = internal::getCoordinates( z ).size();
+		if( internal::getCoordinates( y ).size() != n ) {
+			return MISMATCH;
+		}
+		if( internal::getCoordinates( mask ).size() != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// check if we can dispatch to dense variant
+		if( descr & descriptors::dense ) {
+			return eWiseApply< descr >( z, mask, alpha, y, monoid.getOperator() );
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+		constexpr const bool dense_mask = dense_descr &&
+			(descr & descriptors::structural) && !(descr & descriptors::invert_mask);
+
+		internal::Pipeline::stage_type func = [&z, &mask, alpha, &y, &monoid] (
+			internal::Pipeline &pipeline,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _ASCEND_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage eWiseApply(z, mask, alpha, y, monoid) "
+				<< "in the range(" << lower_bound << ", " << upper_bound << ")"
+				<< std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			Coords local_mask, local_x, local_y, local_z;
+			const size_t local_n = upper_bound - lower_bound;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+			const bool mask_is_dense = (descr & descriptors::structural) &&
+				!(descr & descriptors::invert_mask) && already_dense_vectors;
+
+			bool already_dense_mask = true;
+			bool already_dense_input_y = true;
+
+			if( !mask_is_dense ) {
+				local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+					upper_bound );
+				if( dense_descr && local_z.nonzeroes() < local_n ) {
+					return ILLEGAL;
+				}
+			}
+
+			if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_mask = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( mask ) );
+				if( !already_dense_mask ) {
+#else
+					already_dense_mask = false;
+#endif
+					local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+						upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+
+				already_dense_input_y = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( y ) );
+				if( !already_dense_input_y ) {
+#else
+					already_dense_input_y = false;
+#endif
+					local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+						upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			const internal::Wrapper< true, InputType1, Coords > x_wrapper( alpha );
+			const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+			// we are in the masked sparse variant
+			const InputType2 right_identity =
+				monoid.template getIdentity< InputType2 >();
+			const auto &op = monoid.getOperator();
+
+			if( !mask_is_dense ) {
+				// the sparsity structure of z will be a result of the given mask and descr
+				local_z.local_clear();
+				if( lower_bound == 0 ) {
+					internal::getCoordinates( z ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+					if( dense_descr ) {
+						pipeline.markMaybeSparseDenseDescriptorVerification(
+							&internal::getCoordinates( z ) );
+					}
+				}
+			}
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+			rc = internal::boolean_dispatcher_masked_apply_generic<
+#else
+			rc = internal::masked_apply_generic<
+#endif
+					true, false, false, true, descr, typename Monoid::Operator,
+					OutputType, MaskType, InputType1, InputType2, Coords
+				>(
+					already_dense_mask, true, already_dense_input_y,
+					lower_bound, upper_bound,
+					local_z, local_mask, local_x, local_y,
+					z, mask, x_wrapper, y_wrapper,
+					op,
+					nullptr, &right_identity
+				);
+
+			if( !mask_is_dense ) {
+				internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_MASKED_EWISEAPPLY,
+				n, sizeof( OutputType ), dense_descr, dense_mask,
+				&z, nullptr, &internal::getCoordinates( z ), nullptr,
+				&y, &mask, nullptr, nullptr,
+				&internal::getCoordinates( y ), &internal::getCoordinates( mask ),
+				nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseApply(z, mask, alpha, y, "
+			<< "monoid)" << std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename OutputType,
+		typename MaskType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< MaskType, ascend, Coords > &mask,
+		const Vector< InputType1, ascend, Coords > &x,
+		const InputType2 beta,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+				!grb::is_object< MaskType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_monoid< Monoid >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+			"called with an output mask element type that is not Boolean " );
+#ifdef _DEBUG
+		std::cout << "In masked eWiseApply ([T1]<-[T2]<-T3, using monoid)\n";
+#endif
+		if( size( mask ) == 0 ) {
+			return eWiseApply< descr >( z, x, beta, monoid );
+		}
+
+		// other run-time checks
+		const size_t n = internal::getCoordinates( z ).size();
+		if( internal::getCoordinates( x ).size() != n ) {
+			return MISMATCH;
+		}
+		if( internal::getCoordinates( mask ).size() != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// check if we can dispatch to dense variant
+		if( (descr & descriptors::dense) ) {
+			return eWiseApply< descr >( z, mask, x, beta, monoid.getOperator() );
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+		constexpr const bool dense_mask = dense_descr &&
+			(descr & descriptors::structural) && !(descr & descriptors::invert_mask);
+
+		internal::Pipeline::stage_type func = [&z, &mask, &x, beta, &monoid] (
+			internal::Pipeline &pipeline,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _ASCEND_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage eWiseApply(z, mask, x, beta, monoid) "
+				<< "in the range(" << lower_bound << ", " << upper_bound << ")"
+				<< std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			Coords local_mask, local_x, local_y, local_z;
+			const size_t local_n = upper_bound - lower_bound;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+			const bool mask_is_dense = (descr & descriptors::structural) &&
+				!(descr & descriptors::invert_mask) && already_dense_vectors;
+
+			bool already_dense_mask = true;
+			bool already_dense_input_x = true;
+
+			if( !mask_is_dense ) {
+				local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+					upper_bound );
+				if( dense_descr && local_z.nonzeroes() < local_n ) {
+					return ILLEGAL;
+				}
+			}
+
+			if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_mask = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( mask ) );
+				if( !already_dense_mask ) {
+#else
+				already_dense_mask = false;
+#endif
+					local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+						upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+
+				already_dense_input_x = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( x ) );
+				if( !already_dense_input_x ) {
+#else
+					already_dense_input_x = false;
+#endif
+					local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+						upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			const internal::Wrapper< false, InputType1, Coords > x_wrapper( x );
+			const internal::Wrapper< true, InputType2, Coords > y_wrapper( beta );
+
+			// we are in the masked sparse variant
+			const InputType1 left_identity = monoid.template getIdentity< InputType1 >();
+			const auto &op = monoid.getOperator();
+
+			if( !mask_is_dense ) {
+				local_z.local_clear();
+				if( lower_bound == 0 ) {
+					internal::getCoordinates( z ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+					if( dense_descr ) {
+						pipeline.markMaybeSparseDenseDescriptorVerification(
+							&internal::getCoordinates( z ) );
+					}
+				}
+			}
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+			rc = internal::boolean_dispatcher_masked_apply_generic<
+#else
+			rc = internal::masked_apply_generic<
+#endif
+					false, true, true, false, descr, typename Monoid::Operator,
+					OutputType, MaskType, InputType1, InputType2, Coords
+				>(
+					already_dense_mask, already_dense_input_x, true,
+					lower_bound, upper_bound,
+					local_z, local_mask, local_x, local_y,
+					z, mask, x_wrapper, y_wrapper,
+					op,
+					&left_identity
+				);
+
+			if( !mask_is_dense ) {
+				internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_MASKED_EWISEAPPLY,
+				n, sizeof( OutputType ), dense_descr, dense_mask,
+				&z, nullptr, &internal::getCoordinates( z ), nullptr,
+				&x, &mask, nullptr, nullptr,
+				&internal::getCoordinates( x ), &internal::getCoordinates( mask ),
+				nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseApply(z, mask, x, beta, "
+			<< "monoid)" << std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, ascend, Coords > &z,
+		const InputType1 alpha,
+		const Vector< InputType2, ascend, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_operator< OP >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given operator" );
+#ifdef _DEBUG
+		std::cout << "In eWiseApply ([T1]<-T2<-[T3]), operator variant\n";
+#endif
+		// sanity check
+		const size_t n = internal::getCoordinates( z ).size();
+		if( internal::getCoordinates( y ).size() != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// check if we can dispatch
+		if( static_cast< const void * >( &z ) ==
+			static_cast< const void * >( &y )
+		) {
+			return foldr< descr >( alpha, z, op );
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func = [&z, alpha, &y, &op] (
+			internal::Pipeline &pipeline,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _ASCEND_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage eWiseApply(z, alpha, y, operator) in "
+				<< "the range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			const Vector< bool, ascend, Coords > * const null_mask = nullptr;
+			const Coords * const local_null_mask = nullptr;
+
+			Coords local_mask, local_x, local_y, local_z;
+			const size_t local_n = upper_bound - lower_bound;
+			size_t local_y_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			(void) pipeline;
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+			bool already_dense_input_y = true;
+
+			if( !already_dense_vectors ) {
+				local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+					upper_bound );
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_input_y = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( y ) );
+				if( !already_dense_input_y ) {
+#else
+					already_dense_input_y = false;
+#endif
+					local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+						upper_bound );
+					local_y_nz = local_y.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			const internal::Wrapper< true, InputType1, Coords > x_wrapper( alpha );
+			const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+			if( !already_dense_vectors ) {
+				if( lower_bound == 0 ) {
+					internal::getCoordinates( z ).reset_global_nnz_counter();
+				}
+			}
+
+			// check for dense variant
+			if( (descr & descriptors::dense) || local_y_nz == local_n ) {
+				if( !already_dense_vectors ) {
+					local_z.local_assignAll( );
+				}
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+				rc = internal::boolean_dispatcher_dense_apply_generic<
+#else
+				rc = internal::dense_apply_generic<
+#endif
+						true, false, false, false, descr, OP,
+						OutputType, InputType1, InputType2, Coords
+					>(
+						true, already_dense_input_y,
+						lower_bound, upper_bound,
+						local_x, local_y, z,
+						x_wrapper, y_wrapper,
+						op
+					);
+			} else {
+				if( !already_dense_vectors ) {
+					local_z.local_clear();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+				}
+
+				// we are in the sparse variant
+#ifdef GRB_BOOLEAN_DISPATCHER
+				rc = internal::boolean_dispatcher_sparse_apply_generic<
+						false, false, true, false, descr, OP,
+#else
+				rc = internal::sparse_apply_generic<
+						false, false, true, false, descr, OP,
+#endif
+						OutputType, bool, InputType1, InputType2, Coords
+					>(
+						true, true, already_dense_input_y,
+						lower_bound, upper_bound,
+						local_z, local_null_mask, local_x, local_y,
+						z, null_mask, x_wrapper, y_wrapper,
+						op
+					);
+			}
+
+			if( !already_dense_vectors ) {
+				internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_EWISEAPPLY,
+				n, sizeof( OutputType ), dense_descr, true,
+				&z, nullptr, &internal::getCoordinates( z ), nullptr,
+				&y, nullptr, nullptr, nullptr,
+				&internal::getCoordinates( y ), nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseApply(z, alpha, y, "
+			<< "operator)" << std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename OutputType,
+		typename MaskType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< MaskType, ascend, Coords > &mask,
+		const InputType1 alpha,
+		const Vector< InputType2, ascend, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< MaskType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_operator< OP >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+			"called with an output mask element type that is not Boolean " );
+#ifdef _DEBUG
+		std::cout << "In masked eWiseApply ([T1]<-T2<-[T3], operator variant)\n";
+#endif
+		// check for empty mask
+		if( size( mask ) == 0 ) {
+			return eWiseApply< descr >( z, alpha, y, op );
+		}
+
+		// sanity check
+		const size_t n = internal::getCoordinates( z ).size();
+		if( internal::getCoordinates( y ).size() != n ) {
+			return MISMATCH;
+		}
+		if( internal::getCoordinates( mask ).size() != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+		constexpr const bool dense_mask = dense_descr &&
+			(descr & descriptors::structural) && !(descr & descriptors::invert_mask);
+
+		internal::Pipeline::stage_type func = [&z, &mask, alpha, &y, &op] (
+			internal::Pipeline &pipeline,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _ASCEND_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage eWiseApply(z, mask, alpha, y, "
+				<< "operator) in the range(" << lower_bound << ", " << upper_bound << ")"
+				<< std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			Coords local_mask, local_x, local_y, local_z;
+			const size_t local_n = upper_bound - lower_bound;
+			size_t local_mask_nz = local_n;
+			size_t local_y_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+			const bool mask_is_dense = (descr & descriptors::structural) &&
+				!(descr & descriptors::invert_mask) && already_dense_vectors;
+
+			bool already_dense_mask = true;
+			bool already_dense_input_y = true;
+
+			if( !mask_is_dense ) {
+				local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+					upper_bound );
+				if( dense_descr && local_z.nonzeroes() < local_n ) {
+					return ILLEGAL;
+				}
+			}
+
+			if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_mask = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( mask ) );
+				if( !already_dense_mask ) {
+#else
+					already_dense_mask = false;
+#endif
+					local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+						upper_bound );
+					local_mask_nz = local_mask.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+
+				already_dense_input_y = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( y ) );
+				if( !already_dense_input_y ) {
+#else
+					already_dense_input_y = false;
+#endif
+					local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+						upper_bound );
+					local_y_nz = local_y.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			const internal::Wrapper< true, InputType1, Coords > x_wrapper( alpha );
+			const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+			if( !mask_is_dense ) {
+				local_z.local_clear();
+				if( lower_bound == 0 ) {
+					internal::getCoordinates( z ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+					if( dense_descr ) {
+						pipeline.markMaybeSparseDenseDescriptorVerification(
+							&internal::getCoordinates( z ) );
+					}
+				}
+			}
+
+			if( (descr & descriptors::dense) ||
+				(local_y_nz == local_n) ||
+				local_mask_nz <= local_y_nz
+			) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+				rc = internal::boolean_dispatcher_masked_apply_generic<
+#else
+				rc = internal::masked_apply_generic<
+#endif
+						true, false, false, false, descr, OP,
+						OutputType, bool, InputType1, InputType2, Coords
+					>(
+						already_dense_mask, true, already_dense_input_y,
+						lower_bound, upper_bound,
+						local_z, local_mask, local_x, local_y,
+						z, mask, x_wrapper, y_wrapper,
+						op
+					);
+			} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+				rc = internal::boolean_dispatcher_sparse_apply_generic<
+#else
+				rc = internal::sparse_apply_generic<
+#endif
+						true, false, true, false, descr, OP,
+						OutputType, bool, InputType1, InputType2, Coords
+					>(
+						already_dense_mask, true, already_dense_input_y,
+						lower_bound, upper_bound,
+						local_z, &local_mask, local_x, local_y,
+						z, &mask, x_wrapper, y_wrapper,
+						op
+					);
+			}
+
+			if( !mask_is_dense ) {
+				internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_MASKED_EWISEAPPLY,
+				n, sizeof( OutputType ), dense_descr, dense_mask,
+				&z, nullptr, &internal::getCoordinates( z ), nullptr,
+				&y, &mask, nullptr, nullptr,
+				&internal::getCoordinates( y ), &internal::getCoordinates( mask ),
+				nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseApply(z, mask, alpha, y, "
+			<< "operator)" << std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< InputType1, ascend, Coords > &x,
+		const Vector< InputType2, ascend, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_operator< OP >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given operator" );
+#ifdef _DEBUG
+		std::cout << "In eWiseApply ([T1]<-[T2]<-[T3]), operator variant\n";
+#endif
+		// sanity check
+		const size_t n = internal::getCoordinates( z ).size();
+		if( internal::getCoordinates( x ).size() != n ||
+			internal::getCoordinates( y ).size() != n
+		) {
+#ifdef _DEBUG
+			std::cerr << "\tinput vectors mismatch in dimensions!\n";
+#endif
+			return MISMATCH;
+		}
+
+		// check for possible shortcuts
+		// trivial dispatch
+		if( n == 0 ) {
+			return SUCCESS;
+		}
+
+		// check for possible shortcuts, after dynamic checks
+		if( getID( x ) == getID( y ) && is_idempotent< OP >::value ) {
+			return set< descr >( z, x, phase );
+		}
+		if( getID( x ) == getID( z ) ) {
+			return foldl< descr >( z, y, op, phase );
+		}
+		if( getID( y ) == getID( z ) ) {
+			return foldr< descr >( x, z, op, phase );
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func = [&z, &x, &y, &op] (
+			internal::Pipeline &pipeline,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _ASCEND_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage eWiseApply(z, x, y, operator) in the "
+				<< "range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			const Vector< bool, ascend, Coords > * const null_mask = nullptr;
+			const Coords * const local_null_mask = nullptr;
+
+			Coords local_x, local_y, local_z;
+			const size_t local_n = upper_bound - lower_bound;
+			size_t local_x_nz = local_n;
+			size_t local_y_nz = local_n;
+			bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			(void) pipeline;
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+			bool already_dense_input_x = true;
+			bool already_dense_input_y = true;
+
+			if( !already_dense_vectors ) {
+				local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+					upper_bound );
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_input_x = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( x ) );
+				if( !already_dense_input_x ) {
+#else
+					already_dense_input_x = false;
+#endif
+					local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+						upper_bound );
+					local_x_nz = local_x.nonzeroes();
+					if( local_x_nz < local_n ) {
+						sparse = true;
+					}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+
+				already_dense_input_y = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( y ) );
+				if( !already_dense_input_y ) {
+#else
+					already_dense_input_y = false;
+#endif
+					local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+						upper_bound );
+					local_y_nz = local_y.nonzeroes();
+					if( local_y_nz < local_n ) {
+						sparse = true;
+					}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			if( !already_dense_vectors ) {
+				if( lower_bound == 0 ) {
+					internal::getCoordinates( z ).reset_global_nnz_counter();
+				}
+			}
+
+			if( sparse ) {
+				if( !already_dense_vectors ) {
+					local_z.local_clear();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+				}
+
+				const internal::Wrapper< false, InputType1, Coords > x_wrapper( x );
+				const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+				rc = internal::boolean_dispatcher_sparse_apply_generic<
+#else
+				rc = internal::sparse_apply_generic<
+#endif
+						false, false, false, false, descr | descriptors::dense, OP,
+						OutputType, bool, InputType1, InputType2, Coords
+					>(
+						true, already_dense_input_x, already_dense_input_y,
+						lower_bound, upper_bound,
+						local_z, local_null_mask, local_x, local_y,
+						z, null_mask, x_wrapper, y_wrapper,
+						op
+					);
+			} else {
+				if( !already_dense_vectors ) {
+					local_z.local_assignAll( );
+				}
+
+				if( upper_bound > lower_bound ) {
+					const InputType1 * __restrict__ a = internal::getRaw( x );
+					const InputType2 * __restrict__ b = internal::getRaw( y );
+					OutputType * __restrict__ c = internal::getRaw( z );
+
+					// this function is vectorised
+					op.eWiseApply( a + lower_bound, b + lower_bound, c + lower_bound, local_n);
+				}
+			}
+
+			if( !already_dense_vectors ) {
+				internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_EWISEAPPLY,
+				n, sizeof( OutputType ), dense_descr, true,
+				getID( z ),
+				&z, nullptr, &internal::getCoordinates( z ), nullptr,
+				getID( x ), getID( y ), SIZE_MAX, SIZE_MAX,
+				&x, &y, nullptr, nullptr,
+				&internal::getCoordinates( x ), &internal::getCoordinates( y ),
+				nullptr, nullptr,
+				SIZE_MAX, nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseApply(z, x, y, operator)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename OutputType, typename MaskType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< MaskType, ascend, Coords > &mask,
+		const Vector< InputType1, ascend, Coords > &x,
+		const Vector< InputType2, ascend, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< MaskType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_operator< OP >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+			"called with an output mask element type that is not Boolean " );
+#ifdef _DEBUG
+		std::cout << "In masked eWiseApply ([T1]<-[T2]<-[T3], using operator)\n";
+#endif
+		// check for empty mask
+		if( size( mask ) == 0 ) {
+			return eWiseApply< descr >( z, x, y, op, phase );
+		}
+
+		// other run-time checks
+		const size_t n = internal::getCoordinates( z ).size();
+		if( internal::getCoordinates( x ).size() != n ) {
+			return MISMATCH;
+		}
+		if( internal::getCoordinates( y ).size() != n ) {
+			return MISMATCH;
+		}
+		if( internal::getCoordinates( mask ).size() != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+		constexpr const bool dense_mask = dense_descr &&
+			(descr & descriptors::structural) && !(descr & descriptors::invert_mask);
+
+		internal::Pipeline::stage_type func = [&z, &mask, &x, &y, &op] (
+			internal::Pipeline &pipeline,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _ASCEND_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage eWiseApply(z, mask, x, y, operator) in "
+				<< "the range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			Coords local_mask, local_x, local_y, local_z;
+			const size_t local_n = upper_bound - lower_bound;
+			size_t local_mask_nz = local_n;
+			size_t local_x_nz = local_n;
+			size_t local_y_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+			const bool mask_is_dense = (descr & descriptors::structural) &&
+				!(descr & descriptors::invert_mask) && already_dense_vectors;
+
+			bool already_dense_mask = true;
+			bool already_dense_input_x = true;
+			bool already_dense_input_y = true;
+
+			if( !mask_is_dense ) {
+				local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+					upper_bound );
+				if( dense_descr && local_z.nonzeroes() < local_n ) {
+					return ILLEGAL;
+				}
+			}
+
+			if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_mask = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( mask ) );
+				if( !already_dense_mask ) {
+#else
+					already_dense_mask = false;
+#endif
+					local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+						upper_bound );
+					local_mask_nz = local_mask.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+
+				already_dense_input_x = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( x ) );
+				if( !already_dense_input_x ) {
+#else
+					already_dense_input_x = false;
+#endif
+					local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+						upper_bound );
+					local_x_nz = local_x.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+
+				already_dense_input_y = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( y ) );
+				if( !already_dense_input_y ) {
+#else
+					already_dense_input_y = false;
+#endif
+					local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+						upper_bound );
+					local_y_nz = local_y.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			const internal::Wrapper< false, InputType1, Coords > x_wrapper( x );
+			const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+			const size_t sparse_loop = std::min( local_x_nz, local_y_nz );
+
+			if( !mask_is_dense ) {
+				local_z.local_clear();
+				if( lower_bound == 0 ) {
+					internal::getCoordinates( z ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+					if( dense_descr ) {
+						pipeline.markMaybeSparseDenseDescriptorVerification(
+							&internal::getCoordinates( z ) );
+					}
+				}
+			}
+
+			if( (descr & descriptors::dense) ||
+				(local_x_nz == local_n && local_y_nz == local_n) ||
+				( !(descr & descriptors::invert_mask) && sparse_loop >= local_mask_nz )
+			) {
+				// use loop over mask
+#ifdef GRB_BOOLEAN_DISPATCHER
+				rc = internal::boolean_dispatcher_masked_apply_generic<
+#else
+				rc = internal::masked_apply_generic<
+#endif
+						false, false, false, false, descr, OP,
+						OutputType, bool, InputType1, InputType2, Coords
+					>(
+						already_dense_mask, already_dense_input_x, already_dense_input_y,
+						lower_bound, upper_bound,
+						local_z, local_mask, local_x, local_y,
+						z, mask, x_wrapper, y_wrapper,
+						op
+					);
+
+			} else {
+				// use loop over sparse inputs
+#ifdef GRB_BOOLEAN_DISPATCHER
+				rc = internal::boolean_dispatcher_sparse_apply_generic<
+#else
+				rc = internal::sparse_apply_generic<
+#endif
+						true, false, false, false, descr, OP,
+						OutputType, bool, InputType1, InputType2, Coords
+					>(
+						already_dense_mask, already_dense_input_x, already_dense_input_y,
+						lower_bound, upper_bound,
+						local_z, &local_mask, local_x, local_y,
+						z, &mask, x_wrapper, y_wrapper,
+						op
+					);
+			}
+
+			if( !mask_is_dense ) {
+				internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_MASKED_EWISEAPPLY,
+				n, sizeof( OutputType ), dense_descr, dense_mask,
+				&z, nullptr, &internal::getCoordinates( z ), nullptr,
+				&x, &y, &mask, nullptr,
+				&internal::getCoordinates( x ), &internal::getCoordinates( y ),
+				&internal::getCoordinates( mask ), nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseApply(z, mask, x, y, "
+			<< "operator)" << std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseAdd(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< InputType1, ascend, Coords > &x,
+		const Vector< InputType2, ascend, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_semiring< Ring >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseAdd",
+			"called with an output vector with element type that does not match the "
+			"fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D3, InputType1 >::value ), "grb::eWiseAdd",
+			"called with a left-hand side input vector with element type that does not "
+			"match the third domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseAdd",
+			"called with a right-hand side input vector with element type that does "
+			"not match the fourth domain of the given semiring" );
+#ifdef _DEBUG
+		std::cout << "eWiseAdd (ascend, vector <- vector + vector) dispatches to "
+			<< "two folds using the additive monoid\n";
+#endif
+		RC ret = foldl< descr >( z, x, ring.getAdditiveMonoid(), phase );
+		ret = ret ? ret : foldl< descr >( z, y, ring.getAdditiveMonoid(), phase );
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename Coords
+	>
+	RC eWiseAdd(
+		Vector< OutputType, ascend, Coords > &z,
+		const InputType1 alpha,
+		const Vector< InputType2, ascend, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_semiring< Ring >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseAdd",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseAdd",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseAdd",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+#ifdef _DEBUG
+		std::cout << "eWiseAdd (ascend, vector <- scalar + vector) dispatches to "
+			<< "two folds with the additive monoid\n";
+#endif
+		RC ret = foldl< descr >( z, alpha, ring.getAdditiveMonoid(), phase );
+		ret = ret ? ret : foldl< descr >( z, y, ring.getAdditiveMonoid(), phase );
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename Coords
+	>
+	RC eWiseAdd(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< InputType1, ascend, Coords > &x,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_semiring< Ring >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseAdd",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseAdd",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseAdd",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+#ifdef _DEBUG
+		std::cout << "eWiseAdd (ascend, vector <- vector + scalar) dispatches to "
+			<< "two folds with the additive monoid\n";
+#endif
+		RC ret = foldl< descr >( z, x, ring.getAdditiveMonoid(), phase );
+		ret = ret ? ret : foldl< descr >( z, beta, ring.getAdditiveMonoid(), phase );
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename Coords
+	>
+	RC eWiseAdd(
+		Vector< OutputType, ascend, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_semiring< Ring >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseAdd",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseAdd",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseAdd",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+#ifdef _DEBUG
+		std::cout << "eWiseAdd (ascend, vector <- scalar + scalar) dispatches to "
+			<< "foldl with precomputed scalar and additive monoid\n";
+#endif
+		const typename Ring::D4 add;
+		(void) apply( add, alpha, beta, ring.getAdditiveOperator() );
+		return foldl< descr >( z, add, ring.getAdditiveMonoid(), phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename OutputType,
+		typename MaskType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseAdd(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< MaskType, ascend, Coords > &m,
+		const Vector< InputType1, ascend, Coords > &x,
+		const Vector< InputType2, ascend, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< MaskType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_semiring< Ring >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseAdd",
+			"called with an output vector with element type that does not match the "
+			"fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D3, InputType1 >::value ), "grb::eWiseAdd",
+			"called with a left-hand side input vector with element type that does not "
+			"match the third domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseAdd",
+			"called with a right-hand side input vector with element type that does "
+			"not match the fourth domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< MaskType, bool >::value ),
+			"grb::eWiseAdd (vector <- vector + vector, masked)",
+			"called with non-bool mask element types" );
+#ifdef _DEBUG
+		std::cout << "eWiseAdd (ascend, vector <- vector + vector, masked) "
+			<< "dispatches to two folds using the additive monoid\n";
+#endif
+		RC ret = foldl< descr >( z, m, x, ring.getAdditiveMonoid(), phase );
+		ret = ret ? ret : foldl< descr >( z, m, y, ring.getAdditiveMonoid(), phase );
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename MaskType,
+		typename Coords
+	>
+	RC eWiseAdd(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< MaskType, ascend, Coords > &m,
+		const InputType1 alpha,
+		const Vector< InputType2, ascend, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< MaskType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_semiring< Ring >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseAdd",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseAdd",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseAdd",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< MaskType, bool >::value ),
+			"grb::eWiseAdd (vector <- scalar + vector, masked)",
+			"called with non-bool mask element types" );
+#ifdef _DEBUG
+		std::cout << "eWiseAdd (ascend, vector <- scalar + vector, masked) "
+			<< "dispatches to two folds using the additive monoid\n";
+#endif
+		RC ret = foldl< descr >( z, m, alpha, ring.getAdditiveMonoid(), phase );
+		ret = ret ? ret : foldl< descr >( z, m, y, ring.getAdditiveMonoid(), phase );
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename MaskType,
+		typename Coords
+	>
+	RC eWiseAdd(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< MaskType, ascend, Coords > &m,
+		const Vector< InputType1, ascend, Coords > &x,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< MaskType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_semiring< Ring >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseAdd",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseAdd",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseAdd",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< MaskType, bool >::value ),
+			"grb::eWiseAdd (vector <- vector + scalar, masked)",
+			"called with non-bool mask element types" );
+#ifdef _DEBUG
+		std::cout << "eWiseAdd (ascend, vector <- vector + scalar, masked) "
+			<< "dispatches to eWiseApply using the additive monoid\n";
+#endif
+		RC ret = foldl< descr >( z, m, x, ring.getAdditiveMonoid(), phase );
+		ret = ret ? ret : foldl< descr >( z, m, beta, ring.getAdditiveMonoid(),
+			phase );
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename MaskType,
+		typename Coords
+	>
+	RC eWiseAdd(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< OutputType, ascend, Coords > &m,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< MaskType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_semiring< Ring >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseAdd",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseAdd",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseAdd",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< MaskType, bool >::value ),
+			"grb::eWiseAdd (vector <- scalar + scalar, masked)",
+			"called with non-bool mask element types" );
+#ifdef _DEBUG
+		std::cout << "eWiseAdd (ascend, vector <- scalar + scalar, masked) "
+			<< "dispatches to foldl with precomputed scalar and additive monoid\n";
+#endif
+		const typename Ring::D4 add;
+		(void) apply( add, alpha, beta, ring.getAdditiveOperator() );
+		return foldl< descr >( z, m, add, ring.getAdditiveMonoid(), phase );
+	}
+
+	// declare an internal version of eWiseMulAdd containing the full sparse &
+	// dense implementations
+	namespace internal {
+
+		template<
+			Descriptor descr,
+			bool a_scalar,
+			bool x_scalar,
+			bool y_scalar,
+			bool y_zero,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_output,
+			bool already_dense_mask,
+			bool already_dense_input_a,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+#endif
+			typename OutputType,
+			typename MaskType,
+			typename InputType1,
+			typename InputType2,
+			typename InputType3,
+			typename Coords,
+			class Ring
+		>
+		RC sparse_eWiseMulAdd_maskDriven(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_output,
+			bool already_dense_mask,
+			bool already_dense_input_a,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+#endif
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_z,
+			const Coords &local_m,
+			const Coords &local_a,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, ascend, Coords > &z_vector,
+			const Vector< MaskType, ascend, Coords > &m_vector,
+			const internal::Wrapper< a_scalar, InputType1, Coords > &a_wrapper,
+			const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper,
+			const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper,
+			const Ring &ring
+		) {
+			static_assert( !(descr & descriptors::invert_mask),
+				"Cannot loop over mask nonzeroes if invert_mask is given. "
+				"Please submit a bug report" );
+			static_assert( !a_scalar || !x_scalar,
+				"If both a and x are scalars, this is operation is a simple eWiseApply "
+				"with the additive operator if the semiring." );
+			static_assert( !y_zero || y_scalar,
+				"If y_zero is given, then y_scalar must be given also." );
+
+			OutputType * __restrict__ z = internal::getRaw( z_vector );
+			const MaskType * __restrict__ const m = internal::getRaw( m_vector );
+
+			// create local copies of the input const pointers
+			const InputType1 * __restrict__ const a = a_wrapper.getRaw();
+			const InputType2 * __restrict__ const x = x_wrapper.getRaw();
+			const InputType3 * __restrict__ const y = y_wrapper.getRaw();
+
+			const size_t local_n = upper_bound - lower_bound;
+			const size_t local_m_nz = already_dense_mask ? local_n : local_m.nonzeroes();
+
+			const size_t local_start = 0;
+			const size_t local_end = local_m_nz;
+
+			size_t k = local_start;
+
+			// scalar coda and parallel main body
+			for( ; k < local_end; ++k ) {
+				const size_t index = ( already_dense_mask ? k : local_m.index( k ) ) +
+					lower_bound;
+				assert( index - lower_bound < local_n );
+				if( already_dense_mask ) {
+					if( !internal::getCoordinates( m_vector ).template mask< descr >(
+						index, m )
+					) {
+						continue;
+					}
+				} else {
+					if( !local_m.template mask< descr >( index - lower_bound, m +
+						lower_bound )
+					) {
+						continue;
+					}
+				}
+				typename Ring::D3 t = ring.template getZero< typename Ring::D3 >();
+				if(
+					(
+						a_scalar || already_dense_input_a ||
+						local_a.assigned( index - lower_bound )
+					) && (
+						x_scalar || already_dense_input_x ||
+						local_x.assigned( index - lower_bound)
+					)
+				) {
+					const InputType1 a_p = ( a_scalar )
+						? a_wrapper.getValue()
+						: *( a + index );
+					const InputType2 x_p = ( x_scalar )
+						? x_wrapper.getValue()
+						: *( x + index );
+					(void) apply( t, a_p, x_p, ring.getMultiplicativeOperator() );
+					if( !y_zero && (
+						y_scalar || already_dense_input_y ||
+						local_y.assigned( index - lower_bound ) )
+					) {
+						const InputType3 y_p = ( y_scalar )
+							? y_wrapper.getValue()
+							: *( y + index );
+						typename Ring::D4 b;
+						(void) apply( b, t, y_p, ring.getAdditiveOperator() );
+						if( already_dense_output || local_z.assigned( index - lower_bound ) ) {
+							typename Ring::D4 out = static_cast< typename Ring::D4 >( z[ index ] );
+							(void) foldr( b, out, ring.getAdditiveOperator() );
+							z[ index ] = static_cast< OutputType >( out );
+						} else {
+							(void) local_z.assign( index - lower_bound );
+							z[ index ] = static_cast< OutputType >( b );
+						}
+					} else if( already_dense_output ||
+						local_z.assigned( index - lower_bound )
+					) {
+						typename Ring::D4 out = static_cast< typename Ring::D4 >( z[ index ] );
+						(void) foldr( t, out, ring.getAdditiveOperator() );
+						z[ index ] = static_cast< OutputType >( out );
+					} else {
+						(void) local_z.assign( index - lower_bound );
+						z[ index ] = static_cast< OutputType >( t );
+					}
+				} else if( !y_zero && (
+					already_dense_input_y || y_scalar ||
+					local_y.assigned( index - lower_bound ) )
+				) {
+					if( already_dense_output || local_z.assigned( index - lower_bound ) ) {
+						typename Ring::D4 out = static_cast< typename Ring::D4 >( z[ index ] );
+						(void) foldr( y[ index ], out, ring.getAdditiveOperator() );
+						z[ index ] = static_cast< OutputType >( out );
+					} else {
+						(void)local_z.assign( index - lower_bound );
+						z[ index ] = static_cast< OutputType >( t );
+					}
+				}
+			}
+
+			assert( false );
+			return UNSUPPORTED;
+		}
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool x_scalar,
+			bool y_scalar,
+			bool y_zero,
+			bool mulSwitched,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_output,
+			bool already_dense_mask,
+			bool already_dense_input_a,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+#endif
+			typename OutputType,
+			typename MaskType,
+			typename InputType1,
+			typename InputType2,
+			typename InputType3,
+			typename Coords,
+			class Ring
+		>
+		RC twoPhase_sparse_eWiseMulAdd_mulDriven(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_output,
+			bool already_dense_mask,
+			bool already_dense_input_a,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+#endif
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_z,
+			const Coords * const local_m,
+			const Coords &local_a,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, ascend, Coords > &z_vector,
+			const Vector< MaskType, ascend, Coords > * const m_vector,
+			const Vector< InputType1, ascend, Coords > &a_vector,
+			const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper,
+			const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper,
+			const Ring &ring
+		) {
+			OutputType * __restrict__ z = internal::getRaw( z_vector );
+			const MaskType * __restrict__ const m = masked
+				? internal::getRaw( *m_vector )
+				: nullptr;
+			const InputType1 * __restrict__ const a = internal::getRaw( a_vector );
+
+			// create local copies of the input const pointers
+			const InputType2 * __restrict__ const x = x_wrapper.getRaw();
+
+			const size_t local_n = upper_bound - lower_bound;
+			const size_t local_a_nz = already_dense_input_a
+				? local_n
+				: local_a.nonzeroes();
+
+			for( size_t i = 0; i < local_a_nz; ++i ) {
+				const size_t index = ( already_dense_input_a ? i : local_a.index( i ) ) +
+					lower_bound;
+				if( masked ) {
+					if( already_dense_mask ) {
+						if( !internal::getCoordinates( *m_vector ).template mask< descr >(
+							index, m )
+						) {
+							continue;
+						}
+					} else {
+						if( !local_m->template mask< descr >( index - lower_bound,
+							m + lower_bound )
+						) {
+							continue;
+						}
+					}
+				}
+
+				if( x_scalar || already_dense_input_x ||
+					local_x.assigned( index - lower_bound )
+				) {
+					typename Ring::D3 t;
+					const InputType1 a_p = *( a + index );
+					const InputType2 x_p = ( x_scalar )
+						? x_wrapper.getValue()
+						: *( x + index );
+
+					if( mulSwitched ) {
+						(void) apply( t, x_p, a_p, ring.getMultiplicativeOperator() );
+					} else {
+						(void) apply( t, a_p, x_p, ring.getMultiplicativeOperator() );
+					}
+
+					if( already_dense_output || local_z.assign( index - lower_bound ) ) {
+						typename Ring::D4 b = static_cast< typename Ring::D4 >( z[ index ] );
+						(void) foldr( t, b, ring.getAdditiveOperator() );
+						z[ index ] = static_cast< OutputType >( b );
+					} else {
+						z[ index ] = static_cast< OutputType >(
+							static_cast< typename Ring::D4 >( t )
+						);
+					}
+				}
+			}
+
+			RC rc = SUCCESS;
+
+			// now handle addition
+			if( !y_zero ) {
+				// now handle addition
+				if( masked ) {
+					if( y_scalar ) {
+						rc = internal::fold_from_scalar_to_vector_generic<
+#ifdef GRB_BOOLEAN_DISPATCHER
+								descr, true, true, true, true,
+								already_dense_output, already_dense_mask
+#else
+								descr, true, true, true, true
+#endif
+							>(
+#ifndef GRB_BOOLEAN_DISPATCHER
+								already_dense_output, already_dense_mask,
+#endif
+								lower_bound, upper_bound, local_z, local_m,
+								z_vector, m_vector, y_wrapper.getValue(),
+								ring.getAdditiveMonoid().getOperator(), EXECUTE
+							);
+					} else {
+						rc = fold_from_vector_to_vector_generic<
+#ifdef GRB_BOOLEAN_DISPATCHER
+								descr, true, true, true, true,
+								already_dense_output, already_dense_input_y, already_dense_mask
+#else
+								descr, true, true, true, true
+#endif
+							>(
+#ifndef GRB_BOOLEAN_DISPATCHER
+								already_dense_output, already_dense_input_y, already_dense_mask,
+#endif
+								lower_bound, upper_bound,
+								local_z, local_m, local_y,
+								z_vector, m_vector, *( y_wrapper.getPointer() ),
+								ring.getAdditiveMonoid().getOperator(), EXECUTE
+							);
+					}
+				} else {
+					if( y_scalar ) {
+						rc = fold_from_scalar_to_vector_generic<
+#ifdef GRB_BOOLEAN_DISPATCHER
+								descr, true, true, false, true,
+								already_dense_output, already_dense_mask
+#else
+								descr, true, true, false, true
+#endif
+							>(
+#ifndef GRB_BOOLEAN_DISPATCHER
+								already_dense_output, already_dense_mask,
+#endif
+								lower_bound, upper_bound,
+								local_z, local_m,
+								z_vector, m_vector, y_wrapper.getValue(),
+								ring.getAdditiveMonoid().getOperator(), EXECUTE
+							);
+					} else {
+						rc = fold_from_vector_to_vector_generic<
+#ifdef GRB_BOOLEAN_DISPATCHER
+								descr, true, true, false, true,
+								already_dense_output, already_dense_input_y, already_dense_mask
+#else
+								descr, true, true, false, true
+#endif
+							>(
+#ifndef GRB_BOOLEAN_DISPATCHER
+								already_dense_output, already_dense_input_y, already_dense_mask,
+#endif
+								lower_bound, upper_bound,
+								local_z, local_m, local_y,
+								z_vector, m_vector, *( y_wrapper.getPointer() ),
+								ring.getAdditiveMonoid().getOperator(), EXECUTE
+							);
+					}
+				}
+			}
+
+			// done
+			return rc;
+		}
+
+		template<
+			Descriptor descr,
+			bool a_scalar,
+			bool x_scalar,
+			bool y_scalar,
+			bool y_zero,
+			bool assign_z,
+			typename OutputType,
+			typename InputType1,
+			typename InputType2,
+			typename InputType3,
+			typename Coords,
+			class Ring
+		>
+		RC dense_eWiseMulAdd(
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Vector< OutputType, ascend, Coords > &z_vector,
+			const internal::Wrapper< a_scalar, InputType1, Coords > &a_wrapper,
+			const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper,
+			const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper,
+			const Ring &ring = Ring()
+		) {
+#ifdef _DEBUG
+			std::cout << "\tdense_eWiseMulAdd: loop size will be "
+				<< (upper_bound - lower_bound) << " in the range(" << lower_bound << ", "
+				<< upper_bound << ")\n";
+#endif
+			const size_t start = lower_bound;
+			const size_t end = upper_bound;
+
+			OutputType * __restrict__ z = internal::getRaw( z_vector );
+
+			// create local copies of the input const pointers
+			const InputType1 * __restrict__ a = a_wrapper.getRaw();
+			const InputType2 * __restrict__ x = x_wrapper.getRaw();
+			const InputType3 * __restrict__ y = y_wrapper.getRaw();
+
+			assert( z != a );
+			assert( z != x );
+			assert( z != y );
+			assert( a != x || a == nullptr );
+			assert( a != y || a == nullptr );
+			assert( x != y || x == nullptr );
+
+			// vector registers
+			typename Ring::D1 aa[ Ring::blocksize ];
+			typename Ring::D2 xx[ Ring::blocksize ];
+			typename Ring::D3 tt[ Ring::blocksize ];
+			typename Ring::D4 bb[ Ring::blocksize ];
+			typename Ring::D4 yy[ Ring::blocksize ];
+			typename Ring::D4 zz[ Ring::blocksize ];
+
+			if( a_scalar ) {
+				for( size_t b = 0; b < Ring::blocksize; ++b ) {
+					aa[ b ] = a_wrapper.getValue();
+				}
+			}
+			if( x_scalar ) {
+				for( size_t b = 0; b < Ring::blocksize; ++b ) {
+					xx[ b ] = x_wrapper.getValue();
+				}
+			}
+			if( y_scalar ) {
+				if( y_zero ) {
+					for( size_t b = 0; b < Ring::blocksize; ++b ) {
+						yy[ b ] = ring.template getZero< typename Ring::D4 >();
+					}
+				} else {
+					for( size_t b = 0; b < Ring::blocksize; ++b ) {
+						yy[ b ] = y_wrapper.getValue();
+					}
+				}
+			}
+
+			// do vectorised out-of-place operations. Allows for aligned overlap.
+			// Non-aligned ovelap is not possible due to GraphBLAS semantics.
+			size_t i = start;
+			// note: read the tail code (under this while loop) comments first for
+			// greater understanding
+			while( i + Ring::blocksize <= end ) {
+#ifdef _DEBUG
+				std::cout << "\tdense_eWiseMulAdd: handling block of size "
+					<< Ring::blocksize << " starting at index " << i << "\n";
+#endif
+				// read-in
+				if( !a_scalar ) {
+					for( size_t b = 0; b < Ring::blocksize; ++b ) {
+						aa[ b ] = static_cast< typename Ring::D2 >( a[ i + b ] );
+					}
+				}
+				if( !x_scalar ) {
+					for( size_t b = 0; b < Ring::blocksize; ++b ) {
+						xx[ b ] = static_cast< typename Ring::D2 >( x[ i + b ] );
+					}
+				}
+				if( !y_scalar ) {
+					for( size_t b = 0; b < Ring::blocksize; ++b ) {
+						yy[ b ] = static_cast< typename Ring::D4 >( y[ i + b ] );
+					}
+				}
+				if( !assign_z ) {
+					for( size_t b = 0; b < Ring::blocksize; ++b ) {
+						zz[ b ] = static_cast< typename Ring::D4 >( z[ i + b ] );
+					}
+				}
+
+				// operate
+				if( !y_zero ) {
+					for( size_t b = 0; b < Ring::blocksize; ++b ) {
+						apply( tt[ b ], aa[ b ], xx[ b ], ring.getMultiplicativeOperator() );
+						apply( bb[ b ], tt[ b ], yy[ b ], ring.getAdditiveOperator() );
+					}
+				} else {
+					assert( y_scalar );
+					for( size_t b = 0; b < Ring::blocksize; ++b ) {
+						apply( bb[ b ], aa[ b ], xx[ b ], ring.getMultiplicativeOperator() );
+					}
+				}
+				if( !assign_z ) {
+					for( size_t b = 0; b < Ring::blocksize; ++b ) {
+						foldr( bb[ b ], zz[ b ], ring.getAdditiveOperator() );
+					}
+				}
+
+				// write-out
+				if( assign_z ) {
+					for( size_t b = 0; b < Ring::blocksize; ++b, ++i ) {
+						z[ i ] = static_cast< OutputType >( bb[ b ] );
+					}
+				} else {
+					for( size_t b = 0; b < Ring::blocksize; ++b, ++i ) {
+						z[ i ] = static_cast< OutputType >( zz[ b ] );
+					}
+				}
+			}
+
+			// perform tail
+			if( !a_scalar ) {
+				a += i;
+			}
+			if( !x_scalar ) {
+				x += i;
+			}
+			if( !y_scalar ) {
+				y += i;
+			}
+			z += i;
+			for( ; i < end; ++i ) {
+				// do multiply
+				const typename Ring::D1 &as = ( a_scalar )
+					? static_cast< typename Ring::D1 >( a_wrapper.getValue() )
+					: static_cast< typename Ring::D1 >( *a );
+				const typename Ring::D2 &xs = ( x_scalar )
+					? static_cast< typename Ring::D2 >( x_wrapper.getValue() )
+					: static_cast< typename Ring::D2 >( *x );
+				typename Ring::D4 ys = ( y_scalar )
+					? static_cast< typename Ring::D4 >( y_wrapper.getValue() )
+					: static_cast< typename Ring::D4 >( *y );
+				typename Ring::D3 ts;
+
+				if( !y_zero ) {
+					RC always_succeeds = apply( ts, as, xs, ring.getMultiplicativeOperator() );
+					assert( always_succeeds == SUCCESS );
+					always_succeeds = foldr( ts, ys, ring.getAdditiveOperator() );
+					assert( always_succeeds == SUCCESS );
+#ifdef NDEBUG
+					(void) always_succeeds;
+#endif
+				} else {
+					RC always_succeeds = apply( ys, as, xs, ring.getMultiplicativeOperator() );
+					assert( always_succeeds == SUCCESS );
+#ifdef NDEBUG
+					(void) always_succeeds;
+#endif
+				}
+
+				// write out
+				if( assign_z ) {
+					*z = static_cast< OutputType >( ys );
+				} else {
+					RC always_succeeds = foldr( ys, *z, ring.getAdditiveOperator() );
+					assert( always_succeeds == SUCCESS );
+#ifdef NDEBUG
+					(void) always_succeeds;
+#endif
+				}
+
+				// move pointers
+				if( !a_scalar ) {
+					(void)a++;
+				}
+				if( !x_scalar ) {
+					(void)x++;
+				}
+				if( !y_scalar ) {
+					(void)y++;
+				}
+				(void)z++;
+			}
+
+			// done
+			assert( false );
+			return UNSUPPORTED;
+		}
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool a_scalar,
+			bool x_scalar,
+			bool y_scalar,
+			bool y_zero,
+			typename MaskType,
+			class Ring,
+			typename InputType1,
+			typename InputType2,
+			typename InputType3,
+			typename OutputType,
+			typename Coords
+		>
+		RC eWiseMulAdd_dispatch(
+			Vector< OutputType, ascend, Coords > &z_vector,
+			const Vector< MaskType, ascend, Coords > * const m_vector,
+			const internal::Wrapper< a_scalar, InputType1, Coords > &a_wrapper,
+			const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper,
+			const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper,
+			const size_t n,
+			const Ring &ring
+		) {
+			static_assert( !y_zero || y_scalar, "If y is zero, y_scalar must be true. "
+				"Triggering this assertion indicates an incorrect call to this "
+				"function; please submit a bug report" );
+#ifdef _DEBUG
+			std::cout << "\t in eWiseMulAdd_dispatch\n";
+#endif
+			RC ret = SUCCESS;
+
+			constexpr const bool dense_descr = descr & descriptors::dense;
+
+			internal::Pipeline::stage_type func =
+				[&z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, &ring] (
+					internal::Pipeline &pipeline,
+					const size_t lower_bound, const size_t upper_bound
+				) {
+#ifdef _ASCEND_DEBUG
+					#pragma omp critical
+					std::cout << "\t\tExecution of stage eWiseMulAdd_dispatch in the range("
+						<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+					RC rc = SUCCESS;
+
+					Coords local_z, local_m, local_a, local_x, local_y;
+					const size_t local_n = upper_bound - lower_bound;
+					size_t local_z_nz = local_n;
+					size_t local_m_nz = local_n;
+					size_t local_a_nz = local_n;
+					size_t local_x_nz = local_n;
+					size_t local_y_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					const bool already_dense_vectors = dense_descr ||
+						pipeline.allAlreadyDenseVectors();
+#else
+					(void) pipeline;
+					constexpr const bool already_dense_vectors = dense_descr;
+#endif
+					bool already_dense_output = true;
+					bool already_dense_mask = true;
+					bool already_dense_input_a = true;
+					bool already_dense_input_x = true;
+					bool already_dense_input_y = true;
+
+					if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+						already_dense_output = pipeline.containsAlreadyDenseVector(
+							&internal::getCoordinates( z_vector ) );
+						if( !already_dense_output ) {
+#else
+							already_dense_output = false;
+#endif
+							local_z = internal::getCoordinates( z_vector ).asyncSubset( lower_bound,
+								upper_bound );
+							local_z_nz = local_z.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+						}
+#endif
+						if( masked ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+							already_dense_mask = pipeline.containsAlreadyDenseVector(
+								&internal::getCoordinates( *m_vector ) );
+							if( !already_dense_mask ) {
+#else
+								already_dense_mask = false;
+#endif
+								local_m = internal::getCoordinates( *m_vector ).asyncSubset(
+									lower_bound, upper_bound );
+								local_m_nz = local_m.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+							}
+#endif
+						}
+
+						if( !a_scalar ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+							already_dense_input_a = pipeline.containsAlreadyDenseVector(
+								a_wrapper.getCoordinates() );
+							if ( !already_dense_input_a ) {
+#else
+								already_dense_input_a = false;
+#endif
+								local_a = a_wrapper.getCoordinates()->asyncSubset( lower_bound,
+									upper_bound );
+								local_a_nz = local_a.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+							}
+#endif
+						}
+
+						if( !x_scalar ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+							already_dense_input_x = pipeline.containsAlreadyDenseVector(
+								x_wrapper.getCoordinates() );
+							if( !already_dense_input_x ) {
+#else
+								already_dense_input_x = false;
+#endif
+								local_x = x_wrapper.getCoordinates()->asyncSubset( lower_bound,
+									upper_bound );
+								local_x_nz = local_x.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+							}
+#endif
+						}
+
+						if( !y_scalar ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+							already_dense_input_y = pipeline.containsAlreadyDenseVector(
+								y_wrapper.getCoordinates() );
+							if( !already_dense_input_y ) {
+#else
+								already_dense_input_y = false;
+#endif
+								local_y = y_wrapper.getCoordinates()->asyncSubset( lower_bound,
+									upper_bound );
+								local_y_nz = local_y.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+							}
+#endif
+						}
+					}
+
+					// check whether we are in the sparse or dense case
+					const bool mask_is_dense = !masked || (
+							(descr & descriptors::structural) &&
+							!(descr & descriptors::invert_mask) &&
+							local_m_nz == local_n
+						);
+					const size_t z_nns = local_z_nz;
+
+					// the below Boolean shall be true only if the inputs a, x, and y generate
+					// a dense output vector. It furthermore shall be set to false only if the
+					// output vector was either empty or fully dense. This is done to determine
+					// the exact case the dense variant of the eWiseMulAdd implementations can
+					// be used.
+					const bool sparse = ( a_scalar ? false : ( local_a_nz < local_n ) ) ||
+						( x_scalar ? false : ( local_x_nz < local_n ) ) ||
+						( y_scalar ? false : ( local_y_nz < local_n ) ) ||
+						( z_nns > 0 && z_nns < local_n ) ||
+						( masked && !mask_is_dense );
+					assert( !(sparse && dense_descr) );
+#ifdef _DEBUG
+					std::cout << "\t\t (sparse, dense)=(" << sparse << ", " << dense_descr
+						<< ")\n";
+#endif
+					// pre-assign coors if output is dense but was previously totally empty
+					const bool assign_z = z_nns == 0 && !sparse;
+
+					if( assign_z ) {
+#ifdef _DEBUG
+						std::cout << "\t\t detected output will be dense while "
+							<< "the output vector presently is completely empty. We therefore "
+							<< "pre-assign all output coordinates\n";
+#endif
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+						if( !already_dense_output ) {
+#endif
+							// the result will always be dense
+							local_z.local_assignAllNotAlreadyAssigned();
+							local_z_nz = local_z.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+						}
+#endif
+					}
+
+					if( !dense_descr && sparse ) {
+						// the below computes loop sizes multiplied with the number of vectors that
+						// each loop needs to touch. Possible vectors are: z, m, a, x, and y.
+						const size_t mask_factor = masked ? 1 : 0;
+						const size_t mul_loop_size = ( 3 + mask_factor ) * std::min(
+								( a_scalar ? local_n : local_a_nz ),
+								( x_scalar ? local_n : local_x_nz )
+							) + ( y_zero ? 0 :
+								(2 + mask_factor) * ( y_scalar ? local_n : local_y_nz )
+							);
+#ifdef _DEBUG
+						std::cout << "\t\t mul_loop_size = " << mul_loop_size << "\n";
+#endif
+
+						const size_t mask_loop_size = ( y_zero ? 4 : 5 ) * local_m_nz;
+
+						if( masked && mask_loop_size < mul_loop_size ) {
+#ifdef _DEBUG
+							std::cout << "\t\t mask_loop_size= " << mask_loop_size << "\n";
+							std::cout << "\t\t will be driven by output mask\n";
+#endif
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+							rc = boolean_dispatcher_sparse_eWiseMulAdd_maskDriven<
+#else
+							rc = sparse_eWiseMulAdd_maskDriven<
+#endif
+									descr, a_scalar, x_scalar, y_scalar, y_zero
+								>(
+									already_dense_output, already_dense_mask, already_dense_input_a,
+									already_dense_input_x, already_dense_input_y,
+									lower_bound, upper_bound,
+									local_z, local_m, local_a, local_x, local_y,
+									z_vector, *m_vector, a_wrapper, x_wrapper, y_wrapper,
+									ring
+								);
+						} else {
+#ifdef _DEBUG
+							std::cout << "\t\t will be driven by the multiplication a*x\n";
+#endif
+							static_assert( !(a_scalar && x_scalar),
+								"The case of the multiplication being between two scalars should have"
+								"been caught earlier. Please submit a bug report." );
+
+							if( a_scalar ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+								rc = boolean_dispatcher_twoPhase_sparse_eWiseMulAdd_mulDriven<
+#else
+								rc = twoPhase_sparse_eWiseMulAdd_mulDriven<
+#endif
+										descr, masked, a_scalar, y_scalar, y_zero, true
+									>(
+										already_dense_output, already_dense_mask, already_dense_input_x,
+										already_dense_input_a, already_dense_input_y,
+										lower_bound, upper_bound,
+										local_z, &local_m, local_x, local_a, local_y,
+										z_vector, m_vector, *(x_wrapper.getPointer()), a_wrapper, y_wrapper,
+										ring
+									);
+							} else if( x_scalar ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+								rc = boolean_dispatcher_twoPhase_sparse_eWiseMulAdd_mulDriven<
+#else
+								rc = twoPhase_sparse_eWiseMulAdd_mulDriven<
+#endif
+										descr, masked, x_scalar, y_scalar, y_zero, false
+									>(
+										already_dense_output, already_dense_mask, already_dense_input_a,
+										already_dense_input_x, already_dense_input_y,
+										lower_bound, upper_bound,
+										local_z, &local_m, local_a, local_x, local_y,
+										z_vector, m_vector, *(a_wrapper.getPointer()), x_wrapper, y_wrapper,
+										ring
+									);
+							} else if( local_a_nz <= local_x_nz ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+								rc = boolean_dispatcher_twoPhase_sparse_eWiseMulAdd_mulDriven<
+#else
+								rc = twoPhase_sparse_eWiseMulAdd_mulDriven<
+#endif
+										descr, masked, x_scalar, y_scalar, y_zero, false
+									>(
+										already_dense_output, already_dense_mask, already_dense_input_a,
+										already_dense_input_x, already_dense_input_y,
+										lower_bound, upper_bound,
+										local_z, &local_m, local_a, local_x, local_y,
+										z_vector, m_vector, *(a_wrapper.getPointer()), x_wrapper, y_wrapper,
+										ring
+									);
+							} else {
+								assert( local_x_nz < local_a_nz );
+#ifdef GRB_BOOLEAN_DISPATCHER
+								rc = boolean_dispatcher_twoPhase_sparse_eWiseMulAdd_mulDriven<
+#else
+								rc = twoPhase_sparse_eWiseMulAdd_mulDriven<
+#endif
+										descr, masked, a_scalar, y_scalar, y_zero, true
+									>(
+										already_dense_output, already_dense_mask, already_dense_input_x,
+										already_dense_input_a, already_dense_input_y,
+										lower_bound, upper_bound,
+										local_z, &local_m, local_x, local_a, local_y,
+										z_vector, m_vector, *(x_wrapper.getPointer()), a_wrapper, y_wrapper,
+										ring
+									);
+							}
+						}
+					} else {
+						// all that remains is the dense case
+						assert( a_scalar || local_a_nz == local_n );
+						assert( x_scalar || local_x_nz == local_n );
+						assert( y_scalar || local_y_nz == local_n );
+						assert( ! masked || mask_is_dense );
+						assert( local_z_nz == local_n );
+#ifdef _DEBUG
+						std::cout << "\t\t will perform a dense eWiseMulAdd\n";
+#endif
+						if( assign_z ) {
+							rc = dense_eWiseMulAdd<
+									descr, a_scalar, x_scalar, y_scalar, y_zero, true
+								>(
+									lower_bound, upper_bound,
+									z_vector, a_wrapper, x_wrapper, y_wrapper,
+									ring
+								);
+						} else {
+							rc = dense_eWiseMulAdd<
+									descr, a_scalar, x_scalar, y_scalar, y_zero, false
+								>(
+									lower_bound, upper_bound,
+									z_vector, a_wrapper, x_wrapper, y_wrapper,
+									ring
+								);
+						}
+					}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					if( !already_dense_output ) {
+#else
+					if( !already_dense_vectors ) {
+#endif
+						internal::getCoordinates( z_vector ).asyncJoinSubset( local_z,
+							lower_bound, upper_bound );
+					}
+
+					return rc;
+				};
+
+			ret = ret ? ret : internal::le.addStage(
+					std::move( func ),
+					internal::Opcode::BLAS1_EWISEMULADD_DISPATCH,
+					n, sizeof( OutputType ), dense_descr, true,
+					&z_vector, nullptr, &internal::getCoordinates( z_vector ), nullptr,
+					masked ? m_vector : nullptr, a_wrapper.getPointer(),
+					x_wrapper.getPointer(), y_wrapper.getPointer(),
+					masked ? &internal::getCoordinates( *m_vector ) : nullptr,
+					a_wrapper.getCoordinates(), x_wrapper.getCoordinates(),
+					y_wrapper.getCoordinates(),
+					nullptr
+				);
+
+#ifdef _ASCEND_DEBUG
+			std::cout << "\t\tStage added to a pipeline: eWiseMulAdd_dispatch"
+				<< std::endl;
+#endif
+			return ret;
+		}
+
+	} // namespace internal
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename OutputType,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, ascend, Coords > &z,
+		const InputType1 alpha,
+		const Vector< InputType2, ascend, Coords > &x,
+		const Vector< InputType3, ascend, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				!grb::is_object< InputType3 >::value &&
+				grb::is_semiring< Ring >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a left-hand scalar alpha of an element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a right-hand vector x with an element type that does not "
+			"match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
+			"called with an additive vector y with an element type that does not "
+			"match the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a result vector z with an element type that does not match "
+			"the fourth domain of the given semiring" );
+
+		// dynamic sanity checks
+		const size_t n = size( z );
+		if( size( x ) != n || size( y ) != n ) {
+			return MISMATCH;
+		}
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// catch trivial cases
+		const InputType1 zeroIT1 = ring.template getZero< InputType1 >();
+		if( alpha == zeroIT1 ) {
+			return foldl< descr >( z, y, ring.getAdditiveMonoid() );
+		}
+
+		const Vector< bool, ascend, Coords > * const null_mask = nullptr;
+
+		const internal::Wrapper< true, InputType1, Coords > a_wrapper( alpha );
+		const internal::Wrapper< false, InputType2, Coords > x_wrapper( x );
+		const internal::Wrapper< false, InputType3, Coords > y_wrapper( y );
+
+		// sparse or dense case
+		return internal::eWiseMulAdd_dispatch<
+				descr, false, true, false, false, false,
+				bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+			>( z,  null_mask, a_wrapper, x_wrapper, y_wrapper, n, ring );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename OutputType,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< InputType1, ascend, Coords > &a,
+		const InputType2 chi,
+		const Vector< InputType3, ascend, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a left-hand scalar alpha of an element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a right-hand vector x with an element type that does not "
+			"match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
+			"called with an additive vector y with an element type that does not "
+			"match the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a result vector z with an element type that does not match "
+			"the fourth domain of the given semiring" );
+
+		// dynamic sanity checks
+		const size_t n = size( z );
+		if( size( a ) != n || size( y ) != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// catch trivial cases
+		const InputType1 zeroIT2 = ring.template getZero< InputType2 >();
+		if( chi == zeroIT2 ) {
+			return foldl< descr >( z, y, ring.getAdditiveMonoid() );
+		}
+
+		const Vector< bool, ascend, Coords > * const null_mask = nullptr;
+
+		const internal::Wrapper< false, InputType1, Coords > a_wrapper( a );
+		const internal::Wrapper< true, InputType2, Coords > x_wrapper( chi );
+		const internal::Wrapper< false, InputType3, Coords > y_wrapper( y );
+
+		// sparse or dense case
+		return internal::eWiseMulAdd_dispatch<
+				descr, false, false, true, false, false,
+				bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+			>( z,  null_mask, a_wrapper, x_wrapper, y_wrapper, n, ring );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		bool y_zero = false,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename OutputType,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< InputType1, ascend, Coords > &a,
+		const Vector< InputType2, ascend, Coords > &x,
+		const InputType3 gamma,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a left-hand scalar alpha of an element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a right-hand vector x with an element type that does not "
+			"match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
+			"called with an additive vector y with an element type that does not "
+			"match the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a result vector z with an element type that does not match "
+			"the fourth domain of the given semiring" );
+
+		// dynamic sanity checks
+		const size_t n = size( z );
+		if( size( a ) != n || size( x ) != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		const Vector< bool, ascend, Coords > * const null_mask = nullptr;
+
+		const internal::Wrapper< false, InputType1, Coords > a_wrapper( a );
+		const internal::Wrapper< false, InputType2, Coords > x_wrapper( x );
+		const internal::Wrapper< true, InputType3, Coords > y_wrapper( gamma );
+
+		// sparse or dense case
+		return internal::eWiseMulAdd_dispatch<
+				descr, false, false, false, true, y_zero,
+				bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+			>( z, null_mask, a_wrapper, x_wrapper, y_wrapper, n, ring );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		bool y_zero = false,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename OutputType,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< InputType1, ascend, Coords > &a,
+		const InputType2 beta,
+		const InputType3 gamma,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a left-hand scalar alpha of an element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a right-hand vector x with an element type that does not "
+			"match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
+			"called with an additive vector y with an element type that does not "
+			"match the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a result vector z with an element type that does not match "
+			"the fourth domain of the given semiring" );
+
+		// dynamic sanity checks
+		const size_t n = size( z );
+		if( size( a ) != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// catch trivial dispatches
+		const InputType2 zeroIT2 = ring.template getZero< InputType2 >();
+		if( beta == zeroIT2 ) {
+			return foldl< descr >( z, gamma, ring.getAdditiveMonoid() );
+		}
+
+		const Vector< bool, ascend, Coords > * const null_mask = nullptr;
+
+		const internal::Wrapper< false, InputType1, Coords > a_wrapper( a );
+		const internal::Wrapper< true, InputType2, Coords > x_wrapper( beta );
+		const internal::Wrapper< true, InputType3, Coords > y_wrapper( gamma );
+
+		// sparse or dense case
+		return internal::eWiseMulAdd_dispatch<
+				descr, false, false, true, true, y_zero,
+				bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+			>( z,  null_mask, a_wrapper, x_wrapper, y_wrapper, n, ring );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		bool y_zero = false,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename OutputType,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, ascend, Coords > &z,
+		const InputType1 alpha,
+		const Vector< InputType2, ascend, Coords > &x,
+		const InputType3 gamma,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a left-hand scalar alpha of an element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a right-hand vector x with an element type that does not "
+			"match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
+			"called with an additive vector y with an element type that does not "
+			"match the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a result vector z with an element type that does not match "
+			"the fourth domain of the given semiring" );
+
+		// dynamic sanity checks
+		const size_t n = size( z );
+		if( size( x ) != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// catch trivial cases
+		const InputType1 zeroIT1 = ring.template getZero< InputType1 >();
+		if( alpha == zeroIT1 ) {
+			return foldl< descr >( z, gamma, ring.getAdditiveMonoid() );
+		}
+
+		const Vector< bool, ascend, Coords > * null_mask = nullptr;
+
+		const internal::Wrapper< true, InputType1, Coords > a_wrapper( alpha );
+		const internal::Wrapper< false, InputType2, Coords > x_wrapper( x );
+		const internal::Wrapper< true, InputType3, Coords > y_wrapper( gamma );
+
+		// sparse or dense case
+		return internal::eWiseMulAdd_dispatch<
+				descr, false, true, false, true, y_zero,
+				bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+			>( z,  null_mask, a_wrapper, x_wrapper, y_wrapper, n, ring );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, ascend, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Vector< InputType3, ascend, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"First domain of semiring does not match first input type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"Second domain of semiring does not match second input type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"Fourth domain of semiring does not match third input type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"Fourth domain of semiring does not match output type" );
+#ifdef _DEBUG
+		std::cout << "eWiseMulAdd (ascend, vector <- scalar x scalar + vector) "
+			<< "precomputes scalar multiply and dispatches to eWiseAdd (ascend, "
+			<< "vector <- scalar + vector)\n";
+#endif
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( y ) != n ) { return MISMATCH; }
+
+		typename Ring::D3 mul_result;
+		RC rc = grb::apply( mul_result, alpha, beta,
+			ring.getMultiplicativeOperator() );
+#ifdef NDEBUG
+		(void) rc;
+#else
+		assert( rc == SUCCESS );
+#endif
+		return eWiseAdd< descr >( z, mul_result, y, ring, phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, ascend, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const InputType3 gamma,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"First domain of semiring does not match first input type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"Second domain of semiring does not match second input type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"Fourth domain of semiring does not match third input type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"Fourth domain of semiring does not match output type" );
+#ifdef _DEBUG
+		std::cout << "eWiseMulAdd (ascend, vector <- scalar x scalar + scalar) "
+			<< "precomputes scalar operations and dispatches to set (ascend)\n";
+#endif
+		typename Ring::D3 mul_result;
+		RC rc = grb::apply( mul_result, alpha, beta,
+			ring.getMultiplicativeOperator() );
+#ifdef NDEBUG
+		(void) rc;
+#endif
+		assert( rc == SUCCESS );
+		typename Ring::D4 add_result;
+		rc = grb::apply( add_result, mul_result, gamma, ring.getAdditiveOperator() );
+#ifdef NDEBUG
+		(void) rc;
+#endif
+		assert( rc == SUCCESS );
+		return grb::foldl< descr >( z, add_result, ring.getAdditiveMonoid(), phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename OutputType,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< InputType1, ascend, Coords > &a,
+		const Vector< InputType2, ascend, Coords > &x,
+		const Vector< InputType3, ascend, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		(void) ring;
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a left-hand vector a with an element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a right-hand vector x with an element type that does not "
+			"match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
+			"called with an additive vector y with an element type that does not "
+			"match the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a result vector z with an element type that does not match "
+			"the fourth domain of the given semiring" );
+
+		// dynamic sanity checks
+		const size_t n = size( z );
+		if( size( x ) != n || size( y ) != n || size( a ) != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		const Vector< bool, ascend, Coords > * const null_mask = nullptr;
+
+		const internal::Wrapper< false, InputType1, Coords > a_wrapper( a );
+		const internal::Wrapper< false, InputType2, Coords > x_wrapper( x );
+		const internal::Wrapper< false, InputType3, Coords > y_wrapper( y );
+
+		// sparse or dense case
+		return internal::eWiseMulAdd_dispatch<
+				descr, false, false, false, false, false,
+				bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+			>( z, null_mask, a_wrapper, x_wrapper, y_wrapper, n, ring );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename OutputType,
+		typename MaskType,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< MaskType, ascend, Coords > &m,
+		const InputType1 alpha,
+		const Vector< InputType2, ascend, Coords > &x,
+		const Vector< InputType3, ascend, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value &&
+			!grb::is_object< MaskType >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a left-hand scalar alpha of an element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a right-hand vector x with an element type that does not "
+			"match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
+			"called with an additive vector y with an element type that does not "
+			"match the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a result vector z with an element type that does not match "
+			"the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector m with a non-bool element type" );
+
+		// catch empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMulAdd< descr >( z, alpha, x, y, ring, phase );
+		}
+
+		// dynamic sanity checks
+		const size_t n = size( z );
+		if( size( x ) != n || size( y ) != n || size( m ) != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// catch trivial cases
+		const InputType1 zeroIT1 = ring.template getZero< InputType1 >();
+		if( alpha == zeroIT1 ) {
+			return foldl< descr >( z, m, y, ring.getAdditiveMonoid() );
+		}
+
+		const internal::Wrapper< true, InputType1, Coords > a_wrapper( alpha );
+		const internal::Wrapper< false, InputType2, Coords > x_wrapper( x );
+		const internal::Wrapper< false, InputType3, Coords > y_wrapper( y );
+
+		// sparse or dense case
+		return internal::eWiseMulAdd_dispatch<
+				descr, true, true, false, false, false,
+				bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+			>( z, &m, a_wrapper, x_wrapper, y_wrapper, n, ring );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename OutputType,
+		typename MaskType,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< MaskType, ascend, Coords > &m,
+		const Vector< InputType1, ascend, Coords > &a,
+		const InputType2 chi,
+		const Vector< InputType3, ascend, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value &&
+			!grb::is_object< MaskType >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a left-hand scalar alpha of an element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a right-hand vector x with an element type that does not "
+			"match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
+			"called with an additive vector y with an element type that does not "
+			"match the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a result vector z with an element type that does not match "
+			"the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector m with a non-bool element type" );
+
+		// catch empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMulAdd< descr >( z, a, chi, y, ring, phase );
+		}
+
+		// dynamic sanity checks
+		const size_t n = size( z );
+		if( size( a ) != n || size( y ) != n || size( m ) != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// catch trivial cases
+		const InputType1 zeroIT2 = ring.template getZero< InputType2 >();
+		if( chi == zeroIT2 ) {
+			return foldl< descr >( z, m, y, ring.getAdditiveMonoid() );
+		}
+
+		const internal::Wrapper< false, InputType1, Coords > a_wrapper( a );
+		const internal::Wrapper< true, InputType2, Coords > x_wrapper( chi );
+		const internal::Wrapper< false, InputType3, Coords > y_wrapper( y );
+
+		// sparse or dense case
+		return internal::eWiseMulAdd_dispatch<
+				descr, true, false, true, false, false,
+				bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+			>( z, &m, a_wrapper, x_wrapper, y_wrapper, n, ring );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		bool y_zero = false,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename OutputType,
+		typename MaskType,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< MaskType, ascend, Coords > &m,
+		const Vector< InputType1, ascend, Coords > &a,
+		const Vector< InputType2, ascend, Coords > &x,
+		const InputType3 gamma,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value &&
+			!grb::is_object< MaskType >::value, void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a left-hand scalar alpha of an element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a right-hand vector x with an element type that does not "
+			"match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
+			"called with an additive vector y with an element type that does not "
+			"match the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a result vector z with an element type that does not match "
+			"the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector m with a non-bool element type" );
+
+		// catch empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMulAdd< descr, y_zero >( z, a, x, gamma, ring, phase );
+		}
+
+		// dynamic sanity checks
+		const size_t n = size( z );
+		if( size( a ) != n || size( x ) != n || size( m ) != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		const internal::Wrapper< false, InputType1, Coords > a_wrapper( a );
+		const internal::Wrapper< false, InputType2, Coords > x_wrapper( x );
+		const internal::Wrapper< true, InputType3, Coords > y_wrapper( gamma );
+
+		// sparse or dense case
+		return internal::eWiseMulAdd_dispatch<
+				descr, true, false, false, true, y_zero,
+				bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+			>( z, &m, a_wrapper, x_wrapper, y_wrapper, n, ring );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		bool y_zero = false,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename OutputType,
+		typename MaskType,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< MaskType, ascend, Coords > &m,
+		const Vector< InputType1, ascend, Coords > &a,
+		const InputType2 beta,
+		const InputType3 gamma,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value &&
+			!grb::is_object< MaskType >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a left-hand scalar alpha of an element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a right-hand vector x with an element type that does not "
+			"match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
+			"called with an additive vector y with an element type that does not "
+			"match the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a result vector z with an element type that does not match "
+			"the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector m with a non-bool element type" );
+
+		// catch empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMulAdd< descr, y_zero >( z, a, beta, gamma, ring, phase );
+		}
+
+		// dynamic sanity checks
+		const size_t n = size( z );
+		if( size( a ) != n || size( m ) != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// catch trivial dispatch
+		const InputType2 zeroIT2 = ring.template getZero< InputType2 >();
+		if( zeroIT2 == beta ) {
+#ifdef _DEBUG
+			std::cout << "eWiseMulAdd (ascend, masked, vector<-vector<-scalar<-"
+				<< "scalar) dispatches to foldl\n";
+#endif
+			return foldl< descr >( z, m, gamma, ring.getAdditiveMonoid() );
+		}
+
+		const internal::Wrapper< false, InputType1, Coords > a_wrapper( a );
+		const internal::Wrapper< true, InputType2, Coords > x_wrapper( beta );
+		const internal::Wrapper< true, InputType3, Coords > y_wrapper( gamma );
+
+		// sparse or dense case
+		return internal::eWiseMulAdd_dispatch<
+				descr, true, false, true, true, y_zero,
+				bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+			>( z, &m, a_wrapper, x_wrapper, y_wrapper, n, ring );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		bool y_zero = false,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename OutputType,
+		typename MaskType,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< MaskType, ascend, Coords > &m,
+		const InputType1 alpha,
+		const Vector< InputType2, ascend, Coords > &x,
+		const InputType3 gamma,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value &&
+			!grb::is_object< MaskType >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a left-hand scalar alpha of an element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a right-hand vector x with an element type that does not "
+			"match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
+			"called with an additive vector y with an element type that does not "
+			"match the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a result vector z with an element type that does not match "
+			"the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector m with a non-bool element type" );
+
+		// catch empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMulAdd< descr, y_zero >( z, alpha, x, gamma, ring, phase );
+		}
+
+		// dynamic sanity checks
+		const size_t n = size( z );
+		if( size( x ) != n || size( m ) != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// catch trivial dispatch
+		const InputType1 zeroIT1 = ring.template getZero< InputType1 >();
+		if( alpha == zeroIT1 ) {
+#ifdef _DEBUG
+			std::cout << "eWiseMulAdd (ascend, masked, vector<-scalar<-scalar<-"
+				<< "scalar) dispatches to foldl\n";
+#endif
+			return foldl< descr >( z, m, gamma, ring.getAdditiveMonoid() );
+		}
+
+		const internal::Wrapper< true, InputType1, Coords > a_wrapper( alpha );
+		const internal::Wrapper< false, InputType2, Coords > x_wrapper( x );
+		const internal::Wrapper< true, InputType3, Coords > y_wrapper( gamma );
+
+		// sparse or dense case
+		return internal::eWiseMulAdd_dispatch<
+				descr, true, true, false, true, y_zero,
+				bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+			>( z, &m, a_wrapper, x_wrapper, y_wrapper, n, ring );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename OutputType,
+		typename MaskType,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< MaskType, ascend, Coords > &m,
+		const Vector< InputType1, ascend, Coords > &a,
+		const Vector< InputType2, ascend, Coords > &x,
+		const Vector< InputType3, ascend, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value &&
+			!grb::is_object< MaskType >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a left-hand vector a with an element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a right-hand vector x with an element type that does not "
+			"match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
+			"called with an additive vector y with an element type that does not "
+			"match the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a result vector z with an element type that does not match "
+			"the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector m with a non-bool element type" );
+
+		// catch empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMulAdd< descr >( z, a, x, y, ring, phase );
+		}
+
+		// dynamic sanity checks
+		const size_t n = size( z );
+		if( size( x ) != n || size( y ) != n || size( a ) != n || size( m ) != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		const internal::Wrapper< false, InputType1, Coords > a_wrapper( a );
+		const internal::Wrapper< false, InputType2, Coords > x_wrapper( x );
+		const internal::Wrapper< false, InputType3, Coords > y_wrapper( y );
+
+		// sparse or dense case
+		return internal::eWiseMulAdd_dispatch<
+				descr, true, false, false, false, false,
+				bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+			>( z, &m, a_wrapper, x_wrapper, y_wrapper, n, ring );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename OutputType,
+		typename MaskType,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< MaskType, ascend, Coords > &m,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Vector< InputType3, ascend, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value &&
+			!grb::is_object< MaskType >::value, void
+		>::type * const = nullptr
+	) {
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"First domain of semiring does not match first input type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"Second domain of semiring does not match second input type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"Fourth domain of semiring does not match third input type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"Fourth domain of semiring does not match output type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector with a non-bool element type" );
+#ifdef _DEBUG
+		std::cout << "eWiseMulAdd (ascend, vector <- scalar x scalar + vector, "
+			<< "masked) precomputes scalar multiply and dispatches to eWiseAdd "
+			<< "(ascend, vector <- scalar + vector, masked)\n";
+#endif
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( m ) != n || size( y ) != n ) {
+			return MISMATCH;
+		}
+
+		typename Ring::D3 mul_result;
+		RC rc = grb::apply( mul_result, alpha, beta,
+			ring.getMultiplicativeOperator() );
+#ifdef NDEBUG
+		(void) rc;
+#else
+		assert( rc == SUCCESS );
+#endif
+		return grb::eWiseAdd< descr >( z, m, mul_result, y, ring, phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename OutputType,
+		typename MaskType,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< MaskType, ascend, Coords > &m,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const InputType3 gamma,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"First domain of semiring does not match first input type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"Second domain of semiring does not match second input type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"Fourth domain of semiring does not match third input type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"Fourth domain of semiring does not match output type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector with a non-bool element type" );
+#ifdef _DEBUG
+		std::cout << "eWiseMulAdd (ascend, vector <- scalar x scalar + scalar, "
+			<< "masked) precomputes scalar operations and dispatches to foldl "
+			<< "(ascend, masked)\n";
+#endif
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( m ) != n ) {
+			return MISMATCH;
+		}
+
+		typename Ring::D3 mul_result;
+		RC rc = grb::apply( mul_result, alpha, beta,
+			ring.getMultiplicativeOperator() );
+		assert( rc == SUCCESS );
+#ifdef NDEBUG
+		(void) rc;
+#endif
+		typename Ring::D4 add_result;
+		rc = grb::apply( add_result, mul_result, gamma, ring.getAdditiveOperator() );
+		assert( rc == SUCCESS );
+#ifdef NDEBUG
+		(void) rc;
+#endif
+		return grb::foldl( z, m, add_result, ring.getAdditiveMonoid(), phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< InputType1, ascend, Coords > &x,
+		const Vector< InputType2, ascend, Coords > &y,
+		const Ring & ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( x ) != n || size( y ) != n ) {
+			return MISMATCH;
+		}
+
+		// check trivial phase
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+#ifdef _DEBUG
+		std::cout << "eWiseMul (ascend, vector <- vector x vector) dispatches "
+			<< "to eWiseMulAdd (vector <- vector x vector + 0)\n";
+#endif
+		return eWiseMulAdd< descr, true >(
+			z, x, y, ring.template getZero< typename Ring::D4 >(), ring, phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, ascend, Coords > &z,
+		const InputType1 alpha,
+		const Vector< InputType2, ascend, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( y ) != n ) { return MISMATCH; }
+
+		// check for trivial phase
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+		// check trivial
+		if( alpha == ring.template getZero< typename Ring::D1 >() ) {
+			return SUCCESS;
+		}
+
+#ifdef _DEBUG
+		std::cout << "eWiseMul (ascend, vector <- scalar x vector) dispatches "
+			<< "to eWiseMulAdd (vector <- scalar x vector + 0)\n";
+#endif
+		return eWiseMulAdd< descr, true >(
+			z, alpha, y, ring.template getZero< typename Ring::D4 >(), ring, phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< InputType1, ascend, Coords > &x,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( x ) != n ) {
+			return MISMATCH;
+		}
+
+		// catch trivial phase
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+		// check trivial
+		if( beta == ring.template getZero< typename Ring::D2 >() ) {
+			return SUCCESS;
+		}
+
+#ifdef _DEBUG
+		std::cout << "eWiseMul (ascend) dispatches to eWiseMulAdd with 0.0 as "
+			<< "additive scalar\n";
+#endif
+
+		return eWiseMulAdd< descr, true >(
+			z, x, beta, ring.template getZero< typename Ring::D4 >(), ring, phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, ascend, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+
+		// check for trivial phase
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+		// check trivial
+		if( alpha == ring.template getZero< typename Ring::D1 >() ) {
+			return SUCCESS;
+		}
+		if( beta == ring.template getZero< typename Ring::D2 >() ) {
+			return SUCCESS;
+		}
+
+#ifdef _DEBUG
+		std::cout << "eWiseMul (ascend) dispatches to scalar apply and foldl\n";
+#endif
+		typename Ring::D3 temp;
+		RC always_success = apply( temp, alpha, beta,
+			ring.getMultiplicativeOperator() );
+		assert( always_success == SUCCESS );
+#ifdef NDEBUG
+		(void) always_success;
+#endif
+		return foldl< descr >( z, temp, ring.getAdditiveMonoid(), phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename MaskType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< MaskType, ascend, Coords > &m,
+		const Vector< InputType1, ascend, Coords > &x,
+		const Vector< InputType2, ascend, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< MaskType >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector with a non-bool element type" );
+
+		// check for empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMul< descr >( z, x, y, ring, phase );
+		}
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( m ) != n || size( x ) != n || size( y ) != n ) {
+			return MISMATCH;
+		}
+
+		// check trivial phase
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+#ifdef _DEBUG
+		std::cout << "eWiseMul (ascend, vector <- vector x vector, masked) "
+			<< "dispatches to eWiseMulAdd (vector <- vector x vector + 0, masked)\n";
+#endif
+		return eWiseMulAdd< descr, true >(
+			z, m, x, y, ring.template getZero< typename Ring::D4 >(), ring, phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename MaskType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< MaskType, ascend, Coords > &m,
+		const InputType1 alpha,
+		const Vector< InputType2, ascend, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< MaskType >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector _m with a non-bool element type" );
+
+		// check for empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMul< descr >( z, alpha, y, ring, phase );
+		}
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( m ) != n || size( y ) != n ) { return MISMATCH; }
+
+		// check for trivial phase
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+		// check trivial
+		if( alpha == ring.template getZero< typename Ring::D1 >() ) {
+			return SUCCESS;
+		}
+
+#ifdef _DEBUG
+		std::cout << "eWiseMul (ascend, vector <- scalar x vector, masked) "
+			<< "dispatches to eWiseMulAdd (vector <- scalar x vector + 0, masked)\n";
+#endif
+		return eWiseMulAdd< descr, true >(
+			z, m, alpha, y, ring.template getZero< typename Ring::D4 >(), ring, phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename MaskType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< MaskType, ascend, Coords > &m,
+		const Vector< InputType1, ascend, Coords > &x,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< MaskType >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector _m with a non-bool element type" );
+
+		// check for empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMul< descr >( z, x, beta, ring, phase );
+		}
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( m ) != n || size( x ) != n ) { return MISMATCH; }
+
+		// check for trivial phase
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+		// check trivial
+		if( beta == ring.template getZero< typename Ring::D2 >() ) {
+			return SUCCESS;
+		}
+
+#ifdef _DEBUG
+		std::cout << "eWiseMul (ascend, masked) dispatches to masked "
+			<< "eWiseMulAdd with 0.0 as additive scalar\n";
+#endif
+		return eWiseMulAdd< descr, true >(
+			z, m, x, beta, ring.template getZero< typename Ring::D4 >(), ring, phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename MaskType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, ascend, Coords > &z,
+		const Vector< MaskType, ascend, Coords > &m,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< MaskType >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector _m with a non-bool element type" );
+
+		// check for empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMul< descr >( z, alpha, beta, ring, phase );
+		}
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( m ) != n ) { return MISMATCH; }
+
+		// check for trivial phase
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+		// check trivial
+		if( alpha == ring.template getZero< typename Ring::D1 >() ) {
+			return SUCCESS;
+		}
+		if( beta == ring.template getZero< typename Ring::D2 >() ) {
+			return SUCCESS;
+		}
+
+#ifdef _DEBUG
+		std::cout << "eWiseMul (ascend, masked) dispatches to masked foldl\n";
+#endif
+		typename Ring::D3 temp;
+		const RC always_success = apply( temp, alpha, beta,
+			ring.getMultiplicativeOperator() );
+		assert( always_success == SUCCESS );
+#ifdef NDEBUG
+		(void) always_success;
+#endif
+		return foldl< descr >( z, m, temp, ring.getAdditiveMonoid(), EXECUTE );
+	}
+
+	// internal namespace for implementation of grb::dot
+	namespace internal {
+
+		template<
+			Descriptor descr,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+#endif
+			class AddMonoid,
+			class AnyOp,
+			typename InputType1,
+			typename InputType2,
+			typename Coords
+		>
+		RC sparse_dot_generic(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+#endif
+			typename AddMonoid::D3 &thread_local_output,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_x,
+			const Coords &local_y,
+			const Vector< InputType1, ascend, Coords > &x,
+			const Vector< InputType2, ascend, Coords > &y,
+			const size_t local_nz,
+			const AddMonoid &addMonoid,
+			const AnyOp &anyOp
+		) {
+#ifdef _DEBUG
+			std::cout << "\t\t in sparse variant, nonzero range " << lower_bound << "--"
+				<< upper_bound << ", blocksize " << AnyOp::blocksize << "\n";
+#else
+			(void) upper_bound;
+#endif
+
+			// get raw alias
+			const InputType1 * __restrict__ a = internal::getRaw( x );
+			const InputType2 * __restrict__ b = internal::getRaw( y );
+
+			size_t i = 0;
+			if( local_nz > 0 ) {
+				while( i + AnyOp::blocksize < local_nz ) {
+					// declare buffers
+					static_assert( AnyOp::blocksize > 0,
+						"Configuration error: vectorisation blocksize set to 0!" );
+					typename AnyOp::D1 xx[ AnyOp::blocksize ];
+					typename AnyOp::D2 yy[ AnyOp::blocksize ];
+					typename AnyOp::D3 zz[ AnyOp::blocksize ];
+					bool mask[ AnyOp::blocksize ];
+
+					// prepare registers
+					for( size_t k = 0; k < AnyOp::blocksize; ++k, ++i ) {
+						mask[ k ] = already_dense_input_x ||
+							local_x.assigned( already_dense_input_y ? i : local_y.index( i ) );
+					}
+
+					// rewind
+					i -= AnyOp::blocksize;
+
+					// do masked load
+					for( size_t k = 0; k < AnyOp::blocksize; ++k, ++i ) {
+						if( mask[ k ] ) {
+							xx[ k ] = static_cast< typename AnyOp::D1 >(
+								a[ ( already_dense_input_y ? i : local_y.index( i ) ) + lower_bound ] );
+							yy[ k ] = static_cast< typename AnyOp::D2 >(
+								b[ ( already_dense_input_y ? i : local_y.index( i ) ) + lower_bound ] );
+						}
+					}
+
+					// perform element-wise multiplication
+					if( internal::maybe_noop< AnyOp >::value ) {
+						// we are forced to first initialise zz before doing masked apply
+						for( size_t k = 0; k < AnyOp::blocksize; ++k ) {
+							zz[ k ] = addMonoid.template getIdentity< typename AnyOp::D3 >();
+						}
+						for( size_t k = 0; k < AnyOp::blocksize; ++k ) {
+							if( mask[ k ] ) {
+								GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED        // yy and xx cannot be used
+										                           // uninitialised or mask
+								apply( zz[ k ], xx[ k ], yy[ k ], anyOp ); // would be false while zz
+								GRB_UTIL_RESTORE_WARNINGS                  // init is just above
+							}
+						}
+					} else {
+						// if apply surely initialises zz, we could use a blend-like op
+						for( size_t k = 0; k < AnyOp::blocksize; ++k ) {
+							if( mask[ k ] ) {
+								apply( zz[ k ], xx[ k ], yy[ k ], anyOp );
+							} else {
+								zz[ k ] = addMonoid.template getIdentity< typename AnyOp::D3 >();
+							}
+						}
+					}
+
+					// perform reduction into output element
+					addMonoid.getOperator().foldlArray( thread_local_output, zz,
+						AnyOp::blocksize );
+					//^--> note that this foldl operates on raw arrays,
+					//     and thus should not be mistaken with a foldl
+					//     on a grb::Vector.
+				}
+
+				// perform element-by-element updates for remainder (if any)
+				for( ; i < local_nz; ++i ) {
+					typename AddMonoid::D3 temp =
+						addMonoid.template getIdentity< typename AddMonoid::D3 >();
+					const size_t index = ( already_dense_input_y ? i : local_y.index( i ) ) +
+						lower_bound;
+					if( already_dense_input_x || local_x.assigned( index - lower_bound ) ) {
+						apply( temp, a[ index ], b[ index ], anyOp );
+						foldr( temp, thread_local_output, addMonoid.getOperator() );
+					}
+				}
+			}
+
+			assert( false );
+			return UNSUPPORTED;
+		}
+
+		template<
+			Descriptor descr = descriptors::no_operation,
+			class AddMonoid,
+			class AnyOp,
+			typename OutputType,
+			typename InputType1,
+			typename InputType2,
+			typename Coords
+		>
+		RC dot_generic(
+			OutputType &z,
+			const Vector< InputType1, ascend, Coords > &x,
+			const Vector< InputType2, ascend, Coords > &y,
+			const AddMonoid &addMonoid,
+			const AnyOp &anyOp,
+			const Phase &phase
+		) {
+			const size_t n = internal::getCoordinates( x ).size();
+
+			if( phase == RESIZE ) {
+				return SUCCESS;
+			}
+			assert( phase == EXECUTE );
+
+			RC ret = SUCCESS;
+
+			const size_t start = 0;
+			const size_t end = n;
+
+			if( end > start ) {
+
+				typename AddMonoid::D3 reduced =
+					addMonoid.template getIdentity< typename AddMonoid::D3 >();
+
+				size_t reduced_size = sysconf( _SC_NPROCESSORS_ONLN ) *
+					config::CACHE_LINE_SIZE::value();
+				typename AddMonoid::D3 array_reduced[ reduced_size ];
+
+				for(
+					size_t i = 0;
+					i < reduced_size;
+					i += config::CACHE_LINE_SIZE::value()
+				) {
+					array_reduced[ i ] =
+						addMonoid.template getIdentity< typename AddMonoid::D3 >();
+				}
+
+				constexpr const bool dense_descr = descr & descriptors::dense;
+
+				internal::Pipeline::stage_type func =
+					[&x, &y, &addMonoid, &anyOp, &array_reduced] (
+						internal::Pipeline &pipeline,
+						const size_t lower_bound, const size_t upper_bound
+					) {
+#ifdef _ASCEND_DEBUG
+						#pragma omp critical
+						std::cout << "\t\tExecution of stage dot-generic in the range("
+							<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+						RC rc = SUCCESS;
+
+						Coords local_x, local_y;
+						const size_t local_n = upper_bound - lower_bound;
+						size_t local_x_nz = local_n;
+						size_t local_y_nz = local_n;
+						bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+						const bool already_dense_vectors = dense_descr ||
+							pipeline.allAlreadyDenseVectors();
+#else
+						(void) pipeline;
+						constexpr const bool already_dense_vectors = dense_descr;
+#endif
+						bool already_dense_input_x = true;
+						bool already_dense_input_y = true;
+
+						if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+							already_dense_input_x = pipeline.containsAlreadyDenseVector(
+								&internal::getCoordinates( x ) );
+							if( !already_dense_input_x ) {
+#else
+								already_dense_input_x = false;
+#endif
+								local_x = internal::getCoordinates( x ).asyncSubset(
+									lower_bound, upper_bound );
+								local_x_nz = local_x.nonzeroes();
+								if( local_x_nz < local_n ) {
+									sparse = true;
+								}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+							}
+
+							already_dense_input_y = pipeline.containsAlreadyDenseVector(
+								&internal::getCoordinates( y ) );
+							if( !already_dense_input_y ) {
+#else
+								already_dense_input_y = false;
+#endif
+								local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+									upper_bound );
+								local_y_nz = local_y.nonzeroes();
+								if( local_y_nz < local_n ) {
+									sparse = true;
+								}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+							}
+#endif
+						}
+
+						unsigned int thread_id =
+							omp_get_thread_num() * config::CACHE_LINE_SIZE::value();
+
+						if( sparse ) {
+							if( local_x_nz < local_y_nz ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+								rc = internal::boolean_dispatcher_sparse_dot_generic<
+#else
+								rc = internal::sparse_dot_generic<
+#endif
+										descr, AddMonoid, AnyOp, InputType1, InputType2, Coords
+									>(
+										already_dense_input_x, already_dense_input_y,
+										array_reduced[ thread_id ],
+										lower_bound, upper_bound,
+										local_x, local_y,
+										x, y,
+										local_x_nz,
+										addMonoid, anyOp
+									 );
+							} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+								rc = internal::boolean_dispatcher_sparse_dot_generic<
+#else
+								rc = internal::sparse_dot_generic<
+#endif
+										descr, AddMonoid, AnyOp, InputType1, InputType2, Coords
+									>(
+										already_dense_input_y, already_dense_input_x,
+										array_reduced[ thread_id ],
+										lower_bound, upper_bound,
+										local_y, local_x, x, y, local_y_nz,
+										addMonoid, anyOp
+									);
+							}
+						} else {
+							// get raw alias
+							const InputType1 * __restrict__ a = internal::getRaw( x );
+							const InputType2 * __restrict__ b = internal::getRaw( y );
+
+							size_t i = lower_bound;
+							if( upper_bound > lower_bound ) {
+								while( i + AnyOp::blocksize < upper_bound ) {
+									// declare buffers
+									static_assert( AnyOp::blocksize > 0,
+										"Configuration error: vectorisation blocksize set to 0!" );
+
+									typename AnyOp::D1 xx[ AnyOp::blocksize ];
+									typename AnyOp::D2 yy[ AnyOp::blocksize ];
+									typename AnyOp::D3 zz[ AnyOp::blocksize ];
+
+									// prepare registers
+									for( size_t k = 0; k < AnyOp::blocksize; ++k ) {
+										xx[ k ] = static_cast< typename AnyOp::D1 >( a[ i ] );
+										yy[ k ] = static_cast< typename AnyOp::D2 >( b[ i++ ] );
+									}
+
+									// perform element-wise multiplication
+									if( internal::maybe_noop< AnyOp >::value ) {
+										for( size_t k = 0; k < AnyOp::blocksize; ++k ) {
+											zz[ k ] = addMonoid.template getIdentity< typename AnyOp::D3 >();
+										}
+									}
+									for( size_t k = 0; k < AnyOp::blocksize; ++k ) {
+										apply( zz[ k ], xx[ k ], yy[ k ], anyOp );
+									}
+
+									// perform reduction into output element
+									addMonoid.getOperator().foldlArray( array_reduced[ thread_id ], zz,
+										AnyOp::blocksize );
+									//^--> note that this foldl operates on raw arrays,
+									//     and thus should not be mistaken with a foldl
+									//     on a grb::Vector.
+#ifdef _DEBUG
+									std::cout << "\t\t " << ( i - AnyOp::blocksize ) << "--" << i << ": "
+										<< "running reduction = " << array_reduced[ thread_id ] << "\n";
+#endif
+								}
+
+								// perform element-by-element updates for remainder (if any)
+								for( ; i < upper_bound; ++i ) {
+									OutputType temp = addMonoid.template getIdentity< OutputType >();
+									apply( temp, a[ i ], b[ i ], anyOp );
+									foldr( temp, array_reduced[ thread_id ], addMonoid.getOperator() );
+								}
+							}
+						}
+
+						// the local coordinates for the input vectors have not been updated as
+						// they are read-only therefore, we don't need to invoke asyncJoinSubset;
+						// the output is a scalar
+						return rc;
+					};
+
+#ifdef _ASCEND_DEBUG
+				std::cout << "\t\tStage added to a pipeline: dot-generic" << std::endl;
+#endif
+
+				ret = ret ? ret : internal::le.addStage(
+						std::move( func ),
+						internal::Opcode::BLAS1_DOT_GENERIC,
+						end, sizeof( OutputType ), dense_descr, true,
+						nullptr, nullptr, nullptr, nullptr,
+						&x, &y, nullptr, nullptr,
+						&internal::getCoordinates( x ), &internal::getCoordinates( y ),
+						nullptr, nullptr,
+						nullptr
+					);
+
+				for(
+					size_t i = 0;
+					i < reduced_size;
+					i += config::CACHE_LINE_SIZE::value()
+				) {
+					foldl( reduced, array_reduced[ i ], addMonoid.getOperator() );
+				}
+
+				// write back result
+				z = static_cast< OutputType >( reduced );
+			} else {
+				// this has been tested by the unittest
+			}
+
+#ifdef _DEBUG
+			std::cout << "\t returning " << z << "\n";
+#endif
+			// done!
+			return ret;
+		}
+
+	} // namespace internal
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class AddMonoid,
+		class AnyOp,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC dot(
+		OutputType &z,
+		const Vector< InputType1, ascend, Coords > &x,
+		const Vector< InputType2, ascend, Coords > &y,
+		const AddMonoid &addMonoid = AddMonoid(),
+		const AnyOp &anyOp = AnyOp(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_monoid< AddMonoid >::value &&
+			grb::is_operator< AnyOp >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType1, typename AnyOp::D1 >::value ), "grb::dot",
+			"called with a left-hand vector value type that does not match the first "
+			"domain of the given multiplicative operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType2, typename AnyOp::D2 >::value ), "grb::dot",
+			"called with a right-hand vector value type that does not match the second "
+			"domain of the given multiplicative operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename AddMonoid::D3, typename AnyOp::D1 >::value ),
+			"grb::dot",
+			"called with a multiplicative operator output domain that does not match "
+			"the first domain of the given additive operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< OutputType, typename AddMonoid::D2 >::value ), "grb::dot",
+			"called with an output vector value type that does not match the second "
+			"domain of the given additive operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename AddMonoid::D3, typename AddMonoid::D2 >::value ),
+			"grb::dot",
+			"called with an additive operator whose output domain does not match its "
+			"second input domain" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< OutputType, typename AddMonoid::D3 >::value ), "grb::dot",
+			"called with an output vector value type that does not match the third "
+			"domain of the given additive operator" );
+
+#ifdef _DEBUG
+		std::cout << "In grb::dot (ascend). "
+			<< "I/O scalar on input reads " << z << "\n";
+#endif
+
+		// dynamic sanity check
+		const size_t n = internal::getCoordinates( y ).size();
+		if( internal::getCoordinates( x ).size() != n ) {
+			return MISMATCH;
+		}
+
+#ifdef _DEBUG
+		std::cout << "\t dynamic checks pass\n";
+#endif
+
+		// dot will be computed out-of-place here. A separate field is needed because
+		// of possible multi-threaded computation of the dot.
+		OutputType oop = addMonoid.template getIdentity< OutputType >();
+
+		RC ret = SUCCESS;
+
+		ret = internal::dot_generic< descr >( oop, x, y, addMonoid, anyOp, phase );
+
+		// fold out-of-place dot product into existing input, and exit
+#ifdef _DEBUG
+		std::cout << "\t dot_generic returned " << oop << ", "
+			<< "which will be folded into " << z << " "
+			<< "using the additive monoid\n";
+#endif
+		ret = ret ? ret : foldl( z, oop, addMonoid.getOperator() );
+#ifdef _DEBUG
+		std::cout << "\t returning " << z << "\n";
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename IOType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC dot(
+		IOType &x,
+		const Vector< InputType1, ascend, Coords > &left,
+		const Vector< InputType2, ascend, Coords > &right,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< IOType >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In grb::dot (ascend, semiring version)\n"
+			<< "\t dispatches to monoid-operator version\n";
+#endif
+		return grb::dot< descr >( x, left, right, ring.getAdditiveMonoid(),
+			ring.getMultiplicativeOperator(), phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename Func,
+		typename DataType,
+		typename Coords
+	>
+	RC eWiseMap( const Func f, Vector< DataType, ascend, Coords > &x ) {
+
+		RC ret = SUCCESS;
+
+		const size_t n = internal::getCoordinates( x ).size();
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func = [f, &x] (
+			internal::Pipeline &pipeline, const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _ASCEND_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage eWiseMap(f, x) in the range("
+				<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			Coords local_x;
+			const size_t local_n = upper_bound - lower_bound;
+			size_t local_x_nz = local_n;
+			bool sparse = false;
+
+			bool already_dense_input_x = true;
+
+			if( !dense_descr ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_input_x = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( x ) );
+				if( !already_dense_input_x ) {
+#else
+					already_dense_input_x = false;
+#endif
+					local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+						upper_bound );
+					local_x_nz = local_x.nonzeroes();
+					if( local_x_nz < local_n ) {
+						sparse = true;
+					}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			if( sparse ) {
+				// the sparse case is possible only when the local coordinates are already
+				// initialized
+				assert( already_dense_input_x == false );
+				for( size_t k = 0; k < local_x_nz; ++k ) {
+					DataType &xval = internal::getRaw( x )[ local_x.index( k ) + lower_bound ];
+					xval = f( xval );
+				}
+			} else {
+				for( size_t i = lower_bound; i < upper_bound; ++i ) {
+					DataType &xval = internal::getRaw( x )[ i ];
+					xval = f( xval );
+				}
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_EWISEMAP,
+				n, sizeof( DataType ), dense_descr, true,
+				&x, nullptr, &internal::getCoordinates( x ), nullptr,
+				nullptr, nullptr, nullptr, nullptr,
+				nullptr, nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseMap(f, x)" << std::endl;
+#endif
+		return ret;
+	}
+
+	namespace internal {
+
+		template<
+			Descriptor descr = descriptors::no_operation,
+			typename Func,
+			typename DataType1,
+			typename DataType2,
+			typename Coords,
+			typename... Args
+		>
+		RC eWiseLambda_helper(
+			std::vector< const void * > all_vectors_ptr,
+			size_t maximum_data_type_size,
+			const Func f,
+			const Vector< DataType1, ascend, Coords > &x,
+			const Vector< DataType2, ascend, Coords > &y,
+			Args const &... args
+		) {
+			// catch mismatch
+			if( size( x ) != size( y ) ) {
+				return MISMATCH;
+			}
+
+			all_vectors_ptr.push_back( &y );
+			maximum_data_type_size = std::max( maximum_data_type_size, sizeof( DataType2 ) );
+
+			// continue
+			return eWiseLambda_helper( all_vectors_ptr, maximum_data_type_size, f, x,
+				args... );
+		}
+
+		template<
+			Descriptor descr = descriptors::no_operation,
+			typename Func,
+			typename DataType,
+			typename Coords
+		>
+		RC eWiseLambda_helper(
+			std::vector< const void * > all_vectors_ptr,
+			size_t maximum_data_type_size,
+			const Func f,
+			const Vector< DataType, ascend, Coords > &x
+		) {
+			// all pointers, except one, have been stored, and the last one will be
+			// stored by the normal eWiseLambda
+			return eWiseLambda< descr, Func, DataType, Coords >( f, x, all_vectors_ptr,
+				maximum_data_type_size );
+		}
+	};
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename Func,
+		typename DataType1,
+		typename DataType2,
+		typename Coords,
+		typename... Args
+	>
+	RC eWiseLambda(
+		const Func f,
+		const Vector< DataType1, ascend, Coords > &x,
+		const Vector< DataType2, ascend, Coords > &y,
+		Args const &... args
+	) {
+
+		// create an empty vector to store pointers for all vectors passed to
+		// eWiseLambda
+		std::vector< const void * > all_vectors_ptr;
+
+		// invoke the helper function to store the pointers
+		return internal::eWiseLambda_helper( all_vectors_ptr, 0, f, x, y, args...);
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename Func,
+		typename DataType,
+		typename Coords
+	>
+	RC eWiseLambda(
+		const Func f,
+		const Vector< DataType, ascend, Coords > &x,
+		std::vector< const void * > all_vectors_ptr = std::vector< const void *>(),
+		size_t maximum_data_type_size = 0
+	) {
+#ifdef _DEBUG
+		std::cout << "Info: entering eWiseLambda function on vectors.\n";
+#endif
+
+		all_vectors_ptr.push_back( &x );
+		maximum_data_type_size =
+			std::max( maximum_data_type_size, sizeof( DataType ) );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func = [f, &x] (
+			internal::Pipeline &pipeline,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _ASCEND_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage eWiseLambda in the range("
+				<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			Coords local_x;
+			const size_t local_n = upper_bound - lower_bound;
+			size_t local_x_nz;
+			bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			(void) pipeline;
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+			bool already_dense_output = true;
+
+			if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_output = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( x ) );
+				if( !already_dense_output ) {
+#else
+					already_dense_output = false;
+#endif
+					local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+						upper_bound );
+					local_x_nz = local_x.nonzeroes();
+					if( local_x_nz < local_n ) {
+						sparse = true;
+					}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			if( sparse ) {
+				if ( already_dense_output ) {
+					for( size_t k = 0; k < local_x_nz; ++k ) {
+						f( k + lower_bound );
+					}
+				} else {
+					for( size_t k = 0; k < local_x_nz; ++k ) {
+						const size_t i = local_x.index( k ) + lower_bound;
+						f( i );
+					}
+				}
+			} else {
+				for (size_t i = lower_bound; i < upper_bound; i++) {
+					f( i );
+				}
+			}
+
+			// the local coordinates for all vectors of eWiseLambda cannot change
+			// therefore, we don't need to invoke asyncJoinSubset for any of them
+
+			return SUCCESS;
+		};
+
+		// eWiseLambda is a special case as we don't know which of the accessed
+		// vectors are read-only therefore, we assume that all vectors may be written,
+		// but the sparsity structure cannot change i.e., the coordinates of each
+		// vector cannot be updated, but we pass the coordinates of x for the loop
+		// size
+		ret = ret ? ret : internal::le.addeWiseLambdaStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_EWISELAMBDA,
+				internal::getCoordinates( x ).size(), maximum_data_type_size, dense_descr,
+				all_vectors_ptr, &internal::getCoordinates( x )
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseLambda" << std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename InputType,
+		typename IOType,
+		typename MaskType,
+		typename Coords
+	>
+	RC foldl(
+		IOType &x,
+		const Vector< InputType, ascend, Coords > &y,
+		const Vector< MaskType, ascend, Coords > &mask,
+		const Monoid &monoid = Monoid(),
+		const typename std::enable_if< !grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "foldl: IOType <- [InputType] with a monoid called. "
+			<< "Array has size " << size( y ) << " with " << nnz( y ) << " nonzeroes. "
+			<< "It has a mask of size " << size( mask ) << " with " << nnz( mask )
+			<< " nonzeroes.\n";
+#endif
+
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< IOType, InputType >::value ), "grb::foldl",
+			"called with a scalar IO type that does not match the input vector type" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Monoid::D1 >::value ), "grb::foldl",
+			"called with an input vector value type that does not match the first "
+			"domain of the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Monoid::D2 >::value ), "grb::foldl",
+			"called with an input vector type that does not match the second domain of "
+			"the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Monoid::D3 >::value ), "grb::foldl",
+			"called with an input vector type that does not match the third domain of "
+			"the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::foldl",
+			"called with a vector mask type that is not boolean" );
+
+		if( size( mask ) > 0 ) {
+			return internal::template fold_from_vector_to_scalar_generic<
+					descr, true, true
+				>( x, y, mask, monoid );
+		} else {
+			return internal::template fold_from_vector_to_scalar_generic<
+					descr, false, true
+				>( x, y, mask, monoid );
+		}
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename IOType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldl(
+		IOType &x,
+		const Vector< InputType, ascend, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const typename std::enable_if<
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "foldl: IOType <- [InputType] with a monoid called. "
+			<< "Array has size " << size( y ) << " with " << nnz( y ) << " nonzeroes. "
+			<< "It has no mask.\n";
+#endif
+
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< IOType, InputType >::value ), "grb::reduce",
+			"called with a scalar IO type that does not match the input vector type" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Monoid::D1 >::value ), "grb::reduce",
+			"called with an input vector value type that does not match the first "
+			"domain of the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Monoid::D2 >::value ), "grb::reduce",
+			"called with an input vector type that does not match the second domain of "
+			"the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Monoid::D3 >::value ), "grb::reduce",
+			"called with an input vector type that does not match the third domain of "
+			"the given monoid" );
+
+		// do reduction
+		Vector< bool, ascend, Coords > empty_mask( 0 );
+		return internal::template fold_from_vector_to_scalar_generic<
+				descr, false, true
+			>( x, y, empty_mask, monoid );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename T,
+		typename U,
+		typename Coords
+	>
+	RC zip(
+		Vector< std::pair< T, U >, ascend, Coords > &z,
+		const Vector< T, ascend, Coords > &x,
+		const Vector< U, ascend, Coords > &y,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< T >::value &&
+			!grb::is_object< U >::value,
+		void >::type * const = nullptr
+	) {
+		const size_t n = size( z );
+		if( n != size( x ) ) {
+			return MISMATCH;
+		}
+		if( n != size( y ) ) {
+			return MISMATCH;
+		}
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		const T * const x_raw = internal::getRaw( x );
+		const U * const y_raw = internal::getRaw( y );
+		std::pair< T, U > * z_raw = internal::getRaw( z );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func = [&z, x_raw, y_raw, z_raw] (
+			internal::Pipeline &pipeline,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _ASCEND_DEBUG
+			std::cout << "\t\tExecution of stage zip(z, x, y) in the range("
+				<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			Coords local_z;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			bool already_dense_output = true;
+#else
+			(void) pipeline;
+#endif
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			already_dense_output = pipeline.containsAlreadyDenseVector(
+				&internal::getCoordinates( z ) );
+			if( !dense_descr && !already_dense_output ) {
+#else
+			if( !dense_descr ) {
+#endif
+				local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+					upper_bound );
+			}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			if( !dense_descr && !already_dense_output ) {
+#else
+			if( !dense_descr ) {
+#endif
+				// the result will always be dense
+				local_z.local_assignAllNotAlreadyAssigned();
+			}
+
+			for( size_t i = lower_bound; i < upper_bound; ++i ) {
+				z_raw[ i ].first = x_raw[ i ];
+				z_raw[ i ].second = y_raw[ i ];
+			}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			if( !dense_descr && !already_dense_output ) {
+#else
+			if( !dense_descr ) {
+#endif
+				internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_ZIP,
+				internal::getCoordinates( x ).size(), sizeof( T ) + sizeof( U ),
+				dense_descr, true,
+				&z, nullptr, &internal::getCoordinates( z ), nullptr,
+				&x, &y, nullptr, nullptr,
+				&internal::getCoordinates( x ), &internal::getCoordinates( y ),
+				nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: zip(z, x, y)" << std::endl;
+#endif
+		assert( false );
+		return UNSUPPORTED;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename T,
+		typename U,
+		typename Coords
+	>
+	RC unzip(
+		Vector< T, ascend, Coords > &x,
+		Vector< U, ascend, Coords > &y,
+		const Vector< std::pair< T, U >, ascend, Coords > &in,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< T >::value &&
+			!grb::is_object< U >::value,
+		void >::type * const = nullptr
+	) {
+		const size_t n = size( in );
+		if( n != size( x ) ) {
+			return MISMATCH;
+		}
+		if( n != size( y ) ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		T * const x_raw = internal::getRaw( x );
+		U * const y_raw = internal::getRaw( y );
+		const std::pair< T, U > * in_raw = internal::getRaw( in );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func = [&x, &y, x_raw, y_raw, in_raw] (
+			internal::Pipeline &pipeline,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _ASCEND_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage unzip(x, y, in) in the range("
+				<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			Coords local_x, local_y;
+
+			bool already_dense_output_x = true;
+			bool already_dense_output_y = true;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			already_dense_output_x = pipeline.containsAlreadyDenseVector(
+				&internal::getCoordinates( x ) );
+			if( !dense_descr && !already_dense_output_x ) {
+#else
+			if( !dense_descr ) {
+				already_dense_output_x = false;
+#endif
+				local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+					upper_bound );
+				local_x.local_assignAllNotAlreadyAssigned();
+			}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			already_dense_output_y = pipeline.containsAlreadyDenseVector(
+				&internal::getCoordinates( y ) );
+			if( !dense_descr && !already_dense_output_y ) {
+#else
+			if( !dense_descr ) {
+				already_dense_output_y = false;
+#endif
+				local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+					upper_bound );
+				local_y.local_assignAllNotAlreadyAssigned();
+			}
+
+			for( size_t i = lower_bound; i < upper_bound; ++i ) {
+				x_raw[ i ] = in_raw[ i ].first;
+				y_raw[ i ] = in_raw[ i ].second;
+			}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			if( !dense_descr && !already_dense_output_x ) {
+#else
+			if( !dense_descr ) {
+#endif
+				internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+					upper_bound );
+			}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			if( !dense_descr && !already_dense_output_y ) {
+#else
+			if( !dense_descr ) {
+#endif
+				internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_UNZIP,
+				internal::getCoordinates( x ).size(), std::max( sizeof( T ), sizeof( U ) ),
+				dense_descr, true,
+				&x, &y,
+				&internal::getCoordinates( x ), &internal::getCoordinates( y ),
+				&in, nullptr, nullptr, nullptr,
+				&internal::getCoordinates( in ), nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: unzip(x, y, in)" << std::endl;
+#endif
+		assert( false );
+		return UNSUPPORTED;
+	}
+
+/** @} */
+//   ^-- ends BLAS-1 NB module
+
+} // end namespace ``grb''
+
+#undef NO_CAST_ASSERT
+#undef NO_CAST_OP_ASSERT
+
+#endif // end `_H_GRB_ASCEND_BLAS1'
+
diff --git a/include/graphblas/ascend/blas2.hpp b/include/graphblas/ascend/blas2.hpp
new file mode 100644
index 000000000..8f764bf8d
--- /dev/null
+++ b/include/graphblas/ascend/blas2.hpp
@@ -0,0 +1,1552 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Defines the Ascend level-2 primitives
+ *
+ * @author A. N. Yzelman
+ * @date 12th of September, 2023
+ */
+
+#ifndef _H_GRB_ASCEND_BLAS2
+#define _H_GRB_ASCEND_BLAS2
+
+#include <limits>
+#include <algorithm>
+#include <type_traits>
+
+#include <graphblas/base/blas2.hpp>
+#include <graphblas/blas0.hpp>
+#include <graphblas/descriptors.hpp>
+#include <graphblas/internalops.hpp>
+#include <graphblas/ops.hpp>
+
+#include <graphblas/reference/compressed_storage.hpp>
+
+#include <graphblas/nonblocking/lazy_evaluation.hpp>
+
+#include "coordinates.hpp"
+#include "forward.hpp"
+#include "matrix.hpp"
+#include "vector.hpp"
+#include "boolean_dispatcher_blas2.hpp"
+
+#ifdef _DEBUG
+ #include "spmd.hpp"
+#endif
+
+#define NO_CAST_ASSERT( x, y, z )                                          \
+	static_assert( x,                                                      \
+		"\n\n"                                                             \
+		"****************************************************************" \
+		"****************************************************************" \
+		"**************************************\n"                         \
+		"*     ERROR      | " y " " z ".\n"                                \
+		"****************************************************************" \
+		"****************************************************************" \
+		"**************************************\n"                         \
+		"* Possible fix 1 | Remove no_casting from the template "          \
+		"parameters in this call to " y ".\n"                              \
+		"* Possible fix 2 | Provide objects with element types or "        \
+		"domains that match the expected type.\n"                          \
+		"****************************************************************" \
+		"****************************************************************" \
+		"**************************************\n" );
+
+
+namespace grb {
+
+	namespace internal {
+
+		extern LazyEvaluation le;
+
+	}
+}
+
+namespace grb {
+
+	/**
+	 * \addtogroup ascend
+	 * @{
+	 */
+
+	// put the generic mxv implementation in an internal namespace
+	namespace internal {
+
+		template<
+			bool output_dense,
+			bool left_handed,
+			class AdditiveMonoid,
+			class Multiplication,
+			template< typename > class One,
+			typename IOType,
+			typename InputType,
+			typename SourceType,
+			typename Coords
+		>
+		class addIdentityDuringMV<
+			ascend, true, output_dense, left_handed,
+			AdditiveMonoid, Multiplication, One,
+			IOType, InputType, SourceType, Coords
+		> {
+
+			public:
+
+				static void apply(
+					Vector< IOType, ascend, Coords > &destination_vector,
+					IOType * __restrict__ const &destination,
+					const size_t &destination_range,
+					const size_t &source_index,
+					const AdditiveMonoid &add,
+					const Multiplication &mul,
+					const SourceType &input_element,
+					const std::function< size_t( size_t ) > &src_local_to_global,
+					const std::function< size_t( size_t ) > &dst_global_to_local
+				) {
+
+				}
+		};
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool input_masked,
+			bool left_handed,
+			template< typename > class One,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_destination_vector,
+			bool already_dense_mask_vector,
+#endif
+			class AdditiveMonoid,
+			class Multiplication,
+			typename IOType,
+			typename InputType1,
+			typename InputType2,
+			typename InputType3,
+			typename InputType4,
+			typename Coords,
+			typename RowColType,
+			typename NonzeroType
+		>
+		inline void vxm_inner_kernel_gather(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_destination_vector,
+			bool already_dense_mask_vector,
+#endif
+			RC &rc,
+			const size_t lower_bound,
+			Coords &local_destination_vector,
+			const Coords &local_mask_vector,
+			Vector< IOType, ascend, Coords > &destination_vector,
+			IOType &destination_element,
+			const size_t &destination_index,
+			const Vector< InputType1, ascend, Coords > &source_vector,
+			const InputType1 * __restrict__ const &source,
+			const size_t &source_range,
+			const internal::Compressed_Storage<
+				InputType2, RowColType, NonzeroType
+			> &matrix,
+			const Vector< InputType3, ascend, Coords > &mask_vector,
+			const InputType3 * __restrict__ const &mask,
+			const Vector< InputType4, ascend, Coords > &source_mask_vector,
+			const InputType4 * __restrict__ const &source_mask,
+			const AdditiveMonoid &add,
+			const Multiplication &mul,
+			const std::function< size_t( size_t ) > &src_local_to_global,
+			const std::function< size_t( size_t ) > &src_global_to_local,
+			const std::function< size_t( size_t ) > &dst_local_to_global
+		) {
+#ifndef _DEBUG
+			(void) destination_vector;
+#endif
+			constexpr bool add_identity = descr & descriptors::add_identity;
+			constexpr bool dense_hint = descr & descriptors::dense;
+			constexpr bool explicit_zero = descr & descriptors::explicit_zero;
+#ifdef _DEBUG
+			constexpr bool use_index = descr & descriptors::use_index;
+#endif
+			assert( rc == SUCCESS );
+
+			// check whether we should compute output here
+			if( masked ) {
+				if( already_dense_mask_vector ) {
+					if( !internal::getCoordinates( mask_vector ).template
+						mask< descr >( destination_index, mask )
+					) {
+#ifdef _DEBUG
+						std::cout << "Masks says to skip processing destination index " <<
+							destination_index << "\n";
+#endif
+						return;
+					}
+				} else {
+					if( !local_mask_vector.template
+						mask< descr >( destination_index - lower_bound, mask )
+					) {
+#ifdef _DEBUG
+						std::cout << "Masks says to skip processing destination index " <<
+							destination_index << "\n";
+#endif
+						return;
+					}
+				}
+			}
+
+			// take shortcut, if possible
+			if( grb::has_immutable_nonzeroes< AdditiveMonoid >::value && (
+					already_dense_destination_vector ||
+					local_destination_vector.assigned( destination_index - lower_bound )
+				) && destination_element != add.template getIdentity< IOType >()
+			) {
+				return;
+			}
+
+			// start output
+			typename AdditiveMonoid::D3 output =
+				add.template getIdentity< typename AdditiveMonoid::D3 >();
+			bool set = false;
+
+			// if we need to add identity, do so first:
+			if( add_identity ) {
+				const size_t id_location = src_global_to_local( dst_local_to_global(
+					destination_index ) );
+				// the SpMV primitive may access non-local elements, and thus referring to
+				// the input vector by using local coordinates is incorrect
+				// the input vector of an SpMV cannot be updated, i.e., written, by another
+				// primitive executed in the same pipeline with the current SpMV
+				// therefore, in the current design, it's safe to use global coordinates for
+				// the input vector
+				if( ( !input_masked ||
+						internal::getCoordinates( source_mask_vector ).template
+							mask< descr >( id_location, source_mask )
+					) && id_location < source_range
+				) {
+					if( dense_hint || internal::getCoordinates( source_vector ).assigned( id_location ) ) {
+						typename AdditiveMonoid::D1 temp;
+						internal::CopyOrApplyWithIdentity<
+								!left_handed, typename AdditiveMonoid::D1, InputType1, One
+							>::set( temp, source_vector[ id_location ], mul );
+						internal::CopyOrApplyWithIdentity<
+								false, typename AdditiveMonoid::D3, typename AdditiveMonoid::D1,
+								AdditiveMonoid::template Identity
+							>::set( output, temp, add );
+						set = true;
+					}
+				}
+			}
+
+			// handle row or column at destination_index
+			// NOTE: This /em could be parallelised, but will probably only slow things
+			//       down
+#ifdef _DEBUG
+			std::cout << "vxm_gather: processing destination index " << destination_index << " / "
+				<< internal::getCoordinates( destination_vector ).size()
+				<< ". Input matrix has " << ( matrix.col_start[ destination_index + 1 ] -
+					matrix.col_start[ destination_index ] ) << " nonzeroes.\n";
+#endif
+			for(
+				size_t k = matrix.col_start[ destination_index ];
+				rc == SUCCESS &&
+					k < static_cast< size_t >( matrix.col_start[ destination_index + 1 ] );
+				++k
+			) {
+				// declare multiplication output field
+				typename Multiplication::D3 result =
+					add.template getIdentity< typename AdditiveMonoid::D3 >();
+				// get source index
+				const size_t source_index = matrix.row_index[ k ];
+				// check mask
+				if( input_masked &&
+					!internal::getCoordinates( source_mask_vector ).template
+						mask< descr >( source_index, source_mask )
+				) {
+#ifdef _DEBUG
+					std::cout << "\t vxm_gather: skipping source index " << source_index
+						<< " due to input mask\n";
+#endif
+					continue;
+				}
+				// check for sparsity at source
+				if( !dense_hint ) {
+					if( !internal::getCoordinates( source_vector ).assigned( source_index ) ) {
+#ifdef _DEBUG
+						std::cout << "\t vxm_gather: Skipping out of computation with source "
+							<< "index " << source_index << " since it does not contain a nonzero\n";
+#endif
+						continue;
+					}
+				}
+				// get nonzero
+				typedef typename std::conditional<
+					left_handed,
+					typename Multiplication::D2,
+					typename Multiplication::D1
+				>::type RingNonzeroType;
+				const RingNonzeroType nonzero =
+					matrix.template getValue( k, One< RingNonzeroType >::value() );
+#ifdef _DEBUG
+				std::cout << "\t vxm_gather: interpreted nonzero is " << nonzero << ", "
+					<< "which is the " << k << "-th nonzero and has source index "
+					<< source_index << "\n";
+#endif
+				// check if we use source element or whether we use its index value instead
+				typedef typename std::conditional<
+					left_handed,
+					typename Multiplication::D1,
+					typename Multiplication::D2
+				>::type SourceType;
+				const SourceType apply_source = internal::ValueOrIndex<
+					descr, SourceType, InputType1
+				>::getFromArray( source, src_local_to_global, source_index );
+#ifdef _DEBUG
+				if( use_index ) {
+					std::cout << "\t vxm_gather (use_index descriptor): apply( output, matrix "
+						<< "nonzero, vector nonzero, * ) = apply( ";
+				} else {
+					std::cout << "\t vxm_gather: apply( output, matrix nonzero, vector "
+						<< "nonzero, * ) = apply( ";
+				}
+				std::cout << " output, " << nonzero << ", "  << source << ", * )\n";
+#endif
+				//multiply
+				internal::leftOrRightHandedMul<
+					left_handed, typename Multiplication::D3,
+					SourceType, RingNonzeroType, Multiplication
+				>::mul( result, apply_source, nonzero, mul );
+#ifdef _DEBUG
+				std::cout << "\t vxm_gather: output (this nonzero) = " << result << "\n";
+#endif
+
+				// accumulate
+#ifdef _DEBUG
+				std::cout << "\t vxm_gather: foldr( " << result << ", " << output
+					<< ", + );\n";
+#endif
+				rc = foldr( result, output, add.getOperator() );
+#ifdef _DEBUG
+				std::cout << "\t vxm_gather: output (sum at destination) = " << output
+					<< "\n";
+#endif
+				set = true;
+
+				// sanity check (but apply cannot fail)
+				assert( rc == SUCCESS );
+			}
+
+#ifdef _DEBUG
+			if( set ) {
+				std::cout << "\t vxm_gather: local contribution to this output element at "
+					<< "index " << destination_index << " will be " << output << " "
+					<< "and this corresponds to an explicitly set nonzero.\n";
+			} else {
+				std::cout << "\t vxm_gather: local contribution to this output element at "
+					<< "index " << destination_index << " will be " << output << " and this "
+					<< "is an unset value.\n";
+				if( already_dense_destination_vector ||
+					local_destination_vector.assigned( destination_index - lower_bound )
+				) {
+					std::cout << "\t(old value " << destination_element << " will remain "
+						<< "unmodified.)\n";
+				} else {
+					std::cout << "\t(no old value existed so the output vector will remain "
+						<< "unset at this index.)\n";
+				}
+			}
+#endif
+			// finally, accumulate in output
+			if( explicit_zero || set ) {
+#ifdef _DEBUG
+				std::cout << "\taccumulating " << output << " into output vector...\n";
+#endif
+				if( already_dense_destination_vector ||
+					local_destination_vector.assign( destination_index - lower_bound )
+				) {
+#ifdef _DEBUG
+					std::cout << "\tfoldl( " << destination_element << ", " << output << ", "
+					       << "add.getOperator() );, destination_element = ";
+#endif
+					rc = foldl( destination_element, output, add.getOperator() );
+#ifdef _DEBUG
+					std::cout << destination_element << "\n";
+#endif
+				} else {
+#ifdef _DEBUG
+					std::cout << "\toutput vector element was previously not set. Old "
+						<< "(possibly uninitialised value) " << destination_element << " will "
+						<< "now be set to " << output << ", result (after, possibly, casting): ";
+#endif
+					destination_element = static_cast< IOType >( output );
+#ifdef _DEBUG
+					std::cout << destination_element << "\n";
+#endif
+				}
+			}
+		}
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool input_masked,
+			bool left_handed,
+			bool using_semiring,
+			template< typename > class One,
+			class AdditiveMonoid,
+			class Multiplication,
+			typename IOType,
+			typename InputType1,
+			typename InputType2,
+			typename InputType3,
+			typename InputType4,
+			typename RIT,
+			typename CIT,
+			typename NIT,
+			typename Coords
+		>
+		RC vxm_generic(
+			Vector< IOType, ascend, Coords > &u,
+			const Vector< InputType3, ascend, Coords > &mask,
+			const Vector< InputType1, ascend, Coords > &v,
+			const Vector< InputType4, ascend, Coords > &v_mask,
+			const Matrix< InputType2, ascend, RIT, CIT, NIT > &A,
+			const AdditiveMonoid &add,
+			const Multiplication &mul,
+			const Phase &phase,
+			const std::function< size_t( size_t ) > row_l2g,
+			const std::function< size_t( size_t ) > row_g2l,
+			const std::function< size_t( size_t ) > col_l2g,
+			const std::function< size_t( size_t ) > col_g2l
+		) {
+			// type sanity checking
+			NO_CAST_ASSERT( ( descr > internal::MAX_DESCRIPTOR_VALUE ||
+					!(descr & descriptors::no_casting) ||
+					std::is_same< InputType3, bool >::value
+				), "vxm (any variant)",
+				"Mask type is not boolean" );
+			NO_CAST_ASSERT( ( descr > internal::MAX_DESCRIPTOR_VALUE ||
+					!(descr & descriptors::no_casting) ||
+					!left_handed ||
+					std::is_same< InputType1, typename Multiplication::D1 >::value
+				), "vxm (any variant)",
+				"Input vector type does not match multiplicative operator first "
+				"input domain" );
+			NO_CAST_ASSERT( ( descr > internal::MAX_DESCRIPTOR_VALUE ||
+					!(descr & descriptors::no_casting) ||
+					left_handed ||
+					std::is_same< InputType2, typename Multiplication::D1 >::value
+				), "vxm (any variant)",
+				"Input vector type does not match multiplicative operator second "
+				"input domain" );
+			NO_CAST_ASSERT( ( descr > internal::MAX_DESCRIPTOR_VALUE ||
+					!(descr & descriptors::no_casting) ||
+					!left_handed ||
+					std::is_same< InputType2, typename Multiplication::D2 >::value
+				), "vxm (any variant)",
+				"Input matrix type does not match multiplicative operator second "
+				"input domain" );
+			NO_CAST_ASSERT( ( descr > internal::MAX_DESCRIPTOR_VALUE ||
+					!(descr & descriptors::no_casting) ||
+					left_handed ||
+					std::is_same< InputType1, typename Multiplication::D2 >::value
+				), "vxm (any variant)",
+				"Input matrix type does not match multiplicative operator first "
+				"input domain" );
+
+			RC ret = SUCCESS;
+
+#ifdef _DEBUG
+			const auto s = spmd< ascend >::pid();
+			std::cout << s << ": ascend vxm called with a "
+				<< descriptors::toString( descr ) << "\n";
+#endif
+
+			// get input and output vector sizes
+			const size_t m = internal::getCoordinates( u ).size();
+			const size_t n = internal::getCoordinates( v ).size();
+
+			// get whether the matrix should be transposed prior to execution of this
+			// vector-times-matrix operation
+			constexpr bool transposed = descr & descriptors::transpose_matrix;
+
+			// check for dimension mismatch
+			if( ( transposed && ( n != ncols( A ) || m != nrows( A ) ) )
+				|| ( !transposed && ( n != nrows( A ) || m != ncols( A ) ) ) ) {
+#ifdef _DEBUG
+				std::cout << "Mismatch of columns ( " << n << " vs. " << ncols( A )
+					<< " ) or rows ( " << m << " vs. " << nrows( A ) << " ) with "
+					<< "transposed value " << ((int)transposed) << "\n";
+#endif
+				return MISMATCH;
+			}
+
+			// check density
+			if( descr & descriptors::dense ) {
+				// it's safe to check the number of nonzeroes for the input vector and its
+				// mask since both of them are read-only in the current design for
+				// ascend execution
+				if( nnz( v ) < size( v ) ) {
+#ifdef _DEBUG
+					std::cout << "\t Dense descriptor given but input vector was sparse\n";
+#endif
+					return ILLEGAL;
+				}
+				if( size( v_mask ) > 0 && nnz( v_mask ) < size( v_mask ) ) {
+#ifdef _DEBUG
+					std::cout << "\t Dense descriptor given but input mask has sparse "
+						<< "structure\n";
+#endif
+					return ILLEGAL;
+				}
+			}
+
+			// check mask
+			if( masked ) {
+				if( (transposed && internal::getCoordinates( mask ).size() != nrows( A ) ) ||
+					( !transposed && internal::getCoordinates( mask ).size() != ncols( A ) )
+				) {
+#ifdef _DEBUG
+					std::cout << "Mismatch of mask size ( "
+						<< internal::getCoordinates( mask ).size() << " ) versus matrix rows "
+						<< "or columns ( " << nrows( A ) << " or " << ncols( A ) << " with "
+						<< "transposed value " << ((int)transposed) << "\n";
+#endif
+					return MISMATCH;
+				}
+			}
+
+			// handle resize phase
+			if( phase == RESIZE ) {
+				return SUCCESS;
+			}
+
+			// get raw pointers
+			assert( phase == EXECUTE );
+			const InputType1 * __restrict__ const x = internal::getRaw( v );
+			const InputType3 * __restrict__ const z = internal::getRaw( mask );
+			const InputType4 * __restrict__ const vm = internal::getRaw( v_mask );
+			IOType * __restrict__ const y = internal::getRaw( u );
+
+			// check for illegal arguments
+			if( !(descr & descriptors::safe_overlap) &&
+				reinterpret_cast< const void * >( y ) ==
+					reinterpret_cast< const void * >( x )
+			) {
+				std::cerr << "Warning: grb::internal::vxm_generic called with overlapping "
+					<< "input and output vectors.\n";
+				return OVERLAP;
+			}
+			if( masked && (reinterpret_cast<const void*>(y) ==
+				reinterpret_cast<const void*>(z))
+			) {
+				std::cerr << "Warning: grb::internal::vxm_generic called with overlapping "
+					<< "mask and output vectors.\n";
+				return OVERLAP;
+			}
+
+#ifdef _DEBUG
+			std::cout << s << ": performing SpMV / SpMSpV using an " << nrows( A )
+				<< " by " << ncols( A ) << " matrix holding " << nnz( A )
+				<< " nonzeroes.\n";
+#endif
+
+			// in the current design for ascend execution, the input vectors of
+			// vxm_generic // cannot be overwritten by another stage of the same
+			// pipeline, and therefore, it's safe to rely on the global coordinates of
+			// the input vectors, as they are read-only this property is of special
+			// importance when handling matrices of size "m" x "n" since the mismatch
+			// between "m" and "n" requires special handling for the local coordinates of
+			// the input vectors, the current design relies on the size of the output
+			// vector which should match the sizes of all other vectors in the pipeline
+			// the size of the input vector does not have to match the size of the other
+			// vectors as long as the input vectors are read-only
+
+			constexpr const bool dense_descr = descr & descriptors::dense;
+
+			internal::Pipeline::stage_type func = [
+				&u, &mask, &v, &v_mask, &A, &add, &mul,
+				row_l2g, row_g2l, col_l2g, col_g2l,
+				y, x, z, vm
+#ifdef _DEBUG
+				, s
+#endif
+			] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _ASCEND_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage vxm_generic in the range("
+					<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				(void) pipeline;
+
+				RC rc = SUCCESS;
+
+				Coords local_u, local_mask;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_mask_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+				bool already_dense_output = true;
+				bool already_dense_output_mask = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( u ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_u = internal::getCoordinates( u ).asyncSubset( lower_bound,
+							upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+					if( masked ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+						already_dense_output_mask = pipeline.containsAlreadyDenseVector(
+							&internal::getCoordinates( mask ) );
+						if( !already_dense_output_mask ) {
+#else
+							already_dense_output_mask = false;
+#endif
+							local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+								upper_bound );
+							local_mask_nz = local_mask.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+						}
+#endif
+					}
+				}
+
+				// check if transpose is required
+				if( descr & descriptors::transpose_matrix ) {
+					// start compute u=vA^T
+#ifdef _DEBUG
+					std::cout << s << ": in u=vA^T=Av variant\n";
+#endif
+
+					// start u=vA^T using CRS
+					// matrix = &(A.CRS);
+					// TODO internal issue #193
+					if( !masked || (descr & descriptors::invert_mask) ) {
+						// loop over all columns of the input matrix (can be done in parallel):
+#ifdef _DEBUG
+						std::cout << s << ": in full CRS variant (gather)\n";
+#endif
+
+						for( size_t i = lower_bound; i < upper_bound; i++ ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+							boolean_dispatcher_vxm_inner_kernel_gather<
+#else
+							vxm_inner_kernel_gather<
+#endif
+									descr, masked, input_masked, left_handed, One
+								>(
+									already_dense_output, already_dense_output_mask,
+									rc, lower_bound, local_u, local_mask,
+									u, y[ i ], i, v, x, nrows( A ), internal::getCRS( A ),
+									mask, z, v_mask, vm, add, mul,
+									row_l2g, col_l2g, col_g2l
+								);
+						}
+
+					} else {
+#ifdef _DEBUG
+						std::cout << s << ": in masked CRS variant (gather). Mask has "
+							<< local_mask_nz << " nonzeroes and size " << local_n << ":\n";
+						for( size_t k = 0; k < local_mask_nz; ++k ) {
+							std::cout << " "
+							<< ( ( already_dense_output_mask ? k : local_mask.index( k ) ) +
+								lower_bound );
+						}
+						std::cout << "\n";
+#endif
+						assert( masked );
+
+						for( size_t k = 0; k < local_mask_nz; ++k ) {
+							const size_t i =
+								( already_dense_output_mask ? k : local_mask.index( k ) ) +
+								lower_bound;
+							assert( i < nrows(A) );
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+							boolean_dispatcher_vxm_inner_kernel_gather<
+#else
+							vxm_inner_kernel_gather<
+#endif
+									descr, false, input_masked, left_handed, One
+								>(
+									already_dense_output, already_dense_output_mask,
+									rc, lower_bound, local_u, local_mask,
+									u, y[ i ], i, v, x, nrows( A ), internal::getCRS( A ),
+									mask, z, v_mask, vm, add, mul,
+									row_l2g, col_l2g, col_g2l
+								);
+						}
+					}
+					// end compute u=vA^T
+				} else {
+#ifdef _DEBUG
+					std::cout << s << ": in u=vA=A^Tv variant\n";
+#endif
+					// start u=vA using CCS
+#ifdef _DEBUG
+					std::cout << s << ": in column-major vector times matrix variant (u=vA)\n"
+						<< "\t(this variant relies on the gathering inner kernel)\n";
+#endif
+
+					// if not transposed, then CCS is the data structure to go:
+					// TODO internal issue #193
+					if( !masked || (descr & descriptors::invert_mask) ) {
+#ifdef _DEBUG
+						std::cout << s << ": loop over all input matrix columns\n";
+#endif
+
+						for( size_t j = lower_bound; j < upper_bound; j++ ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+							boolean_dispatcher_vxm_inner_kernel_gather<
+#else
+							vxm_inner_kernel_gather<
+#endif
+									descr, masked, input_masked, left_handed, One
+								>(
+									already_dense_output, already_dense_output_mask,
+									rc, lower_bound, local_u, local_mask,
+									u, y[ j ], j, v, x, nrows( A ), internal::getCCS( A ),
+									mask, z, v_mask, vm, add, mul,
+									row_l2g, row_g2l, col_l2g
+								);
+						}
+					} else {
+						// loop only over the nonzero masks (can still be done in parallel!)
+#ifdef _DEBUG
+						std::cout << s << ": loop over mask indices\n";
+#endif
+						assert( masked );
+
+						for( size_t k = 0; k < local_mask_nz; ++k ) {
+							const size_t j =
+								( already_dense_output_mask ? k : local_mask.index( k ) ) + lower_bound;
+#ifdef GRB_BOOLEAN_DISPATCHER
+							boolean_dispatcher_vxm_inner_kernel_gather<
+#else
+							vxm_inner_kernel_gather<
+#endif
+									descr, masked, input_masked, left_handed, One
+								>(
+									already_dense_output, already_dense_output_mask,
+									rc, lower_bound, local_u, local_mask,
+									u, y[ j ], j, v, x, nrows( A ), internal::getCCS( A ),
+									mask, z, v_mask, vm, add, mul,
+									row_l2g, row_g2l, col_l2g
+								);
+						}
+					}
+					// end computing u=vA
+				}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( !already_dense_output ) {
+#else
+				if( !already_dense_vectors ) {
+#endif
+					internal::getCoordinates( u ).asyncJoinSubset( local_u, lower_bound,
+						upper_bound );
+				}
+
+				return rc;
+			};
+
+			// since the local coordinates are never used for the input vector and the
+			// input mask they are added only for verification of legal usage of the
+			// dense descriptor
+			ret = ret ? ret : internal::le.addStage(
+					std::move( func ),
+					internal::Opcode::BLAS2_VXM_GENERIC,
+					size( u ), sizeof( IOType ), dense_descr, true,
+					&u, nullptr, &internal::getCoordinates( u ), nullptr,
+					&v,
+					masked ? &mask : nullptr,
+					input_masked ? &v_mask : nullptr,
+					nullptr,
+					&internal::getCoordinates( v ),
+					masked ? &internal::getCoordinates( mask ) : nullptr,
+					input_masked ? &internal::getCoordinates( v_mask ) : nullptr,
+					nullptr,
+					&A
+				);
+
+#ifdef _ASCEND_DEBUG
+			std::cout << "\t\tStage added to a pipeline: vxm_generic" << std::endl;
+#endif
+
+#ifdef _DEBUG
+			std::cout << s << ": exiting SpMV / SpMSpV.\n" << std::flush;
+#endif
+			return ret;
+		}
+
+	} // namespace internal
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename IOType,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		typename Coords
+	>
+	RC vxm(
+		Vector< IOType, ascend, Coords > &u,
+		const Vector< InputType3, ascend, Coords > &mask,
+		const Vector< InputType1, ascend, Coords > &v,
+		const Matrix< InputType2, ascend, RIT, CIT, NIT > &A,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+		const Vector< bool, ascend, Coords > empty_mask( 0 );
+		return vxm< descr, true, false >( u, mask, v, empty_mask, A, ring, phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class AdditiveMonoid,
+		class MultiplicativeOperator,
+		typename IOType,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		typename Coords
+	>
+	RC vxm(
+		Vector< IOType, ascend, Coords > &u,
+		const Vector< InputType3, ascend, Coords > &mask,
+		const Vector< InputType1, ascend, Coords > &v,
+		const Matrix< InputType2, ascend, RIT, CIT, NIT > &A,
+		const AdditiveMonoid &add = AdditiveMonoid(),
+		const MultiplicativeOperator &mul = MultiplicativeOperator(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_monoid< AdditiveMonoid >::value &&
+			grb::is_operator< MultiplicativeOperator >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			!std::is_same< InputType2, void >::value,
+		void >::type * const = nullptr
+	) {
+		const grb::Vector< bool, ascend, Coords > empty_mask( 0 );
+		return vxm< descr, true, false >( u, mask, v, empty_mask, A, add, mul,
+			phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		bool output_may_be_masked = true,
+		bool input_may_be_masked = true,
+		class Ring,
+		typename IOType,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename InputType4,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		typename Coords
+	>
+	RC vxm(
+		Vector< IOType, ascend, Coords > &u,
+		const Vector< InputType3, ascend, Coords > &mask,
+		const Vector< InputType1, ascend, Coords > &v,
+		const Vector< InputType4, ascend, Coords > &v_mask,
+		const Matrix< InputType2, ascend, RIT, CIT, NIT > &A,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		constexpr bool left_sided = true;
+		if( output_may_be_masked && size( v_mask ) == 0 && size( mask ) > 0 ) {
+
+			return internal::vxm_generic<
+					descr, true, false, left_sided, true, Ring::template One
+				>(
+					u, mask, v, v_mask, A,
+					ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(),
+					phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		} else if( input_may_be_masked && size( mask ) == 0 && size( v_mask ) > 0 ) {
+			return internal::vxm_generic<
+					descr, false, true, left_sided, true, Ring::template One
+				>(
+					u, mask, v, v_mask, A,
+					ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(),
+					phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		} else if( output_may_be_masked && input_may_be_masked && size( mask ) > 0 && size( v_mask ) > 0 ) {
+			return internal::vxm_generic<
+					descr, true, true, left_sided, true, Ring::template One
+				>(
+					u, mask, v, v_mask, A,
+					ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(),
+					phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		} else {
+			assert( size( mask ) == 0 );
+			assert( size( v_mask ) == 0 );
+			return internal::vxm_generic<
+					descr, false, false, left_sided, true, Ring::template One
+				>(
+					u, mask, v, v_mask, A,
+					ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(),
+					phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		}
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename Coords,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		typename IOType = typename Ring::D4,
+		typename InputType1 = typename Ring::D1,
+		typename InputType2 = typename Ring::D2
+	>
+	RC vxm(
+		Vector< IOType, ascend, Coords > &u,
+		const Vector< InputType1, ascend, Coords > &v,
+		const Matrix< InputType2, ascend, RIT, CIT, NIT > &A,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+		const Vector< bool, ascend, Coords > empty_mask( 0 );
+		return vxm< descr, false, false >( u, empty_mask, v, empty_mask, A, ring,
+			phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class AdditiveMonoid,
+		class MultiplicativeOperator,
+		typename IOType,
+		typename InputType1,
+		typename InputType2,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		typename Coords
+	>
+	RC vxm(
+		Vector< IOType, ascend, Coords > &u,
+		const Vector< InputType1, ascend, Coords > &v,
+		const Matrix< InputType2, ascend, RIT, CIT, NIT > &A,
+		const AdditiveMonoid &add = AdditiveMonoid(),
+		const MultiplicativeOperator &mul = MultiplicativeOperator(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_monoid< AdditiveMonoid >::value &&
+			grb::is_operator< MultiplicativeOperator >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!std::is_same< InputType2, void >::value,
+		void >::type * const = nullptr
+	) {
+		const Vector< bool, ascend, Coords > empty_mask( 0 );
+		return vxm< descr, false, false >( u, empty_mask, v, empty_mask, A, add, mul,
+			phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename Coords,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		typename IOType = typename Ring::D4,
+		typename InputType1 = typename Ring::D1,
+		typename InputType2 = typename Ring::D2,
+		typename InputType3 = bool
+	>
+	RC mxv(
+		Vector< IOType, ascend, Coords > &u,
+		const Vector< InputType3, ascend, Coords > &mask,
+		const Matrix< InputType2, ascend, RIT, CIT, NIT > &A,
+		const Vector< InputType1, ascend, Coords > &v,
+		const Ring &ring,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+		const Vector< bool, ascend, Coords > empty_mask( 0 );
+		return mxv< descr, true, false >( u, mask, A, v, empty_mask, ring, phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		bool output_may_be_masked = true,
+		bool input_may_be_masked = true,
+		class Ring,
+		typename IOType,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename InputType4,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		typename Coords
+	>
+	RC mxv(
+		Vector< IOType, ascend, Coords > &u,
+		const Vector< InputType3, ascend, Coords > &mask,
+		const Matrix< InputType2, ascend, RIT, CIT, NIT > &A,
+		const Vector< InputType1, ascend, Coords > &v,
+		const Vector< InputType4, ascend, Coords > &v_mask,
+		const Ring &ring,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		constexpr Descriptor new_descr = descr ^ descriptors::transpose_matrix;
+		constexpr bool left_sided = false;
+		if( output_may_be_masked && ( size( v_mask ) == 0 && size( mask ) > 0 ) ) {
+
+			return internal::vxm_generic<
+					new_descr, true, false, left_sided, true, Ring::template One
+				>(
+					u, mask, v, v_mask, A,
+					ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(),
+					phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		} else if( input_may_be_masked && ( size( mask ) == 0 &&
+			size( v_mask ) > 0 )
+		) {
+			return internal::vxm_generic<
+					new_descr, false, true, left_sided, true, Ring::template One
+				>(
+					u, mask, v, v_mask, A,
+					ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(),
+					phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		} else if( output_may_be_masked && input_may_be_masked && size( mask ) > 0 &&
+			size( v_mask ) > 0
+		) {
+			return internal::vxm_generic<
+					new_descr, true, true, left_sided, true, Ring::template One
+				>(
+					u, mask, v, v_mask, A,
+					ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(),
+					phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		} else {
+			assert( size( mask ) == 0 );
+			assert( size( v_mask ) == 0 );
+			return internal::vxm_generic<
+					new_descr, false, false, left_sided, true, Ring::template One
+				>(
+					u, mask, v, v_mask, A,
+					ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(),
+					phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		}
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename Coords,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		typename IOType = typename Ring::D4,
+		typename InputType1 = typename Ring::D1,
+		typename InputType2 = typename Ring::D2
+	>
+	RC mxv(
+		Vector< IOType, ascend, Coords > &u,
+		const Matrix< InputType2, ascend, RIT, CIT, NIT > &A,
+		const Vector< InputType1, ascend, Coords > &v,
+		const Ring &ring,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		const Vector< bool, ascend, Coords > empty_mask( 0 );
+		return mxv< descr, false, false >( u, empty_mask, A, v, empty_mask, ring,
+			phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class AdditiveMonoid,
+		class MultiplicativeOperator,
+		typename IOType,
+		typename InputType1,
+		typename InputType2,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		typename Coords
+	>
+	RC mxv(
+		Vector< IOType, ascend, Coords > &u,
+		const Matrix< InputType2, ascend, RIT, CIT, NIT > &A,
+		const Vector< InputType1, ascend, Coords > &v,
+		const AdditiveMonoid &add = AdditiveMonoid(),
+		const MultiplicativeOperator &mul = MultiplicativeOperator(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_monoid< AdditiveMonoid >::value &&
+			grb::is_operator< MultiplicativeOperator >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!std::is_same< InputType2, void >::value,
+		void >::type * const = nullptr
+	) {
+		const Vector< bool, ascend, Coords > empty_mask( 0 );
+		return mxv< descr, false, false >( u, empty_mask, A, v, empty_mask, add, mul,
+			phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		bool output_may_be_masked = true,
+		bool input_may_be_masked = true,
+		class AdditiveMonoid,
+		class MultiplicativeOperator,
+		typename IOType,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename InputType4,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		typename Coords
+	>
+	RC vxm(
+		Vector< IOType, ascend, Coords > &u,
+		const Vector< InputType3, ascend, Coords > &mask,
+		const Vector< InputType1, ascend, Coords > &v,
+		const Vector< InputType4, ascend, Coords > &v_mask,
+		const Matrix< InputType2, ascend, RIT, CIT, NIT > &A,
+		const AdditiveMonoid &add = AdditiveMonoid(),
+		const MultiplicativeOperator &mul = MultiplicativeOperator(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_monoid< AdditiveMonoid >::value &&
+			grb::is_operator< MultiplicativeOperator >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			!grb::is_object< InputType4 >::value &&
+			!std::is_same< InputType2, void >::value,
+		void >::type * const = nullptr
+	) {
+		static_assert( !(descr & descriptors::add_identity), "Cannot add an "
+			"identity if no concept of `one' is known. Suggested fix: use a semiring "
+			"instead." );
+		constexpr bool left_sided = true;
+		if( output_may_be_masked && size( v_mask ) == 0 && size( mask ) > 0 ) {
+			return internal::vxm_generic<
+					descr, true, false, left_sided, false, AdditiveMonoid::template Identity
+				>(
+					u, mask, v, v_mask, A, add, mul, phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		} else if( input_may_be_masked && size( v_mask ) > 0 && size( mask ) == 0 ) {
+			return internal::vxm_generic<
+					descr, false, true, left_sided, false, AdditiveMonoid::template Identity
+				>(
+					u, mask, v, v_mask, A, add, mul, phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		} else if( output_may_be_masked && input_may_be_masked && size( mask ) > 0 &&
+			size( v_mask ) > 0
+		) {
+			return internal::vxm_generic<
+					descr, true, true, left_sided, false, AdditiveMonoid::template Identity
+				>(
+					u, mask, v, v_mask, A, add, mul, phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		} else {
+			assert( size( mask ) == 0 );
+			assert( size( v_mask ) == 0 );
+			return internal::vxm_generic<
+					descr, false, false, left_sided, false, AdditiveMonoid::template Identity
+				>(
+					u, mask, v, v_mask, A, add, mul, phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		}
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		bool output_may_be_masked = true,
+		bool input_may_be_masked = true,
+		class AdditiveMonoid,
+		class MultiplicativeOperator,
+		typename IOType,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename InputType4,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		typename Coords
+	>
+	RC mxv(
+		Vector< IOType, ascend, Coords > &u,
+		const Vector< InputType3, ascend, Coords > &mask,
+		const Matrix< InputType2, ascend, RIT, CIT, NIT > &A,
+		const Vector< InputType1, ascend, Coords > &v,
+		const Vector< InputType4, ascend, Coords > &v_mask,
+		const AdditiveMonoid &add = AdditiveMonoid(),
+		const MultiplicativeOperator &mul = MultiplicativeOperator(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_monoid< AdditiveMonoid >::value &&
+			grb::is_operator< MultiplicativeOperator >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			!grb::is_object< InputType4 >::value &&
+			!std::is_same< InputType2, void >::value,
+		void >::type * const = nullptr
+	) {
+		static_assert( !(descr & descriptors::add_identity), "Cannot add an identity "
+			"if no concept of `1' is known. Suggested fix: use a semiring "
+			"instead." );
+		constexpr Descriptor new_descr = descr ^ descriptors::transpose_matrix;
+		constexpr bool left_sided = false;
+		if( output_may_be_masked && size( v_mask ) == 0 && size( mask ) > 0 ) {
+			return internal::vxm_generic<
+					new_descr, true, false, left_sided, false, AdditiveMonoid::template Identity
+				>(
+					u, mask, v, v_mask, A, add, mul, phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		} else if( input_may_be_masked && size( mask ) == 0 &&
+			size( v_mask ) > 0
+		) {
+			return internal::vxm_generic<
+					new_descr, false, true, left_sided, false, AdditiveMonoid::template Identity
+				>(
+					u, mask, v, v_mask, A, add, mul, phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		} else if( output_may_be_masked && input_may_be_masked && size( mask ) > 0 &&
+			size( v_mask ) > 0
+		) {
+			return internal::vxm_generic<
+					new_descr, true, true, left_sided, false, AdditiveMonoid::template Identity
+				>(
+					u, mask, v, v_mask, A, add, mul, phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		} else {
+			assert( size( mask ) == 0 );
+			assert( size( v_mask ) == 0 );
+			return internal::vxm_generic<
+					new_descr, false, false, left_sided, false, AdditiveMonoid::template Identity
+				>(
+					u, mask, v, v_mask, A, add, mul, phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		}
+	}
+
+	template<
+		class ActiveDistribution,
+		typename Func,
+		typename DataType,
+		typename RIT,
+		typename CIT,
+		typename NIT
+	>
+	RC eWiseLambda(
+		const Func f,
+		const Matrix< DataType, ascend, RIT, CIT, NIT > &A,
+		const size_t s,
+		const size_t P
+	) {
+		// ascend execution is not supported
+		// first, execute any computation that is not completed
+		internal::le.execution();
+
+		// second, delegate to the reference backend
+		return eWiseLambda< ActiveDistribution, Func, DataType, RIT, CIT, NIT >(
+			f, internal::getRefMatrix( A ), s, P );
+	}
+
+	template<
+		typename Func,
+		typename DataType1,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		typename DataType2,
+		typename Coords,
+		typename... Args
+	>
+	RC eWiseLambda(
+		const Func f,
+		const Matrix< DataType1, ascend, RIT, CIT, NIT > &A,
+		const Vector< DataType2, ascend, Coords > &x,
+		Args... args
+	) {
+		// do size checking
+		if( !( size( x ) == nrows( A ) || size( x ) == ncols( A ) ) ) {
+			std::cerr << "Mismatching dimensions: given vector of size " << size( x )
+				<< " has nothing to do with either matrix dimension (" << nrows( A )
+				<< " nor " << ncols( A ) << ").\n";
+			return MISMATCH;
+		}
+
+		return eWiseLambda( f, A, args... );
+	}
+
+	/** @} */
+
+} // namespace grb
+
+#undef NO_CAST_ASSERT
+
+#endif // end _H_GRB_ASCEND_BLAS2
+
diff --git a/include/graphblas/ascend/blas3.hpp b/include/graphblas/ascend/blas3.hpp
new file mode 100644
index 000000000..d910caea6
--- /dev/null
+++ b/include/graphblas/ascend/blas3.hpp
@@ -0,0 +1,534 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Implements the level-3 primitives for the Ascend backend
+ *
+ * @author A. N. Yzelman
+ * @date 12th of September, 2023
+ */
+
+#ifndef _H_GRB_ASCEND_BLAS3
+#define _H_GRB_ASCEND_BLAS3
+
+#include <type_traits> //for std::enable_if
+
+#include <graphblas/base/blas3.hpp>
+#include <graphblas/utils/iterators/matrixVectorIterator.hpp>
+
+#include "io.hpp"
+#include "matrix.hpp"
+
+#include <omp.h>
+
+#define NO_CAST_ASSERT( x, y, z )                                              \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | For all mismatches in the domains of input "       \
+		"parameters and the semiring domains, as specified in the "            \
+		"documentation of the function " y ", supply a container argument of " \
+		"the expected type instead.\n"                                         \
+		"* Possible fix 3 | Provide a compatible semiring where all domains "  \
+		"match those of the container arguments, as specified in the "         \
+		"documentation of the function " y ".\n"                               \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+
+namespace grb {
+
+	namespace internal {
+
+		extern LazyEvaluation le;
+
+	}
+
+}
+
+namespace grb {
+
+	namespace internal {
+
+		template<
+			bool allow_void,
+			Descriptor descr,
+			class Monoid,
+			class Operator,
+			class MulMonoid,
+			typename OutputType, typename InputType1, typename InputType2,
+			typename RIT1, typename CIT1, typename NIT1,
+			typename RIT2, typename CIT2, typename NIT2,
+			typename RIT3, typename CIT3, typename NIT3
+		>
+		RC mxm_generic(
+			Matrix< OutputType, ascend, RIT1, CIT1, NIT1 > &C,
+			const Matrix< InputType1, ascend, RIT2, CIT2, NIT2 > &A,
+			const Matrix< InputType2, ascend, RIT3, CIT3, NIT3 > &B,
+			const Operator &oper,
+			const Monoid &monoid,
+			const MulMonoid &mulMonoid,
+			const Phase &phase,
+			const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_operator< Operator >::value &&
+				grb::is_monoid< Monoid >::value,
+			void >::type * const = nullptr
+		) {
+			// ascend execution is not supported
+			// first, execute any computation that is not completed
+			le.execution();
+
+			// second, delegate to the reference backend
+			return mxm_generic< allow_void, descr >(
+					getRefMatrix( C ), getRefMatrix( A ), getRefMatrix( B ),
+					oper, monoid, mulMonoid, phase
+				);
+		}
+
+	} // end namespace grb::internal
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Semiring,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename RIT1, typename CIT1, typename NIT1,
+		typename RIT2, typename CIT2, typename NIT2,
+		typename RIT3, typename CIT3, typename NIT3
+	>
+	RC mxm(
+		Matrix< OutputType, ascend, RIT1, CIT1, NIT1 > &C,
+		const Matrix< InputType1, ascend, RIT2, CIT2, NIT2 > &A,
+		const Matrix< InputType2, ascend, RIT3, CIT3, NIT3 > &B,
+		const Semiring &ring = Semiring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Semiring >::value,
+		void >::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Semiring::D1, InputType1 >::value
+			), "grb::mxm",
+			"called with a prefactor input matrix A that does not match the first "
+			"domain of the given operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Semiring::D2, InputType2 >::value ), "grb::mxm",
+			"called with a postfactor input matrix B that does not match the "
+			"second domain of the given operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Semiring::D4, OutputType >::value
+			), "grb::mxm",
+			"called with an output matrix C that does not match the output domain "
+			"of the given operator" );
+
+#ifdef _DEBUG
+		std::cout << "In grb::mxm (ascend, unmasked, semiring)\n";
+#endif
+
+		return internal::mxm_generic< true, descr >(
+			C, A, B,
+			ring.getMultiplicativeOperator(),
+			ring.getAdditiveMonoid(),
+			ring.getMultiplicativeMonoid(),
+			phase
+		);
+	}
+
+	template<
+		Descriptor descr = grb::descriptors::no_operation,
+		class Operator,
+		class Monoid,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename RIT1, typename CIT1, typename NIT1,
+		typename RIT2, typename CIT2, typename NIT2,
+		typename RIT3, typename CIT3, typename NIT3
+	>
+	RC mxm(
+		Matrix< OutputType, ascend, RIT1, CIT1, NIT1 > &C,
+		const Matrix< InputType1, ascend, RIT2, CIT2, NIT2 > &A,
+		const Matrix< InputType2, ascend, RIT3, CIT3, NIT3 > &B,
+		const Monoid &addM,
+		const Operator &mulOp,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< Operator >::value &&
+			grb::is_monoid< Monoid >::value,
+		void >::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Operator::D1, InputType1 >::value
+			), "grb::mxm",
+			"called with a prefactor input matrix A that does not match the first "
+			"domain of the given multiplication operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Operator::D2, InputType2 >::value
+			), "grb::mxm",
+			"called with a postfactor input matrix B that does not match the first "
+			"domain of the given multiplication operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Operator::D3, OutputType >::value ),
+			"grb::mxm",
+			"called with an output matrix C that does not match the output domain "
+			"of the given multiplication operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Monoid::D1, typename Operator::D3 >::value
+			), "grb::mxm",
+			"the output domain of the multiplication operator does not match the "
+			"first domain of the given addition monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Monoid::D2, OutputType >::value
+			), "grb::mxm",
+			"the second domain of the given addition monoid does not match the "
+			"type of the output matrix C" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Monoid::D3, OutputType >::value
+			), "grb::mxm",
+			"the output type of the given addition monoid does not match the type "
+			"of the output matrix C" );
+		static_assert( ( !(
+				std::is_same< InputType1, void >::value ||
+				std::is_same< InputType2, void >::value
+			) ),
+			"grb::mxm: the operator-monoid version of mxm cannot be used if either "
+			"of the input matrices is a pattern matrix (of type void)" );
+
+		return internal::mxm_generic< false, descr >(
+			C, A, B, mulOp, addM, Monoid(), phase
+		);
+	}
+
+	namespace internal {
+
+		template<
+			Descriptor descr = descriptors::no_operation,
+			bool matrix_is_void,
+			typename OutputType,
+			typename InputType1, typename InputType2, typename InputType3,
+			typename RIT, typename CIT, typename NIT,
+			typename Coords
+		>
+		RC matrix_zip_generic(
+			Matrix< OutputType, ascend, RIT, CIT, NIT > &A,
+			const Vector< InputType1, ascend, Coords > &x,
+			const Vector< InputType2, ascend, Coords > &y,
+			const Vector< InputType3, ascend, Coords > &z,
+			const Phase &phase
+		) {
+			// ascend execution is not supported
+			// first, execute any computation that is not completed
+			le.execution();
+
+			// second, delegate to the reference backend
+			return matrix_zip_generic< descr, matrix_is_void >(
+					getRefMatrix( A ), getRefVector( x ), getRefVector( y ), getRefVector( z ),
+					phase
+				);
+		}
+
+	} // namespace internal
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType,
+		typename InputType1, typename InputType2, typename InputType3,
+		typename RIT, typename CIT, typename NIT,
+		typename Coords
+	>
+	RC zip(
+		Matrix< OutputType, ascend, RIT, CIT, NIT > &A,
+		const Vector< InputType1, ascend, Coords > &x,
+		const Vector< InputType2, ascend, Coords > &y,
+		const Vector< InputType3, ascend, Coords > &z,
+		const Phase &phase = EXECUTE
+	) {
+		static_assert( !(descr & descriptors::no_casting) ||
+				std::is_integral< InputType1 >::value,
+			"grb::zip (two vectors to matrix) called "
+			"using non-integral left-hand vector elements" );
+		static_assert( !(descr & descriptors::no_casting) ||
+				std::is_integral< InputType2 >::value,
+			"grb::zip (two vectors to matrix) called "
+			"using non-integral right-hand vector elements" );
+		static_assert( !(descr & descriptors::no_casting) ||
+				std::is_same< OutputType, InputType3 >::value,
+			"grb::zip (two vectors to matrix) called "
+			"with differing vector nonzero and output matrix domains" );
+
+		const size_t n = grb::size( x );
+		const size_t nz = grb::nnz( x );
+		const RC ret = grb::clear( A );
+		if( ret != SUCCESS ) {
+			return ret;
+		}
+		if( n != grb::size( y ) ) {
+			return MISMATCH;
+		}
+		if( n != grb::size( z ) ) {
+			return MISMATCH;
+		}
+		if( nz != grb::nnz( y ) ) {
+			return ILLEGAL;
+		}
+		if( nz != grb::nnz( z ) ) {
+			return ILLEGAL;
+		}
+
+		return internal::matrix_zip_generic< descr, false >( A, x, y, z, phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType1, typename InputType2,
+		typename RIT, typename CIT, typename NIT,
+		typename Coords
+	>
+	RC zip(
+		Matrix< void, ascend, RIT, CIT, NIT > &A,
+		const Vector< InputType1, ascend, Coords > &x,
+		const Vector< InputType2, ascend, Coords > &y,
+		const Phase &phase = EXECUTE
+	) {
+		static_assert( !(descr & descriptors::no_casting) ||
+				std::is_integral< InputType1 >::value,
+			"grb::zip (two vectors to void matrix) called using non-integral "
+			"left-hand vector elements" );
+		static_assert( !(descr & descriptors::no_casting) ||
+				std::is_integral< InputType2 >::value,
+			"grb::zip (two vectors to void matrix) called using non-integral "
+			"right-hand vector elements" );
+
+		const size_t n = grb::size( x );
+		const size_t nz = grb::nnz( x );
+		const RC ret = grb::clear( A );
+		if( ret != SUCCESS ) {
+			return ret;
+		}
+		if( n != grb::size( y ) ) {
+			return MISMATCH;
+		}
+		if( nz != grb::nnz( y ) ) {
+			return ILLEGAL;
+		}
+
+		return internal::matrix_zip_generic< descr, true >( A, x, y, x, phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Operator,
+		typename InputType1, typename InputType2, typename OutputType,
+		typename Coords,
+		typename RIT, typename CIT, typename NIT
+	>
+	RC outer(
+		Matrix< OutputType, ascend, RIT, CIT, NIT > &A,
+		const Vector< InputType1, ascend, Coords > &u,
+		const Vector< InputType2, ascend, Coords > &v,
+		const Operator &mul = Operator(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_operator< Operator >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< OutputType >::value,
+		void >::type * const = nullptr
+	) {
+		// ascend execution is not supported
+		// first, execute any computation that is not completed
+		internal::le.execution();
+
+		// second, delegate to the reference backend
+		return outer< descr, Operator >(
+			internal::getRefMatrix( A ),
+			internal::getRefVector( u ), internal::getRefVector( v ),
+			mul, phase
+		);
+	}
+
+	namespace internal {
+
+		template<
+			bool allow_void,
+			Descriptor descr,
+			class MulMonoid, class Operator,
+			typename OutputType, typename InputType1, typename InputType2,
+			typename RIT1, typename CIT1, typename NIT1,
+			typename RIT2, typename CIT2, typename NIT2,
+			typename RIT3, typename CIT3, typename NIT3
+		>
+		RC eWiseApply_matrix_generic(
+			Matrix< OutputType, ascend, RIT1, CIT1, NIT1 > &C,
+			const Matrix< InputType1, ascend, RIT2, CIT2, NIT2 > &A,
+			const Matrix< InputType2, ascend, RIT3, CIT3, NIT3 > &B,
+			const Operator &oper,
+			const MulMonoid &mulMonoid,
+			const Phase &phase,
+			const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_operator< Operator >::value,
+			void >::type * const = nullptr
+		) {
+			// ascend execution is not supported
+			// first, execute any computation that is not completed
+			le.execution();
+
+			// second, delegate to the reference backend
+			return eWiseApply_matrix_generic<
+					allow_void, descr,
+					MulMonoid, Operator
+				>(
+					getRefMatrix( C ), getRefMatrix( A ), getRefMatrix( B ),
+					oper, mulMonoid, phase
+				);
+		}
+
+	} // namespace internal
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class MulMonoid,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename RIT1, typename CIT1, typename NIT1,
+		typename RIT2, typename CIT2, typename NIT2,
+		typename RIT3, typename CIT3, typename NIT3
+	>
+	RC eWiseApply(
+		Matrix< OutputType, ascend, RIT1, CIT1, NIT1 > &C,
+		const Matrix< InputType1, ascend, RIT2, CIT2, NIT2 > &A,
+		const Matrix< InputType2, ascend, RIT3, CIT3, NIT3 > &B,
+		const MulMonoid &mulmono,
+		const Phase phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_monoid< MulMonoid >::value,
+		void >::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename MulMonoid::D1, InputType1 >::value ),
+			"grb::eWiseApply (ascend, matrix <- matrix x matrix, monoid)",
+			"called with a prefactor input matrix A that does not match the first "
+			"domain of the monoid operator"
+		);
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename MulMonoid::D2, InputType2 >::value ),
+			"grb::eWiseApply (ascend, matrix <- matrix x matrix, monoid)",
+			"called with a postfactor input matrix B that does not match the "
+			"second domain of the monoid operator"
+		);
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename MulMonoid::D3, OutputType >::value ),
+			"grb::eWiseApply (ascend, matrix <- matrix x matrix, monoid)",
+			"called with an output matrix C that does not match the output domain "
+			"of the monoid operator"
+		);
+
+#ifdef _DEBUG
+		std::cout << "In grb::eWiseApply_matrix_generic (ascend, monoid)\n";
+#endif
+
+		return internal::eWiseApply_matrix_generic< true, descr >(
+			C, A, B, mulmono.getOperator(), mulmono, phase
+		);
+	}
+
+	template<
+		Descriptor descr = grb::descriptors::no_operation,
+		class Operator,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename RIT1, typename CIT1, typename NIT1,
+		typename RIT2, typename CIT2, typename NIT2,
+		typename RIT3, typename CIT3, typename NIT3
+	>
+	RC eWiseApply(
+		Matrix< OutputType, ascend, RIT1, CIT1, NIT1 > &C,
+		const Matrix< InputType1, ascend, RIT2, CIT2, NIT2 > &A,
+		const Matrix< InputType2, ascend, RIT3, CIT3, NIT3 > &B,
+		const Operator &mulOp,
+		const Phase phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< Operator >::value,
+		void >::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename Operator::D1, InputType1 >::value ),
+			"grb::eWiseApply (ascend, matrix <- matrix x matrix, operator)",
+			"called with a prefactor input matrix A that does not match the first "
+			"domain of the given multiplication operator"
+		);
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename Operator::D2, InputType2 >::value ),
+			"grb::eWiseApply (ascend, matrix <- matrix x matrix, operator)",
+			"called with a postfactor input matrix B that does not match the first "
+			"domain of the given multiplication operator"
+		);
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename Operator::D3, OutputType >::value ),
+			"grb::eWiseApply (ascend, matrix <- matrix x matrix, operator)",
+			"called with an output matrix C that does not match the output domain "
+			"of the given multiplication operator"
+		);
+		static_assert( ( !(
+				std::is_same< InputType1, void >::value ||
+				std::is_same< InputType2, void >::value )
+			), "grb::eWiseApply (ascend, matrix <- matrix x matrix, operator): "
+			"the operator version of eWiseApply cannot be used if either of the "
+			"input matrices is a pattern matrix (of type void)"
+		);
+
+		typename grb::Monoid<
+			grb::operators::mul< double >,
+			grb::identities::one
+		> dummyMonoid;
+		return internal::eWiseApply_matrix_generic< false, descr >(
+			C, A, B, mulOp, dummyMonoid, phase
+		);
+	}
+
+} // namespace grb
+
+#undef NO_CAST_ASSERT
+
+#endif // ``_H_GRB_ASCEND_BLAS3''
+
diff --git a/include/graphblas/ascend/boolean_dispatcher_blas1.hpp b/include/graphblas/ascend/boolean_dispatcher_blas1.hpp
new file mode 100644
index 000000000..d093db955
--- /dev/null
+++ b/include/graphblas/ascend/boolean_dispatcher_blas1.hpp
@@ -0,0 +1,1738 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+
+#ifndef _H_GRB_ASCEND_BOOLEAN_DISPATCHER_BLAS1
+#define _H_GRB_ASCEND_BOOLEAN_DISPATCHER_BLAS1
+
+#include <graphblas/backends.hpp>
+#include <graphblas/blas0.hpp>
+#include <graphblas/descriptors.hpp>
+#include <graphblas/internalops.hpp>
+#include <graphblas/ops.hpp>
+#include <graphblas/rc.hpp>
+#include <graphblas/semiring.hpp>
+
+#include <graphblas/nonblocking/lazy_evaluation.hpp>
+
+#include "coordinates.hpp"
+#include "vector.hpp"
+#include "vector_wrapper.hpp"
+
+
+namespace grb {
+
+	namespace internal {
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool left,
+			bool already_dense_input_to_fold,
+			bool already_dense_mask,
+			class Monoid,
+			typename InputType,
+			typename MaskType,
+			class Coords
+		>
+		RC fold_from_vector_to_scalar_vectorDriven(
+			typename Monoid::D3 &thread_local_output,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_to_fold,
+			const Coords &local_mask,
+			const Vector< InputType, ascend, Coords > &to_fold,
+			const Vector< MaskType, ascend, Coords > &mask,
+			const Monoid &monoid
+		);
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool left,
+			class Monoid,
+			typename InputType,
+			typename MaskType,
+			class Coords
+		>
+		RC boolean_dispatcher_fold_from_vector_to_scalar_vectorDriven(
+			const bool already_dense_input_to_fold,
+			const bool already_dense_mask,
+			typename Monoid::D3 &thread_local_output,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_to_fold,
+			const Coords &local_mask,
+			const Vector< InputType, ascend, Coords > &to_fold,
+			const Vector< MaskType, ascend, Coords > &mask,
+			const Monoid &monoid
+		) {
+			if( already_dense_input_to_fold ) {
+				if( already_dense_mask ) {
+					return internal::fold_from_vector_to_scalar_vectorDriven<
+							descr, masked, left, true, true
+						>(
+							thread_local_output, lower_bound, upper_bound,
+							local_to_fold, local_mask, to_fold, mask, monoid
+						);
+				} else {
+					return internal::fold_from_vector_to_scalar_vectorDriven<
+							descr, masked, left, true, false
+						>(
+							thread_local_output, lower_bound, upper_bound,
+							local_to_fold, local_mask, to_fold, mask, monoid
+						);
+				}
+			} else {
+				if( already_dense_mask ) {
+					return internal::fold_from_vector_to_scalar_vectorDriven<
+							descr, masked, left, false, true
+						>(
+							thread_local_output, lower_bound, upper_bound,
+							local_to_fold, local_mask, to_fold, mask, monoid
+						);
+				} else {
+					return internal::fold_from_vector_to_scalar_vectorDriven<
+							descr, masked, left, false, false
+						>(
+							thread_local_output, lower_bound, upper_bound,
+							local_to_fold, local_mask, to_fold, mask, monoid
+						);
+				}
+			}
+		}
+
+		template<
+			Descriptor descr,
+			bool left,
+			bool already_dense_input_to_fold,
+			bool already_dense_mask,
+			class Monoid,
+			typename InputType,
+			typename MaskType,
+			class Coords
+		>
+		RC fold_from_vector_to_scalar_maskDriven(
+			typename Monoid::D3 &thread_local_output,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_to_fold,
+			const Coords &local_mask,
+			const Vector< InputType, ascend, Coords > &to_fold,
+			const Vector< MaskType, ascend, Coords > &mask,
+			const Monoid &monoid
+		);
+
+		template<
+			Descriptor descr,
+			bool left,
+			class Monoid,
+			typename InputType,
+			typename MaskType,
+			class Coords
+		>
+		RC boolean_dispatcher_fold_from_vector_to_scalar_maskDriven(
+			const bool already_dense_input_to_fold,
+			const bool already_dense_mask,
+			typename Monoid::D3 &thread_local_output,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_to_fold,
+			const Coords &local_mask,
+			const Vector< InputType, ascend, Coords > &to_fold,
+			const Vector< MaskType, ascend, Coords > &mask,
+			const Monoid &monoid
+		) {
+			if( already_dense_input_to_fold ) {
+				if( already_dense_mask ) {
+					return internal::fold_from_vector_to_scalar_maskDriven<
+							descr, left, true, true
+						>(
+							thread_local_output, lower_bound, upper_bound,
+							local_to_fold, local_mask, to_fold, mask, monoid
+						);
+				} else {
+					return internal::fold_from_vector_to_scalar_maskDriven<
+							descr, left, true, false
+						>(
+							thread_local_output, lower_bound, upper_bound,
+							local_to_fold, local_mask, to_fold, mask, monoid
+						);
+				}
+			} else {
+				if( already_dense_mask ) {
+					return internal::fold_from_vector_to_scalar_maskDriven<
+							descr, left, false, true
+						>(
+							thread_local_output, lower_bound, upper_bound,
+							local_to_fold, local_mask, to_fold, mask, monoid
+						);
+				} else {
+					return internal::fold_from_vector_to_scalar_maskDriven<
+							descr, left, false, false
+						>(
+							thread_local_output, lower_bound, upper_bound,
+							local_to_fold, local_mask, to_fold, mask, monoid
+						);
+				}
+			}
+		}
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool left,
+			bool already_dense_input_to_fold,
+			bool already_dense_mask,
+			class Monoid,
+			typename InputType,
+			typename MaskType,
+			class Coords
+		>
+		RC fold_from_vector_to_scalar_fullLoopSparse(
+			typename Monoid::D3 &thread_local_output,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_to_fold,
+			const Coords &local_mask,
+			const Vector< InputType, ascend, Coords > &to_fold,
+			const Vector< MaskType, ascend, Coords > &mask,
+			const Monoid &monoid
+		);
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool left,
+			class Monoid,
+			typename InputType,
+			typename MaskType,
+			class Coords
+		>
+		RC boolean_dispatcher_fold_from_vector_to_scalar_fullLoopSparse(
+			const bool already_dense_input_to_fold,
+			const bool already_dense_mask,
+			typename Monoid::D3 &thread_local_output,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_to_fold,
+			const Coords &local_mask,
+			const Vector< InputType, ascend, Coords > &to_fold,
+			const Vector< MaskType, ascend, Coords > &mask,
+			const Monoid &monoid
+		) {
+			if( already_dense_input_to_fold ) {
+				if( already_dense_mask ) {
+					return internal::fold_from_vector_to_scalar_fullLoopSparse<
+							descr, masked, left, true, true
+						>(
+							thread_local_output, lower_bound, upper_bound,
+							local_to_fold, local_mask, to_fold, mask, monoid
+						);
+				} else {
+					return internal::fold_from_vector_to_scalar_fullLoopSparse<
+							descr, masked, left, true, false
+						>(
+							thread_local_output, lower_bound, upper_bound,
+							local_to_fold, local_mask, to_fold, mask, monoid
+						);
+				}
+			} else {
+				if( already_dense_mask ) {
+					return internal::fold_from_vector_to_scalar_fullLoopSparse<
+							descr, masked, left, false, true
+						>(
+							thread_local_output, lower_bound, upper_bound,
+							local_to_fold, local_mask, to_fold, mask, monoid
+						);
+				} else {
+					return internal::fold_from_vector_to_scalar_fullLoopSparse<
+							descr, masked, left, false, false
+						>(
+							thread_local_output, lower_bound, upper_bound,
+							local_to_fold, local_mask, to_fold, mask, monoid
+						);
+				}
+			}
+		}
+
+		template< Descriptor descr,
+			bool left,
+			bool sparse,
+			bool masked,
+			bool monoid,
+			bool already_dense_output,
+			bool already_dense_mask,
+			typename MaskType,
+			typename IOType,
+			typename InputType,
+			typename Coords,
+			class OP
+		>
+		RC fold_from_scalar_to_vector_generic(
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_vector,
+			const Coords * const local_mask_ptr,
+			Vector< IOType, ascend, Coords > &vector,
+			const Vector< MaskType, ascend, Coords > * const mask,
+			const InputType &scalar,
+			const OP &op,
+			const Phase &phase
+		);
+
+		template< Descriptor descr,
+			bool left,
+			bool sparse,
+			bool masked,
+			bool monoid,
+			typename MaskType,
+			typename IOType,
+			typename InputType,
+			typename Coords,
+			class OP
+		>
+		RC boolean_dispatcher_fold_from_scalar_to_vector_generic(
+			const bool already_dense_output,
+			const bool already_dense_mask,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_vector,
+			const Coords * const local_mask_ptr,
+			Vector< IOType, ascend, Coords > &vector,
+			const Vector< MaskType, ascend, Coords > * const mask,
+			const InputType &scalar,
+			const OP &op,
+			const Phase &phase
+		) {
+			if( already_dense_output ) {
+				if( already_dense_mask ) {
+					return internal::fold_from_scalar_to_vector_generic<
+							descr, left, sparse, masked, monoid,
+							true, true
+						>(
+							lower_bound, upper_bound, local_vector, local_mask_ptr,
+							vector, mask, scalar, op, phase
+						);
+				} else {
+					return internal::fold_from_scalar_to_vector_generic<
+							descr, left, sparse, masked, monoid,
+							true, false
+						>(
+							lower_bound, upper_bound, local_vector, local_mask_ptr,
+							vector, mask, scalar, op, phase
+						);
+				}
+			} else {
+				if( already_dense_mask ) {
+					return internal::fold_from_scalar_to_vector_generic<
+							descr, left, sparse, masked, monoid,
+							false, true
+						>(
+							lower_bound, upper_bound, local_vector, local_mask_ptr,
+							vector, mask, scalar, op, phase
+						);
+				} else {
+					return internal::fold_from_scalar_to_vector_generic<
+							descr, left, sparse, masked, monoid,
+							false, false
+						>(
+							lower_bound, upper_bound, local_vector, local_mask_ptr,
+							vector, mask, scalar, op, phase
+						);
+				}
+			}
+		}
+
+		template< Descriptor descr,
+			bool left,
+			bool sparse,
+			bool masked,
+			bool monoid,
+			bool already_dense_output,
+			bool already_dense_input_to_fold,
+			bool already_dense_mask,
+			typename MaskType,
+			typename IOType,
+			typename IType,
+			typename Coords,
+			class OP
+		>
+		RC fold_from_vector_to_vector_generic(
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_fold_into,
+			const Coords * const local_m_ptr,
+			const Coords &local_to_fold,
+			Vector< IOType, ascend, Coords > &fold_into,
+			const Vector< MaskType, ascend, Coords > * const m,
+			const Vector< IType, ascend, Coords > &to_fold,
+			const OP &op,
+			const Phase phase
+		);
+
+		template< Descriptor descr,
+			bool left,
+			bool sparse,
+			bool masked,
+			bool monoid,
+			typename MaskType,
+			typename IOType,
+			typename IType,
+			typename Coords,
+			class OP
+		>
+		RC boolean_dispatcher_fold_from_vector_to_vector_generic(
+			const bool already_dense_output,
+			const bool already_dense_input_to_fold,
+			const bool already_dense_mask,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_fold_into,
+			const Coords * const local_m_ptr,
+			const Coords &local_to_fold,
+			Vector< IOType, ascend, Coords > &fold_into,
+			const Vector< MaskType, ascend, Coords > * const m,
+			const Vector< IType, ascend, Coords > &to_fold,
+			const OP &op,
+			const Phase phase
+		) {
+			if( already_dense_output ) {
+				if( already_dense_input_to_fold ) {
+					if( already_dense_mask ) {
+						return internal::fold_from_vector_to_vector_generic<
+								descr, left, sparse, masked, monoid,
+								true, true, true
+							>(
+								lower_bound, upper_bound, local_fold_into, local_m_ptr,
+								local_to_fold, fold_into, m, to_fold, op, phase
+							);
+					} else {
+						return internal::fold_from_vector_to_vector_generic<
+								descr, left, sparse, masked, monoid,
+								true, true, false
+							>(
+								lower_bound, upper_bound, local_fold_into, local_m_ptr,
+								local_to_fold, fold_into, m, to_fold, op, phase
+							);
+					}
+				} else {
+					if( already_dense_mask ) {
+						return internal::fold_from_vector_to_vector_generic<
+								descr, left, sparse, masked, monoid,
+								true, false, true
+							>(
+								lower_bound, upper_bound, local_fold_into, local_m_ptr,
+								local_to_fold, fold_into, m, to_fold, op, phase
+							);
+					} else {
+						return internal::fold_from_vector_to_vector_generic<
+								descr, left, sparse, masked, monoid,
+								true, false, false
+							>(
+								lower_bound, upper_bound, local_fold_into, local_m_ptr,
+								local_to_fold, fold_into, m, to_fold, op, phase
+							);
+					}
+				}
+			} else {
+				if( already_dense_input_to_fold ) {
+					if( already_dense_mask ) {
+						return internal::fold_from_vector_to_vector_generic<
+								descr, left, sparse, masked, monoid,
+								false, true, true
+							>(
+								lower_bound, upper_bound, local_fold_into, local_m_ptr,
+								local_to_fold, fold_into, m, to_fold, op, phase
+							);
+					} else {
+						return internal::fold_from_vector_to_vector_generic<
+								descr, left, sparse, masked, monoid,
+								false, true, false
+							>(
+								lower_bound, upper_bound, local_fold_into, local_m_ptr,
+								local_to_fold, fold_into, m, to_fold, op, phase
+							);
+					}
+				} else {
+					if( already_dense_mask ) {
+						return internal::fold_from_vector_to_vector_generic<
+								descr, left, sparse, masked, monoid,
+								false, false, true
+							>(
+								lower_bound, upper_bound, local_fold_into, local_m_ptr,
+								local_to_fold, fold_into, m, to_fold, op, phase
+							);
+					} else {
+						return internal::fold_from_vector_to_vector_generic<
+								descr, left, sparse, masked, monoid,
+								false, false, false
+							>(
+								lower_bound, upper_bound, local_fold_into, local_m_ptr,
+								local_to_fold, fold_into, m, to_fold, op, phase
+							);
+					}
+				}
+			}
+		}
+
+		template<
+			bool left_scalar,
+			bool right_scalar,
+			bool left_sparse,
+			bool right_sparse,
+			Descriptor descr,
+			class OP,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+			typename OutputType,
+			typename InputType1,
+			typename InputType2,
+			typename Coords
+		>
+		RC dense_apply_generic(
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, ascend, Coords > &z_vector,
+			const internal::Wrapper< left_scalar, InputType1, Coords > x_wrapper,
+			const internal::Wrapper< right_scalar, InputType2, Coords > y_wrapper,
+			const OP &op
+		);
+
+		template<
+			bool left_scalar,
+			bool right_scalar,
+			bool left_sparse,
+			bool right_sparse,
+			Descriptor descr,
+			class OP,
+			typename OutputType,
+			typename InputType1,
+			typename InputType2,
+			typename Coords
+		>
+		RC boolean_dispatcher_dense_apply_generic(
+			const bool already_dense_input_x,
+			const bool already_dense_input_y,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, ascend, Coords > &z_vector,
+			const internal::Wrapper< left_scalar, InputType1, Coords > x_wrapper,
+			const internal::Wrapper< right_scalar, InputType2, Coords > y_wrapper,
+			const OP &op
+		) {
+			if( already_dense_input_x ) {
+				if( already_dense_input_y ) {
+					return internal::dense_apply_generic<
+							left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+							true, true
+						>(
+							lower_bound, upper_bound,
+							local_x, local_y, z_vector, x_wrapper, y_wrapper, op
+						);
+				} else {
+					return internal::dense_apply_generic<
+							left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+							true, true
+						>(
+							lower_bound, upper_bound,
+							local_x, local_y, z_vector, x_wrapper, y_wrapper, op
+						);
+				}
+			} else {
+				if( already_dense_input_y ) {
+					return internal::dense_apply_generic<
+							left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+							true, true
+						>(
+							lower_bound, upper_bound,
+							local_x, local_y, z_vector, x_wrapper, y_wrapper, op
+						);
+				} else {
+					return internal::dense_apply_generic<
+							left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+							true, true
+						>(
+							lower_bound, upper_bound,
+							local_x, local_y, z_vector, x_wrapper, y_wrapper, op
+						);
+				}
+			}
+		}
+
+		template<
+			bool masked,
+			bool monoid,
+			bool x_scalar,
+			bool y_scalar,
+			Descriptor descr,
+			class OP,
+			bool already_dense_mask,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+			typename OutputType,
+			typename MaskType,
+			typename InputType1,
+			typename InputType2,
+			typename Coords
+		>
+		RC sparse_apply_generic(
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_z,
+			const Coords * const local_mask_ptr,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, ascend, Coords > &z_vector,
+			const Vector< MaskType, ascend, Coords > * const mask_vector,
+			const internal::Wrapper< x_scalar, InputType1, Coords > x_wrapper,
+			const internal::Wrapper< y_scalar, InputType2, Coords > y_wrapper,
+			const OP &op
+		);
+
+		template<
+			bool masked,
+			bool monoid,
+			bool x_scalar,
+			bool y_scalar,
+			Descriptor descr,
+			class OP,
+			typename OutputType,
+			typename MaskType,
+			typename InputType1,
+			typename InputType2,
+			typename Coords
+		>
+		RC boolean_dispatcher_sparse_apply_generic(
+			const bool already_dense_mask,
+			const bool already_dense_input_x,
+			const bool already_dense_input_y,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_z,
+			const Coords * const local_mask_ptr,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, ascend, Coords > &z_vector,
+			const Vector< MaskType, ascend, Coords > * const mask_vector,
+			const internal::Wrapper< x_scalar, InputType1, Coords > x_wrapper,
+			const internal::Wrapper< y_scalar, InputType2, Coords > y_wrapper,
+			const OP &op
+		) {
+			if( already_dense_mask ) {
+				if( already_dense_input_x ) {
+					if( already_dense_input_y ) {
+						return internal::sparse_apply_generic<
+								masked, monoid, x_scalar, y_scalar, descr, OP,
+								true, true, true
+							> (
+								lower_bound, upper_bound, local_z, local_mask_ptr, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper, op
+							);
+					} else {
+						return internal::sparse_apply_generic<
+								masked, monoid, x_scalar, y_scalar, descr, OP,
+								true, true, false
+							> (
+								lower_bound, upper_bound, local_z, local_mask_ptr, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper, op
+							);
+					}
+				} else {
+					if( already_dense_input_y ) {
+						return internal::sparse_apply_generic<
+								masked, monoid, x_scalar, y_scalar, descr, OP,
+								true, false, true
+							> (
+								lower_bound, upper_bound,
+								local_z, local_mask_ptr, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper, op
+							);
+					} else {
+						return internal::sparse_apply_generic<
+								masked, monoid, x_scalar, y_scalar, descr, OP,
+								true, false, false
+							> (
+								lower_bound, upper_bound, local_z, local_mask_ptr, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper, op
+							);
+					}
+				}
+			} else {
+				if( already_dense_input_x ) {
+					if( already_dense_input_y ) {
+						return internal::sparse_apply_generic<
+								masked, monoid, x_scalar, y_scalar, descr, OP,
+								false, true, true
+							> (
+								lower_bound, upper_bound,
+								local_z, local_mask_ptr, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper, op
+							);
+					} else {
+						return internal::sparse_apply_generic<
+								masked, monoid, x_scalar, y_scalar, descr, OP,
+								false, true, false
+							> (
+								lower_bound, upper_bound, local_z, local_mask_ptr, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper, op
+							);
+					}
+				} else {
+					if( already_dense_input_y ) {
+						return internal::sparse_apply_generic<
+								masked, monoid, x_scalar, y_scalar, descr, OP,
+								false, false, true
+							> (
+								lower_bound, upper_bound, local_z, local_mask_ptr, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper, op
+							);
+					} else {
+						return internal::sparse_apply_generic<
+								masked, monoid, x_scalar, y_scalar, descr, OP,
+								false, false, false
+							> (
+								lower_bound, upper_bound, local_z, local_mask_ptr, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper, op
+							);
+					}
+				}
+			}
+		}
+
+		template<
+			bool left_scalar,
+			bool right_scalar,
+			bool left_sparse,
+			bool right_sparse,
+			Descriptor descr,
+			class OP,
+			bool already_dense_mask,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+			typename OutputType,
+			typename MaskType,
+			typename InputType1,
+			typename InputType2,
+			typename Coords
+		>
+		RC masked_apply_generic(
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_z,
+			const Coords &local_mask,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, ascend, Coords > &z_vector,
+			const Vector< MaskType, ascend, Coords > &mask_vector,
+			const internal::Wrapper< left_scalar, InputType1, Coords > x_wrapper,
+			const internal::Wrapper< right_scalar, InputType2, Coords > y_wrapper,
+			const OP &op,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			const InputType1 * const left_identity,
+			const InputType2 * const right_identity
+#else
+			const InputType1 * const left_identity = nullptr,
+			const InputType2 * const right_identity = nullptr
+#endif
+		);
+
+		template<
+			bool left_scalar,
+			bool right_scalar,
+			bool left_sparse,
+			bool right_sparse,
+			Descriptor descr,
+			class OP,
+			typename OutputType,
+			typename MaskType,
+			typename InputType1,
+			typename InputType2,
+			typename Coords
+		>
+		RC boolean_dispatcher_masked_apply_generic(
+			const bool already_dense_mask,
+			const bool already_dense_input_x,
+			const bool already_dense_input_y,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_z,
+			const Coords &local_mask,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, ascend, Coords > &z_vector,
+			const Vector< MaskType, ascend, Coords > &mask_vector,
+			const internal::Wrapper< left_scalar, InputType1, Coords > x_wrapper,
+			const internal::Wrapper< right_scalar, InputType2, Coords > y_wrapper,
+			const OP &op,
+			const InputType1 * const left_identity = nullptr,
+			const InputType2 * const right_identity = nullptr
+		) {
+			if( already_dense_mask ) {
+				if( already_dense_input_x ) {
+					if( already_dense_input_y ) {
+						return internal::masked_apply_generic<
+								left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+								true, true, true
+							>(
+								lower_bound, upper_bound, local_z, local_mask, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper, op, left_identity, right_identity
+							);
+					} else {
+						return internal::masked_apply_generic<
+								left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+								true, true, false
+							>(
+								lower_bound, upper_bound, local_z, local_mask, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper,
+								op, left_identity, right_identity
+							);
+					}
+				} else {
+					if( already_dense_input_y ) {
+						return internal::masked_apply_generic<
+								left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+								true, false, true
+							>(
+								lower_bound, upper_bound, local_z, local_mask, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper,
+								op, left_identity, right_identity
+							);
+					} else {
+						return internal::masked_apply_generic<
+								left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+								true, false, false
+							>(
+								lower_bound, upper_bound, local_z, local_mask, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper,
+								op, left_identity, right_identity
+							);
+					}
+				}
+			} else {
+				if( already_dense_input_x ) {
+					if( already_dense_input_y ) {
+						return internal::masked_apply_generic<
+								left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+								false, true, true
+							>(
+								lower_bound, upper_bound, local_z, local_mask, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper,
+								op, left_identity, right_identity
+							);
+					} else {
+						return internal::masked_apply_generic<
+								left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+								false, true, false
+							>(
+								lower_bound, upper_bound, local_z, local_mask, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper,
+								op, left_identity, right_identity
+							);
+					}
+				} else {
+					if( already_dense_input_y ) {
+						return internal::masked_apply_generic<
+								left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+								false, false, true
+							>(
+								lower_bound, upper_bound, local_z, local_mask, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper,
+								op, left_identity, right_identity
+							);
+					} else {
+						return internal::masked_apply_generic<
+								left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+								false, false, false
+							>(
+								lower_bound, upper_bound, local_z, local_mask, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper,
+								op, left_identity, right_identity
+							);
+					}
+				}
+			}
+		}
+
+		template<
+			Descriptor descr,
+			bool a_scalar,
+			bool x_scalar,
+			bool y_scalar,
+			bool y_zero,
+			bool already_dense_output,
+			bool already_dense_mask,
+			bool already_dense_input_a,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+			typename OutputType,
+			typename MaskType,
+			typename InputType1,
+			typename InputType2,
+			typename InputType3,
+			typename Coords,
+			class Ring
+		>
+		RC sparse_eWiseMulAdd_maskDriven(
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_z,
+			const Coords &local_m,
+			const Coords &local_a,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, ascend, Coords > &z_vector,
+			const Vector< MaskType, ascend, Coords > &m_vector,
+			const internal::Wrapper< a_scalar, InputType1, Coords > &a_wrapper,
+			const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper,
+			const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper,
+			const Ring &ring
+		);
+
+		template<
+			Descriptor descr,
+			bool a_scalar,
+			bool x_scalar,
+			bool y_scalar,
+			bool y_zero,
+			typename OutputType,
+			typename MaskType,
+			typename InputType1,
+			typename InputType2,
+			typename InputType3,
+			typename Coords,
+			class Ring
+		>
+		RC boolean_dispatcher_sparse_eWiseMulAdd_maskDriven(
+			const bool already_dense_output,
+			const bool already_dense_mask,
+			const bool already_dense_input_a,
+			const bool already_dense_input_x,
+			const bool already_dense_input_y,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_z,
+			const Coords &local_m,
+			const Coords &local_a,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, ascend, Coords > &z_vector,
+			const Vector< MaskType, ascend, Coords > &m_vector,
+			const internal::Wrapper< a_scalar, InputType1, Coords > &a_wrapper,
+			const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper,
+			const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper,
+			const Ring &ring
+		) {
+			if( already_dense_output ) {
+				if( already_dense_mask ) {
+					if( already_dense_input_a ) {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, true, true, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, true, true, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, true, true, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, true, true, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					} else {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, true, false, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, true, false, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, true, false, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, true, false, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					}
+				} else {
+					if( already_dense_input_a ) {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, false, true, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, false, true, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, false, true, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, false, true, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					} else {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, false, false, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, false, false, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, false, false, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, false, false, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					}
+				}
+			} else {
+				if( already_dense_mask ) {
+					if( already_dense_input_a ) {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, true, true, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, true, true, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, true, true, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, true, true, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+											z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					} else {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, true, false, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, true, false, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, true, false, false, true
+									>( lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, true, false, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					}
+				} else {
+					if( already_dense_input_a ) {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, false, true, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, false, true, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, false, true, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, false, true, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					} else {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, false, false, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, false, false, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, false, false, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, false, false, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					}
+				}
+			}
+		}
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool x_scalar,
+			bool y_scalar,
+			bool y_zero,
+			bool mulSwitched,
+			bool already_dense_output,
+			bool already_dense_mask,
+			bool already_dense_input_a,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+			typename OutputType,
+			typename MaskType,
+			typename InputType1,
+			typename InputType2,
+			typename InputType3,
+			typename Coords,
+			class Ring
+		>
+		RC twoPhase_sparse_eWiseMulAdd_mulDriven(
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_z,
+			const Coords * const local_m,
+			const Coords &local_a,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, ascend, Coords > &z_vector,
+			const Vector< MaskType, ascend, Coords > * const m_vector,
+			const Vector< InputType1, ascend, Coords > &a_vector,
+			const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper,
+			const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper,
+			const Ring &ring
+		);
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool x_scalar,
+			bool y_scalar,
+			bool y_zero,
+			bool mulSwitched,
+			typename OutputType,
+			typename MaskType,
+			typename InputType1,
+			typename InputType2,
+			typename InputType3,
+			typename Coords,
+			class Ring
+		>
+		RC boolean_dispatcher_twoPhase_sparse_eWiseMulAdd_mulDriven(
+			const bool already_dense_output,
+			const bool already_dense_mask,
+			const bool already_dense_input_a,
+			const bool already_dense_input_x,
+			const bool already_dense_input_y,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_z,
+			const Coords * const local_m,
+			const Coords &local_a,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, ascend, Coords > &z_vector,
+			const Vector< MaskType, ascend, Coords > * const m_vector,
+			const Vector< InputType1, ascend, Coords > &a_vector,
+			const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper,
+			const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper,
+			const Ring &ring = Ring()
+		) {
+			if( already_dense_output ) {
+				if( already_dense_mask ) {
+					if( already_dense_input_a ) {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, true, true, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, true, true, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, true, true, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, true, true, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					} else {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, true, false, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, true, false, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, true, false, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, true, false, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					}
+				} else {
+					if( already_dense_input_a ) {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, false, true, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, false, true, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, false, true, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, false, true, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					} else {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, false, false, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, false, false, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, false, false, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, false, false, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					}
+				}
+			} else {
+				if( already_dense_mask ) {
+					if( already_dense_input_a ) {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, true, true, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, true, true, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, true, true, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, true, true, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					} else {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, true, false, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, true, false, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, true, false, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, true, false, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					}
+				} else {
+					if( already_dense_input_a ) {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, false, true, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, false, true, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, false, true, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, false, true, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					} else {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, false, false, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, false, false, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, false, false, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, false, false, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					}
+				}
+			}
+		}
+
+		template<
+			Descriptor descr,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+			class AddMonoid,
+			class AnyOp,
+			typename InputType1,
+			typename InputType2,
+			typename Coords
+		>
+		RC sparse_dot_generic(
+			typename AddMonoid::D3 &thread_local_output,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_x,
+			const Coords &local_y,
+			const Vector< InputType1, ascend, Coords > &x,
+			const Vector< InputType2, ascend, Coords > &y,
+			const size_t local_nz,
+			const AddMonoid &addMonoid,
+			const AnyOp &anyOp
+		);
+
+		template<
+			Descriptor descr,
+			class AddMonoid,
+			class AnyOp,
+			typename InputType1,
+			typename InputType2,
+			typename Coords
+		>
+		RC boolean_dispatcher_sparse_dot_generic(
+			const bool already_dense_input_x,
+			const bool already_dense_input_y,
+			typename AddMonoid::D3 &thread_local_output,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_x,
+			const Coords &local_y,
+			const Vector< InputType1, ascend, Coords > &x,
+			const Vector< InputType2, ascend, Coords > &y,
+			const size_t local_nz,
+			const AddMonoid &addMonoid,
+			const AnyOp &anyOp
+		) {
+			if( already_dense_input_x ) {
+				if( already_dense_input_y ) {
+					return internal::sparse_dot_generic<
+							descr, true, true
+						>(
+							thread_local_output, lower_bound, upper_bound, local_x, local_y,
+							x, y, local_nz, addMonoid, anyOp
+						);
+				} else {
+					return internal::sparse_dot_generic<
+							descr, true, false
+						>(
+							thread_local_output, lower_bound, upper_bound, local_x, local_y,
+							x, y, local_nz, addMonoid, anyOp
+						);
+				}
+			} else {
+				if( already_dense_input_y ) {
+					return internal::sparse_dot_generic<
+							descr, false, true
+						>(
+							thread_local_output, lower_bound, upper_bound, local_x, local_y,
+							x, y, local_nz, addMonoid, anyOp
+						);
+				} else {
+					return internal::sparse_dot_generic<
+							descr, false, false
+						>(
+							thread_local_output, lower_bound, upper_bound, local_x, local_y,
+							x, y, local_nz, addMonoid, anyOp
+						);
+				}
+			}
+		}
+
+	} // end namespace ``internal''
+
+} // end namespace ``grb''
+
+#endif
+
diff --git a/include/graphblas/ascend/boolean_dispatcher_blas2.hpp b/include/graphblas/ascend/boolean_dispatcher_blas2.hpp
new file mode 100644
index 000000000..8a846672c
--- /dev/null
+++ b/include/graphblas/ascend/boolean_dispatcher_blas2.hpp
@@ -0,0 +1,191 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Dispatchers for the level-2 primitives
+ *
+ * @author A. N. Yzelman
+ * @date 12th of September, 2023
+ */
+
+#ifndef _H_GRB_ASCEND_BOOLEAN_DISPATCHER_BLAS2
+#define _H_GRB_ASCEND_BOOLEAN_DISPATCHER_BLAS2
+
+#include <graphblas/backends.hpp>
+#include <graphblas/blas0.hpp>
+#include <graphblas/descriptors.hpp>
+#include <graphblas/internalops.hpp>
+#include <graphblas/ops.hpp>
+#include <graphblas/rc.hpp>
+#include <graphblas/semiring.hpp>
+
+#include <graphblas/nonblocking/lazy_evaluation.hpp>
+
+#include "coordinates.hpp"
+#include "vector.hpp"
+
+
+namespace grb {
+
+	namespace internal {
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool input_masked,
+			bool left_handed,
+			template< typename > class One,
+			bool already_dense_destination_vector,
+			bool already_dense_mask_vector,
+			class AdditiveMonoid,
+			class Multiplication,
+			typename IOType,
+			typename InputType1,
+			typename InputType2,
+			typename InputType3,
+			typename InputType4,
+			typename Coords,
+			typename RowColType,
+			typename NonzeroType
+		>
+		inline void vxm_inner_kernel_gather(
+			RC &rc,
+			const size_t lower_bound,
+			Coords &local_destination_vector,
+			const Coords &local_mask_vector,
+			Vector< IOType, ascend, Coords > &destination_vector,
+			IOType &destination_element,
+			const size_t &destination_index,
+			const Vector< InputType1, ascend, Coords > &source_vector,
+			const InputType1 * __restrict__ const &source,
+			const size_t &source_range,
+			const internal::Compressed_Storage<
+				InputType2, RowColType, NonzeroType
+			> &matrix,
+			const Vector< InputType3, ascend, Coords > &mask_vector,
+			const InputType3 * __restrict__ const &mask,
+			const Vector< InputType4, ascend, Coords > &source_mask_vector,
+			const InputType4 * __restrict__ const &source_mask,
+			const AdditiveMonoid &add,
+			const Multiplication &mul,
+			const std::function< size_t( size_t ) > &src_local_to_global,
+			const std::function< size_t( size_t ) > &src_global_to_local,
+			const std::function< size_t( size_t ) > &dst_local_to_global
+		);
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool input_masked,
+			bool left_handed,
+			template< typename > class One,
+			class AdditiveMonoid,
+			class Multiplication,
+			typename IOType,
+			typename InputType1,
+			typename InputType2,
+			typename InputType3,
+			typename InputType4,
+			typename Coords,
+			typename RowColType,
+			typename NonzeroType
+		>
+		inline void boolean_dispatcher_vxm_inner_kernel_gather(
+			const bool already_dense_destination_vector,
+			const bool already_dense_mask_vector,
+			RC &rc,
+			const size_t lower_bound,
+			Coords &local_destination_vector,
+			const Coords &local_mask_vector,
+			Vector< IOType, ascend, Coords > &destination_vector,
+			IOType &destination_element,
+			const size_t &destination_index,
+			const Vector< InputType1, ascend, Coords > &source_vector,
+			const InputType1 * __restrict__ const &source,
+			const size_t &source_range,
+			const internal::Compressed_Storage<
+				InputType2, RowColType, NonzeroType
+			> &matrix,
+			const Vector< InputType3, ascend, Coords > &mask_vector,
+			const InputType3 * __restrict__ const &mask,
+			const Vector< InputType4, ascend, Coords > &source_mask_vector,
+			const InputType4 * __restrict__ const &source_mask,
+			const AdditiveMonoid &add,
+			const Multiplication &mul,
+			const std::function< size_t( size_t ) > &src_local_to_global,
+			const std::function< size_t( size_t ) > &src_global_to_local,
+			const std::function< size_t( size_t ) > &dst_local_to_global
+		) {
+			if( already_dense_destination_vector ) {
+				if( already_dense_mask_vector ) {
+					return internal::vxm_inner_kernel_gather<
+							descr, masked, input_masked, left_handed, One,
+							true, true
+						>(
+							rc, lower_bound, local_destination_vector, local_mask_vector,
+							destination_vector, destination_element, destination_index,
+							source_vector, source, source_range, matrix, mask_vector, mask,
+							source_mask_vector, source_mask, add, mul,
+								src_local_to_global, src_global_to_local, dst_local_to_global
+						);
+				} else {
+					return internal::vxm_inner_kernel_gather<
+							descr, masked, input_masked, left_handed, One,
+							true, false
+						>(
+							rc, lower_bound, local_destination_vector, local_mask_vector,
+							destination_vector, destination_element, destination_index,
+							source_vector, source, source_range, matrix, mask_vector, mask,
+							source_mask_vector, source_mask, add, mul,
+							src_local_to_global, src_global_to_local, dst_local_to_global
+						);
+				}
+			} else {
+				if( already_dense_mask_vector ) {
+					return internal::vxm_inner_kernel_gather<
+							descr, masked, input_masked, left_handed, One,
+							false, true
+						>(
+							rc, lower_bound, local_destination_vector, local_mask_vector,
+							destination_vector, destination_element, destination_index,
+							source_vector, source, source_range, matrix, mask_vector, mask,
+							source_mask_vector, source_mask, add, mul,
+							src_local_to_global, src_global_to_local, dst_local_to_global
+						);
+				} else {
+					return internal::vxm_inner_kernel_gather<
+							descr, masked, input_masked, left_handed, One,
+							false, false
+						>(
+							rc, lower_bound, local_destination_vector, local_mask_vector,
+							destination_vector, destination_element, destination_index,
+							source_vector, source, source_range, matrix, mask_vector, mask,
+							source_mask_vector, source_mask, add, mul,
+							src_local_to_global, src_global_to_local, dst_local_to_global
+						);
+				}
+			}
+		}
+
+	} // end namespace ``internal''
+
+} // end namespace ``grb''
+
+#endif
+
diff --git a/include/graphblas/ascend/boolean_dispatcher_io.hpp b/include/graphblas/ascend/boolean_dispatcher_io.hpp
new file mode 100644
index 000000000..d298cb38a
--- /dev/null
+++ b/include/graphblas/ascend/boolean_dispatcher_io.hpp
@@ -0,0 +1,362 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Dispatchers for the ascend I/O primitives.
+ *
+ * @author A. N. Yzelman
+ * @date 12th of September, 2023
+ */
+
+#ifndef _H_GRB_ASCEND_BOOLEAN_DISPATCHER_IO
+#define _H_GRB_ASCEND_BOOLEAN_DISPATCHER_IO
+
+#include <graphblas/backends.hpp>
+#include <graphblas/blas0.hpp>
+#include <graphblas/descriptors.hpp>
+#include <graphblas/internalops.hpp>
+#include <graphblas/ops.hpp>
+#include <graphblas/rc.hpp>
+#include <graphblas/semiring.hpp>
+
+#include <graphblas/nonblocking/lazy_evaluation.hpp>
+
+#include "coordinates.hpp"
+#include "vector.hpp"
+
+
+namespace grb {
+
+	namespace internal {
+
+		template<
+			Descriptor descr,
+			bool loop_over_vector_length,
+			bool already_dense_mask,
+			bool mask_is_dense,
+			typename DataType,
+			typename MaskType,
+			typename T,
+			typename Coords
+		>
+		RC masked_set(
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_x,
+			const Coords &local_mask,
+			Vector< DataType, ascend, Coords > &x,
+			const Vector< MaskType, ascend, Coords > &m,
+			const T val
+		);
+
+		template<
+			Descriptor descr,
+			typename DataType,
+			typename MaskType,
+			typename T,
+			typename Coords
+		>
+		RC boolean_dispatcher_masked_set(
+			const bool loop_over_vector_length,
+			const bool already_dense_mask,
+			const bool mask_is_dense,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_x,
+			const Coords &local_mask,
+			Vector< DataType, ascend, Coords > &x,
+			const Vector< MaskType, ascend, Coords > &m,
+			const T val
+		) {
+			if( loop_over_vector_length ) {
+				if( already_dense_mask ) {
+					if( mask_is_dense ) {
+						return internal::masked_set<
+								descr, true, true, true
+							>( lower_bound, upper_bound, local_x, local_mask, x, m, val );
+					} else {
+						return internal::masked_set<
+								descr, true, true, false
+							>( lower_bound, upper_bound, local_x, local_mask, x, m, val );
+					}
+				} else {
+					if( mask_is_dense ) {
+						return internal::masked_set<
+								descr, true, false, true
+							>( lower_bound, upper_bound, local_x, local_mask, x, m, val );
+					} else {
+						return internal::masked_set<
+								descr, true, false, false
+							>( lower_bound, upper_bound, local_x, local_mask, x, m, val );
+					}
+				}
+			} else {
+				if( already_dense_mask ) {
+					if( mask_is_dense ) {
+						return internal::masked_set<
+								descr, false, true, true
+							>( lower_bound, upper_bound, local_x, local_mask, x, m, val );
+					} else {
+						return internal::masked_set<
+								descr, false, true, false
+							>( lower_bound, upper_bound, local_x, local_mask, x, m, val );
+					}
+				} else {
+					if( mask_is_dense ) {
+						return internal::masked_set<
+								descr, false, false, true
+							>( lower_bound, upper_bound, local_x, local_mask, x, m, val );
+					} else {
+						return internal::masked_set<
+								descr, false, false, false
+							>( lower_bound, upper_bound, local_x, local_mask, x, m, val );
+					}
+				}
+			}
+		}
+
+		template<
+			Descriptor descr,
+			bool out_is_void,
+			bool in_is_void,
+			bool sparse,
+			bool already_dense_vectors,
+			bool already_dense_input,
+			typename OutputType,
+			typename InputType,
+			typename Coords
+		>
+		RC set_generic(
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, ascend, Coords > &x,
+			const Vector< InputType, ascend, Coords > &y
+		);
+
+		template< Descriptor descr,
+			bool out_is_void,
+			bool in_is_void,
+			bool sparse,
+			typename OutputType,
+			typename InputType,
+			typename Coords
+		>
+		RC boolean_dispatcher_set_generic(
+			const bool already_dense_vectors,
+			const bool already_dense_input,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, ascend, Coords > &x,
+			const Vector< InputType, ascend, Coords > &y
+		) {
+			if( already_dense_vectors ) {
+				if( already_dense_input ) {
+					return internal::set_generic<
+							descr, out_is_void, in_is_void, sparse,
+							true, true
+						>( lower_bound, upper_bound, local_x, local_y, x, y );
+				} else {
+					return internal::set_generic<
+							descr, out_is_void, in_is_void, sparse,
+							true, false
+						>( lower_bound, upper_bound, local_x, local_y, x, y );
+				}
+			} else {
+				if( already_dense_input ) {
+					return internal::set_generic<
+							descr, out_is_void, in_is_void, sparse,
+							false, true
+						>( lower_bound, upper_bound, local_x, local_y, x, y );
+				} else {
+					return internal::set_generic<
+							descr, out_is_void, in_is_void, sparse,
+							false, false
+						>( lower_bound, upper_bound, local_x, local_y, x, y );
+				}
+			}
+		}
+
+		template<
+			Descriptor descr,
+			bool out_is_void,
+			bool in_is_void,
+			bool loop_over_y,
+			bool already_dense_input_y,
+			bool already_dense_mask,
+			bool mask_is_dense,
+			typename OutputType,
+			typename MaskType,
+			typename InputType,
+			typename Coords
+		>
+		RC masked_set(
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_x,
+			const Coords &local_mask,
+			const Coords &local_y,
+			Vector< OutputType, ascend, Coords > &x,
+			const Vector< MaskType, ascend, Coords > &mask,
+			const Vector< InputType, ascend, Coords > &y
+		);
+
+		template<
+			Descriptor descr,
+			bool out_is_void,
+			bool in_is_void,
+			typename OutputType,
+			typename MaskType,
+			typename InputType,
+			typename Coords
+		>
+		RC boolean_dispatcher_masked_set(
+			const bool loop_over_y,
+			const bool already_dense_input_y,
+			const bool already_dense_mask,
+			const bool mask_is_dense,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_x,
+			const Coords &local_mask,
+			const Coords &local_y,
+			Vector< OutputType, ascend, Coords > &x,
+			const Vector< MaskType, ascend, Coords > &mask,
+			const Vector< InputType, ascend, Coords > &y
+		) {
+			if( loop_over_y ) {
+				if( already_dense_input_y ) {
+					if( already_dense_mask ) {
+						if( mask_is_dense ) {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									true, true, true, true
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						} else {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									true, true, true, false
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						}
+					} else {
+						if( mask_is_dense ) {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									true, true, false, true
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						} else {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									true, true, false, false
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						}
+					}
+				} else {
+					if( already_dense_mask ) {
+						if( mask_is_dense ) {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									true, false, true, true
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						} else {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									true, false, true, false
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						}
+					} else {
+						if( mask_is_dense ) {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									true, false, false, true
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						} else {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									true, false, false, false
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						}
+					}
+				}
+			} else {
+				if( already_dense_input_y ) {
+					if( already_dense_mask ) {
+						if( mask_is_dense ) {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									false, true, true, true
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						} else {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									false, true, true, false
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						}
+					} else {
+						if( mask_is_dense ) {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									false, true, false, true
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						} else {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									false, true, false, false
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						}
+					}
+				} else {
+					if( already_dense_mask ) {
+						if( mask_is_dense ) {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									false, false, true, true
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						} else {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									false, false, true, false
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						}
+					} else {
+						if( mask_is_dense ) {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									false, false, false, true
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						} else {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									false, false, false, false
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						}
+					}
+				}
+			}
+		}
+
+	} // end namespace ``internal''
+
+} // end namespace ``grb''
+
+#endif
+
diff --git a/include/graphblas/ascend/collectives.hpp b/include/graphblas/ascend/collectives.hpp
new file mode 100644
index 000000000..2ff096166
--- /dev/null
+++ b/include/graphblas/ascend/collectives.hpp
@@ -0,0 +1,91 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Collectives implementation for the Ascend backend.
+ *
+ * @author A. N. Yzelman
+ * @date 12th of September, 2023
+ */
+
+#ifndef _H_GRB_ASCEND_COLL
+#define _H_GRB_ASCEND_COLL
+
+#include <type_traits>
+
+#include <graphblas/backends.hpp>
+#include <graphblas/base/collectives.hpp>
+#include <graphblas/descriptors.hpp>
+#include <graphblas/rc.hpp>
+
+
+namespace grb {
+
+	/** The collectives class is based on that of the reference backend */
+	template<>
+	class collectives< ascend > {
+
+		private:
+
+			/** Disallow instantiation of this class. */
+			collectives() {}
+
+
+		public:
+
+			template<
+				Descriptor descr = descriptors::no_operation,
+				class Operator, typename IOType
+			>
+			static RC allreduce( IOType &inout, const Operator op = Operator() ) {
+				return collectives< reference >::allreduce< descr, Operator, IOType >(
+					inout, op );
+			}
+
+			template<
+				Descriptor descr = descriptors::no_operation,
+				class Operator, typename IOType
+			>
+			static RC reduce(
+				IOType &inout, const size_t root = 0, const Operator op = Operator()
+			) {
+				return collectives< reference >::reduce< descr, Operator, IOType >( inout,
+					root, op );
+			}
+
+			template< typename IOType >
+			static RC broadcast( IOType &inout, const size_t root = 0 ) {
+				return collectives< reference >::broadcast< IOType >( inout, root );
+			}
+
+			template< Descriptor descr = descriptors::no_operation, typename IOType >
+			static RC broadcast(
+				IOType * inout, const size_t size,
+				const size_t root = 0
+			) {
+				return collectives< reference >::broadcast< descr, IOType >( inout, size,
+					root );
+			}
+
+	}; // end class `collectives< ascend >'
+
+} // namespace grb
+
+#endif // end ``_H_GRB_ASCEND_COLL''
+
diff --git a/include/graphblas/ascend/config.hpp b/include/graphblas/ascend/config.hpp
new file mode 100644
index 000000000..0e468af5a
--- /dev/null
+++ b/include/graphblas/ascend/config.hpp
@@ -0,0 +1,148 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Configuration settings for the Ascend backend.
+ *
+ * @author A. N. Yzelman
+ * @date 12th of September, 2023
+ */
+
+#ifndef _H_GRB_ASCEND_CONFIG
+#define _H_GRB_ASCEND_CONFIG
+
+#include <graphblas/base/config.hpp>
+#include <graphblas/nonblocking/config.hpp>
+
+
+namespace grb {
+
+	/**
+	 * \defgroup ascendConfig Nonblocking backend configuration
+	 *
+	 * \ingroup config
+	 *
+	 * All configuration parameters for the #grb::ascend backend.
+	 *
+	 * @{
+	 */
+
+	namespace config {
+
+		/**
+		 * The various supported Ascend boards.
+		 */
+		enum Ascend {
+			Ascend_910A,
+			Ascend_910B
+		};
+
+		/**
+		 * Class with information about the Ascend cache/scratchpad hierarchy.
+		 */
+		//template< enum Ascend = _ASC_DEFAULT_TARGET > TODO FIXME no way to get this passed in from alpcxx / grbcxx
+		template< enum Ascend = Ascend_910B > // Assuming 910B default instead
+		class ASCEND_CACHE_HIERARCHY {};
+
+		/**
+		 * Cache hierarchy parameters for the 910A.
+		 */
+		template<>
+		class ASCEND_CACHE_HIERARCHY< Ascend_910A > {
+			public:
+				static constexpr const size_t UB_SIZE = 8192;
+		};
+
+		/**
+		 * Cache hierarchy parameters for the 910B.
+		 */
+		template<>
+		class ASCEND_CACHE_HIERARCHY< Ascend_910B > {
+			public:
+				static constexpr const size_t UB_SIZE = 8192;
+		};
+
+		/**
+		 * Implementation-dependent configuration parameters for the \a ascend
+		 * backend.
+		 *
+		 * \note The user documentation only specifies the fields that under some
+		 *       circumstances may benefit from a user adapting it. For viewing all
+		 *       fields, please see the developer documentation.
+		 *
+		 * \note Adapting the fields should be done with care and may require
+		 *       re-compilation and re-installation of the ALP framework.
+		 *
+		 * \ingroup ascendConfig
+		 *
+		 * @see grb::config::IMPLEMENTATION
+		 */
+		template<>
+		class IMPLEMENTATION< ascend > {
+
+			public:
+
+				/**
+				 * A private memory segment shall never be accessed by threads other than
+				 * the thread who allocates it. Therefore we choose aligned mode here.
+				 */
+				static constexpr ALLOC_MODE defaultAllocMode() {
+					return ALLOC_MODE::ALIGNED;
+				}
+
+				/**
+				 * For the ascend backend, a shared memory-segment should use
+				 * interleaved alloc so that any thread has uniform access on average.
+				 */
+				static constexpr ALLOC_MODE sharedAllocMode() {
+					return ALLOC_MODE::INTERLEAVED;
+				}
+
+				/**
+				 * \internal
+				 * By default, use the coordinates of the selected backend.
+				 *
+				 * \note This is an extension that may, at some later stage, be used for
+				 *       composability with the #grb::bsp1d and #grb::hybrid backends.
+				 * \endinternal
+				 */
+				static constexpr Backend coordinatesBackend() {
+					return IMPLEMENTATION< nonblocking >::coordinatesBackend();
+				}
+
+				/**
+				 * \internal
+				 * Whether the backend has vector capacities always fixed to their
+				 * defaults.
+				 * \endinternal
+				 */
+				static constexpr bool fixedVectorCapacities() {
+					return true;
+				}
+
+		};
+
+	} // namespace config
+
+	/** @} */
+
+} // namespace grb
+
+#endif // end ``_H_GRB_ASCEND_CONFIG''
+
diff --git a/include/graphblas/ascend/coordinates.hpp b/include/graphblas/ascend/coordinates.hpp
new file mode 100644
index 000000000..9f04187d2
--- /dev/null
+++ b/include/graphblas/ascend/coordinates.hpp
@@ -0,0 +1,701 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Coordinates for the Ascend backend
+ *
+ * @author A. N. Yzelman
+ * @date 12th of September, 2023
+ */
+
+#ifndef _H_GRB_ASCEND_COORDINATES
+#define _H_GRB_ASCEND_COORDINATES
+
+#include <stdexcept> //std::runtime_error
+#include <vector>
+#if defined _DEBUG && !defined NDEBUG
+ #include <set>
+#endif
+
+#include <stddef.h> //size_t
+#include <assert.h>
+
+#include <graphblas/rc.hpp>
+#include <graphblas/backends.hpp>
+#include <graphblas/descriptors.hpp>
+
+#include <graphblas/utils.hpp>
+
+#include <graphblas/base/coordinates.hpp>
+
+#include <graphblas/reference/config.hpp>
+
+#include <graphblas/ascend/init.hpp>
+
+#include <graphblas/nonblocking/analytic_model.hpp>
+
+
+namespace grb {
+
+	namespace internal {
+
+		/**
+		 * The Coordinates class is based on that of the reference backend.
+		 * A set of new methods is added to handle local coordinates used
+		 * by the ascend backend. The bufferSize method used by the
+		 * Matrix class relies on parbufSize and prefixbufSize that have
+		 * their own implementation for the ascend backend.
+		 */
+		template<>
+		class Coordinates< ascend > {
+
+			public:
+
+				typedef typename config::VectorIndexType StackType;
+
+				typedef bool ArrayType;
+
+
+			private:
+
+				bool * __restrict__ _assigned;
+
+				StackType * __restrict__ _stack;
+
+				StackType * __restrict__ _buffer;
+
+				size_t _n;
+
+				size_t _cap;
+
+				size_t _buf;
+
+				// pointers to the data of the local coordinates mechanism
+				std::vector< config::VectorIndexType * > local_buffer;
+				config::VectorIndexType * __restrict__ local_new_nnzs;
+				config::VectorIndexType * __restrict__ pref_sum;
+
+				// the analytic model used during the execution of a pipeline
+				AnalyticModel analytic_model;
+
+
+			public:
+
+				static inline size_t arraySize( const size_t dim ) noexcept {
+					if( dim == 0 ) {
+						return 0;
+					}
+					return ( dim + 1 ) * sizeof( ArrayType );
+				}
+
+				static inline size_t stackSize( const size_t dim ) noexcept {
+					if( dim == 0 ) {
+						return 0;
+					}
+					return ( dim + 1 ) * sizeof( StackType );
+				}
+
+				static inline size_t prefixbufSize() noexcept {
+					int P = 1;
+					return ( P + 1 ) * sizeof( StackType );
+				}
+
+				static inline size_t parbufSize( const size_t n ) noexcept {
+					return n * sizeof( StackType );
+				}
+
+				static inline size_t bufferSize( const size_t dim ) noexcept {
+					size_t ret = stackSize( dim );
+					ret += parbufSize( dim );
+					ret += prefixbufSize();
+					return ret;
+				}
+
+				inline Coordinates() noexcept :
+					_assigned( nullptr ), _stack( nullptr ), _buffer( nullptr ),
+					_n( 0 ), _cap( 0 ), _buf( 0 )
+				{}
+
+				inline Coordinates( Coordinates< ascend > &&x ) noexcept :
+					_assigned( x._assigned ), _stack( x._stack ), _buffer( x._buffer ),
+					_n( x._n ), _cap( x._cap ), _buf( x._buf )
+				{
+					x._assigned = nullptr;
+					x._stack = nullptr;
+					x._buffer = nullptr;
+					x._n = x._cap = x._buf = 0;
+				}
+
+				inline Coordinates( const Coordinates< ascend > &x ) noexcept :
+					_assigned( x._assigned ), _stack( x._stack ), _buffer( x._buffer ),
+					_n( x._n ), _cap( x._cap ), _buf( x._buf )
+				{
+					assert( this != &x );
+				}
+
+				inline Coordinates< ascend > & operator=(
+					const Coordinates< ascend > &other
+				) {
+					Coordinates replace( other );
+					*this = std::move( replace );
+					return *this;
+				}
+
+				inline Coordinates< ascend > & operator=(
+					Coordinates< ascend > &&x
+				) noexcept {
+					assert( this != &x );
+					_assigned = x._assigned;
+					_stack = x._stack;
+					_buffer = x._buffer;
+					_n = x._n;
+					_cap = x._cap;
+					_buf = x._buf;
+					x._assigned = nullptr;
+					x._stack = x._buffer = nullptr;
+					x._n = x._cap = x._buf = 0;
+					return *this;
+				}
+
+				inline ~Coordinates() noexcept {
+					// done (the #_assigned and #_stack memory
+					// blocks are not managed by this class)
+				}
+
+				void set(
+					void * const arr, bool arr_initialized,
+					void * const buf, const size_t dim, bool parallel = true
+				) noexcept {
+					// catch trivial case
+					if( arr == nullptr || buf == nullptr ) {
+						assert( arr == nullptr );
+						assert( buf == nullptr );
+						assert( dim == 0 );
+						_assigned = nullptr;
+						_stack = nullptr;
+						_buffer = nullptr;
+						_n = 0;
+						_cap = 0;
+						_buf = 0;
+						return;
+					}
+
+					// _assigned has no alignment issues, take directly from input buffer
+					assert( reinterpret_cast< uintptr_t >( _assigned ) % sizeof( bool ) == 0 );
+					_assigned = static_cast< bool * >( arr );
+					// ...but _stack does have potential alignment issues:
+					char * buf_raw = static_cast< char * >( buf );
+					constexpr const size_t size = sizeof( StackType );
+					const size_t mod = reinterpret_cast< uintptr_t >( buf_raw ) % size;
+					if( mod != 0 ) {
+						buf_raw += size - mod;
+					}
+					_stack = reinterpret_cast< StackType * >( buf_raw );
+					// no alignment issues between stack and buffer, so just shift by dim:
+					_buffer = _stack + dim;
+					// initialise
+					_n = 0;
+					_cap = dim;
+					_buf = 0;
+
+					// and initialise _assigned (but only if necessary)
+					if( dim > 0 && !arr_initialized ) {
+						if( parallel ) {
+							#pragma omp parallel
+							{
+								size_t start, end;
+								config::OMP::localRange( start, end, 0, dim );
+								for( size_t i = start; i < end; ++i ) {
+									_assigned[ i ] = false;
+								}
+							}
+						} else {
+							for( size_t i = 0; i < dim; ++i ) {
+								_assigned[ i ] = false;
+							}
+						}
+					}
+				}
+
+				inline bool assign( const size_t i ) noexcept {
+					if( _n == _cap ) {
+						return true;
+					}
+					if( !_assigned[ i ] ) {
+						_assigned[ i ] = true;
+						const size_t newSize = _n + 1;
+						assert( _n <= _cap );
+						assert( newSize <= _cap );
+						_stack[ _n ] = i;
+						_n = newSize;
+						return false;
+					} else {
+						return true;
+					}
+				}
+
+				template< bool maybe_invalid = false >
+				inline void local_assignAll( ) noexcept {
+					if( maybe_invalid || _n != _cap ) {
+						if( _assigned != nullptr ) {
+							assert( _stack != nullptr );
+							assert( maybe_invalid || _n < _cap );
+							assert( !maybe_invalid || _n <= _cap );
+							_n = _cap;
+
+							for( size_t i = 0; i < _n; ++i ) {
+								_assigned[ i ] = true;
+								_stack[ i ] = i;
+							}
+						}
+					}
+
+					// the counter of initial nonzeroes in the local stack is stored in the
+					// buffer immediately before the local stack
+					StackType * __restrict__ local_nnzs = _stack - 1;
+
+					// the counter for the local stack must be set to zero such that the number
+					// of new nonzeroes will be set to _n by asyncJoinSubset and joinSubset
+					// will update the global stack based on the local_new_nnzs counter the
+					// global stack has become empty and _assigned = false so the local
+					// coordinates of this tile must be added in the global stack from scratch
+					// regardless whether this tile was already dense or not as it is hard to
+					// know which part of the global stack contains the coordinates of this
+					// tile
+					*local_nnzs = 0;
+				}
+
+				template< bool maybe_invalid = false >
+				inline void local_assignAllNotAlreadyAssigned( ) noexcept {
+					if( maybe_invalid || _n != _cap ) {
+						if( _assigned != nullptr ) {
+							assert( _stack != nullptr );
+							assert( maybe_invalid || _n < _cap );
+							assert( !maybe_invalid || _n <= _cap );
+
+							// searching for the not already assigned elements and add them to the
+							// local stack such that joinSubset will add to the global stack only
+							// those elements that are not already assigned
+							for( size_t i = 0; i < _cap; ++i ) {
+								if( !_assigned[ i ] ) {
+									_assigned[ i ] = true;
+									_stack[ _n++ ] = i;
+								}
+							}
+
+							assert( _n == _cap );
+						}
+					}
+				}
+
+				inline void clear() noexcept {
+
+					if( _n == _cap ) {
+#ifndef NDEBUG
+						if( _assigned == nullptr && _cap > 0 ) {
+							const bool dense_coordinates_may_not_call_clear = false;
+							assert( dense_coordinates_may_not_call_clear );
+						}
+#endif
+
+						#pragma omp parallel for schedule( dynamic, config::CACHE_LINE_SIZE::value() )
+						for( size_t i = 0; i < _cap; ++i ) {
+							_assigned[ i ] = false;
+						}
+					} else {
+						if( _n < config::OMP::minLoopSize() ) {
+							for( size_t k = 0; k < _n; ++k ) {
+								_assigned[ _stack[ k ] ] = false;
+							}
+						} else {
+							#pragma omp parallel for schedule( dynamic, config::CACHE_LINE_SIZE::value() )
+							for( size_t k = 0; k < _n; ++k ) {
+								_assigned[ _stack[ k ] ] = false;
+							}
+						}
+					}
+					_n = 0;
+				}
+
+				inline void local_clear() noexcept {
+
+					if( _n == _cap ) {
+#ifndef NDEBUG
+						if( _assigned == nullptr && _cap > 0 ) {
+							const bool dense_coordinates_may_not_call_clear = false;
+							assert( dense_coordinates_may_not_call_clear );
+						}
+#endif
+
+						for( size_t i = 0; i < _cap; ++i ) {
+							_assigned[ i ] = false;
+						}
+					} else {
+						for( size_t k = 0; k < _n; ++k ) {
+							_assigned[ _stack[ k ] ] = false;
+						}
+					}
+					_n = 0;
+
+					// the counter of initial nonzeroes in the local stack is stored in the
+					// buffer immediately before the local stack
+					StackType * __restrict__ local_nnzs = _stack - 1;
+
+					// the counter for the local stack must be set to zero such that any new
+					// assigned element will be written to the global stack
+					*local_nnzs = 0;
+				}
+
+				inline void reset_global_nnz_counter() noexcept {
+					_n = 0;
+				}
+
+				inline bool isEmpty() const noexcept {
+					if( _n == 0 ) {
+						return true;
+					} else {
+						return false;
+					}
+				}
+
+				inline bool isDense() const noexcept {
+					return _n == _cap;
+				}
+
+				inline size_t size() const noexcept {
+					return _cap;
+				}
+
+				inline bool assigned( const size_t i ) const noexcept {
+					assert( i < _cap );
+					return _n == _cap || _assigned[ i ];
+				}
+
+				template< Descriptor descr, typename T >
+				inline bool mask( const size_t i, const T * const val ) const noexcept {
+					assert( i < _cap );
+					return utils::interpretMask< descr >( assigned( i ), val, i );
+				}
+
+				inline size_t nonzeroes() const noexcept {
+					assert( _n <= _cap );
+					return _n;
+				}
+
+				inline size_t index( const size_t k ) const noexcept {
+					assert( k < _n );
+					return isDense() ? k : _stack[ k ];
+				}
+
+				void localCoordinatesInit( const AnalyticModel &am ) {
+
+					analytic_model = am;
+
+					const size_t nthreads = analytic_model.getNumThreads();
+					const size_t tile_size = analytic_model.getTileSize();
+					const size_t num_tiles = analytic_model.getNumTiles();
+
+					assert( num_tiles > 0 );
+					assert( _buf >= 4 * num_tiles );
+
+					local_buffer.resize( analytic_model.getNumTiles() );
+
+					#pragma omp parallel for schedule(dynamic) num_threads(nthreads)
+					for( size_t tile_id = 0; tile_id < num_tiles; ++tile_id ) {
+						local_buffer[ tile_id ] = _buffer + tile_id * ( tile_size + 1 );
+					}
+
+					local_new_nnzs = _buffer + num_tiles * ( tile_size + 1 );
+					pref_sum = _buffer + num_tiles * ( tile_size + 2 );
+				}
+
+				/**
+				 * Initialises a Coordinate instance that refers to a subset of this
+				 * coordinates instance. Multiple disjoint subsets may be retrieved
+				 * and concurrently updated, up to a maximum of tiles given by
+				 *   #internal::ASCEND::maxBufferTiles().
+				 *
+				 * Subsets must be contiguous. If one thread calls this function, all
+				 * other threads must make a matching call.
+				 *
+				 * @param[in] lower_bound     The start index of the contiguous subset
+				 *                            (inclusive).
+				 * @param[in] upper_bound     The end index of the contiguous subset
+				 *                            (exclusive).
+				 */
+				void asyncSubsetInit(
+					const size_t lower_bound,
+					const size_t upper_bound
+				) noexcept {
+					if( _cap == 0 ) {
+						return;
+					}
+
+					const size_t tile_id = lower_bound / analytic_model.getTileSize();
+
+					config::VectorIndexType *local_nnzs = local_buffer[ tile_id ];
+					config::VectorIndexType *local_stack = local_buffer[ tile_id ] + 1;
+
+					*local_nnzs = 0;
+					if( upper_bound - lower_bound < _n ) {
+						for( size_t i = lower_bound; i < upper_bound; ++i ) {
+							if( _assigned[ i ] ) {
+								local_stack[ (*local_nnzs)++ ] = i - lower_bound;
+							}
+						}
+					} else {
+						for( size_t i = 0; i < _n; ++i ) {
+							const size_t k = _stack[ i ];
+							if( lower_bound <= k && k < upper_bound ) {
+								assert( _assigned[ k ] );
+								local_stack[ (*local_nnzs)++ ] = k - lower_bound;
+							}
+						}
+					}
+
+					// the number of new nonzeroes is initialized here
+					local_new_nnzs[ tile_id ] = 0;
+				}
+
+				/**
+				 * Retrieves a subset coordinate instance that was previously initialised
+				 * using a call to #asyncSubsetInit.
+				 *
+				 * @returns A Coordinates instance that only supports sequential
+				 *          (synchronous) updates as well as all queries.
+				 */
+				Coordinates< ascend > asyncSubset(
+					const size_t lower_bound, const size_t upper_bound
+				) const noexcept {
+					assert(_cap > 0);
+
+					const size_t tile_id = lower_bound / analytic_model.getTileSize();
+
+					config::VectorIndexType *local_nnzs = local_buffer[ tile_id ];
+					config::VectorIndexType *local_stack = local_buffer[ tile_id ] + 1;
+
+					Coordinates< ascend > ret;
+					assert( upper_bound - lower_bound <= analytic_model.getTileSize() );
+
+					ret.set( _assigned + lower_bound, true, local_stack,
+						upper_bound - lower_bound, false );
+
+					// the number of new nonzeroes is used to determine the total number
+					// of nonzeroes for the given local coordinates, since some of the
+					// nonzeroes are already written on the local statck
+					ret._n = (*local_nnzs) + local_new_nnzs[ tile_id ];
+					assert( ret._n <= ret._cap );
+
+					ret._buf = 0;
+
+					return ret;
+				}
+
+				/**
+				 * Saves the state of a subset Coordinates instance. Can be retrieved later
+				 * once again via a call to #asyncSubset. New nonzeroes will be committed
+				 * to the global coordinate structure via a call to #joinSubset, which will
+				 * furthermore set the related tile to inactive.
+				 */
+				void asyncJoinSubset(
+					const Coordinates< ascend > &subset,
+					const size_t lower_bound, const size_t upper_bound
+				) {
+					assert( _cap > 0 );
+
+					(void) upper_bound;
+
+					const size_t tile_id = lower_bound / analytic_model.getTileSize();
+
+					config::VectorIndexType *local_nnzs = local_buffer[ tile_id ];
+
+					assert( subset._n <= subset._cap );
+					assert( (*local_nnzs) <= subset._cap );
+
+					local_new_nnzs[ tile_id ] = subset._n - (*local_nnzs);
+				}
+
+				bool newNonZeroes() const {
+
+					if( _cap == 0 ) {
+						return false;
+					}
+
+					const size_t num_tiles = analytic_model.getNumTiles();
+
+					for( size_t i = 0; i < num_tiles; i++ ) {
+						if( local_new_nnzs[ i ] > 0 ) {
+							return true;
+						}
+					}
+					return false;
+				}
+
+				void prefixSumComputation() {
+
+					const size_t num_tiles = analytic_model.getNumTiles();
+
+					// takes into accout the size of data for each iteration of the prefix sum
+					// computation which is used to determine the number of parallel task that
+					// should be used such that the data of each parallel task fit in the L1
+					// cache
+					constexpr size_t size_of_data = sizeof( pref_sum[0] ) +
+						sizeof( local_new_nnzs[0] );
+
+					// make use of the analytic model to estimate a proper number of threads
+					// and a tile size
+					AnalyticModel am( size_of_data, num_tiles, 1 );
+
+					const size_t nthreads = am.getNumThreads();
+					const size_t prefix_sum_tile_size = am.getTileSize();
+					const size_t prefix_sum_num_tiles = am.getNumTiles();
+
+					// make a run-time decision to choose between sequential and parallel
+					// prefix sum implementation the sequential prefix sum implementation is
+					// more efficient for a small number of tiles
+					if( num_tiles < prefix_sum_tile_size ) {
+						// sequential computation of the prefix sum
+						pref_sum[ 0 ] = _n + local_new_nnzs[ 0 ];
+						for( size_t i = 1; i < num_tiles; i++ ) {
+							pref_sum[ i ] = pref_sum[ i - 1 ] + local_new_nnzs[ i ];
+						}
+					} else {
+						// parallel computation of the prefix sum
+						size_t local_prefix_sum[ prefix_sum_num_tiles ];
+
+						#pragma omp parallel num_threads(nthreads)
+						{
+							#pragma omp for
+							for( size_t id = 0; id < prefix_sum_num_tiles; id++ ) {
+
+								size_t lower, upper;
+								config::OMP::localRange( lower, upper, 0, num_tiles,
+									prefix_sum_tile_size, id, prefix_sum_num_tiles );
+
+								// the number of threads used for parallel computation must not exceed
+								// num_tiles, otherwise the code below results in data races
+								assert( id <= num_tiles );
+								assert( id < prefix_sum_num_tiles - 1 || upper == num_tiles );
+								assert( lower <= upper );
+								assert( upper <= num_tiles );
+
+								pref_sum[ lower ] = local_new_nnzs[ lower ];
+								for( size_t i = lower + 1; i < upper; i++ ) {
+									pref_sum[ i ] = pref_sum[ i - 1 ] + local_new_nnzs[ i ];
+								}
+
+								// each thread stores the prefix sum of its last element in
+								// local_prefix_sum
+								// the memory location is specified by the identifier of the thread to
+								// avoid data races
+								local_prefix_sum[ id ] = pref_sum[ upper - 1 ];
+							}
+
+							// here, there is an implicit barrier that ensures all threads have
+							// already written the local prefix sum for each parallel task
+
+							// a single threads computes the prefix sum for the last element of each
+							// thread
+							#pragma omp single
+							{
+								for( size_t i = 1; i < prefix_sum_num_tiles; i++ ) {
+									local_prefix_sum[ i ] += local_prefix_sum[ i - 1 ];
+								}
+							}
+
+							#pragma omp for
+							for(size_t id = 0; id < prefix_sum_num_tiles; id++ ) {
+
+								size_t lower, upper;
+								config::OMP::localRange( lower, upper, 0, num_tiles,
+									prefix_sum_tile_size, id, prefix_sum_num_tiles );
+
+								// the first thread (id=0) needs to add only the number of nonzeroes(_n)
+								const size_t acc = _n + ( ( id > 0 ) ? local_prefix_sum[ id - 1 ] : 0 );
+								for( size_t i = lower; i < upper; i++ ) {
+									pref_sum[ i ] += acc;
+								}
+							}
+						}
+
+#ifdef _DEBUG
+						// ensures that the parallel implementation computes the same result
+						// with the following sequential implementation
+						size_t seq_offsets[ num_tiles ];
+						seq_offsets[ 0 ] = _n + local_new_nnzs[ 0 ];
+						for( size_t i = 1; i < num_tiles; i++ ) {
+							seq_offsets[ i ] = seq_offsets[ i - 1 ] + local_new_nnzs[ i ];
+						}
+
+						for( size_t i = 0; i < num_tiles; i++ ) {
+							assert( seq_offsets[i] == pref_sum[i] );
+						}
+#endif
+					}
+
+					// a single thread updates the number of nonzeroes
+					// the last element of prefix_sum_ofssets alredy includes
+					// the current number of nonzeroes _n which was added earlier
+					_n = pref_sum[ num_tiles - 1 ];
+				}
+
+				/**
+				 * Takes a currently active subset and commits it to the global storage.
+				 * After completion the given active tile will be marked inactive.
+				 */
+				void joinSubset( const size_t lower_bound, const size_t upper_bound ) {
+					if( _cap == 0 ) {
+						return;
+					}
+#ifdef NDEBUG
+					( void )upper_bound;
+#endif
+					const size_t tile_id = lower_bound / analytic_model.getTileSize();
+
+					config::VectorIndexType *local_nnzs = local_buffer[ tile_id ];
+					config::VectorIndexType *local_stack = local_buffer[ tile_id ] + 1;
+
+					const size_t local_stack_start = *local_nnzs;
+					const size_t local_stack_end = *local_nnzs + local_new_nnzs[ tile_id ];
+					assert( local_stack_start <= local_stack_end );
+
+					size_t pos = pref_sum[ tile_id ] - local_new_nnzs[ tile_id ];
+
+					for( size_t k = local_stack_start; k < local_stack_end; ++k ) {
+						const size_t local_index = local_stack[ k ];
+						const size_t global_index = local_index + lower_bound;
+
+						assert( global_index >= lower_bound );
+						assert( global_index < upper_bound );
+						assert( _assigned[ global_index ] );
+						assert( pos < _cap );
+
+						_stack[ pos++ ] = global_index;
+					}
+
+					local_new_nnzs[ tile_id ] = 0;
+				}
+			};
+
+	} // namespace internal
+
+} // namespace grb
+
+#endif // end `_H_GRB_ASCEND_COORDINATES'
+
diff --git a/include/graphblas/ascend/data_utils.h b/include/graphblas/ascend/data_utils.h
new file mode 100644
index 000000000..042de8bb0
--- /dev/null
+++ b/include/graphblas/ascend/data_utils.h
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
+ */
+#ifndef DATA_UTILS_H
+#define DATA_UTILS_H
+#include <iostream>
+#include <fstream>
+#include <cstdio>
+#include <string>
+#include <vector>
+#include <iomanip>
+#include <cassert>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include "acl/acl.h"
+
+typedef enum {
+    DT_UNDEFINED = -1,
+    FLOAT = 0,
+    HALF = 1,
+    INT8_T = 2,
+    INT32_T = 3,
+    UINT8_T = 4,
+    INT16_T = 6,
+    UINT16_T = 7,
+    UINT32_T = 8,
+    INT64_T = 9,
+    UINT64_T = 10,
+    DOUBLE = 11,
+    BOOL = 12,
+    STRING = 13,
+    COMPLEX64 = 16,
+    COMPLEX128 = 17,
+    BF16 = 27
+} printDataType;
+
+#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO]  " fmt "\n", ##args)
+#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN]  " fmt "\n", ##args)
+#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR]  " fmt "\n", ##args)
+#define CHECK_ACL(x)                                                                        \
+    do {                                                                                    \
+        aclError __ret = x;                                                                 \
+        if (__ret != ACL_ERROR_NONE) {                                                      \
+            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
+        }                                                                                   \
+    } while (0);
+
+/**
+ * @brief Read data from file
+ * @param [in] filePath: file path
+ * @param [out] fileSize: file size
+ * @return read result
+ */
+bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize)
+{
+    struct stat sBuf;
+    int fileStatus = stat(filePath.data(), &sBuf);
+    if (fileStatus == -1) {
+        ERROR_LOG("failed to get file");
+        return false;
+    }
+    if (S_ISREG(sBuf.st_mode) == 0) {
+        ERROR_LOG("%s is not a file, please enter a file", filePath.c_str());
+        return false;
+    }
+
+    std::ifstream file;
+    file.open(filePath, std::ios::binary);
+    if (!file.is_open()) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    std::filebuf *buf = file.rdbuf();
+    size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in);
+    if (size == 0) {
+        ERROR_LOG("file size is 0");
+        file.close();
+        return false;
+    }
+    if (size > bufferSize) {
+        ERROR_LOG("file size is larger than buffer size");
+        file.close();
+        return false;
+    }
+    buf->pubseekpos(0, std::ios::in);
+    buf->sgetn(static_cast<char *>(buffer), size);
+    fileSize = size;
+    file.close();
+    return true;
+}
+
+/**
+ * @brief Write data to file
+ * @param [in] filePath: file path
+ * @param [in] buffer: data to write to file
+ * @param [in] size: size to write
+ * @return write result
+ */
+bool WriteFile(const std::string &filePath, const void *buffer, size_t size)
+{
+    if (buffer == nullptr) {
+        ERROR_LOG("Write file failed. buffer is nullptr");
+        return false;
+    }
+
+    int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE);
+    if (fd < 0) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    auto writeSize = write(fd, buffer, size);
+    (void) close(fd);
+    if (writeSize != size) {
+        ERROR_LOG("Write file Failed.");
+        return false;
+    }
+
+    return true;
+}
+
+template<typename T>
+void DoPrintData(const T *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << data[i];
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void DoPrintHalfData(const aclFloat16 *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << std::setprecision(6) << aclFloat16ToFloat(data[i]);
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void PrintData(const void *data, size_t count, printDataType dataType, size_t elementsPerRow=16)
+{
+    if (data == nullptr) {
+        ERROR_LOG("Print data failed. data is nullptr");
+        return;
+    }
+
+    switch (dataType) {
+        case BOOL:
+            DoPrintData(reinterpret_cast<const bool *>(data), count, elementsPerRow);
+            break;
+        case INT8_T:
+            DoPrintData(reinterpret_cast<const int8_t *>(data), count, elementsPerRow);
+            break;
+        case UINT8_T:
+            DoPrintData(reinterpret_cast<const uint8_t *>(data), count, elementsPerRow);
+            break;
+        case INT16_T:
+            DoPrintData(reinterpret_cast<const int16_t *>(data), count, elementsPerRow);
+            break;
+        case UINT16_T:
+            DoPrintData(reinterpret_cast<const uint16_t *>(data), count, elementsPerRow);
+            break;
+        case INT32_T:
+            DoPrintData(reinterpret_cast<const int32_t *>(data), count, elementsPerRow);
+            break;
+        case UINT32_T:
+            DoPrintData(reinterpret_cast<const uint32_t *>(data), count, elementsPerRow);
+            break;
+        case INT64_T:
+            DoPrintData(reinterpret_cast<const int64_t *>(data), count, elementsPerRow);
+            break;
+        case UINT64_T:
+            DoPrintData(reinterpret_cast<const uint64_t *>(data), count, elementsPerRow);
+            break;
+        case HALF:
+            DoPrintHalfData(reinterpret_cast<const aclFloat16 *>(data), count, elementsPerRow);
+            break;
+        case FLOAT:
+            DoPrintData(reinterpret_cast<const float *>(data), count, elementsPerRow);
+            break;
+        case DOUBLE:
+            DoPrintData(reinterpret_cast<const double *>(data), count, elementsPerRow);
+            break;
+        default:
+            ERROR_LOG("Unsupported type: %d", dataType);
+    }
+    std::cout << std::endl;
+}
+#endif // DATA_UTILS_H
diff --git a/include/graphblas/ascend/exec.hpp b/include/graphblas/ascend/exec.hpp
new file mode 100644
index 000000000..c7807c6c5
--- /dev/null
+++ b/include/graphblas/ascend/exec.hpp
@@ -0,0 +1,104 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Implements the launcher for the Ascend backend.
+ *
+ * @author A. N. Yzelman
+ * @date 12th of September, 2023
+ */
+
+#ifndef _H_GRB_ASCEND_EXEC
+#define _H_GRB_ASCEND_EXEC
+
+#include <graphblas/backends.hpp>
+#include <graphblas/base/exec.hpp>
+
+#include "init.hpp"
+
+
+namespace grb {
+
+	/** The Launcher class is based on that of the reference backend */
+	template< EXEC_MODE mode >
+	class Launcher< mode, ascend > {
+
+		private:
+
+			Launcher< mode, reference > ref;
+
+		public:
+
+			/**
+			 * This implementation only accepts a single user process. It ignores
+			 * \a hostname and \a port.
+			 */
+			Launcher(
+				const size_t process_id = 0,
+				const size_t nprocs = 1,
+				const std::string hostname = "localhost",
+				const std::string port = "0"
+			) {
+				// ignore hostname and port
+				(void) hostname;
+				(void) port;
+				// sanity checks
+				if( nprocs != 1 ) {
+					throw std::invalid_argument( "Total number of user processes must be "
+						"exactly one when using the ascend implementation."
+					);
+				}
+				if( process_id != 0 ) {
+					throw std::invalid_argument( "Process ID must always be zero in the "
+						"ascend implementation."
+					);
+				}
+			}
+
+			/** No implementation notes. */
+			~Launcher() {}
+
+			/** exec is based on that of the reference backend */
+			template< typename U >
+			RC exec(
+				void ( *grb_program )( const void *, const size_t, U & ),
+				const void * data_in, const size_t in_size,
+				U &data_out, const bool broadcast = false
+			) const {
+				return ref.exec( grb_program, data_in, in_size, data_out, broadcast );
+			}
+
+			/** exec is based on that of the reference backend */
+			template< typename T, typename U >
+			RC exec(
+				void ( *grb_program )( const T &, U & ),
+				const T &data_in, U &data_out,
+				const bool broadcast = false
+			) {
+				return ref.exec( grb_program, data_in, data_out, broadcast );
+			}
+
+			/** finalize is based on that of the reference backend */
+			grb::RC finalize() { return ref.finalize(); }
+	};
+
+} // namespace grb
+
+#endif // end ``_H_GRB_ASCEND_EXEC''
+
diff --git a/include/graphblas/ascend/forward.hpp b/include/graphblas/ascend/forward.hpp
new file mode 100644
index 000000000..9d2e2fbec
--- /dev/null
+++ b/include/graphblas/ascend/forward.hpp
@@ -0,0 +1,51 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Forward declarations required by the Ascend backend.
+ *
+ * @author A. N. Yzelman
+ * @date 12th of September, 2023
+ */
+
+#ifndef _H_GRB_ASCEND_FORWARD
+#define _H_GRB_ASCEND_FORWARD
+
+
+namespace grb {
+
+	// The eWiseLambda is a friend of matrix but defined in blas2. Therefore it is
+	// forward-declared and this forward definition file included from both
+	// matrix.hpp and blas2.hpp
+	template<
+		class ActiveDistribution = internal::Distribution< ascend >,
+		typename Func, typename DataType,
+		typename RIT, typename CIT, typename NIT
+	>
+	RC eWiseLambda(
+		const Func f,
+		const Matrix< DataType, ascend, RIT, CIT, NIT > &A,
+		const size_t s = 0, const size_t P = 1
+	);
+	// end eWiseLambda declarations
+
+} // namespace grb
+
+#endif // end ``_H_GRB_ASCEND_FORWARD''
+
diff --git a/include/graphblas/ascend/grid.hpp b/include/graphblas/ascend/grid.hpp
new file mode 100644
index 000000000..7b0255213
--- /dev/null
+++ b/include/graphblas/ascend/grid.hpp
@@ -0,0 +1,186 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 12th of September, 2023
+ */
+
+#ifndef _H_ALP_ASCEND_GRID
+#define _H_ALP_ASCEND_GRID
+
+#include <functional>
+#include <vector>
+
+#include <graphblas.hpp>
+#include <graphblas/ascend/opgen.hpp>
+#include <graphblas/ascend/semantics.hpp>
+#include <graphblas/ascend/lazy_evaluation.hpp>
+
+namespace alp
+{
+	namespace internal
+	{
+		extern AscendLazyEvaluation ale;
+	}
+}
+
+namespace alp {
+
+	namespace internal {
+
+		class iGrid {
+
+			private:
+				size_t process_order;
+				size_t problem_order;
+
+			public:
+				iGrid( size_t proc, size_t prob );
+				size_t getProcessOrder() const noexcept;
+				size_t getProblemOrder() const noexcept;
+				std::string processSize( const size_t k ) const noexcept;
+				std::string processMode( const size_t k ) const noexcept;
+				std::string problemSize( const size_t k ) const noexcept;
+				std::string problemMode( const size_t k ) const noexcept;
+				std::string problemMainMode( const size_t k ) const noexcept;
+				std::string problemTileMode( const size_t k ) const noexcept;
+				std::string tileSize( const size_t k ) const noexcept;
+		};
+
+	}
+
+	/**
+	 * Specific to the ALP/Ascend backend, this class maps problem spaces on
+	 * process grids in a symbolic fashion.
+	 */
+	template< size_t process_order, size_t problem_order >
+	class Grid {
+
+		private:
+			// problem mesh related state:
+//			std::vector< std::string > problem_sizes, problem_space, chunk_sizes;
+
+		public:
+			Grid() noexcept;
+			std::string processSize( const size_t k ) const noexcept;
+			std::string processMode( const size_t k ) const noexcept;
+			std::string problemSize( const size_t k ) const noexcept;
+			std::string problemMode( const size_t k ) const noexcept;
+			std::string problemMainMode( const size_t k ) const noexcept;
+			std::string problemTileMode( const size_t k ) const noexcept;
+			std::string tileSize( const size_t k ) const noexcept;
+//			std::string chunkSize( const size_t k ) const noexcept;
+			grb::RC forEach( const std::vector< int > axes, const std::function < void( void ) > func ) const;
+	};
+
+	template< size_t process_order, size_t problem_order >
+	Grid< process_order, problem_order >::Grid() noexcept
+	{
+//		for( size_t k = 0; k < problem_order; ++k ) {
+//			   chunk_sizes.push_back( "BLOCK_LENGTH" + k );
+//		}
+	}
+
+	template< size_t process_order, size_t problem_order >
+	std::string Grid< process_order, problem_order >::processSize( const size_t k ) const noexcept {
+		return "p" + std::to_string( k );
+	}
+
+	template< size_t process_order, size_t problem_order >
+	std::string Grid< process_order, problem_order >::processMode( const size_t k ) const noexcept {
+		return "a" + std::to_string( k );
+	}
+
+	template< size_t process_order, size_t problem_order >
+	std::string Grid< process_order, problem_order >::problemSize( const size_t k ) const noexcept {
+		return "n" + std::to_string( k );
+	}
+
+	template< size_t process_order, size_t problem_order >
+	std::string Grid< process_order, problem_order >::problemMode( const size_t k ) const noexcept {
+		return "i" + std::to_string( k );
+	}
+
+	template< size_t process_order, size_t problem_order >
+	std::string Grid< process_order, problem_order >::problemMainMode( const size_t k ) const noexcept {
+		return "z" + std::to_string( k );
+	}
+
+	template< size_t process_order, size_t problem_order >
+	std::string Grid< process_order, problem_order >::problemTileMode( const size_t k ) const noexcept {
+		return "t" + std::to_string( k );
+	}
+
+	template< size_t process_order, size_t problem_order >
+	std::string Grid< process_order, problem_order >::tileSize( const size_t k ) const noexcept {
+		return "tile_size" + std::to_string( k );
+	}
+
+/*	template< size_t process_order, size_t problem_order >
+	std::string Grid< process_order, problem_order >::chunkSize( const size_t k ) const noexcept {
+		return chunk_sizes[ k ];
+	}
+*/
+	template< size_t process_order, size_t problem_order >
+	grb::RC Grid< process_order, problem_order >::forEach( const std::vector< int > axes, const std::function < void( void ) > func ) const {
+
+		alp::internal::OpGen gen();
+
+		if( internal::OpGen::lastAxes.size() > 0 && internal::OpGen::lastAxes != axes ) {
+			alp::internal::ale.addPipeline();
+		}
+
+		if( internal::invalidForEachAxes( axes ) == true ) {
+			std::cerr << "The axes of a nested forEach cannot overlap with the axes of another forEach." << std::endl;
+			std::abort();
+		}
+
+		internal::OpGen::forEachAxes.push_back( axes );
+
+		// indicate the beginning of the forEach
+		internal::OpGen::forEachLevel++;
+
+		// TODO: this is currently used only by the Tensor class in the getView method
+		//		 perhaps the getView should be a method of the Grid class
+//		internal::OpGen::parallelAxes = axes;
+
+		// the current design assumes that each forEach is a new pipeline
+		// which is explicitly added here, later we need to figure out
+		// how we determine the creation of a pipeline
+//		alp::internal::ale.addPipeline( axes );
+
+		// TODO: emit for-loop intro
+		func();
+		// TODO: emit for-loop outro
+
+		// before leaving a forEach loop, any getView of an input Tensor
+		// should match with an implicit Stage for freeing any allocated memory
+		alp::internal::ale.insertFreeInputTensorStages( internal::vectorOfVectorsToVector( internal::OpGen::forEachAxes ) );
+
+		// indicate the end of the forEach
+		internal::OpGen::forEachLevel--;
+
+		internal::OpGen::forEachAxes.pop_back();
+
+		internal::OpGen::lastAxes = axes;
+
+		return grb::SUCCESS;
+	}
+}
+
+#endif
+
diff --git a/include/graphblas/ascend/init.hpp b/include/graphblas/ascend/init.hpp
new file mode 100644
index 000000000..e6d35829d
--- /dev/null
+++ b/include/graphblas/ascend/init.hpp
@@ -0,0 +1,61 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the initialisation and finalisation routines for the ascend
+ * backend.
+ *
+ * @author A. N. Yzelman
+ * @date 12th of September, 2023
+ */
+
+#ifndef _H_GRB_ASCEND_INIT
+#define _H_GRB_ASCEND_INIT
+
+#include <graphblas/base/init.hpp>
+#include <graphblas/utils/DMapper.hpp>
+
+
+namespace grb {
+
+	template<>
+	RC init< ascend >( const size_t, const size_t, void * const );
+
+	template<>
+	RC finalize< ascend >();
+
+	namespace internal {
+
+		/** Internal state of the ascend backend. */
+		class ASCEND {
+
+			friend RC init< ascend >( const size_t, const size_t, void * const );
+
+			private:
+
+			public:
+
+		};
+
+	}
+
+} // namespace grb
+
+#endif //``end _H_GRB_ASCEND_INIT''
+
diff --git a/include/graphblas/ascend/io.hpp b/include/graphblas/ascend/io.hpp
new file mode 100644
index 000000000..9e2177874
--- /dev/null
+++ b/include/graphblas/ascend/io.hpp
@@ -0,0 +1,1353 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the I/O primitives for the Ascend backend.
+ *
+ * @author A. N. Yzelman
+ * @date 12th of September, 2023
+ */
+
+#ifndef _H_GRB_ASCEND_IO
+#define _H_GRB_ASCEND_IO
+
+#include <graphblas/base/io.hpp>
+#include <graphblas/vector.hpp>
+#include <graphblas/matrix.hpp>
+
+#include <graphblas/nonblocking/lazy_evaluation.hpp>
+
+#include "boolean_dispatcher_io.hpp"
+
+#define NO_CAST_ASSERT( x, y, z )                                              \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | Provide a value input iterator with element "      \
+		"types that match the output vector element type.\n"                   \
+		"* Possible fix 3 | If applicable, provide an index input iterator "   \
+		"with element types that are integral.\n"                              \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+
+namespace grb {
+
+	namespace internal {
+
+		extern LazyEvaluation le;
+
+	}
+
+}
+
+namespace grb {
+
+	/**
+	 * \defgroup IO Data Ingestion -- ascend backend
+	 * @{
+	 */
+
+	template< typename InputType, typename RIT, typename CIT, typename NIT >
+	uintptr_t getID( const Matrix< InputType, ascend, RIT, CIT, NIT > &A ) {
+		return getID( internal::getRefMatrix( A ) );
+	}
+
+	template< typename DataType, typename Coords >
+	size_t size( const Vector< DataType, ascend, Coords > &x ) noexcept {
+		return internal::getCoordinates( x ).size();
+	}
+
+	template< typename InputType, typename RIT, typename CIT, typename NIT >
+	size_t nrows(
+		const Matrix< InputType, ascend, RIT, CIT, NIT > &A
+	) noexcept {
+		return nrows( internal::getRefMatrix( A ) );
+	}
+
+	template< typename InputType, typename RIT, typename CIT, typename NIT >
+	size_t ncols(
+		const Matrix< InputType, ascend, RIT, CIT, NIT > &A
+	) noexcept {
+		return ncols( internal::getRefMatrix( A ) );
+	}
+
+	template< typename DataType, typename Coords >
+	size_t nnz( const Vector< DataType, ascend, Coords > &x ) noexcept {
+		internal::le.execution( &x );
+		return internal::getCoordinates( x ).nonzeroes();
+	}
+
+	template< typename InputType, typename RIT, typename CIT, typename NIT >
+	size_t nnz(
+		const Matrix< InputType, ascend, RIT, CIT, NIT > &A
+	) noexcept {
+		return nnz( internal::getRefMatrix( A ) );
+	}
+
+	template< typename DataType, typename Coords >
+	size_t capacity( const Vector< DataType, ascend, Coords > &x ) noexcept {
+		return internal::getCoordinates( x ).size();
+	}
+
+	template< typename DataType, typename RIT, typename CIT, typename NIT >
+	size_t capacity(
+		const Matrix< DataType, ascend, RIT, CIT, NIT > &A
+	) noexcept {
+		return capacity( internal::getRefMatrix( A ) );
+	}
+
+	template< typename DataType, typename Coords >
+	RC clear( Vector< DataType, ascend, Coords > &x ) noexcept {
+		internal::le.execution( &x );
+		internal::getCoordinates( x ).clear();
+		assert( false );
+		return UNSUPPORTED;
+	}
+
+	template< typename InputType, typename RIT, typename CIT, typename NIT >
+	RC clear(
+		Matrix< InputType, ascend, RIT, CIT, NIT > &A
+	) noexcept {
+		return clear( internal::getRefMatrix( A ) );
+	}
+
+	template<
+		typename InputType,
+		typename Coords
+	>
+	RC resize(
+		Vector< InputType, ascend, Coords > &x,
+		const size_t new_nz
+	) noexcept {
+		internal::le.execution( &x );
+#ifdef _DEBUG
+		std::cerr << "In grb::resize (vector, ascend)\n";
+#endif
+		// this cannot wait until after the below check, as the spec defines that
+		// anything is OK for an empty vector
+		if( new_nz == 0 ) {
+			return grb::clear( x );
+		}
+
+		// check if we have a mismatch
+		if( new_nz > grb::size( x ) ) {
+#ifdef _DEBUG
+			std::cerr << "\t requested capacity of " << new_nz << ", "
+				<< "expected a value smaller than or equal to "
+				<< size( x ) << "\n";
+#endif
+			return ILLEGAL;
+		}
+
+		// in the ascend implementation, vectors are of static size
+		// so this function immediately succeeds. However, all existing contents
+		// must be removed
+		return grb::clear( x );
+	}
+
+	template<
+		typename InputType,
+		typename RIT,
+		typename CIT,
+		typename NIT
+	>
+	RC resize(
+		Matrix< InputType, ascend, RIT, CIT, NIT > &A,
+		const size_t new_nz
+	) noexcept {
+		return resize( internal::getRefMatrix( A ), new_nz );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename DataType,
+		typename T,
+		typename Coords
+	>
+	RC set(
+		Vector< DataType, ascend, Coords > &x,
+		const T val,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< DataType >::value &&
+			!grb::is_object< T >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< DataType, T >::value
+			), "grb::set (Vector, unmasked)",
+			"called with a value type that does not match that of the given vector"
+		);
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		RC ret = SUCCESS;
+
+		// pre-cast value to be copied
+		const DataType toCopy = static_cast< DataType >( val );
+		DataType * const raw = internal::getRaw( x );
+		const size_t n = internal::getCoordinates( x ).size();
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func = [&x, toCopy, raw] (
+			internal::Pipeline &pipeline,
+			size_t lower_bound, size_t upper_bound
+		) {
+#ifdef _ASCEND_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage set(x, val) in the range("
+				<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			(void) pipeline;
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+			if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				bool already_dense_output = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( x ) );
+				if( !already_dense_output ) {
+#endif
+					Coords local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+						upper_bound );
+
+					local_x.local_assignAllNotAlreadyAssigned();
+					assert( local_x.nonzeroes() == local_x.size() );
+
+					internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+						upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			for( size_t i = lower_bound; i < upper_bound; i++ ) {
+				raw[ i ] = internal::template ValueOrIndex<
+						descr, DataType, DataType
+					>::getFromScalar( toCopy, i );
+			}
+
+			return SUCCESS;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::IO_SET_SCALAR,
+				n, sizeof( DataType ), dense_descr, true,
+				&x, nullptr,
+				&internal::getCoordinates( x ), nullptr,
+				nullptr, nullptr, nullptr, nullptr,
+				nullptr, nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: SET(x, val)" << std::endl;
+#endif
+		return ret;
+	}
+
+	namespace internal {
+
+		template<
+			Descriptor descr,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool loop_over_vector_length,
+			bool already_dense_mask,
+			bool mask_is_dense,
+#endif
+			typename DataType,
+			typename MaskType,
+			typename T,
+			typename Coords
+		>
+		RC masked_set(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool loop_over_vector_length,
+			bool already_dense_mask,
+			bool mask_is_dense,
+#endif
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_x,
+			const Coords &local_mask,
+			Vector< DataType, ascend, Coords > &x,
+			const Vector< MaskType, ascend, Coords > &m,
+			const T val
+		) {
+			// pre-cast value to be copied
+			const DataType toCopy = static_cast< DataType >( val );
+
+			DataType * const raw = internal::getRaw( x );
+			const MaskType * const m_p = internal::getRaw( m );
+
+#ifdef _DEBUG
+			if( loop_over_vector_length ) {
+				std::cout << "\t using loop of size n (the vector length)\n";
+			} else {
+				std::cout << "\t using loop of size nz (the number of nonzeroes in the "
+					<< "vector)\n";
+			}
+#endif
+
+			const size_t local_n = upper_bound - lower_bound;
+			const size_t local_mask_nz = already_dense_mask
+				? local_n
+				: local_mask.nonzeroes();
+
+			const size_t local_size_n = loop_over_vector_length
+				? local_x.size()
+				: local_mask_nz;
+
+			for( size_t k = 0; k < local_size_n; ++k ) {
+
+				const size_t index = ( ( loop_over_vector_length || already_dense_mask )
+					? k
+					: local_mask.index( k ) ) + lower_bound;
+				assert( index < internal::getCoordinates( x ).size() );
+				if( already_dense_mask ) {
+					if( !internal::getCoordinates( m ).template mask< descr >( index, m_p ) ) {
+						continue;
+					}
+				} else {
+					if( !local_mask.template mask< descr >(
+						index - lower_bound, m_p + lower_bound
+					) ) {
+						continue;
+					}
+				}
+				if( !mask_is_dense ) {
+					(void) local_x.assign( index - lower_bound );
+				}
+				raw[ index ] = internal::ValueOrIndex<
+						descr, DataType, DataType
+					>::getFromScalar( toCopy, index );
+			}
+
+			assert( false );
+			return UNSUPPORTED;
+		}
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename DataType,
+		typename MaskType,
+		typename T,
+		typename Coords
+	>
+	RC set(
+		Vector< DataType, ascend, Coords > &x,
+		const Vector< MaskType, ascend, Coords > &m,
+		const T val,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< DataType >::value && !grb::is_object< T >::value,
+		void >::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In grb::set (vector-to-value, masked)\n";
+#endif
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< DataType, T >::value ), "grb::set (Vector to scalar, masked)",
+			"called with a value type that does not match that of the given "
+			"vector"
+		);
+
+		// catch empty mask
+		if( size( m ) == 0 ) {
+			return set< descr >( x, val, phase );
+		}
+
+		// dynamic sanity checks
+		const size_t sizex = size( x );
+		if( sizex != size( m ) ) {
+			return MISMATCH;
+		}
+
+		// handle trivial resize
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+		constexpr const bool dense_mask = dense_descr &&
+			(descr & descriptors::structural) &&
+			!(descr & descriptors::invert_mask);
+
+		// then source is a pattern vector, just copy its pattern
+		internal::Pipeline::stage_type func = [&x, &m, val] (
+			internal::Pipeline &pipeline,
+			size_t lower_bound, size_t upper_bound
+		) {
+#ifdef _ASCEND_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage set(x, m, val) in the range("
+				<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			(void) pipeline;
+
+			Coords local_mask, local_x;
+			const size_t local_n = upper_bound - lower_bound;
+			size_t local_x_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+			bool already_dense_mask = true;
+
+			const bool mask_is_dense = (descr & descriptors::structural) &&
+				!(descr & descriptors::invert_mask) && already_dense_vectors;
+
+			// for out-of-place operations with a mask and a scalar input, whether the
+			// output is dense or not depends on the mask
+			if( !mask_is_dense ) {
+				local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+					upper_bound );
+				local_x_nz = local_x.nonzeroes();
+				if( dense_descr && local_x_nz < local_n ) {
+					return ILLEGAL;
+				}
+			}
+
+			if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_mask = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( m ) );
+				if( !already_dense_mask ) {
+#else
+				already_dense_mask = false;
+#endif
+					local_mask = internal::getCoordinates( m ).asyncSubset( lower_bound,
+						upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			if( !mask_is_dense ) {
+				local_x.local_clear();
+				if( lower_bound == 0 ) {
+					internal::getCoordinates( x ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					pipeline.markMaybeSparseVector( &internal::getCoordinates( x ) );
+#endif
+					if( dense_descr ) {
+						pipeline.markMaybeSparseDenseDescriptorVerification(
+							&internal::getCoordinates( x ) );
+					}
+				}
+			}
+
+			const bool loop_over_vector_length = ( descr & descriptors::invert_mask ) ||
+				( 4 * local_mask.nonzeroes() > 3 * local_mask.size() );
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+			rc = internal::boolean_dispatcher_masked_set<
+#else
+			rc = internal::masked_set<
+#endif
+					descr, DataType, MaskType, T, Coords
+				>(
+					loop_over_vector_length,
+					already_dense_mask, mask_is_dense,
+					lower_bound, upper_bound,
+					local_x, local_mask, x, m, val
+				);
+
+			if( !mask_is_dense ) {
+				internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::IO_SET_MASKED_SCALAR,
+				sizex, sizeof( DataType ),
+				dense_descr, dense_mask,
+				&x, nullptr, &internal::getCoordinates( x ), nullptr,
+				&m, nullptr, nullptr, nullptr,
+				&internal::getCoordinates( m ), nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: set(x, m, val)" << std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename DataType,
+		typename T,
+		typename Coords
+	>
+	RC setElement(
+		Vector< DataType, ascend, Coords > &x,
+		const T val,
+		const size_t i,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< DataType >::value &&
+				!grb::is_object< T >::value, void
+			>::type * const = nullptr
+	) {
+		internal::le.execution( &x );
+
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< DataType, T >::value ),
+			"grb::set (Vector, at index)",
+			"called with a value type that does not match that of the given "
+			"vector"
+		);
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// dynamic sanity checks
+		if( i >= size( x ) ) {
+			return MISMATCH;
+		}
+		if( (descr & descriptors::dense) && nnz( x ) < size( x ) ) {
+			return ILLEGAL;
+		}
+
+		// do set
+		(void)internal::getCoordinates( x ).assign( i );
+		internal::getRaw( x )[ i ] = static_cast< DataType >( val );
+
+#ifdef _DEBUG
+		std::cout << "setElement (ascend) set index " << i << " to value "
+			<< internal::getRaw( x )[ i ] << "\n";
+#endif
+		assert( false );
+		return UNSUPPORTED;
+	}
+
+	namespace internal {
+
+		template<
+			Descriptor descr,
+			bool out_is_void,
+			bool in_is_void,
+			bool sparse,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_vectors,
+			bool already_dense_input,
+#endif
+			typename OutputType,
+			typename InputType,
+			typename Coords
+		>
+		RC set_generic(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_vectors,
+			bool already_dense_input,
+#endif
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, ascend, Coords > &x,
+			const Vector< InputType, ascend, Coords > &y
+		) {
+			const size_t local_n = upper_bound - lower_bound;
+			const size_t local_y_nz = already_dense_input
+				? local_n
+				: local_y.nonzeroes();
+
+			OutputType * __restrict__ const dst = internal::getRaw( x );
+			const InputType * __restrict__ const src = internal::getRaw( y );
+
+			if( sparse ) {
+				if( src == nullptr && dst == nullptr ) {
+					for( size_t i = 0; i < local_y_nz; ++i ) {
+						const size_t index = ( already_dense_input ) ? i : local_y.index( i );
+						if( !already_dense_vectors ) {
+							(void) local_x.assign( index );
+						}
+					}
+				} else {
+#ifndef NDEBUG
+					if( src == nullptr ) {
+						assert( dst == nullptr );
+					}
+#endif
+					for( size_t i = 0; i < local_y_nz; ++i ) {
+						const size_t index = ( ( already_dense_input )
+							? i
+							: local_y.index( i ) ) + lower_bound;
+						if( !already_dense_vectors ) {
+							(void) local_x.assign( index - lower_bound );
+						}
+						if( !out_is_void && !in_is_void ) {
+							dst[ index ] = internal::setIndexOrValue< descr, OutputType >( index,
+								src[ index ] );
+						}
+					}
+				}
+			} else {
+				if( !( src == nullptr && dst == nullptr ) ) {
+#ifndef NDEBUG
+					if( src == nullptr ) {
+						assert( dst == nullptr );
+					}
+#endif
+					for( size_t i = lower_bound; i < upper_bound; ++i ) {
+						if( !out_is_void && !in_is_void ) {
+							dst[ i ] = src[ i ];
+						}
+					}
+				}
+			}
+
+			assert( false );
+			return UNSUPPORTED;
+		}
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType,
+		typename InputType,
+		typename Coords
+	>
+	RC set(
+		Vector< OutputType, ascend, Coords > &x,
+		const Vector< InputType, ascend, Coords > &y,
+		const Phase &phase = EXECUTE
+	) {
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< OutputType, InputType >::value ),
+			"grb::copy (Vector)",
+			"called with vector parameters whose element data types do not match"
+		);
+		constexpr bool out_is_void = std::is_void< OutputType >::value;
+		constexpr bool in_is_void = std::is_void< OutputType >::value;
+		static_assert( !in_is_void || out_is_void,
+			"grb::set (ascend, vector <- vector, masked): "
+			"if input is void, then the output must be also" );
+		static_assert( !(descr & descriptors::use_index) || !out_is_void,
+			"grb::set (ascend, vector <- vector, masked): "
+			"use_index descriptor cannot be set if output vector is void" );
+
+		//get length
+		const size_t n = internal::getCoordinates( y ).size();
+		// check contract
+		if( n != size( x ) ) {
+			return MISMATCH;
+		}
+		if( n == 0 ) {
+			return SUCCESS;
+		}
+		if( getID( x ) == getID( y ) ) {
+			return ILLEGAL;
+		}
+
+		// on resize
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+		// on execute
+		assert( phase == EXECUTE );
+
+		RC ret = SUCCESS;
+
+		constexpr bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func = [&x, &y] (
+			internal::Pipeline &pipeline,
+			size_t lower_bound, size_t upper_bound
+		) {
+#ifdef _ASCEND_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage set(x, y) in the range("
+				<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			Coords local_x, local_y;
+			const size_t local_n = upper_bound - lower_bound;
+			size_t local_y_nz = local_n;
+			bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			(void) pipeline;
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+			bool already_dense_input = true;
+
+			if( !already_dense_vectors ) {
+				local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+					upper_bound );
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_input = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( y ) );
+				if( !already_dense_input ) {
+#else
+				already_dense_input = false;
+#endif
+					local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+						upper_bound );
+					local_y_nz = local_y.nonzeroes();
+					if( local_y_nz < local_n ) {
+						sparse = true;
+					}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			if( !already_dense_vectors ) {
+				if( lower_bound == 0 ) {
+					internal::getCoordinates( x ).reset_global_nnz_counter();
+				}
+			}
+
+			if( sparse ) {
+				// this primitive is out-of-place, thus make the output empty
+				if( !already_dense_vectors ) {
+					local_x.local_clear();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					pipeline.markMaybeSparseVector( &internal::getCoordinates( x ) );
+#endif
+				}
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+				rc = internal::boolean_dispatcher_set_generic<
+#else
+				rc = internal::set_generic<
+#endif
+						descr, out_is_void, in_is_void, true
+					>(
+						already_dense_vectors, already_dense_input,
+						lower_bound, upper_bound,
+						local_x, local_y, x, y
+					);
+			} else {
+				if( !already_dense_vectors ) {
+					local_x.local_assignAll();
+				}
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+				rc = internal::boolean_dispatcher_set_generic<
+#else
+				rc = internal::set_generic<
+#endif
+						descr, out_is_void, in_is_void, false
+					>(
+						already_dense_vectors, already_dense_input,
+						lower_bound, upper_bound,
+						local_x, local_y, x, y
+					);
+			}
+
+			if( !already_dense_vectors ) {
+				internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::IO_SET_VECTOR,
+				n, sizeof( OutputType ), dense_descr, true,
+				getID( x ),
+				&x, nullptr, &internal::getCoordinates( x ), nullptr,
+				getID( y ), SIZE_MAX, SIZE_MAX, SIZE_MAX,
+				&y, nullptr, nullptr, nullptr,
+				&internal::getCoordinates( y ), nullptr, nullptr, nullptr,
+				SIZE_MAX, nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: set(x, y)" << std::endl;
+#endif
+		return ret;
+	}
+
+	namespace internal {
+
+		template<
+			Descriptor descr,
+			bool out_is_void,
+			bool in_is_void,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool loop_over_y,
+			bool already_dense_input_y,
+			bool already_dense_mask,
+			bool mask_is_dense,
+#endif
+			typename OutputType,
+			typename MaskType,
+			typename InputType,
+			typename Coords
+		>
+		RC masked_set(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool loop_over_y,
+			bool already_dense_input_y,
+			bool already_dense_mask,
+			bool mask_is_dense,
+#endif
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_x,
+			const Coords &local_mask,
+			const Coords &local_y,
+			Vector< OutputType, ascend, Coords > &x,
+			const Vector< MaskType, ascend, Coords > &mask,
+			const Vector< InputType, ascend, Coords > &y
+		) {
+			const size_t local_n = upper_bound - lower_bound;
+			const size_t local_y_nz = already_dense_input_y
+				? local_n
+				: local_y.nonzeroes();
+			const size_t local_mask_nz = already_dense_mask
+				? local_n
+				: local_mask.nonzeroes();
+
+			const size_t n = loop_over_y ? local_y_nz : local_mask_nz;
+
+			for( size_t k = 0; k < n; ++k ) {
+				const size_t i = ( loop_over_y
+						? ( already_dense_input_y ? k : local_y.index( k ) )
+						: ( already_dense_mask ? k : local_mask.index( k ) )
+					) + lower_bound;
+				if( already_dense_mask ) {
+					if( !internal::getCoordinates( mask ).template mask< descr >(
+						i, internal::getRaw( mask )
+					) ) {
+						continue;
+					}
+				} else {
+					if( !local_mask.template mask< descr >(
+						i - lower_bound, internal::getRaw( mask ) + lower_bound
+					) ) {
+						continue;
+					}
+				}
+				if( loop_over_y || already_dense_input_y ||
+					local_y.assigned( i - lower_bound )
+				) {
+					if( !out_is_void && !in_is_void ) {
+						if( !mask_is_dense ) {
+							(void) local_x.assign( i - lower_bound );
+						}
+						internal::getRaw( x )[ i ] = internal::ValueOrIndex<
+								descr, OutputType, InputType
+							>::getFromArray(
+								internal::getRaw( y ),
+								[] (const size_t i) {return i;},
+								i
+							);
+					}
+				}
+			}
+
+			assert( false );
+			return UNSUPPORTED;
+		}
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType,
+		typename MaskType,
+		typename InputType,
+		typename Coords
+	>
+	RC set(
+		Vector< OutputType, ascend, Coords > &x,
+		const Vector< MaskType, ascend, Coords > &mask,
+		const Vector< InputType, ascend, Coords > &y,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< OutputType, InputType >::value ),
+			"grb::set (Vector)",
+			"called with vector parameters whose element data types do not match" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< MaskType, bool >::value ),
+			"grb::set (Vector)",
+			"called with non-bool mask element types" );
+		constexpr bool out_is_void = std::is_void< OutputType >::value;
+		constexpr bool in_is_void = std::is_void< OutputType >::value;
+		static_assert( !in_is_void || out_is_void,
+			"grb::set (ascend, vector <- vector, masked): "
+			"if input is void, then the output must be also" );
+		static_assert( !(descr & descriptors::use_index) || !out_is_void,
+			"grb::set (ascend, vector <- vector, masked): "
+			"use_index descriptor cannot be set if output vector is void" );
+
+		// catch contract violations
+		const size_t size = grb::size( y );
+		if( size != grb::size( x ) ) {
+			return MISMATCH;
+		}
+		if( size == 0 ) {
+			return SUCCESS;
+		}
+		if( getID( x ) == getID( y ) ) {
+			return ILLEGAL;
+		}
+
+		// delegate if possible
+		if( grb::size( mask ) == 0 ) {
+			return set( x, y );
+		}
+
+		// additional contract check
+		if( size != grb::size( mask ) ) {
+			return MISMATCH;
+		}
+
+		// on resize
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+		// on execute
+		assert( phase == EXECUTE );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+		constexpr const bool dense_mask = dense_descr &&
+			(descr & descriptors::structural) &&
+			!(descr & descriptors::invert_mask);
+
+		internal::Pipeline::stage_type func = [&x, &mask, &y] (
+			internal::Pipeline &pipeline,
+			size_t lower_bound, size_t upper_bound
+		) {
+#ifdef _ASCEND_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage set(x, mask, y) in the range("
+				<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			Coords local_mask, local_x, local_y;
+			const size_t local_n = upper_bound - lower_bound;
+			size_t local_mask_nz = local_n;
+			size_t local_x_nz = local_n;
+			size_t local_y_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+			bool already_dense_mask = true;
+			bool already_dense_input_y = true;
+
+			// make the vector empty unless the dense descriptor is provided
+			constexpr const bool mask_is_dense = (descr & descriptors::structural) &&
+				!(descr & descriptors::invert_mask) && already_dense_vectors;
+
+			if( !mask_is_dense ) {
+				local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+					upper_bound );
+				local_x_nz = local_x.nonzeroes();
+				if( dense_descr && local_x_nz < local_n ) {
+					return ILLEGAL;
+				}
+			}
+
+			if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_mask = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( mask ) );
+				if( !already_dense_mask ) {
+#else
+					already_dense_mask = false;
+#endif
+					local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+						upper_bound );
+					local_mask_nz = local_mask.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+
+				already_dense_input_y = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( y ) );
+				if( !already_dense_input_y ) {
+#else
+				already_dense_input_y = false;
+#endif
+					local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+						upper_bound );
+					local_y_nz = local_y.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			if( !mask_is_dense ) {
+				local_x.local_clear();
+				if( lower_bound == 0 ) {
+					internal::getCoordinates( x ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					pipeline.markMaybeSparseVector( &internal::getCoordinates( x ) );
+#endif
+					if( dense_descr ) {
+						pipeline.markMaybeSparseDenseDescriptorVerification(
+							&internal::getCoordinates( x ) );
+					}
+				}
+			}
+
+			// choose optimal loop size
+			const bool loop_over_y = (descr & descriptors::invert_mask) ||
+				( local_y_nz < local_mask_nz );
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+			rc = internal::boolean_dispatcher_masked_set<
+#else
+			rc = internal::masked_set<
+#endif
+					descr, out_is_void, in_is_void
+				>(
+					loop_over_y,
+					already_dense_input_y, already_dense_mask, mask_is_dense,
+					lower_bound, upper_bound,
+					local_x, local_mask, local_y,
+					x, mask, y
+				);
+
+			if( !mask_is_dense ) {
+				internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::IO_SET_MASKED_VECTOR,
+				size, sizeof( OutputType ), dense_descr, dense_mask,
+				&x, nullptr, &internal::getCoordinates( x ), nullptr,
+				&mask, &y, nullptr, nullptr,
+				&internal::getCoordinates( mask ), &internal::getCoordinates( y ),
+				nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _ASCEND_DEBUG
+		std::cout << "\t\tStage added to a pipeline: set(x, mask, y)" << std::endl;
+#endif
+		return ret;
+	}
+
+	namespace internal {
+
+		template<
+			bool A_is_mask,
+			Descriptor descr,
+			typename OutputType,
+			typename InputType1, typename InputType2 = const OutputType,
+			typename RIT1, typename CIT1, typename NIT1,
+			typename RIT2, typename CIT2, typename NIT2
+		>
+		RC set(
+			Matrix< OutputType, ascend, RIT1, CIT1, NIT1 > &C,
+			const Matrix< InputType1, ascend, RIT2, CIT2, NIT2 > &A,
+			const InputType2 * __restrict__ id = nullptr
+		) noexcept {
+			// ascend execution is not supported
+			// first, execute any computation that is not completed
+			grb::internal::le.execution();
+
+			// second, delegate to the reference backend
+			return set< A_is_mask, descr, OutputType, InputType1, InputType2 >(
+				internal::getRefMatrix( C ), internal::getRefMatrix( A ), id );
+		}
+
+	} // end namespace internal::grb
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename InputType,
+		typename RIT1, typename CIT1, typename NIT1,
+		typename RIT2, typename CIT2, typename NIT2
+	>
+	RC set(
+		Matrix< OutputType, ascend, RIT1, CIT1, NIT1 > &C,
+		const Matrix< InputType, ascend, RIT2, CIT2, NIT2 > &A,
+		const Phase &phase = EXECUTE
+	) noexcept {
+		static_assert( std::is_same< OutputType, void >::value ||
+			!std::is_same< InputType, void >::value,
+			"grb::set cannot interpret an input pattern matrix without a "
+			"semiring or a monoid. This interpretation is needed for "
+			"writing the non-pattern matrix output. Possible solutions: 1) "
+			"use a (monoid-based) foldl / foldr, 2) use a masked set, or "
+			"3) change the output of grb::set to a pattern matrix also." );
+#ifdef _DEBUG
+		std::cout << "Called grb::set (matrix-to-matrix, ascend)" << std::endl;
+#endif
+		// static checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< InputType, OutputType >::value
+			), "grb::set",
+			"called with non-matching value types" );
+
+		// dynamic checks
+		assert( phase != TRY );
+
+		// delegate
+		if( phase == RESIZE ) {
+			return resize( C, nnz( A ) );
+		} else {
+			assert( phase == EXECUTE );
+			return internal::set< false, descr >( C, A );
+		}
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename RIT1, typename CIT1, typename NIT1,
+		typename RIT2, typename CIT2, typename NIT2
+	>
+	RC set(
+		Matrix< OutputType, ascend, RIT1, CIT1, NIT1 > &C,
+		const Matrix< InputType1, ascend, RIT2, CIT2, NIT2 > &A,
+		const InputType2 &val,
+		const Phase &phase = EXECUTE
+	) noexcept {
+		static_assert( !std::is_same< OutputType, void >::value,
+			"internal::grb::set (masked set to value): cannot have a pattern "
+			"matrix as output" );
+#ifdef _DEBUG
+		std::cout << "Called grb::set (matrix-to-value-masked, ascend)\n";
+#endif
+		// static checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< InputType2, OutputType >::value
+			), "grb::set",
+			"called with non-matching value types"
+		);
+
+		// dynamic checks
+		assert( phase != TRY );
+
+		// delegate
+		if( phase == RESIZE ) {
+			return resize( C, nnz( A ) );
+		} else {
+			assert( phase == EXECUTE );
+			if( std::is_same< OutputType, void >::value ) {
+				return internal::set< false, descr >( C, A );
+			} else {
+				return internal::set< true, descr >( C, A, &val );
+			}
+		}
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType,
+		typename fwd_iterator,
+		typename Coords,
+		class Dup = operators::right_assign< InputType >
+	>
+	RC buildVector(
+		Vector< InputType, ascend, Coords > &x,
+		fwd_iterator start,
+		const fwd_iterator end,
+		const IOMode mode,
+		const Dup &dup = Dup()
+	) {
+		return buildVector< descr, InputType, fwd_iterator, Coords, Dup >(
+			internal::getRefVector( x ), start, end, mode, dup );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType,
+		typename fwd_iterator1,
+		typename fwd_iterator2,
+		typename Coords,
+		class Dup = operators::right_assign< InputType >
+	>
+	RC buildVector(
+		Vector< InputType, ascend, Coords > &x,
+		fwd_iterator1 ind_start,
+		const fwd_iterator1 ind_end,
+		fwd_iterator2 val_start,
+		const fwd_iterator2 val_end,
+		const IOMode mode,
+		const Dup &dup = Dup()
+	) {
+		internal::le.execution( &x );
+		return buildVector<
+				descr, InputType, fwd_iterator1, fwd_iterator2, Coords, Dup
+			>(
+				internal::getRefVector( x ), ind_start, ind_end, val_start, val_end,
+				mode, dup
+			);
+	}
+
+	/** buildMatrixUnique is based on that of the reference backend */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		typename fwd_iterator
+	>
+	RC buildMatrixUnique(
+		Matrix< InputType, ascend, RIT, CIT, NIT > &A,
+		fwd_iterator start,
+		const fwd_iterator end,
+		const IOMode mode
+	) {
+		return buildMatrixUnique<
+				descr, InputType, RIT, CIT, NIT, fwd_iterator
+			>( internal::getRefMatrix(A), start, end, mode );
+	}
+
+	template<
+		typename InputType,
+		typename Coords
+	>
+	uintptr_t getID( const Vector< InputType, ascend, Coords > &x ) {
+		return getID( internal::getRefVector( x ) );
+	}
+
+	template<>
+	RC wait< ascend >();
+
+	/** \internal Dispatch to base wait implementation */
+	template<
+		typename InputType,
+		typename Coords,
+		typename ... Args
+	>
+	RC wait(
+		const Vector< InputType, ascend, Coords > &x,
+		const Args &... args
+	) {
+		RC ret = internal::le.execution( &x );
+		if( ret != SUCCESS ) {
+			return ret;
+		}
+		return wait( args... );
+	}
+
+	template<
+		typename InputType,
+		typename Coords
+	>
+	RC wait( const Vector< InputType, ascend, Coords > &x ) {
+		return internal::le.execution( &x );
+	}
+
+	/** \internal Dispatch to base wait implementation */
+	template<
+		typename InputType,
+		typename RIT, typename CIT, typename NIT,
+		typename... Args
+	>
+	RC wait(
+		const Matrix< InputType, ascend, RIT, CIT, NIT > &A,
+		const Args &... args
+	) {
+		(void) A;
+		//TODO: currently, matrices are read only and no action is required
+		//		once the level-3 primitives are implemented
+		//		the pipeline should be executed like for vectors
+		return wait( args... );
+	}
+
+	template< typename InputType, typename RIT, typename CIT, typename NIT >
+	RC wait( const Matrix< InputType, ascend > &A ) {
+		(void) A;
+		//TODO: currently, matrices are read only and no action is required
+		//		once the level-3 primitives are implemented
+		//		the pipeline should be executed like for vectors
+		//return wait( args... );
+		assert( false );
+		return UNSUPPORTED;
+	}
+
+	/** @} */
+
+} // namespace grb
+
+#undef NO_CAST_ASSERT
+
+#endif // end ``_H_GRB_ASCEND_IO
+
diff --git a/include/graphblas/ascend/lazy_evaluation.hpp b/include/graphblas/ascend/lazy_evaluation.hpp
new file mode 100644
index 000000000..683129b16
--- /dev/null
+++ b/include/graphblas/ascend/lazy_evaluation.hpp
@@ -0,0 +1,80 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _H_ALP_ASCEND_LAZY_EVALUATION
+#define _H_ALP_ASCEND_LAZY_EVALUATION
+
+/**
+ * To enable debugging information only for the ascend backend, the code
+ * should be combiled with the _ASCEND_DEBUG definition, without defining
+ * _DEBUG. If the code is compiled with _DEBUG, the debugging information for
+ * the ascend backend is enabled as well.
+ */
+#if !defined(_ASCEND_DEBUG) && defined(_DEBUG)
+ #define _ASCEND_DEBUG
+#endif
+
+#include <vector>
+
+#include <graphblas/ascend/pipeline.hpp>
+#include <graphblas/ascend/tensor.hpp>
+#include <graphblas/ascend/utils.hpp>
+
+namespace alp {
+
+	namespace internal {
+
+		class Stage;
+
+		/**
+		 * Encodes a single pipeline that may be expanded, merged, or executed.
+		 */
+		class AscendLazyEvaluation {
+
+			private:
+
+				size_t num_pipelines;
+				std::vector< alp::internal::AscendPipeline > pipelines;
+
+
+			public:
+
+				AscendLazyEvaluation();
+				void addPipeline();
+				void insertFreeInputTensorStages( const std::vector< int > &forEachAxes );
+				const Tensor &store( const Tensor &output_tensor );
+				void clear();
+				void addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const Tensor &tensor1, const double alpha, const std::vector< int > &forEachAxes );
+				void addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const Tensor &tensor1, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes );
+				void addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const Tensor &tensor1, const Tensor &tensor2, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes );
+				void addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const Tensor &tensor1, const Tensor &tensor2, const Tensor &tensor3, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes );
+//				void addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const Tensor &tensor1, const Tensor &tensor2, const Tensor &tensor3, const Tensor &tensor4, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes );
+				void generateDeclarations( std::stringstream &declarations );
+//				void generateConstructor( std::stringstream &constructor );
+				void generateHostBody( std::stringstream &os, std::stringstream &analyticModelArgs, std::stringstream &analyticModelFormalParams, std::stringstream &analyticModelDecls, std::stringstream &analyticModelConstrBody );
+				void generateInit( std::stringstream &init );
+				void generateProcess( std::stringstream &process, std::stringstream &processCall );
+
+				void debug_print() const;
+		};
+
+	}
+
+}
+
+#endif //end `_H_ALP_ASCEND_LAZY_EVALUATION'
+
diff --git a/include/graphblas/ascend/matrix.hpp b/include/graphblas/ascend/matrix.hpp
new file mode 100644
index 000000000..d224ce91b
--- /dev/null
+++ b/include/graphblas/ascend/matrix.hpp
@@ -0,0 +1,602 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the Ascend matrix container.
+ *
+ * @author A. N. Yzelman 
+ * @date 12th of September, 2023
+ */
+
+#ifndef _H_GRB_ASCEND_MATRIX
+#define _H_GRB_ASCEND_MATRIX
+
+#include <sstream> //std::stringstream
+#include <algorithm>
+#include <functional>
+#include <limits>
+#include <stdexcept>
+#include <utility>
+#include <iterator>
+#include <cmath>
+
+#include <assert.h>
+
+#include <graphblas/backends.hpp>
+#include <graphblas/base/matrix.hpp>
+#include <graphblas/config.hpp>
+#include <graphblas/utils.hpp>
+#include <graphblas/ops.hpp>
+#include <graphblas/rc.hpp>
+#include <graphblas/reference/compressed_storage.hpp>
+#include <graphblas/reference/init.hpp>
+#include <graphblas/type_traits.hpp>
+#include <graphblas/utils/autodeleter.hpp>
+#include <graphblas/utils/DMapper.hpp>
+#include <graphblas/type_traits.hpp>
+
+#include <graphblas/algorithms/hpcg/ndim_matrix_builders.hpp>
+#include <graphblas/utils/iterators/utils.hpp>
+
+#include <graphblas/reference/NonzeroWrapper.hpp>
+
+#include "forward.hpp"
+
+
+namespace grb {
+
+	namespace internal {
+
+		template< typename DataType, typename RIT, typename CIT, typename NIT >
+		Matrix< DataType, reference, RIT, CIT, NIT >& getRefMatrix(
+			Matrix< DataType, ascend, RIT, CIT, NIT > &A ) noexcept;
+
+		template< typename DataType, typename RIT, typename CIT, typename NIT >
+		const Matrix< DataType, reference, RIT, CIT, NIT >& getRefMatrix(
+			const Matrix< DataType, ascend, RIT, CIT, NIT > &A ) noexcept;
+
+		template< typename D, typename RIT, typename CIT, typename NIT >
+		const size_t & getNonzeroCapacity(
+			const grb::Matrix< D, ascend, RIT, CIT, NIT > &A
+		) noexcept {
+			return A.cap;
+		}
+
+		template< typename D, typename RIT, typename CIT, typename NIT >
+		const size_t & getCurrentNonzeroes(
+			const grb::Matrix< D, ascend, RIT, CIT, NIT > &A
+		) noexcept {
+			return A.nz;
+		}
+
+		template< typename D, typename RIT, typename CIT, typename NIT >
+		void setCurrentNonzeroes(
+			grb::Matrix< D, ascend, RIT, CIT, NIT > &A,
+			const size_t nnz
+		) noexcept {
+			A.nz = nnz;
+		}
+
+		/**
+		 * \internal
+		 *
+		 * Retrieves internal SPA buffers.
+		 *
+		 * @param[out] coorArr Pointer to the bitmask array
+		 * @param[out] coorBuf Pointer to the stack
+		 * @param[out] valBuf  Pointer to the value buffer
+		 * @param[in]    k     If 0, the row-wise SPA is returned
+		 *                     If 1, the column-wise SPA is returned
+		 *                     Any other value is not allowed
+		 * @param[in]    A     The matrix of which to return the associated SPA
+		 *                     data structures.
+		 *
+		 * @tparam InputType The type of the value buffer.
+		 *
+		 * \endinternal
+		 */
+		template< typename InputType, typename RIT, typename CIT, typename NIT >
+		void getMatrixBuffers(
+			char * &coorArr, char * &coorBuf, InputType * &valbuf,
+			const unsigned int k,
+			const grb::Matrix< InputType, ascend, RIT, CIT, NIT > &A
+		) noexcept {
+			assert( k < 2 );
+			coorArr = const_cast< char * >( A.coorArr[ k ] );
+			coorBuf = const_cast< char * >( A.coorBuf[ k ] );
+			valbuf = const_cast< InputType * >( A.valbuf[ k ] );
+		}
+
+		template< Descriptor descr,
+			bool input_dense, bool output_dense,
+			bool masked,
+			bool left_handed,
+			template< typename > class One,
+			typename IOType,
+			class AdditiveMonoid, class Multiplication,
+			typename InputType1, typename InputType2, typename InputType3,
+			typename RowColType, typename NonzeroType,
+			typename Coords
+		>
+		void vxm_inner_kernel_scatter(
+			RC &rc,
+			Vector< IOType, ascend, Coords > &destination_vector,
+			IOType * __restrict__ const &destination,
+			const size_t &destination_range,
+			const Vector< InputType1, ascend, Coords > &source_vector,
+			const InputType1 * __restrict__ const &source,
+			const size_t &source_index,
+			const internal::Compressed_Storage<
+				InputType2, RowColType, NonzeroType
+			> &matrix,
+			const Vector< InputType3, ascend, Coords > &mask_vector,
+			const InputType3 * __restrict__ const &mask,
+			const AdditiveMonoid &add,
+			const Multiplication &mul,
+			const std::function< size_t( size_t ) > &src_local_to_global,
+			const std::function< size_t( size_t ) > &dst_global_to_local
+		);
+
+		template<
+			Descriptor descr,
+			bool masked, bool input_masked, bool left_handed,
+			template< typename > class One,
+			class AdditiveMonoid, class Multiplication,
+			typename IOType, typename InputType1, typename InputType2,
+			typename InputType3, typename InputType4,
+			typename Coords, typename RIT, typename CIT, typename NIT
+		>
+		RC vxm_generic(
+			Vector< IOType, ascend, Coords > &u,
+			const Vector< InputType3, ascend, Coords > &mask,
+			const Vector< InputType1, ascend, Coords > &v,
+			const Vector< InputType4, ascend, Coords > &v_mask,
+			const Matrix< InputType2, ascend, RIT, CIT, NIT > &A,
+			const AdditiveMonoid &add,
+			const Multiplication &mul,
+			const std::function< size_t( size_t ) > &row_l2g,
+			const std::function< size_t( size_t ) > &row_g2l,
+			const std::function< size_t( size_t ) > &col_l2g,
+			const std::function< size_t( size_t ) > &col_g2l
+		);
+
+	} // namespace internal
+
+	template< typename DataType, typename RIT, typename CIT, typename NIT >
+	size_t nrows(
+		const Matrix< DataType, ascend, RIT, CIT, NIT > &
+	) noexcept;
+
+	template< typename DataType, typename RIT, typename CIT, typename NIT >
+	size_t ncols(
+		const Matrix< DataType, ascend, RIT, CIT, NIT > &
+	) noexcept;
+
+	template< typename DataType, typename RIT, typename CIT, typename NIT >
+	size_t nnz(
+		const Matrix< DataType, ascend, RIT, CIT, NIT > &
+	) noexcept;
+
+	template< typename InputType, typename RIT, typename CIT, typename NIT >
+	RC clear( Matrix< InputType, ascend, RIT, CIT, NIT > & ) noexcept;
+
+	template< typename DataType, typename RIT, typename CIT, typename NIT >
+	RC resize(
+		Matrix< DataType, ascend, RIT, CIT, NIT > &,
+		const size_t
+	) noexcept;
+
+	template<
+		class ActiveDistribution, typename Func, typename DataType,
+		typename RIT, typename CIT, typename NIT
+	>
+	RC eWiseLambda(
+		const Func f,
+		const Matrix< DataType, ascend, RIT, CIT, NIT > &A,
+		const size_t s, const size_t P
+	);
+
+	/**
+	 * A GraphBLAS matrix, ascend implementation.
+	 *
+	 * Uses Compressed Column Storage (CCS) plus Compressed Row Storage (CRS).
+	 *
+	 * \warning This implementation prefers speed over memory efficiency.
+	 *
+	 * @tparam D The type of a nonzero element.
+	 *
+	 * \internal
+	 * @tparam RowIndexType The type used for row indices
+	 * @tparam ColIndexType The type used for column indices
+	 * @tparam NonzeroIndexType The type used for nonzero indices
+	 * \endinternal
+	 */
+	template<
+		typename D,
+		typename RowIndexType,
+		typename ColIndexType,
+		typename NonzeroIndexType
+	>
+	class Matrix< D, ascend, RowIndexType, ColIndexType, NonzeroIndexType > {
+
+		static_assert( !grb::is_object< D >::value,
+			"Cannot create an ALP matrix of ALP objects!" );
+
+		template< typename DataType, typename RIT, typename CIT, typename NIT >
+		friend Matrix< DataType, reference, RIT, CIT, NIT > & internal::getRefMatrix(
+			Matrix< DataType, ascend, RIT, CIT, NIT > &A
+		) noexcept;
+
+		template< typename DataType, typename RIT, typename CIT, typename NIT >
+		friend const Matrix< DataType, reference, RIT, CIT, NIT > &
+		internal::getRefMatrix(
+			const Matrix< DataType, ascend, RIT, CIT, NIT > &A
+		) noexcept;
+
+
+		/* *********************
+		        BLAS2 friends
+		   ********************* */
+
+		template< typename DataType, typename RIT, typename CIT, typename NIT >
+		friend size_t nrows(
+			const Matrix< DataType, ascend, RIT, CIT, NIT > &
+		) noexcept;
+
+		template< typename DataType, typename RIT, typename CIT, typename NIT >
+		friend size_t ncols(
+			const Matrix< DataType, ascend, RIT, CIT, NIT > &
+		) noexcept;
+
+		template< typename DataType, typename RIT, typename CIT, typename NIT >
+		friend size_t nnz(
+			const Matrix< DataType, ascend, RIT, CIT, NIT > &
+		) noexcept;
+
+		template< typename InputType, typename RIT, typename CIT, typename NIT >
+		friend RC clear(
+			Matrix< InputType, ascend, RIT, CIT, NIT > &
+		) noexcept;
+
+		template< typename DataType, typename RIT, typename CIT, typename NIT  >
+		friend RC resize(
+			Matrix< DataType, ascend, RIT, CIT, NIT > &,
+			const size_t
+		) noexcept;
+
+		template<
+			class ActiveDistribution, typename Func, typename DataType,
+			typename RIT, typename CIT, typename NIT
+		>
+		friend RC eWiseLambda(
+			const Func,
+			const Matrix< DataType, ascend, RIT, CIT, NIT > &,
+			const size_t, const size_t
+		);
+
+		template<
+			Descriptor descr,
+			bool input_dense, bool output_dense, bool masked, bool left_handed,
+			template< typename > class One,
+			typename IOType,
+			class AdditiveMonoid, class Multiplication,
+			typename InputType1, typename InputType2,
+			typename InputType3,
+			typename RowColType, typename NonzeroType,
+			typename Coords
+		>
+		friend void internal::vxm_inner_kernel_scatter(
+			RC &rc,
+			Vector< IOType, ascend, Coords > &destination_vector,
+			IOType * __restrict__ const &destination,
+			const size_t &destination_range,
+			const Vector< InputType1, ascend, Coords > &source_vector,
+			const InputType1 * __restrict__ const &source,
+			const size_t &source_index,
+			const internal::Compressed_Storage<
+				InputType2, RowColType, NonzeroType
+			> &matrix,
+			const Vector< InputType3, ascend, Coords > &mask_vector,
+			const InputType3 * __restrict__ const &mask,
+			const AdditiveMonoid &add,
+			const Multiplication &mul,
+			const std::function< size_t( size_t ) > &src_local_to_global,
+			const std::function< size_t( size_t ) > &dst_global_to_local
+		);
+
+		template<
+			Descriptor descr,
+			bool masked, bool input_masked, bool left_handed,
+			template< typename > class One,
+			class AdditiveMonoid, class Multiplication,
+			typename IOType, typename InputType1, typename InputType2,
+			typename InputType3, typename InputType4,
+			typename Coords, typename RIT, typename CIT, typename NIT
+		>
+		friend RC internal::vxm_generic(
+			Vector< IOType, ascend, Coords > &u,
+			const Vector< InputType3, ascend, Coords > &mask,
+			const Vector< InputType1, ascend, Coords > &v,
+			const Vector< InputType4, ascend, Coords > &v_mask,
+			const Matrix< InputType2, ascend, RIT, CIT, NIT > &A,
+			const AdditiveMonoid &add,
+			const Multiplication &mul,
+			const std::function< size_t( size_t ) > &row_l2g,
+			const std::function< size_t( size_t ) > &row_g2l,
+			const std::function< size_t( size_t ) > &col_l2g,
+			const std::function< size_t( size_t ) > &col_g2l
+		);
+
+		/* ********************
+		        IO friends
+		   ******************** */
+
+		template<
+			Descriptor descr, typename InputType,
+			typename RIT, typename CIT, typename NIT,
+			typename fwd_iterator
+		>
+		friend RC buildMatrixUnique(
+			Matrix< InputType, ascend, RIT, CIT, NIT > &,
+			fwd_iterator, const fwd_iterator,
+			const IOMode
+		);
+
+		friend internal::Compressed_Storage< D, RowIndexType, NonzeroIndexType > &
+		internal::getCRS<>(
+			Matrix<
+				D, ascend,
+				RowIndexType, ColIndexType, NonzeroIndexType
+			> &A
+		) noexcept;
+
+		friend const internal::Compressed_Storage<
+			D,
+			RowIndexType, NonzeroIndexType
+		> & internal::getCRS<>(
+			const Matrix<
+				D, ascend,
+				RowIndexType, ColIndexType, NonzeroIndexType
+			> &A
+		) noexcept;
+
+		friend internal::Compressed_Storage< D, ColIndexType, NonzeroIndexType > &
+		internal::getCCS<>(
+			Matrix<
+				D, ascend,
+				RowIndexType, ColIndexType, NonzeroIndexType
+			> &A
+		) noexcept;
+
+		friend const internal::Compressed_Storage<
+			D, ColIndexType, NonzeroIndexType
+		> & internal::getCCS<>(
+			const Matrix<
+				D, ascend,
+				RowIndexType, ColIndexType, NonzeroIndexType
+			> &A
+		) noexcept;
+
+		template< typename InputType, typename RIT, typename CIT, typename NIT >
+		friend const size_t & internal::getNonzeroCapacity(
+			const grb::Matrix< InputType, ascend, RIT, CIT, NIT > &
+		) noexcept;
+
+		template< typename InputType, typename RIT, typename CIT, typename NIT >
+		friend const size_t & internal::getCurrentNonzeroes(
+			const grb::Matrix< InputType, ascend, RIT, CIT, NIT > &
+		) noexcept;
+
+		template< typename InputType, typename RIT, typename CIT, typename NIT >
+		friend void internal::setCurrentNonzeroes(
+			grb::Matrix< InputType, ascend, RIT, CIT, NIT > &, const size_t
+		) noexcept;
+
+		template< typename InputType, typename RIT, typename CIT, typename NIT >
+		friend void internal::getMatrixBuffers(
+			char *&, char *&, InputType *&,
+			const unsigned int,
+			const grb::Matrix< InputType, ascend, RIT, CIT, NIT > &
+		) noexcept;
+
+		template< typename InputType, typename RIT, typename CIT, typename NIT >
+		friend uintptr_t getID(
+			const Matrix< InputType, ascend, RIT, CIT, NIT > &
+		);
+
+
+		private:
+
+			Matrix< D, reference, RowIndexType, ColIndexType, NonzeroIndexType > ref;
+
+			/** Our own type. */
+			typedef Matrix<
+				D, ascend,
+				RowIndexType, ColIndexType, NonzeroIndexType
+			> self_type;
+
+			Matrix() : ref( )
+			{}
+
+			Matrix(
+				const D *__restrict__ const _values,
+				const ColIndexType *__restrict__ const _column_indices,
+				const NonzeroIndexType *__restrict__ const _offset_array,
+				const size_t _m, const size_t _n,
+				const size_t _cap,
+				char *__restrict__ const buf1 = nullptr,
+				char *__restrict__ const buf2 = nullptr,
+				D *__restrict__ const buf3 = nullptr
+			) : ref(
+				_values, _column_indices, _offset_array,
+				_m, _n, _cap,
+				buf1, buf2, buf3
+			) {}
+
+			void moveFromOther( self_type &&other ) {
+				ref.moveFromOther( std::move( other.ref ) );
+			}
+
+			RC clear() {
+				return ref.clear();
+			}
+
+			RC resize( const size_t nonzeroes ) {
+				return ref.resize( nonzeroes );
+			}
+
+			template<
+				Descriptor descr = descriptors::no_operation,
+				typename fwd_iterator
+			>
+			RC buildMatrixUnique(
+				const fwd_iterator &_start,
+				const fwd_iterator &_end
+			) {
+
+				return ref.buildMatrixUnique( _start, _end );
+			}
+
+
+		public:
+
+			/** @see Matrix::value_type */
+			typedef D value_type;
+
+			/** The iterator type over matrices of this type. */
+			typedef typename internal::Compressed_Storage<
+				D, RowIndexType, NonzeroIndexType
+			>::template ConstIterator<
+				internal::Distribution< reference >
+			> const_iterator;
+
+			Matrix(
+				const size_t rows, const size_t columns, const size_t nz
+			) : ref( rows, columns, nz )
+			{}
+
+			Matrix( const size_t rows, const size_t columns ) : ref( rows, columns )
+			{}
+
+			/**
+			 * \internal
+			 * \todo See below code comment
+			 * \endinternal
+			 */
+			Matrix(
+				const Matrix<
+					D, ascend, RowIndexType, ColIndexType, NonzeroIndexType
+				> &other ) : ref( other.ref )
+			{
+				//TODO: the pipeline should be executed once level-3 primitives are
+				//      implemented. In the current implementation matrices may be used only
+				//      as the input of SpMV
+			}
+
+			Matrix( self_type &&other ) noexcept : ref( std::move( other.ref ) ) {
+				//TODO: the pipeline should be executed once level-3 primitives are
+				//      implemented. In the current implementation matrices may be used only
+				//      as the input of SpMV
+			}
+
+			self_type& operator=( self_type &&other ) noexcept {
+				ref = std::move( other.ref );
+				return *this;
+			}
+
+			~Matrix() {
+				// the pipeline is executed before memory deallocation
+				internal::le.execution( this );
+			}
+
+			template< class ActiveDistribution = internal::Distribution< reference > >
+			typename internal::Compressed_Storage<
+				D, RowIndexType, NonzeroIndexType
+			>::template ConstIterator< ActiveDistribution > begin(
+				const IOMode mode = PARALLEL,
+				const size_t s = 0, const size_t P = 1
+			) const {
+				return ref.begin( mode, s, P );
+			}
+
+			template< class ActiveDistribution = internal::Distribution< reference > >
+			typename internal::Compressed_Storage<
+				D,
+				RowIndexType,
+				NonzeroIndexType
+			>::template ConstIterator< ActiveDistribution > end(
+				const IOMode mode = PARALLEL,
+				const size_t s = 0, const size_t P = 1
+			) const {
+				return ref.end( mode, s, P );
+			}
+
+			template< class ActiveDistribution = internal::Distribution< reference > >
+			typename internal::Compressed_Storage<
+				D,
+				RowIndexType,
+				NonzeroIndexType
+			>::template ConstIterator< ActiveDistribution > cbegin(
+				const IOMode mode = PARALLEL
+			) const {
+				return ref.cbegin( mode );
+			}
+
+			template< class ActiveDistribution = internal::Distribution< reference > >
+			typename internal::Compressed_Storage<
+				D,
+				RowIndexType,
+				NonzeroIndexType
+			>::template ConstIterator< ActiveDistribution > cend(
+				const IOMode mode = PARALLEL
+			) const {
+				return ref.cend( mode );
+			}
+
+	};
+
+	// template specialisation for GraphBLAS type traits
+	template< typename D, typename RIT, typename CIT, typename NIT >
+	struct is_container< Matrix< D, ascend, RIT, CIT, NIT > > {
+		/** A ascend Matrix is a GraphBLAS object. */
+		static const constexpr bool value = true;
+	};
+
+	//internal getters implementation
+	namespace internal {
+
+		template< typename DataType, typename RIT, typename CIT, typename NIT >
+		inline Matrix< DataType, reference, RIT, CIT, NIT >& getRefMatrix(
+			Matrix< DataType, ascend, RIT, CIT, NIT > &A
+		) noexcept {
+			return (A.ref);
+		}
+
+		template< typename DataType, typename RIT, typename CIT, typename NIT >
+		inline const Matrix< DataType, reference, RIT, CIT, NIT >& getRefMatrix(
+			const Matrix< DataType, ascend, RIT, CIT, NIT > &A
+		) noexcept {
+			return (A.ref);
+		}
+
+	} //end ``grb::internal'' namespace
+
+} // namespace grb
+
+#endif // end ``_H_GRB_ASCEND_MATRIX''
+
diff --git a/include/graphblas/ascend/operators.hpp b/include/graphblas/ascend/operators.hpp
new file mode 100644
index 000000000..1a495dba2
--- /dev/null
+++ b/include/graphblas/ascend/operators.hpp
@@ -0,0 +1,157 @@
+#ifndef _H_ALP_ASCEND_OPERATORS
+#define _H_ALP_ASCEND_OPERATORS
+
+#include <vector>
+
+#include <graphblas.hpp>
+
+//#include "graphblas/ascend/grid.hpp"
+#include "graphblas/ascend/utils.hpp"
+
+
+namespace alp
+{
+	class Tensor;
+
+
+	Tensor getView( const Tensor &parent );
+
+	// TODO extend to multiple containers
+	void store( const Tensor &output );
+
+	void set(
+		Tensor &tout,
+		Tensor &tin,
+		const std::vector< int > &activeAxes = std::vector< int >()
+	);
+
+	void set(
+		Tensor &tout,
+		double alpha		// TODO: this is hardcoded datatype
+	);
+
+	void apply(
+		Tensor &tout,
+		Tensor &tin,
+		const std::string &opName,
+		const std::vector< int > &activeAxes = std::vector< int >()
+	);
+
+	void apply(
+		Tensor &tout,
+		Tensor &tin1,
+		Tensor &tin2,
+		const std::string &opName,
+		const std::vector< int > &activeAxes = std::vector< int >()
+	);
+
+	void foldl(
+		Tensor &tinout,
+		Tensor &tin,
+		const std::string &opName,
+		const std::vector< int > &activeAxes = std::vector< int >()
+	);
+
+//	template< size_t sm, size_t pm >
+	void foldl(
+//		const Grid< sm, pm > &grid,
+		Tensor &tinout,
+		const std::string &opName,
+		const std::vector< int > &activeAxes = std::vector< int >()
+	);
+
+
+	struct ReductionOperation {
+		Tensor &input;
+		const std::vector< int > axes;
+		const internal::Stagetype opType;
+		const std::string opName;
+
+		ReductionOperation(
+			Tensor &input,
+			const std::vector< int > &axes,
+			const internal::Stagetype &op,
+			const std::string &opName
+		) :
+			input( input ),
+			axes( axes ),
+			opType( op ),
+			opName( opName ) {}
+
+	};
+
+	/**
+	 * Max-reduce operator
+	 */
+	template< typename AxisType >
+	ReductionOperation max( Tensor &z, const AxisType axis ) {
+		static_assert( 
+			std::is_convertible< AxisType, int >::value || std::is_convertible< AxisType, std::string >::value,
+			"AxisType must be convertible to int or std::string"
+		);
+		const int axisId = getAxisId( axis );
+		return { z, { axisId }, internal::Stagetype::FOLDL_MAX, "max" };
+	}
+
+	/**
+	 * Add-reduce operator
+	 */
+	template< typename AxisType >
+	ReductionOperation add( Tensor &z, const AxisType axis ) {
+		static_assert( 
+			std::is_convertible< AxisType, int >::value || std::is_convertible< AxisType, std::string >::value,
+			"AxisType must be convertible to int or std::string"
+		);
+		const int axisId = getAxisId( axis );
+		return { z, { axisId }, internal::Stagetype::FOLDL_ADD, "add" };
+	}
+
+	struct ApplyOperation {
+		Tensor& input1;
+		Tensor& input2;
+		const std::vector< int > axes;
+		const std::string opName;
+
+		ApplyOperation(
+			Tensor &input1, Tensor &input2,
+			const std::vector< int > &axes,
+			const std::string &opName
+		) :
+			input1( input1 ),
+			input2( input2 ),
+			axes( axes ),
+			opName( opName ) { }
+	};
+
+	/**
+	 * Add-reduce operator
+	 */
+	template< typename AxisType >
+	ApplyOperation add( Tensor &y, Tensor &z, const AxisType axis ) {
+		static_assert( 
+			std::is_convertible< AxisType, int >::value || std::is_convertible< AxisType, std::string >::value,
+			"AxisType must be convertible to int or std::string"
+		);
+		const int axisId = getAxisId( axis );
+		// std::vector<Tensor&> inputs = { y, z };
+		return { y, z, { axisId }, "add" };
+	}
+
+
+
+	/**
+	 * Minus operator
+	 */
+	template< typename AxisType >
+	ApplyOperation minus( Tensor &y, Tensor &z, const AxisType axis ) {
+		static_assert( 
+			std::is_convertible< AxisType, int >::value || std::is_convertible< AxisType, std::string >::value,
+			"AxisType must be convertible to int or std::string"
+		);
+		const int axisId = getAxisId( axis );
+		return { y, z, { axisId }, "minus" };
+	}
+
+} // namespace alp
+
+#endif
diff --git a/include/graphblas/ascend/opgen.hpp b/include/graphblas/ascend/opgen.hpp
new file mode 100644
index 000000000..36950cdaa
--- /dev/null
+++ b/include/graphblas/ascend/opgen.hpp
@@ -0,0 +1,100 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Class that defines the state of the code generation.
+ *
+ * @author A. N. Yzelman.
+ * @date 12th of September, 2023.
+ */
+
+
+#ifndef _H_ALP_ASCEND_OPGEN
+#define _H_ALP_ASCEND_OPGEN
+
+#include <map>
+#include <utility>
+#include <vector>
+#include <string>
+#include <sstream>
+
+#include <cxxabi.h>
+
+#include <graphblas.hpp>
+
+namespace alp {
+
+	namespace internal {
+
+		class OpGen {
+
+			public:
+
+				OpGen() = default;
+				virtual ~OpGen() = default;
+
+				/** Returns a string representation of a given type. */
+/*				template<typename T>
+				static std::string type_name(){
+					int info = 0;
+					return abi::__cxa_demangle( typeid(T).name(), NULL, NULL, &info );
+				}
+*/
+				/**
+				 * Maintains a mapping from chunks to their sizes.
+				 *
+				 * \warning The map does not guarantee that chunks who have since been
+				 *          destructed will no longer appear in the map.
+				 */
+//TODO how is this supposed to be used?
+//				static std::map< std::string, std::string > chunkSize;
+
+				static std::string kernel_id;
+
+				/** Indicates if the executed code is within the lambda function of a forEach */
+				static size_t forEachLevel;
+
+				static std::vector< std::vector< int > > forEachAxes;
+				static std::vector< int > lastAxes;
+
+				static std::stringstream aux_func;
+				static std::stringstream analyticModelFormalParams;
+				static std::stringstream hostFormalParam;
+				static std::stringstream hostBody;
+				static std::stringstream hostArg;
+				static std::stringstream constrBody;
+				static std::stringstream classMembers;
+				static std::stringstream initBody;
+				static std::stringstream genericProcessBody;
+				static std::stringstream declarations;
+
+				static std::vector< std::stringstream > processFunc;
+				static std::vector< std::stringstream > computeFunc;
+				static std::vector< std::stringstream > copyinFunc;
+				static std::vector< std::stringstream > copyoutFunc;
+
+				static void compileClear();
+				static void generate( std::ostream &os );
+		};
+	}
+
+}
+
+#endif // end _H_ALP_ASCEND_OPGEN
+
diff --git a/include/graphblas/ascend/pinnedvector.hpp b/include/graphblas/ascend/pinnedvector.hpp
new file mode 100644
index 000000000..3991e8824
--- /dev/null
+++ b/include/graphblas/ascend/pinnedvector.hpp
@@ -0,0 +1,164 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * PinnedVector implementation of the Ascend backend.
+ *
+ * @author A. N. Yzelman 
+ * @date 12th of September, 2023
+ */
+
+#ifndef _H_GRB_ASCEND_PINNEDVECTOR
+#define _H_GRB_ASCEND_PINNEDVECTOR
+
+#include <graphblas/base/pinnedvector.hpp>
+#include <graphblas/utils/autodeleter.hpp>
+
+#include <graphblas/nonblocking/lazy_evaluation.hpp>
+
+#include "coordinates.hpp"
+#include "vector.hpp"
+
+
+namespace grb {
+
+	namespace internal {
+
+		extern LazyEvaluation le;
+
+	}
+
+	/**
+	 * The PinnedVector class is based on that of the reference backend.
+	 *
+	 * \internal There is some code duplication with the reference PinnedVector.
+	 *           At present, it is unclear if this can be reduced.
+	 */
+	template< typename IOType >
+	class PinnedVector< IOType, ascend > {
+
+		private:
+
+			/** Essentially a shared pointer into the nonzero values */
+			utils::AutoDeleter< IOType > _raw_deleter;
+
+			/** Essentially a shared pointer into the SPA's stack. */
+			utils::AutoDeleter< char > _stack_deleter;
+
+			/** The shared nonzero values */
+			IOType * _buffered_values;
+
+			/**
+			 * The shared coordinates, on which only stack-based accesses are performed.
+			 */
+			internal::Coordinates<
+				config::IMPLEMENTATION< ascend >::coordinatesBackend()
+			> _buffered_coordinates;
+
+
+		public:
+
+			/** Constructs an empty pinned vector. */
+			PinnedVector() : _buffered_values( nullptr ) {}
+
+			/** Constructs a pinning of \a x */
+			PinnedVector(
+				const Vector< IOType, ascend, internal::Coordinates<
+					config::IMPLEMENTATION< ascend >::coordinatesBackend()
+				> > &x,
+				const IOMode mode
+			) {
+				// The execution of a pipeline that uses the vector is necessary.
+				if( internal::getCoordinates(x).size() > 0 ) {
+					internal::le.execution( &x );
+				}
+
+				_raw_deleter = internal::getRefVector(x)._raw_deleter;
+				_stack_deleter = internal::getRefVector(x)._buffer_deleter;
+				_buffered_values = internal::getRefVector(x)._raw;
+				_buffered_coordinates = internal::getRefVector(x)._coordinates;
+
+				// The ascend backend is always single process, so the mode is unused.
+				(void) mode;
+			}
+
+			/** \internal No implementation details */
+			inline size_t size() const noexcept {
+#ifndef NDEBUG
+				if( _buffered_coordinates.size() == 0 ) {
+					assert( _buffered_values == nullptr );
+				}
+#endif
+				return _buffered_coordinates.size();
+			}
+
+			/** \internal No implementation details */
+			inline size_t nonzeroes() const noexcept {
+#ifndef NDEBUG
+				if( _buffered_coordinates.size() == 0 ) {
+					assert( _buffered_values == nullptr );
+				}
+#endif
+				return _buffered_coordinates.nonzeroes();
+			}
+
+			/** \internal No implementation details */
+			template< typename OutputType = IOType >
+			inline OutputType getNonzeroValue(
+				const size_t k,
+				const OutputType one
+			) const noexcept {
+				assert( k < nonzeroes() );
+				assert( _buffered_coordinates.size() > 0 );
+				if( _buffered_values == nullptr ) {
+					return one;
+				} else {
+					const size_t index = getNonzeroIndex( k );
+					return static_cast< OutputType >(
+						_buffered_values[ index ]
+					);
+				}
+			}
+
+			/** \internal No implementation details */
+			inline IOType getNonzeroValue(
+				const size_t k
+			) const noexcept {
+				assert( k < nonzeroes() );
+				assert( _buffered_coordinates.size() > 0 );
+				assert( _buffered_values != nullptr );
+				const size_t index = getNonzeroIndex( k );
+				assert( index < _buffered_coordinates.size() );
+				return _buffered_values[ index ];
+			}
+
+			/** \internal No implementation details */
+			inline size_t getNonzeroIndex(
+				const size_t k
+			) const noexcept {
+				assert( k < nonzeroes() );
+				return _buffered_coordinates.index( k );
+			}
+
+	};
+
+} // namespace grb
+
+#endif // end ``_H_GRB_ASCEND_PINNEDVECTOR''
+
diff --git a/include/graphblas/ascend/pipeline.hpp b/include/graphblas/ascend/pipeline.hpp
new file mode 100644
index 000000000..b6286ca58
--- /dev/null
+++ b/include/graphblas/ascend/pipeline.hpp
@@ -0,0 +1,102 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Describes an Ascend pipeline.
+ *
+ * @author A. N. Yzelman
+ * @date 12th of September, 2023
+ */
+
+#ifndef _H_ALP_ASCEND_PIPELINE
+#define _H_ALP_ASCEND_PIPELINE
+
+/**
+ * To enable debugging information only for the ascend backend, the code
+ * should be combiled with the _ASCEND_DEBUG definition, without defining
+ * _DEBUG. If the code is compiled with _DEBUG, the debugging information for
+ * the ascend backend is enabled as well.
+ */
+#if !defined(_ASCEND_DEBUG) && defined(_DEBUG)
+ #define _ASCEND_DEBUG
+#endif
+
+#include <vector>
+#include <unordered_set>
+#include <algorithm>
+#include <functional>
+
+#include <graphblas/ascend/config.hpp>
+#include <graphblas/ascend/tensor.hpp>
+#include <graphblas/ascend/utils.hpp>
+
+namespace alp {
+
+	namespace internal {
+
+		class Stage;
+
+		/**
+		 * Encodes a single pipeline that may be expanded, merged, or executed.
+		 */
+		class AscendPipeline {
+
+			private:
+
+				const size_t id;
+				std::vector< alp::internal::Stage > stages;
+
+				// pointers to Tensors do not work because any local declaration
+				// inside the forEach will be invalid the moment the code is generated
+				std::unordered_set< Tensor > accessed;
+				std::unordered_set< Tensor > outputs;
+
+				void insertTensorToInputs( const Tensor &tensor );
+				std::set< int > getIteratedAxes() const;
+
+
+			public:
+
+				AscendPipeline( size_t _id );
+				AscendPipeline( size_t _id, const std::vector< int > &_forEachParallelAxes );
+				void insertFreeInputTensorStages( const std::vector< int > &forEachAxes );
+				const Tensor &store( const Tensor &output_tensor );
+				bool isOutput( const Tensor &tensor ) const;
+				void clear();
+				size_t getID() const;
+				std::string getTilingAxes() const;
+				void addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const Tensor &tensor1, const double alpha, const std::vector< int > &forEachAxes );
+				void addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const Tensor &tensor1, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes );
+				void addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const Tensor &tensor1, const Tensor &tensor2, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes );
+				void addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const Tensor &tensor1, const Tensor &tensor2, const Tensor &tensor3, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes );
+//				void addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const Tensor &tensor1, const Tensor &tensor2, const Tensor &tensor3, const Tensor &tensor4, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes );
+				void generateDeclarations( std::stringstream &declarations );
+//				void generateConstructor( std::stringstream &constructor );
+				void generateHostBody( std::stringstream &os, std::stringstream &analyticModelArgs, std::stringstream &analyticModelFormalParams, std::stringstream &analyticModelDecls, std::stringstream &analyticModelConstrBody );
+				void generateInit( std::stringstream &init );
+				void generateProcess( std::stringstream &process, std::stringstream &processCall );
+				void debug_print() const;
+		};
+
+	}
+
+}
+
+#endif //end `_H_ALP_ASCEND_PIPELINE'
+
diff --git a/include/graphblas/ascend/properties.hpp b/include/graphblas/ascend/properties.hpp
new file mode 100644
index 000000000..a9f009da4
--- /dev/null
+++ b/include/graphblas/ascend/properties.hpp
@@ -0,0 +1,58 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Collects the Ascend backend properties.
+ *
+ * @author A. N. Yzelman
+ * @date 12th of September, 2023
+ */
+
+#ifndef _H_GRB_ASCEND_PROPERTIES
+#define _H_GRB_ASCEND_PROPERTIES
+
+#include <graphblas/base/properties.hpp>
+
+
+namespace grb {
+
+	/** No implementation notes. */
+	template<>
+	class Properties< ascend > {
+
+		public:
+
+			/**
+			 * This is a shared-memory parallel implementation and therefore captured
+			 * scalars cannot be written to without causing data races.
+			 */
+			static constexpr const bool writableCaptured = false;
+
+			/** This is a nonblocking backend. */
+			static constexpr const bool isBlockingExecution = false;
+
+			/** This is a nonblocking backend. */
+			static constexpr const bool isNonblockingExecution = true;
+
+	};
+
+} // namespace grb
+
+#endif // end `_H_GRB_ASCEND_PROPERTIES
+
diff --git a/include/graphblas/ascend/semantics.hpp b/include/graphblas/ascend/semantics.hpp
new file mode 100644
index 000000000..2f48438c1
--- /dev/null
+++ b/include/graphblas/ascend/semantics.hpp
@@ -0,0 +1,15 @@
+#ifndef _H_ALP_ASCEND_SEMANTICS
+#define _H_ALP_ASCEND_SEMANTICS
+
+namespace alp {
+
+	namespace internal {
+
+		bool invalidForEachAxes( const std::vector< int > &axes );
+		bool invalidAxes( const std::vector< int > &axes );
+
+	} // namespace internal
+
+} // namespace alp
+
+#endif // _H_ALP_ASCEND_SEMANTICS
diff --git a/include/graphblas/ascend/spmd.hpp b/include/graphblas/ascend/spmd.hpp
new file mode 100644
index 000000000..4f0453071
--- /dev/null
+++ b/include/graphblas/ascend/spmd.hpp
@@ -0,0 +1,68 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the SPMD functions for the Ascend backend.
+ *
+ * @author A. N. Yzelman
+ * @date 12th of September, 2023
+ */
+
+#ifndef _H_GRB_ASCEND_SPMD
+#define _H_GRB_ASCEND_SPMD
+
+#include <cstddef> //size_t
+
+#include <graphblas/base/spmd.hpp>
+
+
+namespace grb {
+
+	/** The spmd class is based on that of the reference backend */
+	template<>
+	class spmd< ascend > {
+
+		public:
+
+			/** Refers back to the reference backend */
+			static inline size_t nprocs() noexcept {
+				return spmd< reference >::nprocs();
+			}
+
+			/** Refers back to the reference backend */
+			static inline size_t pid() noexcept {
+				return spmd< reference >::pid();
+			}
+
+			/** Refers back to the reference backend */
+			static RC sync( const size_t msgs_in = 0, const size_t msgs_out = 0 ) noexcept {
+				return spmd< reference >::sync( msgs_in, msgs_out );
+			}
+
+			/** Refers back to the reference backend */
+			static RC barrier() noexcept {
+				return spmd< reference >::barrier();
+			}
+
+	}; // end class ``spmd'' ascend implementation
+
+} // namespace grb
+
+#endif // end _H_GRB_ASCEND_SPMD
+
diff --git a/include/graphblas/ascend/stage.hpp b/include/graphblas/ascend/stage.hpp
new file mode 100644
index 000000000..100f40e68
--- /dev/null
+++ b/include/graphblas/ascend/stage.hpp
@@ -0,0 +1,84 @@
+#ifndef _H_ALP_ASCEND_STAGE
+#define _H_ALP_ASCEND_STAGE
+
+#include <vector>
+#include <string>
+
+#include <graphblas/ascend/utils.hpp>
+#include <graphblas/ascend/tensor.hpp>
+#include <graphblas/ascend/pipeline.hpp>
+
+namespace alp {
+
+	namespace internal {
+
+		class Stage {
+
+			private:
+
+				const AscendPipeline &pipeline;
+				const Stagetype enum_op_type;
+				const Rule rule;
+				const Tensor tensor0;
+				Tensor tensor1;
+				Tensor tensor2;
+				Tensor tensor3;
+				std::string tensor0_offset;
+				std::string tensor1_offset;
+				std::string tensor2_offset;
+//				std::string tensor3_offset;
+				std::string stride;
+				double alpha; 	//TODO double should be replaced by alp::Scalar
+				const std::vector< int > activeAxes;
+				const std::vector< int > forEachAxes;
+
+
+			public:
+
+				Stage( const AscendPipeline &parent, Stagetype _enum_op_type, Rule _rule,
+						const Tensor &_tensor0, const double _alpha, const std::vector< int > &_forEachAxes );
+				Stage( const AscendPipeline &parent, Stagetype _enum_op_type, Rule _rule,
+						const Tensor &_tensor0,
+						const std::vector< int > &_activeAxes, const std::vector< int > &_forEachAxes );
+				Stage( const AscendPipeline &parent, Stagetype _enum_op_type, Rule _rule,
+						const Tensor &_tensor0, const Tensor &_in_tensor,
+						const std::vector< int > &_activeAxes, const std::vector< int > &_forEachAxes );
+				Stage( const AscendPipeline &parent, Stagetype _enum_op_type, Rule _rule,
+						const Tensor &_tensor0, const Tensor &_tensor1, const Tensor &_tensor2,
+						const std::vector< int > &_activeAxes, const std::vector< int > &_forEachAxes );
+//				Stage( const AscendPipeline &parent, Stagetype _enum_op_type, Rule _rule,
+//						const Tensor &_tensor0, const Tensor &_tensor1, const Tensor &_tensor2, const Tensor &_tensor3,
+//						const std::vector< int > &_activeAxes, const std::vector< int > &_forEachAxes );
+				Stagetype getOpType() const;
+				Rule getRule() const;
+				const Tensor &getTensor0() const;
+				const std::vector< int > &getAxes() const;
+				const std::vector< int > &getForEachAxes() const;
+				std::string getOp( const std::string &tabs ) const;
+				std::string generateApplyMinusOp( const std::string &tabs ) const;
+				std::string generateApplyAddOp( const std::string &tabs ) const;
+				std::string generateFoldlDivideOp( const std::string &tabs ) const;
+				std::string generateFoldlMaxOp( const std::string &tabs ) const;
+				std::string generateFoldlTimesOp( const std::string &tabs ) const;
+				std::string generateFoldlAddOp( const std::string &tabs ) const;
+				std::string generateFoldlExpOp( const std::string &tabs ) const;
+				std::string generateSetTensorOp( const std::string &tabs ) const;
+				std::string generateSetScalarOp( const std::string &tabs ) const;
+				std::string generateGetViewOp( const std::string &tabs ) const;
+				std::string generateStoreOp( const std::string &tabs ) const;
+				std::string generateImplicitFreeOp( const std::string &tabs ) const;
+				std::string generateToDoOp( const std::string &tabs ) const;
+
+
+			private:
+
+				std::vector< int > computeOperatorAxes() const;
+				void computeMemoryOffsets();
+				void semanticsCheks();
+		};
+	}
+
+}
+
+#endif // end _H_ALP_ASCEND_STAGE
+
diff --git a/include/graphblas/ascend/symbolTable.hpp b/include/graphblas/ascend/symbolTable.hpp
new file mode 100644
index 000000000..b9bc598a0
--- /dev/null
+++ b/include/graphblas/ascend/symbolTable.hpp
@@ -0,0 +1,109 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _H_GRB_ASCEND_SYMBOLTABLE
+#define _H_GRB_ASCEND_SYMBOLTABLE
+
+#include <string>
+#include <map>
+#include <utility>
+#include <vector>
+#include <unordered_set>
+
+#include <graphblas.hpp>
+#include <graphblas/ascend/utils.hpp>
+
+namespace alp {
+
+	class Tensor;
+
+	namespace internal {
+
+		class SymbolTable {
+
+			private:
+
+				bool TBuf_decl;
+
+				/** Maintains a counter for unique temporary scalar names. */
+				size_t temp_scalar_id;
+
+				// pointers to Tensors do not work because any local declaration
+				// inside the forEach will be invalid the moment the code is generated
+
+				/** Maintains all the global declarations of the compiled function. */
+				std::map< std::string, alp::Tensor > global_tensor_declarations;
+
+				/** Maintains all the local declarations of the current forEach. */
+				std::map< std::string, alp::Tensor > local_tensor_declarations;
+
+				/** Maintains all the temporary declarations of the current forEach. */
+				std::map< std::string, alp::Tensor > temp_tensor_declarations;
+
+				/** Maintains all the buffers that are reused for local
+				 *  and temporary declarations of the current forEach.
+				 */
+				std::map< std::string, std::string > temp_local_buffer_declarations;
+
+				/** Maintains the order of all the global tensors and only the output tensors, respectively */
+				std::vector< alp::Tensor > all_global_tensors;
+				std::vector< alp::Tensor > outputs_global_tensors;
+
+				/**
+				 * Maintains a mapping from chunks to vectors.
+				 *
+				 * \warning The map does not guarantee that chunks who have since been
+				 *          destructed will no longer appear in the map.
+				 */
+				std::map< std::string, std::string > viewToTensor;
+
+
+			public:
+
+				SymbolTable();
+				bool existsTBufTensorDecl() const;
+				void clearAll();
+
+				void addGlobalTensor( const alp::Tensor &t );
+				void addLocalTensor( const alp::Tensor &t );
+				void addTempTensor( const alp::Tensor &t );
+				void addTensorView( const std::string &view_name, const std::string &parent_name );
+//				std::string newTempScalar();
+				void addOutputTensor( const alp::Tensor &t );
+				void printHostLogFile( std::stringstream &listOfGlobalTensors );
+				std::string getLocalTempTensorBuffer( Datatype type, const std::string &size = "" );
+				void generateGlobalSymbols( std::stringstream &initFormalParam,
+											std::stringstream &customFormalParam,
+											std::stringstream &allAccessedArg,
+											std::stringstream &allTempLocalDecl ) const;
+				void generateTempLocalInit( std::stringstream &allTempLocalInit ) const;
+				const alp::Tensor &getTensorFromView( const alp::Tensor &tensor ) const;
+
+				void debug_print() const;
+
+			private:
+
+
+				void reuseLocalTempTensorBuffer( const alp::Tensor &t );
+		};
+
+	}
+
+}
+
+#endif //end `_H_GRB_ASCEND_SYMBOLTABLE'
+
diff --git a/include/graphblas/ascend/tensor.hpp b/include/graphblas/ascend/tensor.hpp
new file mode 100644
index 000000000..ab3e4d3a3
--- /dev/null
+++ b/include/graphblas/ascend/tensor.hpp
@@ -0,0 +1,130 @@
+#ifndef _H_ALP_ASCEND_TENSOR
+#define _H_ALP_ASCEND_TENSOR
+
+#include <functional>
+#include <limits>
+#include <cstddef>
+#include <set>
+
+#include <graphblas.hpp>
+
+#include <graphblas/ascend/utils.hpp>
+#include <graphblas/ascend/operators.hpp>
+
+
+namespace alp {
+
+	/**
+	 * A global ALP/Ascend vector that resides in Ascend memory.
+	 */
+
+	class Tensor {
+
+		private:
+
+			size_t id;
+			std::string name;
+			Datatype type;
+			internal::Scope scope;
+			std::vector< int > axes;
+
+			Tensor& access( const std::vector<int> &axes ) noexcept {
+				(void) axes;
+				return *this;
+			}
+
+
+		public:
+
+			/** Maintains a counter for unique tensor names. */
+			static size_t tensor_id;
+
+			bool operator==( const Tensor &t ) const;
+			bool operator!=( const Tensor &t ) const { return not ( *this == t ); }
+
+			/**
+			 * @deprecated 
+			 * 
+			 * @brief Tensor[i] operator is deprecated. Use Tensor(i, ...) instead
+			 */
+			template< typename T, typename U >
+			T operator[]( const U axis ) = delete;
+
+			/**
+			 * @brief Replacement for Tensor[i] operator, allows to specify multiple 
+			 * axes in any order.
+			 */
+			template< typename AnyType >
+			Tensor& operator()( const AnyType &axis ) {
+				std::vector<int> computedAxes{ getAxisId( axis ) };
+				return access( computedAxes );
+			}
+
+			/**
+			 * @brief Replacement for Tensor[i] operator, allows to specify multiple 
+			 * axes in any order.
+			 */
+			template< typename AnyType, typename... AnyPackType >
+			Tensor& operator()( const AnyType &axis, AnyPackType const... args ) {
+				std::vector<int> computedAxes{ getAxisId( axis ) };
+				for( auto arg : { args... } ) {
+					computedAxes.push_back( getAxisId( arg ) );
+				}
+				return access( computedAxes );
+			}
+
+			/**
+			 * @brief Assignment operator of a Tensor (deleted)
+			 */
+			void operator=( const Tensor& ) = delete;
+
+			/**
+			 * @brief Assignment operator of ReductionOperation
+			 */
+			void operator=( const ReductionOperation &op );
+
+			/**
+			 * @brief Assignment operator of ApplyOperation
+			 */
+			void operator=( const ApplyOperation &op );
+
+			Tensor() = default;
+			Tensor( const Tensor &view_parent, const std::vector< int > &_axes ) noexcept;
+			Tensor( const Tensor &t ) noexcept;
+			Tensor( const std::vector< int > &_axes, const Datatype type ) noexcept;
+
+			Tensor(
+				const Datatype type,
+				const std::vector< int > &axes_vector
+			) noexcept;
+
+			virtual ~Tensor() noexcept { }
+
+			size_t getID() const;
+			const std::string &getName() const;
+			alp::Datatype getType() const;
+			internal::Scope getScope() const;
+			const std::vector< int > &getAxes() const;
+			bool isGlobalDecl() const;
+			bool isLocalDecl() const;
+			bool isTempDecl() const;
+
+			std::string getAccessedElement( size_t id ) const;
+			std::string getAscendName( size_t id ) const;
+			std::string getAscendGlobalName( size_t id ) const;
+			std::string getTQueBufName( size_t id ) const;
+	};
+}
+
+
+template<>
+struct std::hash< alp::Tensor >
+{
+	std::size_t operator()( const alp::Tensor& tensor ) const noexcept
+	{
+	    return std::hash< std::string >{}( tensor.getName() );
+		
+	}
+};
+
+#endif
diff --git a/include/graphblas/ascend/utils.hpp b/include/graphblas/ascend/utils.hpp
new file mode 100644
index 000000000..81f24e1eb
--- /dev/null
+++ b/include/graphblas/ascend/utils.hpp
@@ -0,0 +1,122 @@
+#ifndef _H_ALP_ASCEND_UTILS
+#define _H_ALP_ASCEND_UTILS
+
+#include <cstddef>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <vector>
+#include <set>
+#include <algorithm>
+#include <unordered_map>
+#include <atomic>
+
+namespace alp {
+
+	template< class T >
+	constexpr T Zero = T( 0 );
+
+	template< class T >
+	constexpr T Infinity = std::numeric_limits< T >::infinity();
+
+	// TODO: fix this so that mInfinity=-Infinity
+	template< class T >
+	constexpr T mInfinity = -Infinity< T >;
+
+	enum class Datatype { FP16, FP32, VIEW_TYPE, NO_TYPE };
+
+	namespace internal {
+
+		enum class Rule {
+
+				NONE,
+				EWISE,
+				BCAST,
+				REDUCE
+		};
+
+		enum class Scope {
+
+				GLOBAL,
+				LOCAL,
+				TEMP,
+				VIEW
+		};
+
+		enum class Stagetype {
+
+				GET_VIEW,
+				STORE,
+				IMPLICIT_FREE,
+				SET_TENSOR,
+				SET_SCALAR,
+				APPLY_ADD,
+				APPLY_MINUS,
+				FOLDL_EXP,
+				FOLDL_DIVIDE,
+				FOLDL_MAX,
+				FOLDL_TIMES,
+				FOLDL_ADD
+		};
+
+
+		 std::string getDataType( const Datatype dtype );
+		 std::string getScope( const Scope scope );
+		 std::vector< int > vectorOfVectorsToVector( const std::vector< std::vector< int > > &vector_of_sets );
+		 std::vector< int > vectorDifference( const std::vector< int > &vector1, const std::vector< int > &vector2 );
+		 bool vectorSubset( const std::vector< int > &vector1, const std::vector< int > &vector2 );
+		 std::vector< int > vectorUnion( const std::vector< int > &vector1, const std::vector< int > &vector2 );
+
+	} // namespace internal
+
+
+	static std::atomic_int axes_counter{0};
+
+
+	static inline int getAxisId( const std::string &axis ) {
+		static std::unordered_map<std::string, int> associations;
+
+		if (associations.find(axis) == associations.end()) {
+			associations[axis] = axes_counter++;
+		}
+
+		return associations[axis];
+	}
+
+	template< typename IntegralType = int >
+	static inline int getAxisId( 
+		const IntegralType axis,
+		typename std::enable_if< std::is_integral< IntegralType >::value, int >::type* = 0
+	) {
+		return static_cast< int >( axis );
+	}
+
+	static inline int getAxisId( const char* axis ) {
+		return getAxisId( std::string( axis ) );
+	}
+
+	template< typename = void >
+	static inline std::vector< int > make_axes( ) {
+		return std::vector< int >(0);
+	}
+
+	template< typename AxisType >
+	static inline std::vector< int > make_axes( AxisType axis ) {
+		return std::vector< int >{ getAxisId( axis ) };
+	}
+
+	template< typename AxisType, typename... AxisPackType >
+	static std::vector< int > make_axes( const AxisType arg1, AxisPackType const... args ) {
+		std::vector< int > axes{ getAxisId( arg1 ) };
+
+		for( auto arg : { args... } ) {
+			axes.push_back( getAxisId( arg ) );
+		}
+
+		return axes;
+	}
+
+
+} // namespace alp
+
+#endif // _H_ALP_ASCEND_UTILS
diff --git a/include/graphblas/ascend/vector.hpp b/include/graphblas/ascend/vector.hpp
new file mode 100644
index 000000000..2072ba0bf
--- /dev/null
+++ b/include/graphblas/ascend/vector.hpp
@@ -0,0 +1,480 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the Ascend vector.
+ *
+ * @author A. N. Yzelman
+ * @date 12th of September, 2023
+ */
+
+#ifndef _H_GRB_ASCEND_VECTOR
+#define _H_GRB_ASCEND_VECTOR
+
+#include <cstdlib>
+#include <functional>
+#include <iterator>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+#include <assert.h>
+#include <errno.h>
+#include <string.h>
+
+#include <graphblas/init.hpp>
+#include <graphblas/backends.hpp>
+#include <graphblas/base/matrix.hpp>
+#include <graphblas/base/pinnedvector.hpp>
+#include <graphblas/base/vector.hpp>
+#include <graphblas/blas0.hpp>
+#include <graphblas/config.hpp>
+#include <graphblas/descriptors.hpp>
+#include <graphblas/distribution.hpp>
+#include <graphblas/iomode.hpp>
+#include <graphblas/ops.hpp>
+#include <graphblas/rc.hpp>
+#include <graphblas/type_traits.hpp>
+#include <graphblas/utils/alloc.hpp>
+#include <graphblas/utils/autodeleter.hpp>
+
+#include <graphblas/reference/compressed_storage.hpp>
+
+#include <graphblas/nonblocking/lazy_evaluation.hpp>
+
+#include "coordinates.hpp"
+#include "spmd.hpp"
+
+#define NO_CAST_ASSERT( x, y, z )                                              \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | Provide a value of the same type as the first "    \
+		"domain of the given accumulator.\n"                                   \
+		"* Possible fix 3 | Provide a compatible accumulator where the first " \
+		"domain is of the type of the given value in the template paramters "  \
+		"of this call to " y ".\n"                                             \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+#define NO_MASKCAST_ASSERT( x, y, z )                                              \
+	static_assert( x,                                                              \
+		"\n\n"                                                                     \
+		"********************************************************************"     \
+		"********************************************************************"     \
+		"******************************\n"                                         \
+		"*     ERROR      | " y " " z ".\n"                                        \
+		"********************************************************************"     \
+		"********************************************************************"     \
+		"******************************\n"                                         \
+		"* Possible fix 1 | Remove no_casting from the template parameters "       \
+		"in this call to " y ".\n"                                                 \
+		"* Possible fix 2 | Provide a vector of Booleans in this call to " y ".\n" \
+		"********************************************************************"     \
+		"********************************************************************"     \
+		"******************************\n" );
+
+
+namespace grb {
+
+	namespace internal {
+
+		extern LazyEvaluation le;
+
+	}
+
+}
+
+namespace grb {
+
+	// forward declaration of backend-local matrix specialization for vector's
+	// friends
+	template< typename D, typename RIT, typename CIT, typename NIT >
+	class Matrix< D, ascend, RIT, CIT, NIT >;
+
+	// forward-declare internal getters
+	namespace internal {
+
+		template< typename D, typename C >
+		inline C & getCoordinates( Vector< D, ascend, C > &x ) noexcept;
+
+		template< typename D, typename C >
+		inline const C & getCoordinates(
+			const Vector< D, ascend, C > &x
+		) noexcept;
+
+		template< typename D, typename C >
+		inline D * getRaw( Vector< D, ascend, C > &x ) noexcept;
+
+		template< typename D, typename C >
+		inline const D * getRaw( const Vector< D, ascend, C > &x ) noexcept;
+
+		template< typename D, typename RIT, typename CIT, typename NIT >
+		inline internal::Compressed_Storage< D, RIT, NIT > & getCRS(
+			Matrix< D, ascend, RIT, CIT, NIT > &A
+		) noexcept;
+
+		template< typename D, typename RIT, typename CIT, typename NIT >
+		inline const internal::Compressed_Storage< D, RIT, NIT > & getCRS(
+			const Matrix< D, ascend, RIT, CIT, NIT > &A
+		) noexcept;
+
+		template< typename D, typename RIT, typename CIT, typename NIT >
+		inline internal::Compressed_Storage< D, CIT, NIT > & getCCS(
+			Matrix< D, ascend, RIT, CIT, NIT > &A
+		) noexcept;
+
+		template< typename D, typename RIT, typename CIT, typename NIT >
+		inline const internal::Compressed_Storage< D, CIT, NIT > & getCCS(
+			const Matrix< D, ascend, RIT, CIT, NIT > &A
+		) noexcept;
+
+		template< typename D, typename C >
+		inline Vector< D, reference, C >& getRefVector(
+			Vector< D, ascend, C > &x ) noexcept;
+
+		template< typename D, typename C >
+		inline const Vector< D, reference, C >& getRefVector(
+			const Vector< D, ascend, C > &x ) noexcept;
+
+	} // namespace internal
+
+	template< typename D, typename MyCoordinates >
+	class Vector< D, ascend, MyCoordinates > {
+
+		static_assert( !grb::is_object< D >::value, "Cannot create an ALP/GraphBLAS"
+			"vector of ALP/GraphBLAS objects!" );
+
+		/* *********************
+		     `Getter' friends
+		   ********************* */
+
+		friend MyCoordinates & internal::getCoordinates< D, MyCoordinates >(
+			Vector< D, ascend, MyCoordinates > & x ) noexcept;
+
+		friend const MyCoordinates & internal::getCoordinates< D, MyCoordinates >(
+			const Vector< D, ascend, MyCoordinates > & x ) noexcept;
+
+		friend D * internal::getRaw< D, MyCoordinates >(
+			Vector< D, ascend, MyCoordinates > & x ) noexcept;
+
+		friend const D * internal::getRaw< D, MyCoordinates >(
+			const Vector< D, ascend, MyCoordinates > & x ) noexcept;
+
+		friend Vector< D, reference, MyCoordinates > & internal::getRefVector<>(
+			Vector< D, ascend, MyCoordinates > &x ) noexcept;
+
+		friend const Vector< D, reference, MyCoordinates > & internal::getRefVector<>(
+			const Vector< D, ascend, MyCoordinates > &x ) noexcept;
+
+		/* *********************
+		        IO friends
+		   ********************* */
+
+		friend class PinnedVector< D, ascend >;
+
+
+		private:
+
+			Vector< D, reference, MyCoordinates > ref;
+
+
+		public:
+
+			/** @see Vector::value_type. */
+			typedef D value_type;
+
+			/**
+			 * This implementation makes the simplest implementation choice and declares
+			 * a lambda reference to be of the same type as a regular C++ reference. The
+			 * restrictions as specified in Vector::lambda_reference, however, still
+			 * apply.
+			 *
+			 * @see Vector::lambda_reference for the user-level specification.
+			 */
+			typedef D & lambda_reference;
+
+			typedef typename Vector< D, reference, MyCoordinates >::const_iterator
+				const_iterator;
+
+
+			Vector( const size_t n, const size_t nz ) : ref( n, nz ) {}
+
+			Vector( const size_t n ) : Vector( n, n ) {
+
+				// pipeline execution is not required here as this is a grb::Vector
+				// declaration
+#ifdef _DEBUG
+				std::cerr << "In Vector< ascend >::Vector( size_t ) constructor\n";
+#endif
+			}
+
+			Vector() : Vector( 0 ) {}
+
+			Vector( const Vector< D, ascend, MyCoordinates > &x ) :
+				ref( size( x.ref ), capacity( x.ref ) )
+			{
+				// full delegation to the copy constructor of the reference backend is
+				// impossible since the pipeline must be executed before the copy
+				// constructor
+				// instead a parameterized constructor of the reference backend is invoked
+				// to perform the necessary initialization as the initialize method is not
+				// defined for the ascend backend
+				if( internal::getCoordinates( x ).size() > 0 ) {
+					internal::le.execution( &x );
+				}
+
+
+				// once the execution of any required pipeline is completed
+				// the set primitive initializes the vector for this copy constructor
+				if( size( x ) > 0 ) {
+					const RC rc = set( *this, x );
+					if( rc != SUCCESS ) {
+						throw std::runtime_error( "grb::set inside copy-constructor: "
+							+ toString( rc ) );
+					}
+				}
+			}
+
+			Vector( Vector< D, ascend, MyCoordinates > &&x ) noexcept {
+
+				if( internal::getCoordinates( x ).size() > 0 ) {
+					internal::le.execution( &x );
+				}
+
+				ref = std::move( x.ref );
+			}
+
+			Vector< D, ascend, MyCoordinates > & operator=(
+				const Vector< D, ascend, MyCoordinates > &x
+			) {
+				const RC rc = set( *this, x );
+				if( rc != grb::SUCCESS ) {
+					throw std::runtime_error( grb::toString( rc ) );
+				}
+				return *this;
+			}
+
+			Vector< D, ascend, MyCoordinates > & operator=(
+				Vector< D, ascend, MyCoordinates > &&x
+			) noexcept {
+				if( internal::getCoordinates( x ).size() > 0 ) {
+					internal::le.execution( &x );
+				}
+				ref = std::move( x.ref );
+				return *this;
+			}
+
+			~Vector() {
+				/* TODO this interferes with opgen
+				if( internal::getCoordinates( *this ).size() > 0 ) {
+					internal::le.execution( this );
+				}*/
+			}
+
+			const_iterator begin(
+				const size_t s = 0, const size_t P = 1
+			) const {
+				if( internal::getCoordinates( *this ).size() > 0 ) {
+					internal::le.execution( this );
+				}
+
+				return ref.begin(s, P);
+			}
+
+			const_iterator end(
+				const size_t s = 0, const size_t P = 1
+			) const {
+				if( internal::getCoordinates( *this ).size() > 0 ) {
+					internal::le.execution( this );
+				}
+
+				return ref.end(s, P);
+			}
+
+			const_iterator cbegin(
+				const size_t s = 0, const size_t P = 1
+			) const {
+				if( internal::getCoordinates( *this ).size() > 0 ) {
+					internal::le.execution( this );
+				}
+
+				return ref.cbegin(s, P);
+			}
+
+			const_iterator cend(
+				const size_t s = 0, const size_t P = 1
+			) const {
+				if( internal::getCoordinates( *this ).size() > 0 ) {
+					internal::le.execution( this );
+				}
+
+				return ref.cend(s, P);
+			}
+
+			template< Descriptor descr = descriptors::no_operation,
+				typename mask_type,
+				class Accum,
+				typename ind_iterator = const size_t * __restrict__,
+				typename nnz_iterator = const D * __restrict__,
+				class Dup = operators::right_assign<
+					D, typename nnz_iterator::value_type, D
+				>
+			>
+			RC build(
+				const Vector< mask_type, ascend, MyCoordinates > &mask,
+				const Accum &accum,
+				const ind_iterator ind_start,
+				const ind_iterator ind_end,
+				const nnz_iterator nnz_start,
+				const nnz_iterator nnz_end,
+				const Dup &dup = Dup()
+			) {
+				return ref.build( mask.ref, accum, ind_start, ind_end, nnz_start, nnz_end,
+					dup );
+			}
+
+			template<
+				Descriptor descr = descriptors::no_operation,
+				class Accum = operators::right_assign< D, D, D >,
+				typename T, typename mask_type = bool
+			>
+			RC assign(
+				const T &val,
+				const Vector< mask_type, ascend, MyCoordinates > &mask,
+				const Accum &accum = Accum()
+			) {
+				return ref.assign( val, mask.ref, accum );
+			}
+
+			template< typename T >
+			RC nnz( T &nnz ) const {
+				if( internal::getCoordinates( *this ).size() > 0 ) {
+					internal::le.execution( this );
+				}
+
+				return ref.nnz( nnz );
+			}
+
+			D * raw() const {
+				return ref.raw();
+			}
+
+			lambda_reference operator[]( const size_t i ) {
+				return ref[ i ];
+			}
+
+			lambda_reference operator[]( const size_t i ) const {
+				return ref[ i ];
+			}
+
+	};
+
+	// specialisation for GraphBLAS type_traits
+	template< typename D, typename Coord >
+	struct is_container< Vector< D, ascend, Coord > > {
+		/** A ascend vector is a GraphBLAS object. */
+		static const constexpr bool value = true;
+	};
+
+	// internal getters implementation
+	namespace internal {
+
+		template< typename D, typename C >
+		inline C & getCoordinates( Vector< D, ascend, C > &x ) noexcept {
+			return internal::getCoordinates( x.ref );
+		}
+
+		template< typename D, typename C >
+		inline const C & getCoordinates(
+			const Vector< D, ascend, C > &x
+		) noexcept {
+			return internal::getCoordinates( x.ref );
+		}
+
+		template< typename D, typename C >
+		inline D * getRaw( Vector< D, ascend, C > &x ) noexcept {
+			return getRaw( x.ref );
+		}
+
+		template< typename D, typename C >
+		inline const D * getRaw( const Vector< D, ascend, C > &x ) noexcept {
+			return getRaw( x.ref );
+		}
+
+		template< typename D, typename RIT, typename CIT, typename NIT >
+		inline internal::Compressed_Storage< D, RIT, NIT > & getCRS(
+			Matrix< D, ascend, RIT, CIT, NIT > &A
+		) noexcept {
+			return getCRS( A.ref );
+		}
+
+		template< typename D, typename RIT, typename CIT, typename NIT >
+		inline const internal::Compressed_Storage< D, RIT, NIT > & getCRS(
+			const Matrix< D, ascend, RIT, CIT, NIT > &A
+		) noexcept {
+			return getCRS( A.ref );
+		}
+
+		template< typename D, typename RIT, typename CIT, typename NIT >
+		inline internal::Compressed_Storage< D, CIT, NIT > & getCCS(
+			Matrix< D, ascend, RIT, CIT, NIT > &A
+		) noexcept {
+			return getCCS( A.ref );
+		}
+
+		template< typename D, typename RIT, typename CIT, typename NIT >
+		inline const internal::Compressed_Storage< D, CIT, NIT > & getCCS(
+			const Matrix< D, ascend, RIT, CIT, NIT > &A
+		) noexcept {
+			return getCCS( A.ref );
+		}
+
+		template< typename D, typename C >
+		inline Vector< D, reference, C >& getRefVector(
+			Vector< D, ascend, C > &x
+		) noexcept {
+			return x.ref;
+		}
+
+		template< typename D, typename C >
+		inline const Vector< D, reference, C >& getRefVector(
+			const Vector< D, ascend, C > &x
+		) noexcept {
+			return x.ref;
+		}
+
+	} // namespace internal
+
+} // namespace grb
+
+#undef NO_CAST_ASSERT
+#undef NO_MASKCAST_ASSERT
+
+#endif // end ``_H_GRB_ASCEND_VECTOR''
+
diff --git a/include/graphblas/ascend/vector_wrapper.hpp b/include/graphblas/ascend/vector_wrapper.hpp
new file mode 100644
index 000000000..ac10ac2fe
--- /dev/null
+++ b/include/graphblas/ascend/vector_wrapper.hpp
@@ -0,0 +1,192 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides a wrapper to a scalar or a vector, for those primitives that could
+ * take either.
+ *
+ * @author A. N. Yzelman
+ * @date 12th of September, 2023
+ */
+
+#ifndef _H_GRB_ASCEND_VECTOR_WRAPPER
+#define _H_GRB_ASCEND_VECTOR_WRAPPER
+
+#include <graphblas/backends.hpp>
+#include <graphblas/blas0.hpp>
+#include <graphblas/descriptors.hpp>
+#include <graphblas/internalops.hpp>
+#include <graphblas/ops.hpp>
+#include <graphblas/rc.hpp>
+#include <graphblas/semiring.hpp>
+
+#include <graphblas/nonblocking/lazy_evaluation.hpp>
+
+#include "coordinates.hpp"
+#include "vector.hpp"
+#include "blas1.hpp"
+
+
+namespace grb {
+
+	namespace internal {
+
+		/**
+		 * A wrapper class used to store a scalar value, which is passed by value to
+		 * an internal function used by an ALP/GraphBLAS operation. The wrapper
+		 * classes are used by operations that may have a formal parameter that is
+		 * either a scalar or a vector, because the implementation is generic and
+		 * handles all possible cases.
+		 */
+		template< bool scalar, typename InputType,  typename CoordinatesType >
+		class Wrapper {
+
+			private:
+
+				/**
+				 * \warning This is not a reference, since the semantics are that the
+				 *          \em current scalar value is used.
+				 */
+				InputType val;
+
+
+			public:
+
+				/** Base constructor that copies the input scalar. */
+				Wrapper(const InputType &value) : val( value ) {}
+
+				/** Default copy constructor. */
+				Wrapper( const Wrapper< scalar, InputType, CoordinatesType > & ) = default;
+
+				/**
+				 * @returns <tt>nullptr</tt>
+				 *
+				 * This function returns a raw array for vectors only).
+				 */
+				constexpr InputType * getRaw() const {
+					return nullptr;
+				}
+
+				/**
+				 * @returns <tt>nullptr</tt>
+				 *
+				 * This function returns coordinates only for vectors.
+				 */
+				constexpr CoordinatesType * getCoordinates() const {
+					return nullptr;
+				}
+
+				/**
+				 * @returns <tt>nullptr</tt>
+				 *
+				 * This function returns a vector pointer only when wrapping a vector.
+				 */
+				constexpr Vector< InputType, ascend, CoordinatesType > * getPointer()
+					const
+				{
+					return nullptr;
+				}
+
+				/**
+				 * @returns The scalar value it wraps.
+				 */
+				const InputType & getValue() const {
+					return val;
+				}
+
+				/**
+				 * @returns Whether the underlying container is dense.
+				 */
+				bool isDense() const {
+					return true;
+				}
+
+		};
+
+		/**
+		 * A wrapper class used to store a vector, which is passed by reference to an
+		 * internal function used by an ALP/GraphBLAS operation. The wrapper classes
+		 * are used by by operations that may have a formal parameter that is either a
+		 * scalar or a vector, because the implementation is generic and handles all
+		 * possible cases.
+		 */
+		template< typename InputType,  typename CoordinatesType >
+		class Wrapper< false, InputType, CoordinatesType > {
+
+			private:
+
+				/** A reference to the vector this class wraps. */
+				const Vector< InputType, ascend, CoordinatesType > &vec;
+
+
+			public:
+
+				/** Base constructor wrapping arund a given \a vector. */
+				Wrapper( const Vector< InputType, ascend, CoordinatesType > &vector ) :
+					vec( vector )
+				{}
+
+				/** Copy constructor. */
+				Wrapper( const Wrapper< false, InputType, CoordinatesType > &w ) :
+					vec( w.vec )
+				{}
+
+				/** @returns The underlying raw value array. */
+				const InputType * getRaw() const {
+					return internal::getRaw( vec );
+				}
+
+				/** @returns The underlying coordinates instance. */
+				const CoordinatesType * getCoordinates() const {
+					return &internal::getCoordinates( vec );
+				}
+
+				/** @returns The underlying vector (a pointer to it). */
+				const Vector< InputType, ascend, CoordinatesType > * getPointer()
+					const
+				{
+					return &vec;
+				}
+
+				/**
+				 * @returns a possibly unitialised value that is not intended to be
+				 *          consumed.
+				 *
+				 * \warning This function should only be called on wrappers of scalars.
+				 */
+				const InputType & getValue() const {
+					// this is a trick to avoid compilation errors, this value will never be
+					// used in practice
+					return *( getRaw( ) );
+				}
+
+				/**
+				 * @returns Whether the underlying vector is dense.
+				 */
+				bool isDense() const {
+					return internal::getCoordinates( vec ).isDense();
+				}
+		};
+
+	} // end namespace ``internal''
+
+} // end namespace ``grb''
+
+#endif
+
diff --git a/include/graphblas/backends.hpp b/include/graphblas/backends.hpp
index 653348112..5c5770377 100644
--- a/include/graphblas/backends.hpp
+++ b/include/graphblas/backends.hpp
@@ -75,6 +75,14 @@ namespace grb {
 		 */
 		nonblocking,
 
+		/**
+		 * The Ascend backend. This is not a true ALP implementation -- programs
+		 * compiled using this backend generate AscendC code. This backend may require
+		 * the use of ALP/Ascend-specific primitives as it currently operates without
+		 * compiler support.
+		 */
+		ascend,
+
 		/**
 		 * \internal
 		 * A shared-memory parallel distribution based on a row-wise 1D block-cyclic
diff --git a/include/graphblas/benchmark.hpp b/include/graphblas/benchmark.hpp
index 81bd67773..81e46c251 100644
--- a/include/graphblas/benchmark.hpp
+++ b/include/graphblas/benchmark.hpp
@@ -36,6 +36,9 @@
 #ifdef _GRB_WITH_NONBLOCKING
  #include "graphblas/nonblocking/benchmark.hpp"
 #endif
+#ifdef _GRB_WITH_ASCEND
+ #include "graphblas/ascend/benchmark.hpp"
+#endif
 #ifdef _GRB_WITH_BANSHEE
  #include "graphblas/banshee/benchmark.hpp"
 #endif
diff --git a/include/graphblas/blas1.hpp b/include/graphblas/blas1.hpp
index e28c9e8ad..d33a51cf3 100644
--- a/include/graphblas/blas1.hpp
+++ b/include/graphblas/blas1.hpp
@@ -34,6 +34,9 @@
 #ifdef _GRB_WITH_NONBLOCKING
  #include "graphblas/nonblocking/blas1.hpp"
 #endif
+#ifdef _GRB_WITH_ASCEND
+ #include "graphblas/ascend/blas1.hpp"
+#endif
 #ifdef _GRB_WITH_BANSHEE
  #include <graphblas/banshee/blas1.hpp>
 #endif
diff --git a/include/graphblas/blas2.hpp b/include/graphblas/blas2.hpp
index 2a0b1338e..2f0afbc43 100644
--- a/include/graphblas/blas2.hpp
+++ b/include/graphblas/blas2.hpp
@@ -39,6 +39,9 @@
 #ifdef _GRB_WITH_NONBLOCKING
  #include "graphblas/nonblocking/blas2.hpp"
 #endif
+#ifdef _GRB_WITH_ASCEND
+ #include "graphblas/ascend/blas2.hpp"
+#endif
 #ifdef _GRB_WITH_BANSHEE
  #include <graphblas/banshee/blas2.hpp>
 #endif
diff --git a/include/graphblas/blas3.hpp b/include/graphblas/blas3.hpp
index 6ed90264b..77dfcbb8c 100644
--- a/include/graphblas/blas3.hpp
+++ b/include/graphblas/blas3.hpp
@@ -36,6 +36,9 @@
 #ifdef _GRB_WITH_NONBLOCKING
  #include "graphblas/nonblocking/blas3.hpp"
 #endif
+#ifdef _GRB_WITH_ASCEND
+ #include "graphblas/ascend/blas3.hpp"
+#endif
 #ifdef _GRB_WITH_LPF
  #include <graphblas/bsp1d/blas3.hpp>
 #endif
diff --git a/include/graphblas/collectives.hpp b/include/graphblas/collectives.hpp
index 8ca63fd3e..e3f839c27 100644
--- a/include/graphblas/collectives.hpp
+++ b/include/graphblas/collectives.hpp
@@ -36,6 +36,9 @@
 #ifdef _GRB_WITH_NONBLOCKING
  #include "graphblas/nonblocking/collectives.hpp"
 #endif
+#ifdef _GRB_WITH_ASCEND
+ #include "graphblas/ascend/collectives.hpp"
+#endif
 #ifdef _GRB_WITH_LPF
  #include <graphblas/bsp/collectives.hpp>
 #endif
diff --git a/include/graphblas/config.hpp b/include/graphblas/config.hpp
index d7c2a650f..8a3652ca7 100644
--- a/include/graphblas/config.hpp
+++ b/include/graphblas/config.hpp
@@ -35,6 +35,9 @@
 #ifdef _GRB_WITH_NONBLOCKING
  #include "graphblas/nonblocking/config.hpp"
 #endif
+#ifdef _GRB_WITH_ASCEND
+ #include "graphblas/ascend/config.hpp"
+#endif
 #ifdef _GRB_WITH_OMP
  #include "graphblas/omp/config.hpp"
 #endif
diff --git a/include/graphblas/coordinates.hpp b/include/graphblas/coordinates.hpp
index 43f5c9845..59e80adad 100644
--- a/include/graphblas/coordinates.hpp
+++ b/include/graphblas/coordinates.hpp
@@ -32,8 +32,8 @@
 #ifdef _GRB_WITH_NONBLOCKING
  #include "graphblas/nonblocking/coordinates.hpp"
 #endif
-#ifdef _GRB_WITH_LPF
-// #include <graphblas/bsp1d/coordinates.hpp>
+#ifdef _GRB_WITH_ASCEND
+ #include "graphblas/ascend/coordinates.hpp"
 #endif
 #ifdef _GRB_WITH_BANSHEE
  #include <graphblas/banshee/coordinates.hpp>
diff --git a/include/graphblas/exec.hpp b/include/graphblas/exec.hpp
index f7ecb8cc2..64477bbef 100644
--- a/include/graphblas/exec.hpp
+++ b/include/graphblas/exec.hpp
@@ -36,6 +36,9 @@
 #ifdef _GRB_WITH_NONBLOCKING
  #include "graphblas/nonblocking/exec.hpp"
 #endif
+#ifdef _GRB_WITH_ASCEND
+ #include "graphblas/ascend/exec.hpp"
+#endif
 #ifdef _GRB_WITH_LPF
  #include "graphblas/bsp1d/exec.hpp"
 #endif
diff --git a/include/graphblas/identities.hpp b/include/graphblas/identities.hpp
index fdbb7c7f7..6478eb7ab 100644
--- a/include/graphblas/identities.hpp
+++ b/include/graphblas/identities.hpp
@@ -131,14 +131,19 @@ namespace grb {
 			 *         `minus infinity'.
 			 */
 			static constexpr D value() {
-				return std::numeric_limits< D >::min() == 0 ? 0 : ( std::numeric_limits< D >::has_infinity ? -std::numeric_limits< D >::infinity() : std::numeric_limits< D >::min() );
+				return std::numeric_limits< D >::min() == 0
+					? 0
+					: ( std::numeric_limits< D >::has_infinity
+							? -std::numeric_limits< D >::infinity()
+							: std::numeric_limits< D >::min() );
 			}
 		};
 		template< typename K, typename V >
 		class negative_infinity< std::pair< K, V > > {
 		public:
 			static constexpr std::pair< K, V > value() {
-				return std::make_pair( negative_infinity< K >::value(), negative_infinity< V >::value() );
+				return std::make_pair( negative_infinity< K >::value(),
+					negative_infinity< V >::value() );
 			}
 		};
 
@@ -149,9 +154,11 @@ namespace grb {
 		 */
 		template< typename D >
 		class logical_false {
-			static_assert( std::is_convertible< bool, D >::value, "Cannot form identity under the requested domain" );
+			static_assert( std::is_convertible< bool, D >::value,
+				"Cannot form identity under the requested domain" );
+
+		public: 
 
-		public:
 			/**
 			 * @tparam D The domain of the value to return.
 			 * @return The identity under the standard logical OR operator, i.e.,
@@ -176,7 +183,8 @@ namespace grb {
 		 */
 		template< typename D >
 		class logical_true {
-			static_assert( std::is_convertible< bool, D >::value, "Cannot form identity under the requested domain" );
+			static_assert( std::is_convertible< bool, D >::value,
+				"Cannot form identity under the requested domain" );
 
 		public:
 			/**
@@ -192,11 +200,13 @@ namespace grb {
 		class logical_true< std::pair< K, V > > {
 		public:
 			static constexpr std::pair< K, V > value() {
-				return std::make_pair( logical_true< K >::value(), logical_true< V >::value() );
+				return std::make_pair( logical_true< K >::value(),
+					logical_true< V >::value() );
 			}
 		};
 
 	} // namespace identities
+
 } // namespace grb
 
 #endif
diff --git a/include/graphblas/init.hpp b/include/graphblas/init.hpp
index dd34749ba..794f6c475 100644
--- a/include/graphblas/init.hpp
+++ b/include/graphblas/init.hpp
@@ -36,6 +36,9 @@
 #ifdef _GRB_WITH_NONBLOCKING
  #include "graphblas/nonblocking/init.hpp"
 #endif
+#ifdef _GRB_WITH_ASCEND
+ #include "graphblas/ascend/init.hpp"
+#endif
 #ifdef _GRB_WITH_LPF
  #include "graphblas/bsp1d/init.hpp"
 #endif
diff --git a/include/graphblas/io.hpp b/include/graphblas/io.hpp
index 8fbb70a13..a326a294c 100644
--- a/include/graphblas/io.hpp
+++ b/include/graphblas/io.hpp
@@ -35,6 +35,9 @@
 #ifdef _GRB_WITH_NONBLOCKING
  #include "graphblas/nonblocking/io.hpp"
 #endif
+#ifdef _GRB_WITH_ASCEND
+ #include "graphblas/ascend/io.hpp"
+#endif
 #ifdef _GRB_WITH_LPF
  #include <graphblas/bsp1d/io.hpp>
 #endif
diff --git a/include/graphblas/matrix.hpp b/include/graphblas/matrix.hpp
index bccb0f928..8e46ba8ef 100644
--- a/include/graphblas/matrix.hpp
+++ b/include/graphblas/matrix.hpp
@@ -36,6 +36,9 @@
 #ifdef _GRB_WITH_NONBLOCKING
  #include "graphblas/nonblocking/matrix.hpp"
 #endif
+#ifdef _GRB_WITH_ASCEND
+ #include "graphblas/ascend/matrix.hpp"
+#endif
 #ifdef _GRB_WITH_LPF
  #include <graphblas/bsp1d/matrix.hpp>
 #endif
diff --git a/include/graphblas/nonblocking/coordinates.hpp b/include/graphblas/nonblocking/coordinates.hpp
index 1aca8e322..c24ca64a3 100644
--- a/include/graphblas/nonblocking/coordinates.hpp
+++ b/include/graphblas/nonblocking/coordinates.hpp
@@ -45,6 +45,7 @@
 #include <graphblas/base/coordinates.hpp>
 
 #include <graphblas/reference/config.hpp>
+#include <graphblas/omp/config.hpp>
 
 #include <graphblas/nonblocking/init.hpp>
 #include <graphblas/nonblocking/analytic_model.hpp>
diff --git a/include/graphblas/nonblocking/lazy_evaluation.hpp b/include/graphblas/nonblocking/lazy_evaluation.hpp
index 426f530fb..791ae122c 100644
--- a/include/graphblas/nonblocking/lazy_evaluation.hpp
+++ b/include/graphblas/nonblocking/lazy_evaluation.hpp
@@ -43,11 +43,18 @@ namespace grb {
 		 */
 		class LazyEvaluation {
 
-			private:
+			friend class alp::internal::OpGen;
+
+
+			//private:
+			public: // DBG
 
 				/** Multiple pipelines may be maintained at any time. */
 				std::vector< Pipeline > pipelines;
 
+
+			private: // DBG
+
 				/** Stores the pipelines that share data with the new stage. */
 				std::vector< std::vector< Pipeline >::iterator > shared_data_pipelines;
 
@@ -116,10 +123,15 @@ namespace grb {
 					const size_t data_type_size,
 					const bool dense_descr,
 					const bool dense_mask,
+					const size_t output_container_id,
+					// TODO FIXME is there really a need for pointers?
 					void * const output_container_ptr,
 					void * const output_aux_container_ptr,
 					Coordinates< nonblocking > * const coor_output_ptr,
 					Coordinates< nonblocking > * const coor_output_aux_ptr,
+					const size_t input_a_id, const size_t input_b_id,
+					const size_t input_c_id, const size_t input_d_id,
+					// TODO FIXME is there really a need for pointers?
 					const void * const input_a_ptr,
 					const void * const input_b_ptr,
 					const void * const input_c_ptr,
@@ -128,6 +140,8 @@ namespace grb {
 					const Coordinates< nonblocking > * const coor_b_ptr,
 					const Coordinates< nonblocking > * const coor_c_ptr,
 					const Coordinates< nonblocking > * const coor_d_ptr,
+					const size_t input_matrix_id,
+					// TODO FIXME is there really a need for pointers?
 					const void * const input_matrix
 				);
 
diff --git a/include/graphblas/nonblocking/pipeline.hpp b/include/graphblas/nonblocking/pipeline.hpp
index 62500d115..a1689a88c 100644
--- a/include/graphblas/nonblocking/pipeline.hpp
+++ b/include/graphblas/nonblocking/pipeline.hpp
@@ -72,6 +72,15 @@
 #include "coordinates.hpp"
 
 
+// TODO ugly hack, fwd declare ALP::internal::OpGen
+namespace alp {
+	template< size_t process_order, size_t problem_order >
+	class Grid;
+	namespace internal {
+		class OpGen;
+	}
+}
+
 namespace grb {
 
 	namespace internal {
@@ -105,6 +114,11 @@ namespace grb {
 		 */
 		class Pipeline {
 
+			friend class alp::internal::OpGen;
+	template< size_t process_order, size_t problem_order >
+	friend
+				class alp::Grid;
+
 			public:
 
 				// The pipeline is passed by reference such that an out-of-place operation
@@ -119,9 +133,22 @@ namespace grb {
 
 				size_t containers_size;
 				size_t size_of_data_type;
+
+				// per-stage data
 				std::vector< stage_type > stages;
+
+
+			public: //DBG
+
 				std::vector< Opcode > opcodes;
 
+
+			private: //DBG
+
+				std::vector< std::vector< size_t > > stage_inputs;
+				std::vector< size_t > stage_output;
+
+				// per-pipeline data
 				std::set< Coordinates< nonblocking > * > accessed_coordinates;
 				std::set< const void * > input_vectors;
 				std::set< const void * > output_vectors;
@@ -276,10 +303,17 @@ namespace grb {
 					const size_t data_type_size,
 					const bool dense_descr,
 					const bool dense_mask,
+					const size_t output_vector_id,
+					// TODO FIXME is there really a need for pointers?
 					void * const output_vector_ptr,
 					void * const output_aux_vector_ptr,
 					Coordinates< nonblocking > * const coor_output_ptr,
 					Coordinates< nonblocking > * const coor_output_aux_ptr,
+					const size_t input_a_id,
+					const size_t input_b_id,
+					const size_t input_c_id,
+					const size_t input_d_id,
+					// TODO FIXME is there really a need for pointers?
 					const void * const input_a_ptr,
 					const void * const input_b_ptr,
 					const void * const input_c_ptr,
@@ -288,6 +322,8 @@ namespace grb {
 					const Coordinates< nonblocking > * const coor_b_ptr,
 					const Coordinates< nonblocking > * const coor_c_ptr,
 					const Coordinates< nonblocking > * const coor_d_ptr,
+					const size_t input_matrix_id,
+					// TODO FIXME is there really a need for pointers?
 					const void * const input_matrix
 				);
 
diff --git a/include/graphblas/pinnedvector.hpp b/include/graphblas/pinnedvector.hpp
index 380c53ae7..4bf106fbd 100644
--- a/include/graphblas/pinnedvector.hpp
+++ b/include/graphblas/pinnedvector.hpp
@@ -40,6 +40,9 @@
 #ifdef _GRB_WITH_NONBLOCKING
  #include "graphblas/nonblocking/pinnedvector.hpp"
 #endif
+#ifdef _GRB_WITH_ASCEND
+ #include "graphblas/ascend/pinnedvector.hpp"
+#endif
 #ifdef _GRB_WITH_LPF
  #include <graphblas/bsp1d/pinnedvector.hpp>
 #endif
diff --git a/include/graphblas/properties.hpp b/include/graphblas/properties.hpp
index 864b849cd..effcded68 100644
--- a/include/graphblas/properties.hpp
+++ b/include/graphblas/properties.hpp
@@ -36,6 +36,9 @@
 #ifdef _GRB_WITH_NONBLOCKING
  #include "graphblas/nonblocking/properties.hpp"
 #endif
+#ifdef _GRB_WITH_ASCEND
+ #include "graphblas/ascend/properties.hpp"
+#endif
 #ifdef _GRB_WITH_LPF
  #include <graphblas/bsp1d/properties.hpp>
 #endif
diff --git a/include/graphblas/spmd.hpp b/include/graphblas/spmd.hpp
index 88cef92bc..f65a8c1ea 100644
--- a/include/graphblas/spmd.hpp
+++ b/include/graphblas/spmd.hpp
@@ -35,6 +35,9 @@
 #ifdef _GRB_WITH_NONBLOCKING
  #include "graphblas/nonblocking/spmd.hpp"
 #endif
+#ifdef _GRB_WITH_ASCEND
+ #include "graphblas/ascend/spmd.hpp"
+#endif
 #ifdef _GRB_WITH_LPF
  #include "graphblas/bsp1d/spmd.hpp"
 #endif
diff --git a/include/graphblas/tags.hpp b/include/graphblas/tags.hpp
index 07d39eb2e..e1791b2be 100644
--- a/include/graphblas/tags.hpp
+++ b/include/graphblas/tags.hpp
@@ -42,6 +42,12 @@ namespace grb {
 	 *       template to grb::Vector and grb::Matrix creates a combinatorial
 	 *       explosion in the number of combinations that must be caught.
 	 *       Are there better alternatives?
+	 *
+	 *       Update 2023: yes there are, see Spampinato et al., ARRAY '23. This
+	 *       file will be removed in future releases when it is replaced by the
+	 *       concept of \em views and particular that of xMFs that prevent the
+	 *       feared combinatorial explosion, both introduced in the aforementioned
+	 *       paper.
 	 */
 	namespace tags {
 
diff --git a/include/graphblas/utils/alloc.hpp b/include/graphblas/utils/alloc.hpp
index 5943e5216..fd98b6b72 100644
--- a/include/graphblas/utils/alloc.hpp
+++ b/include/graphblas/utils/alloc.hpp
@@ -59,6 +59,9 @@ namespace grb {
 #ifdef _GRB_WITH_NONBLOCKING
  #include "graphblas/nonblocking/alloc.hpp"
 #endif
+#ifdef _GRB_WITH_ASCEND
+ #include "graphblas/ascend/alloc.hpp"
+#endif
 #ifdef _GRB_WITH_LPF
  #include "graphblas/bsp1d/alloc.hpp"
 #endif
diff --git a/include/graphblas/vector.hpp b/include/graphblas/vector.hpp
index 5ac75b1b1..ee7613ea1 100644
--- a/include/graphblas/vector.hpp
+++ b/include/graphblas/vector.hpp
@@ -37,6 +37,9 @@
 #ifdef _GRB_WITH_NONBLOCKING
  #include "graphblas/nonblocking/vector.hpp"
 #endif
+#ifdef _GRB_WITH_ASCEND
+ #include "graphblas/ascend/vector.hpp"
+#endif
 #ifdef _GRB_WITH_LPF
  #include <graphblas/bsp1d/vector.hpp>
 #endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 50e731a30..d032974d3 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -89,15 +89,20 @@ list( JOIN compiler_list " " BACKEND_COMPILERS_SPACED )
 list( JOIN compile_options_list " " BACKEND_CFLAGS_SPACED )
 list( JOIN link_flags_list " " BACKEND_LFLAGS_SPACED )
 configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/grbcxx.in ${CMAKE_CURRENT_BINARY_DIR}/grbcxx @ONLY )
+configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/grbcxx.in ${CMAKE_CURRENT_BINARY_DIR}/alpcxx @ONLY )
 
 list( JOIN runenv_list " " BACKEND_RUNENV_SPACED )
 list( JOIN runner_list " " BACKEND_RUNNER_SPACED )
 configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/grbrun.in ${CMAKE_CURRENT_BINARY_DIR}/grbrun @ONLY )
+configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/grbrun.in ${CMAKE_CURRENT_BINARY_DIR}/alprun @ONLY )
 
 # install them to the install folder with execute permission
 install( FILES ${CMAKE_CURRENT_BINARY_DIR}/setenv
 	${CMAKE_CURRENT_BINARY_DIR}/grbcxx
 	${CMAKE_CURRENT_BINARY_DIR}/grbrun
+	${CMAKE_CURRENT_BINARY_DIR}/alpcxx
+	${CMAKE_CURRENT_BINARY_DIR}/alprun
+	${CMAKE_CURRENT_BINARY_DIR}/ascendcc
 
 	DESTINATION "${BIN_INSTALL_DIR}"
 	PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE
@@ -105,6 +110,16 @@ install( FILES ${CMAKE_CURRENT_BINARY_DIR}/setenv
 		WORLD_READ WORLD_EXECUTE
 )
 
+if( WITH_ASCEND_BACKEND )
+	configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/ascendcc.in ${CMAKE_CURRENT_BINARY_DIR}/ascendcc @ONLY )
+	install( FILES ${CMAKE_CURRENT_BINARY_DIR}/ascendcc
+		DESTINATION "${BIN_INSTALL_DIR}"
+		PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE
+			GROUP_READ GROUP_EXECUTE
+			WORLD_READ WORLD_EXECUTE
+	)
+endif()
+
 ### GENERATE CMAKE INFRASTRUCTURE INSIDE INSTALLATION DIRECTORY
 
 include(CMakePackageConfigHelpers)
diff --git a/src/ascendcc.in b/src/ascendcc.in
new file mode 100755
index 000000000..27d420242
--- /dev/null
+++ b/src/ascendcc.in
@@ -0,0 +1,125 @@
+#!/bin/bash
+
+#
+#   Copyright 2021 Huawei Technologies Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+if [ -z "${ASCEND_TOOLKIT_INSTALL_PATH}" ]; then
+	echo "Please set ASCEND_TOOLKIT_INSTALL_PATH"
+	echo '	- for example, issue: export ASCEND_TOOLKIT_INSTALL_PATH="/home/yzelman/Packages/CANN/x86_64/ascend-toolkit/latest/"'
+	exit 255
+fi
+
+#example path:
+#ASCEND_TOOLKIT_INSTALL_PATH="/home/yzelman/Packages/CANN/ascend-toolkit/latest/"
+
+if [ ! -d "${ASCEND_TOOLKIT_INSTALL_PATH}" ]; then
+	echo "Error: ASCEND_TOOLKIT_INSTALL_PATH (${ASCEND_TOOLKIT_INSTALL_PATH}) does not exist"
+	exit 255
+fi
+
+ASCEND_TOOLKIT_INSTALL_PATH=${ASCEND_TOOLKIT_INSTALL_PATH}/
+echo "Info: using ASCEND_TOOLKIT_INSTALL_PATH=${ASCEND_TOOLKIT_INSTALL_PATH}"
+
+ASCEND_COMPILER="${ASCEND_TOOLKIT_INSTALL_PATH}/compiler/ccec_compiler/bin/ccec"
+
+BACKENDS=( "910B" "910" )
+ASCEND_CXXFLAGS="-xcce -DTILING_KEY_VAR=0 -O2 -std=c++17 -fPIC -pthread"
+ASCEND_CCECFLAGS_LIST=( "--cce-aicore-arch=dav-c220-cube -mllvm -cce-aicore-function-stack-size=16000 -mllvm -cce-aicore-record-overflow=false -mllvm -cce-aicore-addr-transform --cce-auto-sync" \
+	"--cce-aicore-arch=dav-c100 --cce-auto-sync" )
+ASCEND_CCECFLAGS_LLIST=( "--cce-fatobj-link --cce-aicore-arch=dav-c220-cube" \
+	"--cce-fatobj-link --cce-aicore-arch=dav-c100" )
+ASCEND_LFLAGS_LIST=( "-L${ASCEND_TOOLKIT_INSTALL_PATH}runtime/lib64 -L${ASCEND_TOOLKIT_INSTALL_PATH}tools/simulator/Ascend910B1/lib -L${ASCEND_TOOLKIT_INSTALL_PATH}tools/tikicpulib/lib/Ascend910B1 -lstdc++ -lruntime -lascendcl" \
+	"-L${ASCEND_TOOLKIT_INSTALL_PATH}runtime/lib64 -L${ASCEND_TOOLKIT_INSTALL_PATH}tools/simulator/Ascend910A/lib -L${ASCEND_TOOLKIT_INSTALL_PATH}tools/tikicpulib/lib/Ascend910A -lstdc++ -lruntime -lascendcl" )
+ASCEND_IFLAGS="-I${ASCEND_TOOLKIT_INSTALL_PATH}acllib/include -I${ASCEND_TOOLKIT_INSTALL_PATH}compiler/tikcpp/tikcfw -I${ASCEND_TOOLKIT_INSTALL_PATH}compiler/tikcpp/tikcfw/impl -I${ASCEND_TOOLKIT_INSTALL_PATH}compiler/tikcpp/tikcfw/interface -I${ASCEND_TOOLKIT_INSTALL_PATH}tools/tikicpulib/lib/include"
+
+declare -a ARGS
+SHOW="eval"
+BACKEND=${BACKENDS[0]}
+LINK=true
+
+while [[ $# -gt 0 ]]; do
+	option="$1"
+	shift;
+	case ${option} in
+		-b|--backend)
+			BACKEND=$1
+			shift
+		;;
+		-c)
+			ARGS+=("-c")
+			LINK=false
+		;;
+		--show)
+			SHOW="echo"
+		;;
+		--version)
+			echo "This is ALP/Ascend"
+			echo " "
+			echo "This software comes with NO warranty; not even for"
+			echo "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
+			echo "See the license file for details."
+			echo " "
+			ARGS+=("${option}")
+		;;
+		--)
+			break
+		;;
+		*)
+			ARGS+=( "${option}" )
+		;;
+	esac
+done
+
+BACKENDID=-1
+for i in "${!BACKENDS[@]}"; do
+	if [[ "${BACKENDS[$i]}" = "${BACKEND}" ]]
+	then
+		BACKENDID=$i
+		break
+	fi
+done
+
+echo "Info: compiling for ${BACKEND}"
+
+if [[ ${BACKENDID} -eq -1 ]]
+then
+	echo "Could not find requested backend \`${BACKEND}'"
+	exit 255
+fi
+
+ASCEND_CCECFLAGS=${ASCEND_CCECFLAGS_LIST[$i]}
+
+if ${LINK}
+then
+	CMD="${ASCEND_COMPILER} ${ASCEND_CCECFLAGS_LLIST[$i]} "${ARGS[@]}" "$@" ${ASCEND_LFLAGS_LIST[$i]}"
+else
+	CMD="${ASCEND_COMPILER} ${ASCEND_CXXFLAGS} ${ASCEND_CCECFLAGS} ${ASCEND_IFLAGS} "${ARGS[@]}" "$@" ${LFLAGS}"
+fi
+
+${SHOW} "${CMD}"
+
+#Ascend 910B, -c:
+#/home/yzelman/Packages/CANN/ascend-toolkit/latest/compiler/ccec_compiler/bin/ccec -xcce -DTILING_KEY_VAR=0 -I"/home/yzelman/Packages/CANN/ascend-toolkit/latest/acllib/include" -I"/home/yzelman/Packages/CANN/ascend-toolkit/latest/compiler/tikcpp/tikcfw" -I"/home/yzelman/Packages/CANN/ascend-toolkit/latest/compiler/tikcpp/tikcfw/impl" -I"/home/yzelman/Packages/CANN/ascend-toolkit/latest/compiler/tikcpp/tikcfw/interface" -I"/home/yzelman/Packages/CANN/ascend-toolkit/latest/tools/tikicpulib/lib/include" -O2 -std=c++17 --cce-aicore-arch=dav-c220-cube -mllvm -cce-aicore-function-stack-size=16000 -mllvm -cce-aicore-record-overflow=false -mllvm -cce-aicore-addr-transform --cce-auto-sync -fPIC -pthread -o CMakeFiles/add_custom_npu.dir/__/__/add_custom.cpp.o -c /home/yzelman/Documents/Ascend/samples-master/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/kernel_invocation/Add/add_custom.cpp
+
+#Ascend 910B, link:
+#/home/yzelman/Packages/CANN/ascend-toolkit/latest/compiler/ccec_compiler/bin/ccec --cce-fatobj-link --cce-aicore-arch=dav-c220-cube    CMakeFiles/add_custom_npu.dir/__/__/add_custom.cpp.o CMakeFiles/add_custom_npu.dir/__/__/main.cpp.o -o ../../../add_custom_npu  -L"/home/yzelman/Packages/CANN/ascend-toolkit/latest/runtime/lib64" -L"/home/yzelman/Packages/CANN/ascend-toolkit/latest/tools/simulator/Ascend910B1/lib" -L"/home/yzelman/Packages/CANN/ascend-toolkit/latest/tools/tikicpulib/lib/Ascend910B1" -lstdc++ -lruntime -lascendcl
+
+#Ascend 910, -c:
+#cd /home/yzelman/Documents/Ascend/samples-master/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/kernel_invocation/Add/build/cmake/npu && /home/yzelman/Packages/CANN/ascend-toolkit/latest/compiler/ccec_compiler/bin/ccec -xcce -DTILING_KEY_VAR=0  -I"/home/yzelman/Packages/CANN/ascend-toolkit/latest/acllib/include" -I"/home/yzelman/Packages/CANN/ascend-toolkit/latest/compiler/tikcpp/tikcfw" -I"/home/yzelman/Packages/CANN/ascend-toolkit/latest/compiler/tikcpp/tikcfw/impl" -I"/home/yzelman/Packages/CANN/ascend-toolkit/latest/compiler/tikcpp/tikcfw/interface" -I"/home/yzelman/Packages/CANN/ascend-toolkit/latest/tools/tikicpulib/lib/include"  -O2 -std=c++17 --cce-aicore-arch=dav-c100  --cce-auto-sync -fPIC -pthread -o CMakeFiles/add_custom_npu.dir/__/__/add_custom.cpp.o -c /home/yzelman/Documents/Ascend/samples-master/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/kernel_invocation/Add/add_custom.cpp
+
+#Ascend 910, link:
+#/home/yzelman/Packages/CANN/ascend-toolkit/latest/compiler/ccec_compiler/bin/ccec --cce-fatobj-link --cce-aicore-arch=dav-c100    CMakeFiles/add_custom_npu.dir/__/__/add_custom.cpp.o CMakeFiles/add_custom_npu.dir/__/__/main.cpp.o -o ../../../add_custom_npu  -L"/home/yzelman/Packages/CANN/ascend-toolkit/latest/runtime/lib64" -L"/home/yzelman/Packages/CANN/ascend-toolkit/latest/tools/simulator/Ascend910A/lib" -L"/home/yzelman/Packages/CANN/ascend-toolkit/latest/tools/tikicpulib/lib/Ascend910A" -lstdc++ -lruntime -lascendcl
+
diff --git a/src/graphblas/CMakeLists.txt b/src/graphblas/CMakeLists.txt
index 98924080e..d1692fd3d 100644
--- a/src/graphblas/CMakeLists.txt
+++ b/src/graphblas/CMakeLists.txt
@@ -93,6 +93,10 @@ if( WITH_NONBLOCKING_BACKEND )
 	add_subdirectory( nonblocking )
 endif()
 
+if( WITH_ASCEND_BACKEND )
+	add_subdirectory( ascend )
+endif()
+
 if( WITH_BSP1D_BACKEND OR WITH_HYBRID_BACKEND )
 	add_subdirectory( bsp1d )
 endif()
@@ -151,6 +155,9 @@ if( WITH_REFERENCE_BACKEND OR WITH_OMP_BACKEND OR WITH_NONBLOCKING_BACKEND )
 	if( WITH_NONBLOCKING_BACKEND )
 		target_link_libraries( backend_shmem_static PRIVATE ${backend_nonblocking_headers} )
 	endif()
+	if( WITH_ASCEND_BACKEND )
+		target_link_libraries( backend_shmem_static PRIVATE ${backend_ascend_headers} )
+	endif()
 	# this is the actual binary file, i.e. the one to be installed
 	install( TARGETS backend_shmem_static
 		EXPORT GraphBLASTargets
@@ -163,6 +170,9 @@ if( WITH_REFERENCE_BACKEND OR WITH_OMP_BACKEND OR WITH_NONBLOCKING_BACKEND )
 	if( WITH_NONBLOCKING_BACKEND )
 		target_link_libraries( backend_shmem_shared PRIVATE ${backend_nonblocking_headers} )
 	endif()
+	if( WITH_ASCEND_BACKEND )
+		target_link_libraries( backend_shmem_shared PRIVATE ${backend_ascend_headers} )
+	endif()
 	install( TARGETS backend_shmem_shared
 		EXPORT GraphBLASTargets
 		LIBRARY DESTINATION "${SHMEM_BACKEND_INSTALL_DIR}"
diff --git a/src/graphblas/ascend/CMakeLists.txt b/src/graphblas/ascend/CMakeLists.txt
new file mode 100644
index 000000000..aa3462eed
--- /dev/null
+++ b/src/graphblas/ascend/CMakeLists.txt
@@ -0,0 +1,46 @@
+#
+#   Copyright 2021 Huawei Technologies Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Creation of the nonblocking backend, both as static and dynamic library. Any target
+# importing a backend also imports the compiler definition(s) required to set it as
+# default. If a target want to do it manually, the target 'backend_shmem_static' exists
+# with no default backend selection in its compilation interface.
+#
+
+assert_valid_variables( SHMEM_BACKEND_INSTALL_DIR
+	ASCEND_BACKEND_DEFAULT_NAME ASCEND_SELECTION_DEFS
+)
+
+assert_defined_variables( backend_reference_srcs )
+
+
+set( backend_reference_srcs ${backend_reference_srcs}
+	${CMAKE_CURRENT_SOURCE_DIR}/opgen.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/semantics.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/init.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/io.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/grid.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/symbolTable.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/operators.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/lazy_evaluation.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/pipeline.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/tensor.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/stage.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
+	PARENT_SCOPE
+)
+
diff --git a/src/graphblas/ascend/grid.cpp b/src/graphblas/ascend/grid.cpp
new file mode 100644
index 000000000..1cc463b6e
--- /dev/null
+++ b/src/graphblas/ascend/grid.cpp
@@ -0,0 +1,81 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include <string>
+
+#include <graphblas/ascend/grid.hpp>
+
+namespace alp {
+
+	namespace internal {
+
+		iGrid *igrid;
+	}
+}
+
+alp::internal::iGrid::iGrid( size_t proc, size_t prob ) {
+
+	process_order = proc;
+	problem_order = prob;
+}
+
+size_t alp::internal::iGrid::getProcessOrder() const noexcept {
+
+	return process_order;
+}
+
+size_t alp::internal::iGrid::getProblemOrder() const noexcept {
+
+	return problem_order;
+}
+
+std::string alp::internal::iGrid::processSize( const size_t k ) const noexcept {
+
+	return "p" + std::to_string( k );
+}
+
+std::string alp::internal::iGrid::processMode( const size_t k ) const noexcept {
+
+	return "a" + std::to_string( k );
+}
+
+std::string alp::internal::iGrid::problemSize( const size_t k ) const noexcept {
+
+	return "n" + std::to_string( k );
+}
+
+std::string alp::internal::iGrid::problemMode( const size_t k ) const noexcept {
+
+	return "i" + std::to_string( k );
+}
+
+std::string alp::internal::iGrid::problemMainMode( const size_t k ) const noexcept {
+
+	return "z" + std::to_string( k );
+}
+
+std::string alp::internal::iGrid::problemTileMode( const size_t k ) const noexcept {
+
+	return "t" + std::to_string( k );
+}
+
+std::string alp::internal::iGrid::tileSize( const size_t k ) const noexcept {
+
+	return "tile_size" + std::to_string( k );
+}
+
diff --git a/src/graphblas/ascend/init.cpp b/src/graphblas/ascend/init.cpp
new file mode 100644
index 000000000..9926d8238
--- /dev/null
+++ b/src/graphblas/ascend/init.cpp
@@ -0,0 +1,59 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides initialisation for the Ascend backend.
+ *
+ * @author A. N. Yzelman
+ * @date 12th of September, 2023
+ */
+
+#include <graphblas/reference/init.hpp>
+
+#include <graphblas/nonblocking/config.hpp>
+
+#include <graphblas/ascend/init.hpp>
+#include <graphblas/ascend/opgen.hpp>
+
+#include <graphblas/utils/alloc.hpp>
+
+#include <sstream>
+
+
+template<>
+grb::RC grb::init< grb::ascend >(
+	const size_t s, const size_t P, void * const data
+) {
+	// If the environment variable GRB_ASCEND_TILE_SIZE is set, a fixed
+	// tile size is used for all pipelines built during the ascend execution.
+	// Therefore, the choice is manual. Otherwise, the choice is automatically
+	// made at run-time by the analytic model and may differ for different
+	// pipelines.
+	std::cerr << "Info: grb::init (ascend) called.\n";
+	return grb::init< grb::reference >( s, P, data );
+}
+
+template<>
+grb::RC grb::finalize< grb::ascend >() {
+	std::cerr << "Info: grb::finalize (ascend) called.\n";
+	std::cerr << "Info: codegen will go to std::cout (TODO)\n";
+//	alp::internal::OpGen::generate( std::cout );
+	return grb::finalize< grb::reference >();
+}
+
diff --git a/src/graphblas/ascend/io.cpp b/src/graphblas/ascend/io.cpp
new file mode 100644
index 000000000..49b19e4d9
--- /dev/null
+++ b/src/graphblas/ascend/io.cpp
@@ -0,0 +1,50 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Implements the grb::wait for the Ascend backend.
+ *
+ * @author A. N. Yzelman
+ * @date 12th of September, 2023
+ */
+
+#include <graphblas.hpp>
+
+#include <graphblas/nonblocking/lazy_evaluation.hpp>
+
+
+namespace grb {
+
+	namespace internal {
+
+		extern LazyEvaluation le;
+
+	}
+
+	/**
+	 * \internal This is a ascend implementation, and all
+	 * pending operations must be completed.
+	 */
+	template<>
+	RC wait< ascend >() {
+		return internal::le.execution();
+	}
+
+}
+
diff --git a/src/graphblas/ascend/lazy_evaluation.cpp b/src/graphblas/ascend/lazy_evaluation.cpp
new file mode 100644
index 000000000..480f46158
--- /dev/null
+++ b/src/graphblas/ascend/lazy_evaluation.cpp
@@ -0,0 +1,121 @@
+#include <graphblas/ascend/lazy_evaluation.hpp>
+#include <graphblas/ascend/stage.hpp>
+
+
+namespace alp
+{
+	namespace internal
+	{
+		AscendLazyEvaluation ale;
+	}
+}
+
+alp::internal::AscendLazyEvaluation::AscendLazyEvaluation() {
+
+	num_pipelines = 0;
+	addPipeline(); //TODO add the first pipeline
+}
+
+void alp::internal::AscendLazyEvaluation::addPipeline() {
+
+	pipelines.emplace_back( AscendPipeline( num_pipelines ) );
+	num_pipelines++;
+}
+
+void alp::internal::AscendLazyEvaluation::insertFreeInputTensorStages( const std::vector< int > &forEachAxes ) {
+
+	for( auto it = pipelines.begin(); it != pipelines.end(); ++it ) {
+		it->insertFreeInputTensorStages( forEachAxes );
+	}
+}
+
+const alp::Tensor &alp::internal::AscendLazyEvaluation::store( const alp::Tensor &output_tensor ) {
+
+	//TODO: perhaps data dependence analysis will determine the right pipeline
+	auto pipeline = pipelines.rbegin();
+	return pipeline->store( output_tensor );
+}
+
+void alp::internal::AscendLazyEvaluation::clear() {
+
+	for( auto it = pipelines.begin(); it != pipelines.end(); ++it ) {
+		it->clear();
+	}
+}
+
+void alp::internal::AscendLazyEvaluation::addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const alp::Tensor &tensor1, const double alpha, const std::vector< int > &forEachAxes ) {
+
+	//TODO: perhaps data dependence analysis will determine the right pipeline
+	auto pipeline = pipelines.rbegin();
+	pipeline->addStage( op_type, rule, tensor1, alpha, forEachAxes );
+}
+
+void alp::internal::AscendLazyEvaluation::addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const alp::Tensor &tensor1, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes ) {
+
+	//TODO: perhaps data dependence analysis will determine the right pipeline
+	auto pipeline = pipelines.rbegin();
+	pipeline->addStage( op_type, rule, tensor1, activeAxes, forEachAxes );
+}
+
+void alp::internal::AscendLazyEvaluation::addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const alp::Tensor &tensor1, const alp::Tensor &tensor2, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes ) {
+
+	//TODO: perhaps data dependence analysis will determine the right pipeline
+	auto pipeline = pipelines.rbegin();
+	pipeline->addStage( op_type, rule, tensor1, tensor2, activeAxes, forEachAxes );
+}
+
+void alp::internal::AscendLazyEvaluation::addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const alp::Tensor &tensor1, const alp::Tensor &tensor2, const alp::Tensor &tensor3, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes ) {
+
+	//TODO: perhaps data dependence analysis will determine the right pipeline
+	auto pipeline = pipelines.rbegin();
+	pipeline->addStage( op_type, rule, tensor1, tensor2, tensor3, activeAxes, forEachAxes );
+}
+/*
+void alp::internal::AscendLazyEvaluation::addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const alp::Tensor &tensor1, const alp::Tensor &tensor2, const alp::Tensor &tensor3, const alp::Tensor &tensor4, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes ) {
+
+	//TODO: perhaps data dependence analysis will determine the right pipeline
+	auto pipeline = pipelines.rbegin();
+	pipeline->addStage( op_type, rule, tensor1, tensor2, tensor3, tensor4, activeAxes, forEachAxes );
+}
+*/
+void alp::internal::AscendLazyEvaluation::generateDeclarations( std::stringstream &declarations ) {
+
+	for( auto it = pipelines.begin(); it != pipelines.end(); ++it ) {
+		it->generateDeclarations( declarations );
+	}
+}
+/*
+void alp::internal::AscendLazyEvaluation::generateConstructor( std::stringstream &constructor ) {
+	for( auto it = pipelines.begin(); it != pipelines.end(); ++it ) {
+		it->generateConstructor( constructor );
+	}
+}
+*/
+void alp::internal::AscendLazyEvaluation::generateHostBody( std::stringstream &os, std::stringstream &analyticModelArgs,
+									std::stringstream &analyticModelFormalParams, std::stringstream &analyticModelDecls,
+									std::stringstream &analyticModelConstrBody ) {
+	for( auto it = pipelines.begin(); it != pipelines.end(); ++it ) {
+		it->generateHostBody( os, analyticModelArgs, analyticModelFormalParams, analyticModelDecls, analyticModelConstrBody );
+	}
+}
+
+void alp::internal::AscendLazyEvaluation::generateInit( std::stringstream &init ) {
+
+	for( auto it = pipelines.begin(); it != pipelines.end(); ++it ) {
+		it->generateInit( init );
+	}
+}
+
+void alp::internal::AscendLazyEvaluation::generateProcess( std::stringstream &process, std::stringstream &processCall ) {
+
+	for( auto it = pipelines.begin(); it != pipelines.end(); ++it ) {
+		it->generateProcess( process, processCall );
+	}
+}
+
+void alp::internal::AscendLazyEvaluation::debug_print() const {
+
+	for( auto it = pipelines.begin(); it != pipelines.end(); ++it ) {
+		it->debug_print();
+	}
+}
diff --git a/src/graphblas/ascend/operators.cpp b/src/graphblas/ascend/operators.cpp
new file mode 100644
index 000000000..a87d9af3b
--- /dev/null
+++ b/src/graphblas/ascend/operators.cpp
@@ -0,0 +1,200 @@
+#include <functional>
+#include <limits>
+#include <cstddef>
+
+#include <graphblas/ascend/operators.hpp>
+#include <graphblas/ascend/tensor.hpp>
+#include <graphblas/ascend/stage.hpp>
+#include <graphblas/ascend/lazy_evaluation.hpp>
+#include <graphblas/ascend/opgen.hpp>
+
+namespace alp
+{
+	namespace internal
+	{
+		extern AscendLazyEvaluation ale;
+	}
+}
+
+namespace alp
+{
+	Tensor getView( const Tensor &parent ) {
+
+		std::vector< int > forEachAxes = internal::vectorOfVectorsToVector( internal::OpGen::forEachAxes );
+		std::vector< int > difference_axes = internal::vectorDifference( parent.getAxes(), forEachAxes );
+
+		Tensor ret_view( parent, difference_axes );
+
+		internal::Rule rule = internal::Rule::NONE;
+
+		alp::internal::ale.addStage( alp::internal::Stagetype::GET_VIEW, rule, parent, difference_axes, forEachAxes );
+
+		return ret_view;
+	}
+
+	// TODO extend to multiple containers
+	void store( const Tensor &output ) {
+
+		const alp::Tensor &parent = alp::internal::ale.store( output );
+
+		std::vector< int > forEachAxes = internal::vectorOfVectorsToVector( internal::OpGen::forEachAxes );
+		std::vector< int > difference_axes = internal::vectorDifference( parent.getAxes(), forEachAxes );
+
+		internal::Rule rule = internal::Rule::NONE;
+
+		alp::internal::ale.addStage( alp::internal::Stagetype::STORE, rule, parent, difference_axes, forEachAxes );
+	}
+
+	void set(
+		Tensor &tout,
+		Tensor &tin,
+		const std::vector< int > &activeAxes
+	) {
+		std::vector< int > forEachAxes = internal::vectorOfVectorsToVector( internal::OpGen::forEachAxes );
+
+		internal::Rule rule = internal::Rule::NONE;
+
+		alp::internal::ale.addStage( alp::internal::Stagetype::SET_TENSOR, rule, tout, tin, activeAxes, forEachAxes );
+	}
+
+	void set(
+		Tensor &tout,
+		double alpha		//TODO perhaps use a templated datatype instead of double
+	) {
+		std::vector< int > forEachAxes = internal::vectorOfVectorsToVector( internal::OpGen::forEachAxes );
+
+		internal::Rule rule = internal::Rule::NONE;
+
+		alp::internal::ale.addStage( alp::internal::Stagetype::SET_SCALAR, rule, tout, alpha, forEachAxes );
+	}
+
+	void apply(
+		Tensor &tout,
+		Tensor &tin1,
+		Tensor &tin2,
+		const std::string &opName,
+		const std::vector< int > &activeAxes
+	) {
+		std::vector< int > forEachAxes = internal::vectorOfVectorsToVector( internal::OpGen::forEachAxes );
+/*
+		std::vector< int > union_axes = internal::vectorUnion( tout.getAxes(), tin1.getAxes() );
+		union_axes = internal::vectorUnion( union_axes, tin2.getAxes() );
+
+		assert( union_axes.size() < 3 );
+
+		std::vector< int > temp_axes;
+		if( union_axes.size() == 1 ) {
+			temp_axes.push_back( union_axes[ 0 ] );
+		} else if ( union_axes.size() == 2 ) {
+			temp_axes.push_back( union_axes[ 1 ] );
+		}
+
+		// create a temporary Tensor
+		Tensor temp( temp_axes, tout.getType() );
+*/
+		internal::Rule rule = internal::Rule::NONE;
+
+		//TODO the current design does not make a distinction between the different cases
+		//		of BCAST and REDUCE, this should be fixed in a later version
+		if( tin1.getAxes() == tin2.getAxes() && tout.getAxes() == tin1.getAxes() ) {
+			rule = internal::Rule::EWISE;
+		} else if ( tin1.getAxes() == tin2.getAxes() && internal::vectorSubset( tout.getAxes(), tin1.getAxes() ) == true ) {
+			rule = internal::Rule::REDUCE;
+		} else if ( tin1.getAxes() == tin2.getAxes() && internal::vectorSubset( tin1.getAxes(), tout.getAxes() ) == true ) {
+			rule = internal::Rule::BCAST;
+		} else if ( tin1.getAxes() == tout.getAxes() && internal::vectorSubset( tout.getAxes(), tin2.getAxes() ) == true ) {
+			rule = internal::Rule::REDUCE;
+		} else if ( tin1.getAxes() == tout.getAxes() && internal::vectorSubset( tin2.getAxes(), tout.getAxes() ) == true ) {
+			rule = internal::Rule::BCAST;
+		} else if ( tin2.getAxes() == tout.getAxes() && internal::vectorSubset( tout.getAxes(), tin1.getAxes() ) == true ) {
+			rule = internal::Rule::REDUCE;
+		} else if ( tin2.getAxes() == tout.getAxes() && internal::vectorSubset( tin1.getAxes(), tout.getAxes() ) == true ) {
+			rule = internal::Rule::BCAST;
+		} else if ( tin1.getAxes() != tin2.getAxes() && tin1.getAxes() != tout.getAxes() && tin2.getAxes() != tout.getAxes() ) {
+			if( internal::vectorSubset( tout.getAxes(), tin1.getAxes() ) == true && internal::vectorSubset( tout.getAxes(), tin2.getAxes() ) == true ) {
+				rule = internal::Rule::BCAST;
+			} else if( internal::vectorSubset( tin1.getAxes(), tout.getAxes() ) == true && internal::vectorSubset( tin2.getAxes(), tout.getAxes() ) == true ) {
+				rule = internal::Rule::REDUCE;
+			} else {
+		        std::cerr << "The axes of the output tensor cannot be subset of the axes of one input and superset of the axes of the other input: apply " << opName << std::endl;
+		        std::abort();
+			}
+		}
+
+		if( opName == "minus" ) {
+			alp::internal::ale.addStage( alp::internal::Stagetype::APPLY_MINUS, rule, tout, tin1, tin2, activeAxes, forEachAxes );
+		} else if( opName == "add" ) {
+			alp::internal::ale.addStage( alp::internal::Stagetype::APPLY_ADD, rule, tout, tin1, tin2, activeAxes, forEachAxes );
+		}
+		else {
+
+		}
+	}
+
+	void foldl(
+		Tensor &tinout,
+		Tensor &tin,
+		const std::string &opName,
+		const std::vector< int > &activeAxes
+	) {
+		std::vector< int > forEachAxes = internal::vectorOfVectorsToVector( internal::OpGen::forEachAxes );
+/*
+		std::vector< int > union_axes = internal::vectorUnion( tinout.getAxes(), tin.getAxes() );
+
+		assert( union_axes.size() < 3 );
+
+		std::vector< int > temp_axes;
+		if( union_axes.size() == 1 ) {
+			temp_axes.push_back( union_axes[ 0 ] );
+		} else if ( union_axes.size() == 2 ) {
+			temp_axes.push_back( union_axes[ 1 ] );
+		}
+
+		// create a temporary Tensor
+		Tensor temp( temp_axes, tinout.getType() );
+*/
+		internal::Rule rule = internal::Rule::NONE;
+
+		if( tinout.getAxes() == tin.getAxes() ) {
+			rule = internal::Rule::EWISE;
+		} else if ( internal::vectorSubset( tinout.getAxes(), tin.getAxes() ) == true ) {
+			rule = internal::Rule::REDUCE;
+		} else if ( internal::vectorSubset( tin.getAxes(), tinout.getAxes() ) == true ) {
+			rule = internal::Rule::BCAST;
+		} else {
+
+		}
+
+		if( opName == "divide" ) {
+			alp::internal::ale.addStage( alp::internal::Stagetype::FOLDL_DIVIDE, rule, tinout, tin, activeAxes, forEachAxes );
+		} else if( opName == "max" ) {
+			alp::internal::ale.addStage( alp::internal::Stagetype::FOLDL_MAX, rule, tinout, tin, activeAxes, forEachAxes );
+		} else if( opName == "times" ) {
+			alp::internal::ale.addStage( alp::internal::Stagetype::FOLDL_TIMES, rule, tinout, tin, activeAxes, forEachAxes );
+		} else if( opName == "add" ) {
+			alp::internal::ale.addStage( alp::internal::Stagetype::FOLDL_ADD, rule, tinout, tin, activeAxes, forEachAxes );
+		} else {
+
+		}
+	}
+
+//	template< size_t sm, size_t pm >
+	void foldl(
+//		const Grid< sm, pm > &grid,
+		Tensor &tinout,
+		const std::string &opName,
+		const std::vector< int > &activeAxes
+	) {
+		std::vector< int > forEachAxes = internal::vectorOfVectorsToVector( internal::OpGen::forEachAxes );
+
+		internal::Rule rule = internal::Rule::NONE;
+
+		if( opName == "exp" ) {
+			alp::internal::ale.addStage( alp::internal::Stagetype::FOLDL_EXP, rule, tinout, activeAxes, forEachAxes );
+		} else {
+
+		}
+	}
+
+}
+
diff --git a/src/graphblas/ascend/opgen.cpp b/src/graphblas/ascend/opgen.cpp
new file mode 100644
index 000000000..bb3c6370f
--- /dev/null
+++ b/src/graphblas/ascend/opgen.cpp
@@ -0,0 +1,211 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides CodeGen for the Ascend backend.
+ *
+ * @author A. N. Yzelman
+ * @date 12th of September, 2023
+ */
+
+#include <graphblas/ascend/opgen.hpp>
+#include <graphblas/ascend/symbolTable.hpp>
+
+namespace alp
+{
+	namespace internal
+	{
+		extern SymbolTable symbols;
+	}
+}
+
+std::string alp::internal::OpGen::kernel_id;
+
+size_t alp::internal::OpGen::forEachLevel = 0;
+
+std::vector< std::vector< int > > alp::internal::OpGen::forEachAxes;
+std::vector< int > alp::internal::OpGen::lastAxes;
+
+//TODO how is this supposed to be used?
+//std::map< std::string, std::string > alp::internal::OpGen::chunkSize;
+
+std::stringstream alp::internal::OpGen::aux_func;
+std::stringstream alp::internal::OpGen::analyticModelFormalParams;
+std::stringstream alp::internal::OpGen::hostFormalParam;
+std::stringstream alp::internal::OpGen::hostArg;
+std::stringstream alp::internal::OpGen::constrBody;
+std::stringstream alp::internal::OpGen::hostBody;
+std::stringstream alp::internal::OpGen::classMembers;
+std::stringstream alp::internal::OpGen::initBody;
+std::stringstream alp::internal::OpGen::genericProcessBody;
+std::stringstream alp::internal::OpGen::declarations;
+
+std::vector< std::stringstream > alp::internal::OpGen::processFunc;
+std::vector< std::stringstream > alp::internal::OpGen::computeFunc;
+std::vector< std::stringstream > alp::internal::OpGen::copyinFunc;
+std::vector< std::stringstream > alp::internal::OpGen::copyoutFunc;
+
+void alp::internal::OpGen::compileClear() {
+
+//	chunkSize.clear(); //TODO how is this supposed to be used?
+
+	forEachAxes.clear();
+
+	alp::internal::symbols.clearAll();
+
+	aux_func.clear();
+	analyticModelFormalParams.clear();
+	hostFormalParam.clear();
+	hostArg.clear();
+	constrBody.clear();
+	classMembers.clear();
+	initBody.clear();
+	genericProcessBody.clear();
+	declarations.clear();
+
+	processFunc.clear();
+	computeFunc.clear();
+	copyinFunc.clear();
+	copyoutFunc.clear();
+}
+
+// TODOs:
+//
+//  1. rely on lazy evaluation (le)'s pipelines instead of our own input, output and stages info
+//  2. use streams (such as the above initStreams) to generate the content of each component separately.
+//      - for example, the parameters to the init function could be gathered in initStream; etc.
+//  3. instead of generating the copyIn, copyOut, and compute of the add operation directly, rely on a
+//     library of AscendC kernels
+//      - for example, we should generate in the case of a fused sequence of operators OP1 and OP2:
+//        __aicore__ inline void CopyIn( int32_t progress ) {
+//              OP1::CopyIn( progress, ... );
+//              OP2::CopyIn( progress, ... );
+//              // and so on for further fused ops
+//        }
+//  4. Instead of hardcoding half as the data type, we should generate it appropriately from the typename T
+void alp::internal::OpGen::generate( std::ostream &os ) {
+	os << "\n// start automatic ALP/Ascend opgen\n\n";
+
+	os << "#include <kernel_operator.h>\n\n";
+	os << "#include \"ascendlib.hpp\"\n\n";
+	os << "using namespace AscendC;\n\n";
+
+	// TODO this should be generated by the grid.forEach
+	// TODO should we support both following modes?
+	//  mode 1: symbolic, the below are parameters to the call to custom_kernels, cannot be constexpr(!)
+	//  mode 2: the user passes explicit parameters into alp::Grid, then instead of symbolic output, actual values are emitted as in below
+	os << "constexpr int32_t BUFFER_NUM = 1; \n"; // TODO TBC indicates local buffer space in a pipe?
+	os << "\n";
+
+	os << aux_func.str();
+	os << "\n";
+
+	std::stringstream initFormalParam;
+	std::stringstream customFormalParam;
+	std::stringstream allAccessedArg;
+	std::stringstream allTempLocalDecl;
+
+	alp::internal::symbols.generateGlobalSymbols( initFormalParam, customFormalParam, allAccessedArg, allTempLocalDecl );
+
+	// generate class header
+	os << "class " << kernel_id << " {\n\n";
+	os << "\tpublic:\n\n";
+
+	// generate class constructor
+	os << "\t\t__aicore__ inline " << kernel_id << "( ";
+	os << hostFormalParam.str();
+	os << analyticModelFormalParams.str();
+	os << " ) {\n";
+	os << constrBody.str();
+	os << "\t\t}\n\n\n";
+
+	// generate init function
+	os << "\t\t__aicore__ inline void Init( ";
+	os << initFormalParam.str();
+	os << " ) {\n";
+	os << initBody.str();
+	os << "\t\t}\n\n";
+
+	// insert the Process functions
+	for( auto it = processFunc.cbegin(); it != processFunc.cend(); ++it ) {
+		os << it->str();
+		os << "\n\n";
+	}
+
+	// generate the generic Process functions
+	os << "\t\t__aicore__ inline void Process() {\n";
+	os << "\n";
+	os << genericProcessBody.str();
+	os << "\t\t}\n\n\n";
+
+	// declare private fields
+	os << "\tprivate:\n\n";
+	os << classMembers.str();
+//	os << "\n";
+	os << "\t\tTPipe pipe;\n";
+	os << "\n";
+	os << declarations.str();
+	os << allTempLocalDecl.str();
+	// end of class
+	os << "};\n\n";
+
+	// generate entry function
+	os << "extern \"C\" __global__ __aicore__ void custom_" << kernel_id << "(\n\t";
+	//print the list of all input and output vectors for the arguments list
+	os << initFormalParam.str();
+	os << ",\n\t";
+	os << hostFormalParam.str();
+	os << analyticModelFormalParams.str();
+	os << "\n) {\n";
+	os << "\t" << kernel_id << " op( ";
+	os << hostArg.str();
+	os << " );\n";
+	os << "\top.Init( ";
+	os << allAccessedArg.str();
+	os << " );\n";
+	os << "\top.Process();\n";
+	os << "}\n\n";
+
+	// TODO do we absolutely need to generate the host entry point here?
+	os << "#ifndef __CCE_KT_TEST__\n";
+	os << "void custom_" << kernel_id << "_do(\n"
+		<< "\tuint32_t blockDim,\n"
+		<< "\tvoid *l2ctrl,\n"
+		<< "\tvoid *stream,\n\t";
+	os << customFormalParam.str();
+	os << ",\n\t";
+	os << hostFormalParam.str();
+	os << analyticModelFormalParams.str();
+	os << "\n) {\n";
+
+	// generate analytic model
+//	os << hostBody.str();
+
+	// generate entry point
+	os << "\tcustom_" << kernel_id << "<<< blockDim, l2ctrl, stream >>>( ";
+	os << allAccessedArg.str();
+	os << ", ";
+	os << hostArg.str();
+	os << " );\n";
+	os << "}\n";
+	os << "#endif\n\n";
+
+	os << "// end automatic ALP/Ascend opgen\n\n";
+}
+
diff --git a/src/graphblas/ascend/pipeline.cpp b/src/graphblas/ascend/pipeline.cpp
new file mode 100644
index 000000000..937d9d3ba
--- /dev/null
+++ b/src/graphblas/ascend/pipeline.cpp
@@ -0,0 +1,998 @@
+#include <vector>
+
+#include <graphblas/ascend/pipeline.hpp>
+#include <graphblas/ascend/tensor.hpp>
+#include <graphblas/ascend/stage.hpp>
+#include <graphblas/ascend/symbolTable.hpp>
+#include <graphblas/ascend/grid.hpp>
+
+namespace alp
+{
+	namespace internal
+	{
+		extern iGrid *igrid;
+		extern SymbolTable symbols;
+	}
+}
+
+alp::internal::AscendPipeline::AscendPipeline( size_t _id ) : id( _id )
+{
+
+}
+
+void alp::internal::AscendPipeline::insertTensorToInputs( const alp::Tensor &tensor )
+{
+	accessed.insert( alp::internal::symbols.getTensorFromView( tensor ) );
+}
+
+void alp::internal::AscendPipeline::insertFreeInputTensorStages( const std::vector< int > &forEachAxes )
+{
+	std::vector< alp::internal::Stage * > st;
+
+	// search for all GET_VIEW stages in the pipeline and store them
+	for( auto it = stages.begin(); it != stages.end(); ++it ) {
+		if (it->getOpType() == internal::Stagetype::GET_VIEW ) {
+			st.push_back( &(*it) );
+		}
+	}
+
+	// search for all STORE stages in the pipeline and delete
+	// the corresponding GET_VIEW from those stored above
+	for( auto it = stages.begin(); it != stages.end(); ++it ) {
+		if (it->getOpType() == internal::Stagetype::STORE ) {
+			for( auto jt = st.begin(); jt != st.end(); ) {
+				if( (*jt)->getTensor0().getID() == it->getTensor0().getID() ) {
+					jt = st.erase( jt );
+				} else {
+					++jt;
+				}
+			}
+		}
+	}
+
+	// for the remaining GET_VIEW stages that are still stored
+	// insert a new stage in the end of the pipeline that
+	// corresponds to all input tensors for which store
+	// is not explicitly invoked by the user
+	for( auto it = st.begin(); it != st.end(); ++it ) {
+		if( (*it)->getForEachAxes() == forEachAxes ) {
+			addStage( alp::internal::Stagetype::IMPLICIT_FREE, (*it)->getRule(), (*it)->getTensor0(), (*it)->getAxes(), (*it)->getForEachAxes() );
+		}
+	}
+}
+
+std::set< int > alp::internal::AscendPipeline::getIteratedAxes() const {
+	std::vector< int > union_iterated_axes;
+
+	for( auto it = stages.begin(); it != stages.end(); ++it ) {
+		union_iterated_axes = internal::vectorUnion( union_iterated_axes, it->getForEachAxes() );
+	}
+
+	// convert the std::vector to std::set
+	std::set< int > ret;
+	ret.insert( union_iterated_axes.begin(), union_iterated_axes.end() );
+	return ret;
+}
+
+const alp::Tensor &alp::internal::AscendPipeline::store( const alp::Tensor &output_tensor ) {
+
+	//FIXME I should check here that this is indeed a VIEW
+
+	const alp::Tensor &parent = alp::internal::symbols.getTensorFromView( output_tensor );
+	outputs.insert( parent );
+
+	alp::internal::symbols.addOutputTensor( parent );
+
+	return parent;
+}
+
+bool alp::internal::AscendPipeline::isOutput( const alp::Tensor &tensor ) const {
+
+	//FIXME I should check here that this is indeed a VIEW
+
+	return outputs.find( tensor ) != outputs.end();
+}
+
+void alp::internal::AscendPipeline::clear() {
+
+	stages.clear();
+	accessed.clear();
+	outputs.clear();
+}
+
+size_t alp::internal::AscendPipeline::getID() const {
+
+	return id;
+}
+
+std::string alp::internal::AscendPipeline::getTilingAxes() const {
+
+	std::string tiling_init_numerator;
+
+	const std::set< int > iterated_axes = getIteratedAxes();
+
+	for( auto it = iterated_axes.begin(); it != iterated_axes.end(); ++it ) {
+		tiling_init_numerator.append( igrid->tileSize( *it ) );
+		tiling_init_numerator.append( " * " );
+	}
+
+	return tiling_init_numerator;
+}
+
+void alp::internal::AscendPipeline::addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule,
+											  const alp::Tensor &tensor1, const double alpha, const std::vector< int > &forEachAxes ) {
+
+	// insert the Tensor to the set of accessed data
+	insertTensorToInputs( tensor1 );
+	// get the name of the Tensor object that exists behind this view or tensor
+
+	switch ( op_type ) {
+
+		case alp::internal::Stagetype::SET_SCALAR:
+		{
+			stages.push_back( std::move( alp::internal::Stage( *this,
+												alp::internal::Stagetype::SET_SCALAR, rule, tensor1, alpha, forEachAxes ) ) );
+
+			break;
+		}
+		case alp::internal::Stagetype::GET_VIEW:
+		case alp::internal::Stagetype::STORE:
+		case alp::internal::Stagetype::IMPLICIT_FREE:
+		{
+            std::cerr << "Stage: " << (int) op_type << " has only one tensor argument" << std::endl;
+            std::abort();
+			break;
+		}
+		case alp::internal::Stagetype::FOLDL_EXP:
+		case alp::internal::Stagetype::SET_TENSOR:
+		case alp::internal::Stagetype::APPLY_ADD:
+		case alp::internal::Stagetype::APPLY_MINUS:
+		case alp::internal::Stagetype::FOLDL_DIVIDE:
+		case alp::internal::Stagetype::FOLDL_MAX:
+		case alp::internal::Stagetype::FOLDL_TIMES:
+		case alp::internal::Stagetype::FOLDL_ADD:
+		{
+            std::cerr << "Stage: " << (int) op_type << " has more than one tensor arguments" << std::endl;
+            std::abort();
+			break;
+		}
+	}
+}
+
+void alp::internal::AscendPipeline::addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule,
+											  const alp::Tensor &tensor1, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes ) {
+
+	// insert the Tensor to the set of accessed data
+	insertTensorToInputs( tensor1 );
+	// get the name of the Tensor object that exists behind this view or tensor
+
+	switch ( op_type ) {
+
+		case alp::internal::Stagetype::FOLDL_EXP:
+		case alp::internal::Stagetype::GET_VIEW:
+		case alp::internal::Stagetype::STORE:
+		case alp::internal::Stagetype::IMPLICIT_FREE:
+		{
+			stages.push_back( std::move( alp::internal::Stage( *this,
+												op_type, rule, tensor1, activeAxes, forEachAxes ) ) );
+
+			break;
+		}
+		case alp::internal::Stagetype::SET_SCALAR:
+		case alp::internal::Stagetype::SET_TENSOR:
+		case alp::internal::Stagetype::APPLY_ADD:
+		case alp::internal::Stagetype::APPLY_MINUS:
+		case alp::internal::Stagetype::FOLDL_DIVIDE:
+		case alp::internal::Stagetype::FOLDL_MAX:
+		case alp::internal::Stagetype::FOLDL_TIMES:
+		case alp::internal::Stagetype::FOLDL_ADD:
+		{
+            std::cerr << "Stage: " << (int) op_type << " has more than one tensor arguments" << std::endl;
+            std::abort();
+			break;
+		}
+	}
+}
+
+void alp::internal::AscendPipeline::addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule,
+											  const alp::Tensor &tensor1, const alp::Tensor &tensor2,
+											  const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes ) {
+
+	// insert the Tensors to the set of accessed data
+	insertTensorToInputs( tensor1 ); //TODO pass the string
+	insertTensorToInputs( tensor2 );
+
+	switch ( op_type ) {
+
+		case alp::internal::Stagetype::SET_TENSOR:
+		case alp::internal::Stagetype::FOLDL_MAX:
+		case alp::internal::Stagetype::FOLDL_TIMES:
+		case alp::internal::Stagetype::FOLDL_ADD:
+		case alp::internal::Stagetype::FOLDL_DIVIDE:
+		{
+			stages.push_back( std::move( alp::internal::Stage( *this,
+												op_type, rule, tensor1, tensor2, activeAxes, forEachAxes ) ) );
+			break;
+		}
+		case alp::internal::Stagetype::APPLY_ADD:
+		case alp::internal::Stagetype::APPLY_MINUS:
+		case alp::internal::Stagetype::FOLDL_EXP:
+		case alp::internal::Stagetype::SET_SCALAR:
+		case alp::internal::Stagetype::GET_VIEW:
+		case alp::internal::Stagetype::STORE:
+		case alp::internal::Stagetype::IMPLICIT_FREE:
+		{
+            std::cerr << "Stage: " << (int) op_type << " does not have two tensor arguments" << std::endl;
+            std::abort();
+			break;
+		}
+	}
+}
+
+void alp::internal::AscendPipeline::addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule,
+											  const alp::Tensor &tensor1, const alp::Tensor &tensor2,
+											  const alp::Tensor &tensor3, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes ) {
+
+	// insert the Tensors to the set of accessed data
+	insertTensorToInputs( tensor1 ); //TODO pass the string
+	insertTensorToInputs( tensor2 );
+	insertTensorToInputs( tensor3 );
+
+	switch ( op_type ) {
+
+		case alp::internal::Stagetype::APPLY_MINUS:
+		case alp::internal::Stagetype::APPLY_ADD:
+		{
+			stages.push_back( std::move( alp::internal::Stage( *this,
+												op_type, rule, tensor1, tensor2, tensor3, activeAxes, forEachAxes ) ) );
+			break;
+		}
+		case alp::internal::Stagetype::FOLDL_DIVIDE:
+		case alp::internal::Stagetype::SET_TENSOR:
+		case alp::internal::Stagetype::FOLDL_MAX:
+		case alp::internal::Stagetype::FOLDL_TIMES:
+		case alp::internal::Stagetype::FOLDL_ADD:
+		case alp::internal::Stagetype::FOLDL_EXP:
+		case alp::internal::Stagetype::SET_SCALAR:
+		case alp::internal::Stagetype::GET_VIEW:
+		case alp::internal::Stagetype::STORE:
+		case alp::internal::Stagetype::IMPLICIT_FREE:
+		{
+            std::cerr << "Stage: " << (int) op_type << " does not have three tensor arguments" << std::endl;
+            std::abort();
+			break;
+		}
+	}
+}
+/*
+void alp::internal::AscendPipeline::addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule,
+											  const alp::Tensor &tensor1, const alp::Tensor &tensor2,
+											  const alp::Tensor &tensor3, const alp::Tensor &tensor4,
+											  const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes ) {
+
+	// insert the Tensors to the set of accessed data
+	insertTensorToInputs( tensor1 ); //TODO pass the string
+	insertTensorToInputs( tensor2 );
+	insertTensorToInputs( tensor3 );
+	insertTensorToInputs( tensor4 );
+
+	switch ( op_type ) {
+
+		case alp::internal::Stagetype::APPLY_MINUS:
+		{
+			//TODO tensor4 is a temporary variable
+
+			stages.push_back( std::move( alp::internal::Stage( *this,
+												op_type, rule, tensor1, tensor2, tensor3, tensor4, activeAxes, forEachAxes ) ) );
+			break;
+		}
+		case alp::internal::Stagetype::FOLDL_DIVIDE:
+		case alp::internal::Stagetype::APPLY_ADD:
+		case alp::internal::Stagetype::SET_TENSOR:
+		case alp::internal::Stagetype::FOLDL_MAX:
+		case alp::internal::Stagetype::FOLDL_TIMES:
+		case alp::internal::Stagetype::FOLDL_ADD:
+		case alp::internal::Stagetype::FOLDL_EXP:
+		case alp::internal::Stagetype::SET_SCALAR:
+		case alp::internal::Stagetype::GET_VIEW:
+		case alp::internal::Stagetype::STORE:
+		case alp::internal::Stagetype::IMPLICIT_FREE:
+		{
+            std::cerr << "Stage: " << (int) op_type << " does not have four tensor arguments" << std::endl;
+            std::abort();
+			break;
+		}
+	}
+}
+*/
+void alp::internal::AscendPipeline::generateDeclarations(
+	std::stringstream &declarations
+) {
+
+//	declarations << "\t\tuint32_t " << "block_length" << id << ";\n";
+//	declarations << "\t\tuint32_t " << "tile_length" << id << ";\n";
+//	declarations << "\n";
+
+	for( auto it = accessed.cbegin(); it != accessed.cend(); ++it ) {
+		if( it->isGlobalDecl() ) {
+			declarations << "\t\t// Global Tensor declaration\n";
+			if( outputs.find( *it ) != outputs.end() ) {
+				// TQue< QuePosition::VECOUT, BUFFER_NUM > globalQue_tensor1_0;
+				declarations << "\t\tTQue< QuePosition::VECOUT, BUFFER_NUM > "
+							 << it->getTQueBufName( id ) << ";\n";
+			} else {
+				// TQue< QuePosition::VECIN, BUFFER_NUM > globalQue_tensor0_0;
+				declarations << "\t\tTQue< QuePosition::VECIN, BUFFER_NUM > "
+							 << it->getTQueBufName( id ) << ";\n";
+			}
+			// GlobalTensor< half > Gm_tensor0_0;
+			declarations << "\t\tGlobalTensor< " << internal::getDataType( it->getType() ) << " > "
+						 << it->getAscendGlobalName( id ) << ";\n";
+			// LocalTensor< half > Gm_local_tensor0_0;
+			declarations << "\t\tLocalTensor< " << internal::getDataType( it->getType() ) << " > "
+						 << it->getAscendName( id ) << ";\n";
+		} else if( it->isLocalDecl() ) {
+/*			declarations << "\t\t// Local Tensor declaration\n";
+			// TBuf< QuePosition::VECCALC > localBuf_tensor4_0;
+			declarations << "\t\tTBuf< QuePosition::VECCALC > "
+						 << it->getTQueBufName( id ) << ";\n";
+			// LocalTensor< half > local_tensor4_0;
+			declarations << "\t\tLocalTensor< " << internal::getDataType( it->getType() ) << " > "
+						 << it->getAscendName( id ) << ";\n";
+*/
+			declarations << "\t\t// Offset for local Tensor declaration\n";
+			declarations << "\t\tint32_t " << it->getAscendName( id ) << ";\n";
+		} else if( it->isTempDecl() ) {
+/*			declarations << "\t\t// Temporary Tensor declaration\n";
+			// TBuf< QuePosition::VECCALC > tempBuf_tensor5_0;
+			declarations << "\t\tTBuf< QuePosition::VECCALC > "
+						 << it->getTQueBufName( id ) << ";\n";
+			// LocalTensor< half > temp_tensor5_0;
+			declarations << "\t\tLocalTensor< " << internal::getDataType( it->getType() ) << " > "
+						 << it->getAscendName( id ) << ";\n";
+*/
+			declarations << "\t\t// Offset for temporary Tensor declaration\n";
+			declarations << "\t\tint32_t " << it->getAscendName( id ) << ";\n";
+		}
+		declarations << "\n";
+	}
+/*
+	if( temp_or_local_found == true ) {
+		declarations << "\t\t// Declaration of memory used for Local and Temporary tensor\n";
+		declarations << "\t\tTBuf< QuePosition::VECCALC > " << "_temp_local;\n";
+		declarations << "\t\tLocalTensor< " << "half" << " > " << "_temp_local_Buf;\n";
+		declarations << "\n";
+	}
+*/
+}
+
+//void alp::internal::AscendPipeline::generateConstructor( std::stringstream &constructor ) {
+/*
+	constructor << "\n";
+	constructor << "\t\t\tblock_length" << id << " = ( ";
+	constructor << igrid->problemSize( 0 );
+	for( size_t i = 1; i < igrid->getProblemOrder(); ++i ) {
+		constructor << " * " << igrid->problemSize( i );
+	}
+	constructor << " ) / ( ";
+	constructor << igrid->processSize( 0 );
+	for( size_t i = 1; i < igrid->getProblemOrder(); ++i ) {
+		constructor << " * " << igrid->processSize( i );
+	}
+	constructor << " );\n";
+	constructor << "\t\t\ttile_length" << id << " = ( ";
+	bool first = true;
+	for( size_t i = 0; i < igrid->getProblemOrder(); ++i ) {
+		//TODO this solution assumes that there is only one parallel axis, which is not true
+		// omit the problemSize variables for which the corresponding axes is defined in the parallel forEach
+		// we use the parallel axes of the first stage, any other stage can be used as well
+		// since all stages of the same pipeline have the same outer loop
+		if( stages.begin()->getForEachAxes()[ 0 ] != ( int ) i ) {
+			if( !first ) {
+				constructor << " * ";
+			}
+			constructor << igrid->problemSize( i );
+			first = false;
+		}
+	}
+	constructor << " ) / " << "BUFFER_NUM;\n";
+*/
+//}
+
+void alp::internal::AscendPipeline::generateHostBody( std::stringstream &os, std::stringstream &analyticModelArgs,
+							std::stringstream &analyticModelFormalParams, std::stringstream &analyticModelDecls,
+							std::stringstream &analyticModelConstrBody ) {
+	// analytic model codeblock
+	constexpr size_t ub_size = grb::config::ASCEND_CACHE_HIERARCHY<>::UB_SIZE;
+
+	// This is a symbolic analysis to find what the largest global tensors are.
+	// After this symbolic analysis, we will have generally identified multiple
+	// global tensors as candidates for being the largest. We're generally still
+	// not sure which of these will be the largest tensor at run-time. Therefore,
+	// there is still a final run-time component to find the largest tensor(s).
+	std::set< std::set< int > > largestGlobals;
+	std::vector< Tensor > minorTensors;
+	bool differingDynamicAxesPresent = false;
+	for( const auto &tensor : accessed ) {
+		if( tensor.getScope() == internal::Scope::GLOBAL ) {
+			// TODO FIXME think about a cheaper algorithm for computing this check
+			const auto &current = tensor.getAxes();
+			assert( current.size() > 0 );
+			// by default, register the current tensor (don't register only if symbolic
+			// analysis is sure it is smaller)
+			bool insert = true;
+			for( const auto &existing : largestGlobals ) {
+				if( existing.size() <= current.size() ) {
+					bool larger = true;
+					for( const unsigned int &axis : existing ) {
+						if( std::find( current.cbegin(), current.cend(), axis ) != current.cend() ) {
+							// in this case, static analysis cannot conclude that the current tensor
+							// is larger than this entry in largestGlobal -- check the next entry of
+							// largestGlobals instead
+							larger = false;
+							break;
+						} else {
+							// check if the differing axis is a dynamic one
+							if( getIteratedAxes().find( axis ) != getIteratedAxes().cend() ) {
+								differingDynamicAxesPresent = true;
+							}
+						}
+					}
+					if( larger ) {
+						// in this case, the current tensor is guaranteed larger than this entry
+						// in largestGlobals -- so remove this entry, then flag the current axes
+						// for insertion.
+						(void) largestGlobals.erase( existing );
+						insert = true;
+						// By induction, furthermore, there are no other entries in largestGlobals
+						// that could contain the current tensor. So we terminate the check as
+						// well.
+						break;
+					}
+				} else {
+					bool smaller = true;
+					for( const unsigned int &axis : current ) {
+						if( existing.find( axis ) == existing.cend() ) {
+							// check if the differing axis is a dynamic one
+							if( getIteratedAxes().find( axis ) != getIteratedAxes().cend() ) {
+								differingDynamicAxesPresent = true;
+							}
+							// in this case, current is not a subset of this entry in largestGlobals,
+							// so we cannot that current is smaller-- check next one
+							smaller = false;
+							break;
+						}
+					}
+					if( smaller ) {
+						// in this case, current is a subset of this entry in largestGlobals, and
+						// so we can ignore the current tensor and move to the next one
+						insert = false;
+						// for allowing the analytic model to compute the exact buffer usage, we
+						// still record the tensor
+						minorTensors.push_back( tensor );
+						break;
+					}
+				}
+			}
+			if( insert ) {
+				std::set< int > tempSet( current.cbegin(), current.cend() );
+				(void) largestGlobals.insert( tempSet );
+			}
+		}
+	}
+
+	// start codegen: constructor
+	os << "\tasc::AnalyticModel< " << igrid->getProcessOrder() << ", "
+		<< igrid->getProblemOrder() << ", "
+		<< (differingDynamicAxesPresent ? "true" : "false")
+		<< " > am( " << ub_size << ", { ";
+	os << "_" << igrid->processSize( 0 );
+	for( size_t i = 1; i < igrid->getProcessOrder(); ++i ) {
+		os << ", _" << igrid->processSize( i );
+	}
+	os << " }, { ";
+	os << "_" << igrid->problemSize( 0 );
+	for( size_t i = 1; i < igrid->getProblemOrder(); ++i ) {
+		os << ", _" << igrid->problemSize( i );
+	}
+	os << " }, { ";
+	{
+		const auto &axes = getIteratedAxes();
+		if( axes.find( 0 ) != axes.cend() ) {
+			os << "true";
+		} else {
+			os << "false";
+		}
+		for( size_t i = 1; i < igrid->getProblemOrder(); ++i ) {
+			if( axes.find( i ) != axes.cend() ) {
+				os << ", true";
+			} else {
+				os << ", false";
+			}
+		}
+	}
+	os << " } );\n";
+
+	// add minor tensors
+	for( const auto &tensor : minorTensors ) {
+		const auto &current = tensor.getAxes();
+		os << "\tam.addMinorTensor( sizeof( "
+			<< internal::getDataType( tensor.getType() )
+			<< " ), { ";
+		if( std::find( current.cbegin(), current.cend(), 0 ) == current.cend() ) {
+			os << "false";
+		} else {
+			os << "true";
+		}
+		for( size_t i = 1; i < igrid->getProblemOrder(); ++i ) {
+			if( std::find( current.cbegin(), current.cend(), i ) == current.end() ) {
+				os << ", false";
+			} else {
+				os << ", true";
+			}
+		}
+		os << " } );\n";
+	}
+
+	// add global non-minor tensors
+	for( const auto &tensor : accessed ) {
+		const auto &axes = tensor.getAxes();
+		std::set< int > tempSet( axes.cbegin(), axes.cend() ); // TODO FIXME not the most performant code
+		if( tensor.getScope() != internal::Scope::GLOBAL ) { continue; }
+		if( largestGlobals.find( tempSet ) == largestGlobals.cend() ) { continue; }
+		assert( axes.size() > 0 );
+		os << "\tam.addGlobalTensor( sizeof( "
+			<< internal::getDataType( tensor.getType() )
+			<< " ), { ";
+		size_t k = 0;
+		if( std::find( axes.cbegin(), axes.cend(), 0 ) != axes.cend() ) {
+			os << "true";
+		} else {
+			os << "false";
+		}
+		(void) ++k;
+		for( ; k < igrid->getProblemOrder(); ++k ) {
+			if( std::find( axes.cbegin(), axes.cend(), k ) != axes.cend() ) {
+				os << ", false";
+			} else {
+				os << ", true";
+			}
+		}
+		os << " } );\n";
+	}
+
+	// add buffers
+	for( const auto &tensor : accessed ) {
+		if( tensor.getScope() != internal::Scope::GLOBAL ) {
+			const auto &axes = tensor.getAxes();
+			os << "\tam.addBuffer( sizeof( "
+				<< internal::getDataType( tensor.getType() )
+				<< " ), { ";
+			if( std::find( axes.cbegin(), axes.cend(), 0 ) == axes.cend() ) {
+				os << "false";
+			} else {
+				os << "true";
+			}
+			for( size_t i = 1; i < igrid->getProblemOrder(); ++i ) {
+				if( std::find( axes.cbegin(), axes.cend(), i ) == axes.cend() ) {
+					os << ", false";
+				} else {
+					os << ", true";
+				}
+			}
+			os << " } );\n";
+		}
+	}
+
+	// add stages
+	// TODO ideally, all AscendC functions have a unique identifier and an array of
+	//      those are passed to the analytic model. For now, we just transfer a
+	//      count instead.
+	os << "\tam.setNumStages( " << stages.size() << " );\n";
+
+	// Now, finally, the analytic model has all info it needs -- get the blocksizes!
+	for( auto axes : getIteratedAxes() ) {
+		os << "\tconst uint32_t _tile_size" << axes << " = am.getBlockSize( "
+			<< axes << " );\n";
+	}
+	os << "\n";
+
+
+	// done: move this to the host
+	// for( auto axes : getIteratedAxes() ) {
+	// 	os << "\tconst uint32_t _tile_size" << axes << " = 1;\n";
+	// }
+	// os << "\n";
+
+	analyticModelConstrBody << "\n";
+	for( const auto &axes : getIteratedAxes() ) {
+		analyticModelConstrBody << "\t\t\ttile_size" << axes << " = _tile_size" << axes << ";\n";
+		analyticModelFormalParams << ", const uint32_t _tile_size" << axes;
+		analyticModelDecls << "\t\tuint32_t tile_size" << axes << ";\n\n";
+		analyticModelArgs << ", _tile_size" << axes;
+	}
+	// end analytic model code block
+}
+
+void alp::internal::AscendPipeline::generateInit( std::stringstream &init ) {
+
+	for( auto it = accessed.cbegin(); it != accessed.cend(); ++it ) {
+		if( it->isGlobalDecl() ) {
+
+			assert( it->getAxes().size() > 0 );
+
+			// n0 * n1 * n2 * n3 ...
+			std::string set_numerator( igrid->problemSize( *( it->getAxes().begin() ) ) );
+			// p0 * p1 * p2 * n3 ...
+			std::string set_denominator( igrid->processSize( *( it->getAxes().begin() ) ) );
+			for( auto jt = ++it->getAxes().begin(); jt != it->getAxes().end(); ++jt ) {
+
+				set_numerator.append( " * " + igrid->problemSize( *jt ) );
+				set_denominator.append( " * " + igrid->processSize( *jt ) );
+			}
+
+			// n2 * n3 ... (e.g., n0 and n1 are excluded since they are the loop axes)
+			std::string non_parallel_init_numerator;
+
+			for( auto jt = stages.cbegin(); jt != stages.cend(); ++jt ) {
+
+				if( jt->getOpType() == internal::Stagetype::GET_VIEW && jt->getTensor0().getID() == it->getID() ) {
+
+					bool first = true;
+					for( auto kt = it->getAxes().cbegin(); kt != it->getAxes().cend(); ++kt ) {
+						if( std::find( jt->getForEachAxes().begin(), jt->getForEachAxes().end(), ( int ) *kt ) == jt->getForEachAxes().end() ) {
+							if( !first ) {
+								non_parallel_init_numerator.append( " * " );
+							}
+							non_parallel_init_numerator.append( igrid->problemSize( *kt ) );
+							first = false;
+						}
+					}
+					break;
+				}
+			}
+
+			if( non_parallel_init_numerator.empty() ) {
+				non_parallel_init_numerator.assign( "1" );
+			}
+
+			std::string tiling_init_numerator = getTilingAxes();
+
+			init << "\n";
+			init << "\t\t\t// Initializing data for a Global Tensor\n";
+			init << "\t\t\t" << it->getAscendGlobalName( id )
+					<< ".SetGlobalBuffer( ( __gm__ "
+					<< internal::getDataType( it->getType() ) << " * )"
+					<< it->getName() << " + ( " << set_numerator << " ) / ( " << set_denominator << " ) * GetBlockIdx(), ( "
+					<< set_numerator << " ) / ( " << set_denominator << " ) );\n";
+			init << "\t\t\tpipe.InitBuffer( " << it->getTQueBufName( id ) << ", BUFFER_NUM, "
+					<< tiling_init_numerator
+					<< "( ( " << non_parallel_init_numerator << " ) / BUFFER_NUM ) * sizeof( "
+					<< internal::getDataType( it->getType() ) << " ) );\n";
+		}
+	}
+/*
+	//TODO these two loops can be fused, the only reason are written that way
+	//		was to solve or avoid a bug regarding the order of the init
+	for( auto it = accessed.cbegin(); it != accessed.cend(); ++it ) {
+		if( it->isTempDecl() ) {
+			init << "\n";
+			init << "\t\t\t// Initializing data for a temporary Tensor\n";
+			init << "\t\t\tpipe.InitBuffer( " << it->getTQueBufName( id )
+				<< ", " << "totWorkSpaceSize" << " );\n";
+			init << "\t\t\t" << it->getAscendName( id ) << " = "
+				<< it->getTQueBufName( id )
+				<< ".Get< " << internal::getDataType( it->getType() ) << " >();\n";
+		}
+	}
+
+	for( auto it = accessed.cbegin(); it != accessed.cend(); ++it ) {
+		if( it->isLocalDecl() ) {
+			init << "\n";
+			//TODO fix that
+			std::vector< int > forEachParallelAxes = internal::vectorOfVectorsToVector( internal::OpGen::forEachAxes );
+			const std::vector< int > &axes = it->getAxes();
+			std::vector< int > local_iterated_axes;
+
+			//TODO: is this set correct? is it necessary, if yes, perhaps a sort is required
+			local_iterated_axes = internal::vectorDifference( axes, forEachParallelAxes );
+
+			std::string product_dim("");
+			bool first = true;
+			for( auto it = local_iterated_axes.cbegin(); it != local_iterated_axes.cend(); ++it ) {
+				if( first == true ) {
+					first = false;
+				} else {
+					product_dim.append(" * ");
+				}
+				product_dim.append( igrid->problemSize( *it ) );
+			}
+
+			if( product_dim.empty() == true ) {
+				product_dim.append( "1" );
+			}
+			init << "\t\t\t// Initializing data for a local Tensor\n";
+			init << "\t\t\tpipe.InitBuffer( " << it->getTQueBufName( id )
+				<< ", " << product_dim << " );\n";
+			init << "\t\t\t" << it->getAscendName( id ) << " = "
+				<< it->getTQueBufName( id )
+				<< ".Get< " << internal::getDataType( it->getType() ) << " >();\n";
+		}
+	}
+*/
+	init << "\n";
+
+	std::string prev("totWorkSpaceSize");
+	std::string prev_dim("");
+	for( auto it = accessed.cbegin(); it != accessed.cend(); ++it ) {
+		if( it->isLocalDecl() || it->isTempDecl() ) {
+			init << "\t\t\t" << it->getAscendName( id ) << " = " << prev << ( prev_dim.empty() ? ";\n" : ( " + " + prev_dim + ";\n" ) );
+
+			if( it->getAxes().size() > 0 ) {
+				// n0 * n1 * n2 ...
+				std::string set_numerator( igrid->problemSize( *( it->getAxes().begin() ) ) );
+				for( auto jt = ++it->getAxes().begin(); jt != it->getAxes().end(); ++jt ) {
+
+					set_numerator.append( " * " + igrid->problemSize( *jt ) );
+				}
+				prev_dim = set_numerator;
+			} else {
+				prev_dim = "16";
+			}
+			prev = it->getAscendName( id );
+		}
+	}
+}
+
+void alp::internal::AscendPipeline::generateProcess( std::stringstream &process,
+													 std::stringstream &processCall ) {
+
+	processCall << "\t\t\tProcess" << id << "();\n";
+
+	// generate the Process function
+	// TODO here we should use the grid info and symbolic analytic model
+
+	process << "\n";
+	process << "\t\t__aicore__ inline void Process" << id << "() {\n";
+	process << "\n";
+
+	std::string tabs("");
+
+	// use a stack to keep track of the for loops that are already generated
+	std::vector< int > stack;
+//	std::vector< std::pair< std::string, std::pair< std::string, std::string > > > tiling_stack;
+
+	int parallel_axe = *( stages.cbegin()->getForEachAxes().begin() );
+	// initialize the stack with the axe of the outer forEach
+	// which is the parallel loop and thus can be omitted
+	stack.push_back(  parallel_axe );
+
+//	bool new_nested_level = true;
+
+	// declare variables for the upper bound of the extra loops that are introduced
+	std::set< int > iterated_axes = getIteratedAxes();
+	for( auto it = iterated_axes.cbegin(); it != iterated_axes.cend(); ++it ) {
+		process << tabs << "\t\t\tuint32_t upper_" << igrid->problemTileMode( *it ) << ";\n";
+	}
+
+	process << "\n";
+
+	process << tabs << "\t\t\tfor( uint32_t " << igrid->problemMainMode( parallel_axe )
+			<< " = 0; " << igrid->problemMainMode( parallel_axe ) << " < ( "
+			<< igrid->problemSize( parallel_axe ) << " / " << igrid->processSize( parallel_axe ) << " ); "
+			<< igrid->problemMainMode( parallel_axe ) << " += tile_size" << parallel_axe << " ) {\n";
+
+	tabs.append("\t");
+
+/*	std::stringstream tiling_loop, tiling_condition, tiling_var;
+
+	tiling_condition.str("");
+	tiling_condition << "\t\t\tupper_" << igrid->problemTileMode( parallel_axe ) << " = ( "
+					 << "( " << igrid->problemSize( parallel_axe ) << " / " << igrid->processSize( parallel_axe )
+					 << " ) < ( " << igrid->problemMainMode( parallel_axe ) << " + tile_size" << parallel_axe << " ) ) ? "
+					 << "( (" << igrid->problemSize( parallel_axe ) << " / " << igrid->processSize( parallel_axe ) << " ) - "
+					 << igrid->problemMainMode( parallel_axe ) << " ) : ( tile_size" << parallel_axe << " );\n";
+
+	// the tiling loop is not added in the stack of generated loops
+	tiling_loop.str("");
+	tiling_loop << "\t\t\tfor( uint32_t " << igrid->problemTileMode( parallel_axe )
+				<< " = " << "0" << "; " << igrid->problemTileMode( parallel_axe )
+				<< " < upper_" << igrid->problemTileMode( parallel_axe ) << "; " << igrid->problemTileMode( parallel_axe ) << "++ ) {\n";
+
+	tiling_var.str("");
+	tiling_var << "\t\t\t\tconst uint32_t " << igrid->problemMode( parallel_axe ) << " = " << igrid->problemMainMode( parallel_axe )
+			   << " + " << igrid->problemTileMode( parallel_axe ) << ";\n";
+
+
+	tiling_stack.push_back( std::make_pair( tiling_condition.str(), std::make_pair( tiling_loop.str(), tiling_var.str() ) ) );
+*/
+	std::vector< int > prev_stage_axes;
+
+	// generate AscendC code for the operators of the pipeline
+	for( auto it = stages.cbegin(); it != stages.cend(); ++it ) {
+
+		// get the axes of the current stage
+		const std::vector< int > &forEachAxes = it->getForEachAxes();
+
+		// iterator of the stack
+		auto st = stack.begin();
+		// iterator of the axes for the current stage
+		auto at = forEachAxes.begin();
+
+		// the number of axes that are currently in the stack
+		// and match the corresponding axes of the current stage
+		size_t match_axes = 0;
+
+		// iterate over all the axes of the stack that match
+		// those of the stage, which implies that if the current
+		// stage goes into the current for loop, i.e., no loop
+		// needs to be created and no loop needs to be closed,
+		// all the axes should match
+		while( st != stack.end() ) {
+
+			if( at == forEachAxes.end() || *st != *at ) {
+				break;
+			}
+
+			++match_axes;
+			++st;
+			++at;
+		}
+
+		// if there was a mismatch on the axes between the
+		// already generated loops (stack) and the axes of the stage
+		// then the axes of the stack that do not match should be popped
+		// which implies that the generated loops should close
+		size_t to_pop_axes = stack.size() - match_axes;
+/*
+		if( to_pop_axes > 0 ) {
+			// close the loops for tiling first
+			for( auto jt = tiling_stack.begin(); jt != tiling_stack.end(); ++jt ) {
+				process << tabs << "\t\t}\n";
+				tabs.pop_back();
+			}
+		}
+*/
+		for( size_t i = 0; i < to_pop_axes; ++i ) {
+
+//			tiling_stack.pop_back();
+
+			process << tabs << "\t\t}\n";
+			stack.pop_back();
+			tabs.pop_back();
+		}
+/*
+		// generate tiling loops if at least one loop of axes was closed
+		if( to_pop_axes > 0 ) {
+			for( auto jt = tiling_stack.begin(); jt != tiling_stack.end(); ++jt ) {
+				process << "\n";
+				process << tabs << jt->first;
+				process << tabs << jt->second.first;
+				process << "\n";
+				process << tabs << jt->second.second;
+				tabs.append("\t");
+			}
+		}
+*/
+		// iterator of the stack
+		st = stack.begin();
+		// iterator of the axes for the current stage
+		at = forEachAxes.begin();
+
+		// iterate over all the axes of the stage as long as the
+		// corresponding axes are already in the stack, which implies
+		// the for loops are already generated
+		while( at != forEachAxes.end() ) {
+
+			if( st == stack.end() ) {
+				break;
+			}
+
+			// as long as the end of the stack was not reached
+			// the axes should match those of the current stage
+			// since all the elements did not match were popped
+			assert( *st != *at );
+
+			++st;
+			++at;
+		}
+/*
+		// close tiling loops provides that
+		// a) no loop was already closed, otherwise the corresponding loops are closed
+		// b) this is not the first stage
+		// c) the axes of the previous stage are different than those of the current stage
+		//		a situation that indicates these two stages are not nested in the same level
+		if( to_pop_axes == 0 && it != stages.cbegin() && prev_stage_axes != forEachAxes ) {
+			for( auto jt = tiling_stack.begin(); jt != tiling_stack.end(); ++jt ) {
+				process << tabs << "\t\t}\n";
+				tabs.pop_back();
+			}
+
+		}
+*/
+		// iterate over the rest of the axes of the stage, i.e., those
+		// that are not included in the stack and lead to generation of for loops
+		while( at != forEachAxes.end() ) {
+
+//			new_nested_level = true;
+
+			process << "\n";
+
+			process << tabs << "\t\t\tfor( uint32_t " << igrid->problemMainMode( *at )
+					<< " = 0; " << igrid->problemMainMode( *at ) << " < "
+					<< igrid->problemSize( *at ) << "; "
+					<< igrid->problemMainMode( *at ) << " += tile_size" << *at << " ) {\n";
+
+			tabs.append("\t");
+			stack.push_back( *at );
+/*
+			tiling_condition.str("");
+			tiling_condition << "\t\t\tupper_" << igrid->problemTileMode( *at ) << " = ( "
+							 << igrid->problemSize( *at ) << " < ( " << igrid->problemMainMode( *at ) << " + tile_size" << *at << " ) ) ? "
+							 << igrid->problemSize( *at ) << " - " << igrid->problemMainMode( *at ) << " : ( tile_size" << *at << " );\n";
+
+			// the tiling loop is not added in the stack of generated loops
+			tiling_loop.str("");
+			tiling_loop << "\t\t\tfor( uint32_t " << igrid->problemTileMode( *at )
+						<< " = " << "0" << "; " << igrid->problemTileMode( *at ) << " < upper_"
+						<< igrid->problemTileMode( *at ) << "; " << igrid->problemTileMode( *at ) << "++ ) {\n";
+
+			tiling_var.str("");
+			tiling_var << "\t\t\t\tconst uint32_t " << igrid->problemMode( *at ) << " = " << igrid->problemMainMode( *at )
+					   << " + " << igrid->problemTileMode( *at ) << ";\n";
+
+			tiling_stack.push_back( std::make_pair( tiling_condition.str(), std::make_pair( tiling_loop.str(), tiling_var.str() ) ) );
+*/ 			++at;
+		}
+/*
+		if( new_nested_level ) {
+			for( auto jt = tiling_stack.begin(); jt != tiling_stack.end(); ++jt ) {
+				process << "\n";
+				process << tabs << jt->first;
+				process << tabs << jt->second.first;
+				process << "\n";
+				process << tabs << jt->second.second;
+				tabs.append("\t");
+			}
+		}
+*/
+		process << "\n";
+		process << it->getOp( tabs );
+
+		// reset the flag to false
+//		new_nested_level = false;
+
+		// set the axes of the previous stage to those of the current one
+		prev_stage_axes = forEachAxes;
+	}
+/*
+	if( stack.size() > 0 ) {
+		// before closing a loop, all the generated loops for tiling should close as well
+		for( auto jt = tiling_stack.begin(); jt != tiling_stack.end(); ++jt ) {
+			process << tabs << "\t\t}\n";
+			tabs.pop_back();
+		}
+	}
+*/
+	// close all the generated for loops
+	// starting from 0 to generate the parallel/outer loop
+	// starting from 1 if the outer parallel/loop is not generated
+	for( size_t i = 0; i < stack.size(); ++i ) {
+
+		process << tabs << "\t\t}\n";
+		tabs.pop_back();
+	}
+
+	// the curly bracket for the process function
+	process << "\t\t}\n";
+}
+
+void alp::internal::AscendPipeline::debug_print() const {
+
+	std::cerr << "ACCESSED: ";
+	for (auto it = accessed.cbegin(); it != accessed.cend(); ++it ) {
+		std::cerr << it->getName() << ", ";
+	}
+
+	std::cerr << std::endl << std::endl << std::endl;
+}
diff --git a/src/graphblas/ascend/semantics.cpp b/src/graphblas/ascend/semantics.cpp
new file mode 100644
index 000000000..68159593d
--- /dev/null
+++ b/src/graphblas/ascend/semantics.cpp
@@ -0,0 +1,68 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include <vector>
+
+#include <graphblas/ascend/opgen.hpp>
+#include <graphblas/ascend/semantics.hpp>
+#include <graphblas/ascend/grid.hpp>
+
+namespace alp
+{
+	namespace internal
+	{
+		extern iGrid *igrid;
+	}
+}
+
+bool alp::internal::invalidForEachAxes( const std::vector< int > &axes ) {
+
+	std::vector< int > forEachAxes = internal::vectorOfVectorsToVector( internal::OpGen::forEachAxes );
+
+	for( auto it = axes.cbegin(); it != axes.cend(); ++it ) {
+
+		if( std::find( axes.cbegin(), axes.cend(), *it ) != forEachAxes.cend() 
+			&& std::find( axes.cbegin(), axes.cend(), *it ) != it ) {
+			return true;
+		}
+		if( std::find( forEachAxes.cbegin(), forEachAxes.cend(), *it ) != forEachAxes.cend() ) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+bool alp::internal::invalidAxes( const std::vector< int > &axes ) {
+
+	std::vector< int > forEachAxes = internal::vectorOfVectorsToVector( internal::OpGen::forEachAxes );
+	std::vector< int > sorted_axes_copy = axes;
+	std::vector< int > intersection;
+
+	std::sort( forEachAxes.begin(), forEachAxes.end() );
+	std::sort( sorted_axes_copy.begin(), sorted_axes_copy.end() );
+
+	std::set_intersection(
+		forEachAxes.begin(), forEachAxes.end(),
+		sorted_axes_copy.begin(), sorted_axes_copy.end(),
+		std::back_inserter( intersection )
+	);
+
+	return ( intersection.size() > 0 );
+}
+
diff --git a/src/graphblas/ascend/stage.cpp b/src/graphblas/ascend/stage.cpp
new file mode 100644
index 000000000..ef8430166
--- /dev/null
+++ b/src/graphblas/ascend/stage.cpp
@@ -0,0 +1,985 @@
+#include <graphblas/ascend/stage.hpp>
+#include <graphblas/ascend/symbolTable.hpp>
+#include <graphblas/ascend/semantics.hpp>
+#include <graphblas/ascend/grid.hpp>
+
+namespace alp
+{
+	namespace internal
+	{
+		extern iGrid *igrid;
+		extern SymbolTable symbols;
+	}
+}
+
+//TODO double should be replaced by alp::Scalar
+alp::internal::Stage::Stage( const AscendPipeline &parent,
+							 Stagetype _enum_op_type, Rule _rule,
+							 const alp::Tensor &_tensor0,
+							const double _alpha,
+							 const std::vector< int > &_forEachAxes )
+		: pipeline( parent ),
+		  enum_op_type( _enum_op_type ),
+		  rule( _rule ),
+		  tensor0( _tensor0 ),
+		  alpha( _alpha ),
+		  forEachAxes( _forEachAxes )
+{
+	semanticsCheks();
+	computeMemoryOffsets();
+}
+
+alp::internal::Stage::Stage( const AscendPipeline &parent,
+							 Stagetype _enum_op_type, Rule _rule,
+							 const alp::Tensor &_tensor0,
+							 const std::vector< int > &_activeAxes,
+							 const std::vector< int > &_forEachAxes )
+		: pipeline( parent ),
+		  enum_op_type( _enum_op_type ),
+		  rule( _rule ),
+		  tensor0( _tensor0 ),
+		  activeAxes( _activeAxes ),
+		  forEachAxes( _forEachAxes )
+{
+	semanticsCheks();
+	computeMemoryOffsets();
+}
+
+alp::internal::Stage::Stage( const AscendPipeline &parent,
+							 Stagetype _enum_op_type, Rule _rule,
+							 const alp::Tensor &_tensor0,
+							 const alp::Tensor &_tensor1,
+							 const std::vector< int > &_activeAxes,
+							 const std::vector< int > &_forEachAxes )
+		: pipeline( parent ),
+		  enum_op_type( _enum_op_type ),
+		  rule( _rule ),
+		  tensor0( _tensor0 ),
+		  tensor1( _tensor1 ),
+		  activeAxes( _activeAxes ),
+		  forEachAxes( _forEachAxes )
+{
+	semanticsCheks();
+	computeMemoryOffsets();
+}
+
+alp::internal::Stage::Stage( const AscendPipeline &parent,
+							 Stagetype _enum_op_type, Rule _rule,
+							 const alp::Tensor &_tensor0,
+							 const alp::Tensor &_tensor1,
+							 const alp::Tensor &_tensor2,
+							 const std::vector< int > &_activeAxes,
+							 const std::vector< int > &_forEachAxes )
+		: pipeline( parent ),
+		  enum_op_type( _enum_op_type ),
+		  rule( _rule ),
+		  tensor0( _tensor0 ),
+		  tensor1( _tensor1 ),
+		  tensor2( _tensor2 ),
+		  activeAxes( _activeAxes ),
+		  forEachAxes( _forEachAxes )
+{
+	semanticsCheks();
+	computeMemoryOffsets();
+}
+/*
+alp::internal::Stage::Stage( const AscendPipeline &parent,
+							 Stagetype _enum_op_type, Rule _rule,
+							 const alp::Tensor &_tensor0,
+							 const alp::Tensor &_tensor1,
+							 const alp::Tensor &_tensor2,
+							 const alp::Tensor &_tensor3,
+							 const std::vector< int > &_activeAxes,
+							 const std::vector< int > &_forEachAxes )
+		: pipeline( parent ),
+		  enum_op_type( _enum_op_type ),
+		  rule( _rule ),
+		  tensor0( _tensor0 ),
+		  tensor1( _tensor1 ),
+		  tensor2( _tensor2 ),
+		  tensor3( _tensor3 ),
+		  activeAxes( _activeAxes ),
+		  forEachAxes( _forEachAxes )
+{
+	semanticsCheks();
+	computeMemoryOffsets();
+}
+*/
+alp::internal::Stagetype alp::internal::Stage::getOpType() const {
+
+	return enum_op_type;
+}
+
+alp::internal::Rule alp::internal::Stage::getRule() const {
+
+	return rule;
+}
+
+const alp::Tensor & alp::internal::Stage::getTensor0() const {
+
+	return tensor0;
+}
+
+const std::vector< int > & alp::internal::Stage::getAxes() const {
+
+	return activeAxes;
+}
+
+const std::vector< int >& alp::internal::Stage::getForEachAxes() const {
+
+	return forEachAxes;
+}
+
+std::string alp::internal::Stage::getOp( const std::string &tabs ) const {
+
+	switch (enum_op_type) {
+		case alp::internal::Stagetype::APPLY_MINUS:
+			return generateApplyMinusOp( tabs );
+		case alp::internal::Stagetype::APPLY_ADD:
+			return generateApplyAddOp( tabs );
+		case alp::internal::Stagetype::FOLDL_DIVIDE:
+			return generateFoldlDivideOp( tabs );
+		case alp::internal::Stagetype::FOLDL_MAX:
+			return generateFoldlMaxOp( tabs );
+		case alp::internal::Stagetype::FOLDL_TIMES:
+			return generateFoldlTimesOp( tabs );
+		case alp::internal::Stagetype::FOLDL_ADD:
+			return generateFoldlAddOp( tabs );
+		case alp::internal::Stagetype::FOLDL_EXP:
+			return generateFoldlExpOp( tabs );
+		case alp::internal::Stagetype::SET_TENSOR:
+			return generateSetTensorOp( tabs );
+		case alp::internal::Stagetype::SET_SCALAR:
+			return generateSetScalarOp( tabs );
+		case alp::internal::Stagetype::GET_VIEW:
+			return generateGetViewOp( tabs );
+		case alp::internal::Stagetype::STORE:
+			return generateStoreOp( tabs );
+		case alp::internal::Stagetype::IMPLICIT_FREE:
+			return generateImplicitFreeOp( tabs );
+		default:
+			return generateToDoOp( tabs );
+	}
+}
+
+std::string alp::internal::Stage::generateApplyMinusOp( const std::string &tabs ) const {
+
+	const std::string arg1 = tensor0.getAccessedElement( pipeline.getID() );
+	const std::string arg2 = tensor1.getAccessedElement( pipeline.getID() );
+	const std::string arg3 = tensor2.getAccessedElement( pipeline.getID() );
+//	const std::string arg4 = tensor3.getAccessedElement( pipeline.getID() );
+
+	const std::vector< int > op_axes = computeOperatorAxes();
+	std::stringstream stage;
+
+	switch ( rule ) {
+		case Rule::EWISE:
+		{
+			if( op_axes.size() == 1) {
+				stage << tabs << "\t\t\talp::VectorEwiseMinus( " << arg1 << ", " << arg2 << ", "
+					  << arg3 << ", " << igrid->problemSize( op_axes[ 0 ] ) << " );\n";
+			} else if( op_axes.size() == 2) {
+				stage << tabs << "\t\t\talp::BlockEwiseMinus( " << arg1 << ", " << arg2 << ", "
+					  << arg3 << ", " << igrid->problemSize( op_axes[ 0 ] ) << ", "
+					  << igrid->problemSize( op_axes[ 1 ] ) << " );\n";
+			}
+			break;
+		}
+		case Rule::BCAST:
+		{
+			if( op_axes.size() == 1) {
+				stage << tabs << "\t\t\talp::VectorBcastMinus( " << arg1 << ", " << arg2 << ", " << arg3 << ", "
+					  << alp::internal::symbols.getLocalTempTensorBuffer( tensor0.getType() ) << "[ 0 ], "
+					  << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << " );\n";
+			} else if( op_axes.size() == 2) {
+				stage << tabs << "\t\t\talp::BlockBcastMinus( " << arg1 << ", " << arg2 << ", " << arg3 << ", "
+					  << alp::internal::symbols.getLocalTempTensorBuffer( tensor0.getType() ) << "[ 0 ], "
+					  << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << ", "
+					  << igrid->problemSize( op_axes[ 1 ] ) << " );\n";
+			}
+			break;
+		}
+		case Rule::REDUCE:
+		{
+			break;
+		}
+		default:
+		{
+		        std::cerr << "Invalid rule: apply minus" << std::endl;
+		        std::abort();
+		}
+	}
+
+	return stage.str();
+}
+
+std::string alp::internal::Stage::generateApplyAddOp( const std::string &tabs ) const {
+
+	const std::string arg1 = tensor0.getAccessedElement( pipeline.getID() );
+	const std::string arg2 = tensor1.getAccessedElement( pipeline.getID() );
+	const std::string arg3 = tensor2.getAccessedElement( pipeline.getID() );
+
+	const std::vector< int > op_axes = computeOperatorAxes();
+	std::stringstream stage;
+
+	switch ( rule ) {
+		case Rule::EWISE:
+		{
+			if( op_axes.size() == 0) {
+				stage << tabs << "\t\t\tAdd( " << arg1 << ", " << arg2 << ", " << arg3 << ", "
+					  << pipeline.getTilingAxes() << "1" << " );\n";
+			} else if( op_axes.size() == 1) {
+				stage << tabs << "\t\t\tAdd( " << arg1 << ", " << arg2 << ", " << arg3 << ", "
+					  << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << " );\n";
+			} else if( op_axes.size() == 2) {
+				stage << tabs << "\t\t\tAdd( " << arg1 << ", " << arg2 << ", " << arg3 << ", "
+					  << igrid->problemSize( op_axes[ 0 ] ) << " * "
+					  << igrid->problemSize( op_axes[ 1 ] ) << " );\n";
+			}
+			break;
+		}
+		case Rule::BCAST:
+		{
+			break;
+		}
+		case Rule::REDUCE:
+		{
+			break;
+		}
+		default:
+		{
+		        std::cerr << "Invalid rule: apply add" << std::endl;
+		        std::abort();
+		}
+	}
+
+	return stage.str();
+}
+
+std::string alp::internal::Stage::generateFoldlDivideOp( const std::string &tabs ) const {
+
+	const std::string arg1 = tensor0.getAccessedElement( pipeline.getID() );
+	const std::string arg2 = tensor1.getAccessedElement( pipeline.getID() );
+//	const std::string arg3 = tensor2.getAccessedElement( pipeline.getID() );
+
+	const std::vector< int > op_axes = computeOperatorAxes();
+	std::stringstream stage;
+
+	switch ( rule ) {
+		case Rule::EWISE:
+		{
+			break;
+		}
+		case Rule::BCAST:
+		{
+			if( op_axes.size() == 1) {
+				stage << tabs << "\t\t\talp::VectorBcastDivide( " << arg1 << ", " << arg1 << ", " << arg2 << ", "
+					  << alp::internal::symbols.getLocalTempTensorBuffer( tensor0.getType() ) << "[ 0 ], "
+					  << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << " );\n";
+			} else if( op_axes.size() == 2) {
+				stage << tabs << "\t\t\talp::BlockBcastDivide( " << arg1 << ", " << arg1 << ", " << arg2 << ", "
+					  << alp::internal::symbols.getLocalTempTensorBuffer( tensor0.getType() ) << "[ 0 ], "
+					  << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << ", "
+					  << igrid->problemSize( op_axes[ 1 ] ) << " );\n";
+			}
+			break;
+		}
+		case Rule::REDUCE:
+		{
+			break;
+		}
+		default:
+		{
+		        std::cerr << "Invalid rule: foldl divide" << std::endl;
+		        std::abort();
+		}
+	}
+
+	return stage.str();
+}
+
+std::string alp::internal::Stage::generateFoldlMaxOp( const std::string &tabs ) const {
+
+	const std::string arg1 = tensor0.getAccessedElement( pipeline.getID() );
+	const std::string arg2 = tensor1.getAccessedElement( pipeline.getID() );
+
+	const std::vector< int > op_axes = computeOperatorAxes();
+	std::stringstream stage;
+
+	switch ( rule ) {
+		case Rule::EWISE:
+		{
+			if( op_axes.size() == 1) {
+				stage << tabs << "\t\t\talp::VectorEwiseMax( " << arg1 << ", " << arg1 << ", "
+					  << arg2 << ", " << igrid->problemSize( op_axes[ 0 ] ) << " );\n";
+			} else if( op_axes.size() == 2) {
+				stage << tabs << "\t\t\talp::BlockEwiseMax( " << arg1 << ", " << arg1 << ", "
+					  << arg2 << ", " << igrid->problemSize( op_axes[ 0 ] ) << ", "
+					  << igrid->problemSize( op_axes[ 1 ] ) << " );\n";
+			}
+			break;
+		}
+		case Rule::BCAST:
+		{
+			break;
+		}
+		case Rule::REDUCE:
+		{
+			if( op_axes.size() == 1) {
+				stage << tabs << "\t\t\talp::VectorReduceMax( " << arg1 << ", " << arg2 << ", "
+					  << alp::internal::symbols.getLocalTempTensorBuffer( tensor0.getType() ) << "[ 0 ], "
+					  << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << " );\n";
+			} else if( op_axes.size() == 2) {
+				stage << tabs << "\t\t\talp::BlockReduceMax( " << arg1 << ", " << arg2 << ", "
+					  << alp::internal::symbols.getLocalTempTensorBuffer( tensor0.getType() ) << "[ 0 ], "
+					  << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << ", "
+					  << igrid->problemSize( op_axes[ 1 ] ) << " );\n";
+			}
+			break;
+		}
+		default:
+		{
+		        std::cerr << "Invalid rule: foldl max" << std::endl;
+		        std::abort();
+		}
+	}
+
+	return stage.str();
+}
+
+std::string alp::internal::Stage::generateFoldlTimesOp( const std::string &tabs ) const {
+
+	const std::string arg1 = tensor0.getAccessedElement( pipeline.getID() );
+	const std::string arg2 = tensor1.getAccessedElement( pipeline.getID() );
+
+	const std::vector< int > op_axes = computeOperatorAxes();
+	std::stringstream stage;
+
+	switch ( rule ) {
+		case Rule::EWISE:
+		{
+			if( op_axes.size() == 1) {
+				stage << tabs << "\t\t\talp::VectorEwiseMultiply( " << arg1 << ", " << arg1 << ", "
+					  << arg2 << ", " << igrid->problemSize( op_axes[ 0 ] ) << " );\n";
+			} else if( op_axes.size() == 2) {
+				stage << tabs << "\t\t\talp::BlockEwiseMultiply( " << arg1 << ", " << arg1 << ", "
+					  << arg2 << ", " << igrid->problemSize( op_axes[ 0 ] ) << ", "
+					  << igrid->problemSize( op_axes[ 1 ] ) << " );\n";
+			}
+			break;
+		}
+		case Rule::BCAST:
+		{
+			break;
+		}
+		case Rule::REDUCE:
+		{
+			break;
+		}
+		default:
+		{
+		        std::cerr << "Invalid rule: foldl times" << std::endl;
+		        std::abort();
+		}
+	}
+
+	return stage.str();
+}
+
+std::string alp::internal::Stage::generateFoldlAddOp( const std::string &tabs ) const {
+
+	const std::string arg1 = tensor0.getAccessedElement( pipeline.getID() );
+	const std::string arg2 = tensor1.getAccessedElement( pipeline.getID() );
+
+	const std::vector< int > op_axes = computeOperatorAxes();
+	std::stringstream stage;
+
+	switch ( rule ) {
+		case Rule::EWISE:
+		{
+			if( op_axes.size() == 1) {
+				stage << tabs << "\t\t\talp::VectorEwiseSum( " << arg1 << ", " << arg1 << ", "
+						<< arg2 << ", " << igrid->problemSize( op_axes[ 0 ] ) << " );\n";
+			} else if( op_axes.size() == 2) {
+				stage << tabs << "\t\t\talp::BlockEwiseSum( " << arg1 << ", " << arg1 << ", "
+						<< arg2 << ", " << igrid->problemSize( op_axes[ 0 ] ) << ", "
+						<< igrid->problemSize( op_axes[ 1 ] ) << " );\n";
+			}
+			break;
+		}
+		case Rule::BCAST:
+		{
+			break;
+		}
+		case Rule::REDUCE:
+		{
+			if( op_axes.size() == 1) {
+				stage << tabs << "\t\t\talp::VectorReduceSum( " << arg1 << ", " << arg2 << ", "
+					  << alp::internal::symbols.getLocalTempTensorBuffer( tensor0.getType() ) << "[ 0 ], "
+					  << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << " );\n";
+			} else if( op_axes.size() == 2) {
+				stage << tabs << "\t\t\talp::BlockReduceSum( " << arg1 << ", " << arg2 << ", "
+					  << alp::internal::symbols.getLocalTempTensorBuffer( tensor0.getType() ) << "[ 0 ], "
+					  << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << ", "
+					  << igrid->problemSize( op_axes[ 1 ] ) << " );\n";
+			}
+			break;
+		}
+		default:
+		{
+		        std::cerr << "Invalid rule: foldl add" << std::endl;
+		        std::abort();
+		}
+	}
+
+	return stage.str();
+}
+
+std::string alp::internal::Stage::generateFoldlExpOp( const std::string &tabs ) const {
+
+	const std::string arg1 = tensor0.getAccessedElement( pipeline.getID() );
+
+	const std::vector< int > op_axes = computeOperatorAxes();
+	std::stringstream stage;
+
+	if( op_axes.size() == 1) {
+		stage << tabs << "\t\t\talp::VectorExp( " << arg1 << ", " << arg1
+			  << ", " << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << " );\n";
+	} else if( op_axes.size() == 2) {
+		stage << tabs << "\t\t\talp::BlockExp( " << arg1 << ", " << arg1
+			  << ", " << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << ", "
+			  << igrid->problemSize( op_axes[ 1 ] ) << " );\n";
+	}
+
+	return stage.str();
+}
+
+std::string alp::internal::Stage::generateSetTensorOp( const std::string &tabs ) const {
+	const std::string arg1 = tensor0.getAccessedElement( pipeline.getID() );
+	const std::string arg2 = tensor1.getAccessedElement( pipeline.getID() );
+
+	const std::vector< int > op_axes = computeOperatorAxes();
+	std::stringstream stage;
+
+	if( op_axes.size() == 1) {
+		stage << tabs << "\t\t\talp::VectorSet( " << arg1 << ", " << arg2 << ", "
+			  << igrid->problemSize( op_axes[ 0 ] ) << " );\n";
+	} else if( op_axes.size() == 2) {
+		stage << tabs << "\t\t\talp::BlockSet( " << arg1 << ", " << arg2 << ", "
+			  << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << ", "
+			  << igrid->problemSize( op_axes[ 1 ] ) << " );\n";
+	}
+
+	return stage.str();
+}
+
+std::string alp::internal::Stage::generateSetScalarOp( const std::string &tabs ) const {
+
+	const std::string arg1 = tensor0.getAccessedElement( pipeline.getID() );
+	const std::string scalar = ( alpha == std::numeric_limits< double >::infinity() ) ? "65504.0"
+							 : ( alpha == -std::numeric_limits< double >::infinity() ) ? "-65504.0"
+							 : std::to_string( alpha );
+
+	const std::vector< int > op_axes = computeOperatorAxes();
+	std::stringstream stage;
+
+	if( op_axes.size() == 1) {
+		stage << tabs << "\t\t\talp::VectorSet( " << arg1 << ", " << scalar << ", "
+			  << igrid->problemSize( op_axes[ 0 ] ) << " );\n";
+	} else if( op_axes.size() == 2) {
+		stage << tabs << "\t\t\talp::BlockSet( " << arg1 << ", " << scalar << ", "
+			  << igrid->problemSize( op_axes[ 0 ] ) << ", "
+			  << igrid->problemSize( op_axes[ 1 ] ) << " );\n";
+	}
+
+	return stage.str();
+}
+
+std::string alp::internal::Stage::generateGetViewOp( const std::string &tabs ) const {
+
+	const std::string arg1 = tensor0.getAccessedElement( pipeline.getID() );
+
+	const std::vector< int > op_axes = computeOperatorAxes();
+	std::stringstream stage;
+
+	const size_t id = pipeline.getID();
+
+	if( pipeline.isOutput( tensor0 ) == true ) {
+		stage << tabs << "\t\t\t// Initializing data for an output global Tensor\n";
+		stage << tabs << "\t\t\t" << tensor0.getAscendName( id ) << " = "
+			  << tensor0.getTQueBufName( id ) << ".AllocTensor< "
+			  << internal::getDataType( tensor0.getType() ) << " >();\n";
+	} else {
+		stage << tabs << "\t\t\t// Initializing data for an input global Tensor\n";
+		stage << tabs << "\t\t\t" << tensor0.getAscendName( id ) << " = "
+			  << tensor0.getTQueBufName( id ) << ".AllocTensor< "
+			  << internal::getDataType( tensor0.getType() ) << " >();\n";
+
+		if( op_axes.size() == 0 ) {
+			stage << tabs << "\t\t\talp::DataMove( " << tensor0.getAscendName( id )
+				  << "[ " << "0" << " ], "
+				  << tensor0.getAscendGlobalName( id ) << "[ " << tensor0_offset << " ], "
+				  << pipeline.getTilingAxes() << "1" << " );\n";
+
+		}else if( op_axes.size() == 1 ) {
+//			stage << tabs << "\t\t\tDataCopy( " << tensor0.getAscendName( id ) << ", "
+//				  << tensor0.getAscendGlobalName( id ) << "[ " << tensor0_offset << " ], "
+//				  << igrid->problemSize( op_axes[ 0 ] ) << " );\n";
+
+//			stage << tabs << "\t\t\talp::DataMove( " << tensor0.getAscendName( id ) << ", "
+//				  << tensor0.getAscendGlobalName( id ) << "[ " << tensor0_offset << " ], "
+
+			stage << tabs << "\t\t\talp::DataMove( " << tensor0.getAscendName( id )
+				  << "[ " << "0" << " ], "
+				  << tensor0.getAscendGlobalName( id ) << "[ " << tensor0_offset << " ], "
+				  << pipeline.getTilingAxes() <<  igrid->problemSize( op_axes[ 0 ] ) << " );\n";
+
+		} else if( op_axes.size() == 2) {
+//			stage << tabs << "\t\t\tfor( uint32_t k = 0; k < "
+//				  << igrid->problemSize( op_axes[ 0 ] ) << "; k++ ) {\n";
+
+//			stage << tabs << "\t\t\t\tDataCopy( " << tensor0.getAscendName( id )
+//				  << "[ k * " << igrid->problemSize( op_axes[ 1 ] ) << " ], "
+//				  << tensor0.getAscendGlobalName( id )
+//				  << "[ " << tensor0_offset << " + k" << stride << " ], "
+//				  << igrid->problemSize( op_axes[ 1 ] ) << " );\n";
+
+//			stage << tabs << "\t\t\t}\n";
+
+			stage << tabs << "\t\t\talp::DataMove( " << tensor0.getAscendName( id )
+				  << "[ " << "0" << " ], "
+				  << tensor0.getAscendGlobalName( id ) << "[ " << tensor0_offset << " ], "
+				  << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << ", "
+				  << igrid->problemSize( op_axes[ 1 ] ) << ", "
+				  << stride << ", "
+				  << igrid->problemSize( op_axes[ 1 ] ) << " );\n";
+		}
+
+		stage << tabs << "\t\t\t" << tensor0.getTQueBufName( id )
+			  << ".EnQue( " << tensor0.getAscendName( id ) << " );\n";
+
+		stage << tabs << "\t\t\t" << tensor0.getAscendName( id ) << " = "
+			  << tensor0.getTQueBufName( id )
+			  << ".DeQue< " << internal::getDataType( tensor0.getType() ) << " >();\n";
+	}
+
+	return stage.str();
+}
+
+std::string alp::internal::Stage::generateStoreOp( const std::string &tabs ) const {
+
+	//TODO I should use the arg1
+	const std::string arg1 = tensor0.getAccessedElement( pipeline.getID() );
+
+	const std::vector< int > op_axes = computeOperatorAxes();
+	std::stringstream stage;
+
+	const size_t id = pipeline.getID();
+
+	stage << tabs << "\t\t\t// Copying data of an output Tensor back to the global memory\n";
+	stage << tabs << "\t\t\t" << tensor0.getTQueBufName( id )
+		  << ".EnQue< " << internal::getDataType( tensor0.getType() )
+		  << " >( " << tensor0.getAscendName( id ) << " );\n";
+	stage << tabs << "\t\t\t" << tensor0.getAscendName( id ) << " = "
+		  << tensor0.getTQueBufName( id ) << ".DeQue< "
+		  << internal::getDataType( tensor0.getType() ) << " >();\n";
+
+	if( op_axes.size() == 0) {
+		stage << tabs << "\t\t\talp::DataMove( "
+			<< tensor0.getAscendGlobalName( id ) << "[ " << tensor0_offset << " ], "
+			<< tensor0.getAscendName( id ) << "[ " << "0" << " ], "
+			<< pipeline.getTilingAxes() << "1" << " );\n";
+	} else if( op_axes.size() == 1) {
+//		stage << tabs << "\t\t\tDataCopy( " << tensor0.getAscendGlobalName( id )
+//			  << "[ " << tensor0_offset << " ], " << tensor0.getAscendName( id ) << ", "
+//			  << igrid->problemSize( op_axes[ 0 ] ) << " );\n";
+
+		stage << tabs << "\t\t\talp::DataMove( "
+			<< tensor0.getAscendGlobalName( id ) << "[ " << tensor0_offset << " ], "
+			<< tensor0.getAscendName( id ) << "[ " << "0" << " ], "
+			<< pipeline.getTilingAxes() <<  igrid->problemSize( op_axes[ 0 ] ) << " );\n";
+
+	} else if( op_axes.size() == 2) {
+/*
+		stage << tabs << "\t\t\tfor( uint32_t k = 0; k < "
+			  << igrid->problemSize( op_axes[ 0 ] ) << "; k++ ) {\n";
+
+		stage << tabs << "\t\t\t\tDataCopy( " << tensor0.getAscendGlobalName( id )
+			  << "[ " << tensor0_offset << " + k" << stride << " ], "
+			  << tensor0.getAscendName( id ) << "[ k * " << igrid->problemSize( op_axes[ 1 ] ) << " ], "
+			  << igrid->problemSize( op_axes[ 1 ] ) << " );\n";
+
+		stage << tabs << "\t\t\t}\n";
+*/
+		stage << tabs << "\t\t\talp::DataMove( "
+			  << tensor0.getAscendGlobalName( id ) << "[ " << tensor0_offset << " ], "
+			  << tensor0.getAscendName( id ) << "[ " << "0" << " ], "
+			  << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << ", "
+			  << igrid->problemSize( op_axes[ 1 ] ) << ", "
+			  << igrid->problemSize( op_axes[ 1 ] ) << ", "
+			  << stride << " );\n";
+	}
+
+	stage << tabs << "\t\t\t" << tensor0.getTQueBufName( id )
+		  << ".FreeTensor( " << tensor0.getAscendName( id ) << " );\n";
+
+	return stage.str();
+}
+
+std::string alp::internal::Stage::generateImplicitFreeOp( const std::string &tabs ) const {
+
+	//TODO I should use the arg1
+	const std::string arg1 = tensor0.getAccessedElement( pipeline.getID() );
+
+	std::stringstream stage;
+
+	const size_t id = pipeline.getID();
+
+	stage << tabs << "\t\t\t// Freeing data of a Tensor that is not output\n";
+	stage << tabs << "\t\t\t" << tensor0.getTQueBufName( id )
+		  << ".FreeTensor( " << tensor0.getAscendName( id ) << " );\n";
+
+	return stage.str();
+}
+
+std::string alp::internal::Stage::generateToDoOp( const std::string &tabs ) const {
+
+	return tabs + std::string("");
+}
+
+//TODO: perhaps rename it to computeUnionAxes
+
+std::vector< int > alp::internal::Stage::computeOperatorAxes() const {
+
+	// initializing the union with the axes of tensor0 used by all operators
+	std::vector< int > union_axes = tensor0.getAxes();
+
+	switch ( enum_op_type ) {
+
+		case alp::internal::Stagetype::APPLY_MINUS:		// 3 Tensors
+		case alp::internal::Stagetype::APPLY_ADD:		// 3 Tensors
+		case alp::internal::Stagetype::FOLDL_DIVIDE:	// 2 Tensors
+		case alp::internal::Stagetype::SET_TENSOR:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_MAX:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_TIMES:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_ADD:		// 2 Tensors
+		{
+			const std::vector< int > &tensor1_axes = tensor1.getAxes();
+			union_axes = internal::vectorUnion( union_axes, tensor1_axes );
+			break;
+		}
+		case alp::internal::Stagetype::FOLDL_EXP:		// 1 Tensor
+		case alp::internal::Stagetype::SET_SCALAR:		// 1 Tensor
+		case alp::internal::Stagetype::GET_VIEW:		// 1 Tensor
+		case alp::internal::Stagetype::STORE:			// 1 Tensor
+		case alp::internal::Stagetype::IMPLICIT_FREE:	// 1 Tensor
+//		default:
+		break;
+	}
+
+	switch ( enum_op_type ) {
+
+		case alp::internal::Stagetype::APPLY_MINUS:		// 3 Tensors
+		case alp::internal::Stagetype::APPLY_ADD:		// 3 Tensors
+		{
+			const std::vector< int > &tensor2_axes = tensor2.getAxes();
+			union_axes = internal::vectorUnion( union_axes, tensor2_axes );
+			break;
+		}
+		case alp::internal::Stagetype::FOLDL_DIVIDE:	// 2 Tensors
+		case alp::internal::Stagetype::SET_TENSOR:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_MAX:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_TIMES:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_ADD:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_EXP:		// 1 Tensor
+		case alp::internal::Stagetype::SET_SCALAR:		// 1 Tensor
+		case alp::internal::Stagetype::GET_VIEW:		// 1 Tensor
+		case alp::internal::Stagetype::STORE:			// 1 Tensor
+		case alp::internal::Stagetype::IMPLICIT_FREE:	// 1 Tensor
+//		default:
+		break;
+	}
+
+	// only in the case of GET_VIEW and STORE
+	// we need to remove the axes of the loops
+	// because the stored axes are those of the parent
+	// FIXME: perhaps we should change this design and handle views
+	// as different objects added to the symbol table
+	switch ( enum_op_type ) {
+
+		case alp::internal::Stagetype::GET_VIEW:		// 1 Tensor
+		case alp::internal::Stagetype::STORE:			// 1 Tensor
+		{
+			union_axes = internal::vectorDifference( union_axes, forEachAxes );
+			break;
+		}
+		// IMPLICIT_FREE is created based on STORE
+		// and this step is already done except that
+		// this function is not used by IMPLICIT_FREE
+		case alp::internal::Stagetype::IMPLICIT_FREE:	// 1 Tensor
+		case alp::internal::Stagetype::APPLY_MINUS:		// 3 Tensors
+		case alp::internal::Stagetype::APPLY_ADD:		// 3 Tensors
+		case alp::internal::Stagetype::FOLDL_DIVIDE:	// 2 Tensors
+		case alp::internal::Stagetype::SET_TENSOR:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_MAX:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_TIMES:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_ADD:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_EXP:		// 1 Tensor
+		case alp::internal::Stagetype::SET_SCALAR:		// 1 Tensor
+//		default:
+		break;
+	}
+
+	return union_axes;
+}
+
+void alp::internal::Stage::computeMemoryOffsets(){
+
+	switch ( enum_op_type ) {
+
+		case alp::internal::Stagetype::GET_VIEW:		// 1 Tensor
+		case alp::internal::Stagetype::STORE:			// 1 Tensor
+		case alp::internal::Stagetype::IMPLICIT_FREE:	// 1 Tensor
+		{
+			// for the GET_VIEW and STORE it's necessary to compute the expression for the stride
+			// we compute the stride only if the axes of the view are two
+			// more than two axes are not supported
+			// one axis does not require the stride
+			if( activeAxes.size() == 2 ) {
+				bool first = true;
+				for( int i = activeAxes[ 0 ] + 1; i <= activeAxes[ 1 ]; ++i ) {
+					if( first == true ) {
+						first = false;
+						stride.append( igrid->problemSize( i ) ); // n3 * n4 * n5
+					} else {
+						stride.append( " * " + igrid->problemSize( i ) ); // n3 * n4 * n5
+					}
+				}
+			}
+			break;
+		}
+		default:
+			break;
+	}
+
+	switch ( enum_op_type ) {
+
+//		case alp::internal::Stagetype::APPLY_MINUS:		// 4 Tensors
+		case alp::internal::Stagetype::APPLY_MINUS:		// 3 Tensors
+		case alp::internal::Stagetype::APPLY_ADD:		// 3 Tensors
+		case alp::internal::Stagetype::FOLDL_DIVIDE:	// 2 Tensors
+		case alp::internal::Stagetype::SET_TENSOR:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_MAX:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_TIMES:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_ADD:		// 2 Tensors
+		case alp::internal::Stagetype::GET_VIEW:		// 1 Tensor
+		case alp::internal::Stagetype::STORE:			// 1 Tensor
+		case alp::internal::Stagetype::IMPLICIT_FREE:	// 1 Tensor
+		case alp::internal::Stagetype::FOLDL_EXP:		// 1 Tensor
+		case alp::internal::Stagetype::SET_SCALAR:		// 1 Tensor
+		{
+			const std::vector< int > &view_parent_0_axes = alp::internal::symbols.getTensorFromView( tensor0 ).getAxes();
+
+			bool first = true;
+
+			for( auto it = forEachAxes.begin(); it != forEachAxes.end(); ++it ) {
+
+				if( std::find( view_parent_0_axes.begin(), view_parent_0_axes.end(), *it ) != view_parent_0_axes.end() ) {
+
+					if( !first ) {
+						tensor0_offset.append( " + " );
+					} else {
+						first = false;
+					}
+
+					tensor0_offset.append( igrid->problemMainMode( *it ) ); // z0
+					for( auto jt = view_parent_0_axes.begin(); jt != view_parent_0_axes.end(); ++jt ) {
+						if( *jt > *it ) {
+							tensor0_offset.append( " * " + igrid->problemSize( *jt ) ); // n1 * n2 * n3
+						}
+					}
+				}
+			}
+			break;
+		}
+//		default:
+	}
+
+	switch ( enum_op_type ) {
+
+//		case alp::internal::Stagetype::APPLY_MINUS:		// 4 Tensors
+		case alp::internal::Stagetype::APPLY_MINUS:		// 3 Tensors
+		case alp::internal::Stagetype::APPLY_ADD:		// 3 Tensors
+		case alp::internal::Stagetype::FOLDL_DIVIDE:	// 2 Tensors
+		case alp::internal::Stagetype::SET_TENSOR:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_MAX:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_TIMES:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_ADD:		// 2 Tensors
+		{
+			const std::vector< int > &view_parent_1_axes = alp::internal::symbols.getTensorFromView( tensor1 ).getAxes();
+
+			bool first = true;
+
+			for( auto it = forEachAxes.begin(); it != forEachAxes.end(); ++it ) {
+
+				if( std::find( view_parent_1_axes.begin(), view_parent_1_axes.end(), *it ) != view_parent_1_axes.end() ) {
+
+					if( !first ) {
+						tensor1_offset.append( " + " );
+					} else {
+						first = false;
+					}
+
+					tensor1_offset.append( igrid->problemMainMode( *it ) ); // z0
+					for( auto jt = view_parent_1_axes.begin(); jt != view_parent_1_axes.end(); ++jt ) {
+						if( *jt > *it ) {
+							tensor1_offset.append( " * " + igrid->problemSize( *jt ) ); // n1 * n2 * n3
+						}
+					}
+				}
+			}
+			break;
+		}
+		case alp::internal::Stagetype::GET_VIEW:		// 1 Tensor
+		case alp::internal::Stagetype::STORE:			// 1 Tensor
+		case alp::internal::Stagetype::IMPLICIT_FREE:	// 1 Tensor
+		case alp::internal::Stagetype::FOLDL_EXP:		// 1 Tensor
+		case alp::internal::Stagetype::SET_SCALAR:		// 1 Tensor
+//		default:
+		break;
+	}
+
+	switch ( enum_op_type ) {
+
+//		case alp::internal::Stagetype::APPLY_MINUS:		// 4 Tensors
+		case alp::internal::Stagetype::APPLY_MINUS:		// 3 Tensors
+		case alp::internal::Stagetype::APPLY_ADD:		// 3 Tensors
+		{
+			const std::vector< int > &view_parent_2_axes = alp::internal::symbols.getTensorFromView( tensor2 ).getAxes();
+
+			bool first = true;
+
+			for( auto it = forEachAxes.begin(); it != forEachAxes.end(); ++it ) {
+
+				if( std::find( view_parent_2_axes.begin(), view_parent_2_axes.end(), *it ) != view_parent_2_axes.end() ) {
+
+					if( !first ) {
+						tensor2_offset.append( " + " );
+					} else {
+						first = false;
+					}
+
+					tensor2_offset.append( igrid->problemMainMode( *it ) ); // z0
+					for( auto jt = view_parent_2_axes.begin(); jt != view_parent_2_axes.end(); ++jt ) {
+						if( *jt > *it ) {
+							tensor2_offset.append( " * " + igrid->problemSize( *jt ) ); // n1 * n2 * n3
+						}
+					}
+				}
+			}
+			break;
+		}
+		case alp::internal::Stagetype::FOLDL_DIVIDE:	// 2 Tensors
+		case alp::internal::Stagetype::SET_TENSOR:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_MAX:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_TIMES:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_ADD:		// 2 Tensors
+		case alp::internal::Stagetype::GET_VIEW:		// 1 Tensor
+		case alp::internal::Stagetype::STORE:			// 1 Tensor
+		case alp::internal::Stagetype::IMPLICIT_FREE:	// 1 Tensor
+		case alp::internal::Stagetype::FOLDL_EXP:		// 1 Tensor
+		case alp::internal::Stagetype::SET_SCALAR:		// 1 Tensor
+//		default:
+		break;
+	}
+}
+
+void alp::internal::Stage::semanticsCheks(){
+
+	switch ( enum_op_type ) {
+
+//		case alp::internal::Stagetype::APPLY_MINUS:		// 4 Tensors
+		case alp::internal::Stagetype::APPLY_MINUS:		// 3 Tensors
+		case alp::internal::Stagetype::APPLY_ADD:		// 3 Tensors
+		case alp::internal::Stagetype::FOLDL_DIVIDE:	// 2 Tensors
+		case alp::internal::Stagetype::SET_TENSOR:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_MAX:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_TIMES:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_ADD:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_EXP:		// 1 Tensor
+		case alp::internal::Stagetype::SET_SCALAR:		// 1 Tensor
+		{
+			if( internal::invalidAxes( tensor0.getAxes() ) == true ) {
+				std::cerr << "The axes of the Tensor must not be included in the axes of the forEach." << std::endl;
+				std::abort();
+			}
+			break;
+		}
+		case alp::internal::Stagetype::GET_VIEW:		// 1 Tensor
+		case alp::internal::Stagetype::STORE:			// 1 Tensor
+		case alp::internal::Stagetype::IMPLICIT_FREE:	// 1 Tensor
+		{
+			//TODO this semantics check cannot be done on the parent tensor
+			break;
+		}
+//		default:
+	}
+
+	switch ( enum_op_type ) {
+
+//		case alp::internal::Stagetype::APPLY_MINUS:		// 4 Tensors
+		case alp::internal::Stagetype::APPLY_MINUS:		// 3 Tensors
+		case alp::internal::Stagetype::APPLY_ADD:		// 3 Tensors
+		case alp::internal::Stagetype::FOLDL_DIVIDE:	// 2 Tensors
+		case alp::internal::Stagetype::SET_TENSOR:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_MAX:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_TIMES:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_ADD:		// 2 Tensors
+		{
+			if( internal::invalidAxes( tensor1.getAxes() ) == true ) {
+				std::cerr << "The axes of the Tensor must not be included in the axes of the forEach." << std::endl;
+				std::abort();
+			}
+			break;
+		}
+		case alp::internal::Stagetype::FOLDL_EXP:		// 1 Tensor
+		case alp::internal::Stagetype::SET_SCALAR:		// 1 Tensor
+		case alp::internal::Stagetype::GET_VIEW:		// 1 Tensor
+		case alp::internal::Stagetype::STORE:			// 1 Tensor
+		case alp::internal::Stagetype::IMPLICIT_FREE:	// 1 Tensor
+		{
+			//TODO this semantics check cannot be done on the parent tensor
+			break;
+		}
+//		default:
+	}
+
+	switch ( enum_op_type ) {
+
+//		case alp::internal::Stagetype::APPLY_MINUS:		// 4 Tensors
+		case alp::internal::Stagetype::APPLY_MINUS:		// 3 Tensors
+		case alp::internal::Stagetype::APPLY_ADD:		// 3 Tensors
+		{
+			if( internal::invalidAxes( tensor2.getAxes() ) == true ) {
+				std::cerr << "The axes of the Tensor must not be included in the axes of the forEach." << std::endl;
+				std::abort();
+			}
+			break;
+		}
+		case alp::internal::Stagetype::FOLDL_DIVIDE:	// 2 Tensors
+		case alp::internal::Stagetype::SET_TENSOR:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_MAX:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_TIMES:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_ADD:		// 2 Tensors
+		case alp::internal::Stagetype::FOLDL_EXP:		// 1 Tensor
+		case alp::internal::Stagetype::SET_SCALAR:		// 1 Tensor
+		case alp::internal::Stagetype::GET_VIEW:		// 1 Tensor
+		case alp::internal::Stagetype::STORE:			// 1 Tensor
+		case alp::internal::Stagetype::IMPLICIT_FREE:	// 1 Tensor
+		{
+			//TODO this semantics check cannot be done on the parent tensor
+			break;
+		}
+//		default:
+	}
+}
diff --git a/src/graphblas/ascend/symbolTable.cpp b/src/graphblas/ascend/symbolTable.cpp
new file mode 100644
index 000000000..68d5996ad
--- /dev/null
+++ b/src/graphblas/ascend/symbolTable.cpp
@@ -0,0 +1,240 @@
+#include <graphblas/ascend/symbolTable.hpp>
+#include <graphblas/ascend/tensor.hpp>
+#include <graphblas/ascend/semantics.hpp>
+#include <graphblas/ascend/grid.hpp>
+
+namespace alp
+{
+	namespace internal
+	{
+		extern iGrid *igrid;
+		SymbolTable symbols;
+	}
+}
+
+alp::internal::SymbolTable::SymbolTable() {
+
+	TBuf_decl = false;
+	temp_scalar_id = 0;
+}
+
+bool alp::internal::SymbolTable::existsTBufTensorDecl() const {
+
+	return TBuf_decl;
+}
+
+void alp::internal::SymbolTable::clearAll() {
+
+	global_tensor_declarations.clear();
+	local_tensor_declarations.clear();
+	temp_tensor_declarations.clear();
+
+	// assuming that views are created locally in a forEach
+	// or is it possible to have views in a global scope?
+	viewToTensor.clear();
+}
+
+void alp::internal::SymbolTable::addGlobalTensor( const alp::Tensor &t ) {
+
+	// TODO this semantics check is essentially unnecessary
+	// since global Tensors are not declared within forEach
+	if( internal::invalidAxes( t.getAxes() ) == true ) {
+		std::cerr << "The axes of the global Tensor must not be included in the axes of the forEach." << std::endl;
+		std::abort();
+	}
+
+	global_tensor_declarations.emplace( t.getName() , t );
+
+	all_global_tensors.emplace_back( t );
+}
+
+void alp::internal::SymbolTable::addLocalTensor( const alp::Tensor &t ) {
+
+	if( internal::invalidAxes( t.getAxes() ) == true ) {
+		std::cerr << "The axes of the local Tensor must not be included in the axes of the forEach." << std::endl;
+		std::abort();
+	}
+
+	TBuf_decl = true;
+	local_tensor_declarations.emplace( t.getName(), t );
+
+	reuseLocalTempTensorBuffer( t );
+}
+
+void alp::internal::SymbolTable::addTempTensor( const alp::Tensor &t ) {
+
+	// TODO this semantics check is essentially unnecessary
+	// since temporary Tensors are declared internally
+	if( internal::invalidAxes( t.getAxes() ) == true ) {
+		std::cerr << "The axes of the temporary Tensor must not be included in the axes of the forEach." << std::endl;
+		std::abort();
+	}
+
+	TBuf_decl = true;
+	temp_tensor_declarations.emplace( t.getName(), t );
+
+	reuseLocalTempTensorBuffer( t );
+}
+
+void alp::internal::SymbolTable::addTensorView( const std::string &view_name, const std::string &parent_name ) {
+
+	viewToTensor[ view_name ] = parent_name;
+}
+
+/*
+std::string alp::internal::SymbolTable::newTempScalar() {
+
+	return "temp_scalar_" + std::to_string( temp_scalar_id++ );
+}
+*/
+
+void alp::internal::SymbolTable::addOutputTensor( const alp::Tensor &t ) {
+
+	outputs_global_tensors.emplace_back( t );
+}
+
+void alp::internal::SymbolTable::printHostLogFile( std::stringstream &listOfGlobalTensors ) {
+
+	bool first = true;
+
+	for( auto it = all_global_tensors.begin(); it != all_global_tensors.end(); ++it ) {
+
+		std::vector< int > axes = it->getAxes();
+		for( auto jt = axes.begin(); jt != axes.end(); ++jt ) {
+
+			if( first == true ) {
+				first = false;
+			} else {
+				listOfGlobalTensors << ",";
+			}
+			listOfGlobalTensors << *jt;
+		}
+		if ( std::find( outputs_global_tensors.cbegin(), outputs_global_tensors.cend(), *it ) == outputs_global_tensors.cend() ) {
+			listOfGlobalTensors << ",in";
+		} else {
+			listOfGlobalTensors << ",out";
+		}
+	}
+}
+
+void alp::internal::SymbolTable::generateGlobalSymbols( std::stringstream &initFormalParam,
+								std::stringstream &customFormalParam, std::stringstream &allAccessedArg,
+								std::stringstream &allTempLocalDecl ) const {
+
+	for( auto it = global_tensor_declarations.cbegin(); it != global_tensor_declarations.cend(); ++it ) {
+		if( it->first != global_tensor_declarations.cbegin()->first ) {
+			initFormalParam  << ", ";
+			customFormalParam  << ", ";
+			allAccessedArg  << ", ";
+		}
+		initFormalParam << "GM_ADDR " << it->first;
+		// TODO data type needs to be parametrised
+		// TODO or MAYBE NOT?
+		customFormalParam << "uint8_t" << " * " << it->first;
+		allAccessedArg << it->first;
+	}
+
+	for( auto it = temp_local_buffer_declarations.begin(); it != temp_local_buffer_declarations.end(); ++it ) {
+		allTempLocalDecl << "\t\t// Declaration of memory used for Local and Temporary tensor\n";
+		allTempLocalDecl << "\t\tTBuf< QuePosition::VECCALC > " << it->first << "_temp_local_Buf;\n";
+		allTempLocalDecl << "\t\tLocalTensor< " << it->first << " > " << it->first << "_temp_local;\n";
+		allTempLocalDecl << "\n";
+	}
+}
+
+void alp::internal::SymbolTable::generateTempLocalInit( std::stringstream &allTempLocalInit ) const {
+
+	for( auto it = temp_local_buffer_declarations.begin(); it != temp_local_buffer_declarations.end(); ++it ) {
+		allTempLocalInit << "\n";
+		allTempLocalInit << "\t\t\t// Initialization of memory used for Local and Temporary tensor\n";
+		allTempLocalInit << "\t\t\tpipe.InitBuffer( " << it->first
+						 << "_temp_local_Buf, ( totWorkSpaceSize + " << it->second << " ) * sizeof( " << it->first << " ) );\n";
+		allTempLocalInit << "\t\t\t" << it->first << "_temp_local = "
+						 << it->first << "_temp_local_Buf.Get< " << it->first << " >();\n";
+	}
+}
+
+const alp::Tensor &alp::internal::SymbolTable::getTensorFromView( const alp::Tensor &tensor ) const {
+
+	auto it = viewToTensor.find( tensor.getName() );
+	// TODO: assume we have only one level of views, otherwise a loop is required
+	if( it != viewToTensor.cend() ) {
+		auto jt = global_tensor_declarations.find( it->second );
+		if( jt != global_tensor_declarations.cend() ) {
+			return jt->second;
+		} else {
+            std::cerr << "Cannot handle a view of a non-global declaration" << std::endl;
+            std::abort();
+		}
+	} else {
+		return tensor;
+	}
+}
+
+std::string alp::internal::SymbolTable::getLocalTempTensorBuffer( Datatype type, const std::string &size ) {
+
+	std::string datatype = internal::getDataType( type );
+
+	auto it = temp_local_buffer_declarations.find( datatype );
+	if( it == temp_local_buffer_declarations.cend() ) {
+		temp_local_buffer_declarations.emplace( datatype, size );
+	} else if ( size.empty() == false ) {
+		it->second.append( std::string( " + " ) + std::string( size ) );
+	}
+	return datatype + "_temp_local";
+}
+
+void alp::internal::SymbolTable::reuseLocalTempTensorBuffer( const alp::Tensor &t ) {
+
+	std::string datatype = internal::getDataType( t.getType() );
+	const std::vector< int > &axes = t.getAxes();
+
+	assert( axes.size() < 3 );
+
+	std::string size;
+	if( axes.size() == 0 ) {
+		size = "( 32 / sizeof( ";
+		size.append( datatype );
+		size.append( " ) )" );
+	} else if( axes.size() == 1 ) {
+		size = igrid->problemSize( axes[ 0 ] );
+	} else if( axes.size() == 2) {
+		size = igrid->problemSize( axes[ 0 ] ) + " * " + igrid->problemSize( axes[ 1 ] );
+	}
+/*
+	auto it = temp_local_buffer_declarations.find( datatype );
+	if( it == temp_local_buffer_declarations.cend() ) {
+		temp_local_buffer_declarations.emplace( datatype, size );
+	} else {
+		it->second.append( std::string( " + " ) + std::string( size ) );
+	}
+*/
+	( void ) getLocalTempTensorBuffer( t.getType(), size );
+}
+
+void alp::internal::SymbolTable::debug_print() const {
+
+	std::cerr << "\nGLOBAL: ";
+	for( auto it = global_tensor_declarations.cbegin(); it != global_tensor_declarations.cend(); ++it ) {
+		std::cerr << it->first << "(" << alp::internal::getScope( it->second.getScope() ) << "), ";
+
+	}
+
+	std::cerr << "\nLOCAL: ";
+	for( auto it = local_tensor_declarations.cbegin(); it != local_tensor_declarations.cend(); ++it ) {
+		std::cerr << it->first << "(" << alp::internal::getScope( it->second.getScope() ) << "), ";
+	}
+
+	std::cerr << "\nTEMP: ";
+	for( auto it = temp_tensor_declarations.cbegin(); it != temp_tensor_declarations.cend(); ++it ) {
+		std::cerr << it->first << "(" << alp::internal::getScope( it->second.getScope() ) << "), ";
+	}
+
+	std::cerr << "\nVIEW: ";
+	for( auto it = viewToTensor.cbegin(); it != viewToTensor.cend(); ++it ) {
+		std::cerr << it->first << "( of " << it->second << "), ";
+
+	}
+
+	std::cerr << std::endl << std::endl << std::endl;
+}
diff --git a/src/graphblas/ascend/tensor.cpp b/src/graphblas/ascend/tensor.cpp
new file mode 100644
index 000000000..ca2d9c264
--- /dev/null
+++ b/src/graphblas/ascend/tensor.cpp
@@ -0,0 +1,183 @@
+#include <graphblas/ascend/utils.hpp>
+#include <graphblas/ascend/tensor.hpp>
+//#include <graphblas/ascend/grid.hpp>
+#include <graphblas/ascend/opgen.hpp>		//TODO forEachLevel
+#include <graphblas/ascend/symbolTable.hpp>
+#include <graphblas/ascend/operators.hpp>
+
+
+namespace alp
+{
+	namespace internal
+	{
+		extern SymbolTable symbols;
+	}
+}
+
+size_t alp::Tensor::tensor_id = 0;
+
+bool alp::Tensor::operator==( const Tensor &t ) const {
+	return this->name == t.name;
+}
+
+void alp::Tensor::operator=( const ReductionOperation& op ) {
+	foldl( *this, op.input, op.opName, op.axes );
+}
+
+void alp::Tensor::operator=( const ApplyOperation& op ) {
+	apply( *this, op.input1, op.input2, op.opName, op.axes );
+}
+
+alp::Tensor::Tensor( const Datatype _type, const std::vector< int > &_axes ) noexcept
+			: id( tensor_id++ ),
+			  name( std::string("tensor") + std::to_string( id ) ),
+			  type( _type ),
+			  scope( internal::OpGen::forEachLevel > 0 ? internal::Scope::LOCAL : internal::Scope::GLOBAL ),
+			  axes( _axes ) {
+	if( internal::OpGen::forEachLevel > 0 ) {
+		internal::symbols.addLocalTensor( *this );
+	} else {
+		/*
+		for( auto it = axes.begin(); it != axes.end(); ++it ) {
+			if( it != axes.begin() ) {
+				internal::OpGen::output_host_log << ",";
+			}
+			internal::OpGen::output_host_log << *it;
+		}
+		*/
+		internal::symbols.addGlobalTensor( *this );
+	}
+}
+
+alp::Tensor::Tensor( const Tensor&parent, const std::vector< int > &_axes ) noexcept
+			: id( tensor_id++ ),
+			  name( "view_" + std::to_string( id ) + "_of_" + parent.getName() ),
+			  type( parent.getType() ),
+			  scope( internal::Scope::VIEW ),
+			  axes( _axes ) {
+		// TODO Is it okay to have a view with empty Axes?
+		internal::symbols.addTensorView( name, parent.getName() );
+}
+
+alp::Tensor::Tensor( const Tensor &t ) noexcept
+			: id( t.id ),
+			  name( t.name ),
+			  type( t.type ),
+			  scope( t.scope ),
+			  axes( t.axes ) {
+
+}
+
+alp::Tensor::Tensor( const std::vector< int > &_axes, const Datatype _type ) noexcept
+			: id( tensor_id++ ),
+			  name( std::string("tensor") + std::to_string( id ) ),
+			  type( _type ),
+			  scope( internal::Scope::TEMP ),
+			  axes( _axes ) {
+	internal::symbols.addTempTensor( *this );
+}
+
+size_t alp::Tensor::getID() const {
+	return id;
+}
+
+const std::string &alp::Tensor::getName() const {
+	return name;
+}
+
+alp::Datatype alp::Tensor::getType() const {
+	return type;
+}
+
+alp::internal::Scope alp::Tensor::getScope() const {
+	return scope;
+}
+
+const std::vector< int > &alp::Tensor::getAxes() const {
+	return axes;
+}
+
+bool alp::Tensor::isGlobalDecl() const {
+
+	const Tensor tensor = internal::symbols.getTensorFromView( *this );
+
+	return tensor.scope == internal::Scope::GLOBAL;
+}
+
+bool alp::Tensor::isLocalDecl() const {
+	return scope == internal::Scope::LOCAL;
+}
+
+bool alp::Tensor::isTempDecl() const {
+	return scope == internal::Scope::TEMP;
+}
+
+std::string alp::Tensor::getAccessedElement( size_t id ) const {
+
+	// if this tensor is a view, find its parent tensor
+	const Tensor tensor = internal::symbols.getTensorFromView( *this );
+
+	// make a decision based on the scope of the parent tensor
+	switch( tensor.scope ) {
+		case internal::Scope::GLOBAL:
+			return "Gm_local_" + tensor.name + "_" + std::to_string( id );
+		case internal::Scope::LOCAL:
+			return internal::getDataType( type ) + "_temp_local[ local_" + tensor.name + "_" + std::to_string( id ) + " ]";
+		case internal::Scope::TEMP:
+			return internal::getDataType( type ) + "_temp_local[ temp_" + tensor.name + "_" + std::to_string( id ) + " ]";
+		case internal::Scope::VIEW:
+		default:
+			std::cerr << "ERROR in the declaration " << name << " of getAccessedElement" << std::endl;
+			std::abort();
+			break;
+	}
+}
+
+std::string alp::Tensor::getAscendName( size_t id ) const {
+
+	switch( scope ) {
+		case internal::Scope::GLOBAL:
+			return "Gm_local_" + name + "_" + std::to_string( id );
+		case internal::Scope::LOCAL:
+			return "local_" + name + "_" + std::to_string( id );
+		case internal::Scope::TEMP:
+			return "temp_" + name + "_" + std::to_string( id );
+		case internal::Scope::VIEW:
+		default:
+			std::cerr << "ERROR in the symbol table, the declaration " << name << " was not found" << std::endl;
+			std::abort();
+	}
+}
+
+std::string alp::Tensor::getAscendGlobalName( size_t id ) const {
+
+	switch( scope ) {
+		case internal::Scope::GLOBAL:
+			return "Gm_" + name + "_" + std::to_string( id );
+		case internal::Scope::LOCAL:
+		case internal::Scope::TEMP:
+		case internal::Scope::VIEW:
+		default:
+			std::cerr << "ERROR: declaration " << name << " is not global" << std::endl;
+			std::abort();
+
+	}
+}
+
+std::string alp::Tensor::getTQueBufName( size_t id ) const {
+
+	switch( scope ) {
+		case internal::Scope::GLOBAL:
+			return "globalQue_" + name + "_" + std::to_string( id );
+		case internal::Scope::LOCAL:
+			return "localBuf_" + name + "_" + std::to_string( id );
+		case internal::Scope::TEMP:
+			return "tempBuf_" + name + "_" + std::to_string( id );
+		case internal::Scope::VIEW:
+		default:
+			std::cerr << "ERROR in the declaration " << name << " of getTQueBufName" << std::endl;
+			std::abort();
+			break;
+	}
+}
+
diff --git a/src/graphblas/ascend/utils.cpp b/src/graphblas/ascend/utils.cpp
new file mode 100644
index 000000000..a0cabcb4b
--- /dev/null
+++ b/src/graphblas/ascend/utils.cpp
@@ -0,0 +1,114 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+
+#include <graphblas/ascend/utils.hpp>
+
+namespace alp {
+
+	namespace internal {
+
+		std::string getDataType( const Datatype dtype ) {
+			switch( dtype ) {
+				case alp::Datatype::FP16:
+					return "half";
+				case alp::Datatype::FP32:
+					return "single";
+				case alp::Datatype::VIEW_TYPE:
+					return "VIEW_TYPE";
+				case alp::Datatype::NO_TYPE:
+					return "NO_TYPE";
+			}
+            std::cerr << "Unknown datatype: " << (int) dtype << std::endl;
+            std::abort();
+		}
+
+		std::string getScope( const Scope scope ) {
+			switch( scope ) {
+				case alp::internal::Scope::GLOBAL:
+					return "GLOBAL";
+				case alp::internal::Scope::LOCAL:
+					return "LOCAL";
+				case alp::internal::Scope::TEMP:
+					return "TEMP";
+				case alp::internal::Scope::VIEW:
+					return "VIEW";
+			}
+            std::cerr << "Unknown scope: " << (int) scope << std::endl;
+            std::abort();
+		}
+
+		std::vector< int > vectorOfVectorsToVector( const std::vector< std::vector< int > > &vector_of_sets ) {
+			std::vector< int > vec;
+			for( auto it = vector_of_sets.begin(); it != vector_of_sets.end(); ++it ) {
+				for( auto jt = it->begin(); jt != it->end(); ++jt ) {
+					vec.push_back( *jt );
+				}
+			}
+			return vec;
+		}
+
+		std::vector< int > vectorDifference( const std::vector< int > &vector1, const std::vector< int > &vector2 ) {
+			std::vector< int > diff;
+			for( auto it = vector1.begin(); it != vector1.end(); ++it ) {
+				if( std::find( vector2.begin(), vector2.end(), *it ) == std::end( vector2 ) ) {
+					diff.push_back( *it );
+				}
+			}
+			return diff;
+		}
+
+		bool vectorSubset( const std::vector< int > &vector1, const std::vector< int > &vector2 ) {
+			for( auto it = vector1.begin(); it != vector1.end(); ++it ) {
+				if( std::find( vector2.begin(), vector2.end(), *it ) == std::end( vector2 ) ) {
+					return false;
+				}
+			}
+			return true;
+		}
+
+		std::vector< int > vectorUnion( const std::vector< int > &vector1, const std::vector< int > &vector2 ) {
+			// create copies for the sorting part below
+			std::vector< int > v1 = vector1;
+			std::vector< int > v2 = vector2;
+			std::vector< int > vec_union;
+
+			// the vectors must be sorted here before using set_union
+			// but perhaps this is not what we want
+			// on the other hand is unclear which order to maintain
+			std::sort( v1.begin(), v1.end() );
+			std::sort( v2.begin(), v2.end() );
+
+			std::set_union(
+				v1.begin(),
+				v1.end(),
+				v2.begin(),
+				v2.end(),
+				std::inserter( vec_union, vec_union.end() )
+			);
+
+			return vec_union;
+		}
+
+		std::vector< int > intArgsToVector( const int arg ) {
+			std::vector< int > set1;
+			set1.push_back( arg );
+			return set1;
+		}
+	}
+}
diff --git a/src/graphblas/nonblocking/lazy_evaluation.cpp b/src/graphblas/nonblocking/lazy_evaluation.cpp
index a78de8933..bc5f23a81 100644
--- a/src/graphblas/nonblocking/lazy_evaluation.cpp
+++ b/src/graphblas/nonblocking/lazy_evaluation.cpp
@@ -62,15 +62,22 @@ grb::RC LazyEvaluation::addStage(
 	const Pipeline::stage_type &&func, Opcode opcode,
 	const size_t n, const size_t data_type_size,
 	const bool dense_descr, const bool dense_mask,
+	// TODO FIXME is there really a need for pointers?
+	const size_t output_vector_id,
 	void * const output_vector_ptr, void * const output_aux_vector_ptr,
 	Coordinates< nonblocking > * const coor_output_ptr,
 	Coordinates< nonblocking > * const coor_output_aux_ptr,
+	const size_t input_a_id, const size_t input_b_id,
+	const size_t input_c_id, const size_t input_d_id,
+	// TODO FIXME is there really a need for pointers?
 	const void * const input_a_ptr, const void * const input_b_ptr,
 	const void * const input_c_ptr, const void * const input_d_ptr,
 	const Coordinates< nonblocking > * const coor_a_ptr,
 	const Coordinates< nonblocking > * const coor_b_ptr,
 	const Coordinates< nonblocking > * const coor_c_ptr,
 	const Coordinates< nonblocking > * const coor_d_ptr,
+	const size_t input_matrix_id,
+	// TODO FIXME is there really a need for pointers?
 	const void * const input_matrix
 ) {
 	RC ret = SUCCESS;
@@ -271,6 +278,7 @@ grb::RC LazyEvaluation::addStage(
 			std::vector< Pipeline >::iterator pt = pipelines.begin();
 			pt != pipelines.end(); pt++
 		) {
+			//std::cerr << "*** tic\n"; DBG
 
 			if( ( *pt ).empty() ) {
 				empty_pipeline = &( *pt );
@@ -279,14 +287,17 @@ grb::RC LazyEvaluation::addStage(
 		}
 
 		if( empty_pipeline != nullptr ) {
-			( *empty_pipeline).addStage(
+			//std::cerr << "*** le 1\n"; DBG
+			( *empty_pipeline ).addStage(
 				std::move( func ), opcode,
 				n, data_type_size, dense_descr, dense_mask,
+				output_vector_id,
 				output_vector_ptr, output_aux_vector_ptr,
 				coor_output_ptr, coor_output_aux_ptr,
+				input_a_id, input_b_id, input_c_id, input_d_id,
 				input_a_ptr, input_b_ptr, input_c_ptr, input_d_ptr,
 				coor_a_ptr, coor_b_ptr, coor_c_ptr, coor_d_ptr,
-				input_matrix
+				input_matrix_id, input_matrix
 			);
 
 			// we always execute the pipeline when a scalar is returned
@@ -296,13 +307,17 @@ grb::RC LazyEvaluation::addStage(
 		} else {
 			Pipeline pipeline;
 
+			//std::cerr << "*** le 2\n"; DBG
 			pipeline.addStage(
 				std::move( func ), opcode,
 				n, data_type_size, dense_descr, dense_mask,
+				output_vector_id,
 				output_vector_ptr, output_aux_vector_ptr,
 				coor_output_ptr, coor_output_aux_ptr,
+				input_a_id, input_b_id, input_c_id, input_d_id,
 				input_a_ptr, input_b_ptr, input_c_ptr, input_d_ptr,
 				coor_a_ptr, coor_b_ptr, coor_c_ptr, coor_d_ptr,
+				input_matrix_id,
 				input_matrix
 			);
 
@@ -321,13 +336,17 @@ grb::RC LazyEvaluation::addStage(
 		// the stage is added in the current pipeline which may be empty if it
 		// overwrites the input of SpMV
 		// it is not necessary to deallocate/release this pipeline
+		//std::cerr << "*** le 3\n"; DBG
 		( *ptr ).addStage(
 			std::move( func ), opcode,
 			n, data_type_size, dense_descr, dense_mask,
+			output_vector_id,
 			output_vector_ptr, output_aux_vector_ptr,
 			coor_output_ptr, coor_output_aux_ptr,
+			input_a_id, input_b_id, input_c_id, input_d_id,
 			input_a_ptr, input_b_ptr, input_c_ptr, input_d_ptr,
 			coor_a_ptr, coor_b_ptr, coor_c_ptr, coor_d_ptr,
+			input_matrix_id,
 			input_matrix
 		);
 
@@ -352,13 +371,17 @@ grb::RC LazyEvaluation::addStage(
 
 		// the stage is added in the merged pipeline
 		// it is not necessary to deallocate/release this pipeline
+		// std::cerr << "*** le 4\n"; DBG
 		( *union_pipeline ).addStage(
 			std::move( func ), opcode,
 			n, data_type_size, dense_descr, dense_mask,
+			output_vector_id,
 			output_vector_ptr, output_aux_vector_ptr,
 			coor_output_ptr, coor_output_aux_ptr,
+			input_a_id, input_b_id, input_c_id, input_d_id,
 			input_a_ptr, input_b_ptr, input_c_ptr, input_d_ptr,
 			coor_a_ptr, coor_b_ptr, coor_c_ptr, coor_d_ptr,
+			input_matrix_id,
 			input_matrix
 		);
 
diff --git a/src/graphblas/nonblocking/pipeline.cpp b/src/graphblas/nonblocking/pipeline.cpp
index 711d65f07..9b0fd82e7 100644
--- a/src/graphblas/nonblocking/pipeline.cpp
+++ b/src/graphblas/nonblocking/pipeline.cpp
@@ -274,15 +274,22 @@ void Pipeline::addStage(
 		const Pipeline::stage_type &&func, const Opcode opcode,
 		const size_t n, const size_t data_type_size,
 		const bool dense_descr, const bool dense_mask,
+		const size_t output_vector_id,
+		// TODO FIXME is there really a need for pointers?
 		void * const output_vector_ptr, void * const output_aux_vector_ptr,
 		Coordinates< nonblocking > * const coor_output_ptr,
 		Coordinates< nonblocking > * const coor_output_aux_ptr,
+		const size_t input_a_id, const size_t input_b_id,
+		const size_t input_c_id, const size_t input_d_id,
+		// TODO FIXME is there really a need for pointers?
 		const void * const input_a_ptr, const void * const input_b_ptr,
 		const void * const input_c_ptr, const void * const input_d_ptr,
 		const Coordinates< nonblocking > * const coor_a_ptr,
 		const Coordinates< nonblocking > * const coor_b_ptr,
 		const Coordinates< nonblocking > * const coor_c_ptr,
 		const Coordinates< nonblocking > * const coor_d_ptr,
+		const size_t input_matrix_id,
+		// TODO FIXME is there really a need for pointers?
 		const void * const input_matrix
 ) {
 	assert( stages.size() != 0 || containers_size == 0);
@@ -305,39 +312,50 @@ void Pipeline::addStage(
 		size_of_data_type = data_type_size;
 	}
 
+#ifndef NDEBUG
+	const size_t num_stage = stages.size();
+#endif
 	stages.push_back( std::move( func ) );
 	opcodes.push_back( opcode );
+	assert( opcodes.size() == num_stage + 1 );
 
 	if( output_vector_ptr != nullptr ) {
 		output_vectors.insert( output_vector_ptr );
+		stage_output.push_back( output_vector_id );
 	}
 
 	if( output_aux_vector_ptr != nullptr ) {
 		output_vectors.insert( output_aux_vector_ptr );
+		std::cerr << "Warning: ALP/Ascend does not handle output_aux_vectors yet, please submit a bug report\n";
 	}
 
 	// special treatment for an SpMV operation as the input must not be overwritten
 	// by another stage of the pipeline
+	std::vector< size_t > inputIDs;
 	if( opcode == Opcode::BLAS2_VXM_GENERIC ) {
 
 		if( input_a_ptr != nullptr ) {
 			input_vectors.insert( input_a_ptr );
 			vxm_input_vectors.insert( input_a_ptr );
+			inputIDs.push_back( input_a_id );
 		}
 
 		if( input_b_ptr != nullptr ) {
 			input_vectors.insert( input_b_ptr );
 			vxm_input_vectors.insert( input_b_ptr );
+			inputIDs.push_back( input_b_id );
 		}
 
 		if( input_c_ptr != nullptr ) {
 			input_vectors.insert( input_c_ptr );
 			vxm_input_vectors.insert( input_c_ptr );
+			inputIDs.push_back( input_c_id );
 		}
 
 		if( input_d_ptr != nullptr ) {
 			input_vectors.insert( input_d_ptr );
 			vxm_input_vectors.insert( input_d_ptr );
+			inputIDs.push_back( input_d_id );
 		}
 
 		// in the current implementation that supports level-1 and level-2 operations
@@ -346,24 +364,32 @@ void Pipeline::addStage(
 		//      moved
 		if( input_matrix != nullptr ) {
 			input_matrices.insert( input_matrix );
+			inputIDs.push_back( input_matrix_id );
 		}
 	} else {
 		if( input_a_ptr != nullptr ) {
 			input_vectors.insert( input_a_ptr );
+			inputIDs.push_back( input_a_id );
 		}
 
 		if( input_b_ptr != nullptr ) {
 			input_vectors.insert( input_b_ptr );
+			inputIDs.push_back( input_b_id );
 		}
 
 		if( input_c_ptr != nullptr ) {
 			input_vectors.insert( input_c_ptr );
+			inputIDs.push_back( input_c_id );
 		}
 
 		if( input_d_ptr != nullptr ) {
 			input_vectors.insert( input_d_ptr );
+			inputIDs.push_back( input_d_id );
 		}
 	}
+	assert( inputIDs.size() != 0 );
+	stage_inputs.push_back( inputIDs );
+	assert( stage_inputs.size() + 1 == num_stages );
 
 	// update all the sets of the pipeline by adding the entries of the new stage
 	if( coor_a_ptr != nullptr ) {
@@ -755,6 +781,8 @@ grb::RC Pipeline::verifyDenseDescriptor() {
 }
 
 grb::RC Pipeline::execution() {
+	//throw std::runtime_error( "DBG" );
+
 	RC ret = SUCCESS;
 
 	// if the pipeline is empty, nothing needs to be executed
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index d90ca5cdb..1342fde7c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -125,6 +125,39 @@ if( WITH_NONBLOCKING_BACKEND )
 endif( WITH_NONBLOCKING_BACKEND )
 
 
+if( WITH_ASCEND_BACKEND )
+
+	assert_defined_targets( backend_shmem_static )
+
+	## ascend static
+	add_library( backend_ascend_static INTERFACE )
+	target_link_libraries( backend_ascend_static INTERFACE backend_shmem_static )
+	target_link_libraries( backend_ascend_static INTERFACE backend_ascend_headers )
+	target_compile_definitions( backend_ascend_static INTERFACE "${ASCEND_SELECTION_DEFS}" )
+
+	install( TARGETS backend_ascend_static
+		EXPORT GraphBLASTargets
+		ARCHIVE DESTINATION "${SHMEM_BACKEND_INSTALL_DIR}"
+	)
+
+	## ascend shared
+	add_library( backend_ascend_shared INTERFACE )
+	target_link_libraries( backend_ascend_shared INTERFACE backend_shmem_shared )
+	target_link_libraries( backend_ascend_shared INTERFACE backend_ascend_headers )
+	target_compile_definitions( backend_ascend_shared INTERFACE "${ASCEND_SELECTION_DEFS}" )
+
+	install( TARGETS backend_ascend_shared
+		EXPORT GraphBLASTargets
+		LIBRARY DESTINATION "${SHMEM_BACKEND_INSTALL_DIR}"
+	)
+
+	# this is an alias for add_grb_executables() to select the backend to link against
+	# DO NOT CHANGE THE ALIAS NAME!
+	add_library( "${ASCEND_BACKEND_DEFAULT_NAME}" ALIAS backend_ascend_static )
+
+endif( WITH_ASCEND_BACKEND )
+
+
 # library with utilities for tests, to be used optionally
 # i.e. NOT linked by default
 add_library( test_utils_headers INTERFACE )
diff --git a/tests/unit/asc_analytic_model.cpp b/tests/unit/asc_analytic_model.cpp
new file mode 100644
index 000000000..404ff26cb
--- /dev/null
+++ b/tests/unit/asc_analytic_model.cpp
@@ -0,0 +1,184 @@
+
+#include "analytic_model.hpp"
+
+#include <sstream>
+#include <iostream>
+
+
+int main() {
+	{
+		// this is a 1D problem over 10 cores and 1M problem size, with a fictional
+		// cache size of 5000 bytes
+		asc::AnalyticModel< 1, 1, false > am( 5000, {10}, {1000000}, {true} );
+		// cannot test minor tensors for 1D problems (TODO test elsewhere)
+		// add three float vectors
+		am.addGlobalTensor( 4, {true} );
+		am.addGlobalTensor( 4, {true} );
+		am.addGlobalTensor( 4, {true} );
+		// suppose we just add them
+		am.setNumStages( 1 );
+		// this problem should be feasible:
+		//  - every processing unit gets 100000 elements per vector
+		//  - their byte size is 400000 per vector
+		//  - there are three vectors of size 1200000 bytes total
+		//  - block size that maximises reuse is 5000 / 12 = 416
+		try {
+			const size_t bsize = am.getBlockSize( 0 );
+			std::cout << "Test case 1: suggested block size is " << bsize << ", ";
+			if( bsize != 416 ) {
+				std::cout << "x\n";
+				std::ostringstream oss;
+				oss << "Expected block size 416, got " << bsize << " instead";
+				throw std::runtime_error( oss.str() );
+			} else {
+				std::cout << "v\n";
+			}
+		} catch( const std::exception &e ) {
+			std::cerr << "Error during test case 1: " << e.what() << std::endl;
+			std::cout << "Test FAILED\n" << std::endl;
+			return 10;
+		}
+	}
+	{
+		// this is a 1D problem over a 2D 2 x 5 process mesh with otherwise the same
+		// test parameters as the above test
+		asc::AnalyticModel< 2, 1, false > am( 5000, {2,5}, {1000000}, {true} );
+		am.addGlobalTensor( 4, {true} );
+		am.addGlobalTensor( 4, {true} );
+		am.addGlobalTensor( 4, {true} );
+		am.setNumStages( 1 );
+		try {
+			const size_t bsize = am.getBlockSize( 0 );
+			std::cout << "Test case 2: suggested block size is " << bsize << ", ";
+			if( bsize != 416 ) {
+				std::cout << "x\n";
+				std::ostringstream oss;
+				oss << "Expected block size 416, got " << bsize << " instead";
+				throw std::runtime_error( oss.str() );
+			} else {
+				std::cout << "v\n";
+			}
+		} catch( const std::exception &e ) {
+			std::cerr << "Error during test case 2: " << e.what() << std::endl;
+			std::cout << "Test FAILED\n" << std::endl;
+			return 20;
+		}
+	}
+	{
+		// this is a 1D problem over a 5D 1 x 1 x 1 x 2 x 5 process mesh with
+		// otherwise the same test parameters as the above test
+		asc::AnalyticModel< 5, 1, false > am( 5000, {1,1,1,2,5}, {1000000}, {true} );
+		am.addGlobalTensor( 4, {true} );
+		am.addGlobalTensor( 4, {true} );
+		am.addGlobalTensor( 4, {true} );
+		am.setNumStages( 1 );
+		try {
+			const size_t bsize = am.getBlockSize( 0 );
+			std::cout << "Test case 3: suggested block size is " << bsize << ", ";
+			if( bsize != 416 ) {
+				std::cout << "x\n";
+				std::ostringstream oss;
+				oss << "Expected block size 416, got " << bsize << " instead";
+				throw std::runtime_error( oss.str() );
+			} else {
+				std::cout << "v\n";
+			}
+		} catch( const std::exception &e ) {
+			std::cerr << "Error during test case 3: " << e.what() << std::endl;
+			std::cout << "Test FAILED\n" << std::endl;
+			return 30;
+		}
+	}
+	{
+		// test a 1D case where a trivial solution is possible
+		asc::AnalyticModel< 1, 1, false > am( 24000, {10}, {10000}, {true} );
+		am.addGlobalTensor( 8, {true} );
+		am.addGlobalTensor( 8, {true} );
+		am.setNumStages( 1 );
+		try {
+			const size_t bsize = am.getBlockSize( 0 );
+			std::cout << "Test case 4: suggested block size is " << bsize << ", ";
+			if( bsize != 1000 ) {
+				std::cout << "x\n";
+				std::ostringstream oss;
+				oss << "Expected block size 10000, got " << bsize << " instead";
+				throw std::runtime_error( oss.str() );
+			} else {
+				std::cout << "v\n";
+			}
+		} catch( const std::exception &e ) {
+			std::cerr << "Error during test case 4: " << e.what() << std::endl;
+			std::cout << "Test FAILED\n" << std::endl;
+			return 40;
+		}
+	}
+	{
+		// test a 1D case where a trivial solution is possible
+		asc::AnalyticModel< 1, 1, false > am( 3003, {1}, {1001}, {true} );
+		am.addGlobalTensor( 3, {true} );
+		am.setNumStages( 1 );
+		try {
+			const size_t bsize = am.getBlockSize( 0 );
+			std::cout << "Test case 5: suggested block size is " << bsize << ", ";
+			if( bsize != 1001 ) {
+				std::cout << "x\n";
+				std::ostringstream oss;
+				oss << "Expected block size 1001, got " << bsize << " instead";
+				throw std::runtime_error( oss.str() );
+			} else {
+				std::cout << "v\n";
+			}
+		} catch( const std::exception &e ) {
+			std::cerr << "Error during test case 5: " << e.what() << std::endl;
+			std::cout << "Test FAILED\n" << std::endl;
+			return 50;
+		}
+	}
+	{
+		// test for the other trivial (worst-case trivial) solution, 1D
+		asc::AnalyticModel< 1, 1, false > am( 32, {8}, {2538791}, {true} );
+		am.addGlobalTensor( 8, {true} );
+		am.addGlobalTensor( 8, {true} );
+		am.addGlobalTensor( 8, {true} );
+		am.addGlobalTensor( 8, {true} );
+		am.setNumStages( 2 );
+		try {
+			const size_t bsize = am.getBlockSize( 0 );
+			std::cout << "Test case 6: suggested block size is " << bsize << ", ";
+			if( bsize != 1 ) {
+				std::cout << "x\n";
+				std::ostringstream oss;
+				oss << "Expected block size 1, got " << bsize << " instead";
+				throw std::runtime_error( oss.str() );
+			} else {
+				std::cout << "v\n";
+			}
+		} catch( const std::exception &e ) {
+			std::cerr << "Error during test case 6: " << e.what() << std::endl;
+			std::cout << "Test FAILED\n" << std::endl;
+			return 60;
+		}
+	}
+	{
+		// test with no feasible solution, 1D
+		asc::AnalyticModel< 1, 1, false > am( 1, {8}, {2538791}, {true} );
+		am.addGlobalTensor( 1, {true} );
+		am.addGlobalTensor( 1, {true} );
+		am.setNumStages( 1 );
+		try {
+			const size_t bsize = am.getBlockSize( 0 );
+			std::cout << "Test case 7: suggested block size is " << bsize << ", x\n";
+			std::cerr << "Error during test case 7: a blocksize was returned even "
+				<< "though the problem is infeasible" << std::endl;
+			std::cout << "Test FAILED\n" << std::endl;
+			return 70;
+		} catch( ... ) {
+			std::cout << "Test case 7: infeasibility correctly detected\n";
+		}
+	}
+
+	// done
+	std::cout << "Test OK\n" << std::endl;
+	return 0;
+}
+