diff --git a/include/graphblas/base/io.hpp b/include/graphblas/base/io.hpp
index 493dfcfea..c904570fc 100644
--- a/include/graphblas/base/io.hpp
+++ b/include/graphblas/base/io.hpp
@@ -1218,15 +1218,17 @@ namespace grb {
 	 *   -# #grb::descriptors::invert_mask, and
 	 *   -# #grb::descriptors::structural.
 	 *
-	 * However, and differently from most ALP primtivies, the
+	 * However, and differently from most ALP/GraphBLAS primitivies, the
 	 * #grb::descriptors::invert_mask and #grb::descriptors::structural are
 	 * mutually exclusive for this primitive.
 	 * \endparblock
 	 *
 	 * @tparam OutputType The type of each element in the given matrix.
 	 * @tparam MaskType   The type of each element in the given mask.
-	 * @tparam ValueType  The type of the given value. Should be convertible
-	 *                    to \a OutputType.
+	 * @tparam ValueType  The type of the given value.
+	 *
+	 * The given \a ValueType must be convertible to \a OutputType.
+	 *
 	 * @tparam RIT       The integer type for encoding row indices.
 	 * @tparam CIT       The integer type for encoding column indices.
 	 * @tparam NIT       The integer type for encoding nonzero indices.
@@ -1244,51 +1246,50 @@ namespace grb {
 	 *                  the default is #grb::EXECUTE.
 	 *
 	 * In #grb::RESIZE mode:
-	 * @returns #grb::SUCCESS  When the capacity of \a C has been (made) sufficient
-	 *                         to store the requested output.
+	 *
+	 * @returns #grb::SUCCESS  When the capacity of \a C (has been made or already
+	 *                         was) sufficient to store the requested output.
 	 * @returns #grb::OUTOFMEM When out-of-memory conditions have been met while
-	 *                         executing. If this error code is returned, \a C
+	 *                         resizing \a C. If this error code is returned, \a C
 	 *                         shall be unmodified compared to its state at
 	 *                         function entry.
 	 *
 	 * In #grb::EXECUTE mode:
-	 * @returns #grb::SUCCESS When the call completes successfully.
+	 *
+	 * @returns #grb::SUCCESS When the computation has completed or will execute
+	 *                        successfully.
 	 * @returns #grb::ILLEGAL When \a C did not have enough capacity to store the
 	 *                        output of the requested computation.
 	 *
 	 * Either mode may additionally return:
-	 * @returns #grb::ILLEGAL  In case the given \a mask was empty.
+	 *
 	 * @returns #grb::MISMATCH In case \a C and \a mask have mismatching sizes.
 	 * @returns #grb::PANIC    In case an unmitigable error was encountered. The
-	 *                         caller is suggested to exit gracefully, and in any
-	 *                         case to not make any further calls to ALP.
+	 *                         caller, when encountering this return code, is
+	 *                         suggested to exit gracefully and to not make any
+	 *                         further calls to ALP.
 	 *
 	 * When \a descr includes #grb::descriptors::no_casting then code shall not
 	 * compile if one of the following conditions are met:
 	 *  -# \a ValueType does not match \a OutputType; or
 	 *  -# \a MaskType does not match <tt>bool</tt>.
 	 *
-	 * In these cases, the code shall not compile: implementations must throw
-	 * a static assertion failure in this case.
-	 *
 	 * Similarly, it is forbidden to call this function with both following
 	 * descriptors simultaneously:
 	 *  - #grb::descriptors::invert_mask \em and #grb::descriptors::structural.
 	 *
 	 * The use of the #grb::descriptors::structural_complement descriptor hence is
-	 * is forbidden also. Implementations shall throw a static assertion failure
-	 * if the user nonetheless asks for structural mask inversion.
+	 * is forbidden also. These conditions, when encountered, should lead to
+	 * compile-time errors also.
+	 *
+	 * \note One vehicle to ensure compilation does not occur in these cases is via
+	 *       <tt>static_assert</tt>.
 	 *
 	 * \parblock
 	 * \par Performance semantics
 	 * Each backend must define performance semantics for this primitive.
 	 *
 	 * @see perfSemantics
-	 *
-	 * \warning Generally, if \a mask equals \a C and the mask is non-structural,
-	 *          then optimised implementations will assign higher costs than when
-	 *          \a mask does not equal \a C. This is because the nonzero structure
-	 *          update cannot be done in-place.
 	 * \endparblock
 	 */
 	template<
@@ -1320,6 +1321,128 @@ namespace grb {
 		return UNSUPPORTED;
 	}
 
+	/**
+	 * Sets all values of a matrix to that of a given source matrix, if and only if
+	 * the corresponding value coordinates evaluate <tt>true</tt> at the given mask
+	 * matrix.
+	 *
+	 * @tparam descr The descriptor used for this operation.
+	 *
+	 * \parblock
+	 * \par Accepted descriptors
+	 *   -# #grb::descriptors::no_operation,
+	 *   -# #grb::descriptors::no_casting,
+	 *   -# #grb::descriptors::invert_mask,
+	 *   -# #grb::descriptors::structural
+	 *
+	 * However, and differently from most ALP primtivies, the
+	 * #grb::descriptors::invert_mask and #grb::descriptors::structural are
+	 * mutually exclusive for this primitive.
+	 * \endparblock
+	 *
+	 * @tparam OutputType The type of each element in the destination matrix.
+	 * @tparam MaskType   The type of each element in the output mask.
+	 * @tparam ValueType  The type of each element in the source matrix.
+	 *
+	 * The given \a ValueType must be convertible to \a OutputType.
+	 *
+	 * \internal
+	 * @tparam RIT1       The integer type for encoding row indices in \a C.
+	 * @tparam CIT1       The integer type for encoding column indices in \a C.
+	 * @tparam NIT1       The integer type for encoding nonzero indices in \a C.
+	 * @tparam RIT2       The integer type for encoding row indices in \a mask.
+	 * @tparam CIT2       The integer type for encoding column indices in \a mask.
+	 * @tparam NIT2       The integer type for encoding nonzero indices in \a mask.
+	 * @tparam RIT3       The integer type for encoding row indices in \a A.
+	 * @tparam CIT3       The integer type for encoding column indices in \a A.
+	 * @tparam NIT3       The integer type for encoding nonzero indices in \a A.
+	 * @tparam backend    The backend selected for executing this primitive.
+	 * \endinternal
+	 *
+	 * @param[out] C     The matrix that will be a masked copy of \a A.
+	 * @param[in]  mask  Matrix that acts as output mask on \a C.
+	 * @param[in]  A     The source matrix which will (partially) be copied to
+	 *                   \a A.
+	 * @param[in]  phase Which #grb::Phase of the operation is requested. Optional;
+	 *                   the default is #grb::EXECUTE.
+	 *
+	 * In #grb::RESIZE mode:
+	 *
+	 * @returns #grb::SUCCESS  When the capacity of \a C (has been made or already
+	 *                         was) sufficient to store the requested output.
+	 * @returns #grb::OUTOFMEM When out-of-memory conditions were met while
+	 *                         resizing \a C. If this error code is returned, \a C
+	 *                         shall be left unmodified compared to its state at
+	 *                         function entry.
+	 *
+	 * In #grb::EXECUTE mode:
+	 *
+	 * @returns #grb::SUCCESS When the computation has completed or will execute
+	 *                        successfully.
+	 * @returns #grb::ILLEGAL When \a C did not have enough capacity to store the
+	 *                        output of the requested computation.
+	 *
+	 * Either mode may additionally return:
+	 *
+	 * @returns #grb::MISMATCH When \a A and \a C have mismatching sizes.
+	 * @returns #grb::MISMATCH When \a C and \a mask have mismatching sizes.
+	 * @returns #grb::PANIC    In case an unmitigable error was encountered. The
+	 *                         caller, when encountering this return code, is
+	 *                         suggested to exit the program gracefully and to not
+	 *                         make any further calls to ALP.
+	 *
+	 * When \a descr includes #grb::descriptors::no_casting, then code shall not
+	 * comile if one of the following conditions are met:
+	 *  -# \a ValueType does not match \a OutputType; or
+	 *  -# \a MaskType does not match <tt>bool</tt>.
+	 *
+	 * Similarly, it is forbidden to call this function with both following
+	 * descriptors simultaneously:
+	 *  - #grb::descriptors::invert_mask \em and #grb::descriptors::structural.
+	 *
+	 * The use of the #grb::descriptors::structural_complement descriptor hence is
+	 * is forbidden also. These conditions should lead to compile-time errors also.
+	 *
+	 * \note One vehicle to ensure compilation does not occur in these cases is via
+	 *       <tt>static_assert</tt>.
+	 *
+	 * \parblock
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 * \endparblock
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename MaskType, typename ValueType,
+		typename RIT1, typename CIT1, typename NIT1,
+		typename RIT2, typename CIT2, typename NIT2,
+		typename RIT3, typename CIT3, typename NIT3,
+		Backend backend
+	>
+	RC set(
+		Matrix< OutputType, backend, RIT1, CIT1, NIT1 > &C,
+		const Matrix< MaskType, backend, RIT2, CIT2, NIT2 > &mask,
+		const Matrix< ValueType, backend, RIT3, CIT3, NIT3 > &A,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< ValueType >::value &&
+			!grb::is_object< MaskType >::value,
+		void >::type * const = nullptr
+	) noexcept {
+#ifndef NDEBUG
+		const bool should_not_call_base_matrix_masked_matrix_set = false;
+		assert( should_not_call_base_matrix_masked_matrix_set );
+#endif
+		(void) C;
+		(void) mask;
+		(void) A;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
 	/**
 	 * Sets the element of a given vector at a given position to a given value.
 	 *
@@ -1763,7 +1886,7 @@ namespace grb {
 	 * successfully, then a call to this function shall return #grb::SUCCESS.
 	 *
 	 * The are several other cases in which the computation of nonblocking
-	 * primtives is forced:
+	 * primitives is forced:
 	 *   -# whenever an output iterator of an output container of any of the non-
 	 *      blocking primitives is requested; and
 	 *   -# whenever an output container of any of the non-blocking primitives is
diff --git a/include/graphblas/bsp1d/init.hpp b/include/graphblas/bsp1d/init.hpp
index 582479bf1..d5798d894 100644
--- a/include/graphblas/bsp1d/init.hpp
+++ b/include/graphblas/bsp1d/init.hpp
@@ -34,6 +34,7 @@
 #include <graphblas/utils/DMapper.hpp>
 #include <graphblas/utils/threadLocalStorage.hpp>
 
+#include <cstdint>  //uintptr_t
 #include <assert.h> //assertions
 
 #include <lpf/core.h>
diff --git a/include/graphblas/bsp1d/io.hpp b/include/graphblas/bsp1d/io.hpp
index edada4191..b8181041a 100644
--- a/include/graphblas/bsp1d/io.hpp
+++ b/include/graphblas/bsp1d/io.hpp
@@ -32,7 +32,6 @@
  #include <omp.h>
 #endif
 
-#include "graphblas/blas1.hpp"                 // for grb::size
 #include "graphblas/nonzeroStorage.hpp"
 
 // the below transforms an std::vector iterator into an ALP/GraphBLAS-compatible
@@ -976,20 +975,166 @@ namespace grb {
 #ifdef _BSP1D_IO_DEBUG
 		std::cout << "Called grb::set( matrix, mask, value ) (BSP1D)\n";
 #endif
+		// catch trivial cases
 		const size_t m = nrows( C );
 		const size_t n = ncols( C );
+		if( m == 0 || n == 0 ) { return SUCCESS; }
 
-		// dynamic checks (I)
+		// dynamic checks
+		if( nrows( mask ) == 0 || ncols( mask ) == 0 ) {
+			return ILLEGAL;
+		}
 		if( m != nrows( mask ) || n != ncols( mask ) ) {
 			return MISMATCH;
 		}
 
-		// catch trivial case
+#ifdef _BSP1D_IO_DEBUG
+		std::cout << "\t delegating to final backend\n";
+#endif
+		RC ret = SUCCESS;
+		// Take care that local matrices may be empty, even if the global matrix is
+		// not. Processes with empty local matrices will not delegate (no-op).
+		{
+			auto &local_C = internal::getLocal( C );
+			const auto &local_mask = internal::getLocal( mask );
+			const size_t local_m = nrows( local_C );
+			const size_t local_n = ncols( local_C );
+			assert( local_m == nrows( local_mask ) );
+			assert( local_n == ncols( local_mask ) );
+			if( local_m > 0 && local_n > 0 ) {
+				ret = set< descr >( local_C, local_mask, val, phase );
+			}
+		}
+
+		// in the self-masked case, there is no way an error could occur
+		if( (descr & descriptors::structural) && getID( C ) == getID( mask ) ) {
+#ifdef _BSP1D_IO_DEBUG
+			std::cout << "\t structural self-masking detected, which allows trivial "
+				"exit\n"; // since the nnz nor capacity would never change
+#endif
+			assert( ret == SUCCESS );
+			return ret;
+		}
+
+		// in all other cases, in either mode (resize or execute), we must check for
+		// errors
+#ifdef _BSP1D_IO_DEBUG
+		std::cout << "\t all-reducing error code\n";
+#endif
+		if( collectives< BSP1D >::allreduce( ret, operators::any_or< RC >() )
+			!= SUCCESS
+		) {
+			return PANIC;
+		}
+
+#ifdef _BSP1D_IO_DEBUG
+		std::cout << "\t all-reduced error code is " << toString( ret ) << "\n";
+#endif
+		if( phase == RESIZE ) {
+			if( ret == SUCCESS ) {
+#ifdef _BSP1D_IO_DEBUG
+				std::cout << "\t resize phase detected -- synchronising capacity\n";
+#endif
+				ret = internal::updateCap( C );
+				if( ret != SUCCESS ) {
+					std::cerr << "Error updating capacity: " << toString( ret ) << "\n";
+				}
+			}
+		} else {
+			assert( phase == EXECUTE );
+			if( ret == SUCCESS ) {
+#ifdef _BSP1D_IO_DEBUG
+				std::cout << "\t execute phase detected -- synchronising nnz count\n";
+#endif
+				ret = internal::updateNnz( C );
+				if( ret != SUCCESS ) {
+					std::cerr << "Error updating output number of nonzeroes: "
+						<< toString( ret ) << "\n";
+				}
+			} else if( ret == ILLEGAL ) {
+#ifdef _BSP1D_IO_DEBUG
+				std::cout << "\t delegate returns ILLEGAL, clearing output\n";
+#endif
+				const RC clear_rc = clear( C );
+				if( clear_rc != SUCCESS ) {
+					ret = PANIC;
+				}
+			} else {
+				if( ret != PANIC ) {
+					std::cerr << "Warning: unexpected error code in grb::set( matrix, mask, "
+						<< "value ) (BSP1D). Please submit a bug report.\n";
+				}
+				assert( ret == PANIC );
+			}
+		}
+
+#ifdef _BSP1D_IO_DEBUG
+		std::cout << "\t done; returning " << toString( ret ) << "\n";
+#endif
+
+		// done
+		return ret;
+	}
+
+	/**
+	 * The implementation can trivially rely on the final backend, however, the
+	 * capacity or nonzero count of the output can in some cases differ. The below
+	 * implementation mostly deals with that logic.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename DataType, typename RIT1, typename CIT1, typename NIT1,
+		typename MaskType, typename RIT2, typename CIT2, typename NIT2,
+		typename ValueType = DataType, typename RIT3, typename CIT3, typename NIT3
+	>
+	RC set(
+		Matrix< DataType, BSP1D, RIT1, CIT1, NIT1 > &C,
+		const Matrix< MaskType, BSP1D, RIT2, CIT2, NIT2 > &mask,
+		const Matrix< ValueType, BSP1D, RIT3, CIT3, NIT3 > &A,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< DataType >::value &&
+			!grb::is_object< ValueType >::value &&
+			!grb::is_object< MaskType >::value
+		>::type * const = nullptr
+	) noexcept {
+		// static checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< ValueType, DataType >::value
+			), "grb::set( matrix, mask, matrix ) (BSP1D)",
+			"called with non-matching value types"
+		);
+		NO_CAST_ASSERT(
+			( !(descr & descriptors::no_casting) ||
+				std::is_same< MaskType, bool >::value ),
+			"grb::set( matrix, mask, matrix ) (BSP1D)",
+			"called with non-Boolean mask value type"
+		);
+		static_assert( !( (descr & descriptors::structural) &&
+				(descr & descriptors::invert_mask)
+			), "grb::set( matrix, mask, matrix ) (BSP1D): Primitives with matrix "
+			"outputs may not employ structurally inverted masking"
+		);
+#ifdef _BSP1D_IO_DEBUG
+		std::cout << "Called grb::set( matrix, mask, matrix ) (BSP1D)\n";
+#endif
+		const size_t m = nrows( C );
+		const size_t n = ncols( C );
+
+		// dynamic checks (I)
+		if( m != nrows( A ) || n != ncols( A ) ) {
+			return MISMATCH;
+		}
+
+		// catch trivial cases
 		if( m == 0 || n == 0 ) { return SUCCESS; }
+		if( nrows( mask ) == 0 || ncols( mask ) == 0 ) {
+			return set< descr >( C, A, phase );
+		}
 
 		// dynamic checks (II)
-		if( nrows( mask ) == 0 || ncols( mask ) == 0 ) {
-			return ILLEGAL;
+		if( m != nrows( mask ) || n != ncols( mask ) ) {
+			return MISMATCH;
 		}
 
 #ifdef _BSP1D_IO_DEBUG
@@ -1000,13 +1145,16 @@ namespace grb {
 		// not. Processes with empty local matrices will not delegate (no-op).
 		{
 			auto &local_C = internal::getLocal( C );
+			const auto &local_A = internal::getLocal( A );
 			const auto &local_mask = internal::getLocal( mask );
 			const size_t local_m = nrows( local_C );
 			const size_t local_n = ncols( local_C );
 			assert( local_m == nrows( local_mask ) );
 			assert( local_n == ncols( local_mask ) );
+			assert( local_m == nrows( local_A ) );
+			assert( local_n == ncols( local_A ) );
 			if( local_m > 0 && local_n > 0 ) {
-				ret = set< descr >( local_C, local_mask, val, phase );
+				ret = set< descr >( local_C, local_mask, local_A, phase );
 			}
 		}
 
@@ -1625,8 +1773,9 @@ namespace grb {
 			// a pipeline depth of 2 is sufficient.
 			constexpr size_t iteration_overlaps = 2;
 
-			const std::unique_ptr< size_t > first_nnz_per_thread(
-				new size_t[ num_threads * iteration_overlaps ]()
+			const std::unique_ptr< size_t, void(*)(size_t * const) > first_nnz_per_thread(
+				new size_t[ num_threads * iteration_overlaps ](),
+				[](size_t * const array){ delete [] array; }
 			);
 			size_t * const first_nnz_per_thread_ptr = first_nnz_per_thread.get();
 			outgoing.resize( data.P );
diff --git a/include/graphblas/descriptors.hpp b/include/graphblas/descriptors.hpp
index cd0a5ab25..2c0a793ae 100644
--- a/include/graphblas/descriptors.hpp
+++ b/include/graphblas/descriptors.hpp
@@ -62,7 +62,20 @@ namespace grb {
 		 */
 		static constexpr Descriptor no_operation = 0;
 
-		/** Inverts the mask prior to applying it. */
+		/**
+		 * Inverts the mask prior to applying it.
+		 *
+		 * Applying this descriptor to a sparse mask may still only generate output
+		 * where the mask has elements. To decide whether an output will be generated
+		 * at a given element in the mask, its value will be inverted.
+		 *
+		 * If instead the structural complement is to be taken as a mask, this
+		 * descriptor must be combined with #grb::descriptors::structural.
+		 * ALP/GraphBLAS forbids taking the structural inverse of matrix masks
+		 * (because then either the output matrix or the mask matrix has
+		 * \f$ \mathcal{mn} \f$ values, which defeats any useful application of
+		 * GraphBLAS as this signifies one of the containers is, in fact, not sparse).
+		 */
 		static constexpr Descriptor invert_mask = 1;
 
 		/**
@@ -98,6 +111,11 @@ namespace grb {
 		 * i-th index, regardless of how that value evaluates. It evaluates false
 		 * if there was no value assigned.
 		 *
+		 * These semantics are inverted when this descriptor is combined with
+		 * #grb::descriptors::invert_mask: in that case, the mask evaluates true at
+		 * index \f$ i \f$ if the mask had no value at that index; and evaluates false
+		 * otherwise.
+		 *
 		 * @see structural_complement
 		 */
 		static constexpr Descriptor structural = 8;
@@ -113,6 +131,10 @@ namespace grb {
 		 * This ignores the actual values of the mask argument. The i-th element of
 		 * the mask now evaluates true if the mask has \em no value assigned to its
 		 * i-th index, and evaluates false otherwise.
+		 *
+		 * The application of this descriptor is forbidden for matrix mask arguments,
+		 * as otherwise either the output or the mask is dense or is (too) close to
+		 * being dense.
 		 */
 		static constexpr Descriptor structural_complement = structural | invert_mask;
 
diff --git a/include/graphblas/hyperdags/hyperdags.hpp b/include/graphblas/hyperdags/hyperdags.hpp
index 2ae951565..b4a650c0e 100644
--- a/include/graphblas/hyperdags/hyperdags.hpp
+++ b/include/graphblas/hyperdags/hyperdags.hpp
@@ -381,6 +381,8 @@ namespace grb {
 
 				SET_MATRIX_MATRIX_INPUT2,
 
+				SET_MATRIX_MATRIX_MASKED,
+
 				MXM_MATRIX_MATRIX_MATRIX_SEMIRING,
 
 				MXM_MATRIX_MATRIX_MATRIX_MONOID,
@@ -503,7 +505,7 @@ namespace grb {
 			};
 
 			/** \internal How many operation vertex types exist. */
-			const constexpr size_t numOperationVertexTypes = 111;
+			const constexpr size_t numOperationVertexTypes = 112;
 
 			/** \internal An array of all operation vertex types. */
 			const constexpr enum OperationVertexType
diff --git a/include/graphblas/hyperdags/io.hpp b/include/graphblas/hyperdags/io.hpp
index 4ec81917b..99711f0bb 100644
--- a/include/graphblas/hyperdags/io.hpp
+++ b/include/graphblas/hyperdags/io.hpp
@@ -28,6 +28,10 @@
 
 #include <array>
 
+#ifdef _DEBUG
+ #define _DEBUG_HYPERDAGS_IO
+#endif
+
 
 namespace grb {
 
@@ -440,6 +444,110 @@ namespace grb {
 		return ret;
 	}
 
+	/**
+	 * This function inherits the performance semantics of the underlying backend.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename MaskType, typename InputType,
+		typename RIT1, typename CIT1, typename NIT1,
+		typename RIT2, typename CIT2, typename NIT2,
+		typename RIT3, typename CIT3, typename NIT3
+	>
+	RC set(
+		Matrix< OutputType, hyperdags, RIT1, CIT1, NIT1 > &C,
+		const Matrix< MaskType, hyperdags, RIT2, CIT2, NIT2 > &M,
+		const Matrix< InputType, hyperdags, RIT3, CIT3, NIT3 > &A,
+		const Phase &phase = EXECUTE
+	) {
+#ifdef _DEBUG_HYPERDAGS_IO
+		std::cout << "Entering set(matrix, matrix, matrix), hyperdags backend\n";
+#endif
+		// first, check for dynamic errors
+		const size_t m = nrows( C );
+		const size_t n = ncols( C );
+		if( m != nrows( A ) || n != ncols( A ) ) {
+#ifdef _DEBUG_HYPERDAGS_IO
+			std::cerr << "\t set( matrix, matrix, matrix ): dimension mismatch (I)\n";
+#endif
+			return MISMATCH;
+		}
+		if( m != nrows( M ) && nrows( M ) != 0 ) {
+#ifdef _DEBUG_HYPERDAGS_IO
+			std::cerr << "\t set( matrix, matrix, matrix ): dimension mismatch (II)\n";
+#endif
+			return MISMATCH;
+		}
+		if( n != ncols( M ) && ncols( M ) != 0 ) {
+#ifdef _DEBUG_HYPERDAGS_IO
+			std::cerr << "\t set( matrix, matrix, matrix ): dimension mismatch (III)\n";
+#endif
+			return MISMATCH;
+		}
+		// second, check for trivial op
+		if( m == 0 || n == 0 ) {
+#ifdef _DEBUG_HYPERDAGS_IO
+			std::cerr << "\t WARNING set( matrix, matrix, matrix ), hyperdags: "
+				<< "trivial op detected (all containers empty). No operation will be "
+				<< "recorded\n";
+#endif
+			return SUCCESS;
+		}
+		// third, execute
+#ifdef _DEBUG_HYPERDAGS_IO
+		std::cerr << "\t set( matrix, matrix, matrix ), hyperdags: forwarding to "
+			<< "execution backend\n";
+#endif
+		const RC ret = set< descr >(
+			internal::getMatrix( C ), internal::getMatrix( M ),
+			internal::getMatrix( A ), phase
+		);
+		// fourth, forward any errors
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		// fifth, record operation
+#ifdef _DEBUG_HYPERDAGS_IO
+		std::cerr << "\t set( matrix, matrix, matrix ), hyperdags: execution had no "
+			<< "error; recording operation\n";
+#endif
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(C) ) };
+		if( nrows( M ) == 0 || ncols( M ) == 0 ) {
+#ifdef _DEBUG_HYPERDAGS_IO
+			std::cerr << "\t WARNING set( matrix, matrix, matrix ), hyperdags: "
+				<< "empty mask detected; hyperDAG will not (cannot) record it\n";
+#endif
+			std::array< uintptr_t, 2 > sourcesC{
+				getID( internal::getMatrix(A) ),
+				getID( internal::getMatrix(C) )
+			};
+			internal::hyperdags::generator.addOperation(
+				internal::hyperdags::SET_MATRIX_MATRIX_MASKED,
+				sourcesP.begin(), sourcesP.end(),
+				sourcesC.begin(), sourcesC.end(),
+				destinations.begin(), destinations.end()
+			);
+		} else {
+			std::array< uintptr_t, 3 > sourcesC{
+				getID( internal::getMatrix(A) ),
+				getID( internal::getMatrix(M) ),
+				getID( internal::getMatrix(C) )
+			};
+			internal::hyperdags::generator.addOperation(
+				internal::hyperdags::SET_MATRIX_MATRIX_MASKED,
+				sourcesP.begin(), sourcesP.end(),
+				sourcesC.begin(), sourcesC.end(),
+				destinations.begin(), destinations.end()
+			);
+		}
+
+		// done
+#ifdef _DEBUG_HYPERDAGS_IO
+		std::cerr << "\t set( matrix, matrix, matrix ), hyperdags: exiting\n";
+#endif
+		return ret;
+	}
+
 	/**
 	 * This function inherits the performance semantics of the underlying backend.
 	 */
diff --git a/include/graphblas/nonblocking/io.hpp b/include/graphblas/nonblocking/io.hpp
index 0831fc564..963846251 100644
--- a/include/graphblas/nonblocking/io.hpp
+++ b/include/graphblas/nonblocking/io.hpp
@@ -1227,6 +1227,42 @@ namespace grb {
 			internal::getRefMatrix( C ), internal::getRefMatrix( A ), val, phase );
 	}
 
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename MaskType, typename InputType,
+		typename RIT1, typename CIT1, typename NIT1,
+		typename RIT2, typename CIT2, typename NIT2,
+		typename RIT3, typename CIT3, typename NIT3
+	>
+	RC set(
+		Matrix< OutputType, nonblocking, RIT1, CIT1, NIT1 > &C,
+		const Matrix< MaskType, nonblocking, RIT2, CIT2, NIT2 > &M,
+		const Matrix< InputType, nonblocking, RIT3, CIT3, NIT3 > &A,
+		const Phase &phase = EXECUTE
+	) noexcept {
+		if( internal::NONBLOCKING::warn_if_not_native &&
+			config::PIPELINE::warn_if_not_native
+		) {
+			std::cerr << "Warning: masked set (nonblocking) currently delegates "
+				<< "to a blocking implementation.\n"
+				<< "         Further similar such warnings will be suppressed.\n";
+			internal::NONBLOCKING::warn_if_not_native = false;
+		}
+
+		// nonblocking execution is not supported
+		// first, execute any computation that is not completed
+		internal::le.execution();
+
+		// second, delegate to the reference backend
+		return set< descr >(
+			internal::getRefMatrix( C ),
+			internal::getRefMatrix( M ),
+			internal::getRefMatrix( A ),
+			phase
+		);
+	}
+
 	template<
 		Descriptor descr = descriptors::no_operation,
 		typename InputType,
diff --git a/include/graphblas/phase.hpp b/include/graphblas/phase.hpp
index 1f12b8f05..044395ac9 100644
--- a/include/graphblas/phase.hpp
+++ b/include/graphblas/phase.hpp
@@ -236,7 +236,7 @@ namespace grb {
 		 * execute phase.
 		 *
 		 * If, instead, the output container capacity was found to be insufficient,
-		 * then the requested operation may return #grb::FAILED, in which case the
+		 * then the requested operation may return #grb::ILLEGAL, in which case the
 		 * contents of output containers shall be cleared.
 		 *
 		 * \note That on failure a primitive called using the execute phase may
diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 054d945dd..3cffb588a 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -579,432 +579,6 @@ namespace grb {
 			return SUCCESS;
 		}
 
-#ifndef _H_GRB_REFERENCE_OMP_BLAS3
-		/**
-		 * Meta-data for global buffer management for use with #grb::mxm.
-		 *
-		 * This meta-data is the same for both the sequential (reference) and shared-
-		 * memory parallel (reference_omp) backends.
-		 *
-		 * This class contains all meta-data necessary to interpret the global buffer
-		 * as an array of sparse accumulators (SPAs). The length of the array is given
-		 * by a call to #threads(), minus one. It is called that since a call to
-		 * #threads() retrieves how many threads can be used to process the call to
-		 * #grb::mxm.
-		 *
-		 * Each SPA has the layout (bitarray, stack, valueArray). These are packed in a
-		 * padded byte array, such that each bit array, stack, and value array is
-		 * aligned on sizeof(int) bytes.
-		 *
-		 * @tparam NIT       The nonzero index type.
-		 * @tparam ValueType The output matrix value type.
-		 */
-		template< typename NIT, typename ValueType >
-		class MXM_BufferMetaData {
-
-			static_assert( sizeof(NIT) % sizeof(int) == 0, "Unsupported type for NIT; "
-				"please submit a bug report!" );
-
-			private:
-
-				/** The size of the offset array */
-				size_t m;
-
-				/** The size of the SPA */
-				size_t n;
-
-				/** The number of threads supported during a call to #grb::mxm */
-				size_t nthreads;
-
-				/** The initial buffer offset */
-				size_t bufferOffset;
-
-				/** The size of a single SPA, including bytes needed for padding */
-				size_t paddedSPASize;
-
-				/** The number of bytes to pad the SPA array with */
-				size_t arrayShift;
-
-				/** The number of bytes to pad the SPA stack with */
-				size_t stackShift;
-
-				/**
-				 * Given a number of used bytes of the buffer, calculate the available
-				 * remainder buffer and return it.
-				 *
-				 * @param[in]  osize     The size of the buffer (in bytes) that is already
-				 *                       in use.
-				 * @param[out] remainder Pointer to any remainder buffer.
-				 * @param[out] rsize     The size of the remainder buffer.
-				 *
-				 * If no buffer space is left, \a remainder will be set to <tt>nullptr</tt>
-				 * and \a size to <tt>0</tt>.
-				 */
-				void retrieveRemainderBuffer(
-					const size_t osize,
-					void * &remainder, size_t &rsize
-				) const noexcept {
-					const size_t size = internal::template getCurrentBufferSize< char >();
-					char * rem = internal::template getReferenceBuffer< char >( size );
-					size_t rsize_calc = size - osize;
-					rem += osize;
-					const size_t mod = reinterpret_cast< uintptr_t >(rem) % sizeof(int);
-					if( mod ) {
-						const size_t shift = sizeof(int) - mod;
-						if( rsize_calc >= shift ) {
-							rsize_calc -= shift;
-							rem += rsize;
-						} else {
-							rsize_calc = 0;
-							rem = nullptr;
-						}
-					}
-					assert( !(reinterpret_cast< uintptr_t >(rem) % sizeof(int)) );
-					// write out
-					remainder = rem;
-					rsize = rsize_calc;
-				}
-
-
-			public:
-
-				/**
-				 * Base constructor.
-				 *
-				 * @param[in] _m          The length of the offset array.
-				 * @param[in] _n          The length of the SPA.
-				 * @param[in] max_threads The maximum number of threads.
-				 *
-				 * \note \a max_threads is a separate input since there might be a need to
-				 *       cap the maximum number of threads used based on some analytic
-				 *       performance model. Rather than putting such a performance model
-				 *       within this class, we make it an obligatory input parameter
-				 *       instead.
-				 *
-				 * \note It is always valid to pass <tt>config::OMP::threads()</tt>.
-				 *
-				 * \note This class \em will, however, cap the number of threads returned
-				 *       to \a _n.
-				 */
-				MXM_BufferMetaData(
-					const size_t _m, const size_t _n,
-					const size_t max_threads
-				) : m( _m ), n( _n ), arrayShift( 0 ), stackShift( 0 ) {
- #ifdef _DEBUG_REFERENCE_BLAS3
-					#pragma omp critical
-					std::cout << "\t\t\t computing padded buffer size for a SPA of length "
-						<< n << " while leaving space for an additional offset buffer of length "
-						<< std::max( m, n ) << "...\n";
- #endif
-					// compute bufferOffset
-					bufferOffset = (std::max( m, n ) + 1) * sizeof( NIT );
-
-					// compute value buffer size
-					const size_t valBufSize = n * sizeof( ValueType );
-
- #ifdef _DEBUG_REFERENCE_BLAS3
-					std::cout << "\t\t\t\t bit-array size has byte-size " <<
-						internal::Coordinates< reference >::arraySize( n ) << "\n";
-					std::cout << "\t\t\t\t stack has byte-size " <<
-						internal::Coordinates< reference >::stackSize( n ) << "\n";
-					std::cout << "\t\t\t\t value buffer has byte-size " << valBufSize << "\n";
- #endif
-
-					// compute paddedSPASize
-					paddedSPASize =
-						internal::Coordinates< reference >::arraySize( n ) +
-						internal::Coordinates< reference >::stackSize( n ) +
-						valBufSize;
-					size_t shift =
-						internal::Coordinates< reference >::arraySize( n ) % sizeof(int);
-					if( shift != 0 ) {
-						arrayShift = sizeof(int) - shift;
-						paddedSPASize += arrayShift;
-					}
-					shift = internal::Coordinates< reference >::stackSize( n ) % sizeof(int);
-					if( shift != 0 ) {
-						stackShift = sizeof(int) - shift;
-						paddedSPASize += stackShift;
-					}
-					shift = valBufSize % sizeof(int);
-					if( shift != 0 ) {
-						paddedSPASize += (sizeof(int) - shift);
-					}
-
-					// pad bufferOffset
-					shift = bufferOffset % sizeof(int);
-					if( shift != 0 ) {
-						bufferOffset += (sizeof(int) - shift);
-					}
-
-					// compute free buffer size
-					const size_t freeBufferSize = internal::getCurrentBufferSize< char >() -
-						bufferOffset;
-
-					// compute max number of threads
-					nthreads = 1 + freeBufferSize / paddedSPASize;
- #ifdef _DEBUG_REFERENCE_BLAS3
-					#pragma omp critical
-					std::cout << "\t\t\t free buffer size: " << freeBufferSize
-						<< ", (padded) SPA size: " << paddedSPASize
-						<< " -> supported #threads: " << nthreads << ". "
-						<< " The shifts for the bit-array and the stack are " << arrayShift
-						<< ", respectively, " << stackShift << "."
-						<< "\n";
- #endif
-					// cap the final number of selected threads
-					if( nthreads > max_threads ) {
-						nthreads = max_threads;
-					}
-					if( nthreads > n ) {
-						nthreads = n;
-					}
-				}
-
-				/** @returns The maximum number of supported threads during #grb::mxm */
-				size_t threads() const noexcept {
-					return nthreads;
-				}
-
-				/**
-				 * Requests and returns a global buffer required for a thread-local SPA.
-				 *
-				 * @param[in] t The thread ID. Must be larger than 0.
-				 *
-				 * \note Thread 0 employs the SPA allocated with the output matrix.
-				 *
-				 * @returns Pointer into the global buffer starting at the area reserved for
-				 *          the SPA of thread \a t.
-				 */
-				char * getSPABuffers( size_t t ) const noexcept {
-					assert( t > 0 );
-					(void) --t;
-					char * raw = internal::template getReferenceBuffer< char >(
-						bufferOffset + nthreads * paddedSPASize );
-					assert( reinterpret_cast< uintptr_t >(raw) % sizeof(int) == 0 );
-					raw += bufferOffset;
-					assert( reinterpret_cast< uintptr_t >(raw) % sizeof(int) == 0 );
-					raw += t * paddedSPASize;
-					return raw;
-				}
-
-				/**
-				 * Retrieves the column offset buffer.
-				 *
-				 * @param[out] remainder Returns any remainder buffer beyond that of the row
-				 *                       offset buffer.
-				 * @param[out] rsize     The remainder buffer size \a remainder points to.
-				 *
-				 * If \a remainder is not a <tt>nullptr</tt> then neither should \a rsize,
-				 * and vice versa.
-				 *
-				 * Retrieving any remainder buffer is optional. The default is to not ask
-				 * for them.
-				 *
-				 * \warning If all buffer memory is used for the column offsets, it may be
-				 *          that \a remainder equals <tt>nullptr</tt> and <tt>rsize</tt>
-				 *          zero.
-				 *
-				 * \warning This buffer is only guaranteed exclusive if only the retrieved
-				 *          column buffer is used. In particular, if also requesting (and
-				 *          using) SPA buffers, the remainder buffer area is shared with
-				 *          those SPA buffers, and data races are likely to occur. In other
-				 *          words: be very careful with any use of these remainder buffers.
-				 *
-				 * @returns The column offset buffer.
-				 *
-				 * \warning This buffer overlaps with the CRS offset buffer. The caller
-				 *          must ensure to only ever use one at a time.
-				 */
-				NIT * getColOffsetBuffer(
-					void * * const remainder = nullptr,
-					size_t * const rsize = nullptr
-				) const noexcept {
-					NIT * const ret = internal::template getReferenceBuffer< NIT >( n + 1 );
-					if( remainder != nullptr || rsize != nullptr ) {
-						assert( remainder != nullptr && rsize != nullptr );
-						retrieveRemainderBuffer( (m + 1) * sizeof(NIT), *remainder, *rsize );
-					}
-					return ret;
-				}
-
-				/**
-				 * Retrieves the row offset buffer.
-				 *
-				 * @param[out] remainder Returns any remainder buffer beyond that of the row
-				 *                       offset buffer.
-				 * @param[out] rsize     The remainder buffer size \a remainder points to.
-				 *
-				 * If \a remainder is not a <tt>nullptr</tt> then neither should \a rsize,
-				 * and vice versa.
-				 *
-				 * Retrieving any remainder buffer is optional. The default is to not ask
-				 * for them.
-				 *
-				 * \warning If all buffer memory is used for the row offsets, it may be that
-				 *          \a remainder equals <tt>nullptr</tt> and <tt>rsize</tt> zero.
-				 *
-				 * \warning This buffer is only guaranteed exclusive if only the retrieved
-				 *          row buffer is used. In particular, if also requesting (and
-				 *          using) SPA buffers, the remainder buffer area is shared with
-				 *          those SPA buffers, and data races are likely to occur. In other
-				 *          words: be very careful with any use of these remainder buffers.
-				 *
-				 * @returns The row offset buffer.
-				 *
-				 * \warning This buffer overlaps with the CCS offset buffer. The caller
-				 *          must ensure to only ever use one at a time.
-				 */
-				NIT * getRowOffsetBuffer(
-					void * * const remainder = nullptr,
-					size_t * const rsize = nullptr
-				) const noexcept {
-					NIT * const ret = internal::template getReferenceBuffer< NIT >( m + 1 );
-					if( remainder != nullptr || rsize != nullptr ) {
-						assert( remainder != nullptr && rsize != nullptr );
-						retrieveRemainderBuffer( (m + 1) * sizeof(NIT), *remainder, *rsize );
-					}
-					return ret;
-				}
-
-				/**
-				 * Shifts a pointer into the global buffer by the bit-array size and its
-				 * padding.
-				 *
-				 * @param[in,out] raw On input: an aligned pointer into the global buffer.
-				 *                    On output: an aligned pointer past the bit-array
-				 *                    position.
-				 */
-				void applyArrayShift( char * &raw ) const noexcept {
-					const size_t totalShift =
-						internal::Coordinates< reference >::arraySize( n ) +
-						arrayShift;
- #ifdef _DEBUG_REFERENCE_BLAS3
-					std::cout << "\t\t\t shifting input pointer with "
-						<< internal::Coordinates< reference >::arraySize( n ) << " + "
-						<< arrayShift << " = " << totalShift << "bytes \n";
- #endif
-					raw += totalShift;
-				}
-
-				/**
-				 * Shifts a pointer into the global buffer by the stack size and its
-				 * padding.
-				 *
-				 * @param[in,out] raw On input: an aligned pointer into the global buffer.
-				 *                    On output: an aligned pointer past the stack position.
-				 */
-				void applyStackShift( char * &raw ) const noexcept {
-					const size_t totalShift =
-						internal::Coordinates< reference >::stackSize( n ) +
-						stackShift;
- #ifdef _DEBUG_REFERENCE_BLAS3
-					std::cout << "\t\t\t shifting input pointer with "
-						<< internal::Coordinates< reference >::arraySize( n ) << " + "
-						<< stackShift << " = " << totalShift << "bytes \n";
- #endif
-					raw += totalShift;
-				}
-
-		};
-#endif
-
-		/**
-		 * Retrieves the SPA buffers for the calling thread.
-		 *
-		 * \warning This function must be called from within an OpenMP parallel
-		 *          section.
-		 *
-		 * @param[out]    arr Where the bit-array may be located.
-		 * @param[out]    buf Where the stack may be located.
-		 * @param[out] valbuf Where the value buffer may be located.
-		 *
-		 * All above pointers are aligned on sizeof(int) bytes.
-		 *
-		 * @param[in] md Meta-data for global buffer management.
-		 * @param[in]  C The output matrix.
-		 *
-		 * One thread uses the buffers pre-allocated with the matrix \a C, thus
-		 * ensuring at least one thread may perform the #grb::mxm. Any remainder
-		 * threads can only help process the #grb::mxm if there is enough global
-		 * buffer memory available.
-		 *
-		 *
-		 * \note The global memory has size \f$ \Omega( \mathit{nz} ) \f$, which may
-		 *       be several factors (or even asymptotically greater than)
-		 *       \f$ \max\{ m, n \} \f$.
-		 *
-		 * \note In case the application stores multiple matrices, the global buffer
-		 *       may additionally be greater than the above note indicates if at least
-		 *       one of the other matrices is significantly (or asymptotically) larger
-		 *       than the one involved with the #grb::mxm.
-		 */
-		template<
-			typename OutputType,
-			typename RIT, typename CIT, typename NIT
-		>
-		void mxm_ompPar_getSPABuffers(
-			char * &arr, char * &buf, OutputType * &valbuf,
-			const struct MXM_BufferMetaData< NIT, OutputType > &md,
-			Matrix< OutputType, reference, RIT, CIT, NIT > &C
-		) {
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-			// other threads use the global buffer to create additional SPAs
-			{
-				const size_t t = config::OMP::current_thread_ID();
- #ifndef NDEBUG
-				const size_t T = config::OMP::current_threads();
-				assert( t < T );
- #endif
-				if( t > 0 ) {
- #ifdef _DEBUG_REFERENCE_BLAS3
-					#pragma omp critical
-					std::cout << "\t Thread " << t << " gets buffers from global buffer\n";
- #endif
-					char * rawBuffer = md.getSPABuffers( t );
-					assert( reinterpret_cast< uintptr_t >(rawBuffer) % sizeof(int) == 0 );
-					arr = rawBuffer;
- #ifdef _DEBUG_REFERENCE_BLAS3
-					#pragma omp critical
- #endif
-					md.applyArrayShift( rawBuffer );
-					assert( reinterpret_cast< uintptr_t >(rawBuffer) % sizeof(int) == 0 );
-					buf = rawBuffer;
- #ifdef _DEBUG_REFERENCE_BLAS3
-					#pragma omp critical
- #endif
-					md.applyStackShift( rawBuffer );
-					assert( reinterpret_cast< uintptr_t >(rawBuffer) % sizeof(int) == 0 );
-					assert( buf != arr );
-					valbuf = reinterpret_cast< OutputType * >(rawBuffer);
-					assert( static_cast< void * >(valbuf) != static_cast< void * >(buf) );
-				} else {
- #ifdef _DEBUG_REFERENCE_BLAS3
-					#pragma omp critical
-					std::cout << "\t Thread " << t << " gets buffers from matrix storage\n";
- #endif
-					// one thread uses the standard matrix buffer
-					internal::getMatrixBuffers( arr, buf, valbuf, 1, C );
-				}
- #ifdef _DEBUG_REFERENCE_BLAS3
-				#pragma omp critical
-				{
-					std::cout << "\t Thread " << t << " has SPA array @ "
-						<< static_cast< void * >( arr ) << " and SPA stack @ "
-						<< static_cast< void * >( buf ) << " and SPA values @ "
-						<< static_cast< void * >( valbuf ) << "\n";
-				}
- #endif
-			}
-#else
- #ifdef _DEBUG_REFERENCE_BLAS3
-			std::cout << "\t Reference backend gets buffers from global buffer\n";
- #endif
-			internal::getMatrixBuffers( arr, buf, valbuf, 1, C );
-			(void) md;
-#endif
-		}
-
 		/**
 		 * Given a computed new row_start array, moves the old index and value arrays
 		 * to new offsets. This leaves precisely enough space for the mxm algorithm to
@@ -1215,6 +789,7 @@ namespace grb {
 
 			// a basic analytic model based on the number of nonzeroes
 			size_t max_threads = config::OMP::threads();
+			assert( max_threads > 0 );
 			{
 				size_t target_nnz = 0;
 				if( phase == EXECUTE ) {
@@ -1226,7 +801,7 @@ namespace grb {
 				const size_t nnz_based_nthreads =
 					target_nnz / config::CACHE_LINE_SIZE::value();
 				if( nnz_based_nthreads < max_threads ) {
-					max_threads = nnz_based_nthreads;
+					max_threads = nnz_based_nthreads > 0 ? nnz_based_nthreads : 1;
 				}
 #ifdef _DEBUG_REFERENCE_BLAS3
 				std::cout << "\t simple analytic model selects max threads of "
@@ -1234,7 +809,7 @@ namespace grb {
 #endif
 			}
 
-			MXM_BufferMetaData< NIT, OutputType > bufferMD( m, n, max_threads );
+			SPA_BufferMetaData< NIT, OutputType > bufferMD( m, n, max_threads );
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 			// derive number of threads
@@ -1255,7 +830,7 @@ namespace grb {
 					char * arr = nullptr;
 					char * buf = nullptr;
 					OutputType * valbuf = nullptr;
-					mxm_ompPar_getSPABuffers( arr, buf, valbuf, bufferMD, C );
+					spa_ompPar_getBuffers( arr, buf, valbuf, bufferMD, C );
 
 					// do count
 					size_t local_nzc;
@@ -1296,7 +871,7 @@ namespace grb {
 				char * arr = nullptr;
 				char * buf = nullptr;
 				OutputType * valbuf = nullptr;
-				mxm_ompPar_getSPABuffers( arr, buf, valbuf, bufferMD, C );
+				spa_ompPar_getBuffers( arr, buf, valbuf, bufferMD, C );
 #ifdef _DEBUG_REFERENCE_BLAS3
  #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 				#pragma omp critical
diff --git a/include/graphblas/reference/compressed_storage.hpp b/include/graphblas/reference/compressed_storage.hpp
index 95b67a903..2d40d18cc 100644
--- a/include/graphblas/reference/compressed_storage.hpp
+++ b/include/graphblas/reference/compressed_storage.hpp
@@ -1156,7 +1156,7 @@ namespace grb {
 				{}
 
 				/** Move constructor. */
-				Compressed_Storage< void, IND, SIZE >( SelfType &&other ) {
+				Compressed_Storage( SelfType &&other ) {
 					moveFromOther( other );
 				}
 
diff --git a/include/graphblas/reference/io.hpp b/include/graphblas/reference/io.hpp
index f4f1b0709..b4af46eea 100644
--- a/include/graphblas/reference/io.hpp
+++ b/include/graphblas/reference/io.hpp
@@ -951,14 +951,15 @@ namespace grb {
 			);
 			static_assert(
 				( !(descr & descriptors::no_casting) ||
-					( A_is_mask && std::is_same< InputType2, OutputType >::value ) ),
+					( A_is_mask && std::is_same< InputType2, OutputType >::value ) ||
+					( !A_is_mask && std::is_same< InputType1, OutputType >::value ) ),
 				"grb::internal::set_copy called with non-matching value types. This is an "
 				"internal error. Please submit a bug report."
 			);
 			static_assert(
-				!(descr & descriptors::invert_mask), "internal::grb::set_copy called with "
-				"the invert_mask descriptor. This is an internal error; please submit a "
-				"bug report."
+				!A_is_mask || !(descr & descriptors::invert_mask),
+				"internal::grb::set_copy called with the invert_mask descriptor. This is "
+				"an internal error; please submit a bug report."
 			);
 
 			// run-time checks
@@ -2028,6 +2029,274 @@ namespace grb {
 		}
 	}
 
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename MaskType, typename InputType,
+		typename RIT1, typename CIT1, typename NIT1,
+		typename RIT2, typename CIT2, typename NIT2,
+		typename RIT3, typename CIT3, typename NIT3
+	>
+	RC set(
+		Matrix< OutputType, reference, RIT1, CIT1, NIT1 > &C,
+		const Matrix< MaskType, reference, RIT2, CIT2, NIT2 > &M,
+		const Matrix< InputType, reference, RIT3, CIT3, NIT3 > &A,
+		const Phase &phase = EXECUTE
+	) noexcept {
+		// static checks
+		static_assert(
+			!std::is_void< InputType >::value ||
+				std::is_void< OutputType >::value, "grb::set( masked set to matrix ): "
+			"cannot have a pattern matrix as input unless the output is also a pattern "
+			"matrix"
+		);
+		static_assert(
+			std::is_convertible< InputType, OutputType >::value ||
+				std::is_void< OutputType >::value,
+			"grb::set (masked set to matrix): input type cannot be "
+			"converted to output type"
+		);
+		static_assert(
+			!(descr & descriptors::structural && (descr & descriptors::invert_mask)),
+			"grb::set (masked set to matrix) may not be called with both the structural "
+			"and invert_mask descriptors set"
+		);
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< InputType, OutputType >::value
+			), "grb::set",
+			"called with non-matching value types"
+		);
+
+		// dynamic checks
+#ifdef _DEBUG_REFERENCE_IO
+		std::cout << "Called grb::set (matrix-to-matrix-masked, reference)\n";
+#endif
+		assert( phase != TRY );
+		const size_t nrows = grb::nrows( C );
+		const size_t ncols = grb::ncols( C );
+		const size_t m = grb::nrows( M );
+		const size_t n = grb::ncols( M );
+
+		// check for trivial dispatch first (otherwise the below checks fail when they
+		// should not)
+		if( m == 0 || n == 0 ) {
+#ifdef _DEBUG_REFERENCE_IO
+			std::cout << "\t delegating to unmasked matrix-to-matrix set, reference\n";
+#endif
+			// If the mask is empty, ignore it
+			return set< descr >( C, A, phase );
+		}
+
+		// dynamic checks, continued
+		if( nrows != grb::nrows( A ) || nrows != m ) {
+			return MISMATCH;
+		}
+
+		if( ncols != grb::ncols( A ) || ncols != n ) {
+			return MISMATCH;
+		}
+
+		// go for implementation, preliminaries:
+		size_t nzc = 0;
+		const auto &A_raw = internal::getCRS( A );
+		const auto &mask_raw = internal::getCRS( M );
+		// we now have one (guaranteed) SPA, which is mask_coors. We now are going to
+		// check how many more SPAs ideally we would like (for reference_omp), and
+		// then go about trying to get those. If, finally, we get just this one SPA,
+		// we will go into this mostly-sequential code (essentially, big-Omega nrows):
+#ifdef _H_GRB_REFERENCE_OMP_IO
+		const size_t nnz_based_nthreads = std::max( config::OMP::threads(),
+			grb::nnz( A ) / config::CACHE_LINE_SIZE::value() );
+		grb::internal::SPA_BufferMetaData< NIT1, OutputType > bufferMD( m, n,
+			nnz_based_nthreads );
+		const size_t nthreads = bufferMD.threads();
+		std::cout << "\t set( matrix, matrix, matrix ) will use " << nthreads
+			<< " threads\n";
+#else
+		const size_t nthreads = 1;
+#endif
+		if( nthreads == 1 ) {
+			char * arr = nullptr;
+			char * buf = nullptr;
+			OutputType * valbuf = nullptr;
+			internal::Coordinates< reference > coors;
+			internal::getMatrixBuffers( arr, buf, valbuf, 1, C );
+			coors.set( arr, false, buf, ncols );
+			for( size_t i = 0; i < nrows; ++i ) {
+				coors.clear();
+				for( auto k = mask_raw.col_start[ i ]; k < mask_raw.col_start[ i + 1 ]; ++k ) {
+					const auto k_col = mask_raw.row_index[ k ];
+					if( utils::interpretMatrixMask< descr, MaskType >( true, mask_raw.getValues(), k ) ) {
+						coors.assign( k_col );
+					}
+				}
+#ifdef _H_GRB_REFERENCE_OMP_IO
+				#pragma omp parallel for reduction( +: nzc ) \
+					schedule( dynamic, config::CACHE_LINE_SIZE::value() )
+#endif
+				for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+					const auto k_col = A_raw.row_index[ k ];
+					if( coors.assigned( k_col ) ) {
+						(void) ++nzc;
+					}
+				}
+			}
+		} else {
+#ifdef _H_GRB_REFERENCE_OMP_IO
+			#pragma omp parallel num_threads( nthreads ) reduction( +: nzc )
+			{
+				// get thread-local buffers
+				size_t local_nz = 0;
+				char * arr = nullptr;
+				char * buf = nullptr;
+				OutputType * valbuf = nullptr;
+				internal::spa_ompPar_getBuffers( arr, buf, valbuf, bufferMD, C );
+				internal::Coordinates< reference > coors;
+				coors.set_seq( arr, false, buf, n );
+				// follow dynamic schedule since we cannot predict sparsity structure
+				#pragma omp for schedule( dynamic, config::CACHE_LINE_SIZE::value() )
+				for( size_t i = 0; i < nrows; ++i ) {
+					coors.clear();
+					for( auto k = mask_raw.col_start[ i ]; k < mask_raw.col_start[ i + 1 ]; ++k ) {
+						const auto k_col = mask_raw.row_index[ k ];
+						if( utils::interpretMatrixMask< descr, MaskType >( true, mask_raw.getValues(), k ) ) {
+							coors.assign( k_col );
+						}
+					}
+					for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+						const auto k_col = A_raw.row_index[ k ];
+						if( coors.assigned( k_col ) ) {
+							(void) ++local_nz;
+						}
+					}
+				}
+				nzc += local_nz;
+			}
+#else
+			const bool code_path_should_not_be_reached = false;
+			std::cerr << "\t logic error in grb::set( matrix, matrix, matrix ): "
+				<< "code path should not reach here. Please submit a bug report\n";
+			assert( code_path_should_not_be_reached );
+ #ifdef NDEBUG
+			(void) code_path_should_not_be_reached;
+ #endif
+#endif
+		}
+
+		// we now have a count. If we're in the resize phase that means we're done:
+		if( phase == RESIZE ) {
+			return resize( C, nzc );
+		}
+
+		// otherwise, we now compute the output. We start with checking capacity
+		assert( phase == EXECUTE );
+		if( capacity( C ) < nzc ) {
+#ifdef _DEBUG
+			std::cout << "\t insufficient capacity to complete "
+				"requested masked set matrix to matrix computation\n";
+#endif
+			const RC clear_rc = clear( C );
+			if( clear_rc != SUCCESS ) {
+				return PANIC;
+			} else {
+				return ILLEGAL;
+			}
+		}
+
+		// get output CRS and CCS structures
+		// TODO: check for crs_only descriptor
+		auto &CRS_raw = internal::getCRS( C );
+		auto &CCS_raw = internal::getCCS( C );
+		config::NonzeroIndexType * C_col_index = internal::template
+			getReferenceBuffer< typename config::NonzeroIndexType >( ncols + 1 );
+		CRS_raw.col_start[ 0 ] = 0;
+
+#ifdef _H_GRB_REFERENCE_OMP_IO
+		// TODO ALPify the below
+		#pragma omp parallel for simd
+#endif
+		for( size_t j = 0; j <= ncols; ++j ) {
+			CCS_raw.col_start[ j ] = 0;
+			C_col_index[ j ] = 0;
+		}
+
+		//TODO: revise the below, WIP
+		char * arr = nullptr;
+		char * buf = nullptr;
+		MaskType * valbuf = nullptr;
+		internal::Coordinates< reference > coors;
+		internal::getMatrixBuffers( arr, buf, valbuf, 1, M );
+		coors.set( arr, false, buf, ncols );
+
+		// do counting sort, phase 1 -- also this loop should employ the same
+		// parallelisation strategy during counting
+		nzc = 0;
+		for( size_t i = 0; i < nrows; ++i ) {
+			coors.clear();
+			for( auto k = mask_raw.col_start[ i ]; k < mask_raw.col_start[ i + 1 ]; ++k ) {
+				const auto k_col = mask_raw.row_index[ k ];
+				if( utils::interpretMask< descr, MaskType >( true, mask_raw.getValues(), k ) ) {
+					coors.assign( k_col );
+				}
+			}
+			for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+				const auto k_col = A_raw.row_index[ k ];
+				if( coors.assigned( k_col ) ) {
+					(void) nzc++;
+					(void) (CCS_raw.col_start[ k_col + 1 ])++;
+				}
+			}
+			CRS_raw.col_start[ i + 1 ] = nzc;
+		}
+
+		// TODO this is a prefix sum -- use the OMP utility function here to
+		//      parallelise it
+		for( size_t j = 1; j < ncols; ++j ) {
+			CCS_raw.col_start[ j + 1 ] += CCS_raw.col_start[ j ];
+		}
+
+		// do counting sort, phase 2 -- use previously computed CCS offset array to
+		// update CCS during the computational phase. Also this loop should employ
+		// the same (multiple-SPA) parallelisation strategy as above
+		nzc = 0;
+		for( size_t i = 0; i < nrows; ++i ) {
+			coors.clear();
+			for( auto k = mask_raw.col_start[ i ]; k < mask_raw.col_start[ i + 1 ]; ++k ) {
+				const auto k_col = mask_raw.row_index[ k ];
+				if( utils::interpretMatrixMask< descr, MaskType >(
+					true, mask_raw.getValues(), k )
+				) {
+					coors.assign( k_col );
+				}
+			}
+			for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+				const auto k_col = A_raw.row_index[ k ];
+				if( coors.assigned( k_col ) ) {
+					constexpr int zero = 0;
+					CRS_raw.row_index[ nzc ] = k_col;
+					CRS_raw.setValue( nzc, A_raw.getValue( k, zero ) );
+					const size_t CCS_index = C_col_index[ k_col ] + CCS_raw.col_start[ k_col ];
+					(void) C_col_index[ k_col ]++;
+					CCS_raw.row_index[ CCS_index ] = i;
+					CCS_raw.setValue( CCS_index, A_raw.getValue( k, zero ) );
+					(void) nzc++;
+				}
+			}
+		}
+#ifndef NDEBUG
+ #ifdef _H_GRB_REFERENCE_OMP_BLAS3
+		#pragma omp parallel schedule( static, config::CACHE_LINE_SIZE::value() )
+ #endif
+		for( size_t j = 0; j < ncols; ++j ) {
+			assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] ==
+				C_col_index[ j ] );
+		}
+#endif
+		internal::setCurrentNonzeroes( C, nzc );
+
+		// done
+		return SUCCESS;
+	}
+
 	/**
 	 * Ingests raw data into a GraphBLAS vector.
 	 *
diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index c1b3a365d..84873a22c 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -1283,7 +1283,9 @@ namespace grb {
 					"We hit here a configuration border case which the implementation does not "
 					"handle at present. Please submit a bug report."
 				);
-				// compute and return
+				// compute and return the maximum of
+				//  - row- and column-wise buffers. The added factor two is for padding
+				//  - minimal buffer requirement for parallel buildMatrixUnique
 				return std::max( (m + n + 2) * globalBufferUnitSize,
 #ifdef _H_GRB_REFERENCE_OMP_MATRIX
 					config::OMP::threads() * config::CACHE_LINE_SIZE::value() *
@@ -2364,7 +2366,434 @@ namespace grb {
 			);
 			return ret;
 		}
+
+		/**
+		 * Meta-data for global buffer management for use with #grb::mxm and the
+		 * matrix-matrix-matrix variant of #grb::set.
+		 *
+		 * This class contains all meta-data necessary to interpret the global buffer
+		 * as an array of sparse accumulators (SPAs). The length of the array is given
+		 * by a call to #threads(), minus one. It is called that since a call to
+		 * #threads() retrieves how many threads can be used to process the call to
+		 * #grb::mxm.
+		 *
+		 * Each SPA has the layout (bitarray, stack, valueArray). These are packed in a
+		 * padded byte array, such that each bit array, stack, and value array is
+		 * aligned on sizeof(int) bytes.
+		 *
+		 * @tparam NIT       The nonzero index type.
+		 * @tparam ValueType The output matrix value type.
+		 *
+		 * This meta-data class applies to both the sequential (reference) and shared-
+		 * memory parallel (reference_omp) backends.
+		 */
+		template< typename NIT, typename ValueType >
+		class SPA_BufferMetaData {
+
+			static_assert( sizeof(NIT) % sizeof(int) == 0, "Unsupported type for NIT; "
+				"please submit a bug report!" );
+
+			private:
+
+				/** The size of the offset array */
+				size_t m;
+
+				/** The size of the SPA */
+				size_t n;
+
+				/** The number of threads supported during a call to #grb::mxm */
+				size_t nthreads;
+
+				/** The initial buffer offset */
+				size_t bufferOffset;
+
+				/** The size of a single SPA, including bytes needed for padding */
+				size_t paddedSPASize;
+
+				/** The number of bytes to pad the SPA array with */
+				size_t arrayShift;
+
+				/** The number of bytes to pad the SPA stack with */
+				size_t stackShift;
+
+				/**
+				 * Given a number of used bytes of the buffer, calculate the available
+				 * remainder buffer and return it.
+				 *
+				 * @param[in]  osize     The size of the buffer (in bytes) that is already
+				 *                       in use.
+				 * @param[out] remainder Pointer to any remainder buffer.
+				 * @param[out] rsize     The size of the remainder buffer.
+				 *
+				 * If no buffer space is left, \a remainder will be set to <tt>nullptr</tt>
+				 * and \a size to <tt>0</tt>.
+				 */
+				void retrieveRemainderBuffer(
+					const size_t osize,
+					void * &remainder, size_t &rsize
+				) const noexcept {
+					const size_t size = internal::template getCurrentBufferSize< char >();
+					char * rem = internal::template getReferenceBuffer< char >( size );
+					size_t rsize_calc = size - osize;
+					rem += osize;
+					const size_t mod = reinterpret_cast< uintptr_t >(rem) % sizeof(int);
+					if( mod ) {
+						const size_t shift = sizeof(int) - mod;
+						if( rsize_calc >= shift ) {
+							rsize_calc -= shift;
+							rem += rsize;
+						} else {
+							rsize_calc = 0;
+							rem = nullptr;
+						}
+					}
+					assert( !(reinterpret_cast< uintptr_t >(rem) % sizeof(int)) );
+					// write out
+					remainder = rem;
+					rsize = rsize_calc;
+				}
+
+
+			public:
+
+				/**
+				 * Base constructor.
+				 *
+				 * @param[in] _m          The length of the offset array.
+				 * @param[in] _n          The length of the SPA.
+				 * @param[in] max_threads The maximum number of threads.
+				 *
+				 * \note \a max_threads is a separate input since there might be a need to
+				 *       cap the maximum number of threads used based on some analytic
+				 *       performance model. Rather than putting such a performance model
+				 *       within this class, we make it an obligatory input parameter
+				 *       instead.
+				 *
+				 * \note It is always valid to pass <tt>config::OMP::threads()</tt>.
+				 *
+				 * \note This class \em will, however, cap the number of threads returned
+				 *       to \a _n.
+				 */
+				SPA_BufferMetaData(
+					const size_t _m, const size_t _n,
+					const size_t max_threads
+				) : m( _m ), n( _n ), arrayShift( 0 ), stackShift( 0 ) {
+ #ifdef _DEBUG_REFERENCE_MATRIX
+					#pragma omp critical
+					std::cout << "\t\t\t computing padded buffer size for a SPA of length "
+						<< n << " while leaving space for an additional offset buffer of length "
+						<< std::max( m, n ) << "...\n";
+ #endif
+					// compute bufferOffset
+					bufferOffset = (std::max( m, n ) + 1) * sizeof( NIT );
+
+					// compute value buffer size
+					const size_t valBufSize = n * utils::SizeOf< ValueType >::value;
+
+ #ifdef _DEBUG_REFERENCE_MATRIX
+					std::cout << "\t\t\t\t bit-array size has byte-size " <<
+						internal::Coordinates< reference >::arraySize( n ) << "\n";
+					std::cout << "\t\t\t\t stack has byte-size " <<
+						internal::Coordinates< reference >::stackSize( n ) << "\n";
+					std::cout << "\t\t\t\t value buffer has byte-size " << valBufSize << "\n";
+ #endif
+
+					// compute paddedSPASize
+					paddedSPASize =
+						internal::Coordinates< reference >::arraySize( n ) +
+						internal::Coordinates< reference >::stackSize( n ) +
+						valBufSize;
+					size_t shift =
+						internal::Coordinates< reference >::arraySize( n ) % sizeof(int);
+					if( shift != 0 ) {
+						arrayShift = sizeof(int) - shift;
+						paddedSPASize += arrayShift;
+					}
+					shift = internal::Coordinates< reference >::stackSize( n ) % sizeof(int);
+					if( shift != 0 ) {
+						stackShift = sizeof(int) - shift;
+						paddedSPASize += stackShift;
+					}
+					shift = valBufSize % sizeof(int);
+					if( shift != 0 ) {
+						paddedSPASize += (sizeof(int) - shift);
+					}
+
+					// pad bufferOffset
+					shift = bufferOffset % sizeof(int);
+					if( shift != 0 ) {
+						bufferOffset += (sizeof(int) - shift);
+					}
+
+					// compute free buffer size
+					const size_t freeBufferSize = internal::getCurrentBufferSize< char >() -
+						bufferOffset;
+
+					// compute max number of threads
+					nthreads = 1 + freeBufferSize / paddedSPASize;
+ #ifdef _DEBUG_REFERENCE_MATRIX
+					#pragma omp critical
+					std::cout << "\t\t\t free buffer size: " << freeBufferSize
+						<< ", (padded) SPA size: " << paddedSPASize
+						<< ", bufferOffset: " << bufferOffset
+						<< " -> supported #threads: " << nthreads << ". "
+						<< " The shifts for the bit-array and the stack are " << arrayShift
+						<< ", respectively, " << stackShift << "."
+						<< "\n";
+ #endif
+					// cap the final number of selected threads
+					if( nthreads > max_threads ) {
+						nthreads = max_threads;
+					}
+					if( nthreads > n ) {
+						nthreads = n;
+					}
+				}
+
+				/** @returns The maximum number of supported threads during #grb::mxm */
+				size_t threads() const noexcept {
+					return nthreads;
+				}
+
+				/**
+				 * Requests and returns a global buffer required for a thread-local SPA.
+				 *
+				 * @param[in] t The thread ID. Must be larger than 0.
+				 *
+				 * \note Thread 0 employs the SPA allocated with the output matrix.
+				 *
+				 * @returns Pointer into the global buffer starting at the area reserved for
+				 *          the SPA of thread \a t.
+				 */
+				char * getSPABuffers( size_t t ) const noexcept {
+					assert( t > 0 );
+					assert( nthreads > 1 );
+					(void) --t;
+					char * raw = internal::template getReferenceBuffer< char >(
+						bufferOffset + (nthreads - 1) * paddedSPASize );
+					assert( reinterpret_cast< uintptr_t >(raw) % sizeof(int) == 0 );
+					raw += bufferOffset;
+					assert( reinterpret_cast< uintptr_t >(raw) % sizeof(int) == 0 );
+					raw += t * paddedSPASize;
+					return raw;
+				}
+
+				/**
+				 * Retrieves the column offset buffer.
+				 *
+				 * @param[out] remainder Returns any remainder buffer beyond that of the row
+				 *                       offset buffer.
+				 * @param[out] rsize     The remainder buffer size \a remainder points to.
+				 *
+				 * If \a remainder is not a <tt>nullptr</tt> then neither should \a rsize,
+				 * and vice versa.
+				 *
+				 * Retrieving any remainder buffer is optional. The default is to not ask
+				 * for them.
+				 *
+				 * \warning If all buffer memory is used for the column offsets, it may be
+				 *          that \a remainder equals <tt>nullptr</tt> and <tt>rsize</tt>
+				 *          zero.
+				 *
+				 * \warning This buffer is only guaranteed exclusive if only the retrieved
+				 *          column buffer is used. In particular, if also requesting (and
+				 *          using) SPA buffers, the remainder buffer area is shared with
+				 *          those SPA buffers, and data races are likely to occur. In other
+				 *          words: be very careful with any use of these remainder buffers.
+				 *
+				 * @returns The column offset buffer.
+				 *
+				 * \warning This buffer overlaps with the CRS offset buffer. The caller
+				 *          must ensure to only ever use one at a time.
+				 */
+				NIT * getColOffsetBuffer(
+					void * * const remainder = nullptr,
+					size_t * const rsize = nullptr
+				) const noexcept {
+					NIT * const ret = internal::template getReferenceBuffer< NIT >( n + 1 );
+					if( remainder != nullptr || rsize != nullptr ) {
+						assert( remainder != nullptr && rsize != nullptr );
+						retrieveRemainderBuffer( (m + 1) * sizeof(NIT), *remainder, *rsize );
+					}
+					return ret;
+				}
+
+				/**
+				 * Retrieves the row offset buffer.
+				 *
+				 * @param[out] remainder Returns any remainder buffer beyond that of the row
+				 *                       offset buffer.
+				 * @param[out] rsize     The remainder buffer size \a remainder points to.
+				 *
+				 * If \a remainder is not a <tt>nullptr</tt> then neither should \a rsize,
+				 * and vice versa.
+				 *
+				 * Retrieving any remainder buffer is optional. The default is to not ask
+				 * for them.
+				 *
+				 * \warning If all buffer memory is used for the row offsets, it may be that
+				 *          \a remainder equals <tt>nullptr</tt> and <tt>rsize</tt> zero.
+				 *
+				 * \warning This buffer is only guaranteed exclusive if only the retrieved
+				 *          row buffer is used. In particular, if also requesting (and
+				 *          using) SPA buffers, the remainder buffer area is shared with
+				 *          those SPA buffers, and data races are likely to occur. In other
+				 *          words: be very careful with any use of these remainder buffers.
+				 *
+				 * @returns The row offset buffer.
+				 *
+				 * \warning This buffer overlaps with the CCS offset buffer. The caller
+				 *          must ensure to only ever use one at a time.
+				 */
+				NIT * getRowOffsetBuffer(
+					void * * const remainder = nullptr,
+					size_t * const rsize = nullptr
+				) const noexcept {
+					NIT * const ret = internal::template getReferenceBuffer< NIT >( m + 1 );
+					if( remainder != nullptr || rsize != nullptr ) {
+						assert( remainder != nullptr && rsize != nullptr );
+						retrieveRemainderBuffer( (m + 1) * sizeof(NIT), *remainder, *rsize );
+					}
+					return ret;
+				}
+
+				/**
+				 * Shifts a pointer into the global buffer by the bit-array size and its
+				 * padding.
+				 *
+				 * @param[in,out] raw On input: an aligned pointer into the global buffer.
+				 *                    On output: an aligned pointer past the bit-array
+				 *                    position.
+				 */
+				void applyArrayShift( char * &raw ) const noexcept {
+					const size_t totalShift =
+						internal::Coordinates< reference >::arraySize( n ) +
+						arrayShift;
+ #ifdef _DEBUG_REFERENCE_MATRIX
+					std::cout << "\t\t\t shifting input pointer with "
+						<< internal::Coordinates< reference >::arraySize( n ) << " + "
+						<< arrayShift << " = " << totalShift << "bytes \n";
+ #endif
+					raw += totalShift;
+				}
+
+				/**
+				 * Shifts a pointer into the global buffer by the stack size and its
+				 * padding.
+				 *
+				 * @param[in,out] raw On input: an aligned pointer into the global buffer.
+				 *                    On output: an aligned pointer past the stack position.
+				 */
+				void applyStackShift( char * &raw ) const noexcept {
+					const size_t totalShift =
+						internal::Coordinates< reference >::stackSize( n ) +
+						stackShift;
+ #ifdef _DEBUG_REFERENCE_MATRIX
+					std::cout << "\t\t\t shifting input pointer with "
+						<< internal::Coordinates< reference >::arraySize( n ) << " + "
+						<< stackShift << " = " << totalShift << "bytes \n";
+ #endif
+					raw += totalShift;
+				}
+
+		};
+#endif
+
+		/**
+		 * Retrieves the SPA buffers for the calling thread.
+		 *
+		 * \warning This function must be called from within an OpenMP parallel
+		 *          section.
+		 *
+		 * @param[out]    arr Where the bit-array may be located.
+		 * @param[out]    buf Where the stack may be located.
+		 * @param[out] valbuf Where the value buffer may be located.
+		 *
+		 * All above pointers are aligned on sizeof(int) bytes.
+		 *
+		 * @param[in] md Meta-data for global buffer management.
+		 * @param[in]  C The output matrix.
+		 *
+		 * One thread uses the buffers pre-allocated with the matrix \a C, thus
+		 * ensuring at least one thread may perform the #grb::mxm. Any remainder
+		 * threads can only help process the #grb::mxm if there is enough global
+		 * buffer memory available.
+		 *
+		 *
+		 * \note The global memory has size \f$ \Omega( \mathit{nz} ) \f$, which may
+		 *       be several factors (or even asymptotically greater than)
+		 *       \f$ \max\{ m, n \} \f$.
+		 *
+		 * \note In case the application stores multiple matrices, the global buffer
+		 *       may additionally be greater than the above note indicates if at least
+		 *       one of the other matrices is significantly (or asymptotically) larger
+		 *       than the one involved with the #grb::mxm.
+		 */
+		template<
+			typename OutputType,
+			typename RIT, typename CIT, typename NIT
+		>
+		void spa_ompPar_getBuffers(
+			char * &arr, char * &buf, OutputType * &valbuf,
+			const struct SPA_BufferMetaData< NIT, OutputType > &md,
+			Matrix< OutputType, reference, RIT, CIT, NIT > &C
+		) {
+#ifdef _H_GRB_REFERENCE_OMP_MATRIX
+			// other threads use the global buffer to create additional SPAs
+			{
+				const size_t t = config::OMP::current_thread_ID();
+ #ifndef NDEBUG
+				const size_t T = config::OMP::current_threads();
+				assert( t < T );
+ #endif
+				if( t > 0 ) {
+ #ifdef _DEBUG_REFERENCE_MATRIX
+					#pragma omp critical
+					std::cout << "\t Thread " << t << " gets buffers from global buffer\n";
+ #endif
+					char * rawBuffer = md.getSPABuffers( t );
+					assert( reinterpret_cast< uintptr_t >(rawBuffer) % sizeof(int) == 0 );
+					arr = rawBuffer;
+ #ifdef _DEBUG_REFERENCE_MATRIX
+					#pragma omp critical
+ #endif
+					md.applyArrayShift( rawBuffer );
+					assert( reinterpret_cast< uintptr_t >(rawBuffer) % sizeof(int) == 0 );
+					buf = rawBuffer;
+ #ifdef _DEBUG_REFERENCE_MATRIX
+					#pragma omp critical
+ #endif
+					md.applyStackShift( rawBuffer );
+					assert( reinterpret_cast< uintptr_t >(rawBuffer) % sizeof(int) == 0 );
+					assert( buf != arr );
+					valbuf = reinterpret_cast< OutputType * >(rawBuffer);
+					assert( static_cast< void * >(valbuf) != static_cast< void * >(buf) );
+				} else {
+ #ifdef _DEBUG_REFERENCE_MATRIX
+					#pragma omp critical
+					std::cout << "\t Thread " << t << " gets buffers from matrix storage\n";
+ #endif
+					// one thread uses the standard matrix buffer
+					internal::getMatrixBuffers( arr, buf, valbuf, 1, C );
+				}
+ #ifdef _DEBUG_REFERENCE_MATRIX
+				#pragma omp critical
+				{
+					std::cout << "\t Thread " << t << " has SPA array @ "
+						<< static_cast< void * >( arr ) << " and SPA stack @ "
+						<< static_cast< void * >( buf ) << " and SPA values @ "
+						<< static_cast< void * >( valbuf ) << "\n";
+				}
+ #endif
+			}
+#else
+ #ifdef _DEBUG_REFERENCE_MATRIX
+			std::cout << "\t Reference backend gets buffers from global buffer\n";
+ #endif
+			internal::getMatrixBuffers( arr, buf, valbuf, 1, C );
+			(void) md;
 #endif
+		}
 
 	} // end namespace grb::internal
 
diff --git a/include/graphblas/utils.hpp b/include/graphblas/utils.hpp
index 3ab508eec..4a2235501 100644
--- a/include/graphblas/utils.hpp
+++ b/include/graphblas/utils.hpp
@@ -384,9 +384,11 @@ namespace grb {
 				return ret;
 			}
 		}
-
-		/** Specialisation for void-valued matrice's masks */
-		template< Descriptor descriptor, typename MatrixDataType, typename ValuesType >
+		/** Specialisation for void-valued matrix masks */
+		template<
+			Descriptor descriptor,
+			typename MatrixDataType, typename ValuesType
+		>
 		static bool interpretMatrixMask(
 			const bool &assigned,
 			const ValuesType * const values,
@@ -398,8 +400,11 @@ namespace grb {
 			return interpretMask< descriptor, ValuesType >( assigned, values, k );
 		}
 
-		/** Specialisation for void-valued matrice's masks */
-		template< Descriptor descriptor, typename MatrixDataType, typename ValuesType >
+		/** Specialisation for void-valued matrix masks */
+		template<
+			Descriptor descriptor,
+			typename MatrixDataType, typename ValuesType
+		>
 		static bool interpretMatrixMask(
 			const bool &assigned,
 			const ValuesType * const,
diff --git a/include/graphblas/utils/parser/matrixFileReaderBase.hpp b/include/graphblas/utils/parser/matrixFileReaderBase.hpp
index c5cdf8b7a..15aa3e4f8 100644
--- a/include/graphblas/utils/parser/matrixFileReaderBase.hpp
+++ b/include/graphblas/utils/parser/matrixFileReaderBase.hpp
@@ -280,7 +280,7 @@ namespace grb {
 						properties._nz = nz;
 						properties._entries = entries;
 						properties._pattern = pattern;
-						properties._symmetric = symmetric;
+						properties._symmetric = symmetric ? Symmetric : General;
 						properties._direct = direct;
 						properties._symmetricmap = symmetricmap;
 						// check for existance of file
diff --git a/src/graphblas/hyperdags/hyperdags.cpp b/src/graphblas/hyperdags/hyperdags.cpp
index 256b31fba..320b522f5 100644
--- a/src/graphblas/hyperdags/hyperdags.cpp
+++ b/src/graphblas/hyperdags/hyperdags.cpp
@@ -215,6 +215,9 @@ std::string grb::internal::hyperdags::toString(
 		case SET_MATRIX_MATRIX_INPUT2:
 			return "set( matrix, matrix, scalar )";
 
+		case SET_MATRIX_MATRIX_MASKED:
+			return "set( matrix, matrix, matrix )";
+
 		case MXM_MATRIX_MATRIX_MATRIX_MONOID:
 			return "mxm( matrix, matrix, matrix, monoid, scalar, scalar )";
 
diff --git a/tests/unit/id.cpp b/tests/unit/id.cpp
index a7a7cdb0b..8711369ad 100644
--- a/tests/unit/id.cpp
+++ b/tests/unit/id.cpp
@@ -216,7 +216,7 @@ void grb_program2( const struct input &in, struct output &out ) {
 	}
 
 	/**
-	 * Test for move assignement id cleanup.
+	 * Test for move assignment id cleanup.
 	 *
 	 * Creating and performing move assignment on multiple new objects and check
 	 * for collisions.
diff --git a/tests/unit/matrixSet.cpp b/tests/unit/matrixSet.cpp
index c6ab6739d..da3d48f64 100644
--- a/tests/unit/matrixSet.cpp
+++ b/tests/unit/matrixSet.cpp
@@ -19,6 +19,7 @@
 #include <iostream>
 
 #include <graphblas.hpp>
+#include <graphblas/algorithms/matrix_factory.hpp>
 
 
 using namespace grb;
@@ -27,8 +28,356 @@ static const int data1[ 15 ] = { 4, 7, 4, 6, 4, 7, 1, 7, 3, 6, 7, 5, 1, 8, 7 };
 static const size_t I[ 15 ] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 8, 7, 6 };
 static const size_t J[ 15 ] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 5, 7, 5, 1 };
 
+/** Generic implementation of masked tests */
+template<
+	Descriptor descr = descriptors::no_operation,
+	typename Tout, typename Tmask, typename Tin
+>
+RC masked_tests_generic_impl(
+	RC &rc, grb::Matrix< Tout > &output,
+	const grb::Matrix< Tmask > &mask, const grb::Matrix< Tin > &input,
+	const size_t n
+) {
+	const bool emptyMask = grb::nrows( mask ) == 0 || grb::ncols( mask ) == 0;
+
+	std::cout << "\t\t with structural descriptor\n";
+	rc = grb::set< descr | descriptors::structural >( output, mask, input );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t grb::set structural (matrix to matrix masked) FAILED\n";
+		return rc;
+	}
+	if( grb::nnz( output ) != 2 * n - 1 ) {
+		std::cerr << "\t unexpected number of output elements ( "
+			<< grb::nnz( output ) << " ), expected " << 2 * n - 1 <<".\n";
+		rc = FAILED;
+	}
+	for( const auto &triplet : output ) {
+		if(
+			triplet.first.first != triplet.first.second &&
+			triplet.first.first != triplet.first.second - 1
+		) {
+			std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", "
+				<< triplet.first.second << " ), value " << triplet.second << ".\n";
+			rc = FAILED;
+		}
+		if( triplet.first.first != static_cast< size_t >(triplet.second) ) {
+			std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", "
+				<< triplet.first.second << " ) with value " << triplet.second;
+			std::cerr << ", expected value "<< triplet.first.first <<".\n";
+			rc = FAILED;
+		}
+	}
+	if( rc != SUCCESS ) { return rc; }
+
+	std::cout << "\t\t without descriptor\n";
+	rc = grb::set< descr >( output, mask, input );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t grb::set (matrix to matrix masked) FAILED\n";
+		return rc;
+	}
+	if( emptyMask ) {
+		if( grb::nnz( output ) != 2 * n - 1 ) {
+			std::cerr << "\t unexpected number of output elements ( "
+				<< grb::nnz( output ) << " ), expected " << 2 * n - 1 <<".\n";
+			rc = FAILED;
+		}
+	} else {
+		if( grb::nnz( output ) != n ) {
+			std::cerr << "\t unexpected number of output elements ( "
+				<< grb::nnz( output ) << " ), expected " << n <<".\n";
+			rc = FAILED;
+		}
+	}
+	for( const auto &triplet : output ) {
+		if( emptyMask ) {
+			if( triplet.first.first != triplet.first.second &&
+				triplet.first.first != triplet.first.second - 1
+			) {
+				std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", "
+					<< triplet.first.second << " ), value " << triplet.second << ".\n";
+				rc = FAILED;
+			}
+		} else {
+			if( triplet.first.first != triplet.first.second ) {
+				std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", "
+					<< triplet.first.second << " ), value " << triplet.second << ".\n";
+				rc = FAILED;
+			}
+		}
+		if( triplet.first.first != static_cast< size_t >(triplet.second) ) {
+			std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", "
+				<< triplet.first.second << " ) with value " << triplet.second;
+			std::cerr << ", expected value "<< triplet.first.first <<".\n";
+			rc = FAILED;
+		}
+	}
+	if( rc != SUCCESS ) { return rc; }
+
+	// done
+	return rc;
+}
+
+/** Specialisation for void output */
+template<
+	Descriptor descr = descriptors::no_operation,
+	typename Tmask, typename Tin
+>
+RC masked_tests_generic_impl(
+	RC &rc, grb::Matrix< void > &output,
+	const grb::Matrix< Tmask > &mask, const grb::Matrix< Tin > &input,
+	const size_t n
+) {
+	const bool emptyMask = grb::nrows( mask ) == 0 || grb::ncols( mask ) == 0;
+
+	std::cout << "\t\t with structural descriptor\n";
+	rc = grb::set< descr | descriptors::structural >( output, mask, input );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t grb::set structural (matrix to matrix masked) FAILED\n";
+		return rc;
+	}
+	if( grb::nnz( output ) != 2 * n - 1 ) {
+		std::cerr << "\t unexpected number of output elements ( "
+			<< grb::nnz( output ) << " ), expected " << 2 * n - 1 <<".\n";
+		rc = FAILED;
+	}
+	for( const auto &triplet : output ) {
+		if(
+			triplet.first != triplet.second &&
+			triplet.first != triplet.second - 1
+		) {
+			std::cerr << "\t unexpected entry at ( " << triplet.first << ", "
+				<< triplet.second << " ), no value (pattern matrix).\n";
+			rc = FAILED;
+		}
+	}
+	if( rc != SUCCESS ) { return rc; }
+
+	std::cout << "\t\t without descriptor\n";
+	rc = grb::set< descr >( output, mask, input );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t grb::set (matrix to matrix masked) FAILED\n";
+		return rc;
+	}
+	if( emptyMask ) {
+		if( grb::nnz( output ) != 2 * n - 1 ) {
+			std::cerr << "\t unexpected number of output elements ( "
+				<< grb::nnz( output ) << " ), expected " << 2 * n - 1 <<".\n";
+			rc = FAILED;
+		}
+	} else {
+		if( grb::nnz( output ) != n ) {
+			std::cerr << "\t unexpected number of output elements ( "
+				<< grb::nnz( output ) << " ), expected " << n <<".\n";
+			rc = FAILED;
+		}
+	}
+	for( const auto &triplet : output ) {
+		if( emptyMask ) {
+			if( triplet.first != triplet.second &&
+				triplet.first != triplet.second - 1
+			) {
+				std::cerr << "\t unexpected entry at ( " << triplet.first << ", "
+					<< triplet.second << " ), no value (pattern matrix).\n";
+				rc = FAILED;
+			}
+		} else {
+			if( triplet.first != triplet.second ) {
+				std::cerr << "\t unexpected entry at ( " << triplet.first << ", "
+					<< triplet.second << " ), no value (pattern matrix).\n";
+				rc = FAILED;
+			}
+		}
+	}
+	if( rc != SUCCESS ) { return rc; }
+
+	// done
+	return rc;
+}
+
+/** Implementation of masked tests for non-void masks (nvm). */
+template<
+	Descriptor descr = descriptors::no_operation,
+	typename Tout, typename Tmask, typename Tin
+>
+RC masked_tests_nvm_impl(
+	RC &rc, grb::Matrix< Tout > &output,
+	const grb::Matrix< Tmask > &mask, const grb::Matrix< Tin > &input,
+	const size_t n
+) {
+	const bool emptyMask = grb::nrows( mask ) == 0 || grb::ncols( mask ) == 0;
+
+	std::cout << "\t\t with invert_mask descriptor\n";
+	rc = grb::set< descr | descriptors::invert_mask >( output, mask, input );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t grb::set invert mask (matrix to matrix masked) FAILED\n";
+		return rc;
+	}
+	if( emptyMask ) {
+		if( grb::nnz( output ) != 2 * n - 1 ) {
+			std::cerr << "\t unexpected number of output elements ( "
+				<< grb::nnz( output ) << " ), expected " << 2 * n - 1 <<".\n";
+			rc = FAILED;
+		}
+	} else {
+		if( grb::nnz( output ) != n - 1 ) {
+			std::cerr << "\t unexpected number of output elements ( "
+				<< grb::nnz( output ) << " ), expected " << n - 1 <<".\n";
+			rc = FAILED;
+		}
+	}
+	for( const auto &triplet : output ) {
+		if( emptyMask ) {
+			if( triplet.first.first != triplet.first.second &&
+				triplet.first.first != triplet.first.second - 1
+			) {
+				std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", "
+					<< triplet.first.second << " ), value " << triplet.second << ".\n";
+				rc = FAILED;
+			}
+		} else {
+			if( triplet.first.first != triplet.first.second - 1 ) {
+				std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", "
+					<< triplet.first.second << " ), value " << triplet.second << ".\n";
+				rc = FAILED;
+			}
+		}
+		if( triplet.first.first != static_cast< size_t >(triplet.second) ) {
+			std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", "
+				<< triplet.first.second << " ) with value " << triplet.second;
+			std::cerr << ", expected value "<< triplet.first.first <<".\n";
+			rc = FAILED;
+		}
+	}
+	if( rc != SUCCESS ) { return rc; }
+
+	// done
+	return rc;
+}
+
+/** Specialisation for void output */
+template<
+	Descriptor descr = descriptors::no_operation,
+	typename Tmask, typename Tin
+>
+RC masked_tests_nvm_impl(
+	RC &rc, grb::Matrix< void > &output,
+	const grb::Matrix< Tmask > &mask, const grb::Matrix< Tin > &input,
+	const size_t n
+) {
+	const bool emptyMask = grb::nrows( mask ) == 0 || grb::ncols( mask ) == 0;
+
+	std::cout << "\t\t with invert_mask descriptor\n";
+	rc = grb::set< descr | descriptors::invert_mask >( output, mask, input );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t grb::set invert mask (matrix to matrix masked) FAILED\n";
+		return rc;
+	}
+	if( emptyMask ) {
+		if( grb::nnz( output ) != 2 * n - 1 ) {
+			std::cerr << "\t unexpected number of output elements ( "
+				<< grb::nnz( output ) << " ), expected " << 2 * n - 1 <<".\n";
+			rc = FAILED;
+		}
+	} else {
+		if( grb::nnz( output ) != n - 1 ) {
+			std::cerr << "\t unexpected number of output elements ( "
+				<< grb::nnz( output ) << " ), expected " << n - 1 <<".\n";
+			rc = FAILED;
+		}
+	}
+	for( const auto &triplet : output ) {
+		if( emptyMask ) {
+			if( triplet.first != triplet.second &&
+				triplet.first != triplet.second - 1
+			) {
+				std::cerr << "\t unexpected entry at ( " << triplet.first << ", "
+					<< triplet.second << " ), no value (pattern matrix).\n";
+				rc = FAILED;
+			}
+		} else {
+			if( triplet.first != triplet.second - 1 ) {
+				std::cerr << "\t unexpected entry at ( " << triplet.first << ", "
+					<< triplet.second << " ), no value (pattern matrix).\n";
+				rc = FAILED;
+			}
+		}
+	}
+	if( rc != SUCCESS ) { return rc; }
+
+	// done
+	return rc;
+}
+
+/** Dispatch for generic masked tests */
+template< typename Tout, typename Tmask, typename Tin >
+RC masked_tests(
+	RC &rc, grb::Matrix< Tout > &output,
+	const grb::Matrix< Tmask > &mask, const grb::Matrix< Tin > &input,
+	const size_t n
+) {
+	if( masked_tests_generic_impl( rc, output, mask, input, n ) != SUCCESS ) {
+		return rc;
+	}
+	return masked_tests_nvm_impl( rc, output, mask, input, n );
+}
+
+/** Specialised dispatch for masked tests with void masks */
+template< typename Tout, typename Tin >
+RC masked_tests(
+	RC &rc, grb::Matrix< Tout > &output,
+	const grb::Matrix< void > &mask, const grb::Matrix< Tin > &input,
+	const size_t n
+) {
+	const grb::RC ret = masked_tests_generic_impl( rc, output, mask, input, n );
+	std::cout << "\t\t invert_mask descriptor SKIPPED\n";
+	return ret;
+}
+
+/** Specialised dispatch for masked tests with no-cast domains */
+template< typename T >
+RC masked_tests(
+	RC &rc, grb::Matrix< T > &output,
+	const grb::Matrix< bool > &mask, const grb::Matrix< T > &input,
+	const size_t n
+) {
+	if( masked_tests_generic_impl( rc, output, mask, input, n ) != SUCCESS ) {
+		return rc;
+	}
+	if( masked_tests_nvm_impl( rc, output, mask, input, n ) != SUCCESS ) {
+		return rc;
+	}
+	std::cout << "\t re-running previous tests with no_casting descriptor\n";
+	if(
+		masked_tests_generic_impl< descriptors::no_casting >(
+			rc, output, mask, input, n
+		) != SUCCESS
+	) {
+		return rc;
+	}
+	return masked_tests_nvm_impl< descriptors::no_casting >(
+			rc, output, mask, input, n );
+}
+
 void grb_program( const size_t &n, grb::RC &rc ) {
-	// initialize test
+	// initialize non-masked test containers
+	grb::Matrix< double > A( n, n );
+	grb::Matrix< double > B( n, n );
+	grb::Matrix< void > C( n, n );
+	grb::Matrix< void > D( n, n );
+	grb::Matrix< unsigned int > E( n, n );
+
+	// initalize masked test containers
+	grb::Matrix< int > mask( n, n );
+	grb::Matrix< bool > maskBool( n, n );
+	grb::Matrix< void > maskVoid( n, n );
+	grb::Matrix< double > maskEmpty( 0, 0 );
+	grb::Matrix< int > input( n, n );
+	grb::Matrix< void > inputVoid( n, n );
+	grb::Matrix< float > inputFloat( n, n );
+	grb::Matrix< int > output( n, n );
+	grb::Matrix< void > outputVoid( n, n );
+	
+	// initialise non-masked test data
 	int chk[ 10 ][ 10 ];
 	for( size_t i = 0; i < 10; ++i ) {
 		for( size_t j = 0; j < 10; ++j ) {
@@ -38,21 +387,17 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 	for( size_t k = 0; k < 15; ++k ) {
 		chk[ I[ k ] ][ J[ k ] ] = data1[ k ];
 	}
-	grb::Matrix< double > A( n, n );
-	grb::Matrix< double > B( n, n );
-	grb::Matrix< void > C( n, n );
-	grb::Matrix< void > D( n, n );
-	grb::Matrix< unsigned int > E( n, n );
+
 	rc = grb::resize( A, 15 );
 	if( rc == SUCCESS ) {
 		rc = grb::buildMatrixUnique( A, I, J, data1, 15, SEQUENTIAL );
 		for( const auto &triplet : A ) {
 			if( triplet.first.first >= 10 || triplet.first.second >= 10 ) {
-				std::cerr << "\tunexpected entry at A( " << triplet.first.first << ", "
+				std::cerr << "\t unexpected entry at A( " << triplet.first.first << ", "
 					<< triplet.first.second << " ).\n";
 				rc = FAILED;
 			} else if( chk[ triplet.first.first ][ triplet.first.second ] != triplet.second ) {
-				std::cerr << "\tunexpected entry at A( " << triplet.first.first << ", "
+				std::cerr << "\t unexpected entry at A( " << triplet.first.first << ", "
 					<< triplet.first.second << " ) with value " << triplet.second;
 				if( chk[ triplet.first.first ][ triplet.first.second ] == 0 ) {
 					std::cerr << ", expected no entry here.\n";
@@ -64,22 +409,86 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 			}
 		}
 	}
-	if( rc == SUCCESS ) {
-		rc = grb::resize( B, 15 );
+	rc = rc ? rc : grb::resize( B, 15 );
+	rc = rc ? rc : grb::resize( C, 15 );
+	rc = rc ? rc : grb::resize( D, 15 );
+	rc = rc ? rc : grb::resize( E, 15 );
+	rc = rc ? rc : grb::resize( output, 15 );
+	if( rc != SUCCESS || grb::nnz( A ) != 15 ) {
+		std::cerr << "\tinitialisation FAILED\n";
+		return;
 	}
-	if( rc == SUCCESS ) {
-		rc = grb::resize( C, 15 );
+
+	// initialise data for masked-set tests
+	//  - mask will be an n by n matrix with its diagonal set to 1 and its
+	//    superdiagonal set to 0.
+	//  - input will be an n by n matrix with each element on its diagonal and
+	//    superdiagonal set to its row index (meaning the entries on row 0 have
+	//    value 0).
+	size_t * const I_mask = new size_t[ 2 * n - 1 ];
+	size_t * const J_mask = new size_t[ 2 * n - 1 ];
+	int * const mask_vals = new int[ 2 * n - 1 ];
+	int * const input_vals = new int[ 2 * n - 1 ];
+	if( I_mask == nullptr || J_mask == nullptr || mask_vals == nullptr ||
+		input_vals == nullptr
+	) {
+		std::cerr << "\t initialisation FAILED\n";
+		return;
 	}
-	if( rc == SUCCESS ) {
-		rc = grb::resize( D, 15 );
+	for( size_t k = 0; k < n; ++k ) {
+		I_mask[ k ] = J_mask[ k ] = k;
+		mask_vals[ k ] = 1;
+		input_vals[ k ] = static_cast< int >( k );
+		if( k < n - 1 ) {
+			I_mask[ n + k ] = k;
+			J_mask[ n + k ] = k + 1;
+			mask_vals [ n + k ] = 0;
+			input_vals[ n + k ] = k;
+		}
 	}
-	if( rc == SUCCESS ) {
-		rc = grb::resize( E, 15 );
+	rc = grb::buildMatrixUnique( mask, I_mask, J_mask, mask_vals, 2 * n - 1,
+		SEQUENTIAL );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t buildMatrixUnique of mask matrix FAILED\n";
+		return;
 	}
-	if( rc != SUCCESS || grb::nnz( A ) != 15 ) {
-		std::cerr << "\tinitialisation FAILED\n";
+	rc = grb::buildMatrixUnique( maskBool, I_mask, J_mask, mask_vals, 2 * n - 1,
+		SEQUENTIAL );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t buildMatrixUnique of maskBool matrix FAILED\n";
+		return;
+	}
+	try {
+		maskVoid = grb::algorithms::matrices< void >::identity( n );
+	} catch( ... ) {
+		std::cerr << "\t constructing maskVoid FAILED\n";
+		return;
+	}
+	rc = grb::buildMatrixUnique( input, I_mask, J_mask, input_vals, 2 * n - 1,
+		SEQUENTIAL );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t buildMatrixUnique of input matrix FAILED\n";
+		return;
+	}
+	rc = grb::buildMatrixUnique( inputFloat, I_mask, J_mask, input_vals, 2 * n - 1,
+		SEQUENTIAL );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t buildMatrixUnique of inputFloat matrix FAILED\n";
+		return;
+	}
+	rc = grb::resize( inputVoid, 2 * n - 1 );
+	rc = rc ? rc : grb::resize( outputVoid, 2 * n - 1 );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t error resizing matrices for masked tests\n";
 		return;
 	}
+	// postpone materialisation of inputVoid since it relies on unmasked grb::set
+	// (which is itself unit-tested later)
+
+	delete [] I_mask;
+	delete [] J_mask;
+	delete [] mask_vals;
+	delete [] input_vals;
 	std::cout << "\t test initialisation complete\n";
 
 	// check grb::set for non-voids
@@ -96,11 +505,11 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 	}
 	for( const auto &triplet : B ) {
 		if( triplet.first.first >= 10 || triplet.first.second >= 10 ) {
-			std::cerr << "\tunexpected entry at B( " << triplet.first.first << ", "
+			std::cerr << "\t unexpected entry at B( " << triplet.first.first << ", "
 				<< triplet.first.second << " ).\n";
 			rc = FAILED;
 		} else if( chk[ triplet.first.first ][ triplet.first.second ] != triplet.second ) {
-			std::cerr << "\tunexpected entry at B( " << triplet.first.first << ", "
+			std::cerr << "\t unexpected entry at B( " << triplet.first.first << ", "
 				<< triplet.first.second << " ) with value " << triplet.second;
 			if( chk[ triplet.first.first ][ triplet.first.second ] == 0 ) {
 				std::cerr << ", expected no entry here.\n";
@@ -175,14 +584,14 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 	}
 	for( const auto &triplet : E ) {
 		if( triplet.first.first >= 10 || triplet.first.second >= 10 ) {
-			std::cerr << "\tunexpected entry at E( " << triplet.first.first << ", "
+			std::cerr << "\t unexpected entry at E( " << triplet.first.first << ", "
 				<< triplet.first.second << " ), value " << triplet.second << ".\n";
 			rc = FAILED;
 		} else if( static_cast< unsigned int >(
 				chk[ triplet.first.first ][ triplet.first.second ]
 			) != triplet.second
 		) {
-			std::cerr << "\tunexpected entry at E( " << triplet.first.first << ", "
+			std::cerr << "\t unexpected entry at E( " << triplet.first.first << ", "
 				<< triplet.first.second << " ) with value " << triplet.second;
 			if( chk[ triplet.first.first ][ triplet.first.second ] == 0 ) {
 				std::cerr << ", expected no entry here.\n";
@@ -209,11 +618,11 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 	}
 	for( const auto &triplet : E ) {
 		if( triplet.first.first >= 10 || triplet.first.second >= 10 ) {
-			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", "
+			std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", "
 				<< triplet.first.second << " ), value " << triplet.second << ".\n";
 			rc = FAILED;
 		} else if( 117 != triplet.second ) {
-			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", "
+			std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", "
 				<< triplet.first.second << " ) with value " << triplet.second;
 			if( chk[ triplet.first.first ][ triplet.first.second ] == 0 ) {
 				std::cerr << ", expected no entry here.\n";
@@ -224,6 +633,109 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 		}
 	}
 	if( rc != SUCCESS ) { return; }
+
+	// check masked matrix set
+	// first, finish initialisation
+	rc = grb::set( inputVoid, input );
+	rc = rc ? rc : grb::resize( output, 2 * n - 1 );
+	if( rc != SUCCESS || grb::nnz( inputVoid ) != 2 * n - 1 ) {
+		std::cerr << "\t error in inputVoid (an earlier test likely failed)\n";
+		if( rc == SUCCESS ) { rc = FAILED; }
+		return;
+	}
+
+	std::cout << "\t testing set( matrix, mask, matrix ), non-void, no-cast, "
+		<< "empty mask\n";
+	if( masked_tests( rc, output, maskEmpty, input, n ) != grb::SUCCESS ) {
+		return;
+	}
+
+	std::cout << "\t testing set( matrix, mask, matrix ), non-void, no-cast, "
+		<< "non-empty mask\n";
+	if( masked_tests( rc, output, mask, input, n ) != grb::SUCCESS ) { return; }
+
+	std::cout << "\t testing set( matrix, mask, matrix ), non-void, no-cast, "
+		<< "non-empty Boolean mask\n";
+	if( masked_tests( rc, output, maskBool, input, n ) != grb::SUCCESS ) {
+		return;
+	}
+
+	std::cout << "\t testing set( matrix, mask, matrix ), non-void, casting from "
+		<< "float to int, empty mask\n";
+	if( masked_tests( rc, output, maskEmpty, inputFloat, n ) != grb::SUCCESS ) {
+		return;
+	}
+
+	std::cout << "\t testing set( matrix, mask, matrix ), non-void, casting from "
+		<< "float to int, non-empty mask\n";
+	if( masked_tests( rc, output, mask, inputFloat, n ) != grb::SUCCESS ) {
+		return;
+	}
+
+	std::cout << "\t testing set( matrix, mask, matrix ), non-void, casting from "
+		<< "float to int, non-empty Boolean mask\n";
+	if( masked_tests( rc, output, maskBool, inputFloat, n ) != grb::SUCCESS ) {
+		return;
+	}
+
+	std::cout << "\t testing set( matrix, mask, matrix ), void-to-void (no cast), "
+		<< "empty mask\n";
+	if( masked_tests( rc, outputVoid, maskEmpty, inputVoid, n ) != grb::SUCCESS ) {
+		return;
+	}
+
+	std::cout << "\t testing set( matrix, mask, matrix ), void-to-void (no cast), "
+		<< "non-empty mask\n";
+	if( masked_tests( rc, outputVoid, mask, inputVoid, n ) != grb::SUCCESS ) {
+		return;
+	}
+
+	std::cout << "\t testing set( matrix, mask, matrix ), void-to-void (no cast), "
+		<< "non-empty Boolean mask\n";
+	if( masked_tests( rc, outputVoid, maskBool, inputVoid, n ) != grb::SUCCESS ) {
+		return;
+	}
+
+	std::cout << "\t testing set( matrix, mask, matrix ), int-to-void, casting "
+		<< "(sort of), empty mask\n";
+	if( masked_tests( rc, outputVoid, maskEmpty, input, n ) != grb::SUCCESS ) {
+		return;
+	}
+
+	std::cout << "\t testing set( matrix, mask, matrix ), int-to-void, casting "
+		<< "(sort of), non-empty mask\n";
+	if( masked_tests( rc, outputVoid, mask, input, n ) != grb::SUCCESS ) {
+		return;
+	}
+
+	std::cout << "\t testing set( matrix, mask, matrix ), int-to-void, casting "
+		<< "(sort of), Boolean mask\n";
+	if( masked_tests( rc, outputVoid, maskBool, input, n ) != grb::SUCCESS ) {
+		return;
+	}
+
+	std::cout << "\t testing set( matrix, mask, matrix ), float-to-void, casting "
+		<< "(sort of), empty mask\n";
+	if(
+		masked_tests( rc, outputVoid, maskEmpty, inputFloat, n ) !=
+			grb::SUCCESS
+	) {
+		return;
+	}
+
+	std::cout << "\t testing set( matrix, mask, matrix ), float-to-void, casting "
+		<< "(sort of), non-empty mask\n";
+	if( masked_tests( rc, outputVoid, mask, inputFloat, n ) != grb::SUCCESS ) {
+		return;
+	}
+
+	std::cout << "\t testing set( matrix, mask, matrix ), float-to-void, casting "
+		<< "(sort of), Boolean mask\n";
+	if( masked_tests( rc, outputVoid, maskBool, inputFloat, n ) != grb::SUCCESS ) {
+		return;
+	}
+
+	// done
 }
 
 int main( int argc, char ** argv ) {
diff --git a/tests/unit/unittests.sh b/tests/unit/unittests.sh
index 0b49d2421..6abb78e21 100755
--- a/tests/unit/unittests.sh
+++ b/tests/unit/unittests.sh
@@ -484,6 +484,12 @@ for MODE in ${MODES}; do
 				grep "Test OK" ${TEST_OUT_DIR}/matrixSet_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
 				echo " "
 
+				echo ">>>      [x]           [ ]       Testing grb::set (matrices), size 10 000"
+				$runner ${TEST_BIN_DIR}/matrixSet_${MODE}_${BACKEND} 10000 2> ${TEST_OUT_DIR}/matrixSet_large_${MODE}_${BACKEND}_${P}_${T}.err 1> ${TEST_OUT_DIR}/matrixSet_large_${MODE}_${BACKEND}_${P}_${T}.log
+				head -1 ${TEST_OUT_DIR}/matrixSet_large_${MODE}_${BACKEND}_${P}_${T}.log
+				grep "Test OK" ${TEST_OUT_DIR}/matrixSet_large_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
+				echo " "
+
 				echo ">>>      [x]           [ ]       Testing grb::set (matrix, value)"
 				$runner ${TEST_BIN_DIR}/setMatrixValue_${MODE}_${BACKEND} 2> ${TEST_OUT_DIR}/setMatrixValue_${MODE}_${BACKEND}_${P}_${T}.err 1> ${TEST_OUT_DIR}/setMatrixValue_${MODE}_${BACKEND}_${P}_${T}.log
 				head -1 ${TEST_OUT_DIR}/setMatrixValue_${MODE}_${BACKEND}_${P}_${T}.log