diff --git a/include/graphblas/base/blas2.hpp b/include/graphblas/base/blas2.hpp
index 3b5e5df4e..2d61d46d4 100644
--- a/include/graphblas/base/blas2.hpp
+++ b/include/graphblas/base/blas2.hpp
@@ -337,6 +337,149 @@ namespace grb {
 		return UNSUPPORTED;
 	}
 
+	/**
+	 * Computes \f$ x \f$ from \f$ Tx=b \f$.
+	 *
+	 * \warning This is an experimental feature.
+	 *
+	 * Here, \f$ T, b \f$ are given while \f$ T \f$ additionally must be either
+	 * lower- or upper-triangular. The output \f$ x \f$ may furthermore be masked.
+	 *
+	 * @param[in,out] xb On input: the vector b. On output: the vector x.
+	 *                   When #grb::EXECUTE is given this vector must have
+	 *                   sufficient capacity to store both any pre-existing
+	 *                   nonzeroes in \f$ b \f$ plus the nonzeroes in the (masked)
+	 *                   result of \f$ x \f$.
+	 *                   When #grb::RESIZE is given, the vector contents are
+	 *                   unchanged, however its capacity may be enlarged in order
+	 *                   to ensure the above-described condition.
+	 *
+	 * \note On input, \a xb may contain both implicit and explicit zeroes.
+	 *
+	 * \note On output, \a xb may contain both implicit and explicit zeroes, even
+	 *       if on input it did not.
+	 *
+	 * @param[in] mask   The mask that acts on \f$ x \f$. This vector has either
+	 *                   size zero or size equal to that of \a x.
+	 *
+	 * \note If \a mask has size zero, it is interpreted as though the operation
+	 *       unmasked.
+	 *
+	 * @param[in] T       The upper- or lower-triangular input matrix. This must be
+	 *                    a square matrix with size equal to that of \a xb.
+	 * @param[in] forward Whether to perform forward substitution (i.e., \f$ T \f$
+	 *                    is lower-triangular), or to perform backward substitution
+	 *                    instead (i.e., \f$ T \f$ is upper-triangular).
+	 *
+	 * \note With the structural information ALP/Dense passes as template
+	 *       information, \a forward would not be required.
+	 *
+	 * @param[in] semiring    The semiring under which to perform the SpTrsv.
+	 * @param[in] subtraction The inverse of the additive operator of \a semiring.
+	 * @param[in] division    The inverse of the multiplicative operator of
+	 *                        \a semiring.
+	 *
+	 * \note That is, this operation in fact requires a field structure and not a
+	 *       semiring. A future extension to ALP may provide such a structure
+	 *       explicitly.
+	 *
+	 * @param[in] phase       The requested phase of the computation. Only
+	 *                        #grb::EXECUTE and #grb::RESIZE are supported.
+	 *
+	 * \todo Expand documentation.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Semiring,
+		class Subtraction,
+		class Division,
+		typename IOType, typename InputType1, typename InputType2,
+		typename Coords, typename RIT, typename CIT, typename NIT,
+		Backend backend
+	>
+	RC sptrsv(
+		Vector< IOType, backend, Coords > &xb,
+		const Vector< InputType2, backend, Coords > &mask,
+		const Matrix< InputType1, backend, RIT, CIT, NIT > &T,
+		const bool forward,
+		const Semiring &semiring = Semiring(),
+		const Subtraction &subtraction = Subtraction(),
+		const Division &division = Division(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_semiring< Semiring >::value &&
+			grb::is_operator< Subtraction >::value &&
+			grb::is_operator< Division >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value,
+		void >::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cerr << "Selected backend does not implement masked grb::sptrsv\n";
+#endif
+#ifndef NDEBUG
+		const bool selected_backend_does_not_support_masked_sptrsv = false;
+		assert( selected_backend_does_not_support_masked_sptrsv );
+#endif
+		(void) xb;
+		(void) mask;
+		(void) T;
+		(void) semiring;
+		(void) subtraction;
+		(void) division;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Computes \f$ x \f$ from \f$ Tx = b \f$, unmasked variant.
+	 *
+	 * \warning This is an experimental feature.
+	 *
+	 * \todo Extend documentation
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Semiring,
+		class Subtraction,
+		class Division,
+		typename IOType, typename InputType1,
+		typename Coords, typename RIT, typename CIT, typename NIT,
+		Backend backend
+	>
+	RC sptrsv(
+		Vector< IOType, backend, Coords > &xb,
+		const Matrix< InputType1, backend, RIT, CIT, NIT > &T,
+		const bool forward,
+		const Semiring &semiring = Semiring(),
+		const Subtraction &subtraction = Subtraction(),
+		const Division &division = Division(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_semiring< Semiring >::value &&
+			grb::is_operator< Subtraction >::value &&
+			grb::is_operator< Division >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value,
+		void >::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cerr << "Selected backend does not implement masked grb::sptrsv\n";
+#endif
+#ifndef NDEBUG
+		const bool selected_backend_does_not_support_unmasked_sptrsv = false;
+		assert( selected_backend_does_not_support_unmasked_sptrsv );
+#endif
+		(void) xb;
+		(void) T;
+		(void) semiring;
+		(void) subtraction;
+		(void) division;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
 	/**
 	 * Executes an arbitrary element-wise user-defined function \a f on all
 	 * nonzero elements of a given matrix \a A.
diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp
new file mode 100644
index 000000000..ea25e8229
--- /dev/null
+++ b/include/graphblas/reference/SptrsvSchedule.hpp
@@ -0,0 +1,236 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 4th of April, 2025
+ */
+
+#ifndef _H_GRB_REFERENCE_SPTRSVSCHEDULE
+#define _H_GRB_REFERENCE_SPTRSVSCHEDULE
+
+#include <limits>
+#include <vector>
+
+#include <graphblas/utils/autodeleter.hpp>
+
+
+namespace grb {
+
+	namespace internal {
+
+		template< typename NIT >
+		class SptrsvSchedule {
+
+			private:
+
+				/** One deleter for each chunk of thread-local data. */
+				std::vector< utils::AutoDeleter< char > > _deleters;
+
+				/**
+				 * Fixed-size buffer for realising the default schedule.
+				 *
+				 * The first two positions are reserved for the range of the default
+				 * schedule, while the last position is reserved for the end-position of
+				 * the default schedule.
+				 */
+				NIT default_schedule[ 3 ];
+
+				/** Computes and initialises a trivial schedule. */
+				void initTrivial( const NIT n ) {
+					// dynamic sanity check
+					assert( data[ 0 ] == nullptr );
+					// set trivial schedule
+					default_schedule[ 0 ] = 0;
+					default_schedule[ 1 ] = n;
+					default_schedule[ 2 ] = 1;
+					nRanges[ 0 ] = 1;
+					// set pointers to trivial schedule
+					data[ 0 ] = reinterpret_cast< char * >( &(default_schedule[0]) );
+					endPositions[ 0 ] = reinterpret_cast< char * >( &(default_schedule[2]) );
+					// verify trivial schedule
+#ifndef NDEBUG
+					{
+						NIT * const interpreted = reinterpret_cast< NIT * >(data[0]);
+						assert( interpreted[ 0 ] == 0 );
+						assert( interpreted[ 1 ] == n );
+					}
+					{
+						NIT * const interpreted = reinterpret_cast< NIT * >(endPositions[0]);
+						assert( *interpreted == 1 );
+					}
+#endif
+					// note that this is a trivial schedule
+					is_simple = true;
+
+					// the default schedule does not (cannot) sort input matrices
+					is_sorted = false;
+				}
+
+				/**
+				 * Implementation of move construction and assignment.
+				 */
+				void moveImpl( SptrsvSchedule &&toMove ) {
+					_deleters = std::move( toMove._deleters );
+					default_schedule = std::move( toMove.default_schedule );
+					is_sorted = toMove.is_sorted;
+					is_simple = toMove.is_simple;
+					supersteps = toMove.supersteps;
+					nThreads = toMove.nThreads;
+					data = std::move( toMove.data );
+					endPositions = std::move( toMove.endPositions );
+					nRanges = std::move( toMove.nRanges );
+					toMove.is_sorted = true;
+					toMove.is_simple = false;
+					toMove.supersteps = 0;
+					toMove.nThreads = 0;
+				}
+
+
+			public:
+
+				/** Whether the matrix is sorted as part of tuning. */
+				bool is_sorted;
+
+				/** Whether the schedule is simple.*/
+				bool is_simple;
+
+				/** Number of schedule steps. */
+				size_t supersteps;
+
+				/**
+				 * The number of threads the schedule is designed for.
+				 */
+				size_t nThreads;
+
+				/** One data pointer per thread. */
+				std::vector< char * > data;
+
+				/** One end-position array per thread. */
+				std::vector< char * > endPositions;
+
+				/** The total number of ranges per thread. */
+				std::vector< NIT > nRanges;
+
+				/** Move constructor. */
+				SptrsvSchedule( SptrsvSchedule &&toMove ) {
+					moveImpl( toMove );
+				}
+
+				/**
+				 * Base constructor.
+				 *
+				 * @param[in] n The size of the linear system.
+				 *
+				 * Initialises the schedule to the trivial always-valid schedule, which is
+				 * a single-thread single-superstep schedule that assigns all work to the
+				 * first (and only) thread.
+				 */
+				SptrsvSchedule( const NIT n ) :
+					_deleters( 1 ), is_sorted( true ), is_simple( false ),
+					supersteps( 1 ), nThreads( 1 ),
+					data( 1 ), endPositions( 1 ), nRanges( 1 )
+				{
+					data[ 0 ] = nullptr;
+					endPositions[ 0 ] = nullptr;
+					initTrivial( n );
+				}
+
+				/**
+				 * Specialised constructor for multiple threads.
+				 *
+				 * @param[in] n The size of the linear system.
+				 * @param[in] T The number of threads a schedule should be created for.
+				 *
+				 * Initialises the schedule to the trivial always-valid schedule, which is
+				 * a single-superstep schedule that assigns all work to the first thread.
+				 */
+				SptrsvSchedule( const NIT n, const size_t T ) :
+					_deleters( 2 * T ), is_sorted( true ), is_simple( false ),
+					supersteps( 1 ), nThreads( T ),
+					data( T, nullptr ), endPositions( T, nullptr ), nRanges( T, 0 )
+				{
+					if( T == 0 || T > std::numeric_limits< int >::max() ) {
+						throw std::runtime_error( "Invalid number of threads" );
+					}
+					initTrivial( n );
+				}
+
+				/**
+				 * Base destructor.
+				 *
+				 * Dynamic memory is freed through the #_deleters.
+				 */
+				~SptrsvSchedule() {
+					// sanity checks only
+					assert( nThreads != 0 );
+					if( nThreads == 1 ) {
+						assert( supersteps == 1 );
+					}
+				}
+
+				/** Move assignment. */
+				SptrsvSchedule& operator=( SptrsvSchedule &&toMove ) {
+					moveImpl( toMove );
+					return *this;
+				}
+
+				/**
+				 * Allocates a thread-local chunk of data.
+				 *
+				 * Must be called from within the thread that will use it(!)
+				 */
+				void alloc( const size_t s, const size_t nRanges_in ) {
+					assert( s < nThreads );
+					assert( supersteps > 0 );
+					assert( _deleters.size() >= s );
+					assert( data.size() >= s );
+					assert( data[ s ] == nullptr );
+					assert( endPositions[ s ] == nullptr );
+					if( nRanges[ s ] != nRanges_in ) {
+						throw std::runtime_error( "Given nRanges differs from the one given at "
+							"allocation time" );
+					}
+					grb::RC rc = grb::SUCCESS;
+					if( nRanges_in > 0 ) {
+						rc = utils::alloc(
+							"grb::internal::SptrsvSchedule (default constructor)",
+							"default thread-local data allocation, variant I",
+							data[ s ], 2 * nRanges_in * sizeof(NIT), false, _deleters[ s ],
+							endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ s + nThreads ]
+						);
+					} else {
+						data[ s ] = nullptr;
+						rc = utils::alloc(
+							"grb::internal::SptrsvSchedule (default constructor)",
+							"default thread-local data allocation, variant II",
+							endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ s + nThreads ]
+						);
+					}
+					if( rc != grb::SUCCESS ) {
+						throw std::bad_alloc();
+					}
+				}
+
+		};
+
+	} // end namespace `grb::internal'
+
+} // end namespace `grb'
+
+#endif // _H_GRB_REFERENCE_SPTRSVSCHEDULE
+
diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp
index 7a4d3e135..41fac2d0b 100644
--- a/include/graphblas/reference/blas2.hpp
+++ b/include/graphblas/reference/blas2.hpp
@@ -2216,6 +2216,658 @@ namespace grb {
 
 	namespace internal {
 
+#ifndef _H_GRB_REFERENCE_OMP_BLAS2
+		/**
+		 * Backend-independent sptrsv kernel code.
+		 *
+		 * @tparam sorted 0: the matrix rows are not sorted,
+		 *                1: the matrix rows have diagonal elements at position 0,
+		 *                2: the matrix rows are sorted.
+		 *
+		 * @tparam forward True: perform forward substitution.
+		 *                 False: perform backward substitution.
+		 *
+		 * \note The above description assumes CRS.
+		 */
+		template<
+			int sorted, bool forward,
+			typename IOType, typename InputType1,
+			typename IND, typename NIT,
+			class Semiring, class Subtraction
+		>
+		inline void sptrsv_kernel(
+			const Compressed_Storage< InputType1, IND, NIT > &crs,
+			const size_t &i,
+			IOType *__restrict__ const &x,
+			IOType &divBy,
+			const Semiring &semiring,
+			const Subtraction &subtraction
+		) {
+			// static assertions
+			static_assert( sorted >= 0 && sorted < 3, "Illegal value for sorted; this "
+				"is an internal error, please submit a bug report" );
+			constexpr auto one =
+				Semiring::template One< typename Semiring::D1 >::value();
+
+			// NOTE: explicit vectorisation seems to cause significant slowdowns.
+			//       disabled but retained for future reference / more successful
+			//       optimisation
+			/*
+			// constants
+			constexpr size_t ind_blocksize =
+				grb::config::SIMD_BLOCKSIZE< IND >::value();
+			constexpr size_t blocksize = ind_blocksize < Semiring::blocksize
+				? ind_blocksize : Semiring::blocksize;
+			// constant bufer
+			IND cur_ind[ blocksize ];
+			// SIMD set
+			for( size_t r = 0; r < blocksize; ++r ) {
+				cur_ind[ r ] = i;
+			}
+			for( ; k + Semiring::blocksize < crs.col_start[ i + 1 ];
+				k += Semiring::blocksize
+			) {
+				typename Semiring::D1 values[ blocksize ];
+				typename Semiring::D3 tmp[ blocksize ];
+				IOType x_simd[ blocksize ];
+				IND indices[ blocksize ];
+				bool match[ blocksize ];
+				// streaming loads
+				for( size_t r = 0; r < blocksize; ++r ) {
+					values[ r ] = crs.template getValue( k + r, one );
+				}
+				for( size_t r = 0; r < blocksize; ++r ) {
+					indices[ r ] = crs.row_index[ k + r ];
+				}
+				// SIMD compute
+				for( size_t r = 0; r < blocksize; ++r ) {
+					match[ r ] = cur_ind[ r ] != indices[ r ];
+				}
+				// gather
+				for( size_t r = 0; r < blocksize; ++r ) {
+					if( match[ r ] ) {
+						x_simd[ r ] = x[ indices[ r ] ];
+					}
+				}
+				// masked SIMD compute
+				for( size_t r = 0; r < blocksize; ++r ) {
+					if( match[ r ] ) {
+						tmp[ r ] = values[ r ] * x_simd[ r ];
+					}
+				}
+				// masked SIMD reduce
+				for( size_t r = 0; r < blocksize; ++r ) {
+					if( match[ r ] ) {
+						reduced += tmp[ r ];
+					}
+				}
+				// diagonal detection; SIMD invert and reduce
+				for( size_t r = 0; r < blocksize; ++r ) {
+					match[ r ] = !match[ r ];
+				}
+				for( size_t r = 0; r < blocksize; ++r ) {
+					if( match[ r ] ) {
+						divBy += values[ r ];
+					}
+				}
+			}*/
+
+			if( sorted ) {
+				(void) divBy;
+			}
+			assert( crs.col_start[ i + 1 ] > crs.col_start[ i ] );
+			const size_t start = (!sorted) ? crs.col_start[ i ] : (
+				(sorted == 1) ? (crs.col_start[ i ] + 1) : (
+					forward ? crs.col_start[ i ] : (crs.col_start[ i ] + 1) )
+				);
+			const size_t end = (!sorted) ? crs.col_start[ i + 1 ] : (
+				(sorted == 1) ? crs.col_start[ i + 1 ] : (
+					forward ? (crs.col_start[ i + 1 ] - 1) : crs.col_start[ i + 1 ] )
+				);
+			for( size_t k = start; k < end; ++k ) {
+				const typename Semiring::D1 val = crs.template getValue( k, one );
+				const auto &ind = crs.row_index[ k ];
+				if( !sorted ) {
+					if( static_cast< size_t >(ind) == i ) {
+ #ifdef _DEBUG
+						std::cout << "\tskipping nonzero at " << i ", " << i << "\n";
+ #endif
+						divBy = val;
+						continue;
+					}
+				}
+ #ifdef _DEBUG
+				std::cout << "\t x[ " << i << " ] (" << x[i] << ") -= " << val << "* x[ "
+					<< ind << " ] (" << x[ind] << ")\n";
+ #endif
+				typename Semiring::D3 tmp;
+				(void) grb::apply( tmp, val, x[ ind ],
+					semiring.getMultiplicativeOperator() );
+				(void) grb::foldl( x[ i ], tmp, subtraction );
+			}
+		}
+
+		template<
+			typename IOType, int sorted, bool forward,
+			typename VIT, typename RCIT, typename NIT
+		>
+		inline static IOType getDiagonalEntry(
+			const Compressed_Storage< VIT, RCIT, NIT > &storage, const size_t &i,
+			const IOType &one
+		) {
+			assert( sorted >= 1 && sorted < 3 );
+			assert( storage.col_start[ i + 1 ] > storage.col_start[ i ] );
+			const size_t diag_index = sorted == 1 ? storage.col_start[ i ] : (
+				forward ? (storage.col_start[ i + 1 ] - 1) : storage.col_start[ i ] );
+			assert( storage.row_index[ diag_index ] == i );
+			return storage.template getValue( diag_index, one );
+		}
+#endif // end ifndef _H_GRB_REFERENCE_OMP_BLAS2
+
+		/** \internal Specialised dense unmasked sptrsv implementation, sequential */
+		template<
+			Descriptor descr, bool maybe_offset, int sorted, bool forward,
+			class Semiring, class Subtraction, class Division,
+			typename IOPtrType, typename InputType1,
+			typename RIT, typename CIT, typename NIT
+		>
+		RC dense_unmasked_sequential_sptrsv(
+			const IOPtrType &v_raw,
+			const Matrix< InputType1, reference, RIT, CIT, NIT > &T,
+			const NIT &offset, const NIT &n,
+			const Semiring &semiring,
+			const Subtraction &subtraction,
+			const Division &division,
+			const Phase &phase
+		) {
+			// static sanity checks
+			static_assert( sorted >= 0 && sorted < 3, "Invalid value for sorted; this "
+				"an internal error, please submit a bug report" );
+
+			// only execute and resize are supported
+			assert( phase == grb::EXECUTE );
+#ifdef NDEBUG
+			(void) phase;
+#endif
+
+			// get value type from the pointer
+			typedef typename std::remove_reference<
+				decltype( *std::declval< IOPtrType >() )
+			>::type IOType;
+
+			// in case of pattern matrices, get 1 from the semiring
+			const IOType one = semiring.getMultiplicativeMonoid().template
+				getIdentity< IOType >();
+
+			// switch forward or backward solve
+			if( forward ) {
+				const auto &crs = internal::getCRS( T );
+				if( !maybe_offset || offset == 0 ) {
+					for( size_t i = 0; i < n; ++i ) {
+						IOType divBy;
+						if( !sorted ) {
+							// in this case we will auto-detect the diagonal item
+							divBy = semiring.template getZero< IOType >();
+						} else {
+							divBy = getDiagonalEntry< IOType, sorted, forward >( crs, i, one );
+						}
+						assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] );
+						sptrsv_kernel< sorted, forward >( crs, i, v_raw, divBy, semiring,
+							subtraction );
+#ifdef _DEBUG
+						std::cout << "\trow v_raw[ " << i << " ] = " << v_raw[ i ] << " will be normalised with " << divBy
+							<< "\n";
+#endif
+						(void) grb::foldl( v_raw[ i ], divBy, division );
+					}
+				} else {
+					const size_t end = n + offset;
+					for( size_t i = offset; i < end; ++i ) {
+						IOType divBy;
+						if( !sorted ) {
+							divBy = semiring.template getZero< IOType >();
+						} else {
+							divBy = getDiagonalEntry< IOType, sorted, forward >( crs, i, one );
+						}
+						assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] );
+						sptrsv_kernel< sorted, forward >( crs, i, v_raw, divBy, semiring,
+							subtraction );
+#ifdef _DEBUG
+						std::cout << "\trow v_raw[ " << i << " ] = " << v_raw[ i ] << " will be normalised with " << divBy
+							<< "\n";
+#endif
+						(void) grb::foldl( v_raw[ i ], divBy, division );
+					}
+				}
+			} else {
+				const auto &ccs = internal::getCCS( T );
+				if( !maybe_offset || offset == 0 ) {
+					for( size_t i = n - 1; i < n; --i ) {
+						IOType divBy;
+						if( !sorted ) {
+							divBy = semiring.template getZero< IOType >();
+						} else {
+							divBy = getDiagonalEntry< IOType, sorted, forward >( ccs, i, one );
+						}
+						assert( ccs.col_start[ i ] <= ccs.col_start[ i + 1 ] );
+						sptrsv_kernel< sorted, forward >( ccs, i, v_raw, divBy, semiring,
+							subtraction );
+#ifdef _DEBUG
+						std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy
+							<< "\n";
+#endif
+						(void) grb::foldl( v_raw[ i ], divBy, division );
+					}
+				} else {
+					for( size_t i = n - 1 + offset; i >= offset; --i ) {
+						IOType divBy;
+						if( !sorted ) {
+							divBy = semiring.template getZero< IOType >();
+						} else {
+							divBy = getDiagonalEntry< IOType, sorted, forward >( ccs, i, one );
+						}
+						assert( ccs.col_start[ i ] <= ccs.col_start[ i + 1 ] );
+						sptrsv_kernel< sorted, forward >( ccs, i, v_raw, divBy, semiring,
+							subtraction );
+#ifdef _DEBUG
+						std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy
+							<< "\n";
+#endif
+						(void) grb::foldl( v_raw[ i ], divBy, division );
+					}
+				}
+			}
+
+			// done
+			return grb::SUCCESS;
+		}
+
+#ifdef _H_GRB_REFERENCE_OMP_BLAS2
+		/** \internal Specialised dense unmasked sptrsv implementation, OpenMP */
+		template<
+			Descriptor descr, bool forward,
+			class Semiring, class Subtraction, class Division,
+			typename IOType, typename InputType1,
+			typename RIT, typename CIT, typename NIT
+		>
+		RC dense_unmasked_omp_sptrsv(
+			IOType * const &v_raw,
+			const Matrix< InputType1, reference, RIT, CIT, NIT > &T,
+			const size_t &n,
+			const Semiring &semiring,
+			const Subtraction &subtraction,
+			const Division &division,
+			const Phase &phase
+		) {
+			// in dense unmasked, resize is a no-op
+			if( phase == grb::RESIZE ) { return grb::SUCCESS; }
+
+			// get SpTrsv schedule
+			const SptrsvSchedule< NIT > *sptrsvSchedule_p =
+				internal::getSptrsvData( T );
+			const SptrsvSchedule< NIT > default_schedule (n);
+			const SptrsvSchedule< NIT > &sptrsv = sptrsvSchedule_p == nullptr ?
+				default_schedule : *sptrsvSchedule_p;
+
+			// only execute and resize are supported
+			assert( phase == grb::EXECUTE );
+
+			// start parallel section
+			grb::RC ret = grb::SUCCESS;
+
+			if( sptrsv.is_simple ) {
+				#pragma omp parallel num_threads(sptrsv.nThreads)
+				{
+					grb::RC local_rc = grb::SUCCESS;
+					const int s = omp_get_thread_num();
+					const NIT *__restrict__ data =
+						reinterpret_cast< const NIT * >(sptrsv.data[ s ]);
+					assert( data != nullptr );
+					if( sptrsv.is_sorted ) {
+						for( size_t i = 0; i < sptrsv.supersteps; ++i ) {
+							const auto &lo = *data++;
+							const auto &no = *data++;
+							assert( lo < n );
+							assert( no + lo <= n );
+							local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv<
+									descr, true, config::tuning::SpTRSV::sortingMode, forward
+								>(
+									v_raw, T, lo, no, semiring, subtraction, division, phase
+								);
+							#pragma omp barrier
+						}
+					} else {
+						for( size_t i = 0; i < sptrsv.supersteps; ++i ) {
+							const NIT &lo = *data++;
+							const NIT &no = *data++;
+							assert( lo < n );
+							assert( no + lo <= n );
+							local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv<
+								descr, true, 0, forward
+							>(
+								v_raw, T, lo, no, semiring, subtraction, division, phase
+							);
+							#pragma omp barrier
+						}
+					}
+					if( local_rc != grb::SUCCESS ) {
+						#pragma omp atomic write
+						ret = local_rc;
+					}
+				}
+			} else {
+				#pragma omp parallel num_threads(sptrsv.nThreads)
+				{
+					grb::RC local_rc = grb::SUCCESS;
+					const int s = omp_get_thread_num();
+					const NIT *__restrict__ data =
+						reinterpret_cast< const NIT * >(sptrsv.data[ s ]);
+					const NIT *__restrict__ const end =
+						reinterpret_cast< const NIT * >(sptrsv.endPositions[ s ]);
+					assert( end != nullptr );
+					if( sptrsv.is_sorted ) {
+						for( size_t i = 0; i < sptrsv.supersteps; ++i ) {
+							for( size_t k = 0; k < end[ i ]; ++k ) {
+								const NIT &lo = *data++;
+								const NIT &no = *data++;
+								assert( lo < n );
+								assert( no + lo <= n );
+								local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv<
+										descr, true, config::tuning::SpTRSV::sortingMode, forward
+									>(
+										v_raw, T, lo, no, semiring, subtraction, division, phase
+									);
+							}
+							#pragma omp barrier
+						}
+					} else {
+						for( size_t i = 0; i < sptrsv.supersteps; ++i ) {
+							for( size_t k = 0; k < end[ i ]; ++k ) {
+								const NIT &lo = *data++;
+								const NIT &no = *data++;
+								assert( lo < n );
+								assert( no + lo <= n );
+								local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv<
+										descr, true, 0, forward
+									>(
+										v_raw, T, lo, no, semiring, subtraction, division, phase
+									);
+							}
+							#pragma omp barrier
+						}
+					}
+					if( local_rc != grb::SUCCESS ) {
+						#pragma omp atomic write
+						ret = local_rc;
+					}
+				}
+			}
+
+			// done
+			return ret;
+		}
+#endif
+
+		/**
+		 * \internal Implements sparse masked, sparse unmasked, and dense masked
+		 *           sptrsv.
+		 */
+		template<
+			Descriptor descr, bool forward,
+			bool masked, bool sparse,
+			class Semiring, class Subtraction, class Division,
+			typename IOType, typename InputType1, typename InputType2,
+			typename Coords, typename RIT, typename CIT, typename NIT
+		>
+		RC generic_sptrsv(
+			Vector< IOType, reference, Coords > &xb,
+			const Vector< InputType2, reference, Coords > &mask,
+			const Matrix< InputType1, reference, RIT, CIT, NIT > &T,
+			const size_t &n,
+			const Semiring &semiring,
+			const Subtraction &subtraction,
+			const Division &division,
+			const Phase &phase
+		) {
+			// static checks
+			static_assert( masked || sparse, "Internal logic error; please submit a bug "
+				"report" );
+
+			//dynamic checks
+			assert( grb::size( xb ) == n );
+			assert( !masked || grb::size( mask ) == n );
+			assert( grb::nrows( T ) == n );
+			assert( grb::ncols( T ) == n );
+
+			(void) descr;
+			(void) forward;
+			(void) masked;
+			(void) sparse;
+#ifdef NDEBUG
+			(void) xb;
+			(void) mask;
+			(void) T;
+			(void) n;
+#endif
+			(void) semiring;
+			(void) subtraction;
+			(void) division;
+			(void) phase;
+			std::cerr << "Warning: masked sptrsv not yet implemented\n";
+			return grb::UNSUPPORTED;
+		}
+
+	} // end grb::internal
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Semiring, class Subtraction, class Division,
+		typename IOType, typename InputType1,
+		typename Coords, typename RIT, typename CIT, typename NIT
+	>
+	RC sptrsv(
+		Vector< IOType, reference, Coords > &xb,
+		const Matrix< InputType1, reference, RIT, CIT, NIT > &T,
+		const bool forward,
+		const Semiring &semiring = Semiring(),
+		const Subtraction &subtraction = Subtraction(),
+		const Division &division = Division(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_semiring< Semiring >::value &&
+			grb::is_operator< Subtraction >::value &&
+			grb::is_operator< Division >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value,
+		void >::type * const = nullptr
+	) {
+		// check contract
+		constexpr bool dense = descr & descriptors::dense;
+		const size_t n = size( xb );
+		if( grb::nrows( T ) != n ) {
+			std::cerr << "Error, sptrsv (unmasked): matrix should have a number of rows "
+				<< "equal to the size of the input/output vector\n";
+			return grb::ILLEGAL;
+		}
+		if( grb::ncols( T ) != n ) {
+			std::cerr << "Error, sptrsv (unmasked): matrix should have a number of "
+				<< "columns equal to the size of the input/output vector\n";
+			return grb::ILLEGAL;
+		}
+		if( dense && grb::nnz( xb ) < n ) {
+			std::cerr << "Error, sptrsv (unmasked): sparse vector given in conjunction "
+				<< "with a dense descriptor\n";
+			return grb::ILLEGAL;
+		}
+
+		// check trivial
+		if( n == 0 ) { return grb::SUCCESS; }
+
+		// in the dense variant, resize does nothing
+		if( phase == grb::RESIZE ) { return grb::SUCCESS; }
+
+		// check dense dispatch
+		if( dense || grb::nnz( xb ) == n ) {
+#ifndef _H_GRB_REFERENCE_OMP_BLAS2
+			IOType *__restrict__ const xb_p = internal::getRaw( xb );
+			const NIT lo = 0;
+			const NIT no = n;
+			if( forward ) {
+				return internal::dense_unmasked_sequential_sptrsv< descr, false, 0, true >(
+					xb_p, T, lo, no, semiring, subtraction, division, phase );
+			} else {
+				return internal::dense_unmasked_sequential_sptrsv< descr, false, 0, false >(
+					xb_p, T, lo, no, semiring, subtraction, division, phase );
+			}
+#else
+			IOType * const xb_p = internal::getRaw( xb );
+			// The below code is only for testing (DBG):
+ #if 0
+			const auto sptrsv = internal::getSptrsvData( T );
+			#pragma omp parallel num_threads(sptrsv->nThreads)
+			{
+				const auto &crs = internal::getCRS( T );
+				const int *__restrict__ schedule = reinterpret_cast< int * >(
+					sptrsv->data[ omp_get_thread_num() ] );
+				for( size_t step = 0; step < sptrsv->supersteps; ++step ) {
+					const int &lo = *schedule++;
+					const int &no = *schedule++;
+					const int upper_limit = lo + no;
+					for( int row_idx = lo; row_idx < upper_limit; ++row_idx ) {
+						// if not sorted, enable this variant instead:
+						// IOType div = 0;
+						// for( unsigned int k = crs.col_start[ row_idx ]; k < crs.col_start[ row_idx + 1 ]; ++k ) {
+						for( unsigned int k = crs.col_start[ row_idx ]; k < crs.col_start[ row_idx + 1 ] - 1; ++k ) {
+							const int &j = crs.row_index[ k ];
+							const double &value = crs.values[ k ];
+							// if not sorted, enable this if-else:
+							// if( j == row_idx ) {
+							//	div = value;
+							// } else {
+								xb_p[ row_idx ] -= value * xb_p[ j ];
+							//}
+						}
+						#pragma omp critical
+						xb_p[ row_idx ] /= crs.values[ crs.col_start[ row_idx + 1 ] - 1 ];
+					}
+					#pragma omp barrier
+				}
+			}
+			return grb::SUCCESS;
+ #else
+			if( forward ) {
+				return internal::dense_unmasked_omp_sptrsv< descr, true >(
+					xb_p, T, n, semiring, subtraction, division, phase );
+			} else {
+				return internal::dense_unmasked_omp_sptrsv< descr, false >(
+					xb_p, T, n, semiring, subtraction, division, phase );
+			}
+ #endif
+#endif
+		} else {
+			grb::Vector< IOType, reference, Coords > no_mask( 0 );
+			if( forward ) {
+				return internal::generic_sptrsv< descr, false, true, true >(
+					xb, no_mask, T, n,
+					semiring, subtraction, division, phase
+				);
+			} else {
+				return internal::generic_sptrsv< descr, false, true, false>(
+					xb, no_mask, T, n,
+					semiring, subtraction, division, phase
+				);
+			}
+		}
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Semiring,
+		class Subtraction,
+		class Division,
+		typename IOType, typename InputType1, typename InputType2,
+		typename Coords, typename RIT, typename CIT, typename NIT
+	>
+	RC sptrsv(
+		Vector< IOType, reference, Coords > &xb,
+		const Vector< InputType2, reference, Coords > &mask,
+		const Matrix< InputType1, reference, RIT, CIT, NIT > &T,
+		const bool forward,
+		const Semiring &semiring = Semiring(),
+		const Subtraction &subtraction = Subtraction(),
+		const Division &division = Division(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_semiring< Semiring >::value &&
+			grb::is_operator< Subtraction >::value &&
+			grb::is_operator< Division >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value,
+		void >::type * const = nullptr
+	) {
+		// check if can forward
+		if( grb::size( mask ) == 0 ) {
+			return sptrsv< descr >( xb, T, forward, semiring, subtraction, division,
+				phase );
+		}
+
+		// check contract
+		constexpr bool dense = descr & descriptors::dense;
+		const size_t n = size( xb );
+		if( grb::size( mask ) != n ) {
+			std::cerr << "Error, grb::sptrsv (masked): mask and input/output vector "
+				<< "should have the same size\n";
+			return grb::ILLEGAL;
+		}
+		if( grb::nrows( T ) != n ) {
+			std::cerr << "Error, grb::sptrsv (masked): matrix should have a number of "
+				<< "rows equal to the size of the input/output vector\n";
+			return grb::ILLEGAL;
+		}
+		if( grb::ncols( T ) != n ) {
+			std::cerr << "Error, grb::sptrsv (masked): matrix should have a number of "
+				<< "columns equal to the size of the input/output vector\n";
+			return grb::ILLEGAL;
+		}
+		if( dense && grb::nnz( mask ) < n ) {
+			std::cerr << "Error, grb::sptrsv (masked): sparse mask but dense descriptor "
+				<< "was given\n";
+			return grb::ILLEGAL;
+		}
+		if( dense && grb::nnz( xb ) < n ) {
+			std::cerr << "Error, grb::sptrsv (masked): sparse input vector given but "
+				"dense descriptor was given\n";
+			return grb::ILLEGAL;
+		}
+
+		if( dense || (grb::nnz( xb ) == n && grb::nnz( mask ) == n) ) {
+			if( forward ) {
+				return internal::generic_sptrsv< descr, true, false, true >(
+					xb, mask, T, n,
+					semiring, subtraction, division, phase
+				);
+			} else {
+				return internal::generic_sptrsv< descr, true, false, false >(
+					xb, mask, T, n,
+					semiring, subtraction, division, phase
+				);
+			}
+		} else {
+			if( forward ) {
+				return internal::generic_sptrsv< descr, true, true, true >(
+					xb, mask, T, n,
+					semiring, subtraction, division, phase );
+			} else {
+				return internal::generic_sptrsv< descr, true, true, false >(
+					xb, mask, T, n,
+					semiring, subtraction, division, phase );
+			}
+		}
+	}
+
+	namespace internal {
+
 #ifndef _H_GRB_REFERENCE_OMP_BLAS2
 		/**
 		 * A nonzero wrapper for use with grb::eWiseLambda over matrices.
diff --git a/include/graphblas/reference/config.hpp b/include/graphblas/reference/config.hpp
index 82e067ffd..116ed0202 100644
--- a/include/graphblas/reference/config.hpp
+++ b/include/graphblas/reference/config.hpp
@@ -333,7 +333,34 @@ namespace grb {
 
 		/** @} */
 
-	} // namespace config
+
+		/**
+		 * Collects all optimisations with regards to tuning operations for specific
+		 * inputs, i.e., #grb::PHASE::TUNE.
+		 */
+		namespace tuning {
+
+			/**
+			 * Collects all options for SpTRSV tuning.
+			 */
+			struct SpTRSV {
+
+				/**
+				 * There are three sorting modes implemented:
+				 *  - 0: the matrix rows are not sorted;
+				 *  - 1: the matrix rows have diagonal elements at position 0;
+				 *  - 2: the matrix rows are sorted.
+				 * The above description assumes CRS, however, the chosen sorting mode is
+				 * also applied to the CCS (in which case the above three points apply with
+				 * `rows' substituted by `columns').
+				 */
+				static constexpr int sortingMode = 0;
+
+			};
+
+		} // namespace grb::config::tuning
+
+	} // namespace grb::config
 
 } // namespace grb
 
diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index c1b3a365d..1b99614ef 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -49,6 +49,7 @@
 #include <graphblas/utils/iterators/utils.hpp>
 
 #include <graphblas/reference/init.hpp>
+#include <graphblas/reference/SptrsvSchedule.hpp>
 #include <graphblas/reference/compressed_storage.hpp>
 
 #include "NonzeroWrapper.hpp"
@@ -237,6 +238,259 @@ namespace grb {
 			return A.valbuf[ 1 ];
 		}
 
+		/**
+		 * \internal
+		 * Retrieves an optimised sptrsv-specific schedule.
+		 * \endinternal
+		 */
+		template< typename InputType, typename RIT, typename CIT, typename NIT >
+		const internal::SptrsvSchedule< NIT > * getSptrsvData(
+			const grb::Matrix< InputType, reference, RIT, CIT, NIT > &A
+		) noexcept {
+			return A.sptrsvSchedule;
+		}
+
+		template<
+			typename InputType, typename RIT, typename CIT, typename NIT,
+			typename NumRangesIt
+		>
+		void allocateSptrsvSchedule(
+			grb::Matrix< InputType, reference, RIT, CIT, NIT > &A,
+			const NIT n, const size_t nThreads, const size_t nSteps,
+			const NumRangesIt nRanges, const NumRangesIt nRanges_end
+		) {
+			if( A.sptrsvSchedule != nullptr ) {
+				throw std::runtime_error( "SptrsvSchedule was already initialised" );
+			}
+			A.sptrsvSchedule = new SptrsvSchedule<NIT>(n, nThreads);
+
+			auto &sptrsv = *(A.sptrsvSchedule);
+
+			sptrsv.supersteps = nSteps;
+			{
+				std::vector< NIT > tmp( nRanges, nRanges_end );
+				sptrsv.nRanges = std::move( tmp );
+			}
+
+			// disable default trivial schedule
+			sptrsv.data[ 0 ] = nullptr;
+			sptrsv.endPositions[ 0 ] = nullptr;
+
+			#pragma omp parallel
+			{
+				auto localIt = nRanges;
+				const size_t s = omp_get_thread_num();
+				const size_t actualNumThreads = omp_get_num_threads();
+				if( actualNumThreads < nThreads ) {
+					throw std::runtime_error( "Unexpected number of threads" );
+				}
+				std::advance( localIt, s );
+				if( sptrsv.data[ s ] || sptrsv.endPositions[ s ] ) {
+					throw std::runtime_error( "A thread-local schedule already existed" );
+				}
+				sptrsv.alloc( s, *localIt );
+			}
+		}
+
+		/**
+		 * This schedule ingestion method expects the schedule as an iterator-pair
+		 * over a container of doubly-nested vectors. Each such vector v iterated
+		 * over, corresponds to a superstep. For a given superstep's v, v[s][k] has
+		 * that s indicates the thread ID and k indicates the floor( k / 2 )-th range
+		 * to be processed in that superstep and at that thread. Even-numbered k
+		 * indicate lower bounds of the range, while odd k indicate their upper
+		 * bounds.
+		 *
+		 * \note This is not an ideal way to pass around schedules, as it assumes the
+		 *       container element types are doubly-nested STL vectors-- it is hence
+		 *       not a generic approach.
+		 */
+		template<
+			typename InputType, typename RIT, typename CIT, typename NIT, typename It
+		>
+		void setSptrsvSchedule(
+			grb::Matrix< InputType, reference, RIT, CIT, NIT > &A,
+			const It &bounds_begin, const It &bounds_end,
+			const size_t nThreads
+		) {
+			if( A.sptrsvSchedule == nullptr ) {
+				throw std::runtime_error( "SptrsvSchedule was not initialised" );
+			}
+			auto &sptrsv = *(A.sptrsvSchedule);
+			if( sptrsv.nThreads != nThreads ) {
+				throw std::runtime_error( "SptrsvSchedule was allocated with different "
+					"nThreads" );
+			}
+
+			// re-sort A, if enabled
+			sptrsv.is_sorted = true;
+			if( config::tuning::SpTRSV::sortingMode == 2 ) {
+				// CRS first
+				{
+					const auto &crs = internal::getCRS( A );
+					const size_t nThreads = A.m < config::OMP::minLoopSize()
+						? 1
+						: std::min(
+							config::OMP::threads(),
+							A.nz / config::CACHE_LINE_SIZE::value()
+						);
+					#pragma omp parallel for num_threads( nThreads )
+					for( size_t i = 0; i < A.m; ++i ) {
+						std::vector< std::pair< int, double > > pairs;
+						for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k ) {
+							pairs.push_back( std::make_pair( crs.row_index[ k ], crs.values[ k ] ) );
+						}
+						std::sort( pairs.begin(), pairs.end(),
+							[]( const std::pair< int, double > &left, const std::pair< int, double > &right ) {
+								return left.first < right.first;
+							} );
+						auto it = pairs.cbegin();
+						for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k, ++it ) {
+							assert( it != pairs.cend() );
+							crs.row_index[ k ] = it->first;
+							crs.values[ k ] = it->second;
+						}
+					}
+				}
+				// then CCS
+				{
+					const auto &ccs = internal::getCCS( A );
+					const size_t nThreads = A.n < config::OMP::minLoopSize()
+						? 1
+						: std::min(
+							config::OMP::threads(),
+							A.nz / config::CACHE_LINE_SIZE::value()
+						);
+					#pragma omp parallel for num_threads( nThreads )
+					for( size_t i = 0; i < A.n; ++i ) {
+						std::vector< std::pair< int, double > > pairs;
+						for( size_t k = ccs.col_start[ i ]; k < ccs.col_start[ i + 1 ]; ++k ) {
+							pairs.push_back( std::make_pair( ccs.row_index[ k ], ccs.values[ k ] ) );
+						}
+						std::sort( pairs.begin(), pairs.end(),
+							[]( const std::pair< int, double > &left, const std::pair< int, double > &right ) {
+								return left.first < right.first;
+							} );
+						auto it = pairs.cbegin();
+						for( size_t k = ccs.col_start[ i ]; k < ccs.col_start[ i + 1 ]; ++k, ++it ) {
+							assert( it != pairs.cend() );
+							ccs.row_index[ k ] = it->first;
+							ccs.values[ k ] = it->second;
+						}
+					}
+				}
+			} else if( config::tuning::SpTRSV::sortingMode == 1 ) {
+				{
+					const auto &crs = internal::getCRS( A );
+					const size_t nThreads = A.m < config::OMP::minLoopSize()
+						? 1
+						: std::min(
+							config::OMP::threads(),
+							A.nz / config::CACHE_LINE_SIZE::value()
+						);
+					#pragma omp parallel for num_threads( nThreads )
+					for( size_t i = 0; i < A.m; ++i ) {
+						const auto index_it = std::find(
+							crs.row_index + crs.col_start[ i ],
+							crs.row_index + crs.col_start[ i + 1 ],
+							i
+						);
+						if( !std::is_void< InputType >::value ) {
+							const size_t index = std::distance( crs.row_index, index_it );
+							std::swap( crs.values[ index ], crs.values[ crs.col_start[ i ] ] );
+						}
+						std::swap( *index_it, crs.row_index[ crs.col_start[ i ] ] );
+					}
+				}
+				{
+					const auto &ccs = internal::getCCS( A );
+					const size_t nThreads = A.n < config::OMP::minLoopSize()
+						? 1
+						: std::min(
+							config::OMP::threads(),
+							A.nz / config::CACHE_LINE_SIZE::value()
+						);
+					#pragma omp parallel for num_threads( nThreads )
+					for( size_t i = 0; i < A.n; ++i ) {
+						const auto start = ccs.row_index + ccs.col_start[ i ];
+						const auto end = ccs.row_index + ccs.col_start[ i + 1 ];
+						const auto index_it = std::find( start, end, i );
+						if( index_it == end ) {
+							throw std::runtime_error( "No diagonal element present" );
+						} else if( index_it != start ) {
+							if( !std::is_void< InputType >::value ) {
+								const size_t index = std::distance( ccs.row_index, index_it );
+								std::swap( ccs.values[ index ], ccs.values[ ccs.col_start[ i ] ] );
+							}
+							std::swap( *index_it, ccs.row_index[ ccs.col_start[ i ] ] );
+						}
+					}
+				}
+			} else {
+				sptrsv.is_sorted = false;
+			}
+
+			sptrsv.is_simple = true;
+			#pragma omp parallel
+			{
+				bool simple_local = true;
+				It bounds = bounds_begin;
+				const size_t actualNumThreads = omp_get_num_threads();
+				if( actualNumThreads != nThreads ) {
+					throw std::runtime_error( "Unexpected number of threads" );
+				}
+				const size_t s = omp_get_thread_num();
+				// get buffer as an array of NIT, which we will write to in one pass
+				NIT *__restrict__ array = reinterpret_cast< NIT * >(sptrsv.data[ s ]);
+				// get the number of ranges for a given superstep at this thread that this
+				// function should populate
+				NIT *__restrict__ end = reinterpret_cast< NIT * >(sptrsv.endPositions[ s ]);
+				// dynamic assertion: there should be at least one superstep
+				assert( bounds != bounds_end );
+				// start ingestion
+				size_t count = 0;
+				do {
+					assert( bounds->size() == nThreads );
+					size_t nRanges = 0;
+					// get this superstep's and this thread's vector v[s]
+					const auto &v = (*bounds)[s];
+					// go range-by-range
+					for( auto it = v.cbegin(); it != v.cend(); ++it ) {
+						// we have a range, parse it
+						const NIT l = *it++;
+						assert( it != v.cend() );
+						const NIT h = *it + 1;
+						assert( h >= l );
+						const NIT n = h - l;
+						if( count >= sptrsv.supersteps ) {
+							throw std::runtime_error( "Too many supersteps" );
+						}
+						// store it
+						assert( nRanges < sptrsv.nRanges[ s ] );
+						*array++ = l;
+						*array++ = n;
+						(void) ++nRanges;
+					}
+					// store the number of ranges in the end array
+					assert( count < sptrsv.supersteps );
+					if( nRanges != 1 ) {
+						simple_local = false;
+					}
+					*end++ = nRanges;
+					// forward to the next superstep
+					(void) ++count;
+					(void) ++bounds;
+				} while( bounds != bounds_end );
+				if( count != sptrsv.supersteps ) {
+					throw std::runtime_error( "Unexpected number of supersteps" );
+				}
+				if( !simple_local ) {
+					#pragma omp critical
+					sptrsv.is_simple = false;
+				}
+			}
+		}
+
 		template<
 			Descriptor descr,
 			bool input_dense, bool output_dense,
@@ -1225,6 +1479,30 @@ namespace grb {
 			const grb::Matrix< InputType, reference, RIT, CIT, NIT > &A
 		) noexcept;
 
+		template< typename InputType, typename RIT, typename CIT, typename NIT >
+		friend const internal::SptrsvSchedule< NIT > * internal::getSptrsvData(
+			const grb::Matrix< InputType, reference, RIT, CIT, NIT > &A
+		) noexcept;
+
+		template<
+			typename InputType, typename RIT, typename CIT, typename NIT,
+			typename NumRangesIt
+		>
+		friend void internal::allocateSptrsvSchedule(
+			grb::Matrix< InputType, reference, RIT, CIT, NIT > &A,
+			const NIT n, const size_t nThreads, const size_t nSteps,
+			const NumRangesIt nRanges, const NumRangesIt nRanges_end
+		);
+
+		template<
+			typename InputType, typename RIT, typename CIT, typename NIT, typename It
+		>
+		friend void internal::setSptrsvSchedule(
+			grb::Matrix< InputType, reference, RIT, CIT, NIT > &A,
+			const It &bounds, const It &bounds_end,
+			const size_t nThreads
+		);
+
 		friend const grb::Matrix<
 			D, reference,
 			ColIndexType, ColIndexType, NonzeroIndexType
@@ -1351,6 +1629,11 @@ namespace grb {
 			 */
 			utils::AutoDeleter< char > _local_deleter[ 6 ];
 
+			/**
+			 * Optimised schedule for sptrsv operations on this matrix.
+			 */
+			internal::SptrsvSchedule< NonzeroIndexType > * sptrsvSchedule;
+
 			/**
 			 * Internal constructor for manual construction of matrices.
 			 *
@@ -1403,7 +1686,7 @@ namespace grb {
 				id( std::numeric_limits< uintptr_t >::max() ), remove_id( false ),
 				m( _m ), n( _n ), cap( _cap ), nz( _offset_array[ _m ] ),
 				coorArr{ nullptr, buf1 }, coorBuf{ nullptr, buf2 },
-				valbuf{ nullptr, buf3 }
+				valbuf{ nullptr, buf3 }, sptrsvSchedule( nullptr )
 			{
 				assert( (_m > 0 && _n > 0) || _column_indices[ 0 ] == 0 );
 				CRS.replace( _values, _column_indices );
@@ -1424,6 +1707,8 @@ namespace grb {
 				const size_t rows, const size_t cols,
 				const size_t cap_in
 			) {
+				// SpTrsvSchedule should be manually set always by a requested tuning phase
+				sptrsvSchedule = nullptr;
 #ifdef _DEBUG_REFERENCE_MATRIX
 				std::cerr << "\t in Matrix< reference >::initialize...\n"
 					<< "\t\t matrix size " << rows << " by " << cols << "\n"
@@ -1596,6 +1881,7 @@ namespace grb {
 					_deleter[ i ] = std::move( other._deleter[ i ] );
 					_local_deleter[ i ] = std::move( other._local_deleter[ i ] );
 				}
+				sptrsvSchedule = other.sptrsvSchedule;
 
 				// invalidate other fields
 				for( unsigned int i = 0; i < 2; ++i ) {
@@ -1608,6 +1894,7 @@ namespace grb {
 				other.n = 0;
 				other.cap = 0;
 				other.nz = 0;
+				other.sptrsvSchedule = nullptr;
 			}
 
 			/**
@@ -2228,6 +2515,9 @@ namespace grb {
 #endif
 					internal::reference_mapper.remove( id );
 				}
+				if( sptrsvSchedule ) {
+					delete sptrsvSchedule;
+				}
 			}
 
 			/**
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 5cd03cf77..049af30e9 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -374,6 +374,12 @@ add_grb_executables( launch_benchmark_frommpi_manual launcherAndBenchmarker.cpp
 	COMPILE_DEFINITIONS NO_LPF_AUTO_INIT
 )
 
+# tests for experimental features
+
+add_grb_executables( dense_sptrsv dense_sptrsv.cpp
+	BACKENDS reference reference_omp
+)
+
 # targets to list and build the test for this category
 get_property( unit_tests_list GLOBAL PROPERTY tests_category_unit )
 add_custom_target( "list_tests_category_unit"
diff --git a/tests/unit/dense_sptrsv.cpp b/tests/unit/dense_sptrsv.cpp
new file mode 100644
index 000000000..c7138fe94
--- /dev/null
+++ b/tests/unit/dense_sptrsv.cpp
@@ -0,0 +1,320 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <map>
+#include <set>
+#include <cmath>
+#include <array>
+#include <cstdio>
+#include <string>
+#include <vector>
+#include <climits>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <sstream>
+#include <algorithm>
+
+#include <inttypes.h>
+
+#include "graphblas/algorithms/matrix_factory.hpp"
+#include "graphblas/utils/timer.hpp"
+
+#include "graphblas.hpp"
+
+
+using namespace grb;
+
+struct input {
+	size_t n;
+	size_t rep;
+};
+
+struct output {
+	enum grb::RC error_code;
+	grb::utils::TimerResults times;
+};
+
+template< typename T, class Semiring >
+bool checkVectorDenseAndEqualTo(
+	const grb::Vector< T > &x, const T val, struct output &out,
+	const Semiring &semiring
+) {
+	size_t count = 0;
+	bool error = false;
+	std::set< size_t > localSet;
+	const size_t n = grb::size( x );
+	for( const auto &pair : x ) {
+		(void) ++count;
+		localSet.insert( pair.first );
+		if( pair.second != val ) {
+			std::cout << "unexpected entry ( " << pair.first << ", " << pair.second
+				<< " ); expected value " << val << "\n";
+			error = true;
+		}
+	}
+	if( grb::collectives<>::allreduce( count, semiring.getAdditiveOperator() )
+		!= grb::SUCCESS
+	) {
+		std::cerr << "error during computing checksum (I)\n";
+		out.error_code = grb::PANIC;
+		return false;
+	}
+	if( count != n ) {
+		std::cout << "expected " << n << " nonzeroes, got " << count << "\n";
+		error = true;
+	}
+	count = localSet.size();
+	if( grb::collectives<>::allreduce( count, semiring.getAdditiveOperator() )
+		!= grb::SUCCESS
+	) {
+		std::cerr << "error during computing checksum (II)\n";
+		out.error_code = grb::PANIC;
+		return false;
+	}
+	if( count != n ) {
+		std::cout << "expected " << n << " indices, got " << count << "\n";
+		error = true;
+	}
+	if( out.error_code == grb::SUCCESS ) {
+		if( error ) {
+			out.error_code = grb::FAILED;
+			return false;
+		}
+	}
+	return true;
+}
+
+// main label propagation algorithm
+void grbProgram( const struct input &data_in, struct output &out ) {
+	out.error_code = grb::PANIC;
+	grb::utils::Timer timer;
+
+	const size_t s = grb::spmd<>::pid();
+#ifndef NDEBUG
+	assert( s < grb::spmd<>::nprocs() );
+#else
+	(void) s;
+#endif
+
+	// get input n and test case
+	const size_t n = data_in.n;
+
+	// setup
+	grb::Vector< double > x( n );
+	grb::Matrix< double > A = grb::algorithms::matrices< double >::identity( n );
+	Semiring<
+		grb::operators::add< double >, grb::operators::mul< double >,
+		grb::identities::zero, grb::identities::one
+	> semiring;
+	grb::operators::divide< double > division;
+	grb::operators::subtract< double > subtraction;
+
+	out.error_code = grb::set( x, 3.14 );
+	if( out.error_code ) {
+		std::cerr << "Error during test initialisation\n";
+		return;
+	}
+	if( !checkVectorDenseAndEqualTo( x, 3.14, out, semiring ) ) {
+		std::cerr << "NOTE: the aforementioned errors occurred during test "
+			<< "initialisation\n";
+		return;
+	}
+	out.times.preamble = timer.time();
+
+	if( s == 0 ) {
+		std::cout << "Test 1: ";
+	}
+	timer.reset();
+	out.error_code = grb::sptrsv( x, A, false, semiring, subtraction, division );
+	if( out.error_code ) {
+		std::cerr << "test returned error\n";
+	}
+	if( !checkVectorDenseAndEqualTo( x, 3.14, out, semiring ) ) { return; }
+	out.times.postamble += timer.time();
+	timer.reset();
+	for( size_t rep = 0; rep < data_in.rep; ++rep ) {
+		(void) grb::sptrsv( x, A, false, semiring, subtraction, division );
+	}
+	out.times.useful += timer.time();
+
+	assert( out.error_code == grb::SUCCESS );
+	if( s == 0 ) {
+		std::cout << "\b\b 2: ";
+	}
+	timer.reset();
+	out.error_code = grb::set< grb::descriptors::dense >( x, 2.14 );
+	if( out.error_code ) {
+		std::cerr << "Error during test initialisation\n";
+		return;
+	}
+	if( !checkVectorDenseAndEqualTo( x, 2.14, out, semiring ) ) {
+		std::cerr << "NOTE: aforementioned errors occurred during initialisation\n";
+		return;
+	}
+	out.times.preamble += timer.time();
+	timer.reset();
+	out.error_code = grb::sptrsv( x, A, true, semiring, subtraction, division );
+	if( out.error_code ) {
+		std::cerr << "test returned error\n";
+	}
+	if( !checkVectorDenseAndEqualTo( x, 2.14, out, semiring ) ) { return; }
+	out.times.postamble += timer.time();
+	timer.reset();
+	for( size_t rep = 0; rep < data_in.rep; ++rep ) {
+		(void) grb::sptrsv( x, A, true, semiring, subtraction, division );
+	}
+	out.times.useful += timer.time();
+
+	assert( out.error_code == grb::SUCCESS );
+	if( s == 0 ) {
+		std::cout << "\b\b 3: ";
+	}
+	timer.reset();
+	out.error_code = grb::set< grb::descriptors::dense >( x, 3.14 );
+	if( out.error_code ) {
+		std::cerr << "Error during test initialisation\n";
+		return;
+	}
+	if( !checkVectorDenseAndEqualTo( x, 3.14, out, semiring ) ) {
+		std::cerr << "NOTE: aforementioned errors occurred during initialisation\n";
+		return;
+	}
+	out.times.preamble += timer.time();
+	timer.reset();
+	out.error_code = grb::sptrsv< grb::descriptors::dense >( x, A, false, semiring,
+		subtraction, division );
+	if( out.error_code ) {
+		std::cerr << "test returned error\n";
+	}
+	if( !checkVectorDenseAndEqualTo( x, 3.14, out, semiring ) ) { return; }
+	out.times.postamble += timer.time();
+	timer.reset();
+	for( size_t rep = 0; rep < data_in.rep; ++rep ) {
+		(void) grb::sptrsv< grb::descriptors::dense >( x, A, false, semiring,
+			subtraction, division );
+	}
+	out.times.useful += timer.time();
+
+	assert( out.error_code == grb::SUCCESS );
+	if( s == 0 ) {
+		std::cout << "\b\b 4: ";
+	}
+	timer.reset();
+	out.error_code = grb::set< grb::descriptors::dense >( x, 2.14 );
+	if( out.error_code ) {
+		std::cerr << "Error during test initialisation\n";
+		return;
+	}
+	if( !checkVectorDenseAndEqualTo( x, 2.14, out, semiring ) ) {
+		std::cerr << "NOTE: aforementioned errors occurred during initialisation\n";
+		return;
+	}
+	out.times.preamble += timer.time();
+	timer.reset();
+	out.error_code = grb::sptrsv< grb::descriptors::dense >( x, A, true, semiring,
+		subtraction, division );
+	if( out.error_code ) {
+		std::cerr << "test returned error\n";
+	}
+	if( !checkVectorDenseAndEqualTo( x, 2.14, out, semiring ) ) { return; }
+	out.times.postamble += timer.time();
+	timer.reset();
+	for( size_t rep = 0; rep < data_in.rep; ++rep ) {
+		(void) grb::sptrsv< grb::descriptors::dense >( x, A, true, semiring,
+			subtraction, division );
+	}
+	out.times.useful += timer.time();
+
+	std::cout << "OK\n";
+}
+
+// main function will execute in serial or as SPMD
+int main( int argc, char ** argv ) {
+	// sanity check
+	if( argc < 3 || argc > 5 ) {
+		std::cout << "Usage: " << argv[ 0 ] << " <problem size> "
+			<< "(inner repititions) (outer repititions)" << std::endl;
+		return 0;
+	}
+	std::cout << "Test executable: " << argv[ 0 ] << std::endl;
+
+	// the input struct
+	struct input in;
+	in.n = atoi( argv[ 1 ] );
+	in.rep = grb::config::BENCHMARKING::inner();
+	size_t outer = grb::config::BENCHMARKING::outer();
+	char * end = NULL;
+	if( argc >= 3 ) {
+		in.rep = strtoumax( argv[ 2 ], &end, 10 );
+		if( argv[ 2 ] == end ) {
+			std::cerr << "Could not parse argument for number of inner "
+				<< "repetitions." << std::endl;
+			return 25;
+		}
+	}
+	if( argc >= 4 ) {
+		outer = strtoumax( argv[ 3 ], &end, 10 );
+		if( argv[ 3 ] == end ) {
+			std::cerr << "Could not parse argument for number of outer "
+				<< "reptitions." << std::endl;
+			return 30;
+		}
+	}
+
+	std::cout << "Executable called with parameters: problem size " << in.n
+		<< ", inner = " << in.rep << ", outer = " << outer << "." << std::endl;
+
+	// the output struct
+	struct output out;
+
+	// run the program one time to infer number of inner repititions
+	if( in.rep == 0 ) {
+		in.rep = 1;
+		grb::Launcher< AUTOMATIC > launcher;
+		const enum grb::RC rc = launcher.exec( &grbProgram, in, out, true );
+		if( rc != SUCCESS ) {
+			std::cerr << "launcher.exec returns with non-SUCCESS error code "
+				<< grb::toString( rc ) << std::endl;
+			return 40;
+		}
+		// set guesstimate for inner repititions: a single experiment should take at least a second
+		in.rep = static_cast< double >( 1000.0 / out.times.useful ) + 1;
+		std::cout << "Auto-selected number of inner repetitions is "
+			<< in.rep << " (at an estimated time of "
+			<< out.times.useful << " ms. of useful work per benchmark).\n";
+	}
+
+	// start benchmarks
+	grb::Benchmarker< AUTOMATIC > benchmarker;
+	const enum grb::RC rc = benchmarker.exec( &grbProgram, in, out, 1, outer, true );
+	if( rc != SUCCESS ) {
+		std::cerr << "launcher.exec returns with non-SUCCESS error code "
+			<< grb::toString( rc ) << std::endl;
+		return 50;
+	}
+
+	// done
+	if( out.error_code != SUCCESS ) {
+		std::cout << "Test FAILED\n" << std::endl;
+		std::cerr << std::flush;
+		return out.error_code;
+	}
+	std::cout << "Test OK\n" << std::endl;
+	return 0;
+}
+
diff --git a/tests/unit/unittests.sh b/tests/unit/unittests.sh
index 0b49d2421..e6d3f7644 100755
--- a/tests/unit/unittests.sh
+++ b/tests/unit/unittests.sh
@@ -197,6 +197,19 @@ for MODE in ${MODES}; do
 					grep 'Test OK' ${TEST_OUT_DIR}/mxm_crs_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
 					echo " "
 
+					echo ">>>      [x]           [ ]       Testing grb::sptrsv (unmasked, dense vectors) on"
+					echo "                                 small simple matrices, re-entrant"
+					$runner ${TEST_BIN_DIR}/dense_sptrsv_${MODE}_${BACKEND} 10 1000 3 &> ${TEST_OUT_DIR}/dense_sptrsv_${MODE}_${BACKEND}_${T}.log
+					head -1 ${TEST_OUT_DIR}/dense_sptrsv_${MODE}_${BACKEND}_${T}.log
+					grep 'Test OK' ${TEST_OUT_DIR}/dense_sptrsv_${MODE}_${BACKEND}_${T}.log || echo "Test FAILED"
+					echo " "
+
+					echo ">>>      [x]           [ ]       Testing grb::sptrsv (unmasked, dense vectors) on"
+					echo "                                 large simple matrices, re-entrant"
+					$runner ${TEST_BIN_DIR}/dense_sptrsv_${MODE}_${BACKEND} 1000000 1 3 &> ${TEST_OUT_DIR}/dense_sptrsv_large_${MODE}_${BACKEND}_${T}.log
+					head -1 ${TEST_OUT_DIR}/dense_sptrsv_large_${MODE}_${BACKEND}_${T}.log
+					grep 'Test OK' ${TEST_OUT_DIR}/dense_sptrsv_large_${MODE}_${BACKEND}_${T}.log || echo "Test FAILED"
+					echo " "
 				fi
 
 				echo "#################################################################"