From bd04f66e0bc38503d2fa3e005098bbb9904104a5 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Thu, 3 Apr 2025 18:23:52 +0200
Subject: [PATCH 01/55] Add prototype for sptrsv

---
 include/graphblas/base/blas2.hpp | 96 ++++++++++++++++++++++++++++++++
 1 file changed, 96 insertions(+)

diff --git a/include/graphblas/base/blas2.hpp b/include/graphblas/base/blas2.hpp
index 3b5e5df4e..c273ca158 100644
--- a/include/graphblas/base/blas2.hpp
+++ b/include/graphblas/base/blas2.hpp
@@ -337,6 +337,102 @@ namespace grb {
 		return UNSUPPORTED;
 	}
 
+	/**
+	 * Computes \f$ x + y \f$, where \f$ y \f$ is solved from \f$ Ty=b \f$.
+	 *
+	 * Here, \f$ T, b \f$ are given while \f$ T \f$ additionally must be either
+	 * lower- or upper-triangular. The output contribution by \f$ y \f$ furthermore
+	 * may be masked.
+	 *
+	 * @param[in,out] x  On input: a vector with sufficient capacity to store the
+	 *                   union of any pre-existing nonzeroes in \f$ x \f$ plus the
+	 *                   (masked) result of \f$ y \f$ when #grb::EXECUTE is
+	 *                   given.
+	 *                   On output: the result of the computation when
+	 *                   #grb::EXECUTE is given, or a vector with the exact same
+	 *                   contents as on input but with the capacity resized to
+	 *                   ensure it could store the result of the requested
+	 *                   computation if #grb::RESIZE is given.
+	 * @param[in] x_mask The mask that acts on \f$ y \f$. This vector has either
+	 *                   size zero or size equal to that of \a x.
+	 *
+	 * \note If \a x_mask has size zero, it is interpreted as though the operation
+	 *       unmasked.
+	 *
+	 * @param[in] T       The upper- or lower-triangular input matrix. This must be
+	 *                    a square matrix with size equal to that of \a x.
+	 * @param[in] b       The right-hand side input vector. Its size must be equal
+	 *                    to that of \a x.
+	 * @param[in] forward Whether to perform forward substitution (i.e., \f$ T \f$
+	 *                    is lower-triangular), or to perform backward substitution
+	 *                    instead (i.e., \f$ T \f$ is upper-triangular).
+	 *
+	 * \note With the structural information ALP/Dense passes as template
+	 *       information, \a forward would not be required.
+	 *
+	 * @param[in] semiring    The semiring under which to perform the SpTrsv.
+	 * @param[in] subtraction The inverse of the additive operator of \a semiring.
+	 * @param[in] division    The inverse of the multiplicative operator of
+	 *                        \a semiring.
+	 *
+	 * \note That is, this operation in fact requires a field structure and not a
+	 *       semiring. A future extension to ALP may provide such a structure
+	 *       explicitly.
+	 *
+	 * @param[in] phase       The requested phase of the computation. Only
+	 *                        #grb::EXECUTE and #grb::RESIZE are supported.
+	 *
+	 * \todo Expand documentation.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Semiring,
+		class Subtraction,
+		class Division,
+		typename IOType, typename InputType1, typename InputType2,
+		typename InputType3, typename InputType4,
+		typename Coords, typename RIT, typename CIT, typename NIT,
+		Backend backend
+	>
+	RC sptrsv(
+		Vector< IOType, backend, Coords > &x,
+		const Vector< InputType3, backend, Coords > &x_mask,
+		const Matrix< InputType2, backend, RIT, CIT, NIT > &T,
+		const Vector< InputType1, backend, Coords > &b,
+		const bool forward,
+		const Semiring &semiring = Semiring(),
+		const Subtraction &subtraction = Subtraction(),
+		const Division &division = Division(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_semiring< Semiring >::value &&
+			grb::is_operator< Subtraction >::value &&
+			grb::is_operator< Division >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			!grb::is_object< InputType4 >::value,
+		void >::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cerr << "Selected backend does not implement masked grb::sptrsv\n";
+#endif
+#ifndef NDEBUG
+		const bool selected_backend_does_not_support_masked_sptrsv = false;
+		assert( selected_backend_does_not_support_masked_sptrsv );
+#endif
+		(void) x;
+		(void) x_mask;
+		(void) T;
+		(void) b;
+		(void) semiring;
+		(void) subtraction;
+		(void) division;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
 	/**
 	 * Executes an arbitrary element-wise user-defined function \a f on all
 	 * nonzero elements of a given matrix \a A.

From 7e139b0048abf0c03308c66ec33764dd051e9815 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Fri, 4 Apr 2025 12:47:23 +0200
Subject: [PATCH 02/55] simplify spec

---
 include/graphblas/base/blas2.hpp | 55 +++++++++++++++-----------------
 1 file changed, 26 insertions(+), 29 deletions(-)

diff --git a/include/graphblas/base/blas2.hpp b/include/graphblas/base/blas2.hpp
index c273ca158..902c83a58 100644
--- a/include/graphblas/base/blas2.hpp
+++ b/include/graphblas/base/blas2.hpp
@@ -338,31 +338,33 @@ namespace grb {
 	}
 
 	/**
-	 * Computes \f$ x + y \f$, where \f$ y \f$ is solved from \f$ Ty=b \f$.
+	 * Computes \f$ x \f$ from \f$ Tx=b \f$.
 	 *
 	 * Here, \f$ T, b \f$ are given while \f$ T \f$ additionally must be either
-	 * lower- or upper-triangular. The output contribution by \f$ y \f$ furthermore
-	 * may be masked.
-	 *
-	 * @param[in,out] x  On input: a vector with sufficient capacity to store the
-	 *                   union of any pre-existing nonzeroes in \f$ x \f$ plus the
-	 *                   (masked) result of \f$ y \f$ when #grb::EXECUTE is
-	 *                   given.
-	 *                   On output: the result of the computation when
-	 *                   #grb::EXECUTE is given, or a vector with the exact same
-	 *                   contents as on input but with the capacity resized to
-	 *                   ensure it could store the result of the requested
-	 *                   computation if #grb::RESIZE is given.
-	 * @param[in] x_mask The mask that acts on \f$ y \f$. This vector has either
+	 * lower- or upper-triangular. The output \f$ x \f$ may furthermore be masked.
+	 *
+	 * @param[in,out] xb On input: the vector b. On output: the vector x.
+	 *                   When #grb::EXECUTE is given this vector must have
+	 *                   sufficient capacity to store both any pre-existing
+	 *                   nonzeroes in \f$ b \f$ plus the nonzeroes in the (masked)
+	 *                   result of \f$ x \f$.
+	 *                   When #grb::RESIZE is given, the vector contents are
+	 *                   unchanged, however its capacity may be enlarged in order
+	 *                   to ensure the above-described condition.
+	 *
+	 * \note On input, \a xb may contain both implicit and explicit zeroes.
+	 *
+	 * \note On output, \a xb may contain both implicit and explicit zeroes, even
+	 *       if on input it did not.
+	 *
+	 * @param[in] mask   The mask that acts on \f$ x \f$. This vector has either
 	 *                   size zero or size equal to that of \a x.
 	 *
-	 * \note If \a x_mask has size zero, it is interpreted as though the operation
+	 * \note If \a mask has size zero, it is interpreted as though the operation
 	 *       unmasked.
 	 *
 	 * @param[in] T       The upper- or lower-triangular input matrix. This must be
-	 *                    a square matrix with size equal to that of \a x.
-	 * @param[in] b       The right-hand side input vector. Its size must be equal
-	 *                    to that of \a x.
+	 *                    a square matrix with size equal to that of \a xb.
 	 * @param[in] forward Whether to perform forward substitution (i.e., \f$ T \f$
 	 *                    is lower-triangular), or to perform backward substitution
 	 *                    instead (i.e., \f$ T \f$ is upper-triangular).
@@ -390,15 +392,13 @@ namespace grb {
 		class Subtraction,
 		class Division,
 		typename IOType, typename InputType1, typename InputType2,
-		typename InputType3, typename InputType4,
 		typename Coords, typename RIT, typename CIT, typename NIT,
 		Backend backend
 	>
 	RC sptrsv(
-		Vector< IOType, backend, Coords > &x,
-		const Vector< InputType3, backend, Coords > &x_mask,
-		const Matrix< InputType2, backend, RIT, CIT, NIT > &T,
-		const Vector< InputType1, backend, Coords > &b,
+		Vector< IOType, backend, Coords > &xb,
+		const Vector< InputType2, backend, Coords > &mask,
+		const Matrix< InputType1, backend, RIT, CIT, NIT > &T,
 		const bool forward,
 		const Semiring &semiring = Semiring(),
 		const Subtraction &subtraction = Subtraction(),
@@ -410,9 +410,7 @@ namespace grb {
 			grb::is_operator< Division >::value &&
 			!grb::is_object< IOType >::value &&
 			!grb::is_object< InputType1 >::value &&
-			!grb::is_object< InputType2 >::value &&
-			!grb::is_object< InputType3 >::value &&
-			!grb::is_object< InputType4 >::value,
+			!grb::is_object< InputType2 >::value,
 		void >::type * const = nullptr
 	) {
 #ifdef _DEBUG
@@ -422,10 +420,9 @@ namespace grb {
 		const bool selected_backend_does_not_support_masked_sptrsv = false;
 		assert( selected_backend_does_not_support_masked_sptrsv );
 #endif
-		(void) x;
-		(void) x_mask;
+		(void) xb;
+		(void) mask;
 		(void) T;
-		(void) b;
 		(void) semiring;
 		(void) subtraction;
 		(void) division;

From 01f7ed05595e95b55a818838c1c8cf40571b5829 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Fri, 4 Apr 2025 15:37:58 +0200
Subject: [PATCH 03/55] First non-vectorised reference sptrsv implementation,
 not yet tested

---
 include/graphblas/reference/blas2.hpp | 254 ++++++++++++++++++++++++++
 1 file changed, 254 insertions(+)

diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp
index 7a4d3e135..2cfeac82b 100644
--- a/include/graphblas/reference/blas2.hpp
+++ b/include/graphblas/reference/blas2.hpp
@@ -2216,6 +2216,260 @@ namespace grb {
 
 	namespace internal {
 
+#ifndef _H_GRB_REFERENCE_OMP_BLAS2
+		/** Backend-independent sptrsv kernel code. */
+		template<
+			typename IOType, typename InputType1,
+			typename IND, typename NIT,
+			class Semiring, class Subtraction
+		>
+		inline void sptrsv_kernel(
+			const Compressed_Storage< InputType1, IND, NIT > &crs,
+			const size_t i,
+			IOType * const x,
+			IOType &divBy,
+			const Semiring &semiring,
+			const Subtraction &subtraction
+		) {
+			constexpr auto one = Semiring::template getOne< typename Semiring::D2 >();
+			for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k ) {
+				const typename Semiring::D2 val = crs.template getValue( k, one );
+				const auto &ind = crs.row_index[ k ];
+				if( ind == i ) {
+					divBy = val;
+					continue;
+				}
+				const typename Semiring::D3 tmp;
+				(void) grb::apply( tmp, val, x[ ind ],
+					semiring.getMultiplicativeOperator() );
+				(void) grb::foldl( x[ i ], tmp, subtraction );
+			}
+		}
+#endif
+
+		/** \internal Specialised dense unmasked sptrsv implementation */
+		template<
+			Descriptor descr,
+			class Semiring, class Subtraction, class Division,
+			typename IOType, typename InputType1,
+			typename Coords, typename RIT, typename CIT, typename NIT
+		>
+		RC dense_unmasked_sptrsv(
+			Vector< IOType, reference, Coords > &xb,
+			const Matrix< InputType1, reference, RIT, CIT, NIT > &T,
+			const size_t &n,
+			const bool forward,
+			const Semiring &semiring,
+			const Subtraction &subtraction,
+			const Division &division,
+			const Phase &phase
+		) {
+			// dynamic sanity checks
+			assert( grb::size( xb ) == n );
+			assert( grb::nnz( xb ) == n );
+			assert( grb::nrows( T ) == n );
+			assert( grb::ncols( T ) == n );
+
+			// in dense unmasked, resize is a no-op
+			if( phase == grb::RESIZE ) { return grb::SUCCESS; }
+
+			// only execute and resize are supported
+			assert( phase == grb::EXECUTE );
+
+			// get required data handles
+			IOType * const v_raw = internal::getRaw( xb );
+			size_t new_nz = 0;
+
+			// switch forward or backward solve
+			if( forward ) {
+				const auto &crs = internal::getCRS( T );
+				for( size_t i = 0; i < n; ++i ) {
+					IOType divBy = semiring.template getZero< IOType >();
+					assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] );
+					sptrsv_kernel( crs, i, v_raw, divBy, semiring, subtraction );
+					(void) grb::foldl( v_raw[ i ], divBy, division );
+				}
+			} else {
+				const auto &ccs = internal::getCCS( T );
+				for( size_t i = n - 1; i < n; --i ) {
+					IOType divBy = semiring.template getZero< IOType >();
+					assert( ccs.col_start[ i ] <= ccs.col_start[ i + 1 ] );
+					sptrsv_kernel( ccs, i, v_raw, divBy, semiring, subtraction );
+					(void) grb::foldl( v_raw[ i ], divBy, division );
+				}
+			}
+
+			// done
+			return grb::SUCCESS;
+		}
+
+		/**
+		 * \internal Implements sparse masked, sparse unmasked, and dense masked
+		 *           sptrsv.
+		 */
+		template<
+			Descriptor descr,
+			bool masked, bool sparse,
+			class Semiring, class Subtraction, class Division,
+			typename IOType, typename InputType1, typename InputType2,
+			typename Coords, typename RIT, typename CIT, typename NIT
+		>
+		RC generic_sptrsv(
+			Vector< IOType, reference, Coords > &xb,
+			const Vector< InputType2, reference, Coords > &mask,
+			const Matrix< InputType1, reference, RIT, CIT, NIT > &T,
+			const size_t &n,
+			const bool forward,
+			const Semiring &semiring,
+			const Subtraction &subtraction,
+			const Division &division,
+			const Phase &phase
+		) {
+			static_assert( masked || sparse, "Internal logic error; please submit a bug "
+				"report" );
+			assert( grb::size( xb ) == n );
+			assert( !masked || grb::size( mask ) == n );
+			assert( grb::nrows( T ) == n );
+			assert( grb::ncols( T ) == n );
+			(void) masked;
+			(void) sparse;
+			(void) semiring;
+			(void) subtraction;
+			(void) division;
+			(void) phase;
+			std::cerr << "Warning: masked sptrsv not yet implemented\n";
+			return grb::UNSUPPORTED;
+		}
+
+	} // end grb::internal
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Semiring, class Subtraction, class Division,
+		typename IOType, typename InputType1,
+		typename Coords, typename RIT, typename CIT, typename NIT
+	>
+	RC sptrsv(
+		Vector< IOType, reference, Coords > &xb,
+		const Matrix< InputType1, reference, RIT, CIT, NIT > &T,
+		const bool forward,
+		const Semiring &semiring = Semiring(),
+		const Subtraction &subtraction = Subtraction(),
+		const Division &division = Division(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_semiring< Semiring >::value &&
+			grb::is_operator< Subtraction >::value &&
+			grb::is_operator< Division >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value,
+		void >::type * const = nullptr
+	) {
+		// check contract
+		constexpr bool dense = descr & descriptors::dense;
+		const size_t n = size( xb );
+		if( grb::nrows( T ) != n ) {
+			std::cerr << "Error, sptrsv (unmasked): matrix should have a number of rows "
+				<< "equal to the size of the input/output vector\n";
+			return grb::ILLEGAL;
+		}
+		if( grb::ncols( T ) != n ) {
+			std::cerr << "Error, sptrsv (unmasked): matrix should have a number of "
+				<< "columns equal to the size of the input/output vector\n";
+			return grb::ILLEGAL;
+		}
+		if( dense && grb::nnz( xb ) < n ) {
+			std::cerr << "Error, sptrsv (unmasked): sparse vector given in conjunction "
+				<< "with a dense descriptor\n";
+			return grb::ILLEGAL;
+		}
+
+		// check trivial
+		if( n == 0 ) { return grb::SUCCESS; }
+
+		// check dense dispatch
+		if( dense || grb::nnz( xb ) == n ) {
+			return internal::dense_unmasked_sptrsv< descr >( xb, T, forward, semiring,
+				subtraction, division, phase );
+		} else {
+			grb::Vector< IOType, reference, Coords > no_mask( 0 );
+			return internal::generic_sptrsv< descr, false, true >( xb, no_mask, T,
+				forward, semiring, subtraction, division, phase );
+		}
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Semiring,
+		class Subtraction,
+		class Division,
+		typename IOType, typename InputType1, typename InputType2,
+		typename Coords, typename RIT, typename CIT, typename NIT
+	>
+	RC sptrsv(
+		Vector< IOType, reference, Coords > &xb,
+		const Vector< InputType2, reference, Coords > &mask,
+		const Matrix< InputType1, reference, RIT, CIT, NIT > &T,
+		const bool forward,
+		const Semiring &semiring = Semiring(),
+		const Subtraction &subtraction = Subtraction(),
+		const Division &division = Division(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_semiring< Semiring >::value &&
+			grb::is_operator< Subtraction >::value &&
+			grb::is_operator< Division >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value,
+		void >::type * const = nullptr
+	) {
+		// check if can forward
+		if( grb::size( mask ) == 0 ) {
+			return sptrsv< descr >( xb, T, forward, semiring, subtraction, division,
+				phase );
+		}
+
+		// check contract
+		constexpr bool dense = descr & descriptors::dense;
+		const size_t n = size( xb );
+		if( grb::size( mask ) != n ) {
+			std::cerr << "Error, grb::sptrsv (masked): mask and input/output vector "
+				<< "should have the same size\n";
+			return grb::ILLEGAL;
+		}
+		if( grb::nrows( T ) != n ) {
+			std::cerr << "Error, grb::sptrsv (masked): matrix should have a number of "
+				<< "rows equal to the size of the input/output vector\n";
+			return grb::ILLEGAL;
+		}
+		if( grb::ncols( T ) != n ) {
+			std::cerr << "Error, grb::sptrsv (masked): matrix should have a number of "
+				<< "columns equal to the size of the input/output vector\n";
+			return grb::ILLEGAL;
+		}
+		if( dense && grb::nnz( mask ) < n ) {
+			std::cerr << "Error, grb::sptrsv (masked): sparse mask but dense descriptor "
+				<< "was given\n";
+			return grb::ILLEGAL;
+		}
+		if( dense && grb::nnz( xb ) < n ) {
+			std::cerr << "Error, grb::sptrsv (masked): sparse input vector given but "
+				"dense descriptor was given\n";
+			return grb::ILLEGAL;
+		}
+
+		if( dense || (grb::nnz( xb ) == n && grb::nnz( mask ) == n) ) {
+			return internal::generic_sptrsv< descr, true, false >( xb, mask, T, n,
+				forward, semiring, subtraction, division, phase );
+		} else {
+			return internal::generic_sptrsv< descr, true, true >( xb, mask, T, n,
+				forward, semiring, subtraction, division, phase );
+		}
+	}
+
+	namespace internal {
+
 #ifndef _H_GRB_REFERENCE_OMP_BLAS2
 		/**
 		 * A nonzero wrapper for use with grb::eWiseLambda over matrices.

From e8b921e8f3dbbc9c67c90deeec362625aa9fa43e Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Fri, 4 Apr 2025 17:07:24 +0200
Subject: [PATCH 04/55] Add unmasked sptrsv to base spec

---
 include/graphblas/base/blas2.hpp | 46 ++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/include/graphblas/base/blas2.hpp b/include/graphblas/base/blas2.hpp
index 902c83a58..7e3274ea4 100644
--- a/include/graphblas/base/blas2.hpp
+++ b/include/graphblas/base/blas2.hpp
@@ -430,6 +430,52 @@ namespace grb {
 		return UNSUPPORTED;
 	}
 
+	/**
+	 * Computes \f$ x \f$ from \f$ Tx = b \f$, unmasked variant.
+	 *
+	 * \todo Extend documentation
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Semiring,
+		class Subtraction,
+		class Division,
+		typename IOType, typename InputType1,
+		typename Coords, typename RIT, typename CIT, typename NIT,
+		Backend backend
+	>
+	RC sptrsv(
+		Vector< IOType, backend, Coords > &xb,
+		const Matrix< InputType1, backend, RIT, CIT, NIT > &T,
+		const bool forward,
+		const Semiring &semiring = Semiring(),
+		const Subtraction &subtraction = Subtraction(),
+		const Division &division = Division(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_semiring< Semiring >::value &&
+			grb::is_operator< Subtraction >::value &&
+			grb::is_operator< Division >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value,
+		void >::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cerr << "Selected backend does not implement masked grb::sptrsv\n";
+#endif
+#ifndef NDEBUG
+		const bool selected_backend_does_not_support_unmasked_sptrsv = false;
+		assert( selected_backend_does_not_support_unmasked_sptrsv );
+#endif
+		(void) xb;
+		(void) T;
+		(void) semiring;
+		(void) subtraction;
+		(void) division;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
 	/**
 	 * Executes an arbitrary element-wise user-defined function \a f on all
 	 * nonzero elements of a given matrix \a A.

From f28db62d953ec6f79482deccb171f0a021d2305d Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Fri, 4 Apr 2025 17:07:58 +0200
Subject: [PATCH 05/55] Bugfixes, remove compiler warnings, and some debug
 tracing

---
 include/graphblas/reference/blas2.hpp | 38 ++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp
index 2cfeac82b..9b60c5d7d 100644
--- a/include/graphblas/reference/blas2.hpp
+++ b/include/graphblas/reference/blas2.hpp
@@ -2231,15 +2231,23 @@ namespace grb {
 			const Semiring &semiring,
 			const Subtraction &subtraction
 		) {
-			constexpr auto one = Semiring::template getOne< typename Semiring::D2 >();
+			constexpr auto one =
+				Semiring::template One< typename Semiring::D2 >::value();
 			for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k ) {
 				const typename Semiring::D2 val = crs.template getValue( k, one );
 				const auto &ind = crs.row_index[ k ];
-				if( ind == i ) {
+				if( static_cast< size_t >(ind) == i ) {
+ #ifdef _DEBUG
+					std::cout << "\tskipping nonzero at " << i ", " << i << "\n";
+ #endif
 					divBy = val;
 					continue;
 				}
-				const typename Semiring::D3 tmp;
+ #ifdef _DEBUG
+				std::cout << "\t x[ " << i << " ] (" << x[i] << ") -= " << val << "* x[ "
+					<< ind << " ] (" << x[ind] << ")\n";
+ #endif
+				typename Semiring::D3 tmp;
 				(void) grb::apply( tmp, val, x[ ind ],
 					semiring.getMultiplicativeOperator() );
 				(void) grb::foldl( x[ i ], tmp, subtraction );
@@ -2278,7 +2286,6 @@ namespace grb {
 
 			// get required data handles
 			IOType * const v_raw = internal::getRaw( xb );
-			size_t new_nz = 0;
 
 			// switch forward or backward solve
 			if( forward ) {
@@ -2287,6 +2294,10 @@ namespace grb {
 					IOType divBy = semiring.template getZero< IOType >();
 					assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] );
 					sptrsv_kernel( crs, i, v_raw, divBy, semiring, subtraction );
+#ifdef _DEBUG
+					std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy
+						<< "\n";
+#endif
 					(void) grb::foldl( v_raw[ i ], divBy, division );
 				}
 			} else {
@@ -2295,6 +2306,10 @@ namespace grb {
 					IOType divBy = semiring.template getZero< IOType >();
 					assert( ccs.col_start[ i ] <= ccs.col_start[ i + 1 ] );
 					sptrsv_kernel( ccs, i, v_raw, divBy, semiring, subtraction );
+#ifdef _DEBUG
+					std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy
+						<< "\n";
+#endif
 					(void) grb::foldl( v_raw[ i ], divBy, division );
 				}
 			}
@@ -2325,14 +2340,25 @@ namespace grb {
 			const Division &division,
 			const Phase &phase
 		) {
+			// static checks
 			static_assert( masked || sparse, "Internal logic error; please submit a bug "
 				"report" );
+
+			//dynamic checks
 			assert( grb::size( xb ) == n );
 			assert( !masked || grb::size( mask ) == n );
 			assert( grb::nrows( T ) == n );
 			assert( grb::ncols( T ) == n );
+
 			(void) masked;
 			(void) sparse;
+#ifdef NDEBUG
+			(void) xb;
+			(void) mask;
+			(void) T;
+			(void) n;
+#endif
+			(void) forward;
 			(void) semiring;
 			(void) subtraction;
 			(void) division;
@@ -2389,11 +2415,11 @@ namespace grb {
 
 		// check dense dispatch
 		if( dense || grb::nnz( xb ) == n ) {
-			return internal::dense_unmasked_sptrsv< descr >( xb, T, forward, semiring,
+			return internal::dense_unmasked_sptrsv< descr >( xb, T, n, forward, semiring,
 				subtraction, division, phase );
 		} else {
 			grb::Vector< IOType, reference, Coords > no_mask( 0 );
-			return internal::generic_sptrsv< descr, false, true >( xb, no_mask, T,
+			return internal::generic_sptrsv< descr, false, true >( xb, no_mask, T, n,
 				forward, semiring, subtraction, division, phase );
 		}
 	}

From 6181cc3b27d96bb2f630489d2e3004d36f9c00ca Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Fri, 4 Apr 2025 17:25:28 +0200
Subject: [PATCH 06/55] Add test for new sptrsv function

---
 tests/unit/CMakeLists.txt   |   6 +
 tests/unit/dense_sptrsv.cpp | 320 ++++++++++++++++++++++++++++++++++++
 2 files changed, 326 insertions(+)
 create mode 100644 tests/unit/dense_sptrsv.cpp

diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 5cd03cf77..049af30e9 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -374,6 +374,12 @@ add_grb_executables( launch_benchmark_frommpi_manual launcherAndBenchmarker.cpp
 	COMPILE_DEFINITIONS NO_LPF_AUTO_INIT
 )
 
+# tests for experimental features
+
+add_grb_executables( dense_sptrsv dense_sptrsv.cpp
+	BACKENDS reference reference_omp
+)
+
 # targets to list and build the test for this category
 get_property( unit_tests_list GLOBAL PROPERTY tests_category_unit )
 add_custom_target( "list_tests_category_unit"
diff --git a/tests/unit/dense_sptrsv.cpp b/tests/unit/dense_sptrsv.cpp
new file mode 100644
index 000000000..c7138fe94
--- /dev/null
+++ b/tests/unit/dense_sptrsv.cpp
@@ -0,0 +1,320 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <map>
+#include <set>
+#include <cmath>
+#include <array>
+#include <cstdio>
+#include <string>
+#include <vector>
+#include <climits>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <sstream>
+#include <algorithm>
+
+#include <inttypes.h>
+
+#include "graphblas/algorithms/matrix_factory.hpp"
+#include "graphblas/utils/timer.hpp"
+
+#include "graphblas.hpp"
+
+
+using namespace grb;
+
+struct input {
+	size_t n;
+	size_t rep;
+};
+
+struct output {
+	enum grb::RC error_code;
+	grb::utils::TimerResults times;
+};
+
+template< typename T, class Semiring >
+bool checkVectorDenseAndEqualTo(
+	const grb::Vector< T > &x, const T val, struct output &out,
+	const Semiring &semiring
+) {
+	size_t count = 0;
+	bool error = false;
+	std::set< size_t > localSet;
+	const size_t n = grb::size( x );
+	for( const auto &pair : x ) {
+		(void) ++count;
+		localSet.insert( pair.first );
+		if( pair.second != val ) {
+			std::cout << "unexpected entry ( " << pair.first << ", " << pair.second
+				<< " ); expected value " << val << "\n";
+			error = true;
+		}
+	}
+	if( grb::collectives<>::allreduce( count, semiring.getAdditiveOperator() )
+		!= grb::SUCCESS
+	) {
+		std::cerr << "error during computing checksum (I)\n";
+		out.error_code = grb::PANIC;
+		return false;
+	}
+	if( count != n ) {
+		std::cout << "expected " << n << " nonzeroes, got " << count << "\n";
+		error = true;
+	}
+	count = localSet.size();
+	if( grb::collectives<>::allreduce( count, semiring.getAdditiveOperator() )
+		!= grb::SUCCESS
+	) {
+		std::cerr << "error during computing checksum (II)\n";
+		out.error_code = grb::PANIC;
+		return false;
+	}
+	if( count != n ) {
+		std::cout << "expected " << n << " indices, got " << count << "\n";
+		error = true;
+	}
+	if( out.error_code == grb::SUCCESS ) {
+		if( error ) {
+			out.error_code = grb::FAILED;
+			return false;
+		}
+	}
+	return true;
+}
+
+// main label propagation algorithm
+void grbProgram( const struct input &data_in, struct output &out ) {
+	out.error_code = grb::PANIC;
+	grb::utils::Timer timer;
+
+	const size_t s = grb::spmd<>::pid();
+#ifndef NDEBUG
+	assert( s < grb::spmd<>::nprocs() );
+#else
+	(void) s;
+#endif
+
+	// get input n and test case
+	const size_t n = data_in.n;
+
+	// setup
+	grb::Vector< double > x( n );
+	grb::Matrix< double > A = grb::algorithms::matrices< double >::identity( n );
+	Semiring<
+		grb::operators::add< double >, grb::operators::mul< double >,
+		grb::identities::zero, grb::identities::one
+	> semiring;
+	grb::operators::divide< double > division;
+	grb::operators::subtract< double > subtraction;
+
+	out.error_code = grb::set( x, 3.14 );
+	if( out.error_code ) {
+		std::cerr << "Error during test initialisation\n";
+		return;
+	}
+	if( !checkVectorDenseAndEqualTo( x, 3.14, out, semiring ) ) {
+		std::cerr << "NOTE: the aforementioned errors occurred during test "
+			<< "initialisation\n";
+		return;
+	}
+	out.times.preamble = timer.time();
+
+	if( s == 0 ) {
+		std::cout << "Test 1: ";
+	}
+	timer.reset();
+	out.error_code = grb::sptrsv( x, A, false, semiring, subtraction, division );
+	if( out.error_code ) {
+		std::cerr << "test returned error\n";
+	}
+	if( !checkVectorDenseAndEqualTo( x, 3.14, out, semiring ) ) { return; }
+	out.times.postamble += timer.time();
+	timer.reset();
+	for( size_t rep = 0; rep < data_in.rep; ++rep ) {
+		(void) grb::sptrsv( x, A, false, semiring, subtraction, division );
+	}
+	out.times.useful += timer.time();
+
+	assert( out.error_code == grb::SUCCESS );
+	if( s == 0 ) {
+		std::cout << "\b\b 2: ";
+	}
+	timer.reset();
+	out.error_code = grb::set< grb::descriptors::dense >( x, 2.14 );
+	if( out.error_code ) {
+		std::cerr << "Error during test initialisation\n";
+		return;
+	}
+	if( !checkVectorDenseAndEqualTo( x, 2.14, out, semiring ) ) {
+		std::cerr << "NOTE: aforementioned errors occurred during initialisation\n";
+		return;
+	}
+	out.times.preamble += timer.time();
+	timer.reset();
+	out.error_code = grb::sptrsv( x, A, true, semiring, subtraction, division );
+	if( out.error_code ) {
+		std::cerr << "test returned error\n";
+	}
+	if( !checkVectorDenseAndEqualTo( x, 2.14, out, semiring ) ) { return; }
+	out.times.postamble += timer.time();
+	timer.reset();
+	for( size_t rep = 0; rep < data_in.rep; ++rep ) {
+		(void) grb::sptrsv( x, A, true, semiring, subtraction, division );
+	}
+	out.times.useful += timer.time();
+
+	assert( out.error_code == grb::SUCCESS );
+	if( s == 0 ) {
+		std::cout << "\b\b 3: ";
+	}
+	timer.reset();
+	out.error_code = grb::set< grb::descriptors::dense >( x, 3.14 );
+	if( out.error_code ) {
+		std::cerr << "Error during test initialisation\n";
+		return;
+	}
+	if( !checkVectorDenseAndEqualTo( x, 3.14, out, semiring ) ) {
+		std::cerr << "NOTE: aforementioned errors occurred during initialisation\n";
+		return;
+	}
+	out.times.preamble += timer.time();
+	timer.reset();
+	out.error_code = grb::sptrsv< grb::descriptors::dense >( x, A, false, semiring,
+		subtraction, division );
+	if( out.error_code ) {
+		std::cerr << "test returned error\n";
+	}
+	if( !checkVectorDenseAndEqualTo( x, 3.14, out, semiring ) ) { return; }
+	out.times.postamble += timer.time();
+	timer.reset();
+	for( size_t rep = 0; rep < data_in.rep; ++rep ) {
+		(void) grb::sptrsv< grb::descriptors::dense >( x, A, false, semiring,
+			subtraction, division );
+	}
+	out.times.useful += timer.time();
+
+	assert( out.error_code == grb::SUCCESS );
+	if( s == 0 ) {
+		std::cout << "\b\b 4: ";
+	}
+	timer.reset();
+	out.error_code = grb::set< grb::descriptors::dense >( x, 2.14 );
+	if( out.error_code ) {
+		std::cerr << "Error during test initialisation\n";
+		return;
+	}
+	if( !checkVectorDenseAndEqualTo( x, 2.14, out, semiring ) ) {
+		std::cerr << "NOTE: aforementioned errors occurred during initialisation\n";
+		return;
+	}
+	out.times.preamble += timer.time();
+	timer.reset();
+	out.error_code = grb::sptrsv< grb::descriptors::dense >( x, A, true, semiring,
+		subtraction, division );
+	if( out.error_code ) {
+		std::cerr << "test returned error\n";
+	}
+	if( !checkVectorDenseAndEqualTo( x, 2.14, out, semiring ) ) { return; }
+	out.times.postamble += timer.time();
+	timer.reset();
+	for( size_t rep = 0; rep < data_in.rep; ++rep ) {
+		(void) grb::sptrsv< grb::descriptors::dense >( x, A, true, semiring,
+			subtraction, division );
+	}
+	out.times.useful += timer.time();
+
+	std::cout << "OK\n";
+}
+
+// main function will execute in serial or as SPMD
+int main( int argc, char ** argv ) {
+	// sanity check
+	if( argc < 3 || argc > 5 ) {
+		std::cout << "Usage: " << argv[ 0 ] << " <problem size> "
+			<< "(inner repititions) (outer repititions)" << std::endl;
+		return 0;
+	}
+	std::cout << "Test executable: " << argv[ 0 ] << std::endl;
+
+	// the input struct
+	struct input in;
+	in.n = atoi( argv[ 1 ] );
+	in.rep = grb::config::BENCHMARKING::inner();
+	size_t outer = grb::config::BENCHMARKING::outer();
+	char * end = NULL;
+	if( argc >= 3 ) {
+		in.rep = strtoumax( argv[ 2 ], &end, 10 );
+		if( argv[ 2 ] == end ) {
+			std::cerr << "Could not parse argument for number of inner "
+				<< "repetitions." << std::endl;
+			return 25;
+		}
+	}
+	if( argc >= 4 ) {
+		outer = strtoumax( argv[ 3 ], &end, 10 );
+		if( argv[ 3 ] == end ) {
+			std::cerr << "Could not parse argument for number of outer "
+				<< "reptitions." << std::endl;
+			return 30;
+		}
+	}
+
+	std::cout << "Executable called with parameters: problem size " << in.n
+		<< ", inner = " << in.rep << ", outer = " << outer << "." << std::endl;
+
+	// the output struct
+	struct output out;
+
+	// run the program one time to infer number of inner repititions
+	if( in.rep == 0 ) {
+		in.rep = 1;
+		grb::Launcher< AUTOMATIC > launcher;
+		const enum grb::RC rc = launcher.exec( &grbProgram, in, out, true );
+		if( rc != SUCCESS ) {
+			std::cerr << "launcher.exec returns with non-SUCCESS error code "
+				<< grb::toString( rc ) << std::endl;
+			return 40;
+		}
+		// set guesstimate for inner repititions: a single experiment should take at least a second
+		in.rep = static_cast< double >( 1000.0 / out.times.useful ) + 1;
+		std::cout << "Auto-selected number of inner repetitions is "
+			<< in.rep << " (at an estimated time of "
+			<< out.times.useful << " ms. of useful work per benchmark).\n";
+	}
+
+	// start benchmarks
+	grb::Benchmarker< AUTOMATIC > benchmarker;
+	const enum grb::RC rc = benchmarker.exec( &grbProgram, in, out, 1, outer, true );
+	if( rc != SUCCESS ) {
+		std::cerr << "launcher.exec returns with non-SUCCESS error code "
+			<< grb::toString( rc ) << std::endl;
+		return 50;
+	}
+
+	// done
+	if( out.error_code != SUCCESS ) {
+		std::cout << "Test FAILED\n" << std::endl;
+		std::cerr << std::flush;
+		return out.error_code;
+	}
+	std::cout << "Test OK\n" << std::endl;
+	return 0;
+}
+

From 8b11500215f3352d12eb109b411fc1ddc7bc7996 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Fri, 4 Apr 2025 17:26:25 +0200
Subject: [PATCH 07/55] Warn about experimental nature of new primitive

---
 include/graphblas/base/blas2.hpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/graphblas/base/blas2.hpp b/include/graphblas/base/blas2.hpp
index 7e3274ea4..2d61d46d4 100644
--- a/include/graphblas/base/blas2.hpp
+++ b/include/graphblas/base/blas2.hpp
@@ -340,6 +340,8 @@ namespace grb {
 	/**
 	 * Computes \f$ x \f$ from \f$ Tx=b \f$.
 	 *
+	 * \warning This is an experimental feature.
+	 *
 	 * Here, \f$ T, b \f$ are given while \f$ T \f$ additionally must be either
 	 * lower- or upper-triangular. The output \f$ x \f$ may furthermore be masked.
 	 *
@@ -433,6 +435,8 @@ namespace grb {
 	/**
 	 * Computes \f$ x \f$ from \f$ Tx = b \f$, unmasked variant.
 	 *
+	 * \warning This is an experimental feature.
+	 *
 	 * \todo Extend documentation
 	 */
 	template<

From 2bb446b7e156d211bc8d7f7367d7623d6bbf0795 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Fri, 4 Apr 2025 17:48:03 +0200
Subject: [PATCH 08/55] Fix cast error

---
 include/graphblas/reference/blas2.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp
index 9b60c5d7d..9906f7471 100644
--- a/include/graphblas/reference/blas2.hpp
+++ b/include/graphblas/reference/blas2.hpp
@@ -2232,9 +2232,9 @@ namespace grb {
 			const Subtraction &subtraction
 		) {
 			constexpr auto one =
-				Semiring::template One< typename Semiring::D2 >::value();
+				Semiring::template One< typename Semiring::D1 >::value();
 			for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k ) {
-				const typename Semiring::D2 val = crs.template getValue( k, one );
+				const typename Semiring::D1 val = crs.template getValue( k, one );
 				const auto &ind = crs.row_index[ k ];
 				if( static_cast< size_t >(ind) == i ) {
  #ifdef _DEBUG

From ef2c4e4587a0f64a2c39336261ed977ae2e5b7ab Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Fri, 4 Apr 2025 18:20:11 +0200
Subject: [PATCH 09/55] Simplify code a bit

---
 include/graphblas/reference/blas2.hpp | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp
index 9906f7471..bbaf93d1e 100644
--- a/include/graphblas/reference/blas2.hpp
+++ b/include/graphblas/reference/blas2.hpp
@@ -2233,6 +2233,7 @@ namespace grb {
 		) {
 			constexpr auto one =
 				Semiring::template One< typename Semiring::D1 >::value();
+			// TODO could do some ALP-style vectorisation here
 			for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k ) {
 				const typename Semiring::D1 val = crs.template getValue( k, one );
 				const auto &ind = crs.row_index[ k ];
@@ -2260,10 +2261,10 @@ namespace grb {
 			Descriptor descr,
 			class Semiring, class Subtraction, class Division,
 			typename IOType, typename InputType1,
-			typename Coords, typename RIT, typename CIT, typename NIT
+			typename RIT, typename CIT, typename NIT
 		>
 		RC dense_unmasked_sptrsv(
-			Vector< IOType, reference, Coords > &xb,
+			IOType *__restrict__ const v_raw,
 			const Matrix< InputType1, reference, RIT, CIT, NIT > &T,
 			const size_t &n,
 			const bool forward,
@@ -2273,8 +2274,6 @@ namespace grb {
 			const Phase &phase
 		) {
 			// dynamic sanity checks
-			assert( grb::size( xb ) == n );
-			assert( grb::nnz( xb ) == n );
 			assert( grb::nrows( T ) == n );
 			assert( grb::ncols( T ) == n );
 
@@ -2284,9 +2283,6 @@ namespace grb {
 			// only execute and resize are supported
 			assert( phase == grb::EXECUTE );
 
-			// get required data handles
-			IOType * const v_raw = internal::getRaw( xb );
-
 			// switch forward or backward solve
 			if( forward ) {
 				const auto &crs = internal::getCRS( T );
@@ -2415,7 +2411,9 @@ namespace grb {
 
 		// check dense dispatch
 		if( dense || grb::nnz( xb ) == n ) {
-			return internal::dense_unmasked_sptrsv< descr >( xb, T, n, forward, semiring,
+			IOType * const xb_p = internal::getRaw( xb );
+			return internal::dense_unmasked_sptrsv< descr >(
+				xb_p, T, n, forward, semiring,
 				subtraction, division, phase );
 		} else {
 			grb::Vector< IOType, reference, Coords > no_mask( 0 );

From ef8992170cb26c1cfdc30769c73bd468b9702905 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Fri, 4 Apr 2025 18:20:24 +0200
Subject: [PATCH 10/55] Enable new sptrsv unit test

---
 tests/unit/unittests.sh | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tests/unit/unittests.sh b/tests/unit/unittests.sh
index 0b49d2421..e6d3f7644 100755
--- a/tests/unit/unittests.sh
+++ b/tests/unit/unittests.sh
@@ -197,6 +197,19 @@ for MODE in ${MODES}; do
 					grep 'Test OK' ${TEST_OUT_DIR}/mxm_crs_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
 					echo " "
 
+					echo ">>>      [x]           [ ]       Testing grb::sptrsv (unmasked, dense vectors) on"
+					echo "                                 small simple matrices, re-entrant"
+					$runner ${TEST_BIN_DIR}/dense_sptrsv_${MODE}_${BACKEND} 10 1000 3 &> ${TEST_OUT_DIR}/dense_sptrsv_${MODE}_${BACKEND}_${T}.log
+					head -1 ${TEST_OUT_DIR}/dense_sptrsv_${MODE}_${BACKEND}_${T}.log
+					grep 'Test OK' ${TEST_OUT_DIR}/dense_sptrsv_${MODE}_${BACKEND}_${T}.log || echo "Test FAILED"
+					echo " "
+
+					echo ">>>      [x]           [ ]       Testing grb::sptrsv (unmasked, dense vectors) on"
+					echo "                                 large simple matrices, re-entrant"
+					$runner ${TEST_BIN_DIR}/dense_sptrsv_${MODE}_${BACKEND} 1000000 1 3 &> ${TEST_OUT_DIR}/dense_sptrsv_large_${MODE}_${BACKEND}_${T}.log
+					head -1 ${TEST_OUT_DIR}/dense_sptrsv_large_${MODE}_${BACKEND}_${T}.log
+					grep 'Test OK' ${TEST_OUT_DIR}/dense_sptrsv_large_${MODE}_${BACKEND}_${T}.log || echo "Test FAILED"
+					echo " "
 				fi
 
 				echo "#################################################################"

From e28e1d42c06ea89157b2c268e6cf14e5a4a03ad4 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Fri, 4 Apr 2025 20:44:55 +0200
Subject: [PATCH 11/55] Add parallel (OpenMP) sptrsv, which may take a schedule
 such as produced by OneStopParallel

---
 .../graphblas/reference/SptrsvSchedule.hpp    | 118 ++++++++++++++++
 include/graphblas/reference/blas2.hpp         | 129 +++++++++++++++---
 include/graphblas/reference/matrix.hpp        |  34 ++++-
 3 files changed, 258 insertions(+), 23 deletions(-)
 create mode 100644 include/graphblas/reference/SptrsvSchedule.hpp

diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp
new file mode 100644
index 000000000..86479c7d4
--- /dev/null
+++ b/include/graphblas/reference/SptrsvSchedule.hpp
@@ -0,0 +1,118 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 4th of April, 2025
+ */
+
+#ifndef _H_GRB_REFERENCE_SPTRSVSCHEDULE
+#define _H_GRB_REFERENCE_SPTRSVSCHEDULE
+
+#include <limits>
+#include <vector>
+
+#include <graphblas/utils/autodeleter.hpp>
+
+
+namespace grb {
+
+	namespace internal {
+
+		template< typename NIT >
+		class SptrsvSchedule {
+
+			private:
+
+				/** One deleter for each chunk of thread-local data. */
+				std::vector< utils::AutoDeleter< char > > _deleters;
+
+				/** Computes and initialises a trivial schedule. */
+				void initTrivial( const NIT n ) {
+					assert( data[ 0 ] == nullptr );
+					const grb::RC rc = utils::alloc(
+						"grb::internal::SptrsvSchedule (default constructor)",
+						"default thread-local data allocation",
+						data[ 0 ], 2 * sizeof(NIT), false, _deleters[ 0 ]
+					);
+					if( rc != grb::SUCCESS ) {
+						throw std::bad_alloc();
+					}
+					NIT * const interpreted = reinterpret_cast< NIT * >(data[0]);
+					interpreted[ 0 ] = 0;
+					interpreted[ 1 ] = n;
+				}
+
+
+			public:
+
+				/** Number of schedule steps. */
+				size_t supersteps;
+
+				/**
+				 * The number of threads the schedule is designed for.
+				 *
+				 * \note Uses int as per OpenMP spec.
+				 */
+				int nThreads;
+
+				/** One data pointer per thread. */
+				std::vector< char * > data;
+
+				/**
+				 * Base constructor.
+				 *
+				 * @param[in] n The size of the linear system.
+				 *
+				 * Initialises the schedule to the trivial always-valid schedule, which is
+				 * a single-thread single-superstep schedule that assigns all work to the
+				 * first (and only) thread.
+				 */
+				SptrsvSchedule( const NIT n ) :
+					_deleters( 1 ), supersteps( 1 ), nThreads( 1 ), data( 1 )
+				{
+					data[ 0 ] = nullptr;
+					initTrivial( n );
+				}
+
+				/**
+				 * Specialised constructor for multiple threads.
+				 *
+				 * @param[in] n The size of the linear system.
+				 * @param[in] T The number of threads a schedule should be created for.
+				 *
+				 * Initialises the schedule to the trivial always-valid schedule, which is
+				 * a single-superstep schedule that assigns all work to the first thread.
+				 */
+				SptrsvSchedule( const NIT n, const size_t T ) :
+					_deleters( T ), supersteps( 0 ), nThreads( T ), data( T )
+				{
+					data[ 0 ] = nullptr;
+					if( T == 0 || T > std::numeric_limits< int >::max() ) {
+						throw std::runtime_error( "Invalid number of threads" );
+					}
+					initTrivial( n );
+				}
+
+		};
+
+	} // end namespace `grb::internal'
+
+} // end namespace `grb'
+
+#endif // _H_GRB_REFERENCE_SPTRSVSCHEDULE
+
diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp
index bbaf93d1e..e79c3a80c 100644
--- a/include/graphblas/reference/blas2.hpp
+++ b/include/graphblas/reference/blas2.hpp
@@ -2256,17 +2256,17 @@ namespace grb {
 		}
 #endif
 
-		/** \internal Specialised dense unmasked sptrsv implementation */
+		/** \internal Specialised dense unmasked sptrsv implementation, sequential */
 		template<
-			Descriptor descr,
+			Descriptor descr, bool maybe_offset,
 			class Semiring, class Subtraction, class Division,
 			typename IOType, typename InputType1,
 			typename RIT, typename CIT, typename NIT
 		>
-		RC dense_unmasked_sptrsv(
+		RC dense_unmasked_sequential_sptrsv(
 			IOType *__restrict__ const v_raw,
 			const Matrix< InputType1, reference, RIT, CIT, NIT > &T,
-			const size_t &n,
+			const size_t offset, const size_t &n,
 			const bool forward,
 			const Semiring &semiring,
 			const Subtraction &subtraction,
@@ -2286,27 +2286,53 @@ namespace grb {
 			// switch forward or backward solve
 			if( forward ) {
 				const auto &crs = internal::getCRS( T );
-				for( size_t i = 0; i < n; ++i ) {
-					IOType divBy = semiring.template getZero< IOType >();
-					assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] );
-					sptrsv_kernel( crs, i, v_raw, divBy, semiring, subtraction );
+				if( maybe_offset ) {
+					for( size_t i = 0; i < n; ++i ) {
+						IOType divBy = semiring.template getZero< IOType >();
+						assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] );
+						sptrsv_kernel( crs, i, v_raw, divBy, semiring, subtraction );
 #ifdef _DEBUG
-					std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy
-						<< "\n";
+						std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy
+							<< "\n";
 #endif
-					(void) grb::foldl( v_raw[ i ], divBy, division );
+						(void) grb::foldl( v_raw[ i ], divBy, division );
+					}
+				} else {
+					for( size_t i = offset; i < n; ++i ) {
+						IOType divBy = semiring.template getZero< IOType >();
+						assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] );
+						sptrsv_kernel( crs, i, v_raw, divBy, semiring, subtraction );
+#ifdef _DEBUG
+						std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy
+							<< "\n";
+#endif
+						(void) grb::foldl( v_raw[ i ], divBy, division );
+					}
 				}
 			} else {
 				const auto &ccs = internal::getCCS( T );
-				for( size_t i = n - 1; i < n; --i ) {
-					IOType divBy = semiring.template getZero< IOType >();
-					assert( ccs.col_start[ i ] <= ccs.col_start[ i + 1 ] );
-					sptrsv_kernel( ccs, i, v_raw, divBy, semiring, subtraction );
+				if( !maybe_offset || offset == 0 ) {
+					for( size_t i = n - 1; i < n; --i ) {
+						IOType divBy = semiring.template getZero< IOType >();
+						assert( ccs.col_start[ i ] <= ccs.col_start[ i + 1 ] );
+						sptrsv_kernel( ccs, i, v_raw, divBy, semiring, subtraction );
 #ifdef _DEBUG
-					std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy
-						<< "\n";
+						std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy
+							<< "\n";
 #endif
-					(void) grb::foldl( v_raw[ i ], divBy, division );
+						(void) grb::foldl( v_raw[ i ], divBy, division );
+					}
+				} else {
+					for( size_t i = n - 1 + offset; i >= offset; --i ) {
+						IOType divBy = semiring.template getZero< IOType >();
+						assert( ccs.col_start[ i ] <= ccs.col_start[ i + 1 ] );
+						sptrsv_kernel( ccs, i, v_raw, divBy, semiring, subtraction );
+#ifdef _DEBUG
+						std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy
+							<< "\n";
+#endif
+						(void) grb::foldl( v_raw[ i ], divBy, division );
+					}
 				}
 			}
 
@@ -2314,6 +2340,63 @@ namespace grb {
 			return grb::SUCCESS;
 		}
 
+#ifdef _H_GRB_REFERENCE_OMP_BLAS2
+		/** \internal Specialised dense unmasked sptrsv implementation, OpenMP */
+		template<
+			Descriptor descr,
+			class Semiring, class Subtraction, class Division,
+			typename IOType, typename InputType1,
+			typename RIT, typename CIT, typename NIT
+		>
+		RC dense_unmasked_omp_sptrsv(
+			IOType *__restrict__ const v_raw,
+			const Matrix< InputType1, reference, RIT, CIT, NIT > &T,
+			const size_t &n,
+			const bool forward,
+			const Semiring &semiring,
+			const Subtraction &subtraction,
+			const Division &division,
+			const Phase &phase
+		) {
+			// dynamic sanity checks
+			assert( grb::nrows( T ) == n );
+			assert( grb::ncols( T ) == n );
+
+			// in dense unmasked, resize is a no-op
+			if( phase == grb::RESIZE ) { return grb::SUCCESS; }
+
+			// get SpTrsv schedule
+			const SptrsvSchedule< NIT > *sptrsvSchedule_p =
+				internal::getSptrsvData( T );
+			const SptrsvSchedule< NIT > &sptrsv = sptrsvSchedule_p == nullptr ?
+				SptrsvSchedule< NIT >( n ) : *sptrsvSchedule_p;
+
+			// only execute and resize are supported
+			assert( phase == grb::EXECUTE );
+
+			// start parallel section
+			grb::RC ret = grb::SUCCESS;
+			#pragma omp parallel num_threads(sptrsv.nThreads)
+			{
+				const int s = omp_get_thread_num();
+				const NIT *__restrict__ data =
+					reinterpret_cast< const NIT * >(sptrsv.data[ s ]);
+				for( size_t i = 0; i < sptrsv.supersteps; ++i ) {
+					const size_t lo = static_cast< size_t >( *data++ );
+					const size_t no = static_cast< size_t >( *data++ );
+					assert( lo < n );
+					assert( no + lo <= n );
+					ret = ret ? ret : dense_unmasked_sequential_sptrsv< descr, true >(
+						v_raw, T, lo, no, forward, semiring, subtraction, division, phase );
+					#pragma omp barrier
+				}
+			}
+
+			// done
+			return ret;
+		}
+#endif
+
 		/**
 		 * \internal Implements sparse masked, sparse unmasked, and dense masked
 		 *           sptrsv.
@@ -2412,9 +2495,13 @@ namespace grb {
 		// check dense dispatch
 		if( dense || grb::nnz( xb ) == n ) {
 			IOType * const xb_p = internal::getRaw( xb );
-			return internal::dense_unmasked_sptrsv< descr >(
-				xb_p, T, n, forward, semiring,
-				subtraction, division, phase );
+#ifndef _H_GRB_REFERENCE_OMP_BLAS2
+			return internal::dense_unmasked_sequential_sptrsv< descr, false >(
+				xb_p, T, 0, n, forward, semiring, subtraction, division, phase );
+#else
+			return internal::dense_unmasked_omp_sptrsv< descr >(
+				xb_p, T, n, forward, semiring, subtraction, division, phase );
+#endif
 		} else {
 			grb::Vector< IOType, reference, Coords > no_mask( 0 );
 			return internal::generic_sptrsv< descr, false, true >( xb, no_mask, T, n,
diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index c1b3a365d..dfad4d6f7 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -49,6 +49,7 @@
 #include <graphblas/utils/iterators/utils.hpp>
 
 #include <graphblas/reference/init.hpp>
+#include <graphblas/reference/SptrsvSchedule.hpp>
 #include <graphblas/reference/compressed_storage.hpp>
 
 #include "NonzeroWrapper.hpp"
@@ -237,6 +238,18 @@ namespace grb {
 			return A.valbuf[ 1 ];
 		}
 
+		/**
+		 * \internal
+		 * Retrieves an optimised sptrsv-specific schedule.
+		 * \endinternal
+		 */
+		template< typename InputType, typename RIT, typename CIT, typename NIT >
+		const internal::SptrsvSchedule< NIT > * getSptrsvData(
+			const grb::Matrix< InputType, reference, RIT, CIT, NIT > &A
+		) noexcept {
+			return A.sptrsvSchedule;
+		}
+
 		template<
 			Descriptor descr,
 			bool input_dense, bool output_dense,
@@ -1225,6 +1238,11 @@ namespace grb {
 			const grb::Matrix< InputType, reference, RIT, CIT, NIT > &A
 		) noexcept;
 
+		template< typename InputType, typename RIT, typename CIT, typename NIT >
+		friend const internal::SptrsvSchedule< NIT > * internal::getSptrsvData(
+			const grb::Matrix< InputType, reference, RIT, CIT, NIT > &A
+		) noexcept;
+
 		friend const grb::Matrix<
 			D, reference,
 			ColIndexType, ColIndexType, NonzeroIndexType
@@ -1351,13 +1369,19 @@ namespace grb {
 			 */
 			utils::AutoDeleter< char > _local_deleter[ 6 ];
 
+			/**
+			 * Optimised schedule for sptrsv operations on this matrix.
+			 */
+			internal::SptrsvSchedule< NonzeroIndexType > * sptrsvSchedule;
+
 			/**
 			 * Internal constructor for manual construction of matrices.
 			 *
 			 * Should be followed by a manual call to #initialize.
 			 */
 			Matrix() : id( std::numeric_limits< uintptr_t >::max() ),
-				remove_id( false ), m( 0 ), n( 0 ), cap( 0 ), nz( 0 )
+				remove_id( false ), m( 0 ), n( 0 ), cap( 0 ), nz( 0 ),
+				sptrsvSchedule( nullptr )
 			{}
 
 			/**
@@ -1403,7 +1427,7 @@ namespace grb {
 				id( std::numeric_limits< uintptr_t >::max() ), remove_id( false ),
 				m( _m ), n( _n ), cap( _cap ), nz( _offset_array[ _m ] ),
 				coorArr{ nullptr, buf1 }, coorBuf{ nullptr, buf2 },
-				valbuf{ nullptr, buf3 }
+				valbuf{ nullptr, buf3 }, sptrsvSchedule( nullptr )
 			{
 				assert( (_m > 0 && _n > 0) || _column_indices[ 0 ] == 0 );
 				CRS.replace( _values, _column_indices );
@@ -1596,6 +1620,8 @@ namespace grb {
 					_deleter[ i ] = std::move( other._deleter[ i ] );
 					_local_deleter[ i ] = std::move( other._local_deleter[ i ] );
 				}
+				if( sptrsvSchedule ) { delete sptrsvSchedule; }
+				sptrsvSchedule = other.sptrsvSchedule;
 
 				// invalidate other fields
 				for( unsigned int i = 0; i < 2; ++i ) {
@@ -1608,6 +1634,7 @@ namespace grb {
 				other.n = 0;
 				other.cap = 0;
 				other.nz = 0;
+				other.sptrsvSchedule = nullptr;
 			}
 
 			/**
@@ -2228,6 +2255,9 @@ namespace grb {
 #endif
 					internal::reference_mapper.remove( id );
 				}
+				if( sptrsvSchedule ) {
+					delete sptrsvSchedule;
+				}
 			}
 
 			/**

From 20ea13fdae8ebc659ab73238466c8186890c43da Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Fri, 4 Apr 2025 21:00:43 +0200
Subject: [PATCH 12/55] Delete from move constructor should not occur

---
 include/graphblas/reference/matrix.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index dfad4d6f7..fc02e001b 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -1448,6 +1448,7 @@ namespace grb {
 				const size_t rows, const size_t cols,
 				const size_t cap_in
 			) {
+				sptrsvSchedule = nullptr;
 #ifdef _DEBUG_REFERENCE_MATRIX
 				std::cerr << "\t in Matrix< reference >::initialize...\n"
 					<< "\t\t matrix size " << rows << " by " << cols << "\n"
@@ -1620,7 +1621,6 @@ namespace grb {
 					_deleter[ i ] = std::move( other._deleter[ i ] );
 					_local_deleter[ i ] = std::move( other._local_deleter[ i ] );
 				}
-				if( sptrsvSchedule ) { delete sptrsvSchedule; }
 				sptrsvSchedule = other.sptrsvSchedule;
 
 				// invalidate other fields

From 45b9f46e4730df1042d7b9bf52a7e9a84e57be8a Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Fri, 4 Apr 2025 21:04:30 +0200
Subject: [PATCH 13/55] If we set sptrsvSchedule to default in initialize(...),
 then we do not need to do so in the default constructor

---
 include/graphblas/reference/matrix.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index fc02e001b..244d7599d 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -1380,8 +1380,7 @@ namespace grb {
 			 * Should be followed by a manual call to #initialize.
 			 */
 			Matrix() : id( std::numeric_limits< uintptr_t >::max() ),
-				remove_id( false ), m( 0 ), n( 0 ), cap( 0 ), nz( 0 ),
-				sptrsvSchedule( nullptr )
+				remove_id( false ), m( 0 ), n( 0 ), cap( 0 ), nz( 0 )
 			{}
 
 			/**
@@ -1448,6 +1447,7 @@ namespace grb {
 				const size_t rows, const size_t cols,
 				const size_t cap_in
 			) {
+				// SpTrsvSchedule should be manually set always by a requested tuning phase
 				sptrsvSchedule = nullptr;
 #ifdef _DEBUG_REFERENCE_MATRIX
 				std::cerr << "\t in Matrix< reference >::initialize...\n"

From 2413a524ca5bae29a317b77b9846476c90774a8d Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Mon, 7 Apr 2025 10:29:39 +0200
Subject: [PATCH 14/55] Coding blind for a sec here, apologies -- will test it
 once I have my machine back

---
 .../graphblas/reference/SptrsvSchedule.hpp    | 32 ++++++++++++++++---
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp
index 86479c7d4..ea7d184ab 100644
--- a/include/graphblas/reference/SptrsvSchedule.hpp
+++ b/include/graphblas/reference/SptrsvSchedule.hpp
@@ -44,19 +44,28 @@ namespace grb {
 				/** Computes and initialises a trivial schedule. */
 				void initTrivial( const NIT n ) {
 					assert( data[ 0 ] == nullptr );
+					data[ 0 ] = &(default_schedule[0]);
+					NIT * const interpreted = reinterpret_cast< NIT * >(data[0]);
+					interpreted[ 0 ] = 0;
+					interpreted[ 1 ] = n;
+				}
+
+				/** Allocates a thread-local chunk of data. */
+				void alloc( const size_t s ) {
+					assert( data[ s ] == nullptr );
 					const grb::RC rc = utils::alloc(
 						"grb::internal::SptrsvSchedule (default constructor)",
 						"default thread-local data allocation",
-						data[ 0 ], 2 * sizeof(NIT), false, _deleters[ 0 ]
+						data[ s ], supersteps * sizeof(NIT), false, _deleters[ s ]
 					);
 					if( rc != grb::SUCCESS ) {
 						throw std::bad_alloc();
 					}
-					NIT * const interpreted = reinterpret_cast< NIT * >(data[0]);
-					interpreted[ 0 ] = 0;
-					interpreted[ 1 ] = n;
 				}
 
+				/** Fixed-size buffer for realising the default schedule. */
+				NIT default_schedule[2];
+
 
 			public:
 
@@ -99,7 +108,7 @@ namespace grb {
 				 * a single-superstep schedule that assigns all work to the first thread.
 				 */
 				SptrsvSchedule( const NIT n, const size_t T ) :
-					_deleters( T ), supersteps( 0 ), nThreads( T ), data( T )
+					_deleters( T ), supersteps( 1 ), nThreads( T ), data( T )
 				{
 					data[ 0 ] = nullptr;
 					if( T == 0 || T > std::numeric_limits< int >::max() ) {
@@ -108,6 +117,19 @@ namespace grb {
 					initTrivial( n );
 				}
 
+				/**
+				 * Base destructor.
+				 *
+				 * Dynamic memory is freed through the #_deleters.
+				 */
+				~SptrsvSchedule() {
+					// sanity checks only
+					assert( nThreads != 0 );
+					if( nThreads == 1 ) {
+						assert( supersteps == 1 );
+					}
+				}
+
 		};
 
 	} // end namespace `grb::internal'

From 32905e538cd5bdc3444933851c4600ffc4e08d2c Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Mon, 7 Apr 2025 12:54:25 +0200
Subject: [PATCH 15/55] Fix compile error and some more defensive programming

---
 include/graphblas/reference/SptrsvSchedule.hpp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp
index ea7d184ab..06dc431dc 100644
--- a/include/graphblas/reference/SptrsvSchedule.hpp
+++ b/include/graphblas/reference/SptrsvSchedule.hpp
@@ -44,14 +44,22 @@ namespace grb {
 				/** Computes and initialises a trivial schedule. */
 				void initTrivial( const NIT n ) {
 					assert( data[ 0 ] == nullptr );
-					data[ 0 ] = &(default_schedule[0]);
+					data[ 0 ] = reinterpret_cast< char * >( &(default_schedule[0]) );
 					NIT * const interpreted = reinterpret_cast< NIT * >(data[0]);
 					interpreted[ 0 ] = 0;
 					interpreted[ 1 ] = n;
 				}
 
-				/** Allocates a thread-local chunk of data. */
+				/**
+				 * Allocates a thread-local chunk of data.
+				 *
+				 * Must be called from within the thread that will use it(!)
+				 */
 				void alloc( const size_t s ) {
+					assert( s < nThreads );
+					assert( supersteps > 0 );
+					assert( _deleters.size() >= s );
+					assert( data.size() >= s );
 					assert( data[ s ] == nullptr );
 					const grb::RC rc = utils::alloc(
 						"grb::internal::SptrsvSchedule (default constructor)",

From 98206d0a52260d4f8593bf847513073e56e6eb2b Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Mon, 7 Apr 2025 18:26:28 +0200
Subject: [PATCH 16/55] Add internal functions to set an Sptrsv schedule. Next
 up: test the performance of the sptrsv, in particular over multiple NUMA
 domains

---
 .../graphblas/reference/SptrsvSchedule.hpp    | 46 +++++----
 include/graphblas/reference/matrix.hpp        | 97 +++++++++++++++++++
 2 files changed, 119 insertions(+), 24 deletions(-)

diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp
index 06dc431dc..2e02b95f9 100644
--- a/include/graphblas/reference/SptrsvSchedule.hpp
+++ b/include/graphblas/reference/SptrsvSchedule.hpp
@@ -50,27 +50,6 @@ namespace grb {
 					interpreted[ 1 ] = n;
 				}
 
-				/**
-				 * Allocates a thread-local chunk of data.
-				 *
-				 * Must be called from within the thread that will use it(!)
-				 */
-				void alloc( const size_t s ) {
-					assert( s < nThreads );
-					assert( supersteps > 0 );
-					assert( _deleters.size() >= s );
-					assert( data.size() >= s );
-					assert( data[ s ] == nullptr );
-					const grb::RC rc = utils::alloc(
-						"grb::internal::SptrsvSchedule (default constructor)",
-						"default thread-local data allocation",
-						data[ s ], supersteps * sizeof(NIT), false, _deleters[ s ]
-					);
-					if( rc != grb::SUCCESS ) {
-						throw std::bad_alloc();
-					}
-				}
-
 				/** Fixed-size buffer for realising the default schedule. */
 				NIT default_schedule[2];
 
@@ -82,10 +61,8 @@ namespace grb {
 
 				/**
 				 * The number of threads the schedule is designed for.
-				 *
-				 * \note Uses int as per OpenMP spec.
 				 */
-				int nThreads;
+				size_t nThreads;
 
 				/** One data pointer per thread. */
 				std::vector< char * > data;
@@ -138,6 +115,27 @@ namespace grb {
 					}
 				}
 
+				/**
+				 * Allocates a thread-local chunk of data.
+				 *
+				 * Must be called from within the thread that will use it(!)
+				 */
+				void alloc( const size_t s ) {
+					assert( s < nThreads );
+					assert( supersteps > 0 );
+					assert( _deleters.size() >= s );
+					assert( data.size() >= s );
+					assert( data[ s ] == nullptr );
+					const grb::RC rc = utils::alloc(
+						"grb::internal::SptrsvSchedule (default constructor)",
+						"default thread-local data allocation",
+						data[ s ], supersteps * sizeof(NIT), false, _deleters[ s ]
+					);
+					if( rc != grb::SUCCESS ) {
+						throw std::bad_alloc();
+					}
+				}
+
 		};
 
 	} // end namespace `grb::internal'
diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index 244d7599d..461cd9975 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -250,6 +250,85 @@ namespace grb {
 			return A.sptrsvSchedule;
 		}
 
+		template<
+			typename InputType, typename RIT, typename CIT, typename NIT
+		>
+		void allocateSptrsvSchedule(
+			grb::Matrix< InputType, reference, RIT, CIT, NIT > &A,
+			const size_t nSteps, const size_t nThreads
+		) {
+			if( A.sptrsvSchedule == nullptr ) {
+				throw std::runtime_error( "SptrsvSchedule was not initialised" );
+			}
+			auto &sptrsv = *(A.sptrsvSchedule);
+			if( sptrsv.nThreads != nThreads ) {
+				throw std::runtime_error( "SptrsvSchedule was allocated with different "
+					"nThreads" );
+			}
+			sptrsv.supersteps = nSteps;
+
+			#pragma omp parallel
+			{
+				const size_t actualNumThreads = omp_get_num_threads();
+				if( actualNumThreads != nThreads ) {
+					throw std::runtime_error( "Unexpected number of threads" );
+				}
+				const size_t s = omp_get_thread_num();
+				if( sptrsv.data[ s ] ) {
+					throw std::runtime_error( "A thread-local schedule already existed" );
+				}
+				sptrsv.alloc( s );
+			}
+		}
+
+		template<
+			typename InputType, typename RIT, typename CIT, typename NIT,
+			typename LoIt, typename HiIt
+		>
+		void setSptrsvSchedule(
+			grb::Matrix< InputType, reference, RIT, CIT, NIT > &A,
+			LoIt lo, const LoIt &lo_end, HiIt hi, const HiIt &hi_end,
+			const size_t nThreads
+		) {
+			if( A.sptrsvSchedule == nullptr ) {
+				throw std::runtime_error( "SptrsvSchedule was not initialised" );
+			}
+			auto &sptrsv = *(A.sptrsvSchedule);
+			if( sptrsv.nThreads != nThreads ) {
+				throw std::runtime_error( "SptrsvSchedule was allocated with different "
+					"nThreads" );
+			}
+
+			#pragma omp parallel
+			{
+				const size_t actualNumThreads = omp_get_num_threads();
+				if( actualNumThreads != nThreads ) {
+					throw std::runtime_error( "Unexpected number of threads" );
+				}
+				const size_t s = omp_get_thread_num();
+				NIT *__restrict__ const array = reinterpret_cast< NIT * >(sptrsv.data[ s ]);
+				assert( lo != lo_end );
+				assert( hi != hi_end );
+				size_t count = 0;
+				do {
+					const NIT l = *lo++;
+					const NIT h = *hi++;
+					assert( h >= l );
+					const NIT n = h - l;
+					if( count >= sptrsv.supersteps ) {
+						throw std::runtime_error( "Too many supersteps" );
+					}
+					*array++ = l;
+					*array++ = n;
+					(void) count++;
+					if( lo == lo_end ) { assert( hi == hi_end ); }
+				} while( lo != lo_end && hi != hi_end );
+				if( count != sptrsv.supersteps ) {
+					throw std::runtime_error( "Unexpected number of supersteps" );
+				}
+			}
+		}
+
 		template<
 			Descriptor descr,
 			bool input_dense, bool output_dense,
@@ -1243,6 +1322,24 @@ namespace grb {
 			const grb::Matrix< InputType, reference, RIT, CIT, NIT > &A
 		) noexcept;
 
+		template<
+			typename InputType, typename RIT, typename CIT, typename NIT
+		>
+		friend void internal::allocateSptrsvSchedule(
+			grb::Matrix< InputType, reference, RIT, CIT, NIT > &A,
+			const size_t nSteps, const size_t nThreads
+		);
+
+		template<
+			typename InputType, typename RIT, typename CIT, typename NIT,
+			typename LoIt, typename HiIt
+		>
+		friend void internal::setSptrsvSchedule(
+			grb::Matrix< InputType, reference, RIT, CIT, NIT > &A,
+			LoIt lo, const LoIt &lo_end, HiIt hi, const HiIt &hi_end,
+			const size_t nThreads
+		);
+
 		friend const grb::Matrix<
 			D, reference,
 			ColIndexType, ColIndexType, NonzeroIndexType

From 42773ca5a265bab0a5e182f8dc13c813e17db861 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Tue, 8 Apr 2025 16:48:33 +0200
Subject: [PATCH 17/55] OSP integration error: input schedule has slightly
 different structure

---
 include/graphblas/reference/matrix.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index 461cd9975..72e077eff 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -311,8 +311,8 @@ namespace grb {
 				assert( hi != hi_end );
 				size_t count = 0;
 				do {
-					const NIT l = *lo++;
-					const NIT h = *hi++;
+					const NIT l = (*lo++)[s];
+					const NIT h = (*hi++)[s];
 					assert( h >= l );
 					const NIT n = h - l;
 					if( count >= sptrsv.supersteps ) {

From b680b6e5550d770f9bb4e069ffb1e65b0b3ba964 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Tue, 8 Apr 2025 16:59:21 +0200
Subject: [PATCH 18/55] I still did not get it, but this should now be correct

---
 include/graphblas/reference/matrix.hpp | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index 72e077eff..e3898d3d0 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -282,12 +282,11 @@ namespace grb {
 		}
 
 		template<
-			typename InputType, typename RIT, typename CIT, typename NIT,
-			typename LoIt, typename HiIt
+			typename InputType, typename RIT, typename CIT, typename NIT, typename It
 		>
 		void setSptrsvSchedule(
 			grb::Matrix< InputType, reference, RIT, CIT, NIT > &A,
-			LoIt lo, const LoIt &lo_end, HiIt hi, const HiIt &hi_end,
+			It bounds, const It &bounds_end,
 			const size_t nThreads
 		) {
 			if( A.sptrsvSchedule == nullptr ) {
@@ -307,12 +306,11 @@ namespace grb {
 				}
 				const size_t s = omp_get_thread_num();
 				NIT *__restrict__ const array = reinterpret_cast< NIT * >(sptrsv.data[ s ]);
-				assert( lo != lo_end );
-				assert( hi != hi_end );
+				assert( bounds != bounds_end );
 				size_t count = 0;
 				do {
-					const NIT l = (*lo++)[s];
-					const NIT h = (*hi++)[s];
+					const NIT l = (*bounds)[s][0];
+					const NIT h = (*bounds)[s][1];
 					assert( h >= l );
 					const NIT n = h - l;
 					if( count >= sptrsv.supersteps ) {
@@ -321,8 +319,8 @@ namespace grb {
 					*array++ = l;
 					*array++ = n;
 					(void) count++;
-					if( lo == lo_end ) { assert( hi == hi_end ); }
-				} while( lo != lo_end && hi != hi_end );
+					(void) bounds++;
+				} while( bounds != bounds_end );
 				if( count != sptrsv.supersteps ) {
 					throw std::runtime_error( "Unexpected number of supersteps" );
 				}

From 6f6cdecaeb35b013b2b381681ccd4cfb05443ac0 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Tue, 8 Apr 2025 17:05:09 +0200
Subject: [PATCH 19/55] Forgot to update friend signature

---
 include/graphblas/reference/matrix.hpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index e3898d3d0..73cad8525 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -1329,12 +1329,11 @@ namespace grb {
 		);
 
 		template<
-			typename InputType, typename RIT, typename CIT, typename NIT,
-			typename LoIt, typename HiIt
+			typename InputType, typename RIT, typename CIT, typename NIT, typename It
 		>
 		friend void internal::setSptrsvSchedule(
 			grb::Matrix< InputType, reference, RIT, CIT, NIT > &A,
-			LoIt lo, const LoIt &lo_end, HiIt hi, const HiIt &hi_end,
+			It bounds, const It &bounds_end,
 			const size_t nThreads
 		);
 

From ddf70bf707afa3c3da1bd4d9a2b61fb216e0d59b Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Tue, 8 Apr 2025 17:07:11 +0200
Subject: [PATCH 20/55] Bugfix

---
 include/graphblas/reference/matrix.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index 73cad8525..7b7c43098 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -305,7 +305,8 @@ namespace grb {
 					throw std::runtime_error( "Unexpected number of threads" );
 				}
 				const size_t s = omp_get_thread_num();
-				NIT *__restrict__ const array = reinterpret_cast< NIT * >(sptrsv.data[ s ]);
+				// get buffer as an array of NIT, which we will write to in one pass
+				NIT *__restrict__ array = reinterpret_cast< NIT * >(sptrsv.data[ s ]);
 				assert( bounds != bounds_end );
 				size_t count = 0;
 				do {

From 18e79268b4a4a31943890eb81fcd33c581992976 Mon Sep 17 00:00:00 2001
From: Christos Konstantinos Matzoros
 <christos.konstantinos.matzoros@h-partners.com>
Date: Wed, 9 Apr 2025 19:04:32 +0200
Subject: [PATCH 21/55] Bug Fixes

---
 .../graphblas/reference/SptrsvSchedule.hpp    | 25 ++++++++++++++++---
 include/graphblas/reference/blas2.hpp         |  3 ++-
 include/graphblas/reference/matrix.hpp        | 22 +++++++++-------
 3 files changed, 37 insertions(+), 13 deletions(-)

diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp
index 2e02b95f9..e4d8426b9 100644
--- a/include/graphblas/reference/SptrsvSchedule.hpp
+++ b/include/graphblas/reference/SptrsvSchedule.hpp
@@ -53,6 +53,16 @@ namespace grb {
 				/** Fixed-size buffer for realising the default schedule. */
 				NIT default_schedule[2];
 
+				void moveImpl( SptrsvSchedule && toMove ) {
+					_deleters = std::move( toMove._deleters );
+					default_schedule = std::move( toMove.default_schedule );
+					supersteps = toMove.supersteps;
+					nThreads = toMove.nThreads;
+					data = std::move( toMove.data );
+					toMove.supersteps = 0;
+					toMove.nThreads = 0;
+				}
+
 
 			public:
 
@@ -67,6 +77,15 @@ namespace grb {
 				/** One data pointer per thread. */
 				std::vector< char * > data;
 
+				SptrsvSchedule( SptrsvSchedule && toMove ) {
+					moveImpl( toMove );
+				}
+
+				SptrsvSchedule& operator=( SptrsvSchedule &&toMove ) {
+					moveImpl( toMove );
+					return *this;
+				}
+
 				/**
 				 * Base constructor.
 				 *
@@ -93,9 +112,9 @@ namespace grb {
 				 * a single-superstep schedule that assigns all work to the first thread.
 				 */
 				SptrsvSchedule( const NIT n, const size_t T ) :
-					_deleters( T ), supersteps( 1 ), nThreads( T ), data( T )
+					_deleters( T ), supersteps( 1 ), nThreads( T ), data( T , nullptr )
 				{
-					data[ 0 ] = nullptr;
+					//data[ 0 ] = nullptr;
 					if( T == 0 || T > std::numeric_limits< int >::max() ) {
 						throw std::runtime_error( "Invalid number of threads" );
 					}
@@ -129,7 +148,7 @@ namespace grb {
 					const grb::RC rc = utils::alloc(
 						"grb::internal::SptrsvSchedule (default constructor)",
 						"default thread-local data allocation",
-						data[ s ], supersteps * sizeof(NIT), false, _deleters[ s ]
+						data[ s ], 2 * supersteps * sizeof(NIT), false, _deleters[ s ]
 					);
 					if( rc != grb::SUCCESS ) {
 						throw std::bad_alloc();
diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp
index e79c3a80c..7c7254598 100644
--- a/include/graphblas/reference/blas2.hpp
+++ b/include/graphblas/reference/blas2.hpp
@@ -2368,8 +2368,9 @@ namespace grb {
 			// get SpTrsv schedule
 			const SptrsvSchedule< NIT > *sptrsvSchedule_p =
 				internal::getSptrsvData( T );
+			const SptrsvSchedule< NIT > default_schedule (n);
 			const SptrsvSchedule< NIT > &sptrsv = sptrsvSchedule_p == nullptr ?
-				SptrsvSchedule< NIT >( n ) : *sptrsvSchedule_p;
+				default_schedule : *sptrsvSchedule_p;
 
 			// only execute and resize are supported
 			assert( phase == grb::EXECUTE );
diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index 7b7c43098..7be64e203 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -255,22 +255,25 @@ namespace grb {
 		>
 		void allocateSptrsvSchedule(
 			grb::Matrix< InputType, reference, RIT, CIT, NIT > &A,
-			const size_t nSteps, const size_t nThreads
+			const size_t nSteps, const NIT n, const size_t nThreads
 		) {
-			if( A.sptrsvSchedule == nullptr ) {
-				throw std::runtime_error( "SptrsvSchedule was not initialised" );
+			if( A.sptrsvSchedule != nullptr ) {
+				throw std::runtime_error( "SptrsvSchedule was already initialised" );
 			}
+			A.sptrsvSchedule = new SptrsvSchedule<NIT>(n, nThreads);
+
 			auto &sptrsv = *(A.sptrsvSchedule);
-			if( sptrsv.nThreads != nThreads ) {
-				throw std::runtime_error( "SptrsvSchedule was allocated with different "
-					"nThreads" );
-			}
+
 			sptrsv.supersteps = nSteps;
 
+			// disable default trivial schedule
+			sptrsv.data[ 0 ] = nullptr;
+
 			#pragma omp parallel
 			{
 				const size_t actualNumThreads = omp_get_num_threads();
-				if( actualNumThreads != nThreads ) {
+
+				if( actualNumThreads < nThreads ) {
 					throw std::runtime_error( "Unexpected number of threads" );
 				}
 				const size_t s = omp_get_thread_num();
@@ -306,6 +309,7 @@ namespace grb {
 				}
 				const size_t s = omp_get_thread_num();
 				// get buffer as an array of NIT, which we will write to in one pass
+				assert( sptrsv.data[ s ] != nullptr );
 				NIT *__restrict__ array = reinterpret_cast< NIT * >(sptrsv.data[ s ]);
 				assert( bounds != bounds_end );
 				size_t count = 0;
@@ -1326,7 +1330,7 @@ namespace grb {
 		>
 		friend void internal::allocateSptrsvSchedule(
 			grb::Matrix< InputType, reference, RIT, CIT, NIT > &A,
-			const size_t nSteps, const size_t nThreads
+			const size_t nSteps,  const NIT n, const size_t nThreads
 		);
 
 		template<

From 6617cada9e39509dbe97bb60611c6d2e08beb3db Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Thu, 10 Apr 2025 12:23:07 +0200
Subject: [PATCH 22/55] There can be multiple ranges per superstep per thread
 that OSP returns -- create space to store how many ranges per superstep per
 thread there are

---
 .../graphblas/reference/SptrsvSchedule.hpp    | 69 ++++++++++++++-----
 include/graphblas/reference/matrix.hpp        | 16 +++--
 2 files changed, 64 insertions(+), 21 deletions(-)

diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp
index e4d8426b9..322cc601e 100644
--- a/include/graphblas/reference/SptrsvSchedule.hpp
+++ b/include/graphblas/reference/SptrsvSchedule.hpp
@@ -41,24 +41,50 @@ namespace grb {
 				/** One deleter for each chunk of thread-local data. */
 				std::vector< utils::AutoDeleter< char > > _deleters;
 
+				/**
+				 * Fixed-size buffer for realising the default schedule.
+				 *
+				 * The first two positions are reserved for the range of the default
+				 * schedule, while the last position is reserved for the end-position of
+				 * the default schedule.
+				 */
+				NIT default_schedule[ 3 ];
+
 				/** Computes and initialises a trivial schedule. */
 				void initTrivial( const NIT n ) {
+					// dynamic sanity check
 					assert( data[ 0 ] == nullptr );
+					// set trivial schedule
+					default_schedule[ 0 ] = 0;
+					default_schedule[ 1 ] = n;
+					default_schedule[ 2 ] = 1;
+					// set pointers to trivial schedule
 					data[ 0 ] = reinterpret_cast< char * >( &(default_schedule[0]) );
-					NIT * const interpreted = reinterpret_cast< NIT * >(data[0]);
-					interpreted[ 0 ] = 0;
-					interpreted[ 1 ] = n;
+					endPostions[ 0 ] = reinterpret_cast< char * >( &(default_schedule[2]) );
+					// verify trivial schedule
+#ifndef NDEBUG
+					{
+						NIT * const interpreted = reinterpret_cast< NIT * >(data[0]);
+						assert( interpreted[ 0 ] == 0 );
+						assert( interpreted[ 1 ] == n );
+					}
+					{
+						NIT * const interpreted = reinterpret_cast< NIT * >(endPositions[0]);
+						assert( *interpreted == 1 );
+					}
+#endif
 				}
 
-				/** Fixed-size buffer for realising the default schedule. */
-				NIT default_schedule[2];
-
-				void moveImpl( SptrsvSchedule && toMove ) {
+				/**
+				 * Implementation of move construction and assignment.
+				 */
+				void moveImpl( SptrsvSchedule &&toMove ) {
 					_deleters = std::move( toMove._deleters );
 					default_schedule = std::move( toMove.default_schedule );
 					supersteps = toMove.supersteps;
 					nThreads = toMove.nThreads;
 					data = std::move( toMove.data );
+					endPositions = std::move( toMove.endPositions );
 					toMove.supersteps = 0;
 					toMove.nThreads = 0;
 				}
@@ -77,13 +103,12 @@ namespace grb {
 				/** One data pointer per thread. */
 				std::vector< char * > data;
 
-				SptrsvSchedule( SptrsvSchedule && toMove ) {
-					moveImpl( toMove );
-				}
+				/** One end-position array per thread. */
+				std::vector< char * > endPostions;
 
-				SptrsvSchedule& operator=( SptrsvSchedule &&toMove ) {
+				/** Move constructor. */
+				SptrsvSchedule( SptrsvSchedule &&toMove ) {
 					moveImpl( toMove );
-					return *this;
 				}
 
 				/**
@@ -96,9 +121,11 @@ namespace grb {
 				 * first (and only) thread.
 				 */
 				SptrsvSchedule( const NIT n ) :
-					_deleters( 1 ), supersteps( 1 ), nThreads( 1 ), data( 1 )
+					_deleters( 1 ), supersteps( 1 ), nThreads( 1 ),
+					data( 1 ), endPositions( 1 )
 				{
 					data[ 0 ] = nullptr;
+					endPositions[ 0 ] = nullptr;
 					initTrivial( n );
 				}
 
@@ -112,9 +139,9 @@ namespace grb {
 				 * a single-superstep schedule that assigns all work to the first thread.
 				 */
 				SptrsvSchedule( const NIT n, const size_t T ) :
-					_deleters( T ), supersteps( 1 ), nThreads( T ), data( T , nullptr )
+					_deleters( 2 * T ), supersteps( 1 ), nThreads( T ),
+					data( T, nullptr ), endPositions( T, nullptr )
 				{
-					//data[ 0 ] = nullptr;
 					if( T == 0 || T > std::numeric_limits< int >::max() ) {
 						throw std::runtime_error( "Invalid number of threads" );
 					}
@@ -134,21 +161,29 @@ namespace grb {
 					}
 				}
 
+				/** Move assignment. */
+				SptrsvSchedule& operator=( SptrsvSchedule &&toMove ) {
+					moveImpl( toMove );
+					return *this;
+				}
+
 				/**
 				 * Allocates a thread-local chunk of data.
 				 *
 				 * Must be called from within the thread that will use it(!)
 				 */
-				void alloc( const size_t s ) {
+				void alloc( const size_t s, const size_t nRanges ) {
 					assert( s < nThreads );
 					assert( supersteps > 0 );
 					assert( _deleters.size() >= s );
 					assert( data.size() >= s );
 					assert( data[ s ] == nullptr );
+					assert( endPositions[ s ] == nullptr );
 					const grb::RC rc = utils::alloc(
 						"grb::internal::SptrsvSchedule (default constructor)",
 						"default thread-local data allocation",
-						data[ s ], 2 * supersteps * sizeof(NIT), false, _deleters[ s ]
+						data[ s ], 2 * nRanges * sizeof(NIT), false, _deleters[ s ],
+						endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ 2 * s ]
 					);
 					if( rc != grb::SUCCESS ) {
 						throw std::bad_alloc();
diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index 7be64e203..606a19285 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -251,11 +251,13 @@ namespace grb {
 		}
 
 		template<
-			typename InputType, typename RIT, typename CIT, typename NIT
+			typename InputType, typename RIT, typename CIT, typename NIT,
+			typename NumRangesIt
 		>
 		void allocateSptrsvSchedule(
 			grb::Matrix< InputType, reference, RIT, CIT, NIT > &A,
-			const size_t nSteps, const NIT n, const size_t nThreads
+			const NIT n, const size_t nThreads, const size_t nSteps,
+			const NumRangesIt nRanges, const NumRangesIt nRanges_end
 		) {
 			if( A.sptrsvSchedule != nullptr ) {
 				throw std::runtime_error( "SptrsvSchedule was already initialised" );
@@ -268,19 +270,25 @@ namespace grb {
 
 			// disable default trivial schedule
 			sptrsv.data[ 0 ] = nullptr;
+			sptrsv.endPositions[ 0 ] = nullptr;
 
 			#pragma omp parallel
 			{
+				auto localIt = nRanges;
+				std::advance( localIt, s );
 				const size_t actualNumThreads = omp_get_num_threads();
 
 				if( actualNumThreads < nThreads ) {
 					throw std::runtime_error( "Unexpected number of threads" );
 				}
+				if( *localIt <= 0 ) {
+					throw std::runtime_error( "Unexpected number of ranges" );
+				}
 				const size_t s = omp_get_thread_num();
-				if( sptrsv.data[ s ] ) {
+				if( sptrsv.data[ s ] || sptrsv.endPositions[ s ] ) {
 					throw std::runtime_error( "A thread-local schedule already existed" );
 				}
-				sptrsv.alloc( s );
+				sptrsv.alloc( s, *localIt );
 			}
 		}
 

From fba4d082269263d2d79848653dcab916ad0f5e9f Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Thu, 10 Apr 2025 12:31:26 +0200
Subject: [PATCH 23/55] Bugfixes in previous commit, apologies

---
 include/graphblas/reference/SptrsvSchedule.hpp | 4 ++--
 include/graphblas/reference/matrix.hpp         | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp
index 322cc601e..fc9fe8b7b 100644
--- a/include/graphblas/reference/SptrsvSchedule.hpp
+++ b/include/graphblas/reference/SptrsvSchedule.hpp
@@ -60,7 +60,7 @@ namespace grb {
 					default_schedule[ 2 ] = 1;
 					// set pointers to trivial schedule
 					data[ 0 ] = reinterpret_cast< char * >( &(default_schedule[0]) );
-					endPostions[ 0 ] = reinterpret_cast< char * >( &(default_schedule[2]) );
+					endPositions[ 0 ] = reinterpret_cast< char * >( &(default_schedule[2]) );
 					// verify trivial schedule
 #ifndef NDEBUG
 					{
@@ -104,7 +104,7 @@ namespace grb {
 				std::vector< char * > data;
 
 				/** One end-position array per thread. */
-				std::vector< char * > endPostions;
+				std::vector< char * > endPositions;
 
 				/** Move constructor. */
 				SptrsvSchedule( SptrsvSchedule &&toMove ) {
diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index 606a19285..72db3fb5e 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -275,16 +275,15 @@ namespace grb {
 			#pragma omp parallel
 			{
 				auto localIt = nRanges;
-				std::advance( localIt, s );
+				const size_t s = omp_get_thread_num();
 				const size_t actualNumThreads = omp_get_num_threads();
-
 				if( actualNumThreads < nThreads ) {
 					throw std::runtime_error( "Unexpected number of threads" );
 				}
+				std::advance( localIt, s );
 				if( *localIt <= 0 ) {
 					throw std::runtime_error( "Unexpected number of ranges" );
 				}
-				const size_t s = omp_get_thread_num();
 				if( sptrsv.data[ s ] || sptrsv.endPositions[ s ] ) {
 					throw std::runtime_error( "A thread-local schedule already existed" );
 				}

From 2dfa32919f5f5dc4c9bc9620200f26dd73bfc85f Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Thu, 10 Apr 2025 12:36:43 +0200
Subject: [PATCH 24/55] Update sptrsv implementation to make use of new
 schedule field. Tests OK

---
 include/graphblas/reference/blas2.hpp | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp
index 7c7254598..473c0c8f3 100644
--- a/include/graphblas/reference/blas2.hpp
+++ b/include/graphblas/reference/blas2.hpp
@@ -2382,13 +2382,17 @@ namespace grb {
 				const int s = omp_get_thread_num();
 				const NIT *__restrict__ data =
 					reinterpret_cast< const NIT * >(sptrsv.data[ s ]);
+				const NIT *__restrict__ const end =
+					reinterpret_cast< const NIT * >(sptrsv.endPositions[ s ]);
 				for( size_t i = 0; i < sptrsv.supersteps; ++i ) {
-					const size_t lo = static_cast< size_t >( *data++ );
-					const size_t no = static_cast< size_t >( *data++ );
-					assert( lo < n );
-					assert( no + lo <= n );
-					ret = ret ? ret : dense_unmasked_sequential_sptrsv< descr, true >(
-						v_raw, T, lo, no, forward, semiring, subtraction, division, phase );
+					for( size_t k = 0; k < end[ i ]; ++k ) {
+						const size_t lo = static_cast< size_t >( *data++ );
+						const size_t no = static_cast< size_t >( *data++ );
+						assert( lo < n );
+						assert( no + lo <= n );
+						ret = ret ? ret : dense_unmasked_sequential_sptrsv< descr, true >(
+							v_raw, T, lo, no, forward, semiring, subtraction, division, phase );
+					}
 					#pragma omp barrier
 				}
 			}

From bc6939e1a61d9a4178fc320a7eaeba0f25f7e2dd Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Thu, 10 Apr 2025 13:33:24 +0200
Subject: [PATCH 25/55] Update schedule ingestion accordingly

---
 include/graphblas/reference/matrix.hpp | 54 +++++++++++++++++++++-----
 1 file changed, 44 insertions(+), 10 deletions(-)

diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index 72db3fb5e..e126d4f2a 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -291,6 +291,19 @@ namespace grb {
 			}
 		}
 
+		/**
+		 * This schedule ingestion method expects the schedule as an iterator-pair
+		 * over a container of doubly-nested vectors. Each such vector v iterated
+		 * over, corresponds to a superstep. For a given superstep's v, v[s][k] has
+		 * that s indicates the thread ID and k indicates the floor( k / 2 )-th range
+		 * to be processed in that superstep and at that thread. Even-numbered k
+		 * indicate lower bounds of the range, while odd k indicate their upper
+		 * bounds.
+		 *
+		 * \note This is not an ideal way to pass around schedules, as it assumes the
+		 *       container element types are doubly-nested STL vectors-- it is hence
+		 *       not a generic approach.
+		 */
 		template<
 			typename InputType, typename RIT, typename CIT, typename NIT, typename It
 		>
@@ -318,20 +331,41 @@ namespace grb {
 				// get buffer as an array of NIT, which we will write to in one pass
 				assert( sptrsv.data[ s ] != nullptr );
 				NIT *__restrict__ array = reinterpret_cast< NIT * >(sptrsv.data[ s ]);
+				// get the number of ranges for a given superstep at this thread that this
+				// function should populate
+				NIT *__restrict__ end = reinterpret_cast< NIT * >(sptrsv.endPositions[ s ]);
+				// dynamic assertion: there should be at least one superstep
 				assert( bounds != bounds_end );
+				// start ingestion
 				size_t count = 0;
 				do {
-					const NIT l = (*bounds)[s][0];
-					const NIT h = (*bounds)[s][1];
-					assert( h >= l );
-					const NIT n = h - l;
-					if( count >= sptrsv.supersteps ) {
-						throw std::runtime_error( "Too many supersteps" );
+					assert( bounds->size() == nThreads );
+					// get this superstep's and this thread's vector v[s]
+					const auto &v = (*bounds)[s];
+					// go range-by-range
+					size_t nRanges = 0;
+					for( auto it = v.cbegin(); it != v.cend(); ++it ) {
+						// we have a range, parse it
+						const NIT l = *it++;
+						assert( it != v.cend() );
+						const NIT h = *it + 1;
+						assert( h >= l );
+						const NIT n = h - l;
+						if( count >= sptrsv.supersteps ) {
+							throw std::runtime_error( "Too many supersteps" );
+						}
+						// store it
+						*array++ = l;
+						*array++ = n;
+						// TODO we do not / cannot double-check nRanges with the data we have
+						//      stored at the moment. This may potentially by fixed (FIXME).
+						(void) ++nRanges;
 					}
-					*array++ = l;
-					*array++ = n;
-					(void) count++;
-					(void) bounds++;
+					// store the number of ranges in the end array
+					*end++ = nRanges;
+					// forward to the next superstep
+					(void) ++count;
+					(void) ++bounds;
 				} while( bounds != bounds_end );
 				if( count != sptrsv.supersteps ) {
 					throw std::runtime_error( "Unexpected number of supersteps" );

From 708959cb9aed4691be7f449e8a0d544c539023cc Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Thu, 10 Apr 2025 13:55:34 +0200
Subject: [PATCH 26/55] Forgot to update friend declaration

---
 include/graphblas/reference/matrix.hpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index e126d4f2a..024904b60 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -1367,11 +1367,13 @@ namespace grb {
 		) noexcept;
 
 		template<
-			typename InputType, typename RIT, typename CIT, typename NIT
+			typename InputType, typename RIT, typename CIT, typename NIT,
+			typename NumRangesIt
 		>
 		friend void internal::allocateSptrsvSchedule(
 			grb::Matrix< InputType, reference, RIT, CIT, NIT > &A,
-			const size_t nSteps,  const NIT n, const size_t nThreads
+			const NIT n, const size_t nThreads, const size_t nSteps,
+			const NumRangesIt nRanges, const NumRangesIt nRanges_end
 		);
 
 		template<

From bfe58a87ecac2b442a7e13245f804893a060f990 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Thu, 10 Apr 2025 14:27:33 +0200
Subject: [PATCH 27/55] Should allow for empty ranges

---
 .../graphblas/reference/SptrsvSchedule.hpp    | 22 ++++++++++++++-----
 include/graphblas/reference/blas2.hpp         |  1 +
 include/graphblas/reference/matrix.hpp        |  3 ---
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp
index fc9fe8b7b..6d776ddde 100644
--- a/include/graphblas/reference/SptrsvSchedule.hpp
+++ b/include/graphblas/reference/SptrsvSchedule.hpp
@@ -179,12 +179,22 @@ namespace grb {
 					assert( data.size() >= s );
 					assert( data[ s ] == nullptr );
 					assert( endPositions[ s ] == nullptr );
-					const grb::RC rc = utils::alloc(
-						"grb::internal::SptrsvSchedule (default constructor)",
-						"default thread-local data allocation",
-						data[ s ], 2 * nRanges * sizeof(NIT), false, _deleters[ s ],
-						endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ 2 * s ]
-					);
+					grb::RC rc = grb::SUCCESS;
+					if( nRanges > 0 ) {
+						rc = utils::alloc(
+							"grb::internal::SptrsvSchedule (default constructor)",
+							"default thread-local data allocation, variant I",
+							data[ s ], 2 * nRanges * sizeof(NIT), false, _deleters[ s ],
+							endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ 2 * s ]
+						);
+					} else {
+						data[ s ] = nullptr;
+						rc = utils::alloc(
+							"grb::internal::SptrsvSchedule (default constructor)",
+							"default thread-local data allocation, variant II",
+							endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ 2 * s ]
+						);
+					}
 					if( rc != grb::SUCCESS ) {
 						throw std::bad_alloc();
 					}
diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp
index 473c0c8f3..f6115995d 100644
--- a/include/graphblas/reference/blas2.hpp
+++ b/include/graphblas/reference/blas2.hpp
@@ -2384,6 +2384,7 @@ namespace grb {
 					reinterpret_cast< const NIT * >(sptrsv.data[ s ]);
 				const NIT *__restrict__ const end =
 					reinterpret_cast< const NIT * >(sptrsv.endPositions[ s ]);
+				assert( end != nullptr );
 				for( size_t i = 0; i < sptrsv.supersteps; ++i ) {
 					for( size_t k = 0; k < end[ i ]; ++k ) {
 						const size_t lo = static_cast< size_t >( *data++ );
diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index 024904b60..787850993 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -281,9 +281,6 @@ namespace grb {
 					throw std::runtime_error( "Unexpected number of threads" );
 				}
 				std::advance( localIt, s );
-				if( *localIt <= 0 ) {
-					throw std::runtime_error( "Unexpected number of ranges" );
-				}
 				if( sptrsv.data[ s ] || sptrsv.endPositions[ s ] ) {
 					throw std::runtime_error( "A thread-local schedule already existed" );
 				}

From 25684b9b597620c6b964620bba019ed433dfa769 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Thu, 10 Apr 2025 14:35:17 +0200
Subject: [PATCH 28/55] I am not enthused about the exceptions that the
 schedule format appears to have to account for-- it may apparently also occur
 that the number of inner dimensions is not always constant

---
 include/graphblas/reference/matrix.hpp | 40 ++++++++++++++------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index 787850993..e3c7ac4be 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -336,27 +336,29 @@ namespace grb {
 				// start ingestion
 				size_t count = 0;
 				do {
-					assert( bounds->size() == nThreads );
-					// get this superstep's and this thread's vector v[s]
-					const auto &v = (*bounds)[s];
-					// go range-by-range
+					assert( bounds->size() <= nThreads );
 					size_t nRanges = 0;
-					for( auto it = v.cbegin(); it != v.cend(); ++it ) {
-						// we have a range, parse it
-						const NIT l = *it++;
-						assert( it != v.cend() );
-						const NIT h = *it + 1;
-						assert( h >= l );
-						const NIT n = h - l;
-						if( count >= sptrsv.supersteps ) {
-							throw std::runtime_error( "Too many supersteps" );
+					if( bounds->size() >= s ) {
+						// get this superstep's and this thread's vector v[s]
+						const auto &v = (*bounds)[s];
+						// go range-by-range
+						for( auto it = v.cbegin(); it != v.cend(); ++it ) {
+							// we have a range, parse it
+							const NIT l = *it++;
+							assert( it != v.cend() );
+							const NIT h = *it + 1;
+							assert( h >= l );
+							const NIT n = h - l;
+							if( count >= sptrsv.supersteps ) {
+								throw std::runtime_error( "Too many supersteps" );
+							}
+							// store it
+							*array++ = l;
+							*array++ = n;
+							// TODO we do not / cannot double-check nRanges with the data we have
+							//      stored at the moment. This may potentially by fixed (FIXME).
+							(void) ++nRanges;
 						}
-						// store it
-						*array++ = l;
-						*array++ = n;
-						// TODO we do not / cannot double-check nRanges with the data we have
-						//      stored at the moment. This may potentially by fixed (FIXME).
-						(void) ++nRanges;
 					}
 					// store the number of ranges in the end array
 					*end++ = nRanges;

From 834e375ca39c8b8ba19609a2293951581a882b3d Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Thu, 10 Apr 2025 14:42:23 +0200
Subject: [PATCH 29/55] There can be processes without any range

---
 include/graphblas/reference/matrix.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index e3c7ac4be..6fbe2bb11 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -326,7 +326,6 @@ namespace grb {
 				}
 				const size_t s = omp_get_thread_num();
 				// get buffer as an array of NIT, which we will write to in one pass
-				assert( sptrsv.data[ s ] != nullptr );
 				NIT *__restrict__ array = reinterpret_cast< NIT * >(sptrsv.data[ s ]);
 				// get the number of ranges for a given superstep at this thread that this
 				// function should populate

From 471f8477a468983cc3303177d72832785761fab7 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Thu, 10 Apr 2025 14:59:45 +0200
Subject: [PATCH 30/55] Partial undo of one or two commits ago -- it should not
 happen that there are empty inner vectors in the schedule -- something else
 is off

---
 include/graphblas/reference/matrix.hpp | 40 ++++++++++++--------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index 6fbe2bb11..36a008fd0 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -335,29 +335,27 @@ namespace grb {
 				// start ingestion
 				size_t count = 0;
 				do {
-					assert( bounds->size() <= nThreads );
+					assert( bounds->size() == nThreads );
 					size_t nRanges = 0;
-					if( bounds->size() >= s ) {
-						// get this superstep's and this thread's vector v[s]
-						const auto &v = (*bounds)[s];
-						// go range-by-range
-						for( auto it = v.cbegin(); it != v.cend(); ++it ) {
-							// we have a range, parse it
-							const NIT l = *it++;
-							assert( it != v.cend() );
-							const NIT h = *it + 1;
-							assert( h >= l );
-							const NIT n = h - l;
-							if( count >= sptrsv.supersteps ) {
-								throw std::runtime_error( "Too many supersteps" );
-							}
-							// store it
-							*array++ = l;
-							*array++ = n;
-							// TODO we do not / cannot double-check nRanges with the data we have
-							//      stored at the moment. This may potentially by fixed (FIXME).
-							(void) ++nRanges;
+					// get this superstep's and this thread's vector v[s]
+					const auto &v = (*bounds)[s];
+					// go range-by-range
+					for( auto it = v.cbegin(); it != v.cend(); ++it ) {
+						// we have a range, parse it
+						const NIT l = *it++;
+						assert( it != v.cend() );
+						const NIT h = *it + 1;
+						assert( h >= l );
+						const NIT n = h - l;
+						if( count >= sptrsv.supersteps ) {
+							throw std::runtime_error( "Too many supersteps" );
 						}
+						// store it
+						*array++ = l;
+						*array++ = n;
+						// TODO we do not / cannot double-check nRanges with the data we have
+						//      stored at the moment. This may potentially by fixed (FIXME).
+						(void) ++nRanges;
 					}
 					// store the number of ranges in the end array
 					*end++ = nRanges;

From 0cb629541e96180565c1712f5f7e60e17673fafd Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Thu, 10 Apr 2025 15:03:21 +0200
Subject: [PATCH 31/55] Found the likely culprit!

---
 include/graphblas/reference/matrix.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index 36a008fd0..ebe7de820 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -306,7 +306,7 @@ namespace grb {
 		>
 		void setSptrsvSchedule(
 			grb::Matrix< InputType, reference, RIT, CIT, NIT > &A,
-			It bounds, const It &bounds_end,
+			const It &bounds_begin, const It &bounds_end,
 			const size_t nThreads
 		) {
 			if( A.sptrsvSchedule == nullptr ) {
@@ -320,6 +320,7 @@ namespace grb {
 
 			#pragma omp parallel
 			{
+				It bounds = bounds_begin;
 				const size_t actualNumThreads = omp_get_num_threads();
 				if( actualNumThreads != nThreads ) {
 					throw std::runtime_error( "Unexpected number of threads" );

From 64c64eee5739b01cdfa4069e4644ab72cdc51961 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Thu, 10 Apr 2025 15:04:57 +0200
Subject: [PATCH 32/55] Also update friend declaration

---
 include/graphblas/reference/matrix.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index ebe7de820..06bd4eb6f 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -1378,7 +1378,7 @@ namespace grb {
 		>
 		friend void internal::setSptrsvSchedule(
 			grb::Matrix< InputType, reference, RIT, CIT, NIT > &A,
-			It bounds, const It &bounds_end,
+			const It &bounds, const It &bounds_end,
 			const size_t nThreads
 		);
 

From dd17fba5d19d5591bc8b03e9f0814defb879114c Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Thu, 10 Apr 2025 18:38:21 +0200
Subject: [PATCH 33/55] Additional hardening of schedule ingestion logic

---
 .../graphblas/reference/SptrsvSchedule.hpp    | 19 ++++++++++++++-----
 include/graphblas/reference/matrix.hpp        |  8 ++++++--
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp
index 6d776ddde..543961b98 100644
--- a/include/graphblas/reference/SptrsvSchedule.hpp
+++ b/include/graphblas/reference/SptrsvSchedule.hpp
@@ -58,6 +58,7 @@ namespace grb {
 					default_schedule[ 0 ] = 0;
 					default_schedule[ 1 ] = n;
 					default_schedule[ 2 ] = 1;
+					nRanges[ 0 ] = 1;
 					// set pointers to trivial schedule
 					data[ 0 ] = reinterpret_cast< char * >( &(default_schedule[0]) );
 					endPositions[ 0 ] = reinterpret_cast< char * >( &(default_schedule[2]) );
@@ -85,6 +86,7 @@ namespace grb {
 					nThreads = toMove.nThreads;
 					data = std::move( toMove.data );
 					endPositions = std::move( toMove.endPositions );
+					nRanges = std::move( toMove.nRanges );
 					toMove.supersteps = 0;
 					toMove.nThreads = 0;
 				}
@@ -106,6 +108,9 @@ namespace grb {
 				/** One end-position array per thread. */
 				std::vector< char * > endPositions;
 
+				/** The total number of ranges per thread. */
+				std::vector< NIT > nRanges;
+
 				/** Move constructor. */
 				SptrsvSchedule( SptrsvSchedule &&toMove ) {
 					moveImpl( toMove );
@@ -122,7 +127,7 @@ namespace grb {
 				 */
 				SptrsvSchedule( const NIT n ) :
 					_deleters( 1 ), supersteps( 1 ), nThreads( 1 ),
-					data( 1 ), endPositions( 1 )
+					data( 1 ), endPositions( 1 ), nRanges( 1 )
 				{
 					data[ 0 ] = nullptr;
 					endPositions[ 0 ] = nullptr;
@@ -140,7 +145,7 @@ namespace grb {
 				 */
 				SptrsvSchedule( const NIT n, const size_t T ) :
 					_deleters( 2 * T ), supersteps( 1 ), nThreads( T ),
-					data( T, nullptr ), endPositions( T, nullptr )
+					data( T, nullptr ), endPositions( T, nullptr ), nRanges( T, 0 )
 				{
 					if( T == 0 || T > std::numeric_limits< int >::max() ) {
 						throw std::runtime_error( "Invalid number of threads" );
@@ -172,19 +177,23 @@ namespace grb {
 				 *
 				 * Must be called from within the thread that will use it(!)
 				 */
-				void alloc( const size_t s, const size_t nRanges ) {
+				void alloc( const size_t s, const size_t nRanges_in ) {
 					assert( s < nThreads );
 					assert( supersteps > 0 );
 					assert( _deleters.size() >= s );
 					assert( data.size() >= s );
 					assert( data[ s ] == nullptr );
 					assert( endPositions[ s ] == nullptr );
+					if( nRanges[ s ] != nRanges_in ) {
+						throw std::runtime_error( "Given nRanges differs from the one given at "
+							"allocation time" );
+					}
 					grb::RC rc = grb::SUCCESS;
-					if( nRanges > 0 ) {
+					if( nRanges_in > 0 ) {
 						rc = utils::alloc(
 							"grb::internal::SptrsvSchedule (default constructor)",
 							"default thread-local data allocation, variant I",
-							data[ s ], 2 * nRanges * sizeof(NIT), false, _deleters[ s ],
+							data[ s ], 2 * nRanges_in * sizeof(NIT), false, _deleters[ s ],
 							endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ 2 * s ]
 						);
 					} else {
diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index 06bd4eb6f..4a6958a04 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -267,6 +267,10 @@ namespace grb {
 			auto &sptrsv = *(A.sptrsvSchedule);
 
 			sptrsv.supersteps = nSteps;
+			{
+				std::vector< NIT > tmp( nRanges, nRanges_end );
+				sptrsv.nRanges = std::move( tmp );
+			}
 
 			// disable default trivial schedule
 			sptrsv.data[ 0 ] = nullptr;
@@ -354,11 +358,11 @@ namespace grb {
 						// store it
 						*array++ = l;
 						*array++ = n;
-						// TODO we do not / cannot double-check nRanges with the data we have
-						//      stored at the moment. This may potentially by fixed (FIXME).
 						(void) ++nRanges;
+						assert( nRanges < sptrsv.nRanges[ s ] );
 					}
 					// store the number of ranges in the end array
+					assert( count < sptrsv.supersteps );
 					*end++ = nRanges;
 					// forward to the next superstep
 					(void) ++count;

From a5b545488dbdb0c99c5a32303d2143593ec33b44 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Thu, 10 Apr 2025 18:41:38 +0200
Subject: [PATCH 34/55] Metabugfix

---
 include/graphblas/reference/matrix.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index 4a6958a04..282f4c504 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -359,7 +359,7 @@ namespace grb {
 						*array++ = l;
 						*array++ = n;
 						(void) ++nRanges;
-						assert( nRanges < sptrsv.nRanges[ s ] );
+						assert( nRanges <= sptrsv.nRanges[ s ] );
 					}
 					// store the number of ranges in the end array
 					assert( count < sptrsv.supersteps );

From f9bf41d7d5ee880c43fd6ef76ff2d00d893bb18e Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Thu, 10 Apr 2025 18:46:38 +0200
Subject: [PATCH 35/55] Silly bug causing difficult-to-track memory errors,
 apologies

---
 include/graphblas/reference/SptrsvSchedule.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp
index 543961b98..743844e32 100644
--- a/include/graphblas/reference/SptrsvSchedule.hpp
+++ b/include/graphblas/reference/SptrsvSchedule.hpp
@@ -194,14 +194,14 @@ namespace grb {
 							"grb::internal::SptrsvSchedule (default constructor)",
 							"default thread-local data allocation, variant I",
 							data[ s ], 2 * nRanges_in * sizeof(NIT), false, _deleters[ s ],
-							endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ 2 * s ]
+							endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ s + T ]
 						);
 					} else {
 						data[ s ] = nullptr;
 						rc = utils::alloc(
 							"grb::internal::SptrsvSchedule (default constructor)",
 							"default thread-local data allocation, variant II",
-							endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ 2 * s ]
+							endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ s + T ]
 						);
 					}
 					if( rc != grb::SUCCESS ) {

From 573c80d8f0a614bd636f84ff6c11aa41ad4f9073 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Thu, 10 Apr 2025 18:48:13 +0200
Subject: [PATCH 36/55] Fix typo

---
 include/graphblas/reference/SptrsvSchedule.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp
index 743844e32..214f5ff60 100644
--- a/include/graphblas/reference/SptrsvSchedule.hpp
+++ b/include/graphblas/reference/SptrsvSchedule.hpp
@@ -194,14 +194,14 @@ namespace grb {
 							"grb::internal::SptrsvSchedule (default constructor)",
 							"default thread-local data allocation, variant I",
 							data[ s ], 2 * nRanges_in * sizeof(NIT), false, _deleters[ s ],
-							endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ s + T ]
+							endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ s + nThreads ]
 						);
 					} else {
 						data[ s ] = nullptr;
 						rc = utils::alloc(
 							"grb::internal::SptrsvSchedule (default constructor)",
 							"default thread-local data allocation, variant II",
-							endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ s + T ]
+							endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ s + nThreads ]
 						);
 					}
 					if( rc != grb::SUCCESS ) {

From 4f42d4eb0d5ca9bb4b12639d8950b4f85c4fdeeb Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Thu, 10 Apr 2025 18:52:36 +0200
Subject: [PATCH 37/55] Move assertion to before writing to the location the
 assert checks

---
 include/graphblas/reference/matrix.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index 282f4c504..f768b2a1a 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -356,10 +356,10 @@ namespace grb {
 							throw std::runtime_error( "Too many supersteps" );
 						}
 						// store it
+						assert( nRanges < sptrsv.nRanges[ s ] );
 						*array++ = l;
 						*array++ = n;
 						(void) ++nRanges;
-						assert( nRanges <= sptrsv.nRanges[ s ] );
 					}
 					// store the number of ranges in the end array
 					assert( count < sptrsv.supersteps );

From fc968dc651897c43a7261b3f5d3efc7ee1e60402 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Fri, 11 Apr 2025 11:14:59 +0200
Subject: [PATCH 38/55] Fix meta-bug

---
 include/graphblas/reference/blas2.hpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp
index f6115995d..943c15bbb 100644
--- a/include/graphblas/reference/blas2.hpp
+++ b/include/graphblas/reference/blas2.hpp
@@ -2358,10 +2358,6 @@ namespace grb {
 			const Division &division,
 			const Phase &phase
 		) {
-			// dynamic sanity checks
-			assert( grb::nrows( T ) == n );
-			assert( grb::ncols( T ) == n );
-
 			// in dense unmasked, resize is a no-op
 			if( phase == grb::RESIZE ) { return grb::SUCCESS; }
 

From 3f5c4c1f8f444e91106ef3ab4cd089493425fc81 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Fri, 11 Apr 2025 12:14:47 +0200
Subject: [PATCH 39/55] Add back support for simple schedules, as the other OSP
 sptrsv variants uses that

---
 .../graphblas/reference/SptrsvSchedule.hpp    | 11 ++++-
 include/graphblas/reference/blas2.hpp         | 42 ++++++++++++++-----
 include/graphblas/reference/matrix.hpp        |  9 ++++
 3 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp
index 214f5ff60..8811f80cc 100644
--- a/include/graphblas/reference/SptrsvSchedule.hpp
+++ b/include/graphblas/reference/SptrsvSchedule.hpp
@@ -74,6 +74,8 @@ namespace grb {
 						assert( *interpreted == 1 );
 					}
 #endif
+					// note that this is a trivial schedule
+					is_simple = true;
 				}
 
 				/**
@@ -82,11 +84,13 @@ namespace grb {
 				void moveImpl( SptrsvSchedule &&toMove ) {
 					_deleters = std::move( toMove._deleters );
 					default_schedule = std::move( toMove.default_schedule );
+					is_simple = toMove.is_simple;
 					supersteps = toMove.supersteps;
 					nThreads = toMove.nThreads;
 					data = std::move( toMove.data );
 					endPositions = std::move( toMove.endPositions );
 					nRanges = std::move( toMove.nRanges );
+					toMove.is_simple = false;
 					toMove.supersteps = 0;
 					toMove.nThreads = 0;
 				}
@@ -94,6 +98,9 @@ namespace grb {
 
 			public:
 
+				/** Whether the schedule is simple.*/
+				bool is_simple;
+
 				/** Number of schedule steps. */
 				size_t supersteps;
 
@@ -126,7 +133,7 @@ namespace grb {
 				 * first (and only) thread.
 				 */
 				SptrsvSchedule( const NIT n ) :
-					_deleters( 1 ), supersteps( 1 ), nThreads( 1 ),
+					_deleters( 1 ), is_simple( false ), supersteps( 1 ), nThreads( 1 ),
 					data( 1 ), endPositions( 1 ), nRanges( 1 )
 				{
 					data[ 0 ] = nullptr;
@@ -144,7 +151,7 @@ namespace grb {
 				 * a single-superstep schedule that assigns all work to the first thread.
 				 */
 				SptrsvSchedule( const NIT n, const size_t T ) :
-					_deleters( 2 * T ), supersteps( 1 ), nThreads( T ),
+					_deleters( 2 * T ), is_simple( false ), supersteps( 1 ), nThreads( T ),
 					data( T, nullptr ), endPositions( T, nullptr ), nRanges( T, 0 )
 				{
 					if( T == 0 || T > std::numeric_limits< int >::max() ) {
diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp
index 943c15bbb..c431614a6 100644
--- a/include/graphblas/reference/blas2.hpp
+++ b/include/graphblas/reference/blas2.hpp
@@ -2373,24 +2373,44 @@ namespace grb {
 
 			// start parallel section
 			grb::RC ret = grb::SUCCESS;
-			#pragma omp parallel num_threads(sptrsv.nThreads)
-			{
-				const int s = omp_get_thread_num();
-				const NIT *__restrict__ data =
-					reinterpret_cast< const NIT * >(sptrsv.data[ s ]);
-				const NIT *__restrict__ const end =
-					reinterpret_cast< const NIT * >(sptrsv.endPositions[ s ]);
-				assert( end != nullptr );
-				for( size_t i = 0; i < sptrsv.supersteps; ++i ) {
-					for( size_t k = 0; k < end[ i ]; ++k ) {
+
+			if( sptrsv.is_simple ) {
+				#pragma omp parallel num_threads(sptrsv.nThreads)
+				{
+					const int s = omp_get_thread_num();
+					const NIT *__restrict__ data =
+						reinterpret_cast< const NIT * >(sptrsv.data[ s ]);
+					assert( data != nullptr );
+					for( size_t i = 0; i < sptrsv.supersteps; ++i ) {
 						const size_t lo = static_cast< size_t >( *data++ );
 						const size_t no = static_cast< size_t >( *data++ );
 						assert( lo < n );
 						assert( no + lo <= n );
 						ret = ret ? ret : dense_unmasked_sequential_sptrsv< descr, true >(
 							v_raw, T, lo, no, forward, semiring, subtraction, division, phase );
+						#pragma omp barrier
+					}
+				}
+			} else {
+				#pragma omp parallel num_threads(sptrsv.nThreads)
+				{
+					const int s = omp_get_thread_num();
+					const NIT *__restrict__ data =
+						reinterpret_cast< const NIT * >(sptrsv.data[ s ]);
+					const NIT *__restrict__ const end =
+						reinterpret_cast< const NIT * >(sptrsv.endPositions[ s ]);
+					assert( end != nullptr );
+					for( size_t i = 0; i < sptrsv.supersteps; ++i ) {
+						for( size_t k = 0; k < end[ i ]; ++k ) {
+							const size_t lo = static_cast< size_t >( *data++ );
+							const size_t no = static_cast< size_t >( *data++ );
+							assert( lo < n );
+							assert( no + lo <= n );
+							ret = ret ? ret : dense_unmasked_sequential_sptrsv< descr, true >(
+								v_raw, T, lo, no, forward, semiring, subtraction, division, phase );
+						}
+						#pragma omp barrier
 					}
-					#pragma omp barrier
 				}
 			}
 
diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index f768b2a1a..00084ca75 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -322,8 +322,10 @@ namespace grb {
 					"nThreads" );
 			}
 
+			sptrsv.is_simple = true;
 			#pragma omp parallel
 			{
+				bool simple_local = true;
 				It bounds = bounds_begin;
 				const size_t actualNumThreads = omp_get_num_threads();
 				if( actualNumThreads != nThreads ) {
@@ -363,6 +365,9 @@ namespace grb {
 					}
 					// store the number of ranges in the end array
 					assert( count < sptrsv.supersteps );
+					if( nRanges != 1 ) {
+						simple_local = false;
+					}
 					*end++ = nRanges;
 					// forward to the next superstep
 					(void) ++count;
@@ -371,6 +376,10 @@ namespace grb {
 				if( count != sptrsv.supersteps ) {
 					throw std::runtime_error( "Unexpected number of supersteps" );
 				}
+				if( !simple_local ) {
+					#pragma omp critical
+					sptrsv.is_simple = false;
+				}
 			}
 		}
 

From cd2c9674cd3589638f0f6dcfeb50569e50ea9674 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Fri, 11 Apr 2025 16:25:47 +0200
Subject: [PATCH 40/55] I rehashed a classic performance bug there, apologies
 -- now fixed!

---
 include/graphblas/reference/blas2.hpp | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp
index c431614a6..b6af9b23f 100644
--- a/include/graphblas/reference/blas2.hpp
+++ b/include/graphblas/reference/blas2.hpp
@@ -2377,6 +2377,7 @@ namespace grb {
 			if( sptrsv.is_simple ) {
 				#pragma omp parallel num_threads(sptrsv.nThreads)
 				{
+					grb::RC local_rc = grb::SUCCESS;
 					const int s = omp_get_thread_num();
 					const NIT *__restrict__ data =
 						reinterpret_cast< const NIT * >(sptrsv.data[ s ]);
@@ -2386,14 +2387,21 @@ namespace grb {
 						const size_t no = static_cast< size_t >( *data++ );
 						assert( lo < n );
 						assert( no + lo <= n );
-						ret = ret ? ret : dense_unmasked_sequential_sptrsv< descr, true >(
-							v_raw, T, lo, no, forward, semiring, subtraction, division, phase );
+						local_rc = local_rc
+							? local_rc
+							: dense_unmasked_sequential_sptrsv< descr, true >(
+								v_raw, T, lo, no, forward, semiring, subtraction, division, phase );
 						#pragma omp barrier
 					}
+					if( local_rc != grb::SUCCESS ) {
+						#pragma omp atomic write
+						ret = local_rc;
+					}
 				}
 			} else {
 				#pragma omp parallel num_threads(sptrsv.nThreads)
 				{
+					grb::RC local_rc = grb::SUCCESS;
 					const int s = omp_get_thread_num();
 					const NIT *__restrict__ data =
 						reinterpret_cast< const NIT * >(sptrsv.data[ s ]);
@@ -2406,11 +2414,17 @@ namespace grb {
 							const size_t no = static_cast< size_t >( *data++ );
 							assert( lo < n );
 							assert( no + lo <= n );
-							ret = ret ? ret : dense_unmasked_sequential_sptrsv< descr, true >(
-								v_raw, T, lo, no, forward, semiring, subtraction, division, phase );
+							local_rc = local_rc
+								? local_rc
+								: dense_unmasked_sequential_sptrsv< descr, true >(
+									v_raw, T, lo, no, forward, semiring, subtraction, division, phase );
 						}
 						#pragma omp barrier
 					}
+					if( local_rc != grb::SUCCESS ) {
+						#pragma omp atomic write
+						ret = local_rc;
+					}
 				}
 			}
 

From 3e0eda86a12ab8edb9f83cdd5960f9f9668818ce Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Fri, 11 Apr 2025 17:21:02 +0200
Subject: [PATCH 41/55] Another tricky performance bug-- and a silly mistake
 once more...

---
 include/graphblas/reference/blas2.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp
index b6af9b23f..eb0f4bf7f 100644
--- a/include/graphblas/reference/blas2.hpp
+++ b/include/graphblas/reference/blas2.hpp
@@ -2286,7 +2286,7 @@ namespace grb {
 			// switch forward or backward solve
 			if( forward ) {
 				const auto &crs = internal::getCRS( T );
-				if( maybe_offset ) {
+				if( !maybe_offset ) {
 					for( size_t i = 0; i < n; ++i ) {
 						IOType divBy = semiring.template getZero< IOType >();
 						assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] );

From ad392992e1b381163975e05fd363213e38abe9f9 Mon Sep 17 00:00:00 2001
From: tonibohnlein <113011201+tonibohnlein@users.noreply.github.com>
Date: Mon, 14 Apr 2025 11:05:08 +0200
Subject: [PATCH 42/55] fixed bug in blas2.hpp

---
 include/graphblas/reference/blas2.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp
index eb0f4bf7f..9935e9151 100644
--- a/include/graphblas/reference/blas2.hpp
+++ b/include/graphblas/reference/blas2.hpp
@@ -2298,7 +2298,8 @@ namespace grb {
 						(void) grb::foldl( v_raw[ i ], divBy, division );
 					}
 				} else {
-					for( size_t i = offset; i < n; ++i ) {
+					for( size_t j = 0; j < n; ++j ) {
+						const size_t i = j + offset;
 						IOType divBy = semiring.template getZero< IOType >();
 						assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] );
 						sptrsv_kernel( crs, i, v_raw, divBy, semiring, subtraction );

From ba98f0bf0633faac73cc13045a134d5eece17819 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Mon, 14 Apr 2025 13:00:57 +0200
Subject: [PATCH 43/55] Missing optimisation introduced

---
 include/graphblas/reference/blas2.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp
index 9935e9151..d2c72fe2a 100644
--- a/include/graphblas/reference/blas2.hpp
+++ b/include/graphblas/reference/blas2.hpp
@@ -2286,7 +2286,7 @@ namespace grb {
 			// switch forward or backward solve
 			if( forward ) {
 				const auto &crs = internal::getCRS( T );
-				if( !maybe_offset ) {
+				if( !maybe_offset || offset == 0 ) {
 					for( size_t i = 0; i < n; ++i ) {
 						IOType divBy = semiring.template getZero< IOType >();
 						assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] );

From 220b6f86b56a9c81d4b2d02e87caba8a550bf2ab Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Mon, 14 Apr 2025 14:36:19 +0200
Subject: [PATCH 44/55] Changes towards supporting sptrsv on sorted matrices

---
 include/graphblas/reference/blas2.hpp | 115 +++++++++++++++++++++-----
 1 file changed, 96 insertions(+), 19 deletions(-)

diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp
index d2c72fe2a..a5781d0aa 100644
--- a/include/graphblas/reference/blas2.hpp
+++ b/include/graphblas/reference/blas2.hpp
@@ -2217,8 +2217,17 @@ namespace grb {
 	namespace internal {
 
 #ifndef _H_GRB_REFERENCE_OMP_BLAS2
-		/** Backend-independent sptrsv kernel code. */
+		/**
+		 * Backend-independent sptrsv kernel code.
+		 *
+		 * @tparam sorted 0: the matrix rows are not sorted,
+		 *                1: the matrix rows have diagonal elements at position 0,
+		 *                2: the matrix rows are sorted.
+		 *
+		 * \note The above description assumes CRS.
+		 */
 		template<
+			int sorted,
 			typename IOType, typename InputType1,
 			typename IND, typename NIT,
 			class Semiring, class Subtraction
@@ -2227,22 +2236,40 @@ namespace grb {
 			const Compressed_Storage< InputType1, IND, NIT > &crs,
 			const size_t i,
 			IOType * const x,
+			const bool forward,
 			IOType &divBy,
 			const Semiring &semiring,
 			const Subtraction &subtraction
 		) {
+			// static assertions
+			static_assert( sorted >= 0 && sorted < 3, "Illegal value for sorted; this "
+				"is an internal error, please submit a bug report" );
 			constexpr auto one =
 				Semiring::template One< typename Semiring::D1 >::value();
+			if( sorted ) {
+				(void) divBy;
+			}
+			assert( crs.col_start[ i + 1 ] > crs.col_start[ i ] );
+			const size_t start = (!sorted) ? crs.col_start[ i ] : (
+				(sorted == 1) ? (crs.col_start[ i ] + 1) : (
+					forward ? crs.col_start[ i ] : (crs.col_start[ i ] + 1) )
+				);
+			const size_t end = (!sorted) ? crs.col_start[ i + 1 ] : (
+				(sorted == 1) ? crs.col_start[ i + 1 ] : (
+					forward ? (crs.col_start[ i + 1 ] - 1) : crs.col_start[ i + 1 ] )
+				);
 			// TODO could do some ALP-style vectorisation here
-			for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k ) {
+			for( size_t k = start; k < end; ++k ) {
 				const typename Semiring::D1 val = crs.template getValue( k, one );
 				const auto &ind = crs.row_index[ k ];
-				if( static_cast< size_t >(ind) == i ) {
+				if( !sorted ) {
+					if( static_cast< size_t >(ind) == i ) {
  #ifdef _DEBUG
-					std::cout << "\tskipping nonzero at " << i ", " << i << "\n";
+						std::cout << "\tskipping nonzero at " << i ", " << i << "\n";
  #endif
-					divBy = val;
-					continue;
+						divBy = val;
+						continue;
+					}
 				}
  #ifdef _DEBUG
 				std::cout << "\t x[ " << i << " ] (" << x[i] << ") -= " << val << "* x[ "
@@ -2254,11 +2281,27 @@ namespace grb {
 				(void) grb::foldl( x[ i ], tmp, subtraction );
 			}
 		}
-#endif
+
+		template<
+			typename IOType, int sorted,
+			typename VIT, typename RCIT, typename NIT
+		>
+		inline static IOType getDiagonalEntry(
+			const Compressed_Storage< VIT, RCIT, NIT > &storage, const size_t &i,
+			const bool &forward, const IOType &one
+		) {
+			assert( sorted >= 1 && sorted < 3 );
+			assert( storage.col_start[ i + 1 ] > storage.col_start[ i ] );
+			const size_t diag_index = sorted == 1 ? storage.col_start[ i ] : (
+				forward ? (storage.col_start[ i + 1 ] - 1) : storage.col_start[ i ] );
+			assert( storage.col_index[ diag_index ] == i );
+			return storage.template getValue( diag_index, one );
+		}
+#endif // end ifndef _H_GRB_REFERENCE_OMP_BLAS2
 
 		/** \internal Specialised dense unmasked sptrsv implementation, sequential */
 		template<
-			Descriptor descr, bool maybe_offset,
+			Descriptor descr, bool maybe_offset, int sorted,
 			class Semiring, class Subtraction, class Division,
 			typename IOType, typename InputType1,
 			typename RIT, typename CIT, typename NIT
@@ -2273,6 +2316,10 @@ namespace grb {
 			const Division &division,
 			const Phase &phase
 		) {
+			// static sanity checks
+			static_assert( sorted >= 0 && sorted < 3, "Invalid value for sorted; this "
+				"an internal error, please submit a bug report" );
+
 			// dynamic sanity checks
 			assert( grb::nrows( T ) == n );
 			assert( grb::ncols( T ) == n );
@@ -2283,14 +2330,26 @@ namespace grb {
 			// only execute and resize are supported
 			assert( phase == grb::EXECUTE );
 
+			// in case of pattern matrices, get 1 from the semiring
+			const IOType one = semiring.getMultiplicativeMonoid().template
+				getIdentity< IOType >();
+
 			// switch forward or backward solve
 			if( forward ) {
 				const auto &crs = internal::getCRS( T );
 				if( !maybe_offset || offset == 0 ) {
 					for( size_t i = 0; i < n; ++i ) {
-						IOType divBy = semiring.template getZero< IOType >();
+						IOType divBy;
+						if( !sorted ) {
+							// in this case we will auto-detect the diagonal item
+							divBy = semiring.template getZero< IOType >();
+						} else {
+							divBy = getDiagonalEntry< IOType, sorted, InputType1, RIT, NIT >(
+								crs, i, forward, one );
+						}
 						assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] );
-						sptrsv_kernel( crs, i, v_raw, divBy, semiring, subtraction );
+						sptrsv_kernel< sorted >( crs, i, v_raw, forward, divBy, semiring,
+							subtraction );
 #ifdef _DEBUG
 						std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy
 							<< "\n";
@@ -2300,9 +2359,15 @@ namespace grb {
 				} else {
 					for( size_t j = 0; j < n; ++j ) {
 						const size_t i = j + offset;
-						IOType divBy = semiring.template getZero< IOType >();
+						IOType divBy;
+						if( !sorted ) {
+							divBy = semiring.template getZero< IOType >();
+						} else {
+							divBy = getDiagonalEntry< IOType, sorted >( crs, i, forward, one );
+						}
 						assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] );
-						sptrsv_kernel( crs, i, v_raw, divBy, semiring, subtraction );
+						sptrsv_kernel< sorted >( crs, i, v_raw, forward, divBy, semiring,
+							subtraction );
 #ifdef _DEBUG
 						std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy
 							<< "\n";
@@ -2314,9 +2379,15 @@ namespace grb {
 				const auto &ccs = internal::getCCS( T );
 				if( !maybe_offset || offset == 0 ) {
 					for( size_t i = n - 1; i < n; --i ) {
-						IOType divBy = semiring.template getZero< IOType >();
+						IOType divBy;
+						if( !sorted ) {
+							divBy = semiring.template getZero< IOType >();
+						} else {
+							divBy = getDiagonalEntry< IOType, sorted >( ccs, i, forward, one );
+						}
 						assert( ccs.col_start[ i ] <= ccs.col_start[ i + 1 ] );
-						sptrsv_kernel( ccs, i, v_raw, divBy, semiring, subtraction );
+						sptrsv_kernel< sorted >( ccs, i, v_raw, forward, divBy, semiring,
+							subtraction );
 #ifdef _DEBUG
 						std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy
 							<< "\n";
@@ -2325,9 +2396,15 @@ namespace grb {
 					}
 				} else {
 					for( size_t i = n - 1 + offset; i >= offset; --i ) {
-						IOType divBy = semiring.template getZero< IOType >();
+						IOType divBy;
+						if( !sorted ) {
+							divBy = semiring.template getZero< IOType >();
+						} else {
+							divBy = getDiagonalEntry< IOType, sorted >( ccs, i, forward, one );
+						}
 						assert( ccs.col_start[ i ] <= ccs.col_start[ i + 1 ] );
-						sptrsv_kernel( ccs, i, v_raw, divBy, semiring, subtraction );
+						sptrsv_kernel< sorted >( ccs, i, v_raw, forward, divBy, semiring,
+							subtraction );
 #ifdef _DEBUG
 						std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy
 							<< "\n";
@@ -2390,7 +2467,7 @@ namespace grb {
 						assert( no + lo <= n );
 						local_rc = local_rc
 							? local_rc
-							: dense_unmasked_sequential_sptrsv< descr, true >(
+							: dense_unmasked_sequential_sptrsv< descr, true, 1 >(
 								v_raw, T, lo, no, forward, semiring, subtraction, division, phase );
 						#pragma omp barrier
 					}
@@ -2417,7 +2494,7 @@ namespace grb {
 							assert( no + lo <= n );
 							local_rc = local_rc
 								? local_rc
-								: dense_unmasked_sequential_sptrsv< descr, true >(
+								: dense_unmasked_sequential_sptrsv< descr, true, 1 >(
 									v_raw, T, lo, no, forward, semiring, subtraction, division, phase );
 						}
 						#pragma omp barrier
@@ -2533,7 +2610,7 @@ namespace grb {
 		if( dense || grb::nnz( xb ) == n ) {
 			IOType * const xb_p = internal::getRaw( xb );
 #ifndef _H_GRB_REFERENCE_OMP_BLAS2
-			return internal::dense_unmasked_sequential_sptrsv< descr, false >(
+			return internal::dense_unmasked_sequential_sptrsv< descr, false, 0 >(
 				xb_p, T, 0, n, forward, semiring, subtraction, division, phase );
 #else
 			return internal::dense_unmasked_omp_sptrsv< descr >(

From abc397bc3d33229a9079d9dad8de18dabc22ec13 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Mon, 14 Apr 2025 14:43:07 +0200
Subject: [PATCH 45/55] Move index computation into for-loop

---
 include/graphblas/reference/blas2.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp
index 9935e9151..b76786384 100644
--- a/include/graphblas/reference/blas2.hpp
+++ b/include/graphblas/reference/blas2.hpp
@@ -2298,8 +2298,8 @@ namespace grb {
 						(void) grb::foldl( v_raw[ i ], divBy, division );
 					}
 				} else {
-					for( size_t j = 0; j < n; ++j ) {
-						const size_t i = j + offset;
+					const size_t end = n + offset;
+					for( size_t i = offset; i < end; ++i ) {
 						IOType divBy = semiring.template getZero< IOType >();
 						assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] );
 						sptrsv_kernel( crs, i, v_raw, divBy, semiring, subtraction );

From 4ec4720b29a78d91c697d30a410043948e417736 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Mon, 14 Apr 2025 17:00:15 +0200
Subject: [PATCH 46/55] Tried and failed vectorisation

---
 include/graphblas/reference/blas2.hpp | 65 ++++++++++++++++++++++++++-
 1 file changed, 64 insertions(+), 1 deletion(-)

diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp
index b76786384..860fc2ffe 100644
--- a/include/graphblas/reference/blas2.hpp
+++ b/include/graphblas/reference/blas2.hpp
@@ -2233,7 +2233,70 @@ namespace grb {
 		) {
 			constexpr auto one =
 				Semiring::template One< typename Semiring::D1 >::value();
-			// TODO could do some ALP-style vectorisation here
+
+			// NOTE: explicit vectorisation seems to cause significant slowdowns.
+			//       disabled but retained for future reference / more successful
+			//       optimisation
+			/*
+			// constants
+			constexpr size_t ind_blocksize =
+				grb::config::SIMD_BLOCKSIZE< IND >::value();
+			constexpr size_t blocksize = ind_blocksize < Semiring::blocksize
+				? ind_blocksize : Semiring::blocksize;
+			// constant bufer
+			IND cur_ind[ blocksize ];
+			// SIMD set
+			for( size_t r = 0; r < blocksize; ++r ) {
+				cur_ind[ r ] = i;
+			}
+			for( ; k + Semiring::blocksize < crs.col_start[ i + 1 ];
+				k += Semiring::blocksize
+			) {
+				typename Semiring::D1 values[ blocksize ];
+				typename Semiring::D3 tmp[ blocksize ];
+				IOType x_simd[ blocksize ];
+				IND indices[ blocksize ];
+				bool match[ blocksize ];
+				// streaming loads
+				for( size_t r = 0; r < blocksize; ++r ) {
+					values[ r ] = crs.template getValue( k + r, one );
+				}
+				for( size_t r = 0; r < blocksize; ++r ) {
+					indices[ r ] = crs.row_index[ k + r ];
+				}
+				// SIMD compute
+				for( size_t r = 0; r < blocksize; ++r ) {
+					match[ r ] = cur_ind[ r ] != indices[ r ];
+				}
+				// gather
+				for( size_t r = 0; r < blocksize; ++r ) {
+					if( match[ r ] ) {
+						x_simd[ r ] = x[ indices[ r ] ];
+					}
+				}
+				// masked SIMD compute
+				for( size_t r = 0; r < blocksize; ++r ) {
+					if( match[ r ] ) {
+						tmp[ r ] = values[ r ] * x_simd[ r ];
+					}
+				}
+				// masked SIMD reduce
+				for( size_t r = 0; r < blocksize; ++r ) {
+					if( match[ r ] ) {
+						reduced += tmp[ r ];
+					}
+				}
+				// diagonal detection; SIMD invert and reduce
+				for( size_t r = 0; r < blocksize; ++r ) {
+					match[ r ] = !match[ r ];
+				}
+				for( size_t r = 0; r < blocksize; ++r ) {
+					if( match[ r ] ) {
+						divBy += values[ r ];
+					}
+				}
+			}*/
+
 			for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k ) {
 				const typename Semiring::D1 val = crs.template getValue( k, one );
 				const auto &ind = crs.row_index[ k ];

From 19b9473a70c973ada778dc0d4bdf288bc358aa70 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Mon, 14 Apr 2025 17:09:44 +0200
Subject: [PATCH 47/55] Const-reference passing where possible

---
 include/graphblas/reference/blas2.hpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp
index 860fc2ffe..41c6c8da0 100644
--- a/include/graphblas/reference/blas2.hpp
+++ b/include/graphblas/reference/blas2.hpp
@@ -2225,8 +2225,8 @@ namespace grb {
 		>
 		inline void sptrsv_kernel(
 			const Compressed_Storage< InputType1, IND, NIT > &crs,
-			const size_t i,
-			IOType * const x,
+			const size_t &i,
+			IOType *__restrict__ const &x,
 			IOType &divBy,
 			const Semiring &semiring,
 			const Subtraction &subtraction
@@ -2327,10 +2327,10 @@ namespace grb {
 			typename RIT, typename CIT, typename NIT
 		>
 		RC dense_unmasked_sequential_sptrsv(
-			IOType *__restrict__ const v_raw,
+			IOType *__restrict__ const &v_raw,
 			const Matrix< InputType1, reference, RIT, CIT, NIT > &T,
-			const size_t offset, const size_t &n,
-			const bool forward,
+			const size_t &offset, const size_t &n,
+			const bool &forward,
 			const Semiring &semiring,
 			const Subtraction &subtraction,
 			const Division &division,
@@ -2413,10 +2413,10 @@ namespace grb {
 			typename RIT, typename CIT, typename NIT
 		>
 		RC dense_unmasked_omp_sptrsv(
-			IOType *__restrict__ const v_raw,
+			IOType *__restrict__ const &v_raw,
 			const Matrix< InputType1, reference, RIT, CIT, NIT > &T,
 			const size_t &n,
-			const bool forward,
+			const bool &forward,
 			const Semiring &semiring,
 			const Subtraction &subtraction,
 			const Division &division,
@@ -2474,8 +2474,8 @@ namespace grb {
 					assert( end != nullptr );
 					for( size_t i = 0; i < sptrsv.supersteps; ++i ) {
 						for( size_t k = 0; k < end[ i ]; ++k ) {
-							const size_t lo = static_cast< size_t >( *data++ );
-							const size_t no = static_cast< size_t >( *data++ );
+							const size_t &lo = static_cast< size_t >( *data++ );
+							const size_t &no = static_cast< size_t >( *data++ );
 							assert( lo < n );
 							assert( no + lo <= n );
 							local_rc = local_rc

From 90c12b82ece6e1de2fdf0e50930b772adc222dab Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Wed, 16 Apr 2025 13:18:17 +0200
Subject: [PATCH 48/55] Add in CRS column-sorting as a pre-processing / tuning
 step for SpTrsv

---
 include/graphblas/reference/matrix.hpp | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index 00084ca75..e4f10a7b4 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -322,6 +322,30 @@ namespace grb {
 					"nThreads" );
 			}
 
+			// re-sort A
+			{
+				const auto &crs = internal::getCRS( A );
+				for( size_t i = 0; i < A.m; ++i ) {
+					std::vector< std::pair< int, double > > pairs;
+					for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k ) {
+						pairs.push_back( std::make_pair( crs.row_index[ k ], crs.values[ k ] ) );
+					}
+					std::sort( pairs.begin(), pairs.end(),
+						[]( const std::pair< int, double > &left, const std::pair< int, double > &right ) {
+							return left.first < right.first;
+						} );
+					auto it = pairs.cbegin();
+					//std::cout << "Row " << i << ": ";
+					for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k, ++it ) {
+						//std::cout << it->first << ", ";
+						assert( it != pairs.cend() );
+						crs.row_index[ k ] = it->first;
+						crs.values[ k ] = it->second;
+					}
+					//std::cout << std::endl;
+				}
+			}
+
 			sptrsv.is_simple = true;
 			#pragma omp parallel
 			{

From 0a0fe6f52d23334ef646d680b3a46183263b6f39 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Wed, 16 Apr 2025 13:25:18 +0200
Subject: [PATCH 49/55] Remove debug statements, also sort CCS, and parallelise
 the sorting

---
 include/graphblas/reference/matrix.hpp | 39 +++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index e4f10a7b4..27e97cb22 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -322,9 +322,16 @@ namespace grb {
 					"nThreads" );
 			}
 
-			// re-sort A
+			// re-sort A, CRS first
 			{
 				const auto &crs = internal::getCRS( A );
+				const size_t nThreads = A.m < config::OMP::minLoopSize()
+					? 1
+					: std::min(
+						config::OMP::threads(),
+						A.nz / config::CACHE_LINE_SIZE::value()
+					);
+				#pragma omp parallel for num_threads( nThreads )
 				for( size_t i = 0; i < A.m; ++i ) {
 					std::vector< std::pair< int, double > > pairs;
 					for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k ) {
@@ -335,14 +342,38 @@ namespace grb {
 							return left.first < right.first;
 						} );
 					auto it = pairs.cbegin();
-					//std::cout << "Row " << i << ": ";
 					for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k, ++it ) {
-						//std::cout << it->first << ", ";
 						assert( it != pairs.cend() );
 						crs.row_index[ k ] = it->first;
 						crs.values[ k ] = it->second;
 					}
-					//std::cout << std::endl;
+				}
+			}
+			// then CCS
+			{
+				const auto &ccs = internal::getCCS( A );
+				const size_t nThreads = A.n < config::OMP::minLoopSize()
+					? 1
+					: std::min(
+						config::OMP::threads(),
+						A.nz / config::CACHE_LINE_SIZE::value()
+					);
+				#pragma omp parallel for num_threads( nThreads )
+				for( size_t i = 0; i < A.n; ++i ) {
+					std::vector< std::pair< int, double > > pairs;
+					for( size_t k = ccs.col_start[ i ]; k < ccs.col_start[ i + 1 ]; ++k ) {
+						pairs.push_back( std::make_pair( ccs.row_index[ k ], ccs.values[ k ] ) );
+					}
+					std::sort( pairs.begin(), pairs.end(),
+						[]( const std::pair< int, double > &left, const std::pair< int, double > &right ) {
+							return left.first < right.first;
+						} );
+					auto it = pairs.cbegin();
+					for( size_t k = ccs.col_start[ i ]; k < ccs.col_start[ i + 1 ]; ++k, ++it ) {
+						assert( it != pairs.cend() );
+						ccs.row_index[ k ] = it->first;
+						ccs.values[ k ] = it->second;
+					}
 				}
 			}
 

From 38534246d83233c04a0bf4a05f76bf48dc2da81b Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Wed, 16 Apr 2025 13:28:48 +0200
Subject: [PATCH 50/55] Save testing code, assuming sorted arrays

---
 include/graphblas/reference/blas2.hpp | 35 +++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp
index 41c6c8da0..fe9588d15 100644
--- a/include/graphblas/reference/blas2.hpp
+++ b/include/graphblas/reference/blas2.hpp
@@ -2599,8 +2599,43 @@ namespace grb {
 			return internal::dense_unmasked_sequential_sptrsv< descr, false >(
 				xb_p, T, 0, n, forward, semiring, subtraction, division, phase );
 #else
+			// The below code is only for testing (DBG):
+ #if 0
+			if( phase == grb::RESIZE ) { return grb::SUCCESS; }
+			const auto sptrsv = internal::getSptrsvData( T );
+			#pragma omp parallel num_threads(sptrsv->nThreads)
+			{
+				const auto &crs = internal::getCRS( T );
+				const int *__restrict__ schedule = reinterpret_cast< int * >(
+					sptrsv->data[ omp_get_thread_num() ] );
+				for( size_t step = 0; step < sptrsv->supersteps; ++step ) {
+					const int &lo = *schedule++;
+					const int &no = *schedule++;
+					const int upper_limit = lo + no;
+					for( int row_idx = lo; row_idx < upper_limit; ++row_idx ) {
+						// if not sorted, enable this variant instead:
+						// IOType div = 0;
+						// for( unsigned int k = crs.col_start[ row_idx ]; k < crs.col_start[ row_idx + 1 ]; ++k ) {
+						for( unsigned int k = crs.col_start[ row_idx ]; k < crs.col_start[ row_idx + 1 ] - 1; ++k ) {
+							const int &j = crs.row_index[ k ];
+							const double &value = crs.values[ k ];
+							// if not sorted, enable this if-else:
+							// if( j == row_idx ) {
+							//	div = value;
+							// } else {
+								xb_p[ row_idx ] -= value * xb_p[ j ];
+							//}
+						}
+						xb_p[ row_idx ] /= crs.values[ crs.col_start[ row_idx + 1 ] - 1 ];
+					}
+					#pragma omp barrier
+				}
+			}
+			return grb::SUCCESS;
+ #else
 			return internal::dense_unmasked_omp_sptrsv< descr >(
 				xb_p, T, n, forward, semiring, subtraction, division, phase );
+ #endif
 #endif
 		} else {
 			grb::Vector< IOType, reference, Coords > no_mask( 0 );

From 555bc5f1ad83cb232b4c754199242e8f5de3cbbd Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Thu, 17 Apr 2025 13:05:48 +0200
Subject: [PATCH 51/55] Make sorted-ness configurable

---
 .../graphblas/reference/SptrsvSchedule.hpp    |  14 +-
 include/graphblas/reference/blas2.hpp         |  74 ++++++---
 include/graphblas/reference/config.hpp        |  29 +++-
 include/graphblas/reference/matrix.hpp        | 145 ++++++++++++------
 4 files changed, 192 insertions(+), 70 deletions(-)

diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp
index 8811f80cc..ea25e8229 100644
--- a/include/graphblas/reference/SptrsvSchedule.hpp
+++ b/include/graphblas/reference/SptrsvSchedule.hpp
@@ -76,6 +76,9 @@ namespace grb {
 #endif
 					// note that this is a trivial schedule
 					is_simple = true;
+
+					// the default schedule does not (cannot) sort input matrices
+					is_sorted = false;
 				}
 
 				/**
@@ -84,12 +87,14 @@ namespace grb {
 				void moveImpl( SptrsvSchedule &&toMove ) {
 					_deleters = std::move( toMove._deleters );
 					default_schedule = std::move( toMove.default_schedule );
+					is_sorted = toMove.is_sorted;
 					is_simple = toMove.is_simple;
 					supersteps = toMove.supersteps;
 					nThreads = toMove.nThreads;
 					data = std::move( toMove.data );
 					endPositions = std::move( toMove.endPositions );
 					nRanges = std::move( toMove.nRanges );
+					toMove.is_sorted = true;
 					toMove.is_simple = false;
 					toMove.supersteps = 0;
 					toMove.nThreads = 0;
@@ -98,6 +103,9 @@ namespace grb {
 
 			public:
 
+				/** Whether the matrix is sorted as part of tuning. */
+				bool is_sorted;
+
 				/** Whether the schedule is simple.*/
 				bool is_simple;
 
@@ -133,7 +141,8 @@ namespace grb {
 				 * first (and only) thread.
 				 */
 				SptrsvSchedule( const NIT n ) :
-					_deleters( 1 ), is_simple( false ), supersteps( 1 ), nThreads( 1 ),
+					_deleters( 1 ), is_sorted( true ), is_simple( false ),
+					supersteps( 1 ), nThreads( 1 ),
 					data( 1 ), endPositions( 1 ), nRanges( 1 )
 				{
 					data[ 0 ] = nullptr;
@@ -151,7 +160,8 @@ namespace grb {
 				 * a single-superstep schedule that assigns all work to the first thread.
 				 */
 				SptrsvSchedule( const NIT n, const size_t T ) :
-					_deleters( 2 * T ), is_simple( false ), supersteps( 1 ), nThreads( T ),
+					_deleters( 2 * T ), is_sorted( true ), is_simple( false ),
+					supersteps( 1 ), nThreads( T ),
 					data( T, nullptr ), endPositions( T, nullptr ), nRanges( T, 0 )
 				{
 					if( T == 0 || T > std::numeric_limits< int >::max() ) {
diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp
index 03aad374d..a623b94b4 100644
--- a/include/graphblas/reference/blas2.hpp
+++ b/include/graphblas/reference/blas2.hpp
@@ -2523,15 +2523,31 @@ namespace grb {
 					const NIT *__restrict__ data =
 						reinterpret_cast< const NIT * >(sptrsv.data[ s ]);
 					assert( data != nullptr );
-					for( size_t i = 0; i < sptrsv.supersteps; ++i ) {
-						const size_t lo = static_cast< size_t >( *data++ );
-						const size_t no = static_cast< size_t >( *data++ );
-						assert( lo < n );
-						assert( no + lo <= n );
-						local_rc = local_rc
-							? local_rc
-							: dense_unmasked_sequential_sptrsv< descr, true, 1 >(
-								v_raw, T, lo, no, forward, semiring, subtraction, division, phase );
+					if( sptrsv.is_sorted ) {
+						for( size_t i = 0; i < sptrsv.supersteps; ++i ) {
+							const size_t lo = static_cast< size_t >( *data++ );
+							const size_t no = static_cast< size_t >( *data++ );
+							assert( lo < n );
+							assert( no + lo <= n );
+							local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv<
+									descr, true, config::tuning::SpTRSV::sortingMode
+								>(
+									v_raw, T, lo, no, forward, semiring, subtraction, division, phase
+								);
+						}
+						#pragma omp barrier
+					} else {
+						for( size_t i = 0; i < sptrsv.supersteps; ++i ) {
+							const size_t lo = static_cast< size_t >( *data++ );
+							const size_t no = static_cast< size_t >( *data++ );
+							assert( lo < n );
+							assert( no + lo <= n );
+							local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv<
+								descr, true, 0
+							>(
+								v_raw, T, lo, no, forward, semiring, subtraction, division, phase
+							);
+						}
 						#pragma omp barrier
 					}
 					if( local_rc != grb::SUCCESS ) {
@@ -2549,18 +2565,36 @@ namespace grb {
 					const NIT *__restrict__ const end =
 						reinterpret_cast< const NIT * >(sptrsv.endPositions[ s ]);
 					assert( end != nullptr );
-					for( size_t i = 0; i < sptrsv.supersteps; ++i ) {
-						for( size_t k = 0; k < end[ i ]; ++k ) {
-							const size_t &lo = static_cast< size_t >( *data++ );
-							const size_t &no = static_cast< size_t >( *data++ );
-							assert( lo < n );
-							assert( no + lo <= n );
-							local_rc = local_rc
-								? local_rc
-								: dense_unmasked_sequential_sptrsv< descr, true, 1 >(
-									v_raw, T, lo, no, forward, semiring, subtraction, division, phase );
+					if( sptrsv.is_sorted ) {
+						for( size_t i = 0; i < sptrsv.supersteps; ++i ) {
+							for( size_t k = 0; k < end[ i ]; ++k ) {
+								const size_t &lo = static_cast< size_t >( *data++ );
+								const size_t &no = static_cast< size_t >( *data++ );
+								assert( lo < n );
+								assert( no + lo <= n );
+								local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv<
+										descr, true, config::tuning::SpTRSV::sortingMode
+									>(
+										v_raw, T, lo, no, forward, semiring, subtraction, division, phase
+									);
+							}
+							#pragma omp barrier
+						}
+					} else {
+						for( size_t i = 0; i < sptrsv.supersteps; ++i ) {
+							for( size_t k = 0; k < end[ i ]; ++k ) {
+								const size_t &lo = static_cast< size_t >( *data++ );
+								const size_t &no = static_cast< size_t >( *data++ );
+								assert( lo < n );
+								assert( no + lo <= n );
+								local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv<
+										descr, true, 0
+									>(
+										v_raw, T, lo, no, forward, semiring, subtraction, division, phase
+									);
+							}
+							#pragma omp barrier
 						}
-						#pragma omp barrier
 					}
 					if( local_rc != grb::SUCCESS ) {
 						#pragma omp atomic write
diff --git a/include/graphblas/reference/config.hpp b/include/graphblas/reference/config.hpp
index 82e067ffd..1d9a8ebbf 100644
--- a/include/graphblas/reference/config.hpp
+++ b/include/graphblas/reference/config.hpp
@@ -333,7 +333,34 @@ namespace grb {
 
 		/** @} */
 
-	} // namespace config
+
+		/**
+		 * Collects all optimisations with regards to tuning operations for specific
+		 * inputs, i.e., #grb::PHASE::TUNE.
+		 */
+		namespace tuning {
+
+			/**
+			 * Collects all options for SpTRSV tuning.
+			 */
+			struct SpTRSV {
+
+				/**
+				 * There are three sorting modes implemented:
+				 *  - 0: the matrix rows are not sorted;
+				 *  - 1: the matrix rows have diagonal elements at position 0;
+				 *  - 2: the matrix rows are sorted.
+				 * The above description assumes CRS, however, the chosen sorting mode is
+				 * also applied to the CCS (in which case the above three points apply with
+				 * `rows' substituted by `columns').
+				 */
+				static constexpr int sortingMode = 2;
+
+			};
+
+		} // namespace grb::config::tuning
+
+	} // namespace grb::config
 
 } // namespace grb
 
diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index 27e97cb22..6b7cb29ba 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -322,59 +322,110 @@ namespace grb {
 					"nThreads" );
 			}
 
-			// re-sort A, CRS first
-			{
-				const auto &crs = internal::getCRS( A );
-				const size_t nThreads = A.m < config::OMP::minLoopSize()
-					? 1
-					: std::min(
-						config::OMP::threads(),
-						A.nz / config::CACHE_LINE_SIZE::value()
-					);
-				#pragma omp parallel for num_threads( nThreads )
-				for( size_t i = 0; i < A.m; ++i ) {
-					std::vector< std::pair< int, double > > pairs;
-					for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k ) {
-						pairs.push_back( std::make_pair( crs.row_index[ k ], crs.values[ k ] ) );
+			// re-sort A, if enabled
+			sptrsv.is_sorted = true;
+			if( config::tuning::SpTRSV::sortingMode == 2 ) {
+				// CRS first
+				{
+					const auto &crs = internal::getCRS( A );
+					const size_t nThreads = A.m < config::OMP::minLoopSize()
+						? 1
+						: std::min(
+							config::OMP::threads(),
+							A.nz / config::CACHE_LINE_SIZE::value()
+						);
+					#pragma omp parallel for num_threads( nThreads )
+					for( size_t i = 0; i < A.m; ++i ) {
+						std::vector< std::pair< int, double > > pairs;
+						for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k ) {
+							pairs.push_back( std::make_pair( crs.row_index[ k ], crs.values[ k ] ) );
+						}
+						std::sort( pairs.begin(), pairs.end(),
+							[]( const std::pair< int, double > &left, const std::pair< int, double > &right ) {
+								return left.first < right.first;
+							} );
+						auto it = pairs.cbegin();
+						for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k, ++it ) {
+							assert( it != pairs.cend() );
+							crs.row_index[ k ] = it->first;
+							crs.values[ k ] = it->second;
+						}
 					}
-					std::sort( pairs.begin(), pairs.end(),
-						[]( const std::pair< int, double > &left, const std::pair< int, double > &right ) {
-							return left.first < right.first;
-						} );
-					auto it = pairs.cbegin();
-					for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k, ++it ) {
-						assert( it != pairs.cend() );
-						crs.row_index[ k ] = it->first;
-						crs.values[ k ] = it->second;
+				}
+				// then CCS
+				{
+					const auto &ccs = internal::getCCS( A );
+					const size_t nThreads = A.n < config::OMP::minLoopSize()
+						? 1
+						: std::min(
+							config::OMP::threads(),
+							A.nz / config::CACHE_LINE_SIZE::value()
+						);
+					#pragma omp parallel for num_threads( nThreads )
+					for( size_t i = 0; i < A.n; ++i ) {
+						std::vector< std::pair< int, double > > pairs;
+						for( size_t k = ccs.col_start[ i ]; k < ccs.col_start[ i + 1 ]; ++k ) {
+							pairs.push_back( std::make_pair( ccs.row_index[ k ], ccs.values[ k ] ) );
+						}
+						std::sort( pairs.begin(), pairs.end(),
+							[]( const std::pair< int, double > &left, const std::pair< int, double > &right ) {
+								return left.first < right.first;
+							} );
+						auto it = pairs.cbegin();
+						for( size_t k = ccs.col_start[ i ]; k < ccs.col_start[ i + 1 ]; ++k, ++it ) {
+							assert( it != pairs.cend() );
+							ccs.row_index[ k ] = it->first;
+							ccs.values[ k ] = it->second;
+						}
 					}
 				}
-			}
-			// then CCS
-			{
-				const auto &ccs = internal::getCCS( A );
-				const size_t nThreads = A.n < config::OMP::minLoopSize()
-					? 1
-					: std::min(
-						config::OMP::threads(),
-						A.nz / config::CACHE_LINE_SIZE::value()
-					);
-				#pragma omp parallel for num_threads( nThreads )
-				for( size_t i = 0; i < A.n; ++i ) {
-					std::vector< std::pair< int, double > > pairs;
-					for( size_t k = ccs.col_start[ i ]; k < ccs.col_start[ i + 1 ]; ++k ) {
-						pairs.push_back( std::make_pair( ccs.row_index[ k ], ccs.values[ k ] ) );
+			} else if( config::tuning::SpTRSV::sortingMode == 1 ) {
+				{
+					const auto &crs = internal::getCRS( A );
+					const size_t nThreads = A.m < config::OMP::minLoopSize()
+						? 1
+						: std::min(
+							config::OMP::threads(),
+							A.nz / config::CACHE_LINE_SIZE::value()
+						);
+					#pragma omp parallel for num_threads( nThreads )
+					for( size_t i = 0; i < A.m; ++i ) {
+						const auto index_it = std::find(
+							crs.row_index + crs.col_start[ i ],
+							crs.row_index + crs.col_start[ i + 1 ],
+							i
+						);
+						if( !std::is_void< InputType >::value ) {
+							const size_t index = std::distance( crs.row_index, index_it );
+							std::swap( crs.values[ index ], crs.values[ crs.col_start[ i ] ] );
+						}
+						std::swap( *index_it, crs.row_index[ crs.col_start[ i ] ] );
 					}
-					std::sort( pairs.begin(), pairs.end(),
-						[]( const std::pair< int, double > &left, const std::pair< int, double > &right ) {
-							return left.first < right.first;
-						} );
-					auto it = pairs.cbegin();
-					for( size_t k = ccs.col_start[ i ]; k < ccs.col_start[ i + 1 ]; ++k, ++it ) {
-						assert( it != pairs.cend() );
-						ccs.row_index[ k ] = it->first;
-						ccs.values[ k ] = it->second;
+				}
+				{
+					const auto &ccs = internal::getCCS( A );
+					const size_t nThreads = A.n < config::OMP::minLoopSize()
+						? 1
+						: std::min(
+							config::OMP::threads(),
+							A.nz / config::CACHE_LINE_SIZE::value()
+						);
+					#pragma omp parallel for num_threads( nThreads )
+					for( size_t i = 0; i < A.n; ++i ) {
+						const auto index_it = std::find(
+							ccs.row_index + ccs.col_start[ i ],
+							ccs.row_index + ccs.col_start[ i + 1 ],
+							i
+						);
+						if( !std::is_void< InputType >::value ) {
+							const size_t index = std::distance( ccs.row_index, index_it );
+							std::swap( ccs.values[ index ], ccs.values[ ccs.col_start[ i ] ] );
+						}
+						std::swap( *index_it, ccs.row_index[ ccs.col_start[ i ] ] );
 					}
 				}
+			} else {
+				sptrsv.is_sorted = false;
 			}
 
 			sptrsv.is_simple = true;

From e23d8ee9f54897b2454ef9dadc9545f6e9ba5e93 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Thu, 17 Apr 2025 16:53:11 +0200
Subject: [PATCH 52/55] Make unsorted the default (no sorting overhead

---
 include/graphblas/reference/config.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/graphblas/reference/config.hpp b/include/graphblas/reference/config.hpp
index 1d9a8ebbf..116ed0202 100644
--- a/include/graphblas/reference/config.hpp
+++ b/include/graphblas/reference/config.hpp
@@ -354,7 +354,7 @@ namespace grb {
 				 * also applied to the CCS (in which case the above three points apply with
 				 * `rows' substituted by `columns').
 				 */
-				static constexpr int sortingMode = 2;
+				static constexpr int sortingMode = 0;
 
 			};
 

From fd1ce27c9c79e23afd2b350282902e627ce584f2 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Thu, 17 Apr 2025 16:53:24 +0200
Subject: [PATCH 53/55] Make forward a template parameter rather than a
 run-time switch

---
 include/graphblas/reference/blas2.hpp | 120 +++++++++++++++++---------
 1 file changed, 77 insertions(+), 43 deletions(-)

diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp
index a623b94b4..a9568ab37 100644
--- a/include/graphblas/reference/blas2.hpp
+++ b/include/graphblas/reference/blas2.hpp
@@ -2224,10 +2224,13 @@ namespace grb {
 		 *                1: the matrix rows have diagonal elements at position 0,
 		 *                2: the matrix rows are sorted.
 		 *
+		 * @tparam forward True: perform forward substitution.
+		 *                 False: perform backward substitution.
+		 *
 		 * \note The above description assumes CRS.
 		 */
 		template<
-			int sorted,
+			int sorted, bool forward,
 			typename IOType, typename InputType1,
 			typename IND, typename NIT,
 			class Semiring, class Subtraction
@@ -2236,7 +2239,6 @@ namespace grb {
 			const Compressed_Storage< InputType1, IND, NIT > &crs,
 			const size_t &i,
 			IOType *__restrict__ const &x,
-			const bool &forward,
 			IOType &divBy,
 			const Semiring &semiring,
 			const Subtraction &subtraction
@@ -2346,25 +2348,25 @@ namespace grb {
 		}
 
 		template<
-			typename IOType, int sorted,
+			typename IOType, int sorted, bool forward,
 			typename VIT, typename RCIT, typename NIT
 		>
 		inline static IOType getDiagonalEntry(
 			const Compressed_Storage< VIT, RCIT, NIT > &storage, const size_t &i,
-			const bool &forward, const IOType &one
+			const IOType &one
 		) {
 			assert( sorted >= 1 && sorted < 3 );
 			assert( storage.col_start[ i + 1 ] > storage.col_start[ i ] );
 			const size_t diag_index = sorted == 1 ? storage.col_start[ i ] : (
 				forward ? (storage.col_start[ i + 1 ] - 1) : storage.col_start[ i ] );
-			assert( storage.col_index[ diag_index ] == i );
+			assert( storage.row_index[ diag_index ] == i );
 			return storage.template getValue( diag_index, one );
 		}
 #endif // end ifndef _H_GRB_REFERENCE_OMP_BLAS2
 
 		/** \internal Specialised dense unmasked sptrsv implementation, sequential */
 		template<
-			Descriptor descr, bool maybe_offset, int sorted,
+			Descriptor descr, bool maybe_offset, int sorted, bool forward,
 			class Semiring, class Subtraction, class Division,
 			typename IOType, typename InputType1,
 			typename RIT, typename CIT, typename NIT
@@ -2373,7 +2375,6 @@ namespace grb {
 			IOType *__restrict__ const &v_raw,
 			const Matrix< InputType1, reference, RIT, CIT, NIT > &T,
 			const size_t &offset, const size_t &n,
-			const bool &forward,
 			const Semiring &semiring,
 			const Subtraction &subtraction,
 			const Division &division,
@@ -2383,10 +2384,6 @@ namespace grb {
 			static_assert( sorted >= 0 && sorted < 3, "Invalid value for sorted; this "
 				"an internal error, please submit a bug report" );
 
-			// dynamic sanity checks
-			assert( grb::nrows( T ) == n );
-			assert( grb::ncols( T ) == n );
-
 			// in dense unmasked, resize is a no-op
 			if( phase == grb::RESIZE ) { return grb::SUCCESS; }
 
@@ -2407,11 +2404,14 @@ namespace grb {
 							// in this case we will auto-detect the diagonal item
 							divBy = semiring.template getZero< IOType >();
 						} else {
-							divBy = getDiagonalEntry< IOType, sorted, InputType1, RIT, NIT >(
-								crs, i, forward, one );
+							divBy = getDiagonalEntry<
+								IOType, sorted, forward, InputType1, RIT, NIT
+							> (
+								crs, i, one
+							);
 						}
 						assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] );
-						sptrsv_kernel< sorted >( crs, i, v_raw, forward, divBy, semiring,
+						sptrsv_kernel< sorted, forward >( crs, i, v_raw, divBy, semiring,
 							subtraction );
 #ifdef _DEBUG
 						std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy
@@ -2426,10 +2426,10 @@ namespace grb {
 						if( !sorted ) {
 							divBy = semiring.template getZero< IOType >();
 						} else {
-							divBy = getDiagonalEntry< IOType, sorted >( crs, i, forward, one );
+							divBy = getDiagonalEntry< IOType, sorted, forward >( crs, i, one );
 						}
 						assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] );
-						sptrsv_kernel< sorted >( crs, i, v_raw, forward, divBy, semiring,
+						sptrsv_kernel< sorted, forward >( crs, i, v_raw, divBy, semiring,
 							subtraction );
 #ifdef _DEBUG
 						std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy
@@ -2446,10 +2446,10 @@ namespace grb {
 						if( !sorted ) {
 							divBy = semiring.template getZero< IOType >();
 						} else {
-							divBy = getDiagonalEntry< IOType, sorted >( ccs, i, forward, one );
+							divBy = getDiagonalEntry< IOType, sorted, forward >( ccs, i, one );
 						}
 						assert( ccs.col_start[ i ] <= ccs.col_start[ i + 1 ] );
-						sptrsv_kernel< sorted >( ccs, i, v_raw, forward, divBy, semiring,
+						sptrsv_kernel< sorted, forward >( ccs, i, v_raw, divBy, semiring,
 							subtraction );
 #ifdef _DEBUG
 						std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy
@@ -2463,10 +2463,10 @@ namespace grb {
 						if( !sorted ) {
 							divBy = semiring.template getZero< IOType >();
 						} else {
-							divBy = getDiagonalEntry< IOType, sorted >( ccs, i, forward, one );
+							divBy = getDiagonalEntry< IOType, sorted, forward >( ccs, i, one );
 						}
 						assert( ccs.col_start[ i ] <= ccs.col_start[ i + 1 ] );
-						sptrsv_kernel< sorted >( ccs, i, v_raw, forward, divBy, semiring,
+						sptrsv_kernel< sorted, forward >( ccs, i, v_raw, divBy, semiring,
 							subtraction );
 #ifdef _DEBUG
 						std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy
@@ -2484,7 +2484,7 @@ namespace grb {
 #ifdef _H_GRB_REFERENCE_OMP_BLAS2
 		/** \internal Specialised dense unmasked sptrsv implementation, OpenMP */
 		template<
-			Descriptor descr,
+			Descriptor descr, bool forward,
 			class Semiring, class Subtraction, class Division,
 			typename IOType, typename InputType1,
 			typename RIT, typename CIT, typename NIT
@@ -2493,7 +2493,6 @@ namespace grb {
 			IOType *__restrict__ const &v_raw,
 			const Matrix< InputType1, reference, RIT, CIT, NIT > &T,
 			const size_t &n,
-			const bool &forward,
 			const Semiring &semiring,
 			const Subtraction &subtraction,
 			const Division &division,
@@ -2530,9 +2529,9 @@ namespace grb {
 							assert( lo < n );
 							assert( no + lo <= n );
 							local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv<
-									descr, true, config::tuning::SpTRSV::sortingMode
+									descr, true, config::tuning::SpTRSV::sortingMode, forward
 								>(
-									v_raw, T, lo, no, forward, semiring, subtraction, division, phase
+									v_raw, T, lo, no, semiring, subtraction, division, phase
 								);
 						}
 						#pragma omp barrier
@@ -2543,9 +2542,9 @@ namespace grb {
 							assert( lo < n );
 							assert( no + lo <= n );
 							local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv<
-								descr, true, 0
+								descr, true, 0, forward
 							>(
-								v_raw, T, lo, no, forward, semiring, subtraction, division, phase
+								v_raw, T, lo, no, semiring, subtraction, division, phase
 							);
 						}
 						#pragma omp barrier
@@ -2573,9 +2572,9 @@ namespace grb {
 								assert( lo < n );
 								assert( no + lo <= n );
 								local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv<
-										descr, true, config::tuning::SpTRSV::sortingMode
+										descr, true, config::tuning::SpTRSV::sortingMode, forward
 									>(
-										v_raw, T, lo, no, forward, semiring, subtraction, division, phase
+										v_raw, T, lo, no, semiring, subtraction, division, phase
 									);
 							}
 							#pragma omp barrier
@@ -2588,9 +2587,9 @@ namespace grb {
 								assert( lo < n );
 								assert( no + lo <= n );
 								local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv<
-										descr, true, 0
+										descr, true, 0, forward
 									>(
-										v_raw, T, lo, no, forward, semiring, subtraction, division, phase
+										v_raw, T, lo, no, semiring, subtraction, division, phase
 									);
 							}
 							#pragma omp barrier
@@ -2613,7 +2612,7 @@ namespace grb {
 		 *           sptrsv.
 		 */
 		template<
-			Descriptor descr,
+			Descriptor descr, bool forward,
 			bool masked, bool sparse,
 			class Semiring, class Subtraction, class Division,
 			typename IOType, typename InputType1, typename InputType2,
@@ -2624,7 +2623,6 @@ namespace grb {
 			const Vector< InputType2, reference, Coords > &mask,
 			const Matrix< InputType1, reference, RIT, CIT, NIT > &T,
 			const size_t &n,
-			const bool forward,
 			const Semiring &semiring,
 			const Subtraction &subtraction,
 			const Division &division,
@@ -2640,6 +2638,8 @@ namespace grb {
 			assert( grb::nrows( T ) == n );
 			assert( grb::ncols( T ) == n );
 
+			(void) descr;
+			(void) forward;
 			(void) masked;
 			(void) sparse;
 #ifdef NDEBUG
@@ -2648,7 +2648,6 @@ namespace grb {
 			(void) T;
 			(void) n;
 #endif
-			(void) forward;
 			(void) semiring;
 			(void) subtraction;
 			(void) division;
@@ -2707,8 +2706,13 @@ namespace grb {
 		if( dense || grb::nnz( xb ) == n ) {
 			IOType * const xb_p = internal::getRaw( xb );
 #ifndef _H_GRB_REFERENCE_OMP_BLAS2
-			return internal::dense_unmasked_sequential_sptrsv< descr, false, 0 >(
-				xb_p, T, 0, n, forward, semiring, subtraction, division, phase );
+			if( forward ) {
+				return internal::dense_unmasked_sequential_sptrsv< descr, false, 0, true >(
+					xb_p, T, 0, n, semiring, subtraction, division, phase );
+			} else {
+				return internal::dense_unmasked_sequential_sptrsv< descr, false, 0, false >(
+					xb_p, T, 0, n, semiring, subtraction, division, phase );
+			}
 #else
 			// The below code is only for testing (DBG):
  #if 0
@@ -2744,14 +2748,28 @@ namespace grb {
 			}
 			return grb::SUCCESS;
  #else
-			return internal::dense_unmasked_omp_sptrsv< descr >(
-				xb_p, T, n, forward, semiring, subtraction, division, phase );
+			if( forward ) {
+				return internal::dense_unmasked_omp_sptrsv< descr, true >(
+					xb_p, T, n, semiring, subtraction, division, phase );
+			} else {
+				return internal::dense_unmasked_omp_sptrsv< descr, false >(
+					xb_p, T, n, semiring, subtraction, division, phase );
+			}
  #endif
 #endif
 		} else {
 			grb::Vector< IOType, reference, Coords > no_mask( 0 );
-			return internal::generic_sptrsv< descr, false, true >( xb, no_mask, T, n,
-				forward, semiring, subtraction, division, phase );
+			if( forward ) {
+				return internal::generic_sptrsv< descr, false, true, true >(
+					xb, no_mask, T, n,
+					semiring, subtraction, division, phase
+				);
+			} else {
+				return internal::generic_sptrsv< descr, false, true, false>(
+					xb, no_mask, T, n,
+					semiring, subtraction, division, phase
+				);
+			}
 		}
 	}
 
@@ -2817,11 +2835,27 @@ namespace grb {
 		}
 
 		if( dense || (grb::nnz( xb ) == n && grb::nnz( mask ) == n) ) {
-			return internal::generic_sptrsv< descr, true, false >( xb, mask, T, n,
-				forward, semiring, subtraction, division, phase );
+			if( forward ) {
+				return internal::generic_sptrsv< descr, true, false, true >(
+					xb, mask, T, n,
+					semiring, subtraction, division, phase
+				);
+			} else {
+				return internal::generic_sptrsv< descr, true, false, false >(
+					xb, mask, T, n,
+					semiring, subtraction, division, phase
+				);
+			}
 		} else {
-			return internal::generic_sptrsv< descr, true, true >( xb, mask, T, n,
-				forward, semiring, subtraction, division, phase );
+			if( forward ) {
+				return internal::generic_sptrsv< descr, true, true, true >(
+					xb, mask, T, n,
+					semiring, subtraction, division, phase );
+			} else {
+				return internal::generic_sptrsv< descr, true, true, false >(
+					xb, mask, T, n,
+					semiring, subtraction, division, phase );
+			}
 		}
 	}
 

From a346aa17295a3454033959fd2351d2869367f0b2 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Thu, 17 Apr 2025 19:09:29 +0200
Subject: [PATCH 54/55] Optimisations and bugfixes

---
 include/graphblas/reference/blas2.hpp | 63 +++++++++++++++------------
 1 file changed, 35 insertions(+), 28 deletions(-)

diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp
index a9568ab37..41fac2d0b 100644
--- a/include/graphblas/reference/blas2.hpp
+++ b/include/graphblas/reference/blas2.hpp
@@ -2368,13 +2368,13 @@ namespace grb {
 		template<
 			Descriptor descr, bool maybe_offset, int sorted, bool forward,
 			class Semiring, class Subtraction, class Division,
-			typename IOType, typename InputType1,
+			typename IOPtrType, typename InputType1,
 			typename RIT, typename CIT, typename NIT
 		>
 		RC dense_unmasked_sequential_sptrsv(
-			IOType *__restrict__ const &v_raw,
+			const IOPtrType &v_raw,
 			const Matrix< InputType1, reference, RIT, CIT, NIT > &T,
-			const size_t &offset, const size_t &n,
+			const NIT &offset, const NIT &n,
 			const Semiring &semiring,
 			const Subtraction &subtraction,
 			const Division &division,
@@ -2384,11 +2384,16 @@ namespace grb {
 			static_assert( sorted >= 0 && sorted < 3, "Invalid value for sorted; this "
 				"an internal error, please submit a bug report" );
 
-			// in dense unmasked, resize is a no-op
-			if( phase == grb::RESIZE ) { return grb::SUCCESS; }
-
 			// only execute and resize are supported
 			assert( phase == grb::EXECUTE );
+#ifdef NDEBUG
+			(void) phase;
+#endif
+
+			// get value type from the pointer
+			typedef typename std::remove_reference<
+				decltype( *std::declval< IOPtrType >() )
+			>::type IOType;
 
 			// in case of pattern matrices, get 1 from the semiring
 			const IOType one = semiring.getMultiplicativeMonoid().template
@@ -2404,17 +2409,13 @@ namespace grb {
 							// in this case we will auto-detect the diagonal item
 							divBy = semiring.template getZero< IOType >();
 						} else {
-							divBy = getDiagonalEntry<
-								IOType, sorted, forward, InputType1, RIT, NIT
-							> (
-								crs, i, one
-							);
+							divBy = getDiagonalEntry< IOType, sorted, forward >( crs, i, one );
 						}
 						assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] );
 						sptrsv_kernel< sorted, forward >( crs, i, v_raw, divBy, semiring,
 							subtraction );
 #ifdef _DEBUG
-						std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy
+						std::cout << "\trow v_raw[ " << i << " ] = " << v_raw[ i ] << " will be normalised with " << divBy
 							<< "\n";
 #endif
 						(void) grb::foldl( v_raw[ i ], divBy, division );
@@ -2432,7 +2433,7 @@ namespace grb {
 						sptrsv_kernel< sorted, forward >( crs, i, v_raw, divBy, semiring,
 							subtraction );
 #ifdef _DEBUG
-						std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy
+						std::cout << "\trow v_raw[ " << i << " ] = " << v_raw[ i ] << " will be normalised with " << divBy
 							<< "\n";
 #endif
 						(void) grb::foldl( v_raw[ i ], divBy, division );
@@ -2490,7 +2491,7 @@ namespace grb {
 			typename RIT, typename CIT, typename NIT
 		>
 		RC dense_unmasked_omp_sptrsv(
-			IOType *__restrict__ const &v_raw,
+			IOType * const &v_raw,
 			const Matrix< InputType1, reference, RIT, CIT, NIT > &T,
 			const size_t &n,
 			const Semiring &semiring,
@@ -2524,8 +2525,8 @@ namespace grb {
 					assert( data != nullptr );
 					if( sptrsv.is_sorted ) {
 						for( size_t i = 0; i < sptrsv.supersteps; ++i ) {
-							const size_t lo = static_cast< size_t >( *data++ );
-							const size_t no = static_cast< size_t >( *data++ );
+							const auto &lo = *data++;
+							const auto &no = *data++;
 							assert( lo < n );
 							assert( no + lo <= n );
 							local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv<
@@ -2533,12 +2534,12 @@ namespace grb {
 								>(
 									v_raw, T, lo, no, semiring, subtraction, division, phase
 								);
+							#pragma omp barrier
 						}
-						#pragma omp barrier
 					} else {
 						for( size_t i = 0; i < sptrsv.supersteps; ++i ) {
-							const size_t lo = static_cast< size_t >( *data++ );
-							const size_t no = static_cast< size_t >( *data++ );
+							const NIT &lo = *data++;
+							const NIT &no = *data++;
 							assert( lo < n );
 							assert( no + lo <= n );
 							local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv<
@@ -2546,8 +2547,8 @@ namespace grb {
 							>(
 								v_raw, T, lo, no, semiring, subtraction, division, phase
 							);
+							#pragma omp barrier
 						}
-						#pragma omp barrier
 					}
 					if( local_rc != grb::SUCCESS ) {
 						#pragma omp atomic write
@@ -2567,8 +2568,8 @@ namespace grb {
 					if( sptrsv.is_sorted ) {
 						for( size_t i = 0; i < sptrsv.supersteps; ++i ) {
 							for( size_t k = 0; k < end[ i ]; ++k ) {
-								const size_t &lo = static_cast< size_t >( *data++ );
-								const size_t &no = static_cast< size_t >( *data++ );
+								const NIT &lo = *data++;
+								const NIT &no = *data++;
 								assert( lo < n );
 								assert( no + lo <= n );
 								local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv<
@@ -2582,8 +2583,8 @@ namespace grb {
 					} else {
 						for( size_t i = 0; i < sptrsv.supersteps; ++i ) {
 							for( size_t k = 0; k < end[ i ]; ++k ) {
-								const size_t &lo = static_cast< size_t >( *data++ );
-								const size_t &no = static_cast< size_t >( *data++ );
+								const NIT &lo = *data++;
+								const NIT &no = *data++;
 								assert( lo < n );
 								assert( no + lo <= n );
 								local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv<
@@ -2702,21 +2703,26 @@ namespace grb {
 		// check trivial
 		if( n == 0 ) { return grb::SUCCESS; }
 
+		// in the dense variant, resize does nothing
+		if( phase == grb::RESIZE ) { return grb::SUCCESS; }
+
 		// check dense dispatch
 		if( dense || grb::nnz( xb ) == n ) {
-			IOType * const xb_p = internal::getRaw( xb );
 #ifndef _H_GRB_REFERENCE_OMP_BLAS2
+			IOType *__restrict__ const xb_p = internal::getRaw( xb );
+			const NIT lo = 0;
+			const NIT no = n;
 			if( forward ) {
 				return internal::dense_unmasked_sequential_sptrsv< descr, false, 0, true >(
-					xb_p, T, 0, n, semiring, subtraction, division, phase );
+					xb_p, T, lo, no, semiring, subtraction, division, phase );
 			} else {
 				return internal::dense_unmasked_sequential_sptrsv< descr, false, 0, false >(
-					xb_p, T, 0, n, semiring, subtraction, division, phase );
+					xb_p, T, lo, no, semiring, subtraction, division, phase );
 			}
 #else
+			IOType * const xb_p = internal::getRaw( xb );
 			// The below code is only for testing (DBG):
  #if 0
-			if( phase == grb::RESIZE ) { return grb::SUCCESS; }
 			const auto sptrsv = internal::getSptrsvData( T );
 			#pragma omp parallel num_threads(sptrsv->nThreads)
 			{
@@ -2741,6 +2747,7 @@ namespace grb {
 								xb_p[ row_idx ] -= value * xb_p[ j ];
 							//}
 						}
+						#pragma omp critical
 						xb_p[ row_idx ] /= crs.values[ crs.col_start[ row_idx + 1 ] - 1 ];
 					}
 					#pragma omp barrier

From 763b89f6232dc61302f03044691f96b642fd1b4c Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Thu, 17 Apr 2025 19:09:46 +0200
Subject: [PATCH 55/55] More robust implementation of sorting mode 1

---
 include/graphblas/reference/matrix.hpp | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index 6b7cb29ba..1b99614ef 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -412,16 +412,18 @@ namespace grb {
 						);
 					#pragma omp parallel for num_threads( nThreads )
 					for( size_t i = 0; i < A.n; ++i ) {
-						const auto index_it = std::find(
-							ccs.row_index + ccs.col_start[ i ],
-							ccs.row_index + ccs.col_start[ i + 1 ],
-							i
-						);
-						if( !std::is_void< InputType >::value ) {
-							const size_t index = std::distance( ccs.row_index, index_it );
-							std::swap( ccs.values[ index ], ccs.values[ ccs.col_start[ i ] ] );
+						const auto start = ccs.row_index + ccs.col_start[ i ];
+						const auto end = ccs.row_index + ccs.col_start[ i + 1 ];
+						const auto index_it = std::find( start, end, i );
+						if( index_it == end ) {
+							throw std::runtime_error( "No diagonal element present" );
+						} else if( index_it != start ) {
+							if( !std::is_void< InputType >::value ) {
+								const size_t index = std::distance( ccs.row_index, index_it );
+								std::swap( ccs.values[ index ], ccs.values[ ccs.col_start[ i ] ] );
+							}
+							std::swap( *index_it, ccs.row_index[ ccs.col_start[ i ] ] );
 						}
-						std::swap( *index_it, ccs.row_index[ ccs.col_start[ i ] ] );
 					}
 				}
 			} else {